llama_cpp 0.12.2 → 0.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +68 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -2
- data/vendor/tmp/llama.cpp/Makefile +25 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +87 -27
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +176 -18
- data/vendor/tmp/llama.cpp/ggml-backend.h +14 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +144 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +736 -59
- data/vendor/tmp/llama.cpp/ggml-quants.h +20 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
- data/vendor/tmp/llama.cpp/ggml.c +664 -117
- data/vendor/tmp/llama.cpp/ggml.h +46 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1426 -341
- data/vendor/tmp/llama.cpp/llama.h +24 -15
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +10 -3
@@ -2,12 +2,8 @@
|
|
2
2
|
#define LLAMA_H
|
3
3
|
|
4
4
|
#include "ggml.h"
|
5
|
-
#
|
6
|
-
|
7
|
-
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
8
|
-
#else
|
9
|
-
#define LLAMA_MAX_DEVICES 1
|
10
|
-
#endif // GGML_USE_CUBLAS
|
5
|
+
#include "ggml-backend.h"
|
6
|
+
|
11
7
|
#include <stddef.h>
|
12
8
|
#include <stdint.h>
|
13
9
|
#include <stdio.h>
|
@@ -45,11 +41,6 @@
|
|
45
41
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
46
42
|
#define LLAMA_SESSION_VERSION 4
|
47
43
|
|
48
|
-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
49
|
-
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
50
|
-
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
51
|
-
#endif
|
52
|
-
|
53
44
|
#ifdef __cplusplus
|
54
45
|
extern "C" {
|
55
46
|
#endif
|
@@ -106,6 +97,8 @@ extern "C" {
|
|
106
97
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
107
98
|
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
108
99
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
100
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
101
|
+
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
109
102
|
|
110
103
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
111
104
|
};
|
@@ -194,7 +187,7 @@ extern "C" {
|
|
194
187
|
// LLAMA_SPLIT_LAYER: ignored
|
195
188
|
int32_t main_gpu;
|
196
189
|
|
197
|
-
// proportion of the model (layers or rows) to offload to each GPU, size:
|
190
|
+
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
198
191
|
const float * tensor_split;
|
199
192
|
|
200
193
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
@@ -231,6 +224,9 @@ extern "C" {
|
|
231
224
|
float yarn_beta_slow; // YaRN high correction dim
|
232
225
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
233
226
|
|
227
|
+
ggml_backend_sched_eval_callback cb_eval;
|
228
|
+
void * cb_eval_user_data;
|
229
|
+
|
234
230
|
enum ggml_type type_k; // data type for K cache
|
235
231
|
enum ggml_type type_v; // data type for V cache
|
236
232
|
|
@@ -328,9 +324,14 @@ extern "C" {
|
|
328
324
|
|
329
325
|
LLAMA_API int64_t llama_time_us(void);
|
330
326
|
|
331
|
-
LLAMA_API
|
332
|
-
|
333
|
-
LLAMA_API bool
|
327
|
+
LLAMA_API size_t llama_max_devices(void);
|
328
|
+
|
329
|
+
LLAMA_API bool llama_supports_mmap (void);
|
330
|
+
LLAMA_API bool llama_supports_mlock (void);
|
331
|
+
LLAMA_API bool llama_supports_gpu_offload(void);
|
332
|
+
|
333
|
+
LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
|
334
|
+
LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
|
334
335
|
|
335
336
|
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
336
337
|
|
@@ -770,6 +771,14 @@ extern "C" {
|
|
770
771
|
float p,
|
771
772
|
size_t min_keep);
|
772
773
|
|
774
|
+
/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
|
775
|
+
LLAMA_API void llama_sample_entropy(
|
776
|
+
struct llama_context * ctx,
|
777
|
+
llama_token_data_array * candidates_p,
|
778
|
+
float min_temp,
|
779
|
+
float max_temp,
|
780
|
+
float exponent_val);
|
781
|
+
|
773
782
|
LLAMA_API void llama_sample_temp(
|
774
783
|
struct llama_context * ctx,
|
775
784
|
llama_token_data_array * candidates,
|
@@ -2,8 +2,9 @@
|
|
2
2
|
|
3
3
|
#include <cassert>
|
4
4
|
#include <stdexcept>
|
5
|
-
#include <
|
5
|
+
#include <string>
|
6
6
|
#include <unordered_map>
|
7
|
+
#include <vector>
|
7
8
|
|
8
9
|
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
|
9
10
|
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-02-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -45,6 +45,8 @@ files:
|
|
45
45
|
- vendor/tmp/llama.cpp/ggml-cuda.cu
|
46
46
|
- vendor/tmp/llama.cpp/ggml-cuda.h
|
47
47
|
- vendor/tmp/llama.cpp/ggml-impl.h
|
48
|
+
- vendor/tmp/llama.cpp/ggml-kompute.cpp
|
49
|
+
- vendor/tmp/llama.cpp/ggml-kompute.h
|
48
50
|
- vendor/tmp/llama.cpp/ggml-metal.h
|
49
51
|
- vendor/tmp/llama.cpp/ggml-metal.m
|
50
52
|
- vendor/tmp/llama.cpp/ggml-metal.metal
|
@@ -54,6 +56,11 @@ files:
|
|
54
56
|
- vendor/tmp/llama.cpp/ggml-opencl.h
|
55
57
|
- vendor/tmp/llama.cpp/ggml-quants.c
|
56
58
|
- vendor/tmp/llama.cpp/ggml-quants.h
|
59
|
+
- vendor/tmp/llama.cpp/ggml-sycl.cpp
|
60
|
+
- vendor/tmp/llama.cpp/ggml-sycl.h
|
61
|
+
- vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp
|
62
|
+
- vendor/tmp/llama.cpp/ggml-vulkan.cpp
|
63
|
+
- vendor/tmp/llama.cpp/ggml-vulkan.h
|
57
64
|
- vendor/tmp/llama.cpp/ggml.c
|
58
65
|
- vendor/tmp/llama.cpp/ggml.h
|
59
66
|
- vendor/tmp/llama.cpp/llama.cpp
|
@@ -84,7 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
84
91
|
- !ruby/object:Gem::Version
|
85
92
|
version: '0'
|
86
93
|
requirements: []
|
87
|
-
rubygems_version: 3.
|
94
|
+
rubygems_version: 3.4.19
|
88
95
|
signing_key:
|
89
96
|
specification_version: 4
|
90
97
|
summary: Ruby bindings for the llama.cpp.
|