llama_cpp 0.12.2 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,12 +2,8 @@
2
2
  #define LLAMA_H
3
3
 
4
4
  #include "ggml.h"
5
- #ifdef GGML_USE_CUBLAS
6
- #include "ggml-cuda.h"
7
- #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
8
- #else
9
- #define LLAMA_MAX_DEVICES 1
10
- #endif // GGML_USE_CUBLAS
5
+ #include "ggml-backend.h"
6
+
11
7
  #include <stddef.h>
12
8
  #include <stdint.h>
13
9
  #include <stdio.h>
@@ -45,11 +41,6 @@
45
41
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
46
42
  #define LLAMA_SESSION_VERSION 4
47
43
 
48
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
49
- // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
50
- #define LLAMA_SUPPORTS_GPU_OFFLOAD
51
- #endif
52
-
53
44
  #ifdef __cplusplus
54
45
  extern "C" {
55
46
  #endif
@@ -106,6 +97,8 @@ extern "C" {
106
97
  LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
107
98
  LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
108
99
  LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
100
+ LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
101
+ LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
109
102
 
110
103
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
111
104
  };
@@ -194,7 +187,7 @@ extern "C" {
194
187
  // LLAMA_SPLIT_LAYER: ignored
195
188
  int32_t main_gpu;
196
189
 
197
- // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
190
+ // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
198
191
  const float * tensor_split;
199
192
 
200
193
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
@@ -231,6 +224,9 @@ extern "C" {
231
224
  float yarn_beta_slow; // YaRN high correction dim
232
225
  uint32_t yarn_orig_ctx; // YaRN original context size
233
226
 
227
+ ggml_backend_sched_eval_callback cb_eval;
228
+ void * cb_eval_user_data;
229
+
234
230
  enum ggml_type type_k; // data type for K cache
235
231
  enum ggml_type type_v; // data type for V cache
236
232
 
@@ -328,9 +324,14 @@ extern "C" {
328
324
 
329
325
  LLAMA_API int64_t llama_time_us(void);
330
326
 
331
- LLAMA_API int32_t llama_max_devices(void);
332
- LLAMA_API bool llama_mmap_supported (void);
333
- LLAMA_API bool llama_mlock_supported(void);
327
+ LLAMA_API size_t llama_max_devices(void);
328
+
329
+ LLAMA_API bool llama_supports_mmap (void);
330
+ LLAMA_API bool llama_supports_mlock (void);
331
+ LLAMA_API bool llama_supports_gpu_offload(void);
332
+
333
+ LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
334
+ LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
334
335
 
335
336
  LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
336
337
 
@@ -770,6 +771,14 @@ extern "C" {
770
771
  float p,
771
772
  size_t min_keep);
772
773
 
774
+ /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
775
+ LLAMA_API void llama_sample_entropy(
776
+ struct llama_context * ctx,
777
+ llama_token_data_array * candidates_p,
778
+ float min_temp,
779
+ float max_temp,
780
+ float exponent_val);
781
+
773
782
  LLAMA_API void llama_sample_temp(
774
783
  struct llama_context * ctx,
775
784
  llama_token_data_array * candidates,
@@ -2,8 +2,9 @@
2
2
 
3
3
  #include <cassert>
4
4
  #include <stdexcept>
5
- #include <vector>
5
+ #include <string>
6
6
  #include <unordered_map>
7
+ #include <vector>
7
8
 
8
9
  static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
9
10
  {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.2
4
+ version: 0.12.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-01-20 00:00:00.000000000 Z
11
+ date: 2024-02-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -45,6 +45,8 @@ files:
45
45
  - vendor/tmp/llama.cpp/ggml-cuda.cu
46
46
  - vendor/tmp/llama.cpp/ggml-cuda.h
47
47
  - vendor/tmp/llama.cpp/ggml-impl.h
48
+ - vendor/tmp/llama.cpp/ggml-kompute.cpp
49
+ - vendor/tmp/llama.cpp/ggml-kompute.h
48
50
  - vendor/tmp/llama.cpp/ggml-metal.h
49
51
  - vendor/tmp/llama.cpp/ggml-metal.m
50
52
  - vendor/tmp/llama.cpp/ggml-metal.metal
@@ -54,6 +56,11 @@ files:
54
56
  - vendor/tmp/llama.cpp/ggml-opencl.h
55
57
  - vendor/tmp/llama.cpp/ggml-quants.c
56
58
  - vendor/tmp/llama.cpp/ggml-quants.h
59
+ - vendor/tmp/llama.cpp/ggml-sycl.cpp
60
+ - vendor/tmp/llama.cpp/ggml-sycl.h
61
+ - vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp
62
+ - vendor/tmp/llama.cpp/ggml-vulkan.cpp
63
+ - vendor/tmp/llama.cpp/ggml-vulkan.h
57
64
  - vendor/tmp/llama.cpp/ggml.c
58
65
  - vendor/tmp/llama.cpp/ggml.h
59
66
  - vendor/tmp/llama.cpp/llama.cpp
@@ -84,7 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
84
91
  - !ruby/object:Gem::Version
85
92
  version: '0'
86
93
  requirements: []
87
- rubygems_version: 3.5.3
94
+ rubygems_version: 3.4.19
88
95
  signing_key:
89
96
  specification_version: 4
90
97
  summary: Ruby bindings for the llama.cpp.