llama_cpp 0.12.4 → 0.12.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +146 -53
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +688 -270
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +386 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +68 -59
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +139 -145
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1516 -10656
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1777 -1238
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +14 -9
- data/vendor/tmp/llama.cpp/ggml.c +147 -70
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +920 -173
- data/vendor/tmp/llama.cpp/llama.h +7 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -191,70 +191,74 @@ typedef struct {
|
|
191
191
|
} block_iq3_xxs;
|
192
192
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
193
193
|
|
194
|
+
#ifdef __cplusplus
|
195
|
+
extern "C" {
|
196
|
+
#endif
|
197
|
+
|
194
198
|
// Quantization
|
195
|
-
void quantize_row_q4_0_reference(const float *
|
196
|
-
void quantize_row_q4_1_reference(const float *
|
197
|
-
void quantize_row_q5_0_reference(const float *
|
198
|
-
void quantize_row_q5_1_reference(const float *
|
199
|
-
void quantize_row_q8_0_reference(const float *
|
200
|
-
void quantize_row_q8_1_reference(const float *
|
201
|
-
|
202
|
-
void quantize_row_q2_K_reference(const float *
|
203
|
-
void quantize_row_q3_K_reference(const float *
|
204
|
-
void quantize_row_q4_K_reference(const float *
|
205
|
-
void quantize_row_q5_K_reference(const float *
|
206
|
-
void quantize_row_q6_K_reference(const float *
|
207
|
-
void quantize_row_q8_K_reference(const float *
|
208
|
-
void quantize_row_iq3_xxs_reference(const float *
|
209
|
-
|
210
|
-
void quantize_row_q4_0(const float *
|
211
|
-
void quantize_row_q4_1(const float *
|
212
|
-
void quantize_row_q5_0(const float *
|
213
|
-
void quantize_row_q5_1(const float *
|
214
|
-
void quantize_row_q8_0(const float *
|
215
|
-
void quantize_row_q8_1(const float *
|
216
|
-
|
217
|
-
void quantize_row_q2_K(const float *
|
218
|
-
void quantize_row_q3_K(const float *
|
219
|
-
void quantize_row_q4_K(const float *
|
220
|
-
void quantize_row_q5_K(const float *
|
221
|
-
void quantize_row_q6_K(const float *
|
222
|
-
void quantize_row_q8_K(const float *
|
223
|
-
void quantize_row_iq3_xxs(const float *
|
199
|
+
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
|
200
|
+
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
|
201
|
+
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
|
202
|
+
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
|
203
|
+
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
|
204
|
+
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
|
205
|
+
|
206
|
+
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
|
207
|
+
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
|
208
|
+
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
|
209
|
+
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
210
|
+
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
211
|
+
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
212
|
+
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
213
|
+
|
214
|
+
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
215
|
+
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
216
|
+
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
217
|
+
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
218
|
+
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
219
|
+
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
220
|
+
|
221
|
+
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
222
|
+
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
223
|
+
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
224
|
+
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
225
|
+
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
226
|
+
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
227
|
+
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
224
228
|
|
225
229
|
// Dequantization
|
226
|
-
void dequantize_row_q4_0(const block_q4_0 *
|
227
|
-
void dequantize_row_q4_1(const block_q4_1 *
|
228
|
-
void dequantize_row_q5_0(const block_q5_0 *
|
229
|
-
void dequantize_row_q5_1(const block_q5_1 *
|
230
|
-
void dequantize_row_q8_0(const block_q8_0 *
|
231
|
-
//void dequantize_row_q8_1(const block_q8_1 *
|
232
|
-
|
233
|
-
void dequantize_row_q2_K(const block_q2_K *
|
234
|
-
void dequantize_row_q3_K(const block_q3_K *
|
235
|
-
void dequantize_row_q4_K(const block_q4_K *
|
236
|
-
void dequantize_row_q5_K(const block_q5_K *
|
237
|
-
void dequantize_row_q6_K(const block_q6_K *
|
238
|
-
void dequantize_row_q8_K(const block_q8_K *
|
239
|
-
void dequantize_row_iq2_xxs(const block_iq2_xxs *
|
240
|
-
void dequantize_row_iq2_xs (const block_iq2_xs *
|
241
|
-
void dequantize_row_iq3_xxs(const block_iq3_xxs *
|
230
|
+
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
231
|
+
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
232
|
+
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
233
|
+
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
234
|
+
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
235
|
+
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
236
|
+
|
237
|
+
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
238
|
+
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
239
|
+
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
240
|
+
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
241
|
+
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
242
|
+
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
243
|
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
244
|
+
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
245
|
+
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
242
246
|
|
243
247
|
// Dot product
|
244
|
-
void ggml_vec_dot_q4_0_q8_0(int n, float *
|
245
|
-
void ggml_vec_dot_q4_1_q8_1(int n, float *
|
246
|
-
void ggml_vec_dot_q5_0_q8_0(int n, float *
|
247
|
-
void ggml_vec_dot_q5_1_q8_1(int n, float *
|
248
|
-
void ggml_vec_dot_q8_0_q8_0(int n, float *
|
249
|
-
|
250
|
-
void ggml_vec_dot_q2_K_q8_K(int n, float *
|
251
|
-
void ggml_vec_dot_q3_K_q8_K(int n, float *
|
252
|
-
void ggml_vec_dot_q4_K_q8_K(int n, float *
|
253
|
-
void ggml_vec_dot_q5_K_q8_K(int n, float *
|
254
|
-
void ggml_vec_dot_q6_K_q8_K(int n, float *
|
255
|
-
void ggml_vec_dot_iq2_xxs_q8_K(int n, float *
|
256
|
-
void ggml_vec_dot_iq2_xs_q8_K (int n, float *
|
257
|
-
void ggml_vec_dot_iq3_xxs_q8_K(int n, float *
|
248
|
+
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
249
|
+
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
250
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
251
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
252
|
+
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
253
|
+
|
254
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
255
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
256
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
257
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
258
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
259
|
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
260
|
+
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
261
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
258
262
|
|
259
263
|
//
|
260
264
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
@@ -276,3 +280,8 @@ void iq2xs_init_impl(int grid_size);
|
|
276
280
|
void iq2xs_free_impl(int grid_size);
|
277
281
|
void iq3xs_init_impl(int grid_size);
|
278
282
|
void iq3xs_free_impl(int grid_size);
|
283
|
+
|
284
|
+
#ifdef __cplusplus
|
285
|
+
}
|
286
|
+
#endif
|
287
|
+
|