llama_cpp 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -241,6 +241,13 @@ extern "C" {
|
|
241
241
|
GGML_TYPE_Q5_1 = 7,
|
242
242
|
GGML_TYPE_Q8_0 = 8,
|
243
243
|
GGML_TYPE_Q8_1 = 9,
|
244
|
+
// k-quantizations
|
245
|
+
GGML_TYPE_Q2_K = 10,
|
246
|
+
GGML_TYPE_Q3_K = 11,
|
247
|
+
GGML_TYPE_Q4_K = 12,
|
248
|
+
GGML_TYPE_Q5_K = 13,
|
249
|
+
GGML_TYPE_Q6_K = 14,
|
250
|
+
GGML_TYPE_Q8_K = 15,
|
244
251
|
GGML_TYPE_I8,
|
245
252
|
GGML_TYPE_I16,
|
246
253
|
GGML_TYPE_I32,
|
@@ -249,8 +256,8 @@ extern "C" {
|
|
249
256
|
|
250
257
|
enum ggml_backend {
|
251
258
|
GGML_BACKEND_CPU = 0,
|
252
|
-
|
253
|
-
|
259
|
+
GGML_BACKEND_GPU = 10,
|
260
|
+
GGML_BACKEND_GPU_SPLIT = 20,
|
254
261
|
};
|
255
262
|
|
256
263
|
// model file types
|
@@ -264,6 +271,11 @@ extern "C" {
|
|
264
271
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
265
272
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
266
273
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
274
|
+
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
275
|
+
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
276
|
+
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
277
|
+
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
278
|
+
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
267
279
|
};
|
268
280
|
|
269
281
|
// available tensor operations:
|
@@ -375,7 +387,9 @@ extern "C" {
|
|
375
387
|
|
376
388
|
char name[GGML_MAX_NAME];
|
377
389
|
|
378
|
-
|
390
|
+
void * extra; // extra things e.g. for ggml-cuda.cu
|
391
|
+
|
392
|
+
char padding[4];
|
379
393
|
};
|
380
394
|
|
381
395
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -413,6 +427,25 @@ extern "C" {
|
|
413
427
|
bool no_alloc; // don't allocate memory for the tensor data
|
414
428
|
};
|
415
429
|
|
430
|
+
|
431
|
+
// compute types
|
432
|
+
enum ggml_task_type {
|
433
|
+
GGML_TASK_INIT = 0,
|
434
|
+
GGML_TASK_COMPUTE,
|
435
|
+
GGML_TASK_FINALIZE,
|
436
|
+
};
|
437
|
+
|
438
|
+
struct ggml_compute_params {
|
439
|
+
enum ggml_task_type type;
|
440
|
+
|
441
|
+
// ith = thread index, nth = number of threads
|
442
|
+
int ith, nth;
|
443
|
+
|
444
|
+
// work buffer for all threads
|
445
|
+
size_t wsize;
|
446
|
+
void * wdata;
|
447
|
+
};
|
448
|
+
|
416
449
|
// misc
|
417
450
|
|
418
451
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -424,8 +457,10 @@ extern "C" {
|
|
424
457
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
425
458
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
426
459
|
|
427
|
-
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
428
|
-
GGML_API
|
460
|
+
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
461
|
+
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
462
|
+
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
463
|
+
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
429
464
|
|
430
465
|
GGML_API int ggml_blck_size (enum ggml_type type);
|
431
466
|
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
@@ -441,13 +476,16 @@ extern "C" {
|
|
441
476
|
// TODO: temporary until model loading of ggml examples is refactored
|
442
477
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
443
478
|
|
479
|
+
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
480
|
+
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
481
|
+
|
444
482
|
// use this to compute the memory overhead of a tensor
|
445
483
|
GGML_API size_t ggml_tensor_overhead(void);
|
446
484
|
|
447
485
|
// main
|
448
486
|
|
449
487
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
450
|
-
GGML_API void
|
488
|
+
GGML_API void ggml_free(struct ggml_context * ctx);
|
451
489
|
|
452
490
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
453
491
|
|