llama_cpp 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -241,6 +241,13 @@ extern "C" {
|
|
241
241
|
GGML_TYPE_Q5_1 = 7,
|
242
242
|
GGML_TYPE_Q8_0 = 8,
|
243
243
|
GGML_TYPE_Q8_1 = 9,
|
244
|
+
// k-quantizations
|
245
|
+
GGML_TYPE_Q2_K = 10,
|
246
|
+
GGML_TYPE_Q3_K = 11,
|
247
|
+
GGML_TYPE_Q4_K = 12,
|
248
|
+
GGML_TYPE_Q5_K = 13,
|
249
|
+
GGML_TYPE_Q6_K = 14,
|
250
|
+
GGML_TYPE_Q8_K = 15,
|
244
251
|
GGML_TYPE_I8,
|
245
252
|
GGML_TYPE_I16,
|
246
253
|
GGML_TYPE_I32,
|
@@ -249,8 +256,8 @@ extern "C" {
|
|
249
256
|
|
250
257
|
enum ggml_backend {
|
251
258
|
GGML_BACKEND_CPU = 0,
|
252
|
-
|
253
|
-
|
259
|
+
GGML_BACKEND_GPU = 10,
|
260
|
+
GGML_BACKEND_GPU_SPLIT = 20,
|
254
261
|
};
|
255
262
|
|
256
263
|
// model file types
|
@@ -264,6 +271,11 @@ extern "C" {
|
|
264
271
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
265
272
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
266
273
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
274
|
+
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
275
|
+
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
276
|
+
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
277
|
+
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
278
|
+
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
267
279
|
};
|
268
280
|
|
269
281
|
// available tensor operations:
|
@@ -375,7 +387,9 @@ extern "C" {
|
|
375
387
|
|
376
388
|
char name[GGML_MAX_NAME];
|
377
389
|
|
378
|
-
|
390
|
+
void * extra; // extra things e.g. for ggml-cuda.cu
|
391
|
+
|
392
|
+
char padding[4];
|
379
393
|
};
|
380
394
|
|
381
395
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -413,6 +427,25 @@ extern "C" {
|
|
413
427
|
bool no_alloc; // don't allocate memory for the tensor data
|
414
428
|
};
|
415
429
|
|
430
|
+
|
431
|
+
// compute types
|
432
|
+
enum ggml_task_type {
|
433
|
+
GGML_TASK_INIT = 0,
|
434
|
+
GGML_TASK_COMPUTE,
|
435
|
+
GGML_TASK_FINALIZE,
|
436
|
+
};
|
437
|
+
|
438
|
+
struct ggml_compute_params {
|
439
|
+
enum ggml_task_type type;
|
440
|
+
|
441
|
+
// ith = thread index, nth = number of threads
|
442
|
+
int ith, nth;
|
443
|
+
|
444
|
+
// work buffer for all threads
|
445
|
+
size_t wsize;
|
446
|
+
void * wdata;
|
447
|
+
};
|
448
|
+
|
416
449
|
// misc
|
417
450
|
|
418
451
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -424,8 +457,10 @@ extern "C" {
|
|
424
457
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
425
458
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
426
459
|
|
427
|
-
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
428
|
-
GGML_API
|
460
|
+
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
461
|
+
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
462
|
+
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
463
|
+
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
429
464
|
|
430
465
|
GGML_API int ggml_blck_size (enum ggml_type type);
|
431
466
|
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
@@ -441,13 +476,16 @@ extern "C" {
|
|
441
476
|
// TODO: temporary until model loading of ggml examples is refactored
|
442
477
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
443
478
|
|
479
|
+
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
480
|
+
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
481
|
+
|
444
482
|
// use this to compute the memory overhead of a tensor
|
445
483
|
GGML_API size_t ggml_tensor_overhead(void);
|
446
484
|
|
447
485
|
// main
|
448
486
|
|
449
487
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
450
|
-
GGML_API void
|
488
|
+
GGML_API void ggml_free(struct ggml_context * ctx);
|
451
489
|
|
452
490
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
453
491
|
|