llama_cpp 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -8
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +242 -52
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +835 -82
- data/ext/llama_cpp/src/ggml.h +64 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +489 -134
- data/ext/llama_cpp/src/llama.h +43 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -198,6 +198,7 @@
|
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_OPT 4
|
201
|
+
#define GGML_MAX_NAME 32
|
201
202
|
#define GGML_DEFAULT_N_THREADS 4
|
202
203
|
|
203
204
|
#define GGML_ASSERT(x) \
|
@@ -240,6 +241,13 @@ extern "C" {
|
|
240
241
|
GGML_TYPE_Q5_1 = 7,
|
241
242
|
GGML_TYPE_Q8_0 = 8,
|
242
243
|
GGML_TYPE_Q8_1 = 9,
|
244
|
+
// k-quantizations
|
245
|
+
GGML_TYPE_Q2_K = 10,
|
246
|
+
GGML_TYPE_Q3_K = 11,
|
247
|
+
GGML_TYPE_Q4_K = 12,
|
248
|
+
GGML_TYPE_Q5_K = 13,
|
249
|
+
GGML_TYPE_Q6_K = 14,
|
250
|
+
GGML_TYPE_Q8_K = 15,
|
243
251
|
GGML_TYPE_I8,
|
244
252
|
GGML_TYPE_I16,
|
245
253
|
GGML_TYPE_I32,
|
@@ -248,8 +256,8 @@ extern "C" {
|
|
248
256
|
|
249
257
|
enum ggml_backend {
|
250
258
|
GGML_BACKEND_CPU = 0,
|
251
|
-
|
252
|
-
|
259
|
+
GGML_BACKEND_GPU = 10,
|
260
|
+
GGML_BACKEND_GPU_SPLIT = 20,
|
253
261
|
};
|
254
262
|
|
255
263
|
// model file types
|
@@ -263,6 +271,11 @@ extern "C" {
|
|
263
271
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
264
272
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
265
273
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
274
|
+
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
275
|
+
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
276
|
+
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
277
|
+
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
278
|
+
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
266
279
|
};
|
267
280
|
|
268
281
|
// available tensor operations:
|
@@ -372,11 +385,15 @@ extern "C" {
|
|
372
385
|
|
373
386
|
void * data;
|
374
387
|
|
375
|
-
char name[
|
388
|
+
char name[GGML_MAX_NAME];
|
376
389
|
|
377
|
-
|
390
|
+
void * extra; // extra things e.g. for ggml-cuda.cu
|
391
|
+
|
392
|
+
char padding[4];
|
378
393
|
};
|
379
394
|
|
395
|
+
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
396
|
+
|
380
397
|
// computation graph
|
381
398
|
struct ggml_cgraph {
|
382
399
|
int n_nodes;
|
@@ -410,6 +427,25 @@ extern "C" {
|
|
410
427
|
bool no_alloc; // don't allocate memory for the tensor data
|
411
428
|
};
|
412
429
|
|
430
|
+
|
431
|
+
// compute types
|
432
|
+
enum ggml_task_type {
|
433
|
+
GGML_TASK_INIT = 0,
|
434
|
+
GGML_TASK_COMPUTE,
|
435
|
+
GGML_TASK_FINALIZE,
|
436
|
+
};
|
437
|
+
|
438
|
+
struct ggml_compute_params {
|
439
|
+
enum ggml_task_type type;
|
440
|
+
|
441
|
+
// ith = thread index, nth = number of threads
|
442
|
+
int ith, nth;
|
443
|
+
|
444
|
+
// work buffer for all threads
|
445
|
+
size_t wsize;
|
446
|
+
void * wdata;
|
447
|
+
};
|
448
|
+
|
413
449
|
// misc
|
414
450
|
|
415
451
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -421,14 +457,17 @@ extern "C" {
|
|
421
457
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
422
458
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
423
459
|
|
424
|
-
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
425
|
-
GGML_API
|
460
|
+
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
461
|
+
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
462
|
+
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
463
|
+
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
426
464
|
|
427
465
|
GGML_API int ggml_blck_size (enum ggml_type type);
|
428
466
|
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
429
467
|
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
430
468
|
|
431
469
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
470
|
+
GGML_API const char * ggml_op_name (enum ggml_op op);
|
432
471
|
|
433
472
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
434
473
|
|
@@ -437,14 +476,24 @@ extern "C" {
|
|
437
476
|
// TODO: temporary until model loading of ggml examples is refactored
|
438
477
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
439
478
|
|
479
|
+
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
480
|
+
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
481
|
+
|
482
|
+
// use this to compute the memory overhead of a tensor
|
483
|
+
GGML_API size_t ggml_tensor_overhead(void);
|
484
|
+
|
440
485
|
// main
|
441
486
|
|
442
487
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
443
|
-
GGML_API void
|
488
|
+
GGML_API void ggml_free(struct ggml_context * ctx);
|
444
489
|
|
445
490
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
446
491
|
|
447
|
-
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
492
|
+
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
493
|
+
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
494
|
+
|
495
|
+
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
496
|
+
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
|
448
497
|
|
449
498
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
450
499
|
struct ggml_context * ctx,
|
@@ -484,6 +533,8 @@ extern "C" {
|
|
484
533
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
485
534
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
486
535
|
|
536
|
+
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
537
|
+
|
487
538
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
488
539
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
489
540
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
@@ -970,6 +1021,11 @@ extern "C" {
|
|
970
1021
|
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
971
1022
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
972
1023
|
|
1024
|
+
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1025
|
+
|
1026
|
+
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
1027
|
+
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
1028
|
+
|
973
1029
|
// print info and performance information for the graph
|
974
1030
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
975
1031
|
|