llama_cpp 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -8
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +242 -52
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +835 -82
- data/ext/llama_cpp/src/ggml.h +64 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +489 -134
- data/ext/llama_cpp/src/llama.h +43 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -198,6 +198,7 @@
|
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_OPT 4
|
201
|
+
#define GGML_MAX_NAME 32
|
201
202
|
#define GGML_DEFAULT_N_THREADS 4
|
202
203
|
|
203
204
|
#define GGML_ASSERT(x) \
|
@@ -240,6 +241,13 @@ extern "C" {
|
|
240
241
|
GGML_TYPE_Q5_1 = 7,
|
241
242
|
GGML_TYPE_Q8_0 = 8,
|
242
243
|
GGML_TYPE_Q8_1 = 9,
|
244
|
+
// k-quantizations
|
245
|
+
GGML_TYPE_Q2_K = 10,
|
246
|
+
GGML_TYPE_Q3_K = 11,
|
247
|
+
GGML_TYPE_Q4_K = 12,
|
248
|
+
GGML_TYPE_Q5_K = 13,
|
249
|
+
GGML_TYPE_Q6_K = 14,
|
250
|
+
GGML_TYPE_Q8_K = 15,
|
243
251
|
GGML_TYPE_I8,
|
244
252
|
GGML_TYPE_I16,
|
245
253
|
GGML_TYPE_I32,
|
@@ -248,8 +256,8 @@ extern "C" {
|
|
248
256
|
|
249
257
|
enum ggml_backend {
|
250
258
|
GGML_BACKEND_CPU = 0,
|
251
|
-
|
252
|
-
|
259
|
+
GGML_BACKEND_GPU = 10,
|
260
|
+
GGML_BACKEND_GPU_SPLIT = 20,
|
253
261
|
};
|
254
262
|
|
255
263
|
// model file types
|
@@ -263,6 +271,11 @@ extern "C" {
|
|
263
271
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
264
272
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
265
273
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
274
|
+
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
275
|
+
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
276
|
+
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
277
|
+
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
278
|
+
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
266
279
|
};
|
267
280
|
|
268
281
|
// available tensor operations:
|
@@ -372,11 +385,15 @@ extern "C" {
|
|
372
385
|
|
373
386
|
void * data;
|
374
387
|
|
375
|
-
char name[
|
388
|
+
char name[GGML_MAX_NAME];
|
376
389
|
|
377
|
-
|
390
|
+
void * extra; // extra things e.g. for ggml-cuda.cu
|
391
|
+
|
392
|
+
char padding[4];
|
378
393
|
};
|
379
394
|
|
395
|
+
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
396
|
+
|
380
397
|
// computation graph
|
381
398
|
struct ggml_cgraph {
|
382
399
|
int n_nodes;
|
@@ -410,6 +427,25 @@ extern "C" {
|
|
410
427
|
bool no_alloc; // don't allocate memory for the tensor data
|
411
428
|
};
|
412
429
|
|
430
|
+
|
431
|
+
// compute types
|
432
|
+
enum ggml_task_type {
|
433
|
+
GGML_TASK_INIT = 0,
|
434
|
+
GGML_TASK_COMPUTE,
|
435
|
+
GGML_TASK_FINALIZE,
|
436
|
+
};
|
437
|
+
|
438
|
+
struct ggml_compute_params {
|
439
|
+
enum ggml_task_type type;
|
440
|
+
|
441
|
+
// ith = thread index, nth = number of threads
|
442
|
+
int ith, nth;
|
443
|
+
|
444
|
+
// work buffer for all threads
|
445
|
+
size_t wsize;
|
446
|
+
void * wdata;
|
447
|
+
};
|
448
|
+
|
413
449
|
// misc
|
414
450
|
|
415
451
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -421,14 +457,17 @@ extern "C" {
|
|
421
457
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
422
458
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
423
459
|
|
424
|
-
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
425
|
-
GGML_API
|
460
|
+
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
461
|
+
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
462
|
+
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
463
|
+
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
426
464
|
|
427
465
|
GGML_API int ggml_blck_size (enum ggml_type type);
|
428
466
|
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
429
467
|
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
430
468
|
|
431
469
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
470
|
+
GGML_API const char * ggml_op_name (enum ggml_op op);
|
432
471
|
|
433
472
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
434
473
|
|
@@ -437,14 +476,24 @@ extern "C" {
|
|
437
476
|
// TODO: temporary until model loading of ggml examples is refactored
|
438
477
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
439
478
|
|
479
|
+
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
480
|
+
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
481
|
+
|
482
|
+
// use this to compute the memory overhead of a tensor
|
483
|
+
GGML_API size_t ggml_tensor_overhead(void);
|
484
|
+
|
440
485
|
// main
|
441
486
|
|
442
487
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
443
|
-
GGML_API void
|
488
|
+
GGML_API void ggml_free(struct ggml_context * ctx);
|
444
489
|
|
445
490
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
446
491
|
|
447
|
-
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
492
|
+
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
493
|
+
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
494
|
+
|
495
|
+
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
496
|
+
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
|
448
497
|
|
449
498
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
450
499
|
struct ggml_context * ctx,
|
@@ -484,6 +533,8 @@ extern "C" {
|
|
484
533
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
485
534
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
486
535
|
|
536
|
+
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
537
|
+
|
487
538
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
488
539
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
489
540
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
@@ -970,6 +1021,11 @@ extern "C" {
|
|
970
1021
|
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
971
1022
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
972
1023
|
|
1024
|
+
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1025
|
+
|
1026
|
+
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
1027
|
+
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
1028
|
+
|
973
1029
|
// print info and performance information for the graph
|
974
1030
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
975
1031
|
|