llama_cpp 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -198,6 +198,7 @@
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
200
  #define GGML_MAX_OPT 4
201
+ #define GGML_MAX_NAME 32
201
202
  #define GGML_DEFAULT_N_THREADS 4
202
203
 
203
204
  #define GGML_ASSERT(x) \
@@ -240,6 +241,13 @@ extern "C" {
240
241
  GGML_TYPE_Q5_1 = 7,
241
242
  GGML_TYPE_Q8_0 = 8,
242
243
  GGML_TYPE_Q8_1 = 9,
244
+ // k-quantizations
245
+ GGML_TYPE_Q2_K = 10,
246
+ GGML_TYPE_Q3_K = 11,
247
+ GGML_TYPE_Q4_K = 12,
248
+ GGML_TYPE_Q5_K = 13,
249
+ GGML_TYPE_Q6_K = 14,
250
+ GGML_TYPE_Q8_K = 15,
243
251
  GGML_TYPE_I8,
244
252
  GGML_TYPE_I16,
245
253
  GGML_TYPE_I32,
@@ -248,8 +256,8 @@ extern "C" {
248
256
 
249
257
  enum ggml_backend {
250
258
  GGML_BACKEND_CPU = 0,
251
- GGML_BACKEND_CUDA = 1,
252
- GGML_BACKEND_CL = 2,
259
+ GGML_BACKEND_GPU = 10,
260
+ GGML_BACKEND_GPU_SPLIT = 20,
253
261
  };
254
262
 
255
263
  // model file types
@@ -263,6 +271,11 @@ extern "C" {
263
271
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
264
272
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
265
273
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
274
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
275
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
276
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
277
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
278
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
266
279
  };
267
280
 
268
281
  // available tensor operations:
@@ -372,11 +385,15 @@ extern "C" {
372
385
 
373
386
  void * data;
374
387
 
375
- char name[32];
388
+ char name[GGML_MAX_NAME];
376
389
 
377
- char padding[16];
390
+ void * extra; // extra things e.g. for ggml-cuda.cu
391
+
392
+ char padding[4];
378
393
  };
379
394
 
395
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
396
+
380
397
  // computation graph
381
398
  struct ggml_cgraph {
382
399
  int n_nodes;
@@ -410,6 +427,25 @@ extern "C" {
410
427
  bool no_alloc; // don't allocate memory for the tensor data
411
428
  };
412
429
 
430
+
431
+ // compute types
432
+ enum ggml_task_type {
433
+ GGML_TASK_INIT = 0,
434
+ GGML_TASK_COMPUTE,
435
+ GGML_TASK_FINALIZE,
436
+ };
437
+
438
+ struct ggml_compute_params {
439
+ enum ggml_task_type type;
440
+
441
+ // ith = thread index, nth = number of threads
442
+ int ith, nth;
443
+
444
+ // work buffer for all threads
445
+ size_t wsize;
446
+ void * wdata;
447
+ };
448
+
413
449
  // misc
414
450
 
415
451
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -421,14 +457,17 @@ extern "C" {
421
457
  GGML_API void ggml_print_object (const struct ggml_object * obj);
422
458
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
423
459
 
424
- GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
425
- GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
460
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
461
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
462
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
463
+ GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
426
464
 
427
465
  GGML_API int ggml_blck_size (enum ggml_type type);
428
466
  GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
429
467
  GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
430
468
 
431
469
  GGML_API const char * ggml_type_name(enum ggml_type type);
470
+ GGML_API const char * ggml_op_name (enum ggml_op op);
432
471
 
433
472
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
434
473
 
@@ -437,14 +476,24 @@ extern "C" {
437
476
  // TODO: temporary until model loading of ggml examples is refactored
438
477
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
439
478
 
479
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
480
+ GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
481
+
482
+ // use this to compute the memory overhead of a tensor
483
+ GGML_API size_t ggml_tensor_overhead(void);
484
+
440
485
  // main
441
486
 
442
487
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
443
- GGML_API void ggml_free(struct ggml_context * ctx);
488
+ GGML_API void ggml_free(struct ggml_context * ctx);
444
489
 
445
490
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
446
491
 
447
- GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
492
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
493
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
494
+
495
+ GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
496
+ GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
448
497
 
449
498
  GGML_API struct ggml_tensor * ggml_new_tensor(
450
499
  struct ggml_context * ctx,
@@ -484,6 +533,8 @@ extern "C" {
484
533
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
485
534
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
486
535
 
536
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
537
+
487
538
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
488
539
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
489
540
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@@ -970,6 +1021,11 @@ extern "C" {
970
1021
  GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
971
1022
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
972
1023
 
1024
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1025
+
1026
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
1027
+ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
1028
+
973
1029
  // print info and performance information for the graph
974
1030
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
975
1031