llama_cpp 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -198,6 +198,7 @@
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
200
  #define GGML_MAX_OPT 4
201
+ #define GGML_MAX_NAME 32
201
202
  #define GGML_DEFAULT_N_THREADS 4
202
203
 
203
204
  #define GGML_ASSERT(x) \
@@ -240,6 +241,13 @@ extern "C" {
240
241
  GGML_TYPE_Q5_1 = 7,
241
242
  GGML_TYPE_Q8_0 = 8,
242
243
  GGML_TYPE_Q8_1 = 9,
244
+ // k-quantizations
245
+ GGML_TYPE_Q2_K = 10,
246
+ GGML_TYPE_Q3_K = 11,
247
+ GGML_TYPE_Q4_K = 12,
248
+ GGML_TYPE_Q5_K = 13,
249
+ GGML_TYPE_Q6_K = 14,
250
+ GGML_TYPE_Q8_K = 15,
243
251
  GGML_TYPE_I8,
244
252
  GGML_TYPE_I16,
245
253
  GGML_TYPE_I32,
@@ -248,8 +256,8 @@ extern "C" {
248
256
 
249
257
  enum ggml_backend {
250
258
  GGML_BACKEND_CPU = 0,
251
- GGML_BACKEND_CUDA = 1,
252
- GGML_BACKEND_CL = 2,
259
+ GGML_BACKEND_GPU = 10,
260
+ GGML_BACKEND_GPU_SPLIT = 20,
253
261
  };
254
262
 
255
263
  // model file types
@@ -263,6 +271,11 @@ extern "C" {
263
271
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
264
272
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
265
273
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
274
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
275
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
276
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
277
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
278
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
266
279
  };
267
280
 
268
281
  // available tensor operations:
@@ -372,11 +385,15 @@ extern "C" {
372
385
 
373
386
  void * data;
374
387
 
375
- char name[32];
388
+ char name[GGML_MAX_NAME];
376
389
 
377
- char padding[16];
390
+ void * extra; // extra things e.g. for ggml-cuda.cu
391
+
392
+ char padding[4];
378
393
  };
379
394
 
395
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
396
+
380
397
  // computation graph
381
398
  struct ggml_cgraph {
382
399
  int n_nodes;
@@ -410,6 +427,25 @@ extern "C" {
410
427
  bool no_alloc; // don't allocate memory for the tensor data
411
428
  };
412
429
 
430
+
431
+ // compute types
432
+ enum ggml_task_type {
433
+ GGML_TASK_INIT = 0,
434
+ GGML_TASK_COMPUTE,
435
+ GGML_TASK_FINALIZE,
436
+ };
437
+
438
+ struct ggml_compute_params {
439
+ enum ggml_task_type type;
440
+
441
+ // ith = thread index, nth = number of threads
442
+ int ith, nth;
443
+
444
+ // work buffer for all threads
445
+ size_t wsize;
446
+ void * wdata;
447
+ };
448
+
413
449
  // misc
414
450
 
415
451
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -421,14 +457,17 @@ extern "C" {
421
457
  GGML_API void ggml_print_object (const struct ggml_object * obj);
422
458
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
423
459
 
424
- GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
425
- GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
460
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
461
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
462
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
463
+ GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
426
464
 
427
465
  GGML_API int ggml_blck_size (enum ggml_type type);
428
466
  GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
429
467
  GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
430
468
 
431
469
  GGML_API const char * ggml_type_name(enum ggml_type type);
470
+ GGML_API const char * ggml_op_name (enum ggml_op op);
432
471
 
433
472
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
434
473
 
@@ -437,14 +476,24 @@ extern "C" {
437
476
  // TODO: temporary until model loading of ggml examples is refactored
438
477
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
439
478
 
479
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
480
+ GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
481
+
482
+ // use this to compute the memory overhead of a tensor
483
+ GGML_API size_t ggml_tensor_overhead(void);
484
+
440
485
  // main
441
486
 
442
487
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
443
- GGML_API void ggml_free(struct ggml_context * ctx);
488
+ GGML_API void ggml_free(struct ggml_context * ctx);
444
489
 
445
490
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
446
491
 
447
- GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
492
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
493
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
494
+
495
+ GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
496
+ GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
448
497
 
449
498
  GGML_API struct ggml_tensor * ggml_new_tensor(
450
499
  struct ggml_context * ctx,
@@ -484,6 +533,8 @@ extern "C" {
484
533
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
485
534
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
486
535
 
536
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
537
+
487
538
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
488
539
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
489
540
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@@ -970,6 +1021,11 @@ extern "C" {
970
1021
  GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
971
1022
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
972
1023
 
1024
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1025
+
1026
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
1027
+ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
1028
+
973
1029
  // print info and performance information for the graph
974
1030
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
975
1031