llama_cpp 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -241,6 +241,13 @@ extern "C" {
241
241
  GGML_TYPE_Q5_1 = 7,
242
242
  GGML_TYPE_Q8_0 = 8,
243
243
  GGML_TYPE_Q8_1 = 9,
244
+ // k-quantizations
245
+ GGML_TYPE_Q2_K = 10,
246
+ GGML_TYPE_Q3_K = 11,
247
+ GGML_TYPE_Q4_K = 12,
248
+ GGML_TYPE_Q5_K = 13,
249
+ GGML_TYPE_Q6_K = 14,
250
+ GGML_TYPE_Q8_K = 15,
244
251
  GGML_TYPE_I8,
245
252
  GGML_TYPE_I16,
246
253
  GGML_TYPE_I32,
@@ -249,8 +256,8 @@ extern "C" {
249
256
 
250
257
  enum ggml_backend {
251
258
  GGML_BACKEND_CPU = 0,
252
- GGML_BACKEND_CUDA = 1,
253
- GGML_BACKEND_CL = 2,
259
+ GGML_BACKEND_GPU = 10,
260
+ GGML_BACKEND_GPU_SPLIT = 20,
254
261
  };
255
262
 
256
263
  // model file types
@@ -264,6 +271,11 @@ extern "C" {
264
271
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
265
272
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
266
273
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
274
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
275
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
276
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
277
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
278
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
267
279
  };
268
280
 
269
281
  // available tensor operations:
@@ -375,7 +387,9 @@ extern "C" {
375
387
 
376
388
  char name[GGML_MAX_NAME];
377
389
 
378
- char padding[16];
390
+ void * extra; // extra things e.g. for ggml-cuda.cu
391
+
392
+ char padding[4];
379
393
  };
380
394
 
381
395
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -413,6 +427,25 @@ extern "C" {
413
427
  bool no_alloc; // don't allocate memory for the tensor data
414
428
  };
415
429
 
430
+
431
+ // compute types
432
+ enum ggml_task_type {
433
+ GGML_TASK_INIT = 0,
434
+ GGML_TASK_COMPUTE,
435
+ GGML_TASK_FINALIZE,
436
+ };
437
+
438
+ struct ggml_compute_params {
439
+ enum ggml_task_type type;
440
+
441
+ // ith = thread index, nth = number of threads
442
+ int ith, nth;
443
+
444
+ // work buffer for all threads
445
+ size_t wsize;
446
+ void * wdata;
447
+ };
448
+
416
449
  // misc
417
450
 
418
451
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -424,8 +457,10 @@ extern "C" {
424
457
  GGML_API void ggml_print_object (const struct ggml_object * obj);
425
458
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
426
459
 
427
- GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
428
- GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
460
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
461
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
462
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
463
+ GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
429
464
 
430
465
  GGML_API int ggml_blck_size (enum ggml_type type);
431
466
  GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -441,13 +476,16 @@ extern "C" {
441
476
  // TODO: temporary until model loading of ggml examples is refactored
442
477
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
443
478
 
479
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
480
+ GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
481
+
444
482
  // use this to compute the memory overhead of a tensor
445
483
  GGML_API size_t ggml_tensor_overhead(void);
446
484
 
447
485
  // main
448
486
 
449
487
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
450
- GGML_API void ggml_free(struct ggml_context * ctx);
488
+ GGML_API void ggml_free(struct ggml_context * ctx);
451
489
 
452
490
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
453
491