llama_cpp 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -241,6 +241,13 @@ extern "C" {
241
241
  GGML_TYPE_Q5_1 = 7,
242
242
  GGML_TYPE_Q8_0 = 8,
243
243
  GGML_TYPE_Q8_1 = 9,
244
+ // k-quantizations
245
+ GGML_TYPE_Q2_K = 10,
246
+ GGML_TYPE_Q3_K = 11,
247
+ GGML_TYPE_Q4_K = 12,
248
+ GGML_TYPE_Q5_K = 13,
249
+ GGML_TYPE_Q6_K = 14,
250
+ GGML_TYPE_Q8_K = 15,
244
251
  GGML_TYPE_I8,
245
252
  GGML_TYPE_I16,
246
253
  GGML_TYPE_I32,
@@ -249,8 +256,8 @@ extern "C" {
249
256
 
250
257
  enum ggml_backend {
251
258
  GGML_BACKEND_CPU = 0,
252
- GGML_BACKEND_CUDA = 1,
253
- GGML_BACKEND_CL = 2,
259
+ GGML_BACKEND_GPU = 10,
260
+ GGML_BACKEND_GPU_SPLIT = 20,
254
261
  };
255
262
 
256
263
  // model file types
@@ -264,6 +271,11 @@ extern "C" {
264
271
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
265
272
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
266
273
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
274
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
275
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
276
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
277
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
278
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
267
279
  };
268
280
 
269
281
  // available tensor operations:
@@ -375,7 +387,9 @@ extern "C" {
375
387
 
376
388
  char name[GGML_MAX_NAME];
377
389
 
378
- char padding[16];
390
+ void * extra; // extra things e.g. for ggml-cuda.cu
391
+
392
+ char padding[4];
379
393
  };
380
394
 
381
395
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -413,6 +427,25 @@ extern "C" {
413
427
  bool no_alloc; // don't allocate memory for the tensor data
414
428
  };
415
429
 
430
+
431
+ // compute types
432
+ enum ggml_task_type {
433
+ GGML_TASK_INIT = 0,
434
+ GGML_TASK_COMPUTE,
435
+ GGML_TASK_FINALIZE,
436
+ };
437
+
438
+ struct ggml_compute_params {
439
+ enum ggml_task_type type;
440
+
441
+ // ith = thread index, nth = number of threads
442
+ int ith, nth;
443
+
444
+ // work buffer for all threads
445
+ size_t wsize;
446
+ void * wdata;
447
+ };
448
+
416
449
  // misc
417
450
 
418
451
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -424,8 +457,10 @@ extern "C" {
424
457
  GGML_API void ggml_print_object (const struct ggml_object * obj);
425
458
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
426
459
 
427
- GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
428
- GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
460
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
461
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
462
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
463
+ GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
429
464
 
430
465
  GGML_API int ggml_blck_size (enum ggml_type type);
431
466
  GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -441,13 +476,16 @@ extern "C" {
441
476
  // TODO: temporary until model loading of ggml examples is refactored
442
477
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
443
478
 
479
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
480
+ GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
481
+
444
482
  // use this to compute the memory overhead of a tensor
445
483
  GGML_API size_t ggml_tensor_overhead(void);
446
484
 
447
485
  // main
448
486
 
449
487
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
450
- GGML_API void ggml_free(struct ggml_context * ctx);
488
+ GGML_API void ggml_free(struct ggml_context * ctx);
451
489
 
452
490
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
453
491