llama_cpp 0.0.7 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -197,6 +197,14 @@
197
197
  #define GGML_MAX_OPT 4
198
198
  #define GGML_DEFAULT_N_THREADS 4
199
199
 
200
+ #define GGML_ASSERT(x) \
201
+ do { \
202
+ if (!(x)) { \
203
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
204
+ abort(); \
205
+ } \
206
+ } while (0)
207
+
200
208
  #ifdef __cplusplus
201
209
  extern "C" {
202
210
  #endif
@@ -212,6 +220,9 @@ extern "C" {
212
220
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
213
221
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
214
222
 
223
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
224
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
225
+
215
226
  struct ggml_object;
216
227
  struct ggml_context;
217
228
 
@@ -221,7 +232,7 @@ extern "C" {
221
232
  GGML_TYPE_Q4_0 = 2,
222
233
  GGML_TYPE_Q4_1 = 3,
223
234
  GGML_TYPE_Q4_2 = 4,
224
- GGML_TYPE_Q4_3 = 5,
235
+ // GGML_TYPE_Q4_3 (5) support has been removed
225
236
  GGML_TYPE_Q5_0 = 6,
226
237
  GGML_TYPE_Q5_1 = 7,
227
238
  GGML_TYPE_Q8_0 = 8,
@@ -232,6 +243,20 @@ extern "C" {
232
243
  GGML_TYPE_COUNT,
233
244
  };
234
245
 
246
+ // model file types
247
+ enum ggml_ftype {
248
+ GGML_FTYPE_UNKNOWN = -1,
249
+ GGML_FTYPE_ALL_F32 = 0,
250
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
251
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
252
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
253
+ GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
254
+ GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
255
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
256
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
257
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
258
+ };
259
+
235
260
  // available tensor operations:
236
261
  enum ggml_op {
237
262
  GGML_OP_NONE = 0,
@@ -269,6 +294,7 @@ extern "C" {
269
294
  GGML_OP_DIAG_MASK_INF,
270
295
  GGML_OP_SOFT_MAX,
271
296
  GGML_OP_ROPE,
297
+ GGML_OP_ALIBI,
272
298
  GGML_OP_CONV_1D_1S,
273
299
  GGML_OP_CONV_1D_2S,
274
300
 
@@ -324,7 +350,10 @@ extern "C" {
324
350
  int64_t perf_time_us;
325
351
 
326
352
  void * data;
327
- char padding[8];
353
+
354
+ char name[32];
355
+
356
+ char padding[8]; // TODO: remove and add padding to name?
328
357
  };
329
358
 
330
359
  // computation graph
@@ -384,6 +413,9 @@ extern "C" {
384
413
 
385
414
  GGML_API bool ggml_is_quantized(enum ggml_type type);
386
415
 
416
+ // TODO: temporary until model loading of ggml examples is refactored
417
+ GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
418
+
387
419
  // main
388
420
 
389
421
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -444,6 +476,9 @@ extern "C" {
444
476
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
445
477
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
446
478
 
479
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
480
+ GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
481
+
447
482
  //
448
483
  // operations on tensors with backpropagation
449
484
  //
@@ -662,6 +697,14 @@ extern "C" {
662
697
  int n_dims,
663
698
  int mode);
664
699
 
700
+ // alibi position embedding
701
+ // in-place, returns view(a)
702
+ struct ggml_tensor * ggml_alibi(
703
+ struct ggml_context * ctx,
704
+ struct ggml_tensor * a,
705
+ int n_past,
706
+ int n_head);
707
+
665
708
  // padding = 1
666
709
  // TODO: we don't support extra parameters for now
667
710
  // that's why we are hard-coding the stride, padding, and dilation
@@ -692,8 +735,8 @@ extern "C" {
692
735
  struct ggml_tensor * c1);
693
736
 
694
737
  // Mapping operations
695
- GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
696
- GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
738
+ typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
739
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
697
740
 
698
741
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
699
742
  struct ggml_context * ctx,
@@ -834,7 +877,6 @@ extern "C" {
834
877
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
835
878
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
836
879
  GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
837
- GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
838
880
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
839
881
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
840
882
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -14,6 +14,7 @@
14
14
 
15
15
  #include <string>
16
16
  #include <vector>
17
+ #include <stdexcept>
17
18
 
18
19
  #ifdef __has_include
19
20
  #if __has_include(<unistd.h>)
@@ -74,7 +75,7 @@ struct llama_file {
74
75
  llama_file(const char * fname, const char * mode) {
75
76
  fp = std::fopen(fname, mode);
76
77
  if (fp == NULL) {
77
- throw format("failed to open %s: %s", fname, std::strerror(errno));
78
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
78
79
  }
79
80
  seek(0, SEEK_END);
80
81
  size = tell();
@@ -107,10 +108,10 @@ struct llama_file {
107
108
  errno = 0;
108
109
  std::size_t ret = std::fread(ptr, size, 1, fp);
109
110
  if (ferror(fp)) {
110
- throw format("read error: %s", strerror(errno));
111
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
111
112
  }
112
113
  if (ret != 1) {
113
- throw std::string("unexpectedly reached end of file");
114
+ throw std::runtime_error(std::string("unexpectedly reached end of file"));
114
115
  }
115
116
  }
116
117
 
@@ -133,7 +134,7 @@ struct llama_file {
133
134
  errno = 0;
134
135
  size_t ret = std::fwrite(ptr, size, 1, fp);
135
136
  if (ret != 1) {
136
- throw format("write error: %s", strerror(errno));
137
+ throw std::runtime_error(format("write error: %s", strerror(errno)));
137
138
  }
138
139
  }
139
140
 
@@ -180,7 +181,7 @@ struct llama_mmap {
180
181
  #endif
181
182
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
182
183
  if (addr == MAP_FAILED) {
183
- throw format("mmap failed: %s", strerror(errno));
184
+ throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
184
185
  }
185
186
 
186
187
  if (prefetch) {
@@ -207,7 +208,7 @@ struct llama_mmap {
207
208
  DWORD error = GetLastError();
208
209
 
209
210
  if (hMapping == NULL) {
210
- throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
211
+ throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
211
212
  }
212
213
 
213
214
  addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
@@ -215,7 +216,7 @@ struct llama_mmap {
215
216
  CloseHandle(hMapping);
216
217
 
217
218
  if (addr == NULL) {
218
- throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
219
+ throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
219
220
  }
220
221
 
221
222
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
@@ -243,8 +244,9 @@ struct llama_mmap {
243
244
  #else
244
245
  static constexpr bool SUPPORTED = false;
245
246
 
246
- llama_mmap(struct llama_file *) {
247
- throw std::string("mmap not supported");
247
+ llama_mmap(struct llama_file *, bool prefetch = true) {
248
+ (void)prefetch;
249
+ throw std::runtime_error(std::string("mmap not supported"));
248
250
  }
249
251
  #endif
250
252
  };
@@ -382,8 +384,13 @@ struct llama_mlock {
382
384
  #else
383
385
  static constexpr bool SUPPORTED = false;
384
386
 
385
- void raw_lock(const void * addr, size_t size) {
387
+ size_t lock_granularity() {
388
+ return (size_t) 65536;
389
+ }
390
+
391
+ bool raw_lock(const void * addr, size_t size) {
386
392
  fprintf(stderr, "warning: mlock not supported on this system\n");
393
+ return false;
387
394
  }
388
395
 
389
396
  void raw_unlock(const void * addr, size_t size) {}
@@ -395,6 +402,8 @@ struct llama_buffer {
395
402
  uint8_t * addr = NULL;
396
403
  size_t size = 0;
397
404
 
405
+ llama_buffer() = default;
406
+
398
407
  void resize(size_t size) {
399
408
  delete[] addr;
400
409
  addr = new uint8_t[size];
@@ -404,5 +413,62 @@ struct llama_buffer {
404
413
  ~llama_buffer() {
405
414
  delete[] addr;
406
415
  }
416
+
417
+ // disable copy and move
418
+ llama_buffer(const llama_buffer&) = delete;
419
+ llama_buffer(llama_buffer&&) = delete;
420
+ llama_buffer& operator=(const llama_buffer&) = delete;
421
+ llama_buffer& operator=(llama_buffer&&) = delete;
407
422
  };
423
+
424
+ #ifdef GGML_USE_CUBLAS
425
+ #include "ggml-cuda.h"
426
+ struct llama_ctx_buffer {
427
+ uint8_t * addr = NULL;
428
+ bool is_cuda;
429
+ size_t size = 0;
430
+
431
+ llama_ctx_buffer() = default;
432
+
433
+ void resize(size_t size) {
434
+ free();
435
+
436
+ addr = (uint8_t *) ggml_cuda_host_malloc(size);
437
+ if (addr) {
438
+ is_cuda = true;
439
+ }
440
+ else {
441
+ // fall back to pageable memory
442
+ addr = new uint8_t[size];
443
+ is_cuda = false;
444
+ }
445
+ this->size = size;
446
+ }
447
+
448
+ void free() {
449
+ if (addr) {
450
+ if (is_cuda) {
451
+ ggml_cuda_host_free(addr);
452
+ }
453
+ else {
454
+ delete[] addr;
455
+ }
456
+ }
457
+ addr = NULL;
458
+ }
459
+
460
+ ~llama_ctx_buffer() {
461
+ free();
462
+ }
463
+
464
+ // disable copy and move
465
+ llama_ctx_buffer(const llama_ctx_buffer&) = delete;
466
+ llama_ctx_buffer(llama_ctx_buffer&&) = delete;
467
+ llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
468
+ llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
469
+ };
470
+ #else
471
+ typedef llama_buffer llama_ctx_buffer;
472
+ #endif
473
+
408
474
  #endif