llama_cpp 0.0.7 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -197,6 +197,14 @@
197
197
  #define GGML_MAX_OPT 4
198
198
  #define GGML_DEFAULT_N_THREADS 4
199
199
 
200
+ #define GGML_ASSERT(x) \
201
+ do { \
202
+ if (!(x)) { \
203
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
204
+ abort(); \
205
+ } \
206
+ } while (0)
207
+
200
208
  #ifdef __cplusplus
201
209
  extern "C" {
202
210
  #endif
@@ -212,6 +220,9 @@ extern "C" {
212
220
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
213
221
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
214
222
 
223
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
224
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
225
+
215
226
  struct ggml_object;
216
227
  struct ggml_context;
217
228
 
@@ -221,7 +232,7 @@ extern "C" {
221
232
  GGML_TYPE_Q4_0 = 2,
222
233
  GGML_TYPE_Q4_1 = 3,
223
234
  GGML_TYPE_Q4_2 = 4,
224
- GGML_TYPE_Q4_3 = 5,
235
+ // GGML_TYPE_Q4_3 (5) support has been removed
225
236
  GGML_TYPE_Q5_0 = 6,
226
237
  GGML_TYPE_Q5_1 = 7,
227
238
  GGML_TYPE_Q8_0 = 8,
@@ -232,6 +243,20 @@ extern "C" {
232
243
  GGML_TYPE_COUNT,
233
244
  };
234
245
 
246
+ // model file types
247
+ enum ggml_ftype {
248
+ GGML_FTYPE_UNKNOWN = -1,
249
+ GGML_FTYPE_ALL_F32 = 0,
250
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
251
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
252
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
253
+ GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
254
+ GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
255
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
256
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
257
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
258
+ };
259
+
235
260
  // available tensor operations:
236
261
  enum ggml_op {
237
262
  GGML_OP_NONE = 0,
@@ -269,6 +294,7 @@ extern "C" {
269
294
  GGML_OP_DIAG_MASK_INF,
270
295
  GGML_OP_SOFT_MAX,
271
296
  GGML_OP_ROPE,
297
+ GGML_OP_ALIBI,
272
298
  GGML_OP_CONV_1D_1S,
273
299
  GGML_OP_CONV_1D_2S,
274
300
 
@@ -324,7 +350,10 @@ extern "C" {
324
350
  int64_t perf_time_us;
325
351
 
326
352
  void * data;
327
- char padding[8];
353
+
354
+ char name[32];
355
+
356
+ char padding[8]; // TODO: remove and add padding to name?
328
357
  };
329
358
 
330
359
  // computation graph
@@ -384,6 +413,9 @@ extern "C" {
384
413
 
385
414
  GGML_API bool ggml_is_quantized(enum ggml_type type);
386
415
 
416
+ // TODO: temporary until model loading of ggml examples is refactored
417
+ GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
418
+
387
419
  // main
388
420
 
389
421
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -444,6 +476,9 @@ extern "C" {
444
476
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
445
477
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
446
478
 
479
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
480
+ GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
481
+
447
482
  //
448
483
  // operations on tensors with backpropagation
449
484
  //
@@ -662,6 +697,14 @@ extern "C" {
662
697
  int n_dims,
663
698
  int mode);
664
699
 
700
+ // alibi position embedding
701
+ // in-place, returns view(a)
702
+ struct ggml_tensor * ggml_alibi(
703
+ struct ggml_context * ctx,
704
+ struct ggml_tensor * a,
705
+ int n_past,
706
+ int n_head);
707
+
665
708
  // padding = 1
666
709
  // TODO: we don't support extra parameters for now
667
710
  // that's why we are hard-coding the stride, padding, and dilation
@@ -692,8 +735,8 @@ extern "C" {
692
735
  struct ggml_tensor * c1);
693
736
 
694
737
  // Mapping operations
695
- GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
696
- GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
738
+ typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
739
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
697
740
 
698
741
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
699
742
  struct ggml_context * ctx,
@@ -834,7 +877,6 @@ extern "C" {
834
877
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
835
878
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
836
879
  GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
837
- GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
838
880
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
839
881
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
840
882
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -14,6 +14,7 @@
14
14
 
15
15
  #include <string>
16
16
  #include <vector>
17
+ #include <stdexcept>
17
18
 
18
19
  #ifdef __has_include
19
20
  #if __has_include(<unistd.h>)
@@ -74,7 +75,7 @@ struct llama_file {
74
75
  llama_file(const char * fname, const char * mode) {
75
76
  fp = std::fopen(fname, mode);
76
77
  if (fp == NULL) {
77
- throw format("failed to open %s: %s", fname, std::strerror(errno));
78
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
78
79
  }
79
80
  seek(0, SEEK_END);
80
81
  size = tell();
@@ -107,10 +108,10 @@ struct llama_file {
107
108
  errno = 0;
108
109
  std::size_t ret = std::fread(ptr, size, 1, fp);
109
110
  if (ferror(fp)) {
110
- throw format("read error: %s", strerror(errno));
111
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
111
112
  }
112
113
  if (ret != 1) {
113
- throw std::string("unexpectedly reached end of file");
114
+ throw std::runtime_error(std::string("unexpectedly reached end of file"));
114
115
  }
115
116
  }
116
117
 
@@ -133,7 +134,7 @@ struct llama_file {
133
134
  errno = 0;
134
135
  size_t ret = std::fwrite(ptr, size, 1, fp);
135
136
  if (ret != 1) {
136
- throw format("write error: %s", strerror(errno));
137
+ throw std::runtime_error(format("write error: %s", strerror(errno)));
137
138
  }
138
139
  }
139
140
 
@@ -180,7 +181,7 @@ struct llama_mmap {
180
181
  #endif
181
182
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
182
183
  if (addr == MAP_FAILED) {
183
- throw format("mmap failed: %s", strerror(errno));
184
+ throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
184
185
  }
185
186
 
186
187
  if (prefetch) {
@@ -207,7 +208,7 @@ struct llama_mmap {
207
208
  DWORD error = GetLastError();
208
209
 
209
210
  if (hMapping == NULL) {
210
- throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
211
+ throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
211
212
  }
212
213
 
213
214
  addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
@@ -215,7 +216,7 @@ struct llama_mmap {
215
216
  CloseHandle(hMapping);
216
217
 
217
218
  if (addr == NULL) {
218
- throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
219
+ throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
219
220
  }
220
221
 
221
222
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
@@ -243,8 +244,9 @@ struct llama_mmap {
243
244
  #else
244
245
  static constexpr bool SUPPORTED = false;
245
246
 
246
- llama_mmap(struct llama_file *) {
247
- throw std::string("mmap not supported");
247
+ llama_mmap(struct llama_file *, bool prefetch = true) {
248
+ (void)prefetch;
249
+ throw std::runtime_error(std::string("mmap not supported"));
248
250
  }
249
251
  #endif
250
252
  };
@@ -382,8 +384,13 @@ struct llama_mlock {
382
384
  #else
383
385
  static constexpr bool SUPPORTED = false;
384
386
 
385
- void raw_lock(const void * addr, size_t size) {
387
+ size_t lock_granularity() {
388
+ return (size_t) 65536;
389
+ }
390
+
391
+ bool raw_lock(const void * addr, size_t size) {
386
392
  fprintf(stderr, "warning: mlock not supported on this system\n");
393
+ return false;
387
394
  }
388
395
 
389
396
  void raw_unlock(const void * addr, size_t size) {}
@@ -395,6 +402,8 @@ struct llama_buffer {
395
402
  uint8_t * addr = NULL;
396
403
  size_t size = 0;
397
404
 
405
+ llama_buffer() = default;
406
+
398
407
  void resize(size_t size) {
399
408
  delete[] addr;
400
409
  addr = new uint8_t[size];
@@ -404,5 +413,62 @@ struct llama_buffer {
404
413
  ~llama_buffer() {
405
414
  delete[] addr;
406
415
  }
416
+
417
+ // disable copy and move
418
+ llama_buffer(const llama_buffer&) = delete;
419
+ llama_buffer(llama_buffer&&) = delete;
420
+ llama_buffer& operator=(const llama_buffer&) = delete;
421
+ llama_buffer& operator=(llama_buffer&&) = delete;
407
422
  };
423
+
424
+ #ifdef GGML_USE_CUBLAS
425
+ #include "ggml-cuda.h"
426
+ struct llama_ctx_buffer {
427
+ uint8_t * addr = NULL;
428
+ bool is_cuda;
429
+ size_t size = 0;
430
+
431
+ llama_ctx_buffer() = default;
432
+
433
+ void resize(size_t size) {
434
+ free();
435
+
436
+ addr = (uint8_t *) ggml_cuda_host_malloc(size);
437
+ if (addr) {
438
+ is_cuda = true;
439
+ }
440
+ else {
441
+ // fall back to pageable memory
442
+ addr = new uint8_t[size];
443
+ is_cuda = false;
444
+ }
445
+ this->size = size;
446
+ }
447
+
448
+ void free() {
449
+ if (addr) {
450
+ if (is_cuda) {
451
+ ggml_cuda_host_free(addr);
452
+ }
453
+ else {
454
+ delete[] addr;
455
+ }
456
+ }
457
+ addr = NULL;
458
+ }
459
+
460
+ ~llama_ctx_buffer() {
461
+ free();
462
+ }
463
+
464
+ // disable copy and move
465
+ llama_ctx_buffer(const llama_ctx_buffer&) = delete;
466
+ llama_ctx_buffer(llama_ctx_buffer&&) = delete;
467
+ llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
468
+ llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
469
+ };
470
+ #else
471
+ typedef llama_buffer llama_ctx_buffer;
472
+ #endif
473
+
408
474
  #endif