llama_cpp 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
162
162
 
163
163
  #define GGML_PRINT(...) printf(__VA_ARGS__)
164
164
 
165
+ //
166
+ // end of logging block
167
+ //
168
+
165
169
  #ifdef GGML_USE_ACCELERATE
166
170
  // uncomment to use vDSP for soft max computation
167
171
  // note: not sure if it is actually faster
168
172
  //#define GGML_SOFT_MAX_ACCELERATE
169
173
  #endif
170
174
 
171
- //
172
- // logging
173
- //
174
-
175
- #if (GGML_DEBUG >= 1)
176
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
177
- #else
178
- #define GGML_PRINT_DEBUG(...)
179
- #endif
180
-
181
- #if (GGML_DEBUG >= 5)
182
- #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
183
- #else
184
- #define GGML_PRINT_DEBUG_5(...)
185
- #endif
186
-
187
- #if (GGML_DEBUG >= 10)
188
- #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
189
- #else
190
- #define GGML_PRINT_DEBUG_10(...)
191
- #endif
192
-
193
- #define GGML_PRINT(...) printf(__VA_ARGS__)
194
-
195
- //
196
- // end of logging block
197
- //
198
-
199
175
  #if defined(_MSC_VER) || defined(__MINGW32__)
200
176
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
201
177
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
@@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4951
4927
  *result = (struct ggml_tensor) {
4952
4928
  /*.type =*/ type,
4953
4929
  /*.backend =*/ GGML_BACKEND_CPU,
4930
+ /*.buffer =*/ NULL,
4954
4931
  /*.n_dims =*/ n_dims,
4955
4932
  /*.ne =*/ { 1, 1, 1, 1 },
4956
4933
  /*.nb =*/ { 0, 0, 0, 0 },
@@ -5517,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
5517
5494
  return result;
5518
5495
  }
5519
5496
 
5497
+ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
5498
+ struct ggml_object * obj = ctx->objects_begin;
5499
+
5500
+ char * const mem_buffer = ctx->mem_buffer;
5501
+
5502
+ while (obj != NULL) {
5503
+ if (obj->type == GGML_OBJECT_TENSOR) {
5504
+ return (struct ggml_tensor *)(mem_buffer + obj->offs);
5505
+ }
5506
+
5507
+ obj = obj->next;
5508
+ }
5509
+
5510
+ return NULL;
5511
+ }
5512
+
5513
+ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
5514
+ struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
5515
+ obj = obj->next;
5516
+
5517
+ char * const mem_buffer = ctx->mem_buffer;
5518
+
5519
+ while (obj != NULL) {
5520
+ if (obj->type == GGML_OBJECT_TENSOR) {
5521
+ return (struct ggml_tensor *)(mem_buffer + obj->offs);
5522
+ }
5523
+
5524
+ obj = obj->next;
5525
+ }
5526
+
5527
+ return NULL;
5528
+ }
5529
+
5520
5530
  struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
5521
5531
  struct ggml_object * obj = ctx->objects_begin;
5522
5532
 
@@ -8670,6 +8680,7 @@ void ggml_set_param(
8670
8680
 
8671
8681
  GGML_ASSERT(tensor->grad == NULL);
8672
8682
  tensor->grad = ggml_dup_tensor(ctx, tensor);
8683
+ ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
8673
8684
  }
8674
8685
 
8675
8686
  // ggml_compute_forward_dup
@@ -11256,7 +11267,7 @@ static void ggml_compute_forward_silu_f32(
11256
11267
 
11257
11268
  #ifndef NDEBUG
11258
11269
  for (int k = 0; k < nc; k++) {
11259
- const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
11270
+ const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
11260
11271
  UNUSED(x);
11261
11272
  assert(!isnan(x));
11262
11273
  assert(!isinf(x));
@@ -13082,24 +13093,22 @@ static void ggml_compute_forward_alibi_f32(
13082
13093
  return;
13083
13094
  }
13084
13095
 
13085
- const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
13096
+ //const int n_past = ((int32_t *) dst->op_params)[0];
13086
13097
  const int n_head = ((int32_t *) dst->op_params)[1];
13087
13098
  float max_bias;
13088
13099
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13089
13100
 
13090
- assert(n_past >= 0);
13091
-
13092
- const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13093
- const int ne1 = src0->ne[1]; // seq_len_without_past
13094
- const int ne2 = src0->ne[2]; // n_head -> this is k
13095
- //const int ne3 = src0->ne[3]; // 1 -> bsz
13101
+ const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13102
+ const int64_t ne1 = src0->ne[1]; // seq_len_without_past
13103
+ const int64_t ne2 = src0->ne[2]; // n_head -> this is k
13104
+ //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
13096
13105
 
13097
- const int n = ggml_nrows(src0);
13098
- const int ne2_ne3 = n/ne1; // ne2*ne3
13106
+ const int64_t n = ggml_nrows(src0);
13107
+ const int64_t ne2_ne3 = n/ne1; // ne2*ne3
13099
13108
 
13100
- const int nb0 = src0->nb[0];
13101
- const int nb1 = src0->nb[1];
13102
- const int nb2 = src0->nb[2];
13109
+ const size_t nb0 = src0->nb[0];
13110
+ const size_t nb1 = src0->nb[1];
13111
+ const size_t nb2 = src0->nb[2];
13103
13112
  //const int nb3 = src0->nb[3];
13104
13113
 
13105
13114
  GGML_ASSERT(nb0 == sizeof(float));
@@ -13111,9 +13120,9 @@ static void ggml_compute_forward_alibi_f32(
13111
13120
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13112
13121
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13113
13122
 
13114
- for (int i = 0; i < ne0; i++) {
13115
- for (int j = 0; j < ne1; j++) {
13116
- for (int k = 0; k < ne2_ne3; k++) {
13123
+ for (int64_t i = 0; i < ne0; i++) {
13124
+ for (int64_t j = 0; j < ne1; j++) {
13125
+ for (int64_t k = 0; k < ne2_ne3; k++) {
13117
13126
  float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13118
13127
  float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13119
13128
 
@@ -13128,7 +13137,6 @@ static void ggml_compute_forward_alibi_f32(
13128
13137
  }
13129
13138
 
13130
13139
  pdst[0] = i * m_k + src[0];
13131
-
13132
13140
  }
13133
13141
  }
13134
13142
  }
@@ -14454,7 +14462,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
14454
14462
  int64_t t0 = ggml_perf_time_us();
14455
14463
  UNUSED(t0);
14456
14464
 
14457
- GGML_TENSOR_BINARY_OP_LOCALS
14465
+ GGML_TENSOR_BINARY_OP_LOCALS;
14458
14466
 
14459
14467
  const int ith = params->ith;
14460
14468
  const int nth = params->nth;
@@ -20203,6 +20211,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
20203
20211
  ggml_vec_cpy_f32(nx, xp, x);
20204
20212
  ggml_vec_cpy_f32(nx, gp, g);
20205
20213
 
20214
+ // TODO: instead of passing &cancel here, use the return code of the linesearch
20215
+ // to determine if the optimization should be cancelled
20216
+ // this is a simple change, but not doing this atm, since I don't have a nice
20217
+ // way to test and don't want to break something with so many changes lined up
20206
20218
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
20207
20219
  if (cancel) {
20208
20220
  return GGML_OPT_CANCEL;
@@ -326,7 +326,7 @@ extern "C" {
326
326
  GGML_TYPE_COUNT,
327
327
  };
328
328
 
329
- enum ggml_backend {
329
+ enum ggml_backend_type {
330
330
  GGML_BACKEND_CPU = 0,
331
331
  GGML_BACKEND_GPU = 10,
332
332
  GGML_BACKEND_GPU_SPLIT = 20,
@@ -479,8 +479,10 @@ extern "C" {
479
479
 
480
480
  // n-dimensional tensor
481
481
  struct ggml_tensor {
482
- enum ggml_type type;
483
- enum ggml_backend backend;
482
+ enum ggml_type type;
483
+ enum ggml_backend_type backend;
484
+
485
+ struct ggml_backend_buffer * buffer;
484
486
 
485
487
  int n_dims;
486
488
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -514,7 +516,7 @@ extern "C" {
514
516
 
515
517
  void * extra; // extra things e.g. for ggml-cuda.cu
516
518
 
517
- char padding[4];
519
+ char padding[12];
518
520
  };
519
521
 
520
522
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -702,6 +704,9 @@ extern "C" {
702
704
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
703
705
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
704
706
 
707
+ // Context tensor enumeration and lookup
708
+ GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
709
+ GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
705
710
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
706
711
 
707
712
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@@ -1358,7 +1363,7 @@ extern "C" {
1358
1363
 
1359
1364
  // alibi position embedding
1360
1365
  // in-place, returns view(a)
1361
- struct ggml_tensor * ggml_alibi(
1366
+ GGML_API struct ggml_tensor * ggml_alibi(
1362
1367
  struct ggml_context * ctx,
1363
1368
  struct ggml_tensor * a,
1364
1369
  int n_past,
@@ -1367,7 +1372,7 @@ extern "C" {
1367
1372
 
1368
1373
  // clamp
1369
1374
  // in-place, returns view(a)
1370
- struct ggml_tensor * ggml_clamp(
1375
+ GGML_API struct ggml_tensor * ggml_clamp(
1371
1376
  struct ggml_context * ctx,
1372
1377
  struct ggml_tensor * a,
1373
1378
  float min,
@@ -2102,7 +2107,7 @@ extern "C" {
2102
2107
  enum ggml_type vec_dot_type;
2103
2108
  } ggml_type_traits_t;
2104
2109
 
2105
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2110
+ GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2106
2111
 
2107
2112
  #ifdef __cplusplus
2108
2113
  }
@@ -29,7 +29,7 @@
29
29
 
30
30
  // 2-bit quantization
31
31
  // weight is represented as x = a * q + b
32
- // 16 blocks of 16 elemenets each
32
+ // 16 blocks of 16 elements each
33
33
  // Effectively 2.5625 bits per weight
34
34
  typedef struct {
35
35
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
@@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
41
41
 
42
42
  // 3-bit quantization
43
43
  // weight is represented as x = a * q
44
- // 16 blocks of 16 elemenets each
44
+ // 16 blocks of 16 elements each
45
45
  // Effectively 3.4375 bits per weight
46
46
  #ifdef GGML_QKK_64
47
47
  typedef struct {
@@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
62
62
  #endif
63
63
 
64
64
  // 4-bit quantization
65
- // 16 blocks of 32 elements each
65
+ // 8 blocks of 32 elements each
66
66
  // weight is represented as x = a * q + b
67
67
  // Effectively 4.5 bits per weight
68
68
  #ifdef GGML_QKK_64
@@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
83
83
  #endif
84
84
 
85
85
  // 5-bit quantization
86
- // 16 blocks of 32 elements each
86
+ // 8 blocks of 32 elements each
87
87
  // weight is represented as x = a * q + b
88
88
  // Effectively 5.5 bits per weight
89
89
  #ifdef GGML_QKK_64
@@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
107
107
 
108
108
  // 6-bit quantization
109
109
  // weight is represented as x = a * q
110
- // 16 blocks of 16 elemenets each
110
+ // 16 blocks of 16 elements each
111
111
  // Effectively 6.5625 bits per weight
112
112
  typedef struct {
113
113
  uint8_t ql[QK_K/2]; // quants, lower 4 bits