llama_cpp 0.7.0 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
162
162
 
163
163
  #define GGML_PRINT(...) printf(__VA_ARGS__)
164
164
 
165
+ //
166
+ // end of logging block
167
+ //
168
+
165
169
  #ifdef GGML_USE_ACCELERATE
166
170
  // uncomment to use vDSP for soft max computation
167
171
  // note: not sure if it is actually faster
168
172
  //#define GGML_SOFT_MAX_ACCELERATE
169
173
  #endif
170
174
 
171
- //
172
- // logging
173
- //
174
-
175
- #if (GGML_DEBUG >= 1)
176
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
177
- #else
178
- #define GGML_PRINT_DEBUG(...)
179
- #endif
180
-
181
- #if (GGML_DEBUG >= 5)
182
- #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
183
- #else
184
- #define GGML_PRINT_DEBUG_5(...)
185
- #endif
186
-
187
- #if (GGML_DEBUG >= 10)
188
- #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
189
- #else
190
- #define GGML_PRINT_DEBUG_10(...)
191
- #endif
192
-
193
- #define GGML_PRINT(...) printf(__VA_ARGS__)
194
-
195
- //
196
- // end of logging block
197
- //
198
-
199
175
  #if defined(_MSC_VER) || defined(__MINGW32__)
200
176
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
201
177
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
@@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4951
4927
  *result = (struct ggml_tensor) {
4952
4928
  /*.type =*/ type,
4953
4929
  /*.backend =*/ GGML_BACKEND_CPU,
4930
+ /*.buffer =*/ NULL,
4954
4931
  /*.n_dims =*/ n_dims,
4955
4932
  /*.ne =*/ { 1, 1, 1, 1 },
4956
4933
  /*.nb =*/ { 0, 0, 0, 0 },
@@ -5517,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
5517
5494
  return result;
5518
5495
  }
5519
5496
 
5497
+ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
5498
+ struct ggml_object * obj = ctx->objects_begin;
5499
+
5500
+ char * const mem_buffer = ctx->mem_buffer;
5501
+
5502
+ while (obj != NULL) {
5503
+ if (obj->type == GGML_OBJECT_TENSOR) {
5504
+ return (struct ggml_tensor *)(mem_buffer + obj->offs);
5505
+ }
5506
+
5507
+ obj = obj->next;
5508
+ }
5509
+
5510
+ return NULL;
5511
+ }
5512
+
5513
+ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
5514
+ struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
5515
+ obj = obj->next;
5516
+
5517
+ char * const mem_buffer = ctx->mem_buffer;
5518
+
5519
+ while (obj != NULL) {
5520
+ if (obj->type == GGML_OBJECT_TENSOR) {
5521
+ return (struct ggml_tensor *)(mem_buffer + obj->offs);
5522
+ }
5523
+
5524
+ obj = obj->next;
5525
+ }
5526
+
5527
+ return NULL;
5528
+ }
5529
+
5520
5530
  struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
5521
5531
  struct ggml_object * obj = ctx->objects_begin;
5522
5532
 
@@ -8670,6 +8680,7 @@ void ggml_set_param(
8670
8680
 
8671
8681
  GGML_ASSERT(tensor->grad == NULL);
8672
8682
  tensor->grad = ggml_dup_tensor(ctx, tensor);
8683
+ ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
8673
8684
  }
8674
8685
 
8675
8686
  // ggml_compute_forward_dup
@@ -11256,7 +11267,7 @@ static void ggml_compute_forward_silu_f32(
11256
11267
 
11257
11268
  #ifndef NDEBUG
11258
11269
  for (int k = 0; k < nc; k++) {
11259
- const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
11270
+ const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
11260
11271
  UNUSED(x);
11261
11272
  assert(!isnan(x));
11262
11273
  assert(!isinf(x));
@@ -13082,24 +13093,22 @@ static void ggml_compute_forward_alibi_f32(
13082
13093
  return;
13083
13094
  }
13084
13095
 
13085
- const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
13096
+ //const int n_past = ((int32_t *) dst->op_params)[0];
13086
13097
  const int n_head = ((int32_t *) dst->op_params)[1];
13087
13098
  float max_bias;
13088
13099
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13089
13100
 
13090
- assert(n_past >= 0);
13091
-
13092
- const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13093
- const int ne1 = src0->ne[1]; // seq_len_without_past
13094
- const int ne2 = src0->ne[2]; // n_head -> this is k
13095
- //const int ne3 = src0->ne[3]; // 1 -> bsz
13101
+ const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13102
+ const int64_t ne1 = src0->ne[1]; // seq_len_without_past
13103
+ const int64_t ne2 = src0->ne[2]; // n_head -> this is k
13104
+ //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
13096
13105
 
13097
- const int n = ggml_nrows(src0);
13098
- const int ne2_ne3 = n/ne1; // ne2*ne3
13106
+ const int64_t n = ggml_nrows(src0);
13107
+ const int64_t ne2_ne3 = n/ne1; // ne2*ne3
13099
13108
 
13100
- const int nb0 = src0->nb[0];
13101
- const int nb1 = src0->nb[1];
13102
- const int nb2 = src0->nb[2];
13109
+ const size_t nb0 = src0->nb[0];
13110
+ const size_t nb1 = src0->nb[1];
13111
+ const size_t nb2 = src0->nb[2];
13103
13112
  //const int nb3 = src0->nb[3];
13104
13113
 
13105
13114
  GGML_ASSERT(nb0 == sizeof(float));
@@ -13111,9 +13120,9 @@ static void ggml_compute_forward_alibi_f32(
13111
13120
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13112
13121
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13113
13122
 
13114
- for (int i = 0; i < ne0; i++) {
13115
- for (int j = 0; j < ne1; j++) {
13116
- for (int k = 0; k < ne2_ne3; k++) {
13123
+ for (int64_t i = 0; i < ne0; i++) {
13124
+ for (int64_t j = 0; j < ne1; j++) {
13125
+ for (int64_t k = 0; k < ne2_ne3; k++) {
13117
13126
  float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13118
13127
  float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13119
13128
 
@@ -13128,7 +13137,6 @@ static void ggml_compute_forward_alibi_f32(
13128
13137
  }
13129
13138
 
13130
13139
  pdst[0] = i * m_k + src[0];
13131
-
13132
13140
  }
13133
13141
  }
13134
13142
  }
@@ -14454,7 +14462,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
14454
14462
  int64_t t0 = ggml_perf_time_us();
14455
14463
  UNUSED(t0);
14456
14464
 
14457
- GGML_TENSOR_BINARY_OP_LOCALS
14465
+ GGML_TENSOR_BINARY_OP_LOCALS;
14458
14466
 
14459
14467
  const int ith = params->ith;
14460
14468
  const int nth = params->nth;
@@ -20203,6 +20211,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
20203
20211
  ggml_vec_cpy_f32(nx, xp, x);
20204
20212
  ggml_vec_cpy_f32(nx, gp, g);
20205
20213
 
20214
+ // TODO: instead of passing &cancel here, use the return code of the linesearch
20215
+ // to determine if the optimization should be cancelled
20216
+ // this is a simple change, but not doing this atm, since I don't have a nice
20217
+ // way to test and don't want to break something with so many changes lined up
20206
20218
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
20207
20219
  if (cancel) {
20208
20220
  return GGML_OPT_CANCEL;
@@ -326,7 +326,7 @@ extern "C" {
326
326
  GGML_TYPE_COUNT,
327
327
  };
328
328
 
329
- enum ggml_backend {
329
+ enum ggml_backend_type {
330
330
  GGML_BACKEND_CPU = 0,
331
331
  GGML_BACKEND_GPU = 10,
332
332
  GGML_BACKEND_GPU_SPLIT = 20,
@@ -479,8 +479,10 @@ extern "C" {
479
479
 
480
480
  // n-dimensional tensor
481
481
  struct ggml_tensor {
482
- enum ggml_type type;
483
- enum ggml_backend backend;
482
+ enum ggml_type type;
483
+ enum ggml_backend_type backend;
484
+
485
+ struct ggml_backend_buffer * buffer;
484
486
 
485
487
  int n_dims;
486
488
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -514,7 +516,7 @@ extern "C" {
514
516
 
515
517
  void * extra; // extra things e.g. for ggml-cuda.cu
516
518
 
517
- char padding[4];
519
+ char padding[12];
518
520
  };
519
521
 
520
522
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -702,6 +704,9 @@ extern "C" {
702
704
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
703
705
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
704
706
 
707
+ // Context tensor enumeration and lookup
708
+ GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
709
+ GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
705
710
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
706
711
 
707
712
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@@ -1358,7 +1363,7 @@ extern "C" {
1358
1363
 
1359
1364
  // alibi position embedding
1360
1365
  // in-place, returns view(a)
1361
- struct ggml_tensor * ggml_alibi(
1366
+ GGML_API struct ggml_tensor * ggml_alibi(
1362
1367
  struct ggml_context * ctx,
1363
1368
  struct ggml_tensor * a,
1364
1369
  int n_past,
@@ -1367,7 +1372,7 @@ extern "C" {
1367
1372
 
1368
1373
  // clamp
1369
1374
  // in-place, returns view(a)
1370
- struct ggml_tensor * ggml_clamp(
1375
+ GGML_API struct ggml_tensor * ggml_clamp(
1371
1376
  struct ggml_context * ctx,
1372
1377
  struct ggml_tensor * a,
1373
1378
  float min,
@@ -2102,7 +2107,7 @@ extern "C" {
2102
2107
  enum ggml_type vec_dot_type;
2103
2108
  } ggml_type_traits_t;
2104
2109
 
2105
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2110
+ GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2106
2111
 
2107
2112
  #ifdef __cplusplus
2108
2113
  }
@@ -29,7 +29,7 @@
29
29
 
30
30
  // 2-bit quantization
31
31
  // weight is represented as x = a * q + b
32
- // 16 blocks of 16 elemenets each
32
+ // 16 blocks of 16 elements each
33
33
  // Effectively 2.5625 bits per weight
34
34
  typedef struct {
35
35
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
@@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
41
41
 
42
42
  // 3-bit quantization
43
43
  // weight is represented as x = a * q
44
- // 16 blocks of 16 elemenets each
44
+ // 16 blocks of 16 elements each
45
45
  // Effectively 3.4375 bits per weight
46
46
  #ifdef GGML_QKK_64
47
47
  typedef struct {
@@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
62
62
  #endif
63
63
 
64
64
  // 4-bit quantization
65
- // 16 blocks of 32 elements each
65
+ // 8 blocks of 32 elements each
66
66
  // weight is represented as x = a * q + b
67
67
  // Effectively 4.5 bits per weight
68
68
  #ifdef GGML_QKK_64
@@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
83
83
  #endif
84
84
 
85
85
  // 5-bit quantization
86
- // 16 blocks of 32 elements each
86
+ // 8 blocks of 32 elements each
87
87
  // weight is represented as x = a * q + b
88
88
  // Effectively 5.5 bits per weight
89
89
  #ifdef GGML_QKK_64
@@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
107
107
 
108
108
  // 6-bit quantization
109
109
  // weight is represented as x = a * q
110
- // 16 blocks of 16 elemenets each
110
+ // 16 blocks of 16 elements each
111
111
  // Effectively 6.5625 bits per weight
112
112
  typedef struct {
113
113
  uint8_t ql[QK_K/2]; // quants, lower 4 bits