llama_cpp 0.7.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,7 +19,7 @@
19
19
  #pragma warning(disable: 4244 4267) // possible loss of data
20
20
  #endif
21
21
 
22
- #define CL_DMMV_BLOCK_SIZE 32
22
+ #define CL_DMMV_LOCAL_SIZE 32
23
23
 
24
24
  #ifndef K_QUANTS_PER_ITERATION
25
25
  #define K_QUANTS_PER_ITERATION 1
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
338
338
  const int row = get_group_id(0);
339
339
 
340
340
  const int num_blocks_per_row = ncols / QK_K;
341
- const int ib0 = row*num_blocks_per_row;
341
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
342
342
 
343
343
  __global const struct block_q2_K * x = xx + ib0;
344
344
 
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
413
413
  const int row = get_group_id(0);
414
414
 
415
415
  const int num_blocks_per_row = ncols / QK_K;
416
- const int ib0 = row*num_blocks_per_row;
416
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
417
417
 
418
418
  __global const struct block_q3_K * x = xx + ib0;
419
419
 
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
489
489
 
490
490
  const int row = get_group_id(0);
491
491
  const int num_blocks_per_row = ncols / QK_K;
492
- const int ib0 = row*num_blocks_per_row;
492
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
493
493
 
494
494
  const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
495
495
  const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
562
562
 
563
563
  const int row = get_group_id(0);
564
564
  const int num_blocks_per_row = ncols / QK_K;
565
- const int ib0 = row*num_blocks_per_row;
565
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
566
566
 
567
567
  const int tid = get_local_id(0)/2; // 0...15
568
568
  const int ix = get_local_id(0)%2;
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
641
641
  const int row = get_group_id(0);
642
642
 
643
643
  const int num_blocks_per_row = ncols / QK_K;
644
- const int ib0 = row*num_blocks_per_row;
644
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
645
645
 
646
646
  __global const struct block_q6_K * x = xx + ib0;
647
647
 
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
745
745
 
746
746
  std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
747
747
  __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
748
- const int block_size = get_local_size(0);
748
+ const int local_size = get_local_size(0);
749
749
  const int row = get_group_id(0);
750
750
  const int tid = get_local_id(0);
751
751
 
752
752
  const uint qk = QUANT_K;
753
753
  const uint qr = QUANT_R;
754
754
 
755
+ const int col_step = local_size * 2;
755
756
  const int y_offset = qr == 1 ? 1 : qk/2;
756
757
 
758
+ x += get_global_offset(0);
759
+
757
760
  tmp[tid] = 0;
758
761
 
759
- for (int i = 0; i < ncols/block_size; i += 2) {
760
- const int col = i*block_size + 2*tid;
762
+ for (int col = tid*2; col < ncols; col += col_step) {
761
763
  const int ib = (row*ncols + col)/qk; // block index
762
764
  const int iqs = (col%qk)/qr; // quant index
763
765
  const int iybs = col - col%qk; // y block start index
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
773
775
 
774
776
  // sum up partial sums and write back result
775
777
  barrier(CLK_LOCAL_MEM_FENCE);
776
- for (int s=block_size/2; s>0; s>>=1) {
778
+ for (int s=local_size/2; s>0; s>>=1) {
777
779
  if (tid < s) {
778
780
  tmp[tid] += tmp[tid + s];
779
781
  }
@@ -1393,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1393
1395
  const int64_t ne01 = src0->ne[1];
1394
1396
  const int64_t ne02 = src0->ne[2];
1395
1397
  const int64_t ne03 = src0->ne[3];
1396
- const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
1397
1398
  const int64_t ne10 = src1->ne[0];
1398
1399
  const int64_t ne11 = src1->ne[1];
1399
1400
  const int64_t ne12 = src1->ne[2];
1400
1401
  const int64_t ne13 = src1->ne[3];
1401
- const int64_t nb10 = src1->nb[0];
1402
1402
  const int nb2 = dst->nb[2];
1403
1403
  const int nb3 = dst->nb[3];
1404
1404
  size_t x_size;
1405
1405
  size_t d_size;
1406
1406
 
1407
- cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1407
+ cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
1408
1408
  cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1409
- cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1409
+ cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
1410
1410
 
1411
1411
 
1412
1412
  for (int64_t i03 = 0; i03 < ne03; i03++) {
1413
1413
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1414
- const int i0 = i03*ne02 + i02;
1415
-
1416
1414
  cl_event ev;
1417
1415
 
1418
1416
  // copy src0 to device
1419
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
1420
-
1421
- if (nb10 == sizeof(float)) {
1422
- // Contiguous, avoid overhead from queueing many kernel runs
1423
- const int64_t i13 = i03%ne13;
1424
- const int64_t i12 = i02%ne12;
1425
- const int i1 = i13*ne12*ne11 + i12*ne11;
1426
-
1427
- cl_int x_offset = 0;
1428
- cl_int y_offset = i1*ne10;
1429
- cl_int d_offset = 0;
1430
-
1431
- size_t global = ne00 * ne01;
1432
- cl_int ky = ne10;
1433
- CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1434
- CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1435
- CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1436
- CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1437
- CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1438
- CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1439
- CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1440
- CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1441
- } else {
1442
- for (int64_t i01 = 0; i01 < ne01; i01++) {
1443
- const int64_t i13 = i03%ne13;
1444
- const int64_t i12 = i02%ne12;
1445
- const int64_t i11 = i01%ne11;
1446
- const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
1447
-
1448
- cl_int x_offset = i01*ne00;
1449
- cl_int y_offset = i1*ne10;
1450
- cl_int d_offset = i01*ne00;
1451
-
1452
- // compute
1453
- size_t global = ne00;
1454
- cl_int ky = ne10;
1455
- CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1456
- CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1457
- CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1458
- CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1459
- CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1460
- CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1461
- CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1462
- CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1463
- }
1464
- }
1417
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
1418
+
1419
+ const int64_t i13 = i03%ne13;
1420
+ const int64_t i12 = i02%ne12;
1421
+ const int i1 = i13*ne12*ne11 + i12*ne11;
1422
+
1423
+ cl_int x_offset = 0;
1424
+ cl_int y_offset = i1*ne10;
1425
+ cl_int d_offset = 0;
1426
+
1427
+ size_t global = ne00 * ne01;
1428
+ cl_int ky = ne10 * ne11;
1429
+
1430
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1431
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1432
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1433
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1434
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1435
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1436
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1437
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1465
1438
 
1466
1439
  CL_CHECK(clReleaseEvent(ev));
1467
1440
  CL_CHECK(clFinish(queue));
@@ -1566,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1566
1539
  ggml_cl_pool_free(d_D, d_size);
1567
1540
  }
1568
1541
 
1569
- static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
1542
+ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
1570
1543
  GGML_ASSERT(fp16_support);
1571
1544
 
1572
1545
  const int64_t ne00 = src0->ne[0];
@@ -1596,6 +1569,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1596
1569
  const int y_ne = ne11 * ne10;
1597
1570
  const int d_ne = ne11 * ne01;
1598
1571
 
1572
+ GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
1573
+ GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
1574
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
1575
+
1599
1576
  size_t x_size;
1600
1577
  size_t y_size;
1601
1578
  size_t d_size;
@@ -1632,7 +1609,6 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1632
1609
 
1633
1610
  // convert src1 to fp16
1634
1611
  // TODO: use multiple threads
1635
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
1636
1612
  char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1637
1613
  if (src1_cont_rows) {
1638
1614
  if (src1_cont_cols) {
@@ -1704,7 +1680,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1704
1680
  const int nb2 = dst->nb[2];
1705
1681
  const int nb3 = dst->nb[3];
1706
1682
  const ggml_type type = src0->type;
1707
- const bool mul_mat_vec = ne11 == 1;
1683
+ const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
1708
1684
 
1709
1685
  const int64_t r2 = ne12 / ne02;
1710
1686
  const int64_t r3 = ne13 / ne03;
@@ -1737,7 +1713,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1737
1713
  GGML_ASSERT(to_fp32_cl != nullptr);
1738
1714
 
1739
1715
  const size_t global_denom = ggml_cl_global_denom(type);
1740
- const size_t local = ggml_cl_local_size(type);
1716
+ const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
1741
1717
 
1742
1718
  size_t ev_idx = 0;
1743
1719
  std::vector<cl_event> events;
@@ -1770,8 +1746,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1770
1746
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1771
1747
 
1772
1748
  // compute
1773
- const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
1774
- const size_t local = CL_DMMV_BLOCK_SIZE;
1749
+ const size_t global = ne01 * local;
1750
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1775
1751
  const cl_int ncols = ne00;
1776
1752
  events.emplace_back();
1777
1753
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@@ -1779,7 +1755,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1779
1755
  CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1780
1756
  CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1781
1757
  CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1782
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1758
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1783
1759
  } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1784
1760
  // convert src0 to fp32 on device
1785
1761
  const size_t global = x_ne / global_denom;
@@ -1895,8 +1871,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
1895
1871
  }
1896
1872
 
1897
1873
  size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1898
- if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1899
- return ggml_nelements(src1) * sizeof(ggml_fp16_t);
1874
+ if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1875
+ return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
1900
1876
  }
1901
1877
  return 0;
1902
1878
  }
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
162
162
 
163
163
  #define GGML_PRINT(...) printf(__VA_ARGS__)
164
164
 
165
+ //
166
+ // end of logging block
167
+ //
168
+
165
169
  #ifdef GGML_USE_ACCELERATE
166
170
  // uncomment to use vDSP for soft max computation
167
171
  // note: not sure if it is actually faster
168
172
  //#define GGML_SOFT_MAX_ACCELERATE
169
173
  #endif
170
174
 
171
- //
172
- // logging
173
- //
174
-
175
- #if (GGML_DEBUG >= 1)
176
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
177
- #else
178
- #define GGML_PRINT_DEBUG(...)
179
- #endif
180
-
181
- #if (GGML_DEBUG >= 5)
182
- #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
183
- #else
184
- #define GGML_PRINT_DEBUG_5(...)
185
- #endif
186
-
187
- #if (GGML_DEBUG >= 10)
188
- #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
189
- #else
190
- #define GGML_PRINT_DEBUG_10(...)
191
- #endif
192
-
193
- #define GGML_PRINT(...) printf(__VA_ARGS__)
194
-
195
- //
196
- // end of logging block
197
- //
198
-
199
175
  #if defined(_MSC_VER) || defined(__MINGW32__)
200
176
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
201
177
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
@@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4951
4927
  *result = (struct ggml_tensor) {
4952
4928
  /*.type =*/ type,
4953
4929
  /*.backend =*/ GGML_BACKEND_CPU,
4930
+ /*.buffer =*/ NULL,
4954
4931
  /*.n_dims =*/ n_dims,
4955
4932
  /*.ne =*/ { 1, 1, 1, 1 },
4956
4933
  /*.nb =*/ { 0, 0, 0, 0 },
@@ -5517,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
5517
5494
  return result;
5518
5495
  }
5519
5496
 
5497
+ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
5498
+ struct ggml_object * obj = ctx->objects_begin;
5499
+
5500
+ char * const mem_buffer = ctx->mem_buffer;
5501
+
5502
+ while (obj != NULL) {
5503
+ if (obj->type == GGML_OBJECT_TENSOR) {
5504
+ return (struct ggml_tensor *)(mem_buffer + obj->offs);
5505
+ }
5506
+
5507
+ obj = obj->next;
5508
+ }
5509
+
5510
+ return NULL;
5511
+ }
5512
+
5513
+ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
5514
+ struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
5515
+ obj = obj->next;
5516
+
5517
+ char * const mem_buffer = ctx->mem_buffer;
5518
+
5519
+ while (obj != NULL) {
5520
+ if (obj->type == GGML_OBJECT_TENSOR) {
5521
+ return (struct ggml_tensor *)(mem_buffer + obj->offs);
5522
+ }
5523
+
5524
+ obj = obj->next;
5525
+ }
5526
+
5527
+ return NULL;
5528
+ }
5529
+
5520
5530
  struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
5521
5531
  struct ggml_object * obj = ctx->objects_begin;
5522
5532
 
@@ -8670,6 +8680,7 @@ void ggml_set_param(
8670
8680
 
8671
8681
  GGML_ASSERT(tensor->grad == NULL);
8672
8682
  tensor->grad = ggml_dup_tensor(ctx, tensor);
8683
+ ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
8673
8684
  }
8674
8685
 
8675
8686
  // ggml_compute_forward_dup
@@ -11256,7 +11267,7 @@ static void ggml_compute_forward_silu_f32(
11256
11267
 
11257
11268
  #ifndef NDEBUG
11258
11269
  for (int k = 0; k < nc; k++) {
11259
- const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
11270
+ const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
11260
11271
  UNUSED(x);
11261
11272
  assert(!isnan(x));
11262
11273
  assert(!isinf(x));
@@ -13082,24 +13093,22 @@ static void ggml_compute_forward_alibi_f32(
13082
13093
  return;
13083
13094
  }
13084
13095
 
13085
- const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
13096
+ //const int n_past = ((int32_t *) dst->op_params)[0];
13086
13097
  const int n_head = ((int32_t *) dst->op_params)[1];
13087
13098
  float max_bias;
13088
13099
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13089
13100
 
13090
- assert(n_past >= 0);
13091
-
13092
- const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13093
- const int ne1 = src0->ne[1]; // seq_len_without_past
13094
- const int ne2 = src0->ne[2]; // n_head -> this is k
13095
- //const int ne3 = src0->ne[3]; // 1 -> bsz
13101
+ const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13102
+ const int64_t ne1 = src0->ne[1]; // seq_len_without_past
13103
+ const int64_t ne2 = src0->ne[2]; // n_head -> this is k
13104
+ //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
13096
13105
 
13097
- const int n = ggml_nrows(src0);
13098
- const int ne2_ne3 = n/ne1; // ne2*ne3
13106
+ const int64_t n = ggml_nrows(src0);
13107
+ const int64_t ne2_ne3 = n/ne1; // ne2*ne3
13099
13108
 
13100
- const int nb0 = src0->nb[0];
13101
- const int nb1 = src0->nb[1];
13102
- const int nb2 = src0->nb[2];
13109
+ const size_t nb0 = src0->nb[0];
13110
+ const size_t nb1 = src0->nb[1];
13111
+ const size_t nb2 = src0->nb[2];
13103
13112
  //const int nb3 = src0->nb[3];
13104
13113
 
13105
13114
  GGML_ASSERT(nb0 == sizeof(float));
@@ -13111,9 +13120,9 @@ static void ggml_compute_forward_alibi_f32(
13111
13120
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13112
13121
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13113
13122
 
13114
- for (int i = 0; i < ne0; i++) {
13115
- for (int j = 0; j < ne1; j++) {
13116
- for (int k = 0; k < ne2_ne3; k++) {
13123
+ for (int64_t i = 0; i < ne0; i++) {
13124
+ for (int64_t j = 0; j < ne1; j++) {
13125
+ for (int64_t k = 0; k < ne2_ne3; k++) {
13117
13126
  float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13118
13127
  float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13119
13128
 
@@ -13128,7 +13137,6 @@ static void ggml_compute_forward_alibi_f32(
13128
13137
  }
13129
13138
 
13130
13139
  pdst[0] = i * m_k + src[0];
13131
-
13132
13140
  }
13133
13141
  }
13134
13142
  }
@@ -13529,7 +13537,7 @@ static void ggml_compute_forward_rope_f16(
13529
13537
  dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
13530
13538
  dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
13531
13539
  }
13532
- } if (!is_neox) {
13540
+ } else if (!is_neox) {
13533
13541
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
13534
13542
  const float cos_theta = cosf(theta);
13535
13543
  const float sin_theta = sinf(theta);
@@ -14454,7 +14462,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
14454
14462
  int64_t t0 = ggml_perf_time_us();
14455
14463
  UNUSED(t0);
14456
14464
 
14457
- GGML_TENSOR_BINARY_OP_LOCALS
14465
+ GGML_TENSOR_BINARY_OP_LOCALS;
14458
14466
 
14459
14467
  const int ith = params->ith;
14460
14468
  const int nth = params->nth;
@@ -19162,6 +19170,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
19162
19170
 
19163
19171
  if (idx == -1) {
19164
19172
  fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
19173
+ fclose(fout);
19165
19174
  return;
19166
19175
  }
19167
19176
 
@@ -20203,6 +20212,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
20203
20212
  ggml_vec_cpy_f32(nx, xp, x);
20204
20213
  ggml_vec_cpy_f32(nx, gp, g);
20205
20214
 
20215
+ // TODO: instead of passing &cancel here, use the return code of the linesearch
20216
+ // to determine if the optimization should be cancelled
20217
+ // this is a simple change, but not doing this atm, since I don't have a nice
20218
+ // way to test and don't want to break something with so many changes lined up
20206
20219
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
20207
20220
  if (cancel) {
20208
20221
  return GGML_OPT_CANCEL;
@@ -20832,7 +20845,7 @@ struct gguf_kv {
20832
20845
  };
20833
20846
 
20834
20847
  struct gguf_header {
20835
- uint32_t magic;
20848
+ char magic[4];
20836
20849
  uint32_t version;
20837
20850
  uint64_t n_tensors; // GGUFv2
20838
20851
  uint64_t n_kv; // GGUFv2
@@ -20902,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
20902
20915
  struct gguf_context * gguf_init_empty(void) {
20903
20916
  struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
20904
20917
 
20905
- ctx->header.magic = GGUF_MAGIC;
20918
+ memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
20906
20919
  ctx->header.version = GGUF_VERSION;
20907
20920
  ctx->header.n_tensors = 0;
20908
20921
  ctx->header.n_kv = 0;
@@ -20928,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20928
20941
  // offset from start of file
20929
20942
  size_t offset = 0;
20930
20943
 
20931
- uint32_t magic = 0;
20944
+ char magic[4];
20932
20945
 
20933
20946
  // check the magic before making allocations
20934
20947
  {
20935
20948
  gguf_fread_el(file, &magic, sizeof(magic), &offset);
20936
20949
 
20937
- if (magic != GGUF_MAGIC) {
20938
- fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
20939
- fclose(file);
20940
- return NULL;
20950
+ for (uint32_t i = 0; i < sizeof(magic); i++) {
20951
+ if (magic[i] != GGUF_MAGIC[i]) {
20952
+ fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
20953
+ fclose(file);
20954
+ return NULL;
20955
+ }
20941
20956
  }
20942
20957
  }
20943
20958
 
@@ -20947,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20947
20962
 
20948
20963
  // read the header
20949
20964
  {
20950
- ctx->header.magic = magic;
20965
+ strncpy(ctx->header.magic, magic, 4);
20966
+
20951
20967
 
20952
20968
  ctx->kv = NULL;
20953
20969
  ctx->infos = NULL;
@@ -231,8 +231,9 @@
231
231
  #define GGML_EXIT_SUCCESS 0
232
232
  #define GGML_EXIT_ABORTED 1
233
233
 
234
- #define GGUF_MAGIC 0x46554747 // "GGUF"
235
- #define GGUF_VERSION 2
234
+ #define GGUF_MAGIC "GGUF"
235
+
236
+ #define GGUF_VERSION 3
236
237
 
237
238
  #define GGUF_DEFAULT_ALIGNMENT 32
238
239
 
@@ -326,7 +327,7 @@ extern "C" {
326
327
  GGML_TYPE_COUNT,
327
328
  };
328
329
 
329
- enum ggml_backend {
330
+ enum ggml_backend_type {
330
331
  GGML_BACKEND_CPU = 0,
331
332
  GGML_BACKEND_GPU = 10,
332
333
  GGML_BACKEND_GPU_SPLIT = 20,
@@ -479,8 +480,10 @@ extern "C" {
479
480
 
480
481
  // n-dimensional tensor
481
482
  struct ggml_tensor {
482
- enum ggml_type type;
483
- enum ggml_backend backend;
483
+ enum ggml_type type;
484
+ enum ggml_backend_type backend;
485
+
486
+ struct ggml_backend_buffer * buffer;
484
487
 
485
488
  int n_dims;
486
489
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -514,7 +517,7 @@ extern "C" {
514
517
 
515
518
  void * extra; // extra things e.g. for ggml-cuda.cu
516
519
 
517
- char padding[4];
520
+ char padding[12];
518
521
  };
519
522
 
520
523
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -702,6 +705,9 @@ extern "C" {
702
705
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
703
706
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
704
707
 
708
+ // Context tensor enumeration and lookup
709
+ GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
710
+ GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
705
711
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
706
712
 
707
713
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@@ -1358,7 +1364,7 @@ extern "C" {
1358
1364
 
1359
1365
  // alibi position embedding
1360
1366
  // in-place, returns view(a)
1361
- struct ggml_tensor * ggml_alibi(
1367
+ GGML_API struct ggml_tensor * ggml_alibi(
1362
1368
  struct ggml_context * ctx,
1363
1369
  struct ggml_tensor * a,
1364
1370
  int n_past,
@@ -1367,7 +1373,7 @@ extern "C" {
1367
1373
 
1368
1374
  // clamp
1369
1375
  // in-place, returns view(a)
1370
- struct ggml_tensor * ggml_clamp(
1376
+ GGML_API struct ggml_tensor * ggml_clamp(
1371
1377
  struct ggml_context * ctx,
1372
1378
  struct ggml_tensor * a,
1373
1379
  float min,
@@ -2102,7 +2108,7 @@ extern "C" {
2102
2108
  enum ggml_type vec_dot_type;
2103
2109
  } ggml_type_traits_t;
2104
2110
 
2105
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2111
+ GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2106
2112
 
2107
2113
  #ifdef __cplusplus
2108
2114
  }
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
46
46
  #if defined(_MSC_VER) || defined(__MINGW32__)
47
47
  #include <intrin.h>
48
48
  #else
49
- #if !defined(__riscv)
49
+ #if !defined(__riscv) && !defined(__s390__)
50
50
  #include <immintrin.h>
51
51
  #endif
52
52
  #endif
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
462
462
  }
463
463
 
464
464
  size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
465
- const int nb = k / QK_K;
466
-
467
- // TODO - collect histograms - although, at a second thought, I don't really care about them
468
- (void)hist;
465
+ (void)hist; // TODO: collect histograms
469
466
 
470
- for (int j = 0; j < nb; j += k) {
467
+ for (int j = 0; j < n; j += k) {
471
468
  block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
472
469
  quantize_row_q2_K_reference(src + j, y, k);
473
470
  }
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
678
675
  }
679
676
 
680
677
  size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
681
- const int nb = k / QK_K;
682
-
683
- // TODO - collect histograms - although, at a second thought, I don't really care about them
684
- (void)hist;
678
+ (void)hist; // TODO: collect histograms
685
679
 
686
- for (int j = 0; j < nb; j += k) {
680
+ for (int j = 0; j < n; j += k) {
687
681
  block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
688
682
  quantize_row_q3_K_reference(src + j, y, k);
689
683
  }
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
846
840
 
847
841
  size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
848
842
  assert(k % QK_K == 0);
849
- const int nb = k / QK_K;
850
843
  (void)hist; // TODO: collect histograms
851
- for (int j = 0; j < nb; j += k) {
844
+
845
+ for (int j = 0; j < n; j += k) {
852
846
  block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
853
847
  quantize_row_q4_K_reference(src + j, y, k);
854
848
  }
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
1052
1046
 
1053
1047
  size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
1054
1048
  assert(k % QK_K == 0);
1055
- const int nb = k / QK_K;
1056
- (void)hist;
1057
- for (int j = 0; j < nb; j += k) {
1049
+ (void)hist; // TODO: collect histograms
1050
+
1051
+ for (int j = 0; j < n; j += k) {
1058
1052
  block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
1059
1053
  quantize_row_q5_K_reference(src + j, y, k);
1060
1054
  }
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
1200
1194
 
1201
1195
  size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
1202
1196
  assert(k % QK_K == 0);
1203
- const int nb = k / QK_K;
1204
-
1205
- (void)hist; // TODO
1197
+ (void)hist; // TODO: collect histograms
1206
1198
 
1207
- for (int j = 0; j < nb; j += k) {
1199
+ for (int j = 0; j < n; j += k) {
1208
1200
  block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
1209
1201
  quantize_row_q6_K_reference(src + j, y, k);
1210
1202
  }