llama_cpp 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,7 +19,7 @@
19
19
  #pragma warning(disable: 4244 4267) // possible loss of data
20
20
  #endif
21
21
 
22
- #define CL_DMMV_BLOCK_SIZE 32
22
+ #define CL_DMMV_LOCAL_SIZE 32
23
23
 
24
24
  #ifndef K_QUANTS_PER_ITERATION
25
25
  #define K_QUANTS_PER_ITERATION 1
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
338
338
  const int row = get_group_id(0);
339
339
 
340
340
  const int num_blocks_per_row = ncols / QK_K;
341
- const int ib0 = row*num_blocks_per_row;
341
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
342
342
 
343
343
  __global const struct block_q2_K * x = xx + ib0;
344
344
 
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
413
413
  const int row = get_group_id(0);
414
414
 
415
415
  const int num_blocks_per_row = ncols / QK_K;
416
- const int ib0 = row*num_blocks_per_row;
416
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
417
417
 
418
418
  __global const struct block_q3_K * x = xx + ib0;
419
419
 
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
489
489
 
490
490
  const int row = get_group_id(0);
491
491
  const int num_blocks_per_row = ncols / QK_K;
492
- const int ib0 = row*num_blocks_per_row;
492
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
493
493
 
494
494
  const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
495
495
  const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
562
562
 
563
563
  const int row = get_group_id(0);
564
564
  const int num_blocks_per_row = ncols / QK_K;
565
- const int ib0 = row*num_blocks_per_row;
565
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
566
566
 
567
567
  const int tid = get_local_id(0)/2; // 0...15
568
568
  const int ix = get_local_id(0)%2;
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
641
641
  const int row = get_group_id(0);
642
642
 
643
643
  const int num_blocks_per_row = ncols / QK_K;
644
- const int ib0 = row*num_blocks_per_row;
644
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
645
645
 
646
646
  __global const struct block_q6_K * x = xx + ib0;
647
647
 
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
745
745
 
746
746
  std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
747
747
  __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
748
- const int block_size = get_local_size(0);
748
+ const int local_size = get_local_size(0);
749
749
  const int row = get_group_id(0);
750
750
  const int tid = get_local_id(0);
751
751
 
752
752
  const uint qk = QUANT_K;
753
753
  const uint qr = QUANT_R;
754
754
 
755
+ const int col_step = local_size * 2;
755
756
  const int y_offset = qr == 1 ? 1 : qk/2;
756
757
 
758
+ x += get_global_offset(0);
759
+
757
760
  tmp[tid] = 0;
758
761
 
759
- for (int i = 0; i < ncols/block_size; i += 2) {
760
- const int col = i*block_size + 2*tid;
762
+ for (int col = tid*2; col < ncols; col += col_step) {
761
763
  const int ib = (row*ncols + col)/qk; // block index
762
764
  const int iqs = (col%qk)/qr; // quant index
763
765
  const int iybs = col - col%qk; // y block start index
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
773
775
 
774
776
  // sum up partial sums and write back result
775
777
  barrier(CLK_LOCAL_MEM_FENCE);
776
- for (int s=block_size/2; s>0; s>>=1) {
778
+ for (int s=local_size/2; s>0; s>>=1) {
777
779
  if (tid < s) {
778
780
  tmp[tid] += tmp[tid + s];
779
781
  }
@@ -1393,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1393
1395
  const int64_t ne01 = src0->ne[1];
1394
1396
  const int64_t ne02 = src0->ne[2];
1395
1397
  const int64_t ne03 = src0->ne[3];
1396
- const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
1397
1398
  const int64_t ne10 = src1->ne[0];
1398
1399
  const int64_t ne11 = src1->ne[1];
1399
1400
  const int64_t ne12 = src1->ne[2];
1400
1401
  const int64_t ne13 = src1->ne[3];
1401
- const int64_t nb10 = src1->nb[0];
1402
1402
  const int nb2 = dst->nb[2];
1403
1403
  const int nb3 = dst->nb[3];
1404
1404
  size_t x_size;
1405
1405
  size_t d_size;
1406
1406
 
1407
- cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1407
+ cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
1408
1408
  cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1409
- cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1409
+ cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
1410
1410
 
1411
1411
 
1412
1412
  for (int64_t i03 = 0; i03 < ne03; i03++) {
1413
1413
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1414
- const int i0 = i03*ne02 + i02;
1415
-
1416
1414
  cl_event ev;
1417
1415
 
1418
1416
  // copy src0 to device
1419
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
1420
-
1421
- if (nb10 == sizeof(float)) {
1422
- // Contiguous, avoid overhead from queueing many kernel runs
1423
- const int64_t i13 = i03%ne13;
1424
- const int64_t i12 = i02%ne12;
1425
- const int i1 = i13*ne12*ne11 + i12*ne11;
1426
-
1427
- cl_int x_offset = 0;
1428
- cl_int y_offset = i1*ne10;
1429
- cl_int d_offset = 0;
1430
-
1431
- size_t global = ne00 * ne01;
1432
- cl_int ky = ne10;
1433
- CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1434
- CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1435
- CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1436
- CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1437
- CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1438
- CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1439
- CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1440
- CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1441
- } else {
1442
- for (int64_t i01 = 0; i01 < ne01; i01++) {
1443
- const int64_t i13 = i03%ne13;
1444
- const int64_t i12 = i02%ne12;
1445
- const int64_t i11 = i01%ne11;
1446
- const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
1447
-
1448
- cl_int x_offset = i01*ne00;
1449
- cl_int y_offset = i1*ne10;
1450
- cl_int d_offset = i01*ne00;
1451
-
1452
- // compute
1453
- size_t global = ne00;
1454
- cl_int ky = ne10;
1455
- CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1456
- CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1457
- CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1458
- CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1459
- CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1460
- CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1461
- CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1462
- CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1463
- }
1464
- }
1417
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
1418
+
1419
+ const int64_t i13 = i03%ne13;
1420
+ const int64_t i12 = i02%ne12;
1421
+ const int i1 = i13*ne12*ne11 + i12*ne11;
1422
+
1423
+ cl_int x_offset = 0;
1424
+ cl_int y_offset = i1*ne10;
1425
+ cl_int d_offset = 0;
1426
+
1427
+ size_t global = ne00 * ne01;
1428
+ cl_int ky = ne10 * ne11;
1429
+
1430
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1431
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1432
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1433
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1434
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1435
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1436
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1437
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1465
1438
 
1466
1439
  CL_CHECK(clReleaseEvent(ev));
1467
1440
  CL_CHECK(clFinish(queue));
@@ -1566,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1566
1539
  ggml_cl_pool_free(d_D, d_size);
1567
1540
  }
1568
1541
 
1569
- static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
1542
+ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
1570
1543
  GGML_ASSERT(fp16_support);
1571
1544
 
1572
1545
  const int64_t ne00 = src0->ne[0];
@@ -1596,6 +1569,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1596
1569
  const int y_ne = ne11 * ne10;
1597
1570
  const int d_ne = ne11 * ne01;
1598
1571
 
1572
+ GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
1573
+ GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
1574
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
1575
+
1599
1576
  size_t x_size;
1600
1577
  size_t y_size;
1601
1578
  size_t d_size;
@@ -1632,7 +1609,6 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1632
1609
 
1633
1610
  // convert src1 to fp16
1634
1611
  // TODO: use multiple threads
1635
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
1636
1612
  char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1637
1613
  if (src1_cont_rows) {
1638
1614
  if (src1_cont_cols) {
@@ -1704,7 +1680,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1704
1680
  const int nb2 = dst->nb[2];
1705
1681
  const int nb3 = dst->nb[3];
1706
1682
  const ggml_type type = src0->type;
1707
- const bool mul_mat_vec = ne11 == 1;
1683
+ const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
1708
1684
 
1709
1685
  const int64_t r2 = ne12 / ne02;
1710
1686
  const int64_t r3 = ne13 / ne03;
@@ -1737,7 +1713,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1737
1713
  GGML_ASSERT(to_fp32_cl != nullptr);
1738
1714
 
1739
1715
  const size_t global_denom = ggml_cl_global_denom(type);
1740
- const size_t local = ggml_cl_local_size(type);
1716
+ const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
1741
1717
 
1742
1718
  size_t ev_idx = 0;
1743
1719
  std::vector<cl_event> events;
@@ -1770,8 +1746,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1770
1746
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1771
1747
 
1772
1748
  // compute
1773
- const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
1774
- const size_t local = CL_DMMV_BLOCK_SIZE;
1749
+ const size_t global = ne01 * local;
1750
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1775
1751
  const cl_int ncols = ne00;
1776
1752
  events.emplace_back();
1777
1753
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@@ -1779,7 +1755,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1779
1755
  CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1780
1756
  CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1781
1757
  CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1782
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1758
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1783
1759
  } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1784
1760
  // convert src0 to fp32 on device
1785
1761
  const size_t global = x_ne / global_denom;
@@ -1895,8 +1871,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
1895
1871
  }
1896
1872
 
1897
1873
  size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1898
- if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1899
- return ggml_nelements(src1) * sizeof(ggml_fp16_t);
1874
+ if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1875
+ return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
1900
1876
  }
1901
1877
  return 0;
1902
1878
  }
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
162
162
 
163
163
  #define GGML_PRINT(...) printf(__VA_ARGS__)
164
164
 
165
+ //
166
+ // end of logging block
167
+ //
168
+
165
169
  #ifdef GGML_USE_ACCELERATE
166
170
  // uncomment to use vDSP for soft max computation
167
171
  // note: not sure if it is actually faster
168
172
  //#define GGML_SOFT_MAX_ACCELERATE
169
173
  #endif
170
174
 
171
- //
172
- // logging
173
- //
174
-
175
- #if (GGML_DEBUG >= 1)
176
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
177
- #else
178
- #define GGML_PRINT_DEBUG(...)
179
- #endif
180
-
181
- #if (GGML_DEBUG >= 5)
182
- #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
183
- #else
184
- #define GGML_PRINT_DEBUG_5(...)
185
- #endif
186
-
187
- #if (GGML_DEBUG >= 10)
188
- #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
189
- #else
190
- #define GGML_PRINT_DEBUG_10(...)
191
- #endif
192
-
193
- #define GGML_PRINT(...) printf(__VA_ARGS__)
194
-
195
- //
196
- // end of logging block
197
- //
198
-
199
175
  #if defined(_MSC_VER) || defined(__MINGW32__)
200
176
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
201
177
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
@@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4951
4927
  *result = (struct ggml_tensor) {
4952
4928
  /*.type =*/ type,
4953
4929
  /*.backend =*/ GGML_BACKEND_CPU,
4930
+ /*.buffer =*/ NULL,
4954
4931
  /*.n_dims =*/ n_dims,
4955
4932
  /*.ne =*/ { 1, 1, 1, 1 },
4956
4933
  /*.nb =*/ { 0, 0, 0, 0 },
@@ -5517,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
5517
5494
  return result;
5518
5495
  }
5519
5496
 
5497
+ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
5498
+ struct ggml_object * obj = ctx->objects_begin;
5499
+
5500
+ char * const mem_buffer = ctx->mem_buffer;
5501
+
5502
+ while (obj != NULL) {
5503
+ if (obj->type == GGML_OBJECT_TENSOR) {
5504
+ return (struct ggml_tensor *)(mem_buffer + obj->offs);
5505
+ }
5506
+
5507
+ obj = obj->next;
5508
+ }
5509
+
5510
+ return NULL;
5511
+ }
5512
+
5513
+ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
5514
+ struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
5515
+ obj = obj->next;
5516
+
5517
+ char * const mem_buffer = ctx->mem_buffer;
5518
+
5519
+ while (obj != NULL) {
5520
+ if (obj->type == GGML_OBJECT_TENSOR) {
5521
+ return (struct ggml_tensor *)(mem_buffer + obj->offs);
5522
+ }
5523
+
5524
+ obj = obj->next;
5525
+ }
5526
+
5527
+ return NULL;
5528
+ }
5529
+
5520
5530
  struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
5521
5531
  struct ggml_object * obj = ctx->objects_begin;
5522
5532
 
@@ -8670,6 +8680,7 @@ void ggml_set_param(
8670
8680
 
8671
8681
  GGML_ASSERT(tensor->grad == NULL);
8672
8682
  tensor->grad = ggml_dup_tensor(ctx, tensor);
8683
+ ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
8673
8684
  }
8674
8685
 
8675
8686
  // ggml_compute_forward_dup
@@ -11256,7 +11267,7 @@ static void ggml_compute_forward_silu_f32(
11256
11267
 
11257
11268
  #ifndef NDEBUG
11258
11269
  for (int k = 0; k < nc; k++) {
11259
- const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
11270
+ const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
11260
11271
  UNUSED(x);
11261
11272
  assert(!isnan(x));
11262
11273
  assert(!isinf(x));
@@ -13082,24 +13093,22 @@ static void ggml_compute_forward_alibi_f32(
13082
13093
  return;
13083
13094
  }
13084
13095
 
13085
- const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
13096
+ //const int n_past = ((int32_t *) dst->op_params)[0];
13086
13097
  const int n_head = ((int32_t *) dst->op_params)[1];
13087
13098
  float max_bias;
13088
13099
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13089
13100
 
13090
- assert(n_past >= 0);
13091
-
13092
- const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13093
- const int ne1 = src0->ne[1]; // seq_len_without_past
13094
- const int ne2 = src0->ne[2]; // n_head -> this is k
13095
- //const int ne3 = src0->ne[3]; // 1 -> bsz
13101
+ const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13102
+ const int64_t ne1 = src0->ne[1]; // seq_len_without_past
13103
+ const int64_t ne2 = src0->ne[2]; // n_head -> this is k
13104
+ //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
13096
13105
 
13097
- const int n = ggml_nrows(src0);
13098
- const int ne2_ne3 = n/ne1; // ne2*ne3
13106
+ const int64_t n = ggml_nrows(src0);
13107
+ const int64_t ne2_ne3 = n/ne1; // ne2*ne3
13099
13108
 
13100
- const int nb0 = src0->nb[0];
13101
- const int nb1 = src0->nb[1];
13102
- const int nb2 = src0->nb[2];
13109
+ const size_t nb0 = src0->nb[0];
13110
+ const size_t nb1 = src0->nb[1];
13111
+ const size_t nb2 = src0->nb[2];
13103
13112
  //const int nb3 = src0->nb[3];
13104
13113
 
13105
13114
  GGML_ASSERT(nb0 == sizeof(float));
@@ -13111,9 +13120,9 @@ static void ggml_compute_forward_alibi_f32(
13111
13120
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13112
13121
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13113
13122
 
13114
- for (int i = 0; i < ne0; i++) {
13115
- for (int j = 0; j < ne1; j++) {
13116
- for (int k = 0; k < ne2_ne3; k++) {
13123
+ for (int64_t i = 0; i < ne0; i++) {
13124
+ for (int64_t j = 0; j < ne1; j++) {
13125
+ for (int64_t k = 0; k < ne2_ne3; k++) {
13117
13126
  float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13118
13127
  float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13119
13128
 
@@ -13128,7 +13137,6 @@ static void ggml_compute_forward_alibi_f32(
13128
13137
  }
13129
13138
 
13130
13139
  pdst[0] = i * m_k + src[0];
13131
-
13132
13140
  }
13133
13141
  }
13134
13142
  }
@@ -13529,7 +13537,7 @@ static void ggml_compute_forward_rope_f16(
13529
13537
  dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
13530
13538
  dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
13531
13539
  }
13532
- } if (!is_neox) {
13540
+ } else if (!is_neox) {
13533
13541
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
13534
13542
  const float cos_theta = cosf(theta);
13535
13543
  const float sin_theta = sinf(theta);
@@ -14454,7 +14462,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
14454
14462
  int64_t t0 = ggml_perf_time_us();
14455
14463
  UNUSED(t0);
14456
14464
 
14457
- GGML_TENSOR_BINARY_OP_LOCALS
14465
+ GGML_TENSOR_BINARY_OP_LOCALS;
14458
14466
 
14459
14467
  const int ith = params->ith;
14460
14468
  const int nth = params->nth;
@@ -19162,6 +19170,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
19162
19170
 
19163
19171
  if (idx == -1) {
19164
19172
  fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
19173
+ fclose(fout);
19165
19174
  return;
19166
19175
  }
19167
19176
 
@@ -20203,6 +20212,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
20203
20212
  ggml_vec_cpy_f32(nx, xp, x);
20204
20213
  ggml_vec_cpy_f32(nx, gp, g);
20205
20214
 
20215
+ // TODO: instead of passing &cancel here, use the return code of the linesearch
20216
+ // to determine if the optimization should be cancelled
20217
+ // this is a simple change, but not doing this atm, since I don't have a nice
20218
+ // way to test and don't want to break something with so many changes lined up
20206
20219
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
20207
20220
  if (cancel) {
20208
20221
  return GGML_OPT_CANCEL;
@@ -20832,7 +20845,7 @@ struct gguf_kv {
20832
20845
  };
20833
20846
 
20834
20847
  struct gguf_header {
20835
- uint32_t magic;
20848
+ char magic[4];
20836
20849
  uint32_t version;
20837
20850
  uint64_t n_tensors; // GGUFv2
20838
20851
  uint64_t n_kv; // GGUFv2
@@ -20902,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
20902
20915
  struct gguf_context * gguf_init_empty(void) {
20903
20916
  struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
20904
20917
 
20905
- ctx->header.magic = GGUF_MAGIC;
20918
+ memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
20906
20919
  ctx->header.version = GGUF_VERSION;
20907
20920
  ctx->header.n_tensors = 0;
20908
20921
  ctx->header.n_kv = 0;
@@ -20928,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20928
20941
  // offset from start of file
20929
20942
  size_t offset = 0;
20930
20943
 
20931
- uint32_t magic = 0;
20944
+ char magic[4];
20932
20945
 
20933
20946
  // check the magic before making allocations
20934
20947
  {
20935
20948
  gguf_fread_el(file, &magic, sizeof(magic), &offset);
20936
20949
 
20937
- if (magic != GGUF_MAGIC) {
20938
- fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
20939
- fclose(file);
20940
- return NULL;
20950
+ for (uint32_t i = 0; i < sizeof(magic); i++) {
20951
+ if (magic[i] != GGUF_MAGIC[i]) {
20952
+ fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
20953
+ fclose(file);
20954
+ return NULL;
20955
+ }
20941
20956
  }
20942
20957
  }
20943
20958
 
@@ -20947,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20947
20962
 
20948
20963
  // read the header
20949
20964
  {
20950
- ctx->header.magic = magic;
20965
+ strncpy(ctx->header.magic, magic, 4);
20966
+
20951
20967
 
20952
20968
  ctx->kv = NULL;
20953
20969
  ctx->infos = NULL;
@@ -231,8 +231,9 @@
231
231
  #define GGML_EXIT_SUCCESS 0
232
232
  #define GGML_EXIT_ABORTED 1
233
233
 
234
- #define GGUF_MAGIC 0x46554747 // "GGUF"
235
- #define GGUF_VERSION 2
234
+ #define GGUF_MAGIC "GGUF"
235
+
236
+ #define GGUF_VERSION 3
236
237
 
237
238
  #define GGUF_DEFAULT_ALIGNMENT 32
238
239
 
@@ -326,7 +327,7 @@ extern "C" {
326
327
  GGML_TYPE_COUNT,
327
328
  };
328
329
 
329
- enum ggml_backend {
330
+ enum ggml_backend_type {
330
331
  GGML_BACKEND_CPU = 0,
331
332
  GGML_BACKEND_GPU = 10,
332
333
  GGML_BACKEND_GPU_SPLIT = 20,
@@ -479,8 +480,10 @@ extern "C" {
479
480
 
480
481
  // n-dimensional tensor
481
482
  struct ggml_tensor {
482
- enum ggml_type type;
483
- enum ggml_backend backend;
483
+ enum ggml_type type;
484
+ enum ggml_backend_type backend;
485
+
486
+ struct ggml_backend_buffer * buffer;
484
487
 
485
488
  int n_dims;
486
489
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -514,7 +517,7 @@ extern "C" {
514
517
 
515
518
  void * extra; // extra things e.g. for ggml-cuda.cu
516
519
 
517
- char padding[4];
520
+ char padding[12];
518
521
  };
519
522
 
520
523
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -702,6 +705,9 @@ extern "C" {
702
705
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
703
706
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
704
707
 
708
+ // Context tensor enumeration and lookup
709
+ GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
710
+ GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
705
711
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
706
712
 
707
713
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@@ -1358,7 +1364,7 @@ extern "C" {
1358
1364
 
1359
1365
  // alibi position embedding
1360
1366
  // in-place, returns view(a)
1361
- struct ggml_tensor * ggml_alibi(
1367
+ GGML_API struct ggml_tensor * ggml_alibi(
1362
1368
  struct ggml_context * ctx,
1363
1369
  struct ggml_tensor * a,
1364
1370
  int n_past,
@@ -1367,7 +1373,7 @@ extern "C" {
1367
1373
 
1368
1374
  // clamp
1369
1375
  // in-place, returns view(a)
1370
- struct ggml_tensor * ggml_clamp(
1376
+ GGML_API struct ggml_tensor * ggml_clamp(
1371
1377
  struct ggml_context * ctx,
1372
1378
  struct ggml_tensor * a,
1373
1379
  float min,
@@ -2102,7 +2108,7 @@ extern "C" {
2102
2108
  enum ggml_type vec_dot_type;
2103
2109
  } ggml_type_traits_t;
2104
2110
 
2105
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2111
+ GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2106
2112
 
2107
2113
  #ifdef __cplusplus
2108
2114
  }
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
46
46
  #if defined(_MSC_VER) || defined(__MINGW32__)
47
47
  #include <intrin.h>
48
48
  #else
49
- #if !defined(__riscv)
49
+ #if !defined(__riscv) && !defined(__s390__)
50
50
  #include <immintrin.h>
51
51
  #endif
52
52
  #endif
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
462
462
  }
463
463
 
464
464
  size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
465
- const int nb = k / QK_K;
466
-
467
- // TODO - collect histograms - although, at a second thought, I don't really care about them
468
- (void)hist;
465
+ (void)hist; // TODO: collect histograms
469
466
 
470
- for (int j = 0; j < nb; j += k) {
467
+ for (int j = 0; j < n; j += k) {
471
468
  block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
472
469
  quantize_row_q2_K_reference(src + j, y, k);
473
470
  }
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
678
675
  }
679
676
 
680
677
  size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
681
- const int nb = k / QK_K;
682
-
683
- // TODO - collect histograms - although, at a second thought, I don't really care about them
684
- (void)hist;
678
+ (void)hist; // TODO: collect histograms
685
679
 
686
- for (int j = 0; j < nb; j += k) {
680
+ for (int j = 0; j < n; j += k) {
687
681
  block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
688
682
  quantize_row_q3_K_reference(src + j, y, k);
689
683
  }
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
846
840
 
847
841
  size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
848
842
  assert(k % QK_K == 0);
849
- const int nb = k / QK_K;
850
843
  (void)hist; // TODO: collect histograms
851
- for (int j = 0; j < nb; j += k) {
844
+
845
+ for (int j = 0; j < n; j += k) {
852
846
  block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
853
847
  quantize_row_q4_K_reference(src + j, y, k);
854
848
  }
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
1052
1046
 
1053
1047
  size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
1054
1048
  assert(k % QK_K == 0);
1055
- const int nb = k / QK_K;
1056
- (void)hist;
1057
- for (int j = 0; j < nb; j += k) {
1049
+ (void)hist; // TODO: collect histograms
1050
+
1051
+ for (int j = 0; j < n; j += k) {
1058
1052
  block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
1059
1053
  quantize_row_q5_K_reference(src + j, y, k);
1060
1054
  }
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
1200
1194
 
1201
1195
  size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
1202
1196
  assert(k % QK_K == 0);
1203
- const int nb = k / QK_K;
1204
-
1205
- (void)hist; // TODO
1197
+ (void)hist; // TODO: collect histograms
1206
1198
 
1207
- for (int j = 0; j < nb; j += k) {
1199
+ for (int j = 0; j < n; j += k) {
1208
1200
  block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
1209
1201
  quantize_row_q6_K_reference(src + j, y, k);
1210
1202
  }