llama_cpp 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,7 +38,7 @@ kernel void kernel_add_row(
38
38
  device const float4 * src0,
39
39
  device const float4 * src1,
40
40
  device float4 * dst,
41
- constant int64_t & nb,
41
+ constant int64_t & nb,
42
42
  uint tpig[[thread_position_in_grid]]) {
43
43
  dst[tpig] = src0[tpig] + src1[tpig % nb];
44
44
  }
@@ -118,7 +118,7 @@ kernel void kernel_soft_max(
118
118
  device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
119
119
 
120
120
  // parallel max
121
- float lmax = psrc0[tpitg[0]];
121
+ float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
122
122
  for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
123
123
  lmax = MAX(lmax, psrc0[i00]);
124
124
  }
@@ -158,7 +158,7 @@ kernel void kernel_soft_max_4(
158
158
  device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
159
159
 
160
160
  // parallel max
161
- float4 lmax4 = psrc4[tpitg[0]];
161
+ float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
162
162
  for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
163
163
  lmax4 = fmax(lmax4, psrc4[i00]);
164
164
  }
@@ -523,6 +523,79 @@ kernel void kernel_mul_mat_q8_0_f32(
523
523
  }
524
524
  }
525
525
 
526
+ #define N_F32_F32 4
527
+
528
+ kernel void kernel_mul_mat_f32_f32(
529
+ device const char * src0,
530
+ device const char * src1,
531
+ device float * dst,
532
+ constant int64_t & ne00,
533
+ constant int64_t & ne01,
534
+ constant int64_t & ne02,
535
+ constant uint64_t & nb00,
536
+ constant uint64_t & nb01,
537
+ constant uint64_t & nb02,
538
+ constant int64_t & ne10,
539
+ constant int64_t & ne11,
540
+ constant int64_t & ne12,
541
+ constant uint64_t & nb10,
542
+ constant uint64_t & nb11,
543
+ constant uint64_t & nb12,
544
+ constant int64_t & ne0,
545
+ constant int64_t & ne1,
546
+ uint3 tgpig[[threadgroup_position_in_grid]],
547
+ uint tiisg[[thread_index_in_simdgroup]]) {
548
+
549
+ const int64_t r0 = tgpig.x;
550
+ const int64_t rb = tgpig.y*N_F32_F32;
551
+ const int64_t im = tgpig.z;
552
+
553
+ device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
554
+
555
+ if (ne00 < 128) {
556
+ for (int row = 0; row < N_F32_F32; ++row) {
557
+ int r1 = rb + row;
558
+ if (r1 >= ne11) {
559
+ break;
560
+ }
561
+
562
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
563
+
564
+ float sumf = 0;
565
+ for (int i = tiisg; i < ne00; i += 32) {
566
+ sumf += (float) x[i] * (float) y[i];
567
+ }
568
+
569
+ float all_sum = simd_sum(sumf);
570
+ if (tiisg == 0) {
571
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
572
+ }
573
+ }
574
+ } else {
575
+ device const float4 * x4 = (device const float4 *)x;
576
+ for (int row = 0; row < N_F32_F32; ++row) {
577
+ int r1 = rb + row;
578
+ if (r1 >= ne11) {
579
+ break;
580
+ }
581
+
582
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
583
+ device const float4 * y4 = (device const float4 *) y;
584
+
585
+ float sumf = 0;
586
+ for (int i = tiisg; i < ne00/4; i += 32) {
587
+ for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
588
+ }
589
+
590
+ float all_sum = simd_sum(sumf);
591
+ if (tiisg == 0) {
592
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
593
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
594
+ }
595
+ }
596
+ }
597
+ }
598
+
526
599
  kernel void kernel_mul_mat_f16_f32_1row(
527
600
  device const char * src0,
528
601
  device const char * src1,
@@ -1321,7 +1394,6 @@ kernel void kernel_mul_mat_q3_K_f32(
1321
1394
  dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
1322
1395
  }
1323
1396
  }
1324
-
1325
1397
  }
1326
1398
  #else
1327
1399
  kernel void kernel_mul_mat_q3_K_f32(
@@ -1400,13 +1472,13 @@ kernel void kernel_mul_mat_q4_K_f32(
1400
1472
  device const float * src1,
1401
1473
  device float * dst,
1402
1474
  constant int64_t & ne00,
1403
- constant int64_t & ne01[[buffer(4)]],
1404
- constant int64_t & ne02[[buffer(5)]],
1405
- constant int64_t & ne10[[buffer(9)]],
1406
- constant int64_t & ne12[[buffer(11)]],
1407
- constant int64_t & ne0[[buffer(15)]],
1408
- constant int64_t & ne1[[buffer(16)]],
1409
- constant uint & gqa[[buffer(17)]],
1475
+ constant int64_t & ne01 [[buffer(4)]],
1476
+ constant int64_t & ne02 [[buffer(5)]],
1477
+ constant int64_t & ne10 [[buffer(9)]],
1478
+ constant int64_t & ne12 [[buffer(11)]],
1479
+ constant int64_t & ne0 [[buffer(15)]],
1480
+ constant int64_t & ne1 [[buffer(16)]],
1481
+ constant uint & gqa [[buffer(17)]],
1410
1482
  uint3 tgpig[[threadgroup_position_in_grid]],
1411
1483
  uint tiisg[[thread_index_in_simdgroup]],
1412
1484
  uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -1865,6 +1937,15 @@ kernel void kernel_mul_mat_q6_K_f32(
1865
1937
 
1866
1938
  //============================= templates and their specializations =============================
1867
1939
 
1940
+ // NOTE: this is not dequantizing - we are simply fitting the template
1941
+ template <typename type4x4>
1942
+ void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
1943
+ float4x4 temp = *(((device float4x4 *)src));
1944
+ for (int i = 0; i < 16; i++){
1945
+ reg[i/4][i%4] = temp[i/4][i%4];
1946
+ }
1947
+ }
1948
+
1868
1949
  template <typename type4x4>
1869
1950
  void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
1870
1951
  half4x4 temp = *(((device half4x4 *)src));
@@ -1875,7 +1956,6 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
1875
1956
 
1876
1957
  template <typename type4x4>
1877
1958
  void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
1878
-
1879
1959
  device const uint16_t * qs = ((device const uint16_t *)xb + 1);
1880
1960
  const float d1 = il ? (xb->d / 16.h) : xb->d;
1881
1961
  const float d2 = d1 / 256.f;
@@ -1887,12 +1967,10 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg
1887
1967
  reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
1888
1968
  reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
1889
1969
  }
1890
-
1891
1970
  }
1892
1971
 
1893
1972
  template <typename type4x4>
1894
1973
  void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
1895
-
1896
1974
  device const uint16_t * qs = ((device const uint16_t *)xb + 2);
1897
1975
  const float d1 = il ? (xb->d / 16.h) : xb->d;
1898
1976
  const float d2 = d1 / 256.f;
@@ -1964,7 +2042,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
1964
2042
  for (int i = 0; i < 16; ++i) {
1965
2043
  reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
1966
2044
  }
1967
-
1968
2045
  #else
1969
2046
  float kcoef = il&1 ? 1.f/16.f : 1.f;
1970
2047
  uint16_t kmask = il&1 ? 0xF0 : 0x0F;
@@ -2008,7 +2085,6 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
2008
2085
  for (int i = 0; i < 16; ++i) {
2009
2086
  reg[i/4][i%4] = dl * (q[i] & mask) - ml;
2010
2087
  }
2011
-
2012
2088
  }
2013
2089
 
2014
2090
  template <typename type4x4>
@@ -2110,22 +2186,25 @@ kernel void kernel_get_rows(
2110
2186
  // each block_q contains 16*nl weights
2111
2187
  template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
2112
2188
  kernel void kernel_mul_mm(device const uchar * src0,
2113
- device const float * src1,
2114
- device float * dst,
2115
- constant int64_t & ne00,
2116
- constant int64_t & ne02,
2117
- constant int64_t & nb01,
2118
- constant int64_t & nb02,
2119
- constant int64_t & ne12,
2120
- constant int64_t & ne0,
2121
- constant int64_t & ne1,
2122
- constant uint & gqa,
2123
- threadgroup uchar * shared_memory [[threadgroup(0)]],
2124
- uint3 tgpig[[threadgroup_position_in_grid]],
2125
- uint tiitg[[thread_index_in_threadgroup]],
2126
- uint sgitg[[simdgroup_index_in_threadgroup]]) {
2127
-
2128
- threadgroup half * sa = ((threadgroup half *)shared_memory);
2189
+ device const uchar * src1,
2190
+ device float * dst,
2191
+ constant int64_t & ne00,
2192
+ constant int64_t & ne02,
2193
+ constant int64_t & nb01,
2194
+ constant int64_t & nb02,
2195
+ constant int64_t & ne12,
2196
+ constant int64_t & nb10,
2197
+ constant int64_t & nb11,
2198
+ constant int64_t & nb12,
2199
+ constant int64_t & ne0,
2200
+ constant int64_t & ne1,
2201
+ constant uint & gqa,
2202
+ threadgroup uchar * shared_memory [[threadgroup(0)]],
2203
+ uint3 tgpig[[threadgroup_position_in_grid]],
2204
+ uint tiitg[[thread_index_in_threadgroup]],
2205
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
2206
+
2207
+ threadgroup half * sa = (threadgroup half *)(shared_memory);
2129
2208
  threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
2130
2209
 
2131
2210
  const uint r0 = tgpig.y;
@@ -2138,7 +2217,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
2138
2217
  short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
2139
2218
  short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
2140
2219
 
2141
- simdgroup_half8x8 ma[4];
2220
+ simdgroup_half8x8 ma[4];
2142
2221
  simdgroup_float8x8 mb[2];
2143
2222
  simdgroup_float8x8 c_res[8];
2144
2223
  for (int i = 0; i < 8; i++){
@@ -2146,10 +2225,15 @@ kernel void kernel_mul_mm(device const uchar * src0,
2146
2225
  }
2147
2226
 
2148
2227
  short il = (tiitg % THREAD_PER_ROW);
2149
- uint offset0 = im/gqa*nb02; ushort offset1 = il/nl;
2150
- device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
2151
- device const float * y = src1 + (r1 * BLOCK_SIZE_N + thread_col) * ne00 \
2152
- + BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL) + im * ne00 * ne1;
2228
+
2229
+ uint offset0 = im/gqa*nb02;
2230
+ ushort offset1 = il/nl;
2231
+
2232
+ device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
2233
+ device const float * y = (device const float *)(src1
2234
+ + nb12 * im
2235
+ + nb11 * (r1 * BLOCK_SIZE_N + thread_col)
2236
+ + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
2153
2237
 
2154
2238
  for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
2155
2239
  //load data and store to threadgroup memory
@@ -2229,6 +2313,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
2229
2313
  typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
2230
2314
  constant uint64_t &, constant uint64_t &, uint, uint, uint);
2231
2315
 
2316
+ template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows<float4x4, 1, dequantize_f32>;
2232
2317
  template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
2233
2318
  template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
2234
2319
  template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
@@ -2239,14 +2324,28 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows
2239
2324
  template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
2240
2325
  template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
2241
2326
 
2242
- typedef void (mat_mm_t)(device const uchar *, device const float *, device float *, constant int64_t &,\
2243
- constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \
2244
- constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint);
2245
-
2246
- template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
2247
- template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
2248
- template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
2249
- template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
2327
+ typedef void (mat_mm_t)(
2328
+ device const uchar * src0,
2329
+ device const uchar * src1,
2330
+ device float * dst,
2331
+ constant int64_t & ne00,
2332
+ constant int64_t & ne02,
2333
+ constant int64_t & nb01,
2334
+ constant int64_t & nb02,
2335
+ constant int64_t & ne12,
2336
+ constant int64_t & nb10,
2337
+ constant int64_t & nb11,
2338
+ constant int64_t & nb12,
2339
+ constant int64_t & ne0,
2340
+ constant int64_t & ne1,
2341
+ constant uint & gqa,
2342
+ threadgroup uchar *, uint3, uint, uint);
2343
+
2344
+ template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<float4x4, 1, dequantize_f32>;
2345
+ template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
2346
+ template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
2347
+ template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
2348
+ template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
2250
2349
  template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
2251
2350
  template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
2252
2351
  template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
@@ -847,7 +847,7 @@ std::array<std::string, 2> mul_str_values = {
847
847
  "mul_f32", "float"
848
848
  };
849
849
 
850
- std::string& replace(std::string& s, const std::string& from, const std::string& to) {
850
+ static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
851
851
  size_t pos = 0;
852
852
  while ((pos = s.find(from, pos)) != std::string::npos) {
853
853
  s.replace(pos, from.length(), to);
@@ -856,7 +856,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
856
856
  return s;
857
857
  }
858
858
 
859
- std::string generate_kernels() {
859
+ static std::string generate_kernels() {
860
860
  std::stringstream src;
861
861
  src << program_source << '\n';
862
862
  src << k_quants_source << '\n';
@@ -1788,7 +1788,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
1788
1788
  return false;
1789
1789
  }
1790
1790
 
1791
- bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
1791
+ static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
1792
1792
  // If device doesn't support FP16
1793
1793
  if (!fp16_support) {
1794
1794
  return false;
@@ -4303,10 +4303,21 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
4303
4303
  }
4304
4304
 
4305
4305
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4306
- size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
4307
- for (int i = 1; i < GGML_MAX_DIMS; ++i) {
4308
- nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4306
+ size_t nbytes;
4307
+ size_t blck_size = ggml_blck_size(tensor->type);
4308
+ if (blck_size == 1) {
4309
+ nbytes = ggml_type_size(tensor->type);
4310
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
4311
+ nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4312
+ }
4313
+ }
4314
+ else {
4315
+ nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
4316
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
4317
+ nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4318
+ }
4309
4319
  }
4320
+
4310
4321
  return nbytes;
4311
4322
  }
4312
4323
 
@@ -17283,10 +17294,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17283
17294
  } else {
17284
17295
  // wait for other threads to finish
17285
17296
  const int last = node_n;
17286
- do {
17287
- //sched_yield();
17297
+ while (true) {
17298
+ // TODO: this sched_yield can have significant impact on the performance - either positive or negative
17299
+ // depending on the workload and the operating system.
17300
+ // since it is not clear what is the best approach, it should potentially become user-configurable
17301
+ // ref: https://github.com/ggerganov/ggml/issues/291
17302
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
17303
+ sched_yield();
17304
+ #endif
17305
+
17288
17306
  node_n = atomic_load(&state->shared->node_n);
17289
- } while (node_n == last);
17307
+ if (node_n != last) break;
17308
+ };
17290
17309
  }
17291
17310
 
17292
17311
  // check if we should stop
@@ -18337,10 +18356,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
18337
18356
  for (int i = 0; i < cgraph->n_leafs; i++) {
18338
18357
  struct ggml_tensor * node = cgraph->leafs[i];
18339
18358
 
18340
- GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
18359
+ GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
18341
18360
  i,
18342
18361
  node->ne[0], node->ne[1],
18343
- ggml_op_name(node->op));
18362
+ ggml_op_name(node->op),
18363
+ ggml_get_name(node));
18344
18364
  }
18345
18365
 
18346
18366
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -20099,27 +20119,27 @@ const char * gguf_type_name(enum gguf_type type) {
20099
20119
  return GGUF_TYPE_NAME[type];
20100
20120
  }
20101
20121
 
20102
- int gguf_get_version(struct gguf_context * ctx) {
20122
+ int gguf_get_version(const struct gguf_context * ctx) {
20103
20123
  return ctx->header.version;
20104
20124
  }
20105
20125
 
20106
- size_t gguf_get_alignment(struct gguf_context * ctx) {
20126
+ size_t gguf_get_alignment(const struct gguf_context * ctx) {
20107
20127
  return ctx->alignment;
20108
20128
  }
20109
20129
 
20110
- size_t gguf_get_data_offset(struct gguf_context * ctx) {
20130
+ size_t gguf_get_data_offset(const struct gguf_context * ctx) {
20111
20131
  return ctx->offset;
20112
20132
  }
20113
20133
 
20114
- void * gguf_get_data(struct gguf_context * ctx) {
20134
+ void * gguf_get_data(const struct gguf_context * ctx) {
20115
20135
  return ctx->data;
20116
20136
  }
20117
20137
 
20118
- int gguf_get_n_kv(struct gguf_context * ctx) {
20138
+ int gguf_get_n_kv(const struct gguf_context * ctx) {
20119
20139
  return ctx->header.n_kv;
20120
20140
  }
20121
20141
 
20122
- int gguf_find_key(struct gguf_context * ctx, const char * key) {
20142
+ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
20123
20143
  // return -1 if key not found
20124
20144
  int keyfound = -1;
20125
20145
 
@@ -20135,85 +20155,85 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
20135
20155
  return keyfound;
20136
20156
  }
20137
20157
 
20138
- const char * gguf_get_key(struct gguf_context * ctx, int i) {
20158
+ const char * gguf_get_key(const struct gguf_context * ctx, int i) {
20139
20159
  return ctx->kv[i].key.data;
20140
20160
  }
20141
20161
 
20142
- enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
20162
+ enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) {
20143
20163
  return ctx->kv[i].type;
20144
20164
  }
20145
20165
 
20146
- enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
20166
+ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) {
20147
20167
  return ctx->kv[i].value.arr.type;
20148
20168
  }
20149
20169
 
20150
- const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
20170
+ const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) {
20151
20171
  return ctx->kv[i].value.arr.data;
20152
20172
  }
20153
20173
 
20154
- const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
20174
+ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
20155
20175
  struct gguf_kv * kv = &ctx->kv[key_id];
20156
20176
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
20157
20177
  return str->data;
20158
20178
  }
20159
20179
 
20160
- int gguf_get_arr_n(struct gguf_context * ctx, int i) {
20180
+ int gguf_get_arr_n(const struct gguf_context * ctx, int i) {
20161
20181
  return ctx->kv[i].value.arr.n;
20162
20182
  }
20163
20183
 
20164
- uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
20184
+ uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) {
20165
20185
  return ctx->kv[i].value.uint8;
20166
20186
  }
20167
20187
 
20168
- int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
20188
+ int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) {
20169
20189
  return ctx->kv[i].value.int8;
20170
20190
  }
20171
20191
 
20172
- uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
20192
+ uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) {
20173
20193
  return ctx->kv[i].value.uint16;
20174
20194
  }
20175
20195
 
20176
- int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
20196
+ int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) {
20177
20197
  return ctx->kv[i].value.int16;
20178
20198
  }
20179
20199
 
20180
- uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
20200
+ uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) {
20181
20201
  return ctx->kv[i].value.uint32;
20182
20202
  }
20183
20203
 
20184
- int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
20204
+ int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) {
20185
20205
  return ctx->kv[i].value.int32;
20186
20206
  }
20187
20207
 
20188
- float gguf_get_val_f32(struct gguf_context * ctx, int i) {
20208
+ float gguf_get_val_f32(const struct gguf_context * ctx, int i) {
20189
20209
  return ctx->kv[i].value.float32;
20190
20210
  }
20191
20211
 
20192
- uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
20212
+ uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) {
20193
20213
  return ctx->kv[i].value.uint64;
20194
20214
  }
20195
20215
 
20196
- int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
20216
+ int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) {
20197
20217
  return ctx->kv[i].value.int64;
20198
20218
  }
20199
20219
 
20200
- double gguf_get_val_f64(struct gguf_context * ctx, int i) {
20220
+ double gguf_get_val_f64(const struct gguf_context * ctx, int i) {
20201
20221
  return ctx->kv[i].value.float64;
20202
20222
  }
20203
20223
 
20204
- bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
20224
+ bool gguf_get_val_bool(const struct gguf_context * ctx, int i) {
20205
20225
  return ctx->kv[i].value.bool_;
20206
20226
  }
20207
20227
 
20208
- const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
20228
+ const char * gguf_get_val_str (const struct gguf_context * ctx, int i) {
20209
20229
  return ctx->kv[i].value.str.data;
20210
20230
  }
20211
20231
 
20212
- int gguf_get_n_tensors(struct gguf_context * ctx) {
20232
+ int gguf_get_n_tensors(const struct gguf_context * ctx) {
20213
20233
  return ctx->header.n_tensors;
20214
20234
  }
20215
20235
 
20216
- int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
20236
+ int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
20217
20237
  // return -1 if tensor not found
20218
20238
  int tensorfound = -1;
20219
20239
 
@@ -20229,11 +20249,11 @@ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
20229
20249
  return tensorfound;
20230
20250
  }
20231
20251
 
20232
- size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
20252
+ size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
20233
20253
  return ctx->infos[i].offset;
20234
20254
  }
20235
20255
 
20236
- char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
20256
+ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
20237
20257
  return ctx->infos[i].name.data;
20238
20258
  }
20239
20259
 
@@ -20516,7 +20536,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
20516
20536
  buf->offset += el_size;
20517
20537
  }
20518
20538
 
20519
- static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
20539
+ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
20520
20540
  // write header
20521
20541
  gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
20522
20542
  gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
@@ -20631,7 +20651,7 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
20631
20651
  }
20632
20652
  }
20633
20653
 
20634
- void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
20654
+ void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
20635
20655
  FILE * file = fopen(fname, "wb");
20636
20656
  if (!file) {
20637
20657
  GGML_ASSERT(false && "failed to open file for writing");
@@ -20648,7 +20668,7 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only
20648
20668
  fclose(file);
20649
20669
  }
20650
20670
 
20651
- size_t gguf_get_meta_size(struct gguf_context * ctx) {
20671
+ size_t gguf_get_meta_size(const struct gguf_context * ctx) {
20652
20672
  // no allocs - only compute size
20653
20673
  struct gguf_buf buf = gguf_buf_init(0);
20654
20674
 
@@ -20657,7 +20677,7 @@ size_t gguf_get_meta_size(struct gguf_context * ctx) {
20657
20677
  return buf.offset;
20658
20678
  }
20659
20679
 
20660
- void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
20680
+ void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
20661
20681
  struct gguf_buf buf = gguf_buf_init(16*1024);
20662
20682
 
20663
20683
  gguf_write_to_buf(ctx, &buf, true);
@@ -20733,6 +20753,14 @@ int ggml_cpu_has_arm_fma(void) {
20733
20753
  #endif
20734
20754
  }
20735
20755
 
20756
+ int ggml_cpu_has_metal(void) {
20757
+ #if defined(GGML_USE_METAL)
20758
+ return 1;
20759
+ #else
20760
+ return 0;
20761
+ #endif
20762
+ }
20763
+
20736
20764
  int ggml_cpu_has_f16c(void) {
20737
20765
  #if defined(__F16C__)
20738
20766
  return 1;