llama_cpp 0.5.2 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -38,7 +38,7 @@ kernel void kernel_add_row(
38
38
  device const float4 * src0,
39
39
  device const float4 * src1,
40
40
  device float4 * dst,
41
- constant int64_t & nb,
41
+ constant int64_t & nb,
42
42
  uint tpig[[thread_position_in_grid]]) {
43
43
  dst[tpig] = src0[tpig] + src1[tpig % nb];
44
44
  }
@@ -118,7 +118,7 @@ kernel void kernel_soft_max(
118
118
  device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
119
119
 
120
120
  // parallel max
121
- float lmax = psrc0[tpitg[0]];
121
+ float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
122
122
  for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
123
123
  lmax = MAX(lmax, psrc0[i00]);
124
124
  }
@@ -158,7 +158,7 @@ kernel void kernel_soft_max_4(
158
158
  device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
159
159
 
160
160
  // parallel max
161
- float4 lmax4 = psrc4[tpitg[0]];
161
+ float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
162
162
  for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
163
163
  lmax4 = fmax(lmax4, psrc4[i00]);
164
164
  }
@@ -523,6 +523,79 @@ kernel void kernel_mul_mat_q8_0_f32(
523
523
  }
524
524
  }
525
525
 
526
+ #define N_F32_F32 4
527
+
528
+ kernel void kernel_mul_mat_f32_f32(
529
+ device const char * src0,
530
+ device const char * src1,
531
+ device float * dst,
532
+ constant int64_t & ne00,
533
+ constant int64_t & ne01,
534
+ constant int64_t & ne02,
535
+ constant uint64_t & nb00,
536
+ constant uint64_t & nb01,
537
+ constant uint64_t & nb02,
538
+ constant int64_t & ne10,
539
+ constant int64_t & ne11,
540
+ constant int64_t & ne12,
541
+ constant uint64_t & nb10,
542
+ constant uint64_t & nb11,
543
+ constant uint64_t & nb12,
544
+ constant int64_t & ne0,
545
+ constant int64_t & ne1,
546
+ uint3 tgpig[[threadgroup_position_in_grid]],
547
+ uint tiisg[[thread_index_in_simdgroup]]) {
548
+
549
+ const int64_t r0 = tgpig.x;
550
+ const int64_t rb = tgpig.y*N_F32_F32;
551
+ const int64_t im = tgpig.z;
552
+
553
+ device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
554
+
555
+ if (ne00 < 128) {
556
+ for (int row = 0; row < N_F32_F32; ++row) {
557
+ int r1 = rb + row;
558
+ if (r1 >= ne11) {
559
+ break;
560
+ }
561
+
562
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
563
+
564
+ float sumf = 0;
565
+ for (int i = tiisg; i < ne00; i += 32) {
566
+ sumf += (float) x[i] * (float) y[i];
567
+ }
568
+
569
+ float all_sum = simd_sum(sumf);
570
+ if (tiisg == 0) {
571
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
572
+ }
573
+ }
574
+ } else {
575
+ device const float4 * x4 = (device const float4 *)x;
576
+ for (int row = 0; row < N_F32_F32; ++row) {
577
+ int r1 = rb + row;
578
+ if (r1 >= ne11) {
579
+ break;
580
+ }
581
+
582
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
583
+ device const float4 * y4 = (device const float4 *) y;
584
+
585
+ float sumf = 0;
586
+ for (int i = tiisg; i < ne00/4; i += 32) {
587
+ for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
588
+ }
589
+
590
+ float all_sum = simd_sum(sumf);
591
+ if (tiisg == 0) {
592
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
593
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
594
+ }
595
+ }
596
+ }
597
+ }
598
+
526
599
  kernel void kernel_mul_mat_f16_f32_1row(
527
600
  device const char * src0,
528
601
  device const char * src1,
@@ -1321,7 +1394,6 @@ kernel void kernel_mul_mat_q3_K_f32(
1321
1394
  dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
1322
1395
  }
1323
1396
  }
1324
-
1325
1397
  }
1326
1398
  #else
1327
1399
  kernel void kernel_mul_mat_q3_K_f32(
@@ -1400,13 +1472,13 @@ kernel void kernel_mul_mat_q4_K_f32(
1400
1472
  device const float * src1,
1401
1473
  device float * dst,
1402
1474
  constant int64_t & ne00,
1403
- constant int64_t & ne01[[buffer(4)]],
1404
- constant int64_t & ne02[[buffer(5)]],
1405
- constant int64_t & ne10[[buffer(9)]],
1406
- constant int64_t & ne12[[buffer(11)]],
1407
- constant int64_t & ne0[[buffer(15)]],
1408
- constant int64_t & ne1[[buffer(16)]],
1409
- constant uint & gqa[[buffer(17)]],
1475
+ constant int64_t & ne01 [[buffer(4)]],
1476
+ constant int64_t & ne02 [[buffer(5)]],
1477
+ constant int64_t & ne10 [[buffer(9)]],
1478
+ constant int64_t & ne12 [[buffer(11)]],
1479
+ constant int64_t & ne0 [[buffer(15)]],
1480
+ constant int64_t & ne1 [[buffer(16)]],
1481
+ constant uint & gqa [[buffer(17)]],
1410
1482
  uint3 tgpig[[threadgroup_position_in_grid]],
1411
1483
  uint tiisg[[thread_index_in_simdgroup]],
1412
1484
  uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -1865,6 +1937,15 @@ kernel void kernel_mul_mat_q6_K_f32(
1865
1937
 
1866
1938
  //============================= templates and their specializations =============================
1867
1939
 
1940
+ // NOTE: this is not dequantizing - we are simply fitting the template
1941
+ template <typename type4x4>
1942
+ void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
1943
+ float4x4 temp = *(((device float4x4 *)src));
1944
+ for (int i = 0; i < 16; i++){
1945
+ reg[i/4][i%4] = temp[i/4][i%4];
1946
+ }
1947
+ }
1948
+
1868
1949
  template <typename type4x4>
1869
1950
  void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
1870
1951
  half4x4 temp = *(((device half4x4 *)src));
@@ -1875,7 +1956,6 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
1875
1956
 
1876
1957
  template <typename type4x4>
1877
1958
  void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
1878
-
1879
1959
  device const uint16_t * qs = ((device const uint16_t *)xb + 1);
1880
1960
  const float d1 = il ? (xb->d / 16.h) : xb->d;
1881
1961
  const float d2 = d1 / 256.f;
@@ -1887,12 +1967,10 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg
1887
1967
  reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
1888
1968
  reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
1889
1969
  }
1890
-
1891
1970
  }
1892
1971
 
1893
1972
  template <typename type4x4>
1894
1973
  void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
1895
-
1896
1974
  device const uint16_t * qs = ((device const uint16_t *)xb + 2);
1897
1975
  const float d1 = il ? (xb->d / 16.h) : xb->d;
1898
1976
  const float d2 = d1 / 256.f;
@@ -1964,7 +2042,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
1964
2042
  for (int i = 0; i < 16; ++i) {
1965
2043
  reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
1966
2044
  }
1967
-
1968
2045
  #else
1969
2046
  float kcoef = il&1 ? 1.f/16.f : 1.f;
1970
2047
  uint16_t kmask = il&1 ? 0xF0 : 0x0F;
@@ -2008,7 +2085,6 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
2008
2085
  for (int i = 0; i < 16; ++i) {
2009
2086
  reg[i/4][i%4] = dl * (q[i] & mask) - ml;
2010
2087
  }
2011
-
2012
2088
  }
2013
2089
 
2014
2090
  template <typename type4x4>
@@ -2110,22 +2186,25 @@ kernel void kernel_get_rows(
2110
2186
  // each block_q contains 16*nl weights
2111
2187
  template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
2112
2188
  kernel void kernel_mul_mm(device const uchar * src0,
2113
- device const float * src1,
2114
- device float * dst,
2115
- constant int64_t & ne00,
2116
- constant int64_t & ne02,
2117
- constant int64_t & nb01,
2118
- constant int64_t & nb02,
2119
- constant int64_t & ne12,
2120
- constant int64_t & ne0,
2121
- constant int64_t & ne1,
2122
- constant uint & gqa,
2123
- threadgroup uchar * shared_memory [[threadgroup(0)]],
2124
- uint3 tgpig[[threadgroup_position_in_grid]],
2125
- uint tiitg[[thread_index_in_threadgroup]],
2126
- uint sgitg[[simdgroup_index_in_threadgroup]]) {
2127
-
2128
- threadgroup half * sa = ((threadgroup half *)shared_memory);
2189
+ device const uchar * src1,
2190
+ device float * dst,
2191
+ constant int64_t & ne00,
2192
+ constant int64_t & ne02,
2193
+ constant int64_t & nb01,
2194
+ constant int64_t & nb02,
2195
+ constant int64_t & ne12,
2196
+ constant int64_t & nb10,
2197
+ constant int64_t & nb11,
2198
+ constant int64_t & nb12,
2199
+ constant int64_t & ne0,
2200
+ constant int64_t & ne1,
2201
+ constant uint & gqa,
2202
+ threadgroup uchar * shared_memory [[threadgroup(0)]],
2203
+ uint3 tgpig[[threadgroup_position_in_grid]],
2204
+ uint tiitg[[thread_index_in_threadgroup]],
2205
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
2206
+
2207
+ threadgroup half * sa = (threadgroup half *)(shared_memory);
2129
2208
  threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
2130
2209
 
2131
2210
  const uint r0 = tgpig.y;
@@ -2138,7 +2217,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
2138
2217
  short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
2139
2218
  short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
2140
2219
 
2141
- simdgroup_half8x8 ma[4];
2220
+ simdgroup_half8x8 ma[4];
2142
2221
  simdgroup_float8x8 mb[2];
2143
2222
  simdgroup_float8x8 c_res[8];
2144
2223
  for (int i = 0; i < 8; i++){
@@ -2146,10 +2225,15 @@ kernel void kernel_mul_mm(device const uchar * src0,
2146
2225
  }
2147
2226
 
2148
2227
  short il = (tiitg % THREAD_PER_ROW);
2149
- uint offset0 = im/gqa*nb02; ushort offset1 = il/nl;
2150
- device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
2151
- device const float * y = src1 + (r1 * BLOCK_SIZE_N + thread_col) * ne00 \
2152
- + BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL) + im * ne00 * ne1;
2228
+
2229
+ uint offset0 = im/gqa*nb02;
2230
+ ushort offset1 = il/nl;
2231
+
2232
+ device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
2233
+ device const float * y = (device const float *)(src1
2234
+ + nb12 * im
2235
+ + nb11 * (r1 * BLOCK_SIZE_N + thread_col)
2236
+ + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
2153
2237
 
2154
2238
  for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
2155
2239
  //load data and store to threadgroup memory
@@ -2229,6 +2313,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
2229
2313
  typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
2230
2314
  constant uint64_t &, constant uint64_t &, uint, uint, uint);
2231
2315
 
2316
+ template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows<float4x4, 1, dequantize_f32>;
2232
2317
  template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
2233
2318
  template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
2234
2319
  template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
@@ -2239,14 +2324,28 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows
2239
2324
  template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
2240
2325
  template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
2241
2326
 
2242
- typedef void (mat_mm_t)(device const uchar *, device const float *, device float *, constant int64_t &,\
2243
- constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \
2244
- constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint);
2245
-
2246
- template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
2247
- template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
2248
- template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
2249
- template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
2327
+ typedef void (mat_mm_t)(
2328
+ device const uchar * src0,
2329
+ device const uchar * src1,
2330
+ device float * dst,
2331
+ constant int64_t & ne00,
2332
+ constant int64_t & ne02,
2333
+ constant int64_t & nb01,
2334
+ constant int64_t & nb02,
2335
+ constant int64_t & ne12,
2336
+ constant int64_t & nb10,
2337
+ constant int64_t & nb11,
2338
+ constant int64_t & nb12,
2339
+ constant int64_t & ne0,
2340
+ constant int64_t & ne1,
2341
+ constant uint & gqa,
2342
+ threadgroup uchar *, uint3, uint, uint);
2343
+
2344
+ template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<float4x4, 1, dequantize_f32>;
2345
+ template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
2346
+ template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
2347
+ template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
2348
+ template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
2250
2349
  template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
2251
2350
  template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
2252
2351
  template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
@@ -847,7 +847,7 @@ std::array<std::string, 2> mul_str_values = {
847
847
  "mul_f32", "float"
848
848
  };
849
849
 
850
- std::string& replace(std::string& s, const std::string& from, const std::string& to) {
850
+ static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
851
851
  size_t pos = 0;
852
852
  while ((pos = s.find(from, pos)) != std::string::npos) {
853
853
  s.replace(pos, from.length(), to);
@@ -856,7 +856,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
856
856
  return s;
857
857
  }
858
858
 
859
- std::string generate_kernels() {
859
+ static std::string generate_kernels() {
860
860
  std::stringstream src;
861
861
  src << program_source << '\n';
862
862
  src << k_quants_source << '\n';
@@ -1788,7 +1788,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
1788
1788
  return false;
1789
1789
  }
1790
1790
 
1791
- bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
1791
+ static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
1792
1792
  // If device doesn't support FP16
1793
1793
  if (!fp16_support) {
1794
1794
  return false;
@@ -4303,10 +4303,21 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
4303
4303
  }
4304
4304
 
4305
4305
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4306
- size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
4307
- for (int i = 1; i < GGML_MAX_DIMS; ++i) {
4308
- nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4306
+ size_t nbytes;
4307
+ size_t blck_size = ggml_blck_size(tensor->type);
4308
+ if (blck_size == 1) {
4309
+ nbytes = ggml_type_size(tensor->type);
4310
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
4311
+ nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4312
+ }
4313
+ }
4314
+ else {
4315
+ nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
4316
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
4317
+ nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
4318
+ }
4309
4319
  }
4320
+
4310
4321
  return nbytes;
4311
4322
  }
4312
4323
 
@@ -17283,10 +17294,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17283
17294
  } else {
17284
17295
  // wait for other threads to finish
17285
17296
  const int last = node_n;
17286
- do {
17287
- //sched_yield();
17297
+ while (true) {
17298
+ // TODO: this sched_yield can have significant impact on the performance - either positive or negative
17299
+ // depending on the workload and the operating system.
17300
+ // since it is not clear what is the best approach, it should potentially become user-configurable
17301
+ // ref: https://github.com/ggerganov/ggml/issues/291
17302
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
17303
+ sched_yield();
17304
+ #endif
17305
+
17288
17306
  node_n = atomic_load(&state->shared->node_n);
17289
- } while (node_n == last);
17307
+ if (node_n != last) break;
17308
+ };
17290
17309
  }
17291
17310
 
17292
17311
  // check if we should stop
@@ -18337,10 +18356,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
18337
18356
  for (int i = 0; i < cgraph->n_leafs; i++) {
18338
18357
  struct ggml_tensor * node = cgraph->leafs[i];
18339
18358
 
18340
- GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
18359
+ GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
18341
18360
  i,
18342
18361
  node->ne[0], node->ne[1],
18343
- ggml_op_name(node->op));
18362
+ ggml_op_name(node->op),
18363
+ ggml_get_name(node));
18344
18364
  }
18345
18365
 
18346
18366
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -20099,27 +20119,27 @@ const char * gguf_type_name(enum gguf_type type) {
20099
20119
  return GGUF_TYPE_NAME[type];
20100
20120
  }
20101
20121
 
20102
- int gguf_get_version(struct gguf_context * ctx) {
20122
+ int gguf_get_version(const struct gguf_context * ctx) {
20103
20123
  return ctx->header.version;
20104
20124
  }
20105
20125
 
20106
- size_t gguf_get_alignment(struct gguf_context * ctx) {
20126
+ size_t gguf_get_alignment(const struct gguf_context * ctx) {
20107
20127
  return ctx->alignment;
20108
20128
  }
20109
20129
 
20110
- size_t gguf_get_data_offset(struct gguf_context * ctx) {
20130
+ size_t gguf_get_data_offset(const struct gguf_context * ctx) {
20111
20131
  return ctx->offset;
20112
20132
  }
20113
20133
 
20114
- void * gguf_get_data(struct gguf_context * ctx) {
20134
+ void * gguf_get_data(const struct gguf_context * ctx) {
20115
20135
  return ctx->data;
20116
20136
  }
20117
20137
 
20118
- int gguf_get_n_kv(struct gguf_context * ctx) {
20138
+ int gguf_get_n_kv(const struct gguf_context * ctx) {
20119
20139
  return ctx->header.n_kv;
20120
20140
  }
20121
20141
 
20122
- int gguf_find_key(struct gguf_context * ctx, const char * key) {
20142
+ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
20123
20143
  // return -1 if key not found
20124
20144
  int keyfound = -1;
20125
20145
 
@@ -20135,85 +20155,85 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
20135
20155
  return keyfound;
20136
20156
  }
20137
20157
 
20138
- const char * gguf_get_key(struct gguf_context * ctx, int i) {
20158
+ const char * gguf_get_key(const struct gguf_context * ctx, int i) {
20139
20159
  return ctx->kv[i].key.data;
20140
20160
  }
20141
20161
 
20142
- enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
20162
+ enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) {
20143
20163
  return ctx->kv[i].type;
20144
20164
  }
20145
20165
 
20146
- enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
20166
+ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) {
20147
20167
  return ctx->kv[i].value.arr.type;
20148
20168
  }
20149
20169
 
20150
- const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
20170
+ const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) {
20151
20171
  return ctx->kv[i].value.arr.data;
20152
20172
  }
20153
20173
 
20154
- const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
20174
+ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
20155
20175
  struct gguf_kv * kv = &ctx->kv[key_id];
20156
20176
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
20157
20177
  return str->data;
20158
20178
  }
20159
20179
 
20160
- int gguf_get_arr_n(struct gguf_context * ctx, int i) {
20180
+ int gguf_get_arr_n(const struct gguf_context * ctx, int i) {
20161
20181
  return ctx->kv[i].value.arr.n;
20162
20182
  }
20163
20183
 
20164
- uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
20184
+ uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) {
20165
20185
  return ctx->kv[i].value.uint8;
20166
20186
  }
20167
20187
 
20168
- int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
20188
+ int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) {
20169
20189
  return ctx->kv[i].value.int8;
20170
20190
  }
20171
20191
 
20172
- uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
20192
+ uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) {
20173
20193
  return ctx->kv[i].value.uint16;
20174
20194
  }
20175
20195
 
20176
- int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
20196
+ int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) {
20177
20197
  return ctx->kv[i].value.int16;
20178
20198
  }
20179
20199
 
20180
- uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
20200
+ uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) {
20181
20201
  return ctx->kv[i].value.uint32;
20182
20202
  }
20183
20203
 
20184
- int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
20204
+ int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) {
20185
20205
  return ctx->kv[i].value.int32;
20186
20206
  }
20187
20207
 
20188
- float gguf_get_val_f32(struct gguf_context * ctx, int i) {
20208
+ float gguf_get_val_f32(const struct gguf_context * ctx, int i) {
20189
20209
  return ctx->kv[i].value.float32;
20190
20210
  }
20191
20211
 
20192
- uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
20212
+ uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) {
20193
20213
  return ctx->kv[i].value.uint64;
20194
20214
  }
20195
20215
 
20196
- int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
20216
+ int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) {
20197
20217
  return ctx->kv[i].value.int64;
20198
20218
  }
20199
20219
 
20200
- double gguf_get_val_f64(struct gguf_context * ctx, int i) {
20220
+ double gguf_get_val_f64(const struct gguf_context * ctx, int i) {
20201
20221
  return ctx->kv[i].value.float64;
20202
20222
  }
20203
20223
 
20204
- bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
20224
+ bool gguf_get_val_bool(const struct gguf_context * ctx, int i) {
20205
20225
  return ctx->kv[i].value.bool_;
20206
20226
  }
20207
20227
 
20208
- const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
20228
+ const char * gguf_get_val_str (const struct gguf_context * ctx, int i) {
20209
20229
  return ctx->kv[i].value.str.data;
20210
20230
  }
20211
20231
 
20212
- int gguf_get_n_tensors(struct gguf_context * ctx) {
20232
+ int gguf_get_n_tensors(const struct gguf_context * ctx) {
20213
20233
  return ctx->header.n_tensors;
20214
20234
  }
20215
20235
 
20216
- int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
20236
+ int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
20217
20237
  // return -1 if tensor not found
20218
20238
  int tensorfound = -1;
20219
20239
 
@@ -20229,11 +20249,11 @@ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
20229
20249
  return tensorfound;
20230
20250
  }
20231
20251
 
20232
- size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
20252
+ size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
20233
20253
  return ctx->infos[i].offset;
20234
20254
  }
20235
20255
 
20236
- char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
20256
+ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
20237
20257
  return ctx->infos[i].name.data;
20238
20258
  }
20239
20259
 
@@ -20516,7 +20536,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
20516
20536
  buf->offset += el_size;
20517
20537
  }
20518
20538
 
20519
- static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
20539
+ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
20520
20540
  // write header
20521
20541
  gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
20522
20542
  gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
@@ -20631,7 +20651,7 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
20631
20651
  }
20632
20652
  }
20633
20653
 
20634
- void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
20654
+ void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
20635
20655
  FILE * file = fopen(fname, "wb");
20636
20656
  if (!file) {
20637
20657
  GGML_ASSERT(false && "failed to open file for writing");
@@ -20648,7 +20668,7 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only
20648
20668
  fclose(file);
20649
20669
  }
20650
20670
 
20651
- size_t gguf_get_meta_size(struct gguf_context * ctx) {
20671
+ size_t gguf_get_meta_size(const struct gguf_context * ctx) {
20652
20672
  // no allocs - only compute size
20653
20673
  struct gguf_buf buf = gguf_buf_init(0);
20654
20674
 
@@ -20657,7 +20677,7 @@ size_t gguf_get_meta_size(struct gguf_context * ctx) {
20657
20677
  return buf.offset;
20658
20678
  }
20659
20679
 
20660
- void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
20680
+ void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
20661
20681
  struct gguf_buf buf = gguf_buf_init(16*1024);
20662
20682
 
20663
20683
  gguf_write_to_buf(ctx, &buf, true);
@@ -20733,6 +20753,14 @@ int ggml_cpu_has_arm_fma(void) {
20733
20753
  #endif
20734
20754
  }
20735
20755
 
20756
+ int ggml_cpu_has_metal(void) {
20757
+ #if defined(GGML_USE_METAL)
20758
+ return 1;
20759
+ #else
20760
+ return 0;
20761
+ #endif
20762
+ }
20763
+
20736
20764
  int ggml_cpu_has_f16c(void) {
20737
20765
  #if defined(__F16C__)
20738
20766
  return 1;