llama_cpp 0.5.2 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -6
- data/ext/llama_cpp/src/ggml-cuda.cu +99 -46
- data/ext/llama_cpp/src/ggml-metal.m +37 -10
- data/ext/llama_cpp/src/ggml-metal.metal +144 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +68 -40
- data/ext/llama_cpp/src/ggml.h +43 -33
- data/ext/llama_cpp/src/llama.cpp +420 -57
- data/ext/llama_cpp/src/llama.h +5 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -38,7 +38,7 @@ kernel void kernel_add_row(
|
|
38
38
|
device const float4 * src0,
|
39
39
|
device const float4 * src1,
|
40
40
|
device float4 * dst,
|
41
|
-
constant
|
41
|
+
constant int64_t & nb,
|
42
42
|
uint tpig[[thread_position_in_grid]]) {
|
43
43
|
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
44
44
|
}
|
@@ -118,7 +118,7 @@ kernel void kernel_soft_max(
|
|
118
118
|
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
119
119
|
|
120
120
|
// parallel max
|
121
|
-
float lmax = psrc0[tpitg[0]];
|
121
|
+
float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
|
122
122
|
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
|
123
123
|
lmax = MAX(lmax, psrc0[i00]);
|
124
124
|
}
|
@@ -158,7 +158,7 @@ kernel void kernel_soft_max_4(
|
|
158
158
|
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
159
159
|
|
160
160
|
// parallel max
|
161
|
-
float4 lmax4 = psrc4[tpitg[0]];
|
161
|
+
float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
|
162
162
|
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
163
163
|
lmax4 = fmax(lmax4, psrc4[i00]);
|
164
164
|
}
|
@@ -523,6 +523,79 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
523
523
|
}
|
524
524
|
}
|
525
525
|
|
526
|
+
#define N_F32_F32 4
|
527
|
+
|
528
|
+
kernel void kernel_mul_mat_f32_f32(
|
529
|
+
device const char * src0,
|
530
|
+
device const char * src1,
|
531
|
+
device float * dst,
|
532
|
+
constant int64_t & ne00,
|
533
|
+
constant int64_t & ne01,
|
534
|
+
constant int64_t & ne02,
|
535
|
+
constant uint64_t & nb00,
|
536
|
+
constant uint64_t & nb01,
|
537
|
+
constant uint64_t & nb02,
|
538
|
+
constant int64_t & ne10,
|
539
|
+
constant int64_t & ne11,
|
540
|
+
constant int64_t & ne12,
|
541
|
+
constant uint64_t & nb10,
|
542
|
+
constant uint64_t & nb11,
|
543
|
+
constant uint64_t & nb12,
|
544
|
+
constant int64_t & ne0,
|
545
|
+
constant int64_t & ne1,
|
546
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
547
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
548
|
+
|
549
|
+
const int64_t r0 = tgpig.x;
|
550
|
+
const int64_t rb = tgpig.y*N_F32_F32;
|
551
|
+
const int64_t im = tgpig.z;
|
552
|
+
|
553
|
+
device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
554
|
+
|
555
|
+
if (ne00 < 128) {
|
556
|
+
for (int row = 0; row < N_F32_F32; ++row) {
|
557
|
+
int r1 = rb + row;
|
558
|
+
if (r1 >= ne11) {
|
559
|
+
break;
|
560
|
+
}
|
561
|
+
|
562
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
563
|
+
|
564
|
+
float sumf = 0;
|
565
|
+
for (int i = tiisg; i < ne00; i += 32) {
|
566
|
+
sumf += (float) x[i] * (float) y[i];
|
567
|
+
}
|
568
|
+
|
569
|
+
float all_sum = simd_sum(sumf);
|
570
|
+
if (tiisg == 0) {
|
571
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
572
|
+
}
|
573
|
+
}
|
574
|
+
} else {
|
575
|
+
device const float4 * x4 = (device const float4 *)x;
|
576
|
+
for (int row = 0; row < N_F32_F32; ++row) {
|
577
|
+
int r1 = rb + row;
|
578
|
+
if (r1 >= ne11) {
|
579
|
+
break;
|
580
|
+
}
|
581
|
+
|
582
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
583
|
+
device const float4 * y4 = (device const float4 *) y;
|
584
|
+
|
585
|
+
float sumf = 0;
|
586
|
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
587
|
+
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
588
|
+
}
|
589
|
+
|
590
|
+
float all_sum = simd_sum(sumf);
|
591
|
+
if (tiisg == 0) {
|
592
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
593
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
594
|
+
}
|
595
|
+
}
|
596
|
+
}
|
597
|
+
}
|
598
|
+
|
526
599
|
kernel void kernel_mul_mat_f16_f32_1row(
|
527
600
|
device const char * src0,
|
528
601
|
device const char * src1,
|
@@ -1321,7 +1394,6 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
1321
1394
|
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
|
1322
1395
|
}
|
1323
1396
|
}
|
1324
|
-
|
1325
1397
|
}
|
1326
1398
|
#else
|
1327
1399
|
kernel void kernel_mul_mat_q3_K_f32(
|
@@ -1400,13 +1472,13 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
1400
1472
|
device const float * src1,
|
1401
1473
|
device float * dst,
|
1402
1474
|
constant int64_t & ne00,
|
1403
|
-
constant int64_t & ne01[[buffer(4)]],
|
1404
|
-
constant int64_t & ne02[[buffer(5)]],
|
1405
|
-
constant int64_t & ne10[[buffer(9)]],
|
1406
|
-
constant int64_t & ne12[[buffer(11)]],
|
1407
|
-
constant int64_t & ne0[[buffer(15)]],
|
1408
|
-
constant int64_t & ne1[[buffer(16)]],
|
1409
|
-
constant uint & gqa[[buffer(17)]],
|
1475
|
+
constant int64_t & ne01 [[buffer(4)]],
|
1476
|
+
constant int64_t & ne02 [[buffer(5)]],
|
1477
|
+
constant int64_t & ne10 [[buffer(9)]],
|
1478
|
+
constant int64_t & ne12 [[buffer(11)]],
|
1479
|
+
constant int64_t & ne0 [[buffer(15)]],
|
1480
|
+
constant int64_t & ne1 [[buffer(16)]],
|
1481
|
+
constant uint & gqa [[buffer(17)]],
|
1410
1482
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
1411
1483
|
uint tiisg[[thread_index_in_simdgroup]],
|
1412
1484
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
@@ -1865,6 +1937,15 @@ kernel void kernel_mul_mat_q6_K_f32(
|
|
1865
1937
|
|
1866
1938
|
//============================= templates and their specializations =============================
|
1867
1939
|
|
1940
|
+
// NOTE: this is not dequantizing - we are simply fitting the template
|
1941
|
+
template <typename type4x4>
|
1942
|
+
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
1943
|
+
float4x4 temp = *(((device float4x4 *)src));
|
1944
|
+
for (int i = 0; i < 16; i++){
|
1945
|
+
reg[i/4][i%4] = temp[i/4][i%4];
|
1946
|
+
}
|
1947
|
+
}
|
1948
|
+
|
1868
1949
|
template <typename type4x4>
|
1869
1950
|
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
1870
1951
|
half4x4 temp = *(((device half4x4 *)src));
|
@@ -1875,7 +1956,6 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
|
|
1875
1956
|
|
1876
1957
|
template <typename type4x4>
|
1877
1958
|
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
1878
|
-
|
1879
1959
|
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
1880
1960
|
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
1881
1961
|
const float d2 = d1 / 256.f;
|
@@ -1887,12 +1967,10 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg
|
|
1887
1967
|
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
1888
1968
|
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
1889
1969
|
}
|
1890
|
-
|
1891
1970
|
}
|
1892
1971
|
|
1893
1972
|
template <typename type4x4>
|
1894
1973
|
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
1895
|
-
|
1896
1974
|
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
1897
1975
|
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
1898
1976
|
const float d2 = d1 / 256.f;
|
@@ -1964,7 +2042,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
|
1964
2042
|
for (int i = 0; i < 16; ++i) {
|
1965
2043
|
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
1966
2044
|
}
|
1967
|
-
|
1968
2045
|
#else
|
1969
2046
|
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
1970
2047
|
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
@@ -2008,7 +2085,6 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
|
2008
2085
|
for (int i = 0; i < 16; ++i) {
|
2009
2086
|
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
2010
2087
|
}
|
2011
|
-
|
2012
2088
|
}
|
2013
2089
|
|
2014
2090
|
template <typename type4x4>
|
@@ -2110,22 +2186,25 @@ kernel void kernel_get_rows(
|
|
2110
2186
|
// each block_q contains 16*nl weights
|
2111
2187
|
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
|
2112
2188
|
kernel void kernel_mul_mm(device const uchar * src0,
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
2123
|
-
|
2124
|
-
|
2125
|
-
|
2126
|
-
|
2127
|
-
|
2128
|
-
|
2189
|
+
device const uchar * src1,
|
2190
|
+
device float * dst,
|
2191
|
+
constant int64_t & ne00,
|
2192
|
+
constant int64_t & ne02,
|
2193
|
+
constant int64_t & nb01,
|
2194
|
+
constant int64_t & nb02,
|
2195
|
+
constant int64_t & ne12,
|
2196
|
+
constant int64_t & nb10,
|
2197
|
+
constant int64_t & nb11,
|
2198
|
+
constant int64_t & nb12,
|
2199
|
+
constant int64_t & ne0,
|
2200
|
+
constant int64_t & ne1,
|
2201
|
+
constant uint & gqa,
|
2202
|
+
threadgroup uchar * shared_memory [[threadgroup(0)]],
|
2203
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
2204
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
2205
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
2206
|
+
|
2207
|
+
threadgroup half * sa = (threadgroup half *)(shared_memory);
|
2129
2208
|
threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
|
2130
2209
|
|
2131
2210
|
const uint r0 = tgpig.y;
|
@@ -2138,7 +2217,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2138
2217
|
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
2139
2218
|
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
2140
2219
|
|
2141
|
-
simdgroup_half8x8
|
2220
|
+
simdgroup_half8x8 ma[4];
|
2142
2221
|
simdgroup_float8x8 mb[2];
|
2143
2222
|
simdgroup_float8x8 c_res[8];
|
2144
2223
|
for (int i = 0; i < 8; i++){
|
@@ -2146,10 +2225,15 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2146
2225
|
}
|
2147
2226
|
|
2148
2227
|
short il = (tiitg % THREAD_PER_ROW);
|
2149
|
-
|
2150
|
-
|
2151
|
-
|
2152
|
-
|
2228
|
+
|
2229
|
+
uint offset0 = im/gqa*nb02;
|
2230
|
+
ushort offset1 = il/nl;
|
2231
|
+
|
2232
|
+
device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
|
2233
|
+
device const float * y = (device const float *)(src1
|
2234
|
+
+ nb12 * im
|
2235
|
+
+ nb11 * (r1 * BLOCK_SIZE_N + thread_col)
|
2236
|
+
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
|
2153
2237
|
|
2154
2238
|
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
|
2155
2239
|
//load data and store to threadgroup memory
|
@@ -2229,6 +2313,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2229
2313
|
typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
|
2230
2314
|
constant uint64_t &, constant uint64_t &, uint, uint, uint);
|
2231
2315
|
|
2316
|
+
template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows<float4x4, 1, dequantize_f32>;
|
2232
2317
|
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
|
2233
2318
|
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
|
2234
2319
|
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
|
@@ -2239,14 +2324,28 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows
|
|
2239
2324
|
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
|
2240
2325
|
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
|
2241
2326
|
|
2242
|
-
typedef void (mat_mm_t)(
|
2243
|
-
|
2244
|
-
|
2245
|
-
|
2246
|
-
|
2247
|
-
|
2248
|
-
|
2249
|
-
|
2327
|
+
typedef void (mat_mm_t)(
|
2328
|
+
device const uchar * src0,
|
2329
|
+
device const uchar * src1,
|
2330
|
+
device float * dst,
|
2331
|
+
constant int64_t & ne00,
|
2332
|
+
constant int64_t & ne02,
|
2333
|
+
constant int64_t & nb01,
|
2334
|
+
constant int64_t & nb02,
|
2335
|
+
constant int64_t & ne12,
|
2336
|
+
constant int64_t & nb10,
|
2337
|
+
constant int64_t & nb11,
|
2338
|
+
constant int64_t & nb12,
|
2339
|
+
constant int64_t & ne0,
|
2340
|
+
constant int64_t & ne1,
|
2341
|
+
constant uint & gqa,
|
2342
|
+
threadgroup uchar *, uint3, uint, uint);
|
2343
|
+
|
2344
|
+
template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<float4x4, 1, dequantize_f32>;
|
2345
|
+
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
2346
|
+
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
|
2347
|
+
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
|
2348
|
+
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
|
2250
2349
|
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
|
2251
2350
|
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
|
2252
2351
|
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
|
@@ -847,7 +847,7 @@ std::array<std::string, 2> mul_str_values = {
|
|
847
847
|
"mul_f32", "float"
|
848
848
|
};
|
849
849
|
|
850
|
-
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
850
|
+
static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
851
851
|
size_t pos = 0;
|
852
852
|
while ((pos = s.find(from, pos)) != std::string::npos) {
|
853
853
|
s.replace(pos, from.length(), to);
|
@@ -856,7 +856,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
|
|
856
856
|
return s;
|
857
857
|
}
|
858
858
|
|
859
|
-
std::string generate_kernels() {
|
859
|
+
static std::string generate_kernels() {
|
860
860
|
std::stringstream src;
|
861
861
|
src << program_source << '\n';
|
862
862
|
src << k_quants_source << '\n';
|
@@ -1788,7 +1788,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
1788
1788
|
return false;
|
1789
1789
|
}
|
1790
1790
|
|
1791
|
-
bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
1791
|
+
static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
1792
1792
|
// If device doesn't support FP16
|
1793
1793
|
if (!fp16_support) {
|
1794
1794
|
return false;
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -4303,10 +4303,21 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
|
4303
4303
|
}
|
4304
4304
|
|
4305
4305
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
4306
|
-
size_t nbytes
|
4307
|
-
|
4308
|
-
|
4306
|
+
size_t nbytes;
|
4307
|
+
size_t blck_size = ggml_blck_size(tensor->type);
|
4308
|
+
if (blck_size == 1) {
|
4309
|
+
nbytes = ggml_type_size(tensor->type);
|
4310
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
4311
|
+
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
4312
|
+
}
|
4313
|
+
}
|
4314
|
+
else {
|
4315
|
+
nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
|
4316
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
4317
|
+
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
4318
|
+
}
|
4309
4319
|
}
|
4320
|
+
|
4310
4321
|
return nbytes;
|
4311
4322
|
}
|
4312
4323
|
|
@@ -17283,10 +17294,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17283
17294
|
} else {
|
17284
17295
|
// wait for other threads to finish
|
17285
17296
|
const int last = node_n;
|
17286
|
-
|
17287
|
-
//sched_yield
|
17297
|
+
while (true) {
|
17298
|
+
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
17299
|
+
// depending on the workload and the operating system.
|
17300
|
+
// since it is not clear what is the best approach, it should potentially become user-configurable
|
17301
|
+
// ref: https://github.com/ggerganov/ggml/issues/291
|
17302
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
17303
|
+
sched_yield();
|
17304
|
+
#endif
|
17305
|
+
|
17288
17306
|
node_n = atomic_load(&state->shared->node_n);
|
17289
|
-
|
17307
|
+
if (node_n != last) break;
|
17308
|
+
};
|
17290
17309
|
}
|
17291
17310
|
|
17292
17311
|
// check if we should stop
|
@@ -18337,10 +18356,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
18337
18356
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
18338
18357
|
struct ggml_tensor * node = cgraph->leafs[i];
|
18339
18358
|
|
18340
|
-
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
18359
|
+
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
|
18341
18360
|
i,
|
18342
18361
|
node->ne[0], node->ne[1],
|
18343
|
-
ggml_op_name(node->op)
|
18362
|
+
ggml_op_name(node->op),
|
18363
|
+
ggml_get_name(node));
|
18344
18364
|
}
|
18345
18365
|
|
18346
18366
|
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
@@ -20099,27 +20119,27 @@ const char * gguf_type_name(enum gguf_type type) {
|
|
20099
20119
|
return GGUF_TYPE_NAME[type];
|
20100
20120
|
}
|
20101
20121
|
|
20102
|
-
int gguf_get_version(struct gguf_context * ctx) {
|
20122
|
+
int gguf_get_version(const struct gguf_context * ctx) {
|
20103
20123
|
return ctx->header.version;
|
20104
20124
|
}
|
20105
20125
|
|
20106
|
-
size_t gguf_get_alignment(struct gguf_context * ctx) {
|
20126
|
+
size_t gguf_get_alignment(const struct gguf_context * ctx) {
|
20107
20127
|
return ctx->alignment;
|
20108
20128
|
}
|
20109
20129
|
|
20110
|
-
size_t gguf_get_data_offset(struct gguf_context * ctx) {
|
20130
|
+
size_t gguf_get_data_offset(const struct gguf_context * ctx) {
|
20111
20131
|
return ctx->offset;
|
20112
20132
|
}
|
20113
20133
|
|
20114
|
-
void * gguf_get_data(struct gguf_context * ctx) {
|
20134
|
+
void * gguf_get_data(const struct gguf_context * ctx) {
|
20115
20135
|
return ctx->data;
|
20116
20136
|
}
|
20117
20137
|
|
20118
|
-
int gguf_get_n_kv(struct gguf_context * ctx) {
|
20138
|
+
int gguf_get_n_kv(const struct gguf_context * ctx) {
|
20119
20139
|
return ctx->header.n_kv;
|
20120
20140
|
}
|
20121
20141
|
|
20122
|
-
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
20142
|
+
int gguf_find_key(const struct gguf_context * ctx, const char * key) {
|
20123
20143
|
// return -1 if key not found
|
20124
20144
|
int keyfound = -1;
|
20125
20145
|
|
@@ -20135,85 +20155,85 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
|
20135
20155
|
return keyfound;
|
20136
20156
|
}
|
20137
20157
|
|
20138
|
-
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
20158
|
+
const char * gguf_get_key(const struct gguf_context * ctx, int i) {
|
20139
20159
|
return ctx->kv[i].key.data;
|
20140
20160
|
}
|
20141
20161
|
|
20142
|
-
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
20162
|
+
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) {
|
20143
20163
|
return ctx->kv[i].type;
|
20144
20164
|
}
|
20145
20165
|
|
20146
|
-
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
20166
|
+
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) {
|
20147
20167
|
return ctx->kv[i].value.arr.type;
|
20148
20168
|
}
|
20149
20169
|
|
20150
|
-
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
20170
|
+
const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) {
|
20151
20171
|
return ctx->kv[i].value.arr.data;
|
20152
20172
|
}
|
20153
20173
|
|
20154
|
-
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
20174
|
+
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
20155
20175
|
struct gguf_kv * kv = &ctx->kv[key_id];
|
20156
20176
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
20157
20177
|
return str->data;
|
20158
20178
|
}
|
20159
20179
|
|
20160
|
-
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
20180
|
+
int gguf_get_arr_n(const struct gguf_context * ctx, int i) {
|
20161
20181
|
return ctx->kv[i].value.arr.n;
|
20162
20182
|
}
|
20163
20183
|
|
20164
|
-
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
20184
|
+
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) {
|
20165
20185
|
return ctx->kv[i].value.uint8;
|
20166
20186
|
}
|
20167
20187
|
|
20168
|
-
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
20188
|
+
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) {
|
20169
20189
|
return ctx->kv[i].value.int8;
|
20170
20190
|
}
|
20171
20191
|
|
20172
|
-
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
20192
|
+
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) {
|
20173
20193
|
return ctx->kv[i].value.uint16;
|
20174
20194
|
}
|
20175
20195
|
|
20176
|
-
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
20196
|
+
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) {
|
20177
20197
|
return ctx->kv[i].value.int16;
|
20178
20198
|
}
|
20179
20199
|
|
20180
|
-
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
20200
|
+
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) {
|
20181
20201
|
return ctx->kv[i].value.uint32;
|
20182
20202
|
}
|
20183
20203
|
|
20184
|
-
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
20204
|
+
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) {
|
20185
20205
|
return ctx->kv[i].value.int32;
|
20186
20206
|
}
|
20187
20207
|
|
20188
|
-
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
20208
|
+
float gguf_get_val_f32(const struct gguf_context * ctx, int i) {
|
20189
20209
|
return ctx->kv[i].value.float32;
|
20190
20210
|
}
|
20191
20211
|
|
20192
|
-
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
|
20212
|
+
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) {
|
20193
20213
|
return ctx->kv[i].value.uint64;
|
20194
20214
|
}
|
20195
20215
|
|
20196
|
-
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
|
20216
|
+
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) {
|
20197
20217
|
return ctx->kv[i].value.int64;
|
20198
20218
|
}
|
20199
20219
|
|
20200
|
-
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
|
20220
|
+
double gguf_get_val_f64(const struct gguf_context * ctx, int i) {
|
20201
20221
|
return ctx->kv[i].value.float64;
|
20202
20222
|
}
|
20203
20223
|
|
20204
|
-
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
20224
|
+
bool gguf_get_val_bool(const struct gguf_context * ctx, int i) {
|
20205
20225
|
return ctx->kv[i].value.bool_;
|
20206
20226
|
}
|
20207
20227
|
|
20208
|
-
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
20228
|
+
const char * gguf_get_val_str (const struct gguf_context * ctx, int i) {
|
20209
20229
|
return ctx->kv[i].value.str.data;
|
20210
20230
|
}
|
20211
20231
|
|
20212
|
-
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
20232
|
+
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
20213
20233
|
return ctx->header.n_tensors;
|
20214
20234
|
}
|
20215
20235
|
|
20216
|
-
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
20236
|
+
int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
|
20217
20237
|
// return -1 if tensor not found
|
20218
20238
|
int tensorfound = -1;
|
20219
20239
|
|
@@ -20229,11 +20249,11 @@ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
|
20229
20249
|
return tensorfound;
|
20230
20250
|
}
|
20231
20251
|
|
20232
|
-
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
20252
|
+
size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
|
20233
20253
|
return ctx->infos[i].offset;
|
20234
20254
|
}
|
20235
20255
|
|
20236
|
-
char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
|
20256
|
+
char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
20237
20257
|
return ctx->infos[i].name.data;
|
20238
20258
|
}
|
20239
20259
|
|
@@ -20516,7 +20536,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
|
|
20516
20536
|
buf->offset += el_size;
|
20517
20537
|
}
|
20518
20538
|
|
20519
|
-
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
20539
|
+
static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
20520
20540
|
// write header
|
20521
20541
|
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
20522
20542
|
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
@@ -20631,7 +20651,7 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
|
20631
20651
|
}
|
20632
20652
|
}
|
20633
20653
|
|
20634
|
-
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
|
20654
|
+
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
20635
20655
|
FILE * file = fopen(fname, "wb");
|
20636
20656
|
if (!file) {
|
20637
20657
|
GGML_ASSERT(false && "failed to open file for writing");
|
@@ -20648,7 +20668,7 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only
|
|
20648
20668
|
fclose(file);
|
20649
20669
|
}
|
20650
20670
|
|
20651
|
-
size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
20671
|
+
size_t gguf_get_meta_size(const struct gguf_context * ctx) {
|
20652
20672
|
// no allocs - only compute size
|
20653
20673
|
struct gguf_buf buf = gguf_buf_init(0);
|
20654
20674
|
|
@@ -20657,7 +20677,7 @@ size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
|
20657
20677
|
return buf.offset;
|
20658
20678
|
}
|
20659
20679
|
|
20660
|
-
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
|
20680
|
+
void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
|
20661
20681
|
struct gguf_buf buf = gguf_buf_init(16*1024);
|
20662
20682
|
|
20663
20683
|
gguf_write_to_buf(ctx, &buf, true);
|
@@ -20733,6 +20753,14 @@ int ggml_cpu_has_arm_fma(void) {
|
|
20733
20753
|
#endif
|
20734
20754
|
}
|
20735
20755
|
|
20756
|
+
int ggml_cpu_has_metal(void) {
|
20757
|
+
#if defined(GGML_USE_METAL)
|
20758
|
+
return 1;
|
20759
|
+
#else
|
20760
|
+
return 0;
|
20761
|
+
#endif
|
20762
|
+
}
|
20763
|
+
|
20736
20764
|
int ggml_cpu_has_f16c(void) {
|
20737
20765
|
#if defined(__F16C__)
|
20738
20766
|
return 1;
|