llama_cpp 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -6
- data/ext/llama_cpp/src/ggml-cuda.cu +99 -46
- data/ext/llama_cpp/src/ggml-metal.m +37 -10
- data/ext/llama_cpp/src/ggml-metal.metal +144 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +68 -40
- data/ext/llama_cpp/src/ggml.h +43 -33
- data/ext/llama_cpp/src/llama.cpp +420 -57
- data/ext/llama_cpp/src/llama.h +5 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -38,7 +38,7 @@ kernel void kernel_add_row(
|
|
38
38
|
device const float4 * src0,
|
39
39
|
device const float4 * src1,
|
40
40
|
device float4 * dst,
|
41
|
-
constant
|
41
|
+
constant int64_t & nb,
|
42
42
|
uint tpig[[thread_position_in_grid]]) {
|
43
43
|
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
44
44
|
}
|
@@ -118,7 +118,7 @@ kernel void kernel_soft_max(
|
|
118
118
|
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
119
119
|
|
120
120
|
// parallel max
|
121
|
-
float lmax = psrc0[tpitg[0]];
|
121
|
+
float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
|
122
122
|
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
|
123
123
|
lmax = MAX(lmax, psrc0[i00]);
|
124
124
|
}
|
@@ -158,7 +158,7 @@ kernel void kernel_soft_max_4(
|
|
158
158
|
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
159
159
|
|
160
160
|
// parallel max
|
161
|
-
float4 lmax4 = psrc4[tpitg[0]];
|
161
|
+
float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
|
162
162
|
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
163
163
|
lmax4 = fmax(lmax4, psrc4[i00]);
|
164
164
|
}
|
@@ -523,6 +523,79 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
523
523
|
}
|
524
524
|
}
|
525
525
|
|
526
|
+
#define N_F32_F32 4
|
527
|
+
|
528
|
+
kernel void kernel_mul_mat_f32_f32(
|
529
|
+
device const char * src0,
|
530
|
+
device const char * src1,
|
531
|
+
device float * dst,
|
532
|
+
constant int64_t & ne00,
|
533
|
+
constant int64_t & ne01,
|
534
|
+
constant int64_t & ne02,
|
535
|
+
constant uint64_t & nb00,
|
536
|
+
constant uint64_t & nb01,
|
537
|
+
constant uint64_t & nb02,
|
538
|
+
constant int64_t & ne10,
|
539
|
+
constant int64_t & ne11,
|
540
|
+
constant int64_t & ne12,
|
541
|
+
constant uint64_t & nb10,
|
542
|
+
constant uint64_t & nb11,
|
543
|
+
constant uint64_t & nb12,
|
544
|
+
constant int64_t & ne0,
|
545
|
+
constant int64_t & ne1,
|
546
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
547
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
548
|
+
|
549
|
+
const int64_t r0 = tgpig.x;
|
550
|
+
const int64_t rb = tgpig.y*N_F32_F32;
|
551
|
+
const int64_t im = tgpig.z;
|
552
|
+
|
553
|
+
device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
554
|
+
|
555
|
+
if (ne00 < 128) {
|
556
|
+
for (int row = 0; row < N_F32_F32; ++row) {
|
557
|
+
int r1 = rb + row;
|
558
|
+
if (r1 >= ne11) {
|
559
|
+
break;
|
560
|
+
}
|
561
|
+
|
562
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
563
|
+
|
564
|
+
float sumf = 0;
|
565
|
+
for (int i = tiisg; i < ne00; i += 32) {
|
566
|
+
sumf += (float) x[i] * (float) y[i];
|
567
|
+
}
|
568
|
+
|
569
|
+
float all_sum = simd_sum(sumf);
|
570
|
+
if (tiisg == 0) {
|
571
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
572
|
+
}
|
573
|
+
}
|
574
|
+
} else {
|
575
|
+
device const float4 * x4 = (device const float4 *)x;
|
576
|
+
for (int row = 0; row < N_F32_F32; ++row) {
|
577
|
+
int r1 = rb + row;
|
578
|
+
if (r1 >= ne11) {
|
579
|
+
break;
|
580
|
+
}
|
581
|
+
|
582
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
583
|
+
device const float4 * y4 = (device const float4 *) y;
|
584
|
+
|
585
|
+
float sumf = 0;
|
586
|
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
587
|
+
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
588
|
+
}
|
589
|
+
|
590
|
+
float all_sum = simd_sum(sumf);
|
591
|
+
if (tiisg == 0) {
|
592
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
593
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
594
|
+
}
|
595
|
+
}
|
596
|
+
}
|
597
|
+
}
|
598
|
+
|
526
599
|
kernel void kernel_mul_mat_f16_f32_1row(
|
527
600
|
device const char * src0,
|
528
601
|
device const char * src1,
|
@@ -1321,7 +1394,6 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
1321
1394
|
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
|
1322
1395
|
}
|
1323
1396
|
}
|
1324
|
-
|
1325
1397
|
}
|
1326
1398
|
#else
|
1327
1399
|
kernel void kernel_mul_mat_q3_K_f32(
|
@@ -1400,13 +1472,13 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
1400
1472
|
device const float * src1,
|
1401
1473
|
device float * dst,
|
1402
1474
|
constant int64_t & ne00,
|
1403
|
-
constant int64_t & ne01[[buffer(4)]],
|
1404
|
-
constant int64_t & ne02[[buffer(5)]],
|
1405
|
-
constant int64_t & ne10[[buffer(9)]],
|
1406
|
-
constant int64_t & ne12[[buffer(11)]],
|
1407
|
-
constant int64_t & ne0[[buffer(15)]],
|
1408
|
-
constant int64_t & ne1[[buffer(16)]],
|
1409
|
-
constant uint & gqa[[buffer(17)]],
|
1475
|
+
constant int64_t & ne01 [[buffer(4)]],
|
1476
|
+
constant int64_t & ne02 [[buffer(5)]],
|
1477
|
+
constant int64_t & ne10 [[buffer(9)]],
|
1478
|
+
constant int64_t & ne12 [[buffer(11)]],
|
1479
|
+
constant int64_t & ne0 [[buffer(15)]],
|
1480
|
+
constant int64_t & ne1 [[buffer(16)]],
|
1481
|
+
constant uint & gqa [[buffer(17)]],
|
1410
1482
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
1411
1483
|
uint tiisg[[thread_index_in_simdgroup]],
|
1412
1484
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
@@ -1865,6 +1937,15 @@ kernel void kernel_mul_mat_q6_K_f32(
|
|
1865
1937
|
|
1866
1938
|
//============================= templates and their specializations =============================
|
1867
1939
|
|
1940
|
+
// NOTE: this is not dequantizing - we are simply fitting the template
|
1941
|
+
template <typename type4x4>
|
1942
|
+
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
1943
|
+
float4x4 temp = *(((device float4x4 *)src));
|
1944
|
+
for (int i = 0; i < 16; i++){
|
1945
|
+
reg[i/4][i%4] = temp[i/4][i%4];
|
1946
|
+
}
|
1947
|
+
}
|
1948
|
+
|
1868
1949
|
template <typename type4x4>
|
1869
1950
|
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
1870
1951
|
half4x4 temp = *(((device half4x4 *)src));
|
@@ -1875,7 +1956,6 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
|
|
1875
1956
|
|
1876
1957
|
template <typename type4x4>
|
1877
1958
|
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
1878
|
-
|
1879
1959
|
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
1880
1960
|
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
1881
1961
|
const float d2 = d1 / 256.f;
|
@@ -1887,12 +1967,10 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg
|
|
1887
1967
|
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
1888
1968
|
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
1889
1969
|
}
|
1890
|
-
|
1891
1970
|
}
|
1892
1971
|
|
1893
1972
|
template <typename type4x4>
|
1894
1973
|
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
1895
|
-
|
1896
1974
|
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
1897
1975
|
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
1898
1976
|
const float d2 = d1 / 256.f;
|
@@ -1964,7 +2042,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
|
1964
2042
|
for (int i = 0; i < 16; ++i) {
|
1965
2043
|
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
1966
2044
|
}
|
1967
|
-
|
1968
2045
|
#else
|
1969
2046
|
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
1970
2047
|
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
@@ -2008,7 +2085,6 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
|
2008
2085
|
for (int i = 0; i < 16; ++i) {
|
2009
2086
|
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
2010
2087
|
}
|
2011
|
-
|
2012
2088
|
}
|
2013
2089
|
|
2014
2090
|
template <typename type4x4>
|
@@ -2110,22 +2186,25 @@ kernel void kernel_get_rows(
|
|
2110
2186
|
// each block_q contains 16*nl weights
|
2111
2187
|
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
|
2112
2188
|
kernel void kernel_mul_mm(device const uchar * src0,
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
2123
|
-
|
2124
|
-
|
2125
|
-
|
2126
|
-
|
2127
|
-
|
2128
|
-
|
2189
|
+
device const uchar * src1,
|
2190
|
+
device float * dst,
|
2191
|
+
constant int64_t & ne00,
|
2192
|
+
constant int64_t & ne02,
|
2193
|
+
constant int64_t & nb01,
|
2194
|
+
constant int64_t & nb02,
|
2195
|
+
constant int64_t & ne12,
|
2196
|
+
constant int64_t & nb10,
|
2197
|
+
constant int64_t & nb11,
|
2198
|
+
constant int64_t & nb12,
|
2199
|
+
constant int64_t & ne0,
|
2200
|
+
constant int64_t & ne1,
|
2201
|
+
constant uint & gqa,
|
2202
|
+
threadgroup uchar * shared_memory [[threadgroup(0)]],
|
2203
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
2204
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
2205
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
2206
|
+
|
2207
|
+
threadgroup half * sa = (threadgroup half *)(shared_memory);
|
2129
2208
|
threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
|
2130
2209
|
|
2131
2210
|
const uint r0 = tgpig.y;
|
@@ -2138,7 +2217,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2138
2217
|
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
2139
2218
|
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
2140
2219
|
|
2141
|
-
simdgroup_half8x8
|
2220
|
+
simdgroup_half8x8 ma[4];
|
2142
2221
|
simdgroup_float8x8 mb[2];
|
2143
2222
|
simdgroup_float8x8 c_res[8];
|
2144
2223
|
for (int i = 0; i < 8; i++){
|
@@ -2146,10 +2225,15 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2146
2225
|
}
|
2147
2226
|
|
2148
2227
|
short il = (tiitg % THREAD_PER_ROW);
|
2149
|
-
|
2150
|
-
|
2151
|
-
|
2152
|
-
|
2228
|
+
|
2229
|
+
uint offset0 = im/gqa*nb02;
|
2230
|
+
ushort offset1 = il/nl;
|
2231
|
+
|
2232
|
+
device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
|
2233
|
+
device const float * y = (device const float *)(src1
|
2234
|
+
+ nb12 * im
|
2235
|
+
+ nb11 * (r1 * BLOCK_SIZE_N + thread_col)
|
2236
|
+
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
|
2153
2237
|
|
2154
2238
|
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
|
2155
2239
|
//load data and store to threadgroup memory
|
@@ -2229,6 +2313,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2229
2313
|
typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
|
2230
2314
|
constant uint64_t &, constant uint64_t &, uint, uint, uint);
|
2231
2315
|
|
2316
|
+
template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows<float4x4, 1, dequantize_f32>;
|
2232
2317
|
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
|
2233
2318
|
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
|
2234
2319
|
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
|
@@ -2239,14 +2324,28 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows
|
|
2239
2324
|
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
|
2240
2325
|
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
|
2241
2326
|
|
2242
|
-
typedef void (mat_mm_t)(
|
2243
|
-
|
2244
|
-
|
2245
|
-
|
2246
|
-
|
2247
|
-
|
2248
|
-
|
2249
|
-
|
2327
|
+
typedef void (mat_mm_t)(
|
2328
|
+
device const uchar * src0,
|
2329
|
+
device const uchar * src1,
|
2330
|
+
device float * dst,
|
2331
|
+
constant int64_t & ne00,
|
2332
|
+
constant int64_t & ne02,
|
2333
|
+
constant int64_t & nb01,
|
2334
|
+
constant int64_t & nb02,
|
2335
|
+
constant int64_t & ne12,
|
2336
|
+
constant int64_t & nb10,
|
2337
|
+
constant int64_t & nb11,
|
2338
|
+
constant int64_t & nb12,
|
2339
|
+
constant int64_t & ne0,
|
2340
|
+
constant int64_t & ne1,
|
2341
|
+
constant uint & gqa,
|
2342
|
+
threadgroup uchar *, uint3, uint, uint);
|
2343
|
+
|
2344
|
+
template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<float4x4, 1, dequantize_f32>;
|
2345
|
+
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
2346
|
+
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
|
2347
|
+
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
|
2348
|
+
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
|
2250
2349
|
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
|
2251
2350
|
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
|
2252
2351
|
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
|
@@ -847,7 +847,7 @@ std::array<std::string, 2> mul_str_values = {
|
|
847
847
|
"mul_f32", "float"
|
848
848
|
};
|
849
849
|
|
850
|
-
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
850
|
+
static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
851
851
|
size_t pos = 0;
|
852
852
|
while ((pos = s.find(from, pos)) != std::string::npos) {
|
853
853
|
s.replace(pos, from.length(), to);
|
@@ -856,7 +856,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
|
|
856
856
|
return s;
|
857
857
|
}
|
858
858
|
|
859
|
-
std::string generate_kernels() {
|
859
|
+
static std::string generate_kernels() {
|
860
860
|
std::stringstream src;
|
861
861
|
src << program_source << '\n';
|
862
862
|
src << k_quants_source << '\n';
|
@@ -1788,7 +1788,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
1788
1788
|
return false;
|
1789
1789
|
}
|
1790
1790
|
|
1791
|
-
bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
1791
|
+
static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
1792
1792
|
// If device doesn't support FP16
|
1793
1793
|
if (!fp16_support) {
|
1794
1794
|
return false;
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -4303,10 +4303,21 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
|
4303
4303
|
}
|
4304
4304
|
|
4305
4305
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
4306
|
-
size_t nbytes
|
4307
|
-
|
4308
|
-
|
4306
|
+
size_t nbytes;
|
4307
|
+
size_t blck_size = ggml_blck_size(tensor->type);
|
4308
|
+
if (blck_size == 1) {
|
4309
|
+
nbytes = ggml_type_size(tensor->type);
|
4310
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
4311
|
+
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
4312
|
+
}
|
4313
|
+
}
|
4314
|
+
else {
|
4315
|
+
nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
|
4316
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
4317
|
+
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
4318
|
+
}
|
4309
4319
|
}
|
4320
|
+
|
4310
4321
|
return nbytes;
|
4311
4322
|
}
|
4312
4323
|
|
@@ -17283,10 +17294,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17283
17294
|
} else {
|
17284
17295
|
// wait for other threads to finish
|
17285
17296
|
const int last = node_n;
|
17286
|
-
|
17287
|
-
//sched_yield
|
17297
|
+
while (true) {
|
17298
|
+
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
17299
|
+
// depending on the workload and the operating system.
|
17300
|
+
// since it is not clear what is the best approach, it should potentially become user-configurable
|
17301
|
+
// ref: https://github.com/ggerganov/ggml/issues/291
|
17302
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
17303
|
+
sched_yield();
|
17304
|
+
#endif
|
17305
|
+
|
17288
17306
|
node_n = atomic_load(&state->shared->node_n);
|
17289
|
-
|
17307
|
+
if (node_n != last) break;
|
17308
|
+
};
|
17290
17309
|
}
|
17291
17310
|
|
17292
17311
|
// check if we should stop
|
@@ -18337,10 +18356,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
18337
18356
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
18338
18357
|
struct ggml_tensor * node = cgraph->leafs[i];
|
18339
18358
|
|
18340
|
-
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
18359
|
+
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
|
18341
18360
|
i,
|
18342
18361
|
node->ne[0], node->ne[1],
|
18343
|
-
ggml_op_name(node->op)
|
18362
|
+
ggml_op_name(node->op),
|
18363
|
+
ggml_get_name(node));
|
18344
18364
|
}
|
18345
18365
|
|
18346
18366
|
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
@@ -20099,27 +20119,27 @@ const char * gguf_type_name(enum gguf_type type) {
|
|
20099
20119
|
return GGUF_TYPE_NAME[type];
|
20100
20120
|
}
|
20101
20121
|
|
20102
|
-
int gguf_get_version(struct gguf_context * ctx) {
|
20122
|
+
int gguf_get_version(const struct gguf_context * ctx) {
|
20103
20123
|
return ctx->header.version;
|
20104
20124
|
}
|
20105
20125
|
|
20106
|
-
size_t gguf_get_alignment(struct gguf_context * ctx) {
|
20126
|
+
size_t gguf_get_alignment(const struct gguf_context * ctx) {
|
20107
20127
|
return ctx->alignment;
|
20108
20128
|
}
|
20109
20129
|
|
20110
|
-
size_t gguf_get_data_offset(struct gguf_context * ctx) {
|
20130
|
+
size_t gguf_get_data_offset(const struct gguf_context * ctx) {
|
20111
20131
|
return ctx->offset;
|
20112
20132
|
}
|
20113
20133
|
|
20114
|
-
void * gguf_get_data(struct gguf_context * ctx) {
|
20134
|
+
void * gguf_get_data(const struct gguf_context * ctx) {
|
20115
20135
|
return ctx->data;
|
20116
20136
|
}
|
20117
20137
|
|
20118
|
-
int gguf_get_n_kv(struct gguf_context * ctx) {
|
20138
|
+
int gguf_get_n_kv(const struct gguf_context * ctx) {
|
20119
20139
|
return ctx->header.n_kv;
|
20120
20140
|
}
|
20121
20141
|
|
20122
|
-
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
20142
|
+
int gguf_find_key(const struct gguf_context * ctx, const char * key) {
|
20123
20143
|
// return -1 if key not found
|
20124
20144
|
int keyfound = -1;
|
20125
20145
|
|
@@ -20135,85 +20155,85 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
|
20135
20155
|
return keyfound;
|
20136
20156
|
}
|
20137
20157
|
|
20138
|
-
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
20158
|
+
const char * gguf_get_key(const struct gguf_context * ctx, int i) {
|
20139
20159
|
return ctx->kv[i].key.data;
|
20140
20160
|
}
|
20141
20161
|
|
20142
|
-
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
20162
|
+
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) {
|
20143
20163
|
return ctx->kv[i].type;
|
20144
20164
|
}
|
20145
20165
|
|
20146
|
-
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
20166
|
+
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) {
|
20147
20167
|
return ctx->kv[i].value.arr.type;
|
20148
20168
|
}
|
20149
20169
|
|
20150
|
-
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
20170
|
+
const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) {
|
20151
20171
|
return ctx->kv[i].value.arr.data;
|
20152
20172
|
}
|
20153
20173
|
|
20154
|
-
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
20174
|
+
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
20155
20175
|
struct gguf_kv * kv = &ctx->kv[key_id];
|
20156
20176
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
20157
20177
|
return str->data;
|
20158
20178
|
}
|
20159
20179
|
|
20160
|
-
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
20180
|
+
int gguf_get_arr_n(const struct gguf_context * ctx, int i) {
|
20161
20181
|
return ctx->kv[i].value.arr.n;
|
20162
20182
|
}
|
20163
20183
|
|
20164
|
-
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
20184
|
+
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) {
|
20165
20185
|
return ctx->kv[i].value.uint8;
|
20166
20186
|
}
|
20167
20187
|
|
20168
|
-
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
20188
|
+
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) {
|
20169
20189
|
return ctx->kv[i].value.int8;
|
20170
20190
|
}
|
20171
20191
|
|
20172
|
-
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
20192
|
+
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) {
|
20173
20193
|
return ctx->kv[i].value.uint16;
|
20174
20194
|
}
|
20175
20195
|
|
20176
|
-
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
20196
|
+
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) {
|
20177
20197
|
return ctx->kv[i].value.int16;
|
20178
20198
|
}
|
20179
20199
|
|
20180
|
-
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
20200
|
+
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) {
|
20181
20201
|
return ctx->kv[i].value.uint32;
|
20182
20202
|
}
|
20183
20203
|
|
20184
|
-
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
20204
|
+
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) {
|
20185
20205
|
return ctx->kv[i].value.int32;
|
20186
20206
|
}
|
20187
20207
|
|
20188
|
-
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
20208
|
+
float gguf_get_val_f32(const struct gguf_context * ctx, int i) {
|
20189
20209
|
return ctx->kv[i].value.float32;
|
20190
20210
|
}
|
20191
20211
|
|
20192
|
-
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
|
20212
|
+
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) {
|
20193
20213
|
return ctx->kv[i].value.uint64;
|
20194
20214
|
}
|
20195
20215
|
|
20196
|
-
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
|
20216
|
+
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) {
|
20197
20217
|
return ctx->kv[i].value.int64;
|
20198
20218
|
}
|
20199
20219
|
|
20200
|
-
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
|
20220
|
+
double gguf_get_val_f64(const struct gguf_context * ctx, int i) {
|
20201
20221
|
return ctx->kv[i].value.float64;
|
20202
20222
|
}
|
20203
20223
|
|
20204
|
-
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
20224
|
+
bool gguf_get_val_bool(const struct gguf_context * ctx, int i) {
|
20205
20225
|
return ctx->kv[i].value.bool_;
|
20206
20226
|
}
|
20207
20227
|
|
20208
|
-
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
20228
|
+
const char * gguf_get_val_str (const struct gguf_context * ctx, int i) {
|
20209
20229
|
return ctx->kv[i].value.str.data;
|
20210
20230
|
}
|
20211
20231
|
|
20212
|
-
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
20232
|
+
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
20213
20233
|
return ctx->header.n_tensors;
|
20214
20234
|
}
|
20215
20235
|
|
20216
|
-
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
20236
|
+
int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
|
20217
20237
|
// return -1 if tensor not found
|
20218
20238
|
int tensorfound = -1;
|
20219
20239
|
|
@@ -20229,11 +20249,11 @@ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
|
20229
20249
|
return tensorfound;
|
20230
20250
|
}
|
20231
20251
|
|
20232
|
-
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
20252
|
+
size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
|
20233
20253
|
return ctx->infos[i].offset;
|
20234
20254
|
}
|
20235
20255
|
|
20236
|
-
char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
|
20256
|
+
char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
20237
20257
|
return ctx->infos[i].name.data;
|
20238
20258
|
}
|
20239
20259
|
|
@@ -20516,7 +20536,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
|
|
20516
20536
|
buf->offset += el_size;
|
20517
20537
|
}
|
20518
20538
|
|
20519
|
-
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
20539
|
+
static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
20520
20540
|
// write header
|
20521
20541
|
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
20522
20542
|
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
@@ -20631,7 +20651,7 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
|
20631
20651
|
}
|
20632
20652
|
}
|
20633
20653
|
|
20634
|
-
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
|
20654
|
+
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
20635
20655
|
FILE * file = fopen(fname, "wb");
|
20636
20656
|
if (!file) {
|
20637
20657
|
GGML_ASSERT(false && "failed to open file for writing");
|
@@ -20648,7 +20668,7 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only
|
|
20648
20668
|
fclose(file);
|
20649
20669
|
}
|
20650
20670
|
|
20651
|
-
size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
20671
|
+
size_t gguf_get_meta_size(const struct gguf_context * ctx) {
|
20652
20672
|
// no allocs - only compute size
|
20653
20673
|
struct gguf_buf buf = gguf_buf_init(0);
|
20654
20674
|
|
@@ -20657,7 +20677,7 @@ size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
|
20657
20677
|
return buf.offset;
|
20658
20678
|
}
|
20659
20679
|
|
20660
|
-
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
|
20680
|
+
void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
|
20661
20681
|
struct gguf_buf buf = gguf_buf_init(16*1024);
|
20662
20682
|
|
20663
20683
|
gguf_write_to_buf(ctx, &buf, true);
|
@@ -20733,6 +20753,14 @@ int ggml_cpu_has_arm_fma(void) {
|
|
20733
20753
|
#endif
|
20734
20754
|
}
|
20735
20755
|
|
20756
|
+
int ggml_cpu_has_metal(void) {
|
20757
|
+
#if defined(GGML_USE_METAL)
|
20758
|
+
return 1;
|
20759
|
+
#else
|
20760
|
+
return 0;
|
20761
|
+
#endif
|
20762
|
+
}
|
20763
|
+
|
20736
20764
|
int ggml_cpu_has_f16c(void) {
|
20737
20765
|
#if defined(__F16C__)
|
20738
20766
|
return 1;
|