llama_cpp 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +41 -21
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +396 -127
- data/ext/llama_cpp/src/ggml-metal.metal +290 -46
- data/ext/llama_cpp/src/ggml-opencl.cpp +47 -71
- data/ext/llama_cpp/src/ggml.c +71 -55
- data/ext/llama_cpp/src/ggml.h +15 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1851 -250
- data/ext/llama_cpp/src/llama.h +18 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -4
- metadata +5 -3
@@ -19,7 +19,7 @@
|
|
19
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define
|
22
|
+
#define CL_DMMV_LOCAL_SIZE 32
|
23
23
|
|
24
24
|
#ifndef K_QUANTS_PER_ITERATION
|
25
25
|
#define K_QUANTS_PER_ITERATION 1
|
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
|
|
338
338
|
const int row = get_group_id(0);
|
339
339
|
|
340
340
|
const int num_blocks_per_row = ncols / QK_K;
|
341
|
-
const int ib0 = row*num_blocks_per_row;
|
341
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
342
342
|
|
343
343
|
__global const struct block_q2_K * x = xx + ib0;
|
344
344
|
|
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
|
|
413
413
|
const int row = get_group_id(0);
|
414
414
|
|
415
415
|
const int num_blocks_per_row = ncols / QK_K;
|
416
|
-
const int ib0 = row*num_blocks_per_row;
|
416
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
417
417
|
|
418
418
|
__global const struct block_q3_K * x = xx + ib0;
|
419
419
|
|
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
|
|
489
489
|
|
490
490
|
const int row = get_group_id(0);
|
491
491
|
const int num_blocks_per_row = ncols / QK_K;
|
492
|
-
const int ib0 = row*num_blocks_per_row;
|
492
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
493
493
|
|
494
494
|
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
|
495
495
|
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
|
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
|
|
562
562
|
|
563
563
|
const int row = get_group_id(0);
|
564
564
|
const int num_blocks_per_row = ncols / QK_K;
|
565
|
-
const int ib0 = row*num_blocks_per_row;
|
565
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
566
566
|
|
567
567
|
const int tid = get_local_id(0)/2; // 0...15
|
568
568
|
const int ix = get_local_id(0)%2;
|
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
641
641
|
const int row = get_group_id(0);
|
642
642
|
|
643
643
|
const int num_blocks_per_row = ncols / QK_K;
|
644
|
-
const int ib0 = row*num_blocks_per_row;
|
644
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
645
645
|
|
646
646
|
__global const struct block_q6_K * x = xx + ib0;
|
647
647
|
|
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
|
745
745
|
|
746
746
|
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
747
747
|
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
748
|
-
const int
|
748
|
+
const int local_size = get_local_size(0);
|
749
749
|
const int row = get_group_id(0);
|
750
750
|
const int tid = get_local_id(0);
|
751
751
|
|
752
752
|
const uint qk = QUANT_K;
|
753
753
|
const uint qr = QUANT_R;
|
754
754
|
|
755
|
+
const int col_step = local_size * 2;
|
755
756
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
756
757
|
|
758
|
+
x += get_global_offset(0);
|
759
|
+
|
757
760
|
tmp[tid] = 0;
|
758
761
|
|
759
|
-
for (int
|
760
|
-
const int col = i*block_size + 2*tid;
|
762
|
+
for (int col = tid*2; col < ncols; col += col_step) {
|
761
763
|
const int ib = (row*ncols + col)/qk; // block index
|
762
764
|
const int iqs = (col%qk)/qr; // quant index
|
763
765
|
const int iybs = col - col%qk; // y block start index
|
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
773
775
|
|
774
776
|
// sum up partial sums and write back result
|
775
777
|
barrier(CLK_LOCAL_MEM_FENCE);
|
776
|
-
for (int s=
|
778
|
+
for (int s=local_size/2; s>0; s>>=1) {
|
777
779
|
if (tid < s) {
|
778
780
|
tmp[tid] += tmp[tid + s];
|
779
781
|
}
|
@@ -1393,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1393
1395
|
const int64_t ne01 = src0->ne[1];
|
1394
1396
|
const int64_t ne02 = src0->ne[2];
|
1395
1397
|
const int64_t ne03 = src0->ne[3];
|
1396
|
-
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
1397
1398
|
const int64_t ne10 = src1->ne[0];
|
1398
1399
|
const int64_t ne11 = src1->ne[1];
|
1399
1400
|
const int64_t ne12 = src1->ne[2];
|
1400
1401
|
const int64_t ne13 = src1->ne[3];
|
1401
|
-
const int64_t nb10 = src1->nb[0];
|
1402
1402
|
const int nb2 = dst->nb[2];
|
1403
1403
|
const int nb3 = dst->nb[3];
|
1404
1404
|
size_t x_size;
|
1405
1405
|
size_t d_size;
|
1406
1406
|
|
1407
|
-
cl_mem d_X = ggml_cl_pool_malloc(
|
1407
|
+
cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
|
1408
1408
|
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
1409
|
-
cl_mem d_D = ggml_cl_pool_malloc(
|
1409
|
+
cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
|
1410
1410
|
|
1411
1411
|
|
1412
1412
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1413
1413
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1414
|
-
const int i0 = i03*ne02 + i02;
|
1415
|
-
|
1416
1414
|
cl_event ev;
|
1417
1415
|
|
1418
1416
|
// copy src0 to device
|
1419
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X,
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
1441
|
-
} else {
|
1442
|
-
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
1443
|
-
const int64_t i13 = i03%ne13;
|
1444
|
-
const int64_t i12 = i02%ne12;
|
1445
|
-
const int64_t i11 = i01%ne11;
|
1446
|
-
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
1447
|
-
|
1448
|
-
cl_int x_offset = i01*ne00;
|
1449
|
-
cl_int y_offset = i1*ne10;
|
1450
|
-
cl_int d_offset = i01*ne00;
|
1451
|
-
|
1452
|
-
// compute
|
1453
|
-
size_t global = ne00;
|
1454
|
-
cl_int ky = ne10;
|
1455
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
1456
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
1457
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
1458
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
1459
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
1460
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
1461
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
1462
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
1463
|
-
}
|
1464
|
-
}
|
1417
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
|
1418
|
+
|
1419
|
+
const int64_t i13 = i03%ne13;
|
1420
|
+
const int64_t i12 = i02%ne12;
|
1421
|
+
const int i1 = i13*ne12*ne11 + i12*ne11;
|
1422
|
+
|
1423
|
+
cl_int x_offset = 0;
|
1424
|
+
cl_int y_offset = i1*ne10;
|
1425
|
+
cl_int d_offset = 0;
|
1426
|
+
|
1427
|
+
size_t global = ne00 * ne01;
|
1428
|
+
cl_int ky = ne10 * ne11;
|
1429
|
+
|
1430
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
1431
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
1432
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
1433
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
1434
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
1435
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
1436
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
1437
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
1465
1438
|
|
1466
1439
|
CL_CHECK(clReleaseEvent(ev));
|
1467
1440
|
CL_CHECK(clFinish(queue));
|
@@ -1566,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1566
1539
|
ggml_cl_pool_free(d_D, d_size);
|
1567
1540
|
}
|
1568
1541
|
|
1569
|
-
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t
|
1542
|
+
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
|
1570
1543
|
GGML_ASSERT(fp16_support);
|
1571
1544
|
|
1572
1545
|
const int64_t ne00 = src0->ne[0];
|
@@ -1596,6 +1569,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1596
1569
|
const int y_ne = ne11 * ne10;
|
1597
1570
|
const int d_ne = ne11 * ne01;
|
1598
1571
|
|
1572
|
+
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
|
1573
|
+
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
|
1574
|
+
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
|
1575
|
+
|
1599
1576
|
size_t x_size;
|
1600
1577
|
size_t y_size;
|
1601
1578
|
size_t d_size;
|
@@ -1632,7 +1609,6 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1632
1609
|
|
1633
1610
|
// convert src1 to fp16
|
1634
1611
|
// TODO: use multiple threads
|
1635
|
-
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
1636
1612
|
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1637
1613
|
if (src1_cont_rows) {
|
1638
1614
|
if (src1_cont_cols) {
|
@@ -1704,7 +1680,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1704
1680
|
const int nb2 = dst->nb[2];
|
1705
1681
|
const int nb3 = dst->nb[3];
|
1706
1682
|
const ggml_type type = src0->type;
|
1707
|
-
const bool mul_mat_vec = ne11 == 1;
|
1683
|
+
const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
|
1708
1684
|
|
1709
1685
|
const int64_t r2 = ne12 / ne02;
|
1710
1686
|
const int64_t r3 = ne13 / ne03;
|
@@ -1737,7 +1713,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1737
1713
|
GGML_ASSERT(to_fp32_cl != nullptr);
|
1738
1714
|
|
1739
1715
|
const size_t global_denom = ggml_cl_global_denom(type);
|
1740
|
-
const size_t local = ggml_cl_local_size(type);
|
1716
|
+
const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
|
1741
1717
|
|
1742
1718
|
size_t ev_idx = 0;
|
1743
1719
|
std::vector<cl_event> events;
|
@@ -1770,8 +1746,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1770
1746
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1771
1747
|
|
1772
1748
|
// compute
|
1773
|
-
const size_t global = ne01 *
|
1774
|
-
const size_t
|
1749
|
+
const size_t global = ne01 * local;
|
1750
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1775
1751
|
const cl_int ncols = ne00;
|
1776
1752
|
events.emplace_back();
|
1777
1753
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
@@ -1779,7 +1755,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1779
1755
|
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
1780
1756
|
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
1781
1757
|
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
1782
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1,
|
1758
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
1783
1759
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
1784
1760
|
// convert src0 to fp32 on device
|
1785
1761
|
const size_t global = x_ne / global_denom;
|
@@ -1895,8 +1871,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
|
|
1895
1871
|
}
|
1896
1872
|
|
1897
1873
|
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
1898
|
-
if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
1899
|
-
return
|
1874
|
+
if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
1875
|
+
return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
|
1900
1876
|
}
|
1901
1877
|
return 0;
|
1902
1878
|
}
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
|
|
162
162
|
|
163
163
|
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
164
164
|
|
165
|
+
//
|
166
|
+
// end of logging block
|
167
|
+
//
|
168
|
+
|
165
169
|
#ifdef GGML_USE_ACCELERATE
|
166
170
|
// uncomment to use vDSP for soft max computation
|
167
171
|
// note: not sure if it is actually faster
|
168
172
|
//#define GGML_SOFT_MAX_ACCELERATE
|
169
173
|
#endif
|
170
174
|
|
171
|
-
//
|
172
|
-
// logging
|
173
|
-
//
|
174
|
-
|
175
|
-
#if (GGML_DEBUG >= 1)
|
176
|
-
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
177
|
-
#else
|
178
|
-
#define GGML_PRINT_DEBUG(...)
|
179
|
-
#endif
|
180
|
-
|
181
|
-
#if (GGML_DEBUG >= 5)
|
182
|
-
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
183
|
-
#else
|
184
|
-
#define GGML_PRINT_DEBUG_5(...)
|
185
|
-
#endif
|
186
|
-
|
187
|
-
#if (GGML_DEBUG >= 10)
|
188
|
-
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
189
|
-
#else
|
190
|
-
#define GGML_PRINT_DEBUG_10(...)
|
191
|
-
#endif
|
192
|
-
|
193
|
-
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
194
|
-
|
195
|
-
//
|
196
|
-
// end of logging block
|
197
|
-
//
|
198
|
-
|
199
175
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
200
176
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
201
177
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
@@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4951
4927
|
*result = (struct ggml_tensor) {
|
4952
4928
|
/*.type =*/ type,
|
4953
4929
|
/*.backend =*/ GGML_BACKEND_CPU,
|
4930
|
+
/*.buffer =*/ NULL,
|
4954
4931
|
/*.n_dims =*/ n_dims,
|
4955
4932
|
/*.ne =*/ { 1, 1, 1, 1 },
|
4956
4933
|
/*.nb =*/ { 0, 0, 0, 0 },
|
@@ -5517,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
|
|
5517
5494
|
return result;
|
5518
5495
|
}
|
5519
5496
|
|
5497
|
+
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
5498
|
+
struct ggml_object * obj = ctx->objects_begin;
|
5499
|
+
|
5500
|
+
char * const mem_buffer = ctx->mem_buffer;
|
5501
|
+
|
5502
|
+
while (obj != NULL) {
|
5503
|
+
if (obj->type == GGML_OBJECT_TENSOR) {
|
5504
|
+
return (struct ggml_tensor *)(mem_buffer + obj->offs);
|
5505
|
+
}
|
5506
|
+
|
5507
|
+
obj = obj->next;
|
5508
|
+
}
|
5509
|
+
|
5510
|
+
return NULL;
|
5511
|
+
}
|
5512
|
+
|
5513
|
+
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
5514
|
+
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
5515
|
+
obj = obj->next;
|
5516
|
+
|
5517
|
+
char * const mem_buffer = ctx->mem_buffer;
|
5518
|
+
|
5519
|
+
while (obj != NULL) {
|
5520
|
+
if (obj->type == GGML_OBJECT_TENSOR) {
|
5521
|
+
return (struct ggml_tensor *)(mem_buffer + obj->offs);
|
5522
|
+
}
|
5523
|
+
|
5524
|
+
obj = obj->next;
|
5525
|
+
}
|
5526
|
+
|
5527
|
+
return NULL;
|
5528
|
+
}
|
5529
|
+
|
5520
5530
|
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
5521
5531
|
struct ggml_object * obj = ctx->objects_begin;
|
5522
5532
|
|
@@ -8670,6 +8680,7 @@ void ggml_set_param(
|
|
8670
8680
|
|
8671
8681
|
GGML_ASSERT(tensor->grad == NULL);
|
8672
8682
|
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
8683
|
+
ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
|
8673
8684
|
}
|
8674
8685
|
|
8675
8686
|
// ggml_compute_forward_dup
|
@@ -11256,7 +11267,7 @@ static void ggml_compute_forward_silu_f32(
|
|
11256
11267
|
|
11257
11268
|
#ifndef NDEBUG
|
11258
11269
|
for (int k = 0; k < nc; k++) {
|
11259
|
-
const float x = ((float *) ((char *) dst->data + i1*(
|
11270
|
+
const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
|
11260
11271
|
UNUSED(x);
|
11261
11272
|
assert(!isnan(x));
|
11262
11273
|
assert(!isinf(x));
|
@@ -13082,24 +13093,22 @@ static void ggml_compute_forward_alibi_f32(
|
|
13082
13093
|
return;
|
13083
13094
|
}
|
13084
13095
|
|
13085
|
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
13096
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
13086
13097
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
13087
13098
|
float max_bias;
|
13088
13099
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13089
13100
|
|
13090
|
-
|
13091
|
-
|
13092
|
-
const
|
13093
|
-
const
|
13094
|
-
const int ne2 = src0->ne[2]; // n_head -> this is k
|
13095
|
-
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
13101
|
+
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13102
|
+
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
13103
|
+
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
13104
|
+
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
13096
13105
|
|
13097
|
-
const
|
13098
|
-
const
|
13106
|
+
const int64_t n = ggml_nrows(src0);
|
13107
|
+
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
13099
13108
|
|
13100
|
-
const
|
13101
|
-
const
|
13102
|
-
const
|
13109
|
+
const size_t nb0 = src0->nb[0];
|
13110
|
+
const size_t nb1 = src0->nb[1];
|
13111
|
+
const size_t nb2 = src0->nb[2];
|
13103
13112
|
//const int nb3 = src0->nb[3];
|
13104
13113
|
|
13105
13114
|
GGML_ASSERT(nb0 == sizeof(float));
|
@@ -13111,9 +13120,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
13111
13120
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13112
13121
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13113
13122
|
|
13114
|
-
for (
|
13115
|
-
for (
|
13116
|
-
for (
|
13123
|
+
for (int64_t i = 0; i < ne0; i++) {
|
13124
|
+
for (int64_t j = 0; j < ne1; j++) {
|
13125
|
+
for (int64_t k = 0; k < ne2_ne3; k++) {
|
13117
13126
|
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13118
13127
|
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13119
13128
|
|
@@ -13128,7 +13137,6 @@ static void ggml_compute_forward_alibi_f32(
|
|
13128
13137
|
}
|
13129
13138
|
|
13130
13139
|
pdst[0] = i * m_k + src[0];
|
13131
|
-
|
13132
13140
|
}
|
13133
13141
|
}
|
13134
13142
|
}
|
@@ -13529,7 +13537,7 @@ static void ggml_compute_forward_rope_f16(
|
|
13529
13537
|
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
13530
13538
|
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
13531
13539
|
}
|
13532
|
-
} if (!is_neox) {
|
13540
|
+
} else if (!is_neox) {
|
13533
13541
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
13534
13542
|
const float cos_theta = cosf(theta);
|
13535
13543
|
const float sin_theta = sinf(theta);
|
@@ -14454,7 +14462,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
|
|
14454
14462
|
int64_t t0 = ggml_perf_time_us();
|
14455
14463
|
UNUSED(t0);
|
14456
14464
|
|
14457
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
14465
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
14458
14466
|
|
14459
14467
|
const int ith = params->ith;
|
14460
14468
|
const int nth = params->nth;
|
@@ -19162,6 +19170,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
19162
19170
|
|
19163
19171
|
if (idx == -1) {
|
19164
19172
|
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
19173
|
+
fclose(fout);
|
19165
19174
|
return;
|
19166
19175
|
}
|
19167
19176
|
|
@@ -20203,6 +20212,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
20203
20212
|
ggml_vec_cpy_f32(nx, xp, x);
|
20204
20213
|
ggml_vec_cpy_f32(nx, gp, g);
|
20205
20214
|
|
20215
|
+
// TODO: instead of passing &cancel here, use the return code of the linesearch
|
20216
|
+
// to determine if the optimization should be cancelled
|
20217
|
+
// this is a simple change, but not doing this atm, since I don't have a nice
|
20218
|
+
// way to test and don't want to break something with so many changes lined up
|
20206
20219
|
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
20207
20220
|
if (cancel) {
|
20208
20221
|
return GGML_OPT_CANCEL;
|
@@ -20832,7 +20845,7 @@ struct gguf_kv {
|
|
20832
20845
|
};
|
20833
20846
|
|
20834
20847
|
struct gguf_header {
|
20835
|
-
|
20848
|
+
char magic[4];
|
20836
20849
|
uint32_t version;
|
20837
20850
|
uint64_t n_tensors; // GGUFv2
|
20838
20851
|
uint64_t n_kv; // GGUFv2
|
@@ -20902,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
|
|
20902
20915
|
struct gguf_context * gguf_init_empty(void) {
|
20903
20916
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
20904
20917
|
|
20905
|
-
ctx->header.magic
|
20918
|
+
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
|
20906
20919
|
ctx->header.version = GGUF_VERSION;
|
20907
20920
|
ctx->header.n_tensors = 0;
|
20908
20921
|
ctx->header.n_kv = 0;
|
@@ -20928,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20928
20941
|
// offset from start of file
|
20929
20942
|
size_t offset = 0;
|
20930
20943
|
|
20931
|
-
|
20944
|
+
char magic[4];
|
20932
20945
|
|
20933
20946
|
// check the magic before making allocations
|
20934
20947
|
{
|
20935
20948
|
gguf_fread_el(file, &magic, sizeof(magic), &offset);
|
20936
20949
|
|
20937
|
-
|
20938
|
-
|
20939
|
-
|
20940
|
-
|
20950
|
+
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
20951
|
+
if (magic[i] != GGUF_MAGIC[i]) {
|
20952
|
+
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
|
20953
|
+
fclose(file);
|
20954
|
+
return NULL;
|
20955
|
+
}
|
20941
20956
|
}
|
20942
20957
|
}
|
20943
20958
|
|
@@ -20947,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20947
20962
|
|
20948
20963
|
// read the header
|
20949
20964
|
{
|
20950
|
-
ctx->header.magic
|
20965
|
+
strncpy(ctx->header.magic, magic, 4);
|
20966
|
+
|
20951
20967
|
|
20952
20968
|
ctx->kv = NULL;
|
20953
20969
|
ctx->infos = NULL;
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -231,8 +231,9 @@
|
|
231
231
|
#define GGML_EXIT_SUCCESS 0
|
232
232
|
#define GGML_EXIT_ABORTED 1
|
233
233
|
|
234
|
-
#define GGUF_MAGIC
|
235
|
-
|
234
|
+
#define GGUF_MAGIC "GGUF"
|
235
|
+
|
236
|
+
#define GGUF_VERSION 3
|
236
237
|
|
237
238
|
#define GGUF_DEFAULT_ALIGNMENT 32
|
238
239
|
|
@@ -326,7 +327,7 @@ extern "C" {
|
|
326
327
|
GGML_TYPE_COUNT,
|
327
328
|
};
|
328
329
|
|
329
|
-
enum
|
330
|
+
enum ggml_backend_type {
|
330
331
|
GGML_BACKEND_CPU = 0,
|
331
332
|
GGML_BACKEND_GPU = 10,
|
332
333
|
GGML_BACKEND_GPU_SPLIT = 20,
|
@@ -479,8 +480,10 @@ extern "C" {
|
|
479
480
|
|
480
481
|
// n-dimensional tensor
|
481
482
|
struct ggml_tensor {
|
482
|
-
enum ggml_type
|
483
|
-
enum
|
483
|
+
enum ggml_type type;
|
484
|
+
enum ggml_backend_type backend;
|
485
|
+
|
486
|
+
struct ggml_backend_buffer * buffer;
|
484
487
|
|
485
488
|
int n_dims;
|
486
489
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -514,7 +517,7 @@ extern "C" {
|
|
514
517
|
|
515
518
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
516
519
|
|
517
|
-
char padding[
|
520
|
+
char padding[12];
|
518
521
|
};
|
519
522
|
|
520
523
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -702,6 +705,9 @@ extern "C" {
|
|
702
705
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
703
706
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
704
707
|
|
708
|
+
// Context tensor enumeration and lookup
|
709
|
+
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
710
|
+
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
705
711
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
706
712
|
|
707
713
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
@@ -1358,7 +1364,7 @@ extern "C" {
|
|
1358
1364
|
|
1359
1365
|
// alibi position embedding
|
1360
1366
|
// in-place, returns view(a)
|
1361
|
-
struct ggml_tensor * ggml_alibi(
|
1367
|
+
GGML_API struct ggml_tensor * ggml_alibi(
|
1362
1368
|
struct ggml_context * ctx,
|
1363
1369
|
struct ggml_tensor * a,
|
1364
1370
|
int n_past,
|
@@ -1367,7 +1373,7 @@ extern "C" {
|
|
1367
1373
|
|
1368
1374
|
// clamp
|
1369
1375
|
// in-place, returns view(a)
|
1370
|
-
struct ggml_tensor * ggml_clamp(
|
1376
|
+
GGML_API struct ggml_tensor * ggml_clamp(
|
1371
1377
|
struct ggml_context * ctx,
|
1372
1378
|
struct ggml_tensor * a,
|
1373
1379
|
float min,
|
@@ -2102,7 +2108,7 @@ extern "C" {
|
|
2102
2108
|
enum ggml_type vec_dot_type;
|
2103
2109
|
} ggml_type_traits_t;
|
2104
2110
|
|
2105
|
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2111
|
+
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2106
2112
|
|
2107
2113
|
#ifdef __cplusplus
|
2108
2114
|
}
|
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
46
46
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
47
47
|
#include <intrin.h>
|
48
48
|
#else
|
49
|
-
#if !defined(__riscv)
|
49
|
+
#if !defined(__riscv) && !defined(__s390__)
|
50
50
|
#include <immintrin.h>
|
51
51
|
#endif
|
52
52
|
#endif
|
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
|
462
462
|
}
|
463
463
|
|
464
464
|
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
465
|
-
|
466
|
-
|
467
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
468
|
-
(void)hist;
|
465
|
+
(void)hist; // TODO: collect histograms
|
469
466
|
|
470
|
-
for (int j = 0; j <
|
467
|
+
for (int j = 0; j < n; j += k) {
|
471
468
|
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
472
469
|
quantize_row_q2_K_reference(src + j, y, k);
|
473
470
|
}
|
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
|
678
675
|
}
|
679
676
|
|
680
677
|
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
681
|
-
|
682
|
-
|
683
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
684
|
-
(void)hist;
|
678
|
+
(void)hist; // TODO: collect histograms
|
685
679
|
|
686
|
-
for (int j = 0; j <
|
680
|
+
for (int j = 0; j < n; j += k) {
|
687
681
|
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
688
682
|
quantize_row_q3_K_reference(src + j, y, k);
|
689
683
|
}
|
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
|
846
840
|
|
847
841
|
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
848
842
|
assert(k % QK_K == 0);
|
849
|
-
const int nb = k / QK_K;
|
850
843
|
(void)hist; // TODO: collect histograms
|
851
|
-
|
844
|
+
|
845
|
+
for (int j = 0; j < n; j += k) {
|
852
846
|
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
853
847
|
quantize_row_q4_K_reference(src + j, y, k);
|
854
848
|
}
|
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
|
1052
1046
|
|
1053
1047
|
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
1054
1048
|
assert(k % QK_K == 0);
|
1055
|
-
|
1056
|
-
|
1057
|
-
for (int j = 0; j <
|
1049
|
+
(void)hist; // TODO: collect histograms
|
1050
|
+
|
1051
|
+
for (int j = 0; j < n; j += k) {
|
1058
1052
|
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
1059
1053
|
quantize_row_q5_K_reference(src + j, y, k);
|
1060
1054
|
}
|
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
|
1200
1194
|
|
1201
1195
|
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
1202
1196
|
assert(k % QK_K == 0);
|
1203
|
-
|
1204
|
-
|
1205
|
-
(void)hist; // TODO
|
1197
|
+
(void)hist; // TODO: collect histograms
|
1206
1198
|
|
1207
|
-
for (int j = 0; j <
|
1199
|
+
for (int j = 0; j < n; j += k) {
|
1208
1200
|
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
1209
1201
|
quantize_row_q6_K_reference(src + j, y, k);
|
1210
1202
|
}
|