llama_cpp 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +41 -21
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +396 -127
- data/ext/llama_cpp/src/ggml-metal.metal +290 -46
- data/ext/llama_cpp/src/ggml-opencl.cpp +47 -71
- data/ext/llama_cpp/src/ggml.c +71 -55
- data/ext/llama_cpp/src/ggml.h +15 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1851 -250
- data/ext/llama_cpp/src/llama.h +18 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -4
- metadata +5 -3
@@ -19,7 +19,7 @@
|
|
19
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define
|
22
|
+
#define CL_DMMV_LOCAL_SIZE 32
|
23
23
|
|
24
24
|
#ifndef K_QUANTS_PER_ITERATION
|
25
25
|
#define K_QUANTS_PER_ITERATION 1
|
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
|
|
338
338
|
const int row = get_group_id(0);
|
339
339
|
|
340
340
|
const int num_blocks_per_row = ncols / QK_K;
|
341
|
-
const int ib0 = row*num_blocks_per_row;
|
341
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
342
342
|
|
343
343
|
__global const struct block_q2_K * x = xx + ib0;
|
344
344
|
|
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
|
|
413
413
|
const int row = get_group_id(0);
|
414
414
|
|
415
415
|
const int num_blocks_per_row = ncols / QK_K;
|
416
|
-
const int ib0 = row*num_blocks_per_row;
|
416
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
417
417
|
|
418
418
|
__global const struct block_q3_K * x = xx + ib0;
|
419
419
|
|
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
|
|
489
489
|
|
490
490
|
const int row = get_group_id(0);
|
491
491
|
const int num_blocks_per_row = ncols / QK_K;
|
492
|
-
const int ib0 = row*num_blocks_per_row;
|
492
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
493
493
|
|
494
494
|
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
|
495
495
|
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
|
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
|
|
562
562
|
|
563
563
|
const int row = get_group_id(0);
|
564
564
|
const int num_blocks_per_row = ncols / QK_K;
|
565
|
-
const int ib0 = row*num_blocks_per_row;
|
565
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
566
566
|
|
567
567
|
const int tid = get_local_id(0)/2; // 0...15
|
568
568
|
const int ix = get_local_id(0)%2;
|
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
641
641
|
const int row = get_group_id(0);
|
642
642
|
|
643
643
|
const int num_blocks_per_row = ncols / QK_K;
|
644
|
-
const int ib0 = row*num_blocks_per_row;
|
644
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
645
645
|
|
646
646
|
__global const struct block_q6_K * x = xx + ib0;
|
647
647
|
|
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
|
745
745
|
|
746
746
|
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
747
747
|
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
748
|
-
const int
|
748
|
+
const int local_size = get_local_size(0);
|
749
749
|
const int row = get_group_id(0);
|
750
750
|
const int tid = get_local_id(0);
|
751
751
|
|
752
752
|
const uint qk = QUANT_K;
|
753
753
|
const uint qr = QUANT_R;
|
754
754
|
|
755
|
+
const int col_step = local_size * 2;
|
755
756
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
756
757
|
|
758
|
+
x += get_global_offset(0);
|
759
|
+
|
757
760
|
tmp[tid] = 0;
|
758
761
|
|
759
|
-
for (int
|
760
|
-
const int col = i*block_size + 2*tid;
|
762
|
+
for (int col = tid*2; col < ncols; col += col_step) {
|
761
763
|
const int ib = (row*ncols + col)/qk; // block index
|
762
764
|
const int iqs = (col%qk)/qr; // quant index
|
763
765
|
const int iybs = col - col%qk; // y block start index
|
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
773
775
|
|
774
776
|
// sum up partial sums and write back result
|
775
777
|
barrier(CLK_LOCAL_MEM_FENCE);
|
776
|
-
for (int s=
|
778
|
+
for (int s=local_size/2; s>0; s>>=1) {
|
777
779
|
if (tid < s) {
|
778
780
|
tmp[tid] += tmp[tid + s];
|
779
781
|
}
|
@@ -1393,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1393
1395
|
const int64_t ne01 = src0->ne[1];
|
1394
1396
|
const int64_t ne02 = src0->ne[2];
|
1395
1397
|
const int64_t ne03 = src0->ne[3];
|
1396
|
-
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
1397
1398
|
const int64_t ne10 = src1->ne[0];
|
1398
1399
|
const int64_t ne11 = src1->ne[1];
|
1399
1400
|
const int64_t ne12 = src1->ne[2];
|
1400
1401
|
const int64_t ne13 = src1->ne[3];
|
1401
|
-
const int64_t nb10 = src1->nb[0];
|
1402
1402
|
const int nb2 = dst->nb[2];
|
1403
1403
|
const int nb3 = dst->nb[3];
|
1404
1404
|
size_t x_size;
|
1405
1405
|
size_t d_size;
|
1406
1406
|
|
1407
|
-
cl_mem d_X = ggml_cl_pool_malloc(
|
1407
|
+
cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
|
1408
1408
|
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
1409
|
-
cl_mem d_D = ggml_cl_pool_malloc(
|
1409
|
+
cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
|
1410
1410
|
|
1411
1411
|
|
1412
1412
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1413
1413
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1414
|
-
const int i0 = i03*ne02 + i02;
|
1415
|
-
|
1416
1414
|
cl_event ev;
|
1417
1415
|
|
1418
1416
|
// copy src0 to device
|
1419
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X,
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
1441
|
-
} else {
|
1442
|
-
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
1443
|
-
const int64_t i13 = i03%ne13;
|
1444
|
-
const int64_t i12 = i02%ne12;
|
1445
|
-
const int64_t i11 = i01%ne11;
|
1446
|
-
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
1447
|
-
|
1448
|
-
cl_int x_offset = i01*ne00;
|
1449
|
-
cl_int y_offset = i1*ne10;
|
1450
|
-
cl_int d_offset = i01*ne00;
|
1451
|
-
|
1452
|
-
// compute
|
1453
|
-
size_t global = ne00;
|
1454
|
-
cl_int ky = ne10;
|
1455
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
1456
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
1457
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
1458
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
1459
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
1460
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
1461
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
1462
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
1463
|
-
}
|
1464
|
-
}
|
1417
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
|
1418
|
+
|
1419
|
+
const int64_t i13 = i03%ne13;
|
1420
|
+
const int64_t i12 = i02%ne12;
|
1421
|
+
const int i1 = i13*ne12*ne11 + i12*ne11;
|
1422
|
+
|
1423
|
+
cl_int x_offset = 0;
|
1424
|
+
cl_int y_offset = i1*ne10;
|
1425
|
+
cl_int d_offset = 0;
|
1426
|
+
|
1427
|
+
size_t global = ne00 * ne01;
|
1428
|
+
cl_int ky = ne10 * ne11;
|
1429
|
+
|
1430
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
1431
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
1432
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
1433
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
1434
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
1435
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
1436
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
1437
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
1465
1438
|
|
1466
1439
|
CL_CHECK(clReleaseEvent(ev));
|
1467
1440
|
CL_CHECK(clFinish(queue));
|
@@ -1566,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1566
1539
|
ggml_cl_pool_free(d_D, d_size);
|
1567
1540
|
}
|
1568
1541
|
|
1569
|
-
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t
|
1542
|
+
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
|
1570
1543
|
GGML_ASSERT(fp16_support);
|
1571
1544
|
|
1572
1545
|
const int64_t ne00 = src0->ne[0];
|
@@ -1596,6 +1569,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1596
1569
|
const int y_ne = ne11 * ne10;
|
1597
1570
|
const int d_ne = ne11 * ne01;
|
1598
1571
|
|
1572
|
+
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
|
1573
|
+
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
|
1574
|
+
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
|
1575
|
+
|
1599
1576
|
size_t x_size;
|
1600
1577
|
size_t y_size;
|
1601
1578
|
size_t d_size;
|
@@ -1632,7 +1609,6 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1632
1609
|
|
1633
1610
|
// convert src1 to fp16
|
1634
1611
|
// TODO: use multiple threads
|
1635
|
-
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
1636
1612
|
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1637
1613
|
if (src1_cont_rows) {
|
1638
1614
|
if (src1_cont_cols) {
|
@@ -1704,7 +1680,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1704
1680
|
const int nb2 = dst->nb[2];
|
1705
1681
|
const int nb3 = dst->nb[3];
|
1706
1682
|
const ggml_type type = src0->type;
|
1707
|
-
const bool mul_mat_vec = ne11 == 1;
|
1683
|
+
const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
|
1708
1684
|
|
1709
1685
|
const int64_t r2 = ne12 / ne02;
|
1710
1686
|
const int64_t r3 = ne13 / ne03;
|
@@ -1737,7 +1713,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1737
1713
|
GGML_ASSERT(to_fp32_cl != nullptr);
|
1738
1714
|
|
1739
1715
|
const size_t global_denom = ggml_cl_global_denom(type);
|
1740
|
-
const size_t local = ggml_cl_local_size(type);
|
1716
|
+
const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
|
1741
1717
|
|
1742
1718
|
size_t ev_idx = 0;
|
1743
1719
|
std::vector<cl_event> events;
|
@@ -1770,8 +1746,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1770
1746
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1771
1747
|
|
1772
1748
|
// compute
|
1773
|
-
const size_t global = ne01 *
|
1774
|
-
const size_t
|
1749
|
+
const size_t global = ne01 * local;
|
1750
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1775
1751
|
const cl_int ncols = ne00;
|
1776
1752
|
events.emplace_back();
|
1777
1753
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
@@ -1779,7 +1755,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1779
1755
|
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
1780
1756
|
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
1781
1757
|
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
1782
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1,
|
1758
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
1783
1759
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
1784
1760
|
// convert src0 to fp32 on device
|
1785
1761
|
const size_t global = x_ne / global_denom;
|
@@ -1895,8 +1871,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
|
|
1895
1871
|
}
|
1896
1872
|
|
1897
1873
|
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
1898
|
-
if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
1899
|
-
return
|
1874
|
+
if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
1875
|
+
return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
|
1900
1876
|
}
|
1901
1877
|
return 0;
|
1902
1878
|
}
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
|
|
162
162
|
|
163
163
|
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
164
164
|
|
165
|
+
//
|
166
|
+
// end of logging block
|
167
|
+
//
|
168
|
+
|
165
169
|
#ifdef GGML_USE_ACCELERATE
|
166
170
|
// uncomment to use vDSP for soft max computation
|
167
171
|
// note: not sure if it is actually faster
|
168
172
|
//#define GGML_SOFT_MAX_ACCELERATE
|
169
173
|
#endif
|
170
174
|
|
171
|
-
//
|
172
|
-
// logging
|
173
|
-
//
|
174
|
-
|
175
|
-
#if (GGML_DEBUG >= 1)
|
176
|
-
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
177
|
-
#else
|
178
|
-
#define GGML_PRINT_DEBUG(...)
|
179
|
-
#endif
|
180
|
-
|
181
|
-
#if (GGML_DEBUG >= 5)
|
182
|
-
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
183
|
-
#else
|
184
|
-
#define GGML_PRINT_DEBUG_5(...)
|
185
|
-
#endif
|
186
|
-
|
187
|
-
#if (GGML_DEBUG >= 10)
|
188
|
-
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
189
|
-
#else
|
190
|
-
#define GGML_PRINT_DEBUG_10(...)
|
191
|
-
#endif
|
192
|
-
|
193
|
-
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
194
|
-
|
195
|
-
//
|
196
|
-
// end of logging block
|
197
|
-
//
|
198
|
-
|
199
175
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
200
176
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
201
177
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
@@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4951
4927
|
*result = (struct ggml_tensor) {
|
4952
4928
|
/*.type =*/ type,
|
4953
4929
|
/*.backend =*/ GGML_BACKEND_CPU,
|
4930
|
+
/*.buffer =*/ NULL,
|
4954
4931
|
/*.n_dims =*/ n_dims,
|
4955
4932
|
/*.ne =*/ { 1, 1, 1, 1 },
|
4956
4933
|
/*.nb =*/ { 0, 0, 0, 0 },
|
@@ -5517,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
|
|
5517
5494
|
return result;
|
5518
5495
|
}
|
5519
5496
|
|
5497
|
+
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
5498
|
+
struct ggml_object * obj = ctx->objects_begin;
|
5499
|
+
|
5500
|
+
char * const mem_buffer = ctx->mem_buffer;
|
5501
|
+
|
5502
|
+
while (obj != NULL) {
|
5503
|
+
if (obj->type == GGML_OBJECT_TENSOR) {
|
5504
|
+
return (struct ggml_tensor *)(mem_buffer + obj->offs);
|
5505
|
+
}
|
5506
|
+
|
5507
|
+
obj = obj->next;
|
5508
|
+
}
|
5509
|
+
|
5510
|
+
return NULL;
|
5511
|
+
}
|
5512
|
+
|
5513
|
+
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
5514
|
+
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
5515
|
+
obj = obj->next;
|
5516
|
+
|
5517
|
+
char * const mem_buffer = ctx->mem_buffer;
|
5518
|
+
|
5519
|
+
while (obj != NULL) {
|
5520
|
+
if (obj->type == GGML_OBJECT_TENSOR) {
|
5521
|
+
return (struct ggml_tensor *)(mem_buffer + obj->offs);
|
5522
|
+
}
|
5523
|
+
|
5524
|
+
obj = obj->next;
|
5525
|
+
}
|
5526
|
+
|
5527
|
+
return NULL;
|
5528
|
+
}
|
5529
|
+
|
5520
5530
|
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
5521
5531
|
struct ggml_object * obj = ctx->objects_begin;
|
5522
5532
|
|
@@ -8670,6 +8680,7 @@ void ggml_set_param(
|
|
8670
8680
|
|
8671
8681
|
GGML_ASSERT(tensor->grad == NULL);
|
8672
8682
|
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
8683
|
+
ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
|
8673
8684
|
}
|
8674
8685
|
|
8675
8686
|
// ggml_compute_forward_dup
|
@@ -11256,7 +11267,7 @@ static void ggml_compute_forward_silu_f32(
|
|
11256
11267
|
|
11257
11268
|
#ifndef NDEBUG
|
11258
11269
|
for (int k = 0; k < nc; k++) {
|
11259
|
-
const float x = ((float *) ((char *) dst->data + i1*(
|
11270
|
+
const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
|
11260
11271
|
UNUSED(x);
|
11261
11272
|
assert(!isnan(x));
|
11262
11273
|
assert(!isinf(x));
|
@@ -13082,24 +13093,22 @@ static void ggml_compute_forward_alibi_f32(
|
|
13082
13093
|
return;
|
13083
13094
|
}
|
13084
13095
|
|
13085
|
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
13096
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
13086
13097
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
13087
13098
|
float max_bias;
|
13088
13099
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13089
13100
|
|
13090
|
-
|
13091
|
-
|
13092
|
-
const
|
13093
|
-
const
|
13094
|
-
const int ne2 = src0->ne[2]; // n_head -> this is k
|
13095
|
-
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
13101
|
+
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13102
|
+
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
13103
|
+
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
13104
|
+
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
13096
13105
|
|
13097
|
-
const
|
13098
|
-
const
|
13106
|
+
const int64_t n = ggml_nrows(src0);
|
13107
|
+
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
13099
13108
|
|
13100
|
-
const
|
13101
|
-
const
|
13102
|
-
const
|
13109
|
+
const size_t nb0 = src0->nb[0];
|
13110
|
+
const size_t nb1 = src0->nb[1];
|
13111
|
+
const size_t nb2 = src0->nb[2];
|
13103
13112
|
//const int nb3 = src0->nb[3];
|
13104
13113
|
|
13105
13114
|
GGML_ASSERT(nb0 == sizeof(float));
|
@@ -13111,9 +13120,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
13111
13120
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13112
13121
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13113
13122
|
|
13114
|
-
for (
|
13115
|
-
for (
|
13116
|
-
for (
|
13123
|
+
for (int64_t i = 0; i < ne0; i++) {
|
13124
|
+
for (int64_t j = 0; j < ne1; j++) {
|
13125
|
+
for (int64_t k = 0; k < ne2_ne3; k++) {
|
13117
13126
|
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13118
13127
|
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13119
13128
|
|
@@ -13128,7 +13137,6 @@ static void ggml_compute_forward_alibi_f32(
|
|
13128
13137
|
}
|
13129
13138
|
|
13130
13139
|
pdst[0] = i * m_k + src[0];
|
13131
|
-
|
13132
13140
|
}
|
13133
13141
|
}
|
13134
13142
|
}
|
@@ -13529,7 +13537,7 @@ static void ggml_compute_forward_rope_f16(
|
|
13529
13537
|
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
13530
13538
|
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
13531
13539
|
}
|
13532
|
-
} if (!is_neox) {
|
13540
|
+
} else if (!is_neox) {
|
13533
13541
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
13534
13542
|
const float cos_theta = cosf(theta);
|
13535
13543
|
const float sin_theta = sinf(theta);
|
@@ -14454,7 +14462,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
|
|
14454
14462
|
int64_t t0 = ggml_perf_time_us();
|
14455
14463
|
UNUSED(t0);
|
14456
14464
|
|
14457
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
14465
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
14458
14466
|
|
14459
14467
|
const int ith = params->ith;
|
14460
14468
|
const int nth = params->nth;
|
@@ -19162,6 +19170,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
19162
19170
|
|
19163
19171
|
if (idx == -1) {
|
19164
19172
|
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
19173
|
+
fclose(fout);
|
19165
19174
|
return;
|
19166
19175
|
}
|
19167
19176
|
|
@@ -20203,6 +20212,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
20203
20212
|
ggml_vec_cpy_f32(nx, xp, x);
|
20204
20213
|
ggml_vec_cpy_f32(nx, gp, g);
|
20205
20214
|
|
20215
|
+
// TODO: instead of passing &cancel here, use the return code of the linesearch
|
20216
|
+
// to determine if the optimization should be cancelled
|
20217
|
+
// this is a simple change, but not doing this atm, since I don't have a nice
|
20218
|
+
// way to test and don't want to break something with so many changes lined up
|
20206
20219
|
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
20207
20220
|
if (cancel) {
|
20208
20221
|
return GGML_OPT_CANCEL;
|
@@ -20832,7 +20845,7 @@ struct gguf_kv {
|
|
20832
20845
|
};
|
20833
20846
|
|
20834
20847
|
struct gguf_header {
|
20835
|
-
|
20848
|
+
char magic[4];
|
20836
20849
|
uint32_t version;
|
20837
20850
|
uint64_t n_tensors; // GGUFv2
|
20838
20851
|
uint64_t n_kv; // GGUFv2
|
@@ -20902,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
|
|
20902
20915
|
struct gguf_context * gguf_init_empty(void) {
|
20903
20916
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
20904
20917
|
|
20905
|
-
ctx->header.magic
|
20918
|
+
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
|
20906
20919
|
ctx->header.version = GGUF_VERSION;
|
20907
20920
|
ctx->header.n_tensors = 0;
|
20908
20921
|
ctx->header.n_kv = 0;
|
@@ -20928,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20928
20941
|
// offset from start of file
|
20929
20942
|
size_t offset = 0;
|
20930
20943
|
|
20931
|
-
|
20944
|
+
char magic[4];
|
20932
20945
|
|
20933
20946
|
// check the magic before making allocations
|
20934
20947
|
{
|
20935
20948
|
gguf_fread_el(file, &magic, sizeof(magic), &offset);
|
20936
20949
|
|
20937
|
-
|
20938
|
-
|
20939
|
-
|
20940
|
-
|
20950
|
+
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
20951
|
+
if (magic[i] != GGUF_MAGIC[i]) {
|
20952
|
+
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
|
20953
|
+
fclose(file);
|
20954
|
+
return NULL;
|
20955
|
+
}
|
20941
20956
|
}
|
20942
20957
|
}
|
20943
20958
|
|
@@ -20947,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20947
20962
|
|
20948
20963
|
// read the header
|
20949
20964
|
{
|
20950
|
-
ctx->header.magic
|
20965
|
+
strncpy(ctx->header.magic, magic, 4);
|
20966
|
+
|
20951
20967
|
|
20952
20968
|
ctx->kv = NULL;
|
20953
20969
|
ctx->infos = NULL;
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -231,8 +231,9 @@
|
|
231
231
|
#define GGML_EXIT_SUCCESS 0
|
232
232
|
#define GGML_EXIT_ABORTED 1
|
233
233
|
|
234
|
-
#define GGUF_MAGIC
|
235
|
-
|
234
|
+
#define GGUF_MAGIC "GGUF"
|
235
|
+
|
236
|
+
#define GGUF_VERSION 3
|
236
237
|
|
237
238
|
#define GGUF_DEFAULT_ALIGNMENT 32
|
238
239
|
|
@@ -326,7 +327,7 @@ extern "C" {
|
|
326
327
|
GGML_TYPE_COUNT,
|
327
328
|
};
|
328
329
|
|
329
|
-
enum
|
330
|
+
enum ggml_backend_type {
|
330
331
|
GGML_BACKEND_CPU = 0,
|
331
332
|
GGML_BACKEND_GPU = 10,
|
332
333
|
GGML_BACKEND_GPU_SPLIT = 20,
|
@@ -479,8 +480,10 @@ extern "C" {
|
|
479
480
|
|
480
481
|
// n-dimensional tensor
|
481
482
|
struct ggml_tensor {
|
482
|
-
enum ggml_type
|
483
|
-
enum
|
483
|
+
enum ggml_type type;
|
484
|
+
enum ggml_backend_type backend;
|
485
|
+
|
486
|
+
struct ggml_backend_buffer * buffer;
|
484
487
|
|
485
488
|
int n_dims;
|
486
489
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -514,7 +517,7 @@ extern "C" {
|
|
514
517
|
|
515
518
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
516
519
|
|
517
|
-
char padding[
|
520
|
+
char padding[12];
|
518
521
|
};
|
519
522
|
|
520
523
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -702,6 +705,9 @@ extern "C" {
|
|
702
705
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
703
706
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
704
707
|
|
708
|
+
// Context tensor enumeration and lookup
|
709
|
+
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
710
|
+
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
705
711
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
706
712
|
|
707
713
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
@@ -1358,7 +1364,7 @@ extern "C" {
|
|
1358
1364
|
|
1359
1365
|
// alibi position embedding
|
1360
1366
|
// in-place, returns view(a)
|
1361
|
-
struct ggml_tensor * ggml_alibi(
|
1367
|
+
GGML_API struct ggml_tensor * ggml_alibi(
|
1362
1368
|
struct ggml_context * ctx,
|
1363
1369
|
struct ggml_tensor * a,
|
1364
1370
|
int n_past,
|
@@ -1367,7 +1373,7 @@ extern "C" {
|
|
1367
1373
|
|
1368
1374
|
// clamp
|
1369
1375
|
// in-place, returns view(a)
|
1370
|
-
struct ggml_tensor * ggml_clamp(
|
1376
|
+
GGML_API struct ggml_tensor * ggml_clamp(
|
1371
1377
|
struct ggml_context * ctx,
|
1372
1378
|
struct ggml_tensor * a,
|
1373
1379
|
float min,
|
@@ -2102,7 +2108,7 @@ extern "C" {
|
|
2102
2108
|
enum ggml_type vec_dot_type;
|
2103
2109
|
} ggml_type_traits_t;
|
2104
2110
|
|
2105
|
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2111
|
+
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2106
2112
|
|
2107
2113
|
#ifdef __cplusplus
|
2108
2114
|
}
|
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
46
46
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
47
47
|
#include <intrin.h>
|
48
48
|
#else
|
49
|
-
#if !defined(__riscv)
|
49
|
+
#if !defined(__riscv) && !defined(__s390__)
|
50
50
|
#include <immintrin.h>
|
51
51
|
#endif
|
52
52
|
#endif
|
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
|
462
462
|
}
|
463
463
|
|
464
464
|
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
465
|
-
|
466
|
-
|
467
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
468
|
-
(void)hist;
|
465
|
+
(void)hist; // TODO: collect histograms
|
469
466
|
|
470
|
-
for (int j = 0; j <
|
467
|
+
for (int j = 0; j < n; j += k) {
|
471
468
|
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
472
469
|
quantize_row_q2_K_reference(src + j, y, k);
|
473
470
|
}
|
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
|
678
675
|
}
|
679
676
|
|
680
677
|
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
681
|
-
|
682
|
-
|
683
|
-
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
684
|
-
(void)hist;
|
678
|
+
(void)hist; // TODO: collect histograms
|
685
679
|
|
686
|
-
for (int j = 0; j <
|
680
|
+
for (int j = 0; j < n; j += k) {
|
687
681
|
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
688
682
|
quantize_row_q3_K_reference(src + j, y, k);
|
689
683
|
}
|
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
|
846
840
|
|
847
841
|
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
848
842
|
assert(k % QK_K == 0);
|
849
|
-
const int nb = k / QK_K;
|
850
843
|
(void)hist; // TODO: collect histograms
|
851
|
-
|
844
|
+
|
845
|
+
for (int j = 0; j < n; j += k) {
|
852
846
|
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
853
847
|
quantize_row_q4_K_reference(src + j, y, k);
|
854
848
|
}
|
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
|
1052
1046
|
|
1053
1047
|
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
1054
1048
|
assert(k % QK_K == 0);
|
1055
|
-
|
1056
|
-
|
1057
|
-
for (int j = 0; j <
|
1049
|
+
(void)hist; // TODO: collect histograms
|
1050
|
+
|
1051
|
+
for (int j = 0; j < n; j += k) {
|
1058
1052
|
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
1059
1053
|
quantize_row_q5_K_reference(src + j, y, k);
|
1060
1054
|
}
|
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
|
1200
1194
|
|
1201
1195
|
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
1202
1196
|
assert(k % QK_K == 0);
|
1203
|
-
|
1204
|
-
|
1205
|
-
(void)hist; // TODO
|
1197
|
+
(void)hist; // TODO: collect histograms
|
1206
1198
|
|
1207
|
-
for (int j = 0; j <
|
1199
|
+
for (int j = 0; j < n; j += k) {
|
1208
1200
|
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
1209
1201
|
quantize_row_q6_K_reference(src + j, y, k);
|
1210
1202
|
}
|