llama_cpp 0.12.7 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +72 -262
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -25
- data/vendor/tmp/llama.cpp/Makefile +8 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
- data/vendor/tmp/llama.cpp/ggml.c +318 -195
- data/vendor/tmp/llama.cpp/ggml.h +35 -19
- data/vendor/tmp/llama.cpp/llama.cpp +806 -531
- data/vendor/tmp/llama.cpp/llama.h +53 -65
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
|
1354
1354
|
}
|
1355
1355
|
|
1356
1356
|
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
1357
|
-
if (tensor->backend !=
|
1357
|
+
if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
|
1358
1358
|
return;
|
1359
1359
|
}
|
1360
1360
|
|
@@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|
1412
1412
|
}
|
1413
1413
|
|
1414
1414
|
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1415
|
-
GGML_ASSERT(src1->backend ==
|
1415
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
1416
1416
|
const int64_t ne00 = src0->ne[0];
|
1417
1417
|
const int64_t ne01 = src0->ne[1];
|
1418
1418
|
const int64_t ne02 = src0->ne[2];
|
@@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
|
|
1476
1476
|
}
|
1477
1477
|
|
1478
1478
|
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1479
|
-
GGML_ASSERT(src1->backend ==
|
1479
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
1480
1480
|
const int64_t ne00 = src0->ne[0];
|
1481
1481
|
const int64_t ne01 = src0->ne[1];
|
1482
1482
|
const int64_t ne02 = src0->ne[2];
|
@@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1566
1566
|
size_t y_size;
|
1567
1567
|
size_t d_size;
|
1568
1568
|
cl_mem d_X;
|
1569
|
-
if (src0->backend ==
|
1569
|
+
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
|
1570
1570
|
d_X = (cl_mem) src0->extra;
|
1571
1571
|
} else {
|
1572
1572
|
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
1573
1573
|
}
|
1574
|
-
cl_mem d_Y = src1->backend ==
|
1575
|
-
cl_mem d_D = dst->backend ==
|
1574
|
+
cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1575
|
+
cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
1576
1576
|
|
1577
1577
|
size_t x_offset = 0;
|
1578
1578
|
|
@@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1580
1580
|
// TODO: copy src0 here when r3>1
|
1581
1581
|
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1582
1582
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1583
|
-
if (src0->backend ==
|
1583
|
+
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
1584
1584
|
x_offset = (i03 * ne02 + i02) * x_ne;
|
1585
1585
|
} else {
|
1586
1586
|
// copy src0 to device
|
@@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1589
1589
|
|
1590
1590
|
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
1591
1591
|
// copy src1 to device
|
1592
|
-
if (src1->backend ==
|
1592
|
+
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
1593
1593
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1594
1594
|
}
|
1595
1595
|
|
@@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1612
1612
|
}
|
1613
1613
|
|
1614
1614
|
// copy dst to host
|
1615
|
-
if (dst->backend ==
|
1615
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
1616
1616
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1617
1617
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
1618
1618
|
}
|
@@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1621
1621
|
}
|
1622
1622
|
}
|
1623
1623
|
|
1624
|
-
if (src0->backend !=
|
1624
|
+
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
|
1625
1625
|
ggml_cl_pool_free(d_X, x_size);
|
1626
1626
|
}
|
1627
|
-
if (src1->backend !=
|
1627
|
+
if (src1->backend != GGML_BACKEND_TYPE_GPU) {
|
1628
1628
|
ggml_cl_pool_free(d_Y, y_size);
|
1629
1629
|
}
|
1630
|
-
if (dst->backend !=
|
1630
|
+
if (dst->backend != GGML_BACKEND_TYPE_GPU) {
|
1631
1631
|
ggml_cl_pool_free(d_D, d_size);
|
1632
1632
|
}
|
1633
1633
|
}
|
@@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1670
1670
|
size_t y_size;
|
1671
1671
|
size_t d_size;
|
1672
1672
|
cl_mem d_X;
|
1673
|
-
if (src0->backend ==
|
1673
|
+
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
|
1674
1674
|
d_X = (cl_mem) src0->extra;
|
1675
1675
|
} else {
|
1676
1676
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
@@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1687
1687
|
// TODO: copy src0 here when r3>1
|
1688
1688
|
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1689
1689
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1690
|
-
if (src0->backend ==
|
1690
|
+
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
1691
1691
|
x_offset = (i03 * ne02 + i02) * x_ne;
|
1692
1692
|
} else {
|
1693
1693
|
// copy src0 to device
|
@@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1741
1741
|
}
|
1742
1742
|
|
1743
1743
|
// copy dst to host, then convert to float
|
1744
|
-
if (dst->backend ==
|
1744
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
1745
1745
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
1746
1746
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1747
1747
|
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
@@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1753
1753
|
}
|
1754
1754
|
}
|
1755
1755
|
|
1756
|
-
if (src0->backend !=
|
1756
|
+
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
|
1757
1757
|
ggml_cl_pool_free(d_X, x_size);
|
1758
1758
|
}
|
1759
1759
|
ggml_cl_pool_free(d_Y, y_size);
|
@@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1798
1798
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1799
1799
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
1800
1800
|
cl_mem d_Q;
|
1801
|
-
if (src0->backend ==
|
1801
|
+
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
1802
1802
|
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
1803
1803
|
}
|
1804
1804
|
|
@@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1817
1817
|
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1818
1818
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1819
1819
|
// copy src0 to device if necessary
|
1820
|
-
if (src0->backend ==
|
1820
|
+
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
1821
1821
|
events.emplace_back();
|
1822
1822
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1823
|
-
} else if (src0->backend ==
|
1823
|
+
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
1824
1824
|
d_Q = (cl_mem) src0->extra;
|
1825
1825
|
} else {
|
1826
1826
|
GGML_ASSERT(false);
|
@@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1829
1829
|
if (!mul_mat_vec) {
|
1830
1830
|
// convert src0 to fp32 on device
|
1831
1831
|
const size_t global = x_ne / global_denom;
|
1832
|
-
const size_t offset = src0->backend ==
|
1832
|
+
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1833
1833
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1834
1834
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1835
1835
|
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
@@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1843
1843
|
|
1844
1844
|
// compute
|
1845
1845
|
const size_t global = ne01 * local;
|
1846
|
-
const size_t offset = src0->backend ==
|
1846
|
+
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1847
1847
|
const cl_int ncols = ne00;
|
1848
1848
|
events.emplace_back();
|
1849
1849
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
@@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1895
1895
|
}
|
1896
1896
|
ggml_cl_pool_free(d_Y, y_size);
|
1897
1897
|
ggml_cl_pool_free(d_D, d_size);
|
1898
|
-
if (src0->backend ==
|
1898
|
+
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
1899
1899
|
ggml_cl_pool_free(d_Q, q_size);
|
1900
1900
|
}
|
1901
1901
|
}
|
@@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
1911
1911
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
1912
1912
|
src1->type == GGML_TYPE_F32 &&
|
1913
1913
|
dst->type == GGML_TYPE_F32 &&
|
1914
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend ==
|
1914
|
+
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
|
1915
1915
|
return true;
|
1916
1916
|
}
|
1917
1917
|
|
@@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1993
1993
|
CL_CHECK(clFinish(queue));
|
1994
1994
|
|
1995
1995
|
tensor->extra = dst;
|
1996
|
-
GGML_ASSERT(tensor->backend ==
|
1996
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
1997
1997
|
}
|
1998
1998
|
|
1999
1999
|
// ggml-backend
|
@@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
2045
2045
|
ctx->sub_buffers.push_back(sub_buffer);
|
2046
2046
|
tensor->extra = sub_buffer;
|
2047
2047
|
}
|
2048
|
-
tensor->backend =
|
2048
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
2049
2049
|
}
|
2050
2050
|
|
2051
2051
|
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|