llama_cpp 0.12.7 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +72 -262
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -25
- data/vendor/tmp/llama.cpp/Makefile +8 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
- data/vendor/tmp/llama.cpp/ggml.c +318 -195
- data/vendor/tmp/llama.cpp/ggml.h +35 -19
- data/vendor/tmp/llama.cpp/llama.cpp +806 -531
- data/vendor/tmp/llama.cpp/llama.h +53 -65
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
|
1354
1354
|
}
|
1355
1355
|
|
1356
1356
|
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
1357
|
-
if (tensor->backend !=
|
1357
|
+
if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
|
1358
1358
|
return;
|
1359
1359
|
}
|
1360
1360
|
|
@@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|
1412
1412
|
}
|
1413
1413
|
|
1414
1414
|
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1415
|
-
GGML_ASSERT(src1->backend ==
|
1415
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
1416
1416
|
const int64_t ne00 = src0->ne[0];
|
1417
1417
|
const int64_t ne01 = src0->ne[1];
|
1418
1418
|
const int64_t ne02 = src0->ne[2];
|
@@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
|
|
1476
1476
|
}
|
1477
1477
|
|
1478
1478
|
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1479
|
-
GGML_ASSERT(src1->backend ==
|
1479
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
1480
1480
|
const int64_t ne00 = src0->ne[0];
|
1481
1481
|
const int64_t ne01 = src0->ne[1];
|
1482
1482
|
const int64_t ne02 = src0->ne[2];
|
@@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1566
1566
|
size_t y_size;
|
1567
1567
|
size_t d_size;
|
1568
1568
|
cl_mem d_X;
|
1569
|
-
if (src0->backend ==
|
1569
|
+
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
|
1570
1570
|
d_X = (cl_mem) src0->extra;
|
1571
1571
|
} else {
|
1572
1572
|
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
1573
1573
|
}
|
1574
|
-
cl_mem d_Y = src1->backend ==
|
1575
|
-
cl_mem d_D = dst->backend ==
|
1574
|
+
cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1575
|
+
cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
1576
1576
|
|
1577
1577
|
size_t x_offset = 0;
|
1578
1578
|
|
@@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1580
1580
|
// TODO: copy src0 here when r3>1
|
1581
1581
|
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1582
1582
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1583
|
-
if (src0->backend ==
|
1583
|
+
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
1584
1584
|
x_offset = (i03 * ne02 + i02) * x_ne;
|
1585
1585
|
} else {
|
1586
1586
|
// copy src0 to device
|
@@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1589
1589
|
|
1590
1590
|
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
1591
1591
|
// copy src1 to device
|
1592
|
-
if (src1->backend ==
|
1592
|
+
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
1593
1593
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1594
1594
|
}
|
1595
1595
|
|
@@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1612
1612
|
}
|
1613
1613
|
|
1614
1614
|
// copy dst to host
|
1615
|
-
if (dst->backend ==
|
1615
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
1616
1616
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1617
1617
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
1618
1618
|
}
|
@@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1621
1621
|
}
|
1622
1622
|
}
|
1623
1623
|
|
1624
|
-
if (src0->backend !=
|
1624
|
+
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
|
1625
1625
|
ggml_cl_pool_free(d_X, x_size);
|
1626
1626
|
}
|
1627
|
-
if (src1->backend !=
|
1627
|
+
if (src1->backend != GGML_BACKEND_TYPE_GPU) {
|
1628
1628
|
ggml_cl_pool_free(d_Y, y_size);
|
1629
1629
|
}
|
1630
|
-
if (dst->backend !=
|
1630
|
+
if (dst->backend != GGML_BACKEND_TYPE_GPU) {
|
1631
1631
|
ggml_cl_pool_free(d_D, d_size);
|
1632
1632
|
}
|
1633
1633
|
}
|
@@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1670
1670
|
size_t y_size;
|
1671
1671
|
size_t d_size;
|
1672
1672
|
cl_mem d_X;
|
1673
|
-
if (src0->backend ==
|
1673
|
+
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
|
1674
1674
|
d_X = (cl_mem) src0->extra;
|
1675
1675
|
} else {
|
1676
1676
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
@@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1687
1687
|
// TODO: copy src0 here when r3>1
|
1688
1688
|
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1689
1689
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1690
|
-
if (src0->backend ==
|
1690
|
+
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
1691
1691
|
x_offset = (i03 * ne02 + i02) * x_ne;
|
1692
1692
|
} else {
|
1693
1693
|
// copy src0 to device
|
@@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1741
1741
|
}
|
1742
1742
|
|
1743
1743
|
// copy dst to host, then convert to float
|
1744
|
-
if (dst->backend ==
|
1744
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
1745
1745
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
1746
1746
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1747
1747
|
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
@@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1753
1753
|
}
|
1754
1754
|
}
|
1755
1755
|
|
1756
|
-
if (src0->backend !=
|
1756
|
+
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
|
1757
1757
|
ggml_cl_pool_free(d_X, x_size);
|
1758
1758
|
}
|
1759
1759
|
ggml_cl_pool_free(d_Y, y_size);
|
@@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1798
1798
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1799
1799
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
1800
1800
|
cl_mem d_Q;
|
1801
|
-
if (src0->backend ==
|
1801
|
+
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
1802
1802
|
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
1803
1803
|
}
|
1804
1804
|
|
@@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1817
1817
|
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1818
1818
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1819
1819
|
// copy src0 to device if necessary
|
1820
|
-
if (src0->backend ==
|
1820
|
+
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
1821
1821
|
events.emplace_back();
|
1822
1822
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1823
|
-
} else if (src0->backend ==
|
1823
|
+
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
1824
1824
|
d_Q = (cl_mem) src0->extra;
|
1825
1825
|
} else {
|
1826
1826
|
GGML_ASSERT(false);
|
@@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1829
1829
|
if (!mul_mat_vec) {
|
1830
1830
|
// convert src0 to fp32 on device
|
1831
1831
|
const size_t global = x_ne / global_denom;
|
1832
|
-
const size_t offset = src0->backend ==
|
1832
|
+
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1833
1833
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1834
1834
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1835
1835
|
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
@@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1843
1843
|
|
1844
1844
|
// compute
|
1845
1845
|
const size_t global = ne01 * local;
|
1846
|
-
const size_t offset = src0->backend ==
|
1846
|
+
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1847
1847
|
const cl_int ncols = ne00;
|
1848
1848
|
events.emplace_back();
|
1849
1849
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
@@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1895
1895
|
}
|
1896
1896
|
ggml_cl_pool_free(d_Y, y_size);
|
1897
1897
|
ggml_cl_pool_free(d_D, d_size);
|
1898
|
-
if (src0->backend ==
|
1898
|
+
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
1899
1899
|
ggml_cl_pool_free(d_Q, q_size);
|
1900
1900
|
}
|
1901
1901
|
}
|
@@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
1911
1911
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
1912
1912
|
src1->type == GGML_TYPE_F32 &&
|
1913
1913
|
dst->type == GGML_TYPE_F32 &&
|
1914
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend ==
|
1914
|
+
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
|
1915
1915
|
return true;
|
1916
1916
|
}
|
1917
1917
|
|
@@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1993
1993
|
CL_CHECK(clFinish(queue));
|
1994
1994
|
|
1995
1995
|
tensor->extra = dst;
|
1996
|
-
GGML_ASSERT(tensor->backend ==
|
1996
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
1997
1997
|
}
|
1998
1998
|
|
1999
1999
|
// ggml-backend
|
@@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
2045
2045
|
ctx->sub_buffers.push_back(sub_buffer);
|
2046
2046
|
tensor->extra = sub_buffer;
|
2047
2047
|
}
|
2048
|
-
tensor->backend =
|
2048
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
2049
2049
|
}
|
2050
2050
|
|
2051
2051
|
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|