llama_cpp 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +81 -162
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +13 -5
- data/ext/llama_cpp/src/ggml-metal.metal +9 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
- data/ext/llama_cpp/src/ggml.c +362 -84
- data/ext/llama_cpp/src/ggml.h +8 -7
- data/ext/llama_cpp/src/llama.cpp +100 -95
- data/ext/llama_cpp/src/llama.h +16 -21
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +11 -12
- metadata +2 -2
@@ -1489,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1489
1489
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
1490
1490
|
|
1491
1491
|
size_t x_offset = 0;
|
1492
|
-
int64_t pi02 = -1;
|
1493
|
-
int64_t pi03 = -1;
|
1494
|
-
|
1495
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1496
|
-
int64_t i03 = i13 / r3;
|
1497
|
-
|
1498
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1499
|
-
int64_t i02 = i12 / r2;
|
1500
|
-
|
1501
|
-
// copy data to device
|
1502
|
-
if (src0->backend == GGML_BACKEND_GPU) {
|
1503
|
-
x_offset = (i03 * ne02 + i02) * x_ne;
|
1504
|
-
} else if (i02 != pi02 || i03 != pi03) {
|
1505
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1506
|
-
pi02 = i02;
|
1507
|
-
pi03 = i03;
|
1508
|
-
}
|
1509
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1510
1492
|
|
1511
|
-
|
1493
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1494
|
+
// TODO: copy src0 here when r3>1
|
1495
|
+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1496
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1497
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1498
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1499
|
+
} else {
|
1500
|
+
// copy src0 to device
|
1501
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1502
|
+
}
|
1512
1503
|
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1516
|
-
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1521
|
-
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1504
|
+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
1505
|
+
// copy src1 to device
|
1506
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1507
|
+
|
1508
|
+
CL_CHECK(clFinish(queue));
|
1509
|
+
|
1510
|
+
// compute
|
1511
|
+
cl_event ev_sgemm;
|
1512
|
+
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
1513
|
+
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1514
|
+
ne01, ne11, ne10,
|
1515
|
+
alpha,
|
1516
|
+
d_X, x_offset, ne00,
|
1517
|
+
d_Y, 0, ne10,
|
1518
|
+
beta,
|
1519
|
+
d_D, 0, ne01,
|
1520
|
+
&queue, &ev_sgemm);
|
1521
|
+
|
1522
|
+
if (status != clblast::StatusCode::kSuccess) {
|
1523
|
+
GGML_ASSERT(false);
|
1524
|
+
}
|
1528
1525
|
|
1529
|
-
|
1530
|
-
|
1531
|
-
|
1526
|
+
// copy dst to host
|
1527
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1528
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
1529
|
+
}
|
1530
|
+
}
|
1532
1531
|
}
|
1533
1532
|
}
|
1534
1533
|
|
@@ -1589,73 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1589
1588
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
1590
1589
|
|
1591
1590
|
size_t x_offset = 0;
|
1592
|
-
int64_t pi02 = -1;
|
1593
|
-
int64_t pi03 = -1;
|
1594
|
-
|
1595
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1596
|
-
int64_t i03 = i13 / r3;
|
1597
|
-
|
1598
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1599
|
-
int64_t i02 = i12 / r2;
|
1600
1591
|
|
1601
|
-
|
1602
|
-
|
1603
|
-
|
1604
|
-
|
1605
|
-
|
1606
|
-
|
1607
|
-
|
1608
|
-
|
1609
|
-
|
1610
|
-
// convert src1 to fp16
|
1611
|
-
// TODO: use multiple threads
|
1612
|
-
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1613
|
-
if (src1_cont_rows) {
|
1614
|
-
if (src1_cont_cols) {
|
1615
|
-
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
1592
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1593
|
+
// TODO: copy src0 here when r3>1
|
1594
|
+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1595
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1596
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1597
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1598
|
+
} else {
|
1599
|
+
// copy src0 to device
|
1600
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1616
1601
|
}
|
1617
|
-
|
1618
|
-
|
1619
|
-
|
1602
|
+
|
1603
|
+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
1604
|
+
// convert src1 to fp16
|
1605
|
+
// TODO: use multiple threads
|
1606
|
+
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1607
|
+
if (src1_cont_rows) {
|
1608
|
+
if (src1_cont_cols) {
|
1609
|
+
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
1610
|
+
}
|
1611
|
+
else {
|
1612
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1613
|
+
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
1614
|
+
}
|
1615
|
+
}
|
1620
1616
|
}
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1617
|
+
else {
|
1618
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1619
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
1620
|
+
// very slow due to no inlining
|
1621
|
+
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
1622
|
+
}
|
1623
|
+
}
|
1628
1624
|
}
|
1629
|
-
}
|
1630
|
-
}
|
1631
|
-
|
1632
|
-
// copy src1 to device
|
1633
|
-
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
|
1634
|
-
|
1635
|
-
CL_CHECK(clFinish(queue));
|
1636
1625
|
|
1637
|
-
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
1641
|
-
|
1642
|
-
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1626
|
+
// copy src1 to device
|
1627
|
+
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
|
1628
|
+
|
1629
|
+
CL_CHECK(clFinish(queue));
|
1630
|
+
|
1631
|
+
// compute
|
1632
|
+
cl_event ev_sgemm;
|
1633
|
+
clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
|
1634
|
+
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1635
|
+
ne01, ne11, ne10,
|
1636
|
+
alpha,
|
1637
|
+
d_X, x_offset, ne00,
|
1638
|
+
d_Y, 0, ne10,
|
1639
|
+
beta,
|
1640
|
+
d_D, 0, ne01,
|
1641
|
+
&queue, &ev_sgemm);
|
1642
|
+
|
1643
|
+
if (status != clblast::StatusCode::kSuccess) {
|
1644
|
+
GGML_ASSERT(false);
|
1645
|
+
}
|
1652
1646
|
|
1653
|
-
|
1654
|
-
|
1647
|
+
// copy dst to host, then convert to float
|
1648
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
1655
1649
|
|
1656
|
-
|
1650
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1657
1651
|
|
1658
|
-
|
1652
|
+
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
1653
|
+
}
|
1654
|
+
}
|
1659
1655
|
}
|
1660
1656
|
}
|
1661
1657
|
|
@@ -1718,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1718
1714
|
size_t ev_idx = 0;
|
1719
1715
|
std::vector<cl_event> events;
|
1720
1716
|
|
1721
|
-
int64_t
|
1722
|
-
|
1723
|
-
|
1724
|
-
|
1725
|
-
|
1726
|
-
|
1727
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1728
|
-
int64_t i02 = i12 / r2;
|
1729
|
-
|
1730
|
-
// copy src0 to device if necessary
|
1731
|
-
if (src0->backend == GGML_BACKEND_CPU) {
|
1732
|
-
if (i02 != pi02 || i03 != pi03) {
|
1717
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1718
|
+
// TODO: copy and dequantize src0 here when r3>1
|
1719
|
+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1720
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1721
|
+
// copy src0 to device if necessary
|
1722
|
+
if (src0->backend == GGML_BACKEND_CPU) {
|
1733
1723
|
events.emplace_back();
|
1734
1724
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1735
|
-
|
1736
|
-
|
1737
|
-
}
|
1738
|
-
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1739
|
-
d_Q = (cl_mem) src0->extra;
|
1740
|
-
} else {
|
1741
|
-
GGML_ASSERT(false);
|
1742
|
-
}
|
1743
|
-
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
1744
|
-
// copy src1 to device
|
1745
|
-
events.emplace_back();
|
1746
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1747
|
-
|
1748
|
-
// compute
|
1749
|
-
const size_t global = ne01 * local;
|
1750
|
-
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1751
|
-
const cl_int ncols = ne00;
|
1752
|
-
events.emplace_back();
|
1753
|
-
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
1754
|
-
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
1755
|
-
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
1756
|
-
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
1757
|
-
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
1758
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
1759
|
-
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
1760
|
-
// convert src0 to fp32 on device
|
1761
|
-
const size_t global = x_ne / global_denom;
|
1762
|
-
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1763
|
-
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1764
|
-
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1765
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1766
|
-
|
1767
|
-
// copy src1 to device
|
1768
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1769
|
-
|
1770
|
-
events.emplace_back();
|
1771
|
-
|
1772
|
-
// wait for conversion
|
1773
|
-
CL_CHECK(clFinish(queue));
|
1774
|
-
|
1775
|
-
// compute
|
1776
|
-
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
1777
|
-
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1778
|
-
ne01, ne11, ne10,
|
1779
|
-
alpha,
|
1780
|
-
d_X, 0, ne00,
|
1781
|
-
d_Y, 0, ne10,
|
1782
|
-
beta,
|
1783
|
-
d_D, 0, ne01,
|
1784
|
-
&queue, events.data() + ev_idx++);
|
1785
|
-
|
1786
|
-
if (status != clblast::StatusCode::kSuccess) {
|
1725
|
+
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1726
|
+
d_Q = (cl_mem) src0->extra;
|
1727
|
+
} else {
|
1787
1728
|
GGML_ASSERT(false);
|
1788
1729
|
}
|
1789
|
-
}
|
1790
1730
|
|
1791
|
-
|
1792
|
-
|
1793
|
-
|
1794
|
-
|
1795
|
-
|
1796
|
-
|
1731
|
+
if (!mul_mat_vec) {
|
1732
|
+
// convert src0 to fp32 on device
|
1733
|
+
const size_t global = x_ne / global_denom;
|
1734
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1735
|
+
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1736
|
+
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1737
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1738
|
+
}
|
1797
1739
|
|
1798
|
-
|
1799
|
-
|
1740
|
+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
1741
|
+
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
1742
|
+
// copy src1 to device
|
1743
|
+
events.emplace_back();
|
1744
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1745
|
+
|
1746
|
+
// compute
|
1747
|
+
const size_t global = ne01 * local;
|
1748
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1749
|
+
const cl_int ncols = ne00;
|
1750
|
+
events.emplace_back();
|
1751
|
+
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
1752
|
+
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
1753
|
+
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
1754
|
+
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
1755
|
+
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
1756
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
1757
|
+
} else { // CLBlast matrix matrix multiplication
|
1758
|
+
// copy src1 to device
|
1759
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1760
|
+
|
1761
|
+
// wait for conversion
|
1762
|
+
CL_CHECK(clFinish(queue));
|
1763
|
+
|
1764
|
+
// compute
|
1765
|
+
events.emplace_back();
|
1766
|
+
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
1767
|
+
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1768
|
+
ne01, ne11, ne10,
|
1769
|
+
alpha,
|
1770
|
+
d_X, 0, ne00,
|
1771
|
+
d_Y, 0, ne10,
|
1772
|
+
beta,
|
1773
|
+
d_D, 0, ne01,
|
1774
|
+
&queue, events.data() + ev_idx++);
|
1775
|
+
|
1776
|
+
if (status != clblast::StatusCode::kSuccess) {
|
1777
|
+
GGML_ASSERT(false);
|
1778
|
+
}
|
1779
|
+
}
|
1780
|
+
|
1781
|
+
// copy dst to host
|
1782
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1783
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
1784
|
+
for (auto *event : events) {
|
1785
|
+
clReleaseEvent(event);
|
1786
|
+
}
|
1787
|
+
|
1788
|
+
ev_idx = 0;
|
1789
|
+
events.clear();
|
1790
|
+
}
|
1791
|
+
}
|
1800
1792
|
}
|
1801
1793
|
}
|
1802
1794
|
|