llama_cpp 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1489,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1489
1489
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1490
1490
 
1491
1491
  size_t x_offset = 0;
1492
- int64_t pi02 = -1;
1493
- int64_t pi03 = -1;
1494
-
1495
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1496
- int64_t i03 = i13 / r3;
1497
-
1498
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1499
- int64_t i02 = i12 / r2;
1500
-
1501
- // copy data to device
1502
- if (src0->backend == GGML_BACKEND_GPU) {
1503
- x_offset = (i03 * ne02 + i02) * x_ne;
1504
- } else if (i02 != pi02 || i03 != pi03) {
1505
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1506
- pi02 = i02;
1507
- pi03 = i03;
1508
- }
1509
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1510
1492
 
1511
- CL_CHECK(clFinish(queue));
1493
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1494
+ // TODO: copy src0 here when r3>1
1495
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1496
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1497
+ if (src0->backend == GGML_BACKEND_GPU) {
1498
+ x_offset = (i03 * ne02 + i02) * x_ne;
1499
+ } else {
1500
+ // copy src0 to device
1501
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1502
+ }
1512
1503
 
1513
- // compute
1514
- cl_event ev_sgemm;
1515
- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1516
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1517
- ne01, ne11, ne10,
1518
- alpha,
1519
- d_X, x_offset, ne00,
1520
- d_Y, 0, ne10,
1521
- beta,
1522
- d_D, 0, ne01,
1523
- &queue, &ev_sgemm);
1524
-
1525
- if (status != clblast::StatusCode::kSuccess) {
1526
- GGML_ASSERT(false);
1527
- }
1504
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1505
+ // copy src1 to device
1506
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1507
+
1508
+ CL_CHECK(clFinish(queue));
1509
+
1510
+ // compute
1511
+ cl_event ev_sgemm;
1512
+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1513
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1514
+ ne01, ne11, ne10,
1515
+ alpha,
1516
+ d_X, x_offset, ne00,
1517
+ d_Y, 0, ne10,
1518
+ beta,
1519
+ d_D, 0, ne01,
1520
+ &queue, &ev_sgemm);
1521
+
1522
+ if (status != clblast::StatusCode::kSuccess) {
1523
+ GGML_ASSERT(false);
1524
+ }
1528
1525
 
1529
- // copy dst to host
1530
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1531
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1526
+ // copy dst to host
1527
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1528
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1529
+ }
1530
+ }
1532
1531
  }
1533
1532
  }
1534
1533
 
@@ -1589,73 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1589
1588
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
1590
1589
 
1591
1590
  size_t x_offset = 0;
1592
- int64_t pi02 = -1;
1593
- int64_t pi03 = -1;
1594
-
1595
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1596
- int64_t i03 = i13 / r3;
1597
-
1598
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1599
- int64_t i02 = i12 / r2;
1600
1591
 
1601
- // copy src0 to device
1602
- if (src0->backend == GGML_BACKEND_GPU) {
1603
- x_offset = (i03 * ne02 + i02) * x_ne;
1604
- } else if (i02 != pi02 || i03 != pi03) {
1605
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1606
- pi02 = i02;
1607
- pi03 = i03;
1608
- }
1609
-
1610
- // convert src1 to fp16
1611
- // TODO: use multiple threads
1612
- char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1613
- if (src1_cont_rows) {
1614
- if (src1_cont_cols) {
1615
- ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1592
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1593
+ // TODO: copy src0 here when r3>1
1594
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1595
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1596
+ if (src0->backend == GGML_BACKEND_GPU) {
1597
+ x_offset = (i03 * ne02 + i02) * x_ne;
1598
+ } else {
1599
+ // copy src0 to device
1600
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1616
1601
  }
1617
- else {
1618
- for (int64_t i11 = 0; i11 < ne11; i11++) {
1619
- ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1602
+
1603
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1604
+ // convert src1 to fp16
1605
+ // TODO: use multiple threads
1606
+ char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1607
+ if (src1_cont_rows) {
1608
+ if (src1_cont_cols) {
1609
+ ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1610
+ }
1611
+ else {
1612
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1613
+ ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1614
+ }
1615
+ }
1620
1616
  }
1621
- }
1622
- }
1623
- else {
1624
- for (int64_t i11 = 0; i11 < ne11; i11++) {
1625
- for (int64_t i10 = 0; i10 < ne10; i10++) {
1626
- // very slow due to no inlining
1627
- tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1617
+ else {
1618
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1619
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
1620
+ // very slow due to no inlining
1621
+ tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1622
+ }
1623
+ }
1628
1624
  }
1629
- }
1630
- }
1631
-
1632
- // copy src1 to device
1633
- CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1634
-
1635
- CL_CHECK(clFinish(queue));
1636
1625
 
1637
- // compute
1638
- cl_event ev_sgemm;
1639
- clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1640
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1641
- ne01, ne11, ne10,
1642
- alpha,
1643
- d_X, x_offset, ne00,
1644
- d_Y, 0, ne10,
1645
- beta,
1646
- d_D, 0, ne01,
1647
- &queue, &ev_sgemm);
1648
-
1649
- if (status != clblast::StatusCode::kSuccess) {
1650
- GGML_ASSERT(false);
1651
- }
1626
+ // copy src1 to device
1627
+ CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1628
+
1629
+ CL_CHECK(clFinish(queue));
1630
+
1631
+ // compute
1632
+ cl_event ev_sgemm;
1633
+ clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1634
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1635
+ ne01, ne11, ne10,
1636
+ alpha,
1637
+ d_X, x_offset, ne00,
1638
+ d_Y, 0, ne10,
1639
+ beta,
1640
+ d_D, 0, ne01,
1641
+ &queue, &ev_sgemm);
1642
+
1643
+ if (status != clblast::StatusCode::kSuccess) {
1644
+ GGML_ASSERT(false);
1645
+ }
1652
1646
 
1653
- // copy dst to host, then convert to float
1654
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1647
+ // copy dst to host, then convert to float
1648
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1655
1649
 
1656
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1650
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1657
1651
 
1658
- ggml_fp16_to_fp32_row(tmp, d, d_ne);
1652
+ ggml_fp16_to_fp32_row(tmp, d, d_ne);
1653
+ }
1654
+ }
1659
1655
  }
1660
1656
  }
1661
1657
 
@@ -1718,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1718
1714
  size_t ev_idx = 0;
1719
1715
  std::vector<cl_event> events;
1720
1716
 
1721
- int64_t pi02 = -1;
1722
- int64_t pi03 = -1;
1723
-
1724
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1725
- int64_t i03 = i13 / r3;
1726
-
1727
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1728
- int64_t i02 = i12 / r2;
1729
-
1730
- // copy src0 to device if necessary
1731
- if (src0->backend == GGML_BACKEND_CPU) {
1732
- if (i02 != pi02 || i03 != pi03) {
1717
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1718
+ // TODO: copy and dequantize src0 here when r3>1
1719
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1720
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1721
+ // copy src0 to device if necessary
1722
+ if (src0->backend == GGML_BACKEND_CPU) {
1733
1723
  events.emplace_back();
1734
1724
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1735
- pi02 = i02;
1736
- pi03 = i03;
1737
- }
1738
- } else if (src0->backend == GGML_BACKEND_GPU) {
1739
- d_Q = (cl_mem) src0->extra;
1740
- } else {
1741
- GGML_ASSERT(false);
1742
- }
1743
- if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1744
- // copy src1 to device
1745
- events.emplace_back();
1746
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1747
-
1748
- // compute
1749
- const size_t global = ne01 * local;
1750
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1751
- const cl_int ncols = ne00;
1752
- events.emplace_back();
1753
- CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1754
- CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1755
- CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1756
- CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1757
- CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1758
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1759
- } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1760
- // convert src0 to fp32 on device
1761
- const size_t global = x_ne / global_denom;
1762
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1763
- CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1764
- CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1765
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1766
-
1767
- // copy src1 to device
1768
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1769
-
1770
- events.emplace_back();
1771
-
1772
- // wait for conversion
1773
- CL_CHECK(clFinish(queue));
1774
-
1775
- // compute
1776
- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1777
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1778
- ne01, ne11, ne10,
1779
- alpha,
1780
- d_X, 0, ne00,
1781
- d_Y, 0, ne10,
1782
- beta,
1783
- d_D, 0, ne01,
1784
- &queue, events.data() + ev_idx++);
1785
-
1786
- if (status != clblast::StatusCode::kSuccess) {
1725
+ } else if (src0->backend == GGML_BACKEND_GPU) {
1726
+ d_Q = (cl_mem) src0->extra;
1727
+ } else {
1787
1728
  GGML_ASSERT(false);
1788
1729
  }
1789
- }
1790
1730
 
1791
- // copy dst to host
1792
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1793
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1794
- for (auto *event : events) {
1795
- clReleaseEvent(event);
1796
- }
1731
+ if (!mul_mat_vec) {
1732
+ // convert src0 to fp32 on device
1733
+ const size_t global = x_ne / global_denom;
1734
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1735
+ CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1736
+ CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1737
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1738
+ }
1797
1739
 
1798
- ev_idx = 0;
1799
- events.clear();
1740
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1741
+ if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1742
+ // copy src1 to device
1743
+ events.emplace_back();
1744
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1745
+
1746
+ // compute
1747
+ const size_t global = ne01 * local;
1748
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1749
+ const cl_int ncols = ne00;
1750
+ events.emplace_back();
1751
+ CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1752
+ CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1753
+ CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1754
+ CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1755
+ CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1756
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1757
+ } else { // CLBlast matrix matrix multiplication
1758
+ // copy src1 to device
1759
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1760
+
1761
+ // wait for conversion
1762
+ CL_CHECK(clFinish(queue));
1763
+
1764
+ // compute
1765
+ events.emplace_back();
1766
+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1767
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1768
+ ne01, ne11, ne10,
1769
+ alpha,
1770
+ d_X, 0, ne00,
1771
+ d_Y, 0, ne10,
1772
+ beta,
1773
+ d_D, 0, ne01,
1774
+ &queue, events.data() + ev_idx++);
1775
+
1776
+ if (status != clblast::StatusCode::kSuccess) {
1777
+ GGML_ASSERT(false);
1778
+ }
1779
+ }
1780
+
1781
+ // copy dst to host
1782
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1783
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1784
+ for (auto *event : events) {
1785
+ clReleaseEvent(event);
1786
+ }
1787
+
1788
+ ev_idx = 0;
1789
+ events.clear();
1790
+ }
1791
+ }
1800
1792
  }
1801
1793
  }
1802
1794