llama_cpp 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1489,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1489
1489
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1490
1490
 
1491
1491
  size_t x_offset = 0;
1492
- int64_t pi02 = -1;
1493
- int64_t pi03 = -1;
1494
-
1495
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1496
- int64_t i03 = i13 / r3;
1497
-
1498
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1499
- int64_t i02 = i12 / r2;
1500
-
1501
- // copy data to device
1502
- if (src0->backend == GGML_BACKEND_GPU) {
1503
- x_offset = (i03 * ne02 + i02) * x_ne;
1504
- } else if (i02 != pi02 || i03 != pi03) {
1505
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1506
- pi02 = i02;
1507
- pi03 = i03;
1508
- }
1509
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1510
1492
 
1511
- CL_CHECK(clFinish(queue));
1493
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1494
+ // TODO: copy src0 here when r3>1
1495
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1496
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1497
+ if (src0->backend == GGML_BACKEND_GPU) {
1498
+ x_offset = (i03 * ne02 + i02) * x_ne;
1499
+ } else {
1500
+ // copy src0 to device
1501
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1502
+ }
1512
1503
 
1513
- // compute
1514
- cl_event ev_sgemm;
1515
- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1516
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1517
- ne01, ne11, ne10,
1518
- alpha,
1519
- d_X, x_offset, ne00,
1520
- d_Y, 0, ne10,
1521
- beta,
1522
- d_D, 0, ne01,
1523
- &queue, &ev_sgemm);
1524
-
1525
- if (status != clblast::StatusCode::kSuccess) {
1526
- GGML_ASSERT(false);
1527
- }
1504
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1505
+ // copy src1 to device
1506
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1507
+
1508
+ CL_CHECK(clFinish(queue));
1509
+
1510
+ // compute
1511
+ cl_event ev_sgemm;
1512
+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1513
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1514
+ ne01, ne11, ne10,
1515
+ alpha,
1516
+ d_X, x_offset, ne00,
1517
+ d_Y, 0, ne10,
1518
+ beta,
1519
+ d_D, 0, ne01,
1520
+ &queue, &ev_sgemm);
1521
+
1522
+ if (status != clblast::StatusCode::kSuccess) {
1523
+ GGML_ASSERT(false);
1524
+ }
1528
1525
 
1529
- // copy dst to host
1530
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1531
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1526
+ // copy dst to host
1527
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1528
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1529
+ }
1530
+ }
1532
1531
  }
1533
1532
  }
1534
1533
 
@@ -1589,73 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1589
1588
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
1590
1589
 
1591
1590
  size_t x_offset = 0;
1592
- int64_t pi02 = -1;
1593
- int64_t pi03 = -1;
1594
-
1595
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1596
- int64_t i03 = i13 / r3;
1597
-
1598
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1599
- int64_t i02 = i12 / r2;
1600
1591
 
1601
- // copy src0 to device
1602
- if (src0->backend == GGML_BACKEND_GPU) {
1603
- x_offset = (i03 * ne02 + i02) * x_ne;
1604
- } else if (i02 != pi02 || i03 != pi03) {
1605
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1606
- pi02 = i02;
1607
- pi03 = i03;
1608
- }
1609
-
1610
- // convert src1 to fp16
1611
- // TODO: use multiple threads
1612
- char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1613
- if (src1_cont_rows) {
1614
- if (src1_cont_cols) {
1615
- ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1592
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1593
+ // TODO: copy src0 here when r3>1
1594
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1595
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1596
+ if (src0->backend == GGML_BACKEND_GPU) {
1597
+ x_offset = (i03 * ne02 + i02) * x_ne;
1598
+ } else {
1599
+ // copy src0 to device
1600
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1616
1601
  }
1617
- else {
1618
- for (int64_t i11 = 0; i11 < ne11; i11++) {
1619
- ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1602
+
1603
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1604
+ // convert src1 to fp16
1605
+ // TODO: use multiple threads
1606
+ char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1607
+ if (src1_cont_rows) {
1608
+ if (src1_cont_cols) {
1609
+ ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1610
+ }
1611
+ else {
1612
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1613
+ ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1614
+ }
1615
+ }
1620
1616
  }
1621
- }
1622
- }
1623
- else {
1624
- for (int64_t i11 = 0; i11 < ne11; i11++) {
1625
- for (int64_t i10 = 0; i10 < ne10; i10++) {
1626
- // very slow due to no inlining
1627
- tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1617
+ else {
1618
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1619
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
1620
+ // very slow due to no inlining
1621
+ tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1622
+ }
1623
+ }
1628
1624
  }
1629
- }
1630
- }
1631
-
1632
- // copy src1 to device
1633
- CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1634
-
1635
- CL_CHECK(clFinish(queue));
1636
1625
 
1637
- // compute
1638
- cl_event ev_sgemm;
1639
- clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1640
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1641
- ne01, ne11, ne10,
1642
- alpha,
1643
- d_X, x_offset, ne00,
1644
- d_Y, 0, ne10,
1645
- beta,
1646
- d_D, 0, ne01,
1647
- &queue, &ev_sgemm);
1648
-
1649
- if (status != clblast::StatusCode::kSuccess) {
1650
- GGML_ASSERT(false);
1651
- }
1626
+ // copy src1 to device
1627
+ CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1628
+
1629
+ CL_CHECK(clFinish(queue));
1630
+
1631
+ // compute
1632
+ cl_event ev_sgemm;
1633
+ clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1634
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1635
+ ne01, ne11, ne10,
1636
+ alpha,
1637
+ d_X, x_offset, ne00,
1638
+ d_Y, 0, ne10,
1639
+ beta,
1640
+ d_D, 0, ne01,
1641
+ &queue, &ev_sgemm);
1642
+
1643
+ if (status != clblast::StatusCode::kSuccess) {
1644
+ GGML_ASSERT(false);
1645
+ }
1652
1646
 
1653
- // copy dst to host, then convert to float
1654
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1647
+ // copy dst to host, then convert to float
1648
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1655
1649
 
1656
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1650
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1657
1651
 
1658
- ggml_fp16_to_fp32_row(tmp, d, d_ne);
1652
+ ggml_fp16_to_fp32_row(tmp, d, d_ne);
1653
+ }
1654
+ }
1659
1655
  }
1660
1656
  }
1661
1657
 
@@ -1718,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1718
1714
  size_t ev_idx = 0;
1719
1715
  std::vector<cl_event> events;
1720
1716
 
1721
- int64_t pi02 = -1;
1722
- int64_t pi03 = -1;
1723
-
1724
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1725
- int64_t i03 = i13 / r3;
1726
-
1727
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1728
- int64_t i02 = i12 / r2;
1729
-
1730
- // copy src0 to device if necessary
1731
- if (src0->backend == GGML_BACKEND_CPU) {
1732
- if (i02 != pi02 || i03 != pi03) {
1717
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1718
+ // TODO: copy and dequantize src0 here when r3>1
1719
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1720
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1721
+ // copy src0 to device if necessary
1722
+ if (src0->backend == GGML_BACKEND_CPU) {
1733
1723
  events.emplace_back();
1734
1724
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1735
- pi02 = i02;
1736
- pi03 = i03;
1737
- }
1738
- } else if (src0->backend == GGML_BACKEND_GPU) {
1739
- d_Q = (cl_mem) src0->extra;
1740
- } else {
1741
- GGML_ASSERT(false);
1742
- }
1743
- if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1744
- // copy src1 to device
1745
- events.emplace_back();
1746
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1747
-
1748
- // compute
1749
- const size_t global = ne01 * local;
1750
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1751
- const cl_int ncols = ne00;
1752
- events.emplace_back();
1753
- CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1754
- CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1755
- CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1756
- CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1757
- CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1758
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1759
- } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1760
- // convert src0 to fp32 on device
1761
- const size_t global = x_ne / global_denom;
1762
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1763
- CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1764
- CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1765
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1766
-
1767
- // copy src1 to device
1768
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1769
-
1770
- events.emplace_back();
1771
-
1772
- // wait for conversion
1773
- CL_CHECK(clFinish(queue));
1774
-
1775
- // compute
1776
- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1777
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1778
- ne01, ne11, ne10,
1779
- alpha,
1780
- d_X, 0, ne00,
1781
- d_Y, 0, ne10,
1782
- beta,
1783
- d_D, 0, ne01,
1784
- &queue, events.data() + ev_idx++);
1785
-
1786
- if (status != clblast::StatusCode::kSuccess) {
1725
+ } else if (src0->backend == GGML_BACKEND_GPU) {
1726
+ d_Q = (cl_mem) src0->extra;
1727
+ } else {
1787
1728
  GGML_ASSERT(false);
1788
1729
  }
1789
- }
1790
1730
 
1791
- // copy dst to host
1792
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1793
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1794
- for (auto *event : events) {
1795
- clReleaseEvent(event);
1796
- }
1731
+ if (!mul_mat_vec) {
1732
+ // convert src0 to fp32 on device
1733
+ const size_t global = x_ne / global_denom;
1734
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1735
+ CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1736
+ CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1737
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1738
+ }
1797
1739
 
1798
- ev_idx = 0;
1799
- events.clear();
1740
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1741
+ if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1742
+ // copy src1 to device
1743
+ events.emplace_back();
1744
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1745
+
1746
+ // compute
1747
+ const size_t global = ne01 * local;
1748
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1749
+ const cl_int ncols = ne00;
1750
+ events.emplace_back();
1751
+ CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1752
+ CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1753
+ CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1754
+ CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1755
+ CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1756
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1757
+ } else { // CLBlast matrix matrix multiplication
1758
+ // copy src1 to device
1759
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1760
+
1761
+ // wait for conversion
1762
+ CL_CHECK(clFinish(queue));
1763
+
1764
+ // compute
1765
+ events.emplace_back();
1766
+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1767
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1768
+ ne01, ne11, ne10,
1769
+ alpha,
1770
+ d_X, 0, ne00,
1771
+ d_Y, 0, ne10,
1772
+ beta,
1773
+ d_D, 0, ne01,
1774
+ &queue, events.data() + ev_idx++);
1775
+
1776
+ if (status != clblast::StatusCode::kSuccess) {
1777
+ GGML_ASSERT(false);
1778
+ }
1779
+ }
1780
+
1781
+ // copy dst to host
1782
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1783
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1784
+ for (auto *event : events) {
1785
+ clReleaseEvent(event);
1786
+ }
1787
+
1788
+ ev_idx = 0;
1789
+ events.clear();
1790
+ }
1791
+ }
1800
1792
  }
1801
1793
  }
1802
1794