llama_cpp 0.7.1 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1395,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1395
1395
  const int64_t ne01 = src0->ne[1];
1396
1396
  const int64_t ne02 = src0->ne[2];
1397
1397
  const int64_t ne03 = src0->ne[3];
1398
- const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
1399
1398
  const int64_t ne10 = src1->ne[0];
1400
1399
  const int64_t ne11 = src1->ne[1];
1401
1400
  const int64_t ne12 = src1->ne[2];
1402
1401
  const int64_t ne13 = src1->ne[3];
1403
- const int64_t nb10 = src1->nb[0];
1404
1402
  const int nb2 = dst->nb[2];
1405
1403
  const int nb3 = dst->nb[3];
1406
1404
  size_t x_size;
1407
1405
  size_t d_size;
1408
1406
 
1409
- cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1407
+ cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
1410
1408
  cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1411
- cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1409
+ cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
1412
1410
 
1413
1411
 
1414
1412
  for (int64_t i03 = 0; i03 < ne03; i03++) {
1415
1413
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1416
- const int i0 = i03*ne02 + i02;
1417
-
1418
1414
  cl_event ev;
1419
1415
 
1420
1416
  // copy src0 to device
1421
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
1422
-
1423
- if (nb10 == sizeof(float)) {
1424
- // Contiguous, avoid overhead from queueing many kernel runs
1425
- const int64_t i13 = i03%ne13;
1426
- const int64_t i12 = i02%ne12;
1427
- const int i1 = i13*ne12*ne11 + i12*ne11;
1428
-
1429
- cl_int x_offset = 0;
1430
- cl_int y_offset = i1*ne10;
1431
- cl_int d_offset = 0;
1432
-
1433
- size_t global = ne00 * ne01;
1434
- cl_int ky = ne10;
1435
- CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1436
- CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1437
- CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1438
- CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1439
- CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1440
- CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1441
- CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1442
- CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1443
- } else {
1444
- for (int64_t i01 = 0; i01 < ne01; i01++) {
1445
- const int64_t i13 = i03%ne13;
1446
- const int64_t i12 = i02%ne12;
1447
- const int64_t i11 = i01%ne11;
1448
- const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
1449
-
1450
- cl_int x_offset = i01*ne00;
1451
- cl_int y_offset = i1*ne10;
1452
- cl_int d_offset = i01*ne00;
1417
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
1453
1418
 
1454
- // compute
1455
- size_t global = ne00;
1456
- cl_int ky = ne10;
1457
- CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1458
- CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1459
- CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1460
- CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1461
- CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1462
- CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1463
- CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1464
- CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1465
- }
1466
- }
1419
+ const int64_t i13 = i03%ne13;
1420
+ const int64_t i12 = i02%ne12;
1421
+ const int i1 = i13*ne12*ne11 + i12*ne11;
1422
+
1423
+ cl_int x_offset = 0;
1424
+ cl_int y_offset = i1*ne10;
1425
+ cl_int d_offset = 0;
1426
+
1427
+ size_t global = ne00 * ne01;
1428
+ cl_int ky = ne10 * ne11;
1429
+
1430
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1431
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1432
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1433
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1434
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1435
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1436
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1437
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1467
1438
 
1468
1439
  CL_CHECK(clReleaseEvent(ev));
1469
1440
  CL_CHECK(clFinish(queue));
@@ -1518,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1518
1489
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1519
1490
 
1520
1491
  size_t x_offset = 0;
1521
- int64_t pi02 = -1;
1522
- int64_t pi03 = -1;
1523
-
1524
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1525
- int64_t i03 = i13 / r3;
1526
-
1527
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1528
- int64_t i02 = i12 / r2;
1529
-
1530
- // copy data to device
1531
- if (src0->backend == GGML_BACKEND_GPU) {
1532
- x_offset = (i03 * ne02 + i02) * x_ne;
1533
- } else if (i02 != pi02 || i03 != pi03) {
1534
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1535
- pi02 = i02;
1536
- pi03 = i03;
1537
- }
1538
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1539
1492
 
1540
- CL_CHECK(clFinish(queue));
1493
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1494
+ // TODO: copy src0 here when r3>1
1495
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1496
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1497
+ if (src0->backend == GGML_BACKEND_GPU) {
1498
+ x_offset = (i03 * ne02 + i02) * x_ne;
1499
+ } else {
1500
+ // copy src0 to device
1501
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1502
+ }
1541
1503
 
1542
- // compute
1543
- cl_event ev_sgemm;
1544
- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1545
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1546
- ne01, ne11, ne10,
1547
- alpha,
1548
- d_X, x_offset, ne00,
1549
- d_Y, 0, ne10,
1550
- beta,
1551
- d_D, 0, ne01,
1552
- &queue, &ev_sgemm);
1553
-
1554
- if (status != clblast::StatusCode::kSuccess) {
1555
- GGML_ASSERT(false);
1556
- }
1504
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1505
+ // copy src1 to device
1506
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1557
1507
 
1558
- // copy dst to host
1559
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1560
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1508
+ CL_CHECK(clFinish(queue));
1509
+
1510
+ // compute
1511
+ cl_event ev_sgemm;
1512
+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1513
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1514
+ ne01, ne11, ne10,
1515
+ alpha,
1516
+ d_X, x_offset, ne00,
1517
+ d_Y, 0, ne10,
1518
+ beta,
1519
+ d_D, 0, ne01,
1520
+ &queue, &ev_sgemm);
1521
+
1522
+ if (status != clblast::StatusCode::kSuccess) {
1523
+ GGML_ASSERT(false);
1524
+ }
1525
+
1526
+ // copy dst to host
1527
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1528
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1529
+ }
1530
+ }
1561
1531
  }
1562
1532
  }
1563
1533
 
@@ -1568,7 +1538,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1568
1538
  ggml_cl_pool_free(d_D, d_size);
1569
1539
  }
1570
1540
 
1571
- static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
1541
+ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
1572
1542
  GGML_ASSERT(fp16_support);
1573
1543
 
1574
1544
  const int64_t ne00 = src0->ne[0];
@@ -1598,6 +1568,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1598
1568
  const int y_ne = ne11 * ne10;
1599
1569
  const int d_ne = ne11 * ne01;
1600
1570
 
1571
+ GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
1572
+ GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
1573
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
1574
+
1601
1575
  size_t x_size;
1602
1576
  size_t y_size;
1603
1577
  size_t d_size;
@@ -1614,74 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1614
1588
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
1615
1589
 
1616
1590
  size_t x_offset = 0;
1617
- int64_t pi02 = -1;
1618
- int64_t pi03 = -1;
1619
-
1620
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1621
- int64_t i03 = i13 / r3;
1622
1591
 
1623
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1624
- int64_t i02 = i12 / r2;
1625
-
1626
- // copy src0 to device
1627
- if (src0->backend == GGML_BACKEND_GPU) {
1628
- x_offset = (i03 * ne02 + i02) * x_ne;
1629
- } else if (i02 != pi02 || i03 != pi03) {
1630
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1631
- pi02 = i02;
1632
- pi03 = i03;
1633
- }
1634
-
1635
- // convert src1 to fp16
1636
- // TODO: use multiple threads
1637
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
1638
- char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1639
- if (src1_cont_rows) {
1640
- if (src1_cont_cols) {
1641
- ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1592
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1593
+ // TODO: copy src0 here when r3>1
1594
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1595
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1596
+ if (src0->backend == GGML_BACKEND_GPU) {
1597
+ x_offset = (i03 * ne02 + i02) * x_ne;
1598
+ } else {
1599
+ // copy src0 to device
1600
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1642
1601
  }
1643
- else {
1644
- for (int64_t i11 = 0; i11 < ne11; i11++) {
1645
- ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1602
+
1603
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1604
+ // convert src1 to fp16
1605
+ // TODO: use multiple threads
1606
+ char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1607
+ if (src1_cont_rows) {
1608
+ if (src1_cont_cols) {
1609
+ ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1610
+ }
1611
+ else {
1612
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1613
+ ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1614
+ }
1615
+ }
1646
1616
  }
1647
- }
1648
- }
1649
- else {
1650
- for (int64_t i11 = 0; i11 < ne11; i11++) {
1651
- for (int64_t i10 = 0; i10 < ne10; i10++) {
1652
- // very slow due to no inlining
1653
- tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1617
+ else {
1618
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1619
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
1620
+ // very slow due to no inlining
1621
+ tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1622
+ }
1623
+ }
1654
1624
  }
1655
- }
1656
- }
1657
1625
 
1658
- // copy src1 to device
1659
- CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1626
+ // copy src1 to device
1627
+ CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1660
1628
 
1661
- CL_CHECK(clFinish(queue));
1629
+ CL_CHECK(clFinish(queue));
1662
1630
 
1663
- // compute
1664
- cl_event ev_sgemm;
1665
- clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1666
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1667
- ne01, ne11, ne10,
1668
- alpha,
1669
- d_X, x_offset, ne00,
1670
- d_Y, 0, ne10,
1671
- beta,
1672
- d_D, 0, ne01,
1673
- &queue, &ev_sgemm);
1674
-
1675
- if (status != clblast::StatusCode::kSuccess) {
1676
- GGML_ASSERT(false);
1677
- }
1631
+ // compute
1632
+ cl_event ev_sgemm;
1633
+ clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1634
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1635
+ ne01, ne11, ne10,
1636
+ alpha,
1637
+ d_X, x_offset, ne00,
1638
+ d_Y, 0, ne10,
1639
+ beta,
1640
+ d_D, 0, ne01,
1641
+ &queue, &ev_sgemm);
1642
+
1643
+ if (status != clblast::StatusCode::kSuccess) {
1644
+ GGML_ASSERT(false);
1645
+ }
1678
1646
 
1679
- // copy dst to host, then convert to float
1680
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1647
+ // copy dst to host, then convert to float
1648
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1681
1649
 
1682
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1650
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1683
1651
 
1684
- ggml_fp16_to_fp32_row(tmp, d, d_ne);
1652
+ ggml_fp16_to_fp32_row(tmp, d, d_ne);
1653
+ }
1654
+ }
1685
1655
  }
1686
1656
  }
1687
1657
 
@@ -1744,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1744
1714
  size_t ev_idx = 0;
1745
1715
  std::vector<cl_event> events;
1746
1716
 
1747
- int64_t pi02 = -1;
1748
- int64_t pi03 = -1;
1749
-
1750
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1751
- int64_t i03 = i13 / r3;
1752
-
1753
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1754
- int64_t i02 = i12 / r2;
1755
-
1756
- // copy src0 to device if necessary
1757
- if (src0->backend == GGML_BACKEND_CPU) {
1758
- if (i02 != pi02 || i03 != pi03) {
1717
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1718
+ // TODO: copy and dequantize src0 here when r3>1
1719
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1720
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1721
+ // copy src0 to device if necessary
1722
+ if (src0->backend == GGML_BACKEND_CPU) {
1759
1723
  events.emplace_back();
1760
1724
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1761
- pi02 = i02;
1762
- pi03 = i03;
1763
- }
1764
- } else if (src0->backend == GGML_BACKEND_GPU) {
1765
- d_Q = (cl_mem) src0->extra;
1766
- } else {
1767
- GGML_ASSERT(false);
1768
- }
1769
- if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1770
- // copy src1 to device
1771
- events.emplace_back();
1772
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1773
-
1774
- // compute
1775
- const size_t global = ne01 * local;
1776
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1777
- const cl_int ncols = ne00;
1778
- events.emplace_back();
1779
- CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1780
- CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1781
- CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1782
- CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1783
- CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1784
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1785
- } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1786
- // convert src0 to fp32 on device
1787
- const size_t global = x_ne / global_denom;
1788
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1789
- CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1790
- CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1791
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1792
-
1793
- // copy src1 to device
1794
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1795
-
1796
- events.emplace_back();
1797
-
1798
- // wait for conversion
1799
- CL_CHECK(clFinish(queue));
1800
-
1801
- // compute
1802
- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1803
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1804
- ne01, ne11, ne10,
1805
- alpha,
1806
- d_X, 0, ne00,
1807
- d_Y, 0, ne10,
1808
- beta,
1809
- d_D, 0, ne01,
1810
- &queue, events.data() + ev_idx++);
1811
-
1812
- if (status != clblast::StatusCode::kSuccess) {
1725
+ } else if (src0->backend == GGML_BACKEND_GPU) {
1726
+ d_Q = (cl_mem) src0->extra;
1727
+ } else {
1813
1728
  GGML_ASSERT(false);
1814
1729
  }
1815
- }
1816
1730
 
1817
- // copy dst to host
1818
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1819
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1820
- for (auto *event : events) {
1821
- clReleaseEvent(event);
1822
- }
1731
+ if (!mul_mat_vec) {
1732
+ // convert src0 to fp32 on device
1733
+ const size_t global = x_ne / global_denom;
1734
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1735
+ CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1736
+ CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1737
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1738
+ }
1823
1739
 
1824
- ev_idx = 0;
1825
- events.clear();
1740
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1741
+ if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1742
+ // copy src1 to device
1743
+ events.emplace_back();
1744
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1745
+
1746
+ // compute
1747
+ const size_t global = ne01 * local;
1748
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1749
+ const cl_int ncols = ne00;
1750
+ events.emplace_back();
1751
+ CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1752
+ CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1753
+ CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1754
+ CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1755
+ CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1756
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1757
+ } else { // CLBlast matrix matrix multiplication
1758
+ // copy src1 to device
1759
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1760
+
1761
+ // wait for conversion
1762
+ CL_CHECK(clFinish(queue));
1763
+
1764
+ // compute
1765
+ events.emplace_back();
1766
+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1767
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1768
+ ne01, ne11, ne10,
1769
+ alpha,
1770
+ d_X, 0, ne00,
1771
+ d_Y, 0, ne10,
1772
+ beta,
1773
+ d_D, 0, ne01,
1774
+ &queue, events.data() + ev_idx++);
1775
+
1776
+ if (status != clblast::StatusCode::kSuccess) {
1777
+ GGML_ASSERT(false);
1778
+ }
1779
+ }
1780
+
1781
+ // copy dst to host
1782
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1783
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1784
+ for (auto *event : events) {
1785
+ clReleaseEvent(event);
1786
+ }
1787
+
1788
+ ev_idx = 0;
1789
+ events.clear();
1790
+ }
1791
+ }
1826
1792
  }
1827
1793
  }
1828
1794
 
@@ -1897,8 +1863,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
1897
1863
  }
1898
1864
 
1899
1865
  size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1900
- if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1901
- return ggml_nelements(src1) * sizeof(ggml_fp16_t);
1866
+ if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1867
+ return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
1902
1868
  }
1903
1869
  return 0;
1904
1870
  }