llama_cpp 0.7.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1395,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1395
1395
  const int64_t ne01 = src0->ne[1];
1396
1396
  const int64_t ne02 = src0->ne[2];
1397
1397
  const int64_t ne03 = src0->ne[3];
1398
- const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
1399
1398
  const int64_t ne10 = src1->ne[0];
1400
1399
  const int64_t ne11 = src1->ne[1];
1401
1400
  const int64_t ne12 = src1->ne[2];
1402
1401
  const int64_t ne13 = src1->ne[3];
1403
- const int64_t nb10 = src1->nb[0];
1404
1402
  const int nb2 = dst->nb[2];
1405
1403
  const int nb3 = dst->nb[3];
1406
1404
  size_t x_size;
1407
1405
  size_t d_size;
1408
1406
 
1409
- cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1407
+ cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
1410
1408
  cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1411
- cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1409
+ cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
1412
1410
 
1413
1411
 
1414
1412
  for (int64_t i03 = 0; i03 < ne03; i03++) {
1415
1413
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1416
- const int i0 = i03*ne02 + i02;
1417
-
1418
1414
  cl_event ev;
1419
1415
 
1420
1416
  // copy src0 to device
1421
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
1422
-
1423
- if (nb10 == sizeof(float)) {
1424
- // Contiguous, avoid overhead from queueing many kernel runs
1425
- const int64_t i13 = i03%ne13;
1426
- const int64_t i12 = i02%ne12;
1427
- const int i1 = i13*ne12*ne11 + i12*ne11;
1428
-
1429
- cl_int x_offset = 0;
1430
- cl_int y_offset = i1*ne10;
1431
- cl_int d_offset = 0;
1432
-
1433
- size_t global = ne00 * ne01;
1434
- cl_int ky = ne10;
1435
- CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1436
- CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1437
- CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1438
- CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1439
- CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1440
- CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1441
- CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1442
- CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1443
- } else {
1444
- for (int64_t i01 = 0; i01 < ne01; i01++) {
1445
- const int64_t i13 = i03%ne13;
1446
- const int64_t i12 = i02%ne12;
1447
- const int64_t i11 = i01%ne11;
1448
- const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
1449
-
1450
- cl_int x_offset = i01*ne00;
1451
- cl_int y_offset = i1*ne10;
1452
- cl_int d_offset = i01*ne00;
1417
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
1453
1418
 
1454
- // compute
1455
- size_t global = ne00;
1456
- cl_int ky = ne10;
1457
- CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1458
- CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1459
- CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1460
- CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1461
- CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1462
- CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1463
- CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1464
- CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1465
- }
1466
- }
1419
+ const int64_t i13 = i03%ne13;
1420
+ const int64_t i12 = i02%ne12;
1421
+ const int i1 = i13*ne12*ne11 + i12*ne11;
1422
+
1423
+ cl_int x_offset = 0;
1424
+ cl_int y_offset = i1*ne10;
1425
+ cl_int d_offset = 0;
1426
+
1427
+ size_t global = ne00 * ne01;
1428
+ cl_int ky = ne10 * ne11;
1429
+
1430
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1431
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1432
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1433
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1434
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1435
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1436
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1437
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1467
1438
 
1468
1439
  CL_CHECK(clReleaseEvent(ev));
1469
1440
  CL_CHECK(clFinish(queue));
@@ -1518,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1518
1489
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1519
1490
 
1520
1491
  size_t x_offset = 0;
1521
- int64_t pi02 = -1;
1522
- int64_t pi03 = -1;
1523
-
1524
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1525
- int64_t i03 = i13 / r3;
1526
-
1527
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1528
- int64_t i02 = i12 / r2;
1529
-
1530
- // copy data to device
1531
- if (src0->backend == GGML_BACKEND_GPU) {
1532
- x_offset = (i03 * ne02 + i02) * x_ne;
1533
- } else if (i02 != pi02 || i03 != pi03) {
1534
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1535
- pi02 = i02;
1536
- pi03 = i03;
1537
- }
1538
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1539
1492
 
1540
- CL_CHECK(clFinish(queue));
1493
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1494
+ // TODO: copy src0 here when r3>1
1495
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1496
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1497
+ if (src0->backend == GGML_BACKEND_GPU) {
1498
+ x_offset = (i03 * ne02 + i02) * x_ne;
1499
+ } else {
1500
+ // copy src0 to device
1501
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1502
+ }
1541
1503
 
1542
- // compute
1543
- cl_event ev_sgemm;
1544
- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1545
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1546
- ne01, ne11, ne10,
1547
- alpha,
1548
- d_X, x_offset, ne00,
1549
- d_Y, 0, ne10,
1550
- beta,
1551
- d_D, 0, ne01,
1552
- &queue, &ev_sgemm);
1553
-
1554
- if (status != clblast::StatusCode::kSuccess) {
1555
- GGML_ASSERT(false);
1556
- }
1504
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1505
+ // copy src1 to device
1506
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1557
1507
 
1558
- // copy dst to host
1559
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1560
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1508
+ CL_CHECK(clFinish(queue));
1509
+
1510
+ // compute
1511
+ cl_event ev_sgemm;
1512
+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1513
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1514
+ ne01, ne11, ne10,
1515
+ alpha,
1516
+ d_X, x_offset, ne00,
1517
+ d_Y, 0, ne10,
1518
+ beta,
1519
+ d_D, 0, ne01,
1520
+ &queue, &ev_sgemm);
1521
+
1522
+ if (status != clblast::StatusCode::kSuccess) {
1523
+ GGML_ASSERT(false);
1524
+ }
1525
+
1526
+ // copy dst to host
1527
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1528
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1529
+ }
1530
+ }
1561
1531
  }
1562
1532
  }
1563
1533
 
@@ -1568,7 +1538,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1568
1538
  ggml_cl_pool_free(d_D, d_size);
1569
1539
  }
1570
1540
 
1571
- static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
1541
+ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
1572
1542
  GGML_ASSERT(fp16_support);
1573
1543
 
1574
1544
  const int64_t ne00 = src0->ne[0];
@@ -1598,6 +1568,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1598
1568
  const int y_ne = ne11 * ne10;
1599
1569
  const int d_ne = ne11 * ne01;
1600
1570
 
1571
+ GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
1572
+ GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
1573
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
1574
+
1601
1575
  size_t x_size;
1602
1576
  size_t y_size;
1603
1577
  size_t d_size;
@@ -1614,74 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1614
1588
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
1615
1589
 
1616
1590
  size_t x_offset = 0;
1617
- int64_t pi02 = -1;
1618
- int64_t pi03 = -1;
1619
-
1620
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1621
- int64_t i03 = i13 / r3;
1622
1591
 
1623
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1624
- int64_t i02 = i12 / r2;
1625
-
1626
- // copy src0 to device
1627
- if (src0->backend == GGML_BACKEND_GPU) {
1628
- x_offset = (i03 * ne02 + i02) * x_ne;
1629
- } else if (i02 != pi02 || i03 != pi03) {
1630
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1631
- pi02 = i02;
1632
- pi03 = i03;
1633
- }
1634
-
1635
- // convert src1 to fp16
1636
- // TODO: use multiple threads
1637
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
1638
- char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1639
- if (src1_cont_rows) {
1640
- if (src1_cont_cols) {
1641
- ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1592
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1593
+ // TODO: copy src0 here when r3>1
1594
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1595
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1596
+ if (src0->backend == GGML_BACKEND_GPU) {
1597
+ x_offset = (i03 * ne02 + i02) * x_ne;
1598
+ } else {
1599
+ // copy src0 to device
1600
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1642
1601
  }
1643
- else {
1644
- for (int64_t i11 = 0; i11 < ne11; i11++) {
1645
- ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1602
+
1603
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1604
+ // convert src1 to fp16
1605
+ // TODO: use multiple threads
1606
+ char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1607
+ if (src1_cont_rows) {
1608
+ if (src1_cont_cols) {
1609
+ ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1610
+ }
1611
+ else {
1612
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1613
+ ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1614
+ }
1615
+ }
1646
1616
  }
1647
- }
1648
- }
1649
- else {
1650
- for (int64_t i11 = 0; i11 < ne11; i11++) {
1651
- for (int64_t i10 = 0; i10 < ne10; i10++) {
1652
- // very slow due to no inlining
1653
- tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1617
+ else {
1618
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1619
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
1620
+ // very slow due to no inlining
1621
+ tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1622
+ }
1623
+ }
1654
1624
  }
1655
- }
1656
- }
1657
1625
 
1658
- // copy src1 to device
1659
- CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1626
+ // copy src1 to device
1627
+ CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1660
1628
 
1661
- CL_CHECK(clFinish(queue));
1629
+ CL_CHECK(clFinish(queue));
1662
1630
 
1663
- // compute
1664
- cl_event ev_sgemm;
1665
- clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1666
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1667
- ne01, ne11, ne10,
1668
- alpha,
1669
- d_X, x_offset, ne00,
1670
- d_Y, 0, ne10,
1671
- beta,
1672
- d_D, 0, ne01,
1673
- &queue, &ev_sgemm);
1674
-
1675
- if (status != clblast::StatusCode::kSuccess) {
1676
- GGML_ASSERT(false);
1677
- }
1631
+ // compute
1632
+ cl_event ev_sgemm;
1633
+ clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1634
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1635
+ ne01, ne11, ne10,
1636
+ alpha,
1637
+ d_X, x_offset, ne00,
1638
+ d_Y, 0, ne10,
1639
+ beta,
1640
+ d_D, 0, ne01,
1641
+ &queue, &ev_sgemm);
1642
+
1643
+ if (status != clblast::StatusCode::kSuccess) {
1644
+ GGML_ASSERT(false);
1645
+ }
1678
1646
 
1679
- // copy dst to host, then convert to float
1680
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1647
+ // copy dst to host, then convert to float
1648
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1681
1649
 
1682
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1650
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1683
1651
 
1684
- ggml_fp16_to_fp32_row(tmp, d, d_ne);
1652
+ ggml_fp16_to_fp32_row(tmp, d, d_ne);
1653
+ }
1654
+ }
1685
1655
  }
1686
1656
  }
1687
1657
 
@@ -1744,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1744
1714
  size_t ev_idx = 0;
1745
1715
  std::vector<cl_event> events;
1746
1716
 
1747
- int64_t pi02 = -1;
1748
- int64_t pi03 = -1;
1749
-
1750
- for (int64_t i13 = 0; i13 < ne13; i13++) {
1751
- int64_t i03 = i13 / r3;
1752
-
1753
- for (int64_t i12 = 0; i12 < ne12; i12++) {
1754
- int64_t i02 = i12 / r2;
1755
-
1756
- // copy src0 to device if necessary
1757
- if (src0->backend == GGML_BACKEND_CPU) {
1758
- if (i02 != pi02 || i03 != pi03) {
1717
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1718
+ // TODO: copy and dequantize src0 here when r3>1
1719
+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1720
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1721
+ // copy src0 to device if necessary
1722
+ if (src0->backend == GGML_BACKEND_CPU) {
1759
1723
  events.emplace_back();
1760
1724
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1761
- pi02 = i02;
1762
- pi03 = i03;
1763
- }
1764
- } else if (src0->backend == GGML_BACKEND_GPU) {
1765
- d_Q = (cl_mem) src0->extra;
1766
- } else {
1767
- GGML_ASSERT(false);
1768
- }
1769
- if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1770
- // copy src1 to device
1771
- events.emplace_back();
1772
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1773
-
1774
- // compute
1775
- const size_t global = ne01 * local;
1776
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1777
- const cl_int ncols = ne00;
1778
- events.emplace_back();
1779
- CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1780
- CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1781
- CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1782
- CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1783
- CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1784
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1785
- } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1786
- // convert src0 to fp32 on device
1787
- const size_t global = x_ne / global_denom;
1788
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1789
- CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1790
- CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1791
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1792
-
1793
- // copy src1 to device
1794
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1795
-
1796
- events.emplace_back();
1797
-
1798
- // wait for conversion
1799
- CL_CHECK(clFinish(queue));
1800
-
1801
- // compute
1802
- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1803
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1804
- ne01, ne11, ne10,
1805
- alpha,
1806
- d_X, 0, ne00,
1807
- d_Y, 0, ne10,
1808
- beta,
1809
- d_D, 0, ne01,
1810
- &queue, events.data() + ev_idx++);
1811
-
1812
- if (status != clblast::StatusCode::kSuccess) {
1725
+ } else if (src0->backend == GGML_BACKEND_GPU) {
1726
+ d_Q = (cl_mem) src0->extra;
1727
+ } else {
1813
1728
  GGML_ASSERT(false);
1814
1729
  }
1815
- }
1816
1730
 
1817
- // copy dst to host
1818
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1819
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1820
- for (auto *event : events) {
1821
- clReleaseEvent(event);
1822
- }
1731
+ if (!mul_mat_vec) {
1732
+ // convert src0 to fp32 on device
1733
+ const size_t global = x_ne / global_denom;
1734
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1735
+ CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1736
+ CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1737
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1738
+ }
1823
1739
 
1824
- ev_idx = 0;
1825
- events.clear();
1740
+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1741
+ if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1742
+ // copy src1 to device
1743
+ events.emplace_back();
1744
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1745
+
1746
+ // compute
1747
+ const size_t global = ne01 * local;
1748
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1749
+ const cl_int ncols = ne00;
1750
+ events.emplace_back();
1751
+ CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1752
+ CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1753
+ CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1754
+ CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1755
+ CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1756
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1757
+ } else { // CLBlast matrix matrix multiplication
1758
+ // copy src1 to device
1759
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1760
+
1761
+ // wait for conversion
1762
+ CL_CHECK(clFinish(queue));
1763
+
1764
+ // compute
1765
+ events.emplace_back();
1766
+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1767
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1768
+ ne01, ne11, ne10,
1769
+ alpha,
1770
+ d_X, 0, ne00,
1771
+ d_Y, 0, ne10,
1772
+ beta,
1773
+ d_D, 0, ne01,
1774
+ &queue, events.data() + ev_idx++);
1775
+
1776
+ if (status != clblast::StatusCode::kSuccess) {
1777
+ GGML_ASSERT(false);
1778
+ }
1779
+ }
1780
+
1781
+ // copy dst to host
1782
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1783
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1784
+ for (auto *event : events) {
1785
+ clReleaseEvent(event);
1786
+ }
1787
+
1788
+ ev_idx = 0;
1789
+ events.clear();
1790
+ }
1791
+ }
1826
1792
  }
1827
1793
  }
1828
1794
 
@@ -1897,8 +1863,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
1897
1863
  }
1898
1864
 
1899
1865
  size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1900
- if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1901
- return ggml_nelements(src1) * sizeof(ggml_fp16_t);
1866
+ if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1867
+ return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
1902
1868
  }
1903
1869
  return 0;
1904
1870
  }