llama_cpp 0.7.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +122 -183
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +57 -8
- data/ext/llama_cpp/src/ggml-metal.metal +171 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +188 -222
- data/ext/llama_cpp/src/ggml.c +375 -93
- data/ext/llama_cpp/src/ggml.h +11 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/llama.cpp +459 -153
- data/ext/llama_cpp/src/llama.h +34 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +15 -16
- metadata +3 -3
@@ -1395,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1395
1395
|
const int64_t ne01 = src0->ne[1];
|
1396
1396
|
const int64_t ne02 = src0->ne[2];
|
1397
1397
|
const int64_t ne03 = src0->ne[3];
|
1398
|
-
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
1399
1398
|
const int64_t ne10 = src1->ne[0];
|
1400
1399
|
const int64_t ne11 = src1->ne[1];
|
1401
1400
|
const int64_t ne12 = src1->ne[2];
|
1402
1401
|
const int64_t ne13 = src1->ne[3];
|
1403
|
-
const int64_t nb10 = src1->nb[0];
|
1404
1402
|
const int nb2 = dst->nb[2];
|
1405
1403
|
const int nb3 = dst->nb[3];
|
1406
1404
|
size_t x_size;
|
1407
1405
|
size_t d_size;
|
1408
1406
|
|
1409
|
-
cl_mem d_X = ggml_cl_pool_malloc(
|
1407
|
+
cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
|
1410
1408
|
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
1411
|
-
cl_mem d_D = ggml_cl_pool_malloc(
|
1409
|
+
cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
|
1412
1410
|
|
1413
1411
|
|
1414
1412
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1415
1413
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1416
|
-
const int i0 = i03*ne02 + i02;
|
1417
|
-
|
1418
1414
|
cl_event ev;
|
1419
1415
|
|
1420
1416
|
// copy src0 to device
|
1421
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X,
|
1422
|
-
|
1423
|
-
if (nb10 == sizeof(float)) {
|
1424
|
-
// Contiguous, avoid overhead from queueing many kernel runs
|
1425
|
-
const int64_t i13 = i03%ne13;
|
1426
|
-
const int64_t i12 = i02%ne12;
|
1427
|
-
const int i1 = i13*ne12*ne11 + i12*ne11;
|
1428
|
-
|
1429
|
-
cl_int x_offset = 0;
|
1430
|
-
cl_int y_offset = i1*ne10;
|
1431
|
-
cl_int d_offset = 0;
|
1432
|
-
|
1433
|
-
size_t global = ne00 * ne01;
|
1434
|
-
cl_int ky = ne10;
|
1435
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
1436
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
1437
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
1438
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
1439
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
1440
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
1441
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
1442
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
1443
|
-
} else {
|
1444
|
-
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
1445
|
-
const int64_t i13 = i03%ne13;
|
1446
|
-
const int64_t i12 = i02%ne12;
|
1447
|
-
const int64_t i11 = i01%ne11;
|
1448
|
-
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
1449
|
-
|
1450
|
-
cl_int x_offset = i01*ne00;
|
1451
|
-
cl_int y_offset = i1*ne10;
|
1452
|
-
cl_int d_offset = i01*ne00;
|
1417
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
|
1453
1418
|
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1466
|
-
|
1419
|
+
const int64_t i13 = i03%ne13;
|
1420
|
+
const int64_t i12 = i02%ne12;
|
1421
|
+
const int i1 = i13*ne12*ne11 + i12*ne11;
|
1422
|
+
|
1423
|
+
cl_int x_offset = 0;
|
1424
|
+
cl_int y_offset = i1*ne10;
|
1425
|
+
cl_int d_offset = 0;
|
1426
|
+
|
1427
|
+
size_t global = ne00 * ne01;
|
1428
|
+
cl_int ky = ne10 * ne11;
|
1429
|
+
|
1430
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
1431
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
1432
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
1433
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
1434
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
1435
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
1436
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
1437
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
1467
1438
|
|
1468
1439
|
CL_CHECK(clReleaseEvent(ev));
|
1469
1440
|
CL_CHECK(clFinish(queue));
|
@@ -1518,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1518
1489
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
1519
1490
|
|
1520
1491
|
size_t x_offset = 0;
|
1521
|
-
int64_t pi02 = -1;
|
1522
|
-
int64_t pi03 = -1;
|
1523
|
-
|
1524
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1525
|
-
int64_t i03 = i13 / r3;
|
1526
|
-
|
1527
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1528
|
-
int64_t i02 = i12 / r2;
|
1529
|
-
|
1530
|
-
// copy data to device
|
1531
|
-
if (src0->backend == GGML_BACKEND_GPU) {
|
1532
|
-
x_offset = (i03 * ne02 + i02) * x_ne;
|
1533
|
-
} else if (i02 != pi02 || i03 != pi03) {
|
1534
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1535
|
-
pi02 = i02;
|
1536
|
-
pi03 = i03;
|
1537
|
-
}
|
1538
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1539
1492
|
|
1540
|
-
|
1493
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1494
|
+
// TODO: copy src0 here when r3>1
|
1495
|
+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1496
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1497
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1498
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1499
|
+
} else {
|
1500
|
+
// copy src0 to device
|
1501
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1502
|
+
}
|
1541
1503
|
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1546
|
-
ne01, ne11, ne10,
|
1547
|
-
alpha,
|
1548
|
-
d_X, x_offset, ne00,
|
1549
|
-
d_Y, 0, ne10,
|
1550
|
-
beta,
|
1551
|
-
d_D, 0, ne01,
|
1552
|
-
&queue, &ev_sgemm);
|
1553
|
-
|
1554
|
-
if (status != clblast::StatusCode::kSuccess) {
|
1555
|
-
GGML_ASSERT(false);
|
1556
|
-
}
|
1504
|
+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
1505
|
+
// copy src1 to device
|
1506
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1557
1507
|
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1508
|
+
CL_CHECK(clFinish(queue));
|
1509
|
+
|
1510
|
+
// compute
|
1511
|
+
cl_event ev_sgemm;
|
1512
|
+
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
1513
|
+
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1514
|
+
ne01, ne11, ne10,
|
1515
|
+
alpha,
|
1516
|
+
d_X, x_offset, ne00,
|
1517
|
+
d_Y, 0, ne10,
|
1518
|
+
beta,
|
1519
|
+
d_D, 0, ne01,
|
1520
|
+
&queue, &ev_sgemm);
|
1521
|
+
|
1522
|
+
if (status != clblast::StatusCode::kSuccess) {
|
1523
|
+
GGML_ASSERT(false);
|
1524
|
+
}
|
1525
|
+
|
1526
|
+
// copy dst to host
|
1527
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1528
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
1529
|
+
}
|
1530
|
+
}
|
1561
1531
|
}
|
1562
1532
|
}
|
1563
1533
|
|
@@ -1568,7 +1538,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1568
1538
|
ggml_cl_pool_free(d_D, d_size);
|
1569
1539
|
}
|
1570
1540
|
|
1571
|
-
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t
|
1541
|
+
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
|
1572
1542
|
GGML_ASSERT(fp16_support);
|
1573
1543
|
|
1574
1544
|
const int64_t ne00 = src0->ne[0];
|
@@ -1598,6 +1568,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1598
1568
|
const int y_ne = ne11 * ne10;
|
1599
1569
|
const int d_ne = ne11 * ne01;
|
1600
1570
|
|
1571
|
+
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
|
1572
|
+
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
|
1573
|
+
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
|
1574
|
+
|
1601
1575
|
size_t x_size;
|
1602
1576
|
size_t y_size;
|
1603
1577
|
size_t d_size;
|
@@ -1614,74 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1614
1588
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
1615
1589
|
|
1616
1590
|
size_t x_offset = 0;
|
1617
|
-
int64_t pi02 = -1;
|
1618
|
-
int64_t pi03 = -1;
|
1619
|
-
|
1620
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1621
|
-
int64_t i03 = i13 / r3;
|
1622
1591
|
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
pi03 = i03;
|
1633
|
-
}
|
1634
|
-
|
1635
|
-
// convert src1 to fp16
|
1636
|
-
// TODO: use multiple threads
|
1637
|
-
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
1638
|
-
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1639
|
-
if (src1_cont_rows) {
|
1640
|
-
if (src1_cont_cols) {
|
1641
|
-
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
1592
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1593
|
+
// TODO: copy src0 here when r3>1
|
1594
|
+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1595
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1596
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1597
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1598
|
+
} else {
|
1599
|
+
// copy src0 to device
|
1600
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1642
1601
|
}
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1602
|
+
|
1603
|
+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
1604
|
+
// convert src1 to fp16
|
1605
|
+
// TODO: use multiple threads
|
1606
|
+
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1607
|
+
if (src1_cont_rows) {
|
1608
|
+
if (src1_cont_cols) {
|
1609
|
+
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
1610
|
+
}
|
1611
|
+
else {
|
1612
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1613
|
+
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
1614
|
+
}
|
1615
|
+
}
|
1646
1616
|
}
|
1647
|
-
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
|
1653
|
-
|
1617
|
+
else {
|
1618
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1619
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
1620
|
+
// very slow due to no inlining
|
1621
|
+
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
1622
|
+
}
|
1623
|
+
}
|
1654
1624
|
}
|
1655
|
-
}
|
1656
|
-
}
|
1657
1625
|
|
1658
|
-
|
1659
|
-
|
1626
|
+
// copy src1 to device
|
1627
|
+
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
|
1660
1628
|
|
1661
|
-
|
1629
|
+
CL_CHECK(clFinish(queue));
|
1662
1630
|
|
1663
|
-
|
1664
|
-
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1631
|
+
// compute
|
1632
|
+
cl_event ev_sgemm;
|
1633
|
+
clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
|
1634
|
+
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1635
|
+
ne01, ne11, ne10,
|
1636
|
+
alpha,
|
1637
|
+
d_X, x_offset, ne00,
|
1638
|
+
d_Y, 0, ne10,
|
1639
|
+
beta,
|
1640
|
+
d_D, 0, ne01,
|
1641
|
+
&queue, &ev_sgemm);
|
1642
|
+
|
1643
|
+
if (status != clblast::StatusCode::kSuccess) {
|
1644
|
+
GGML_ASSERT(false);
|
1645
|
+
}
|
1678
1646
|
|
1679
|
-
|
1680
|
-
|
1647
|
+
// copy dst to host, then convert to float
|
1648
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
1681
1649
|
|
1682
|
-
|
1650
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1683
1651
|
|
1684
|
-
|
1652
|
+
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
1653
|
+
}
|
1654
|
+
}
|
1685
1655
|
}
|
1686
1656
|
}
|
1687
1657
|
|
@@ -1744,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1744
1714
|
size_t ev_idx = 0;
|
1745
1715
|
std::vector<cl_event> events;
|
1746
1716
|
|
1747
|
-
int64_t
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1751
|
-
|
1752
|
-
|
1753
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1754
|
-
int64_t i02 = i12 / r2;
|
1755
|
-
|
1756
|
-
// copy src0 to device if necessary
|
1757
|
-
if (src0->backend == GGML_BACKEND_CPU) {
|
1758
|
-
if (i02 != pi02 || i03 != pi03) {
|
1717
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1718
|
+
// TODO: copy and dequantize src0 here when r3>1
|
1719
|
+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1720
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1721
|
+
// copy src0 to device if necessary
|
1722
|
+
if (src0->backend == GGML_BACKEND_CPU) {
|
1759
1723
|
events.emplace_back();
|
1760
1724
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1761
|
-
|
1762
|
-
|
1763
|
-
}
|
1764
|
-
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1765
|
-
d_Q = (cl_mem) src0->extra;
|
1766
|
-
} else {
|
1767
|
-
GGML_ASSERT(false);
|
1768
|
-
}
|
1769
|
-
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
1770
|
-
// copy src1 to device
|
1771
|
-
events.emplace_back();
|
1772
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1773
|
-
|
1774
|
-
// compute
|
1775
|
-
const size_t global = ne01 * local;
|
1776
|
-
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1777
|
-
const cl_int ncols = ne00;
|
1778
|
-
events.emplace_back();
|
1779
|
-
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
1780
|
-
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
1781
|
-
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
1782
|
-
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
1783
|
-
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
1784
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
1785
|
-
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
1786
|
-
// convert src0 to fp32 on device
|
1787
|
-
const size_t global = x_ne / global_denom;
|
1788
|
-
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1789
|
-
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1790
|
-
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1791
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1792
|
-
|
1793
|
-
// copy src1 to device
|
1794
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1795
|
-
|
1796
|
-
events.emplace_back();
|
1797
|
-
|
1798
|
-
// wait for conversion
|
1799
|
-
CL_CHECK(clFinish(queue));
|
1800
|
-
|
1801
|
-
// compute
|
1802
|
-
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
1803
|
-
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1804
|
-
ne01, ne11, ne10,
|
1805
|
-
alpha,
|
1806
|
-
d_X, 0, ne00,
|
1807
|
-
d_Y, 0, ne10,
|
1808
|
-
beta,
|
1809
|
-
d_D, 0, ne01,
|
1810
|
-
&queue, events.data() + ev_idx++);
|
1811
|
-
|
1812
|
-
if (status != clblast::StatusCode::kSuccess) {
|
1725
|
+
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1726
|
+
d_Q = (cl_mem) src0->extra;
|
1727
|
+
} else {
|
1813
1728
|
GGML_ASSERT(false);
|
1814
1729
|
}
|
1815
|
-
}
|
1816
1730
|
|
1817
|
-
|
1818
|
-
|
1819
|
-
|
1820
|
-
|
1821
|
-
|
1822
|
-
|
1731
|
+
if (!mul_mat_vec) {
|
1732
|
+
// convert src0 to fp32 on device
|
1733
|
+
const size_t global = x_ne / global_denom;
|
1734
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1735
|
+
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1736
|
+
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1737
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1738
|
+
}
|
1823
1739
|
|
1824
|
-
|
1825
|
-
|
1740
|
+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
1741
|
+
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
1742
|
+
// copy src1 to device
|
1743
|
+
events.emplace_back();
|
1744
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1745
|
+
|
1746
|
+
// compute
|
1747
|
+
const size_t global = ne01 * local;
|
1748
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1749
|
+
const cl_int ncols = ne00;
|
1750
|
+
events.emplace_back();
|
1751
|
+
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
1752
|
+
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
1753
|
+
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
1754
|
+
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
1755
|
+
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
1756
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
1757
|
+
} else { // CLBlast matrix matrix multiplication
|
1758
|
+
// copy src1 to device
|
1759
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1760
|
+
|
1761
|
+
// wait for conversion
|
1762
|
+
CL_CHECK(clFinish(queue));
|
1763
|
+
|
1764
|
+
// compute
|
1765
|
+
events.emplace_back();
|
1766
|
+
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
1767
|
+
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1768
|
+
ne01, ne11, ne10,
|
1769
|
+
alpha,
|
1770
|
+
d_X, 0, ne00,
|
1771
|
+
d_Y, 0, ne10,
|
1772
|
+
beta,
|
1773
|
+
d_D, 0, ne01,
|
1774
|
+
&queue, events.data() + ev_idx++);
|
1775
|
+
|
1776
|
+
if (status != clblast::StatusCode::kSuccess) {
|
1777
|
+
GGML_ASSERT(false);
|
1778
|
+
}
|
1779
|
+
}
|
1780
|
+
|
1781
|
+
// copy dst to host
|
1782
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1783
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
1784
|
+
for (auto *event : events) {
|
1785
|
+
clReleaseEvent(event);
|
1786
|
+
}
|
1787
|
+
|
1788
|
+
ev_idx = 0;
|
1789
|
+
events.clear();
|
1790
|
+
}
|
1791
|
+
}
|
1826
1792
|
}
|
1827
1793
|
}
|
1828
1794
|
|
@@ -1897,8 +1863,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
|
|
1897
1863
|
}
|
1898
1864
|
|
1899
1865
|
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
1900
|
-
if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
1901
|
-
return
|
1866
|
+
if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
1867
|
+
return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
|
1902
1868
|
}
|
1903
1869
|
return 0;
|
1904
1870
|
}
|