llama_cpp 0.7.1 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +122 -183
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +57 -8
- data/ext/llama_cpp/src/ggml-metal.metal +171 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +188 -222
- data/ext/llama_cpp/src/ggml.c +375 -93
- data/ext/llama_cpp/src/ggml.h +11 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/llama.cpp +459 -153
- data/ext/llama_cpp/src/llama.h +34 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +15 -16
- metadata +3 -3
@@ -1395,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1395
1395
|
const int64_t ne01 = src0->ne[1];
|
1396
1396
|
const int64_t ne02 = src0->ne[2];
|
1397
1397
|
const int64_t ne03 = src0->ne[3];
|
1398
|
-
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
1399
1398
|
const int64_t ne10 = src1->ne[0];
|
1400
1399
|
const int64_t ne11 = src1->ne[1];
|
1401
1400
|
const int64_t ne12 = src1->ne[2];
|
1402
1401
|
const int64_t ne13 = src1->ne[3];
|
1403
|
-
const int64_t nb10 = src1->nb[0];
|
1404
1402
|
const int nb2 = dst->nb[2];
|
1405
1403
|
const int nb3 = dst->nb[3];
|
1406
1404
|
size_t x_size;
|
1407
1405
|
size_t d_size;
|
1408
1406
|
|
1409
|
-
cl_mem d_X = ggml_cl_pool_malloc(
|
1407
|
+
cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
|
1410
1408
|
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
1411
|
-
cl_mem d_D = ggml_cl_pool_malloc(
|
1409
|
+
cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
|
1412
1410
|
|
1413
1411
|
|
1414
1412
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1415
1413
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1416
|
-
const int i0 = i03*ne02 + i02;
|
1417
|
-
|
1418
1414
|
cl_event ev;
|
1419
1415
|
|
1420
1416
|
// copy src0 to device
|
1421
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X,
|
1422
|
-
|
1423
|
-
if (nb10 == sizeof(float)) {
|
1424
|
-
// Contiguous, avoid overhead from queueing many kernel runs
|
1425
|
-
const int64_t i13 = i03%ne13;
|
1426
|
-
const int64_t i12 = i02%ne12;
|
1427
|
-
const int i1 = i13*ne12*ne11 + i12*ne11;
|
1428
|
-
|
1429
|
-
cl_int x_offset = 0;
|
1430
|
-
cl_int y_offset = i1*ne10;
|
1431
|
-
cl_int d_offset = 0;
|
1432
|
-
|
1433
|
-
size_t global = ne00 * ne01;
|
1434
|
-
cl_int ky = ne10;
|
1435
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
1436
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
1437
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
1438
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
1439
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
1440
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
1441
|
-
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
1442
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
1443
|
-
} else {
|
1444
|
-
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
1445
|
-
const int64_t i13 = i03%ne13;
|
1446
|
-
const int64_t i12 = i02%ne12;
|
1447
|
-
const int64_t i11 = i01%ne11;
|
1448
|
-
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
1449
|
-
|
1450
|
-
cl_int x_offset = i01*ne00;
|
1451
|
-
cl_int y_offset = i1*ne10;
|
1452
|
-
cl_int d_offset = i01*ne00;
|
1417
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
|
1453
1418
|
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1466
|
-
|
1419
|
+
const int64_t i13 = i03%ne13;
|
1420
|
+
const int64_t i12 = i02%ne12;
|
1421
|
+
const int i1 = i13*ne12*ne11 + i12*ne11;
|
1422
|
+
|
1423
|
+
cl_int x_offset = 0;
|
1424
|
+
cl_int y_offset = i1*ne10;
|
1425
|
+
cl_int d_offset = 0;
|
1426
|
+
|
1427
|
+
size_t global = ne00 * ne01;
|
1428
|
+
cl_int ky = ne10 * ne11;
|
1429
|
+
|
1430
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
1431
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
1432
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
1433
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
1434
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
1435
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
1436
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
1437
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
1467
1438
|
|
1468
1439
|
CL_CHECK(clReleaseEvent(ev));
|
1469
1440
|
CL_CHECK(clFinish(queue));
|
@@ -1518,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1518
1489
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
1519
1490
|
|
1520
1491
|
size_t x_offset = 0;
|
1521
|
-
int64_t pi02 = -1;
|
1522
|
-
int64_t pi03 = -1;
|
1523
|
-
|
1524
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1525
|
-
int64_t i03 = i13 / r3;
|
1526
|
-
|
1527
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1528
|
-
int64_t i02 = i12 / r2;
|
1529
|
-
|
1530
|
-
// copy data to device
|
1531
|
-
if (src0->backend == GGML_BACKEND_GPU) {
|
1532
|
-
x_offset = (i03 * ne02 + i02) * x_ne;
|
1533
|
-
} else if (i02 != pi02 || i03 != pi03) {
|
1534
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1535
|
-
pi02 = i02;
|
1536
|
-
pi03 = i03;
|
1537
|
-
}
|
1538
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1539
1492
|
|
1540
|
-
|
1493
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1494
|
+
// TODO: copy src0 here when r3>1
|
1495
|
+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1496
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1497
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1498
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1499
|
+
} else {
|
1500
|
+
// copy src0 to device
|
1501
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1502
|
+
}
|
1541
1503
|
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1546
|
-
ne01, ne11, ne10,
|
1547
|
-
alpha,
|
1548
|
-
d_X, x_offset, ne00,
|
1549
|
-
d_Y, 0, ne10,
|
1550
|
-
beta,
|
1551
|
-
d_D, 0, ne01,
|
1552
|
-
&queue, &ev_sgemm);
|
1553
|
-
|
1554
|
-
if (status != clblast::StatusCode::kSuccess) {
|
1555
|
-
GGML_ASSERT(false);
|
1556
|
-
}
|
1504
|
+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
1505
|
+
// copy src1 to device
|
1506
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1557
1507
|
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1508
|
+
CL_CHECK(clFinish(queue));
|
1509
|
+
|
1510
|
+
// compute
|
1511
|
+
cl_event ev_sgemm;
|
1512
|
+
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
1513
|
+
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1514
|
+
ne01, ne11, ne10,
|
1515
|
+
alpha,
|
1516
|
+
d_X, x_offset, ne00,
|
1517
|
+
d_Y, 0, ne10,
|
1518
|
+
beta,
|
1519
|
+
d_D, 0, ne01,
|
1520
|
+
&queue, &ev_sgemm);
|
1521
|
+
|
1522
|
+
if (status != clblast::StatusCode::kSuccess) {
|
1523
|
+
GGML_ASSERT(false);
|
1524
|
+
}
|
1525
|
+
|
1526
|
+
// copy dst to host
|
1527
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1528
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
1529
|
+
}
|
1530
|
+
}
|
1561
1531
|
}
|
1562
1532
|
}
|
1563
1533
|
|
@@ -1568,7 +1538,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1568
1538
|
ggml_cl_pool_free(d_D, d_size);
|
1569
1539
|
}
|
1570
1540
|
|
1571
|
-
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t
|
1541
|
+
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
|
1572
1542
|
GGML_ASSERT(fp16_support);
|
1573
1543
|
|
1574
1544
|
const int64_t ne00 = src0->ne[0];
|
@@ -1598,6 +1568,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1598
1568
|
const int y_ne = ne11 * ne10;
|
1599
1569
|
const int d_ne = ne11 * ne01;
|
1600
1570
|
|
1571
|
+
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
|
1572
|
+
GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
|
1573
|
+
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
|
1574
|
+
|
1601
1575
|
size_t x_size;
|
1602
1576
|
size_t y_size;
|
1603
1577
|
size_t d_size;
|
@@ -1614,74 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1614
1588
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
1615
1589
|
|
1616
1590
|
size_t x_offset = 0;
|
1617
|
-
int64_t pi02 = -1;
|
1618
|
-
int64_t pi03 = -1;
|
1619
|
-
|
1620
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1621
|
-
int64_t i03 = i13 / r3;
|
1622
1591
|
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
pi03 = i03;
|
1633
|
-
}
|
1634
|
-
|
1635
|
-
// convert src1 to fp16
|
1636
|
-
// TODO: use multiple threads
|
1637
|
-
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
1638
|
-
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1639
|
-
if (src1_cont_rows) {
|
1640
|
-
if (src1_cont_cols) {
|
1641
|
-
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
1592
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1593
|
+
// TODO: copy src0 here when r3>1
|
1594
|
+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1595
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1596
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1597
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1598
|
+
} else {
|
1599
|
+
// copy src0 to device
|
1600
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1642
1601
|
}
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1602
|
+
|
1603
|
+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
1604
|
+
// convert src1 to fp16
|
1605
|
+
// TODO: use multiple threads
|
1606
|
+
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1607
|
+
if (src1_cont_rows) {
|
1608
|
+
if (src1_cont_cols) {
|
1609
|
+
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
1610
|
+
}
|
1611
|
+
else {
|
1612
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1613
|
+
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
1614
|
+
}
|
1615
|
+
}
|
1646
1616
|
}
|
1647
|
-
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
|
1653
|
-
|
1617
|
+
else {
|
1618
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1619
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
1620
|
+
// very slow due to no inlining
|
1621
|
+
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
1622
|
+
}
|
1623
|
+
}
|
1654
1624
|
}
|
1655
|
-
}
|
1656
|
-
}
|
1657
1625
|
|
1658
|
-
|
1659
|
-
|
1626
|
+
// copy src1 to device
|
1627
|
+
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
|
1660
1628
|
|
1661
|
-
|
1629
|
+
CL_CHECK(clFinish(queue));
|
1662
1630
|
|
1663
|
-
|
1664
|
-
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1631
|
+
// compute
|
1632
|
+
cl_event ev_sgemm;
|
1633
|
+
clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
|
1634
|
+
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1635
|
+
ne01, ne11, ne10,
|
1636
|
+
alpha,
|
1637
|
+
d_X, x_offset, ne00,
|
1638
|
+
d_Y, 0, ne10,
|
1639
|
+
beta,
|
1640
|
+
d_D, 0, ne01,
|
1641
|
+
&queue, &ev_sgemm);
|
1642
|
+
|
1643
|
+
if (status != clblast::StatusCode::kSuccess) {
|
1644
|
+
GGML_ASSERT(false);
|
1645
|
+
}
|
1678
1646
|
|
1679
|
-
|
1680
|
-
|
1647
|
+
// copy dst to host, then convert to float
|
1648
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
1681
1649
|
|
1682
|
-
|
1650
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1683
1651
|
|
1684
|
-
|
1652
|
+
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
1653
|
+
}
|
1654
|
+
}
|
1685
1655
|
}
|
1686
1656
|
}
|
1687
1657
|
|
@@ -1744,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1744
1714
|
size_t ev_idx = 0;
|
1745
1715
|
std::vector<cl_event> events;
|
1746
1716
|
|
1747
|
-
int64_t
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1751
|
-
|
1752
|
-
|
1753
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1754
|
-
int64_t i02 = i12 / r2;
|
1755
|
-
|
1756
|
-
// copy src0 to device if necessary
|
1757
|
-
if (src0->backend == GGML_BACKEND_CPU) {
|
1758
|
-
if (i02 != pi02 || i03 != pi03) {
|
1717
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
1718
|
+
// TODO: copy and dequantize src0 here when r3>1
|
1719
|
+
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
1720
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
1721
|
+
// copy src0 to device if necessary
|
1722
|
+
if (src0->backend == GGML_BACKEND_CPU) {
|
1759
1723
|
events.emplace_back();
|
1760
1724
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1761
|
-
|
1762
|
-
|
1763
|
-
}
|
1764
|
-
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1765
|
-
d_Q = (cl_mem) src0->extra;
|
1766
|
-
} else {
|
1767
|
-
GGML_ASSERT(false);
|
1768
|
-
}
|
1769
|
-
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
1770
|
-
// copy src1 to device
|
1771
|
-
events.emplace_back();
|
1772
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1773
|
-
|
1774
|
-
// compute
|
1775
|
-
const size_t global = ne01 * local;
|
1776
|
-
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1777
|
-
const cl_int ncols = ne00;
|
1778
|
-
events.emplace_back();
|
1779
|
-
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
1780
|
-
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
1781
|
-
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
1782
|
-
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
1783
|
-
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
1784
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
1785
|
-
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
1786
|
-
// convert src0 to fp32 on device
|
1787
|
-
const size_t global = x_ne / global_denom;
|
1788
|
-
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1789
|
-
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1790
|
-
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1791
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1792
|
-
|
1793
|
-
// copy src1 to device
|
1794
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1795
|
-
|
1796
|
-
events.emplace_back();
|
1797
|
-
|
1798
|
-
// wait for conversion
|
1799
|
-
CL_CHECK(clFinish(queue));
|
1800
|
-
|
1801
|
-
// compute
|
1802
|
-
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
1803
|
-
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1804
|
-
ne01, ne11, ne10,
|
1805
|
-
alpha,
|
1806
|
-
d_X, 0, ne00,
|
1807
|
-
d_Y, 0, ne10,
|
1808
|
-
beta,
|
1809
|
-
d_D, 0, ne01,
|
1810
|
-
&queue, events.data() + ev_idx++);
|
1811
|
-
|
1812
|
-
if (status != clblast::StatusCode::kSuccess) {
|
1725
|
+
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1726
|
+
d_Q = (cl_mem) src0->extra;
|
1727
|
+
} else {
|
1813
1728
|
GGML_ASSERT(false);
|
1814
1729
|
}
|
1815
|
-
}
|
1816
1730
|
|
1817
|
-
|
1818
|
-
|
1819
|
-
|
1820
|
-
|
1821
|
-
|
1822
|
-
|
1731
|
+
if (!mul_mat_vec) {
|
1732
|
+
// convert src0 to fp32 on device
|
1733
|
+
const size_t global = x_ne / global_denom;
|
1734
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1735
|
+
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1736
|
+
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1737
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1738
|
+
}
|
1823
1739
|
|
1824
|
-
|
1825
|
-
|
1740
|
+
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
1741
|
+
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
1742
|
+
// copy src1 to device
|
1743
|
+
events.emplace_back();
|
1744
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1745
|
+
|
1746
|
+
// compute
|
1747
|
+
const size_t global = ne01 * local;
|
1748
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1749
|
+
const cl_int ncols = ne00;
|
1750
|
+
events.emplace_back();
|
1751
|
+
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
1752
|
+
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
1753
|
+
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
1754
|
+
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
1755
|
+
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
1756
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
1757
|
+
} else { // CLBlast matrix matrix multiplication
|
1758
|
+
// copy src1 to device
|
1759
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1760
|
+
|
1761
|
+
// wait for conversion
|
1762
|
+
CL_CHECK(clFinish(queue));
|
1763
|
+
|
1764
|
+
// compute
|
1765
|
+
events.emplace_back();
|
1766
|
+
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
1767
|
+
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1768
|
+
ne01, ne11, ne10,
|
1769
|
+
alpha,
|
1770
|
+
d_X, 0, ne00,
|
1771
|
+
d_Y, 0, ne10,
|
1772
|
+
beta,
|
1773
|
+
d_D, 0, ne01,
|
1774
|
+
&queue, events.data() + ev_idx++);
|
1775
|
+
|
1776
|
+
if (status != clblast::StatusCode::kSuccess) {
|
1777
|
+
GGML_ASSERT(false);
|
1778
|
+
}
|
1779
|
+
}
|
1780
|
+
|
1781
|
+
// copy dst to host
|
1782
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1783
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
1784
|
+
for (auto *event : events) {
|
1785
|
+
clReleaseEvent(event);
|
1786
|
+
}
|
1787
|
+
|
1788
|
+
ev_idx = 0;
|
1789
|
+
events.clear();
|
1790
|
+
}
|
1791
|
+
}
|
1826
1792
|
}
|
1827
1793
|
}
|
1828
1794
|
|
@@ -1897,8 +1863,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
|
|
1897
1863
|
}
|
1898
1864
|
|
1899
1865
|
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
1900
|
-
if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
1901
|
-
return
|
1866
|
+
if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
1867
|
+
return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
|
1902
1868
|
}
|
1903
1869
|
return 0;
|
1904
1870
|
}
|