llama_cpp 0.12.7 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
1354
1354
  }
1355
1355
 
1356
1356
  void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1357
- if (tensor->backend != GGML_BACKEND_GPU) {
1357
+ if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
1358
1358
  return;
1359
1359
  }
1360
1360
 
@@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
1412
1412
  }
1413
1413
 
1414
1414
  static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1415
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
1415
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
1416
1416
  const int64_t ne00 = src0->ne[0];
1417
1417
  const int64_t ne01 = src0->ne[1];
1418
1418
  const int64_t ne02 = src0->ne[2];
@@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
1476
1476
  }
1477
1477
 
1478
1478
  static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1479
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
1479
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
1480
1480
  const int64_t ne00 = src0->ne[0];
1481
1481
  const int64_t ne01 = src0->ne[1];
1482
1482
  const int64_t ne02 = src0->ne[2];
@@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1566
1566
  size_t y_size;
1567
1567
  size_t d_size;
1568
1568
  cl_mem d_X;
1569
- if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1569
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
1570
1570
  d_X = (cl_mem) src0->extra;
1571
1571
  } else {
1572
1572
  d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1573
1573
  }
1574
- cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1575
- cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1574
+ cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1575
+ cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1576
1576
 
1577
1577
  size_t x_offset = 0;
1578
1578
 
@@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1580
1580
  // TODO: copy src0 here when r3>1
1581
1581
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1582
1582
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1583
- if (src0->backend == GGML_BACKEND_GPU) {
1583
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1584
1584
  x_offset = (i03 * ne02 + i02) * x_ne;
1585
1585
  } else {
1586
1586
  // copy src0 to device
@@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1589
1589
 
1590
1590
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1591
1591
  // copy src1 to device
1592
- if (src1->backend == GGML_BACKEND_CPU) {
1592
+ if (src1->backend == GGML_BACKEND_TYPE_CPU) {
1593
1593
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1594
1594
  }
1595
1595
 
@@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1612
1612
  }
1613
1613
 
1614
1614
  // copy dst to host
1615
- if (dst->backend == GGML_BACKEND_CPU) {
1615
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
1616
1616
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1617
1617
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1618
1618
  }
@@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1621
1621
  }
1622
1622
  }
1623
1623
 
1624
- if (src0->backend != GGML_BACKEND_GPU) {
1624
+ if (src0->backend != GGML_BACKEND_TYPE_GPU) {
1625
1625
  ggml_cl_pool_free(d_X, x_size);
1626
1626
  }
1627
- if (src1->backend != GGML_BACKEND_GPU) {
1627
+ if (src1->backend != GGML_BACKEND_TYPE_GPU) {
1628
1628
  ggml_cl_pool_free(d_Y, y_size);
1629
1629
  }
1630
- if (dst->backend != GGML_BACKEND_GPU) {
1630
+ if (dst->backend != GGML_BACKEND_TYPE_GPU) {
1631
1631
  ggml_cl_pool_free(d_D, d_size);
1632
1632
  }
1633
1633
  }
@@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1670
1670
  size_t y_size;
1671
1671
  size_t d_size;
1672
1672
  cl_mem d_X;
1673
- if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1673
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
1674
1674
  d_X = (cl_mem) src0->extra;
1675
1675
  } else {
1676
1676
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
@@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1687
1687
  // TODO: copy src0 here when r3>1
1688
1688
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1689
1689
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1690
- if (src0->backend == GGML_BACKEND_GPU) {
1690
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1691
1691
  x_offset = (i03 * ne02 + i02) * x_ne;
1692
1692
  } else {
1693
1693
  // copy src0 to device
@@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1741
1741
  }
1742
1742
 
1743
1743
  // copy dst to host, then convert to float
1744
- if (dst->backend == GGML_BACKEND_CPU) {
1744
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
1745
1745
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1746
1746
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1747
1747
  ggml_fp16_to_fp32_row(tmp, d, d_ne);
@@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1753
1753
  }
1754
1754
  }
1755
1755
 
1756
- if (src0->backend != GGML_BACKEND_GPU) {
1756
+ if (src0->backend != GGML_BACKEND_TYPE_GPU) {
1757
1757
  ggml_cl_pool_free(d_X, x_size);
1758
1758
  }
1759
1759
  ggml_cl_pool_free(d_Y, y_size);
@@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1798
1798
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1799
1799
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1800
1800
  cl_mem d_Q;
1801
- if (src0->backend == GGML_BACKEND_CPU) {
1801
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1802
1802
  d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
1803
1803
  }
1804
1804
 
@@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1817
1817
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1818
1818
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1819
1819
  // copy src0 to device if necessary
1820
- if (src0->backend == GGML_BACKEND_CPU) {
1820
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1821
1821
  events.emplace_back();
1822
1822
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1823
- } else if (src0->backend == GGML_BACKEND_GPU) {
1823
+ } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1824
1824
  d_Q = (cl_mem) src0->extra;
1825
1825
  } else {
1826
1826
  GGML_ASSERT(false);
@@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1829
1829
  if (!mul_mat_vec) {
1830
1830
  // convert src0 to fp32 on device
1831
1831
  const size_t global = x_ne / global_denom;
1832
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1832
+ const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1833
1833
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1834
1834
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1835
1835
  CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
@@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1843
1843
 
1844
1844
  // compute
1845
1845
  const size_t global = ne01 * local;
1846
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1846
+ const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1847
1847
  const cl_int ncols = ne00;
1848
1848
  events.emplace_back();
1849
1849
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1895
1895
  }
1896
1896
  ggml_cl_pool_free(d_Y, y_size);
1897
1897
  ggml_cl_pool_free(d_D, d_size);
1898
- if (src0->backend == GGML_BACKEND_CPU) {
1898
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1899
1899
  ggml_cl_pool_free(d_Q, q_size);
1900
1900
  }
1901
1901
  }
@@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
1911
1911
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
1912
1912
  src1->type == GGML_TYPE_F32 &&
1913
1913
  dst->type == GGML_TYPE_F32 &&
1914
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
1914
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
1915
1915
  return true;
1916
1916
  }
1917
1917
 
@@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1993
1993
  CL_CHECK(clFinish(queue));
1994
1994
 
1995
1995
  tensor->extra = dst;
1996
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1996
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
1997
1997
  }
1998
1998
 
1999
1999
  // ggml-backend
@@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
2045
2045
  ctx->sub_buffers.push_back(sub_buffer);
2046
2046
  tensor->extra = sub_buffer;
2047
2047
  }
2048
- tensor->backend = GGML_BACKEND_GPU;
2048
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
2049
2049
  }
2050
2050
 
2051
2051
  static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {