llama_cpp 0.12.7 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
1354
1354
  }
1355
1355
 
1356
1356
  void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1357
- if (tensor->backend != GGML_BACKEND_GPU) {
1357
+ if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
1358
1358
  return;
1359
1359
  }
1360
1360
 
@@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
1412
1412
  }
1413
1413
 
1414
1414
  static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1415
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
1415
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
1416
1416
  const int64_t ne00 = src0->ne[0];
1417
1417
  const int64_t ne01 = src0->ne[1];
1418
1418
  const int64_t ne02 = src0->ne[2];
@@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
1476
1476
  }
1477
1477
 
1478
1478
  static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1479
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
1479
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
1480
1480
  const int64_t ne00 = src0->ne[0];
1481
1481
  const int64_t ne01 = src0->ne[1];
1482
1482
  const int64_t ne02 = src0->ne[2];
@@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1566
1566
  size_t y_size;
1567
1567
  size_t d_size;
1568
1568
  cl_mem d_X;
1569
- if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1569
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
1570
1570
  d_X = (cl_mem) src0->extra;
1571
1571
  } else {
1572
1572
  d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1573
1573
  }
1574
- cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1575
- cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1574
+ cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1575
+ cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1576
1576
 
1577
1577
  size_t x_offset = 0;
1578
1578
 
@@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1580
1580
  // TODO: copy src0 here when r3>1
1581
1581
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1582
1582
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1583
- if (src0->backend == GGML_BACKEND_GPU) {
1583
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1584
1584
  x_offset = (i03 * ne02 + i02) * x_ne;
1585
1585
  } else {
1586
1586
  // copy src0 to device
@@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1589
1589
 
1590
1590
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1591
1591
  // copy src1 to device
1592
- if (src1->backend == GGML_BACKEND_CPU) {
1592
+ if (src1->backend == GGML_BACKEND_TYPE_CPU) {
1593
1593
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1594
1594
  }
1595
1595
 
@@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1612
1612
  }
1613
1613
 
1614
1614
  // copy dst to host
1615
- if (dst->backend == GGML_BACKEND_CPU) {
1615
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
1616
1616
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1617
1617
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1618
1618
  }
@@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1621
1621
  }
1622
1622
  }
1623
1623
 
1624
- if (src0->backend != GGML_BACKEND_GPU) {
1624
+ if (src0->backend != GGML_BACKEND_TYPE_GPU) {
1625
1625
  ggml_cl_pool_free(d_X, x_size);
1626
1626
  }
1627
- if (src1->backend != GGML_BACKEND_GPU) {
1627
+ if (src1->backend != GGML_BACKEND_TYPE_GPU) {
1628
1628
  ggml_cl_pool_free(d_Y, y_size);
1629
1629
  }
1630
- if (dst->backend != GGML_BACKEND_GPU) {
1630
+ if (dst->backend != GGML_BACKEND_TYPE_GPU) {
1631
1631
  ggml_cl_pool_free(d_D, d_size);
1632
1632
  }
1633
1633
  }
@@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1670
1670
  size_t y_size;
1671
1671
  size_t d_size;
1672
1672
  cl_mem d_X;
1673
- if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1673
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
1674
1674
  d_X = (cl_mem) src0->extra;
1675
1675
  } else {
1676
1676
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
@@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1687
1687
  // TODO: copy src0 here when r3>1
1688
1688
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1689
1689
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1690
- if (src0->backend == GGML_BACKEND_GPU) {
1690
+ if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1691
1691
  x_offset = (i03 * ne02 + i02) * x_ne;
1692
1692
  } else {
1693
1693
  // copy src0 to device
@@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1741
1741
  }
1742
1742
 
1743
1743
  // copy dst to host, then convert to float
1744
- if (dst->backend == GGML_BACKEND_CPU) {
1744
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
1745
1745
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1746
1746
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1747
1747
  ggml_fp16_to_fp32_row(tmp, d, d_ne);
@@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1753
1753
  }
1754
1754
  }
1755
1755
 
1756
- if (src0->backend != GGML_BACKEND_GPU) {
1756
+ if (src0->backend != GGML_BACKEND_TYPE_GPU) {
1757
1757
  ggml_cl_pool_free(d_X, x_size);
1758
1758
  }
1759
1759
  ggml_cl_pool_free(d_Y, y_size);
@@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1798
1798
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1799
1799
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1800
1800
  cl_mem d_Q;
1801
- if (src0->backend == GGML_BACKEND_CPU) {
1801
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1802
1802
  d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
1803
1803
  }
1804
1804
 
@@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1817
1817
  for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1818
1818
  for (int64_t i02 = 0; i02 < ne02; i02++) {
1819
1819
  // copy src0 to device if necessary
1820
- if (src0->backend == GGML_BACKEND_CPU) {
1820
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1821
1821
  events.emplace_back();
1822
1822
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1823
- } else if (src0->backend == GGML_BACKEND_GPU) {
1823
+ } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1824
1824
  d_Q = (cl_mem) src0->extra;
1825
1825
  } else {
1826
1826
  GGML_ASSERT(false);
@@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1829
1829
  if (!mul_mat_vec) {
1830
1830
  // convert src0 to fp32 on device
1831
1831
  const size_t global = x_ne / global_denom;
1832
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1832
+ const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1833
1833
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1834
1834
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1835
1835
  CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
@@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1843
1843
 
1844
1844
  // compute
1845
1845
  const size_t global = ne01 * local;
1846
- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1846
+ const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1847
1847
  const cl_int ncols = ne00;
1848
1848
  events.emplace_back();
1849
1849
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1895
1895
  }
1896
1896
  ggml_cl_pool_free(d_Y, y_size);
1897
1897
  ggml_cl_pool_free(d_D, d_size);
1898
- if (src0->backend == GGML_BACKEND_CPU) {
1898
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1899
1899
  ggml_cl_pool_free(d_Q, q_size);
1900
1900
  }
1901
1901
  }
@@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
1911
1911
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
1912
1912
  src1->type == GGML_TYPE_F32 &&
1913
1913
  dst->type == GGML_TYPE_F32 &&
1914
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
1914
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
1915
1915
  return true;
1916
1916
  }
1917
1917
 
@@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1993
1993
  CL_CHECK(clFinish(queue));
1994
1994
 
1995
1995
  tensor->extra = dst;
1996
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1996
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
1997
1997
  }
1998
1998
 
1999
1999
  // ggml-backend
@@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
2045
2045
  ctx->sub_buffers.push_back(sub_buffer);
2046
2046
  tensor->extra = sub_buffer;
2047
2047
  }
2048
- tensor->backend = GGML_BACKEND_GPU;
2048
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
2049
2049
  }
2050
2050
 
2051
2051
  static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {