llama_cpp 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  #include "ggml.h"
2
2
  #include "ggml-opencl.h"
3
+ #include "ggml-backend-impl.h"
3
4
 
4
5
  #include <array>
5
6
  #include <atomic>
@@ -10,7 +11,7 @@
10
11
  #include <sstream>
11
12
  #include <vector>
12
13
 
13
- #define CL_TARGET_OPENCL_VERSION 110
14
+ #define CL_TARGET_OPENCL_VERSION 120
14
15
  #include <clblast.h>
15
16
 
16
17
  #if defined(_MSC_VER)
@@ -929,6 +930,12 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
929
930
  }
930
931
 
931
932
  void ggml_cl_init(void) {
933
+ static bool initialized = false;
934
+ if (initialized) {
935
+ return;
936
+ }
937
+ initialized = true;
938
+
932
939
  cl_int err;
933
940
 
934
941
  struct cl_device;
@@ -1483,8 +1490,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1483
1490
  } else {
1484
1491
  d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1485
1492
  }
1486
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1487
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1493
+ cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1494
+ cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1488
1495
 
1489
1496
  size_t x_offset = 0;
1490
1497
 
@@ -1501,7 +1508,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1501
1508
 
1502
1509
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1503
1510
  // copy src1 to device
1504
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1511
+ if (src1->backend == GGML_BACKEND_CPU) {
1512
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1513
+ }
1505
1514
 
1506
1515
  CL_CHECK(clFinish(queue));
1507
1516
 
@@ -1522,8 +1531,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1522
1531
  }
1523
1532
 
1524
1533
  // copy dst to host
1525
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1526
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1534
+ if (dst->backend == GGML_BACKEND_CPU) {
1535
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1536
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1537
+ }
1527
1538
  }
1528
1539
  }
1529
1540
  }
@@ -1532,8 +1543,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1532
1543
  if (src0->backend != GGML_BACKEND_GPU) {
1533
1544
  ggml_cl_pool_free(d_X, x_size);
1534
1545
  }
1535
- ggml_cl_pool_free(d_Y, y_size);
1536
- ggml_cl_pool_free(d_D, d_size);
1546
+ if (src1->backend != GGML_BACKEND_GPU) {
1547
+ ggml_cl_pool_free(d_Y, y_size);
1548
+ }
1549
+ if (dst->backend != GGML_BACKEND_GPU) {
1550
+ ggml_cl_pool_free(d_D, d_size);
1551
+ }
1537
1552
  }
1538
1553
 
1539
1554
  static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
@@ -1598,6 +1613,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1598
1613
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1599
1614
  }
1600
1615
 
1616
+ // FIXME: convert on device
1617
+
1601
1618
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1602
1619
  // convert src1 to fp16
1603
1620
  // TODO: use multiple threads
@@ -1643,11 +1660,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1643
1660
  }
1644
1661
 
1645
1662
  // copy dst to host, then convert to float
1646
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1647
-
1648
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1649
-
1650
- ggml_fp16_to_fp32_row(tmp, d, d_ne);
1663
+ if (dst->backend == GGML_BACKEND_CPU) {
1664
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1665
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1666
+ ggml_fp16_to_fp32_row(tmp, d, d_ne);
1667
+ } else {
1668
+ // FIXME: convert dst to fp32 on device
1669
+ }
1651
1670
  }
1652
1671
  }
1653
1672
  }
@@ -1801,7 +1820,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1801
1820
  }
1802
1821
 
1803
1822
 
1804
- bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1823
+ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
1805
1824
  const int64_t ne10 = src1->ne[0];
1806
1825
 
1807
1826
  const int64_t ne0 = dst->ne[0];
@@ -1895,3 +1914,291 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1895
1914
  tensor->extra = dst;
1896
1915
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1897
1916
  }
1917
+
1918
+ // ggml-backend
1919
+
1920
+ // buffer
1921
+
1922
+ struct ggml_backend_opencl_buffer_context {
1923
+ ~ggml_backend_opencl_buffer_context() {
1924
+ if (buffer) {
1925
+ clReleaseMemObject(buffer);
1926
+ }
1927
+ for (auto * sub_buffer : sub_buffers) {
1928
+ clReleaseMemObject(sub_buffer);
1929
+ }
1930
+ }
1931
+
1932
+ cl_mem buffer;
1933
+ std::vector<cl_mem> sub_buffers;
1934
+ };
1935
+
1936
+ static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
1937
+
1938
+ static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
1939
+ return "OpenCL";
1940
+
1941
+ GGML_UNUSED(buffer);
1942
+ }
1943
+
1944
+ static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1945
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1946
+ delete ctx;
1947
+ }
1948
+
1949
+ static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
1950
+ return cl_ptr_base;
1951
+
1952
+ GGML_UNUSED(buffer);
1953
+ }
1954
+
1955
+ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1956
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
1957
+ tensor->extra = tensor->view_src->extra;
1958
+ } else {
1959
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1960
+ cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
1961
+ cl_int err;
1962
+ cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
1963
+ CL_CHECK(err);
1964
+ ctx->sub_buffers.push_back(sub_buffer);
1965
+ tensor->extra = sub_buffer;
1966
+ }
1967
+ tensor->backend = GGML_BACKEND_GPU;
1968
+ }
1969
+
1970
+ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1971
+ cl_mem tensor_buffer = (cl_mem) tensor->extra;
1972
+ CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
1973
+ CL_CHECK(clFinish(queue));
1974
+
1975
+ GGML_UNUSED(buffer);
1976
+ }
1977
+
1978
+ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1979
+ cl_mem tensor_buffer = (cl_mem) tensor->extra;
1980
+ CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
1981
+ CL_CHECK(clFinish(queue));
1982
+
1983
+ GGML_UNUSED(buffer);
1984
+ }
1985
+
1986
+ static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1987
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1988
+ CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
1989
+ CL_CHECK(clFinish(queue));
1990
+ }
1991
+
1992
+ static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
1993
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1994
+ for (auto * sub_buffer : ctx->sub_buffers) {
1995
+ clReleaseMemObject(sub_buffer);
1996
+ }
1997
+ ctx->sub_buffers.clear();
1998
+ }
1999
+
2000
+ static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
2001
+ /* .get_name = */ ggml_backend_opencl_buffer_get_name,
2002
+ /* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
2003
+ /* .get_base = */ ggml_backend_opencl_buffer_get_base,
2004
+ /* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,
2005
+ /* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
2006
+ /* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
2007
+ /* .cpy_tensor = */ NULL,
2008
+ /* .clear = */ ggml_backend_opencl_buffer_clear,
2009
+ /* .reset = */ ggml_backend_opencl_buffer_reset,
2010
+ };
2011
+
2012
+ // buffer type
2013
+
2014
+ static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
2015
+ return "OpenCL";
2016
+
2017
+ GGML_UNUSED(buffer_type);
2018
+ }
2019
+
2020
+ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
2021
+ ggml_cl_init();
2022
+
2023
+ cl_int err;
2024
+ cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
2025
+ if (err != CL_SUCCESS) {
2026
+ fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
2027
+ return nullptr;
2028
+ }
2029
+
2030
+ ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
2031
+
2032
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
2033
+ }
2034
+
2035
+ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
2036
+ // FIXME: not thread safe, device may not be initialized yet
2037
+ static cl_uint alignment = -1;
2038
+ if (alignment == (cl_uint)-1) {
2039
+ ggml_cl_init();
2040
+ clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
2041
+ }
2042
+ return alignment;
2043
+
2044
+ GGML_UNUSED(buffer_type);
2045
+ }
2046
+
2047
+ static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
2048
+ //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
2049
+ return ggml_backend_is_cpu(backend);
2050
+
2051
+ GGML_UNUSED(buffer_type);
2052
+ }
2053
+
2054
+ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
2055
+ /* .get_name = */ ggml_backend_opencl_buffer_type_name,
2056
+ /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
2057
+ /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
2058
+ /* .get_alloc_size = */ NULL,
2059
+ /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
2060
+ /* .is_host = */ NULL,
2061
+ };
2062
+
2063
+
2064
+ ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
2065
+ static ggml_backend_buffer_type buffer_type = {
2066
+ /* .iface = */ ggml_backend_opencl_buffer_type_interface,
2067
+ /* .context = */ nullptr,
2068
+ };
2069
+
2070
+ return &buffer_type;
2071
+ }
2072
+
2073
+ #if 0
2074
+ // host buffer type
2075
+
2076
+ static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
2077
+ return "CL_Host";
2078
+
2079
+ GGML_UNUSED(buft);
2080
+ }
2081
+
2082
+ static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
2083
+ return "CL_Host";
2084
+
2085
+ GGML_UNUSED(buffer);
2086
+ }
2087
+
2088
+ static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2089
+ ggml_cl_host_free(buffer->context);
2090
+ }
2091
+
2092
+ static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2093
+ void * ptr = ggml_cl_host_malloc(size);
2094
+
2095
+ if (ptr == nullptr) {
2096
+ // fallback to cpu buffer
2097
+ return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
2098
+ }
2099
+
2100
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
2101
+ buffer->buft = buft;
2102
+ buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
2103
+ buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
2104
+
2105
+ return buffer;
2106
+ }
2107
+
2108
+ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
2109
+ static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
2110
+ /* .iface = */ {
2111
+ /* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
2112
+ /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
2113
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
2114
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
2115
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
2116
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
2117
+ },
2118
+ /* .context = */ nullptr,
2119
+ };
2120
+
2121
+ return &ggml_backend_opencl_buffer_type_host;
2122
+ }
2123
+
2124
+ // backend
2125
+
2126
+ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
2127
+ return "OpenCL";
2128
+
2129
+ GGML_UNUSED(backend);
2130
+ }
2131
+
2132
+ static void ggml_backend_opencl_free(ggml_backend_t backend) {
2133
+ GGML_UNUSED(backend);
2134
+ }
2135
+
2136
+ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
2137
+ return ggml_backend_opencl_buffer_type();
2138
+
2139
+ GGML_UNUSED(backend);
2140
+ }
2141
+
2142
+ static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
2143
+ for (int i = 0; i < graph->n_nodes; ++i) {
2144
+ ggml_tensor * node = graph->nodes[i];
2145
+ switch (node->op) {
2146
+ case GGML_OP_MUL_MAT:
2147
+ ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
2148
+ break;
2149
+ case GGML_OP_MUL:
2150
+ ggml_cl_mul(node->src[0], node->src[1], node);
2151
+ break;
2152
+ default:
2153
+ GGML_ASSERT(false);
2154
+ }
2155
+ }
2156
+
2157
+ return true;
2158
+
2159
+ GGML_UNUSED(backend);
2160
+ }
2161
+
2162
+ static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
2163
+ switch (op->op) {
2164
+ case GGML_OP_MUL_MAT:
2165
+ return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
2166
+ case GGML_OP_MUL:
2167
+ // return ggml_can_repeat_rows(op->src[1], op->src[0]);
2168
+ return true;
2169
+ default:
2170
+ return false;
2171
+ }
2172
+
2173
+ GGML_UNUSED(backend);
2174
+ }
2175
+
2176
+ static ggml_backend_i opencl_backend_i = {
2177
+ /* .get_name = */ ggml_backend_opencl_name,
2178
+ /* .free = */ ggml_backend_opencl_free,
2179
+ /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
2180
+ /* .set_tensor_async = */ NULL,
2181
+ /* .get_tensor_async = */ NULL,
2182
+ /* .cpy_tensor_from_async = */ NULL,
2183
+ /* .cpy_tensor_to_async = */ NULL,
2184
+ /* .synchronize = */ NULL,
2185
+ /* .graph_plan_create = */ NULL,
2186
+ /* .graph_plan_free = */ NULL,
2187
+ /* .graph_plan_compute = */ NULL,
2188
+ /* .graph_compute = */ ggml_backend_opencl_graph_compute,
2189
+ /* .supports_op = */ ggml_backend_opencl_supports_op,
2190
+ };
2191
+
2192
+ ggml_backend_t ggml_backend_opencl_init() {
2193
+ ggml_backend_t backend = new ggml_backend {
2194
+ /* .interface = */ opencl_backend_i,
2195
+ /* .context = */ nullptr
2196
+ };
2197
+
2198
+ return backend;
2199
+ }
2200
+
2201
+ bool ggml_backend_is_opencl(ggml_backend_t backend) {
2202
+ return backend && backend->iface.get_name == ggml_backend_opencl_name;
2203
+ }
2204
+ #endif
@@ -1,6 +1,7 @@
1
1
  #pragma once
2
2
 
3
3
  #include "ggml.h"
4
+ #include "ggml-backend.h"
4
5
 
5
6
  #ifdef __cplusplus
6
7
  extern "C" {
@@ -9,17 +10,26 @@ extern "C" {
9
10
  GGML_API void ggml_cl_init(void);
10
11
 
11
12
  GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12
- GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
+ GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
13
14
  GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
14
15
  GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
15
16
 
16
- GGML_API void * ggml_cl_host_malloc(size_t size);
17
- GGML_API void ggml_cl_host_free(void * ptr);
17
+ // GGML_API void * ggml_cl_host_malloc(size_t size);
18
+ // GGML_API void ggml_cl_host_free(void * ptr);
18
19
 
19
20
  GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
20
21
 
21
22
  GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
22
23
 
24
+ // backend API
25
+
26
+ // GGML_API ggml_backend_t ggml_backend_opencl_init(void);
27
+
28
+ // GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
29
+
30
+ GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
31
+ // GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
32
+
23
33
  #ifdef __cplusplus
24
34
  }
25
35
  #endif