llama_cpp 0.12.1 → 0.12.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,6 @@
1
1
  #include "ggml.h"
2
2
  #include "ggml-opencl.h"
3
+ #include "ggml-backend-impl.h"
3
4
 
4
5
  #include <array>
5
6
  #include <atomic>
@@ -10,7 +11,7 @@
10
11
  #include <sstream>
11
12
  #include <vector>
12
13
 
13
- #define CL_TARGET_OPENCL_VERSION 110
14
+ #define CL_TARGET_OPENCL_VERSION 120
14
15
  #include <clblast.h>
15
16
 
16
17
  #if defined(_MSC_VER)
@@ -929,6 +930,12 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
929
930
  }
930
931
 
931
932
  void ggml_cl_init(void) {
933
+ static bool initialized = false;
934
+ if (initialized) {
935
+ return;
936
+ }
937
+ initialized = true;
938
+
932
939
  cl_int err;
933
940
 
934
941
  struct cl_device;
@@ -1483,8 +1490,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1483
1490
  } else {
1484
1491
  d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1485
1492
  }
1486
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1487
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1493
+ cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1494
+ cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1488
1495
 
1489
1496
  size_t x_offset = 0;
1490
1497
 
@@ -1501,7 +1508,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1501
1508
 
1502
1509
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1503
1510
  // copy src1 to device
1504
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1511
+ if (src1->backend == GGML_BACKEND_CPU) {
1512
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1513
+ }
1505
1514
 
1506
1515
  CL_CHECK(clFinish(queue));
1507
1516
 
@@ -1522,8 +1531,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1522
1531
  }
1523
1532
 
1524
1533
  // copy dst to host
1525
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1526
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1534
+ if (dst->backend == GGML_BACKEND_CPU) {
1535
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1536
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1537
+ }
1527
1538
  }
1528
1539
  }
1529
1540
  }
@@ -1532,8 +1543,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1532
1543
  if (src0->backend != GGML_BACKEND_GPU) {
1533
1544
  ggml_cl_pool_free(d_X, x_size);
1534
1545
  }
1535
- ggml_cl_pool_free(d_Y, y_size);
1536
- ggml_cl_pool_free(d_D, d_size);
1546
+ if (src1->backend != GGML_BACKEND_GPU) {
1547
+ ggml_cl_pool_free(d_Y, y_size);
1548
+ }
1549
+ if (dst->backend != GGML_BACKEND_GPU) {
1550
+ ggml_cl_pool_free(d_D, d_size);
1551
+ }
1537
1552
  }
1538
1553
 
1539
1554
  static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
@@ -1598,6 +1613,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1598
1613
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1599
1614
  }
1600
1615
 
1616
+ // FIXME: convert on device
1617
+
1601
1618
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1602
1619
  // convert src1 to fp16
1603
1620
  // TODO: use multiple threads
@@ -1643,11 +1660,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1643
1660
  }
1644
1661
 
1645
1662
  // copy dst to host, then convert to float
1646
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1647
-
1648
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1649
-
1650
- ggml_fp16_to_fp32_row(tmp, d, d_ne);
1663
+ if (dst->backend == GGML_BACKEND_CPU) {
1664
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1665
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1666
+ ggml_fp16_to_fp32_row(tmp, d, d_ne);
1667
+ } else {
1668
+ // FIXME: convert dst to fp32 on device
1669
+ }
1651
1670
  }
1652
1671
  }
1653
1672
  }
@@ -1801,7 +1820,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1801
1820
  }
1802
1821
 
1803
1822
 
1804
- bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1823
+ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
1805
1824
  const int64_t ne10 = src1->ne[0];
1806
1825
 
1807
1826
  const int64_t ne0 = dst->ne[0];
@@ -1895,3 +1914,291 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1895
1914
  tensor->extra = dst;
1896
1915
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1897
1916
  }
1917
+
1918
+ // ggml-backend
1919
+
1920
+ // buffer
1921
+
1922
+ struct ggml_backend_opencl_buffer_context {
1923
+ ~ggml_backend_opencl_buffer_context() {
1924
+ if (buffer) {
1925
+ clReleaseMemObject(buffer);
1926
+ }
1927
+ for (auto * sub_buffer : sub_buffers) {
1928
+ clReleaseMemObject(sub_buffer);
1929
+ }
1930
+ }
1931
+
1932
+ cl_mem buffer;
1933
+ std::vector<cl_mem> sub_buffers;
1934
+ };
1935
+
1936
+ static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
1937
+
1938
+ static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
1939
+ return "OpenCL";
1940
+
1941
+ GGML_UNUSED(buffer);
1942
+ }
1943
+
1944
+ static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1945
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1946
+ delete ctx;
1947
+ }
1948
+
1949
+ static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
1950
+ return cl_ptr_base;
1951
+
1952
+ GGML_UNUSED(buffer);
1953
+ }
1954
+
1955
+ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1956
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
1957
+ tensor->extra = tensor->view_src->extra;
1958
+ } else {
1959
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1960
+ cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
1961
+ cl_int err;
1962
+ cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
1963
+ CL_CHECK(err);
1964
+ ctx->sub_buffers.push_back(sub_buffer);
1965
+ tensor->extra = sub_buffer;
1966
+ }
1967
+ tensor->backend = GGML_BACKEND_GPU;
1968
+ }
1969
+
1970
+ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1971
+ cl_mem tensor_buffer = (cl_mem) tensor->extra;
1972
+ CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
1973
+ CL_CHECK(clFinish(queue));
1974
+
1975
+ GGML_UNUSED(buffer);
1976
+ }
1977
+
1978
+ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1979
+ cl_mem tensor_buffer = (cl_mem) tensor->extra;
1980
+ CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
1981
+ CL_CHECK(clFinish(queue));
1982
+
1983
+ GGML_UNUSED(buffer);
1984
+ }
1985
+
1986
+ static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1987
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1988
+ CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
1989
+ CL_CHECK(clFinish(queue));
1990
+ }
1991
+
1992
+ static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
1993
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1994
+ for (auto * sub_buffer : ctx->sub_buffers) {
1995
+ clReleaseMemObject(sub_buffer);
1996
+ }
1997
+ ctx->sub_buffers.clear();
1998
+ }
1999
+
2000
+ static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
2001
+ /* .get_name = */ ggml_backend_opencl_buffer_get_name,
2002
+ /* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
2003
+ /* .get_base = */ ggml_backend_opencl_buffer_get_base,
2004
+ /* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,
2005
+ /* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
2006
+ /* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
2007
+ /* .cpy_tensor = */ NULL,
2008
+ /* .clear = */ ggml_backend_opencl_buffer_clear,
2009
+ /* .reset = */ ggml_backend_opencl_buffer_reset,
2010
+ };
2011
+
2012
+ // buffer type
2013
+
2014
+ static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
2015
+ return "OpenCL";
2016
+
2017
+ GGML_UNUSED(buffer_type);
2018
+ }
2019
+
2020
+ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
2021
+ ggml_cl_init();
2022
+
2023
+ cl_int err;
2024
+ cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
2025
+ if (err != CL_SUCCESS) {
2026
+ fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
2027
+ return nullptr;
2028
+ }
2029
+
2030
+ ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
2031
+
2032
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
2033
+ }
2034
+
2035
+ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
2036
+ // FIXME: not thread safe, device may not be initialized yet
2037
+ static cl_uint alignment = -1;
2038
+ if (alignment == (cl_uint)-1) {
2039
+ ggml_cl_init();
2040
+ clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
2041
+ }
2042
+ return alignment;
2043
+
2044
+ GGML_UNUSED(buffer_type);
2045
+ }
2046
+
2047
+ static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
2048
+ //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
2049
+ return ggml_backend_is_cpu(backend);
2050
+
2051
+ GGML_UNUSED(buffer_type);
2052
+ }
2053
+
2054
+ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
2055
+ /* .get_name = */ ggml_backend_opencl_buffer_type_name,
2056
+ /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
2057
+ /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
2058
+ /* .get_alloc_size = */ NULL,
2059
+ /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
2060
+ /* .is_host = */ NULL,
2061
+ };
2062
+
2063
+
2064
+ ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
2065
+ static ggml_backend_buffer_type buffer_type = {
2066
+ /* .iface = */ ggml_backend_opencl_buffer_type_interface,
2067
+ /* .context = */ nullptr,
2068
+ };
2069
+
2070
+ return &buffer_type;
2071
+ }
2072
+
2073
+ #if 0
2074
+ // host buffer type
2075
+
2076
+ static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
2077
+ return "CL_Host";
2078
+
2079
+ GGML_UNUSED(buft);
2080
+ }
2081
+
2082
+ static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
2083
+ return "CL_Host";
2084
+
2085
+ GGML_UNUSED(buffer);
2086
+ }
2087
+
2088
+ static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2089
+ ggml_cl_host_free(buffer->context);
2090
+ }
2091
+
2092
+ static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2093
+ void * ptr = ggml_cl_host_malloc(size);
2094
+
2095
+ if (ptr == nullptr) {
2096
+ // fallback to cpu buffer
2097
+ return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
2098
+ }
2099
+
2100
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
2101
+ buffer->buft = buft;
2102
+ buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
2103
+ buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
2104
+
2105
+ return buffer;
2106
+ }
2107
+
2108
+ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
2109
+ static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
2110
+ /* .iface = */ {
2111
+ /* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
2112
+ /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
2113
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
2114
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
2115
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
2116
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
2117
+ },
2118
+ /* .context = */ nullptr,
2119
+ };
2120
+
2121
+ return &ggml_backend_opencl_buffer_type_host;
2122
+ }
2123
+
2124
+ // backend
2125
+
2126
+ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
2127
+ return "OpenCL";
2128
+
2129
+ GGML_UNUSED(backend);
2130
+ }
2131
+
2132
+ static void ggml_backend_opencl_free(ggml_backend_t backend) {
2133
+ GGML_UNUSED(backend);
2134
+ }
2135
+
2136
+ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
2137
+ return ggml_backend_opencl_buffer_type();
2138
+
2139
+ GGML_UNUSED(backend);
2140
+ }
2141
+
2142
+ static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
2143
+ for (int i = 0; i < graph->n_nodes; ++i) {
2144
+ ggml_tensor * node = graph->nodes[i];
2145
+ switch (node->op) {
2146
+ case GGML_OP_MUL_MAT:
2147
+ ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
2148
+ break;
2149
+ case GGML_OP_MUL:
2150
+ ggml_cl_mul(node->src[0], node->src[1], node);
2151
+ break;
2152
+ default:
2153
+ GGML_ASSERT(false);
2154
+ }
2155
+ }
2156
+
2157
+ return true;
2158
+
2159
+ GGML_UNUSED(backend);
2160
+ }
2161
+
2162
+ static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
2163
+ switch (op->op) {
2164
+ case GGML_OP_MUL_MAT:
2165
+ return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
2166
+ case GGML_OP_MUL:
2167
+ // return ggml_can_repeat_rows(op->src[1], op->src[0]);
2168
+ return true;
2169
+ default:
2170
+ return false;
2171
+ }
2172
+
2173
+ GGML_UNUSED(backend);
2174
+ }
2175
+
2176
+ static ggml_backend_i opencl_backend_i = {
2177
+ /* .get_name = */ ggml_backend_opencl_name,
2178
+ /* .free = */ ggml_backend_opencl_free,
2179
+ /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
2180
+ /* .set_tensor_async = */ NULL,
2181
+ /* .get_tensor_async = */ NULL,
2182
+ /* .cpy_tensor_from_async = */ NULL,
2183
+ /* .cpy_tensor_to_async = */ NULL,
2184
+ /* .synchronize = */ NULL,
2185
+ /* .graph_plan_create = */ NULL,
2186
+ /* .graph_plan_free = */ NULL,
2187
+ /* .graph_plan_compute = */ NULL,
2188
+ /* .graph_compute = */ ggml_backend_opencl_graph_compute,
2189
+ /* .supports_op = */ ggml_backend_opencl_supports_op,
2190
+ };
2191
+
2192
+ ggml_backend_t ggml_backend_opencl_init() {
2193
+ ggml_backend_t backend = new ggml_backend {
2194
+ /* .interface = */ opencl_backend_i,
2195
+ /* .context = */ nullptr
2196
+ };
2197
+
2198
+ return backend;
2199
+ }
2200
+
2201
+ bool ggml_backend_is_opencl(ggml_backend_t backend) {
2202
+ return backend && backend->iface.get_name == ggml_backend_opencl_name;
2203
+ }
2204
+ #endif
@@ -1,6 +1,7 @@
1
1
  #pragma once
2
2
 
3
3
  #include "ggml.h"
4
+ #include "ggml-backend.h"
4
5
 
5
6
  #ifdef __cplusplus
6
7
  extern "C" {
@@ -9,17 +10,26 @@ extern "C" {
9
10
  GGML_API void ggml_cl_init(void);
10
11
 
11
12
  GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12
- GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
+ GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
13
14
  GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
14
15
  GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
15
16
 
16
- GGML_API void * ggml_cl_host_malloc(size_t size);
17
- GGML_API void ggml_cl_host_free(void * ptr);
17
+ // GGML_API void * ggml_cl_host_malloc(size_t size);
18
+ // GGML_API void ggml_cl_host_free(void * ptr);
18
19
 
19
20
  GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
20
21
 
21
22
  GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
22
23
 
24
+ // backend API
25
+
26
+ // GGML_API ggml_backend_t ggml_backend_opencl_init(void);
27
+
28
+ // GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
29
+
30
+ GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
31
+ // GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
32
+
23
33
  #ifdef __cplusplus
24
34
  }
25
35
  #endif