llama_cpp 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c4058abcb7afa897554fc75bb368caeea0e77429e01fb5f3a1949191c50f4de5
4
- data.tar.gz: 9929e94c02b5d9c21379a9275f08668e835f91d3d7be3570a2da9ab4ecbe6ad1
3
+ metadata.gz: 33b146badd1bebdf9588e48c0adac1f9924a0653aa5ec806fdf5dd288ef665d8
4
+ data.tar.gz: 134606db2b9fb10b51fc82f410d6653a6481b828d9fd05390b1570d6e198526a
5
5
  SHA512:
6
- metadata.gz: ab267defd1769e7bf4599da199f50a7c5cc2355d2281ab7fd2ccd1a5ef196b716350cf8df9522a9185d02c8c3ad6a5d0f46f271fad0951440ab9b3fab4019932
7
- data.tar.gz: 16727a2ac2c68f7913749b656c26523e9eee0118b69ff06bbc0935f899eac1874f16395d9e72ed2caa853e9c61fb9f614ad5913fca623e356aa249308b2f3dda
6
+ metadata.gz: 462d9e00121408c7af3934b0a663b29f99d5ad28f60a3471155509463bf26a14792c484d1fdc6054460941ae011d39b510774e225ad4ec03d60ce20a1dfef667
7
+ data.tar.gz: 4bf447ac55bba2b62d204dc975528de6664fe53af89df8ba4aa4172d4dbff709ac5b14a944326be5c71d64baa2cde00b60f7ba5e916e1fb68123c595f74ce24f
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [[0.1.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.0...v0.1.1)] - 2023-05-21
4
+
5
+ - Add load_session_file method to Context
6
+ - Add save_session_file method to Context
7
+
8
+ **Breaking Changes**
9
+
10
+ - Bump bundled llama.cpp from master-173d0e6 to master-6986c78
11
+ - bump LLAMA_FILE_VERSION to 2
12
+
3
13
  ## [[0.1.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.7...v0.1.0)] - 2023-05-20
4
14
 
5
15
  **Breaking Changes**
@@ -292,8 +292,6 @@ public:
292
292
  // rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
293
293
  rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
294
294
  rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
295
- rb_define_method(rb_cLLaMAContextParams, "n_parts=", RUBY_METHOD_FUNC(_llama_context_params_set_n_parts), 1);
296
- rb_define_method(rb_cLLaMAContextParams, "n_parts", RUBY_METHOD_FUNC(_llama_context_params_get_n_parts), 0);
297
295
  rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
298
296
  rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
299
297
  rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
@@ -331,18 +329,6 @@ private:
331
329
  return INT2NUM(ptr->params.n_ctx);
332
330
  };
333
331
 
334
- // n_parts
335
- static VALUE _llama_context_params_set_n_parts(VALUE self, VALUE n_parts) {
336
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
337
- ptr->params.n_parts = NUM2INT(n_parts);
338
- return INT2NUM(ptr->params.n_parts);
339
- };
340
-
341
- static VALUE _llama_context_params_get_n_parts(VALUE self) {
342
- LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
343
- return INT2NUM(ptr->params.n_parts);
344
- };
345
-
346
332
  // seed
347
333
  static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
348
334
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -494,6 +480,8 @@ public:
494
480
  rb_define_method(rb_cLLaMAContext, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_context_apply_lora_from_file), -1);
495
481
  rb_define_method(rb_cLLaMAContext, "kv_cache_token_count", RUBY_METHOD_FUNC(_llama_context_kv_cache_token_count), 0);
496
482
  rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
483
+ rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
484
+ rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
497
485
  rb_define_method(rb_cLLaMAContext, "sample_repetition_penalty", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalty), -1);
498
486
  rb_define_method(rb_cLLaMAContext, "sample_frequency_and_presence_penalties", RUBY_METHOD_FUNC(_llama_context_sample_frequency_and_presence_penalties), -1);
499
487
  rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
@@ -870,6 +858,97 @@ private:
870
858
  return Qnil;
871
859
  };
872
860
 
861
+ static VALUE _llama_context_load_session_file(int argc, VALUE* argv, VALUE self) {
862
+ VALUE kw_args = Qnil;
863
+ ID kw_table[1] = { rb_intern("session_path") };
864
+ VALUE kw_values[1] = { Qundef };
865
+ VALUE candidates = Qnil;
866
+ VALUE last_n_tokens = Qnil;
867
+ rb_scan_args(argc, argv, ":", &kw_args);
868
+ rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
869
+
870
+ if (!RB_TYPE_P(kw_values[0], T_STRING)) {
871
+ rb_raise(rb_eArgError, "session_path must be a String");
872
+ return Qnil;
873
+ }
874
+
875
+ VALUE filename = kw_values[0];
876
+
877
+ LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
878
+ if (ctx_ptr->ctx == NULL) {
879
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
880
+ return Qnil;
881
+ }
882
+
883
+ LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(rb_iv_get(self, "@params"));
884
+ const int n_ctx = prms_ptr->params.n_ctx;
885
+
886
+ std::vector<llama_token> session_tokens(n_ctx);
887
+ size_t n_token_count_out = 0;
888
+
889
+ try {
890
+ bool res = llama_load_session_file(ctx_ptr->ctx, StringValueCStr(filename), session_tokens.data(), session_tokens.capacity(), &n_token_count_out);
891
+ if (!res) {
892
+ rb_raise(rb_eRuntimeError, "Failed to load session file");
893
+ return Qnil;
894
+ }
895
+ session_tokens.resize(n_token_count_out);
896
+ } catch (const std::runtime_error& e) {
897
+ rb_raise(rb_eRuntimeError, "%s", e.what());
898
+ return Qnil;
899
+ }
900
+
901
+ VALUE ary_session_tokens = rb_ary_new2(n_token_count_out);
902
+ for (size_t i = 0; i < n_token_count_out; i++) {
903
+ rb_ary_store(ary_session_tokens, i, INT2NUM(session_tokens[i]));
904
+ }
905
+
906
+ RB_GC_GUARD(filename);
907
+ return ary_session_tokens;
908
+ }
909
+
910
+ static VALUE _llama_context_save_session_file(int argc, VALUE* argv, VALUE self) {
911
+ VALUE kw_args = Qnil;
912
+ ID kw_table[2] = { rb_intern("session_path"), rb_intern("session_tokens") };
913
+ VALUE kw_values[2] = { Qundef, Qundef };
914
+ VALUE candidates = Qnil;
915
+ VALUE last_n_tokens = Qnil;
916
+ rb_scan_args(argc, argv, ":", &kw_args);
917
+ rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
918
+
919
+ if (!RB_TYPE_P(kw_values[0], T_STRING)) {
920
+ rb_raise(rb_eArgError, "session_path must be a String");
921
+ return Qnil;
922
+ }
923
+ if (!RB_TYPE_P(kw_values[1], T_ARRAY)) {
924
+ rb_raise(rb_eArgError, "session_tokens must be an Array");
925
+ return Qnil;
926
+ }
927
+
928
+ VALUE filename = kw_values[0];
929
+ const size_t sz_session_tokens = RARRAY_LEN(kw_values[1]);
930
+ std::vector<llama_token> session_tokens(sz_session_tokens);
931
+ for (size_t i = 0; i < sz_session_tokens; i++) {
932
+ session_tokens[i] = NUM2INT(rb_ary_entry(kw_values[1], i));
933
+ }
934
+
935
+ LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
936
+ if (ctx_ptr->ctx == NULL) {
937
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
938
+ return Qnil;
939
+ }
940
+
941
+ bool res = llama_save_session_file(ctx_ptr->ctx, StringValueCStr(filename), session_tokens.data(), sz_session_tokens);
942
+
943
+ if (!res) {
944
+ rb_raise(rb_eRuntimeError, "Failed to save session file");
945
+ return Qnil;
946
+ }
947
+
948
+ RB_GC_GUARD(filename);
949
+ return Qnil;
950
+ }
951
+
873
952
  static VALUE _llama_context_sample_repetition_penalty(int argc, VALUE* argv, VALUE self) {
874
953
  VALUE kw_args = Qnil;
875
954
  ID kw_table[1] = { rb_intern("penalty") };
@@ -1411,7 +1490,6 @@ extern "C" void Init_llama_cpp(void) {
1411
1490
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
1412
1491
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1));
1413
1492
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16));
1414
- rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_2", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2));
1415
1493
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
1416
1494
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
1417
1495
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
@@ -14,6 +14,8 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
14
14
  void * ggml_cuda_host_malloc(size_t size);
15
15
  void ggml_cuda_host_free(void * ptr);
16
16
 
17
+ void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
18
+
17
19
  #ifdef __cplusplus
18
20
  }
19
21
  #endif
@@ -12,129 +12,129 @@
12
12
  #define MULTILINE_QUOTE(...) #__VA_ARGS__
13
13
  const char * clblast_dequant = MULTILINE_QUOTE(
14
14
 
15
+ typedef uchar uint8_t;
16
+ typedef int int32_t;
17
+ typedef uint uint32_t;
18
+
19
+ constant uint QK4_0 = 32;
15
20
  struct block_q4_0
16
21
  {
17
22
  float d;
18
- uchar qs[16];
23
+ uint8_t qs[QK4_0 / 2];
19
24
  };
20
25
 
21
- __kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
22
- const uint i = get_global_id(0) / 32;
23
- const uint l = get_local_id(0);
24
-
25
- const float d = blocks[i].d;
26
+ constant uint QK4_1 = 32;
27
+ struct block_q4_1
28
+ {
29
+ float d;
30
+ float m;
31
+ uint8_t qs[QK4_1 / 2];
32
+ };
26
33
 
27
- const uchar vi = blocks[i].qs[l];
34
+ constant uint QK5_0 = 32;
35
+ struct __attribute__ ((packed)) block_q5_0
36
+ {
37
+ half d;
38
+ uint32_t qh;
39
+ uint8_t qs[QK5_0 / 2];
40
+ };
28
41
 
29
- const uint index = i*32 + l*2;
30
- result[index + 0] = ((vi & 0xf) - 8)*d;
31
- result[index + 1] = ((vi >> 4) - 8)*d;
32
- }
42
+ constant uint QK5_1 = 32;
43
+ struct block_q5_1
44
+ {
45
+ half d;
46
+ half m;
47
+ uint32_t qh;
48
+ uint8_t qs[QK5_1 / 2];
49
+ };
33
50
 
34
- struct block_q4_1
51
+ constant uint QK8_0 = 32;
52
+ struct block_q8_0
35
53
  {
36
54
  float d;
37
- float m;
38
- uchar qs[16];
55
+ uint8_t qs[QK8_0];
39
56
  };
40
57
 
41
- __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
42
- const uint i = get_global_id(0) / 32;
43
- const uint l = get_local_id(0);
44
58
 
45
- const float d = blocks[i].d;
46
- const float m = blocks[i].m;
59
+ __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
60
+ constant uint qk = QK4_0;
61
+
62
+ const uint i = get_global_id(0) / qk;
63
+ const uint j = get_local_id(0);
64
+
65
+ const float d = x[i].d;
47
66
 
48
- const uchar vi = blocks[i].qs[l];
67
+ const int x0 = (x[i].qs[j] & 0xf) - 8;
68
+ const int x1 = (x[i].qs[j] >> 4) - 8;
49
69
 
50
- const uint index = i*32 + l*2;
51
- result[index + 0] = (vi & 0xf) * d + m;
52
- result[index + 1] = (vi >> 4) * d + m;
70
+ y[i*qk + j + 0 ] = x0*d;
71
+ y[i*qk + j + qk/2] = x1*d;
53
72
  }
54
73
 
55
- struct block_q4_2
56
- {
57
- ushort d;
58
- uchar qs[8];
59
- };
74
+ __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
75
+ constant uint qk = QK4_1;
60
76
 
61
- __kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
62
- const uint i = get_global_id(0) / 16;
63
- const uint l = get_local_id(0);
77
+ const uint i = get_global_id(0) / qk;
78
+ const uint j = get_local_id(0);
64
79
 
65
- const float d = vload_half(0, (__global half*) &blocks[i].d);
80
+ const float d = x[i].d;
81
+ const float m = x[i].m;
66
82
 
67
- const uchar vi = blocks[i].qs[l];
83
+ const int x0 = (x[i].qs[j] & 0xf);
84
+ const int x1 = (x[i].qs[j] >> 4);
68
85
 
69
- const uint index = i*16 + l*2;
70
- result[index + 0] = ((vi & 0xf) - 8)*d;
71
- result[index + 1] = ((vi >> 4) - 8)*d;
86
+ y[i*qk + j + 0 ] = x0*d + m;
87
+ y[i*qk + j + qk/2] = x1*d + m;
72
88
  }
73
89
 
90
+ __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
91
+ constant uint qk = QK5_0;
74
92
 
75
- struct block_q5_0
76
- {
77
- float d;
78
- uint qh;
79
- uchar qs[16];
80
- };
93
+ const uint i = get_global_id(0) / qk;
94
+ const uint j = get_local_id(0);
81
95
 
82
- __kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
83
- const uint i = get_global_id(0) / 32;
84
- const uint l = get_local_id(0);
96
+ const float d = vload_half(0, (__global half*) &x[i].d);
85
97
 
86
- const float d = blocks[i].d;
98
+ uint32_t qh = x[i].qh;
87
99
 
88
- const uchar vi = blocks[i].qs[l];
100
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
101
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
89
102
 
90
- const uint l2 = l * 2;
103
+ const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
104
+ const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
91
105
 
92
- const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
93
- const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
94
-
95
- const uint index = i*32 + l2;
96
- result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
97
- result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
106
+ y[i*qk + j + 0 ] = x0*d;
107
+ y[i*qk + j + qk/2] = x1*d;
98
108
  }
99
109
 
100
- struct block_q5_1
101
- {
102
- ushort d;
103
- ushort m;
104
- uint qh;
105
- uchar qs[16];
106
- };
110
+ __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
111
+ constant uint qk = QK5_1;
107
112
 
108
- __kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
109
- const uint i = get_global_id(0) / 32;
110
- const uint l = get_local_id(0);
113
+ const uint i = get_global_id(0) / qk;
114
+ const uint j = get_local_id(0);
111
115
 
112
- const float d = vload_half(0, (__global half*) &blocks[i].d);
113
- const float m = vload_half(0, (__global half*) &blocks[i].m);
116
+ const float d = vload_half(0, (__global half*) &x[i].d);
117
+ const float m = vload_half(0, (__global half*) &x[i].m);
114
118
 
115
- const uchar vi = blocks[i].qs[l];
119
+ uint32_t qh = x[i].qh;
116
120
 
117
- const uint l2 = l * 2;
121
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
122
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
118
123
 
119
- const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
120
- const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
124
+ const int x0 = (x[i].qs[j] & 0xf) | xh_0;
125
+ const int x1 = (x[i].qs[j] >> 4) | xh_1;
121
126
 
122
- const uint index = i*32 + l2;
123
- result[index + 0] = ((vi & 0xf) | vh0)*d + m;
124
- result[index + 1] = ((vi >> 4) | vh1)*d + m;
127
+ y[i*qk + j + 0 ] = x0*d + m;
128
+ y[i*qk + j + qk/2] = x1*d + m;
125
129
  }
126
130
 
127
- struct block_q8_0
128
- {
129
- float d;
130
- char qs[32];
131
- };
132
-
133
- __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
134
- const uint i = get_global_id(0) / 32;
135
- const uint l = get_local_id(0);
131
+ __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
132
+ constant uint qk = QK8_0;
133
+ const uint i = get_global_id(0) / qk;
134
+ const uint j = get_local_id(0);
136
135
 
137
- result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
136
+ const float d = x[i].d;
137
+ y[i*qk + j] = x[i].qs[j]*d;
138
138
  }
139
139
 
140
140
  );
@@ -148,26 +148,12 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
148
148
  } \
149
149
  } while (0)
150
150
 
151
- #define QK5_0 32
152
- typedef struct {
153
- ggml_fp16_t d; // delta
154
- uint8_t qh[4]; // 5-th bit of quants
155
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
156
- } block_q5_0;
157
-
158
-
159
- typedef struct {
160
- float d; // delta
161
- uint32_t qh; // 5-th bit of quants
162
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
163
- } cl_block_q5_0;
164
-
165
151
  static cl_platform_id platform;
166
152
  static cl_device_id device;
167
153
  static cl_context context;
168
154
  static cl_command_queue queue;
169
155
  static cl_program program;
170
- static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
156
+ static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
171
157
  static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
172
158
  static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
173
159
 
@@ -238,8 +224,6 @@ void ggml_cl_init(void) {
238
224
  CL_CHECK(err, "clCreateKernel");
239
225
  kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
240
226
  CL_CHECK(err, "clCreateKernel");
241
- kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
242
- CL_CHECK(err, "clCreateKernel");
243
227
  kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
244
228
  CL_CHECK(err, "clCreateKernel");
245
229
  kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
@@ -274,7 +258,6 @@ void ggml_cl_sgemm_wrapper(
274
258
  cl_kernel kernel;
275
259
  size_t global = n * k, local, size_qb;
276
260
  bool dequant;
277
- cl_block_q5_0* cl_host_b;
278
261
 
279
262
  switch (btype) {
280
263
  case GGML_TYPE_F32:
@@ -292,28 +275,11 @@ void ggml_cl_sgemm_wrapper(
292
275
  local = 16;
293
276
  size_qb = global * (sizeof(float) * 2 + local) / 32;
294
277
  break;
295
- case GGML_TYPE_Q4_2:
296
- dequant = true;
297
- kernel = kernel_q4_2;
298
- local = 8;
299
- size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
300
- break;
301
278
  case GGML_TYPE_Q5_0:
302
279
  dequant = true;
303
280
  kernel = kernel_q5_0;
304
281
  local = 16;
305
- // For some reason OpenCL seems to be incapable of working with structs of size 22.
306
- // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
307
- // TODO Find the reason, fix and remove workaround.
308
- const block_q5_0* b = (const block_q5_0*) host_b;
309
- cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
310
- for (size_t i = 0; i < global / 32; i++) {
311
- cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
312
- memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
313
- memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
314
- }
315
- host_b = (const float*) cl_host_b;
316
- size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
282
+ size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
317
283
  break;
318
284
  case GGML_TYPE_Q5_1:
319
285
  dequant = true;
@@ -392,7 +358,4 @@ void ggml_cl_sgemm_wrapper(
392
358
  clWaitForEvents(1, &ev_c);
393
359
  clReleaseEvent(ev_sgemm);
394
360
  clReleaseEvent(ev_c);
395
- if (btype == GGML_TYPE_Q5_0) {
396
- free((void*) cl_host_b);
397
- }
398
361
  }