llama_cpp 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +13 -50
- data/ext/llama_cpp/src/ggml-cuda.cu +23 -11
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +130 -61
- data/ext/llama_cpp/src/ggml-metal.metal +44 -26
- data/ext/llama_cpp/src/ggml.c +637 -328
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +2 -2
- data/ext/llama_cpp/src/llama.cpp +426 -97
- data/ext/llama_cpp/src/llama.h +51 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 715eab98a76ed825d66da6e4fcc84154dca8eed76f6cf6625d210a1ffb702958
|
4
|
+
data.tar.gz: 3ceafc312354d245e485b664d71450cd9c27bcd89f5faec91af6cdf1221c251f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ebe959d9380c9d981156606fdd8a6bcea9b88914923e693b400cfcd605b8c216bdfdcc807c0e72a21fe5fc6d7d623118fc7246524d7f59acdb8bc0064d736bc
|
7
|
+
data.tar.gz: c6d428234d866c09d227b5c308a573e9721454ded3f7fdd36880706e7c47c72c67e6fed119c75d6898c6a1149cde853e5dbb59e3a390ef3d370aab4f0d6be548
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
|
2
|
+
|
3
|
+
**Breaking Changes**
|
4
|
+
- Bump bundled llama.cpp from master-b1060 to master-b1140.
|
5
|
+
- Rename `token_to_str` method on Context to `token_to_piece` method.
|
6
|
+
- Rename `token_to_str` method on Model to `token_to_piece` method.
|
7
|
+
- Rename `type` method on Model to `desc` method.
|
8
|
+
- Add `size` and `n_params` methods to Model.
|
9
|
+
|
1
10
|
## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
|
2
11
|
|
3
12
|
**Breaking Changes**
|
data/examples/chat.rb
CHANGED
@@ -122,7 +122,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
122
122
|
|
123
123
|
if input_echo
|
124
124
|
output = []
|
125
|
-
embd.each { |token| output << context.
|
125
|
+
embd.each { |token| output << context.token_to_piece(token) }
|
126
126
|
output_str = output.join
|
127
127
|
output_str.chomp!(antiprompt) if first_input
|
128
128
|
print(output_str)
|
@@ -131,7 +131,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
131
131
|
if embd_input.size <= n_consumed
|
132
132
|
if antiprompt.size.positive?
|
133
133
|
last_output = []
|
134
|
-
last_n_tokens.each { |token| last_output << context.
|
134
|
+
last_n_tokens.each { |token| last_output << context.token_to_piece(token) }
|
135
135
|
last_output_str = last_output.join
|
136
136
|
|
137
137
|
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -50,7 +50,7 @@ if with_config('accelerate')
|
|
50
50
|
end
|
51
51
|
|
52
52
|
if with_config('metal')
|
53
|
-
$CFLAGS << ' -DGGML_USE_METAL
|
53
|
+
$CFLAGS << ' -DGGML_USE_METAL'
|
54
54
|
$CXXFLAGS << ' -DGGML_USE_METAL'
|
55
55
|
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
|
56
56
|
$objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -811,9 +811,11 @@ public:
|
|
811
811
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
812
812
|
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
|
813
813
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
814
|
-
rb_define_method(rb_cLLaMAModel, "
|
814
|
+
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
|
815
815
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
816
|
-
rb_define_method(rb_cLLaMAModel, "
|
816
|
+
rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
|
817
|
+
rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
|
818
|
+
rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
|
817
819
|
}
|
818
820
|
|
819
821
|
private:
|
@@ -974,7 +976,7 @@ private:
|
|
974
976
|
return INT2NUM(llama_model_n_embd(ptr->model));
|
975
977
|
}
|
976
978
|
|
977
|
-
static VALUE
|
979
|
+
static VALUE _llama_model_token_to_piece_with_model(VALUE self, VALUE token_) {
|
978
980
|
if (!RB_INTEGER_TYPE_P(token_)) {
|
979
981
|
rb_raise(rb_eArgError, "token must be an integer");
|
980
982
|
return Qnil;
|
@@ -982,10 +984,10 @@ private:
|
|
982
984
|
const llama_token token = NUM2INT(token_);
|
983
985
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
984
986
|
std::vector<char> result(8, 0);
|
985
|
-
const int n_tokens =
|
987
|
+
const int n_tokens = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
|
986
988
|
if (n_tokens < 0) {
|
987
989
|
result.resize(-n_tokens);
|
988
|
-
const int check =
|
990
|
+
const int check = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
|
989
991
|
if (check != -n_tokens) {
|
990
992
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
991
993
|
return Qnil;
|
@@ -1040,12 +1042,22 @@ private:
|
|
1040
1042
|
return ret;
|
1041
1043
|
}
|
1042
1044
|
|
1043
|
-
static VALUE
|
1045
|
+
static VALUE _llama_model_get_model_desc(VALUE self) {
|
1044
1046
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1045
1047
|
char buf[128];
|
1046
|
-
|
1048
|
+
llama_model_desc(ptr->model, buf, sizeof(buf));
|
1047
1049
|
return rb_str_new_cstr(buf);
|
1048
1050
|
}
|
1051
|
+
|
1052
|
+
static VALUE _llama_model_get_model_size(VALUE self) {
|
1053
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1054
|
+
return UINT2NUM(llama_model_size(ptr->model));
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
static VALUE _llama_model_get_model_n_params(VALUE self) {
|
1058
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1059
|
+
return UINT2NUM(llama_model_n_params(ptr->model));
|
1060
|
+
}
|
1049
1061
|
};
|
1050
1062
|
|
1051
1063
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -1326,7 +1338,7 @@ public:
|
|
1326
1338
|
rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
|
1327
1339
|
rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
|
1328
1340
|
rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
|
1329
|
-
rb_define_method(rb_cLLaMAContext, "
|
1341
|
+
rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
|
1330
1342
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
1331
1343
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
1332
1344
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
@@ -1567,7 +1579,7 @@ private:
|
|
1567
1579
|
return output;
|
1568
1580
|
}
|
1569
1581
|
|
1570
|
-
static VALUE
|
1582
|
+
static VALUE _llama_context_token_to_piece(VALUE self, VALUE token_) {
|
1571
1583
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1572
1584
|
if (ptr->ctx == NULL) {
|
1573
1585
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
@@ -1575,10 +1587,10 @@ private:
|
|
1575
1587
|
}
|
1576
1588
|
const llama_token token = NUM2INT(token_);
|
1577
1589
|
std::vector<char> result(8, 0);
|
1578
|
-
const int n_tokens =
|
1590
|
+
const int n_tokens = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
|
1579
1591
|
if (n_tokens < 0) {
|
1580
1592
|
result.resize(-n_tokens);
|
1581
|
-
const int check =
|
1593
|
+
const int check = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
|
1582
1594
|
if (check != -n_tokens) {
|
1583
1595
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
1584
1596
|
return Qnil;
|
@@ -107,6 +107,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g
|
|
107
107
|
}
|
108
108
|
|
109
109
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
110
|
+
#ifdef GGML_ALLOCATOR_DEBUG
|
111
|
+
GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
|
112
|
+
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
113
|
+
#endif
|
110
114
|
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
111
115
|
size = aligned_offset(NULL, size, alloc->alignment);
|
112
116
|
|
@@ -268,7 +272,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
268
272
|
/*.parse_seq = */ {0},
|
269
273
|
/*.parse_seq_len = */ 0,
|
270
274
|
#ifdef GGML_ALLOCATOR_DEBUG
|
271
|
-
/*.allocated_tensors = */
|
275
|
+
/*.allocated_tensors = */ {0},
|
272
276
|
#endif
|
273
277
|
};
|
274
278
|
|
@@ -297,7 +301,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
297
301
|
/*.parse_seq = */ {0},
|
298
302
|
/*.parse_seq_len = */ 0,
|
299
303
|
#ifdef GGML_ALLOCATOR_DEBUG
|
300
|
-
/*.allocated_tensors = */
|
304
|
+
/*.allocated_tensors = */ {0},
|
301
305
|
#endif
|
302
306
|
};
|
303
307
|
|
@@ -317,8 +321,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
317
321
|
//////////// compute graph allocator
|
318
322
|
|
319
323
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
320
|
-
return t->
|
321
|
-
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
324
|
+
return t->view_src != NULL;
|
322
325
|
}
|
323
326
|
|
324
327
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
@@ -336,28 +339,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
|
336
339
|
return true;
|
337
340
|
}
|
338
341
|
|
339
|
-
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
|
340
|
-
switch (t->op) {
|
341
|
-
case GGML_OP_PERMUTE:
|
342
|
-
case GGML_OP_RESHAPE:
|
343
|
-
case GGML_OP_TRANSPOSE:
|
344
|
-
case GGML_OP_VIEW:
|
345
|
-
return t->src[0];
|
346
|
-
case GGML_OP_CPY:
|
347
|
-
return t->src[1];
|
348
|
-
default:
|
349
|
-
return NULL;
|
350
|
-
}
|
351
|
-
}
|
352
|
-
|
353
|
-
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
|
354
|
-
struct ggml_tensor * parent = t;
|
355
|
-
do {
|
356
|
-
parent = get_view_parent(parent);
|
357
|
-
} while (ggml_is_view(parent));
|
358
|
-
return parent;
|
359
|
-
}
|
360
|
-
|
361
342
|
static bool ggml_op_can_inplace(enum ggml_op op) {
|
362
343
|
switch (op) {
|
363
344
|
case GGML_OP_SCALE:
|
@@ -365,7 +346,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
365
346
|
case GGML_OP_DIAG_MASK_INF:
|
366
347
|
case GGML_OP_ADD:
|
367
348
|
case GGML_OP_ADD1:
|
368
|
-
case GGML_OP_ACC:
|
369
349
|
case GGML_OP_SUB:
|
370
350
|
case GGML_OP_MUL:
|
371
351
|
case GGML_OP_DIV:
|
@@ -375,7 +355,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
375
355
|
case GGML_OP_UNARY:
|
376
356
|
case GGML_OP_ROPE:
|
377
357
|
case GGML_OP_RMS_NORM:
|
378
|
-
case GGML_OP_SET:
|
379
358
|
case GGML_OP_SOFT_MAX:
|
380
359
|
case GGML_OP_CONT:
|
381
360
|
return true;
|
@@ -389,24 +368,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
389
368
|
struct hash_node * ht = alloc->hash_table;
|
390
369
|
if (node->data == NULL) {
|
391
370
|
if (ggml_is_view(node)) {
|
392
|
-
|
393
|
-
|
394
|
-
case GGML_OP_VIEW:
|
395
|
-
memcpy(&offset, node->op_params, sizeof(size_t));
|
396
|
-
node->data = (char *) node->src[0]->data + offset;
|
397
|
-
break;
|
398
|
-
case GGML_OP_PERMUTE:
|
399
|
-
case GGML_OP_RESHAPE:
|
400
|
-
case GGML_OP_TRANSPOSE:
|
401
|
-
node->data = node->src[0]->data;
|
402
|
-
break;
|
403
|
-
case GGML_OP_CPY:
|
404
|
-
node->data = node->src[1]->data;
|
405
|
-
break;
|
406
|
-
default:
|
407
|
-
GGML_ASSERT(!"unknown view op");
|
408
|
-
break;
|
409
|
-
}
|
371
|
+
assert(node->view_src->data != NULL);
|
372
|
+
node->data = (char *)node->view_src->data + node->view_offs;
|
410
373
|
} else {
|
411
374
|
// see if we can reuse a parent's buffer (inplace)
|
412
375
|
if (ggml_op_can_inplace(node->op)) {
|
@@ -426,7 +389,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
426
389
|
struct hash_node * p_hn = hash_get(ht, parent);
|
427
390
|
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
428
391
|
if (ggml_is_view(parent)) {
|
429
|
-
struct ggml_tensor * view_src =
|
392
|
+
struct ggml_tensor * view_src = parent->view_src;
|
430
393
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
431
394
|
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
432
395
|
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
@@ -468,7 +431,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
468
431
|
struct ggml_tensor * node = gf->nodes[i];
|
469
432
|
|
470
433
|
if (ggml_is_view(node)) {
|
471
|
-
struct ggml_tensor * view_src =
|
434
|
+
struct ggml_tensor * view_src = node->view_src;
|
472
435
|
hash_get(ht, view_src)->n_views += 1;
|
473
436
|
}
|
474
437
|
|
@@ -553,10 +516,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
553
516
|
|
554
517
|
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
555
518
|
if (ggml_is_view(parent)) {
|
556
|
-
struct ggml_tensor * view_src =
|
519
|
+
struct ggml_tensor * view_src = parent->view_src;
|
557
520
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
558
521
|
view_src_hn->n_views -= 1;
|
559
|
-
AT_PRINTF("view_src %s\n", view_src->name);
|
522
|
+
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
560
523
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
561
524
|
ggml_allocator_free_tensor(alloc, view_src);
|
562
525
|
}
|
@@ -306,11 +306,11 @@ typedef struct {
|
|
306
306
|
#define QI4_K (QK_K / (4*QR4_K))
|
307
307
|
#ifdef GGML_QKK_64
|
308
308
|
typedef struct {
|
309
|
-
half
|
309
|
+
half dm[2]; // super-block scales/mins
|
310
310
|
uint8_t scales[2]; // 4-bit block scales/mins
|
311
311
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
312
312
|
} block_q4_K;
|
313
|
-
static_assert(sizeof(block_q4_K) ==
|
313
|
+
static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
314
314
|
#else
|
315
315
|
typedef struct {
|
316
316
|
half2 dm; // super-block scale for quantized scales/mins
|
@@ -737,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
737
737
|
const int tid = threadIdx.x;
|
738
738
|
const uint8_t * q = x[i].qs;
|
739
739
|
float * y = yy + i*QK_K;
|
740
|
-
const float d = (float)x[i].
|
741
|
-
const float m = (float)x[i].
|
740
|
+
const float d = (float)x[i].dm[0];
|
741
|
+
const float m = (float)x[i].dm[1];
|
742
742
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
743
743
|
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
744
744
|
#endif
|
@@ -1155,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
1155
1155
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1156
1156
|
aux16[0] = a[0] & 0x0f0f;
|
1157
1157
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
1158
|
-
const float d = (float)x[i].
|
1159
|
-
const float m = (float)x[i].
|
1158
|
+
const float d = (float)x[i].dm[0];
|
1159
|
+
const float m = (float)x[i].dm[1];
|
1160
1160
|
float sum = 0.f;
|
1161
1161
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
1162
1162
|
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
@@ -2845,8 +2845,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2845
2845
|
aux16[0] = a[0] & 0x0f0f;
|
2846
2846
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2847
2847
|
|
2848
|
-
const float dall = bq4_K->
|
2849
|
-
const float dmin = bq4_K->
|
2848
|
+
const float dall = bq4_K->dm[0];
|
2849
|
+
const float dmin = bq4_K->dm[1];
|
2850
2850
|
|
2851
2851
|
const float d8_1 = __low2float(bq8_1[0].ds);
|
2852
2852
|
const float d8_2 = __low2float(bq8_1[1].ds);
|
@@ -2929,7 +2929,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2929
2929
|
|
2930
2930
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2931
2931
|
|
2932
|
+
#if QK_K == 256
|
2932
2933
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2934
|
+
#else
|
2935
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
2936
|
+
#endif
|
2933
2937
|
}
|
2934
2938
|
|
2935
2939
|
#pragma unroll
|
@@ -3119,7 +3123,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3119
3123
|
|
3120
3124
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
3121
3125
|
|
3126
|
+
#if QK_K == 256
|
3122
3127
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
3128
|
+
#endif
|
3123
3129
|
}
|
3124
3130
|
|
3125
3131
|
#pragma unroll
|
@@ -4709,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4709
4715
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4710
4716
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4711
4717
|
|
4718
|
+
#if QK_K == 256
|
4719
|
+
|
4712
4720
|
int id;
|
4713
4721
|
CUDA_CHECK(cudaGetDevice(&id));
|
4714
4722
|
const int compute_capability = g_compute_capabilities[id];
|
@@ -4740,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4740
4748
|
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4741
4749
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4742
4750
|
}
|
4751
|
+
#endif
|
4743
4752
|
}
|
4744
4753
|
|
4745
4754
|
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
@@ -4899,8 +4908,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
4899
4908
|
|
4900
4909
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4901
4910
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4902
|
-
GGML_ASSERT(
|
4903
|
-
const dim3 block_dims(1,
|
4911
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4912
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4904
4913
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4905
4914
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4906
4915
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
@@ -4908,7 +4917,8 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
4908
4917
|
|
4909
4918
|
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4910
4919
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4911
|
-
|
4920
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4921
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4912
4922
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4913
4923
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4914
4924
|
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
@@ -6328,9 +6338,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
|
6328
6338
|
|
6329
6339
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6330
6340
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6341
|
+
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6331
6342
|
|
6332
6343
|
const int mode = ((int32_t *) dst->op_params)[2];
|
6333
6344
|
const bool is_glm = mode & 4;
|
6345
|
+
|
6334
6346
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6335
6347
|
}
|
6336
6348
|
|