llama_cpp 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +13 -50
- data/ext/llama_cpp/src/ggml-cuda.cu +23 -11
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +130 -61
- data/ext/llama_cpp/src/ggml-metal.metal +44 -26
- data/ext/llama_cpp/src/ggml.c +637 -328
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +2 -2
- data/ext/llama_cpp/src/llama.cpp +426 -97
- data/ext/llama_cpp/src/llama.h +51 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 715eab98a76ed825d66da6e4fcc84154dca8eed76f6cf6625d210a1ffb702958
|
4
|
+
data.tar.gz: 3ceafc312354d245e485b664d71450cd9c27bcd89f5faec91af6cdf1221c251f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ebe959d9380c9d981156606fdd8a6bcea9b88914923e693b400cfcd605b8c216bdfdcc807c0e72a21fe5fc6d7d623118fc7246524d7f59acdb8bc0064d736bc
|
7
|
+
data.tar.gz: c6d428234d866c09d227b5c308a573e9721454ded3f7fdd36880706e7c47c72c67e6fed119c75d6898c6a1149cde853e5dbb59e3a390ef3d370aab4f0d6be548
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
|
2
|
+
|
3
|
+
**Breaking Changes**
|
4
|
+
- Bump bundled llama.cpp from master-b1060 to master-b1140.
|
5
|
+
- Rename `token_to_str` method on Context to `token_to_piece` method.
|
6
|
+
- Rename `token_to_str` method on Model to `token_to_piece` method.
|
7
|
+
- Rename `type` method on Model to `desc` method.
|
8
|
+
- Add `size` and `n_params` methods to Model.
|
9
|
+
|
1
10
|
## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
|
2
11
|
|
3
12
|
**Breaking Changes**
|
data/examples/chat.rb
CHANGED
@@ -122,7 +122,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
122
122
|
|
123
123
|
if input_echo
|
124
124
|
output = []
|
125
|
-
embd.each { |token| output << context.
|
125
|
+
embd.each { |token| output << context.token_to_piece(token) }
|
126
126
|
output_str = output.join
|
127
127
|
output_str.chomp!(antiprompt) if first_input
|
128
128
|
print(output_str)
|
@@ -131,7 +131,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
131
131
|
if embd_input.size <= n_consumed
|
132
132
|
if antiprompt.size.positive?
|
133
133
|
last_output = []
|
134
|
-
last_n_tokens.each { |token| last_output << context.
|
134
|
+
last_n_tokens.each { |token| last_output << context.token_to_piece(token) }
|
135
135
|
last_output_str = last_output.join
|
136
136
|
|
137
137
|
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -50,7 +50,7 @@ if with_config('accelerate')
|
|
50
50
|
end
|
51
51
|
|
52
52
|
if with_config('metal')
|
53
|
-
$CFLAGS << ' -DGGML_USE_METAL
|
53
|
+
$CFLAGS << ' -DGGML_USE_METAL'
|
54
54
|
$CXXFLAGS << ' -DGGML_USE_METAL'
|
55
55
|
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
|
56
56
|
$objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -811,9 +811,11 @@ public:
|
|
811
811
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
812
812
|
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
|
813
813
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
814
|
-
rb_define_method(rb_cLLaMAModel, "
|
814
|
+
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
|
815
815
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
816
|
-
rb_define_method(rb_cLLaMAModel, "
|
816
|
+
rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
|
817
|
+
rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
|
818
|
+
rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
|
817
819
|
}
|
818
820
|
|
819
821
|
private:
|
@@ -974,7 +976,7 @@ private:
|
|
974
976
|
return INT2NUM(llama_model_n_embd(ptr->model));
|
975
977
|
}
|
976
978
|
|
977
|
-
static VALUE
|
979
|
+
static VALUE _llama_model_token_to_piece_with_model(VALUE self, VALUE token_) {
|
978
980
|
if (!RB_INTEGER_TYPE_P(token_)) {
|
979
981
|
rb_raise(rb_eArgError, "token must be an integer");
|
980
982
|
return Qnil;
|
@@ -982,10 +984,10 @@ private:
|
|
982
984
|
const llama_token token = NUM2INT(token_);
|
983
985
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
984
986
|
std::vector<char> result(8, 0);
|
985
|
-
const int n_tokens =
|
987
|
+
const int n_tokens = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
|
986
988
|
if (n_tokens < 0) {
|
987
989
|
result.resize(-n_tokens);
|
988
|
-
const int check =
|
990
|
+
const int check = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
|
989
991
|
if (check != -n_tokens) {
|
990
992
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
991
993
|
return Qnil;
|
@@ -1040,12 +1042,22 @@ private:
|
|
1040
1042
|
return ret;
|
1041
1043
|
}
|
1042
1044
|
|
1043
|
-
static VALUE
|
1045
|
+
static VALUE _llama_model_get_model_desc(VALUE self) {
|
1044
1046
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1045
1047
|
char buf[128];
|
1046
|
-
|
1048
|
+
llama_model_desc(ptr->model, buf, sizeof(buf));
|
1047
1049
|
return rb_str_new_cstr(buf);
|
1048
1050
|
}
|
1051
|
+
|
1052
|
+
static VALUE _llama_model_get_model_size(VALUE self) {
|
1053
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1054
|
+
return UINT2NUM(llama_model_size(ptr->model));
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
static VALUE _llama_model_get_model_n_params(VALUE self) {
|
1058
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1059
|
+
return UINT2NUM(llama_model_n_params(ptr->model));
|
1060
|
+
}
|
1049
1061
|
};
|
1050
1062
|
|
1051
1063
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -1326,7 +1338,7 @@ public:
|
|
1326
1338
|
rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
|
1327
1339
|
rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
|
1328
1340
|
rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
|
1329
|
-
rb_define_method(rb_cLLaMAContext, "
|
1341
|
+
rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
|
1330
1342
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
1331
1343
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
1332
1344
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
@@ -1567,7 +1579,7 @@ private:
|
|
1567
1579
|
return output;
|
1568
1580
|
}
|
1569
1581
|
|
1570
|
-
static VALUE
|
1582
|
+
static VALUE _llama_context_token_to_piece(VALUE self, VALUE token_) {
|
1571
1583
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1572
1584
|
if (ptr->ctx == NULL) {
|
1573
1585
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
@@ -1575,10 +1587,10 @@ private:
|
|
1575
1587
|
}
|
1576
1588
|
const llama_token token = NUM2INT(token_);
|
1577
1589
|
std::vector<char> result(8, 0);
|
1578
|
-
const int n_tokens =
|
1590
|
+
const int n_tokens = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
|
1579
1591
|
if (n_tokens < 0) {
|
1580
1592
|
result.resize(-n_tokens);
|
1581
|
-
const int check =
|
1593
|
+
const int check = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
|
1582
1594
|
if (check != -n_tokens) {
|
1583
1595
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
1584
1596
|
return Qnil;
|
@@ -107,6 +107,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g
|
|
107
107
|
}
|
108
108
|
|
109
109
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
110
|
+
#ifdef GGML_ALLOCATOR_DEBUG
|
111
|
+
GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
|
112
|
+
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
113
|
+
#endif
|
110
114
|
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
111
115
|
size = aligned_offset(NULL, size, alloc->alignment);
|
112
116
|
|
@@ -268,7 +272,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
268
272
|
/*.parse_seq = */ {0},
|
269
273
|
/*.parse_seq_len = */ 0,
|
270
274
|
#ifdef GGML_ALLOCATOR_DEBUG
|
271
|
-
/*.allocated_tensors = */
|
275
|
+
/*.allocated_tensors = */ {0},
|
272
276
|
#endif
|
273
277
|
};
|
274
278
|
|
@@ -297,7 +301,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
297
301
|
/*.parse_seq = */ {0},
|
298
302
|
/*.parse_seq_len = */ 0,
|
299
303
|
#ifdef GGML_ALLOCATOR_DEBUG
|
300
|
-
/*.allocated_tensors = */
|
304
|
+
/*.allocated_tensors = */ {0},
|
301
305
|
#endif
|
302
306
|
};
|
303
307
|
|
@@ -317,8 +321,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
317
321
|
//////////// compute graph allocator
|
318
322
|
|
319
323
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
320
|
-
return t->
|
321
|
-
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
324
|
+
return t->view_src != NULL;
|
322
325
|
}
|
323
326
|
|
324
327
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
@@ -336,28 +339,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
|
336
339
|
return true;
|
337
340
|
}
|
338
341
|
|
339
|
-
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
|
340
|
-
switch (t->op) {
|
341
|
-
case GGML_OP_PERMUTE:
|
342
|
-
case GGML_OP_RESHAPE:
|
343
|
-
case GGML_OP_TRANSPOSE:
|
344
|
-
case GGML_OP_VIEW:
|
345
|
-
return t->src[0];
|
346
|
-
case GGML_OP_CPY:
|
347
|
-
return t->src[1];
|
348
|
-
default:
|
349
|
-
return NULL;
|
350
|
-
}
|
351
|
-
}
|
352
|
-
|
353
|
-
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
|
354
|
-
struct ggml_tensor * parent = t;
|
355
|
-
do {
|
356
|
-
parent = get_view_parent(parent);
|
357
|
-
} while (ggml_is_view(parent));
|
358
|
-
return parent;
|
359
|
-
}
|
360
|
-
|
361
342
|
static bool ggml_op_can_inplace(enum ggml_op op) {
|
362
343
|
switch (op) {
|
363
344
|
case GGML_OP_SCALE:
|
@@ -365,7 +346,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
365
346
|
case GGML_OP_DIAG_MASK_INF:
|
366
347
|
case GGML_OP_ADD:
|
367
348
|
case GGML_OP_ADD1:
|
368
|
-
case GGML_OP_ACC:
|
369
349
|
case GGML_OP_SUB:
|
370
350
|
case GGML_OP_MUL:
|
371
351
|
case GGML_OP_DIV:
|
@@ -375,7 +355,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
375
355
|
case GGML_OP_UNARY:
|
376
356
|
case GGML_OP_ROPE:
|
377
357
|
case GGML_OP_RMS_NORM:
|
378
|
-
case GGML_OP_SET:
|
379
358
|
case GGML_OP_SOFT_MAX:
|
380
359
|
case GGML_OP_CONT:
|
381
360
|
return true;
|
@@ -389,24 +368,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
389
368
|
struct hash_node * ht = alloc->hash_table;
|
390
369
|
if (node->data == NULL) {
|
391
370
|
if (ggml_is_view(node)) {
|
392
|
-
|
393
|
-
|
394
|
-
case GGML_OP_VIEW:
|
395
|
-
memcpy(&offset, node->op_params, sizeof(size_t));
|
396
|
-
node->data = (char *) node->src[0]->data + offset;
|
397
|
-
break;
|
398
|
-
case GGML_OP_PERMUTE:
|
399
|
-
case GGML_OP_RESHAPE:
|
400
|
-
case GGML_OP_TRANSPOSE:
|
401
|
-
node->data = node->src[0]->data;
|
402
|
-
break;
|
403
|
-
case GGML_OP_CPY:
|
404
|
-
node->data = node->src[1]->data;
|
405
|
-
break;
|
406
|
-
default:
|
407
|
-
GGML_ASSERT(!"unknown view op");
|
408
|
-
break;
|
409
|
-
}
|
371
|
+
assert(node->view_src->data != NULL);
|
372
|
+
node->data = (char *)node->view_src->data + node->view_offs;
|
410
373
|
} else {
|
411
374
|
// see if we can reuse a parent's buffer (inplace)
|
412
375
|
if (ggml_op_can_inplace(node->op)) {
|
@@ -426,7 +389,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
426
389
|
struct hash_node * p_hn = hash_get(ht, parent);
|
427
390
|
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
428
391
|
if (ggml_is_view(parent)) {
|
429
|
-
struct ggml_tensor * view_src =
|
392
|
+
struct ggml_tensor * view_src = parent->view_src;
|
430
393
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
431
394
|
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
432
395
|
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
@@ -468,7 +431,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
468
431
|
struct ggml_tensor * node = gf->nodes[i];
|
469
432
|
|
470
433
|
if (ggml_is_view(node)) {
|
471
|
-
struct ggml_tensor * view_src =
|
434
|
+
struct ggml_tensor * view_src = node->view_src;
|
472
435
|
hash_get(ht, view_src)->n_views += 1;
|
473
436
|
}
|
474
437
|
|
@@ -553,10 +516,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
553
516
|
|
554
517
|
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
555
518
|
if (ggml_is_view(parent)) {
|
556
|
-
struct ggml_tensor * view_src =
|
519
|
+
struct ggml_tensor * view_src = parent->view_src;
|
557
520
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
558
521
|
view_src_hn->n_views -= 1;
|
559
|
-
AT_PRINTF("view_src %s\n", view_src->name);
|
522
|
+
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
560
523
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
561
524
|
ggml_allocator_free_tensor(alloc, view_src);
|
562
525
|
}
|
@@ -306,11 +306,11 @@ typedef struct {
|
|
306
306
|
#define QI4_K (QK_K / (4*QR4_K))
|
307
307
|
#ifdef GGML_QKK_64
|
308
308
|
typedef struct {
|
309
|
-
half
|
309
|
+
half dm[2]; // super-block scales/mins
|
310
310
|
uint8_t scales[2]; // 4-bit block scales/mins
|
311
311
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
312
312
|
} block_q4_K;
|
313
|
-
static_assert(sizeof(block_q4_K) ==
|
313
|
+
static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
314
314
|
#else
|
315
315
|
typedef struct {
|
316
316
|
half2 dm; // super-block scale for quantized scales/mins
|
@@ -737,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
737
737
|
const int tid = threadIdx.x;
|
738
738
|
const uint8_t * q = x[i].qs;
|
739
739
|
float * y = yy + i*QK_K;
|
740
|
-
const float d = (float)x[i].
|
741
|
-
const float m = (float)x[i].
|
740
|
+
const float d = (float)x[i].dm[0];
|
741
|
+
const float m = (float)x[i].dm[1];
|
742
742
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
743
743
|
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
744
744
|
#endif
|
@@ -1155,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
1155
1155
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1156
1156
|
aux16[0] = a[0] & 0x0f0f;
|
1157
1157
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
1158
|
-
const float d = (float)x[i].
|
1159
|
-
const float m = (float)x[i].
|
1158
|
+
const float d = (float)x[i].dm[0];
|
1159
|
+
const float m = (float)x[i].dm[1];
|
1160
1160
|
float sum = 0.f;
|
1161
1161
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
1162
1162
|
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
@@ -2845,8 +2845,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2845
2845
|
aux16[0] = a[0] & 0x0f0f;
|
2846
2846
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2847
2847
|
|
2848
|
-
const float dall = bq4_K->
|
2849
|
-
const float dmin = bq4_K->
|
2848
|
+
const float dall = bq4_K->dm[0];
|
2849
|
+
const float dmin = bq4_K->dm[1];
|
2850
2850
|
|
2851
2851
|
const float d8_1 = __low2float(bq8_1[0].ds);
|
2852
2852
|
const float d8_2 = __low2float(bq8_1[1].ds);
|
@@ -2929,7 +2929,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2929
2929
|
|
2930
2930
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2931
2931
|
|
2932
|
+
#if QK_K == 256
|
2932
2933
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2934
|
+
#else
|
2935
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
2936
|
+
#endif
|
2933
2937
|
}
|
2934
2938
|
|
2935
2939
|
#pragma unroll
|
@@ -3119,7 +3123,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3119
3123
|
|
3120
3124
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
3121
3125
|
|
3126
|
+
#if QK_K == 256
|
3122
3127
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
3128
|
+
#endif
|
3123
3129
|
}
|
3124
3130
|
|
3125
3131
|
#pragma unroll
|
@@ -4709,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4709
4715
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4710
4716
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4711
4717
|
|
4718
|
+
#if QK_K == 256
|
4719
|
+
|
4712
4720
|
int id;
|
4713
4721
|
CUDA_CHECK(cudaGetDevice(&id));
|
4714
4722
|
const int compute_capability = g_compute_capabilities[id];
|
@@ -4740,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4740
4748
|
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4741
4749
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4742
4750
|
}
|
4751
|
+
#endif
|
4743
4752
|
}
|
4744
4753
|
|
4745
4754
|
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
@@ -4899,8 +4908,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
4899
4908
|
|
4900
4909
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4901
4910
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4902
|
-
GGML_ASSERT(
|
4903
|
-
const dim3 block_dims(1,
|
4911
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4912
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4904
4913
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4905
4914
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4906
4915
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
@@ -4908,7 +4917,8 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
4908
4917
|
|
4909
4918
|
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4910
4919
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4911
|
-
|
4920
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4921
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4912
4922
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4913
4923
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4914
4924
|
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
@@ -6328,9 +6338,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
|
6328
6338
|
|
6329
6339
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6330
6340
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6341
|
+
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6331
6342
|
|
6332
6343
|
const int mode = ((int32_t *) dst->op_params)[2];
|
6333
6344
|
const bool is_glm = mode & 4;
|
6345
|
+
|
6334
6346
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6335
6347
|
}
|
6336
6348
|
|