llama_cpp 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: af3a0e01bc9f3cfad4cee3f21144dd354640e1d4558125be36d4b499fa3b4c24
4
- data.tar.gz: 042a3b0491d98fa6a093c684e6ab751152f37c8438a3b4a7b19cb2d8c7ab95a7
3
+ metadata.gz: 715eab98a76ed825d66da6e4fcc84154dca8eed76f6cf6625d210a1ffb702958
4
+ data.tar.gz: 3ceafc312354d245e485b664d71450cd9c27bcd89f5faec91af6cdf1221c251f
5
5
  SHA512:
6
- metadata.gz: 7ed85bd8438ee3b3adab884795c4aecb5b0d72ad57b7e02bc281b62c3b1d669efab62a020e03b09defe3084ecd8afacc4220303e99167d04d668650768c7392b
7
- data.tar.gz: b705a0ccd2c7c1e15aed6383acb9d5a3d79d0a0c882a74c42b9099df9a27aff88ba08a2f06aa4d195382e8f41c1b16c0014a2047d1923369f275ca481d52bb21
6
+ metadata.gz: 7ebe959d9380c9d981156606fdd8a6bcea9b88914923e693b400cfcd605b8c216bdfdcc807c0e72a21fe5fc6d7d623118fc7246524d7f59acdb8bc0064d736bc
7
+ data.tar.gz: c6d428234d866c09d227b5c308a573e9721454ded3f7fdd36880706e7c47c72c67e6fed119c75d6898c6a1149cde853e5dbb59e3a390ef3d370aab4f0d6be548
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
2
+
3
+ **Breaking Changes**
4
+ - Bump bundled llama.cpp from master-b1060 to master-b1140.
5
+ - Rename `token_to_str` method on Context to `token_to_piece` method.
6
+ - Rename `token_to_str` method on Model to `token_to_piece` method.
7
+ - Rename `type` method on Model to `desc` method.
8
+ - Add `size` and `n_params` methods to Model.
9
+
1
10
  ## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
2
11
 
3
12
  **Breaking Changes**
data/examples/chat.rb CHANGED
@@ -122,7 +122,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
122
122
 
123
123
  if input_echo
124
124
  output = []
125
- embd.each { |token| output << context.token_to_str(token) }
125
+ embd.each { |token| output << context.token_to_piece(token) }
126
126
  output_str = output.join
127
127
  output_str.chomp!(antiprompt) if first_input
128
128
  print(output_str)
@@ -131,7 +131,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
131
131
  if embd_input.size <= n_consumed
132
132
  if antiprompt.size.positive?
133
133
  last_output = []
134
- last_n_tokens.each { |token| last_output << context.token_to_str(token) }
134
+ last_n_tokens.each { |token| last_output << context.token_to_piece(token) }
135
135
  last_output_str = last_output.join
136
136
 
137
137
  search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
@@ -50,7 +50,7 @@ if with_config('accelerate')
50
50
  end
51
51
 
52
52
  if with_config('metal')
53
- $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
53
+ $CFLAGS << ' -DGGML_USE_METAL'
54
54
  $CXXFLAGS << ' -DGGML_USE_METAL'
55
55
  $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
56
56
  $objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
@@ -811,9 +811,11 @@ public:
811
811
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
812
812
  rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
813
813
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
814
- rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
814
+ rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
815
815
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
816
- rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
816
+ rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
817
+ rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
818
+ rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
817
819
  }
818
820
 
819
821
  private:
@@ -974,7 +976,7 @@ private:
974
976
  return INT2NUM(llama_model_n_embd(ptr->model));
975
977
  }
976
978
 
977
- static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
979
+ static VALUE _llama_model_token_to_piece_with_model(VALUE self, VALUE token_) {
978
980
  if (!RB_INTEGER_TYPE_P(token_)) {
979
981
  rb_raise(rb_eArgError, "token must be an integer");
980
982
  return Qnil;
@@ -982,10 +984,10 @@ private:
982
984
  const llama_token token = NUM2INT(token_);
983
985
  LLaMAModelWrapper* ptr = get_llama_model(self);
984
986
  std::vector<char> result(8, 0);
985
- const int n_tokens = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
987
+ const int n_tokens = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
986
988
  if (n_tokens < 0) {
987
989
  result.resize(-n_tokens);
988
- const int check = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
990
+ const int check = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
989
991
  if (check != -n_tokens) {
990
992
  rb_raise(rb_eRuntimeError, "failed to convert");
991
993
  return Qnil;
@@ -1040,12 +1042,22 @@ private:
1040
1042
  return ret;
1041
1043
  }
1042
1044
 
1043
- static VALUE _llama_model_get_model_type(VALUE self) {
1045
+ static VALUE _llama_model_get_model_desc(VALUE self) {
1044
1046
  LLaMAModelWrapper* ptr = get_llama_model(self);
1045
1047
  char buf[128];
1046
- ::llama_model_type(ptr->model, buf, sizeof(buf));
1048
+ llama_model_desc(ptr->model, buf, sizeof(buf));
1047
1049
  return rb_str_new_cstr(buf);
1048
1050
  }
1051
+
1052
+ static VALUE _llama_model_get_model_size(VALUE self) {
1053
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1054
+ return UINT2NUM(llama_model_size(ptr->model));
1055
+ }
1056
+
1057
+ static VALUE _llama_model_get_model_n_params(VALUE self) {
1058
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1059
+ return UINT2NUM(llama_model_n_params(ptr->model));
1060
+ }
1049
1061
  };
1050
1062
 
1051
1063
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -1326,7 +1338,7 @@ public:
1326
1338
  rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
1327
1339
  rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
1328
1340
  rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
1329
- rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
1341
+ rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
1330
1342
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
1331
1343
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
1332
1344
  rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
@@ -1567,7 +1579,7 @@ private:
1567
1579
  return output;
1568
1580
  }
1569
1581
 
1570
- static VALUE _llama_context_token_to_str(VALUE self, VALUE token_) {
1582
+ static VALUE _llama_context_token_to_piece(VALUE self, VALUE token_) {
1571
1583
  LLaMAContextWrapper* ptr = get_llama_context(self);
1572
1584
  if (ptr->ctx == NULL) {
1573
1585
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
@@ -1575,10 +1587,10 @@ private:
1575
1587
  }
1576
1588
  const llama_token token = NUM2INT(token_);
1577
1589
  std::vector<char> result(8, 0);
1578
- const int n_tokens = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1590
+ const int n_tokens = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
1579
1591
  if (n_tokens < 0) {
1580
1592
  result.resize(-n_tokens);
1581
- const int check = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1593
+ const int check = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
1582
1594
  if (check != -n_tokens) {
1583
1595
  rb_raise(rb_eRuntimeError, "failed to convert");
1584
1596
  return Qnil;
@@ -107,6 +107,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g
107
107
  }
108
108
 
109
109
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
110
+ #ifdef GGML_ALLOCATOR_DEBUG
111
+ GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
112
+ GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
113
+ #endif
110
114
  size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
111
115
  size = aligned_offset(NULL, size, alloc->alignment);
112
116
 
@@ -268,7 +272,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
268
272
  /*.parse_seq = */ {0},
269
273
  /*.parse_seq_len = */ 0,
270
274
  #ifdef GGML_ALLOCATOR_DEBUG
271
- /*.allocated_tensors = */ = {0},
275
+ /*.allocated_tensors = */ {0},
272
276
  #endif
273
277
  };
274
278
 
@@ -297,7 +301,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
297
301
  /*.parse_seq = */ {0},
298
302
  /*.parse_seq_len = */ 0,
299
303
  #ifdef GGML_ALLOCATOR_DEBUG
300
- /*.allocated_tensors = */ = {0},
304
+ /*.allocated_tensors = */ {0},
301
305
  #endif
302
306
  };
303
307
 
@@ -317,8 +321,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
317
321
  //////////// compute graph allocator
318
322
 
319
323
  static bool ggml_is_view(struct ggml_tensor * t) {
320
- return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
321
- t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
324
+ return t->view_src != NULL;
322
325
  }
323
326
 
324
327
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -336,28 +339,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
336
339
  return true;
337
340
  }
338
341
 
339
- static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
340
- switch (t->op) {
341
- case GGML_OP_PERMUTE:
342
- case GGML_OP_RESHAPE:
343
- case GGML_OP_TRANSPOSE:
344
- case GGML_OP_VIEW:
345
- return t->src[0];
346
- case GGML_OP_CPY:
347
- return t->src[1];
348
- default:
349
- return NULL;
350
- }
351
- }
352
-
353
- static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
354
- struct ggml_tensor * parent = t;
355
- do {
356
- parent = get_view_parent(parent);
357
- } while (ggml_is_view(parent));
358
- return parent;
359
- }
360
-
361
342
  static bool ggml_op_can_inplace(enum ggml_op op) {
362
343
  switch (op) {
363
344
  case GGML_OP_SCALE:
@@ -365,7 +346,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
365
346
  case GGML_OP_DIAG_MASK_INF:
366
347
  case GGML_OP_ADD:
367
348
  case GGML_OP_ADD1:
368
- case GGML_OP_ACC:
369
349
  case GGML_OP_SUB:
370
350
  case GGML_OP_MUL:
371
351
  case GGML_OP_DIV:
@@ -375,7 +355,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
375
355
  case GGML_OP_UNARY:
376
356
  case GGML_OP_ROPE:
377
357
  case GGML_OP_RMS_NORM:
378
- case GGML_OP_SET:
379
358
  case GGML_OP_SOFT_MAX:
380
359
  case GGML_OP_CONT:
381
360
  return true;
@@ -389,24 +368,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
389
368
  struct hash_node * ht = alloc->hash_table;
390
369
  if (node->data == NULL) {
391
370
  if (ggml_is_view(node)) {
392
- size_t offset;
393
- switch(node->op) {
394
- case GGML_OP_VIEW:
395
- memcpy(&offset, node->op_params, sizeof(size_t));
396
- node->data = (char *) node->src[0]->data + offset;
397
- break;
398
- case GGML_OP_PERMUTE:
399
- case GGML_OP_RESHAPE:
400
- case GGML_OP_TRANSPOSE:
401
- node->data = node->src[0]->data;
402
- break;
403
- case GGML_OP_CPY:
404
- node->data = node->src[1]->data;
405
- break;
406
- default:
407
- GGML_ASSERT(!"unknown view op");
408
- break;
409
- }
371
+ assert(node->view_src->data != NULL);
372
+ node->data = (char *)node->view_src->data + node->view_offs;
410
373
  } else {
411
374
  // see if we can reuse a parent's buffer (inplace)
412
375
  if (ggml_op_can_inplace(node->op)) {
@@ -426,7 +389,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
426
389
  struct hash_node * p_hn = hash_get(ht, parent);
427
390
  if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
428
391
  if (ggml_is_view(parent)) {
429
- struct ggml_tensor * view_src = get_view_source(parent);
392
+ struct ggml_tensor * view_src = parent->view_src;
430
393
  struct hash_node * view_src_hn = hash_get(ht, view_src);
431
394
  if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
432
395
  // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
@@ -468,7 +431,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
468
431
  struct ggml_tensor * node = gf->nodes[i];
469
432
 
470
433
  if (ggml_is_view(node)) {
471
- struct ggml_tensor * view_src = get_view_source(node);
434
+ struct ggml_tensor * view_src = node->view_src;
472
435
  hash_get(ht, view_src)->n_views += 1;
473
436
  }
474
437
 
@@ -553,10 +516,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
553
516
 
554
517
  if (p_hn->n_children == 0 && p_hn->n_views == 0) {
555
518
  if (ggml_is_view(parent)) {
556
- struct ggml_tensor * view_src = get_view_source(parent);
519
+ struct ggml_tensor * view_src = parent->view_src;
557
520
  struct hash_node * view_src_hn = hash_get(ht, view_src);
558
521
  view_src_hn->n_views -= 1;
559
- AT_PRINTF("view_src %s\n", view_src->name);
522
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
560
523
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
561
524
  ggml_allocator_free_tensor(alloc, view_src);
562
525
  }
@@ -306,11 +306,11 @@ typedef struct {
306
306
  #define QI4_K (QK_K / (4*QR4_K))
307
307
  #ifdef GGML_QKK_64
308
308
  typedef struct {
309
- half d[2]; // super-block scales/mins
309
+ half dm[2]; // super-block scales/mins
310
310
  uint8_t scales[2]; // 4-bit block scales/mins
311
311
  uint8_t qs[QK_K/2]; // 4--bit quants
312
312
  } block_q4_K;
313
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
313
+ static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
314
314
  #else
315
315
  typedef struct {
316
316
  half2 dm; // super-block scale for quantized scales/mins
@@ -737,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
737
737
  const int tid = threadIdx.x;
738
738
  const uint8_t * q = x[i].qs;
739
739
  float * y = yy + i*QK_K;
740
- const float d = (float)x[i].d[0];
741
- const float m = (float)x[i].d[1];
740
+ const float d = (float)x[i].dm[0];
741
+ const float m = (float)x[i].dm[1];
742
742
  y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
743
743
  y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
744
744
  #endif
@@ -1155,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
1155
1155
  const uint16_t * a = (const uint16_t *)x[i].scales;
1156
1156
  aux16[0] = a[0] & 0x0f0f;
1157
1157
  aux16[1] = (a[0] >> 4) & 0x0f0f;
1158
- const float d = (float)x[i].d[0];
1159
- const float m = (float)x[i].d[1];
1158
+ const float d = (float)x[i].dm[0];
1159
+ const float m = (float)x[i].dm[1];
1160
1160
  float sum = 0.f;
1161
1161
  for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
1162
1162
  sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
@@ -2845,8 +2845,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2845
2845
  aux16[0] = a[0] & 0x0f0f;
2846
2846
  aux16[1] = (a[0] >> 4) & 0x0f0f;
2847
2847
 
2848
- const float dall = bq4_K->d[0];
2849
- const float dmin = bq4_K->d[1];
2848
+ const float dall = bq4_K->dm[0];
2849
+ const float dmin = bq4_K->dm[1];
2850
2850
 
2851
2851
  const float d8_1 = __low2float(bq8_1[0].ds);
2852
2852
  const float d8_2 = __low2float(bq8_1[1].ds);
@@ -2929,7 +2929,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2929
2929
 
2930
2930
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2931
2931
 
2932
+ #if QK_K == 256
2932
2933
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2934
+ #else
2935
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
2936
+ #endif
2933
2937
  }
2934
2938
 
2935
2939
  #pragma unroll
@@ -3119,7 +3123,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3119
3123
 
3120
3124
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
3121
3125
 
3126
+ #if QK_K == 256
3122
3127
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
3128
+ #endif
3123
3129
  }
3124
3130
 
3125
3131
  #pragma unroll
@@ -4709,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4709
4715
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4710
4716
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4711
4717
 
4718
+ #if QK_K == 256
4719
+
4712
4720
  int id;
4713
4721
  CUDA_CHECK(cudaGetDevice(&id));
4714
4722
  const int compute_capability = g_compute_capabilities[id];
@@ -4740,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4740
4748
  mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4741
4749
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4742
4750
  }
4751
+ #endif
4743
4752
  }
4744
4753
 
4745
4754
  static void ggml_mul_mat_q4_K_q8_1_cuda(
@@ -4899,8 +4908,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4899
4908
 
4900
4909
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4901
4910
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4902
- GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
4903
- const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4911
+ GGML_ASSERT(ncols % 2 == 0);
4912
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4904
4913
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4905
4914
  const dim3 block_nums(nrows, num_blocks_x, 1);
4906
4915
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
@@ -4908,7 +4917,8 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
4908
4917
 
4909
4918
  static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4910
4919
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4911
- const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4920
+ GGML_ASSERT(ncols % 2 == 0);
4921
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4912
4922
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4913
4923
  const dim3 block_nums(nrows, num_blocks_x, 1);
4914
4924
  rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
@@ -6328,9 +6338,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
6328
6338
 
6329
6339
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6330
6340
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6341
+ GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6331
6342
 
6332
6343
  const int mode = ((int32_t *) dst->op_params)[2];
6333
6344
  const bool is_glm = mode & 4;
6345
+
6334
6346
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6335
6347
  }
6336
6348
 
@@ -24,6 +24,7 @@
24
24
 
25
25
  // max memory buffers that can be mapped to the device
26
26
  #define GGML_METAL_MAX_BUFFERS 16
27
+ #define GGML_METAL_MAX_COMMAND_BUFFERS 32
27
28
 
28
29
  struct ggml_tensor;
29
30
  struct ggml_cgraph;