llama_cpp 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: af3a0e01bc9f3cfad4cee3f21144dd354640e1d4558125be36d4b499fa3b4c24
4
- data.tar.gz: 042a3b0491d98fa6a093c684e6ab751152f37c8438a3b4a7b19cb2d8c7ab95a7
3
+ metadata.gz: 715eab98a76ed825d66da6e4fcc84154dca8eed76f6cf6625d210a1ffb702958
4
+ data.tar.gz: 3ceafc312354d245e485b664d71450cd9c27bcd89f5faec91af6cdf1221c251f
5
5
  SHA512:
6
- metadata.gz: 7ed85bd8438ee3b3adab884795c4aecb5b0d72ad57b7e02bc281b62c3b1d669efab62a020e03b09defe3084ecd8afacc4220303e99167d04d668650768c7392b
7
- data.tar.gz: b705a0ccd2c7c1e15aed6383acb9d5a3d79d0a0c882a74c42b9099df9a27aff88ba08a2f06aa4d195382e8f41c1b16c0014a2047d1923369f275ca481d52bb21
6
+ metadata.gz: 7ebe959d9380c9d981156606fdd8a6bcea9b88914923e693b400cfcd605b8c216bdfdcc807c0e72a21fe5fc6d7d623118fc7246524d7f59acdb8bc0064d736bc
7
+ data.tar.gz: c6d428234d866c09d227b5c308a573e9721454ded3f7fdd36880706e7c47c72c67e6fed119c75d6898c6a1149cde853e5dbb59e3a390ef3d370aab4f0d6be548
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
2
+
3
+ **Breaking Changes**
4
+ - Bump bundled llama.cpp from master-b1060 to master-b1140.
5
+ - Rename `token_to_str` method on Context to `token_to_piece` method.
6
+ - Rename `token_to_str` method on Model to `token_to_piece` method.
7
+ - Rename `type` method on Model to `desc` method.
8
+ - Add `size` and `n_params` methods to Model.
9
+
1
10
  ## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
2
11
 
3
12
  **Breaking Changes**
data/examples/chat.rb CHANGED
@@ -122,7 +122,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
122
122
 
123
123
  if input_echo
124
124
  output = []
125
- embd.each { |token| output << context.token_to_str(token) }
125
+ embd.each { |token| output << context.token_to_piece(token) }
126
126
  output_str = output.join
127
127
  output_str.chomp!(antiprompt) if first_input
128
128
  print(output_str)
@@ -131,7 +131,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
131
131
  if embd_input.size <= n_consumed
132
132
  if antiprompt.size.positive?
133
133
  last_output = []
134
- last_n_tokens.each { |token| last_output << context.token_to_str(token) }
134
+ last_n_tokens.each { |token| last_output << context.token_to_piece(token) }
135
135
  last_output_str = last_output.join
136
136
 
137
137
  search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
@@ -50,7 +50,7 @@ if with_config('accelerate')
50
50
  end
51
51
 
52
52
  if with_config('metal')
53
- $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
53
+ $CFLAGS << ' -DGGML_USE_METAL'
54
54
  $CXXFLAGS << ' -DGGML_USE_METAL'
55
55
  $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
56
56
  $objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
@@ -811,9 +811,11 @@ public:
811
811
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
812
812
  rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
813
813
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
814
- rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
814
+ rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
815
815
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
816
- rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
816
+ rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
817
+ rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
818
+ rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
817
819
  }
818
820
 
819
821
  private:
@@ -974,7 +976,7 @@ private:
974
976
  return INT2NUM(llama_model_n_embd(ptr->model));
975
977
  }
976
978
 
977
- static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
979
+ static VALUE _llama_model_token_to_piece_with_model(VALUE self, VALUE token_) {
978
980
  if (!RB_INTEGER_TYPE_P(token_)) {
979
981
  rb_raise(rb_eArgError, "token must be an integer");
980
982
  return Qnil;
@@ -982,10 +984,10 @@ private:
982
984
  const llama_token token = NUM2INT(token_);
983
985
  LLaMAModelWrapper* ptr = get_llama_model(self);
984
986
  std::vector<char> result(8, 0);
985
- const int n_tokens = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
987
+ const int n_tokens = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
986
988
  if (n_tokens < 0) {
987
989
  result.resize(-n_tokens);
988
- const int check = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
990
+ const int check = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
989
991
  if (check != -n_tokens) {
990
992
  rb_raise(rb_eRuntimeError, "failed to convert");
991
993
  return Qnil;
@@ -1040,12 +1042,22 @@ private:
1040
1042
  return ret;
1041
1043
  }
1042
1044
 
1043
- static VALUE _llama_model_get_model_type(VALUE self) {
1045
+ static VALUE _llama_model_get_model_desc(VALUE self) {
1044
1046
  LLaMAModelWrapper* ptr = get_llama_model(self);
1045
1047
  char buf[128];
1046
- ::llama_model_type(ptr->model, buf, sizeof(buf));
1048
+ llama_model_desc(ptr->model, buf, sizeof(buf));
1047
1049
  return rb_str_new_cstr(buf);
1048
1050
  }
1051
+
1052
+ static VALUE _llama_model_get_model_size(VALUE self) {
1053
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1054
+ return UINT2NUM(llama_model_size(ptr->model));
1055
+ }
1056
+
1057
+ static VALUE _llama_model_get_model_n_params(VALUE self) {
1058
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1059
+ return UINT2NUM(llama_model_n_params(ptr->model));
1060
+ }
1049
1061
  };
1050
1062
 
1051
1063
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -1326,7 +1338,7 @@ public:
1326
1338
  rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
1327
1339
  rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
1328
1340
  rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
1329
- rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
1341
+ rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
1330
1342
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
1331
1343
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
1332
1344
  rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
@@ -1567,7 +1579,7 @@ private:
1567
1579
  return output;
1568
1580
  }
1569
1581
 
1570
- static VALUE _llama_context_token_to_str(VALUE self, VALUE token_) {
1582
+ static VALUE _llama_context_token_to_piece(VALUE self, VALUE token_) {
1571
1583
  LLaMAContextWrapper* ptr = get_llama_context(self);
1572
1584
  if (ptr->ctx == NULL) {
1573
1585
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
@@ -1575,10 +1587,10 @@ private:
1575
1587
  }
1576
1588
  const llama_token token = NUM2INT(token_);
1577
1589
  std::vector<char> result(8, 0);
1578
- const int n_tokens = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1590
+ const int n_tokens = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
1579
1591
  if (n_tokens < 0) {
1580
1592
  result.resize(-n_tokens);
1581
- const int check = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1593
+ const int check = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
1582
1594
  if (check != -n_tokens) {
1583
1595
  rb_raise(rb_eRuntimeError, "failed to convert");
1584
1596
  return Qnil;
@@ -107,6 +107,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g
107
107
  }
108
108
 
109
109
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
110
+ #ifdef GGML_ALLOCATOR_DEBUG
111
+ GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
112
+ GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
113
+ #endif
110
114
  size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
111
115
  size = aligned_offset(NULL, size, alloc->alignment);
112
116
 
@@ -268,7 +272,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
268
272
  /*.parse_seq = */ {0},
269
273
  /*.parse_seq_len = */ 0,
270
274
  #ifdef GGML_ALLOCATOR_DEBUG
271
- /*.allocated_tensors = */ = {0},
275
+ /*.allocated_tensors = */ {0},
272
276
  #endif
273
277
  };
274
278
 
@@ -297,7 +301,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
297
301
  /*.parse_seq = */ {0},
298
302
  /*.parse_seq_len = */ 0,
299
303
  #ifdef GGML_ALLOCATOR_DEBUG
300
- /*.allocated_tensors = */ = {0},
304
+ /*.allocated_tensors = */ {0},
301
305
  #endif
302
306
  };
303
307
 
@@ -317,8 +321,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
317
321
  //////////// compute graph allocator
318
322
 
319
323
  static bool ggml_is_view(struct ggml_tensor * t) {
320
- return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
321
- t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
324
+ return t->view_src != NULL;
322
325
  }
323
326
 
324
327
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -336,28 +339,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
336
339
  return true;
337
340
  }
338
341
 
339
- static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
340
- switch (t->op) {
341
- case GGML_OP_PERMUTE:
342
- case GGML_OP_RESHAPE:
343
- case GGML_OP_TRANSPOSE:
344
- case GGML_OP_VIEW:
345
- return t->src[0];
346
- case GGML_OP_CPY:
347
- return t->src[1];
348
- default:
349
- return NULL;
350
- }
351
- }
352
-
353
- static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
354
- struct ggml_tensor * parent = t;
355
- do {
356
- parent = get_view_parent(parent);
357
- } while (ggml_is_view(parent));
358
- return parent;
359
- }
360
-
361
342
  static bool ggml_op_can_inplace(enum ggml_op op) {
362
343
  switch (op) {
363
344
  case GGML_OP_SCALE:
@@ -365,7 +346,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
365
346
  case GGML_OP_DIAG_MASK_INF:
366
347
  case GGML_OP_ADD:
367
348
  case GGML_OP_ADD1:
368
- case GGML_OP_ACC:
369
349
  case GGML_OP_SUB:
370
350
  case GGML_OP_MUL:
371
351
  case GGML_OP_DIV:
@@ -375,7 +355,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
375
355
  case GGML_OP_UNARY:
376
356
  case GGML_OP_ROPE:
377
357
  case GGML_OP_RMS_NORM:
378
- case GGML_OP_SET:
379
358
  case GGML_OP_SOFT_MAX:
380
359
  case GGML_OP_CONT:
381
360
  return true;
@@ -389,24 +368,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
389
368
  struct hash_node * ht = alloc->hash_table;
390
369
  if (node->data == NULL) {
391
370
  if (ggml_is_view(node)) {
392
- size_t offset;
393
- switch(node->op) {
394
- case GGML_OP_VIEW:
395
- memcpy(&offset, node->op_params, sizeof(size_t));
396
- node->data = (char *) node->src[0]->data + offset;
397
- break;
398
- case GGML_OP_PERMUTE:
399
- case GGML_OP_RESHAPE:
400
- case GGML_OP_TRANSPOSE:
401
- node->data = node->src[0]->data;
402
- break;
403
- case GGML_OP_CPY:
404
- node->data = node->src[1]->data;
405
- break;
406
- default:
407
- GGML_ASSERT(!"unknown view op");
408
- break;
409
- }
371
+ assert(node->view_src->data != NULL);
372
+ node->data = (char *)node->view_src->data + node->view_offs;
410
373
  } else {
411
374
  // see if we can reuse a parent's buffer (inplace)
412
375
  if (ggml_op_can_inplace(node->op)) {
@@ -426,7 +389,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
426
389
  struct hash_node * p_hn = hash_get(ht, parent);
427
390
  if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
428
391
  if (ggml_is_view(parent)) {
429
- struct ggml_tensor * view_src = get_view_source(parent);
392
+ struct ggml_tensor * view_src = parent->view_src;
430
393
  struct hash_node * view_src_hn = hash_get(ht, view_src);
431
394
  if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
432
395
  // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
@@ -468,7 +431,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
468
431
  struct ggml_tensor * node = gf->nodes[i];
469
432
 
470
433
  if (ggml_is_view(node)) {
471
- struct ggml_tensor * view_src = get_view_source(node);
434
+ struct ggml_tensor * view_src = node->view_src;
472
435
  hash_get(ht, view_src)->n_views += 1;
473
436
  }
474
437
 
@@ -553,10 +516,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
553
516
 
554
517
  if (p_hn->n_children == 0 && p_hn->n_views == 0) {
555
518
  if (ggml_is_view(parent)) {
556
- struct ggml_tensor * view_src = get_view_source(parent);
519
+ struct ggml_tensor * view_src = parent->view_src;
557
520
  struct hash_node * view_src_hn = hash_get(ht, view_src);
558
521
  view_src_hn->n_views -= 1;
559
- AT_PRINTF("view_src %s\n", view_src->name);
522
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
560
523
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
561
524
  ggml_allocator_free_tensor(alloc, view_src);
562
525
  }
@@ -306,11 +306,11 @@ typedef struct {
306
306
  #define QI4_K (QK_K / (4*QR4_K))
307
307
  #ifdef GGML_QKK_64
308
308
  typedef struct {
309
- half d[2]; // super-block scales/mins
309
+ half dm[2]; // super-block scales/mins
310
310
  uint8_t scales[2]; // 4-bit block scales/mins
311
311
  uint8_t qs[QK_K/2]; // 4--bit quants
312
312
  } block_q4_K;
313
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
313
+ static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
314
314
  #else
315
315
  typedef struct {
316
316
  half2 dm; // super-block scale for quantized scales/mins
@@ -737,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
737
737
  const int tid = threadIdx.x;
738
738
  const uint8_t * q = x[i].qs;
739
739
  float * y = yy + i*QK_K;
740
- const float d = (float)x[i].d[0];
741
- const float m = (float)x[i].d[1];
740
+ const float d = (float)x[i].dm[0];
741
+ const float m = (float)x[i].dm[1];
742
742
  y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
743
743
  y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
744
744
  #endif
@@ -1155,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
1155
1155
  const uint16_t * a = (const uint16_t *)x[i].scales;
1156
1156
  aux16[0] = a[0] & 0x0f0f;
1157
1157
  aux16[1] = (a[0] >> 4) & 0x0f0f;
1158
- const float d = (float)x[i].d[0];
1159
- const float m = (float)x[i].d[1];
1158
+ const float d = (float)x[i].dm[0];
1159
+ const float m = (float)x[i].dm[1];
1160
1160
  float sum = 0.f;
1161
1161
  for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
1162
1162
  sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
@@ -2845,8 +2845,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2845
2845
  aux16[0] = a[0] & 0x0f0f;
2846
2846
  aux16[1] = (a[0] >> 4) & 0x0f0f;
2847
2847
 
2848
- const float dall = bq4_K->d[0];
2849
- const float dmin = bq4_K->d[1];
2848
+ const float dall = bq4_K->dm[0];
2849
+ const float dmin = bq4_K->dm[1];
2850
2850
 
2851
2851
  const float d8_1 = __low2float(bq8_1[0].ds);
2852
2852
  const float d8_2 = __low2float(bq8_1[1].ds);
@@ -2929,7 +2929,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2929
2929
 
2930
2930
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2931
2931
 
2932
+ #if QK_K == 256
2932
2933
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2934
+ #else
2935
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
2936
+ #endif
2933
2937
  }
2934
2938
 
2935
2939
  #pragma unroll
@@ -3119,7 +3123,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3119
3123
 
3120
3124
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
3121
3125
 
3126
+ #if QK_K == 256
3122
3127
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
3128
+ #endif
3123
3129
  }
3124
3130
 
3125
3131
  #pragma unroll
@@ -4709,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4709
4715
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4710
4716
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4711
4717
 
4718
+ #if QK_K == 256
4719
+
4712
4720
  int id;
4713
4721
  CUDA_CHECK(cudaGetDevice(&id));
4714
4722
  const int compute_capability = g_compute_capabilities[id];
@@ -4740,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4740
4748
  mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4741
4749
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4742
4750
  }
4751
+ #endif
4743
4752
  }
4744
4753
 
4745
4754
  static void ggml_mul_mat_q4_K_q8_1_cuda(
@@ -4899,8 +4908,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4899
4908
 
4900
4909
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4901
4910
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4902
- GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
4903
- const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4911
+ GGML_ASSERT(ncols % 2 == 0);
4912
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4904
4913
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4905
4914
  const dim3 block_nums(nrows, num_blocks_x, 1);
4906
4915
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
@@ -4908,7 +4917,8 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
4908
4917
 
4909
4918
  static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4910
4919
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4911
- const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4920
+ GGML_ASSERT(ncols % 2 == 0);
4921
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4912
4922
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4913
4923
  const dim3 block_nums(nrows, num_blocks_x, 1);
4914
4924
  rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
@@ -6328,9 +6338,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
6328
6338
 
6329
6339
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6330
6340
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6341
+ GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6331
6342
 
6332
6343
  const int mode = ((int32_t *) dst->op_params)[2];
6333
6344
  const bool is_glm = mode & 4;
6345
+
6334
6346
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6335
6347
  }
6336
6348
 
@@ -24,6 +24,7 @@
24
24
 
25
25
  // max memory buffers that can be mapped to the device
26
26
  #define GGML_METAL_MAX_BUFFERS 16
27
+ #define GGML_METAL_MAX_COMMAND_BUFFERS 32
27
28
 
28
29
  struct ggml_tensor;
29
30
  struct ggml_cgraph;