llama_cpp 0.3.6 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -195,8 +195,8 @@ typedef void * thread_ret_t;
195
195
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
196
196
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
197
197
  #else
198
- inline static void* ggml_aligned_malloc(size_t size) {
199
- void* aligned_memory = NULL;
198
+ inline static void * ggml_aligned_malloc(size_t size) {
199
+ void * aligned_memory = NULL;
200
200
  #ifdef GGML_USE_METAL
201
201
  int result = posix_memalign(&aligned_memory, getpagesize(), size);
202
202
  #else
@@ -3811,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3811
3811
  "CROSS_ENTROPY_LOSS_BACK",
3812
3812
  };
3813
3813
 
3814
- static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3814
+ static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
3815
3815
 
3816
3816
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3817
3817
  "none",
@@ -3883,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3883
3883
  "cross_entropy_loss_back(x,y)",
3884
3884
  };
3885
3885
 
3886
- static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3886
+ static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
3887
3887
 
3888
3888
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3889
3889
 
@@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4110
4110
  //
4111
4111
  // is enough, but just in case, adding the second part
4112
4112
 
4113
- return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
4113
+ return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
4114
4114
  }
4115
4115
 
4116
4116
  size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
@@ -4253,7 +4253,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
4253
4253
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4254
4254
  }
4255
4255
 
4256
- static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
4256
+ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
4257
4257
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4258
4258
 
4259
4259
  return
@@ -4602,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4602
4602
  /*.ne =*/ { 1, 1, 1, 1 },
4603
4603
  /*.nb =*/ { 0, 0, 0, 0 },
4604
4604
  /*.op =*/ GGML_OP_NONE,
4605
- /*.op_params =*/ {0},
4605
+ /*.op_params =*/ { 0 },
4606
4606
  /*.is_param =*/ false,
4607
4607
  /*.grad =*/ NULL,
4608
4608
  /*.src =*/ { NULL },
@@ -4634,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4634
4634
  }
4635
4635
 
4636
4636
  static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4637
+ GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4637
4638
  assert(params_size <= GGML_MAX_OP_PARAMS);
4638
4639
  memcpy(tensor->op_params, params, params_size);
4639
4640
  }
@@ -6439,7 +6440,7 @@ struct ggml_tensor * ggml_permute(
6439
6440
  result->src[0] = a;
6440
6441
 
6441
6442
  int32_t params[] = { axis0, axis1, axis2, axis3 };
6442
- ggml_set_op_params(result, &params, sizeof(params));
6443
+ ggml_set_op_params(result, params, sizeof(params));
6443
6444
 
6444
6445
  return result;
6445
6446
  }
@@ -6565,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6565
6566
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6566
6567
 
6567
6568
  int32_t params[] = { n_past, inplace ? 1 : 0 };
6568
- ggml_set_op_params(result, &params, sizeof(params));
6569
+ ggml_set_op_params(result, params, sizeof(params));
6569
6570
 
6570
6571
  result->op = GGML_OP_DIAG_MASK_INF;
6571
6572
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6605,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6605
6606
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6606
6607
 
6607
6608
  int32_t params[] = { n_past, inplace ? 1 : 0 };
6608
- ggml_set_op_params(result, &params, sizeof(params));
6609
+ ggml_set_op_params(result, params, sizeof(params));
6609
6610
 
6610
6611
  result->op = GGML_OP_DIAG_MASK_ZERO;
6611
6612
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6721,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl(
6721
6722
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6722
6723
 
6723
6724
  int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6724
- memcpy(params + 4, &freq_base, sizeof(float));
6725
+ memcpy(params + 4, &freq_base, sizeof(float));
6725
6726
  memcpy(params + 5, &freq_scale, sizeof(float));
6726
- ggml_set_op_params(result, &params, sizeof(params));
6727
+ ggml_set_op_params(result, params, sizeof(params));
6727
6728
 
6728
6729
  result->op = GGML_OP_ROPE;
6729
6730
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6797,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back(
6797
6798
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6798
6799
 
6799
6800
  int32_t params[] = { n_past, n_dims, mode, n_ctx };
6800
- ggml_set_op_params(result, &params, sizeof(params));
6801
+ ggml_set_op_params(result, params, sizeof(params));
6801
6802
 
6802
6803
  result->op = GGML_OP_ROPE_BACK;
6803
6804
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6828,7 +6829,7 @@ struct ggml_tensor * ggml_alibi(
6828
6829
 
6829
6830
  int32_t op_params[3] = { n_past, n_head };
6830
6831
  memcpy(op_params + 2, &bias_max, sizeof(float));
6831
- ggml_set_op_params(result, &op_params, sizeof(op_params));
6832
+ ggml_set_op_params(result, op_params, sizeof(op_params));
6832
6833
 
6833
6834
  result->op = GGML_OP_ALIBI;
6834
6835
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6855,7 +6856,7 @@ struct ggml_tensor * ggml_clamp(
6855
6856
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6856
6857
 
6857
6858
  float params[] = { min, max };
6858
- ggml_set_op_params(result, &params, sizeof(params));
6859
+ ggml_set_op_params(result, params, sizeof(params));
6859
6860
 
6860
6861
  result->op = GGML_OP_CLAMP;
6861
6862
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6890,10 +6891,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
6890
6891
  ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
6891
6892
  a->ne[2], 1, 1,
6892
6893
  };
6893
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6894
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6894
6895
 
6895
6896
  int32_t params[] = { s0, p0, d0 };
6896
- ggml_set_op_params(result, &params, sizeof(params));
6897
+ ggml_set_op_params(result, params, sizeof(params));
6897
6898
 
6898
6899
  result->op = GGML_OP_CONV_1D;
6899
6900
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6905,10 +6906,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
6905
6906
 
6906
6907
  // ggml_conv_2d
6907
6908
 
6908
- struct ggml_tensor* ggml_conv_2d(
6909
- struct ggml_context* ctx,
6910
- struct ggml_tensor * a,
6911
- struct ggml_tensor * b,
6909
+ struct ggml_tensor * ggml_conv_2d(
6910
+ struct ggml_context * ctx,
6911
+ struct ggml_tensor * a,
6912
+ struct ggml_tensor * b,
6912
6913
  int s0,
6913
6914
  int s1,
6914
6915
  int p0,
@@ -6929,10 +6930,10 @@ struct ggml_tensor* ggml_conv_2d(
6929
6930
  ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
6930
6931
  a->ne[3], b->ne[3],
6931
6932
  };
6932
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6933
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6933
6934
 
6934
6935
  int32_t params[] = { s0, s1, p0, p1, d0, d1 };
6935
- ggml_set_op_params(result, &params, sizeof(params));
6936
+ ggml_set_op_params(result, params, sizeof(params));
6936
6937
 
6937
6938
  result->op = GGML_OP_CONV_2D;
6938
6939
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6945,7 +6946,7 @@ struct ggml_tensor* ggml_conv_2d(
6945
6946
 
6946
6947
  // ggml_conv_1d_ph
6947
6948
 
6948
- struct ggml_tensor* ggml_conv_1d_ph(
6949
+ struct ggml_tensor * ggml_conv_1d_ph(
6949
6950
  struct ggml_context * ctx,
6950
6951
  struct ggml_tensor * a,
6951
6952
  struct ggml_tensor * b,
@@ -6963,7 +6964,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
6963
6964
 
6964
6965
  // ggml_pool_1d
6965
6966
 
6966
- struct ggml_tensor* ggml_pool_1d(
6967
+ struct ggml_tensor * ggml_pool_1d(
6967
6968
  struct ggml_context * ctx,
6968
6969
  struct ggml_tensor * a,
6969
6970
  enum ggml_op_pool op,
@@ -6982,10 +6983,10 @@ struct ggml_tensor* ggml_pool_1d(
6982
6983
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
6983
6984
  a->ne[1],
6984
6985
  };
6985
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6986
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6986
6987
 
6987
6988
  int32_t params[] = { op, k0, s0, p0 };
6988
- ggml_set_op_params(result, &params, sizeof(params));
6989
+ ggml_set_op_params(result, params, sizeof(params));
6989
6990
 
6990
6991
  result->op = GGML_OP_POOL_1D;
6991
6992
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6996,7 +6997,7 @@ struct ggml_tensor* ggml_pool_1d(
6996
6997
 
6997
6998
  // ggml_pool_2d
6998
6999
 
6999
- struct ggml_tensor* ggml_pool_2d(
7000
+ struct ggml_tensor * ggml_pool_2d(
7000
7001
  struct ggml_context * ctx,
7001
7002
  struct ggml_tensor * a,
7002
7003
  enum ggml_op_pool op,
@@ -7019,10 +7020,10 @@ struct ggml_tensor* ggml_pool_2d(
7019
7020
  ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
7020
7021
  a->ne[2],
7021
7022
  };
7022
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7023
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7023
7024
 
7024
7025
  int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
7025
- ggml_set_op_params(result, &params, sizeof(params));
7026
+ ggml_set_op_params(result, params, sizeof(params));
7026
7027
 
7027
7028
  result->op = GGML_OP_POOL_2D;
7028
7029
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7190,7 +7191,7 @@ struct ggml_tensor * ggml_win_part(
7190
7191
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7191
7192
 
7192
7193
  int32_t params[] = { npx, npy, w };
7193
- ggml_set_op_params(result, &params, sizeof(params));
7194
+ ggml_set_op_params(result, params, sizeof(params));
7194
7195
 
7195
7196
  result->op = GGML_OP_WIN_PART;
7196
7197
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7220,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart(
7220
7221
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7221
7222
 
7222
7223
  int32_t params[] = { w };
7223
- ggml_set_op_params(result, &params, sizeof(params));
7224
+ ggml_set_op_params(result, params, sizeof(params));
7224
7225
 
7225
7226
  result->op = GGML_OP_WIN_UNPART;
7226
7227
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7349,7 +7350,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7349
7350
  return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
7350
7351
  }
7351
7352
 
7352
- // ggml_map_custom1
7353
+ // ggml_map_custom1_f32
7353
7354
 
7354
7355
  static struct ggml_tensor * ggml_map_custom1_impl_f32(
7355
7356
  struct ggml_context * ctx,
@@ -7366,7 +7367,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
7366
7367
 
7367
7368
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7368
7369
 
7369
- result->op = GGML_OP_MAP_CUSTOM1;
7370
+ result->op = GGML_OP_MAP_CUSTOM1_F32;
7370
7371
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7371
7372
  result->src[0] = a;
7372
7373
 
@@ -7387,7 +7388,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7387
7388
  return ggml_map_custom1_impl_f32(ctx, a, fun, true);
7388
7389
  }
7389
7390
 
7390
- // ggml_map_custom2
7391
+ // ggml_map_custom2_f32
7391
7392
 
7392
7393
  static struct ggml_tensor * ggml_map_custom2_impl_f32(
7393
7394
  struct ggml_context * ctx,
@@ -7405,7 +7406,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
7405
7406
 
7406
7407
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7407
7408
 
7408
- result->op = GGML_OP_MAP_CUSTOM2;
7409
+ result->op = GGML_OP_MAP_CUSTOM2_F32;
7409
7410
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7410
7411
  result->src[0] = a;
7411
7412
  result->src[1] = b;
@@ -7429,7 +7430,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7429
7430
  return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
7430
7431
  }
7431
7432
 
7432
- // ggml_map_custom3
7433
+ // ggml_map_custom3_f32
7433
7434
 
7434
7435
  static struct ggml_tensor * ggml_map_custom3_impl_f32(
7435
7436
  struct ggml_context * ctx,
@@ -7448,7 +7449,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
7448
7449
 
7449
7450
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7450
7451
 
7451
- result->op = GGML_OP_MAP_CUSTOM3;
7452
+ result->op = GGML_OP_MAP_CUSTOM3_F32;
7452
7453
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7453
7454
  result->src[0] = a;
7454
7455
  result->src[1] = b;
@@ -7475,6 +7476,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
7475
7476
  return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
7476
7477
  }
7477
7478
 
7479
+ // ggml_map_custom1
7480
+ struct ggml_map_custom1_op_params {
7481
+ ggml_custom1_op_t fun;
7482
+ int n_tasks;
7483
+ void * userdata;
7484
+ };
7485
+
7486
+ static struct ggml_tensor * ggml_map_custom1_impl(
7487
+ struct ggml_context * ctx,
7488
+ struct ggml_tensor * a,
7489
+ const ggml_custom1_op_t fun,
7490
+ int n_tasks,
7491
+ void * userdata,
7492
+ bool inplace) {
7493
+ GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
7494
+
7495
+ bool is_node = false;
7496
+
7497
+ if (!inplace && a->grad) {
7498
+ is_node = true;
7499
+ }
7500
+
7501
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7502
+
7503
+ struct ggml_map_custom1_op_params params = {
7504
+ /*.fun =*/ fun,
7505
+ /*.n_tasks =*/ n_tasks,
7506
+ /*.userdata =*/ userdata
7507
+ };
7508
+ ggml_set_op_params(result, (const void *) &params, sizeof(params));
7509
+
7510
+ result->op = GGML_OP_MAP_CUSTOM1;
7511
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7512
+ result->src[0] = a;
7513
+
7514
+ return result;
7515
+ }
7516
+
7517
+ struct ggml_tensor * ggml_map_custom1(
7518
+ struct ggml_context * ctx,
7519
+ struct ggml_tensor * a,
7520
+ const ggml_custom1_op_t fun,
7521
+ int n_tasks,
7522
+ void * userdata) {
7523
+ return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
7524
+ }
7525
+
7526
+ struct ggml_tensor * ggml_map_custom1_inplace(
7527
+ struct ggml_context * ctx,
7528
+ struct ggml_tensor * a,
7529
+ const ggml_custom1_op_t fun,
7530
+ int n_tasks,
7531
+ void * userdata) {
7532
+ return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
7533
+ }
7534
+
7535
+ // ggml_map_custom2
7536
+
7537
+ struct ggml_map_custom2_op_params {
7538
+ ggml_custom2_op_t fun;
7539
+ int n_tasks;
7540
+ void * userdata;
7541
+ };
7542
+
7543
+ static struct ggml_tensor * ggml_map_custom2_impl(
7544
+ struct ggml_context * ctx,
7545
+ struct ggml_tensor * a,
7546
+ struct ggml_tensor * b,
7547
+ const ggml_custom2_op_t fun,
7548
+ int n_tasks,
7549
+ void * userdata,
7550
+ bool inplace) {
7551
+ GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
7552
+
7553
+ bool is_node = false;
7554
+
7555
+ if (!inplace && (a->grad || b->grad)) {
7556
+ is_node = true;
7557
+ }
7558
+
7559
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7560
+
7561
+ struct ggml_map_custom2_op_params params = {
7562
+ /*.fun =*/ fun,
7563
+ /*.n_tasks =*/ n_tasks,
7564
+ /*.userdata =*/ userdata
7565
+ };
7566
+ ggml_set_op_params(result, (const void *) &params, sizeof(params));
7567
+
7568
+ result->op = GGML_OP_MAP_CUSTOM2;
7569
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7570
+ result->src[0] = a;
7571
+ result->src[1] = b;
7572
+
7573
+ return result;
7574
+ }
7575
+
7576
+ struct ggml_tensor * ggml_map_custom2(
7577
+ struct ggml_context * ctx,
7578
+ struct ggml_tensor * a,
7579
+ struct ggml_tensor * b,
7580
+ const ggml_custom2_op_t fun,
7581
+ int n_tasks,
7582
+ void * userdata) {
7583
+ return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
7584
+ }
7585
+
7586
+ struct ggml_tensor * ggml_map_custom2_inplace(
7587
+ struct ggml_context * ctx,
7588
+ struct ggml_tensor * a,
7589
+ struct ggml_tensor * b,
7590
+ const ggml_custom2_op_t fun,
7591
+ int n_tasks,
7592
+ void * userdata) {
7593
+ return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
7594
+ }
7595
+
7596
+ // ggml_map_custom3
7597
+
7598
+ struct ggml_map_custom3_op_params {
7599
+ ggml_custom3_op_t fun;
7600
+ int n_tasks;
7601
+ void * userdata;
7602
+ };
7603
+
7604
+ static struct ggml_tensor * ggml_map_custom3_impl(
7605
+ struct ggml_context * ctx,
7606
+ struct ggml_tensor * a,
7607
+ struct ggml_tensor * b,
7608
+ struct ggml_tensor * c,
7609
+ const ggml_custom3_op_t fun,
7610
+ int n_tasks,
7611
+ void * userdata,
7612
+ bool inplace) {
7613
+ GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
7614
+
7615
+ bool is_node = false;
7616
+
7617
+ if (!inplace && (a->grad || b->grad || c->grad)) {
7618
+ is_node = true;
7619
+ }
7620
+
7621
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7622
+
7623
+ struct ggml_map_custom3_op_params params = {
7624
+ /*.fun =*/ fun,
7625
+ /*.n_tasks =*/ n_tasks,
7626
+ /*.userdata =*/ userdata
7627
+ };
7628
+ ggml_set_op_params(result, (const void *) &params, sizeof(params));
7629
+
7630
+ result->op = GGML_OP_MAP_CUSTOM3;
7631
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7632
+ result->src[0] = a;
7633
+ result->src[1] = b;
7634
+ result->src[2] = c;
7635
+
7636
+ return result;
7637
+ }
7638
+
7639
+ struct ggml_tensor * ggml_map_custom3(
7640
+ struct ggml_context * ctx,
7641
+ struct ggml_tensor * a,
7642
+ struct ggml_tensor * b,
7643
+ struct ggml_tensor * c,
7644
+ const ggml_custom3_op_t fun,
7645
+ int n_tasks,
7646
+ void * userdata) {
7647
+ return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
7648
+ }
7649
+
7650
+ struct ggml_tensor * ggml_map_custom3_inplace(
7651
+ struct ggml_context * ctx,
7652
+ struct ggml_tensor * a,
7653
+ struct ggml_tensor * b,
7654
+ struct ggml_tensor * c,
7655
+ const ggml_custom3_op_t fun,
7656
+ int n_tasks,
7657
+ void * userdata) {
7658
+ return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
7659
+ }
7660
+
7661
+
7662
+
7478
7663
  // ggml_cross_entropy_loss
7479
7664
 
7480
7665
  struct ggml_tensor * ggml_cross_entropy_loss(
@@ -9283,8 +9468,8 @@ static void ggml_compute_forward_sum_rows_f32(
9283
9468
  for (int64_t i3 = 0; i3 < ne03; i3++) {
9284
9469
  for (int64_t i2 = 0; i2 < ne02; i2++) {
9285
9470
  for (int64_t i1 = 0; i1 < ne01; i1++) {
9286
- float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
9287
- float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
9471
+ float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
9472
+ float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
9288
9473
  float row_sum = 0;
9289
9474
  ggml_vec_sum_f32(ne00, &row_sum, src_row);
9290
9475
  dst_row[0] = row_sum;
@@ -10546,71 +10731,95 @@ static void ggml_compute_forward_mul_mat(
10546
10731
  return;
10547
10732
  }
10548
10733
 
10549
- // parallelize by src0 rows
10550
- const int64_t dr = (ne01 + nth - 1)/nth;
10734
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10735
+ const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10736
+
10737
+ const int64_t nr0 = ne01; // src0 rows
10738
+ const int64_t nr1 = ne11*ne12*ne13; // src1 rows
10551
10739
 
10552
- const int64_t ir10 = dr*ith;
10553
- const int64_t ir11 = MIN(ir10 + dr, ne01);
10740
+ //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
10554
10741
 
10555
- // src1 rows
10556
- const int64_t nr1 = ne11*ne12*ne13;
10742
+ // distribute the thread work across the inner or outer loop based on which one is larger
10557
10743
 
10558
- const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10559
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10744
+ const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
10745
+ const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
10560
10746
 
10561
- for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
10562
- const int64_t i13 = (ir1/(ne12*ne11));
10563
- const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
10564
- const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
10565
-
10566
- const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
10567
- const int64_t i03 = (ir0/(ne02));
10568
- // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
10569
- // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
10570
- // GG: this is likely the correct way to broadcast, though need some more thought
10571
- // therefore leaving the comments to remind us for now
10572
- const int64_t i02 = (i12 / (ne12 / ne02));
10573
- // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
10574
- // const int64_t i02 = (ir0 - i03*ne02);
10575
-
10576
- const int64_t i1 = i11;
10577
- const int64_t i2 = i12;
10578
- const int64_t i3 = i13;
10579
-
10580
- const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
10581
-
10582
- // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10583
- // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10584
- // the original src1 data pointer, so we should index using the indices directly
10585
- // TODO: this is a bit of a hack, we should probably have a better way to handle this
10586
- const char * src1_col = (const char *) wdata +
10587
- (src1_cont || src1->type != vec_dot_type
10588
- ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10589
- : (i11*nb11 + i12*nb12 + i13*nb13));
10590
-
10591
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10592
-
10593
- for (int64_t ir = ir10; ir < ir11; ++ir) {
10594
- vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
10595
- }
10747
+ const int64_t ith0 = ith % nth0;
10748
+ const int64_t ith1 = ith / nth0;
10749
+
10750
+ const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
10751
+ const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
10752
+
10753
+ const int64_t ir010 = dr0*ith0;
10754
+ const int64_t ir011 = MIN(ir010 + dr0, nr0);
10755
+
10756
+ const int64_t ir110 = dr1*ith1;
10757
+ const int64_t ir111 = MIN(ir110 + dr1, nr1);
10758
+
10759
+ //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
10760
+
10761
+ // threads with no work simply yield (not sure if it helps)
10762
+ if (ir010 >= ir011 || ir110 >= ir111) {
10763
+ sched_yield();
10764
+ return;
10596
10765
  }
10597
10766
 
10598
- //int64_t t1 = ggml_time_us();
10599
- //static int64_t acc = 0;
10600
- //acc += t1 - t0;
10601
- //if (t1 - t0 > 10) {
10602
- // printf("\n");
10603
- // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
10604
- // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
10605
- // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
10767
+ assert(ne12 % ne02 == 0);
10768
+ assert(ne13 % ne03 == 0);
10606
10769
 
10607
- // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
10608
- //}
10609
- }
10770
+ // broadcast factors
10771
+ const int64_t r2 = ne12/ne02;
10772
+ const int64_t r3 = ne13/ne03;
10610
10773
 
10774
+ // block-tiling attempt
10775
+ const int64_t blck_0 = 16;
10776
+ const int64_t blck_1 = 16;
10611
10777
 
10612
- // ggml_compute_forward_out_prod
10778
+ // attempt to reduce false-sharing (does not seem to make a difference)
10779
+ float tmp[16];
10613
10780
 
10781
+ for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
10782
+ for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
10783
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
10784
+ const int64_t i13 = (ir1/(ne12*ne11));
10785
+ const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
10786
+ const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
10787
+
10788
+ // broadcast src0 into src1
10789
+ const int64_t i03 = i13/r3;
10790
+ const int64_t i02 = i12/r2;
10791
+
10792
+ const int64_t i1 = i11;
10793
+ const int64_t i2 = i12;
10794
+ const int64_t i3 = i13;
10795
+
10796
+ const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
10797
+
10798
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10799
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10800
+ // the original src1 data pointer, so we should index using the indices directly
10801
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
10802
+ const char * src1_col = (const char *) wdata +
10803
+ (src1_cont || src1->type != vec_dot_type
10804
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10805
+ : (i11*nb11 + i12*nb12 + i13*nb13));
10806
+
10807
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10808
+
10809
+ //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10810
+ // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
10811
+ //}
10812
+
10813
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10814
+ vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
10815
+ }
10816
+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10817
+ }
10818
+ }
10819
+ }
10820
+ }
10821
+
10822
+ // ggml_compute_forward_out_prod
10614
10823
 
10615
10824
  static void ggml_compute_forward_out_prod_f32(
10616
10825
  const struct ggml_compute_params * params,
@@ -12894,7 +13103,7 @@ static void ggml_compute_forward_pool_1d(
12894
13103
  const struct ggml_tensor * src0,
12895
13104
  struct ggml_tensor * dst) {
12896
13105
 
12897
- const int32_t* opts = (const int32_t*)dst->op_params;
13106
+ const int32_t * opts = (const int32_t *)dst->op_params;
12898
13107
  enum ggml_op_pool op = opts[0];
12899
13108
  const int k0 = opts[1];
12900
13109
  const int s0 = opts[2];
@@ -14227,24 +14436,6 @@ static void ggml_compute_forward_map_custom1_f32(
14227
14436
  fun(dst, a);
14228
14437
  }
14229
14438
 
14230
-
14231
- static void ggml_compute_forward_map_custom1(
14232
- const struct ggml_compute_params * params,
14233
- const struct ggml_tensor * a,
14234
- struct ggml_tensor * dst,
14235
- const ggml_custom1_op_f32_t fun) {
14236
- switch (a->type) {
14237
- case GGML_TYPE_F32:
14238
- {
14239
- ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
14240
- } break;
14241
- default:
14242
- {
14243
- GGML_ASSERT(false);
14244
- } break;
14245
- }
14246
- }
14247
-
14248
14439
  // ggml_compute_forward_map_custom2
14249
14440
 
14250
14441
  static void ggml_compute_forward_map_custom2_f32(
@@ -14263,24 +14454,6 @@ static void ggml_compute_forward_map_custom2_f32(
14263
14454
  }
14264
14455
 
14265
14456
 
14266
- static void ggml_compute_forward_map_custom2(
14267
- const struct ggml_compute_params * params,
14268
- const struct ggml_tensor * a,
14269
- const struct ggml_tensor * b,
14270
- struct ggml_tensor * dst,
14271
- const ggml_custom2_op_f32_t fun) {
14272
- switch (a->type) {
14273
- case GGML_TYPE_F32:
14274
- {
14275
- ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
14276
- } break;
14277
- default:
14278
- {
14279
- GGML_ASSERT(false);
14280
- } break;
14281
- }
14282
- }
14283
-
14284
14457
  // ggml_compute_forward_map_custom3
14285
14458
 
14286
14459
  static void ggml_compute_forward_map_custom3_f32(
@@ -14299,24 +14472,52 @@ static void ggml_compute_forward_map_custom3_f32(
14299
14472
  fun(dst, a, b, c);
14300
14473
  }
14301
14474
 
14475
+ // ggml_compute_forward_map_custom1
14476
+
14477
+ static void ggml_compute_forward_map_custom1(
14478
+ const struct ggml_compute_params * params,
14479
+ const struct ggml_tensor * a,
14480
+ struct ggml_tensor * dst) {
14481
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14482
+ return;
14483
+ }
14484
+
14485
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
14486
+
14487
+ p->fun(dst, a, params->ith, params->nth, p->userdata);
14488
+ }
14489
+
14490
+ // ggml_compute_forward_map_custom2
14491
+
14492
+ static void ggml_compute_forward_map_custom2(
14493
+ const struct ggml_compute_params * params,
14494
+ const struct ggml_tensor * a,
14495
+ const struct ggml_tensor * b,
14496
+ struct ggml_tensor * dst) {
14497
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14498
+ return;
14499
+ }
14500
+
14501
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
14502
+
14503
+ p->fun(dst, a, b, params->ith, params->nth, p->userdata);
14504
+ }
14505
+
14506
+ // ggml_compute_forward_map_custom3
14302
14507
 
14303
14508
  static void ggml_compute_forward_map_custom3(
14304
14509
  const struct ggml_compute_params * params,
14305
14510
  const struct ggml_tensor * a,
14306
14511
  const struct ggml_tensor * b,
14307
14512
  const struct ggml_tensor * c,
14308
- struct ggml_tensor * dst,
14309
- const ggml_custom3_op_f32_t fun) {
14310
- switch (a->type) {
14311
- case GGML_TYPE_F32:
14312
- {
14313
- ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
14314
- } break;
14315
- default:
14316
- {
14317
- GGML_ASSERT(false);
14318
- } break;
14513
+ struct ggml_tensor * dst) {
14514
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14515
+ return;
14319
14516
  }
14517
+
14518
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
14519
+
14520
+ p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
14320
14521
  }
14321
14522
 
14322
14523
  // ggml_compute_forward_cross_entropy_loss
@@ -14838,25 +15039,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14838
15039
  ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
14839
15040
  }
14840
15041
  break;
14841
- case GGML_OP_MAP_CUSTOM1:
15042
+ case GGML_OP_MAP_CUSTOM1_F32:
14842
15043
  {
14843
15044
  ggml_custom1_op_f32_t fun;
14844
15045
  memcpy(&fun, tensor->op_params, sizeof(fun));
14845
- ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
15046
+ ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
14846
15047
  }
14847
15048
  break;
14848
- case GGML_OP_MAP_CUSTOM2:
15049
+ case GGML_OP_MAP_CUSTOM2_F32:
14849
15050
  {
14850
15051
  ggml_custom2_op_f32_t fun;
14851
15052
  memcpy(&fun, tensor->op_params, sizeof(fun));
14852
- ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
15053
+ ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
14853
15054
  }
14854
15055
  break;
14855
- case GGML_OP_MAP_CUSTOM3:
15056
+ case GGML_OP_MAP_CUSTOM3_F32:
14856
15057
  {
14857
15058
  ggml_custom3_op_f32_t fun;
14858
15059
  memcpy(&fun, tensor->op_params, sizeof(fun));
14859
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15060
+ ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15061
+ }
15062
+ break;
15063
+ case GGML_OP_MAP_CUSTOM1:
15064
+ {
15065
+ ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
15066
+ }
15067
+ break;
15068
+ case GGML_OP_MAP_CUSTOM2:
15069
+ {
15070
+ ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
15071
+ }
15072
+ break;
15073
+ case GGML_OP_MAP_CUSTOM3:
15074
+ {
15075
+ ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14860
15076
  }
14861
15077
  break;
14862
15078
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -15664,6 +15880,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15664
15880
  } break;
15665
15881
  case GGML_OP_MAP_UNARY:
15666
15882
  case GGML_OP_MAP_BINARY:
15883
+ case GGML_OP_MAP_CUSTOM1_F32:
15884
+ case GGML_OP_MAP_CUSTOM2_F32:
15885
+ case GGML_OP_MAP_CUSTOM3_F32:
15667
15886
  case GGML_OP_MAP_CUSTOM1:
15668
15887
  case GGML_OP_MAP_CUSTOM2:
15669
15888
  case GGML_OP_MAP_CUSTOM3:
@@ -16449,11 +16668,38 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16449
16668
  case GGML_OP_WIN_UNPART:
16450
16669
  case GGML_OP_MAP_UNARY:
16451
16670
  case GGML_OP_MAP_BINARY:
16671
+ case GGML_OP_MAP_CUSTOM1_F32:
16672
+ case GGML_OP_MAP_CUSTOM2_F32:
16673
+ case GGML_OP_MAP_CUSTOM3_F32:
16674
+ {
16675
+ n_tasks = 1;
16676
+ } break;
16452
16677
  case GGML_OP_MAP_CUSTOM1:
16678
+ {
16679
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
16680
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
16681
+ n_tasks = n_threads;
16682
+ } else {
16683
+ n_tasks = MIN(p->n_tasks, n_threads);
16684
+ }
16685
+ } break;
16453
16686
  case GGML_OP_MAP_CUSTOM2:
16687
+ {
16688
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
16689
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
16690
+ n_tasks = n_threads;
16691
+ } else {
16692
+ n_tasks = MIN(p->n_tasks, n_threads);
16693
+ }
16694
+ } break;
16454
16695
  case GGML_OP_MAP_CUSTOM3:
16455
16696
  {
16456
- n_tasks = 1;
16697
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
16698
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
16699
+ n_tasks = n_threads;
16700
+ } else {
16701
+ n_tasks = MIN(p->n_tasks, n_threads);
16702
+ }
16457
16703
  } break;
16458
16704
  case GGML_OP_CROSS_ENTROPY_LOSS:
16459
16705
  {