llama_cpp 0.3.6 → 0.3.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -195,8 +195,8 @@ typedef void * thread_ret_t;
195
195
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
196
196
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
197
197
  #else
198
- inline static void* ggml_aligned_malloc(size_t size) {
199
- void* aligned_memory = NULL;
198
+ inline static void * ggml_aligned_malloc(size_t size) {
199
+ void * aligned_memory = NULL;
200
200
  #ifdef GGML_USE_METAL
201
201
  int result = posix_memalign(&aligned_memory, getpagesize(), size);
202
202
  #else
@@ -3811,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3811
3811
  "CROSS_ENTROPY_LOSS_BACK",
3812
3812
  };
3813
3813
 
3814
- static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3814
+ static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
3815
3815
 
3816
3816
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3817
3817
  "none",
@@ -3883,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3883
3883
  "cross_entropy_loss_back(x,y)",
3884
3884
  };
3885
3885
 
3886
- static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3886
+ static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
3887
3887
 
3888
3888
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3889
3889
 
@@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4110
4110
  //
4111
4111
  // is enough, but just in case, adding the second part
4112
4112
 
4113
- return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
4113
+ return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
4114
4114
  }
4115
4115
 
4116
4116
  size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
@@ -4253,7 +4253,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
4253
4253
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4254
4254
  }
4255
4255
 
4256
- static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
4256
+ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
4257
4257
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4258
4258
 
4259
4259
  return
@@ -4602,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4602
4602
  /*.ne =*/ { 1, 1, 1, 1 },
4603
4603
  /*.nb =*/ { 0, 0, 0, 0 },
4604
4604
  /*.op =*/ GGML_OP_NONE,
4605
- /*.op_params =*/ {0},
4605
+ /*.op_params =*/ { 0 },
4606
4606
  /*.is_param =*/ false,
4607
4607
  /*.grad =*/ NULL,
4608
4608
  /*.src =*/ { NULL },
@@ -4634,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4634
4634
  }
4635
4635
 
4636
4636
  static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4637
+ GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4637
4638
  assert(params_size <= GGML_MAX_OP_PARAMS);
4638
4639
  memcpy(tensor->op_params, params, params_size);
4639
4640
  }
@@ -6439,7 +6440,7 @@ struct ggml_tensor * ggml_permute(
6439
6440
  result->src[0] = a;
6440
6441
 
6441
6442
  int32_t params[] = { axis0, axis1, axis2, axis3 };
6442
- ggml_set_op_params(result, &params, sizeof(params));
6443
+ ggml_set_op_params(result, params, sizeof(params));
6443
6444
 
6444
6445
  return result;
6445
6446
  }
@@ -6565,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6565
6566
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6566
6567
 
6567
6568
  int32_t params[] = { n_past, inplace ? 1 : 0 };
6568
- ggml_set_op_params(result, &params, sizeof(params));
6569
+ ggml_set_op_params(result, params, sizeof(params));
6569
6570
 
6570
6571
  result->op = GGML_OP_DIAG_MASK_INF;
6571
6572
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6605,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6605
6606
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6606
6607
 
6607
6608
  int32_t params[] = { n_past, inplace ? 1 : 0 };
6608
- ggml_set_op_params(result, &params, sizeof(params));
6609
+ ggml_set_op_params(result, params, sizeof(params));
6609
6610
 
6610
6611
  result->op = GGML_OP_DIAG_MASK_ZERO;
6611
6612
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6721,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl(
6721
6722
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6722
6723
 
6723
6724
  int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6724
- memcpy(params + 4, &freq_base, sizeof(float));
6725
+ memcpy(params + 4, &freq_base, sizeof(float));
6725
6726
  memcpy(params + 5, &freq_scale, sizeof(float));
6726
- ggml_set_op_params(result, &params, sizeof(params));
6727
+ ggml_set_op_params(result, params, sizeof(params));
6727
6728
 
6728
6729
  result->op = GGML_OP_ROPE;
6729
6730
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6797,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back(
6797
6798
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6798
6799
 
6799
6800
  int32_t params[] = { n_past, n_dims, mode, n_ctx };
6800
- ggml_set_op_params(result, &params, sizeof(params));
6801
+ ggml_set_op_params(result, params, sizeof(params));
6801
6802
 
6802
6803
  result->op = GGML_OP_ROPE_BACK;
6803
6804
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6828,7 +6829,7 @@ struct ggml_tensor * ggml_alibi(
6828
6829
 
6829
6830
  int32_t op_params[3] = { n_past, n_head };
6830
6831
  memcpy(op_params + 2, &bias_max, sizeof(float));
6831
- ggml_set_op_params(result, &op_params, sizeof(op_params));
6832
+ ggml_set_op_params(result, op_params, sizeof(op_params));
6832
6833
 
6833
6834
  result->op = GGML_OP_ALIBI;
6834
6835
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6855,7 +6856,7 @@ struct ggml_tensor * ggml_clamp(
6855
6856
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6856
6857
 
6857
6858
  float params[] = { min, max };
6858
- ggml_set_op_params(result, &params, sizeof(params));
6859
+ ggml_set_op_params(result, params, sizeof(params));
6859
6860
 
6860
6861
  result->op = GGML_OP_CLAMP;
6861
6862
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6890,10 +6891,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
6890
6891
  ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
6891
6892
  a->ne[2], 1, 1,
6892
6893
  };
6893
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6894
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6894
6895
 
6895
6896
  int32_t params[] = { s0, p0, d0 };
6896
- ggml_set_op_params(result, &params, sizeof(params));
6897
+ ggml_set_op_params(result, params, sizeof(params));
6897
6898
 
6898
6899
  result->op = GGML_OP_CONV_1D;
6899
6900
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6905,10 +6906,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
6905
6906
 
6906
6907
  // ggml_conv_2d
6907
6908
 
6908
- struct ggml_tensor* ggml_conv_2d(
6909
- struct ggml_context* ctx,
6910
- struct ggml_tensor * a,
6911
- struct ggml_tensor * b,
6909
+ struct ggml_tensor * ggml_conv_2d(
6910
+ struct ggml_context * ctx,
6911
+ struct ggml_tensor * a,
6912
+ struct ggml_tensor * b,
6912
6913
  int s0,
6913
6914
  int s1,
6914
6915
  int p0,
@@ -6929,10 +6930,10 @@ struct ggml_tensor* ggml_conv_2d(
6929
6930
  ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
6930
6931
  a->ne[3], b->ne[3],
6931
6932
  };
6932
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6933
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6933
6934
 
6934
6935
  int32_t params[] = { s0, s1, p0, p1, d0, d1 };
6935
- ggml_set_op_params(result, &params, sizeof(params));
6936
+ ggml_set_op_params(result, params, sizeof(params));
6936
6937
 
6937
6938
  result->op = GGML_OP_CONV_2D;
6938
6939
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6945,7 +6946,7 @@ struct ggml_tensor* ggml_conv_2d(
6945
6946
 
6946
6947
  // ggml_conv_1d_ph
6947
6948
 
6948
- struct ggml_tensor* ggml_conv_1d_ph(
6949
+ struct ggml_tensor * ggml_conv_1d_ph(
6949
6950
  struct ggml_context * ctx,
6950
6951
  struct ggml_tensor * a,
6951
6952
  struct ggml_tensor * b,
@@ -6963,7 +6964,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
6963
6964
 
6964
6965
  // ggml_pool_1d
6965
6966
 
6966
- struct ggml_tensor* ggml_pool_1d(
6967
+ struct ggml_tensor * ggml_pool_1d(
6967
6968
  struct ggml_context * ctx,
6968
6969
  struct ggml_tensor * a,
6969
6970
  enum ggml_op_pool op,
@@ -6982,10 +6983,10 @@ struct ggml_tensor* ggml_pool_1d(
6982
6983
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
6983
6984
  a->ne[1],
6984
6985
  };
6985
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6986
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6986
6987
 
6987
6988
  int32_t params[] = { op, k0, s0, p0 };
6988
- ggml_set_op_params(result, &params, sizeof(params));
6989
+ ggml_set_op_params(result, params, sizeof(params));
6989
6990
 
6990
6991
  result->op = GGML_OP_POOL_1D;
6991
6992
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6996,7 +6997,7 @@ struct ggml_tensor* ggml_pool_1d(
6996
6997
 
6997
6998
  // ggml_pool_2d
6998
6999
 
6999
- struct ggml_tensor* ggml_pool_2d(
7000
+ struct ggml_tensor * ggml_pool_2d(
7000
7001
  struct ggml_context * ctx,
7001
7002
  struct ggml_tensor * a,
7002
7003
  enum ggml_op_pool op,
@@ -7019,10 +7020,10 @@ struct ggml_tensor* ggml_pool_2d(
7019
7020
  ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
7020
7021
  a->ne[2],
7021
7022
  };
7022
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7023
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7023
7024
 
7024
7025
  int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
7025
- ggml_set_op_params(result, &params, sizeof(params));
7026
+ ggml_set_op_params(result, params, sizeof(params));
7026
7027
 
7027
7028
  result->op = GGML_OP_POOL_2D;
7028
7029
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7190,7 +7191,7 @@ struct ggml_tensor * ggml_win_part(
7190
7191
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7191
7192
 
7192
7193
  int32_t params[] = { npx, npy, w };
7193
- ggml_set_op_params(result, &params, sizeof(params));
7194
+ ggml_set_op_params(result, params, sizeof(params));
7194
7195
 
7195
7196
  result->op = GGML_OP_WIN_PART;
7196
7197
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7220,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart(
7220
7221
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7221
7222
 
7222
7223
  int32_t params[] = { w };
7223
- ggml_set_op_params(result, &params, sizeof(params));
7224
+ ggml_set_op_params(result, params, sizeof(params));
7224
7225
 
7225
7226
  result->op = GGML_OP_WIN_UNPART;
7226
7227
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7349,7 +7350,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7349
7350
  return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
7350
7351
  }
7351
7352
 
7352
- // ggml_map_custom1
7353
+ // ggml_map_custom1_f32
7353
7354
 
7354
7355
  static struct ggml_tensor * ggml_map_custom1_impl_f32(
7355
7356
  struct ggml_context * ctx,
@@ -7366,7 +7367,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
7366
7367
 
7367
7368
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7368
7369
 
7369
- result->op = GGML_OP_MAP_CUSTOM1;
7370
+ result->op = GGML_OP_MAP_CUSTOM1_F32;
7370
7371
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7371
7372
  result->src[0] = a;
7372
7373
 
@@ -7387,7 +7388,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7387
7388
  return ggml_map_custom1_impl_f32(ctx, a, fun, true);
7388
7389
  }
7389
7390
 
7390
- // ggml_map_custom2
7391
+ // ggml_map_custom2_f32
7391
7392
 
7392
7393
  static struct ggml_tensor * ggml_map_custom2_impl_f32(
7393
7394
  struct ggml_context * ctx,
@@ -7405,7 +7406,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
7405
7406
 
7406
7407
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7407
7408
 
7408
- result->op = GGML_OP_MAP_CUSTOM2;
7409
+ result->op = GGML_OP_MAP_CUSTOM2_F32;
7409
7410
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7410
7411
  result->src[0] = a;
7411
7412
  result->src[1] = b;
@@ -7429,7 +7430,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7429
7430
  return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
7430
7431
  }
7431
7432
 
7432
- // ggml_map_custom3
7433
+ // ggml_map_custom3_f32
7433
7434
 
7434
7435
  static struct ggml_tensor * ggml_map_custom3_impl_f32(
7435
7436
  struct ggml_context * ctx,
@@ -7448,7 +7449,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
7448
7449
 
7449
7450
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7450
7451
 
7451
- result->op = GGML_OP_MAP_CUSTOM3;
7452
+ result->op = GGML_OP_MAP_CUSTOM3_F32;
7452
7453
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7453
7454
  result->src[0] = a;
7454
7455
  result->src[1] = b;
@@ -7475,6 +7476,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
7475
7476
  return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
7476
7477
  }
7477
7478
 
7479
+ // ggml_map_custom1
7480
+ struct ggml_map_custom1_op_params {
7481
+ ggml_custom1_op_t fun;
7482
+ int n_tasks;
7483
+ void * userdata;
7484
+ };
7485
+
7486
+ static struct ggml_tensor * ggml_map_custom1_impl(
7487
+ struct ggml_context * ctx,
7488
+ struct ggml_tensor * a,
7489
+ const ggml_custom1_op_t fun,
7490
+ int n_tasks,
7491
+ void * userdata,
7492
+ bool inplace) {
7493
+ GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
7494
+
7495
+ bool is_node = false;
7496
+
7497
+ if (!inplace && a->grad) {
7498
+ is_node = true;
7499
+ }
7500
+
7501
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7502
+
7503
+ struct ggml_map_custom1_op_params params = {
7504
+ /*.fun =*/ fun,
7505
+ /*.n_tasks =*/ n_tasks,
7506
+ /*.userdata =*/ userdata
7507
+ };
7508
+ ggml_set_op_params(result, (const void *) &params, sizeof(params));
7509
+
7510
+ result->op = GGML_OP_MAP_CUSTOM1;
7511
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7512
+ result->src[0] = a;
7513
+
7514
+ return result;
7515
+ }
7516
+
7517
+ struct ggml_tensor * ggml_map_custom1(
7518
+ struct ggml_context * ctx,
7519
+ struct ggml_tensor * a,
7520
+ const ggml_custom1_op_t fun,
7521
+ int n_tasks,
7522
+ void * userdata) {
7523
+ return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
7524
+ }
7525
+
7526
+ struct ggml_tensor * ggml_map_custom1_inplace(
7527
+ struct ggml_context * ctx,
7528
+ struct ggml_tensor * a,
7529
+ const ggml_custom1_op_t fun,
7530
+ int n_tasks,
7531
+ void * userdata) {
7532
+ return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
7533
+ }
7534
+
7535
+ // ggml_map_custom2
7536
+
7537
+ struct ggml_map_custom2_op_params {
7538
+ ggml_custom2_op_t fun;
7539
+ int n_tasks;
7540
+ void * userdata;
7541
+ };
7542
+
7543
+ static struct ggml_tensor * ggml_map_custom2_impl(
7544
+ struct ggml_context * ctx,
7545
+ struct ggml_tensor * a,
7546
+ struct ggml_tensor * b,
7547
+ const ggml_custom2_op_t fun,
7548
+ int n_tasks,
7549
+ void * userdata,
7550
+ bool inplace) {
7551
+ GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
7552
+
7553
+ bool is_node = false;
7554
+
7555
+ if (!inplace && (a->grad || b->grad)) {
7556
+ is_node = true;
7557
+ }
7558
+
7559
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7560
+
7561
+ struct ggml_map_custom2_op_params params = {
7562
+ /*.fun =*/ fun,
7563
+ /*.n_tasks =*/ n_tasks,
7564
+ /*.userdata =*/ userdata
7565
+ };
7566
+ ggml_set_op_params(result, (const void *) &params, sizeof(params));
7567
+
7568
+ result->op = GGML_OP_MAP_CUSTOM2;
7569
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7570
+ result->src[0] = a;
7571
+ result->src[1] = b;
7572
+
7573
+ return result;
7574
+ }
7575
+
7576
+ struct ggml_tensor * ggml_map_custom2(
7577
+ struct ggml_context * ctx,
7578
+ struct ggml_tensor * a,
7579
+ struct ggml_tensor * b,
7580
+ const ggml_custom2_op_t fun,
7581
+ int n_tasks,
7582
+ void * userdata) {
7583
+ return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
7584
+ }
7585
+
7586
+ struct ggml_tensor * ggml_map_custom2_inplace(
7587
+ struct ggml_context * ctx,
7588
+ struct ggml_tensor * a,
7589
+ struct ggml_tensor * b,
7590
+ const ggml_custom2_op_t fun,
7591
+ int n_tasks,
7592
+ void * userdata) {
7593
+ return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
7594
+ }
7595
+
7596
+ // ggml_map_custom3
7597
+
7598
+ struct ggml_map_custom3_op_params {
7599
+ ggml_custom3_op_t fun;
7600
+ int n_tasks;
7601
+ void * userdata;
7602
+ };
7603
+
7604
+ static struct ggml_tensor * ggml_map_custom3_impl(
7605
+ struct ggml_context * ctx,
7606
+ struct ggml_tensor * a,
7607
+ struct ggml_tensor * b,
7608
+ struct ggml_tensor * c,
7609
+ const ggml_custom3_op_t fun,
7610
+ int n_tasks,
7611
+ void * userdata,
7612
+ bool inplace) {
7613
+ GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
7614
+
7615
+ bool is_node = false;
7616
+
7617
+ if (!inplace && (a->grad || b->grad || c->grad)) {
7618
+ is_node = true;
7619
+ }
7620
+
7621
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7622
+
7623
+ struct ggml_map_custom3_op_params params = {
7624
+ /*.fun =*/ fun,
7625
+ /*.n_tasks =*/ n_tasks,
7626
+ /*.userdata =*/ userdata
7627
+ };
7628
+ ggml_set_op_params(result, (const void *) &params, sizeof(params));
7629
+
7630
+ result->op = GGML_OP_MAP_CUSTOM3;
7631
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7632
+ result->src[0] = a;
7633
+ result->src[1] = b;
7634
+ result->src[2] = c;
7635
+
7636
+ return result;
7637
+ }
7638
+
7639
+ struct ggml_tensor * ggml_map_custom3(
7640
+ struct ggml_context * ctx,
7641
+ struct ggml_tensor * a,
7642
+ struct ggml_tensor * b,
7643
+ struct ggml_tensor * c,
7644
+ const ggml_custom3_op_t fun,
7645
+ int n_tasks,
7646
+ void * userdata) {
7647
+ return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
7648
+ }
7649
+
7650
+ struct ggml_tensor * ggml_map_custom3_inplace(
7651
+ struct ggml_context * ctx,
7652
+ struct ggml_tensor * a,
7653
+ struct ggml_tensor * b,
7654
+ struct ggml_tensor * c,
7655
+ const ggml_custom3_op_t fun,
7656
+ int n_tasks,
7657
+ void * userdata) {
7658
+ return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
7659
+ }
7660
+
7661
+
7662
+
7478
7663
  // ggml_cross_entropy_loss
7479
7664
 
7480
7665
  struct ggml_tensor * ggml_cross_entropy_loss(
@@ -9283,8 +9468,8 @@ static void ggml_compute_forward_sum_rows_f32(
9283
9468
  for (int64_t i3 = 0; i3 < ne03; i3++) {
9284
9469
  for (int64_t i2 = 0; i2 < ne02; i2++) {
9285
9470
  for (int64_t i1 = 0; i1 < ne01; i1++) {
9286
- float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
9287
- float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
9471
+ float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
9472
+ float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
9288
9473
  float row_sum = 0;
9289
9474
  ggml_vec_sum_f32(ne00, &row_sum, src_row);
9290
9475
  dst_row[0] = row_sum;
@@ -10546,71 +10731,95 @@ static void ggml_compute_forward_mul_mat(
10546
10731
  return;
10547
10732
  }
10548
10733
 
10549
- // parallelize by src0 rows
10550
- const int64_t dr = (ne01 + nth - 1)/nth;
10734
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10735
+ const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10736
+
10737
+ const int64_t nr0 = ne01; // src0 rows
10738
+ const int64_t nr1 = ne11*ne12*ne13; // src1 rows
10551
10739
 
10552
- const int64_t ir10 = dr*ith;
10553
- const int64_t ir11 = MIN(ir10 + dr, ne01);
10740
+ //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
10554
10741
 
10555
- // src1 rows
10556
- const int64_t nr1 = ne11*ne12*ne13;
10742
+ // distribute the thread work across the inner or outer loop based on which one is larger
10557
10743
 
10558
- const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10559
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10744
+ const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
10745
+ const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
10560
10746
 
10561
- for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
10562
- const int64_t i13 = (ir1/(ne12*ne11));
10563
- const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
10564
- const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
10565
-
10566
- const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
10567
- const int64_t i03 = (ir0/(ne02));
10568
- // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
10569
- // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
10570
- // GG: this is likely the correct way to broadcast, though need some more thought
10571
- // therefore leaving the comments to remind us for now
10572
- const int64_t i02 = (i12 / (ne12 / ne02));
10573
- // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
10574
- // const int64_t i02 = (ir0 - i03*ne02);
10575
-
10576
- const int64_t i1 = i11;
10577
- const int64_t i2 = i12;
10578
- const int64_t i3 = i13;
10579
-
10580
- const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
10581
-
10582
- // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10583
- // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10584
- // the original src1 data pointer, so we should index using the indices directly
10585
- // TODO: this is a bit of a hack, we should probably have a better way to handle this
10586
- const char * src1_col = (const char *) wdata +
10587
- (src1_cont || src1->type != vec_dot_type
10588
- ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10589
- : (i11*nb11 + i12*nb12 + i13*nb13));
10590
-
10591
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10592
-
10593
- for (int64_t ir = ir10; ir < ir11; ++ir) {
10594
- vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
10595
- }
10747
+ const int64_t ith0 = ith % nth0;
10748
+ const int64_t ith1 = ith / nth0;
10749
+
10750
+ const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
10751
+ const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
10752
+
10753
+ const int64_t ir010 = dr0*ith0;
10754
+ const int64_t ir011 = MIN(ir010 + dr0, nr0);
10755
+
10756
+ const int64_t ir110 = dr1*ith1;
10757
+ const int64_t ir111 = MIN(ir110 + dr1, nr1);
10758
+
10759
+ //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
10760
+
10761
+ // threads with no work simply yield (not sure if it helps)
10762
+ if (ir010 >= ir011 || ir110 >= ir111) {
10763
+ sched_yield();
10764
+ return;
10596
10765
  }
10597
10766
 
10598
- //int64_t t1 = ggml_time_us();
10599
- //static int64_t acc = 0;
10600
- //acc += t1 - t0;
10601
- //if (t1 - t0 > 10) {
10602
- // printf("\n");
10603
- // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
10604
- // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
10605
- // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
10767
+ assert(ne12 % ne02 == 0);
10768
+ assert(ne13 % ne03 == 0);
10606
10769
 
10607
- // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
10608
- //}
10609
- }
10770
+ // broadcast factors
10771
+ const int64_t r2 = ne12/ne02;
10772
+ const int64_t r3 = ne13/ne03;
10610
10773
 
10774
+ // block-tiling attempt
10775
+ const int64_t blck_0 = 16;
10776
+ const int64_t blck_1 = 16;
10611
10777
 
10612
- // ggml_compute_forward_out_prod
10778
+ // attempt to reduce false-sharing (does not seem to make a difference)
10779
+ float tmp[16];
10613
10780
 
10781
+ for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
10782
+ for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
10783
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
10784
+ const int64_t i13 = (ir1/(ne12*ne11));
10785
+ const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
10786
+ const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
10787
+
10788
+ // broadcast src0 into src1
10789
+ const int64_t i03 = i13/r3;
10790
+ const int64_t i02 = i12/r2;
10791
+
10792
+ const int64_t i1 = i11;
10793
+ const int64_t i2 = i12;
10794
+ const int64_t i3 = i13;
10795
+
10796
+ const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
10797
+
10798
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10799
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10800
+ // the original src1 data pointer, so we should index using the indices directly
10801
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
10802
+ const char * src1_col = (const char *) wdata +
10803
+ (src1_cont || src1->type != vec_dot_type
10804
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10805
+ : (i11*nb11 + i12*nb12 + i13*nb13));
10806
+
10807
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10808
+
10809
+ //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10810
+ // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
10811
+ //}
10812
+
10813
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10814
+ vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
10815
+ }
10816
+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10817
+ }
10818
+ }
10819
+ }
10820
+ }
10821
+
10822
+ // ggml_compute_forward_out_prod
10614
10823
 
10615
10824
  static void ggml_compute_forward_out_prod_f32(
10616
10825
  const struct ggml_compute_params * params,
@@ -12894,7 +13103,7 @@ static void ggml_compute_forward_pool_1d(
12894
13103
  const struct ggml_tensor * src0,
12895
13104
  struct ggml_tensor * dst) {
12896
13105
 
12897
- const int32_t* opts = (const int32_t*)dst->op_params;
13106
+ const int32_t * opts = (const int32_t *)dst->op_params;
12898
13107
  enum ggml_op_pool op = opts[0];
12899
13108
  const int k0 = opts[1];
12900
13109
  const int s0 = opts[2];
@@ -14227,24 +14436,6 @@ static void ggml_compute_forward_map_custom1_f32(
14227
14436
  fun(dst, a);
14228
14437
  }
14229
14438
 
14230
-
14231
- static void ggml_compute_forward_map_custom1(
14232
- const struct ggml_compute_params * params,
14233
- const struct ggml_tensor * a,
14234
- struct ggml_tensor * dst,
14235
- const ggml_custom1_op_f32_t fun) {
14236
- switch (a->type) {
14237
- case GGML_TYPE_F32:
14238
- {
14239
- ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
14240
- } break;
14241
- default:
14242
- {
14243
- GGML_ASSERT(false);
14244
- } break;
14245
- }
14246
- }
14247
-
14248
14439
  // ggml_compute_forward_map_custom2
14249
14440
 
14250
14441
  static void ggml_compute_forward_map_custom2_f32(
@@ -14263,24 +14454,6 @@ static void ggml_compute_forward_map_custom2_f32(
14263
14454
  }
14264
14455
 
14265
14456
 
14266
- static void ggml_compute_forward_map_custom2(
14267
- const struct ggml_compute_params * params,
14268
- const struct ggml_tensor * a,
14269
- const struct ggml_tensor * b,
14270
- struct ggml_tensor * dst,
14271
- const ggml_custom2_op_f32_t fun) {
14272
- switch (a->type) {
14273
- case GGML_TYPE_F32:
14274
- {
14275
- ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
14276
- } break;
14277
- default:
14278
- {
14279
- GGML_ASSERT(false);
14280
- } break;
14281
- }
14282
- }
14283
-
14284
14457
  // ggml_compute_forward_map_custom3
14285
14458
 
14286
14459
  static void ggml_compute_forward_map_custom3_f32(
@@ -14299,24 +14472,52 @@ static void ggml_compute_forward_map_custom3_f32(
14299
14472
  fun(dst, a, b, c);
14300
14473
  }
14301
14474
 
14475
+ // ggml_compute_forward_map_custom1
14476
+
14477
+ static void ggml_compute_forward_map_custom1(
14478
+ const struct ggml_compute_params * params,
14479
+ const struct ggml_tensor * a,
14480
+ struct ggml_tensor * dst) {
14481
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14482
+ return;
14483
+ }
14484
+
14485
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
14486
+
14487
+ p->fun(dst, a, params->ith, params->nth, p->userdata);
14488
+ }
14489
+
14490
+ // ggml_compute_forward_map_custom2
14491
+
14492
+ static void ggml_compute_forward_map_custom2(
14493
+ const struct ggml_compute_params * params,
14494
+ const struct ggml_tensor * a,
14495
+ const struct ggml_tensor * b,
14496
+ struct ggml_tensor * dst) {
14497
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14498
+ return;
14499
+ }
14500
+
14501
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
14502
+
14503
+ p->fun(dst, a, b, params->ith, params->nth, p->userdata);
14504
+ }
14505
+
14506
+ // ggml_compute_forward_map_custom3
14302
14507
 
14303
14508
  static void ggml_compute_forward_map_custom3(
14304
14509
  const struct ggml_compute_params * params,
14305
14510
  const struct ggml_tensor * a,
14306
14511
  const struct ggml_tensor * b,
14307
14512
  const struct ggml_tensor * c,
14308
- struct ggml_tensor * dst,
14309
- const ggml_custom3_op_f32_t fun) {
14310
- switch (a->type) {
14311
- case GGML_TYPE_F32:
14312
- {
14313
- ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
14314
- } break;
14315
- default:
14316
- {
14317
- GGML_ASSERT(false);
14318
- } break;
14513
+ struct ggml_tensor * dst) {
14514
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14515
+ return;
14319
14516
  }
14517
+
14518
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
14519
+
14520
+ p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
14320
14521
  }
14321
14522
 
14322
14523
  // ggml_compute_forward_cross_entropy_loss
@@ -14838,25 +15039,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14838
15039
  ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
14839
15040
  }
14840
15041
  break;
14841
- case GGML_OP_MAP_CUSTOM1:
15042
+ case GGML_OP_MAP_CUSTOM1_F32:
14842
15043
  {
14843
15044
  ggml_custom1_op_f32_t fun;
14844
15045
  memcpy(&fun, tensor->op_params, sizeof(fun));
14845
- ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
15046
+ ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
14846
15047
  }
14847
15048
  break;
14848
- case GGML_OP_MAP_CUSTOM2:
15049
+ case GGML_OP_MAP_CUSTOM2_F32:
14849
15050
  {
14850
15051
  ggml_custom2_op_f32_t fun;
14851
15052
  memcpy(&fun, tensor->op_params, sizeof(fun));
14852
- ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
15053
+ ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
14853
15054
  }
14854
15055
  break;
14855
- case GGML_OP_MAP_CUSTOM3:
15056
+ case GGML_OP_MAP_CUSTOM3_F32:
14856
15057
  {
14857
15058
  ggml_custom3_op_f32_t fun;
14858
15059
  memcpy(&fun, tensor->op_params, sizeof(fun));
14859
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15060
+ ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15061
+ }
15062
+ break;
15063
+ case GGML_OP_MAP_CUSTOM1:
15064
+ {
15065
+ ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
15066
+ }
15067
+ break;
15068
+ case GGML_OP_MAP_CUSTOM2:
15069
+ {
15070
+ ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
15071
+ }
15072
+ break;
15073
+ case GGML_OP_MAP_CUSTOM3:
15074
+ {
15075
+ ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14860
15076
  }
14861
15077
  break;
14862
15078
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -15664,6 +15880,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15664
15880
  } break;
15665
15881
  case GGML_OP_MAP_UNARY:
15666
15882
  case GGML_OP_MAP_BINARY:
15883
+ case GGML_OP_MAP_CUSTOM1_F32:
15884
+ case GGML_OP_MAP_CUSTOM2_F32:
15885
+ case GGML_OP_MAP_CUSTOM3_F32:
15667
15886
  case GGML_OP_MAP_CUSTOM1:
15668
15887
  case GGML_OP_MAP_CUSTOM2:
15669
15888
  case GGML_OP_MAP_CUSTOM3:
@@ -16449,11 +16668,38 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16449
16668
  case GGML_OP_WIN_UNPART:
16450
16669
  case GGML_OP_MAP_UNARY:
16451
16670
  case GGML_OP_MAP_BINARY:
16671
+ case GGML_OP_MAP_CUSTOM1_F32:
16672
+ case GGML_OP_MAP_CUSTOM2_F32:
16673
+ case GGML_OP_MAP_CUSTOM3_F32:
16674
+ {
16675
+ n_tasks = 1;
16676
+ } break;
16452
16677
  case GGML_OP_MAP_CUSTOM1:
16678
+ {
16679
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
16680
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
16681
+ n_tasks = n_threads;
16682
+ } else {
16683
+ n_tasks = MIN(p->n_tasks, n_threads);
16684
+ }
16685
+ } break;
16453
16686
  case GGML_OP_MAP_CUSTOM2:
16687
+ {
16688
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
16689
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
16690
+ n_tasks = n_threads;
16691
+ } else {
16692
+ n_tasks = MIN(p->n_tasks, n_threads);
16693
+ }
16694
+ } break;
16454
16695
  case GGML_OP_MAP_CUSTOM3:
16455
16696
  {
16456
- n_tasks = 1;
16697
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
16698
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
16699
+ n_tasks = n_threads;
16700
+ } else {
16701
+ n_tasks = MIN(p->n_tasks, n_threads);
16702
+ }
16457
16703
  } break;
16458
16704
  case GGML_OP_CROSS_ENTROPY_LOSS:
16459
16705
  {