llama_cpp 0.3.4 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -3440,7 +3440,9 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
3440
3440
 
3441
3441
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
3442
3442
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
3443
- #if defined(GGML_SIMD)
3443
+ #if defined(GGML_USE_ACCELERATE)
3444
+ vDSP_vsmul(y, 1, &v, y, 1, n);
3445
+ #elif defined(GGML_SIMD)
3444
3446
  const int np = (n & ~(GGML_F32_STEP - 1));
3445
3447
 
3446
3448
  GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
@@ -3603,7 +3605,7 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
3603
3605
  #endif
3604
3606
  }
3605
3607
 
3606
- inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) {
3608
+ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
3607
3609
  ggml_float sum = 0.0;
3608
3610
  for (int i = 0; i < n; ++i) {
3609
3611
  sum += (ggml_float)x[i];
@@ -3611,6 +3613,14 @@ inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x
3611
3613
  *s = sum;
3612
3614
  }
3613
3615
 
3616
+ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
3617
+ float sum = 0.0f;
3618
+ for (int i = 0; i < n; ++i) {
3619
+ sum += GGML_FP16_TO_FP32(x[i]);
3620
+ }
3621
+ *s = sum;
3622
+ }
3623
+
3614
3624
  inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
3615
3625
  #ifndef GGML_USE_ACCELERATE
3616
3626
  float max = -INFINITY;
@@ -3750,16 +3760,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3750
3760
  "ARGMAX",
3751
3761
  "REPEAT",
3752
3762
  "REPEAT_BACK",
3753
- "ABS",
3754
- "SGN",
3755
- "NEG",
3756
- "STEP",
3757
- "TANH",
3758
- "ELU",
3759
- "RELU",
3760
- "GELU",
3761
- "GELU_QUICK",
3762
- "SILU",
3763
3763
  "SILU_BACK",
3764
3764
  "NORM",
3765
3765
  "RMS_NORM",
@@ -3798,6 +3798,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3798
3798
  "WIN_PART",
3799
3799
  "WIN_UNPART",
3800
3800
 
3801
+ "UNARY",
3802
+
3801
3803
  "MAP_UNARY",
3802
3804
  "MAP_BINARY",
3803
3805
 
@@ -3809,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3809
3811
  "CROSS_ENTROPY_LOSS_BACK",
3810
3812
  };
3811
3813
 
3812
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3814
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3813
3815
 
3814
3816
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3815
3817
  "none",
@@ -3830,16 +3832,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3830
3832
  "argmax(x)",
3831
3833
  "repeat(x)",
3832
3834
  "repeat_back(x)",
3833
- "abs(x)",
3834
- "sgn(x)",
3835
- "-x",
3836
- "step(x)",
3837
- "tanh(x)",
3838
- "elu(x)",
3839
- "relu(x)",
3840
- "gelu(x)",
3841
- "gelu_quick(x)",
3842
- "silu(x)",
3843
3835
  "silu_back(x)",
3844
3836
  "norm(x)",
3845
3837
  "rms_norm(x)",
@@ -3878,6 +3870,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3878
3870
  "win_part(x)",
3879
3871
  "win_unpart(x)",
3880
3872
 
3873
+ "unary(x)",
3874
+
3881
3875
  "f(x)",
3882
3876
  "f(x,y)",
3883
3877
 
@@ -3889,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3889
3883
  "cross_entropy_loss_back(x,y)",
3890
3884
  };
3891
3885
 
3892
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3886
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3893
3887
 
3894
3888
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3895
3889
 
@@ -4077,8 +4071,8 @@ bool ggml_is_numa(void) {
4077
4071
  ////////////////////////////////////////////////////////////////////////////////
4078
4072
 
4079
4073
  void ggml_print_object(const struct ggml_object * obj) {
4080
- GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
4081
- obj->offs, obj->size, (const void *) obj->next);
4074
+ GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
4075
+ obj->type, obj->offs, obj->size, (const void *) obj->next);
4082
4076
  }
4083
4077
 
4084
4078
  void ggml_print_objects(const struct ggml_context * ctx) {
@@ -4145,6 +4139,10 @@ const char * ggml_op_name(enum ggml_op op) {
4145
4139
  return GGML_OP_NAME[op];
4146
4140
  }
4147
4141
 
4142
+ const char * ggml_op_symbol(enum ggml_op op) {
4143
+ return GGML_OP_SYMBOL[op];
4144
+ }
4145
+
4148
4146
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
4149
4147
  return GGML_TYPE_SIZE[tensor->type];
4150
4148
  }
@@ -4214,7 +4212,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4214
4212
  }
4215
4213
 
4216
4214
  size_t ggml_tensor_overhead(void) {
4217
- return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
4215
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
4218
4216
  }
4219
4217
 
4220
4218
  bool ggml_is_transposed(const struct ggml_tensor * tensor) {
@@ -4231,6 +4229,15 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
4231
4229
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4232
4230
  }
4233
4231
 
4232
+ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
4233
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
+
4235
+ return
4236
+ tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4237
+ tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4238
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4239
+ }
4240
+
4234
4241
  bool ggml_is_permuted(const struct ggml_tensor * tensor) {
4235
4242
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4236
4243
 
@@ -4376,7 +4383,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4376
4383
  return NULL;
4377
4384
  }
4378
4385
 
4379
- const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
4386
+ const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4380
4387
 
4381
4388
  *ctx = (struct ggml_context) {
4382
4389
  /*.mem_size =*/ mem_size,
@@ -4443,6 +4450,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4443
4450
  return result;
4444
4451
  }
4445
4452
 
4453
+ bool ggml_get_no_alloc(struct ggml_context * ctx) {
4454
+ return ctx->no_alloc;
4455
+ }
4456
+
4446
4457
  void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4447
4458
  ctx->no_alloc = no_alloc;
4448
4459
  }
@@ -4461,12 +4472,14 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4461
4472
  struct ggml_object * obj = ctx->objects_begin;
4462
4473
 
4463
4474
  while (obj != NULL) {
4464
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4475
+ if (obj->type == GGML_OBJECT_TENSOR) {
4476
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4465
4477
 
4466
- const size_t size = ggml_nbytes(tensor);
4478
+ const size_t size = ggml_nbytes(tensor);
4467
4479
 
4468
- if (max_size < size) {
4469
- max_size = size;
4480
+ if (max_size < size) {
4481
+ max_size = size;
4482
+ }
4470
4483
  }
4471
4484
 
4472
4485
  obj = obj->next;
@@ -4480,7 +4493,7 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4480
4493
  // this is an error prone process, but it is necessary to support inplace
4481
4494
  // operators when using scratch buffers
4482
4495
  // TODO: implement a better way
4483
- void ggml_scratch_save(struct ggml_context * ctx) {
4496
+ static void ggml_scratch_save(struct ggml_context * ctx) {
4484
4497
  // this is needed to allow opt tensors to store their data
4485
4498
  // TODO: again, need to find a better way
4486
4499
  ctx->no_alloc_save = ctx->no_alloc;
@@ -4490,7 +4503,7 @@ void ggml_scratch_save(struct ggml_context * ctx) {
4490
4503
  ctx->scratch.data = NULL;
4491
4504
  }
4492
4505
 
4493
- void ggml_scratch_load(struct ggml_context * ctx) {
4506
+ static void ggml_scratch_load(struct ggml_context * ctx) {
4494
4507
  ctx->no_alloc = ctx->no_alloc_save;
4495
4508
 
4496
4509
  ctx->scratch = ctx->scratch_save;
@@ -4498,12 +4511,7 @@ void ggml_scratch_load(struct ggml_context * ctx) {
4498
4511
 
4499
4512
  ////////////////////////////////////////////////////////////////////////////////
4500
4513
 
4501
- struct ggml_tensor * ggml_new_tensor_impl(
4502
- struct ggml_context * ctx,
4503
- enum ggml_type type,
4504
- int n_dims,
4505
- const int64_t* ne,
4506
- void* data) {
4514
+ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
4507
4515
  // always insert objects at the end of the context's memory pool
4508
4516
  struct ggml_object * obj_cur = ctx->objects_end;
4509
4517
 
@@ -4511,77 +4519,81 @@ struct ggml_tensor * ggml_new_tensor_impl(
4511
4519
  const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
4512
4520
  const size_t cur_end = cur_offs + cur_size;
4513
4521
 
4514
- size_t size_needed = 0;
4515
-
4516
- if (data == NULL && !ctx->no_alloc) {
4517
- size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4518
- for (int i = 1; i < n_dims; i++) {
4519
- size_needed *= ne[i];
4520
- }
4521
- // align to GGML_MEM_ALIGN
4522
- size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
4523
- }
4522
+ // align to GGML_MEM_ALIGN
4523
+ size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
4524
4524
 
4525
4525
  char * const mem_buffer = ctx->mem_buffer;
4526
4526
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4527
4527
 
4528
- if (ctx->scratch.data == NULL || data != NULL) {
4529
- size_needed += GGML_TENSOR_SIZE;
4528
+ if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4529
+ GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4530
+ __func__, cur_end + size_needed, ctx->mem_size);
4531
+ assert(false);
4532
+ return NULL;
4533
+ }
4534
+
4535
+ *obj_new = (struct ggml_object) {
4536
+ .offs = cur_end + GGML_OBJECT_SIZE,
4537
+ .size = size_needed,
4538
+ .next = NULL,
4539
+ .type = type,
4540
+ };
4530
4541
 
4531
- if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4532
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4533
- __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
4534
- assert(false);
4535
- return NULL;
4536
- }
4542
+ ggml_assert_aligned(mem_buffer + obj_new->offs);
4537
4543
 
4538
- *obj_new = (struct ggml_object) {
4539
- .offs = cur_end + GGML_OBJECT_SIZE,
4540
- .size = size_needed,
4541
- .next = NULL,
4542
- };
4544
+ if (obj_cur != NULL) {
4545
+ obj_cur->next = obj_new;
4543
4546
  } else {
4544
- if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4545
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4546
- __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4547
- assert(false);
4548
- return NULL;
4547
+ // this is the first object in this context
4548
+ ctx->objects_begin = obj_new;
4549
+ }
4550
+
4551
+ ctx->objects_end = obj_new;
4552
+
4553
+ //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4554
+
4555
+ return obj_new;
4556
+ }
4557
+
4558
+ static struct ggml_tensor * ggml_new_tensor_impl(
4559
+ struct ggml_context * ctx,
4560
+ enum ggml_type type,
4561
+ int n_dims,
4562
+ const int64_t * ne,
4563
+ void * data) {
4564
+
4565
+ assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4566
+
4567
+ size_t data_size = 0;
4568
+
4569
+ if (data == NULL && !ctx->no_alloc) {
4570
+ data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4571
+ for (int i = 1; i < n_dims; i++) {
4572
+ data_size *= ne[i];
4549
4573
  }
4574
+ }
4550
4575
 
4551
- if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4552
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4553
- __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4576
+ if (ctx->scratch.data != NULL && data == NULL) {
4577
+ // allocate tensor data in the scratch buffer
4578
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4579
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4580
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4554
4581
  assert(false);
4555
4582
  return NULL;
4556
4583
  }
4557
4584
 
4558
4585
  data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4559
4586
 
4560
- *obj_new = (struct ggml_object) {
4561
- .offs = cur_end + GGML_OBJECT_SIZE,
4562
- .size = GGML_TENSOR_SIZE,
4563
- .next = NULL,
4564
- };
4565
-
4566
- //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
4587
+ ctx->scratch.offs += data_size;
4567
4588
 
4568
- ctx->scratch.offs += size_needed;
4589
+ data_size = 0;
4569
4590
  }
4570
4591
 
4571
- if (obj_cur != NULL) {
4572
- obj_cur->next = obj_new;
4573
- } else {
4574
- // this is the first object in this context
4575
- ctx->objects_begin = obj_new;
4576
- }
4577
-
4578
- ctx->objects_end = obj_new;
4592
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4579
4593
 
4580
- //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4594
+ // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4581
4595
 
4582
- struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
4583
-
4584
- ggml_assert_aligned(result);
4596
+ struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
4585
4597
 
4586
4598
  *result = (struct ggml_tensor) {
4587
4599
  /*.type =*/ type,
@@ -4590,6 +4602,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4590
4602
  /*.ne =*/ { 1, 1, 1, 1 },
4591
4603
  /*.nb =*/ { 0, 0, 0, 0 },
4592
4604
  /*.op =*/ GGML_OP_NONE,
4605
+ /*.op_params =*/ {0},
4593
4606
  /*.is_param =*/ false,
4594
4607
  /*.grad =*/ NULL,
4595
4608
  /*.src =*/ { NULL },
@@ -4620,24 +4633,39 @@ struct ggml_tensor * ggml_new_tensor_impl(
4620
4633
  return result;
4621
4634
  }
4622
4635
 
4636
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4637
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4638
+ memcpy(tensor->op_params, params, params_size);
4639
+ }
4640
+
4641
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4642
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4643
+ return ((const int32_t *)(tensor->op_params))[i];
4644
+ }
4645
+
4646
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4647
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4648
+ ((int32_t *)(tensor->op_params))[i] = value;
4649
+ }
4650
+
4623
4651
  struct ggml_tensor * ggml_new_tensor(
4624
4652
  struct ggml_context * ctx,
4625
- enum ggml_type type,
4626
- int n_dims,
4627
- const int64_t * ne) {
4653
+ enum ggml_type type,
4654
+ int n_dims,
4655
+ const int64_t * ne) {
4628
4656
  return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4629
4657
  }
4630
4658
 
4631
4659
  struct ggml_tensor * ggml_new_tensor_1d(
4632
4660
  struct ggml_context * ctx,
4633
- enum ggml_type type,
4661
+ enum ggml_type type,
4634
4662
  int64_t ne0) {
4635
4663
  return ggml_new_tensor(ctx, type, 1, &ne0);
4636
4664
  }
4637
4665
 
4638
4666
  struct ggml_tensor * ggml_new_tensor_2d(
4639
4667
  struct ggml_context * ctx,
4640
- enum ggml_type type,
4668
+ enum ggml_type type,
4641
4669
  int64_t ne0,
4642
4670
  int64_t ne1) {
4643
4671
  const int64_t ne[2] = { ne0, ne1 };
@@ -4646,7 +4674,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
4646
4674
 
4647
4675
  struct ggml_tensor * ggml_new_tensor_3d(
4648
4676
  struct ggml_context * ctx,
4649
- enum ggml_type type,
4677
+ enum ggml_type type,
4650
4678
  int64_t ne0,
4651
4679
  int64_t ne1,
4652
4680
  int64_t ne2) {
@@ -4951,6 +4979,11 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
4951
4979
  return (float *)(tensor->data);
4952
4980
  }
4953
4981
 
4982
+ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
4983
+ GGML_ASSERT(tensor->op == GGML_OP_UNARY);
4984
+ return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
4985
+ }
4986
+
4954
4987
  const char * ggml_get_name(const struct ggml_tensor * tensor) {
4955
4988
  return tensor->name;
4956
4989
  }
@@ -4989,9 +5022,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
4989
5022
  char * const mem_buffer = ctx->mem_buffer;
4990
5023
 
4991
5024
  while (obj != NULL) {
4992
- struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4993
- if (strcmp(cur->name, name) == 0) {
4994
- return cur;
5025
+ if (obj->type == GGML_OBJECT_TENSOR) {
5026
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
5027
+ if (strcmp(cur->name, name) == 0) {
5028
+ return cur;
5029
+ }
4995
5030
  }
4996
5031
 
4997
5032
  obj = obj->next;
@@ -5004,7 +5039,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
5004
5039
 
5005
5040
  // ggml_dup
5006
5041
 
5007
- struct ggml_tensor * ggml_dup_impl(
5042
+ static struct ggml_tensor * ggml_dup_impl(
5008
5043
  struct ggml_context * ctx,
5009
5044
  struct ggml_tensor * a,
5010
5045
  bool inplace) {
@@ -5019,7 +5054,6 @@ struct ggml_tensor * ggml_dup_impl(
5019
5054
  result->op = GGML_OP_DUP;
5020
5055
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5021
5056
  result->src[0] = a;
5022
- result->src[1] = NULL;
5023
5057
 
5024
5058
  return result;
5025
5059
  }
@@ -5038,7 +5072,7 @@ struct ggml_tensor * ggml_dup_inplace(
5038
5072
 
5039
5073
  // ggml_add
5040
5074
 
5041
- struct ggml_tensor * ggml_add_impl(
5075
+ static struct ggml_tensor * ggml_add_impl(
5042
5076
  struct ggml_context * ctx,
5043
5077
  struct ggml_tensor * a,
5044
5078
  struct ggml_tensor * b,
@@ -5081,7 +5115,7 @@ struct ggml_tensor * ggml_add_inplace(
5081
5115
 
5082
5116
  // ggml_add1
5083
5117
 
5084
- struct ggml_tensor * ggml_add1_impl(
5118
+ static struct ggml_tensor * ggml_add1_impl(
5085
5119
  struct ggml_context * ctx,
5086
5120
  struct ggml_tensor * a,
5087
5121
  struct ggml_tensor * b,
@@ -5121,7 +5155,7 @@ struct ggml_tensor * ggml_add1_inplace(
5121
5155
 
5122
5156
  // ggml_acc
5123
5157
 
5124
- struct ggml_tensor * ggml_acc_impl(
5158
+ static struct ggml_tensor * ggml_acc_impl(
5125
5159
  struct ggml_context * ctx,
5126
5160
  struct ggml_tensor * a,
5127
5161
  struct ggml_tensor * b,
@@ -5143,23 +5177,13 @@ struct ggml_tensor * ggml_acc_impl(
5143
5177
 
5144
5178
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5145
5179
 
5146
- ggml_scratch_save(ctx);
5147
-
5148
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
5149
-
5150
- ((int32_t *) c->data)[0] = nb1;
5151
- ((int32_t *) c->data)[1] = nb2;
5152
- ((int32_t *) c->data)[2] = nb3;
5153
- ((int32_t *) c->data)[3] = offset;
5154
- ((int32_t *) c->data)[4] = inplace ? 1 : 0;
5155
-
5156
- ggml_scratch_load(ctx);
5180
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5181
+ ggml_set_op_params(result, params, sizeof(params));
5157
5182
 
5158
5183
  result->op = GGML_OP_ACC;
5159
5184
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5160
5185
  result->src[0] = a;
5161
5186
  result->src[1] = b;
5162
- result->src[2] = c;
5163
5187
 
5164
5188
  return result;
5165
5189
  }
@@ -5188,7 +5212,7 @@ struct ggml_tensor * ggml_acc_inplace(
5188
5212
 
5189
5213
  // ggml_sub
5190
5214
 
5191
- struct ggml_tensor * ggml_sub_impl(
5215
+ static struct ggml_tensor * ggml_sub_impl(
5192
5216
  struct ggml_context * ctx,
5193
5217
  struct ggml_tensor * a,
5194
5218
  struct ggml_tensor * b,
@@ -5227,7 +5251,7 @@ struct ggml_tensor * ggml_sub_inplace(
5227
5251
 
5228
5252
  // ggml_mul
5229
5253
 
5230
- struct ggml_tensor * ggml_mul_impl(
5254
+ static struct ggml_tensor * ggml_mul_impl(
5231
5255
  struct ggml_context * ctx,
5232
5256
  struct ggml_tensor * a,
5233
5257
  struct ggml_tensor * b,
@@ -5274,7 +5298,7 @@ struct ggml_tensor * ggml_mul_inplace(
5274
5298
 
5275
5299
  // ggml_div
5276
5300
 
5277
- struct ggml_tensor * ggml_div_impl(
5301
+ static struct ggml_tensor * ggml_div_impl(
5278
5302
  struct ggml_context * ctx,
5279
5303
  struct ggml_tensor * a,
5280
5304
  struct ggml_tensor * b,
@@ -5317,7 +5341,7 @@ struct ggml_tensor * ggml_div_inplace(
5317
5341
 
5318
5342
  // ggml_sqr
5319
5343
 
5320
- struct ggml_tensor * ggml_sqr_impl(
5344
+ static struct ggml_tensor * ggml_sqr_impl(
5321
5345
  struct ggml_context * ctx,
5322
5346
  struct ggml_tensor * a,
5323
5347
  bool inplace) {
@@ -5332,7 +5356,6 @@ struct ggml_tensor * ggml_sqr_impl(
5332
5356
  result->op = GGML_OP_SQR;
5333
5357
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5334
5358
  result->src[0] = a;
5335
- result->src[1] = NULL;
5336
5359
 
5337
5360
  return result;
5338
5361
  }
@@ -5351,7 +5374,7 @@ struct ggml_tensor * ggml_sqr_inplace(
5351
5374
 
5352
5375
  // ggml_sqrt
5353
5376
 
5354
- struct ggml_tensor * ggml_sqrt_impl(
5377
+ static struct ggml_tensor * ggml_sqrt_impl(
5355
5378
  struct ggml_context * ctx,
5356
5379
  struct ggml_tensor * a,
5357
5380
  bool inplace) {
@@ -5366,7 +5389,6 @@ struct ggml_tensor * ggml_sqrt_impl(
5366
5389
  result->op = GGML_OP_SQRT;
5367
5390
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5368
5391
  result->src[0] = a;
5369
- result->src[1] = NULL;
5370
5392
 
5371
5393
  return result;
5372
5394
  }
@@ -5386,7 +5408,7 @@ struct ggml_tensor * ggml_sqrt_inplace(
5386
5408
 
5387
5409
  // ggml_log
5388
5410
 
5389
- struct ggml_tensor * ggml_log_impl(
5411
+ static struct ggml_tensor * ggml_log_impl(
5390
5412
  struct ggml_context * ctx,
5391
5413
  struct ggml_tensor * a,
5392
5414
  bool inplace) {
@@ -5401,7 +5423,6 @@ struct ggml_tensor * ggml_log_impl(
5401
5423
  result->op = GGML_OP_LOG;
5402
5424
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5403
5425
  result->src[0] = a;
5404
- result->src[1] = NULL;
5405
5426
 
5406
5427
  return result;
5407
5428
  }
@@ -5434,7 +5455,6 @@ struct ggml_tensor * ggml_sum(
5434
5455
  result->op = GGML_OP_SUM;
5435
5456
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5436
5457
  result->src[0] = a;
5437
- result->src[1] = NULL;
5438
5458
 
5439
5459
  return result;
5440
5460
  }
@@ -5461,7 +5481,6 @@ struct ggml_tensor * ggml_sum_rows(
5461
5481
  result->op = GGML_OP_SUM_ROWS;
5462
5482
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5463
5483
  result->src[0] = a;
5464
- result->src[1] = NULL;
5465
5484
 
5466
5485
  return result;
5467
5486
  }
@@ -5484,7 +5503,6 @@ struct ggml_tensor * ggml_mean(
5484
5503
  result->op = GGML_OP_MEAN;
5485
5504
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5486
5505
  result->src[0] = a;
5487
- result->src[1] = NULL;
5488
5506
 
5489
5507
  return result;
5490
5508
  }
@@ -5508,7 +5526,6 @@ struct ggml_tensor * ggml_argmax(
5508
5526
  result->op = GGML_OP_ARGMAX;
5509
5527
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5510
5528
  result->src[0] = a;
5511
- result->src[1] = NULL;
5512
5529
 
5513
5530
  return result;
5514
5531
  }
@@ -5571,343 +5588,142 @@ struct ggml_tensor * ggml_repeat_back(
5571
5588
 
5572
5589
  // ggml_abs
5573
5590
 
5574
- struct ggml_tensor * ggml_abs_impl(
5575
- struct ggml_context * ctx,
5576
- struct ggml_tensor * a,
5577
- bool inplace) {
5578
- bool is_node = false;
5579
-
5580
- if (!inplace && (a->grad)) {
5581
- is_node = true;
5582
- }
5583
-
5584
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5585
-
5586
- result->op = GGML_OP_ABS;
5587
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5588
- result->src[0] = a;
5589
- result->src[1] = NULL;
5590
-
5591
- return result;
5592
- }
5593
-
5594
5591
  struct ggml_tensor * ggml_abs(
5595
5592
  struct ggml_context * ctx,
5596
5593
  struct ggml_tensor * a) {
5597
- return ggml_abs_impl(ctx, a, false);
5594
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
5598
5595
  }
5599
5596
 
5600
5597
  struct ggml_tensor * ggml_abs_inplace(
5601
5598
  struct ggml_context * ctx,
5602
5599
  struct ggml_tensor * a) {
5603
- return ggml_abs_impl(ctx, a, true);
5600
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
5604
5601
  }
5605
5602
 
5606
-
5607
5603
  // ggml_sgn
5608
5604
 
5609
- struct ggml_tensor * ggml_sgn_impl(
5610
- struct ggml_context * ctx,
5611
- struct ggml_tensor * a,
5612
- bool inplace) {
5613
- bool is_node = false;
5614
-
5615
- if (!inplace && (a->grad)) {
5616
- is_node = true;
5617
- }
5618
-
5619
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5620
-
5621
- result->op = GGML_OP_SGN;
5622
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5623
- result->src[0] = a;
5624
- result->src[1] = NULL;
5625
-
5626
- return result;
5627
- }
5628
-
5629
5605
  struct ggml_tensor * ggml_sgn(
5630
5606
  struct ggml_context * ctx,
5631
5607
  struct ggml_tensor * a) {
5632
- return ggml_sgn_impl(ctx, a, false);
5608
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
5633
5609
  }
5634
5610
 
5635
5611
  struct ggml_tensor * ggml_sgn_inplace(
5636
5612
  struct ggml_context * ctx,
5637
5613
  struct ggml_tensor * a) {
5638
- return ggml_sgn_impl(ctx, a, true);
5614
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
5639
5615
  }
5640
5616
 
5641
5617
  // ggml_neg
5642
5618
 
5643
- struct ggml_tensor * ggml_neg_impl(
5644
- struct ggml_context * ctx,
5645
- struct ggml_tensor * a,
5646
- bool inplace) {
5647
- bool is_node = false;
5648
-
5649
- if (!inplace && (a->grad)) {
5650
- is_node = true;
5651
- }
5652
-
5653
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5654
-
5655
- result->op = GGML_OP_NEG;
5656
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5657
- result->src[0] = a;
5658
- result->src[1] = NULL;
5659
-
5660
- return result;
5661
- }
5662
-
5663
5619
  struct ggml_tensor * ggml_neg(
5664
5620
  struct ggml_context * ctx,
5665
5621
  struct ggml_tensor * a) {
5666
- return ggml_neg_impl(ctx, a, false);
5622
+ return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
5667
5623
  }
5668
5624
 
5669
5625
  struct ggml_tensor * ggml_neg_inplace(
5670
5626
  struct ggml_context * ctx,
5671
5627
  struct ggml_tensor * a) {
5672
- return ggml_neg_impl(ctx, a, true);
5628
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
5673
5629
  }
5674
5630
 
5675
5631
  // ggml_step
5676
5632
 
5677
- struct ggml_tensor * ggml_step_impl(
5678
- struct ggml_context * ctx,
5679
- struct ggml_tensor * a,
5680
- bool inplace) {
5681
- bool is_node = false;
5682
-
5683
- if (!inplace && (a->grad)) {
5684
- is_node = true;
5685
- }
5686
-
5687
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5688
-
5689
- result->op = GGML_OP_STEP;
5690
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5691
- result->src[0] = a;
5692
- result->src[1] = NULL;
5693
-
5694
- return result;
5695
- }
5696
-
5697
5633
  struct ggml_tensor * ggml_step(
5698
5634
  struct ggml_context * ctx,
5699
5635
  struct ggml_tensor * a) {
5700
- return ggml_step_impl(ctx, a, false);
5636
+ return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
5701
5637
  }
5702
5638
 
5703
5639
  struct ggml_tensor * ggml_step_inplace(
5704
5640
  struct ggml_context * ctx,
5705
5641
  struct ggml_tensor * a) {
5706
- return ggml_step_impl(ctx, a, true);
5642
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
5707
5643
  }
5708
5644
 
5709
5645
  // ggml_tanh
5710
5646
 
5711
- struct ggml_tensor * ggml_tanh_impl(
5712
- struct ggml_context * ctx,
5713
- struct ggml_tensor * a,
5714
- bool inplace) {
5715
- bool is_node = false;
5716
-
5717
- if (!inplace && (a->grad)) {
5718
- is_node = true;
5719
- }
5720
-
5721
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5722
-
5723
- result->op = GGML_OP_TANH;
5724
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5725
- result->src[0] = a;
5726
- result->src[1] = NULL;
5727
-
5728
- return result;
5729
- }
5730
-
5731
5647
  struct ggml_tensor * ggml_tanh(
5732
5648
  struct ggml_context * ctx,
5733
5649
  struct ggml_tensor * a) {
5734
- return ggml_tanh_impl(ctx, a, false);
5650
+ return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
5735
5651
  }
5736
5652
 
5737
5653
  struct ggml_tensor * ggml_tanh_inplace(
5738
5654
  struct ggml_context * ctx,
5739
5655
  struct ggml_tensor * a) {
5740
- return ggml_tanh_impl(ctx, a, true);
5656
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
5741
5657
  }
5742
5658
 
5743
5659
  // ggml_elu
5744
5660
 
5745
- struct ggml_tensor * ggml_elu_impl(
5746
- struct ggml_context * ctx,
5747
- struct ggml_tensor * a,
5748
- bool inplace) {
5749
- bool is_node = false;
5750
-
5751
- if (!inplace && (a->grad)) {
5752
- is_node = true;
5753
- }
5754
-
5755
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5756
-
5757
- result->op = GGML_OP_ELU;
5758
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5759
- result->src[0] = a;
5760
- result->src[1] = NULL;
5761
-
5762
- return result;
5763
- }
5764
-
5765
5661
  struct ggml_tensor * ggml_elu(
5766
5662
  struct ggml_context * ctx,
5767
5663
  struct ggml_tensor * a) {
5768
- return ggml_elu_impl(ctx, a, false);
5664
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
5769
5665
  }
5770
5666
 
5771
5667
  struct ggml_tensor * ggml_elu_inplace(
5772
5668
  struct ggml_context * ctx,
5773
5669
  struct ggml_tensor * a) {
5774
- return ggml_elu_impl(ctx, a, true);
5670
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
5775
5671
  }
5776
5672
 
5777
5673
  // ggml_relu
5778
5674
 
5779
- struct ggml_tensor * ggml_relu_impl(
5780
- struct ggml_context * ctx,
5781
- struct ggml_tensor * a,
5782
- bool inplace) {
5783
- bool is_node = false;
5784
-
5785
- if (!inplace && (a->grad)) {
5786
- is_node = true;
5787
- }
5788
-
5789
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5790
-
5791
- result->op = GGML_OP_RELU;
5792
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5793
- result->src[0] = a;
5794
- result->src[1] = NULL;
5795
-
5796
- return result;
5797
- }
5798
-
5799
5675
  struct ggml_tensor * ggml_relu(
5800
5676
  struct ggml_context * ctx,
5801
5677
  struct ggml_tensor * a) {
5802
- return ggml_relu_impl(ctx, a, false);
5678
+ return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
5803
5679
  }
5804
5680
 
5805
5681
  struct ggml_tensor * ggml_relu_inplace(
5806
5682
  struct ggml_context * ctx,
5807
5683
  struct ggml_tensor * a) {
5808
- return ggml_relu_impl(ctx, a, true);
5684
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
5809
5685
  }
5810
5686
 
5811
5687
  // ggml_gelu
5812
5688
 
5813
- struct ggml_tensor * ggml_gelu_impl(
5814
- struct ggml_context * ctx,
5815
- struct ggml_tensor * a,
5816
- bool inplace) {
5817
- bool is_node = false;
5818
-
5819
- if (!inplace && (a->grad)) {
5820
- is_node = true;
5821
- }
5822
-
5823
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5824
-
5825
- result->op = GGML_OP_GELU;
5826
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5827
- result->src[0] = a;
5828
- result->src[1] = NULL;
5829
-
5830
- return result;
5831
- }
5832
-
5833
5689
  struct ggml_tensor * ggml_gelu(
5834
5690
  struct ggml_context * ctx,
5835
5691
  struct ggml_tensor * a) {
5836
- return ggml_gelu_impl(ctx, a, false);
5692
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
5837
5693
  }
5838
5694
 
5839
5695
  struct ggml_tensor * ggml_gelu_inplace(
5840
5696
  struct ggml_context * ctx,
5841
5697
  struct ggml_tensor * a) {
5842
- return ggml_gelu_impl(ctx, a, true);
5698
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
5843
5699
  }
5844
5700
 
5845
5701
  // ggml_gelu_quick
5846
5702
 
5847
- struct ggml_tensor * ggml_gelu_quick_impl(
5848
- struct ggml_context * ctx,
5849
- struct ggml_tensor * a,
5850
- bool inplace) {
5851
- bool is_node = false;
5852
-
5853
- if (!inplace && (a->grad)) {
5854
- is_node = true;
5855
- }
5856
-
5857
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5858
-
5859
- result->op = GGML_OP_GELU_QUICK;
5860
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5861
- result->src[0] = a;
5862
- result->src[1] = NULL;
5863
-
5864
- return result;
5865
- }
5866
-
5867
5703
  struct ggml_tensor * ggml_gelu_quick(
5868
5704
  struct ggml_context * ctx,
5869
5705
  struct ggml_tensor * a) {
5870
- return ggml_gelu_quick_impl(ctx, a, false);
5706
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5871
5707
  }
5872
5708
 
5873
5709
  struct ggml_tensor * ggml_gelu_quick_inplace(
5874
5710
  struct ggml_context * ctx,
5875
5711
  struct ggml_tensor * a) {
5876
- return ggml_gelu_quick_impl(ctx, a, true);
5712
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5877
5713
  }
5878
5714
 
5879
5715
  // ggml_silu
5880
5716
 
5881
- struct ggml_tensor * ggml_silu_impl(
5882
- struct ggml_context * ctx,
5883
- struct ggml_tensor * a,
5884
- bool inplace) {
5885
- bool is_node = false;
5886
-
5887
- if (!inplace && (a->grad)) {
5888
- is_node = true;
5889
- }
5890
-
5891
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5892
-
5893
- result->op = GGML_OP_SILU;
5894
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5895
- result->src[0] = a;
5896
- result->src[1] = NULL;
5897
-
5898
- return result;
5899
- }
5900
-
5901
5717
  struct ggml_tensor * ggml_silu(
5902
5718
  struct ggml_context * ctx,
5903
5719
  struct ggml_tensor * a) {
5904
- return ggml_silu_impl(ctx, a, false);
5720
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
5905
5721
  }
5906
5722
 
5907
5723
  struct ggml_tensor * ggml_silu_inplace(
5908
5724
  struct ggml_context * ctx,
5909
5725
  struct ggml_tensor * a) {
5910
- return ggml_silu_impl(ctx, a, true);
5726
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
5911
5727
  }
5912
5728
 
5913
5729
  // ggml_silu_back
@@ -5935,7 +5751,7 @@ struct ggml_tensor * ggml_silu_back(
5935
5751
 
5936
5752
  // ggml_norm
5937
5753
 
5938
- struct ggml_tensor * ggml_norm_impl(
5754
+ static struct ggml_tensor * ggml_norm_impl(
5939
5755
  struct ggml_context * ctx,
5940
5756
  struct ggml_tensor * a,
5941
5757
  bool inplace) {
@@ -5948,10 +5764,11 @@ struct ggml_tensor * ggml_norm_impl(
5948
5764
 
5949
5765
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5950
5766
 
5767
+ // TODO: maybe store epsilon here?
5768
+
5951
5769
  result->op = GGML_OP_NORM;
5952
5770
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5953
5771
  result->src[0] = a;
5954
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5955
5772
 
5956
5773
  return result;
5957
5774
  }
@@ -5968,9 +5785,10 @@ struct ggml_tensor * ggml_norm_inplace(
5968
5785
  return ggml_norm_impl(ctx, a, true);
5969
5786
  }
5970
5787
 
5971
- struct ggml_tensor * ggml_rms_norm_impl(
5788
+ static struct ggml_tensor * ggml_rms_norm_impl(
5972
5789
  struct ggml_context * ctx,
5973
5790
  struct ggml_tensor * a,
5791
+ float eps,
5974
5792
  bool inplace) {
5975
5793
  bool is_node = false;
5976
5794
 
@@ -5980,24 +5798,27 @@ struct ggml_tensor * ggml_rms_norm_impl(
5980
5798
 
5981
5799
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5982
5800
 
5801
+ ggml_set_op_params(result, &eps, sizeof(eps));
5802
+
5983
5803
  result->op = GGML_OP_RMS_NORM;
5984
5804
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5985
5805
  result->src[0] = a;
5986
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5987
5806
 
5988
5807
  return result;
5989
5808
  }
5990
5809
 
5991
5810
  struct ggml_tensor * ggml_rms_norm(
5992
5811
  struct ggml_context * ctx,
5993
- struct ggml_tensor * a) {
5994
- return ggml_rms_norm_impl(ctx, a, false);
5812
+ struct ggml_tensor * a,
5813
+ float eps) {
5814
+ return ggml_rms_norm_impl(ctx, a, eps, false);
5995
5815
  }
5996
5816
 
5997
5817
  struct ggml_tensor * ggml_rms_norm_inplace(
5998
5818
  struct ggml_context * ctx,
5999
- struct ggml_tensor * a) {
6000
- return ggml_rms_norm_impl(ctx, a, true);
5819
+ struct ggml_tensor * a,
5820
+ float eps) {
5821
+ return ggml_rms_norm_impl(ctx, a, eps, true);
6001
5822
  }
6002
5823
 
6003
5824
  struct ggml_tensor * ggml_rms_norm_back(
@@ -6076,7 +5897,7 @@ struct ggml_tensor * ggml_out_prod(
6076
5897
 
6077
5898
  // ggml_scale
6078
5899
 
6079
- struct ggml_tensor * ggml_scale_impl(
5900
+ static struct ggml_tensor * ggml_scale_impl(
6080
5901
  struct ggml_context * ctx,
6081
5902
  struct ggml_tensor * a,
6082
5903
  struct ggml_tensor * b,
@@ -6116,7 +5937,7 @@ struct ggml_tensor * ggml_scale_inplace(
6116
5937
 
6117
5938
  // ggml_set
6118
5939
 
6119
- struct ggml_tensor * ggml_set_impl(
5940
+ static struct ggml_tensor * ggml_set_impl(
6120
5941
  struct ggml_context * ctx,
6121
5942
  struct ggml_tensor * a,
6122
5943
  struct ggml_tensor * b,
@@ -6136,23 +5957,13 @@ struct ggml_tensor * ggml_set_impl(
6136
5957
  // make a view of the destination
6137
5958
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6138
5959
 
6139
- ggml_scratch_save(ctx);
6140
-
6141
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
6142
-
6143
- (( int32_t * ) c->data)[0] = nb1;
6144
- (( int32_t * ) c->data)[1] = nb2;
6145
- (( int32_t * ) c->data)[2] = nb3;
6146
- (( int32_t * ) c->data)[3] = offset;
6147
- (( int32_t * ) c->data)[4] = inplace ? 1 : 0;
6148
-
6149
- ggml_scratch_load(ctx);
5960
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5961
+ ggml_set_op_params(result, params, sizeof(params));
6150
5962
 
6151
5963
  result->op = GGML_OP_SET;
6152
5964
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6153
5965
  result->src[0] = a;
6154
5966
  result->src[1] = b;
6155
- result->src[2] = c;
6156
5967
 
6157
5968
  return result;
6158
5969
  }
@@ -6216,7 +6027,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
6216
6027
 
6217
6028
  // ggml_cpy
6218
6029
 
6219
- struct ggml_tensor * ggml_cpy_impl(
6030
+ static struct ggml_tensor * ggml_cpy_impl(
6220
6031
  struct ggml_context * ctx,
6221
6032
  struct ggml_tensor * a,
6222
6033
  struct ggml_tensor * b,
@@ -6261,7 +6072,7 @@ struct ggml_tensor * ggml_cpy_inplace(
6261
6072
 
6262
6073
  // ggml_cont
6263
6074
 
6264
- struct ggml_tensor * ggml_cont_impl(
6075
+ static struct ggml_tensor * ggml_cont_impl(
6265
6076
  struct ggml_context * ctx,
6266
6077
  struct ggml_tensor * a,
6267
6078
  bool inplace) {
@@ -6277,7 +6088,6 @@ struct ggml_tensor * ggml_cont_impl(
6277
6088
  result->op = GGML_OP_CONT;
6278
6089
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6279
6090
  result->src[0] = a;
6280
- result->src[1] = NULL;
6281
6091
 
6282
6092
  return result;
6283
6093
  }
@@ -6321,7 +6131,6 @@ struct ggml_tensor * ggml_reshape(
6321
6131
  result->op = GGML_OP_RESHAPE;
6322
6132
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6323
6133
  result->src[0] = a;
6324
- result->src[1] = NULL;
6325
6134
 
6326
6135
  return result;
6327
6136
  }
@@ -6346,7 +6155,6 @@ struct ggml_tensor * ggml_reshape_1d(
6346
6155
  result->op = GGML_OP_RESHAPE;
6347
6156
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6348
6157
  result->src[0] = a;
6349
- result->src[1] = NULL;
6350
6158
 
6351
6159
  return result;
6352
6160
  }
@@ -6372,7 +6180,6 @@ struct ggml_tensor * ggml_reshape_2d(
6372
6180
  result->op = GGML_OP_RESHAPE;
6373
6181
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6374
6182
  result->src[0] = a;
6375
- result->src[1] = NULL;
6376
6183
 
6377
6184
  return result;
6378
6185
  }
@@ -6399,7 +6206,6 @@ struct ggml_tensor * ggml_reshape_3d(
6399
6206
  result->op = GGML_OP_RESHAPE;
6400
6207
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6401
6208
  result->src[0] = a;
6402
- result->src[1] = NULL;
6403
6209
 
6404
6210
  return result;
6405
6211
  }
@@ -6428,13 +6234,33 @@ struct ggml_tensor * ggml_reshape_4d(
6428
6234
  result->op = GGML_OP_RESHAPE;
6429
6235
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6430
6236
  result->src[0] = a;
6431
- result->src[1] = NULL;
6432
6237
 
6433
6238
  return result;
6434
6239
  }
6435
6240
 
6436
6241
  // ggml_view_1d
6437
6242
 
6243
+ static struct ggml_tensor * ggml_view_tensor_offset(
6244
+ struct ggml_context * ctx,
6245
+ struct ggml_tensor * a,
6246
+ int n_dims,
6247
+ const int64_t * ne,
6248
+ size_t offset) {
6249
+ // don't calculate an offset from an unallocated tensor
6250
+ void * data = NULL;
6251
+ if (a->data != NULL) {
6252
+ data = (char *) a->data + offset;
6253
+ }
6254
+
6255
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6256
+
6257
+ ggml_format_name(result, "%s (view)", a->name);
6258
+
6259
+ ggml_set_op_params(result, &offset, sizeof(offset));
6260
+
6261
+ return result;
6262
+ }
6263
+
6438
6264
  struct ggml_tensor * ggml_view_1d(
6439
6265
  struct ggml_context * ctx,
6440
6266
  struct ggml_tensor * a,
@@ -6447,22 +6273,11 @@ struct ggml_tensor * ggml_view_1d(
6447
6273
  is_node = true;
6448
6274
  }
6449
6275
 
6450
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6451
- ggml_format_name(result, "%s (view)", a->name);
6452
-
6453
- ggml_scratch_save(ctx);
6454
-
6455
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6456
- ggml_set_name(offs, "offset");
6457
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6458
-
6459
- ggml_scratch_load(ctx);
6276
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
6460
6277
 
6461
6278
  result->op = GGML_OP_VIEW;
6462
6279
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6463
6280
  result->src[0] = a;
6464
- result->src[1] = NULL;
6465
- result->src[2] = offs;
6466
6281
 
6467
6282
  return result;
6468
6283
  }
@@ -6485,16 +6300,7 @@ struct ggml_tensor * ggml_view_2d(
6485
6300
 
6486
6301
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6487
6302
 
6488
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6489
- ggml_format_name(result, "%s (view)", a->name);
6490
-
6491
- ggml_scratch_save(ctx);
6492
-
6493
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6494
- ggml_set_name(offs, "offset");
6495
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6496
-
6497
- ggml_scratch_load(ctx);
6303
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
6498
6304
 
6499
6305
  result->nb[1] = nb1;
6500
6306
  result->nb[2] = result->nb[1]*ne1;
@@ -6503,8 +6309,6 @@ struct ggml_tensor * ggml_view_2d(
6503
6309
  result->op = GGML_OP_VIEW;
6504
6310
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6505
6311
  result->src[0] = a;
6506
- result->src[1] = NULL;
6507
- result->src[2] = offs;
6508
6312
 
6509
6313
  return result;
6510
6314
  }
@@ -6529,16 +6333,7 @@ struct ggml_tensor * ggml_view_3d(
6529
6333
 
6530
6334
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6531
6335
 
6532
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6533
- ggml_format_name(result, "%s (view)", a->name);
6534
-
6535
- ggml_scratch_save(ctx);
6536
-
6537
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6538
- ggml_set_name(offs, "offset");
6539
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6540
-
6541
- ggml_scratch_load(ctx);
6336
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
6542
6337
 
6543
6338
  result->nb[1] = nb1;
6544
6339
  result->nb[2] = nb2;
@@ -6547,8 +6342,6 @@ struct ggml_tensor * ggml_view_3d(
6547
6342
  result->op = GGML_OP_VIEW;
6548
6343
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6549
6344
  result->src[0] = a;
6550
- result->src[1] = NULL;
6551
- result->src[2] = offs;
6552
6345
 
6553
6346
  return result;
6554
6347
  }
@@ -6575,16 +6368,7 @@ struct ggml_tensor * ggml_view_4d(
6575
6368
 
6576
6369
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6577
6370
 
6578
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6579
- ggml_format_name(result, "%s (view)", a->name);
6580
-
6581
- ggml_scratch_save(ctx);
6582
-
6583
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6584
- ggml_set_name(offs, "offset");
6585
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6586
-
6587
- ggml_scratch_load(ctx);
6371
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
6588
6372
 
6589
6373
  result->nb[1] = nb1;
6590
6374
  result->nb[2] = nb2;
@@ -6593,8 +6377,6 @@ struct ggml_tensor * ggml_view_4d(
6593
6377
  result->op = GGML_OP_VIEW;
6594
6378
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6595
6379
  result->src[0] = a;
6596
- result->src[1] = NULL;
6597
- result->src[2] = offs;
6598
6380
 
6599
6381
  return result;
6600
6382
  }
@@ -6655,22 +6437,9 @@ struct ggml_tensor * ggml_permute(
6655
6437
  result->op = GGML_OP_PERMUTE;
6656
6438
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6657
6439
  result->src[0] = a;
6658
- result->src[1] = NULL;
6659
6440
 
6660
- if (is_node) {
6661
- ggml_scratch_save(ctx);
6662
-
6663
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6664
-
6665
- ((int32_t *) b->data)[0] = axis0;
6666
- ((int32_t *) b->data)[1] = axis1;
6667
- ((int32_t *) b->data)[2] = axis2;
6668
- ((int32_t *) b->data)[3] = axis3;
6669
-
6670
- ggml_scratch_load(ctx);
6671
-
6672
- result->src[2] = b;
6673
- }
6441
+ int32_t params[] = { axis0, axis1, axis2, axis3 };
6442
+ ggml_set_op_params(result, &params, sizeof(params));
6674
6443
 
6675
6444
  return result;
6676
6445
  }
@@ -6698,7 +6467,6 @@ struct ggml_tensor * ggml_transpose(
6698
6467
  result->op = GGML_OP_TRANSPOSE;
6699
6468
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6700
6469
  result->src[0] = a;
6701
- result->src[1] = NULL;
6702
6470
 
6703
6471
  return result;
6704
6472
  }
@@ -6776,7 +6544,6 @@ struct ggml_tensor * ggml_diag(
6776
6544
  result->op = GGML_OP_DIAG;
6777
6545
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6778
6546
  result->src[0] = a;
6779
- result->src[1] = NULL;
6780
6547
 
6781
6548
  return result;
6782
6549
  }
@@ -6784,7 +6551,7 @@ struct ggml_tensor * ggml_diag(
6784
6551
 
6785
6552
  // ggml_diag_mask_inf
6786
6553
 
6787
- struct ggml_tensor * ggml_diag_mask_inf_impl(
6554
+ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6788
6555
  struct ggml_context * ctx,
6789
6556
  struct ggml_tensor * a,
6790
6557
  int n_past,
@@ -6797,19 +6564,12 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
6797
6564
 
6798
6565
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6799
6566
 
6800
- ggml_scratch_save(ctx);
6801
-
6802
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6803
-
6804
- ((int32_t *) b->data)[0] = n_past;
6805
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6806
-
6807
- ggml_scratch_load(ctx);
6567
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6568
+ ggml_set_op_params(result, &params, sizeof(params));
6808
6569
 
6809
6570
  result->op = GGML_OP_DIAG_MASK_INF;
6810
6571
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6811
6572
  result->src[0] = a;
6812
- result->src[1] = b;
6813
6573
 
6814
6574
  return result;
6815
6575
  }
@@ -6831,7 +6591,7 @@ struct ggml_tensor * ggml_diag_mask_inf_inplace(
6831
6591
 
6832
6592
  // ggml_diag_mask_zero
6833
6593
 
6834
- struct ggml_tensor * ggml_diag_mask_zero_impl(
6594
+ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6835
6595
  struct ggml_context * ctx,
6836
6596
  struct ggml_tensor * a,
6837
6597
  int n_past,
@@ -6844,20 +6604,12 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
6844
6604
 
6845
6605
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6846
6606
 
6847
- ggml_scratch_save(ctx);
6848
-
6849
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6850
- ggml_set_name(b, "n_past, inplace");
6851
-
6852
- ((int32_t *) b->data)[0] = n_past;
6853
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6854
-
6855
- ggml_scratch_load(ctx);
6607
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6608
+ ggml_set_op_params(result, &params, sizeof(params));
6856
6609
 
6857
6610
  result->op = GGML_OP_DIAG_MASK_ZERO;
6858
6611
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6859
6612
  result->src[0] = a;
6860
- result->src[1] = b;
6861
6613
 
6862
6614
  return result;
6863
6615
  }
@@ -6878,7 +6630,7 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
6878
6630
 
6879
6631
  // ggml_soft_max
6880
6632
 
6881
- struct ggml_tensor * ggml_soft_max_impl(
6633
+ static struct ggml_tensor * ggml_soft_max_impl(
6882
6634
  struct ggml_context * ctx,
6883
6635
  struct ggml_tensor * a,
6884
6636
  bool inplace) {
@@ -6893,7 +6645,6 @@ struct ggml_tensor * ggml_soft_max_impl(
6893
6645
  result->op = GGML_OP_SOFT_MAX;
6894
6646
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6895
6647
  result->src[0] = a;
6896
- result->src[1] = NULL;
6897
6648
 
6898
6649
  return result;
6899
6650
  }
@@ -6913,7 +6664,7 @@ struct ggml_tensor * ggml_soft_max_inplace(
6913
6664
 
6914
6665
  // ggml_soft_max_back
6915
6666
 
6916
- struct ggml_tensor * ggml_soft_max_back_impl(
6667
+ static struct ggml_tensor * ggml_soft_max_back_impl(
6917
6668
  struct ggml_context * ctx,
6918
6669
  struct ggml_tensor * a,
6919
6670
  struct ggml_tensor * b,
@@ -6950,7 +6701,7 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
6950
6701
 
6951
6702
  // ggml_rope
6952
6703
 
6953
- struct ggml_tensor * ggml_rope_impl(
6704
+ static struct ggml_tensor * ggml_rope_impl(
6954
6705
  struct ggml_context * ctx,
6955
6706
  struct ggml_tensor * a,
6956
6707
  int n_past,
@@ -6969,23 +6720,14 @@ struct ggml_tensor * ggml_rope_impl(
6969
6720
 
6970
6721
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6971
6722
 
6972
- ggml_scratch_save(ctx);
6973
-
6974
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
6975
-
6976
- ((int32_t *) b->data)[0] = n_past;
6977
- ((int32_t *) b->data)[1] = n_dims;
6978
- ((int32_t *) b->data)[2] = mode;
6979
- ((int32_t *) b->data)[3] = n_ctx;
6980
- memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
6981
- memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
6982
-
6983
- ggml_scratch_load(ctx);
6723
+ int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6724
+ memcpy(params + 4, &freq_base, sizeof(float));
6725
+ memcpy(params + 5, &freq_scale, sizeof(float));
6726
+ ggml_set_op_params(result, &params, sizeof(params));
6984
6727
 
6985
6728
  result->op = GGML_OP_ROPE;
6986
6729
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6987
6730
  result->src[0] = a;
6988
- result->src[1] = b;
6989
6731
 
6990
6732
  return result;
6991
6733
  }
@@ -7010,6 +6752,18 @@ struct ggml_tensor * ggml_rope_inplace(
7010
6752
  return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
7011
6753
  }
7012
6754
 
6755
+ struct ggml_tensor * ggml_rope_custom(
6756
+ struct ggml_context * ctx,
6757
+ struct ggml_tensor * a,
6758
+ int n_past,
6759
+ int n_dims,
6760
+ int mode,
6761
+ int n_ctx,
6762
+ float freq_base,
6763
+ float freq_scale) {
6764
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
6765
+ }
6766
+
7013
6767
  struct ggml_tensor * ggml_rope_custom_inplace(
7014
6768
  struct ggml_context * ctx,
7015
6769
  struct ggml_tensor * a,
@@ -7042,22 +6796,12 @@ struct ggml_tensor * ggml_rope_back(
7042
6796
 
7043
6797
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7044
6798
 
7045
- ggml_scratch_save(ctx);
7046
-
7047
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7048
- ggml_set_name(b, "n_past, n_dims, mode");
7049
-
7050
- ((int32_t *) b->data)[0] = n_past;
7051
- ((int32_t *) b->data)[1] = n_dims;
7052
- ((int32_t *) b->data)[2] = mode;
7053
- ((int32_t *) b->data)[3] = n_ctx;
7054
-
7055
- ggml_scratch_load(ctx);
6799
+ int32_t params[] = { n_past, n_dims, mode, n_ctx };
6800
+ ggml_set_op_params(result, &params, sizeof(params));
7056
6801
 
7057
6802
  result->op = GGML_OP_ROPE_BACK;
7058
6803
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7059
6804
  result->src[0] = a;
7060
- result->src[1] = b;
7061
6805
 
7062
6806
  return result;
7063
6807
  }
@@ -7082,21 +6826,13 @@ struct ggml_tensor * ggml_alibi(
7082
6826
  //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7083
6827
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7084
6828
 
7085
- ggml_scratch_save(ctx);
7086
-
7087
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7088
-
7089
- ((int32_t *) b->data)[0] = n_past;
7090
- ((int32_t *) b->data)[1] = n_head;
7091
- GGML_ASSERT(sizeof(float) == sizeof(int32_t));
7092
- (((float *) b->data)[2]) = bias_max;
7093
-
7094
- ggml_scratch_load(ctx);
6829
+ int32_t op_params[3] = { n_past, n_head };
6830
+ memcpy(op_params + 2, &bias_max, sizeof(float));
6831
+ ggml_set_op_params(result, &op_params, sizeof(op_params));
7095
6832
 
7096
6833
  result->op = GGML_OP_ALIBI;
7097
6834
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7098
6835
  result->src[0] = a;
7099
- result->src[1] = b;
7100
6836
 
7101
6837
  return result;
7102
6838
  }
@@ -7118,19 +6854,12 @@ struct ggml_tensor * ggml_clamp(
7118
6854
  // TODO: when implement backward, fix this:
7119
6855
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7120
6856
 
7121
- ggml_scratch_save(ctx);
7122
-
7123
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
7124
-
7125
- ((float *) b->data)[0] = min;
7126
- ((float *) b->data)[1] = max;
7127
-
7128
- ggml_scratch_load(ctx);
6857
+ float params[] = { min, max };
6858
+ ggml_set_op_params(result, &params, sizeof(params));
7129
6859
 
7130
6860
  result->op = GGML_OP_CLAMP;
7131
6861
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7132
6862
  result->src[0] = a;
7133
- result->src[1] = b;
7134
6863
 
7135
6864
  return result;
7136
6865
  }
@@ -7163,18 +6892,13 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
7163
6892
  };
7164
6893
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7165
6894
 
7166
- ggml_scratch_save(ctx);
7167
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7168
- ((int32_t*)c->data)[0] = s0;
7169
- ((int32_t*)c->data)[1] = p0;
7170
- ((int32_t*)c->data)[2] = d0;
7171
- ggml_scratch_load(ctx);
6895
+ int32_t params[] = { s0, p0, d0 };
6896
+ ggml_set_op_params(result, &params, sizeof(params));
7172
6897
 
7173
6898
  result->op = GGML_OP_CONV_1D;
7174
6899
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7175
6900
  result->src[0] = a;
7176
6901
  result->src[1] = b;
7177
- result->src[2] = c;
7178
6902
 
7179
6903
  return result;
7180
6904
  }
@@ -7207,21 +6931,13 @@ struct ggml_tensor* ggml_conv_2d(
7207
6931
  };
7208
6932
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7209
6933
 
7210
- ggml_scratch_save(ctx);
7211
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
7212
- ((int32_t*)c->data)[0] = s0;
7213
- ((int32_t*)c->data)[1] = s1;
7214
- ((int32_t*)c->data)[2] = p0;
7215
- ((int32_t*)c->data)[3] = p1;
7216
- ((int32_t*)c->data)[4] = d0;
7217
- ((int32_t*)c->data)[5] = d1;
7218
- ggml_scratch_load(ctx);
6934
+ int32_t params[] = { s0, s1, p0, p1, d0, d1 };
6935
+ ggml_set_op_params(result, &params, sizeof(params));
7219
6936
 
7220
6937
  result->op = GGML_OP_CONV_2D;
7221
6938
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7222
6939
  result->src[0] = a;
7223
6940
  result->src[1] = b;
7224
- result->src[2] = c;
7225
6941
 
7226
6942
  return result;
7227
6943
 
@@ -7245,7 +6961,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
7245
6961
  return (ins + 2 * p - ks) / s + 1;
7246
6962
  }
7247
6963
 
7248
- // ggml_pool_2d
6964
+ // ggml_pool_1d
7249
6965
 
7250
6966
  struct ggml_tensor* ggml_pool_1d(
7251
6967
  struct ggml_context * ctx,
@@ -7268,18 +6984,12 @@ struct ggml_tensor* ggml_pool_1d(
7268
6984
  };
7269
6985
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7270
6986
 
7271
- ggml_scratch_save(ctx);
7272
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7273
- ((int32_t*)c->data)[0] = op;
7274
- ((int32_t*)c->data)[1] = k0;
7275
- ((int32_t*)c->data)[2] = s0;
7276
- ((int32_t*)c->data)[3] = p0;
7277
- ggml_scratch_load(ctx);
6987
+ int32_t params[] = { op, k0, s0, p0 };
6988
+ ggml_set_op_params(result, &params, sizeof(params));
7278
6989
 
7279
6990
  result->op = GGML_OP_POOL_1D;
7280
6991
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7281
6992
  result->src[0] = a;
7282
- result->src[1] = c;
7283
6993
 
7284
6994
  return result;
7285
6995
  }
@@ -7311,21 +7021,12 @@ struct ggml_tensor* ggml_pool_2d(
7311
7021
  };
7312
7022
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7313
7023
 
7314
- ggml_scratch_save(ctx);
7315
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
7316
- ((int32_t*)c->data)[0] = op;
7317
- ((int32_t*)c->data)[1] = k0;
7318
- ((int32_t*)c->data)[2] = k1;
7319
- ((int32_t*)c->data)[3] = s0;
7320
- ((int32_t*)c->data)[4] = s1;
7321
- ((int32_t*)c->data)[5] = p0;
7322
- ((int32_t*)c->data)[6] = p1;
7323
- ggml_scratch_load(ctx);
7024
+ int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
7025
+ ggml_set_op_params(result, &params, sizeof(params));
7324
7026
 
7325
7027
  result->op = GGML_OP_POOL_2D;
7326
7028
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7327
7029
  result->src[0] = a;
7328
- result->src[1] = c;
7329
7030
 
7330
7031
  return result;
7331
7032
  }
@@ -7348,14 +7049,16 @@ struct ggml_tensor * ggml_flash_attn(
7348
7049
  }
7349
7050
 
7350
7051
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
7351
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne);
7052
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
7053
+
7054
+ int32_t t = masked ? 1 : 0;
7055
+ ggml_set_op_params(result, &t, sizeof(t));
7352
7056
 
7353
7057
  result->op = GGML_OP_FLASH_ATTN;
7354
7058
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7355
7059
  result->src[0] = q;
7356
7060
  result->src[1] = k;
7357
7061
  result->src[2] = v;
7358
- result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
7359
7062
 
7360
7063
  return result;
7361
7064
  }
@@ -7379,7 +7082,7 @@ struct ggml_tensor * ggml_flash_ff(
7379
7082
  }
7380
7083
 
7381
7084
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7382
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);
7085
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
7383
7086
 
7384
7087
  result->op = GGML_OP_FLASH_FF;
7385
7088
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7445,13 +7148,15 @@ struct ggml_tensor * ggml_flash_attn_back(
7445
7148
 
7446
7149
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7447
7150
 
7151
+ int32_t masked_i = masked ? 1 : 0;
7152
+ ggml_set_op_params(result, &masked_i, sizeof(masked_i));
7153
+
7448
7154
  result->op = GGML_OP_FLASH_ATTN_BACK;
7449
7155
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7450
7156
  result->src[0] = q;
7451
7157
  result->src[1] = k;
7452
7158
  result->src[2] = v;
7453
7159
  result->src[3] = d;
7454
- result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
7455
7160
 
7456
7161
  return result;
7457
7162
  }
@@ -7484,21 +7189,12 @@ struct ggml_tensor * ggml_win_part(
7484
7189
 
7485
7190
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7486
7191
 
7487
- ggml_scratch_save(ctx);
7488
-
7489
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7490
-
7491
- ((int32_t *) b->data)[0] = npx;
7492
- ((int32_t *) b->data)[1] = npy;
7493
- ((int32_t *) b->data)[2] = w;
7494
-
7495
- ggml_scratch_load(ctx);
7192
+ int32_t params[] = { npx, npy, w };
7193
+ ggml_set_op_params(result, &params, sizeof(params));
7496
7194
 
7497
7195
  result->op = GGML_OP_WIN_PART;
7498
7196
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7499
7197
  result->src[0] = a;
7500
- result->src[1] = NULL;
7501
- result->src[2] = b;
7502
7198
 
7503
7199
  return result;
7504
7200
  }
@@ -7523,26 +7219,57 @@ struct ggml_tensor * ggml_win_unpart(
7523
7219
  const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
7524
7220
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7525
7221
 
7526
- ggml_scratch_save(ctx);
7222
+ int32_t params[] = { w };
7223
+ ggml_set_op_params(result, &params, sizeof(params));
7224
+
7225
+ result->op = GGML_OP_WIN_UNPART;
7226
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7227
+ result->src[0] = a;
7527
7228
 
7528
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
7229
+ return result;
7230
+ }
7529
7231
 
7530
- ((int32_t *) b->data)[0] = w;
7232
+ // gmml_unary
7531
7233
 
7532
- ggml_scratch_load(ctx);
7234
+ static struct ggml_tensor * ggml_unary_impl(
7235
+ struct ggml_context * ctx,
7236
+ struct ggml_tensor * a,
7237
+ enum ggml_unary_op op,
7238
+ bool inplace) {
7239
+ bool is_node = false;
7533
7240
 
7534
- result->op = GGML_OP_WIN_UNPART;
7241
+ if (!inplace && (a->grad)) {
7242
+ is_node = true;
7243
+ }
7244
+
7245
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7246
+
7247
+ ggml_set_op_params_i32(result, 0, (int32_t) op);
7248
+
7249
+ result->op = GGML_OP_UNARY;
7535
7250
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7536
7251
  result->src[0] = a;
7537
- result->src[1] = NULL;
7538
- result->src[2] = b;
7539
7252
 
7540
7253
  return result;
7541
7254
  }
7542
7255
 
7256
+ struct ggml_tensor * ggml_unary(
7257
+ struct ggml_context * ctx,
7258
+ struct ggml_tensor * a,
7259
+ enum ggml_unary_op op) {
7260
+ return ggml_unary_impl(ctx, a, op, false);
7261
+ }
7262
+
7263
+ struct ggml_tensor * ggml_unary_inplace(
7264
+ struct ggml_context * ctx,
7265
+ struct ggml_tensor * a,
7266
+ enum ggml_unary_op op) {
7267
+ return ggml_unary_impl(ctx, a, op, true);
7268
+ }
7269
+
7543
7270
  // ggml_map_unary
7544
7271
 
7545
- struct ggml_tensor * ggml_map_unary_impl_f32(
7272
+ static struct ggml_tensor * ggml_map_unary_impl_f32(
7546
7273
  struct ggml_context * ctx,
7547
7274
  struct ggml_tensor * a,
7548
7275
  const ggml_unary_op_f32_t fun,
@@ -7553,19 +7280,13 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
7553
7280
  is_node = true;
7554
7281
  }
7555
7282
 
7556
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7557
-
7558
- ggml_scratch_save(ctx);
7283
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7559
7284
 
7560
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7561
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7562
-
7563
- ggml_scratch_load(ctx);
7285
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7564
7286
 
7565
7287
  result->op = GGML_OP_MAP_UNARY;
7566
7288
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7567
7289
  result->src[0] = a;
7568
- result->src[2] = addr_tensor;
7569
7290
 
7570
7291
  return result;
7571
7292
  }
@@ -7586,7 +7307,7 @@ struct ggml_tensor * ggml_map_unary_inplace_f32(
7586
7307
 
7587
7308
  // ggml_map_binary
7588
7309
 
7589
- struct ggml_tensor * ggml_map_binary_impl_f32(
7310
+ static struct ggml_tensor * ggml_map_binary_impl_f32(
7590
7311
  struct ggml_context * ctx,
7591
7312
  struct ggml_tensor * a,
7592
7313
  struct ggml_tensor * b,
@@ -7600,20 +7321,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
7600
7321
  is_node = true;
7601
7322
  }
7602
7323
 
7603
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7604
-
7605
- ggml_scratch_save(ctx);
7606
-
7607
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7608
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7324
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7609
7325
 
7610
- ggml_scratch_load(ctx);
7326
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7611
7327
 
7612
7328
  result->op = GGML_OP_MAP_BINARY;
7613
7329
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7614
7330
  result->src[0] = a;
7615
7331
  result->src[1] = b;
7616
- result->src[2] = addr_tensor;
7617
7332
 
7618
7333
  return result;
7619
7334
  }
@@ -7636,7 +7351,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7636
7351
 
7637
7352
  // ggml_map_custom1
7638
7353
 
7639
- struct ggml_tensor * ggml_map_custom1_impl_f32(
7354
+ static struct ggml_tensor * ggml_map_custom1_impl_f32(
7640
7355
  struct ggml_context * ctx,
7641
7356
  struct ggml_tensor * a,
7642
7357
  const ggml_custom1_op_f32_t fun,
@@ -7647,19 +7362,13 @@ struct ggml_tensor * ggml_map_custom1_impl_f32(
7647
7362
  is_node = true;
7648
7363
  }
7649
7364
 
7650
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7651
-
7652
- ggml_scratch_save(ctx);
7653
-
7654
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7655
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7365
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7656
7366
 
7657
- ggml_scratch_load(ctx);
7367
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7658
7368
 
7659
7369
  result->op = GGML_OP_MAP_CUSTOM1;
7660
7370
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7661
7371
  result->src[0] = a;
7662
- result->src[2] = addr_tensor;
7663
7372
 
7664
7373
  return result;
7665
7374
  }
@@ -7680,7 +7389,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7680
7389
 
7681
7390
  // ggml_map_custom2
7682
7391
 
7683
- struct ggml_tensor * ggml_map_custom2_impl_f32(
7392
+ static struct ggml_tensor * ggml_map_custom2_impl_f32(
7684
7393
  struct ggml_context * ctx,
7685
7394
  struct ggml_tensor * a,
7686
7395
  struct ggml_tensor * b,
@@ -7692,20 +7401,14 @@ struct ggml_tensor * ggml_map_custom2_impl_f32(
7692
7401
  is_node = true;
7693
7402
  }
7694
7403
 
7695
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7696
-
7697
- ggml_scratch_save(ctx);
7404
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7698
7405
 
7699
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7700
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7701
-
7702
- ggml_scratch_load(ctx);
7406
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7703
7407
 
7704
7408
  result->op = GGML_OP_MAP_CUSTOM2;
7705
7409
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7706
7410
  result->src[0] = a;
7707
7411
  result->src[1] = b;
7708
- result->src[2] = addr_tensor;
7709
7412
 
7710
7413
  return result;
7711
7414
  }
@@ -7728,7 +7431,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7728
7431
 
7729
7432
  // ggml_map_custom3
7730
7433
 
7731
- struct ggml_tensor * ggml_map_custom3_impl_f32(
7434
+ static struct ggml_tensor * ggml_map_custom3_impl_f32(
7732
7435
  struct ggml_context * ctx,
7733
7436
  struct ggml_tensor * a,
7734
7437
  struct ggml_tensor * b,
@@ -7741,21 +7444,15 @@ struct ggml_tensor * ggml_map_custom3_impl_f32(
7741
7444
  is_node = true;
7742
7445
  }
7743
7446
 
7744
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7745
-
7746
- ggml_scratch_save(ctx);
7747
-
7748
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7749
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7447
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7750
7448
 
7751
- ggml_scratch_load(ctx);
7449
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7752
7450
 
7753
7451
  result->op = GGML_OP_MAP_CUSTOM3;
7754
7452
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7755
7453
  result->src[0] = a;
7756
7454
  result->src[1] = b;
7757
- result->src[2] = addr_tensor;
7758
- result->src[3] = c;
7455
+ result->src[2] = c;
7759
7456
 
7760
7457
  return result;
7761
7458
  }
@@ -8983,21 +8680,17 @@ static void ggml_compute_forward_acc_f32(
8983
8680
  const struct ggml_compute_params * params,
8984
8681
  const struct ggml_tensor * src0,
8985
8682
  const struct ggml_tensor * src1,
8986
- const struct ggml_tensor * opt0,
8987
8683
  struct ggml_tensor * dst) {
8988
8684
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8989
8685
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
8990
8686
 
8991
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
8992
- GGML_ASSERT(ggml_nelements(opt0) == 5);
8993
-
8994
8687
  // view src0 and dst with these strides and data offset inbytes during acc
8995
8688
  // nb0 is implicitely element_size because src0 and dst are contiguous
8996
- size_t nb1 = ((int32_t *) opt0->data)[0];
8997
- size_t nb2 = ((int32_t *) opt0->data)[1];
8998
- size_t nb3 = ((int32_t *) opt0->data)[2];
8999
- size_t offset = ((int32_t *) opt0->data)[3];
9000
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
8689
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
8690
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
8691
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
8692
+ size_t offset = ((int32_t *) dst->op_params)[3];
8693
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
9001
8694
 
9002
8695
  if (!inplace && (params->type == GGML_TASK_INIT)) {
9003
8696
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -9066,13 +8759,12 @@ static void ggml_compute_forward_acc(
9066
8759
  const struct ggml_compute_params * params,
9067
8760
  const struct ggml_tensor * src0,
9068
8761
  const struct ggml_tensor * src1,
9069
- const struct ggml_tensor * opt0,
9070
8762
  struct ggml_tensor * dst) {
9071
8763
 
9072
8764
  switch (src0->type) {
9073
8765
  case GGML_TYPE_F32:
9074
8766
  {
9075
- ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst);
8767
+ ggml_compute_forward_acc_f32(params, src0, src1, dst);
9076
8768
  } break;
9077
8769
  case GGML_TYPE_F16:
9078
8770
  case GGML_TYPE_Q4_0:
@@ -9504,7 +9196,7 @@ static void ggml_compute_forward_sum_f32(
9504
9196
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9505
9197
  for (int64_t i02 = 0; i02 < ne02; i02++) {
9506
9198
  for (int64_t i01 = 0; i01 < ne01; i01++) {
9507
- ggml_vec_sum_ggf(ne00,
9199
+ ggml_vec_sum_f32_ggf(ne00,
9508
9200
  &row_sum,
9509
9201
  (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
9510
9202
  sum += row_sum;
@@ -9514,6 +9206,38 @@ static void ggml_compute_forward_sum_f32(
9514
9206
  ((float *) dst->data)[0] = sum;
9515
9207
  }
9516
9208
 
9209
+ static void ggml_compute_forward_sum_f16(
9210
+ const struct ggml_compute_params * params,
9211
+ const struct ggml_tensor * src0,
9212
+ struct ggml_tensor * dst) {
9213
+ assert(params->ith == 0);
9214
+ assert(ggml_is_scalar(dst));
9215
+
9216
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9217
+ return;
9218
+ }
9219
+
9220
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
9221
+
9222
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
9223
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb);
9224
+
9225
+ float sum = 0;
9226
+ float row_sum = 0;
9227
+
9228
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
9229
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
9230
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
9231
+ ggml_vec_sum_f16_ggf(ne00,
9232
+ &row_sum,
9233
+ (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
9234
+ sum += row_sum;
9235
+ }
9236
+ }
9237
+ }
9238
+ ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
9239
+ }
9240
+
9517
9241
  static void ggml_compute_forward_sum(
9518
9242
  const struct ggml_compute_params * params,
9519
9243
  const struct ggml_tensor * src0,
@@ -9523,6 +9247,10 @@ static void ggml_compute_forward_sum(
9523
9247
  {
9524
9248
  ggml_compute_forward_sum_f32(params, src0, dst);
9525
9249
  } break;
9250
+ case GGML_TYPE_F16:
9251
+ {
9252
+ ggml_compute_forward_sum_f16(params, src0, dst);
9253
+ } break;
9526
9254
  default:
9527
9255
  {
9528
9256
  GGML_ASSERT(false);
@@ -10118,8 +9846,8 @@ static void ggml_compute_forward_gelu_f32(
10118
9846
  const struct ggml_compute_params * params,
10119
9847
  const struct ggml_tensor * src0,
10120
9848
  struct ggml_tensor * dst) {
10121
- GGML_ASSERT(ggml_is_contiguous(src0));
10122
- GGML_ASSERT(ggml_is_contiguous(dst));
9849
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9850
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10123
9851
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10124
9852
 
10125
9853
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10177,8 +9905,8 @@ static void ggml_compute_forward_gelu_quick_f32(
10177
9905
  const struct ggml_compute_params * params,
10178
9906
  const struct ggml_tensor * src0,
10179
9907
  struct ggml_tensor * dst) {
10180
- GGML_ASSERT(ggml_is_contiguous(src0));
10181
- GGML_ASSERT(ggml_is_contiguous(dst));
9908
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9909
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10182
9910
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10183
9911
 
10184
9912
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10236,8 +9964,8 @@ static void ggml_compute_forward_silu_f32(
10236
9964
  const struct ggml_compute_params * params,
10237
9965
  const struct ggml_tensor * src0,
10238
9966
  struct ggml_tensor * dst) {
10239
- GGML_ASSERT(ggml_is_contiguous(src0));
10240
- GGML_ASSERT(ggml_is_contiguous(dst));
9967
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9968
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10241
9969
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10242
9970
 
10243
9971
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10289,7 +10017,6 @@ static void ggml_compute_forward_silu(
10289
10017
  }
10290
10018
  }
10291
10019
 
10292
-
10293
10020
  // ggml_compute_forward_silu_back
10294
10021
 
10295
10022
  static void ggml_compute_forward_silu_back_f32(
@@ -10297,9 +10024,9 @@ static void ggml_compute_forward_silu_back_f32(
10297
10024
  const struct ggml_tensor * src0,
10298
10025
  const struct ggml_tensor * grad,
10299
10026
  struct ggml_tensor * dst) {
10300
- GGML_ASSERT(ggml_is_contiguous(grad));
10301
- GGML_ASSERT(ggml_is_contiguous(src0));
10302
- GGML_ASSERT(ggml_is_contiguous(dst));
10027
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
10028
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
10029
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10303
10030
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10304
10031
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
10305
10032
 
@@ -10439,7 +10166,8 @@ static void ggml_compute_forward_rms_norm_f32(
10439
10166
 
10440
10167
  GGML_TENSOR_UNARY_OP_LOCALS;
10441
10168
 
10442
- const float eps = 1e-6f; // TODO: make this a parameter
10169
+ float eps;
10170
+ memcpy(&eps, dst->op_params, sizeof(float));
10443
10171
 
10444
10172
  // TODO: optimize
10445
10173
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -11092,21 +10820,17 @@ static void ggml_compute_forward_set_f32(
11092
10820
  const struct ggml_compute_params * params,
11093
10821
  const struct ggml_tensor * src0,
11094
10822
  const struct ggml_tensor * src1,
11095
- const struct ggml_tensor * opt0,
11096
10823
  struct ggml_tensor * dst) {
11097
10824
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11098
10825
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
11099
10826
 
11100
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
11101
- GGML_ASSERT(ggml_nelements(opt0) == 5);
11102
-
11103
10827
  // view src0 and dst with these strides and data offset inbytes during set
11104
10828
  // nb0 is implicitely element_size because src0 and dst are contiguous
11105
- size_t nb1 = ((int32_t *) opt0->data)[0];
11106
- size_t nb2 = ((int32_t *) opt0->data)[1];
11107
- size_t nb3 = ((int32_t *) opt0->data)[2];
11108
- size_t offset = ((int32_t *) opt0->data)[3];
11109
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
10829
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
10830
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
10831
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
10832
+ size_t offset = ((int32_t *) dst->op_params)[3];
10833
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
11110
10834
 
11111
10835
  if (!inplace && (params->type == GGML_TASK_INIT)) {
11112
10836
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -11166,13 +10890,12 @@ static void ggml_compute_forward_set(
11166
10890
  const struct ggml_compute_params * params,
11167
10891
  const struct ggml_tensor * src0,
11168
10892
  const struct ggml_tensor * src1,
11169
- const struct ggml_tensor * opt0,
11170
10893
  struct ggml_tensor * dst) {
11171
10894
 
11172
10895
  switch (src0->type) {
11173
10896
  case GGML_TYPE_F32:
11174
10897
  {
11175
- ggml_compute_forward_set_f32(params, src0, src1, opt0, dst);
10898
+ ggml_compute_forward_set_f32(params, src0, src1, dst);
11176
10899
  } break;
11177
10900
  case GGML_TYPE_F16:
11178
10901
  case GGML_TYPE_Q4_0:
@@ -11568,17 +11291,14 @@ static void ggml_compute_forward_diag(
11568
11291
  static void ggml_compute_forward_diag_mask_f32(
11569
11292
  const struct ggml_compute_params * params,
11570
11293
  const struct ggml_tensor * src0,
11571
- const struct ggml_tensor * src1,
11572
11294
  struct ggml_tensor * dst,
11573
11295
  const float value) {
11574
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11575
- GGML_ASSERT(ggml_nelements(src1) == 2);
11576
11296
 
11577
11297
  const int ith = params->ith;
11578
11298
  const int nth = params->nth;
11579
11299
 
11580
- const int n_past = ((int32_t *) src1->data)[0];
11581
- const bool inplace = (bool)((int32_t *) src1->data)[1];
11300
+ const int n_past = ((int32_t *) dst->op_params)[0];
11301
+ const bool inplace = (bool)((int32_t *) dst->op_params)[1];
11582
11302
 
11583
11303
  GGML_ASSERT(n_past >= 0);
11584
11304
 
@@ -11621,12 +11341,11 @@ static void ggml_compute_forward_diag_mask_f32(
11621
11341
  static void ggml_compute_forward_diag_mask_inf(
11622
11342
  const struct ggml_compute_params * params,
11623
11343
  const struct ggml_tensor * src0,
11624
- const struct ggml_tensor * src1,
11625
11344
  struct ggml_tensor * dst) {
11626
11345
  switch (src0->type) {
11627
11346
  case GGML_TYPE_F32:
11628
11347
  {
11629
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY);
11348
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
11630
11349
  } break;
11631
11350
  default:
11632
11351
  {
@@ -11638,12 +11357,11 @@ static void ggml_compute_forward_diag_mask_inf(
11638
11357
  static void ggml_compute_forward_diag_mask_zero(
11639
11358
  const struct ggml_compute_params * params,
11640
11359
  const struct ggml_tensor * src0,
11641
- const struct ggml_tensor * src1,
11642
11360
  struct ggml_tensor * dst) {
11643
11361
  switch (src0->type) {
11644
11362
  case GGML_TYPE_F32:
11645
11363
  {
11646
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0);
11364
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
11647
11365
  } break;
11648
11366
  default:
11649
11367
  {
@@ -11841,20 +11559,17 @@ static void ggml_compute_forward_soft_max_back(
11841
11559
  static void ggml_compute_forward_alibi_f32(
11842
11560
  const struct ggml_compute_params * params,
11843
11561
  const struct ggml_tensor * src0,
11844
- const struct ggml_tensor * src1,
11845
11562
  struct ggml_tensor * dst) {
11846
11563
  assert(params->ith == 0);
11847
11564
 
11848
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11849
- GGML_ASSERT(ggml_nelements(src1) == 3);
11850
-
11851
11565
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11852
11566
  return;
11853
11567
  }
11854
11568
 
11855
- const int n_past = ((int32_t *) src1->data)[0];
11856
- const int n_head = ((int32_t *) src1->data)[1];
11857
- const float max_bias = ((float *) src1->data)[2];
11569
+ const int n_past = ((int32_t *) dst->op_params)[0];
11570
+ const int n_head = ((int32_t *) dst->op_params)[1];
11571
+ float max_bias;
11572
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11858
11573
 
11859
11574
  assert(n_past >= 0);
11860
11575
 
@@ -11907,20 +11622,17 @@ static void ggml_compute_forward_alibi_f32(
11907
11622
  static void ggml_compute_forward_alibi_f16(
11908
11623
  const struct ggml_compute_params * params,
11909
11624
  const struct ggml_tensor * src0,
11910
- const struct ggml_tensor * src1,
11911
11625
  struct ggml_tensor * dst) {
11912
11626
  assert(params->ith == 0);
11913
11627
 
11914
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11915
- GGML_ASSERT(ggml_nelements(src1) == 3);
11916
-
11917
11628
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11918
11629
  return;
11919
11630
  }
11920
11631
 
11921
- const int n_past = ((int32_t *) src1->data)[0];
11922
- const int n_head = ((int32_t *) src1->data)[1];
11923
- const float max_bias = ((float *) src1->data)[2];
11632
+ const int n_past = ((int32_t *) dst->op_params)[0];
11633
+ const int n_head = ((int32_t *) dst->op_params)[1];
11634
+ float max_bias;
11635
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11924
11636
 
11925
11637
  assert(n_past >= 0);
11926
11638
 
@@ -11973,16 +11685,15 @@ static void ggml_compute_forward_alibi_f16(
11973
11685
  static void ggml_compute_forward_alibi(
11974
11686
  const struct ggml_compute_params * params,
11975
11687
  const struct ggml_tensor * src0,
11976
- const struct ggml_tensor * src1,
11977
11688
  struct ggml_tensor * dst) {
11978
11689
  switch (src0->type) {
11979
11690
  case GGML_TYPE_F16:
11980
11691
  {
11981
- ggml_compute_forward_alibi_f16(params, src0, src1, dst);
11692
+ ggml_compute_forward_alibi_f16(params, src0, dst);
11982
11693
  } break;
11983
11694
  case GGML_TYPE_F32:
11984
11695
  {
11985
- ggml_compute_forward_alibi_f32(params, src0, src1, dst);
11696
+ ggml_compute_forward_alibi_f32(params, src0, dst);
11986
11697
  } break;
11987
11698
  case GGML_TYPE_Q4_0:
11988
11699
  case GGML_TYPE_Q4_1:
@@ -12012,19 +11723,17 @@ static void ggml_compute_forward_alibi(
12012
11723
  static void ggml_compute_forward_clamp_f32(
12013
11724
  const struct ggml_compute_params * params,
12014
11725
  const struct ggml_tensor * src0,
12015
- const struct ggml_tensor * src1,
12016
11726
  struct ggml_tensor * dst) {
12017
11727
  assert(params->ith == 0);
12018
11728
 
12019
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
12020
- GGML_ASSERT(ggml_nelements(src1) == 2);
12021
-
12022
11729
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12023
11730
  return;
12024
11731
  }
12025
11732
 
12026
- const float min = ((float *) src1->data)[0];
12027
- const float max = ((float *) src1->data)[1];
11733
+ float min;
11734
+ float max;
11735
+ memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
11736
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
12028
11737
 
12029
11738
  const int ith = params->ith;
12030
11739
  const int nth = params->nth;
@@ -12054,12 +11763,11 @@ static void ggml_compute_forward_clamp_f32(
12054
11763
  static void ggml_compute_forward_clamp(
12055
11764
  const struct ggml_compute_params * params,
12056
11765
  const struct ggml_tensor * src0,
12057
- const struct ggml_tensor * src1,
12058
11766
  struct ggml_tensor * dst) {
12059
11767
  switch (src0->type) {
12060
11768
  case GGML_TYPE_F32:
12061
11769
  {
12062
- ggml_compute_forward_clamp_f32(params, src0, src1, dst);
11770
+ ggml_compute_forward_clamp_f32(params, src0, dst);
12063
11771
  } break;
12064
11772
  case GGML_TYPE_F16:
12065
11773
  case GGML_TYPE_Q4_0:
@@ -12089,10 +11797,7 @@ static void ggml_compute_forward_clamp(
12089
11797
  static void ggml_compute_forward_rope_f32(
12090
11798
  const struct ggml_compute_params * params,
12091
11799
  const struct ggml_tensor * src0,
12092
- const struct ggml_tensor * src1,
12093
11800
  struct ggml_tensor * dst) {
12094
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12095
- GGML_ASSERT(ggml_nelements(src1) == 6);
12096
11801
 
12097
11802
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12098
11803
  return;
@@ -12101,12 +11806,12 @@ static void ggml_compute_forward_rope_f32(
12101
11806
  float freq_base;
12102
11807
  float freq_scale;
12103
11808
 
12104
- const int n_past = ((int32_t *) src1->data)[0];
12105
- const int n_dims = ((int32_t *) src1->data)[1];
12106
- const int mode = ((int32_t *) src1->data)[2];
12107
- const int n_ctx = ((int32_t *) src1->data)[3];
12108
- memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12109
- memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
11809
+ const int n_past = ((int32_t *) dst->op_params)[0];
11810
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11811
+ const int mode = ((int32_t *) dst->op_params)[2];
11812
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11813
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11814
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12110
11815
 
12111
11816
  assert(n_past >= 0);
12112
11817
 
@@ -12221,10 +11926,7 @@ static void ggml_compute_forward_rope_f32(
12221
11926
  static void ggml_compute_forward_rope_f16(
12222
11927
  const struct ggml_compute_params * params,
12223
11928
  const struct ggml_tensor * src0,
12224
- const struct ggml_tensor * src1,
12225
11929
  struct ggml_tensor * dst) {
12226
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12227
- GGML_ASSERT(ggml_nelements(src1) == 6);
12228
11930
 
12229
11931
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12230
11932
  return;
@@ -12233,12 +11935,12 @@ static void ggml_compute_forward_rope_f16(
12233
11935
  float freq_base;
12234
11936
  float freq_scale;
12235
11937
 
12236
- const int n_past = ((int32_t *) src1->data)[0];
12237
- const int n_dims = ((int32_t *) src1->data)[1];
12238
- const int mode = ((int32_t *) src1->data)[2];
12239
- const int n_ctx = ((int32_t *) src1->data)[3];
12240
- memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12241
- memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
11938
+ const int n_past = ((int32_t *) dst->op_params)[0];
11939
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11940
+ const int mode = ((int32_t *) dst->op_params)[2];
11941
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11942
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11943
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12242
11944
 
12243
11945
  assert(n_past >= 0);
12244
11946
 
@@ -12353,16 +12055,15 @@ static void ggml_compute_forward_rope_f16(
12353
12055
  static void ggml_compute_forward_rope(
12354
12056
  const struct ggml_compute_params * params,
12355
12057
  const struct ggml_tensor * src0,
12356
- const struct ggml_tensor * src1,
12357
12058
  struct ggml_tensor * dst) {
12358
12059
  switch (src0->type) {
12359
12060
  case GGML_TYPE_F16:
12360
12061
  {
12361
- ggml_compute_forward_rope_f16(params, src0, src1, dst);
12062
+ ggml_compute_forward_rope_f16(params, src0, dst);
12362
12063
  } break;
12363
12064
  case GGML_TYPE_F32:
12364
12065
  {
12365
- ggml_compute_forward_rope_f32(params, src0, src1, dst);
12066
+ ggml_compute_forward_rope_f32(params, src0, dst);
12366
12067
  } break;
12367
12068
  default:
12368
12069
  {
@@ -12376,10 +12077,7 @@ static void ggml_compute_forward_rope(
12376
12077
  static void ggml_compute_forward_rope_back_f32(
12377
12078
  const struct ggml_compute_params * params,
12378
12079
  const struct ggml_tensor * src0,
12379
- const struct ggml_tensor * src1,
12380
12080
  struct ggml_tensor * dst) {
12381
- assert(src1->type == GGML_TYPE_I32);
12382
- assert(ggml_nelements(src1) == 4);
12383
12081
 
12384
12082
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12385
12083
  return;
@@ -12389,9 +12087,9 @@ static void ggml_compute_forward_rope_back_f32(
12389
12087
  // dx = rope_back(dy, src1)
12390
12088
  // src0 is dy, src1 contains options
12391
12089
 
12392
- const int n_past = ((int32_t *) src1->data)[0];
12393
- const int n_dims = ((int32_t *) src1->data)[1];
12394
- const int mode = ((int32_t *) src1->data)[2];
12090
+ const int n_past = ((int32_t *) dst->op_params)[0];
12091
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12092
+ const int mode = ((int32_t *) dst->op_params)[2];
12395
12093
 
12396
12094
  assert(n_past >= 0);
12397
12095
 
@@ -12475,10 +12173,7 @@ static void ggml_compute_forward_rope_back_f32(
12475
12173
  static void ggml_compute_forward_rope_back_f16(
12476
12174
  const struct ggml_compute_params * params,
12477
12175
  const struct ggml_tensor * src0,
12478
- const struct ggml_tensor * src1,
12479
12176
  struct ggml_tensor * dst) {
12480
- assert(src1->type == GGML_TYPE_I32);
12481
- assert(ggml_nelements(src1) == 3);
12482
12177
 
12483
12178
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12484
12179
  return;
@@ -12488,9 +12183,9 @@ static void ggml_compute_forward_rope_back_f16(
12488
12183
  // dx = rope_back(dy, src1)
12489
12184
  // src0 is dy, src1 contains options
12490
12185
 
12491
- const int n_past = ((int32_t *) src1->data)[0];
12492
- const int n_dims = ((int32_t *) src1->data)[1];
12493
- const int mode = ((int32_t *) src1->data)[2];
12186
+ const int n_past = ((int32_t *) dst->op_params)[0];
12187
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12188
+ const int mode = ((int32_t *) dst->op_params)[2];
12494
12189
 
12495
12190
  assert(n_past >= 0);
12496
12191
 
@@ -12574,16 +12269,15 @@ static void ggml_compute_forward_rope_back_f16(
12574
12269
  static void ggml_compute_forward_rope_back(
12575
12270
  const struct ggml_compute_params * params,
12576
12271
  const struct ggml_tensor * src0,
12577
- const struct ggml_tensor * src1,
12578
12272
  struct ggml_tensor * dst) {
12579
12273
  switch (src0->type) {
12580
12274
  case GGML_TYPE_F16:
12581
12275
  {
12582
- ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
12276
+ ggml_compute_forward_rope_back_f16(params, src0, dst);
12583
12277
  } break;
12584
12278
  case GGML_TYPE_F32:
12585
12279
  {
12586
- ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
12280
+ ggml_compute_forward_rope_back_f32(params, src0, dst);
12587
12281
  } break;
12588
12282
  default:
12589
12283
  {
@@ -12780,7 +12474,7 @@ static void ggml_compute_forward_conv_1d_s1_ph(
12780
12474
  const struct ggml_compute_params * params,
12781
12475
  const struct ggml_tensor * src0,
12782
12476
  const struct ggml_tensor * src1,
12783
- struct ggml_tensor * dst) {
12477
+ struct ggml_tensor * dst) {
12784
12478
  switch (src0->type) {
12785
12479
  case GGML_TYPE_F16:
12786
12480
  {
@@ -12983,7 +12677,7 @@ static void ggml_compute_forward_conv_1d_s2_ph(
12983
12677
  const struct ggml_compute_params * params,
12984
12678
  const struct ggml_tensor * src0,
12985
12679
  const struct ggml_tensor * src1,
12986
- struct ggml_tensor * dst) {
12680
+ struct ggml_tensor * dst) {
12987
12681
  switch (src0->type) {
12988
12682
  case GGML_TYPE_F16:
12989
12683
  {
@@ -13003,14 +12697,13 @@ static void ggml_compute_forward_conv_1d_s2_ph(
13003
12697
  // ggml_compute_forward_conv_1d
13004
12698
 
13005
12699
  static void ggml_compute_forward_conv_1d(
13006
- const struct ggml_compute_params * params,
13007
- const struct ggml_tensor * src0,
13008
- const struct ggml_tensor * src1,
13009
- const struct ggml_tensor * opt0,
13010
- struct ggml_tensor * dst) {
13011
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13012
- const int32_t p0 = ((const int32_t*)(opt0->data))[1];
13013
- const int32_t d0 = ((const int32_t*)(opt0->data))[2];
12700
+ const struct ggml_compute_params * params,
12701
+ const struct ggml_tensor * src0,
12702
+ const struct ggml_tensor * src1,
12703
+ struct ggml_tensor * dst) {
12704
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12705
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
12706
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
13014
12707
  GGML_ASSERT(d0 == 1); // dilation not supported
13015
12708
  GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
13016
12709
  if (s0 == 1) {
@@ -13028,7 +12721,6 @@ static void ggml_compute_forward_conv_2d_f16_f32(
13028
12721
  const struct ggml_compute_params * params,
13029
12722
  const struct ggml_tensor * src0,
13030
12723
  const struct ggml_tensor * src1,
13031
- const struct ggml_tensor * opt0,
13032
12724
  struct ggml_tensor * dst) {
13033
12725
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13034
12726
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13048,12 +12740,12 @@ static void ggml_compute_forward_conv_2d_f16_f32(
13048
12740
  // size of the convolution row - the kernel size unrolled across all channels
13049
12741
  const int ew0 = nk0*nk1*ne02;
13050
12742
 
13051
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13052
- const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13053
- const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13054
- const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13055
- const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13056
- const int32_t d1 = ((const int32_t*)(opt0->data))[5];
12743
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12744
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12745
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12746
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12747
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12748
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
13057
12749
 
13058
12750
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13059
12751
  GGML_ASSERT(nb10 == sizeof(float));
@@ -13125,17 +12817,15 @@ static void ggml_compute_forward_conv_2d(
13125
12817
  const struct ggml_compute_params * params,
13126
12818
  const struct ggml_tensor * src0,
13127
12819
  const struct ggml_tensor * src1,
13128
- const struct ggml_tensor * opt0,
13129
- struct ggml_tensor * dst
13130
- ) {
12820
+ struct ggml_tensor * dst) {
13131
12821
  switch (src0->type) {
13132
12822
  case GGML_TYPE_F16:
13133
12823
  {
13134
- ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
12824
+ ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
13135
12825
  } break;
13136
12826
  case GGML_TYPE_F32:
13137
12827
  {
13138
- //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
12828
+ //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
13139
12829
  GGML_ASSERT(false);
13140
12830
  } break;
13141
12831
  default:
@@ -13200,12 +12890,11 @@ static void ggml_compute_forward_pool_1d_sk_p0(
13200
12890
  // ggml_compute_forward_pool_1d
13201
12891
 
13202
12892
  static void ggml_compute_forward_pool_1d(
13203
- const struct ggml_compute_params* params,
13204
- const struct ggml_tensor* src0,
13205
- const struct ggml_tensor* opt0,
13206
- struct ggml_tensor* dst) {
13207
- GGML_ASSERT(opt0->ne[0] == 4);
13208
- const int* opts = (const int*)opt0->data;
12893
+ const struct ggml_compute_params * params,
12894
+ const struct ggml_tensor * src0,
12895
+ struct ggml_tensor * dst) {
12896
+
12897
+ const int32_t* opts = (const int32_t*)dst->op_params;
13209
12898
  enum ggml_op_pool op = opts[0];
13210
12899
  const int k0 = opts[1];
13211
12900
  const int s0 = opts[2];
@@ -13219,12 +12908,12 @@ static void ggml_compute_forward_pool_1d(
13219
12908
  // ggml_compute_forward_pool_2d_sk_p0
13220
12909
 
13221
12910
  static void ggml_compute_forward_pool_2d_sk_p0(
13222
- const struct ggml_compute_params * params,
13223
- const enum ggml_op_pool op,
13224
- const struct ggml_tensor * src,
13225
- const int k0,
13226
- const int k1,
13227
- struct ggml_tensor * dst) {
12911
+ const struct ggml_compute_params * params,
12912
+ const enum ggml_op_pool op,
12913
+ const struct ggml_tensor * src,
12914
+ const int k0,
12915
+ const int k1,
12916
+ struct ggml_tensor * dst) {
13228
12917
  assert(src->type == GGML_TYPE_F32);
13229
12918
  assert(params->ith == 0);
13230
12919
 
@@ -13284,12 +12973,11 @@ static void ggml_compute_forward_pool_2d_sk_p0(
13284
12973
  // ggml_compute_forward_pool_2d
13285
12974
 
13286
12975
  static void ggml_compute_forward_pool_2d(
13287
- const struct ggml_compute_params * params,
13288
- const struct ggml_tensor * src0,
13289
- const struct ggml_tensor * opt0,
13290
- struct ggml_tensor * dst) {
13291
- GGML_ASSERT(opt0->ne[0] == 7);
13292
- const int* opts = (const int*)opt0->data;
12976
+ const struct ggml_compute_params * params,
12977
+ const struct ggml_tensor * src0,
12978
+ struct ggml_tensor * dst) {
12979
+
12980
+ const int32_t * opts = (const int32_t *)dst->op_params;
13293
12981
  enum ggml_op_pool op = opts[0];
13294
12982
  const int k0 = opts[1];
13295
12983
  const int k1 = opts[2];
@@ -13314,7 +13002,7 @@ static void ggml_compute_forward_flash_attn_f32(
13314
13002
  const struct ggml_tensor * k,
13315
13003
  const struct ggml_tensor * v,
13316
13004
  const bool masked,
13317
- struct ggml_tensor * dst) {
13005
+ struct ggml_tensor * dst) {
13318
13006
  int64_t t0 = ggml_perf_time_us();
13319
13007
  UNUSED(t0);
13320
13008
 
@@ -13492,7 +13180,7 @@ static void ggml_compute_forward_flash_attn_f16(
13492
13180
  const struct ggml_tensor * k,
13493
13181
  const struct ggml_tensor * v,
13494
13182
  const bool masked,
13495
- struct ggml_tensor * dst) {
13183
+ struct ggml_tensor * dst) {
13496
13184
  int64_t t0 = ggml_perf_time_us();
13497
13185
  UNUSED(t0);
13498
13186
 
@@ -14257,7 +13945,6 @@ static void ggml_compute_forward_flash_attn_back(
14257
13945
  static void ggml_compute_forward_win_part_f32(
14258
13946
  const struct ggml_compute_params * params,
14259
13947
  const struct ggml_tensor * src0,
14260
- const struct ggml_tensor * opt0,
14261
13948
  struct ggml_tensor * dst) {
14262
13949
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14263
13950
  return;
@@ -14266,9 +13953,9 @@ static void ggml_compute_forward_win_part_f32(
14266
13953
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14267
13954
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14268
13955
 
14269
- const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14270
- const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
14271
- const int32_t w = ((const int32_t *)(opt0->data))[2];
13956
+ const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
13957
+ const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
13958
+ const int32_t w = ((const int32_t *)(dst->op_params))[2];
14272
13959
 
14273
13960
  assert(ne00 == ne0);
14274
13961
  assert(ne3 == nep0*nep1);
@@ -14302,12 +13989,11 @@ static void ggml_compute_forward_win_part_f32(
14302
13989
  static void ggml_compute_forward_win_part(
14303
13990
  const struct ggml_compute_params * params,
14304
13991
  const struct ggml_tensor * src0,
14305
- const struct ggml_tensor * opt0,
14306
13992
  struct ggml_tensor * dst) {
14307
13993
  switch (src0->type) {
14308
13994
  case GGML_TYPE_F32:
14309
13995
  {
14310
- ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
13996
+ ggml_compute_forward_win_part_f32(params, src0, dst);
14311
13997
  } break;
14312
13998
  default:
14313
13999
  {
@@ -14321,7 +14007,6 @@ static void ggml_compute_forward_win_part(
14321
14007
  static void ggml_compute_forward_win_unpart_f32(
14322
14008
  const struct ggml_compute_params * params,
14323
14009
  const struct ggml_tensor * src0,
14324
- const struct ggml_tensor * opt0,
14325
14010
  struct ggml_tensor * dst) {
14326
14011
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14327
14012
  return;
@@ -14330,7 +14015,7 @@ static void ggml_compute_forward_win_unpart_f32(
14330
14015
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14331
14016
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14332
14017
 
14333
- const int32_t w = ((const int32_t *)(opt0->data))[0];
14018
+ const int32_t w = ((const int32_t *)(dst->op_params))[0];
14334
14019
 
14335
14020
  // padding
14336
14021
  const int px = (w - ne1%w)%w;
@@ -14364,12 +14049,67 @@ static void ggml_compute_forward_win_unpart_f32(
14364
14049
  static void ggml_compute_forward_win_unpart(
14365
14050
  const struct ggml_compute_params * params,
14366
14051
  const struct ggml_tensor * src0,
14367
- const struct ggml_tensor * opt0,
14368
14052
  struct ggml_tensor * dst) {
14369
14053
  switch (src0->type) {
14370
14054
  case GGML_TYPE_F32:
14371
14055
  {
14372
- ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
14056
+ ggml_compute_forward_win_unpart_f32(params, src0, dst);
14057
+ } break;
14058
+ default:
14059
+ {
14060
+ GGML_ASSERT(false);
14061
+ } break;
14062
+ }
14063
+ }
14064
+
14065
+ //gmml_compute_forward_unary
14066
+
14067
+ static void ggml_compute_forward_unary(
14068
+ const struct ggml_compute_params * params,
14069
+ const struct ggml_tensor * src0,
14070
+ struct ggml_tensor * dst) {
14071
+ const enum ggml_unary_op op = ggml_get_unary_op(dst);
14072
+
14073
+ switch (op) {
14074
+ case GGML_UNARY_OP_ABS:
14075
+ {
14076
+ ggml_compute_forward_abs(params, src0, dst);
14077
+ } break;
14078
+ case GGML_UNARY_OP_SGN:
14079
+ {
14080
+ ggml_compute_forward_sgn(params, src0, dst);
14081
+ } break;
14082
+ case GGML_UNARY_OP_NEG:
14083
+ {
14084
+ ggml_compute_forward_neg(params, src0, dst);
14085
+ } break;
14086
+ case GGML_UNARY_OP_STEP:
14087
+ {
14088
+ ggml_compute_forward_step(params, src0, dst);
14089
+ } break;
14090
+ case GGML_UNARY_OP_TANH:
14091
+ {
14092
+ ggml_compute_forward_tanh(params, src0, dst);
14093
+ } break;
14094
+ case GGML_UNARY_OP_ELU:
14095
+ {
14096
+ ggml_compute_forward_elu(params, src0, dst);
14097
+ } break;
14098
+ case GGML_UNARY_OP_RELU:
14099
+ {
14100
+ ggml_compute_forward_relu(params, src0, dst);
14101
+ } break;
14102
+ case GGML_UNARY_OP_GELU:
14103
+ {
14104
+ ggml_compute_forward_gelu(params, src0, dst);
14105
+ } break;
14106
+ case GGML_UNARY_OP_GELU_QUICK:
14107
+ {
14108
+ ggml_compute_forward_gelu_quick(params, src0, dst);
14109
+ } break;
14110
+ case GGML_UNARY_OP_SILU:
14111
+ {
14112
+ ggml_compute_forward_silu(params, src0, dst);
14373
14113
  } break;
14374
14114
  default:
14375
14115
  {
@@ -14888,7 +14628,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14888
14628
  } break;
14889
14629
  case GGML_OP_ACC:
14890
14630
  {
14891
- ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14631
+ ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
14892
14632
  } break;
14893
14633
  case GGML_OP_SUB:
14894
14634
  {
@@ -14938,46 +14678,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14938
14678
  {
14939
14679
  ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
14940
14680
  } break;
14941
- case GGML_OP_ABS:
14942
- {
14943
- ggml_compute_forward_abs(params, tensor->src[0], tensor);
14944
- } break;
14945
- case GGML_OP_SGN:
14946
- {
14947
- ggml_compute_forward_sgn(params, tensor->src[0], tensor);
14948
- } break;
14949
- case GGML_OP_NEG:
14950
- {
14951
- ggml_compute_forward_neg(params, tensor->src[0], tensor);
14952
- } break;
14953
- case GGML_OP_STEP:
14954
- {
14955
- ggml_compute_forward_step(params, tensor->src[0], tensor);
14956
- } break;
14957
- case GGML_OP_TANH:
14958
- {
14959
- ggml_compute_forward_tanh(params, tensor->src[0], tensor);
14960
- } break;
14961
- case GGML_OP_ELU:
14962
- {
14963
- ggml_compute_forward_elu(params, tensor->src[0], tensor);
14964
- } break;
14965
- case GGML_OP_RELU:
14966
- {
14967
- ggml_compute_forward_relu(params, tensor->src[0], tensor);
14968
- } break;
14969
- case GGML_OP_GELU:
14970
- {
14971
- ggml_compute_forward_gelu(params, tensor->src[0], tensor);
14972
- } break;
14973
- case GGML_OP_GELU_QUICK:
14974
- {
14975
- ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
14976
- } break;
14977
- case GGML_OP_SILU:
14978
- {
14979
- ggml_compute_forward_silu(params, tensor->src[0], tensor);
14980
- } break;
14981
14681
  case GGML_OP_SILU_BACK:
14982
14682
  {
14983
14683
  ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
@@ -15008,7 +14708,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15008
14708
  } break;
15009
14709
  case GGML_OP_SET:
15010
14710
  {
15011
- ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14711
+ ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
15012
14712
  } break;
15013
14713
  case GGML_OP_CPY:
15014
14714
  {
@@ -15048,11 +14748,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15048
14748
  } break;
15049
14749
  case GGML_OP_DIAG_MASK_INF:
15050
14750
  {
15051
- ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor);
14751
+ ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
15052
14752
  } break;
15053
14753
  case GGML_OP_DIAG_MASK_ZERO:
15054
14754
  {
15055
- ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor);
14755
+ ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
15056
14756
  } break;
15057
14757
  case GGML_OP_SOFT_MAX:
15058
14758
  {
@@ -15064,39 +14764,39 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15064
14764
  } break;
15065
14765
  case GGML_OP_ROPE:
15066
14766
  {
15067
- ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
14767
+ ggml_compute_forward_rope(params, tensor->src[0], tensor);
15068
14768
  } break;
15069
14769
  case GGML_OP_ROPE_BACK:
15070
14770
  {
15071
- ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
14771
+ ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
15072
14772
  } break;
15073
14773
  case GGML_OP_ALIBI:
15074
14774
  {
15075
- ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor);
14775
+ ggml_compute_forward_alibi(params, tensor->src[0], tensor);
15076
14776
  } break;
15077
14777
  case GGML_OP_CLAMP:
15078
14778
  {
15079
- ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor);
14779
+ ggml_compute_forward_clamp(params, tensor->src[0], tensor);
15080
14780
  } break;
15081
14781
  case GGML_OP_CONV_1D:
15082
14782
  {
15083
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14783
+ ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
15084
14784
  } break;
15085
14785
  case GGML_OP_CONV_2D:
15086
14786
  {
15087
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14787
+ ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
15088
14788
  } break;
15089
14789
  case GGML_OP_POOL_1D:
15090
14790
  {
15091
- ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
14791
+ ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
15092
14792
  } break;
15093
14793
  case GGML_OP_POOL_2D:
15094
14794
  {
15095
- ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
14795
+ ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
15096
14796
  } break;
15097
14797
  case GGML_OP_FLASH_ATTN:
15098
14798
  {
15099
- const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
14799
+ const int32_t t = ggml_get_op_params_i32(tensor, 0);
15100
14800
  GGML_ASSERT(t == 0 || t == 1);
15101
14801
  const bool masked = t != 0;
15102
14802
  ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
@@ -15107,47 +14807,56 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15107
14807
  } break;
15108
14808
  case GGML_OP_FLASH_ATTN_BACK:
15109
14809
  {
15110
- int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
14810
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15111
14811
  GGML_ASSERT(t == 0 || t == 1);
15112
14812
  bool masked = t != 0;
15113
14813
  ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
15114
14814
  } break;
15115
14815
  case GGML_OP_WIN_PART:
15116
14816
  {
15117
- ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor);
14817
+ ggml_compute_forward_win_part(params, tensor->src[0], tensor);
15118
14818
  } break;
15119
14819
  case GGML_OP_WIN_UNPART:
15120
14820
  {
15121
- ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor);
14821
+ ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
14822
+ } break;
14823
+ case GGML_OP_UNARY:
14824
+ {
14825
+ ggml_compute_forward_unary(params, tensor->src[0], tensor);
15122
14826
  } break;
15123
14827
  case GGML_OP_MAP_UNARY:
15124
14828
  {
15125
- const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data);
14829
+ ggml_unary_op_f32_t fun;
14830
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15126
14831
  ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
15127
14832
  }
15128
14833
  break;
15129
14834
  case GGML_OP_MAP_BINARY:
15130
14835
  {
15131
- const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data);
14836
+ ggml_binary_op_f32_t fun;
14837
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15132
14838
  ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
15133
14839
  }
15134
14840
  break;
15135
14841
  case GGML_OP_MAP_CUSTOM1:
15136
14842
  {
15137
- const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data);
14843
+ ggml_custom1_op_f32_t fun;
14844
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15138
14845
  ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
15139
14846
  }
15140
14847
  break;
15141
14848
  case GGML_OP_MAP_CUSTOM2:
15142
14849
  {
15143
- const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data);
14850
+ ggml_custom2_op_f32_t fun;
14851
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15144
14852
  ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
15145
14853
  }
15146
14854
  break;
15147
14855
  case GGML_OP_MAP_CUSTOM3:
15148
14856
  {
15149
- const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data);
15150
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun);
14857
+ ggml_custom3_op_f32_t fun;
14858
+ memcpy(&fun, tensor->op_params, sizeof(fun));
14859
+ ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15151
14860
  }
15152
14861
  break;
15153
14862
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -15211,12 +14920,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15211
14920
  src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
15212
14921
  }
15213
14922
  if (src1->grad) {
15214
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15215
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15216
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15217
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15218
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15219
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
14923
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
14924
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
14925
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
14926
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15220
14927
 
15221
14928
  struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
15222
14929
  tensor->grad,
@@ -15365,73 +15072,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15365
15072
  inplace);
15366
15073
  }
15367
15074
  } break;
15368
- case GGML_OP_ABS:
15369
- {
15370
- if (src0->grad) {
15371
- src0->grad =
15372
- ggml_add_impl(ctx,
15373
- src0->grad,
15374
- ggml_mul(ctx,
15375
- ggml_sgn(ctx, src0),
15376
- tensor->grad),
15377
- inplace);
15378
- }
15379
- } break;
15380
- case GGML_OP_SGN:
15381
- {
15382
- if (src0->grad) {
15383
- // noop
15384
- }
15385
- } break;
15386
- case GGML_OP_NEG:
15387
- {
15388
- if (src0->grad) {
15389
- src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15390
- }
15391
- } break;
15392
- case GGML_OP_STEP:
15393
- {
15394
- if (src0->grad) {
15395
- // noop
15396
- }
15397
- } break;
15398
- case GGML_OP_TANH:
15399
- {
15400
- GGML_ASSERT(false); // TODO: not implemented
15401
- } break;
15402
- case GGML_OP_ELU:
15403
- {
15404
- GGML_ASSERT(false); // TODO: not implemented
15405
- } break;
15406
- case GGML_OP_RELU:
15407
- {
15408
- if (src0->grad) {
15409
- src0->grad = ggml_sub_impl(ctx,
15410
- src0->grad,
15411
- ggml_mul(ctx,
15412
- ggml_step(ctx, src0),
15413
- tensor->grad),
15414
- inplace);
15415
- }
15416
- } break;
15417
- case GGML_OP_GELU:
15418
- {
15419
- GGML_ASSERT(false); // TODO: not implemented
15420
- } break;
15421
- case GGML_OP_GELU_QUICK:
15422
- {
15423
- GGML_ASSERT(false); // TODO: not implemented
15424
- } break;
15425
- case GGML_OP_SILU:
15426
- {
15427
- // necessary for llama
15428
- if (src0->grad) {
15429
- src0->grad = ggml_add_impl(ctx,
15430
- src0->grad,
15431
- ggml_silu_back(ctx, src0, tensor->grad),
15432
- inplace);
15433
- }
15434
- } break;
15435
15075
  case GGML_OP_SILU_BACK:
15436
15076
  {
15437
15077
  GGML_ASSERT(false); // TODO: not implemented
@@ -15524,12 +15164,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15524
15164
  } break;
15525
15165
  case GGML_OP_SET:
15526
15166
  {
15527
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15528
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15529
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15530
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15531
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15532
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
15167
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
15168
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
15169
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
15170
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15533
15171
 
15534
15172
  struct ggml_tensor * tensor_grad_view = NULL;
15535
15173
 
@@ -15606,8 +15244,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15606
15244
  if (src0->grad) {
15607
15245
  size_t offset;
15608
15246
 
15609
- GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2]));
15610
- memcpy(&offset, tensor->src[2]->data, sizeof(offset));
15247
+ memcpy(&offset, tensor->op_params, sizeof(offset));
15611
15248
 
15612
15249
  size_t nb1 = tensor->nb[1];
15613
15250
  size_t nb2 = tensor->nb[2];
@@ -15634,7 +15271,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15634
15271
  {
15635
15272
  // necessary for llama
15636
15273
  if (src0->grad) {
15637
- int32_t * axes = (int32_t *) tensor->src[2]->data;
15274
+ int32_t * axes = (int32_t *) tensor->op_params;
15638
15275
  int axis0 = axes[0] & 0x3;
15639
15276
  int axis1 = axes[1] & 0x3;
15640
15277
  int axis2 = axes[2] & 0x3;
@@ -15690,33 +15327,23 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15690
15327
  {
15691
15328
  // necessary for llama
15692
15329
  if (src0->grad) {
15693
- assert(src1->type == GGML_TYPE_I32);
15694
- assert(ggml_nelements(src1) == 2);
15695
- const int n_past = ((int32_t *) src1->data)[0];
15330
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15696
15331
  src0->grad =
15697
15332
  ggml_add_impl(ctx, src0->grad,
15698
15333
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15699
15334
  inplace);
15700
15335
  }
15701
- if (src1->grad) {
15702
- // noop
15703
- }
15704
15336
  } break;
15705
15337
  case GGML_OP_DIAG_MASK_ZERO:
15706
15338
  {
15707
15339
  // necessary for llama
15708
15340
  if (src0->grad) {
15709
- assert(src1->type == GGML_TYPE_I32);
15710
- assert(ggml_nelements(src1) == 2);
15711
- const int n_past = ((int32_t *) src1->data)[0];
15341
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15712
15342
  src0->grad =
15713
15343
  ggml_add_impl(ctx, src0->grad,
15714
15344
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15715
15345
  inplace);
15716
15346
  }
15717
- if (src1->grad) {
15718
- // noop
15719
- }
15720
15347
  } break;
15721
15348
  case GGML_OP_SOFT_MAX:
15722
15349
  {
@@ -15737,12 +15364,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15737
15364
  {
15738
15365
  // necessary for llama
15739
15366
  if (src0->grad) {
15740
- assert(src1->type == GGML_TYPE_I32);
15741
- assert(ggml_nelements(src1) == 6);
15742
- const int n_past = ((int32_t *) src1->data)[0];
15743
- const int n_dims = ((int32_t *) src1->data)[1];
15744
- const int mode = ((int32_t *) src1->data)[2];
15745
- const int n_ctx = ((int32_t *) src1->data)[3];
15367
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15368
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15369
+ const int mode = ((int32_t *) tensor->op_params)[2];
15370
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15746
15371
  src0->grad = ggml_add_impl(ctx,
15747
15372
  src0->grad,
15748
15373
  ggml_rope_back(ctx,
@@ -15753,19 +15378,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15753
15378
  n_ctx),
15754
15379
  inplace);
15755
15380
  }
15756
- if (src1->grad) {
15757
- // noop
15758
- }
15759
15381
  } break;
15760
15382
  case GGML_OP_ROPE_BACK:
15761
15383
  {
15762
15384
  if (src0->grad) {
15763
- assert(src1->type == GGML_TYPE_I32);
15764
- assert(ggml_nelements(src1) == 4);
15765
- const int n_past = ((int32_t *) src1->data)[0];
15766
- const int n_dims = ((int32_t *) src1->data)[1];
15767
- const int mode = ((int32_t *) src1->data)[2];
15768
- const int n_ctx = ((int32_t *) src1->data)[3];
15385
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15386
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15387
+ const int mode = ((int32_t *) tensor->op_params)[2];
15388
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15769
15389
  src0->grad = ggml_add_impl(ctx,
15770
15390
  src0->grad,
15771
15391
  ggml_rope(ctx,
@@ -15776,9 +15396,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15776
15396
  n_ctx),
15777
15397
  inplace);
15778
15398
  }
15779
- if (src1->grad) {
15780
- // noop
15781
- }
15782
15399
  } break;
15783
15400
  case GGML_OP_ALIBI:
15784
15401
  {
@@ -15808,7 +15425,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15808
15425
  {
15809
15426
  struct ggml_tensor * flash_grad = NULL;
15810
15427
  if (src0->grad || src1->grad || tensor->src[2]->grad) {
15811
- int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
15428
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15812
15429
  GGML_ASSERT(t == 0 || t == 1);
15813
15430
  bool masked = t != 0;
15814
15431
  flash_grad =
@@ -15971,6 +15588,80 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15971
15588
  } break;
15972
15589
  case GGML_OP_WIN_PART:
15973
15590
  case GGML_OP_WIN_UNPART:
15591
+ case GGML_OP_UNARY:
15592
+ {
15593
+ switch (ggml_get_unary_op(tensor)) {
15594
+ case GGML_UNARY_OP_ABS:
15595
+ {
15596
+ if (src0->grad) {
15597
+ src0->grad =
15598
+ ggml_add_impl(ctx,
15599
+ src0->grad,
15600
+ ggml_mul(ctx,
15601
+ ggml_sgn(ctx, src0),
15602
+ tensor->grad),
15603
+ inplace);
15604
+ }
15605
+ } break;
15606
+ case GGML_UNARY_OP_SGN:
15607
+ {
15608
+ if (src0->grad) {
15609
+ // noop
15610
+ }
15611
+ } break;
15612
+ case GGML_UNARY_OP_NEG:
15613
+ {
15614
+ if (src0->grad) {
15615
+ src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15616
+ }
15617
+ } break;
15618
+ case GGML_UNARY_OP_STEP:
15619
+ {
15620
+ if (src0->grad) {
15621
+ // noop
15622
+ }
15623
+ } break;
15624
+ case GGML_UNARY_OP_TANH:
15625
+ {
15626
+ GGML_ASSERT(false); // TODO: not implemented
15627
+ } break;
15628
+ case GGML_UNARY_OP_ELU:
15629
+ {
15630
+ GGML_ASSERT(false); // TODO: not implemented
15631
+ } break;
15632
+ case GGML_UNARY_OP_RELU:
15633
+ {
15634
+ if (src0->grad) {
15635
+ src0->grad = ggml_add_impl(ctx,
15636
+ src0->grad,
15637
+ ggml_mul(ctx,
15638
+ ggml_step(ctx, src0),
15639
+ tensor->grad),
15640
+ inplace);
15641
+ }
15642
+ } break;
15643
+ case GGML_UNARY_OP_GELU:
15644
+ {
15645
+ GGML_ASSERT(false); // TODO: not implemented
15646
+ } break;
15647
+ case GGML_UNARY_OP_GELU_QUICK:
15648
+ {
15649
+ GGML_ASSERT(false); // TODO: not implemented
15650
+ } break;
15651
+ case GGML_UNARY_OP_SILU:
15652
+ {
15653
+ // necessary for llama
15654
+ if (src0->grad) {
15655
+ src0->grad = ggml_add_impl(ctx,
15656
+ src0->grad,
15657
+ ggml_silu_back(ctx, src0, tensor->grad),
15658
+ inplace);
15659
+ }
15660
+ } break;
15661
+ default:
15662
+ GGML_ASSERT(false);
15663
+ }
15664
+ } break;
15974
15665
  case GGML_OP_MAP_UNARY:
15975
15666
  case GGML_OP_MAP_BINARY:
15976
15667
  case GGML_OP_MAP_CUSTOM1:
@@ -16006,6 +15697,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16006
15697
  }
16007
15698
  }
16008
15699
 
15700
+ static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
15701
+
15702
+ static size_t hash(void * p) {
15703
+ return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
15704
+ }
15705
+
15706
+ static bool hash_insert(void * hash_table[], void * p) {
15707
+ size_t h = hash(p);
15708
+
15709
+ // linear probing
15710
+ size_t i = h;
15711
+ while (hash_table[i] != NULL && hash_table[i] != p) {
15712
+ i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
15713
+ if (i == h) {
15714
+ // hash table is full
15715
+ GGML_ASSERT(false);
15716
+ }
15717
+ }
15718
+
15719
+ if (hash_table[i] == p) {
15720
+ return true;
15721
+ }
15722
+
15723
+ // insert
15724
+ hash_table[i] = p;
15725
+ return false;
15726
+ }
15727
+
16009
15728
  static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
16010
15729
  if (node->grad == NULL) {
16011
15730
  // this usually happens when we generate intermediate nodes from constants in the backward pass
@@ -16016,16 +15735,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
16016
15735
  }
16017
15736
 
16018
15737
  // check if already visited
16019
- for (int i = 0; i < cgraph->n_nodes; i++) {
16020
- if (cgraph->nodes[i] == node) {
16021
- return;
16022
- }
16023
- }
16024
-
16025
- for (int i = 0; i < cgraph->n_leafs; i++) {
16026
- if (cgraph->leafs[i] == node) {
16027
- return;
16028
- }
15738
+ if (hash_insert(cgraph->visited_hash_table, node)) {
15739
+ return;
16029
15740
  }
16030
15741
 
16031
15742
  for (int i = 0; i < GGML_MAX_SRC; ++i) {
@@ -16088,6 +15799,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16088
15799
  /*.nodes =*/ { NULL },
16089
15800
  /*.grads =*/ { NULL },
16090
15801
  /*.leafs =*/ { NULL },
15802
+ /*.hash_table =*/ { NULL },
16091
15803
  /*.perf_runs =*/ 0,
16092
15804
  /*.perf_cycles =*/ 0,
16093
15805
  /*.perf_time_us =*/ 0,
@@ -16129,13 +15841,42 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16129
15841
 
16130
15842
  if (node->is_param) {
16131
15843
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16132
- ggml_build_forward_impl(&result, node->grad, true);
15844
+ ggml_build_forward_expand(&result, node->grad);
16133
15845
  }
16134
15846
  }
16135
15847
 
16136
15848
  return result;
16137
15849
  }
16138
15850
 
15851
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15852
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15853
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15854
+
15855
+ *cgraph = (struct ggml_cgraph) {
15856
+ /*.n_nodes =*/ 0,
15857
+ /*.n_leafs =*/ 0,
15858
+ /*.nodes =*/ { NULL },
15859
+ /*.grads =*/ { NULL },
15860
+ /*.leafs =*/ { NULL },
15861
+ /*.hash_table =*/ { NULL },
15862
+ /*.perf_runs =*/ 0,
15863
+ /*.perf_cycles =*/ 0,
15864
+ /*.perf_time_us =*/ 0,
15865
+ };
15866
+
15867
+ return cgraph;
15868
+ }
15869
+
15870
+ struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
15871
+ struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
15872
+ ggml_build_forward_impl(cgraph, tensor, false);
15873
+ return cgraph;
15874
+ }
15875
+
15876
+ size_t ggml_graph_overhead(void) {
15877
+ return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15878
+ }
15879
+
16139
15880
  //
16140
15881
  // thread data
16141
15882
  //
@@ -16201,7 +15942,7 @@ typedef pthread_t ggml_thread_t;
16201
15942
 
16202
15943
  // Android's libc implementation "bionic" does not support setting affinity
16203
15944
  #if defined(__linux__) && !defined(__BIONIC__)
16204
- void set_numa_thread_affinity(int thread_n, int n_threads) {
15945
+ static void set_numa_thread_affinity(int thread_n, int n_threads) {
16205
15946
  if (!ggml_is_numa()) {
16206
15947
  return;
16207
15948
  }
@@ -16226,7 +15967,7 @@ void set_numa_thread_affinity(int thread_n, int n_threads) {
16226
15967
  CPU_FREE(cpus);
16227
15968
  }
16228
15969
 
16229
- void clear_numa_thread_affinity(void) {
15970
+ static void clear_numa_thread_affinity(void) {
16230
15971
  if (!ggml_is_numa()) {
16231
15972
  return;
16232
15973
  }
@@ -16250,8 +15991,8 @@ void clear_numa_thread_affinity(void) {
16250
15991
  #else
16251
15992
  // TODO: Windows etc.
16252
15993
  // (the linux implementation may also work on BSD, someone should test)
16253
- void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16254
- void clear_numa_thread_affinity(void) {}
15994
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15995
+ static void clear_numa_thread_affinity(void) {}
16255
15996
  #endif
16256
15997
 
16257
15998
  struct ggml_compute_state_shared {
@@ -16463,21 +16204,34 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16463
16204
  case GGML_OP_ARGMAX:
16464
16205
  case GGML_OP_REPEAT:
16465
16206
  case GGML_OP_REPEAT_BACK:
16466
- case GGML_OP_ABS:
16467
- case GGML_OP_SGN:
16468
- case GGML_OP_NEG:
16469
- case GGML_OP_STEP:
16470
- case GGML_OP_TANH:
16471
- case GGML_OP_ELU:
16472
- case GGML_OP_RELU:
16473
- {
16207
+ {
16474
16208
  n_tasks = 1;
16475
16209
  } break;
16476
- case GGML_OP_MUL:
16477
- case GGML_OP_GELU:
16478
- case GGML_OP_GELU_QUICK:
16479
- case GGML_OP_SILU:
16210
+
16211
+ case GGML_OP_UNARY:
16212
+ {
16213
+ switch (ggml_get_unary_op(node)) {
16214
+ case GGML_UNARY_OP_ABS:
16215
+ case GGML_UNARY_OP_SGN:
16216
+ case GGML_UNARY_OP_NEG:
16217
+ case GGML_UNARY_OP_STEP:
16218
+ case GGML_UNARY_OP_TANH:
16219
+ case GGML_UNARY_OP_ELU:
16220
+ case GGML_UNARY_OP_RELU:
16221
+ {
16222
+ n_tasks = 1;
16223
+ } break;
16224
+
16225
+ case GGML_UNARY_OP_GELU:
16226
+ case GGML_UNARY_OP_GELU_QUICK:
16227
+ case GGML_UNARY_OP_SILU:
16228
+ {
16229
+ n_tasks = n_threads;
16230
+ } break;
16231
+ }
16232
+ } break;
16480
16233
  case GGML_OP_SILU_BACK:
16234
+ case GGML_OP_MUL:
16481
16235
  case GGML_OP_NORM:
16482
16236
  case GGML_OP_RMS_NORM:
16483
16237
  case GGML_OP_RMS_NORM_BACK:
@@ -16542,10 +16296,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16542
16296
  case GGML_OP_GET_ROWS:
16543
16297
  case GGML_OP_GET_ROWS_BACK:
16544
16298
  case GGML_OP_DIAG:
16545
- case GGML_OP_DIAG_MASK_ZERO:
16546
16299
  {
16547
16300
  n_tasks = 1;
16548
16301
  } break;
16302
+ case GGML_OP_DIAG_MASK_ZERO:
16549
16303
  case GGML_OP_DIAG_MASK_INF:
16550
16304
  case GGML_OP_SOFT_MAX:
16551
16305
  case GGML_OP_SOFT_MAX_BACK:
@@ -16838,10 +16592,9 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16838
16592
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16839
16593
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16840
16594
 
16841
- struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
16842
- GGML_ASSERT(buf);
16595
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
16843
16596
 
16844
- cplan.work_data = buf->data;
16597
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
16845
16598
 
16846
16599
  ggml_graph_compute(cgraph, &cplan);
16847
16600
  }
@@ -16992,7 +16745,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16992
16745
  fwrite(&nb, sizeof(uint64_t), 1, fout);
16993
16746
  }
16994
16747
 
16995
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16748
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16749
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
16996
16750
 
16997
16751
  // dump the data
16998
16752
  // TODO: pad this to 32 byte boundary
@@ -17025,7 +16779,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17025
16779
  fwrite(&nb, sizeof(uint64_t), 1, fout);
17026
16780
  }
17027
16781
 
17028
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16782
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16783
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
17029
16784
 
17030
16785
  // output the op arguments
17031
16786
  {
@@ -17206,7 +16961,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17206
16961
 
17207
16962
  tensor->op = (enum ggml_op) op;
17208
16963
 
17209
- memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16964
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16965
+ memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
17210
16966
 
17211
16967
  tensor->data = (void *) ptr;
17212
16968
 
@@ -17251,7 +17007,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17251
17007
  nb[j] = nb_cur;
17252
17008
  }
17253
17009
 
17254
- const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
17010
+ const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
17011
+ const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
17255
17012
 
17256
17013
  const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
17257
17014
 
@@ -17288,8 +17045,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17288
17045
  {
17289
17046
  tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
17290
17047
 
17291
- uint64_t offs;
17292
- memcpy(&offs, args[2]->data, sizeof(offs));
17048
+ size_t offs;
17049
+ memcpy(&offs, ptr_op_params, sizeof(offs));
17293
17050
 
17294
17051
  tensor->data = ((char *) tensor->data) + offs;
17295
17052
  } break;
@@ -17309,7 +17066,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17309
17066
  } break;
17310
17067
  }
17311
17068
 
17312
- memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17069
+ memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17070
+ memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
17313
17071
 
17314
17072
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
17315
17073
  tensor->nb[j] = nb[j];
@@ -17343,7 +17101,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17343
17101
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17344
17102
  i,
17345
17103
  node->ne[0], node->ne[1], node->ne[2],
17346
- GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17104
+ ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17347
17105
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17348
17106
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17349
17107
  (double) node->perf_time_us / 1000.0,
@@ -17357,7 +17115,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17357
17115
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
17358
17116
  i,
17359
17117
  node->ne[0], node->ne[1],
17360
- GGML_OP_NAME[node->op]);
17118
+ ggml_op_name(node->op));
17361
17119
  }
17362
17120
 
17363
17121
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -17365,7 +17123,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17365
17123
  continue;
17366
17124
  }
17367
17125
 
17368
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
17126
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
17369
17127
  }
17370
17128
 
17371
17129
  GGML_PRINT("========================================\n");
@@ -17459,13 +17217,13 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17459
17217
  }
17460
17218
 
17461
17219
  if (node->n_dims == 2) {
17462
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
17220
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
17463
17221
  } else {
17464
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
17222
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
17465
17223
  }
17466
17224
 
17467
17225
  if (node->grad) {
17468
- fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
17226
+ fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
17469
17227
  } else {
17470
17228
  fprintf(fp, "\"; ]\n");
17471
17229
  }