llama_cpp 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -3440,7 +3440,9 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
3440
3440
 
3441
3441
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
3442
3442
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
3443
- #if defined(GGML_SIMD)
3443
+ #if defined(GGML_USE_ACCELERATE)
3444
+ vDSP_vsmul(y, 1, &v, y, 1, n);
3445
+ #elif defined(GGML_SIMD)
3444
3446
  const int np = (n & ~(GGML_F32_STEP - 1));
3445
3447
 
3446
3448
  GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
@@ -3603,7 +3605,7 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
3603
3605
  #endif
3604
3606
  }
3605
3607
 
3606
- inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) {
3608
+ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
3607
3609
  ggml_float sum = 0.0;
3608
3610
  for (int i = 0; i < n; ++i) {
3609
3611
  sum += (ggml_float)x[i];
@@ -3611,6 +3613,14 @@ inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x
3611
3613
  *s = sum;
3612
3614
  }
3613
3615
 
3616
+ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
3617
+ float sum = 0.0f;
3618
+ for (int i = 0; i < n; ++i) {
3619
+ sum += GGML_FP16_TO_FP32(x[i]);
3620
+ }
3621
+ *s = sum;
3622
+ }
3623
+
3614
3624
  inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
3615
3625
  #ifndef GGML_USE_ACCELERATE
3616
3626
  float max = -INFINITY;
@@ -3750,16 +3760,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3750
3760
  "ARGMAX",
3751
3761
  "REPEAT",
3752
3762
  "REPEAT_BACK",
3753
- "ABS",
3754
- "SGN",
3755
- "NEG",
3756
- "STEP",
3757
- "TANH",
3758
- "ELU",
3759
- "RELU",
3760
- "GELU",
3761
- "GELU_QUICK",
3762
- "SILU",
3763
3763
  "SILU_BACK",
3764
3764
  "NORM",
3765
3765
  "RMS_NORM",
@@ -3798,6 +3798,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3798
3798
  "WIN_PART",
3799
3799
  "WIN_UNPART",
3800
3800
 
3801
+ "UNARY",
3802
+
3801
3803
  "MAP_UNARY",
3802
3804
  "MAP_BINARY",
3803
3805
 
@@ -3809,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3809
3811
  "CROSS_ENTROPY_LOSS_BACK",
3810
3812
  };
3811
3813
 
3812
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3814
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3813
3815
 
3814
3816
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3815
3817
  "none",
@@ -3830,16 +3832,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3830
3832
  "argmax(x)",
3831
3833
  "repeat(x)",
3832
3834
  "repeat_back(x)",
3833
- "abs(x)",
3834
- "sgn(x)",
3835
- "-x",
3836
- "step(x)",
3837
- "tanh(x)",
3838
- "elu(x)",
3839
- "relu(x)",
3840
- "gelu(x)",
3841
- "gelu_quick(x)",
3842
- "silu(x)",
3843
3835
  "silu_back(x)",
3844
3836
  "norm(x)",
3845
3837
  "rms_norm(x)",
@@ -3878,6 +3870,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3878
3870
  "win_part(x)",
3879
3871
  "win_unpart(x)",
3880
3872
 
3873
+ "unary(x)",
3874
+
3881
3875
  "f(x)",
3882
3876
  "f(x,y)",
3883
3877
 
@@ -3889,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3889
3883
  "cross_entropy_loss_back(x,y)",
3890
3884
  };
3891
3885
 
3892
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3886
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3893
3887
 
3894
3888
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3895
3889
 
@@ -4077,8 +4071,8 @@ bool ggml_is_numa(void) {
4077
4071
  ////////////////////////////////////////////////////////////////////////////////
4078
4072
 
4079
4073
  void ggml_print_object(const struct ggml_object * obj) {
4080
- GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
4081
- obj->offs, obj->size, (const void *) obj->next);
4074
+ GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
4075
+ obj->type, obj->offs, obj->size, (const void *) obj->next);
4082
4076
  }
4083
4077
 
4084
4078
  void ggml_print_objects(const struct ggml_context * ctx) {
@@ -4145,6 +4139,10 @@ const char * ggml_op_name(enum ggml_op op) {
4145
4139
  return GGML_OP_NAME[op];
4146
4140
  }
4147
4141
 
4142
+ const char * ggml_op_symbol(enum ggml_op op) {
4143
+ return GGML_OP_SYMBOL[op];
4144
+ }
4145
+
4148
4146
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
4149
4147
  return GGML_TYPE_SIZE[tensor->type];
4150
4148
  }
@@ -4214,7 +4212,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4214
4212
  }
4215
4213
 
4216
4214
  size_t ggml_tensor_overhead(void) {
4217
- return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
4215
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
4218
4216
  }
4219
4217
 
4220
4218
  bool ggml_is_transposed(const struct ggml_tensor * tensor) {
@@ -4231,6 +4229,15 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
4231
4229
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4232
4230
  }
4233
4231
 
4232
+ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
4233
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
+
4235
+ return
4236
+ tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4237
+ tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4238
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4239
+ }
4240
+
4234
4241
  bool ggml_is_permuted(const struct ggml_tensor * tensor) {
4235
4242
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4236
4243
 
@@ -4376,7 +4383,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4376
4383
  return NULL;
4377
4384
  }
4378
4385
 
4379
- const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
4386
+ const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4380
4387
 
4381
4388
  *ctx = (struct ggml_context) {
4382
4389
  /*.mem_size =*/ mem_size,
@@ -4443,6 +4450,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4443
4450
  return result;
4444
4451
  }
4445
4452
 
4453
+ bool ggml_get_no_alloc(struct ggml_context * ctx) {
4454
+ return ctx->no_alloc;
4455
+ }
4456
+
4446
4457
  void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4447
4458
  ctx->no_alloc = no_alloc;
4448
4459
  }
@@ -4461,12 +4472,14 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4461
4472
  struct ggml_object * obj = ctx->objects_begin;
4462
4473
 
4463
4474
  while (obj != NULL) {
4464
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4475
+ if (obj->type == GGML_OBJECT_TENSOR) {
4476
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4465
4477
 
4466
- const size_t size = ggml_nbytes(tensor);
4478
+ const size_t size = ggml_nbytes(tensor);
4467
4479
 
4468
- if (max_size < size) {
4469
- max_size = size;
4480
+ if (max_size < size) {
4481
+ max_size = size;
4482
+ }
4470
4483
  }
4471
4484
 
4472
4485
  obj = obj->next;
@@ -4480,7 +4493,7 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4480
4493
  // this is an error prone process, but it is necessary to support inplace
4481
4494
  // operators when using scratch buffers
4482
4495
  // TODO: implement a better way
4483
- void ggml_scratch_save(struct ggml_context * ctx) {
4496
+ static void ggml_scratch_save(struct ggml_context * ctx) {
4484
4497
  // this is needed to allow opt tensors to store their data
4485
4498
  // TODO: again, need to find a better way
4486
4499
  ctx->no_alloc_save = ctx->no_alloc;
@@ -4490,7 +4503,7 @@ void ggml_scratch_save(struct ggml_context * ctx) {
4490
4503
  ctx->scratch.data = NULL;
4491
4504
  }
4492
4505
 
4493
- void ggml_scratch_load(struct ggml_context * ctx) {
4506
+ static void ggml_scratch_load(struct ggml_context * ctx) {
4494
4507
  ctx->no_alloc = ctx->no_alloc_save;
4495
4508
 
4496
4509
  ctx->scratch = ctx->scratch_save;
@@ -4498,12 +4511,7 @@ void ggml_scratch_load(struct ggml_context * ctx) {
4498
4511
 
4499
4512
  ////////////////////////////////////////////////////////////////////////////////
4500
4513
 
4501
- struct ggml_tensor * ggml_new_tensor_impl(
4502
- struct ggml_context * ctx,
4503
- enum ggml_type type,
4504
- int n_dims,
4505
- const int64_t* ne,
4506
- void* data) {
4514
+ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
4507
4515
  // always insert objects at the end of the context's memory pool
4508
4516
  struct ggml_object * obj_cur = ctx->objects_end;
4509
4517
 
@@ -4511,77 +4519,79 @@ struct ggml_tensor * ggml_new_tensor_impl(
4511
4519
  const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
4512
4520
  const size_t cur_end = cur_offs + cur_size;
4513
4521
 
4514
- size_t size_needed = 0;
4515
-
4516
- if (data == NULL && !ctx->no_alloc) {
4517
- size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4518
- for (int i = 1; i < n_dims; i++) {
4519
- size_needed *= ne[i];
4520
- }
4521
- // align to GGML_MEM_ALIGN
4522
- size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
4523
- }
4522
+ // align to GGML_MEM_ALIGN
4523
+ size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
4524
4524
 
4525
4525
  char * const mem_buffer = ctx->mem_buffer;
4526
4526
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4527
4527
 
4528
- if (ctx->scratch.data == NULL || data != NULL) {
4529
- size_needed += GGML_TENSOR_SIZE;
4528
+ if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4529
+ GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4530
+ __func__, cur_end + size_needed, ctx->mem_size);
4531
+ assert(false);
4532
+ return NULL;
4533
+ }
4530
4534
 
4531
- if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4532
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4533
- __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
4534
- assert(false);
4535
- return NULL;
4536
- }
4535
+ *obj_new = (struct ggml_object) {
4536
+ .offs = cur_end + GGML_OBJECT_SIZE,
4537
+ .size = size_needed,
4538
+ .next = NULL,
4539
+ .type = type,
4540
+ };
4537
4541
 
4538
- *obj_new = (struct ggml_object) {
4539
- .offs = cur_end + GGML_OBJECT_SIZE,
4540
- .size = size_needed,
4541
- .next = NULL,
4542
- };
4542
+ ggml_assert_aligned(mem_buffer + obj_new->offs);
4543
+
4544
+ if (obj_cur != NULL) {
4545
+ obj_cur->next = obj_new;
4543
4546
  } else {
4544
- if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4545
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4546
- __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4547
- assert(false);
4548
- return NULL;
4547
+ // this is the first object in this context
4548
+ ctx->objects_begin = obj_new;
4549
+ }
4550
+
4551
+ ctx->objects_end = obj_new;
4552
+
4553
+ //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4554
+
4555
+ return obj_new;
4556
+ }
4557
+
4558
+ static struct ggml_tensor * ggml_new_tensor_impl(
4559
+ struct ggml_context * ctx,
4560
+ enum ggml_type type,
4561
+ int n_dims,
4562
+ const int64_t* ne,
4563
+ void* data) {
4564
+
4565
+ size_t data_size = 0;
4566
+
4567
+ if (data == NULL && !ctx->no_alloc) {
4568
+ data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4569
+ for (int i = 1; i < n_dims; i++) {
4570
+ data_size *= ne[i];
4549
4571
  }
4572
+ }
4550
4573
 
4551
- if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4552
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4553
- __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4574
+ if (ctx->scratch.data != NULL && data == NULL) {
4575
+ // allocate tensor data in the scratch buffer
4576
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4577
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4578
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4554
4579
  assert(false);
4555
4580
  return NULL;
4556
4581
  }
4557
4582
 
4558
4583
  data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4559
4584
 
4560
- *obj_new = (struct ggml_object) {
4561
- .offs = cur_end + GGML_OBJECT_SIZE,
4562
- .size = GGML_TENSOR_SIZE,
4563
- .next = NULL,
4564
- };
4565
-
4566
- //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
4567
-
4568
- ctx->scratch.offs += size_needed;
4569
- }
4585
+ ctx->scratch.offs += data_size;
4570
4586
 
4571
- if (obj_cur != NULL) {
4572
- obj_cur->next = obj_new;
4573
- } else {
4574
- // this is the first object in this context
4575
- ctx->objects_begin = obj_new;
4587
+ data_size = 0;
4576
4588
  }
4577
4589
 
4578
- ctx->objects_end = obj_new;
4579
-
4580
- //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4590
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4581
4591
 
4582
- struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
4592
+ // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4583
4593
 
4584
- ggml_assert_aligned(result);
4594
+ struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
4585
4595
 
4586
4596
  *result = (struct ggml_tensor) {
4587
4597
  /*.type =*/ type,
@@ -4590,6 +4600,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4590
4600
  /*.ne =*/ { 1, 1, 1, 1 },
4591
4601
  /*.nb =*/ { 0, 0, 0, 0 },
4592
4602
  /*.op =*/ GGML_OP_NONE,
4603
+ /*.op_params =*/ {0},
4593
4604
  /*.is_param =*/ false,
4594
4605
  /*.grad =*/ NULL,
4595
4606
  /*.src =*/ { NULL },
@@ -4620,6 +4631,21 @@ struct ggml_tensor * ggml_new_tensor_impl(
4620
4631
  return result;
4621
4632
  }
4622
4633
 
4634
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4635
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4636
+ memcpy(tensor->op_params, params, params_size);
4637
+ }
4638
+
4639
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4640
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4641
+ return ((const int32_t *)(tensor->op_params))[i];
4642
+ }
4643
+
4644
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4645
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4646
+ ((int32_t *)(tensor->op_params))[i] = value;
4647
+ }
4648
+
4623
4649
  struct ggml_tensor * ggml_new_tensor(
4624
4650
  struct ggml_context * ctx,
4625
4651
  enum ggml_type type,
@@ -4951,6 +4977,11 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
4951
4977
  return (float *)(tensor->data);
4952
4978
  }
4953
4979
 
4980
+ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
4981
+ GGML_ASSERT(tensor->op == GGML_OP_UNARY);
4982
+ return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
4983
+ }
4984
+
4954
4985
  const char * ggml_get_name(const struct ggml_tensor * tensor) {
4955
4986
  return tensor->name;
4956
4987
  }
@@ -4989,9 +5020,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
4989
5020
  char * const mem_buffer = ctx->mem_buffer;
4990
5021
 
4991
5022
  while (obj != NULL) {
4992
- struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4993
- if (strcmp(cur->name, name) == 0) {
4994
- return cur;
5023
+ if (obj->type == GGML_OBJECT_TENSOR) {
5024
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
5025
+ if (strcmp(cur->name, name) == 0) {
5026
+ return cur;
5027
+ }
4995
5028
  }
4996
5029
 
4997
5030
  obj = obj->next;
@@ -5004,7 +5037,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
5004
5037
 
5005
5038
  // ggml_dup
5006
5039
 
5007
- struct ggml_tensor * ggml_dup_impl(
5040
+ static struct ggml_tensor * ggml_dup_impl(
5008
5041
  struct ggml_context * ctx,
5009
5042
  struct ggml_tensor * a,
5010
5043
  bool inplace) {
@@ -5019,7 +5052,6 @@ struct ggml_tensor * ggml_dup_impl(
5019
5052
  result->op = GGML_OP_DUP;
5020
5053
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5021
5054
  result->src[0] = a;
5022
- result->src[1] = NULL;
5023
5055
 
5024
5056
  return result;
5025
5057
  }
@@ -5038,7 +5070,7 @@ struct ggml_tensor * ggml_dup_inplace(
5038
5070
 
5039
5071
  // ggml_add
5040
5072
 
5041
- struct ggml_tensor * ggml_add_impl(
5073
+ static struct ggml_tensor * ggml_add_impl(
5042
5074
  struct ggml_context * ctx,
5043
5075
  struct ggml_tensor * a,
5044
5076
  struct ggml_tensor * b,
@@ -5081,7 +5113,7 @@ struct ggml_tensor * ggml_add_inplace(
5081
5113
 
5082
5114
  // ggml_add1
5083
5115
 
5084
- struct ggml_tensor * ggml_add1_impl(
5116
+ static struct ggml_tensor * ggml_add1_impl(
5085
5117
  struct ggml_context * ctx,
5086
5118
  struct ggml_tensor * a,
5087
5119
  struct ggml_tensor * b,
@@ -5121,7 +5153,7 @@ struct ggml_tensor * ggml_add1_inplace(
5121
5153
 
5122
5154
  // ggml_acc
5123
5155
 
5124
- struct ggml_tensor * ggml_acc_impl(
5156
+ static struct ggml_tensor * ggml_acc_impl(
5125
5157
  struct ggml_context * ctx,
5126
5158
  struct ggml_tensor * a,
5127
5159
  struct ggml_tensor * b,
@@ -5143,23 +5175,13 @@ struct ggml_tensor * ggml_acc_impl(
5143
5175
 
5144
5176
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5145
5177
 
5146
- ggml_scratch_save(ctx);
5147
-
5148
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
5149
-
5150
- ((int32_t *) c->data)[0] = nb1;
5151
- ((int32_t *) c->data)[1] = nb2;
5152
- ((int32_t *) c->data)[2] = nb3;
5153
- ((int32_t *) c->data)[3] = offset;
5154
- ((int32_t *) c->data)[4] = inplace ? 1 : 0;
5155
-
5156
- ggml_scratch_load(ctx);
5178
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5179
+ ggml_set_op_params(result, params, sizeof(params));
5157
5180
 
5158
5181
  result->op = GGML_OP_ACC;
5159
5182
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5160
5183
  result->src[0] = a;
5161
5184
  result->src[1] = b;
5162
- result->src[2] = c;
5163
5185
 
5164
5186
  return result;
5165
5187
  }
@@ -5188,7 +5210,7 @@ struct ggml_tensor * ggml_acc_inplace(
5188
5210
 
5189
5211
  // ggml_sub
5190
5212
 
5191
- struct ggml_tensor * ggml_sub_impl(
5213
+ static struct ggml_tensor * ggml_sub_impl(
5192
5214
  struct ggml_context * ctx,
5193
5215
  struct ggml_tensor * a,
5194
5216
  struct ggml_tensor * b,
@@ -5227,7 +5249,7 @@ struct ggml_tensor * ggml_sub_inplace(
5227
5249
 
5228
5250
  // ggml_mul
5229
5251
 
5230
- struct ggml_tensor * ggml_mul_impl(
5252
+ static struct ggml_tensor * ggml_mul_impl(
5231
5253
  struct ggml_context * ctx,
5232
5254
  struct ggml_tensor * a,
5233
5255
  struct ggml_tensor * b,
@@ -5274,7 +5296,7 @@ struct ggml_tensor * ggml_mul_inplace(
5274
5296
 
5275
5297
  // ggml_div
5276
5298
 
5277
- struct ggml_tensor * ggml_div_impl(
5299
+ static struct ggml_tensor * ggml_div_impl(
5278
5300
  struct ggml_context * ctx,
5279
5301
  struct ggml_tensor * a,
5280
5302
  struct ggml_tensor * b,
@@ -5317,7 +5339,7 @@ struct ggml_tensor * ggml_div_inplace(
5317
5339
 
5318
5340
  // ggml_sqr
5319
5341
 
5320
- struct ggml_tensor * ggml_sqr_impl(
5342
+ static struct ggml_tensor * ggml_sqr_impl(
5321
5343
  struct ggml_context * ctx,
5322
5344
  struct ggml_tensor * a,
5323
5345
  bool inplace) {
@@ -5332,7 +5354,6 @@ struct ggml_tensor * ggml_sqr_impl(
5332
5354
  result->op = GGML_OP_SQR;
5333
5355
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5334
5356
  result->src[0] = a;
5335
- result->src[1] = NULL;
5336
5357
 
5337
5358
  return result;
5338
5359
  }
@@ -5351,7 +5372,7 @@ struct ggml_tensor * ggml_sqr_inplace(
5351
5372
 
5352
5373
  // ggml_sqrt
5353
5374
 
5354
- struct ggml_tensor * ggml_sqrt_impl(
5375
+ static struct ggml_tensor * ggml_sqrt_impl(
5355
5376
  struct ggml_context * ctx,
5356
5377
  struct ggml_tensor * a,
5357
5378
  bool inplace) {
@@ -5366,7 +5387,6 @@ struct ggml_tensor * ggml_sqrt_impl(
5366
5387
  result->op = GGML_OP_SQRT;
5367
5388
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5368
5389
  result->src[0] = a;
5369
- result->src[1] = NULL;
5370
5390
 
5371
5391
  return result;
5372
5392
  }
@@ -5386,7 +5406,7 @@ struct ggml_tensor * ggml_sqrt_inplace(
5386
5406
 
5387
5407
  // ggml_log
5388
5408
 
5389
- struct ggml_tensor * ggml_log_impl(
5409
+ static struct ggml_tensor * ggml_log_impl(
5390
5410
  struct ggml_context * ctx,
5391
5411
  struct ggml_tensor * a,
5392
5412
  bool inplace) {
@@ -5401,7 +5421,6 @@ struct ggml_tensor * ggml_log_impl(
5401
5421
  result->op = GGML_OP_LOG;
5402
5422
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5403
5423
  result->src[0] = a;
5404
- result->src[1] = NULL;
5405
5424
 
5406
5425
  return result;
5407
5426
  }
@@ -5434,7 +5453,6 @@ struct ggml_tensor * ggml_sum(
5434
5453
  result->op = GGML_OP_SUM;
5435
5454
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5436
5455
  result->src[0] = a;
5437
- result->src[1] = NULL;
5438
5456
 
5439
5457
  return result;
5440
5458
  }
@@ -5461,7 +5479,6 @@ struct ggml_tensor * ggml_sum_rows(
5461
5479
  result->op = GGML_OP_SUM_ROWS;
5462
5480
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5463
5481
  result->src[0] = a;
5464
- result->src[1] = NULL;
5465
5482
 
5466
5483
  return result;
5467
5484
  }
@@ -5484,7 +5501,6 @@ struct ggml_tensor * ggml_mean(
5484
5501
  result->op = GGML_OP_MEAN;
5485
5502
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5486
5503
  result->src[0] = a;
5487
- result->src[1] = NULL;
5488
5504
 
5489
5505
  return result;
5490
5506
  }
@@ -5508,7 +5524,6 @@ struct ggml_tensor * ggml_argmax(
5508
5524
  result->op = GGML_OP_ARGMAX;
5509
5525
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5510
5526
  result->src[0] = a;
5511
- result->src[1] = NULL;
5512
5527
 
5513
5528
  return result;
5514
5529
  }
@@ -5571,343 +5586,142 @@ struct ggml_tensor * ggml_repeat_back(
5571
5586
 
5572
5587
  // ggml_abs
5573
5588
 
5574
- struct ggml_tensor * ggml_abs_impl(
5575
- struct ggml_context * ctx,
5576
- struct ggml_tensor * a,
5577
- bool inplace) {
5578
- bool is_node = false;
5579
-
5580
- if (!inplace && (a->grad)) {
5581
- is_node = true;
5582
- }
5583
-
5584
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5585
-
5586
- result->op = GGML_OP_ABS;
5587
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5588
- result->src[0] = a;
5589
- result->src[1] = NULL;
5590
-
5591
- return result;
5592
- }
5593
-
5594
5589
  struct ggml_tensor * ggml_abs(
5595
5590
  struct ggml_context * ctx,
5596
5591
  struct ggml_tensor * a) {
5597
- return ggml_abs_impl(ctx, a, false);
5592
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
5598
5593
  }
5599
5594
 
5600
5595
  struct ggml_tensor * ggml_abs_inplace(
5601
5596
  struct ggml_context * ctx,
5602
5597
  struct ggml_tensor * a) {
5603
- return ggml_abs_impl(ctx, a, true);
5598
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
5604
5599
  }
5605
5600
 
5606
-
5607
5601
  // ggml_sgn
5608
5602
 
5609
- struct ggml_tensor * ggml_sgn_impl(
5610
- struct ggml_context * ctx,
5611
- struct ggml_tensor * a,
5612
- bool inplace) {
5613
- bool is_node = false;
5614
-
5615
- if (!inplace && (a->grad)) {
5616
- is_node = true;
5617
- }
5618
-
5619
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5620
-
5621
- result->op = GGML_OP_SGN;
5622
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5623
- result->src[0] = a;
5624
- result->src[1] = NULL;
5625
-
5626
- return result;
5627
- }
5628
-
5629
5603
  struct ggml_tensor * ggml_sgn(
5630
5604
  struct ggml_context * ctx,
5631
5605
  struct ggml_tensor * a) {
5632
- return ggml_sgn_impl(ctx, a, false);
5606
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
5633
5607
  }
5634
5608
 
5635
5609
  struct ggml_tensor * ggml_sgn_inplace(
5636
5610
  struct ggml_context * ctx,
5637
5611
  struct ggml_tensor * a) {
5638
- return ggml_sgn_impl(ctx, a, true);
5612
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
5639
5613
  }
5640
5614
 
5641
5615
  // ggml_neg
5642
5616
 
5643
- struct ggml_tensor * ggml_neg_impl(
5644
- struct ggml_context * ctx,
5645
- struct ggml_tensor * a,
5646
- bool inplace) {
5647
- bool is_node = false;
5648
-
5649
- if (!inplace && (a->grad)) {
5650
- is_node = true;
5651
- }
5652
-
5653
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5654
-
5655
- result->op = GGML_OP_NEG;
5656
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5657
- result->src[0] = a;
5658
- result->src[1] = NULL;
5659
-
5660
- return result;
5661
- }
5662
-
5663
5617
  struct ggml_tensor * ggml_neg(
5664
5618
  struct ggml_context * ctx,
5665
5619
  struct ggml_tensor * a) {
5666
- return ggml_neg_impl(ctx, a, false);
5620
+ return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
5667
5621
  }
5668
5622
 
5669
5623
  struct ggml_tensor * ggml_neg_inplace(
5670
5624
  struct ggml_context * ctx,
5671
5625
  struct ggml_tensor * a) {
5672
- return ggml_neg_impl(ctx, a, true);
5626
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
5673
5627
  }
5674
5628
 
5675
5629
  // ggml_step
5676
5630
 
5677
- struct ggml_tensor * ggml_step_impl(
5678
- struct ggml_context * ctx,
5679
- struct ggml_tensor * a,
5680
- bool inplace) {
5681
- bool is_node = false;
5682
-
5683
- if (!inplace && (a->grad)) {
5684
- is_node = true;
5685
- }
5686
-
5687
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5688
-
5689
- result->op = GGML_OP_STEP;
5690
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5691
- result->src[0] = a;
5692
- result->src[1] = NULL;
5693
-
5694
- return result;
5695
- }
5696
-
5697
5631
  struct ggml_tensor * ggml_step(
5698
5632
  struct ggml_context * ctx,
5699
5633
  struct ggml_tensor * a) {
5700
- return ggml_step_impl(ctx, a, false);
5634
+ return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
5701
5635
  }
5702
5636
 
5703
5637
  struct ggml_tensor * ggml_step_inplace(
5704
5638
  struct ggml_context * ctx,
5705
5639
  struct ggml_tensor * a) {
5706
- return ggml_step_impl(ctx, a, true);
5640
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
5707
5641
  }
5708
5642
 
5709
5643
  // ggml_tanh
5710
5644
 
5711
- struct ggml_tensor * ggml_tanh_impl(
5712
- struct ggml_context * ctx,
5713
- struct ggml_tensor * a,
5714
- bool inplace) {
5715
- bool is_node = false;
5716
-
5717
- if (!inplace && (a->grad)) {
5718
- is_node = true;
5719
- }
5720
-
5721
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5722
-
5723
- result->op = GGML_OP_TANH;
5724
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5725
- result->src[0] = a;
5726
- result->src[1] = NULL;
5727
-
5728
- return result;
5729
- }
5730
-
5731
5645
  struct ggml_tensor * ggml_tanh(
5732
5646
  struct ggml_context * ctx,
5733
5647
  struct ggml_tensor * a) {
5734
- return ggml_tanh_impl(ctx, a, false);
5648
+ return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
5735
5649
  }
5736
5650
 
5737
5651
  struct ggml_tensor * ggml_tanh_inplace(
5738
5652
  struct ggml_context * ctx,
5739
5653
  struct ggml_tensor * a) {
5740
- return ggml_tanh_impl(ctx, a, true);
5654
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
5741
5655
  }
5742
5656
 
5743
5657
  // ggml_elu
5744
5658
 
5745
- struct ggml_tensor * ggml_elu_impl(
5746
- struct ggml_context * ctx,
5747
- struct ggml_tensor * a,
5748
- bool inplace) {
5749
- bool is_node = false;
5750
-
5751
- if (!inplace && (a->grad)) {
5752
- is_node = true;
5753
- }
5754
-
5755
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5756
-
5757
- result->op = GGML_OP_ELU;
5758
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5759
- result->src[0] = a;
5760
- result->src[1] = NULL;
5761
-
5762
- return result;
5763
- }
5764
-
5765
5659
  struct ggml_tensor * ggml_elu(
5766
5660
  struct ggml_context * ctx,
5767
5661
  struct ggml_tensor * a) {
5768
- return ggml_elu_impl(ctx, a, false);
5662
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
5769
5663
  }
5770
5664
 
5771
5665
  struct ggml_tensor * ggml_elu_inplace(
5772
5666
  struct ggml_context * ctx,
5773
5667
  struct ggml_tensor * a) {
5774
- return ggml_elu_impl(ctx, a, true);
5668
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
5775
5669
  }
5776
5670
 
5777
5671
  // ggml_relu
5778
5672
 
5779
- struct ggml_tensor * ggml_relu_impl(
5780
- struct ggml_context * ctx,
5781
- struct ggml_tensor * a,
5782
- bool inplace) {
5783
- bool is_node = false;
5784
-
5785
- if (!inplace && (a->grad)) {
5786
- is_node = true;
5787
- }
5788
-
5789
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5790
-
5791
- result->op = GGML_OP_RELU;
5792
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5793
- result->src[0] = a;
5794
- result->src[1] = NULL;
5795
-
5796
- return result;
5797
- }
5798
-
5799
5673
  struct ggml_tensor * ggml_relu(
5800
5674
  struct ggml_context * ctx,
5801
5675
  struct ggml_tensor * a) {
5802
- return ggml_relu_impl(ctx, a, false);
5676
+ return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
5803
5677
  }
5804
5678
 
5805
5679
  struct ggml_tensor * ggml_relu_inplace(
5806
5680
  struct ggml_context * ctx,
5807
5681
  struct ggml_tensor * a) {
5808
- return ggml_relu_impl(ctx, a, true);
5682
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
5809
5683
  }
5810
5684
 
5811
5685
  // ggml_gelu
5812
5686
 
5813
- struct ggml_tensor * ggml_gelu_impl(
5814
- struct ggml_context * ctx,
5815
- struct ggml_tensor * a,
5816
- bool inplace) {
5817
- bool is_node = false;
5818
-
5819
- if (!inplace && (a->grad)) {
5820
- is_node = true;
5821
- }
5822
-
5823
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5824
-
5825
- result->op = GGML_OP_GELU;
5826
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5827
- result->src[0] = a;
5828
- result->src[1] = NULL;
5829
-
5830
- return result;
5831
- }
5832
-
5833
5687
  struct ggml_tensor * ggml_gelu(
5834
5688
  struct ggml_context * ctx,
5835
5689
  struct ggml_tensor * a) {
5836
- return ggml_gelu_impl(ctx, a, false);
5690
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
5837
5691
  }
5838
5692
 
5839
5693
  struct ggml_tensor * ggml_gelu_inplace(
5840
5694
  struct ggml_context * ctx,
5841
5695
  struct ggml_tensor * a) {
5842
- return ggml_gelu_impl(ctx, a, true);
5696
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
5843
5697
  }
5844
5698
 
5845
5699
  // ggml_gelu_quick
5846
5700
 
5847
- struct ggml_tensor * ggml_gelu_quick_impl(
5848
- struct ggml_context * ctx,
5849
- struct ggml_tensor * a,
5850
- bool inplace) {
5851
- bool is_node = false;
5852
-
5853
- if (!inplace && (a->grad)) {
5854
- is_node = true;
5855
- }
5856
-
5857
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5858
-
5859
- result->op = GGML_OP_GELU_QUICK;
5860
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5861
- result->src[0] = a;
5862
- result->src[1] = NULL;
5863
-
5864
- return result;
5865
- }
5866
-
5867
5701
  struct ggml_tensor * ggml_gelu_quick(
5868
5702
  struct ggml_context * ctx,
5869
5703
  struct ggml_tensor * a) {
5870
- return ggml_gelu_quick_impl(ctx, a, false);
5704
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5871
5705
  }
5872
5706
 
5873
5707
  struct ggml_tensor * ggml_gelu_quick_inplace(
5874
5708
  struct ggml_context * ctx,
5875
5709
  struct ggml_tensor * a) {
5876
- return ggml_gelu_quick_impl(ctx, a, true);
5710
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5877
5711
  }
5878
5712
 
5879
5713
  // ggml_silu
5880
5714
 
5881
- struct ggml_tensor * ggml_silu_impl(
5882
- struct ggml_context * ctx,
5883
- struct ggml_tensor * a,
5884
- bool inplace) {
5885
- bool is_node = false;
5886
-
5887
- if (!inplace && (a->grad)) {
5888
- is_node = true;
5889
- }
5890
-
5891
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5892
-
5893
- result->op = GGML_OP_SILU;
5894
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5895
- result->src[0] = a;
5896
- result->src[1] = NULL;
5897
-
5898
- return result;
5899
- }
5900
-
5901
5715
  struct ggml_tensor * ggml_silu(
5902
5716
  struct ggml_context * ctx,
5903
5717
  struct ggml_tensor * a) {
5904
- return ggml_silu_impl(ctx, a, false);
5718
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
5905
5719
  }
5906
5720
 
5907
5721
  struct ggml_tensor * ggml_silu_inplace(
5908
5722
  struct ggml_context * ctx,
5909
5723
  struct ggml_tensor * a) {
5910
- return ggml_silu_impl(ctx, a, true);
5724
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
5911
5725
  }
5912
5726
 
5913
5727
  // ggml_silu_back
@@ -5935,7 +5749,7 @@ struct ggml_tensor * ggml_silu_back(
5935
5749
 
5936
5750
  // ggml_norm
5937
5751
 
5938
- struct ggml_tensor * ggml_norm_impl(
5752
+ static struct ggml_tensor * ggml_norm_impl(
5939
5753
  struct ggml_context * ctx,
5940
5754
  struct ggml_tensor * a,
5941
5755
  bool inplace) {
@@ -5948,10 +5762,11 @@ struct ggml_tensor * ggml_norm_impl(
5948
5762
 
5949
5763
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5950
5764
 
5765
+ // TODO: maybe store epsilon here?
5766
+
5951
5767
  result->op = GGML_OP_NORM;
5952
5768
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5953
5769
  result->src[0] = a;
5954
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5955
5770
 
5956
5771
  return result;
5957
5772
  }
@@ -5968,9 +5783,10 @@ struct ggml_tensor * ggml_norm_inplace(
5968
5783
  return ggml_norm_impl(ctx, a, true);
5969
5784
  }
5970
5785
 
5971
- struct ggml_tensor * ggml_rms_norm_impl(
5786
+ static struct ggml_tensor * ggml_rms_norm_impl(
5972
5787
  struct ggml_context * ctx,
5973
5788
  struct ggml_tensor * a,
5789
+ float eps,
5974
5790
  bool inplace) {
5975
5791
  bool is_node = false;
5976
5792
 
@@ -5980,24 +5796,27 @@ struct ggml_tensor * ggml_rms_norm_impl(
5980
5796
 
5981
5797
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5982
5798
 
5799
+ ggml_set_op_params(result, &eps, sizeof(eps));
5800
+
5983
5801
  result->op = GGML_OP_RMS_NORM;
5984
5802
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5985
5803
  result->src[0] = a;
5986
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5987
5804
 
5988
5805
  return result;
5989
5806
  }
5990
5807
 
5991
5808
  struct ggml_tensor * ggml_rms_norm(
5992
5809
  struct ggml_context * ctx,
5993
- struct ggml_tensor * a) {
5994
- return ggml_rms_norm_impl(ctx, a, false);
5810
+ struct ggml_tensor * a,
5811
+ float eps) {
5812
+ return ggml_rms_norm_impl(ctx, a, eps, false);
5995
5813
  }
5996
5814
 
5997
5815
  struct ggml_tensor * ggml_rms_norm_inplace(
5998
5816
  struct ggml_context * ctx,
5999
- struct ggml_tensor * a) {
6000
- return ggml_rms_norm_impl(ctx, a, true);
5817
+ struct ggml_tensor * a,
5818
+ float eps) {
5819
+ return ggml_rms_norm_impl(ctx, a, eps, true);
6001
5820
  }
6002
5821
 
6003
5822
  struct ggml_tensor * ggml_rms_norm_back(
@@ -6076,7 +5895,7 @@ struct ggml_tensor * ggml_out_prod(
6076
5895
 
6077
5896
  // ggml_scale
6078
5897
 
6079
- struct ggml_tensor * ggml_scale_impl(
5898
+ static struct ggml_tensor * ggml_scale_impl(
6080
5899
  struct ggml_context * ctx,
6081
5900
  struct ggml_tensor * a,
6082
5901
  struct ggml_tensor * b,
@@ -6116,7 +5935,7 @@ struct ggml_tensor * ggml_scale_inplace(
6116
5935
 
6117
5936
  // ggml_set
6118
5937
 
6119
- struct ggml_tensor * ggml_set_impl(
5938
+ static struct ggml_tensor * ggml_set_impl(
6120
5939
  struct ggml_context * ctx,
6121
5940
  struct ggml_tensor * a,
6122
5941
  struct ggml_tensor * b,
@@ -6136,23 +5955,13 @@ struct ggml_tensor * ggml_set_impl(
6136
5955
  // make a view of the destination
6137
5956
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6138
5957
 
6139
- ggml_scratch_save(ctx);
6140
-
6141
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
6142
-
6143
- (( int32_t * ) c->data)[0] = nb1;
6144
- (( int32_t * ) c->data)[1] = nb2;
6145
- (( int32_t * ) c->data)[2] = nb3;
6146
- (( int32_t * ) c->data)[3] = offset;
6147
- (( int32_t * ) c->data)[4] = inplace ? 1 : 0;
6148
-
6149
- ggml_scratch_load(ctx);
5958
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5959
+ ggml_set_op_params(result, params, sizeof(params));
6150
5960
 
6151
5961
  result->op = GGML_OP_SET;
6152
5962
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6153
5963
  result->src[0] = a;
6154
5964
  result->src[1] = b;
6155
- result->src[2] = c;
6156
5965
 
6157
5966
  return result;
6158
5967
  }
@@ -6216,7 +6025,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
6216
6025
 
6217
6026
  // ggml_cpy
6218
6027
 
6219
- struct ggml_tensor * ggml_cpy_impl(
6028
+ static struct ggml_tensor * ggml_cpy_impl(
6220
6029
  struct ggml_context * ctx,
6221
6030
  struct ggml_tensor * a,
6222
6031
  struct ggml_tensor * b,
@@ -6261,7 +6070,7 @@ struct ggml_tensor * ggml_cpy_inplace(
6261
6070
 
6262
6071
  // ggml_cont
6263
6072
 
6264
- struct ggml_tensor * ggml_cont_impl(
6073
+ static struct ggml_tensor * ggml_cont_impl(
6265
6074
  struct ggml_context * ctx,
6266
6075
  struct ggml_tensor * a,
6267
6076
  bool inplace) {
@@ -6277,7 +6086,6 @@ struct ggml_tensor * ggml_cont_impl(
6277
6086
  result->op = GGML_OP_CONT;
6278
6087
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6279
6088
  result->src[0] = a;
6280
- result->src[1] = NULL;
6281
6089
 
6282
6090
  return result;
6283
6091
  }
@@ -6321,7 +6129,6 @@ struct ggml_tensor * ggml_reshape(
6321
6129
  result->op = GGML_OP_RESHAPE;
6322
6130
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6323
6131
  result->src[0] = a;
6324
- result->src[1] = NULL;
6325
6132
 
6326
6133
  return result;
6327
6134
  }
@@ -6346,7 +6153,6 @@ struct ggml_tensor * ggml_reshape_1d(
6346
6153
  result->op = GGML_OP_RESHAPE;
6347
6154
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6348
6155
  result->src[0] = a;
6349
- result->src[1] = NULL;
6350
6156
 
6351
6157
  return result;
6352
6158
  }
@@ -6372,7 +6178,6 @@ struct ggml_tensor * ggml_reshape_2d(
6372
6178
  result->op = GGML_OP_RESHAPE;
6373
6179
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6374
6180
  result->src[0] = a;
6375
- result->src[1] = NULL;
6376
6181
 
6377
6182
  return result;
6378
6183
  }
@@ -6399,7 +6204,6 @@ struct ggml_tensor * ggml_reshape_3d(
6399
6204
  result->op = GGML_OP_RESHAPE;
6400
6205
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6401
6206
  result->src[0] = a;
6402
- result->src[1] = NULL;
6403
6207
 
6404
6208
  return result;
6405
6209
  }
@@ -6428,7 +6232,6 @@ struct ggml_tensor * ggml_reshape_4d(
6428
6232
  result->op = GGML_OP_RESHAPE;
6429
6233
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6430
6234
  result->src[0] = a;
6431
- result->src[1] = NULL;
6432
6235
 
6433
6236
  return result;
6434
6237
  }
@@ -6450,19 +6253,11 @@ struct ggml_tensor * ggml_view_1d(
6450
6253
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6451
6254
  ggml_format_name(result, "%s (view)", a->name);
6452
6255
 
6453
- ggml_scratch_save(ctx);
6454
-
6455
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6456
- ggml_set_name(offs, "offset");
6457
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6458
-
6459
- ggml_scratch_load(ctx);
6256
+ ggml_set_op_params(result, &offset, sizeof(offset));
6460
6257
 
6461
6258
  result->op = GGML_OP_VIEW;
6462
6259
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6463
6260
  result->src[0] = a;
6464
- result->src[1] = NULL;
6465
- result->src[2] = offs;
6466
6261
 
6467
6262
  return result;
6468
6263
  }
@@ -6488,13 +6283,7 @@ struct ggml_tensor * ggml_view_2d(
6488
6283
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6489
6284
  ggml_format_name(result, "%s (view)", a->name);
6490
6285
 
6491
- ggml_scratch_save(ctx);
6492
-
6493
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6494
- ggml_set_name(offs, "offset");
6495
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6496
-
6497
- ggml_scratch_load(ctx);
6286
+ ggml_set_op_params(result, &offset, sizeof(offset));
6498
6287
 
6499
6288
  result->nb[1] = nb1;
6500
6289
  result->nb[2] = result->nb[1]*ne1;
@@ -6503,8 +6292,6 @@ struct ggml_tensor * ggml_view_2d(
6503
6292
  result->op = GGML_OP_VIEW;
6504
6293
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6505
6294
  result->src[0] = a;
6506
- result->src[1] = NULL;
6507
- result->src[2] = offs;
6508
6295
 
6509
6296
  return result;
6510
6297
  }
@@ -6532,13 +6319,7 @@ struct ggml_tensor * ggml_view_3d(
6532
6319
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6533
6320
  ggml_format_name(result, "%s (view)", a->name);
6534
6321
 
6535
- ggml_scratch_save(ctx);
6536
-
6537
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6538
- ggml_set_name(offs, "offset");
6539
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6540
-
6541
- ggml_scratch_load(ctx);
6322
+ ggml_set_op_params(result, &offset, sizeof(offset));
6542
6323
 
6543
6324
  result->nb[1] = nb1;
6544
6325
  result->nb[2] = nb2;
@@ -6547,8 +6328,6 @@ struct ggml_tensor * ggml_view_3d(
6547
6328
  result->op = GGML_OP_VIEW;
6548
6329
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6549
6330
  result->src[0] = a;
6550
- result->src[1] = NULL;
6551
- result->src[2] = offs;
6552
6331
 
6553
6332
  return result;
6554
6333
  }
@@ -6578,13 +6357,7 @@ struct ggml_tensor * ggml_view_4d(
6578
6357
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6579
6358
  ggml_format_name(result, "%s (view)", a->name);
6580
6359
 
6581
- ggml_scratch_save(ctx);
6582
-
6583
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6584
- ggml_set_name(offs, "offset");
6585
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6586
-
6587
- ggml_scratch_load(ctx);
6360
+ ggml_set_op_params(result, &offset, sizeof(offset));
6588
6361
 
6589
6362
  result->nb[1] = nb1;
6590
6363
  result->nb[2] = nb2;
@@ -6593,8 +6366,6 @@ struct ggml_tensor * ggml_view_4d(
6593
6366
  result->op = GGML_OP_VIEW;
6594
6367
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6595
6368
  result->src[0] = a;
6596
- result->src[1] = NULL;
6597
- result->src[2] = offs;
6598
6369
 
6599
6370
  return result;
6600
6371
  }
@@ -6655,22 +6426,9 @@ struct ggml_tensor * ggml_permute(
6655
6426
  result->op = GGML_OP_PERMUTE;
6656
6427
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6657
6428
  result->src[0] = a;
6658
- result->src[1] = NULL;
6659
-
6660
- if (is_node) {
6661
- ggml_scratch_save(ctx);
6662
6429
 
6663
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6664
-
6665
- ((int32_t *) b->data)[0] = axis0;
6666
- ((int32_t *) b->data)[1] = axis1;
6667
- ((int32_t *) b->data)[2] = axis2;
6668
- ((int32_t *) b->data)[3] = axis3;
6669
-
6670
- ggml_scratch_load(ctx);
6671
-
6672
- result->src[2] = b;
6673
- }
6430
+ int32_t params[] = { axis0, axis1, axis2, axis3 };
6431
+ ggml_set_op_params(result, &params, sizeof(params));
6674
6432
 
6675
6433
  return result;
6676
6434
  }
@@ -6698,7 +6456,6 @@ struct ggml_tensor * ggml_transpose(
6698
6456
  result->op = GGML_OP_TRANSPOSE;
6699
6457
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6700
6458
  result->src[0] = a;
6701
- result->src[1] = NULL;
6702
6459
 
6703
6460
  return result;
6704
6461
  }
@@ -6776,7 +6533,6 @@ struct ggml_tensor * ggml_diag(
6776
6533
  result->op = GGML_OP_DIAG;
6777
6534
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6778
6535
  result->src[0] = a;
6779
- result->src[1] = NULL;
6780
6536
 
6781
6537
  return result;
6782
6538
  }
@@ -6784,7 +6540,7 @@ struct ggml_tensor * ggml_diag(
6784
6540
 
6785
6541
  // ggml_diag_mask_inf
6786
6542
 
6787
- struct ggml_tensor * ggml_diag_mask_inf_impl(
6543
+ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6788
6544
  struct ggml_context * ctx,
6789
6545
  struct ggml_tensor * a,
6790
6546
  int n_past,
@@ -6797,19 +6553,12 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
6797
6553
 
6798
6554
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6799
6555
 
6800
- ggml_scratch_save(ctx);
6801
-
6802
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6803
-
6804
- ((int32_t *) b->data)[0] = n_past;
6805
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6806
-
6807
- ggml_scratch_load(ctx);
6556
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6557
+ ggml_set_op_params(result, &params, sizeof(params));
6808
6558
 
6809
6559
  result->op = GGML_OP_DIAG_MASK_INF;
6810
6560
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6811
6561
  result->src[0] = a;
6812
- result->src[1] = b;
6813
6562
 
6814
6563
  return result;
6815
6564
  }
@@ -6831,7 +6580,7 @@ struct ggml_tensor * ggml_diag_mask_inf_inplace(
6831
6580
 
6832
6581
  // ggml_diag_mask_zero
6833
6582
 
6834
- struct ggml_tensor * ggml_diag_mask_zero_impl(
6583
+ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6835
6584
  struct ggml_context * ctx,
6836
6585
  struct ggml_tensor * a,
6837
6586
  int n_past,
@@ -6844,20 +6593,12 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
6844
6593
 
6845
6594
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6846
6595
 
6847
- ggml_scratch_save(ctx);
6848
-
6849
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6850
- ggml_set_name(b, "n_past, inplace");
6851
-
6852
- ((int32_t *) b->data)[0] = n_past;
6853
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6854
-
6855
- ggml_scratch_load(ctx);
6596
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6597
+ ggml_set_op_params(result, &params, sizeof(params));
6856
6598
 
6857
6599
  result->op = GGML_OP_DIAG_MASK_ZERO;
6858
6600
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6859
6601
  result->src[0] = a;
6860
- result->src[1] = b;
6861
6602
 
6862
6603
  return result;
6863
6604
  }
@@ -6878,7 +6619,7 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
6878
6619
 
6879
6620
  // ggml_soft_max
6880
6621
 
6881
- struct ggml_tensor * ggml_soft_max_impl(
6622
+ static struct ggml_tensor * ggml_soft_max_impl(
6882
6623
  struct ggml_context * ctx,
6883
6624
  struct ggml_tensor * a,
6884
6625
  bool inplace) {
@@ -6893,7 +6634,6 @@ struct ggml_tensor * ggml_soft_max_impl(
6893
6634
  result->op = GGML_OP_SOFT_MAX;
6894
6635
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6895
6636
  result->src[0] = a;
6896
- result->src[1] = NULL;
6897
6637
 
6898
6638
  return result;
6899
6639
  }
@@ -6913,7 +6653,7 @@ struct ggml_tensor * ggml_soft_max_inplace(
6913
6653
 
6914
6654
  // ggml_soft_max_back
6915
6655
 
6916
- struct ggml_tensor * ggml_soft_max_back_impl(
6656
+ static struct ggml_tensor * ggml_soft_max_back_impl(
6917
6657
  struct ggml_context * ctx,
6918
6658
  struct ggml_tensor * a,
6919
6659
  struct ggml_tensor * b,
@@ -6950,7 +6690,7 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
6950
6690
 
6951
6691
  // ggml_rope
6952
6692
 
6953
- struct ggml_tensor * ggml_rope_impl(
6693
+ static struct ggml_tensor * ggml_rope_impl(
6954
6694
  struct ggml_context * ctx,
6955
6695
  struct ggml_tensor * a,
6956
6696
  int n_past,
@@ -6969,23 +6709,14 @@ struct ggml_tensor * ggml_rope_impl(
6969
6709
 
6970
6710
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6971
6711
 
6972
- ggml_scratch_save(ctx);
6973
-
6974
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
6975
-
6976
- ((int32_t *) b->data)[0] = n_past;
6977
- ((int32_t *) b->data)[1] = n_dims;
6978
- ((int32_t *) b->data)[2] = mode;
6979
- ((int32_t *) b->data)[3] = n_ctx;
6980
- memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
6981
- memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
6982
-
6983
- ggml_scratch_load(ctx);
6712
+ int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6713
+ memcpy(params + 4, &freq_base, sizeof(float));
6714
+ memcpy(params + 5, &freq_scale, sizeof(float));
6715
+ ggml_set_op_params(result, &params, sizeof(params));
6984
6716
 
6985
6717
  result->op = GGML_OP_ROPE;
6986
6718
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6987
6719
  result->src[0] = a;
6988
- result->src[1] = b;
6989
6720
 
6990
6721
  return result;
6991
6722
  }
@@ -7042,22 +6773,12 @@ struct ggml_tensor * ggml_rope_back(
7042
6773
 
7043
6774
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7044
6775
 
7045
- ggml_scratch_save(ctx);
7046
-
7047
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7048
- ggml_set_name(b, "n_past, n_dims, mode");
7049
-
7050
- ((int32_t *) b->data)[0] = n_past;
7051
- ((int32_t *) b->data)[1] = n_dims;
7052
- ((int32_t *) b->data)[2] = mode;
7053
- ((int32_t *) b->data)[3] = n_ctx;
7054
-
7055
- ggml_scratch_load(ctx);
6776
+ int32_t params[] = { n_past, n_dims, mode, n_ctx };
6777
+ ggml_set_op_params(result, &params, sizeof(params));
7056
6778
 
7057
6779
  result->op = GGML_OP_ROPE_BACK;
7058
6780
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7059
6781
  result->src[0] = a;
7060
- result->src[1] = b;
7061
6782
 
7062
6783
  return result;
7063
6784
  }
@@ -7082,21 +6803,13 @@ struct ggml_tensor * ggml_alibi(
7082
6803
  //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7083
6804
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7084
6805
 
7085
- ggml_scratch_save(ctx);
7086
-
7087
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7088
-
7089
- ((int32_t *) b->data)[0] = n_past;
7090
- ((int32_t *) b->data)[1] = n_head;
7091
- GGML_ASSERT(sizeof(float) == sizeof(int32_t));
7092
- (((float *) b->data)[2]) = bias_max;
7093
-
7094
- ggml_scratch_load(ctx);
6806
+ int32_t op_params[3] = { n_past, n_head };
6807
+ memcpy(op_params + 2, &bias_max, sizeof(float));
6808
+ ggml_set_op_params(result, &op_params, sizeof(op_params));
7095
6809
 
7096
6810
  result->op = GGML_OP_ALIBI;
7097
6811
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7098
6812
  result->src[0] = a;
7099
- result->src[1] = b;
7100
6813
 
7101
6814
  return result;
7102
6815
  }
@@ -7118,19 +6831,12 @@ struct ggml_tensor * ggml_clamp(
7118
6831
  // TODO: when implement backward, fix this:
7119
6832
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7120
6833
 
7121
- ggml_scratch_save(ctx);
7122
-
7123
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
7124
-
7125
- ((float *) b->data)[0] = min;
7126
- ((float *) b->data)[1] = max;
7127
-
7128
- ggml_scratch_load(ctx);
6834
+ float params[] = { min, max };
6835
+ ggml_set_op_params(result, &params, sizeof(params));
7129
6836
 
7130
6837
  result->op = GGML_OP_CLAMP;
7131
6838
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7132
6839
  result->src[0] = a;
7133
- result->src[1] = b;
7134
6840
 
7135
6841
  return result;
7136
6842
  }
@@ -7163,18 +6869,13 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
7163
6869
  };
7164
6870
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7165
6871
 
7166
- ggml_scratch_save(ctx);
7167
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7168
- ((int32_t*)c->data)[0] = s0;
7169
- ((int32_t*)c->data)[1] = p0;
7170
- ((int32_t*)c->data)[2] = d0;
7171
- ggml_scratch_load(ctx);
6872
+ int32_t params[] = { s0, p0, d0 };
6873
+ ggml_set_op_params(result, &params, sizeof(params));
7172
6874
 
7173
6875
  result->op = GGML_OP_CONV_1D;
7174
6876
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7175
6877
  result->src[0] = a;
7176
6878
  result->src[1] = b;
7177
- result->src[2] = c;
7178
6879
 
7179
6880
  return result;
7180
6881
  }
@@ -7207,21 +6908,13 @@ struct ggml_tensor* ggml_conv_2d(
7207
6908
  };
7208
6909
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7209
6910
 
7210
- ggml_scratch_save(ctx);
7211
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
7212
- ((int32_t*)c->data)[0] = s0;
7213
- ((int32_t*)c->data)[1] = s1;
7214
- ((int32_t*)c->data)[2] = p0;
7215
- ((int32_t*)c->data)[3] = p1;
7216
- ((int32_t*)c->data)[4] = d0;
7217
- ((int32_t*)c->data)[5] = d1;
7218
- ggml_scratch_load(ctx);
6911
+ int32_t params[] = { s0, s1, p0, p1, d0, d1 };
6912
+ ggml_set_op_params(result, &params, sizeof(params));
7219
6913
 
7220
6914
  result->op = GGML_OP_CONV_2D;
7221
6915
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7222
6916
  result->src[0] = a;
7223
6917
  result->src[1] = b;
7224
- result->src[2] = c;
7225
6918
 
7226
6919
  return result;
7227
6920
 
@@ -7245,7 +6938,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
7245
6938
  return (ins + 2 * p - ks) / s + 1;
7246
6939
  }
7247
6940
 
7248
- // ggml_pool_2d
6941
+ // ggml_pool_1d
7249
6942
 
7250
6943
  struct ggml_tensor* ggml_pool_1d(
7251
6944
  struct ggml_context * ctx,
@@ -7268,18 +6961,12 @@ struct ggml_tensor* ggml_pool_1d(
7268
6961
  };
7269
6962
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7270
6963
 
7271
- ggml_scratch_save(ctx);
7272
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7273
- ((int32_t*)c->data)[0] = op;
7274
- ((int32_t*)c->data)[1] = k0;
7275
- ((int32_t*)c->data)[2] = s0;
7276
- ((int32_t*)c->data)[3] = p0;
7277
- ggml_scratch_load(ctx);
6964
+ int32_t params[] = { op, k0, s0, p0 };
6965
+ ggml_set_op_params(result, &params, sizeof(params));
7278
6966
 
7279
6967
  result->op = GGML_OP_POOL_1D;
7280
6968
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7281
6969
  result->src[0] = a;
7282
- result->src[1] = c;
7283
6970
 
7284
6971
  return result;
7285
6972
  }
@@ -7311,21 +6998,12 @@ struct ggml_tensor* ggml_pool_2d(
7311
6998
  };
7312
6999
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7313
7000
 
7314
- ggml_scratch_save(ctx);
7315
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
7316
- ((int32_t*)c->data)[0] = op;
7317
- ((int32_t*)c->data)[1] = k0;
7318
- ((int32_t*)c->data)[2] = k1;
7319
- ((int32_t*)c->data)[3] = s0;
7320
- ((int32_t*)c->data)[4] = s1;
7321
- ((int32_t*)c->data)[5] = p0;
7322
- ((int32_t*)c->data)[6] = p1;
7323
- ggml_scratch_load(ctx);
7001
+ int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
7002
+ ggml_set_op_params(result, &params, sizeof(params));
7324
7003
 
7325
7004
  result->op = GGML_OP_POOL_2D;
7326
7005
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7327
7006
  result->src[0] = a;
7328
- result->src[1] = c;
7329
7007
 
7330
7008
  return result;
7331
7009
  }
@@ -7348,14 +7026,16 @@ struct ggml_tensor * ggml_flash_attn(
7348
7026
  }
7349
7027
 
7350
7028
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
7351
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne);
7029
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
7030
+
7031
+ int32_t t = masked ? 1 : 0;
7032
+ ggml_set_op_params(result, &t, sizeof(t));
7352
7033
 
7353
7034
  result->op = GGML_OP_FLASH_ATTN;
7354
7035
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7355
7036
  result->src[0] = q;
7356
7037
  result->src[1] = k;
7357
7038
  result->src[2] = v;
7358
- result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
7359
7039
 
7360
7040
  return result;
7361
7041
  }
@@ -7379,7 +7059,7 @@ struct ggml_tensor * ggml_flash_ff(
7379
7059
  }
7380
7060
 
7381
7061
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7382
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);
7062
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
7383
7063
 
7384
7064
  result->op = GGML_OP_FLASH_FF;
7385
7065
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7445,13 +7125,15 @@ struct ggml_tensor * ggml_flash_attn_back(
7445
7125
 
7446
7126
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7447
7127
 
7128
+ int32_t masked_i = masked ? 1 : 0;
7129
+ ggml_set_op_params(result, &masked_i, sizeof(masked_i));
7130
+
7448
7131
  result->op = GGML_OP_FLASH_ATTN_BACK;
7449
7132
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7450
7133
  result->src[0] = q;
7451
7134
  result->src[1] = k;
7452
7135
  result->src[2] = v;
7453
7136
  result->src[3] = d;
7454
- result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
7455
7137
 
7456
7138
  return result;
7457
7139
  }
@@ -7484,21 +7166,12 @@ struct ggml_tensor * ggml_win_part(
7484
7166
 
7485
7167
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7486
7168
 
7487
- ggml_scratch_save(ctx);
7488
-
7489
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7490
-
7491
- ((int32_t *) b->data)[0] = npx;
7492
- ((int32_t *) b->data)[1] = npy;
7493
- ((int32_t *) b->data)[2] = w;
7494
-
7495
- ggml_scratch_load(ctx);
7169
+ int32_t params[] = { npx, npy, w };
7170
+ ggml_set_op_params(result, &params, sizeof(params));
7496
7171
 
7497
7172
  result->op = GGML_OP_WIN_PART;
7498
7173
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7499
7174
  result->src[0] = a;
7500
- result->src[1] = NULL;
7501
- result->src[2] = b;
7502
7175
 
7503
7176
  return result;
7504
7177
  }
@@ -7523,26 +7196,57 @@ struct ggml_tensor * ggml_win_unpart(
7523
7196
  const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
7524
7197
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7525
7198
 
7526
- ggml_scratch_save(ctx);
7199
+ int32_t params[] = { w };
7200
+ ggml_set_op_params(result, &params, sizeof(params));
7527
7201
 
7528
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
7202
+ result->op = GGML_OP_WIN_UNPART;
7203
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7204
+ result->src[0] = a;
7529
7205
 
7530
- ((int32_t *) b->data)[0] = w;
7206
+ return result;
7207
+ }
7531
7208
 
7532
- ggml_scratch_load(ctx);
7209
+ // gmml_unary
7533
7210
 
7534
- result->op = GGML_OP_WIN_UNPART;
7211
+ static struct ggml_tensor * ggml_unary_impl(
7212
+ struct ggml_context * ctx,
7213
+ struct ggml_tensor * a,
7214
+ enum ggml_unary_op op,
7215
+ bool inplace) {
7216
+ bool is_node = false;
7217
+
7218
+ if (!inplace && (a->grad)) {
7219
+ is_node = true;
7220
+ }
7221
+
7222
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7223
+
7224
+ ggml_set_op_params_i32(result, 0, (int32_t) op);
7225
+
7226
+ result->op = GGML_OP_UNARY;
7535
7227
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7536
7228
  result->src[0] = a;
7537
- result->src[1] = NULL;
7538
- result->src[2] = b;
7539
7229
 
7540
7230
  return result;
7541
7231
  }
7542
7232
 
7233
+ struct ggml_tensor * ggml_unary(
7234
+ struct ggml_context * ctx,
7235
+ struct ggml_tensor * a,
7236
+ enum ggml_unary_op op) {
7237
+ return ggml_unary_impl(ctx, a, op, false);
7238
+ }
7239
+
7240
+ struct ggml_tensor * ggml_unary_inplace(
7241
+ struct ggml_context * ctx,
7242
+ struct ggml_tensor * a,
7243
+ enum ggml_unary_op op) {
7244
+ return ggml_unary_impl(ctx, a, op, true);
7245
+ }
7246
+
7543
7247
  // ggml_map_unary
7544
7248
 
7545
- struct ggml_tensor * ggml_map_unary_impl_f32(
7249
+ static struct ggml_tensor * ggml_map_unary_impl_f32(
7546
7250
  struct ggml_context * ctx,
7547
7251
  struct ggml_tensor * a,
7548
7252
  const ggml_unary_op_f32_t fun,
@@ -7553,19 +7257,13 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
7553
7257
  is_node = true;
7554
7258
  }
7555
7259
 
7556
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7557
-
7558
- ggml_scratch_save(ctx);
7559
-
7560
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7561
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7260
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7562
7261
 
7563
- ggml_scratch_load(ctx);
7262
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7564
7263
 
7565
7264
  result->op = GGML_OP_MAP_UNARY;
7566
7265
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7567
7266
  result->src[0] = a;
7568
- result->src[2] = addr_tensor;
7569
7267
 
7570
7268
  return result;
7571
7269
  }
@@ -7586,7 +7284,7 @@ struct ggml_tensor * ggml_map_unary_inplace_f32(
7586
7284
 
7587
7285
  // ggml_map_binary
7588
7286
 
7589
- struct ggml_tensor * ggml_map_binary_impl_f32(
7287
+ static struct ggml_tensor * ggml_map_binary_impl_f32(
7590
7288
  struct ggml_context * ctx,
7591
7289
  struct ggml_tensor * a,
7592
7290
  struct ggml_tensor * b,
@@ -7600,20 +7298,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
7600
7298
  is_node = true;
7601
7299
  }
7602
7300
 
7603
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7604
-
7605
- ggml_scratch_save(ctx);
7606
-
7607
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7608
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7301
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7609
7302
 
7610
- ggml_scratch_load(ctx);
7303
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7611
7304
 
7612
7305
  result->op = GGML_OP_MAP_BINARY;
7613
7306
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7614
7307
  result->src[0] = a;
7615
7308
  result->src[1] = b;
7616
- result->src[2] = addr_tensor;
7617
7309
 
7618
7310
  return result;
7619
7311
  }
@@ -7636,7 +7328,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7636
7328
 
7637
7329
  // ggml_map_custom1
7638
7330
 
7639
- struct ggml_tensor * ggml_map_custom1_impl_f32(
7331
+ static struct ggml_tensor * ggml_map_custom1_impl_f32(
7640
7332
  struct ggml_context * ctx,
7641
7333
  struct ggml_tensor * a,
7642
7334
  const ggml_custom1_op_f32_t fun,
@@ -7647,19 +7339,13 @@ struct ggml_tensor * ggml_map_custom1_impl_f32(
7647
7339
  is_node = true;
7648
7340
  }
7649
7341
 
7650
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7651
-
7652
- ggml_scratch_save(ctx);
7653
-
7654
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7655
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7342
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7656
7343
 
7657
- ggml_scratch_load(ctx);
7344
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7658
7345
 
7659
7346
  result->op = GGML_OP_MAP_CUSTOM1;
7660
7347
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7661
7348
  result->src[0] = a;
7662
- result->src[2] = addr_tensor;
7663
7349
 
7664
7350
  return result;
7665
7351
  }
@@ -7680,7 +7366,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7680
7366
 
7681
7367
  // ggml_map_custom2
7682
7368
 
7683
- struct ggml_tensor * ggml_map_custom2_impl_f32(
7369
+ static struct ggml_tensor * ggml_map_custom2_impl_f32(
7684
7370
  struct ggml_context * ctx,
7685
7371
  struct ggml_tensor * a,
7686
7372
  struct ggml_tensor * b,
@@ -7692,20 +7378,14 @@ struct ggml_tensor * ggml_map_custom2_impl_f32(
7692
7378
  is_node = true;
7693
7379
  }
7694
7380
 
7695
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7696
-
7697
- ggml_scratch_save(ctx);
7698
-
7699
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7700
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7381
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7701
7382
 
7702
- ggml_scratch_load(ctx);
7383
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7703
7384
 
7704
7385
  result->op = GGML_OP_MAP_CUSTOM2;
7705
7386
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7706
7387
  result->src[0] = a;
7707
7388
  result->src[1] = b;
7708
- result->src[2] = addr_tensor;
7709
7389
 
7710
7390
  return result;
7711
7391
  }
@@ -7728,7 +7408,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7728
7408
 
7729
7409
  // ggml_map_custom3
7730
7410
 
7731
- struct ggml_tensor * ggml_map_custom3_impl_f32(
7411
+ static struct ggml_tensor * ggml_map_custom3_impl_f32(
7732
7412
  struct ggml_context * ctx,
7733
7413
  struct ggml_tensor * a,
7734
7414
  struct ggml_tensor * b,
@@ -7741,21 +7421,15 @@ struct ggml_tensor * ggml_map_custom3_impl_f32(
7741
7421
  is_node = true;
7742
7422
  }
7743
7423
 
7744
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7745
-
7746
- ggml_scratch_save(ctx);
7747
-
7748
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7749
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7424
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7750
7425
 
7751
- ggml_scratch_load(ctx);
7426
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7752
7427
 
7753
7428
  result->op = GGML_OP_MAP_CUSTOM3;
7754
7429
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7755
7430
  result->src[0] = a;
7756
7431
  result->src[1] = b;
7757
- result->src[2] = addr_tensor;
7758
- result->src[3] = c;
7432
+ result->src[2] = c;
7759
7433
 
7760
7434
  return result;
7761
7435
  }
@@ -8983,21 +8657,17 @@ static void ggml_compute_forward_acc_f32(
8983
8657
  const struct ggml_compute_params * params,
8984
8658
  const struct ggml_tensor * src0,
8985
8659
  const struct ggml_tensor * src1,
8986
- const struct ggml_tensor * opt0,
8987
8660
  struct ggml_tensor * dst) {
8988
8661
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8989
8662
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
8990
8663
 
8991
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
8992
- GGML_ASSERT(ggml_nelements(opt0) == 5);
8993
-
8994
8664
  // view src0 and dst with these strides and data offset inbytes during acc
8995
8665
  // nb0 is implicitely element_size because src0 and dst are contiguous
8996
- size_t nb1 = ((int32_t *) opt0->data)[0];
8997
- size_t nb2 = ((int32_t *) opt0->data)[1];
8998
- size_t nb3 = ((int32_t *) opt0->data)[2];
8999
- size_t offset = ((int32_t *) opt0->data)[3];
9000
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
8666
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
8667
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
8668
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
8669
+ size_t offset = ((int32_t *) dst->op_params)[3];
8670
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
9001
8671
 
9002
8672
  if (!inplace && (params->type == GGML_TASK_INIT)) {
9003
8673
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -9066,13 +8736,12 @@ static void ggml_compute_forward_acc(
9066
8736
  const struct ggml_compute_params * params,
9067
8737
  const struct ggml_tensor * src0,
9068
8738
  const struct ggml_tensor * src1,
9069
- const struct ggml_tensor * opt0,
9070
8739
  struct ggml_tensor * dst) {
9071
8740
 
9072
8741
  switch (src0->type) {
9073
8742
  case GGML_TYPE_F32:
9074
8743
  {
9075
- ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst);
8744
+ ggml_compute_forward_acc_f32(params, src0, src1, dst);
9076
8745
  } break;
9077
8746
  case GGML_TYPE_F16:
9078
8747
  case GGML_TYPE_Q4_0:
@@ -9504,7 +9173,7 @@ static void ggml_compute_forward_sum_f32(
9504
9173
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9505
9174
  for (int64_t i02 = 0; i02 < ne02; i02++) {
9506
9175
  for (int64_t i01 = 0; i01 < ne01; i01++) {
9507
- ggml_vec_sum_ggf(ne00,
9176
+ ggml_vec_sum_f32_ggf(ne00,
9508
9177
  &row_sum,
9509
9178
  (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
9510
9179
  sum += row_sum;
@@ -9514,6 +9183,38 @@ static void ggml_compute_forward_sum_f32(
9514
9183
  ((float *) dst->data)[0] = sum;
9515
9184
  }
9516
9185
 
9186
+ static void ggml_compute_forward_sum_f16(
9187
+ const struct ggml_compute_params * params,
9188
+ const struct ggml_tensor * src0,
9189
+ struct ggml_tensor * dst) {
9190
+ assert(params->ith == 0);
9191
+ assert(ggml_is_scalar(dst));
9192
+
9193
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9194
+ return;
9195
+ }
9196
+
9197
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
9198
+
9199
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
9200
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb);
9201
+
9202
+ float sum = 0;
9203
+ float row_sum = 0;
9204
+
9205
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
9206
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
9207
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
9208
+ ggml_vec_sum_f16_ggf(ne00,
9209
+ &row_sum,
9210
+ (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
9211
+ sum += row_sum;
9212
+ }
9213
+ }
9214
+ }
9215
+ ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
9216
+ }
9217
+
9517
9218
  static void ggml_compute_forward_sum(
9518
9219
  const struct ggml_compute_params * params,
9519
9220
  const struct ggml_tensor * src0,
@@ -9523,6 +9224,10 @@ static void ggml_compute_forward_sum(
9523
9224
  {
9524
9225
  ggml_compute_forward_sum_f32(params, src0, dst);
9525
9226
  } break;
9227
+ case GGML_TYPE_F16:
9228
+ {
9229
+ ggml_compute_forward_sum_f16(params, src0, dst);
9230
+ } break;
9526
9231
  default:
9527
9232
  {
9528
9233
  GGML_ASSERT(false);
@@ -10118,8 +9823,8 @@ static void ggml_compute_forward_gelu_f32(
10118
9823
  const struct ggml_compute_params * params,
10119
9824
  const struct ggml_tensor * src0,
10120
9825
  struct ggml_tensor * dst) {
10121
- GGML_ASSERT(ggml_is_contiguous(src0));
10122
- GGML_ASSERT(ggml_is_contiguous(dst));
9826
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9827
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10123
9828
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10124
9829
 
10125
9830
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10177,8 +9882,8 @@ static void ggml_compute_forward_gelu_quick_f32(
10177
9882
  const struct ggml_compute_params * params,
10178
9883
  const struct ggml_tensor * src0,
10179
9884
  struct ggml_tensor * dst) {
10180
- GGML_ASSERT(ggml_is_contiguous(src0));
10181
- GGML_ASSERT(ggml_is_contiguous(dst));
9885
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9886
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10182
9887
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10183
9888
 
10184
9889
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10236,8 +9941,8 @@ static void ggml_compute_forward_silu_f32(
10236
9941
  const struct ggml_compute_params * params,
10237
9942
  const struct ggml_tensor * src0,
10238
9943
  struct ggml_tensor * dst) {
10239
- GGML_ASSERT(ggml_is_contiguous(src0));
10240
- GGML_ASSERT(ggml_is_contiguous(dst));
9944
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9945
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10241
9946
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10242
9947
 
10243
9948
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10289,7 +9994,6 @@ static void ggml_compute_forward_silu(
10289
9994
  }
10290
9995
  }
10291
9996
 
10292
-
10293
9997
  // ggml_compute_forward_silu_back
10294
9998
 
10295
9999
  static void ggml_compute_forward_silu_back_f32(
@@ -10297,9 +10001,9 @@ static void ggml_compute_forward_silu_back_f32(
10297
10001
  const struct ggml_tensor * src0,
10298
10002
  const struct ggml_tensor * grad,
10299
10003
  struct ggml_tensor * dst) {
10300
- GGML_ASSERT(ggml_is_contiguous(grad));
10301
- GGML_ASSERT(ggml_is_contiguous(src0));
10302
- GGML_ASSERT(ggml_is_contiguous(dst));
10004
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
10005
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
10006
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10303
10007
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10304
10008
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
10305
10009
 
@@ -10439,7 +10143,8 @@ static void ggml_compute_forward_rms_norm_f32(
10439
10143
 
10440
10144
  GGML_TENSOR_UNARY_OP_LOCALS;
10441
10145
 
10442
- const float eps = 1e-6f; // TODO: make this a parameter
10146
+ float eps;
10147
+ memcpy(&eps, dst->op_params, sizeof(float));
10443
10148
 
10444
10149
  // TODO: optimize
10445
10150
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -11092,21 +10797,17 @@ static void ggml_compute_forward_set_f32(
11092
10797
  const struct ggml_compute_params * params,
11093
10798
  const struct ggml_tensor * src0,
11094
10799
  const struct ggml_tensor * src1,
11095
- const struct ggml_tensor * opt0,
11096
10800
  struct ggml_tensor * dst) {
11097
10801
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11098
10802
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
11099
10803
 
11100
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
11101
- GGML_ASSERT(ggml_nelements(opt0) == 5);
11102
-
11103
10804
  // view src0 and dst with these strides and data offset inbytes during set
11104
10805
  // nb0 is implicitely element_size because src0 and dst are contiguous
11105
- size_t nb1 = ((int32_t *) opt0->data)[0];
11106
- size_t nb2 = ((int32_t *) opt0->data)[1];
11107
- size_t nb3 = ((int32_t *) opt0->data)[2];
11108
- size_t offset = ((int32_t *) opt0->data)[3];
11109
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
10806
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
10807
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
10808
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
10809
+ size_t offset = ((int32_t *) dst->op_params)[3];
10810
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
11110
10811
 
11111
10812
  if (!inplace && (params->type == GGML_TASK_INIT)) {
11112
10813
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -11166,13 +10867,12 @@ static void ggml_compute_forward_set(
11166
10867
  const struct ggml_compute_params * params,
11167
10868
  const struct ggml_tensor * src0,
11168
10869
  const struct ggml_tensor * src1,
11169
- const struct ggml_tensor * opt0,
11170
10870
  struct ggml_tensor * dst) {
11171
10871
 
11172
10872
  switch (src0->type) {
11173
10873
  case GGML_TYPE_F32:
11174
10874
  {
11175
- ggml_compute_forward_set_f32(params, src0, src1, opt0, dst);
10875
+ ggml_compute_forward_set_f32(params, src0, src1, dst);
11176
10876
  } break;
11177
10877
  case GGML_TYPE_F16:
11178
10878
  case GGML_TYPE_Q4_0:
@@ -11568,17 +11268,14 @@ static void ggml_compute_forward_diag(
11568
11268
  static void ggml_compute_forward_diag_mask_f32(
11569
11269
  const struct ggml_compute_params * params,
11570
11270
  const struct ggml_tensor * src0,
11571
- const struct ggml_tensor * src1,
11572
11271
  struct ggml_tensor * dst,
11573
11272
  const float value) {
11574
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11575
- GGML_ASSERT(ggml_nelements(src1) == 2);
11576
11273
 
11577
11274
  const int ith = params->ith;
11578
11275
  const int nth = params->nth;
11579
11276
 
11580
- const int n_past = ((int32_t *) src1->data)[0];
11581
- const bool inplace = (bool)((int32_t *) src1->data)[1];
11277
+ const int n_past = ((int32_t *) dst->op_params)[0];
11278
+ const bool inplace = (bool)((int32_t *) dst->op_params)[1];
11582
11279
 
11583
11280
  GGML_ASSERT(n_past >= 0);
11584
11281
 
@@ -11621,12 +11318,11 @@ static void ggml_compute_forward_diag_mask_f32(
11621
11318
  static void ggml_compute_forward_diag_mask_inf(
11622
11319
  const struct ggml_compute_params * params,
11623
11320
  const struct ggml_tensor * src0,
11624
- const struct ggml_tensor * src1,
11625
11321
  struct ggml_tensor * dst) {
11626
11322
  switch (src0->type) {
11627
11323
  case GGML_TYPE_F32:
11628
11324
  {
11629
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY);
11325
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
11630
11326
  } break;
11631
11327
  default:
11632
11328
  {
@@ -11638,12 +11334,11 @@ static void ggml_compute_forward_diag_mask_inf(
11638
11334
  static void ggml_compute_forward_diag_mask_zero(
11639
11335
  const struct ggml_compute_params * params,
11640
11336
  const struct ggml_tensor * src0,
11641
- const struct ggml_tensor * src1,
11642
11337
  struct ggml_tensor * dst) {
11643
11338
  switch (src0->type) {
11644
11339
  case GGML_TYPE_F32:
11645
11340
  {
11646
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0);
11341
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
11647
11342
  } break;
11648
11343
  default:
11649
11344
  {
@@ -11841,20 +11536,17 @@ static void ggml_compute_forward_soft_max_back(
11841
11536
  static void ggml_compute_forward_alibi_f32(
11842
11537
  const struct ggml_compute_params * params,
11843
11538
  const struct ggml_tensor * src0,
11844
- const struct ggml_tensor * src1,
11845
11539
  struct ggml_tensor * dst) {
11846
11540
  assert(params->ith == 0);
11847
11541
 
11848
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11849
- GGML_ASSERT(ggml_nelements(src1) == 3);
11850
-
11851
11542
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11852
11543
  return;
11853
11544
  }
11854
11545
 
11855
- const int n_past = ((int32_t *) src1->data)[0];
11856
- const int n_head = ((int32_t *) src1->data)[1];
11857
- const float max_bias = ((float *) src1->data)[2];
11546
+ const int n_past = ((int32_t *) dst->op_params)[0];
11547
+ const int n_head = ((int32_t *) dst->op_params)[1];
11548
+ float max_bias;
11549
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11858
11550
 
11859
11551
  assert(n_past >= 0);
11860
11552
 
@@ -11907,20 +11599,17 @@ static void ggml_compute_forward_alibi_f32(
11907
11599
  static void ggml_compute_forward_alibi_f16(
11908
11600
  const struct ggml_compute_params * params,
11909
11601
  const struct ggml_tensor * src0,
11910
- const struct ggml_tensor * src1,
11911
11602
  struct ggml_tensor * dst) {
11912
11603
  assert(params->ith == 0);
11913
11604
 
11914
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11915
- GGML_ASSERT(ggml_nelements(src1) == 3);
11916
-
11917
11605
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11918
11606
  return;
11919
11607
  }
11920
11608
 
11921
- const int n_past = ((int32_t *) src1->data)[0];
11922
- const int n_head = ((int32_t *) src1->data)[1];
11923
- const float max_bias = ((float *) src1->data)[2];
11609
+ const int n_past = ((int32_t *) dst->op_params)[0];
11610
+ const int n_head = ((int32_t *) dst->op_params)[1];
11611
+ float max_bias;
11612
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11924
11613
 
11925
11614
  assert(n_past >= 0);
11926
11615
 
@@ -11973,16 +11662,15 @@ static void ggml_compute_forward_alibi_f16(
11973
11662
  static void ggml_compute_forward_alibi(
11974
11663
  const struct ggml_compute_params * params,
11975
11664
  const struct ggml_tensor * src0,
11976
- const struct ggml_tensor * src1,
11977
11665
  struct ggml_tensor * dst) {
11978
11666
  switch (src0->type) {
11979
11667
  case GGML_TYPE_F16:
11980
11668
  {
11981
- ggml_compute_forward_alibi_f16(params, src0, src1, dst);
11669
+ ggml_compute_forward_alibi_f16(params, src0, dst);
11982
11670
  } break;
11983
11671
  case GGML_TYPE_F32:
11984
11672
  {
11985
- ggml_compute_forward_alibi_f32(params, src0, src1, dst);
11673
+ ggml_compute_forward_alibi_f32(params, src0, dst);
11986
11674
  } break;
11987
11675
  case GGML_TYPE_Q4_0:
11988
11676
  case GGML_TYPE_Q4_1:
@@ -12012,19 +11700,17 @@ static void ggml_compute_forward_alibi(
12012
11700
  static void ggml_compute_forward_clamp_f32(
12013
11701
  const struct ggml_compute_params * params,
12014
11702
  const struct ggml_tensor * src0,
12015
- const struct ggml_tensor * src1,
12016
11703
  struct ggml_tensor * dst) {
12017
11704
  assert(params->ith == 0);
12018
11705
 
12019
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
12020
- GGML_ASSERT(ggml_nelements(src1) == 2);
12021
-
12022
11706
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12023
11707
  return;
12024
11708
  }
12025
11709
 
12026
- const float min = ((float *) src1->data)[0];
12027
- const float max = ((float *) src1->data)[1];
11710
+ float min;
11711
+ float max;
11712
+ memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
11713
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
12028
11714
 
12029
11715
  const int ith = params->ith;
12030
11716
  const int nth = params->nth;
@@ -12054,12 +11740,11 @@ static void ggml_compute_forward_clamp_f32(
12054
11740
  static void ggml_compute_forward_clamp(
12055
11741
  const struct ggml_compute_params * params,
12056
11742
  const struct ggml_tensor * src0,
12057
- const struct ggml_tensor * src1,
12058
11743
  struct ggml_tensor * dst) {
12059
11744
  switch (src0->type) {
12060
11745
  case GGML_TYPE_F32:
12061
11746
  {
12062
- ggml_compute_forward_clamp_f32(params, src0, src1, dst);
11747
+ ggml_compute_forward_clamp_f32(params, src0, dst);
12063
11748
  } break;
12064
11749
  case GGML_TYPE_F16:
12065
11750
  case GGML_TYPE_Q4_0:
@@ -12089,10 +11774,7 @@ static void ggml_compute_forward_clamp(
12089
11774
  static void ggml_compute_forward_rope_f32(
12090
11775
  const struct ggml_compute_params * params,
12091
11776
  const struct ggml_tensor * src0,
12092
- const struct ggml_tensor * src1,
12093
11777
  struct ggml_tensor * dst) {
12094
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12095
- GGML_ASSERT(ggml_nelements(src1) == 6);
12096
11778
 
12097
11779
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12098
11780
  return;
@@ -12101,12 +11783,12 @@ static void ggml_compute_forward_rope_f32(
12101
11783
  float freq_base;
12102
11784
  float freq_scale;
12103
11785
 
12104
- const int n_past = ((int32_t *) src1->data)[0];
12105
- const int n_dims = ((int32_t *) src1->data)[1];
12106
- const int mode = ((int32_t *) src1->data)[2];
12107
- const int n_ctx = ((int32_t *) src1->data)[3];
12108
- memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12109
- memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
11786
+ const int n_past = ((int32_t *) dst->op_params)[0];
11787
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11788
+ const int mode = ((int32_t *) dst->op_params)[2];
11789
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11790
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11791
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12110
11792
 
12111
11793
  assert(n_past >= 0);
12112
11794
 
@@ -12221,10 +11903,7 @@ static void ggml_compute_forward_rope_f32(
12221
11903
  static void ggml_compute_forward_rope_f16(
12222
11904
  const struct ggml_compute_params * params,
12223
11905
  const struct ggml_tensor * src0,
12224
- const struct ggml_tensor * src1,
12225
11906
  struct ggml_tensor * dst) {
12226
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12227
- GGML_ASSERT(ggml_nelements(src1) == 6);
12228
11907
 
12229
11908
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12230
11909
  return;
@@ -12233,12 +11912,12 @@ static void ggml_compute_forward_rope_f16(
12233
11912
  float freq_base;
12234
11913
  float freq_scale;
12235
11914
 
12236
- const int n_past = ((int32_t *) src1->data)[0];
12237
- const int n_dims = ((int32_t *) src1->data)[1];
12238
- const int mode = ((int32_t *) src1->data)[2];
12239
- const int n_ctx = ((int32_t *) src1->data)[3];
12240
- memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12241
- memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
11915
+ const int n_past = ((int32_t *) dst->op_params)[0];
11916
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11917
+ const int mode = ((int32_t *) dst->op_params)[2];
11918
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11919
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11920
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12242
11921
 
12243
11922
  assert(n_past >= 0);
12244
11923
 
@@ -12353,16 +12032,15 @@ static void ggml_compute_forward_rope_f16(
12353
12032
  static void ggml_compute_forward_rope(
12354
12033
  const struct ggml_compute_params * params,
12355
12034
  const struct ggml_tensor * src0,
12356
- const struct ggml_tensor * src1,
12357
12035
  struct ggml_tensor * dst) {
12358
12036
  switch (src0->type) {
12359
12037
  case GGML_TYPE_F16:
12360
12038
  {
12361
- ggml_compute_forward_rope_f16(params, src0, src1, dst);
12039
+ ggml_compute_forward_rope_f16(params, src0, dst);
12362
12040
  } break;
12363
12041
  case GGML_TYPE_F32:
12364
12042
  {
12365
- ggml_compute_forward_rope_f32(params, src0, src1, dst);
12043
+ ggml_compute_forward_rope_f32(params, src0, dst);
12366
12044
  } break;
12367
12045
  default:
12368
12046
  {
@@ -12376,10 +12054,7 @@ static void ggml_compute_forward_rope(
12376
12054
  static void ggml_compute_forward_rope_back_f32(
12377
12055
  const struct ggml_compute_params * params,
12378
12056
  const struct ggml_tensor * src0,
12379
- const struct ggml_tensor * src1,
12380
12057
  struct ggml_tensor * dst) {
12381
- assert(src1->type == GGML_TYPE_I32);
12382
- assert(ggml_nelements(src1) == 4);
12383
12058
 
12384
12059
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12385
12060
  return;
@@ -12389,9 +12064,9 @@ static void ggml_compute_forward_rope_back_f32(
12389
12064
  // dx = rope_back(dy, src1)
12390
12065
  // src0 is dy, src1 contains options
12391
12066
 
12392
- const int n_past = ((int32_t *) src1->data)[0];
12393
- const int n_dims = ((int32_t *) src1->data)[1];
12394
- const int mode = ((int32_t *) src1->data)[2];
12067
+ const int n_past = ((int32_t *) dst->op_params)[0];
12068
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12069
+ const int mode = ((int32_t *) dst->op_params)[2];
12395
12070
 
12396
12071
  assert(n_past >= 0);
12397
12072
 
@@ -12475,10 +12150,7 @@ static void ggml_compute_forward_rope_back_f32(
12475
12150
  static void ggml_compute_forward_rope_back_f16(
12476
12151
  const struct ggml_compute_params * params,
12477
12152
  const struct ggml_tensor * src0,
12478
- const struct ggml_tensor * src1,
12479
12153
  struct ggml_tensor * dst) {
12480
- assert(src1->type == GGML_TYPE_I32);
12481
- assert(ggml_nelements(src1) == 3);
12482
12154
 
12483
12155
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12484
12156
  return;
@@ -12488,9 +12160,9 @@ static void ggml_compute_forward_rope_back_f16(
12488
12160
  // dx = rope_back(dy, src1)
12489
12161
  // src0 is dy, src1 contains options
12490
12162
 
12491
- const int n_past = ((int32_t *) src1->data)[0];
12492
- const int n_dims = ((int32_t *) src1->data)[1];
12493
- const int mode = ((int32_t *) src1->data)[2];
12163
+ const int n_past = ((int32_t *) dst->op_params)[0];
12164
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12165
+ const int mode = ((int32_t *) dst->op_params)[2];
12494
12166
 
12495
12167
  assert(n_past >= 0);
12496
12168
 
@@ -12574,16 +12246,15 @@ static void ggml_compute_forward_rope_back_f16(
12574
12246
  static void ggml_compute_forward_rope_back(
12575
12247
  const struct ggml_compute_params * params,
12576
12248
  const struct ggml_tensor * src0,
12577
- const struct ggml_tensor * src1,
12578
12249
  struct ggml_tensor * dst) {
12579
12250
  switch (src0->type) {
12580
12251
  case GGML_TYPE_F16:
12581
12252
  {
12582
- ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
12253
+ ggml_compute_forward_rope_back_f16(params, src0, dst);
12583
12254
  } break;
12584
12255
  case GGML_TYPE_F32:
12585
12256
  {
12586
- ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
12257
+ ggml_compute_forward_rope_back_f32(params, src0, dst);
12587
12258
  } break;
12588
12259
  default:
12589
12260
  {
@@ -12780,7 +12451,7 @@ static void ggml_compute_forward_conv_1d_s1_ph(
12780
12451
  const struct ggml_compute_params * params,
12781
12452
  const struct ggml_tensor * src0,
12782
12453
  const struct ggml_tensor * src1,
12783
- struct ggml_tensor * dst) {
12454
+ struct ggml_tensor * dst) {
12784
12455
  switch (src0->type) {
12785
12456
  case GGML_TYPE_F16:
12786
12457
  {
@@ -12983,7 +12654,7 @@ static void ggml_compute_forward_conv_1d_s2_ph(
12983
12654
  const struct ggml_compute_params * params,
12984
12655
  const struct ggml_tensor * src0,
12985
12656
  const struct ggml_tensor * src1,
12986
- struct ggml_tensor * dst) {
12657
+ struct ggml_tensor * dst) {
12987
12658
  switch (src0->type) {
12988
12659
  case GGML_TYPE_F16:
12989
12660
  {
@@ -13003,14 +12674,13 @@ static void ggml_compute_forward_conv_1d_s2_ph(
13003
12674
  // ggml_compute_forward_conv_1d
13004
12675
 
13005
12676
  static void ggml_compute_forward_conv_1d(
13006
- const struct ggml_compute_params * params,
13007
- const struct ggml_tensor * src0,
13008
- const struct ggml_tensor * src1,
13009
- const struct ggml_tensor * opt0,
13010
- struct ggml_tensor * dst) {
13011
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13012
- const int32_t p0 = ((const int32_t*)(opt0->data))[1];
13013
- const int32_t d0 = ((const int32_t*)(opt0->data))[2];
12677
+ const struct ggml_compute_params * params,
12678
+ const struct ggml_tensor * src0,
12679
+ const struct ggml_tensor * src1,
12680
+ struct ggml_tensor * dst) {
12681
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12682
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
12683
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
13014
12684
  GGML_ASSERT(d0 == 1); // dilation not supported
13015
12685
  GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
13016
12686
  if (s0 == 1) {
@@ -13028,7 +12698,6 @@ static void ggml_compute_forward_conv_2d_f16_f32(
13028
12698
  const struct ggml_compute_params * params,
13029
12699
  const struct ggml_tensor * src0,
13030
12700
  const struct ggml_tensor * src1,
13031
- const struct ggml_tensor * opt0,
13032
12701
  struct ggml_tensor * dst) {
13033
12702
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13034
12703
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13048,12 +12717,12 @@ static void ggml_compute_forward_conv_2d_f16_f32(
13048
12717
  // size of the convolution row - the kernel size unrolled across all channels
13049
12718
  const int ew0 = nk0*nk1*ne02;
13050
12719
 
13051
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13052
- const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13053
- const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13054
- const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13055
- const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13056
- const int32_t d1 = ((const int32_t*)(opt0->data))[5];
12720
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12721
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12722
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12723
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12724
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12725
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
13057
12726
 
13058
12727
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13059
12728
  GGML_ASSERT(nb10 == sizeof(float));
@@ -13125,17 +12794,15 @@ static void ggml_compute_forward_conv_2d(
13125
12794
  const struct ggml_compute_params * params,
13126
12795
  const struct ggml_tensor * src0,
13127
12796
  const struct ggml_tensor * src1,
13128
- const struct ggml_tensor * opt0,
13129
- struct ggml_tensor * dst
13130
- ) {
12797
+ struct ggml_tensor * dst) {
13131
12798
  switch (src0->type) {
13132
12799
  case GGML_TYPE_F16:
13133
12800
  {
13134
- ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
12801
+ ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
13135
12802
  } break;
13136
12803
  case GGML_TYPE_F32:
13137
12804
  {
13138
- //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
12805
+ //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
13139
12806
  GGML_ASSERT(false);
13140
12807
  } break;
13141
12808
  default:
@@ -13200,12 +12867,11 @@ static void ggml_compute_forward_pool_1d_sk_p0(
13200
12867
  // ggml_compute_forward_pool_1d
13201
12868
 
13202
12869
  static void ggml_compute_forward_pool_1d(
13203
- const struct ggml_compute_params* params,
13204
- const struct ggml_tensor* src0,
13205
- const struct ggml_tensor* opt0,
13206
- struct ggml_tensor* dst) {
13207
- GGML_ASSERT(opt0->ne[0] == 4);
13208
- const int* opts = (const int*)opt0->data;
12870
+ const struct ggml_compute_params * params,
12871
+ const struct ggml_tensor * src0,
12872
+ struct ggml_tensor * dst) {
12873
+
12874
+ const int32_t* opts = (const int32_t*)dst->op_params;
13209
12875
  enum ggml_op_pool op = opts[0];
13210
12876
  const int k0 = opts[1];
13211
12877
  const int s0 = opts[2];
@@ -13219,12 +12885,12 @@ static void ggml_compute_forward_pool_1d(
13219
12885
  // ggml_compute_forward_pool_2d_sk_p0
13220
12886
 
13221
12887
  static void ggml_compute_forward_pool_2d_sk_p0(
13222
- const struct ggml_compute_params * params,
13223
- const enum ggml_op_pool op,
13224
- const struct ggml_tensor * src,
13225
- const int k0,
13226
- const int k1,
13227
- struct ggml_tensor * dst) {
12888
+ const struct ggml_compute_params * params,
12889
+ const enum ggml_op_pool op,
12890
+ const struct ggml_tensor * src,
12891
+ const int k0,
12892
+ const int k1,
12893
+ struct ggml_tensor * dst) {
13228
12894
  assert(src->type == GGML_TYPE_F32);
13229
12895
  assert(params->ith == 0);
13230
12896
 
@@ -13284,12 +12950,11 @@ static void ggml_compute_forward_pool_2d_sk_p0(
13284
12950
  // ggml_compute_forward_pool_2d
13285
12951
 
13286
12952
  static void ggml_compute_forward_pool_2d(
13287
- const struct ggml_compute_params * params,
13288
- const struct ggml_tensor * src0,
13289
- const struct ggml_tensor * opt0,
13290
- struct ggml_tensor * dst) {
13291
- GGML_ASSERT(opt0->ne[0] == 7);
13292
- const int* opts = (const int*)opt0->data;
12953
+ const struct ggml_compute_params * params,
12954
+ const struct ggml_tensor * src0,
12955
+ struct ggml_tensor * dst) {
12956
+
12957
+ const int32_t * opts = (const int32_t *)dst->op_params;
13293
12958
  enum ggml_op_pool op = opts[0];
13294
12959
  const int k0 = opts[1];
13295
12960
  const int k1 = opts[2];
@@ -13314,7 +12979,7 @@ static void ggml_compute_forward_flash_attn_f32(
13314
12979
  const struct ggml_tensor * k,
13315
12980
  const struct ggml_tensor * v,
13316
12981
  const bool masked,
13317
- struct ggml_tensor * dst) {
12982
+ struct ggml_tensor * dst) {
13318
12983
  int64_t t0 = ggml_perf_time_us();
13319
12984
  UNUSED(t0);
13320
12985
 
@@ -13492,7 +13157,7 @@ static void ggml_compute_forward_flash_attn_f16(
13492
13157
  const struct ggml_tensor * k,
13493
13158
  const struct ggml_tensor * v,
13494
13159
  const bool masked,
13495
- struct ggml_tensor * dst) {
13160
+ struct ggml_tensor * dst) {
13496
13161
  int64_t t0 = ggml_perf_time_us();
13497
13162
  UNUSED(t0);
13498
13163
 
@@ -14257,7 +13922,6 @@ static void ggml_compute_forward_flash_attn_back(
14257
13922
  static void ggml_compute_forward_win_part_f32(
14258
13923
  const struct ggml_compute_params * params,
14259
13924
  const struct ggml_tensor * src0,
14260
- const struct ggml_tensor * opt0,
14261
13925
  struct ggml_tensor * dst) {
14262
13926
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14263
13927
  return;
@@ -14266,9 +13930,9 @@ static void ggml_compute_forward_win_part_f32(
14266
13930
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14267
13931
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14268
13932
 
14269
- const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14270
- const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
14271
- const int32_t w = ((const int32_t *)(opt0->data))[2];
13933
+ const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
13934
+ const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
13935
+ const int32_t w = ((const int32_t *)(dst->op_params))[2];
14272
13936
 
14273
13937
  assert(ne00 == ne0);
14274
13938
  assert(ne3 == nep0*nep1);
@@ -14302,12 +13966,11 @@ static void ggml_compute_forward_win_part_f32(
14302
13966
  static void ggml_compute_forward_win_part(
14303
13967
  const struct ggml_compute_params * params,
14304
13968
  const struct ggml_tensor * src0,
14305
- const struct ggml_tensor * opt0,
14306
13969
  struct ggml_tensor * dst) {
14307
13970
  switch (src0->type) {
14308
13971
  case GGML_TYPE_F32:
14309
13972
  {
14310
- ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
13973
+ ggml_compute_forward_win_part_f32(params, src0, dst);
14311
13974
  } break;
14312
13975
  default:
14313
13976
  {
@@ -14321,7 +13984,6 @@ static void ggml_compute_forward_win_part(
14321
13984
  static void ggml_compute_forward_win_unpart_f32(
14322
13985
  const struct ggml_compute_params * params,
14323
13986
  const struct ggml_tensor * src0,
14324
- const struct ggml_tensor * opt0,
14325
13987
  struct ggml_tensor * dst) {
14326
13988
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14327
13989
  return;
@@ -14330,7 +13992,7 @@ static void ggml_compute_forward_win_unpart_f32(
14330
13992
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14331
13993
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14332
13994
 
14333
- const int32_t w = ((const int32_t *)(opt0->data))[0];
13995
+ const int32_t w = ((const int32_t *)(dst->op_params))[0];
14334
13996
 
14335
13997
  // padding
14336
13998
  const int px = (w - ne1%w)%w;
@@ -14364,12 +14026,67 @@ static void ggml_compute_forward_win_unpart_f32(
14364
14026
  static void ggml_compute_forward_win_unpart(
14365
14027
  const struct ggml_compute_params * params,
14366
14028
  const struct ggml_tensor * src0,
14367
- const struct ggml_tensor * opt0,
14368
14029
  struct ggml_tensor * dst) {
14369
14030
  switch (src0->type) {
14370
14031
  case GGML_TYPE_F32:
14371
14032
  {
14372
- ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
14033
+ ggml_compute_forward_win_unpart_f32(params, src0, dst);
14034
+ } break;
14035
+ default:
14036
+ {
14037
+ GGML_ASSERT(false);
14038
+ } break;
14039
+ }
14040
+ }
14041
+
14042
+ //gmml_compute_forward_unary
14043
+
14044
+ static void ggml_compute_forward_unary(
14045
+ const struct ggml_compute_params * params,
14046
+ const struct ggml_tensor * src0,
14047
+ struct ggml_tensor * dst) {
14048
+ const enum ggml_unary_op op = ggml_get_unary_op(dst);
14049
+
14050
+ switch (op) {
14051
+ case GGML_UNARY_OP_ABS:
14052
+ {
14053
+ ggml_compute_forward_abs(params, src0, dst);
14054
+ } break;
14055
+ case GGML_UNARY_OP_SGN:
14056
+ {
14057
+ ggml_compute_forward_sgn(params, src0, dst);
14058
+ } break;
14059
+ case GGML_UNARY_OP_NEG:
14060
+ {
14061
+ ggml_compute_forward_neg(params, src0, dst);
14062
+ } break;
14063
+ case GGML_UNARY_OP_STEP:
14064
+ {
14065
+ ggml_compute_forward_step(params, src0, dst);
14066
+ } break;
14067
+ case GGML_UNARY_OP_TANH:
14068
+ {
14069
+ ggml_compute_forward_tanh(params, src0, dst);
14070
+ } break;
14071
+ case GGML_UNARY_OP_ELU:
14072
+ {
14073
+ ggml_compute_forward_elu(params, src0, dst);
14074
+ } break;
14075
+ case GGML_UNARY_OP_RELU:
14076
+ {
14077
+ ggml_compute_forward_relu(params, src0, dst);
14078
+ } break;
14079
+ case GGML_UNARY_OP_GELU:
14080
+ {
14081
+ ggml_compute_forward_gelu(params, src0, dst);
14082
+ } break;
14083
+ case GGML_UNARY_OP_GELU_QUICK:
14084
+ {
14085
+ ggml_compute_forward_gelu_quick(params, src0, dst);
14086
+ } break;
14087
+ case GGML_UNARY_OP_SILU:
14088
+ {
14089
+ ggml_compute_forward_silu(params, src0, dst);
14373
14090
  } break;
14374
14091
  default:
14375
14092
  {
@@ -14888,7 +14605,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14888
14605
  } break;
14889
14606
  case GGML_OP_ACC:
14890
14607
  {
14891
- ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14608
+ ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
14892
14609
  } break;
14893
14610
  case GGML_OP_SUB:
14894
14611
  {
@@ -14938,46 +14655,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14938
14655
  {
14939
14656
  ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
14940
14657
  } break;
14941
- case GGML_OP_ABS:
14942
- {
14943
- ggml_compute_forward_abs(params, tensor->src[0], tensor);
14944
- } break;
14945
- case GGML_OP_SGN:
14946
- {
14947
- ggml_compute_forward_sgn(params, tensor->src[0], tensor);
14948
- } break;
14949
- case GGML_OP_NEG:
14950
- {
14951
- ggml_compute_forward_neg(params, tensor->src[0], tensor);
14952
- } break;
14953
- case GGML_OP_STEP:
14954
- {
14955
- ggml_compute_forward_step(params, tensor->src[0], tensor);
14956
- } break;
14957
- case GGML_OP_TANH:
14958
- {
14959
- ggml_compute_forward_tanh(params, tensor->src[0], tensor);
14960
- } break;
14961
- case GGML_OP_ELU:
14962
- {
14963
- ggml_compute_forward_elu(params, tensor->src[0], tensor);
14964
- } break;
14965
- case GGML_OP_RELU:
14966
- {
14967
- ggml_compute_forward_relu(params, tensor->src[0], tensor);
14968
- } break;
14969
- case GGML_OP_GELU:
14970
- {
14971
- ggml_compute_forward_gelu(params, tensor->src[0], tensor);
14972
- } break;
14973
- case GGML_OP_GELU_QUICK:
14974
- {
14975
- ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
14976
- } break;
14977
- case GGML_OP_SILU:
14978
- {
14979
- ggml_compute_forward_silu(params, tensor->src[0], tensor);
14980
- } break;
14981
14658
  case GGML_OP_SILU_BACK:
14982
14659
  {
14983
14660
  ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
@@ -15008,7 +14685,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15008
14685
  } break;
15009
14686
  case GGML_OP_SET:
15010
14687
  {
15011
- ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14688
+ ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
15012
14689
  } break;
15013
14690
  case GGML_OP_CPY:
15014
14691
  {
@@ -15048,11 +14725,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15048
14725
  } break;
15049
14726
  case GGML_OP_DIAG_MASK_INF:
15050
14727
  {
15051
- ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor);
14728
+ ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
15052
14729
  } break;
15053
14730
  case GGML_OP_DIAG_MASK_ZERO:
15054
14731
  {
15055
- ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor);
14732
+ ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
15056
14733
  } break;
15057
14734
  case GGML_OP_SOFT_MAX:
15058
14735
  {
@@ -15064,39 +14741,39 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15064
14741
  } break;
15065
14742
  case GGML_OP_ROPE:
15066
14743
  {
15067
- ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
14744
+ ggml_compute_forward_rope(params, tensor->src[0], tensor);
15068
14745
  } break;
15069
14746
  case GGML_OP_ROPE_BACK:
15070
14747
  {
15071
- ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
14748
+ ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
15072
14749
  } break;
15073
14750
  case GGML_OP_ALIBI:
15074
14751
  {
15075
- ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor);
14752
+ ggml_compute_forward_alibi(params, tensor->src[0], tensor);
15076
14753
  } break;
15077
14754
  case GGML_OP_CLAMP:
15078
14755
  {
15079
- ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor);
14756
+ ggml_compute_forward_clamp(params, tensor->src[0], tensor);
15080
14757
  } break;
15081
14758
  case GGML_OP_CONV_1D:
15082
14759
  {
15083
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14760
+ ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
15084
14761
  } break;
15085
14762
  case GGML_OP_CONV_2D:
15086
14763
  {
15087
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14764
+ ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
15088
14765
  } break;
15089
14766
  case GGML_OP_POOL_1D:
15090
14767
  {
15091
- ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
14768
+ ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
15092
14769
  } break;
15093
14770
  case GGML_OP_POOL_2D:
15094
14771
  {
15095
- ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
14772
+ ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
15096
14773
  } break;
15097
14774
  case GGML_OP_FLASH_ATTN:
15098
14775
  {
15099
- const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
14776
+ const int32_t t = ggml_get_op_params_i32(tensor, 0);
15100
14777
  GGML_ASSERT(t == 0 || t == 1);
15101
14778
  const bool masked = t != 0;
15102
14779
  ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
@@ -15107,47 +14784,56 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15107
14784
  } break;
15108
14785
  case GGML_OP_FLASH_ATTN_BACK:
15109
14786
  {
15110
- int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
14787
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15111
14788
  GGML_ASSERT(t == 0 || t == 1);
15112
14789
  bool masked = t != 0;
15113
14790
  ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
15114
14791
  } break;
15115
14792
  case GGML_OP_WIN_PART:
15116
14793
  {
15117
- ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor);
14794
+ ggml_compute_forward_win_part(params, tensor->src[0], tensor);
15118
14795
  } break;
15119
14796
  case GGML_OP_WIN_UNPART:
15120
14797
  {
15121
- ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor);
14798
+ ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
14799
+ } break;
14800
+ case GGML_OP_UNARY:
14801
+ {
14802
+ ggml_compute_forward_unary(params, tensor->src[0], tensor);
15122
14803
  } break;
15123
14804
  case GGML_OP_MAP_UNARY:
15124
14805
  {
15125
- const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data);
14806
+ ggml_unary_op_f32_t fun;
14807
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15126
14808
  ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
15127
14809
  }
15128
14810
  break;
15129
14811
  case GGML_OP_MAP_BINARY:
15130
14812
  {
15131
- const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data);
14813
+ ggml_binary_op_f32_t fun;
14814
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15132
14815
  ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
15133
14816
  }
15134
14817
  break;
15135
14818
  case GGML_OP_MAP_CUSTOM1:
15136
14819
  {
15137
- const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data);
14820
+ ggml_custom1_op_f32_t fun;
14821
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15138
14822
  ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
15139
14823
  }
15140
14824
  break;
15141
14825
  case GGML_OP_MAP_CUSTOM2:
15142
14826
  {
15143
- const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data);
14827
+ ggml_custom2_op_f32_t fun;
14828
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15144
14829
  ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
15145
14830
  }
15146
14831
  break;
15147
14832
  case GGML_OP_MAP_CUSTOM3:
15148
14833
  {
15149
- const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data);
15150
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun);
14834
+ ggml_custom3_op_f32_t fun;
14835
+ memcpy(&fun, tensor->op_params, sizeof(fun));
14836
+ ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15151
14837
  }
15152
14838
  break;
15153
14839
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -15211,12 +14897,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15211
14897
  src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
15212
14898
  }
15213
14899
  if (src1->grad) {
15214
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15215
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15216
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15217
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15218
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15219
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
14900
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
14901
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
14902
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
14903
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15220
14904
 
15221
14905
  struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
15222
14906
  tensor->grad,
@@ -15365,73 +15049,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15365
15049
  inplace);
15366
15050
  }
15367
15051
  } break;
15368
- case GGML_OP_ABS:
15369
- {
15370
- if (src0->grad) {
15371
- src0->grad =
15372
- ggml_add_impl(ctx,
15373
- src0->grad,
15374
- ggml_mul(ctx,
15375
- ggml_sgn(ctx, src0),
15376
- tensor->grad),
15377
- inplace);
15378
- }
15379
- } break;
15380
- case GGML_OP_SGN:
15381
- {
15382
- if (src0->grad) {
15383
- // noop
15384
- }
15385
- } break;
15386
- case GGML_OP_NEG:
15387
- {
15388
- if (src0->grad) {
15389
- src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15390
- }
15391
- } break;
15392
- case GGML_OP_STEP:
15393
- {
15394
- if (src0->grad) {
15395
- // noop
15396
- }
15397
- } break;
15398
- case GGML_OP_TANH:
15399
- {
15400
- GGML_ASSERT(false); // TODO: not implemented
15401
- } break;
15402
- case GGML_OP_ELU:
15403
- {
15404
- GGML_ASSERT(false); // TODO: not implemented
15405
- } break;
15406
- case GGML_OP_RELU:
15407
- {
15408
- if (src0->grad) {
15409
- src0->grad = ggml_sub_impl(ctx,
15410
- src0->grad,
15411
- ggml_mul(ctx,
15412
- ggml_step(ctx, src0),
15413
- tensor->grad),
15414
- inplace);
15415
- }
15416
- } break;
15417
- case GGML_OP_GELU:
15418
- {
15419
- GGML_ASSERT(false); // TODO: not implemented
15420
- } break;
15421
- case GGML_OP_GELU_QUICK:
15422
- {
15423
- GGML_ASSERT(false); // TODO: not implemented
15424
- } break;
15425
- case GGML_OP_SILU:
15426
- {
15427
- // necessary for llama
15428
- if (src0->grad) {
15429
- src0->grad = ggml_add_impl(ctx,
15430
- src0->grad,
15431
- ggml_silu_back(ctx, src0, tensor->grad),
15432
- inplace);
15433
- }
15434
- } break;
15435
15052
  case GGML_OP_SILU_BACK:
15436
15053
  {
15437
15054
  GGML_ASSERT(false); // TODO: not implemented
@@ -15524,12 +15141,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15524
15141
  } break;
15525
15142
  case GGML_OP_SET:
15526
15143
  {
15527
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15528
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15529
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15530
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15531
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15532
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
15144
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
15145
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
15146
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
15147
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15533
15148
 
15534
15149
  struct ggml_tensor * tensor_grad_view = NULL;
15535
15150
 
@@ -15606,8 +15221,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15606
15221
  if (src0->grad) {
15607
15222
  size_t offset;
15608
15223
 
15609
- GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2]));
15610
- memcpy(&offset, tensor->src[2]->data, sizeof(offset));
15224
+ memcpy(&offset, tensor->op_params, sizeof(offset));
15611
15225
 
15612
15226
  size_t nb1 = tensor->nb[1];
15613
15227
  size_t nb2 = tensor->nb[2];
@@ -15634,7 +15248,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15634
15248
  {
15635
15249
  // necessary for llama
15636
15250
  if (src0->grad) {
15637
- int32_t * axes = (int32_t *) tensor->src[2]->data;
15251
+ int32_t * axes = (int32_t *) tensor->op_params;
15638
15252
  int axis0 = axes[0] & 0x3;
15639
15253
  int axis1 = axes[1] & 0x3;
15640
15254
  int axis2 = axes[2] & 0x3;
@@ -15690,33 +15304,23 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15690
15304
  {
15691
15305
  // necessary for llama
15692
15306
  if (src0->grad) {
15693
- assert(src1->type == GGML_TYPE_I32);
15694
- assert(ggml_nelements(src1) == 2);
15695
- const int n_past = ((int32_t *) src1->data)[0];
15307
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15696
15308
  src0->grad =
15697
15309
  ggml_add_impl(ctx, src0->grad,
15698
15310
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15699
15311
  inplace);
15700
15312
  }
15701
- if (src1->grad) {
15702
- // noop
15703
- }
15704
15313
  } break;
15705
15314
  case GGML_OP_DIAG_MASK_ZERO:
15706
15315
  {
15707
15316
  // necessary for llama
15708
15317
  if (src0->grad) {
15709
- assert(src1->type == GGML_TYPE_I32);
15710
- assert(ggml_nelements(src1) == 2);
15711
- const int n_past = ((int32_t *) src1->data)[0];
15318
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15712
15319
  src0->grad =
15713
15320
  ggml_add_impl(ctx, src0->grad,
15714
15321
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15715
15322
  inplace);
15716
15323
  }
15717
- if (src1->grad) {
15718
- // noop
15719
- }
15720
15324
  } break;
15721
15325
  case GGML_OP_SOFT_MAX:
15722
15326
  {
@@ -15737,12 +15341,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15737
15341
  {
15738
15342
  // necessary for llama
15739
15343
  if (src0->grad) {
15740
- assert(src1->type == GGML_TYPE_I32);
15741
- assert(ggml_nelements(src1) == 6);
15742
- const int n_past = ((int32_t *) src1->data)[0];
15743
- const int n_dims = ((int32_t *) src1->data)[1];
15744
- const int mode = ((int32_t *) src1->data)[2];
15745
- const int n_ctx = ((int32_t *) src1->data)[3];
15344
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15345
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15346
+ const int mode = ((int32_t *) tensor->op_params)[2];
15347
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15746
15348
  src0->grad = ggml_add_impl(ctx,
15747
15349
  src0->grad,
15748
15350
  ggml_rope_back(ctx,
@@ -15753,19 +15355,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15753
15355
  n_ctx),
15754
15356
  inplace);
15755
15357
  }
15756
- if (src1->grad) {
15757
- // noop
15758
- }
15759
15358
  } break;
15760
15359
  case GGML_OP_ROPE_BACK:
15761
15360
  {
15762
15361
  if (src0->grad) {
15763
- assert(src1->type == GGML_TYPE_I32);
15764
- assert(ggml_nelements(src1) == 4);
15765
- const int n_past = ((int32_t *) src1->data)[0];
15766
- const int n_dims = ((int32_t *) src1->data)[1];
15767
- const int mode = ((int32_t *) src1->data)[2];
15768
- const int n_ctx = ((int32_t *) src1->data)[3];
15362
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15363
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15364
+ const int mode = ((int32_t *) tensor->op_params)[2];
15365
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15769
15366
  src0->grad = ggml_add_impl(ctx,
15770
15367
  src0->grad,
15771
15368
  ggml_rope(ctx,
@@ -15776,9 +15373,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15776
15373
  n_ctx),
15777
15374
  inplace);
15778
15375
  }
15779
- if (src1->grad) {
15780
- // noop
15781
- }
15782
15376
  } break;
15783
15377
  case GGML_OP_ALIBI:
15784
15378
  {
@@ -15808,7 +15402,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15808
15402
  {
15809
15403
  struct ggml_tensor * flash_grad = NULL;
15810
15404
  if (src0->grad || src1->grad || tensor->src[2]->grad) {
15811
- int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
15405
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15812
15406
  GGML_ASSERT(t == 0 || t == 1);
15813
15407
  bool masked = t != 0;
15814
15408
  flash_grad =
@@ -15971,6 +15565,80 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15971
15565
  } break;
15972
15566
  case GGML_OP_WIN_PART:
15973
15567
  case GGML_OP_WIN_UNPART:
15568
+ case GGML_OP_UNARY:
15569
+ {
15570
+ switch (ggml_get_unary_op(tensor)) {
15571
+ case GGML_UNARY_OP_ABS:
15572
+ {
15573
+ if (src0->grad) {
15574
+ src0->grad =
15575
+ ggml_add_impl(ctx,
15576
+ src0->grad,
15577
+ ggml_mul(ctx,
15578
+ ggml_sgn(ctx, src0),
15579
+ tensor->grad),
15580
+ inplace);
15581
+ }
15582
+ } break;
15583
+ case GGML_UNARY_OP_SGN:
15584
+ {
15585
+ if (src0->grad) {
15586
+ // noop
15587
+ }
15588
+ } break;
15589
+ case GGML_UNARY_OP_NEG:
15590
+ {
15591
+ if (src0->grad) {
15592
+ src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15593
+ }
15594
+ } break;
15595
+ case GGML_UNARY_OP_STEP:
15596
+ {
15597
+ if (src0->grad) {
15598
+ // noop
15599
+ }
15600
+ } break;
15601
+ case GGML_UNARY_OP_TANH:
15602
+ {
15603
+ GGML_ASSERT(false); // TODO: not implemented
15604
+ } break;
15605
+ case GGML_UNARY_OP_ELU:
15606
+ {
15607
+ GGML_ASSERT(false); // TODO: not implemented
15608
+ } break;
15609
+ case GGML_UNARY_OP_RELU:
15610
+ {
15611
+ if (src0->grad) {
15612
+ src0->grad = ggml_add_impl(ctx,
15613
+ src0->grad,
15614
+ ggml_mul(ctx,
15615
+ ggml_step(ctx, src0),
15616
+ tensor->grad),
15617
+ inplace);
15618
+ }
15619
+ } break;
15620
+ case GGML_UNARY_OP_GELU:
15621
+ {
15622
+ GGML_ASSERT(false); // TODO: not implemented
15623
+ } break;
15624
+ case GGML_UNARY_OP_GELU_QUICK:
15625
+ {
15626
+ GGML_ASSERT(false); // TODO: not implemented
15627
+ } break;
15628
+ case GGML_UNARY_OP_SILU:
15629
+ {
15630
+ // necessary for llama
15631
+ if (src0->grad) {
15632
+ src0->grad = ggml_add_impl(ctx,
15633
+ src0->grad,
15634
+ ggml_silu_back(ctx, src0, tensor->grad),
15635
+ inplace);
15636
+ }
15637
+ } break;
15638
+ default:
15639
+ GGML_ASSERT(false);
15640
+ }
15641
+ } break;
15974
15642
  case GGML_OP_MAP_UNARY:
15975
15643
  case GGML_OP_MAP_BINARY:
15976
15644
  case GGML_OP_MAP_CUSTOM1:
@@ -16006,6 +15674,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16006
15674
  }
16007
15675
  }
16008
15676
 
15677
+ static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
15678
+
15679
+ static size_t hash(void * p) {
15680
+ return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
15681
+ }
15682
+
15683
+ static bool hash_insert(void * hash_table[], void * p) {
15684
+ size_t h = hash(p);
15685
+
15686
+ // linear probing
15687
+ size_t i = h;
15688
+ while (hash_table[i] != NULL && hash_table[i] != p) {
15689
+ i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
15690
+ if (i == h) {
15691
+ // hash table is full
15692
+ GGML_ASSERT(false);
15693
+ }
15694
+ }
15695
+
15696
+ if (hash_table[i] == p) {
15697
+ return true;
15698
+ }
15699
+
15700
+ // insert
15701
+ hash_table[i] = p;
15702
+ return false;
15703
+ }
15704
+
16009
15705
  static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
16010
15706
  if (node->grad == NULL) {
16011
15707
  // this usually happens when we generate intermediate nodes from constants in the backward pass
@@ -16016,16 +15712,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
16016
15712
  }
16017
15713
 
16018
15714
  // check if already visited
16019
- for (int i = 0; i < cgraph->n_nodes; i++) {
16020
- if (cgraph->nodes[i] == node) {
16021
- return;
16022
- }
16023
- }
16024
-
16025
- for (int i = 0; i < cgraph->n_leafs; i++) {
16026
- if (cgraph->leafs[i] == node) {
16027
- return;
16028
- }
15715
+ if (hash_insert(cgraph->visited_hash_table, node)) {
15716
+ return;
16029
15717
  }
16030
15718
 
16031
15719
  for (int i = 0; i < GGML_MAX_SRC; ++i) {
@@ -16088,6 +15776,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16088
15776
  /*.nodes =*/ { NULL },
16089
15777
  /*.grads =*/ { NULL },
16090
15778
  /*.leafs =*/ { NULL },
15779
+ /*.hash_table =*/ { NULL },
16091
15780
  /*.perf_runs =*/ 0,
16092
15781
  /*.perf_cycles =*/ 0,
16093
15782
  /*.perf_time_us =*/ 0,
@@ -16129,13 +15818,42 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16129
15818
 
16130
15819
  if (node->is_param) {
16131
15820
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16132
- ggml_build_forward_impl(&result, node->grad, true);
15821
+ ggml_build_forward_expand(&result, node->grad);
16133
15822
  }
16134
15823
  }
16135
15824
 
16136
15825
  return result;
16137
15826
  }
16138
15827
 
15828
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15829
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15830
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15831
+
15832
+ *cgraph = (struct ggml_cgraph) {
15833
+ /*.n_nodes =*/ 0,
15834
+ /*.n_leafs =*/ 0,
15835
+ /*.nodes =*/ { NULL },
15836
+ /*.grads =*/ { NULL },
15837
+ /*.leafs =*/ { NULL },
15838
+ /*.hash_table =*/ { NULL },
15839
+ /*.perf_runs =*/ 0,
15840
+ /*.perf_cycles =*/ 0,
15841
+ /*.perf_time_us =*/ 0,
15842
+ };
15843
+
15844
+ return cgraph;
15845
+ }
15846
+
15847
+ struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
15848
+ struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
15849
+ ggml_build_forward_impl(cgraph, tensor, false);
15850
+ return cgraph;
15851
+ }
15852
+
15853
+ size_t ggml_graph_overhead(void) {
15854
+ return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15855
+ }
15856
+
16139
15857
  //
16140
15858
  // thread data
16141
15859
  //
@@ -16201,7 +15919,7 @@ typedef pthread_t ggml_thread_t;
16201
15919
 
16202
15920
  // Android's libc implementation "bionic" does not support setting affinity
16203
15921
  #if defined(__linux__) && !defined(__BIONIC__)
16204
- void set_numa_thread_affinity(int thread_n, int n_threads) {
15922
+ static void set_numa_thread_affinity(int thread_n, int n_threads) {
16205
15923
  if (!ggml_is_numa()) {
16206
15924
  return;
16207
15925
  }
@@ -16226,7 +15944,7 @@ void set_numa_thread_affinity(int thread_n, int n_threads) {
16226
15944
  CPU_FREE(cpus);
16227
15945
  }
16228
15946
 
16229
- void clear_numa_thread_affinity(void) {
15947
+ static void clear_numa_thread_affinity(void) {
16230
15948
  if (!ggml_is_numa()) {
16231
15949
  return;
16232
15950
  }
@@ -16250,8 +15968,8 @@ void clear_numa_thread_affinity(void) {
16250
15968
  #else
16251
15969
  // TODO: Windows etc.
16252
15970
  // (the linux implementation may also work on BSD, someone should test)
16253
- void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16254
- void clear_numa_thread_affinity(void) {}
15971
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15972
+ static void clear_numa_thread_affinity(void) {}
16255
15973
  #endif
16256
15974
 
16257
15975
  struct ggml_compute_state_shared {
@@ -16463,21 +16181,34 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16463
16181
  case GGML_OP_ARGMAX:
16464
16182
  case GGML_OP_REPEAT:
16465
16183
  case GGML_OP_REPEAT_BACK:
16466
- case GGML_OP_ABS:
16467
- case GGML_OP_SGN:
16468
- case GGML_OP_NEG:
16469
- case GGML_OP_STEP:
16470
- case GGML_OP_TANH:
16471
- case GGML_OP_ELU:
16472
- case GGML_OP_RELU:
16473
- {
16184
+ {
16474
16185
  n_tasks = 1;
16475
16186
  } break;
16476
- case GGML_OP_MUL:
16477
- case GGML_OP_GELU:
16478
- case GGML_OP_GELU_QUICK:
16479
- case GGML_OP_SILU:
16187
+
16188
+ case GGML_OP_UNARY:
16189
+ {
16190
+ switch (ggml_get_unary_op(node)) {
16191
+ case GGML_UNARY_OP_ABS:
16192
+ case GGML_UNARY_OP_SGN:
16193
+ case GGML_UNARY_OP_NEG:
16194
+ case GGML_UNARY_OP_STEP:
16195
+ case GGML_UNARY_OP_TANH:
16196
+ case GGML_UNARY_OP_ELU:
16197
+ case GGML_UNARY_OP_RELU:
16198
+ {
16199
+ n_tasks = 1;
16200
+ } break;
16201
+
16202
+ case GGML_UNARY_OP_GELU:
16203
+ case GGML_UNARY_OP_GELU_QUICK:
16204
+ case GGML_UNARY_OP_SILU:
16205
+ {
16206
+ n_tasks = n_threads;
16207
+ } break;
16208
+ }
16209
+ } break;
16480
16210
  case GGML_OP_SILU_BACK:
16211
+ case GGML_OP_MUL:
16481
16212
  case GGML_OP_NORM:
16482
16213
  case GGML_OP_RMS_NORM:
16483
16214
  case GGML_OP_RMS_NORM_BACK:
@@ -16542,10 +16273,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16542
16273
  case GGML_OP_GET_ROWS:
16543
16274
  case GGML_OP_GET_ROWS_BACK:
16544
16275
  case GGML_OP_DIAG:
16545
- case GGML_OP_DIAG_MASK_ZERO:
16546
16276
  {
16547
16277
  n_tasks = 1;
16548
16278
  } break;
16279
+ case GGML_OP_DIAG_MASK_ZERO:
16549
16280
  case GGML_OP_DIAG_MASK_INF:
16550
16281
  case GGML_OP_SOFT_MAX:
16551
16282
  case GGML_OP_SOFT_MAX_BACK:
@@ -16838,10 +16569,9 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16838
16569
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16839
16570
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16840
16571
 
16841
- struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
16842
- GGML_ASSERT(buf);
16572
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
16843
16573
 
16844
- cplan.work_data = buf->data;
16574
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
16845
16575
 
16846
16576
  ggml_graph_compute(cgraph, &cplan);
16847
16577
  }
@@ -16992,7 +16722,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16992
16722
  fwrite(&nb, sizeof(uint64_t), 1, fout);
16993
16723
  }
16994
16724
 
16995
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16725
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16726
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
16996
16727
 
16997
16728
  // dump the data
16998
16729
  // TODO: pad this to 32 byte boundary
@@ -17025,7 +16756,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17025
16756
  fwrite(&nb, sizeof(uint64_t), 1, fout);
17026
16757
  }
17027
16758
 
17028
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16759
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16760
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
17029
16761
 
17030
16762
  // output the op arguments
17031
16763
  {
@@ -17206,7 +16938,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17206
16938
 
17207
16939
  tensor->op = (enum ggml_op) op;
17208
16940
 
17209
- memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16941
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16942
+ memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
17210
16943
 
17211
16944
  tensor->data = (void *) ptr;
17212
16945
 
@@ -17251,7 +16984,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17251
16984
  nb[j] = nb_cur;
17252
16985
  }
17253
16986
 
17254
- const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
16987
+ const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
16988
+ const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
17255
16989
 
17256
16990
  const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
17257
16991
 
@@ -17288,8 +17022,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17288
17022
  {
17289
17023
  tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
17290
17024
 
17291
- uint64_t offs;
17292
- memcpy(&offs, args[2]->data, sizeof(offs));
17025
+ size_t offs;
17026
+ memcpy(&offs, ptr_op_params, sizeof(offs));
17293
17027
 
17294
17028
  tensor->data = ((char *) tensor->data) + offs;
17295
17029
  } break;
@@ -17309,7 +17043,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17309
17043
  } break;
17310
17044
  }
17311
17045
 
17312
- memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17046
+ memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17047
+ memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
17313
17048
 
17314
17049
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
17315
17050
  tensor->nb[j] = nb[j];
@@ -17343,7 +17078,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17343
17078
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17344
17079
  i,
17345
17080
  node->ne[0], node->ne[1], node->ne[2],
17346
- GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17081
+ ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17347
17082
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17348
17083
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17349
17084
  (double) node->perf_time_us / 1000.0,
@@ -17357,7 +17092,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17357
17092
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
17358
17093
  i,
17359
17094
  node->ne[0], node->ne[1],
17360
- GGML_OP_NAME[node->op]);
17095
+ ggml_op_name(node->op));
17361
17096
  }
17362
17097
 
17363
17098
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -17365,7 +17100,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17365
17100
  continue;
17366
17101
  }
17367
17102
 
17368
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
17103
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
17369
17104
  }
17370
17105
 
17371
17106
  GGML_PRINT("========================================\n");
@@ -17459,13 +17194,13 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17459
17194
  }
17460
17195
 
17461
17196
  if (node->n_dims == 2) {
17462
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
17197
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
17463
17198
  } else {
17464
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
17199
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
17465
17200
  }
17466
17201
 
17467
17202
  if (node->grad) {
17468
- fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
17203
+ fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
17469
17204
  } else {
17470
17205
  fprintf(fp, "\"; ]\n");
17471
17206
  }