llama_cpp 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3440,7 +3440,9 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
3440
3440
 
3441
3441
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
3442
3442
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
3443
- #if defined(GGML_SIMD)
3443
+ #if defined(GGML_USE_ACCELERATE)
3444
+ vDSP_vsmul(y, 1, &v, y, 1, n);
3445
+ #elif defined(GGML_SIMD)
3444
3446
  const int np = (n & ~(GGML_F32_STEP - 1));
3445
3447
 
3446
3448
  GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
@@ -3603,7 +3605,7 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
3603
3605
  #endif
3604
3606
  }
3605
3607
 
3606
- inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) {
3608
+ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
3607
3609
  ggml_float sum = 0.0;
3608
3610
  for (int i = 0; i < n; ++i) {
3609
3611
  sum += (ggml_float)x[i];
@@ -3611,6 +3613,14 @@ inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x
3611
3613
  *s = sum;
3612
3614
  }
3613
3615
 
3616
+ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
3617
+ float sum = 0.0f;
3618
+ for (int i = 0; i < n; ++i) {
3619
+ sum += GGML_FP16_TO_FP32(x[i]);
3620
+ }
3621
+ *s = sum;
3622
+ }
3623
+
3614
3624
  inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
3615
3625
  #ifndef GGML_USE_ACCELERATE
3616
3626
  float max = -INFINITY;
@@ -3750,16 +3760,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3750
3760
  "ARGMAX",
3751
3761
  "REPEAT",
3752
3762
  "REPEAT_BACK",
3753
- "ABS",
3754
- "SGN",
3755
- "NEG",
3756
- "STEP",
3757
- "TANH",
3758
- "ELU",
3759
- "RELU",
3760
- "GELU",
3761
- "GELU_QUICK",
3762
- "SILU",
3763
3763
  "SILU_BACK",
3764
3764
  "NORM",
3765
3765
  "RMS_NORM",
@@ -3798,6 +3798,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3798
3798
  "WIN_PART",
3799
3799
  "WIN_UNPART",
3800
3800
 
3801
+ "UNARY",
3802
+
3801
3803
  "MAP_UNARY",
3802
3804
  "MAP_BINARY",
3803
3805
 
@@ -3809,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3809
3811
  "CROSS_ENTROPY_LOSS_BACK",
3810
3812
  };
3811
3813
 
3812
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3814
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3813
3815
 
3814
3816
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3815
3817
  "none",
@@ -3830,16 +3832,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3830
3832
  "argmax(x)",
3831
3833
  "repeat(x)",
3832
3834
  "repeat_back(x)",
3833
- "abs(x)",
3834
- "sgn(x)",
3835
- "-x",
3836
- "step(x)",
3837
- "tanh(x)",
3838
- "elu(x)",
3839
- "relu(x)",
3840
- "gelu(x)",
3841
- "gelu_quick(x)",
3842
- "silu(x)",
3843
3835
  "silu_back(x)",
3844
3836
  "norm(x)",
3845
3837
  "rms_norm(x)",
@@ -3878,6 +3870,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3878
3870
  "win_part(x)",
3879
3871
  "win_unpart(x)",
3880
3872
 
3873
+ "unary(x)",
3874
+
3881
3875
  "f(x)",
3882
3876
  "f(x,y)",
3883
3877
 
@@ -3889,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3889
3883
  "cross_entropy_loss_back(x,y)",
3890
3884
  };
3891
3885
 
3892
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3886
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3893
3887
 
3894
3888
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3895
3889
 
@@ -4077,8 +4071,8 @@ bool ggml_is_numa(void) {
4077
4071
  ////////////////////////////////////////////////////////////////////////////////
4078
4072
 
4079
4073
  void ggml_print_object(const struct ggml_object * obj) {
4080
- GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
4081
- obj->offs, obj->size, (const void *) obj->next);
4074
+ GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
4075
+ obj->type, obj->offs, obj->size, (const void *) obj->next);
4082
4076
  }
4083
4077
 
4084
4078
  void ggml_print_objects(const struct ggml_context * ctx) {
@@ -4145,6 +4139,10 @@ const char * ggml_op_name(enum ggml_op op) {
4145
4139
  return GGML_OP_NAME[op];
4146
4140
  }
4147
4141
 
4142
+ const char * ggml_op_symbol(enum ggml_op op) {
4143
+ return GGML_OP_SYMBOL[op];
4144
+ }
4145
+
4148
4146
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
4149
4147
  return GGML_TYPE_SIZE[tensor->type];
4150
4148
  }
@@ -4214,7 +4212,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4214
4212
  }
4215
4213
 
4216
4214
  size_t ggml_tensor_overhead(void) {
4217
- return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
4215
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
4218
4216
  }
4219
4217
 
4220
4218
  bool ggml_is_transposed(const struct ggml_tensor * tensor) {
@@ -4231,6 +4229,15 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
4231
4229
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4232
4230
  }
4233
4231
 
4232
+ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
4233
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
+
4235
+ return
4236
+ tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4237
+ tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4238
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4239
+ }
4240
+
4234
4241
  bool ggml_is_permuted(const struct ggml_tensor * tensor) {
4235
4242
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4236
4243
 
@@ -4376,7 +4383,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4376
4383
  return NULL;
4377
4384
  }
4378
4385
 
4379
- const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
4386
+ const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4380
4387
 
4381
4388
  *ctx = (struct ggml_context) {
4382
4389
  /*.mem_size =*/ mem_size,
@@ -4443,6 +4450,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4443
4450
  return result;
4444
4451
  }
4445
4452
 
4453
+ bool ggml_get_no_alloc(struct ggml_context * ctx) {
4454
+ return ctx->no_alloc;
4455
+ }
4456
+
4446
4457
  void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4447
4458
  ctx->no_alloc = no_alloc;
4448
4459
  }
@@ -4461,12 +4472,14 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4461
4472
  struct ggml_object * obj = ctx->objects_begin;
4462
4473
 
4463
4474
  while (obj != NULL) {
4464
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4475
+ if (obj->type == GGML_OBJECT_TENSOR) {
4476
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4465
4477
 
4466
- const size_t size = ggml_nbytes(tensor);
4478
+ const size_t size = ggml_nbytes(tensor);
4467
4479
 
4468
- if (max_size < size) {
4469
- max_size = size;
4480
+ if (max_size < size) {
4481
+ max_size = size;
4482
+ }
4470
4483
  }
4471
4484
 
4472
4485
  obj = obj->next;
@@ -4480,7 +4493,7 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4480
4493
  // this is an error prone process, but it is necessary to support inplace
4481
4494
  // operators when using scratch buffers
4482
4495
  // TODO: implement a better way
4483
- void ggml_scratch_save(struct ggml_context * ctx) {
4496
+ static void ggml_scratch_save(struct ggml_context * ctx) {
4484
4497
  // this is needed to allow opt tensors to store their data
4485
4498
  // TODO: again, need to find a better way
4486
4499
  ctx->no_alloc_save = ctx->no_alloc;
@@ -4490,7 +4503,7 @@ void ggml_scratch_save(struct ggml_context * ctx) {
4490
4503
  ctx->scratch.data = NULL;
4491
4504
  }
4492
4505
 
4493
- void ggml_scratch_load(struct ggml_context * ctx) {
4506
+ static void ggml_scratch_load(struct ggml_context * ctx) {
4494
4507
  ctx->no_alloc = ctx->no_alloc_save;
4495
4508
 
4496
4509
  ctx->scratch = ctx->scratch_save;
@@ -4498,12 +4511,7 @@ void ggml_scratch_load(struct ggml_context * ctx) {
4498
4511
 
4499
4512
  ////////////////////////////////////////////////////////////////////////////////
4500
4513
 
4501
- struct ggml_tensor * ggml_new_tensor_impl(
4502
- struct ggml_context * ctx,
4503
- enum ggml_type type,
4504
- int n_dims,
4505
- const int64_t* ne,
4506
- void* data) {
4514
+ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
4507
4515
  // always insert objects at the end of the context's memory pool
4508
4516
  struct ggml_object * obj_cur = ctx->objects_end;
4509
4517
 
@@ -4511,77 +4519,79 @@ struct ggml_tensor * ggml_new_tensor_impl(
4511
4519
  const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
4512
4520
  const size_t cur_end = cur_offs + cur_size;
4513
4521
 
4514
- size_t size_needed = 0;
4515
-
4516
- if (data == NULL && !ctx->no_alloc) {
4517
- size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4518
- for (int i = 1; i < n_dims; i++) {
4519
- size_needed *= ne[i];
4520
- }
4521
- // align to GGML_MEM_ALIGN
4522
- size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
4523
- }
4522
+ // align to GGML_MEM_ALIGN
4523
+ size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
4524
4524
 
4525
4525
  char * const mem_buffer = ctx->mem_buffer;
4526
4526
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4527
4527
 
4528
- if (ctx->scratch.data == NULL || data != NULL) {
4529
- size_needed += GGML_TENSOR_SIZE;
4528
+ if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4529
+ GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4530
+ __func__, cur_end + size_needed, ctx->mem_size);
4531
+ assert(false);
4532
+ return NULL;
4533
+ }
4530
4534
 
4531
- if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4532
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4533
- __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
4534
- assert(false);
4535
- return NULL;
4536
- }
4535
+ *obj_new = (struct ggml_object) {
4536
+ .offs = cur_end + GGML_OBJECT_SIZE,
4537
+ .size = size_needed,
4538
+ .next = NULL,
4539
+ .type = type,
4540
+ };
4537
4541
 
4538
- *obj_new = (struct ggml_object) {
4539
- .offs = cur_end + GGML_OBJECT_SIZE,
4540
- .size = size_needed,
4541
- .next = NULL,
4542
- };
4542
+ ggml_assert_aligned(mem_buffer + obj_new->offs);
4543
+
4544
+ if (obj_cur != NULL) {
4545
+ obj_cur->next = obj_new;
4543
4546
  } else {
4544
- if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4545
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4546
- __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4547
- assert(false);
4548
- return NULL;
4547
+ // this is the first object in this context
4548
+ ctx->objects_begin = obj_new;
4549
+ }
4550
+
4551
+ ctx->objects_end = obj_new;
4552
+
4553
+ //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4554
+
4555
+ return obj_new;
4556
+ }
4557
+
4558
+ static struct ggml_tensor * ggml_new_tensor_impl(
4559
+ struct ggml_context * ctx,
4560
+ enum ggml_type type,
4561
+ int n_dims,
4562
+ const int64_t* ne,
4563
+ void* data) {
4564
+
4565
+ size_t data_size = 0;
4566
+
4567
+ if (data == NULL && !ctx->no_alloc) {
4568
+ data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4569
+ for (int i = 1; i < n_dims; i++) {
4570
+ data_size *= ne[i];
4549
4571
  }
4572
+ }
4550
4573
 
4551
- if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4552
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4553
- __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4574
+ if (ctx->scratch.data != NULL && data == NULL) {
4575
+ // allocate tensor data in the scratch buffer
4576
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4577
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4578
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4554
4579
  assert(false);
4555
4580
  return NULL;
4556
4581
  }
4557
4582
 
4558
4583
  data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4559
4584
 
4560
- *obj_new = (struct ggml_object) {
4561
- .offs = cur_end + GGML_OBJECT_SIZE,
4562
- .size = GGML_TENSOR_SIZE,
4563
- .next = NULL,
4564
- };
4565
-
4566
- //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
4567
-
4568
- ctx->scratch.offs += size_needed;
4569
- }
4585
+ ctx->scratch.offs += data_size;
4570
4586
 
4571
- if (obj_cur != NULL) {
4572
- obj_cur->next = obj_new;
4573
- } else {
4574
- // this is the first object in this context
4575
- ctx->objects_begin = obj_new;
4587
+ data_size = 0;
4576
4588
  }
4577
4589
 
4578
- ctx->objects_end = obj_new;
4579
-
4580
- //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4590
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4581
4591
 
4582
- struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
4592
+ // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4583
4593
 
4584
- ggml_assert_aligned(result);
4594
+ struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
4585
4595
 
4586
4596
  *result = (struct ggml_tensor) {
4587
4597
  /*.type =*/ type,
@@ -4590,6 +4600,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4590
4600
  /*.ne =*/ { 1, 1, 1, 1 },
4591
4601
  /*.nb =*/ { 0, 0, 0, 0 },
4592
4602
  /*.op =*/ GGML_OP_NONE,
4603
+ /*.op_params =*/ {0},
4593
4604
  /*.is_param =*/ false,
4594
4605
  /*.grad =*/ NULL,
4595
4606
  /*.src =*/ { NULL },
@@ -4620,6 +4631,21 @@ struct ggml_tensor * ggml_new_tensor_impl(
4620
4631
  return result;
4621
4632
  }
4622
4633
 
4634
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4635
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4636
+ memcpy(tensor->op_params, params, params_size);
4637
+ }
4638
+
4639
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4640
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4641
+ return ((const int32_t *)(tensor->op_params))[i];
4642
+ }
4643
+
4644
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4645
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4646
+ ((int32_t *)(tensor->op_params))[i] = value;
4647
+ }
4648
+
4623
4649
  struct ggml_tensor * ggml_new_tensor(
4624
4650
  struct ggml_context * ctx,
4625
4651
  enum ggml_type type,
@@ -4951,6 +4977,11 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
4951
4977
  return (float *)(tensor->data);
4952
4978
  }
4953
4979
 
4980
+ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
4981
+ GGML_ASSERT(tensor->op == GGML_OP_UNARY);
4982
+ return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
4983
+ }
4984
+
4954
4985
  const char * ggml_get_name(const struct ggml_tensor * tensor) {
4955
4986
  return tensor->name;
4956
4987
  }
@@ -4989,9 +5020,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
4989
5020
  char * const mem_buffer = ctx->mem_buffer;
4990
5021
 
4991
5022
  while (obj != NULL) {
4992
- struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4993
- if (strcmp(cur->name, name) == 0) {
4994
- return cur;
5023
+ if (obj->type == GGML_OBJECT_TENSOR) {
5024
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
5025
+ if (strcmp(cur->name, name) == 0) {
5026
+ return cur;
5027
+ }
4995
5028
  }
4996
5029
 
4997
5030
  obj = obj->next;
@@ -5004,7 +5037,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
5004
5037
 
5005
5038
  // ggml_dup
5006
5039
 
5007
- struct ggml_tensor * ggml_dup_impl(
5040
+ static struct ggml_tensor * ggml_dup_impl(
5008
5041
  struct ggml_context * ctx,
5009
5042
  struct ggml_tensor * a,
5010
5043
  bool inplace) {
@@ -5019,7 +5052,6 @@ struct ggml_tensor * ggml_dup_impl(
5019
5052
  result->op = GGML_OP_DUP;
5020
5053
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5021
5054
  result->src[0] = a;
5022
- result->src[1] = NULL;
5023
5055
 
5024
5056
  return result;
5025
5057
  }
@@ -5038,7 +5070,7 @@ struct ggml_tensor * ggml_dup_inplace(
5038
5070
 
5039
5071
  // ggml_add
5040
5072
 
5041
- struct ggml_tensor * ggml_add_impl(
5073
+ static struct ggml_tensor * ggml_add_impl(
5042
5074
  struct ggml_context * ctx,
5043
5075
  struct ggml_tensor * a,
5044
5076
  struct ggml_tensor * b,
@@ -5081,7 +5113,7 @@ struct ggml_tensor * ggml_add_inplace(
5081
5113
 
5082
5114
  // ggml_add1
5083
5115
 
5084
- struct ggml_tensor * ggml_add1_impl(
5116
+ static struct ggml_tensor * ggml_add1_impl(
5085
5117
  struct ggml_context * ctx,
5086
5118
  struct ggml_tensor * a,
5087
5119
  struct ggml_tensor * b,
@@ -5121,7 +5153,7 @@ struct ggml_tensor * ggml_add1_inplace(
5121
5153
 
5122
5154
  // ggml_acc
5123
5155
 
5124
- struct ggml_tensor * ggml_acc_impl(
5156
+ static struct ggml_tensor * ggml_acc_impl(
5125
5157
  struct ggml_context * ctx,
5126
5158
  struct ggml_tensor * a,
5127
5159
  struct ggml_tensor * b,
@@ -5143,23 +5175,13 @@ struct ggml_tensor * ggml_acc_impl(
5143
5175
 
5144
5176
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5145
5177
 
5146
- ggml_scratch_save(ctx);
5147
-
5148
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
5149
-
5150
- ((int32_t *) c->data)[0] = nb1;
5151
- ((int32_t *) c->data)[1] = nb2;
5152
- ((int32_t *) c->data)[2] = nb3;
5153
- ((int32_t *) c->data)[3] = offset;
5154
- ((int32_t *) c->data)[4] = inplace ? 1 : 0;
5155
-
5156
- ggml_scratch_load(ctx);
5178
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5179
+ ggml_set_op_params(result, params, sizeof(params));
5157
5180
 
5158
5181
  result->op = GGML_OP_ACC;
5159
5182
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5160
5183
  result->src[0] = a;
5161
5184
  result->src[1] = b;
5162
- result->src[2] = c;
5163
5185
 
5164
5186
  return result;
5165
5187
  }
@@ -5188,7 +5210,7 @@ struct ggml_tensor * ggml_acc_inplace(
5188
5210
 
5189
5211
  // ggml_sub
5190
5212
 
5191
- struct ggml_tensor * ggml_sub_impl(
5213
+ static struct ggml_tensor * ggml_sub_impl(
5192
5214
  struct ggml_context * ctx,
5193
5215
  struct ggml_tensor * a,
5194
5216
  struct ggml_tensor * b,
@@ -5227,7 +5249,7 @@ struct ggml_tensor * ggml_sub_inplace(
5227
5249
 
5228
5250
  // ggml_mul
5229
5251
 
5230
- struct ggml_tensor * ggml_mul_impl(
5252
+ static struct ggml_tensor * ggml_mul_impl(
5231
5253
  struct ggml_context * ctx,
5232
5254
  struct ggml_tensor * a,
5233
5255
  struct ggml_tensor * b,
@@ -5274,7 +5296,7 @@ struct ggml_tensor * ggml_mul_inplace(
5274
5296
 
5275
5297
  // ggml_div
5276
5298
 
5277
- struct ggml_tensor * ggml_div_impl(
5299
+ static struct ggml_tensor * ggml_div_impl(
5278
5300
  struct ggml_context * ctx,
5279
5301
  struct ggml_tensor * a,
5280
5302
  struct ggml_tensor * b,
@@ -5317,7 +5339,7 @@ struct ggml_tensor * ggml_div_inplace(
5317
5339
 
5318
5340
  // ggml_sqr
5319
5341
 
5320
- struct ggml_tensor * ggml_sqr_impl(
5342
+ static struct ggml_tensor * ggml_sqr_impl(
5321
5343
  struct ggml_context * ctx,
5322
5344
  struct ggml_tensor * a,
5323
5345
  bool inplace) {
@@ -5332,7 +5354,6 @@ struct ggml_tensor * ggml_sqr_impl(
5332
5354
  result->op = GGML_OP_SQR;
5333
5355
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5334
5356
  result->src[0] = a;
5335
- result->src[1] = NULL;
5336
5357
 
5337
5358
  return result;
5338
5359
  }
@@ -5351,7 +5372,7 @@ struct ggml_tensor * ggml_sqr_inplace(
5351
5372
 
5352
5373
  // ggml_sqrt
5353
5374
 
5354
- struct ggml_tensor * ggml_sqrt_impl(
5375
+ static struct ggml_tensor * ggml_sqrt_impl(
5355
5376
  struct ggml_context * ctx,
5356
5377
  struct ggml_tensor * a,
5357
5378
  bool inplace) {
@@ -5366,7 +5387,6 @@ struct ggml_tensor * ggml_sqrt_impl(
5366
5387
  result->op = GGML_OP_SQRT;
5367
5388
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5368
5389
  result->src[0] = a;
5369
- result->src[1] = NULL;
5370
5390
 
5371
5391
  return result;
5372
5392
  }
@@ -5386,7 +5406,7 @@ struct ggml_tensor * ggml_sqrt_inplace(
5386
5406
 
5387
5407
  // ggml_log
5388
5408
 
5389
- struct ggml_tensor * ggml_log_impl(
5409
+ static struct ggml_tensor * ggml_log_impl(
5390
5410
  struct ggml_context * ctx,
5391
5411
  struct ggml_tensor * a,
5392
5412
  bool inplace) {
@@ -5401,7 +5421,6 @@ struct ggml_tensor * ggml_log_impl(
5401
5421
  result->op = GGML_OP_LOG;
5402
5422
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5403
5423
  result->src[0] = a;
5404
- result->src[1] = NULL;
5405
5424
 
5406
5425
  return result;
5407
5426
  }
@@ -5434,7 +5453,6 @@ struct ggml_tensor * ggml_sum(
5434
5453
  result->op = GGML_OP_SUM;
5435
5454
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5436
5455
  result->src[0] = a;
5437
- result->src[1] = NULL;
5438
5456
 
5439
5457
  return result;
5440
5458
  }
@@ -5461,7 +5479,6 @@ struct ggml_tensor * ggml_sum_rows(
5461
5479
  result->op = GGML_OP_SUM_ROWS;
5462
5480
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5463
5481
  result->src[0] = a;
5464
- result->src[1] = NULL;
5465
5482
 
5466
5483
  return result;
5467
5484
  }
@@ -5484,7 +5501,6 @@ struct ggml_tensor * ggml_mean(
5484
5501
  result->op = GGML_OP_MEAN;
5485
5502
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5486
5503
  result->src[0] = a;
5487
- result->src[1] = NULL;
5488
5504
 
5489
5505
  return result;
5490
5506
  }
@@ -5508,7 +5524,6 @@ struct ggml_tensor * ggml_argmax(
5508
5524
  result->op = GGML_OP_ARGMAX;
5509
5525
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5510
5526
  result->src[0] = a;
5511
- result->src[1] = NULL;
5512
5527
 
5513
5528
  return result;
5514
5529
  }
@@ -5571,343 +5586,142 @@ struct ggml_tensor * ggml_repeat_back(
5571
5586
 
5572
5587
  // ggml_abs
5573
5588
 
5574
- struct ggml_tensor * ggml_abs_impl(
5575
- struct ggml_context * ctx,
5576
- struct ggml_tensor * a,
5577
- bool inplace) {
5578
- bool is_node = false;
5579
-
5580
- if (!inplace && (a->grad)) {
5581
- is_node = true;
5582
- }
5583
-
5584
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5585
-
5586
- result->op = GGML_OP_ABS;
5587
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5588
- result->src[0] = a;
5589
- result->src[1] = NULL;
5590
-
5591
- return result;
5592
- }
5593
-
5594
5589
  struct ggml_tensor * ggml_abs(
5595
5590
  struct ggml_context * ctx,
5596
5591
  struct ggml_tensor * a) {
5597
- return ggml_abs_impl(ctx, a, false);
5592
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
5598
5593
  }
5599
5594
 
5600
5595
  struct ggml_tensor * ggml_abs_inplace(
5601
5596
  struct ggml_context * ctx,
5602
5597
  struct ggml_tensor * a) {
5603
- return ggml_abs_impl(ctx, a, true);
5598
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
5604
5599
  }
5605
5600
 
5606
-
5607
5601
  // ggml_sgn
5608
5602
 
5609
- struct ggml_tensor * ggml_sgn_impl(
5610
- struct ggml_context * ctx,
5611
- struct ggml_tensor * a,
5612
- bool inplace) {
5613
- bool is_node = false;
5614
-
5615
- if (!inplace && (a->grad)) {
5616
- is_node = true;
5617
- }
5618
-
5619
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5620
-
5621
- result->op = GGML_OP_SGN;
5622
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5623
- result->src[0] = a;
5624
- result->src[1] = NULL;
5625
-
5626
- return result;
5627
- }
5628
-
5629
5603
  struct ggml_tensor * ggml_sgn(
5630
5604
  struct ggml_context * ctx,
5631
5605
  struct ggml_tensor * a) {
5632
- return ggml_sgn_impl(ctx, a, false);
5606
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
5633
5607
  }
5634
5608
 
5635
5609
  struct ggml_tensor * ggml_sgn_inplace(
5636
5610
  struct ggml_context * ctx,
5637
5611
  struct ggml_tensor * a) {
5638
- return ggml_sgn_impl(ctx, a, true);
5612
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
5639
5613
  }
5640
5614
 
5641
5615
  // ggml_neg
5642
5616
 
5643
- struct ggml_tensor * ggml_neg_impl(
5644
- struct ggml_context * ctx,
5645
- struct ggml_tensor * a,
5646
- bool inplace) {
5647
- bool is_node = false;
5648
-
5649
- if (!inplace && (a->grad)) {
5650
- is_node = true;
5651
- }
5652
-
5653
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5654
-
5655
- result->op = GGML_OP_NEG;
5656
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5657
- result->src[0] = a;
5658
- result->src[1] = NULL;
5659
-
5660
- return result;
5661
- }
5662
-
5663
5617
  struct ggml_tensor * ggml_neg(
5664
5618
  struct ggml_context * ctx,
5665
5619
  struct ggml_tensor * a) {
5666
- return ggml_neg_impl(ctx, a, false);
5620
+ return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
5667
5621
  }
5668
5622
 
5669
5623
  struct ggml_tensor * ggml_neg_inplace(
5670
5624
  struct ggml_context * ctx,
5671
5625
  struct ggml_tensor * a) {
5672
- return ggml_neg_impl(ctx, a, true);
5626
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
5673
5627
  }
5674
5628
 
5675
5629
  // ggml_step
5676
5630
 
5677
- struct ggml_tensor * ggml_step_impl(
5678
- struct ggml_context * ctx,
5679
- struct ggml_tensor * a,
5680
- bool inplace) {
5681
- bool is_node = false;
5682
-
5683
- if (!inplace && (a->grad)) {
5684
- is_node = true;
5685
- }
5686
-
5687
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5688
-
5689
- result->op = GGML_OP_STEP;
5690
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5691
- result->src[0] = a;
5692
- result->src[1] = NULL;
5693
-
5694
- return result;
5695
- }
5696
-
5697
5631
  struct ggml_tensor * ggml_step(
5698
5632
  struct ggml_context * ctx,
5699
5633
  struct ggml_tensor * a) {
5700
- return ggml_step_impl(ctx, a, false);
5634
+ return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
5701
5635
  }
5702
5636
 
5703
5637
  struct ggml_tensor * ggml_step_inplace(
5704
5638
  struct ggml_context * ctx,
5705
5639
  struct ggml_tensor * a) {
5706
- return ggml_step_impl(ctx, a, true);
5640
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
5707
5641
  }
5708
5642
 
5709
5643
  // ggml_tanh
5710
5644
 
5711
- struct ggml_tensor * ggml_tanh_impl(
5712
- struct ggml_context * ctx,
5713
- struct ggml_tensor * a,
5714
- bool inplace) {
5715
- bool is_node = false;
5716
-
5717
- if (!inplace && (a->grad)) {
5718
- is_node = true;
5719
- }
5720
-
5721
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5722
-
5723
- result->op = GGML_OP_TANH;
5724
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5725
- result->src[0] = a;
5726
- result->src[1] = NULL;
5727
-
5728
- return result;
5729
- }
5730
-
5731
5645
  struct ggml_tensor * ggml_tanh(
5732
5646
  struct ggml_context * ctx,
5733
5647
  struct ggml_tensor * a) {
5734
- return ggml_tanh_impl(ctx, a, false);
5648
+ return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
5735
5649
  }
5736
5650
 
5737
5651
  struct ggml_tensor * ggml_tanh_inplace(
5738
5652
  struct ggml_context * ctx,
5739
5653
  struct ggml_tensor * a) {
5740
- return ggml_tanh_impl(ctx, a, true);
5654
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
5741
5655
  }
5742
5656
 
5743
5657
  // ggml_elu
5744
5658
 
5745
- struct ggml_tensor * ggml_elu_impl(
5746
- struct ggml_context * ctx,
5747
- struct ggml_tensor * a,
5748
- bool inplace) {
5749
- bool is_node = false;
5750
-
5751
- if (!inplace && (a->grad)) {
5752
- is_node = true;
5753
- }
5754
-
5755
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5756
-
5757
- result->op = GGML_OP_ELU;
5758
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5759
- result->src[0] = a;
5760
- result->src[1] = NULL;
5761
-
5762
- return result;
5763
- }
5764
-
5765
5659
  struct ggml_tensor * ggml_elu(
5766
5660
  struct ggml_context * ctx,
5767
5661
  struct ggml_tensor * a) {
5768
- return ggml_elu_impl(ctx, a, false);
5662
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
5769
5663
  }
5770
5664
 
5771
5665
  struct ggml_tensor * ggml_elu_inplace(
5772
5666
  struct ggml_context * ctx,
5773
5667
  struct ggml_tensor * a) {
5774
- return ggml_elu_impl(ctx, a, true);
5668
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
5775
5669
  }
5776
5670
 
5777
5671
  // ggml_relu
5778
5672
 
5779
- struct ggml_tensor * ggml_relu_impl(
5780
- struct ggml_context * ctx,
5781
- struct ggml_tensor * a,
5782
- bool inplace) {
5783
- bool is_node = false;
5784
-
5785
- if (!inplace && (a->grad)) {
5786
- is_node = true;
5787
- }
5788
-
5789
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5790
-
5791
- result->op = GGML_OP_RELU;
5792
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5793
- result->src[0] = a;
5794
- result->src[1] = NULL;
5795
-
5796
- return result;
5797
- }
5798
-
5799
5673
  struct ggml_tensor * ggml_relu(
5800
5674
  struct ggml_context * ctx,
5801
5675
  struct ggml_tensor * a) {
5802
- return ggml_relu_impl(ctx, a, false);
5676
+ return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
5803
5677
  }
5804
5678
 
5805
5679
  struct ggml_tensor * ggml_relu_inplace(
5806
5680
  struct ggml_context * ctx,
5807
5681
  struct ggml_tensor * a) {
5808
- return ggml_relu_impl(ctx, a, true);
5682
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
5809
5683
  }
5810
5684
 
5811
5685
  // ggml_gelu
5812
5686
 
5813
- struct ggml_tensor * ggml_gelu_impl(
5814
- struct ggml_context * ctx,
5815
- struct ggml_tensor * a,
5816
- bool inplace) {
5817
- bool is_node = false;
5818
-
5819
- if (!inplace && (a->grad)) {
5820
- is_node = true;
5821
- }
5822
-
5823
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5824
-
5825
- result->op = GGML_OP_GELU;
5826
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5827
- result->src[0] = a;
5828
- result->src[1] = NULL;
5829
-
5830
- return result;
5831
- }
5832
-
5833
5687
  struct ggml_tensor * ggml_gelu(
5834
5688
  struct ggml_context * ctx,
5835
5689
  struct ggml_tensor * a) {
5836
- return ggml_gelu_impl(ctx, a, false);
5690
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
5837
5691
  }
5838
5692
 
5839
5693
  struct ggml_tensor * ggml_gelu_inplace(
5840
5694
  struct ggml_context * ctx,
5841
5695
  struct ggml_tensor * a) {
5842
- return ggml_gelu_impl(ctx, a, true);
5696
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
5843
5697
  }
5844
5698
 
5845
5699
  // ggml_gelu_quick
5846
5700
 
5847
- struct ggml_tensor * ggml_gelu_quick_impl(
5848
- struct ggml_context * ctx,
5849
- struct ggml_tensor * a,
5850
- bool inplace) {
5851
- bool is_node = false;
5852
-
5853
- if (!inplace && (a->grad)) {
5854
- is_node = true;
5855
- }
5856
-
5857
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5858
-
5859
- result->op = GGML_OP_GELU_QUICK;
5860
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5861
- result->src[0] = a;
5862
- result->src[1] = NULL;
5863
-
5864
- return result;
5865
- }
5866
-
5867
5701
  struct ggml_tensor * ggml_gelu_quick(
5868
5702
  struct ggml_context * ctx,
5869
5703
  struct ggml_tensor * a) {
5870
- return ggml_gelu_quick_impl(ctx, a, false);
5704
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5871
5705
  }
5872
5706
 
5873
5707
  struct ggml_tensor * ggml_gelu_quick_inplace(
5874
5708
  struct ggml_context * ctx,
5875
5709
  struct ggml_tensor * a) {
5876
- return ggml_gelu_quick_impl(ctx, a, true);
5710
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5877
5711
  }
5878
5712
 
5879
5713
  // ggml_silu
5880
5714
 
5881
- struct ggml_tensor * ggml_silu_impl(
5882
- struct ggml_context * ctx,
5883
- struct ggml_tensor * a,
5884
- bool inplace) {
5885
- bool is_node = false;
5886
-
5887
- if (!inplace && (a->grad)) {
5888
- is_node = true;
5889
- }
5890
-
5891
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5892
-
5893
- result->op = GGML_OP_SILU;
5894
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5895
- result->src[0] = a;
5896
- result->src[1] = NULL;
5897
-
5898
- return result;
5899
- }
5900
-
5901
5715
  struct ggml_tensor * ggml_silu(
5902
5716
  struct ggml_context * ctx,
5903
5717
  struct ggml_tensor * a) {
5904
- return ggml_silu_impl(ctx, a, false);
5718
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
5905
5719
  }
5906
5720
 
5907
5721
  struct ggml_tensor * ggml_silu_inplace(
5908
5722
  struct ggml_context * ctx,
5909
5723
  struct ggml_tensor * a) {
5910
- return ggml_silu_impl(ctx, a, true);
5724
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
5911
5725
  }
5912
5726
 
5913
5727
  // ggml_silu_back
@@ -5935,7 +5749,7 @@ struct ggml_tensor * ggml_silu_back(
5935
5749
 
5936
5750
  // ggml_norm
5937
5751
 
5938
- struct ggml_tensor * ggml_norm_impl(
5752
+ static struct ggml_tensor * ggml_norm_impl(
5939
5753
  struct ggml_context * ctx,
5940
5754
  struct ggml_tensor * a,
5941
5755
  bool inplace) {
@@ -5948,10 +5762,11 @@ struct ggml_tensor * ggml_norm_impl(
5948
5762
 
5949
5763
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5950
5764
 
5765
+ // TODO: maybe store epsilon here?
5766
+
5951
5767
  result->op = GGML_OP_NORM;
5952
5768
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5953
5769
  result->src[0] = a;
5954
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5955
5770
 
5956
5771
  return result;
5957
5772
  }
@@ -5968,9 +5783,10 @@ struct ggml_tensor * ggml_norm_inplace(
5968
5783
  return ggml_norm_impl(ctx, a, true);
5969
5784
  }
5970
5785
 
5971
- struct ggml_tensor * ggml_rms_norm_impl(
5786
+ static struct ggml_tensor * ggml_rms_norm_impl(
5972
5787
  struct ggml_context * ctx,
5973
5788
  struct ggml_tensor * a,
5789
+ float eps,
5974
5790
  bool inplace) {
5975
5791
  bool is_node = false;
5976
5792
 
@@ -5980,24 +5796,27 @@ struct ggml_tensor * ggml_rms_norm_impl(
5980
5796
 
5981
5797
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5982
5798
 
5799
+ ggml_set_op_params(result, &eps, sizeof(eps));
5800
+
5983
5801
  result->op = GGML_OP_RMS_NORM;
5984
5802
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5985
5803
  result->src[0] = a;
5986
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5987
5804
 
5988
5805
  return result;
5989
5806
  }
5990
5807
 
5991
5808
  struct ggml_tensor * ggml_rms_norm(
5992
5809
  struct ggml_context * ctx,
5993
- struct ggml_tensor * a) {
5994
- return ggml_rms_norm_impl(ctx, a, false);
5810
+ struct ggml_tensor * a,
5811
+ float eps) {
5812
+ return ggml_rms_norm_impl(ctx, a, eps, false);
5995
5813
  }
5996
5814
 
5997
5815
  struct ggml_tensor * ggml_rms_norm_inplace(
5998
5816
  struct ggml_context * ctx,
5999
- struct ggml_tensor * a) {
6000
- return ggml_rms_norm_impl(ctx, a, true);
5817
+ struct ggml_tensor * a,
5818
+ float eps) {
5819
+ return ggml_rms_norm_impl(ctx, a, eps, true);
6001
5820
  }
6002
5821
 
6003
5822
  struct ggml_tensor * ggml_rms_norm_back(
@@ -6076,7 +5895,7 @@ struct ggml_tensor * ggml_out_prod(
6076
5895
 
6077
5896
  // ggml_scale
6078
5897
 
6079
- struct ggml_tensor * ggml_scale_impl(
5898
+ static struct ggml_tensor * ggml_scale_impl(
6080
5899
  struct ggml_context * ctx,
6081
5900
  struct ggml_tensor * a,
6082
5901
  struct ggml_tensor * b,
@@ -6116,7 +5935,7 @@ struct ggml_tensor * ggml_scale_inplace(
6116
5935
 
6117
5936
  // ggml_set
6118
5937
 
6119
- struct ggml_tensor * ggml_set_impl(
5938
+ static struct ggml_tensor * ggml_set_impl(
6120
5939
  struct ggml_context * ctx,
6121
5940
  struct ggml_tensor * a,
6122
5941
  struct ggml_tensor * b,
@@ -6136,23 +5955,13 @@ struct ggml_tensor * ggml_set_impl(
6136
5955
  // make a view of the destination
6137
5956
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6138
5957
 
6139
- ggml_scratch_save(ctx);
6140
-
6141
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
6142
-
6143
- (( int32_t * ) c->data)[0] = nb1;
6144
- (( int32_t * ) c->data)[1] = nb2;
6145
- (( int32_t * ) c->data)[2] = nb3;
6146
- (( int32_t * ) c->data)[3] = offset;
6147
- (( int32_t * ) c->data)[4] = inplace ? 1 : 0;
6148
-
6149
- ggml_scratch_load(ctx);
5958
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5959
+ ggml_set_op_params(result, params, sizeof(params));
6150
5960
 
6151
5961
  result->op = GGML_OP_SET;
6152
5962
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6153
5963
  result->src[0] = a;
6154
5964
  result->src[1] = b;
6155
- result->src[2] = c;
6156
5965
 
6157
5966
  return result;
6158
5967
  }
@@ -6216,7 +6025,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
6216
6025
 
6217
6026
  // ggml_cpy
6218
6027
 
6219
- struct ggml_tensor * ggml_cpy_impl(
6028
+ static struct ggml_tensor * ggml_cpy_impl(
6220
6029
  struct ggml_context * ctx,
6221
6030
  struct ggml_tensor * a,
6222
6031
  struct ggml_tensor * b,
@@ -6261,7 +6070,7 @@ struct ggml_tensor * ggml_cpy_inplace(
6261
6070
 
6262
6071
  // ggml_cont
6263
6072
 
6264
- struct ggml_tensor * ggml_cont_impl(
6073
+ static struct ggml_tensor * ggml_cont_impl(
6265
6074
  struct ggml_context * ctx,
6266
6075
  struct ggml_tensor * a,
6267
6076
  bool inplace) {
@@ -6277,7 +6086,6 @@ struct ggml_tensor * ggml_cont_impl(
6277
6086
  result->op = GGML_OP_CONT;
6278
6087
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6279
6088
  result->src[0] = a;
6280
- result->src[1] = NULL;
6281
6089
 
6282
6090
  return result;
6283
6091
  }
@@ -6321,7 +6129,6 @@ struct ggml_tensor * ggml_reshape(
6321
6129
  result->op = GGML_OP_RESHAPE;
6322
6130
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6323
6131
  result->src[0] = a;
6324
- result->src[1] = NULL;
6325
6132
 
6326
6133
  return result;
6327
6134
  }
@@ -6346,7 +6153,6 @@ struct ggml_tensor * ggml_reshape_1d(
6346
6153
  result->op = GGML_OP_RESHAPE;
6347
6154
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6348
6155
  result->src[0] = a;
6349
- result->src[1] = NULL;
6350
6156
 
6351
6157
  return result;
6352
6158
  }
@@ -6372,7 +6178,6 @@ struct ggml_tensor * ggml_reshape_2d(
6372
6178
  result->op = GGML_OP_RESHAPE;
6373
6179
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6374
6180
  result->src[0] = a;
6375
- result->src[1] = NULL;
6376
6181
 
6377
6182
  return result;
6378
6183
  }
@@ -6399,7 +6204,6 @@ struct ggml_tensor * ggml_reshape_3d(
6399
6204
  result->op = GGML_OP_RESHAPE;
6400
6205
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6401
6206
  result->src[0] = a;
6402
- result->src[1] = NULL;
6403
6207
 
6404
6208
  return result;
6405
6209
  }
@@ -6428,7 +6232,6 @@ struct ggml_tensor * ggml_reshape_4d(
6428
6232
  result->op = GGML_OP_RESHAPE;
6429
6233
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6430
6234
  result->src[0] = a;
6431
- result->src[1] = NULL;
6432
6235
 
6433
6236
  return result;
6434
6237
  }
@@ -6450,19 +6253,11 @@ struct ggml_tensor * ggml_view_1d(
6450
6253
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6451
6254
  ggml_format_name(result, "%s (view)", a->name);
6452
6255
 
6453
- ggml_scratch_save(ctx);
6454
-
6455
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6456
- ggml_set_name(offs, "offset");
6457
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6458
-
6459
- ggml_scratch_load(ctx);
6256
+ ggml_set_op_params(result, &offset, sizeof(offset));
6460
6257
 
6461
6258
  result->op = GGML_OP_VIEW;
6462
6259
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6463
6260
  result->src[0] = a;
6464
- result->src[1] = NULL;
6465
- result->src[2] = offs;
6466
6261
 
6467
6262
  return result;
6468
6263
  }
@@ -6488,13 +6283,7 @@ struct ggml_tensor * ggml_view_2d(
6488
6283
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6489
6284
  ggml_format_name(result, "%s (view)", a->name);
6490
6285
 
6491
- ggml_scratch_save(ctx);
6492
-
6493
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6494
- ggml_set_name(offs, "offset");
6495
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6496
-
6497
- ggml_scratch_load(ctx);
6286
+ ggml_set_op_params(result, &offset, sizeof(offset));
6498
6287
 
6499
6288
  result->nb[1] = nb1;
6500
6289
  result->nb[2] = result->nb[1]*ne1;
@@ -6503,8 +6292,6 @@ struct ggml_tensor * ggml_view_2d(
6503
6292
  result->op = GGML_OP_VIEW;
6504
6293
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6505
6294
  result->src[0] = a;
6506
- result->src[1] = NULL;
6507
- result->src[2] = offs;
6508
6295
 
6509
6296
  return result;
6510
6297
  }
@@ -6532,13 +6319,7 @@ struct ggml_tensor * ggml_view_3d(
6532
6319
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6533
6320
  ggml_format_name(result, "%s (view)", a->name);
6534
6321
 
6535
- ggml_scratch_save(ctx);
6536
-
6537
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6538
- ggml_set_name(offs, "offset");
6539
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6540
-
6541
- ggml_scratch_load(ctx);
6322
+ ggml_set_op_params(result, &offset, sizeof(offset));
6542
6323
 
6543
6324
  result->nb[1] = nb1;
6544
6325
  result->nb[2] = nb2;
@@ -6547,8 +6328,6 @@ struct ggml_tensor * ggml_view_3d(
6547
6328
  result->op = GGML_OP_VIEW;
6548
6329
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6549
6330
  result->src[0] = a;
6550
- result->src[1] = NULL;
6551
- result->src[2] = offs;
6552
6331
 
6553
6332
  return result;
6554
6333
  }
@@ -6578,13 +6357,7 @@ struct ggml_tensor * ggml_view_4d(
6578
6357
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6579
6358
  ggml_format_name(result, "%s (view)", a->name);
6580
6359
 
6581
- ggml_scratch_save(ctx);
6582
-
6583
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6584
- ggml_set_name(offs, "offset");
6585
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6586
-
6587
- ggml_scratch_load(ctx);
6360
+ ggml_set_op_params(result, &offset, sizeof(offset));
6588
6361
 
6589
6362
  result->nb[1] = nb1;
6590
6363
  result->nb[2] = nb2;
@@ -6593,8 +6366,6 @@ struct ggml_tensor * ggml_view_4d(
6593
6366
  result->op = GGML_OP_VIEW;
6594
6367
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6595
6368
  result->src[0] = a;
6596
- result->src[1] = NULL;
6597
- result->src[2] = offs;
6598
6369
 
6599
6370
  return result;
6600
6371
  }
@@ -6655,22 +6426,9 @@ struct ggml_tensor * ggml_permute(
6655
6426
  result->op = GGML_OP_PERMUTE;
6656
6427
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6657
6428
  result->src[0] = a;
6658
- result->src[1] = NULL;
6659
-
6660
- if (is_node) {
6661
- ggml_scratch_save(ctx);
6662
6429
 
6663
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6664
-
6665
- ((int32_t *) b->data)[0] = axis0;
6666
- ((int32_t *) b->data)[1] = axis1;
6667
- ((int32_t *) b->data)[2] = axis2;
6668
- ((int32_t *) b->data)[3] = axis3;
6669
-
6670
- ggml_scratch_load(ctx);
6671
-
6672
- result->src[2] = b;
6673
- }
6430
+ int32_t params[] = { axis0, axis1, axis2, axis3 };
6431
+ ggml_set_op_params(result, &params, sizeof(params));
6674
6432
 
6675
6433
  return result;
6676
6434
  }
@@ -6698,7 +6456,6 @@ struct ggml_tensor * ggml_transpose(
6698
6456
  result->op = GGML_OP_TRANSPOSE;
6699
6457
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6700
6458
  result->src[0] = a;
6701
- result->src[1] = NULL;
6702
6459
 
6703
6460
  return result;
6704
6461
  }
@@ -6776,7 +6533,6 @@ struct ggml_tensor * ggml_diag(
6776
6533
  result->op = GGML_OP_DIAG;
6777
6534
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6778
6535
  result->src[0] = a;
6779
- result->src[1] = NULL;
6780
6536
 
6781
6537
  return result;
6782
6538
  }
@@ -6784,7 +6540,7 @@ struct ggml_tensor * ggml_diag(
6784
6540
 
6785
6541
  // ggml_diag_mask_inf
6786
6542
 
6787
- struct ggml_tensor * ggml_diag_mask_inf_impl(
6543
+ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6788
6544
  struct ggml_context * ctx,
6789
6545
  struct ggml_tensor * a,
6790
6546
  int n_past,
@@ -6797,19 +6553,12 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
6797
6553
 
6798
6554
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6799
6555
 
6800
- ggml_scratch_save(ctx);
6801
-
6802
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6803
-
6804
- ((int32_t *) b->data)[0] = n_past;
6805
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6806
-
6807
- ggml_scratch_load(ctx);
6556
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6557
+ ggml_set_op_params(result, &params, sizeof(params));
6808
6558
 
6809
6559
  result->op = GGML_OP_DIAG_MASK_INF;
6810
6560
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6811
6561
  result->src[0] = a;
6812
- result->src[1] = b;
6813
6562
 
6814
6563
  return result;
6815
6564
  }
@@ -6831,7 +6580,7 @@ struct ggml_tensor * ggml_diag_mask_inf_inplace(
6831
6580
 
6832
6581
  // ggml_diag_mask_zero
6833
6582
 
6834
- struct ggml_tensor * ggml_diag_mask_zero_impl(
6583
+ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6835
6584
  struct ggml_context * ctx,
6836
6585
  struct ggml_tensor * a,
6837
6586
  int n_past,
@@ -6844,20 +6593,12 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
6844
6593
 
6845
6594
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6846
6595
 
6847
- ggml_scratch_save(ctx);
6848
-
6849
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6850
- ggml_set_name(b, "n_past, inplace");
6851
-
6852
- ((int32_t *) b->data)[0] = n_past;
6853
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6854
-
6855
- ggml_scratch_load(ctx);
6596
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6597
+ ggml_set_op_params(result, &params, sizeof(params));
6856
6598
 
6857
6599
  result->op = GGML_OP_DIAG_MASK_ZERO;
6858
6600
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6859
6601
  result->src[0] = a;
6860
- result->src[1] = b;
6861
6602
 
6862
6603
  return result;
6863
6604
  }
@@ -6878,7 +6619,7 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
6878
6619
 
6879
6620
  // ggml_soft_max
6880
6621
 
6881
- struct ggml_tensor * ggml_soft_max_impl(
6622
+ static struct ggml_tensor * ggml_soft_max_impl(
6882
6623
  struct ggml_context * ctx,
6883
6624
  struct ggml_tensor * a,
6884
6625
  bool inplace) {
@@ -6893,7 +6634,6 @@ struct ggml_tensor * ggml_soft_max_impl(
6893
6634
  result->op = GGML_OP_SOFT_MAX;
6894
6635
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6895
6636
  result->src[0] = a;
6896
- result->src[1] = NULL;
6897
6637
 
6898
6638
  return result;
6899
6639
  }
@@ -6913,7 +6653,7 @@ struct ggml_tensor * ggml_soft_max_inplace(
6913
6653
 
6914
6654
  // ggml_soft_max_back
6915
6655
 
6916
- struct ggml_tensor * ggml_soft_max_back_impl(
6656
+ static struct ggml_tensor * ggml_soft_max_back_impl(
6917
6657
  struct ggml_context * ctx,
6918
6658
  struct ggml_tensor * a,
6919
6659
  struct ggml_tensor * b,
@@ -6950,7 +6690,7 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
6950
6690
 
6951
6691
  // ggml_rope
6952
6692
 
6953
- struct ggml_tensor * ggml_rope_impl(
6693
+ static struct ggml_tensor * ggml_rope_impl(
6954
6694
  struct ggml_context * ctx,
6955
6695
  struct ggml_tensor * a,
6956
6696
  int n_past,
@@ -6969,23 +6709,14 @@ struct ggml_tensor * ggml_rope_impl(
6969
6709
 
6970
6710
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6971
6711
 
6972
- ggml_scratch_save(ctx);
6973
-
6974
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
6975
-
6976
- ((int32_t *) b->data)[0] = n_past;
6977
- ((int32_t *) b->data)[1] = n_dims;
6978
- ((int32_t *) b->data)[2] = mode;
6979
- ((int32_t *) b->data)[3] = n_ctx;
6980
- memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
6981
- memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
6982
-
6983
- ggml_scratch_load(ctx);
6712
+ int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6713
+ memcpy(params + 4, &freq_base, sizeof(float));
6714
+ memcpy(params + 5, &freq_scale, sizeof(float));
6715
+ ggml_set_op_params(result, &params, sizeof(params));
6984
6716
 
6985
6717
  result->op = GGML_OP_ROPE;
6986
6718
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6987
6719
  result->src[0] = a;
6988
- result->src[1] = b;
6989
6720
 
6990
6721
  return result;
6991
6722
  }
@@ -7042,22 +6773,12 @@ struct ggml_tensor * ggml_rope_back(
7042
6773
 
7043
6774
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7044
6775
 
7045
- ggml_scratch_save(ctx);
7046
-
7047
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7048
- ggml_set_name(b, "n_past, n_dims, mode");
7049
-
7050
- ((int32_t *) b->data)[0] = n_past;
7051
- ((int32_t *) b->data)[1] = n_dims;
7052
- ((int32_t *) b->data)[2] = mode;
7053
- ((int32_t *) b->data)[3] = n_ctx;
7054
-
7055
- ggml_scratch_load(ctx);
6776
+ int32_t params[] = { n_past, n_dims, mode, n_ctx };
6777
+ ggml_set_op_params(result, &params, sizeof(params));
7056
6778
 
7057
6779
  result->op = GGML_OP_ROPE_BACK;
7058
6780
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7059
6781
  result->src[0] = a;
7060
- result->src[1] = b;
7061
6782
 
7062
6783
  return result;
7063
6784
  }
@@ -7082,21 +6803,13 @@ struct ggml_tensor * ggml_alibi(
7082
6803
  //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7083
6804
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7084
6805
 
7085
- ggml_scratch_save(ctx);
7086
-
7087
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7088
-
7089
- ((int32_t *) b->data)[0] = n_past;
7090
- ((int32_t *) b->data)[1] = n_head;
7091
- GGML_ASSERT(sizeof(float) == sizeof(int32_t));
7092
- (((float *) b->data)[2]) = bias_max;
7093
-
7094
- ggml_scratch_load(ctx);
6806
+ int32_t op_params[3] = { n_past, n_head };
6807
+ memcpy(op_params + 2, &bias_max, sizeof(float));
6808
+ ggml_set_op_params(result, &op_params, sizeof(op_params));
7095
6809
 
7096
6810
  result->op = GGML_OP_ALIBI;
7097
6811
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7098
6812
  result->src[0] = a;
7099
- result->src[1] = b;
7100
6813
 
7101
6814
  return result;
7102
6815
  }
@@ -7118,19 +6831,12 @@ struct ggml_tensor * ggml_clamp(
7118
6831
  // TODO: when implement backward, fix this:
7119
6832
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7120
6833
 
7121
- ggml_scratch_save(ctx);
7122
-
7123
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
7124
-
7125
- ((float *) b->data)[0] = min;
7126
- ((float *) b->data)[1] = max;
7127
-
7128
- ggml_scratch_load(ctx);
6834
+ float params[] = { min, max };
6835
+ ggml_set_op_params(result, &params, sizeof(params));
7129
6836
 
7130
6837
  result->op = GGML_OP_CLAMP;
7131
6838
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7132
6839
  result->src[0] = a;
7133
- result->src[1] = b;
7134
6840
 
7135
6841
  return result;
7136
6842
  }
@@ -7163,18 +6869,13 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
7163
6869
  };
7164
6870
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7165
6871
 
7166
- ggml_scratch_save(ctx);
7167
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7168
- ((int32_t*)c->data)[0] = s0;
7169
- ((int32_t*)c->data)[1] = p0;
7170
- ((int32_t*)c->data)[2] = d0;
7171
- ggml_scratch_load(ctx);
6872
+ int32_t params[] = { s0, p0, d0 };
6873
+ ggml_set_op_params(result, &params, sizeof(params));
7172
6874
 
7173
6875
  result->op = GGML_OP_CONV_1D;
7174
6876
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7175
6877
  result->src[0] = a;
7176
6878
  result->src[1] = b;
7177
- result->src[2] = c;
7178
6879
 
7179
6880
  return result;
7180
6881
  }
@@ -7207,21 +6908,13 @@ struct ggml_tensor* ggml_conv_2d(
7207
6908
  };
7208
6909
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7209
6910
 
7210
- ggml_scratch_save(ctx);
7211
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
7212
- ((int32_t*)c->data)[0] = s0;
7213
- ((int32_t*)c->data)[1] = s1;
7214
- ((int32_t*)c->data)[2] = p0;
7215
- ((int32_t*)c->data)[3] = p1;
7216
- ((int32_t*)c->data)[4] = d0;
7217
- ((int32_t*)c->data)[5] = d1;
7218
- ggml_scratch_load(ctx);
6911
+ int32_t params[] = { s0, s1, p0, p1, d0, d1 };
6912
+ ggml_set_op_params(result, &params, sizeof(params));
7219
6913
 
7220
6914
  result->op = GGML_OP_CONV_2D;
7221
6915
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7222
6916
  result->src[0] = a;
7223
6917
  result->src[1] = b;
7224
- result->src[2] = c;
7225
6918
 
7226
6919
  return result;
7227
6920
 
@@ -7245,7 +6938,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
7245
6938
  return (ins + 2 * p - ks) / s + 1;
7246
6939
  }
7247
6940
 
7248
- // ggml_pool_2d
6941
+ // ggml_pool_1d
7249
6942
 
7250
6943
  struct ggml_tensor* ggml_pool_1d(
7251
6944
  struct ggml_context * ctx,
@@ -7268,18 +6961,12 @@ struct ggml_tensor* ggml_pool_1d(
7268
6961
  };
7269
6962
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7270
6963
 
7271
- ggml_scratch_save(ctx);
7272
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7273
- ((int32_t*)c->data)[0] = op;
7274
- ((int32_t*)c->data)[1] = k0;
7275
- ((int32_t*)c->data)[2] = s0;
7276
- ((int32_t*)c->data)[3] = p0;
7277
- ggml_scratch_load(ctx);
6964
+ int32_t params[] = { op, k0, s0, p0 };
6965
+ ggml_set_op_params(result, &params, sizeof(params));
7278
6966
 
7279
6967
  result->op = GGML_OP_POOL_1D;
7280
6968
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7281
6969
  result->src[0] = a;
7282
- result->src[1] = c;
7283
6970
 
7284
6971
  return result;
7285
6972
  }
@@ -7311,21 +6998,12 @@ struct ggml_tensor* ggml_pool_2d(
7311
6998
  };
7312
6999
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7313
7000
 
7314
- ggml_scratch_save(ctx);
7315
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
7316
- ((int32_t*)c->data)[0] = op;
7317
- ((int32_t*)c->data)[1] = k0;
7318
- ((int32_t*)c->data)[2] = k1;
7319
- ((int32_t*)c->data)[3] = s0;
7320
- ((int32_t*)c->data)[4] = s1;
7321
- ((int32_t*)c->data)[5] = p0;
7322
- ((int32_t*)c->data)[6] = p1;
7323
- ggml_scratch_load(ctx);
7001
+ int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
7002
+ ggml_set_op_params(result, &params, sizeof(params));
7324
7003
 
7325
7004
  result->op = GGML_OP_POOL_2D;
7326
7005
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7327
7006
  result->src[0] = a;
7328
- result->src[1] = c;
7329
7007
 
7330
7008
  return result;
7331
7009
  }
@@ -7348,14 +7026,16 @@ struct ggml_tensor * ggml_flash_attn(
7348
7026
  }
7349
7027
 
7350
7028
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
7351
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne);
7029
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
7030
+
7031
+ int32_t t = masked ? 1 : 0;
7032
+ ggml_set_op_params(result, &t, sizeof(t));
7352
7033
 
7353
7034
  result->op = GGML_OP_FLASH_ATTN;
7354
7035
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7355
7036
  result->src[0] = q;
7356
7037
  result->src[1] = k;
7357
7038
  result->src[2] = v;
7358
- result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
7359
7039
 
7360
7040
  return result;
7361
7041
  }
@@ -7379,7 +7059,7 @@ struct ggml_tensor * ggml_flash_ff(
7379
7059
  }
7380
7060
 
7381
7061
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7382
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);
7062
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
7383
7063
 
7384
7064
  result->op = GGML_OP_FLASH_FF;
7385
7065
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7445,13 +7125,15 @@ struct ggml_tensor * ggml_flash_attn_back(
7445
7125
 
7446
7126
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7447
7127
 
7128
+ int32_t masked_i = masked ? 1 : 0;
7129
+ ggml_set_op_params(result, &masked_i, sizeof(masked_i));
7130
+
7448
7131
  result->op = GGML_OP_FLASH_ATTN_BACK;
7449
7132
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7450
7133
  result->src[0] = q;
7451
7134
  result->src[1] = k;
7452
7135
  result->src[2] = v;
7453
7136
  result->src[3] = d;
7454
- result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
7455
7137
 
7456
7138
  return result;
7457
7139
  }
@@ -7484,21 +7166,12 @@ struct ggml_tensor * ggml_win_part(
7484
7166
 
7485
7167
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7486
7168
 
7487
- ggml_scratch_save(ctx);
7488
-
7489
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7490
-
7491
- ((int32_t *) b->data)[0] = npx;
7492
- ((int32_t *) b->data)[1] = npy;
7493
- ((int32_t *) b->data)[2] = w;
7494
-
7495
- ggml_scratch_load(ctx);
7169
+ int32_t params[] = { npx, npy, w };
7170
+ ggml_set_op_params(result, &params, sizeof(params));
7496
7171
 
7497
7172
  result->op = GGML_OP_WIN_PART;
7498
7173
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7499
7174
  result->src[0] = a;
7500
- result->src[1] = NULL;
7501
- result->src[2] = b;
7502
7175
 
7503
7176
  return result;
7504
7177
  }
@@ -7523,26 +7196,57 @@ struct ggml_tensor * ggml_win_unpart(
7523
7196
  const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
7524
7197
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7525
7198
 
7526
- ggml_scratch_save(ctx);
7199
+ int32_t params[] = { w };
7200
+ ggml_set_op_params(result, &params, sizeof(params));
7527
7201
 
7528
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
7202
+ result->op = GGML_OP_WIN_UNPART;
7203
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7204
+ result->src[0] = a;
7529
7205
 
7530
- ((int32_t *) b->data)[0] = w;
7206
+ return result;
7207
+ }
7531
7208
 
7532
- ggml_scratch_load(ctx);
7209
+ // gmml_unary
7533
7210
 
7534
- result->op = GGML_OP_WIN_UNPART;
7211
+ static struct ggml_tensor * ggml_unary_impl(
7212
+ struct ggml_context * ctx,
7213
+ struct ggml_tensor * a,
7214
+ enum ggml_unary_op op,
7215
+ bool inplace) {
7216
+ bool is_node = false;
7217
+
7218
+ if (!inplace && (a->grad)) {
7219
+ is_node = true;
7220
+ }
7221
+
7222
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7223
+
7224
+ ggml_set_op_params_i32(result, 0, (int32_t) op);
7225
+
7226
+ result->op = GGML_OP_UNARY;
7535
7227
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7536
7228
  result->src[0] = a;
7537
- result->src[1] = NULL;
7538
- result->src[2] = b;
7539
7229
 
7540
7230
  return result;
7541
7231
  }
7542
7232
 
7233
+ struct ggml_tensor * ggml_unary(
7234
+ struct ggml_context * ctx,
7235
+ struct ggml_tensor * a,
7236
+ enum ggml_unary_op op) {
7237
+ return ggml_unary_impl(ctx, a, op, false);
7238
+ }
7239
+
7240
+ struct ggml_tensor * ggml_unary_inplace(
7241
+ struct ggml_context * ctx,
7242
+ struct ggml_tensor * a,
7243
+ enum ggml_unary_op op) {
7244
+ return ggml_unary_impl(ctx, a, op, true);
7245
+ }
7246
+
7543
7247
  // ggml_map_unary
7544
7248
 
7545
- struct ggml_tensor * ggml_map_unary_impl_f32(
7249
+ static struct ggml_tensor * ggml_map_unary_impl_f32(
7546
7250
  struct ggml_context * ctx,
7547
7251
  struct ggml_tensor * a,
7548
7252
  const ggml_unary_op_f32_t fun,
@@ -7553,19 +7257,13 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
7553
7257
  is_node = true;
7554
7258
  }
7555
7259
 
7556
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7557
-
7558
- ggml_scratch_save(ctx);
7559
-
7560
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7561
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7260
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7562
7261
 
7563
- ggml_scratch_load(ctx);
7262
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7564
7263
 
7565
7264
  result->op = GGML_OP_MAP_UNARY;
7566
7265
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7567
7266
  result->src[0] = a;
7568
- result->src[2] = addr_tensor;
7569
7267
 
7570
7268
  return result;
7571
7269
  }
@@ -7586,7 +7284,7 @@ struct ggml_tensor * ggml_map_unary_inplace_f32(
7586
7284
 
7587
7285
  // ggml_map_binary
7588
7286
 
7589
- struct ggml_tensor * ggml_map_binary_impl_f32(
7287
+ static struct ggml_tensor * ggml_map_binary_impl_f32(
7590
7288
  struct ggml_context * ctx,
7591
7289
  struct ggml_tensor * a,
7592
7290
  struct ggml_tensor * b,
@@ -7600,20 +7298,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
7600
7298
  is_node = true;
7601
7299
  }
7602
7300
 
7603
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7604
-
7605
- ggml_scratch_save(ctx);
7606
-
7607
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7608
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7301
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7609
7302
 
7610
- ggml_scratch_load(ctx);
7303
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7611
7304
 
7612
7305
  result->op = GGML_OP_MAP_BINARY;
7613
7306
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7614
7307
  result->src[0] = a;
7615
7308
  result->src[1] = b;
7616
- result->src[2] = addr_tensor;
7617
7309
 
7618
7310
  return result;
7619
7311
  }
@@ -7636,7 +7328,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7636
7328
 
7637
7329
  // ggml_map_custom1
7638
7330
 
7639
- struct ggml_tensor * ggml_map_custom1_impl_f32(
7331
+ static struct ggml_tensor * ggml_map_custom1_impl_f32(
7640
7332
  struct ggml_context * ctx,
7641
7333
  struct ggml_tensor * a,
7642
7334
  const ggml_custom1_op_f32_t fun,
@@ -7647,19 +7339,13 @@ struct ggml_tensor * ggml_map_custom1_impl_f32(
7647
7339
  is_node = true;
7648
7340
  }
7649
7341
 
7650
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7651
-
7652
- ggml_scratch_save(ctx);
7653
-
7654
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7655
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7342
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7656
7343
 
7657
- ggml_scratch_load(ctx);
7344
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7658
7345
 
7659
7346
  result->op = GGML_OP_MAP_CUSTOM1;
7660
7347
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7661
7348
  result->src[0] = a;
7662
- result->src[2] = addr_tensor;
7663
7349
 
7664
7350
  return result;
7665
7351
  }
@@ -7680,7 +7366,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7680
7366
 
7681
7367
  // ggml_map_custom2
7682
7368
 
7683
- struct ggml_tensor * ggml_map_custom2_impl_f32(
7369
+ static struct ggml_tensor * ggml_map_custom2_impl_f32(
7684
7370
  struct ggml_context * ctx,
7685
7371
  struct ggml_tensor * a,
7686
7372
  struct ggml_tensor * b,
@@ -7692,20 +7378,14 @@ struct ggml_tensor * ggml_map_custom2_impl_f32(
7692
7378
  is_node = true;
7693
7379
  }
7694
7380
 
7695
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7696
-
7697
- ggml_scratch_save(ctx);
7698
-
7699
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7700
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7381
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7701
7382
 
7702
- ggml_scratch_load(ctx);
7383
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7703
7384
 
7704
7385
  result->op = GGML_OP_MAP_CUSTOM2;
7705
7386
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7706
7387
  result->src[0] = a;
7707
7388
  result->src[1] = b;
7708
- result->src[2] = addr_tensor;
7709
7389
 
7710
7390
  return result;
7711
7391
  }
@@ -7728,7 +7408,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7728
7408
 
7729
7409
  // ggml_map_custom3
7730
7410
 
7731
- struct ggml_tensor * ggml_map_custom3_impl_f32(
7411
+ static struct ggml_tensor * ggml_map_custom3_impl_f32(
7732
7412
  struct ggml_context * ctx,
7733
7413
  struct ggml_tensor * a,
7734
7414
  struct ggml_tensor * b,
@@ -7741,21 +7421,15 @@ struct ggml_tensor * ggml_map_custom3_impl_f32(
7741
7421
  is_node = true;
7742
7422
  }
7743
7423
 
7744
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7745
-
7746
- ggml_scratch_save(ctx);
7747
-
7748
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7749
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7424
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7750
7425
 
7751
- ggml_scratch_load(ctx);
7426
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7752
7427
 
7753
7428
  result->op = GGML_OP_MAP_CUSTOM3;
7754
7429
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7755
7430
  result->src[0] = a;
7756
7431
  result->src[1] = b;
7757
- result->src[2] = addr_tensor;
7758
- result->src[3] = c;
7432
+ result->src[2] = c;
7759
7433
 
7760
7434
  return result;
7761
7435
  }
@@ -8983,21 +8657,17 @@ static void ggml_compute_forward_acc_f32(
8983
8657
  const struct ggml_compute_params * params,
8984
8658
  const struct ggml_tensor * src0,
8985
8659
  const struct ggml_tensor * src1,
8986
- const struct ggml_tensor * opt0,
8987
8660
  struct ggml_tensor * dst) {
8988
8661
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8989
8662
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
8990
8663
 
8991
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
8992
- GGML_ASSERT(ggml_nelements(opt0) == 5);
8993
-
8994
8664
  // view src0 and dst with these strides and data offset inbytes during acc
8995
8665
  // nb0 is implicitely element_size because src0 and dst are contiguous
8996
- size_t nb1 = ((int32_t *) opt0->data)[0];
8997
- size_t nb2 = ((int32_t *) opt0->data)[1];
8998
- size_t nb3 = ((int32_t *) opt0->data)[2];
8999
- size_t offset = ((int32_t *) opt0->data)[3];
9000
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
8666
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
8667
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
8668
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
8669
+ size_t offset = ((int32_t *) dst->op_params)[3];
8670
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
9001
8671
 
9002
8672
  if (!inplace && (params->type == GGML_TASK_INIT)) {
9003
8673
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -9066,13 +8736,12 @@ static void ggml_compute_forward_acc(
9066
8736
  const struct ggml_compute_params * params,
9067
8737
  const struct ggml_tensor * src0,
9068
8738
  const struct ggml_tensor * src1,
9069
- const struct ggml_tensor * opt0,
9070
8739
  struct ggml_tensor * dst) {
9071
8740
 
9072
8741
  switch (src0->type) {
9073
8742
  case GGML_TYPE_F32:
9074
8743
  {
9075
- ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst);
8744
+ ggml_compute_forward_acc_f32(params, src0, src1, dst);
9076
8745
  } break;
9077
8746
  case GGML_TYPE_F16:
9078
8747
  case GGML_TYPE_Q4_0:
@@ -9504,7 +9173,7 @@ static void ggml_compute_forward_sum_f32(
9504
9173
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9505
9174
  for (int64_t i02 = 0; i02 < ne02; i02++) {
9506
9175
  for (int64_t i01 = 0; i01 < ne01; i01++) {
9507
- ggml_vec_sum_ggf(ne00,
9176
+ ggml_vec_sum_f32_ggf(ne00,
9508
9177
  &row_sum,
9509
9178
  (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
9510
9179
  sum += row_sum;
@@ -9514,6 +9183,38 @@ static void ggml_compute_forward_sum_f32(
9514
9183
  ((float *) dst->data)[0] = sum;
9515
9184
  }
9516
9185
 
9186
+ static void ggml_compute_forward_sum_f16(
9187
+ const struct ggml_compute_params * params,
9188
+ const struct ggml_tensor * src0,
9189
+ struct ggml_tensor * dst) {
9190
+ assert(params->ith == 0);
9191
+ assert(ggml_is_scalar(dst));
9192
+
9193
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9194
+ return;
9195
+ }
9196
+
9197
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
9198
+
9199
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
9200
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb);
9201
+
9202
+ float sum = 0;
9203
+ float row_sum = 0;
9204
+
9205
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
9206
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
9207
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
9208
+ ggml_vec_sum_f16_ggf(ne00,
9209
+ &row_sum,
9210
+ (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
9211
+ sum += row_sum;
9212
+ }
9213
+ }
9214
+ }
9215
+ ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
9216
+ }
9217
+
9517
9218
  static void ggml_compute_forward_sum(
9518
9219
  const struct ggml_compute_params * params,
9519
9220
  const struct ggml_tensor * src0,
@@ -9523,6 +9224,10 @@ static void ggml_compute_forward_sum(
9523
9224
  {
9524
9225
  ggml_compute_forward_sum_f32(params, src0, dst);
9525
9226
  } break;
9227
+ case GGML_TYPE_F16:
9228
+ {
9229
+ ggml_compute_forward_sum_f16(params, src0, dst);
9230
+ } break;
9526
9231
  default:
9527
9232
  {
9528
9233
  GGML_ASSERT(false);
@@ -10118,8 +9823,8 @@ static void ggml_compute_forward_gelu_f32(
10118
9823
  const struct ggml_compute_params * params,
10119
9824
  const struct ggml_tensor * src0,
10120
9825
  struct ggml_tensor * dst) {
10121
- GGML_ASSERT(ggml_is_contiguous(src0));
10122
- GGML_ASSERT(ggml_is_contiguous(dst));
9826
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9827
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10123
9828
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10124
9829
 
10125
9830
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10177,8 +9882,8 @@ static void ggml_compute_forward_gelu_quick_f32(
10177
9882
  const struct ggml_compute_params * params,
10178
9883
  const struct ggml_tensor * src0,
10179
9884
  struct ggml_tensor * dst) {
10180
- GGML_ASSERT(ggml_is_contiguous(src0));
10181
- GGML_ASSERT(ggml_is_contiguous(dst));
9885
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9886
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10182
9887
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10183
9888
 
10184
9889
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10236,8 +9941,8 @@ static void ggml_compute_forward_silu_f32(
10236
9941
  const struct ggml_compute_params * params,
10237
9942
  const struct ggml_tensor * src0,
10238
9943
  struct ggml_tensor * dst) {
10239
- GGML_ASSERT(ggml_is_contiguous(src0));
10240
- GGML_ASSERT(ggml_is_contiguous(dst));
9944
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9945
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10241
9946
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10242
9947
 
10243
9948
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10289,7 +9994,6 @@ static void ggml_compute_forward_silu(
10289
9994
  }
10290
9995
  }
10291
9996
 
10292
-
10293
9997
  // ggml_compute_forward_silu_back
10294
9998
 
10295
9999
  static void ggml_compute_forward_silu_back_f32(
@@ -10297,9 +10001,9 @@ static void ggml_compute_forward_silu_back_f32(
10297
10001
  const struct ggml_tensor * src0,
10298
10002
  const struct ggml_tensor * grad,
10299
10003
  struct ggml_tensor * dst) {
10300
- GGML_ASSERT(ggml_is_contiguous(grad));
10301
- GGML_ASSERT(ggml_is_contiguous(src0));
10302
- GGML_ASSERT(ggml_is_contiguous(dst));
10004
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
10005
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
10006
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10303
10007
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10304
10008
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
10305
10009
 
@@ -10439,7 +10143,8 @@ static void ggml_compute_forward_rms_norm_f32(
10439
10143
 
10440
10144
  GGML_TENSOR_UNARY_OP_LOCALS;
10441
10145
 
10442
- const float eps = 1e-6f; // TODO: make this a parameter
10146
+ float eps;
10147
+ memcpy(&eps, dst->op_params, sizeof(float));
10443
10148
 
10444
10149
  // TODO: optimize
10445
10150
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -11092,21 +10797,17 @@ static void ggml_compute_forward_set_f32(
11092
10797
  const struct ggml_compute_params * params,
11093
10798
  const struct ggml_tensor * src0,
11094
10799
  const struct ggml_tensor * src1,
11095
- const struct ggml_tensor * opt0,
11096
10800
  struct ggml_tensor * dst) {
11097
10801
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11098
10802
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
11099
10803
 
11100
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
11101
- GGML_ASSERT(ggml_nelements(opt0) == 5);
11102
-
11103
10804
  // view src0 and dst with these strides and data offset inbytes during set
11104
10805
  // nb0 is implicitely element_size because src0 and dst are contiguous
11105
- size_t nb1 = ((int32_t *) opt0->data)[0];
11106
- size_t nb2 = ((int32_t *) opt0->data)[1];
11107
- size_t nb3 = ((int32_t *) opt0->data)[2];
11108
- size_t offset = ((int32_t *) opt0->data)[3];
11109
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
10806
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
10807
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
10808
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
10809
+ size_t offset = ((int32_t *) dst->op_params)[3];
10810
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
11110
10811
 
11111
10812
  if (!inplace && (params->type == GGML_TASK_INIT)) {
11112
10813
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -11166,13 +10867,12 @@ static void ggml_compute_forward_set(
11166
10867
  const struct ggml_compute_params * params,
11167
10868
  const struct ggml_tensor * src0,
11168
10869
  const struct ggml_tensor * src1,
11169
- const struct ggml_tensor * opt0,
11170
10870
  struct ggml_tensor * dst) {
11171
10871
 
11172
10872
  switch (src0->type) {
11173
10873
  case GGML_TYPE_F32:
11174
10874
  {
11175
- ggml_compute_forward_set_f32(params, src0, src1, opt0, dst);
10875
+ ggml_compute_forward_set_f32(params, src0, src1, dst);
11176
10876
  } break;
11177
10877
  case GGML_TYPE_F16:
11178
10878
  case GGML_TYPE_Q4_0:
@@ -11568,17 +11268,14 @@ static void ggml_compute_forward_diag(
11568
11268
  static void ggml_compute_forward_diag_mask_f32(
11569
11269
  const struct ggml_compute_params * params,
11570
11270
  const struct ggml_tensor * src0,
11571
- const struct ggml_tensor * src1,
11572
11271
  struct ggml_tensor * dst,
11573
11272
  const float value) {
11574
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11575
- GGML_ASSERT(ggml_nelements(src1) == 2);
11576
11273
 
11577
11274
  const int ith = params->ith;
11578
11275
  const int nth = params->nth;
11579
11276
 
11580
- const int n_past = ((int32_t *) src1->data)[0];
11581
- const bool inplace = (bool)((int32_t *) src1->data)[1];
11277
+ const int n_past = ((int32_t *) dst->op_params)[0];
11278
+ const bool inplace = (bool)((int32_t *) dst->op_params)[1];
11582
11279
 
11583
11280
  GGML_ASSERT(n_past >= 0);
11584
11281
 
@@ -11621,12 +11318,11 @@ static void ggml_compute_forward_diag_mask_f32(
11621
11318
  static void ggml_compute_forward_diag_mask_inf(
11622
11319
  const struct ggml_compute_params * params,
11623
11320
  const struct ggml_tensor * src0,
11624
- const struct ggml_tensor * src1,
11625
11321
  struct ggml_tensor * dst) {
11626
11322
  switch (src0->type) {
11627
11323
  case GGML_TYPE_F32:
11628
11324
  {
11629
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY);
11325
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
11630
11326
  } break;
11631
11327
  default:
11632
11328
  {
@@ -11638,12 +11334,11 @@ static void ggml_compute_forward_diag_mask_inf(
11638
11334
  static void ggml_compute_forward_diag_mask_zero(
11639
11335
  const struct ggml_compute_params * params,
11640
11336
  const struct ggml_tensor * src0,
11641
- const struct ggml_tensor * src1,
11642
11337
  struct ggml_tensor * dst) {
11643
11338
  switch (src0->type) {
11644
11339
  case GGML_TYPE_F32:
11645
11340
  {
11646
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0);
11341
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
11647
11342
  } break;
11648
11343
  default:
11649
11344
  {
@@ -11841,20 +11536,17 @@ static void ggml_compute_forward_soft_max_back(
11841
11536
  static void ggml_compute_forward_alibi_f32(
11842
11537
  const struct ggml_compute_params * params,
11843
11538
  const struct ggml_tensor * src0,
11844
- const struct ggml_tensor * src1,
11845
11539
  struct ggml_tensor * dst) {
11846
11540
  assert(params->ith == 0);
11847
11541
 
11848
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11849
- GGML_ASSERT(ggml_nelements(src1) == 3);
11850
-
11851
11542
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11852
11543
  return;
11853
11544
  }
11854
11545
 
11855
- const int n_past = ((int32_t *) src1->data)[0];
11856
- const int n_head = ((int32_t *) src1->data)[1];
11857
- const float max_bias = ((float *) src1->data)[2];
11546
+ const int n_past = ((int32_t *) dst->op_params)[0];
11547
+ const int n_head = ((int32_t *) dst->op_params)[1];
11548
+ float max_bias;
11549
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11858
11550
 
11859
11551
  assert(n_past >= 0);
11860
11552
 
@@ -11907,20 +11599,17 @@ static void ggml_compute_forward_alibi_f32(
11907
11599
  static void ggml_compute_forward_alibi_f16(
11908
11600
  const struct ggml_compute_params * params,
11909
11601
  const struct ggml_tensor * src0,
11910
- const struct ggml_tensor * src1,
11911
11602
  struct ggml_tensor * dst) {
11912
11603
  assert(params->ith == 0);
11913
11604
 
11914
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11915
- GGML_ASSERT(ggml_nelements(src1) == 3);
11916
-
11917
11605
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11918
11606
  return;
11919
11607
  }
11920
11608
 
11921
- const int n_past = ((int32_t *) src1->data)[0];
11922
- const int n_head = ((int32_t *) src1->data)[1];
11923
- const float max_bias = ((float *) src1->data)[2];
11609
+ const int n_past = ((int32_t *) dst->op_params)[0];
11610
+ const int n_head = ((int32_t *) dst->op_params)[1];
11611
+ float max_bias;
11612
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11924
11613
 
11925
11614
  assert(n_past >= 0);
11926
11615
 
@@ -11973,16 +11662,15 @@ static void ggml_compute_forward_alibi_f16(
11973
11662
  static void ggml_compute_forward_alibi(
11974
11663
  const struct ggml_compute_params * params,
11975
11664
  const struct ggml_tensor * src0,
11976
- const struct ggml_tensor * src1,
11977
11665
  struct ggml_tensor * dst) {
11978
11666
  switch (src0->type) {
11979
11667
  case GGML_TYPE_F16:
11980
11668
  {
11981
- ggml_compute_forward_alibi_f16(params, src0, src1, dst);
11669
+ ggml_compute_forward_alibi_f16(params, src0, dst);
11982
11670
  } break;
11983
11671
  case GGML_TYPE_F32:
11984
11672
  {
11985
- ggml_compute_forward_alibi_f32(params, src0, src1, dst);
11673
+ ggml_compute_forward_alibi_f32(params, src0, dst);
11986
11674
  } break;
11987
11675
  case GGML_TYPE_Q4_0:
11988
11676
  case GGML_TYPE_Q4_1:
@@ -12012,19 +11700,17 @@ static void ggml_compute_forward_alibi(
12012
11700
  static void ggml_compute_forward_clamp_f32(
12013
11701
  const struct ggml_compute_params * params,
12014
11702
  const struct ggml_tensor * src0,
12015
- const struct ggml_tensor * src1,
12016
11703
  struct ggml_tensor * dst) {
12017
11704
  assert(params->ith == 0);
12018
11705
 
12019
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
12020
- GGML_ASSERT(ggml_nelements(src1) == 2);
12021
-
12022
11706
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12023
11707
  return;
12024
11708
  }
12025
11709
 
12026
- const float min = ((float *) src1->data)[0];
12027
- const float max = ((float *) src1->data)[1];
11710
+ float min;
11711
+ float max;
11712
+ memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
11713
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
12028
11714
 
12029
11715
  const int ith = params->ith;
12030
11716
  const int nth = params->nth;
@@ -12054,12 +11740,11 @@ static void ggml_compute_forward_clamp_f32(
12054
11740
  static void ggml_compute_forward_clamp(
12055
11741
  const struct ggml_compute_params * params,
12056
11742
  const struct ggml_tensor * src0,
12057
- const struct ggml_tensor * src1,
12058
11743
  struct ggml_tensor * dst) {
12059
11744
  switch (src0->type) {
12060
11745
  case GGML_TYPE_F32:
12061
11746
  {
12062
- ggml_compute_forward_clamp_f32(params, src0, src1, dst);
11747
+ ggml_compute_forward_clamp_f32(params, src0, dst);
12063
11748
  } break;
12064
11749
  case GGML_TYPE_F16:
12065
11750
  case GGML_TYPE_Q4_0:
@@ -12089,10 +11774,7 @@ static void ggml_compute_forward_clamp(
12089
11774
  static void ggml_compute_forward_rope_f32(
12090
11775
  const struct ggml_compute_params * params,
12091
11776
  const struct ggml_tensor * src0,
12092
- const struct ggml_tensor * src1,
12093
11777
  struct ggml_tensor * dst) {
12094
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12095
- GGML_ASSERT(ggml_nelements(src1) == 6);
12096
11778
 
12097
11779
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12098
11780
  return;
@@ -12101,12 +11783,12 @@ static void ggml_compute_forward_rope_f32(
12101
11783
  float freq_base;
12102
11784
  float freq_scale;
12103
11785
 
12104
- const int n_past = ((int32_t *) src1->data)[0];
12105
- const int n_dims = ((int32_t *) src1->data)[1];
12106
- const int mode = ((int32_t *) src1->data)[2];
12107
- const int n_ctx = ((int32_t *) src1->data)[3];
12108
- memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12109
- memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
11786
+ const int n_past = ((int32_t *) dst->op_params)[0];
11787
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11788
+ const int mode = ((int32_t *) dst->op_params)[2];
11789
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11790
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11791
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12110
11792
 
12111
11793
  assert(n_past >= 0);
12112
11794
 
@@ -12221,10 +11903,7 @@ static void ggml_compute_forward_rope_f32(
12221
11903
  static void ggml_compute_forward_rope_f16(
12222
11904
  const struct ggml_compute_params * params,
12223
11905
  const struct ggml_tensor * src0,
12224
- const struct ggml_tensor * src1,
12225
11906
  struct ggml_tensor * dst) {
12226
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12227
- GGML_ASSERT(ggml_nelements(src1) == 6);
12228
11907
 
12229
11908
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12230
11909
  return;
@@ -12233,12 +11912,12 @@ static void ggml_compute_forward_rope_f16(
12233
11912
  float freq_base;
12234
11913
  float freq_scale;
12235
11914
 
12236
- const int n_past = ((int32_t *) src1->data)[0];
12237
- const int n_dims = ((int32_t *) src1->data)[1];
12238
- const int mode = ((int32_t *) src1->data)[2];
12239
- const int n_ctx = ((int32_t *) src1->data)[3];
12240
- memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12241
- memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
11915
+ const int n_past = ((int32_t *) dst->op_params)[0];
11916
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11917
+ const int mode = ((int32_t *) dst->op_params)[2];
11918
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11919
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11920
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12242
11921
 
12243
11922
  assert(n_past >= 0);
12244
11923
 
@@ -12353,16 +12032,15 @@ static void ggml_compute_forward_rope_f16(
12353
12032
  static void ggml_compute_forward_rope(
12354
12033
  const struct ggml_compute_params * params,
12355
12034
  const struct ggml_tensor * src0,
12356
- const struct ggml_tensor * src1,
12357
12035
  struct ggml_tensor * dst) {
12358
12036
  switch (src0->type) {
12359
12037
  case GGML_TYPE_F16:
12360
12038
  {
12361
- ggml_compute_forward_rope_f16(params, src0, src1, dst);
12039
+ ggml_compute_forward_rope_f16(params, src0, dst);
12362
12040
  } break;
12363
12041
  case GGML_TYPE_F32:
12364
12042
  {
12365
- ggml_compute_forward_rope_f32(params, src0, src1, dst);
12043
+ ggml_compute_forward_rope_f32(params, src0, dst);
12366
12044
  } break;
12367
12045
  default:
12368
12046
  {
@@ -12376,10 +12054,7 @@ static void ggml_compute_forward_rope(
12376
12054
  static void ggml_compute_forward_rope_back_f32(
12377
12055
  const struct ggml_compute_params * params,
12378
12056
  const struct ggml_tensor * src0,
12379
- const struct ggml_tensor * src1,
12380
12057
  struct ggml_tensor * dst) {
12381
- assert(src1->type == GGML_TYPE_I32);
12382
- assert(ggml_nelements(src1) == 4);
12383
12058
 
12384
12059
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12385
12060
  return;
@@ -12389,9 +12064,9 @@ static void ggml_compute_forward_rope_back_f32(
12389
12064
  // dx = rope_back(dy, src1)
12390
12065
  // src0 is dy, src1 contains options
12391
12066
 
12392
- const int n_past = ((int32_t *) src1->data)[0];
12393
- const int n_dims = ((int32_t *) src1->data)[1];
12394
- const int mode = ((int32_t *) src1->data)[2];
12067
+ const int n_past = ((int32_t *) dst->op_params)[0];
12068
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12069
+ const int mode = ((int32_t *) dst->op_params)[2];
12395
12070
 
12396
12071
  assert(n_past >= 0);
12397
12072
 
@@ -12475,10 +12150,7 @@ static void ggml_compute_forward_rope_back_f32(
12475
12150
  static void ggml_compute_forward_rope_back_f16(
12476
12151
  const struct ggml_compute_params * params,
12477
12152
  const struct ggml_tensor * src0,
12478
- const struct ggml_tensor * src1,
12479
12153
  struct ggml_tensor * dst) {
12480
- assert(src1->type == GGML_TYPE_I32);
12481
- assert(ggml_nelements(src1) == 3);
12482
12154
 
12483
12155
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12484
12156
  return;
@@ -12488,9 +12160,9 @@ static void ggml_compute_forward_rope_back_f16(
12488
12160
  // dx = rope_back(dy, src1)
12489
12161
  // src0 is dy, src1 contains options
12490
12162
 
12491
- const int n_past = ((int32_t *) src1->data)[0];
12492
- const int n_dims = ((int32_t *) src1->data)[1];
12493
- const int mode = ((int32_t *) src1->data)[2];
12163
+ const int n_past = ((int32_t *) dst->op_params)[0];
12164
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12165
+ const int mode = ((int32_t *) dst->op_params)[2];
12494
12166
 
12495
12167
  assert(n_past >= 0);
12496
12168
 
@@ -12574,16 +12246,15 @@ static void ggml_compute_forward_rope_back_f16(
12574
12246
  static void ggml_compute_forward_rope_back(
12575
12247
  const struct ggml_compute_params * params,
12576
12248
  const struct ggml_tensor * src0,
12577
- const struct ggml_tensor * src1,
12578
12249
  struct ggml_tensor * dst) {
12579
12250
  switch (src0->type) {
12580
12251
  case GGML_TYPE_F16:
12581
12252
  {
12582
- ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
12253
+ ggml_compute_forward_rope_back_f16(params, src0, dst);
12583
12254
  } break;
12584
12255
  case GGML_TYPE_F32:
12585
12256
  {
12586
- ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
12257
+ ggml_compute_forward_rope_back_f32(params, src0, dst);
12587
12258
  } break;
12588
12259
  default:
12589
12260
  {
@@ -12780,7 +12451,7 @@ static void ggml_compute_forward_conv_1d_s1_ph(
12780
12451
  const struct ggml_compute_params * params,
12781
12452
  const struct ggml_tensor * src0,
12782
12453
  const struct ggml_tensor * src1,
12783
- struct ggml_tensor * dst) {
12454
+ struct ggml_tensor * dst) {
12784
12455
  switch (src0->type) {
12785
12456
  case GGML_TYPE_F16:
12786
12457
  {
@@ -12983,7 +12654,7 @@ static void ggml_compute_forward_conv_1d_s2_ph(
12983
12654
  const struct ggml_compute_params * params,
12984
12655
  const struct ggml_tensor * src0,
12985
12656
  const struct ggml_tensor * src1,
12986
- struct ggml_tensor * dst) {
12657
+ struct ggml_tensor * dst) {
12987
12658
  switch (src0->type) {
12988
12659
  case GGML_TYPE_F16:
12989
12660
  {
@@ -13003,14 +12674,13 @@ static void ggml_compute_forward_conv_1d_s2_ph(
13003
12674
  // ggml_compute_forward_conv_1d
13004
12675
 
13005
12676
  static void ggml_compute_forward_conv_1d(
13006
- const struct ggml_compute_params * params,
13007
- const struct ggml_tensor * src0,
13008
- const struct ggml_tensor * src1,
13009
- const struct ggml_tensor * opt0,
13010
- struct ggml_tensor * dst) {
13011
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13012
- const int32_t p0 = ((const int32_t*)(opt0->data))[1];
13013
- const int32_t d0 = ((const int32_t*)(opt0->data))[2];
12677
+ const struct ggml_compute_params * params,
12678
+ const struct ggml_tensor * src0,
12679
+ const struct ggml_tensor * src1,
12680
+ struct ggml_tensor * dst) {
12681
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12682
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
12683
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
13014
12684
  GGML_ASSERT(d0 == 1); // dilation not supported
13015
12685
  GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
13016
12686
  if (s0 == 1) {
@@ -13028,7 +12698,6 @@ static void ggml_compute_forward_conv_2d_f16_f32(
13028
12698
  const struct ggml_compute_params * params,
13029
12699
  const struct ggml_tensor * src0,
13030
12700
  const struct ggml_tensor * src1,
13031
- const struct ggml_tensor * opt0,
13032
12701
  struct ggml_tensor * dst) {
13033
12702
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13034
12703
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13048,12 +12717,12 @@ static void ggml_compute_forward_conv_2d_f16_f32(
13048
12717
  // size of the convolution row - the kernel size unrolled across all channels
13049
12718
  const int ew0 = nk0*nk1*ne02;
13050
12719
 
13051
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13052
- const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13053
- const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13054
- const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13055
- const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13056
- const int32_t d1 = ((const int32_t*)(opt0->data))[5];
12720
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12721
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12722
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12723
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12724
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12725
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
13057
12726
 
13058
12727
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13059
12728
  GGML_ASSERT(nb10 == sizeof(float));
@@ -13125,17 +12794,15 @@ static void ggml_compute_forward_conv_2d(
13125
12794
  const struct ggml_compute_params * params,
13126
12795
  const struct ggml_tensor * src0,
13127
12796
  const struct ggml_tensor * src1,
13128
- const struct ggml_tensor * opt0,
13129
- struct ggml_tensor * dst
13130
- ) {
12797
+ struct ggml_tensor * dst) {
13131
12798
  switch (src0->type) {
13132
12799
  case GGML_TYPE_F16:
13133
12800
  {
13134
- ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
12801
+ ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
13135
12802
  } break;
13136
12803
  case GGML_TYPE_F32:
13137
12804
  {
13138
- //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
12805
+ //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
13139
12806
  GGML_ASSERT(false);
13140
12807
  } break;
13141
12808
  default:
@@ -13200,12 +12867,11 @@ static void ggml_compute_forward_pool_1d_sk_p0(
13200
12867
  // ggml_compute_forward_pool_1d
13201
12868
 
13202
12869
  static void ggml_compute_forward_pool_1d(
13203
- const struct ggml_compute_params* params,
13204
- const struct ggml_tensor* src0,
13205
- const struct ggml_tensor* opt0,
13206
- struct ggml_tensor* dst) {
13207
- GGML_ASSERT(opt0->ne[0] == 4);
13208
- const int* opts = (const int*)opt0->data;
12870
+ const struct ggml_compute_params * params,
12871
+ const struct ggml_tensor * src0,
12872
+ struct ggml_tensor * dst) {
12873
+
12874
+ const int32_t* opts = (const int32_t*)dst->op_params;
13209
12875
  enum ggml_op_pool op = opts[0];
13210
12876
  const int k0 = opts[1];
13211
12877
  const int s0 = opts[2];
@@ -13219,12 +12885,12 @@ static void ggml_compute_forward_pool_1d(
13219
12885
  // ggml_compute_forward_pool_2d_sk_p0
13220
12886
 
13221
12887
  static void ggml_compute_forward_pool_2d_sk_p0(
13222
- const struct ggml_compute_params * params,
13223
- const enum ggml_op_pool op,
13224
- const struct ggml_tensor * src,
13225
- const int k0,
13226
- const int k1,
13227
- struct ggml_tensor * dst) {
12888
+ const struct ggml_compute_params * params,
12889
+ const enum ggml_op_pool op,
12890
+ const struct ggml_tensor * src,
12891
+ const int k0,
12892
+ const int k1,
12893
+ struct ggml_tensor * dst) {
13228
12894
  assert(src->type == GGML_TYPE_F32);
13229
12895
  assert(params->ith == 0);
13230
12896
 
@@ -13284,12 +12950,11 @@ static void ggml_compute_forward_pool_2d_sk_p0(
13284
12950
  // ggml_compute_forward_pool_2d
13285
12951
 
13286
12952
  static void ggml_compute_forward_pool_2d(
13287
- const struct ggml_compute_params * params,
13288
- const struct ggml_tensor * src0,
13289
- const struct ggml_tensor * opt0,
13290
- struct ggml_tensor * dst) {
13291
- GGML_ASSERT(opt0->ne[0] == 7);
13292
- const int* opts = (const int*)opt0->data;
12953
+ const struct ggml_compute_params * params,
12954
+ const struct ggml_tensor * src0,
12955
+ struct ggml_tensor * dst) {
12956
+
12957
+ const int32_t * opts = (const int32_t *)dst->op_params;
13293
12958
  enum ggml_op_pool op = opts[0];
13294
12959
  const int k0 = opts[1];
13295
12960
  const int k1 = opts[2];
@@ -13314,7 +12979,7 @@ static void ggml_compute_forward_flash_attn_f32(
13314
12979
  const struct ggml_tensor * k,
13315
12980
  const struct ggml_tensor * v,
13316
12981
  const bool masked,
13317
- struct ggml_tensor * dst) {
12982
+ struct ggml_tensor * dst) {
13318
12983
  int64_t t0 = ggml_perf_time_us();
13319
12984
  UNUSED(t0);
13320
12985
 
@@ -13492,7 +13157,7 @@ static void ggml_compute_forward_flash_attn_f16(
13492
13157
  const struct ggml_tensor * k,
13493
13158
  const struct ggml_tensor * v,
13494
13159
  const bool masked,
13495
- struct ggml_tensor * dst) {
13160
+ struct ggml_tensor * dst) {
13496
13161
  int64_t t0 = ggml_perf_time_us();
13497
13162
  UNUSED(t0);
13498
13163
 
@@ -14257,7 +13922,6 @@ static void ggml_compute_forward_flash_attn_back(
14257
13922
  static void ggml_compute_forward_win_part_f32(
14258
13923
  const struct ggml_compute_params * params,
14259
13924
  const struct ggml_tensor * src0,
14260
- const struct ggml_tensor * opt0,
14261
13925
  struct ggml_tensor * dst) {
14262
13926
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14263
13927
  return;
@@ -14266,9 +13930,9 @@ static void ggml_compute_forward_win_part_f32(
14266
13930
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14267
13931
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14268
13932
 
14269
- const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14270
- const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
14271
- const int32_t w = ((const int32_t *)(opt0->data))[2];
13933
+ const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
13934
+ const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
13935
+ const int32_t w = ((const int32_t *)(dst->op_params))[2];
14272
13936
 
14273
13937
  assert(ne00 == ne0);
14274
13938
  assert(ne3 == nep0*nep1);
@@ -14302,12 +13966,11 @@ static void ggml_compute_forward_win_part_f32(
14302
13966
  static void ggml_compute_forward_win_part(
14303
13967
  const struct ggml_compute_params * params,
14304
13968
  const struct ggml_tensor * src0,
14305
- const struct ggml_tensor * opt0,
14306
13969
  struct ggml_tensor * dst) {
14307
13970
  switch (src0->type) {
14308
13971
  case GGML_TYPE_F32:
14309
13972
  {
14310
- ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
13973
+ ggml_compute_forward_win_part_f32(params, src0, dst);
14311
13974
  } break;
14312
13975
  default:
14313
13976
  {
@@ -14321,7 +13984,6 @@ static void ggml_compute_forward_win_part(
14321
13984
  static void ggml_compute_forward_win_unpart_f32(
14322
13985
  const struct ggml_compute_params * params,
14323
13986
  const struct ggml_tensor * src0,
14324
- const struct ggml_tensor * opt0,
14325
13987
  struct ggml_tensor * dst) {
14326
13988
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14327
13989
  return;
@@ -14330,7 +13992,7 @@ static void ggml_compute_forward_win_unpart_f32(
14330
13992
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14331
13993
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14332
13994
 
14333
- const int32_t w = ((const int32_t *)(opt0->data))[0];
13995
+ const int32_t w = ((const int32_t *)(dst->op_params))[0];
14334
13996
 
14335
13997
  // padding
14336
13998
  const int px = (w - ne1%w)%w;
@@ -14364,12 +14026,67 @@ static void ggml_compute_forward_win_unpart_f32(
14364
14026
  static void ggml_compute_forward_win_unpart(
14365
14027
  const struct ggml_compute_params * params,
14366
14028
  const struct ggml_tensor * src0,
14367
- const struct ggml_tensor * opt0,
14368
14029
  struct ggml_tensor * dst) {
14369
14030
  switch (src0->type) {
14370
14031
  case GGML_TYPE_F32:
14371
14032
  {
14372
- ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
14033
+ ggml_compute_forward_win_unpart_f32(params, src0, dst);
14034
+ } break;
14035
+ default:
14036
+ {
14037
+ GGML_ASSERT(false);
14038
+ } break;
14039
+ }
14040
+ }
14041
+
14042
+ //gmml_compute_forward_unary
14043
+
14044
+ static void ggml_compute_forward_unary(
14045
+ const struct ggml_compute_params * params,
14046
+ const struct ggml_tensor * src0,
14047
+ struct ggml_tensor * dst) {
14048
+ const enum ggml_unary_op op = ggml_get_unary_op(dst);
14049
+
14050
+ switch (op) {
14051
+ case GGML_UNARY_OP_ABS:
14052
+ {
14053
+ ggml_compute_forward_abs(params, src0, dst);
14054
+ } break;
14055
+ case GGML_UNARY_OP_SGN:
14056
+ {
14057
+ ggml_compute_forward_sgn(params, src0, dst);
14058
+ } break;
14059
+ case GGML_UNARY_OP_NEG:
14060
+ {
14061
+ ggml_compute_forward_neg(params, src0, dst);
14062
+ } break;
14063
+ case GGML_UNARY_OP_STEP:
14064
+ {
14065
+ ggml_compute_forward_step(params, src0, dst);
14066
+ } break;
14067
+ case GGML_UNARY_OP_TANH:
14068
+ {
14069
+ ggml_compute_forward_tanh(params, src0, dst);
14070
+ } break;
14071
+ case GGML_UNARY_OP_ELU:
14072
+ {
14073
+ ggml_compute_forward_elu(params, src0, dst);
14074
+ } break;
14075
+ case GGML_UNARY_OP_RELU:
14076
+ {
14077
+ ggml_compute_forward_relu(params, src0, dst);
14078
+ } break;
14079
+ case GGML_UNARY_OP_GELU:
14080
+ {
14081
+ ggml_compute_forward_gelu(params, src0, dst);
14082
+ } break;
14083
+ case GGML_UNARY_OP_GELU_QUICK:
14084
+ {
14085
+ ggml_compute_forward_gelu_quick(params, src0, dst);
14086
+ } break;
14087
+ case GGML_UNARY_OP_SILU:
14088
+ {
14089
+ ggml_compute_forward_silu(params, src0, dst);
14373
14090
  } break;
14374
14091
  default:
14375
14092
  {
@@ -14888,7 +14605,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14888
14605
  } break;
14889
14606
  case GGML_OP_ACC:
14890
14607
  {
14891
- ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14608
+ ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
14892
14609
  } break;
14893
14610
  case GGML_OP_SUB:
14894
14611
  {
@@ -14938,46 +14655,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14938
14655
  {
14939
14656
  ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
14940
14657
  } break;
14941
- case GGML_OP_ABS:
14942
- {
14943
- ggml_compute_forward_abs(params, tensor->src[0], tensor);
14944
- } break;
14945
- case GGML_OP_SGN:
14946
- {
14947
- ggml_compute_forward_sgn(params, tensor->src[0], tensor);
14948
- } break;
14949
- case GGML_OP_NEG:
14950
- {
14951
- ggml_compute_forward_neg(params, tensor->src[0], tensor);
14952
- } break;
14953
- case GGML_OP_STEP:
14954
- {
14955
- ggml_compute_forward_step(params, tensor->src[0], tensor);
14956
- } break;
14957
- case GGML_OP_TANH:
14958
- {
14959
- ggml_compute_forward_tanh(params, tensor->src[0], tensor);
14960
- } break;
14961
- case GGML_OP_ELU:
14962
- {
14963
- ggml_compute_forward_elu(params, tensor->src[0], tensor);
14964
- } break;
14965
- case GGML_OP_RELU:
14966
- {
14967
- ggml_compute_forward_relu(params, tensor->src[0], tensor);
14968
- } break;
14969
- case GGML_OP_GELU:
14970
- {
14971
- ggml_compute_forward_gelu(params, tensor->src[0], tensor);
14972
- } break;
14973
- case GGML_OP_GELU_QUICK:
14974
- {
14975
- ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
14976
- } break;
14977
- case GGML_OP_SILU:
14978
- {
14979
- ggml_compute_forward_silu(params, tensor->src[0], tensor);
14980
- } break;
14981
14658
  case GGML_OP_SILU_BACK:
14982
14659
  {
14983
14660
  ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
@@ -15008,7 +14685,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15008
14685
  } break;
15009
14686
  case GGML_OP_SET:
15010
14687
  {
15011
- ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14688
+ ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
15012
14689
  } break;
15013
14690
  case GGML_OP_CPY:
15014
14691
  {
@@ -15048,11 +14725,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15048
14725
  } break;
15049
14726
  case GGML_OP_DIAG_MASK_INF:
15050
14727
  {
15051
- ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor);
14728
+ ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
15052
14729
  } break;
15053
14730
  case GGML_OP_DIAG_MASK_ZERO:
15054
14731
  {
15055
- ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor);
14732
+ ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
15056
14733
  } break;
15057
14734
  case GGML_OP_SOFT_MAX:
15058
14735
  {
@@ -15064,39 +14741,39 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15064
14741
  } break;
15065
14742
  case GGML_OP_ROPE:
15066
14743
  {
15067
- ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
14744
+ ggml_compute_forward_rope(params, tensor->src[0], tensor);
15068
14745
  } break;
15069
14746
  case GGML_OP_ROPE_BACK:
15070
14747
  {
15071
- ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
14748
+ ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
15072
14749
  } break;
15073
14750
  case GGML_OP_ALIBI:
15074
14751
  {
15075
- ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor);
14752
+ ggml_compute_forward_alibi(params, tensor->src[0], tensor);
15076
14753
  } break;
15077
14754
  case GGML_OP_CLAMP:
15078
14755
  {
15079
- ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor);
14756
+ ggml_compute_forward_clamp(params, tensor->src[0], tensor);
15080
14757
  } break;
15081
14758
  case GGML_OP_CONV_1D:
15082
14759
  {
15083
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14760
+ ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
15084
14761
  } break;
15085
14762
  case GGML_OP_CONV_2D:
15086
14763
  {
15087
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14764
+ ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
15088
14765
  } break;
15089
14766
  case GGML_OP_POOL_1D:
15090
14767
  {
15091
- ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
14768
+ ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
15092
14769
  } break;
15093
14770
  case GGML_OP_POOL_2D:
15094
14771
  {
15095
- ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
14772
+ ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
15096
14773
  } break;
15097
14774
  case GGML_OP_FLASH_ATTN:
15098
14775
  {
15099
- const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
14776
+ const int32_t t = ggml_get_op_params_i32(tensor, 0);
15100
14777
  GGML_ASSERT(t == 0 || t == 1);
15101
14778
  const bool masked = t != 0;
15102
14779
  ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
@@ -15107,47 +14784,56 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15107
14784
  } break;
15108
14785
  case GGML_OP_FLASH_ATTN_BACK:
15109
14786
  {
15110
- int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
14787
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15111
14788
  GGML_ASSERT(t == 0 || t == 1);
15112
14789
  bool masked = t != 0;
15113
14790
  ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
15114
14791
  } break;
15115
14792
  case GGML_OP_WIN_PART:
15116
14793
  {
15117
- ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor);
14794
+ ggml_compute_forward_win_part(params, tensor->src[0], tensor);
15118
14795
  } break;
15119
14796
  case GGML_OP_WIN_UNPART:
15120
14797
  {
15121
- ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor);
14798
+ ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
14799
+ } break;
14800
+ case GGML_OP_UNARY:
14801
+ {
14802
+ ggml_compute_forward_unary(params, tensor->src[0], tensor);
15122
14803
  } break;
15123
14804
  case GGML_OP_MAP_UNARY:
15124
14805
  {
15125
- const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data);
14806
+ ggml_unary_op_f32_t fun;
14807
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15126
14808
  ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
15127
14809
  }
15128
14810
  break;
15129
14811
  case GGML_OP_MAP_BINARY:
15130
14812
  {
15131
- const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data);
14813
+ ggml_binary_op_f32_t fun;
14814
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15132
14815
  ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
15133
14816
  }
15134
14817
  break;
15135
14818
  case GGML_OP_MAP_CUSTOM1:
15136
14819
  {
15137
- const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data);
14820
+ ggml_custom1_op_f32_t fun;
14821
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15138
14822
  ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
15139
14823
  }
15140
14824
  break;
15141
14825
  case GGML_OP_MAP_CUSTOM2:
15142
14826
  {
15143
- const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data);
14827
+ ggml_custom2_op_f32_t fun;
14828
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15144
14829
  ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
15145
14830
  }
15146
14831
  break;
15147
14832
  case GGML_OP_MAP_CUSTOM3:
15148
14833
  {
15149
- const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data);
15150
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun);
14834
+ ggml_custom3_op_f32_t fun;
14835
+ memcpy(&fun, tensor->op_params, sizeof(fun));
14836
+ ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15151
14837
  }
15152
14838
  break;
15153
14839
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -15211,12 +14897,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15211
14897
  src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
15212
14898
  }
15213
14899
  if (src1->grad) {
15214
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15215
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15216
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15217
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15218
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15219
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
14900
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
14901
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
14902
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
14903
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15220
14904
 
15221
14905
  struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
15222
14906
  tensor->grad,
@@ -15365,73 +15049,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15365
15049
  inplace);
15366
15050
  }
15367
15051
  } break;
15368
- case GGML_OP_ABS:
15369
- {
15370
- if (src0->grad) {
15371
- src0->grad =
15372
- ggml_add_impl(ctx,
15373
- src0->grad,
15374
- ggml_mul(ctx,
15375
- ggml_sgn(ctx, src0),
15376
- tensor->grad),
15377
- inplace);
15378
- }
15379
- } break;
15380
- case GGML_OP_SGN:
15381
- {
15382
- if (src0->grad) {
15383
- // noop
15384
- }
15385
- } break;
15386
- case GGML_OP_NEG:
15387
- {
15388
- if (src0->grad) {
15389
- src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15390
- }
15391
- } break;
15392
- case GGML_OP_STEP:
15393
- {
15394
- if (src0->grad) {
15395
- // noop
15396
- }
15397
- } break;
15398
- case GGML_OP_TANH:
15399
- {
15400
- GGML_ASSERT(false); // TODO: not implemented
15401
- } break;
15402
- case GGML_OP_ELU:
15403
- {
15404
- GGML_ASSERT(false); // TODO: not implemented
15405
- } break;
15406
- case GGML_OP_RELU:
15407
- {
15408
- if (src0->grad) {
15409
- src0->grad = ggml_sub_impl(ctx,
15410
- src0->grad,
15411
- ggml_mul(ctx,
15412
- ggml_step(ctx, src0),
15413
- tensor->grad),
15414
- inplace);
15415
- }
15416
- } break;
15417
- case GGML_OP_GELU:
15418
- {
15419
- GGML_ASSERT(false); // TODO: not implemented
15420
- } break;
15421
- case GGML_OP_GELU_QUICK:
15422
- {
15423
- GGML_ASSERT(false); // TODO: not implemented
15424
- } break;
15425
- case GGML_OP_SILU:
15426
- {
15427
- // necessary for llama
15428
- if (src0->grad) {
15429
- src0->grad = ggml_add_impl(ctx,
15430
- src0->grad,
15431
- ggml_silu_back(ctx, src0, tensor->grad),
15432
- inplace);
15433
- }
15434
- } break;
15435
15052
  case GGML_OP_SILU_BACK:
15436
15053
  {
15437
15054
  GGML_ASSERT(false); // TODO: not implemented
@@ -15524,12 +15141,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15524
15141
  } break;
15525
15142
  case GGML_OP_SET:
15526
15143
  {
15527
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15528
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15529
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15530
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15531
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15532
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
15144
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
15145
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
15146
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
15147
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15533
15148
 
15534
15149
  struct ggml_tensor * tensor_grad_view = NULL;
15535
15150
 
@@ -15606,8 +15221,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15606
15221
  if (src0->grad) {
15607
15222
  size_t offset;
15608
15223
 
15609
- GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2]));
15610
- memcpy(&offset, tensor->src[2]->data, sizeof(offset));
15224
+ memcpy(&offset, tensor->op_params, sizeof(offset));
15611
15225
 
15612
15226
  size_t nb1 = tensor->nb[1];
15613
15227
  size_t nb2 = tensor->nb[2];
@@ -15634,7 +15248,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15634
15248
  {
15635
15249
  // necessary for llama
15636
15250
  if (src0->grad) {
15637
- int32_t * axes = (int32_t *) tensor->src[2]->data;
15251
+ int32_t * axes = (int32_t *) tensor->op_params;
15638
15252
  int axis0 = axes[0] & 0x3;
15639
15253
  int axis1 = axes[1] & 0x3;
15640
15254
  int axis2 = axes[2] & 0x3;
@@ -15690,33 +15304,23 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15690
15304
  {
15691
15305
  // necessary for llama
15692
15306
  if (src0->grad) {
15693
- assert(src1->type == GGML_TYPE_I32);
15694
- assert(ggml_nelements(src1) == 2);
15695
- const int n_past = ((int32_t *) src1->data)[0];
15307
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15696
15308
  src0->grad =
15697
15309
  ggml_add_impl(ctx, src0->grad,
15698
15310
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15699
15311
  inplace);
15700
15312
  }
15701
- if (src1->grad) {
15702
- // noop
15703
- }
15704
15313
  } break;
15705
15314
  case GGML_OP_DIAG_MASK_ZERO:
15706
15315
  {
15707
15316
  // necessary for llama
15708
15317
  if (src0->grad) {
15709
- assert(src1->type == GGML_TYPE_I32);
15710
- assert(ggml_nelements(src1) == 2);
15711
- const int n_past = ((int32_t *) src1->data)[0];
15318
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15712
15319
  src0->grad =
15713
15320
  ggml_add_impl(ctx, src0->grad,
15714
15321
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15715
15322
  inplace);
15716
15323
  }
15717
- if (src1->grad) {
15718
- // noop
15719
- }
15720
15324
  } break;
15721
15325
  case GGML_OP_SOFT_MAX:
15722
15326
  {
@@ -15737,12 +15341,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15737
15341
  {
15738
15342
  // necessary for llama
15739
15343
  if (src0->grad) {
15740
- assert(src1->type == GGML_TYPE_I32);
15741
- assert(ggml_nelements(src1) == 6);
15742
- const int n_past = ((int32_t *) src1->data)[0];
15743
- const int n_dims = ((int32_t *) src1->data)[1];
15744
- const int mode = ((int32_t *) src1->data)[2];
15745
- const int n_ctx = ((int32_t *) src1->data)[3];
15344
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15345
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15346
+ const int mode = ((int32_t *) tensor->op_params)[2];
15347
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15746
15348
  src0->grad = ggml_add_impl(ctx,
15747
15349
  src0->grad,
15748
15350
  ggml_rope_back(ctx,
@@ -15753,19 +15355,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15753
15355
  n_ctx),
15754
15356
  inplace);
15755
15357
  }
15756
- if (src1->grad) {
15757
- // noop
15758
- }
15759
15358
  } break;
15760
15359
  case GGML_OP_ROPE_BACK:
15761
15360
  {
15762
15361
  if (src0->grad) {
15763
- assert(src1->type == GGML_TYPE_I32);
15764
- assert(ggml_nelements(src1) == 4);
15765
- const int n_past = ((int32_t *) src1->data)[0];
15766
- const int n_dims = ((int32_t *) src1->data)[1];
15767
- const int mode = ((int32_t *) src1->data)[2];
15768
- const int n_ctx = ((int32_t *) src1->data)[3];
15362
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15363
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15364
+ const int mode = ((int32_t *) tensor->op_params)[2];
15365
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15769
15366
  src0->grad = ggml_add_impl(ctx,
15770
15367
  src0->grad,
15771
15368
  ggml_rope(ctx,
@@ -15776,9 +15373,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15776
15373
  n_ctx),
15777
15374
  inplace);
15778
15375
  }
15779
- if (src1->grad) {
15780
- // noop
15781
- }
15782
15376
  } break;
15783
15377
  case GGML_OP_ALIBI:
15784
15378
  {
@@ -15808,7 +15402,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15808
15402
  {
15809
15403
  struct ggml_tensor * flash_grad = NULL;
15810
15404
  if (src0->grad || src1->grad || tensor->src[2]->grad) {
15811
- int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
15405
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15812
15406
  GGML_ASSERT(t == 0 || t == 1);
15813
15407
  bool masked = t != 0;
15814
15408
  flash_grad =
@@ -15971,6 +15565,80 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15971
15565
  } break;
15972
15566
  case GGML_OP_WIN_PART:
15973
15567
  case GGML_OP_WIN_UNPART:
15568
+ case GGML_OP_UNARY:
15569
+ {
15570
+ switch (ggml_get_unary_op(tensor)) {
15571
+ case GGML_UNARY_OP_ABS:
15572
+ {
15573
+ if (src0->grad) {
15574
+ src0->grad =
15575
+ ggml_add_impl(ctx,
15576
+ src0->grad,
15577
+ ggml_mul(ctx,
15578
+ ggml_sgn(ctx, src0),
15579
+ tensor->grad),
15580
+ inplace);
15581
+ }
15582
+ } break;
15583
+ case GGML_UNARY_OP_SGN:
15584
+ {
15585
+ if (src0->grad) {
15586
+ // noop
15587
+ }
15588
+ } break;
15589
+ case GGML_UNARY_OP_NEG:
15590
+ {
15591
+ if (src0->grad) {
15592
+ src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15593
+ }
15594
+ } break;
15595
+ case GGML_UNARY_OP_STEP:
15596
+ {
15597
+ if (src0->grad) {
15598
+ // noop
15599
+ }
15600
+ } break;
15601
+ case GGML_UNARY_OP_TANH:
15602
+ {
15603
+ GGML_ASSERT(false); // TODO: not implemented
15604
+ } break;
15605
+ case GGML_UNARY_OP_ELU:
15606
+ {
15607
+ GGML_ASSERT(false); // TODO: not implemented
15608
+ } break;
15609
+ case GGML_UNARY_OP_RELU:
15610
+ {
15611
+ if (src0->grad) {
15612
+ src0->grad = ggml_add_impl(ctx,
15613
+ src0->grad,
15614
+ ggml_mul(ctx,
15615
+ ggml_step(ctx, src0),
15616
+ tensor->grad),
15617
+ inplace);
15618
+ }
15619
+ } break;
15620
+ case GGML_UNARY_OP_GELU:
15621
+ {
15622
+ GGML_ASSERT(false); // TODO: not implemented
15623
+ } break;
15624
+ case GGML_UNARY_OP_GELU_QUICK:
15625
+ {
15626
+ GGML_ASSERT(false); // TODO: not implemented
15627
+ } break;
15628
+ case GGML_UNARY_OP_SILU:
15629
+ {
15630
+ // necessary for llama
15631
+ if (src0->grad) {
15632
+ src0->grad = ggml_add_impl(ctx,
15633
+ src0->grad,
15634
+ ggml_silu_back(ctx, src0, tensor->grad),
15635
+ inplace);
15636
+ }
15637
+ } break;
15638
+ default:
15639
+ GGML_ASSERT(false);
15640
+ }
15641
+ } break;
15974
15642
  case GGML_OP_MAP_UNARY:
15975
15643
  case GGML_OP_MAP_BINARY:
15976
15644
  case GGML_OP_MAP_CUSTOM1:
@@ -16006,6 +15674,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16006
15674
  }
16007
15675
  }
16008
15676
 
15677
+ static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
15678
+
15679
+ static size_t hash(void * p) {
15680
+ return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
15681
+ }
15682
+
15683
+ static bool hash_insert(void * hash_table[], void * p) {
15684
+ size_t h = hash(p);
15685
+
15686
+ // linear probing
15687
+ size_t i = h;
15688
+ while (hash_table[i] != NULL && hash_table[i] != p) {
15689
+ i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
15690
+ if (i == h) {
15691
+ // hash table is full
15692
+ GGML_ASSERT(false);
15693
+ }
15694
+ }
15695
+
15696
+ if (hash_table[i] == p) {
15697
+ return true;
15698
+ }
15699
+
15700
+ // insert
15701
+ hash_table[i] = p;
15702
+ return false;
15703
+ }
15704
+
16009
15705
  static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
16010
15706
  if (node->grad == NULL) {
16011
15707
  // this usually happens when we generate intermediate nodes from constants in the backward pass
@@ -16016,16 +15712,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
16016
15712
  }
16017
15713
 
16018
15714
  // check if already visited
16019
- for (int i = 0; i < cgraph->n_nodes; i++) {
16020
- if (cgraph->nodes[i] == node) {
16021
- return;
16022
- }
16023
- }
16024
-
16025
- for (int i = 0; i < cgraph->n_leafs; i++) {
16026
- if (cgraph->leafs[i] == node) {
16027
- return;
16028
- }
15715
+ if (hash_insert(cgraph->visited_hash_table, node)) {
15716
+ return;
16029
15717
  }
16030
15718
 
16031
15719
  for (int i = 0; i < GGML_MAX_SRC; ++i) {
@@ -16088,6 +15776,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16088
15776
  /*.nodes =*/ { NULL },
16089
15777
  /*.grads =*/ { NULL },
16090
15778
  /*.leafs =*/ { NULL },
15779
+ /*.hash_table =*/ { NULL },
16091
15780
  /*.perf_runs =*/ 0,
16092
15781
  /*.perf_cycles =*/ 0,
16093
15782
  /*.perf_time_us =*/ 0,
@@ -16129,13 +15818,42 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16129
15818
 
16130
15819
  if (node->is_param) {
16131
15820
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16132
- ggml_build_forward_impl(&result, node->grad, true);
15821
+ ggml_build_forward_expand(&result, node->grad);
16133
15822
  }
16134
15823
  }
16135
15824
 
16136
15825
  return result;
16137
15826
  }
16138
15827
 
15828
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15829
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15830
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15831
+
15832
+ *cgraph = (struct ggml_cgraph) {
15833
+ /*.n_nodes =*/ 0,
15834
+ /*.n_leafs =*/ 0,
15835
+ /*.nodes =*/ { NULL },
15836
+ /*.grads =*/ { NULL },
15837
+ /*.leafs =*/ { NULL },
15838
+ /*.hash_table =*/ { NULL },
15839
+ /*.perf_runs =*/ 0,
15840
+ /*.perf_cycles =*/ 0,
15841
+ /*.perf_time_us =*/ 0,
15842
+ };
15843
+
15844
+ return cgraph;
15845
+ }
15846
+
15847
+ struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
15848
+ struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
15849
+ ggml_build_forward_impl(cgraph, tensor, false);
15850
+ return cgraph;
15851
+ }
15852
+
15853
+ size_t ggml_graph_overhead(void) {
15854
+ return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15855
+ }
15856
+
16139
15857
  //
16140
15858
  // thread data
16141
15859
  //
@@ -16201,7 +15919,7 @@ typedef pthread_t ggml_thread_t;
16201
15919
 
16202
15920
  // Android's libc implementation "bionic" does not support setting affinity
16203
15921
  #if defined(__linux__) && !defined(__BIONIC__)
16204
- void set_numa_thread_affinity(int thread_n, int n_threads) {
15922
+ static void set_numa_thread_affinity(int thread_n, int n_threads) {
16205
15923
  if (!ggml_is_numa()) {
16206
15924
  return;
16207
15925
  }
@@ -16226,7 +15944,7 @@ void set_numa_thread_affinity(int thread_n, int n_threads) {
16226
15944
  CPU_FREE(cpus);
16227
15945
  }
16228
15946
 
16229
- void clear_numa_thread_affinity(void) {
15947
+ static void clear_numa_thread_affinity(void) {
16230
15948
  if (!ggml_is_numa()) {
16231
15949
  return;
16232
15950
  }
@@ -16250,8 +15968,8 @@ void clear_numa_thread_affinity(void) {
16250
15968
  #else
16251
15969
  // TODO: Windows etc.
16252
15970
  // (the linux implementation may also work on BSD, someone should test)
16253
- void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16254
- void clear_numa_thread_affinity(void) {}
15971
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15972
+ static void clear_numa_thread_affinity(void) {}
16255
15973
  #endif
16256
15974
 
16257
15975
  struct ggml_compute_state_shared {
@@ -16463,21 +16181,34 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16463
16181
  case GGML_OP_ARGMAX:
16464
16182
  case GGML_OP_REPEAT:
16465
16183
  case GGML_OP_REPEAT_BACK:
16466
- case GGML_OP_ABS:
16467
- case GGML_OP_SGN:
16468
- case GGML_OP_NEG:
16469
- case GGML_OP_STEP:
16470
- case GGML_OP_TANH:
16471
- case GGML_OP_ELU:
16472
- case GGML_OP_RELU:
16473
- {
16184
+ {
16474
16185
  n_tasks = 1;
16475
16186
  } break;
16476
- case GGML_OP_MUL:
16477
- case GGML_OP_GELU:
16478
- case GGML_OP_GELU_QUICK:
16479
- case GGML_OP_SILU:
16187
+
16188
+ case GGML_OP_UNARY:
16189
+ {
16190
+ switch (ggml_get_unary_op(node)) {
16191
+ case GGML_UNARY_OP_ABS:
16192
+ case GGML_UNARY_OP_SGN:
16193
+ case GGML_UNARY_OP_NEG:
16194
+ case GGML_UNARY_OP_STEP:
16195
+ case GGML_UNARY_OP_TANH:
16196
+ case GGML_UNARY_OP_ELU:
16197
+ case GGML_UNARY_OP_RELU:
16198
+ {
16199
+ n_tasks = 1;
16200
+ } break;
16201
+
16202
+ case GGML_UNARY_OP_GELU:
16203
+ case GGML_UNARY_OP_GELU_QUICK:
16204
+ case GGML_UNARY_OP_SILU:
16205
+ {
16206
+ n_tasks = n_threads;
16207
+ } break;
16208
+ }
16209
+ } break;
16480
16210
  case GGML_OP_SILU_BACK:
16211
+ case GGML_OP_MUL:
16481
16212
  case GGML_OP_NORM:
16482
16213
  case GGML_OP_RMS_NORM:
16483
16214
  case GGML_OP_RMS_NORM_BACK:
@@ -16542,10 +16273,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16542
16273
  case GGML_OP_GET_ROWS:
16543
16274
  case GGML_OP_GET_ROWS_BACK:
16544
16275
  case GGML_OP_DIAG:
16545
- case GGML_OP_DIAG_MASK_ZERO:
16546
16276
  {
16547
16277
  n_tasks = 1;
16548
16278
  } break;
16279
+ case GGML_OP_DIAG_MASK_ZERO:
16549
16280
  case GGML_OP_DIAG_MASK_INF:
16550
16281
  case GGML_OP_SOFT_MAX:
16551
16282
  case GGML_OP_SOFT_MAX_BACK:
@@ -16838,10 +16569,9 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16838
16569
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16839
16570
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16840
16571
 
16841
- struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
16842
- GGML_ASSERT(buf);
16572
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
16843
16573
 
16844
- cplan.work_data = buf->data;
16574
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
16845
16575
 
16846
16576
  ggml_graph_compute(cgraph, &cplan);
16847
16577
  }
@@ -16992,7 +16722,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16992
16722
  fwrite(&nb, sizeof(uint64_t), 1, fout);
16993
16723
  }
16994
16724
 
16995
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16725
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16726
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
16996
16727
 
16997
16728
  // dump the data
16998
16729
  // TODO: pad this to 32 byte boundary
@@ -17025,7 +16756,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17025
16756
  fwrite(&nb, sizeof(uint64_t), 1, fout);
17026
16757
  }
17027
16758
 
17028
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16759
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16760
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
17029
16761
 
17030
16762
  // output the op arguments
17031
16763
  {
@@ -17206,7 +16938,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17206
16938
 
17207
16939
  tensor->op = (enum ggml_op) op;
17208
16940
 
17209
- memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16941
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16942
+ memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
17210
16943
 
17211
16944
  tensor->data = (void *) ptr;
17212
16945
 
@@ -17251,7 +16984,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17251
16984
  nb[j] = nb_cur;
17252
16985
  }
17253
16986
 
17254
- const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
16987
+ const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
16988
+ const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
17255
16989
 
17256
16990
  const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
17257
16991
 
@@ -17288,8 +17022,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17288
17022
  {
17289
17023
  tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
17290
17024
 
17291
- uint64_t offs;
17292
- memcpy(&offs, args[2]->data, sizeof(offs));
17025
+ size_t offs;
17026
+ memcpy(&offs, ptr_op_params, sizeof(offs));
17293
17027
 
17294
17028
  tensor->data = ((char *) tensor->data) + offs;
17295
17029
  } break;
@@ -17309,7 +17043,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17309
17043
  } break;
17310
17044
  }
17311
17045
 
17312
- memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17046
+ memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17047
+ memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
17313
17048
 
17314
17049
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
17315
17050
  tensor->nb[j] = nb[j];
@@ -17343,7 +17078,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17343
17078
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17344
17079
  i,
17345
17080
  node->ne[0], node->ne[1], node->ne[2],
17346
- GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17081
+ ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17347
17082
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17348
17083
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17349
17084
  (double) node->perf_time_us / 1000.0,
@@ -17357,7 +17092,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17357
17092
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
17358
17093
  i,
17359
17094
  node->ne[0], node->ne[1],
17360
- GGML_OP_NAME[node->op]);
17095
+ ggml_op_name(node->op));
17361
17096
  }
17362
17097
 
17363
17098
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -17365,7 +17100,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17365
17100
  continue;
17366
17101
  }
17367
17102
 
17368
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
17103
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
17369
17104
  }
17370
17105
 
17371
17106
  GGML_PRINT("========================================\n");
@@ -17459,13 +17194,13 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17459
17194
  }
17460
17195
 
17461
17196
  if (node->n_dims == 2) {
17462
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
17197
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
17463
17198
  } else {
17464
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
17199
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
17465
17200
  }
17466
17201
 
17467
17202
  if (node->grad) {
17468
- fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
17203
+ fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
17469
17204
  } else {
17470
17205
  fprintf(fp, "\"; ]\n");
17471
17206
  }