llama_cpp 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3440,7 +3440,9 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
3440
3440
 
3441
3441
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
3442
3442
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
3443
- #if defined(GGML_SIMD)
3443
+ #if defined(GGML_USE_ACCELERATE)
3444
+ vDSP_vsmul(y, 1, &v, y, 1, n);
3445
+ #elif defined(GGML_SIMD)
3444
3446
  const int np = (n & ~(GGML_F32_STEP - 1));
3445
3447
 
3446
3448
  GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
@@ -3603,7 +3605,7 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
3603
3605
  #endif
3604
3606
  }
3605
3607
 
3606
- inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) {
3608
+ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
3607
3609
  ggml_float sum = 0.0;
3608
3610
  for (int i = 0; i < n; ++i) {
3609
3611
  sum += (ggml_float)x[i];
@@ -3611,6 +3613,14 @@ inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x
3611
3613
  *s = sum;
3612
3614
  }
3613
3615
 
3616
+ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
3617
+ float sum = 0.0f;
3618
+ for (int i = 0; i < n; ++i) {
3619
+ sum += GGML_FP16_TO_FP32(x[i]);
3620
+ }
3621
+ *s = sum;
3622
+ }
3623
+
3614
3624
  inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
3615
3625
  #ifndef GGML_USE_ACCELERATE
3616
3626
  float max = -INFINITY;
@@ -3750,16 +3760,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3750
3760
  "ARGMAX",
3751
3761
  "REPEAT",
3752
3762
  "REPEAT_BACK",
3753
- "ABS",
3754
- "SGN",
3755
- "NEG",
3756
- "STEP",
3757
- "TANH",
3758
- "ELU",
3759
- "RELU",
3760
- "GELU",
3761
- "GELU_QUICK",
3762
- "SILU",
3763
3763
  "SILU_BACK",
3764
3764
  "NORM",
3765
3765
  "RMS_NORM",
@@ -3798,6 +3798,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3798
3798
  "WIN_PART",
3799
3799
  "WIN_UNPART",
3800
3800
 
3801
+ "UNARY",
3802
+
3801
3803
  "MAP_UNARY",
3802
3804
  "MAP_BINARY",
3803
3805
 
@@ -3809,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3809
3811
  "CROSS_ENTROPY_LOSS_BACK",
3810
3812
  };
3811
3813
 
3812
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3814
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3813
3815
 
3814
3816
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3815
3817
  "none",
@@ -3830,16 +3832,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3830
3832
  "argmax(x)",
3831
3833
  "repeat(x)",
3832
3834
  "repeat_back(x)",
3833
- "abs(x)",
3834
- "sgn(x)",
3835
- "-x",
3836
- "step(x)",
3837
- "tanh(x)",
3838
- "elu(x)",
3839
- "relu(x)",
3840
- "gelu(x)",
3841
- "gelu_quick(x)",
3842
- "silu(x)",
3843
3835
  "silu_back(x)",
3844
3836
  "norm(x)",
3845
3837
  "rms_norm(x)",
@@ -3878,6 +3870,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3878
3870
  "win_part(x)",
3879
3871
  "win_unpart(x)",
3880
3872
 
3873
+ "unary(x)",
3874
+
3881
3875
  "f(x)",
3882
3876
  "f(x,y)",
3883
3877
 
@@ -3889,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3889
3883
  "cross_entropy_loss_back(x,y)",
3890
3884
  };
3891
3885
 
3892
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3886
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3893
3887
 
3894
3888
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3895
3889
 
@@ -4077,8 +4071,8 @@ bool ggml_is_numa(void) {
4077
4071
  ////////////////////////////////////////////////////////////////////////////////
4078
4072
 
4079
4073
  void ggml_print_object(const struct ggml_object * obj) {
4080
- GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
4081
- obj->offs, obj->size, (const void *) obj->next);
4074
+ GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
4075
+ obj->type, obj->offs, obj->size, (const void *) obj->next);
4082
4076
  }
4083
4077
 
4084
4078
  void ggml_print_objects(const struct ggml_context * ctx) {
@@ -4145,6 +4139,10 @@ const char * ggml_op_name(enum ggml_op op) {
4145
4139
  return GGML_OP_NAME[op];
4146
4140
  }
4147
4141
 
4142
+ const char * ggml_op_symbol(enum ggml_op op) {
4143
+ return GGML_OP_SYMBOL[op];
4144
+ }
4145
+
4148
4146
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
4149
4147
  return GGML_TYPE_SIZE[tensor->type];
4150
4148
  }
@@ -4214,7 +4212,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4214
4212
  }
4215
4213
 
4216
4214
  size_t ggml_tensor_overhead(void) {
4217
- return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
4215
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
4218
4216
  }
4219
4217
 
4220
4218
  bool ggml_is_transposed(const struct ggml_tensor * tensor) {
@@ -4231,6 +4229,15 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
4231
4229
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4232
4230
  }
4233
4231
 
4232
+ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
4233
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
+
4235
+ return
4236
+ tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4237
+ tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4238
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4239
+ }
4240
+
4234
4241
  bool ggml_is_permuted(const struct ggml_tensor * tensor) {
4235
4242
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4236
4243
 
@@ -4376,7 +4383,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4376
4383
  return NULL;
4377
4384
  }
4378
4385
 
4379
- const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
4386
+ const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4380
4387
 
4381
4388
  *ctx = (struct ggml_context) {
4382
4389
  /*.mem_size =*/ mem_size,
@@ -4443,6 +4450,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4443
4450
  return result;
4444
4451
  }
4445
4452
 
4453
+ bool ggml_get_no_alloc(struct ggml_context * ctx) {
4454
+ return ctx->no_alloc;
4455
+ }
4456
+
4446
4457
  void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4447
4458
  ctx->no_alloc = no_alloc;
4448
4459
  }
@@ -4461,12 +4472,14 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4461
4472
  struct ggml_object * obj = ctx->objects_begin;
4462
4473
 
4463
4474
  while (obj != NULL) {
4464
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4475
+ if (obj->type == GGML_OBJECT_TENSOR) {
4476
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4465
4477
 
4466
- const size_t size = ggml_nbytes(tensor);
4478
+ const size_t size = ggml_nbytes(tensor);
4467
4479
 
4468
- if (max_size < size) {
4469
- max_size = size;
4480
+ if (max_size < size) {
4481
+ max_size = size;
4482
+ }
4470
4483
  }
4471
4484
 
4472
4485
  obj = obj->next;
@@ -4480,7 +4493,7 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4480
4493
  // this is an error prone process, but it is necessary to support inplace
4481
4494
  // operators when using scratch buffers
4482
4495
  // TODO: implement a better way
4483
- void ggml_scratch_save(struct ggml_context * ctx) {
4496
+ static void ggml_scratch_save(struct ggml_context * ctx) {
4484
4497
  // this is needed to allow opt tensors to store their data
4485
4498
  // TODO: again, need to find a better way
4486
4499
  ctx->no_alloc_save = ctx->no_alloc;
@@ -4490,7 +4503,7 @@ void ggml_scratch_save(struct ggml_context * ctx) {
4490
4503
  ctx->scratch.data = NULL;
4491
4504
  }
4492
4505
 
4493
- void ggml_scratch_load(struct ggml_context * ctx) {
4506
+ static void ggml_scratch_load(struct ggml_context * ctx) {
4494
4507
  ctx->no_alloc = ctx->no_alloc_save;
4495
4508
 
4496
4509
  ctx->scratch = ctx->scratch_save;
@@ -4498,12 +4511,7 @@ void ggml_scratch_load(struct ggml_context * ctx) {
4498
4511
 
4499
4512
  ////////////////////////////////////////////////////////////////////////////////
4500
4513
 
4501
- struct ggml_tensor * ggml_new_tensor_impl(
4502
- struct ggml_context * ctx,
4503
- enum ggml_type type,
4504
- int n_dims,
4505
- const int64_t* ne,
4506
- void* data) {
4514
+ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
4507
4515
  // always insert objects at the end of the context's memory pool
4508
4516
  struct ggml_object * obj_cur = ctx->objects_end;
4509
4517
 
@@ -4511,77 +4519,81 @@ struct ggml_tensor * ggml_new_tensor_impl(
4511
4519
  const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
4512
4520
  const size_t cur_end = cur_offs + cur_size;
4513
4521
 
4514
- size_t size_needed = 0;
4515
-
4516
- if (data == NULL && !ctx->no_alloc) {
4517
- size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4518
- for (int i = 1; i < n_dims; i++) {
4519
- size_needed *= ne[i];
4520
- }
4521
- // align to GGML_MEM_ALIGN
4522
- size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
4523
- }
4522
+ // align to GGML_MEM_ALIGN
4523
+ size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
4524
4524
 
4525
4525
  char * const mem_buffer = ctx->mem_buffer;
4526
4526
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4527
4527
 
4528
- if (ctx->scratch.data == NULL || data != NULL) {
4529
- size_needed += GGML_TENSOR_SIZE;
4528
+ if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4529
+ GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4530
+ __func__, cur_end + size_needed, ctx->mem_size);
4531
+ assert(false);
4532
+ return NULL;
4533
+ }
4534
+
4535
+ *obj_new = (struct ggml_object) {
4536
+ .offs = cur_end + GGML_OBJECT_SIZE,
4537
+ .size = size_needed,
4538
+ .next = NULL,
4539
+ .type = type,
4540
+ };
4530
4541
 
4531
- if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4532
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4533
- __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
4534
- assert(false);
4535
- return NULL;
4536
- }
4542
+ ggml_assert_aligned(mem_buffer + obj_new->offs);
4537
4543
 
4538
- *obj_new = (struct ggml_object) {
4539
- .offs = cur_end + GGML_OBJECT_SIZE,
4540
- .size = size_needed,
4541
- .next = NULL,
4542
- };
4544
+ if (obj_cur != NULL) {
4545
+ obj_cur->next = obj_new;
4543
4546
  } else {
4544
- if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4545
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4546
- __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4547
- assert(false);
4548
- return NULL;
4547
+ // this is the first object in this context
4548
+ ctx->objects_begin = obj_new;
4549
+ }
4550
+
4551
+ ctx->objects_end = obj_new;
4552
+
4553
+ //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4554
+
4555
+ return obj_new;
4556
+ }
4557
+
4558
+ static struct ggml_tensor * ggml_new_tensor_impl(
4559
+ struct ggml_context * ctx,
4560
+ enum ggml_type type,
4561
+ int n_dims,
4562
+ const int64_t * ne,
4563
+ void * data) {
4564
+
4565
+ assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4566
+
4567
+ size_t data_size = 0;
4568
+
4569
+ if (data == NULL && !ctx->no_alloc) {
4570
+ data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4571
+ for (int i = 1; i < n_dims; i++) {
4572
+ data_size *= ne[i];
4549
4573
  }
4574
+ }
4550
4575
 
4551
- if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4552
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4553
- __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4576
+ if (ctx->scratch.data != NULL && data == NULL) {
4577
+ // allocate tensor data in the scratch buffer
4578
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4579
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4580
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4554
4581
  assert(false);
4555
4582
  return NULL;
4556
4583
  }
4557
4584
 
4558
4585
  data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4559
4586
 
4560
- *obj_new = (struct ggml_object) {
4561
- .offs = cur_end + GGML_OBJECT_SIZE,
4562
- .size = GGML_TENSOR_SIZE,
4563
- .next = NULL,
4564
- };
4565
-
4566
- //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
4587
+ ctx->scratch.offs += data_size;
4567
4588
 
4568
- ctx->scratch.offs += size_needed;
4589
+ data_size = 0;
4569
4590
  }
4570
4591
 
4571
- if (obj_cur != NULL) {
4572
- obj_cur->next = obj_new;
4573
- } else {
4574
- // this is the first object in this context
4575
- ctx->objects_begin = obj_new;
4576
- }
4577
-
4578
- ctx->objects_end = obj_new;
4592
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4579
4593
 
4580
- //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4594
+ // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4581
4595
 
4582
- struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
4583
-
4584
- ggml_assert_aligned(result);
4596
+ struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
4585
4597
 
4586
4598
  *result = (struct ggml_tensor) {
4587
4599
  /*.type =*/ type,
@@ -4590,6 +4602,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4590
4602
  /*.ne =*/ { 1, 1, 1, 1 },
4591
4603
  /*.nb =*/ { 0, 0, 0, 0 },
4592
4604
  /*.op =*/ GGML_OP_NONE,
4605
+ /*.op_params =*/ {0},
4593
4606
  /*.is_param =*/ false,
4594
4607
  /*.grad =*/ NULL,
4595
4608
  /*.src =*/ { NULL },
@@ -4620,24 +4633,39 @@ struct ggml_tensor * ggml_new_tensor_impl(
4620
4633
  return result;
4621
4634
  }
4622
4635
 
4636
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4637
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4638
+ memcpy(tensor->op_params, params, params_size);
4639
+ }
4640
+
4641
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4642
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4643
+ return ((const int32_t *)(tensor->op_params))[i];
4644
+ }
4645
+
4646
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4647
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4648
+ ((int32_t *)(tensor->op_params))[i] = value;
4649
+ }
4650
+
4623
4651
  struct ggml_tensor * ggml_new_tensor(
4624
4652
  struct ggml_context * ctx,
4625
- enum ggml_type type,
4626
- int n_dims,
4627
- const int64_t * ne) {
4653
+ enum ggml_type type,
4654
+ int n_dims,
4655
+ const int64_t * ne) {
4628
4656
  return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4629
4657
  }
4630
4658
 
4631
4659
  struct ggml_tensor * ggml_new_tensor_1d(
4632
4660
  struct ggml_context * ctx,
4633
- enum ggml_type type,
4661
+ enum ggml_type type,
4634
4662
  int64_t ne0) {
4635
4663
  return ggml_new_tensor(ctx, type, 1, &ne0);
4636
4664
  }
4637
4665
 
4638
4666
  struct ggml_tensor * ggml_new_tensor_2d(
4639
4667
  struct ggml_context * ctx,
4640
- enum ggml_type type,
4668
+ enum ggml_type type,
4641
4669
  int64_t ne0,
4642
4670
  int64_t ne1) {
4643
4671
  const int64_t ne[2] = { ne0, ne1 };
@@ -4646,7 +4674,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
4646
4674
 
4647
4675
  struct ggml_tensor * ggml_new_tensor_3d(
4648
4676
  struct ggml_context * ctx,
4649
- enum ggml_type type,
4677
+ enum ggml_type type,
4650
4678
  int64_t ne0,
4651
4679
  int64_t ne1,
4652
4680
  int64_t ne2) {
@@ -4951,6 +4979,11 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
4951
4979
  return (float *)(tensor->data);
4952
4980
  }
4953
4981
 
4982
+ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
4983
+ GGML_ASSERT(tensor->op == GGML_OP_UNARY);
4984
+ return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
4985
+ }
4986
+
4954
4987
  const char * ggml_get_name(const struct ggml_tensor * tensor) {
4955
4988
  return tensor->name;
4956
4989
  }
@@ -4989,9 +5022,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
4989
5022
  char * const mem_buffer = ctx->mem_buffer;
4990
5023
 
4991
5024
  while (obj != NULL) {
4992
- struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4993
- if (strcmp(cur->name, name) == 0) {
4994
- return cur;
5025
+ if (obj->type == GGML_OBJECT_TENSOR) {
5026
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
5027
+ if (strcmp(cur->name, name) == 0) {
5028
+ return cur;
5029
+ }
4995
5030
  }
4996
5031
 
4997
5032
  obj = obj->next;
@@ -5004,7 +5039,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
5004
5039
 
5005
5040
  // ggml_dup
5006
5041
 
5007
- struct ggml_tensor * ggml_dup_impl(
5042
+ static struct ggml_tensor * ggml_dup_impl(
5008
5043
  struct ggml_context * ctx,
5009
5044
  struct ggml_tensor * a,
5010
5045
  bool inplace) {
@@ -5019,7 +5054,6 @@ struct ggml_tensor * ggml_dup_impl(
5019
5054
  result->op = GGML_OP_DUP;
5020
5055
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5021
5056
  result->src[0] = a;
5022
- result->src[1] = NULL;
5023
5057
 
5024
5058
  return result;
5025
5059
  }
@@ -5038,7 +5072,7 @@ struct ggml_tensor * ggml_dup_inplace(
5038
5072
 
5039
5073
  // ggml_add
5040
5074
 
5041
- struct ggml_tensor * ggml_add_impl(
5075
+ static struct ggml_tensor * ggml_add_impl(
5042
5076
  struct ggml_context * ctx,
5043
5077
  struct ggml_tensor * a,
5044
5078
  struct ggml_tensor * b,
@@ -5081,7 +5115,7 @@ struct ggml_tensor * ggml_add_inplace(
5081
5115
 
5082
5116
  // ggml_add1
5083
5117
 
5084
- struct ggml_tensor * ggml_add1_impl(
5118
+ static struct ggml_tensor * ggml_add1_impl(
5085
5119
  struct ggml_context * ctx,
5086
5120
  struct ggml_tensor * a,
5087
5121
  struct ggml_tensor * b,
@@ -5121,7 +5155,7 @@ struct ggml_tensor * ggml_add1_inplace(
5121
5155
 
5122
5156
  // ggml_acc
5123
5157
 
5124
- struct ggml_tensor * ggml_acc_impl(
5158
+ static struct ggml_tensor * ggml_acc_impl(
5125
5159
  struct ggml_context * ctx,
5126
5160
  struct ggml_tensor * a,
5127
5161
  struct ggml_tensor * b,
@@ -5143,23 +5177,13 @@ struct ggml_tensor * ggml_acc_impl(
5143
5177
 
5144
5178
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5145
5179
 
5146
- ggml_scratch_save(ctx);
5147
-
5148
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
5149
-
5150
- ((int32_t *) c->data)[0] = nb1;
5151
- ((int32_t *) c->data)[1] = nb2;
5152
- ((int32_t *) c->data)[2] = nb3;
5153
- ((int32_t *) c->data)[3] = offset;
5154
- ((int32_t *) c->data)[4] = inplace ? 1 : 0;
5155
-
5156
- ggml_scratch_load(ctx);
5180
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5181
+ ggml_set_op_params(result, params, sizeof(params));
5157
5182
 
5158
5183
  result->op = GGML_OP_ACC;
5159
5184
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5160
5185
  result->src[0] = a;
5161
5186
  result->src[1] = b;
5162
- result->src[2] = c;
5163
5187
 
5164
5188
  return result;
5165
5189
  }
@@ -5188,7 +5212,7 @@ struct ggml_tensor * ggml_acc_inplace(
5188
5212
 
5189
5213
  // ggml_sub
5190
5214
 
5191
- struct ggml_tensor * ggml_sub_impl(
5215
+ static struct ggml_tensor * ggml_sub_impl(
5192
5216
  struct ggml_context * ctx,
5193
5217
  struct ggml_tensor * a,
5194
5218
  struct ggml_tensor * b,
@@ -5227,7 +5251,7 @@ struct ggml_tensor * ggml_sub_inplace(
5227
5251
 
5228
5252
  // ggml_mul
5229
5253
 
5230
- struct ggml_tensor * ggml_mul_impl(
5254
+ static struct ggml_tensor * ggml_mul_impl(
5231
5255
  struct ggml_context * ctx,
5232
5256
  struct ggml_tensor * a,
5233
5257
  struct ggml_tensor * b,
@@ -5274,7 +5298,7 @@ struct ggml_tensor * ggml_mul_inplace(
5274
5298
 
5275
5299
  // ggml_div
5276
5300
 
5277
- struct ggml_tensor * ggml_div_impl(
5301
+ static struct ggml_tensor * ggml_div_impl(
5278
5302
  struct ggml_context * ctx,
5279
5303
  struct ggml_tensor * a,
5280
5304
  struct ggml_tensor * b,
@@ -5317,7 +5341,7 @@ struct ggml_tensor * ggml_div_inplace(
5317
5341
 
5318
5342
  // ggml_sqr
5319
5343
 
5320
- struct ggml_tensor * ggml_sqr_impl(
5344
+ static struct ggml_tensor * ggml_sqr_impl(
5321
5345
  struct ggml_context * ctx,
5322
5346
  struct ggml_tensor * a,
5323
5347
  bool inplace) {
@@ -5332,7 +5356,6 @@ struct ggml_tensor * ggml_sqr_impl(
5332
5356
  result->op = GGML_OP_SQR;
5333
5357
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5334
5358
  result->src[0] = a;
5335
- result->src[1] = NULL;
5336
5359
 
5337
5360
  return result;
5338
5361
  }
@@ -5351,7 +5374,7 @@ struct ggml_tensor * ggml_sqr_inplace(
5351
5374
 
5352
5375
  // ggml_sqrt
5353
5376
 
5354
- struct ggml_tensor * ggml_sqrt_impl(
5377
+ static struct ggml_tensor * ggml_sqrt_impl(
5355
5378
  struct ggml_context * ctx,
5356
5379
  struct ggml_tensor * a,
5357
5380
  bool inplace) {
@@ -5366,7 +5389,6 @@ struct ggml_tensor * ggml_sqrt_impl(
5366
5389
  result->op = GGML_OP_SQRT;
5367
5390
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5368
5391
  result->src[0] = a;
5369
- result->src[1] = NULL;
5370
5392
 
5371
5393
  return result;
5372
5394
  }
@@ -5386,7 +5408,7 @@ struct ggml_tensor * ggml_sqrt_inplace(
5386
5408
 
5387
5409
  // ggml_log
5388
5410
 
5389
- struct ggml_tensor * ggml_log_impl(
5411
+ static struct ggml_tensor * ggml_log_impl(
5390
5412
  struct ggml_context * ctx,
5391
5413
  struct ggml_tensor * a,
5392
5414
  bool inplace) {
@@ -5401,7 +5423,6 @@ struct ggml_tensor * ggml_log_impl(
5401
5423
  result->op = GGML_OP_LOG;
5402
5424
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5403
5425
  result->src[0] = a;
5404
- result->src[1] = NULL;
5405
5426
 
5406
5427
  return result;
5407
5428
  }
@@ -5434,7 +5455,6 @@ struct ggml_tensor * ggml_sum(
5434
5455
  result->op = GGML_OP_SUM;
5435
5456
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5436
5457
  result->src[0] = a;
5437
- result->src[1] = NULL;
5438
5458
 
5439
5459
  return result;
5440
5460
  }
@@ -5461,7 +5481,6 @@ struct ggml_tensor * ggml_sum_rows(
5461
5481
  result->op = GGML_OP_SUM_ROWS;
5462
5482
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5463
5483
  result->src[0] = a;
5464
- result->src[1] = NULL;
5465
5484
 
5466
5485
  return result;
5467
5486
  }
@@ -5484,7 +5503,6 @@ struct ggml_tensor * ggml_mean(
5484
5503
  result->op = GGML_OP_MEAN;
5485
5504
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5486
5505
  result->src[0] = a;
5487
- result->src[1] = NULL;
5488
5506
 
5489
5507
  return result;
5490
5508
  }
@@ -5508,7 +5526,6 @@ struct ggml_tensor * ggml_argmax(
5508
5526
  result->op = GGML_OP_ARGMAX;
5509
5527
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5510
5528
  result->src[0] = a;
5511
- result->src[1] = NULL;
5512
5529
 
5513
5530
  return result;
5514
5531
  }
@@ -5571,343 +5588,142 @@ struct ggml_tensor * ggml_repeat_back(
5571
5588
 
5572
5589
  // ggml_abs
5573
5590
 
5574
- struct ggml_tensor * ggml_abs_impl(
5575
- struct ggml_context * ctx,
5576
- struct ggml_tensor * a,
5577
- bool inplace) {
5578
- bool is_node = false;
5579
-
5580
- if (!inplace && (a->grad)) {
5581
- is_node = true;
5582
- }
5583
-
5584
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5585
-
5586
- result->op = GGML_OP_ABS;
5587
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5588
- result->src[0] = a;
5589
- result->src[1] = NULL;
5590
-
5591
- return result;
5592
- }
5593
-
5594
5591
  struct ggml_tensor * ggml_abs(
5595
5592
  struct ggml_context * ctx,
5596
5593
  struct ggml_tensor * a) {
5597
- return ggml_abs_impl(ctx, a, false);
5594
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
5598
5595
  }
5599
5596
 
5600
5597
  struct ggml_tensor * ggml_abs_inplace(
5601
5598
  struct ggml_context * ctx,
5602
5599
  struct ggml_tensor * a) {
5603
- return ggml_abs_impl(ctx, a, true);
5600
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
5604
5601
  }
5605
5602
 
5606
-
5607
5603
  // ggml_sgn
5608
5604
 
5609
- struct ggml_tensor * ggml_sgn_impl(
5610
- struct ggml_context * ctx,
5611
- struct ggml_tensor * a,
5612
- bool inplace) {
5613
- bool is_node = false;
5614
-
5615
- if (!inplace && (a->grad)) {
5616
- is_node = true;
5617
- }
5618
-
5619
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5620
-
5621
- result->op = GGML_OP_SGN;
5622
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5623
- result->src[0] = a;
5624
- result->src[1] = NULL;
5625
-
5626
- return result;
5627
- }
5628
-
5629
5605
  struct ggml_tensor * ggml_sgn(
5630
5606
  struct ggml_context * ctx,
5631
5607
  struct ggml_tensor * a) {
5632
- return ggml_sgn_impl(ctx, a, false);
5608
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
5633
5609
  }
5634
5610
 
5635
5611
  struct ggml_tensor * ggml_sgn_inplace(
5636
5612
  struct ggml_context * ctx,
5637
5613
  struct ggml_tensor * a) {
5638
- return ggml_sgn_impl(ctx, a, true);
5614
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
5639
5615
  }
5640
5616
 
5641
5617
  // ggml_neg
5642
5618
 
5643
- struct ggml_tensor * ggml_neg_impl(
5644
- struct ggml_context * ctx,
5645
- struct ggml_tensor * a,
5646
- bool inplace) {
5647
- bool is_node = false;
5648
-
5649
- if (!inplace && (a->grad)) {
5650
- is_node = true;
5651
- }
5652
-
5653
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5654
-
5655
- result->op = GGML_OP_NEG;
5656
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5657
- result->src[0] = a;
5658
- result->src[1] = NULL;
5659
-
5660
- return result;
5661
- }
5662
-
5663
5619
  struct ggml_tensor * ggml_neg(
5664
5620
  struct ggml_context * ctx,
5665
5621
  struct ggml_tensor * a) {
5666
- return ggml_neg_impl(ctx, a, false);
5622
+ return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
5667
5623
  }
5668
5624
 
5669
5625
  struct ggml_tensor * ggml_neg_inplace(
5670
5626
  struct ggml_context * ctx,
5671
5627
  struct ggml_tensor * a) {
5672
- return ggml_neg_impl(ctx, a, true);
5628
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
5673
5629
  }
5674
5630
 
5675
5631
  // ggml_step
5676
5632
 
5677
- struct ggml_tensor * ggml_step_impl(
5678
- struct ggml_context * ctx,
5679
- struct ggml_tensor * a,
5680
- bool inplace) {
5681
- bool is_node = false;
5682
-
5683
- if (!inplace && (a->grad)) {
5684
- is_node = true;
5685
- }
5686
-
5687
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5688
-
5689
- result->op = GGML_OP_STEP;
5690
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5691
- result->src[0] = a;
5692
- result->src[1] = NULL;
5693
-
5694
- return result;
5695
- }
5696
-
5697
5633
  struct ggml_tensor * ggml_step(
5698
5634
  struct ggml_context * ctx,
5699
5635
  struct ggml_tensor * a) {
5700
- return ggml_step_impl(ctx, a, false);
5636
+ return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
5701
5637
  }
5702
5638
 
5703
5639
  struct ggml_tensor * ggml_step_inplace(
5704
5640
  struct ggml_context * ctx,
5705
5641
  struct ggml_tensor * a) {
5706
- return ggml_step_impl(ctx, a, true);
5642
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
5707
5643
  }
5708
5644
 
5709
5645
  // ggml_tanh
5710
5646
 
5711
- struct ggml_tensor * ggml_tanh_impl(
5712
- struct ggml_context * ctx,
5713
- struct ggml_tensor * a,
5714
- bool inplace) {
5715
- bool is_node = false;
5716
-
5717
- if (!inplace && (a->grad)) {
5718
- is_node = true;
5719
- }
5720
-
5721
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5722
-
5723
- result->op = GGML_OP_TANH;
5724
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5725
- result->src[0] = a;
5726
- result->src[1] = NULL;
5727
-
5728
- return result;
5729
- }
5730
-
5731
5647
  struct ggml_tensor * ggml_tanh(
5732
5648
  struct ggml_context * ctx,
5733
5649
  struct ggml_tensor * a) {
5734
- return ggml_tanh_impl(ctx, a, false);
5650
+ return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
5735
5651
  }
5736
5652
 
5737
5653
  struct ggml_tensor * ggml_tanh_inplace(
5738
5654
  struct ggml_context * ctx,
5739
5655
  struct ggml_tensor * a) {
5740
- return ggml_tanh_impl(ctx, a, true);
5656
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
5741
5657
  }
5742
5658
 
5743
5659
  // ggml_elu
5744
5660
 
5745
- struct ggml_tensor * ggml_elu_impl(
5746
- struct ggml_context * ctx,
5747
- struct ggml_tensor * a,
5748
- bool inplace) {
5749
- bool is_node = false;
5750
-
5751
- if (!inplace && (a->grad)) {
5752
- is_node = true;
5753
- }
5754
-
5755
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5756
-
5757
- result->op = GGML_OP_ELU;
5758
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5759
- result->src[0] = a;
5760
- result->src[1] = NULL;
5761
-
5762
- return result;
5763
- }
5764
-
5765
5661
  struct ggml_tensor * ggml_elu(
5766
5662
  struct ggml_context * ctx,
5767
5663
  struct ggml_tensor * a) {
5768
- return ggml_elu_impl(ctx, a, false);
5664
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
5769
5665
  }
5770
5666
 
5771
5667
  struct ggml_tensor * ggml_elu_inplace(
5772
5668
  struct ggml_context * ctx,
5773
5669
  struct ggml_tensor * a) {
5774
- return ggml_elu_impl(ctx, a, true);
5670
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
5775
5671
  }
5776
5672
 
5777
5673
  // ggml_relu
5778
5674
 
5779
- struct ggml_tensor * ggml_relu_impl(
5780
- struct ggml_context * ctx,
5781
- struct ggml_tensor * a,
5782
- bool inplace) {
5783
- bool is_node = false;
5784
-
5785
- if (!inplace && (a->grad)) {
5786
- is_node = true;
5787
- }
5788
-
5789
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5790
-
5791
- result->op = GGML_OP_RELU;
5792
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5793
- result->src[0] = a;
5794
- result->src[1] = NULL;
5795
-
5796
- return result;
5797
- }
5798
-
5799
5675
  struct ggml_tensor * ggml_relu(
5800
5676
  struct ggml_context * ctx,
5801
5677
  struct ggml_tensor * a) {
5802
- return ggml_relu_impl(ctx, a, false);
5678
+ return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
5803
5679
  }
5804
5680
 
5805
5681
  struct ggml_tensor * ggml_relu_inplace(
5806
5682
  struct ggml_context * ctx,
5807
5683
  struct ggml_tensor * a) {
5808
- return ggml_relu_impl(ctx, a, true);
5684
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
5809
5685
  }
5810
5686
 
5811
5687
  // ggml_gelu
5812
5688
 
5813
- struct ggml_tensor * ggml_gelu_impl(
5814
- struct ggml_context * ctx,
5815
- struct ggml_tensor * a,
5816
- bool inplace) {
5817
- bool is_node = false;
5818
-
5819
- if (!inplace && (a->grad)) {
5820
- is_node = true;
5821
- }
5822
-
5823
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5824
-
5825
- result->op = GGML_OP_GELU;
5826
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5827
- result->src[0] = a;
5828
- result->src[1] = NULL;
5829
-
5830
- return result;
5831
- }
5832
-
5833
5689
  struct ggml_tensor * ggml_gelu(
5834
5690
  struct ggml_context * ctx,
5835
5691
  struct ggml_tensor * a) {
5836
- return ggml_gelu_impl(ctx, a, false);
5692
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
5837
5693
  }
5838
5694
 
5839
5695
  struct ggml_tensor * ggml_gelu_inplace(
5840
5696
  struct ggml_context * ctx,
5841
5697
  struct ggml_tensor * a) {
5842
- return ggml_gelu_impl(ctx, a, true);
5698
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
5843
5699
  }
5844
5700
 
5845
5701
  // ggml_gelu_quick
5846
5702
 
5847
- struct ggml_tensor * ggml_gelu_quick_impl(
5848
- struct ggml_context * ctx,
5849
- struct ggml_tensor * a,
5850
- bool inplace) {
5851
- bool is_node = false;
5852
-
5853
- if (!inplace && (a->grad)) {
5854
- is_node = true;
5855
- }
5856
-
5857
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5858
-
5859
- result->op = GGML_OP_GELU_QUICK;
5860
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5861
- result->src[0] = a;
5862
- result->src[1] = NULL;
5863
-
5864
- return result;
5865
- }
5866
-
5867
5703
  struct ggml_tensor * ggml_gelu_quick(
5868
5704
  struct ggml_context * ctx,
5869
5705
  struct ggml_tensor * a) {
5870
- return ggml_gelu_quick_impl(ctx, a, false);
5706
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5871
5707
  }
5872
5708
 
5873
5709
  struct ggml_tensor * ggml_gelu_quick_inplace(
5874
5710
  struct ggml_context * ctx,
5875
5711
  struct ggml_tensor * a) {
5876
- return ggml_gelu_quick_impl(ctx, a, true);
5712
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5877
5713
  }
5878
5714
 
5879
5715
  // ggml_silu
5880
5716
 
5881
- struct ggml_tensor * ggml_silu_impl(
5882
- struct ggml_context * ctx,
5883
- struct ggml_tensor * a,
5884
- bool inplace) {
5885
- bool is_node = false;
5886
-
5887
- if (!inplace && (a->grad)) {
5888
- is_node = true;
5889
- }
5890
-
5891
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5892
-
5893
- result->op = GGML_OP_SILU;
5894
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5895
- result->src[0] = a;
5896
- result->src[1] = NULL;
5897
-
5898
- return result;
5899
- }
5900
-
5901
5717
  struct ggml_tensor * ggml_silu(
5902
5718
  struct ggml_context * ctx,
5903
5719
  struct ggml_tensor * a) {
5904
- return ggml_silu_impl(ctx, a, false);
5720
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
5905
5721
  }
5906
5722
 
5907
5723
  struct ggml_tensor * ggml_silu_inplace(
5908
5724
  struct ggml_context * ctx,
5909
5725
  struct ggml_tensor * a) {
5910
- return ggml_silu_impl(ctx, a, true);
5726
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
5911
5727
  }
5912
5728
 
5913
5729
  // ggml_silu_back
@@ -5935,7 +5751,7 @@ struct ggml_tensor * ggml_silu_back(
5935
5751
 
5936
5752
  // ggml_norm
5937
5753
 
5938
- struct ggml_tensor * ggml_norm_impl(
5754
+ static struct ggml_tensor * ggml_norm_impl(
5939
5755
  struct ggml_context * ctx,
5940
5756
  struct ggml_tensor * a,
5941
5757
  bool inplace) {
@@ -5948,10 +5764,11 @@ struct ggml_tensor * ggml_norm_impl(
5948
5764
 
5949
5765
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5950
5766
 
5767
+ // TODO: maybe store epsilon here?
5768
+
5951
5769
  result->op = GGML_OP_NORM;
5952
5770
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5953
5771
  result->src[0] = a;
5954
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5955
5772
 
5956
5773
  return result;
5957
5774
  }
@@ -5968,9 +5785,10 @@ struct ggml_tensor * ggml_norm_inplace(
5968
5785
  return ggml_norm_impl(ctx, a, true);
5969
5786
  }
5970
5787
 
5971
- struct ggml_tensor * ggml_rms_norm_impl(
5788
+ static struct ggml_tensor * ggml_rms_norm_impl(
5972
5789
  struct ggml_context * ctx,
5973
5790
  struct ggml_tensor * a,
5791
+ float eps,
5974
5792
  bool inplace) {
5975
5793
  bool is_node = false;
5976
5794
 
@@ -5980,24 +5798,27 @@ struct ggml_tensor * ggml_rms_norm_impl(
5980
5798
 
5981
5799
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5982
5800
 
5801
+ ggml_set_op_params(result, &eps, sizeof(eps));
5802
+
5983
5803
  result->op = GGML_OP_RMS_NORM;
5984
5804
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5985
5805
  result->src[0] = a;
5986
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5987
5806
 
5988
5807
  return result;
5989
5808
  }
5990
5809
 
5991
5810
  struct ggml_tensor * ggml_rms_norm(
5992
5811
  struct ggml_context * ctx,
5993
- struct ggml_tensor * a) {
5994
- return ggml_rms_norm_impl(ctx, a, false);
5812
+ struct ggml_tensor * a,
5813
+ float eps) {
5814
+ return ggml_rms_norm_impl(ctx, a, eps, false);
5995
5815
  }
5996
5816
 
5997
5817
  struct ggml_tensor * ggml_rms_norm_inplace(
5998
5818
  struct ggml_context * ctx,
5999
- struct ggml_tensor * a) {
6000
- return ggml_rms_norm_impl(ctx, a, true);
5819
+ struct ggml_tensor * a,
5820
+ float eps) {
5821
+ return ggml_rms_norm_impl(ctx, a, eps, true);
6001
5822
  }
6002
5823
 
6003
5824
  struct ggml_tensor * ggml_rms_norm_back(
@@ -6076,7 +5897,7 @@ struct ggml_tensor * ggml_out_prod(
6076
5897
 
6077
5898
  // ggml_scale
6078
5899
 
6079
- struct ggml_tensor * ggml_scale_impl(
5900
+ static struct ggml_tensor * ggml_scale_impl(
6080
5901
  struct ggml_context * ctx,
6081
5902
  struct ggml_tensor * a,
6082
5903
  struct ggml_tensor * b,
@@ -6116,7 +5937,7 @@ struct ggml_tensor * ggml_scale_inplace(
6116
5937
 
6117
5938
  // ggml_set
6118
5939
 
6119
- struct ggml_tensor * ggml_set_impl(
5940
+ static struct ggml_tensor * ggml_set_impl(
6120
5941
  struct ggml_context * ctx,
6121
5942
  struct ggml_tensor * a,
6122
5943
  struct ggml_tensor * b,
@@ -6136,23 +5957,13 @@ struct ggml_tensor * ggml_set_impl(
6136
5957
  // make a view of the destination
6137
5958
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6138
5959
 
6139
- ggml_scratch_save(ctx);
6140
-
6141
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
6142
-
6143
- (( int32_t * ) c->data)[0] = nb1;
6144
- (( int32_t * ) c->data)[1] = nb2;
6145
- (( int32_t * ) c->data)[2] = nb3;
6146
- (( int32_t * ) c->data)[3] = offset;
6147
- (( int32_t * ) c->data)[4] = inplace ? 1 : 0;
6148
-
6149
- ggml_scratch_load(ctx);
5960
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5961
+ ggml_set_op_params(result, params, sizeof(params));
6150
5962
 
6151
5963
  result->op = GGML_OP_SET;
6152
5964
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6153
5965
  result->src[0] = a;
6154
5966
  result->src[1] = b;
6155
- result->src[2] = c;
6156
5967
 
6157
5968
  return result;
6158
5969
  }
@@ -6216,7 +6027,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
6216
6027
 
6217
6028
  // ggml_cpy
6218
6029
 
6219
- struct ggml_tensor * ggml_cpy_impl(
6030
+ static struct ggml_tensor * ggml_cpy_impl(
6220
6031
  struct ggml_context * ctx,
6221
6032
  struct ggml_tensor * a,
6222
6033
  struct ggml_tensor * b,
@@ -6261,7 +6072,7 @@ struct ggml_tensor * ggml_cpy_inplace(
6261
6072
 
6262
6073
  // ggml_cont
6263
6074
 
6264
- struct ggml_tensor * ggml_cont_impl(
6075
+ static struct ggml_tensor * ggml_cont_impl(
6265
6076
  struct ggml_context * ctx,
6266
6077
  struct ggml_tensor * a,
6267
6078
  bool inplace) {
@@ -6277,7 +6088,6 @@ struct ggml_tensor * ggml_cont_impl(
6277
6088
  result->op = GGML_OP_CONT;
6278
6089
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6279
6090
  result->src[0] = a;
6280
- result->src[1] = NULL;
6281
6091
 
6282
6092
  return result;
6283
6093
  }
@@ -6321,7 +6131,6 @@ struct ggml_tensor * ggml_reshape(
6321
6131
  result->op = GGML_OP_RESHAPE;
6322
6132
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6323
6133
  result->src[0] = a;
6324
- result->src[1] = NULL;
6325
6134
 
6326
6135
  return result;
6327
6136
  }
@@ -6346,7 +6155,6 @@ struct ggml_tensor * ggml_reshape_1d(
6346
6155
  result->op = GGML_OP_RESHAPE;
6347
6156
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6348
6157
  result->src[0] = a;
6349
- result->src[1] = NULL;
6350
6158
 
6351
6159
  return result;
6352
6160
  }
@@ -6372,7 +6180,6 @@ struct ggml_tensor * ggml_reshape_2d(
6372
6180
  result->op = GGML_OP_RESHAPE;
6373
6181
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6374
6182
  result->src[0] = a;
6375
- result->src[1] = NULL;
6376
6183
 
6377
6184
  return result;
6378
6185
  }
@@ -6399,7 +6206,6 @@ struct ggml_tensor * ggml_reshape_3d(
6399
6206
  result->op = GGML_OP_RESHAPE;
6400
6207
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6401
6208
  result->src[0] = a;
6402
- result->src[1] = NULL;
6403
6209
 
6404
6210
  return result;
6405
6211
  }
@@ -6428,13 +6234,33 @@ struct ggml_tensor * ggml_reshape_4d(
6428
6234
  result->op = GGML_OP_RESHAPE;
6429
6235
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6430
6236
  result->src[0] = a;
6431
- result->src[1] = NULL;
6432
6237
 
6433
6238
  return result;
6434
6239
  }
6435
6240
 
6436
6241
  // ggml_view_1d
6437
6242
 
6243
+ static struct ggml_tensor * ggml_view_tensor_offset(
6244
+ struct ggml_context * ctx,
6245
+ struct ggml_tensor * a,
6246
+ int n_dims,
6247
+ const int64_t * ne,
6248
+ size_t offset) {
6249
+ // don't calculate an offset from an unallocated tensor
6250
+ void * data = NULL;
6251
+ if (a->data != NULL) {
6252
+ data = (char *) a->data + offset;
6253
+ }
6254
+
6255
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6256
+
6257
+ ggml_format_name(result, "%s (view)", a->name);
6258
+
6259
+ ggml_set_op_params(result, &offset, sizeof(offset));
6260
+
6261
+ return result;
6262
+ }
6263
+
6438
6264
  struct ggml_tensor * ggml_view_1d(
6439
6265
  struct ggml_context * ctx,
6440
6266
  struct ggml_tensor * a,
@@ -6447,22 +6273,11 @@ struct ggml_tensor * ggml_view_1d(
6447
6273
  is_node = true;
6448
6274
  }
6449
6275
 
6450
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6451
- ggml_format_name(result, "%s (view)", a->name);
6452
-
6453
- ggml_scratch_save(ctx);
6454
-
6455
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6456
- ggml_set_name(offs, "offset");
6457
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6458
-
6459
- ggml_scratch_load(ctx);
6276
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
6460
6277
 
6461
6278
  result->op = GGML_OP_VIEW;
6462
6279
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6463
6280
  result->src[0] = a;
6464
- result->src[1] = NULL;
6465
- result->src[2] = offs;
6466
6281
 
6467
6282
  return result;
6468
6283
  }
@@ -6485,16 +6300,7 @@ struct ggml_tensor * ggml_view_2d(
6485
6300
 
6486
6301
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6487
6302
 
6488
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6489
- ggml_format_name(result, "%s (view)", a->name);
6490
-
6491
- ggml_scratch_save(ctx);
6492
-
6493
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6494
- ggml_set_name(offs, "offset");
6495
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6496
-
6497
- ggml_scratch_load(ctx);
6303
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
6498
6304
 
6499
6305
  result->nb[1] = nb1;
6500
6306
  result->nb[2] = result->nb[1]*ne1;
@@ -6503,8 +6309,6 @@ struct ggml_tensor * ggml_view_2d(
6503
6309
  result->op = GGML_OP_VIEW;
6504
6310
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6505
6311
  result->src[0] = a;
6506
- result->src[1] = NULL;
6507
- result->src[2] = offs;
6508
6312
 
6509
6313
  return result;
6510
6314
  }
@@ -6529,16 +6333,7 @@ struct ggml_tensor * ggml_view_3d(
6529
6333
 
6530
6334
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6531
6335
 
6532
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6533
- ggml_format_name(result, "%s (view)", a->name);
6534
-
6535
- ggml_scratch_save(ctx);
6536
-
6537
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6538
- ggml_set_name(offs, "offset");
6539
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6540
-
6541
- ggml_scratch_load(ctx);
6336
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
6542
6337
 
6543
6338
  result->nb[1] = nb1;
6544
6339
  result->nb[2] = nb2;
@@ -6547,8 +6342,6 @@ struct ggml_tensor * ggml_view_3d(
6547
6342
  result->op = GGML_OP_VIEW;
6548
6343
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6549
6344
  result->src[0] = a;
6550
- result->src[1] = NULL;
6551
- result->src[2] = offs;
6552
6345
 
6553
6346
  return result;
6554
6347
  }
@@ -6575,16 +6368,7 @@ struct ggml_tensor * ggml_view_4d(
6575
6368
 
6576
6369
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6577
6370
 
6578
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6579
- ggml_format_name(result, "%s (view)", a->name);
6580
-
6581
- ggml_scratch_save(ctx);
6582
-
6583
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6584
- ggml_set_name(offs, "offset");
6585
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6586
-
6587
- ggml_scratch_load(ctx);
6371
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
6588
6372
 
6589
6373
  result->nb[1] = nb1;
6590
6374
  result->nb[2] = nb2;
@@ -6593,8 +6377,6 @@ struct ggml_tensor * ggml_view_4d(
6593
6377
  result->op = GGML_OP_VIEW;
6594
6378
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6595
6379
  result->src[0] = a;
6596
- result->src[1] = NULL;
6597
- result->src[2] = offs;
6598
6380
 
6599
6381
  return result;
6600
6382
  }
@@ -6655,22 +6437,9 @@ struct ggml_tensor * ggml_permute(
6655
6437
  result->op = GGML_OP_PERMUTE;
6656
6438
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6657
6439
  result->src[0] = a;
6658
- result->src[1] = NULL;
6659
6440
 
6660
- if (is_node) {
6661
- ggml_scratch_save(ctx);
6662
-
6663
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6664
-
6665
- ((int32_t *) b->data)[0] = axis0;
6666
- ((int32_t *) b->data)[1] = axis1;
6667
- ((int32_t *) b->data)[2] = axis2;
6668
- ((int32_t *) b->data)[3] = axis3;
6669
-
6670
- ggml_scratch_load(ctx);
6671
-
6672
- result->src[2] = b;
6673
- }
6441
+ int32_t params[] = { axis0, axis1, axis2, axis3 };
6442
+ ggml_set_op_params(result, &params, sizeof(params));
6674
6443
 
6675
6444
  return result;
6676
6445
  }
@@ -6698,7 +6467,6 @@ struct ggml_tensor * ggml_transpose(
6698
6467
  result->op = GGML_OP_TRANSPOSE;
6699
6468
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6700
6469
  result->src[0] = a;
6701
- result->src[1] = NULL;
6702
6470
 
6703
6471
  return result;
6704
6472
  }
@@ -6776,7 +6544,6 @@ struct ggml_tensor * ggml_diag(
6776
6544
  result->op = GGML_OP_DIAG;
6777
6545
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6778
6546
  result->src[0] = a;
6779
- result->src[1] = NULL;
6780
6547
 
6781
6548
  return result;
6782
6549
  }
@@ -6784,7 +6551,7 @@ struct ggml_tensor * ggml_diag(
6784
6551
 
6785
6552
  // ggml_diag_mask_inf
6786
6553
 
6787
- struct ggml_tensor * ggml_diag_mask_inf_impl(
6554
+ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6788
6555
  struct ggml_context * ctx,
6789
6556
  struct ggml_tensor * a,
6790
6557
  int n_past,
@@ -6797,19 +6564,12 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
6797
6564
 
6798
6565
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6799
6566
 
6800
- ggml_scratch_save(ctx);
6801
-
6802
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6803
-
6804
- ((int32_t *) b->data)[0] = n_past;
6805
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6806
-
6807
- ggml_scratch_load(ctx);
6567
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6568
+ ggml_set_op_params(result, &params, sizeof(params));
6808
6569
 
6809
6570
  result->op = GGML_OP_DIAG_MASK_INF;
6810
6571
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6811
6572
  result->src[0] = a;
6812
- result->src[1] = b;
6813
6573
 
6814
6574
  return result;
6815
6575
  }
@@ -6831,7 +6591,7 @@ struct ggml_tensor * ggml_diag_mask_inf_inplace(
6831
6591
 
6832
6592
  // ggml_diag_mask_zero
6833
6593
 
6834
- struct ggml_tensor * ggml_diag_mask_zero_impl(
6594
+ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6835
6595
  struct ggml_context * ctx,
6836
6596
  struct ggml_tensor * a,
6837
6597
  int n_past,
@@ -6844,20 +6604,12 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
6844
6604
 
6845
6605
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6846
6606
 
6847
- ggml_scratch_save(ctx);
6848
-
6849
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6850
- ggml_set_name(b, "n_past, inplace");
6851
-
6852
- ((int32_t *) b->data)[0] = n_past;
6853
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6854
-
6855
- ggml_scratch_load(ctx);
6607
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6608
+ ggml_set_op_params(result, &params, sizeof(params));
6856
6609
 
6857
6610
  result->op = GGML_OP_DIAG_MASK_ZERO;
6858
6611
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6859
6612
  result->src[0] = a;
6860
- result->src[1] = b;
6861
6613
 
6862
6614
  return result;
6863
6615
  }
@@ -6878,7 +6630,7 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
6878
6630
 
6879
6631
  // ggml_soft_max
6880
6632
 
6881
- struct ggml_tensor * ggml_soft_max_impl(
6633
+ static struct ggml_tensor * ggml_soft_max_impl(
6882
6634
  struct ggml_context * ctx,
6883
6635
  struct ggml_tensor * a,
6884
6636
  bool inplace) {
@@ -6893,7 +6645,6 @@ struct ggml_tensor * ggml_soft_max_impl(
6893
6645
  result->op = GGML_OP_SOFT_MAX;
6894
6646
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6895
6647
  result->src[0] = a;
6896
- result->src[1] = NULL;
6897
6648
 
6898
6649
  return result;
6899
6650
  }
@@ -6913,7 +6664,7 @@ struct ggml_tensor * ggml_soft_max_inplace(
6913
6664
 
6914
6665
  // ggml_soft_max_back
6915
6666
 
6916
- struct ggml_tensor * ggml_soft_max_back_impl(
6667
+ static struct ggml_tensor * ggml_soft_max_back_impl(
6917
6668
  struct ggml_context * ctx,
6918
6669
  struct ggml_tensor * a,
6919
6670
  struct ggml_tensor * b,
@@ -6950,7 +6701,7 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
6950
6701
 
6951
6702
  // ggml_rope
6952
6703
 
6953
- struct ggml_tensor * ggml_rope_impl(
6704
+ static struct ggml_tensor * ggml_rope_impl(
6954
6705
  struct ggml_context * ctx,
6955
6706
  struct ggml_tensor * a,
6956
6707
  int n_past,
@@ -6969,23 +6720,14 @@ struct ggml_tensor * ggml_rope_impl(
6969
6720
 
6970
6721
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6971
6722
 
6972
- ggml_scratch_save(ctx);
6973
-
6974
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
6975
-
6976
- ((int32_t *) b->data)[0] = n_past;
6977
- ((int32_t *) b->data)[1] = n_dims;
6978
- ((int32_t *) b->data)[2] = mode;
6979
- ((int32_t *) b->data)[3] = n_ctx;
6980
- memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
6981
- memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
6982
-
6983
- ggml_scratch_load(ctx);
6723
+ int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6724
+ memcpy(params + 4, &freq_base, sizeof(float));
6725
+ memcpy(params + 5, &freq_scale, sizeof(float));
6726
+ ggml_set_op_params(result, &params, sizeof(params));
6984
6727
 
6985
6728
  result->op = GGML_OP_ROPE;
6986
6729
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6987
6730
  result->src[0] = a;
6988
- result->src[1] = b;
6989
6731
 
6990
6732
  return result;
6991
6733
  }
@@ -7010,6 +6752,18 @@ struct ggml_tensor * ggml_rope_inplace(
7010
6752
  return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
7011
6753
  }
7012
6754
 
6755
+ struct ggml_tensor * ggml_rope_custom(
6756
+ struct ggml_context * ctx,
6757
+ struct ggml_tensor * a,
6758
+ int n_past,
6759
+ int n_dims,
6760
+ int mode,
6761
+ int n_ctx,
6762
+ float freq_base,
6763
+ float freq_scale) {
6764
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
6765
+ }
6766
+
7013
6767
  struct ggml_tensor * ggml_rope_custom_inplace(
7014
6768
  struct ggml_context * ctx,
7015
6769
  struct ggml_tensor * a,
@@ -7042,22 +6796,12 @@ struct ggml_tensor * ggml_rope_back(
7042
6796
 
7043
6797
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7044
6798
 
7045
- ggml_scratch_save(ctx);
7046
-
7047
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7048
- ggml_set_name(b, "n_past, n_dims, mode");
7049
-
7050
- ((int32_t *) b->data)[0] = n_past;
7051
- ((int32_t *) b->data)[1] = n_dims;
7052
- ((int32_t *) b->data)[2] = mode;
7053
- ((int32_t *) b->data)[3] = n_ctx;
7054
-
7055
- ggml_scratch_load(ctx);
6799
+ int32_t params[] = { n_past, n_dims, mode, n_ctx };
6800
+ ggml_set_op_params(result, &params, sizeof(params));
7056
6801
 
7057
6802
  result->op = GGML_OP_ROPE_BACK;
7058
6803
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7059
6804
  result->src[0] = a;
7060
- result->src[1] = b;
7061
6805
 
7062
6806
  return result;
7063
6807
  }
@@ -7082,21 +6826,13 @@ struct ggml_tensor * ggml_alibi(
7082
6826
  //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7083
6827
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7084
6828
 
7085
- ggml_scratch_save(ctx);
7086
-
7087
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7088
-
7089
- ((int32_t *) b->data)[0] = n_past;
7090
- ((int32_t *) b->data)[1] = n_head;
7091
- GGML_ASSERT(sizeof(float) == sizeof(int32_t));
7092
- (((float *) b->data)[2]) = bias_max;
7093
-
7094
- ggml_scratch_load(ctx);
6829
+ int32_t op_params[3] = { n_past, n_head };
6830
+ memcpy(op_params + 2, &bias_max, sizeof(float));
6831
+ ggml_set_op_params(result, &op_params, sizeof(op_params));
7095
6832
 
7096
6833
  result->op = GGML_OP_ALIBI;
7097
6834
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7098
6835
  result->src[0] = a;
7099
- result->src[1] = b;
7100
6836
 
7101
6837
  return result;
7102
6838
  }
@@ -7118,19 +6854,12 @@ struct ggml_tensor * ggml_clamp(
7118
6854
  // TODO: when implement backward, fix this:
7119
6855
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7120
6856
 
7121
- ggml_scratch_save(ctx);
7122
-
7123
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
7124
-
7125
- ((float *) b->data)[0] = min;
7126
- ((float *) b->data)[1] = max;
7127
-
7128
- ggml_scratch_load(ctx);
6857
+ float params[] = { min, max };
6858
+ ggml_set_op_params(result, &params, sizeof(params));
7129
6859
 
7130
6860
  result->op = GGML_OP_CLAMP;
7131
6861
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7132
6862
  result->src[0] = a;
7133
- result->src[1] = b;
7134
6863
 
7135
6864
  return result;
7136
6865
  }
@@ -7163,18 +6892,13 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
7163
6892
  };
7164
6893
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7165
6894
 
7166
- ggml_scratch_save(ctx);
7167
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7168
- ((int32_t*)c->data)[0] = s0;
7169
- ((int32_t*)c->data)[1] = p0;
7170
- ((int32_t*)c->data)[2] = d0;
7171
- ggml_scratch_load(ctx);
6895
+ int32_t params[] = { s0, p0, d0 };
6896
+ ggml_set_op_params(result, &params, sizeof(params));
7172
6897
 
7173
6898
  result->op = GGML_OP_CONV_1D;
7174
6899
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7175
6900
  result->src[0] = a;
7176
6901
  result->src[1] = b;
7177
- result->src[2] = c;
7178
6902
 
7179
6903
  return result;
7180
6904
  }
@@ -7207,21 +6931,13 @@ struct ggml_tensor* ggml_conv_2d(
7207
6931
  };
7208
6932
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7209
6933
 
7210
- ggml_scratch_save(ctx);
7211
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
7212
- ((int32_t*)c->data)[0] = s0;
7213
- ((int32_t*)c->data)[1] = s1;
7214
- ((int32_t*)c->data)[2] = p0;
7215
- ((int32_t*)c->data)[3] = p1;
7216
- ((int32_t*)c->data)[4] = d0;
7217
- ((int32_t*)c->data)[5] = d1;
7218
- ggml_scratch_load(ctx);
6934
+ int32_t params[] = { s0, s1, p0, p1, d0, d1 };
6935
+ ggml_set_op_params(result, &params, sizeof(params));
7219
6936
 
7220
6937
  result->op = GGML_OP_CONV_2D;
7221
6938
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7222
6939
  result->src[0] = a;
7223
6940
  result->src[1] = b;
7224
- result->src[2] = c;
7225
6941
 
7226
6942
  return result;
7227
6943
 
@@ -7245,7 +6961,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
7245
6961
  return (ins + 2 * p - ks) / s + 1;
7246
6962
  }
7247
6963
 
7248
- // ggml_pool_2d
6964
+ // ggml_pool_1d
7249
6965
 
7250
6966
  struct ggml_tensor* ggml_pool_1d(
7251
6967
  struct ggml_context * ctx,
@@ -7268,18 +6984,12 @@ struct ggml_tensor* ggml_pool_1d(
7268
6984
  };
7269
6985
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7270
6986
 
7271
- ggml_scratch_save(ctx);
7272
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7273
- ((int32_t*)c->data)[0] = op;
7274
- ((int32_t*)c->data)[1] = k0;
7275
- ((int32_t*)c->data)[2] = s0;
7276
- ((int32_t*)c->data)[3] = p0;
7277
- ggml_scratch_load(ctx);
6987
+ int32_t params[] = { op, k0, s0, p0 };
6988
+ ggml_set_op_params(result, &params, sizeof(params));
7278
6989
 
7279
6990
  result->op = GGML_OP_POOL_1D;
7280
6991
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7281
6992
  result->src[0] = a;
7282
- result->src[1] = c;
7283
6993
 
7284
6994
  return result;
7285
6995
  }
@@ -7311,21 +7021,12 @@ struct ggml_tensor* ggml_pool_2d(
7311
7021
  };
7312
7022
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7313
7023
 
7314
- ggml_scratch_save(ctx);
7315
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
7316
- ((int32_t*)c->data)[0] = op;
7317
- ((int32_t*)c->data)[1] = k0;
7318
- ((int32_t*)c->data)[2] = k1;
7319
- ((int32_t*)c->data)[3] = s0;
7320
- ((int32_t*)c->data)[4] = s1;
7321
- ((int32_t*)c->data)[5] = p0;
7322
- ((int32_t*)c->data)[6] = p1;
7323
- ggml_scratch_load(ctx);
7024
+ int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
7025
+ ggml_set_op_params(result, &params, sizeof(params));
7324
7026
 
7325
7027
  result->op = GGML_OP_POOL_2D;
7326
7028
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7327
7029
  result->src[0] = a;
7328
- result->src[1] = c;
7329
7030
 
7330
7031
  return result;
7331
7032
  }
@@ -7348,14 +7049,16 @@ struct ggml_tensor * ggml_flash_attn(
7348
7049
  }
7349
7050
 
7350
7051
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
7351
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne);
7052
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
7053
+
7054
+ int32_t t = masked ? 1 : 0;
7055
+ ggml_set_op_params(result, &t, sizeof(t));
7352
7056
 
7353
7057
  result->op = GGML_OP_FLASH_ATTN;
7354
7058
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7355
7059
  result->src[0] = q;
7356
7060
  result->src[1] = k;
7357
7061
  result->src[2] = v;
7358
- result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
7359
7062
 
7360
7063
  return result;
7361
7064
  }
@@ -7379,7 +7082,7 @@ struct ggml_tensor * ggml_flash_ff(
7379
7082
  }
7380
7083
 
7381
7084
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7382
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);
7085
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
7383
7086
 
7384
7087
  result->op = GGML_OP_FLASH_FF;
7385
7088
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7445,13 +7148,15 @@ struct ggml_tensor * ggml_flash_attn_back(
7445
7148
 
7446
7149
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7447
7150
 
7151
+ int32_t masked_i = masked ? 1 : 0;
7152
+ ggml_set_op_params(result, &masked_i, sizeof(masked_i));
7153
+
7448
7154
  result->op = GGML_OP_FLASH_ATTN_BACK;
7449
7155
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7450
7156
  result->src[0] = q;
7451
7157
  result->src[1] = k;
7452
7158
  result->src[2] = v;
7453
7159
  result->src[3] = d;
7454
- result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
7455
7160
 
7456
7161
  return result;
7457
7162
  }
@@ -7484,21 +7189,12 @@ struct ggml_tensor * ggml_win_part(
7484
7189
 
7485
7190
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7486
7191
 
7487
- ggml_scratch_save(ctx);
7488
-
7489
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7490
-
7491
- ((int32_t *) b->data)[0] = npx;
7492
- ((int32_t *) b->data)[1] = npy;
7493
- ((int32_t *) b->data)[2] = w;
7494
-
7495
- ggml_scratch_load(ctx);
7192
+ int32_t params[] = { npx, npy, w };
7193
+ ggml_set_op_params(result, &params, sizeof(params));
7496
7194
 
7497
7195
  result->op = GGML_OP_WIN_PART;
7498
7196
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7499
7197
  result->src[0] = a;
7500
- result->src[1] = NULL;
7501
- result->src[2] = b;
7502
7198
 
7503
7199
  return result;
7504
7200
  }
@@ -7523,26 +7219,57 @@ struct ggml_tensor * ggml_win_unpart(
7523
7219
  const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
7524
7220
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7525
7221
 
7526
- ggml_scratch_save(ctx);
7222
+ int32_t params[] = { w };
7223
+ ggml_set_op_params(result, &params, sizeof(params));
7224
+
7225
+ result->op = GGML_OP_WIN_UNPART;
7226
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7227
+ result->src[0] = a;
7527
7228
 
7528
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
7229
+ return result;
7230
+ }
7529
7231
 
7530
- ((int32_t *) b->data)[0] = w;
7232
+ // gmml_unary
7531
7233
 
7532
- ggml_scratch_load(ctx);
7234
+ static struct ggml_tensor * ggml_unary_impl(
7235
+ struct ggml_context * ctx,
7236
+ struct ggml_tensor * a,
7237
+ enum ggml_unary_op op,
7238
+ bool inplace) {
7239
+ bool is_node = false;
7533
7240
 
7534
- result->op = GGML_OP_WIN_UNPART;
7241
+ if (!inplace && (a->grad)) {
7242
+ is_node = true;
7243
+ }
7244
+
7245
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7246
+
7247
+ ggml_set_op_params_i32(result, 0, (int32_t) op);
7248
+
7249
+ result->op = GGML_OP_UNARY;
7535
7250
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7536
7251
  result->src[0] = a;
7537
- result->src[1] = NULL;
7538
- result->src[2] = b;
7539
7252
 
7540
7253
  return result;
7541
7254
  }
7542
7255
 
7256
+ struct ggml_tensor * ggml_unary(
7257
+ struct ggml_context * ctx,
7258
+ struct ggml_tensor * a,
7259
+ enum ggml_unary_op op) {
7260
+ return ggml_unary_impl(ctx, a, op, false);
7261
+ }
7262
+
7263
+ struct ggml_tensor * ggml_unary_inplace(
7264
+ struct ggml_context * ctx,
7265
+ struct ggml_tensor * a,
7266
+ enum ggml_unary_op op) {
7267
+ return ggml_unary_impl(ctx, a, op, true);
7268
+ }
7269
+
7543
7270
  // ggml_map_unary
7544
7271
 
7545
- struct ggml_tensor * ggml_map_unary_impl_f32(
7272
+ static struct ggml_tensor * ggml_map_unary_impl_f32(
7546
7273
  struct ggml_context * ctx,
7547
7274
  struct ggml_tensor * a,
7548
7275
  const ggml_unary_op_f32_t fun,
@@ -7553,19 +7280,13 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
7553
7280
  is_node = true;
7554
7281
  }
7555
7282
 
7556
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7557
-
7558
- ggml_scratch_save(ctx);
7283
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7559
7284
 
7560
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7561
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7562
-
7563
- ggml_scratch_load(ctx);
7285
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7564
7286
 
7565
7287
  result->op = GGML_OP_MAP_UNARY;
7566
7288
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7567
7289
  result->src[0] = a;
7568
- result->src[2] = addr_tensor;
7569
7290
 
7570
7291
  return result;
7571
7292
  }
@@ -7586,7 +7307,7 @@ struct ggml_tensor * ggml_map_unary_inplace_f32(
7586
7307
 
7587
7308
  // ggml_map_binary
7588
7309
 
7589
- struct ggml_tensor * ggml_map_binary_impl_f32(
7310
+ static struct ggml_tensor * ggml_map_binary_impl_f32(
7590
7311
  struct ggml_context * ctx,
7591
7312
  struct ggml_tensor * a,
7592
7313
  struct ggml_tensor * b,
@@ -7600,20 +7321,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
7600
7321
  is_node = true;
7601
7322
  }
7602
7323
 
7603
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7604
-
7605
- ggml_scratch_save(ctx);
7606
-
7607
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7608
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7324
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7609
7325
 
7610
- ggml_scratch_load(ctx);
7326
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7611
7327
 
7612
7328
  result->op = GGML_OP_MAP_BINARY;
7613
7329
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7614
7330
  result->src[0] = a;
7615
7331
  result->src[1] = b;
7616
- result->src[2] = addr_tensor;
7617
7332
 
7618
7333
  return result;
7619
7334
  }
@@ -7636,7 +7351,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7636
7351
 
7637
7352
  // ggml_map_custom1
7638
7353
 
7639
- struct ggml_tensor * ggml_map_custom1_impl_f32(
7354
+ static struct ggml_tensor * ggml_map_custom1_impl_f32(
7640
7355
  struct ggml_context * ctx,
7641
7356
  struct ggml_tensor * a,
7642
7357
  const ggml_custom1_op_f32_t fun,
@@ -7647,19 +7362,13 @@ struct ggml_tensor * ggml_map_custom1_impl_f32(
7647
7362
  is_node = true;
7648
7363
  }
7649
7364
 
7650
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7651
-
7652
- ggml_scratch_save(ctx);
7653
-
7654
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7655
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7365
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7656
7366
 
7657
- ggml_scratch_load(ctx);
7367
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7658
7368
 
7659
7369
  result->op = GGML_OP_MAP_CUSTOM1;
7660
7370
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7661
7371
  result->src[0] = a;
7662
- result->src[2] = addr_tensor;
7663
7372
 
7664
7373
  return result;
7665
7374
  }
@@ -7680,7 +7389,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7680
7389
 
7681
7390
  // ggml_map_custom2
7682
7391
 
7683
- struct ggml_tensor * ggml_map_custom2_impl_f32(
7392
+ static struct ggml_tensor * ggml_map_custom2_impl_f32(
7684
7393
  struct ggml_context * ctx,
7685
7394
  struct ggml_tensor * a,
7686
7395
  struct ggml_tensor * b,
@@ -7692,20 +7401,14 @@ struct ggml_tensor * ggml_map_custom2_impl_f32(
7692
7401
  is_node = true;
7693
7402
  }
7694
7403
 
7695
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7696
-
7697
- ggml_scratch_save(ctx);
7404
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7698
7405
 
7699
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7700
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7701
-
7702
- ggml_scratch_load(ctx);
7406
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7703
7407
 
7704
7408
  result->op = GGML_OP_MAP_CUSTOM2;
7705
7409
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7706
7410
  result->src[0] = a;
7707
7411
  result->src[1] = b;
7708
- result->src[2] = addr_tensor;
7709
7412
 
7710
7413
  return result;
7711
7414
  }
@@ -7728,7 +7431,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7728
7431
 
7729
7432
  // ggml_map_custom3
7730
7433
 
7731
- struct ggml_tensor * ggml_map_custom3_impl_f32(
7434
+ static struct ggml_tensor * ggml_map_custom3_impl_f32(
7732
7435
  struct ggml_context * ctx,
7733
7436
  struct ggml_tensor * a,
7734
7437
  struct ggml_tensor * b,
@@ -7741,21 +7444,15 @@ struct ggml_tensor * ggml_map_custom3_impl_f32(
7741
7444
  is_node = true;
7742
7445
  }
7743
7446
 
7744
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7745
-
7746
- ggml_scratch_save(ctx);
7747
-
7748
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7749
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7447
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7750
7448
 
7751
- ggml_scratch_load(ctx);
7449
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7752
7450
 
7753
7451
  result->op = GGML_OP_MAP_CUSTOM3;
7754
7452
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7755
7453
  result->src[0] = a;
7756
7454
  result->src[1] = b;
7757
- result->src[2] = addr_tensor;
7758
- result->src[3] = c;
7455
+ result->src[2] = c;
7759
7456
 
7760
7457
  return result;
7761
7458
  }
@@ -8983,21 +8680,17 @@ static void ggml_compute_forward_acc_f32(
8983
8680
  const struct ggml_compute_params * params,
8984
8681
  const struct ggml_tensor * src0,
8985
8682
  const struct ggml_tensor * src1,
8986
- const struct ggml_tensor * opt0,
8987
8683
  struct ggml_tensor * dst) {
8988
8684
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8989
8685
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
8990
8686
 
8991
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
8992
- GGML_ASSERT(ggml_nelements(opt0) == 5);
8993
-
8994
8687
  // view src0 and dst with these strides and data offset inbytes during acc
8995
8688
  // nb0 is implicitely element_size because src0 and dst are contiguous
8996
- size_t nb1 = ((int32_t *) opt0->data)[0];
8997
- size_t nb2 = ((int32_t *) opt0->data)[1];
8998
- size_t nb3 = ((int32_t *) opt0->data)[2];
8999
- size_t offset = ((int32_t *) opt0->data)[3];
9000
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
8689
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
8690
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
8691
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
8692
+ size_t offset = ((int32_t *) dst->op_params)[3];
8693
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
9001
8694
 
9002
8695
  if (!inplace && (params->type == GGML_TASK_INIT)) {
9003
8696
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -9066,13 +8759,12 @@ static void ggml_compute_forward_acc(
9066
8759
  const struct ggml_compute_params * params,
9067
8760
  const struct ggml_tensor * src0,
9068
8761
  const struct ggml_tensor * src1,
9069
- const struct ggml_tensor * opt0,
9070
8762
  struct ggml_tensor * dst) {
9071
8763
 
9072
8764
  switch (src0->type) {
9073
8765
  case GGML_TYPE_F32:
9074
8766
  {
9075
- ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst);
8767
+ ggml_compute_forward_acc_f32(params, src0, src1, dst);
9076
8768
  } break;
9077
8769
  case GGML_TYPE_F16:
9078
8770
  case GGML_TYPE_Q4_0:
@@ -9504,7 +9196,7 @@ static void ggml_compute_forward_sum_f32(
9504
9196
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9505
9197
  for (int64_t i02 = 0; i02 < ne02; i02++) {
9506
9198
  for (int64_t i01 = 0; i01 < ne01; i01++) {
9507
- ggml_vec_sum_ggf(ne00,
9199
+ ggml_vec_sum_f32_ggf(ne00,
9508
9200
  &row_sum,
9509
9201
  (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
9510
9202
  sum += row_sum;
@@ -9514,6 +9206,38 @@ static void ggml_compute_forward_sum_f32(
9514
9206
  ((float *) dst->data)[0] = sum;
9515
9207
  }
9516
9208
 
9209
+ static void ggml_compute_forward_sum_f16(
9210
+ const struct ggml_compute_params * params,
9211
+ const struct ggml_tensor * src0,
9212
+ struct ggml_tensor * dst) {
9213
+ assert(params->ith == 0);
9214
+ assert(ggml_is_scalar(dst));
9215
+
9216
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9217
+ return;
9218
+ }
9219
+
9220
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
9221
+
9222
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
9223
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb);
9224
+
9225
+ float sum = 0;
9226
+ float row_sum = 0;
9227
+
9228
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
9229
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
9230
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
9231
+ ggml_vec_sum_f16_ggf(ne00,
9232
+ &row_sum,
9233
+ (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
9234
+ sum += row_sum;
9235
+ }
9236
+ }
9237
+ }
9238
+ ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
9239
+ }
9240
+
9517
9241
  static void ggml_compute_forward_sum(
9518
9242
  const struct ggml_compute_params * params,
9519
9243
  const struct ggml_tensor * src0,
@@ -9523,6 +9247,10 @@ static void ggml_compute_forward_sum(
9523
9247
  {
9524
9248
  ggml_compute_forward_sum_f32(params, src0, dst);
9525
9249
  } break;
9250
+ case GGML_TYPE_F16:
9251
+ {
9252
+ ggml_compute_forward_sum_f16(params, src0, dst);
9253
+ } break;
9526
9254
  default:
9527
9255
  {
9528
9256
  GGML_ASSERT(false);
@@ -10118,8 +9846,8 @@ static void ggml_compute_forward_gelu_f32(
10118
9846
  const struct ggml_compute_params * params,
10119
9847
  const struct ggml_tensor * src0,
10120
9848
  struct ggml_tensor * dst) {
10121
- GGML_ASSERT(ggml_is_contiguous(src0));
10122
- GGML_ASSERT(ggml_is_contiguous(dst));
9849
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9850
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10123
9851
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10124
9852
 
10125
9853
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10177,8 +9905,8 @@ static void ggml_compute_forward_gelu_quick_f32(
10177
9905
  const struct ggml_compute_params * params,
10178
9906
  const struct ggml_tensor * src0,
10179
9907
  struct ggml_tensor * dst) {
10180
- GGML_ASSERT(ggml_is_contiguous(src0));
10181
- GGML_ASSERT(ggml_is_contiguous(dst));
9908
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9909
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10182
9910
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10183
9911
 
10184
9912
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10236,8 +9964,8 @@ static void ggml_compute_forward_silu_f32(
10236
9964
  const struct ggml_compute_params * params,
10237
9965
  const struct ggml_tensor * src0,
10238
9966
  struct ggml_tensor * dst) {
10239
- GGML_ASSERT(ggml_is_contiguous(src0));
10240
- GGML_ASSERT(ggml_is_contiguous(dst));
9967
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9968
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10241
9969
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10242
9970
 
10243
9971
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10289,7 +10017,6 @@ static void ggml_compute_forward_silu(
10289
10017
  }
10290
10018
  }
10291
10019
 
10292
-
10293
10020
  // ggml_compute_forward_silu_back
10294
10021
 
10295
10022
  static void ggml_compute_forward_silu_back_f32(
@@ -10297,9 +10024,9 @@ static void ggml_compute_forward_silu_back_f32(
10297
10024
  const struct ggml_tensor * src0,
10298
10025
  const struct ggml_tensor * grad,
10299
10026
  struct ggml_tensor * dst) {
10300
- GGML_ASSERT(ggml_is_contiguous(grad));
10301
- GGML_ASSERT(ggml_is_contiguous(src0));
10302
- GGML_ASSERT(ggml_is_contiguous(dst));
10027
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
10028
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
10029
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10303
10030
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10304
10031
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
10305
10032
 
@@ -10439,7 +10166,8 @@ static void ggml_compute_forward_rms_norm_f32(
10439
10166
 
10440
10167
  GGML_TENSOR_UNARY_OP_LOCALS;
10441
10168
 
10442
- const float eps = 1e-6f; // TODO: make this a parameter
10169
+ float eps;
10170
+ memcpy(&eps, dst->op_params, sizeof(float));
10443
10171
 
10444
10172
  // TODO: optimize
10445
10173
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -11092,21 +10820,17 @@ static void ggml_compute_forward_set_f32(
11092
10820
  const struct ggml_compute_params * params,
11093
10821
  const struct ggml_tensor * src0,
11094
10822
  const struct ggml_tensor * src1,
11095
- const struct ggml_tensor * opt0,
11096
10823
  struct ggml_tensor * dst) {
11097
10824
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11098
10825
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
11099
10826
 
11100
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
11101
- GGML_ASSERT(ggml_nelements(opt0) == 5);
11102
-
11103
10827
  // view src0 and dst with these strides and data offset inbytes during set
11104
10828
  // nb0 is implicitely element_size because src0 and dst are contiguous
11105
- size_t nb1 = ((int32_t *) opt0->data)[0];
11106
- size_t nb2 = ((int32_t *) opt0->data)[1];
11107
- size_t nb3 = ((int32_t *) opt0->data)[2];
11108
- size_t offset = ((int32_t *) opt0->data)[3];
11109
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
10829
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
10830
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
10831
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
10832
+ size_t offset = ((int32_t *) dst->op_params)[3];
10833
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
11110
10834
 
11111
10835
  if (!inplace && (params->type == GGML_TASK_INIT)) {
11112
10836
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -11166,13 +10890,12 @@ static void ggml_compute_forward_set(
11166
10890
  const struct ggml_compute_params * params,
11167
10891
  const struct ggml_tensor * src0,
11168
10892
  const struct ggml_tensor * src1,
11169
- const struct ggml_tensor * opt0,
11170
10893
  struct ggml_tensor * dst) {
11171
10894
 
11172
10895
  switch (src0->type) {
11173
10896
  case GGML_TYPE_F32:
11174
10897
  {
11175
- ggml_compute_forward_set_f32(params, src0, src1, opt0, dst);
10898
+ ggml_compute_forward_set_f32(params, src0, src1, dst);
11176
10899
  } break;
11177
10900
  case GGML_TYPE_F16:
11178
10901
  case GGML_TYPE_Q4_0:
@@ -11568,17 +11291,14 @@ static void ggml_compute_forward_diag(
11568
11291
  static void ggml_compute_forward_diag_mask_f32(
11569
11292
  const struct ggml_compute_params * params,
11570
11293
  const struct ggml_tensor * src0,
11571
- const struct ggml_tensor * src1,
11572
11294
  struct ggml_tensor * dst,
11573
11295
  const float value) {
11574
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11575
- GGML_ASSERT(ggml_nelements(src1) == 2);
11576
11296
 
11577
11297
  const int ith = params->ith;
11578
11298
  const int nth = params->nth;
11579
11299
 
11580
- const int n_past = ((int32_t *) src1->data)[0];
11581
- const bool inplace = (bool)((int32_t *) src1->data)[1];
11300
+ const int n_past = ((int32_t *) dst->op_params)[0];
11301
+ const bool inplace = (bool)((int32_t *) dst->op_params)[1];
11582
11302
 
11583
11303
  GGML_ASSERT(n_past >= 0);
11584
11304
 
@@ -11621,12 +11341,11 @@ static void ggml_compute_forward_diag_mask_f32(
11621
11341
  static void ggml_compute_forward_diag_mask_inf(
11622
11342
  const struct ggml_compute_params * params,
11623
11343
  const struct ggml_tensor * src0,
11624
- const struct ggml_tensor * src1,
11625
11344
  struct ggml_tensor * dst) {
11626
11345
  switch (src0->type) {
11627
11346
  case GGML_TYPE_F32:
11628
11347
  {
11629
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY);
11348
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
11630
11349
  } break;
11631
11350
  default:
11632
11351
  {
@@ -11638,12 +11357,11 @@ static void ggml_compute_forward_diag_mask_inf(
11638
11357
  static void ggml_compute_forward_diag_mask_zero(
11639
11358
  const struct ggml_compute_params * params,
11640
11359
  const struct ggml_tensor * src0,
11641
- const struct ggml_tensor * src1,
11642
11360
  struct ggml_tensor * dst) {
11643
11361
  switch (src0->type) {
11644
11362
  case GGML_TYPE_F32:
11645
11363
  {
11646
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0);
11364
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
11647
11365
  } break;
11648
11366
  default:
11649
11367
  {
@@ -11841,20 +11559,17 @@ static void ggml_compute_forward_soft_max_back(
11841
11559
  static void ggml_compute_forward_alibi_f32(
11842
11560
  const struct ggml_compute_params * params,
11843
11561
  const struct ggml_tensor * src0,
11844
- const struct ggml_tensor * src1,
11845
11562
  struct ggml_tensor * dst) {
11846
11563
  assert(params->ith == 0);
11847
11564
 
11848
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11849
- GGML_ASSERT(ggml_nelements(src1) == 3);
11850
-
11851
11565
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11852
11566
  return;
11853
11567
  }
11854
11568
 
11855
- const int n_past = ((int32_t *) src1->data)[0];
11856
- const int n_head = ((int32_t *) src1->data)[1];
11857
- const float max_bias = ((float *) src1->data)[2];
11569
+ const int n_past = ((int32_t *) dst->op_params)[0];
11570
+ const int n_head = ((int32_t *) dst->op_params)[1];
11571
+ float max_bias;
11572
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11858
11573
 
11859
11574
  assert(n_past >= 0);
11860
11575
 
@@ -11907,20 +11622,17 @@ static void ggml_compute_forward_alibi_f32(
11907
11622
  static void ggml_compute_forward_alibi_f16(
11908
11623
  const struct ggml_compute_params * params,
11909
11624
  const struct ggml_tensor * src0,
11910
- const struct ggml_tensor * src1,
11911
11625
  struct ggml_tensor * dst) {
11912
11626
  assert(params->ith == 0);
11913
11627
 
11914
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11915
- GGML_ASSERT(ggml_nelements(src1) == 3);
11916
-
11917
11628
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11918
11629
  return;
11919
11630
  }
11920
11631
 
11921
- const int n_past = ((int32_t *) src1->data)[0];
11922
- const int n_head = ((int32_t *) src1->data)[1];
11923
- const float max_bias = ((float *) src1->data)[2];
11632
+ const int n_past = ((int32_t *) dst->op_params)[0];
11633
+ const int n_head = ((int32_t *) dst->op_params)[1];
11634
+ float max_bias;
11635
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11924
11636
 
11925
11637
  assert(n_past >= 0);
11926
11638
 
@@ -11973,16 +11685,15 @@ static void ggml_compute_forward_alibi_f16(
11973
11685
  static void ggml_compute_forward_alibi(
11974
11686
  const struct ggml_compute_params * params,
11975
11687
  const struct ggml_tensor * src0,
11976
- const struct ggml_tensor * src1,
11977
11688
  struct ggml_tensor * dst) {
11978
11689
  switch (src0->type) {
11979
11690
  case GGML_TYPE_F16:
11980
11691
  {
11981
- ggml_compute_forward_alibi_f16(params, src0, src1, dst);
11692
+ ggml_compute_forward_alibi_f16(params, src0, dst);
11982
11693
  } break;
11983
11694
  case GGML_TYPE_F32:
11984
11695
  {
11985
- ggml_compute_forward_alibi_f32(params, src0, src1, dst);
11696
+ ggml_compute_forward_alibi_f32(params, src0, dst);
11986
11697
  } break;
11987
11698
  case GGML_TYPE_Q4_0:
11988
11699
  case GGML_TYPE_Q4_1:
@@ -12012,19 +11723,17 @@ static void ggml_compute_forward_alibi(
12012
11723
  static void ggml_compute_forward_clamp_f32(
12013
11724
  const struct ggml_compute_params * params,
12014
11725
  const struct ggml_tensor * src0,
12015
- const struct ggml_tensor * src1,
12016
11726
  struct ggml_tensor * dst) {
12017
11727
  assert(params->ith == 0);
12018
11728
 
12019
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
12020
- GGML_ASSERT(ggml_nelements(src1) == 2);
12021
-
12022
11729
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12023
11730
  return;
12024
11731
  }
12025
11732
 
12026
- const float min = ((float *) src1->data)[0];
12027
- const float max = ((float *) src1->data)[1];
11733
+ float min;
11734
+ float max;
11735
+ memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
11736
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
12028
11737
 
12029
11738
  const int ith = params->ith;
12030
11739
  const int nth = params->nth;
@@ -12054,12 +11763,11 @@ static void ggml_compute_forward_clamp_f32(
12054
11763
  static void ggml_compute_forward_clamp(
12055
11764
  const struct ggml_compute_params * params,
12056
11765
  const struct ggml_tensor * src0,
12057
- const struct ggml_tensor * src1,
12058
11766
  struct ggml_tensor * dst) {
12059
11767
  switch (src0->type) {
12060
11768
  case GGML_TYPE_F32:
12061
11769
  {
12062
- ggml_compute_forward_clamp_f32(params, src0, src1, dst);
11770
+ ggml_compute_forward_clamp_f32(params, src0, dst);
12063
11771
  } break;
12064
11772
  case GGML_TYPE_F16:
12065
11773
  case GGML_TYPE_Q4_0:
@@ -12089,10 +11797,7 @@ static void ggml_compute_forward_clamp(
12089
11797
  static void ggml_compute_forward_rope_f32(
12090
11798
  const struct ggml_compute_params * params,
12091
11799
  const struct ggml_tensor * src0,
12092
- const struct ggml_tensor * src1,
12093
11800
  struct ggml_tensor * dst) {
12094
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12095
- GGML_ASSERT(ggml_nelements(src1) == 6);
12096
11801
 
12097
11802
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12098
11803
  return;
@@ -12101,12 +11806,12 @@ static void ggml_compute_forward_rope_f32(
12101
11806
  float freq_base;
12102
11807
  float freq_scale;
12103
11808
 
12104
- const int n_past = ((int32_t *) src1->data)[0];
12105
- const int n_dims = ((int32_t *) src1->data)[1];
12106
- const int mode = ((int32_t *) src1->data)[2];
12107
- const int n_ctx = ((int32_t *) src1->data)[3];
12108
- memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12109
- memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
11809
+ const int n_past = ((int32_t *) dst->op_params)[0];
11810
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11811
+ const int mode = ((int32_t *) dst->op_params)[2];
11812
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11813
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11814
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12110
11815
 
12111
11816
  assert(n_past >= 0);
12112
11817
 
@@ -12221,10 +11926,7 @@ static void ggml_compute_forward_rope_f32(
12221
11926
  static void ggml_compute_forward_rope_f16(
12222
11927
  const struct ggml_compute_params * params,
12223
11928
  const struct ggml_tensor * src0,
12224
- const struct ggml_tensor * src1,
12225
11929
  struct ggml_tensor * dst) {
12226
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12227
- GGML_ASSERT(ggml_nelements(src1) == 6);
12228
11930
 
12229
11931
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12230
11932
  return;
@@ -12233,12 +11935,12 @@ static void ggml_compute_forward_rope_f16(
12233
11935
  float freq_base;
12234
11936
  float freq_scale;
12235
11937
 
12236
- const int n_past = ((int32_t *) src1->data)[0];
12237
- const int n_dims = ((int32_t *) src1->data)[1];
12238
- const int mode = ((int32_t *) src1->data)[2];
12239
- const int n_ctx = ((int32_t *) src1->data)[3];
12240
- memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
12241
- memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
11938
+ const int n_past = ((int32_t *) dst->op_params)[0];
11939
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11940
+ const int mode = ((int32_t *) dst->op_params)[2];
11941
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11942
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11943
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12242
11944
 
12243
11945
  assert(n_past >= 0);
12244
11946
 
@@ -12353,16 +12055,15 @@ static void ggml_compute_forward_rope_f16(
12353
12055
  static void ggml_compute_forward_rope(
12354
12056
  const struct ggml_compute_params * params,
12355
12057
  const struct ggml_tensor * src0,
12356
- const struct ggml_tensor * src1,
12357
12058
  struct ggml_tensor * dst) {
12358
12059
  switch (src0->type) {
12359
12060
  case GGML_TYPE_F16:
12360
12061
  {
12361
- ggml_compute_forward_rope_f16(params, src0, src1, dst);
12062
+ ggml_compute_forward_rope_f16(params, src0, dst);
12362
12063
  } break;
12363
12064
  case GGML_TYPE_F32:
12364
12065
  {
12365
- ggml_compute_forward_rope_f32(params, src0, src1, dst);
12066
+ ggml_compute_forward_rope_f32(params, src0, dst);
12366
12067
  } break;
12367
12068
  default:
12368
12069
  {
@@ -12376,10 +12077,7 @@ static void ggml_compute_forward_rope(
12376
12077
  static void ggml_compute_forward_rope_back_f32(
12377
12078
  const struct ggml_compute_params * params,
12378
12079
  const struct ggml_tensor * src0,
12379
- const struct ggml_tensor * src1,
12380
12080
  struct ggml_tensor * dst) {
12381
- assert(src1->type == GGML_TYPE_I32);
12382
- assert(ggml_nelements(src1) == 4);
12383
12081
 
12384
12082
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12385
12083
  return;
@@ -12389,9 +12087,9 @@ static void ggml_compute_forward_rope_back_f32(
12389
12087
  // dx = rope_back(dy, src1)
12390
12088
  // src0 is dy, src1 contains options
12391
12089
 
12392
- const int n_past = ((int32_t *) src1->data)[0];
12393
- const int n_dims = ((int32_t *) src1->data)[1];
12394
- const int mode = ((int32_t *) src1->data)[2];
12090
+ const int n_past = ((int32_t *) dst->op_params)[0];
12091
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12092
+ const int mode = ((int32_t *) dst->op_params)[2];
12395
12093
 
12396
12094
  assert(n_past >= 0);
12397
12095
 
@@ -12475,10 +12173,7 @@ static void ggml_compute_forward_rope_back_f32(
12475
12173
  static void ggml_compute_forward_rope_back_f16(
12476
12174
  const struct ggml_compute_params * params,
12477
12175
  const struct ggml_tensor * src0,
12478
- const struct ggml_tensor * src1,
12479
12176
  struct ggml_tensor * dst) {
12480
- assert(src1->type == GGML_TYPE_I32);
12481
- assert(ggml_nelements(src1) == 3);
12482
12177
 
12483
12178
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12484
12179
  return;
@@ -12488,9 +12183,9 @@ static void ggml_compute_forward_rope_back_f16(
12488
12183
  // dx = rope_back(dy, src1)
12489
12184
  // src0 is dy, src1 contains options
12490
12185
 
12491
- const int n_past = ((int32_t *) src1->data)[0];
12492
- const int n_dims = ((int32_t *) src1->data)[1];
12493
- const int mode = ((int32_t *) src1->data)[2];
12186
+ const int n_past = ((int32_t *) dst->op_params)[0];
12187
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12188
+ const int mode = ((int32_t *) dst->op_params)[2];
12494
12189
 
12495
12190
  assert(n_past >= 0);
12496
12191
 
@@ -12574,16 +12269,15 @@ static void ggml_compute_forward_rope_back_f16(
12574
12269
  static void ggml_compute_forward_rope_back(
12575
12270
  const struct ggml_compute_params * params,
12576
12271
  const struct ggml_tensor * src0,
12577
- const struct ggml_tensor * src1,
12578
12272
  struct ggml_tensor * dst) {
12579
12273
  switch (src0->type) {
12580
12274
  case GGML_TYPE_F16:
12581
12275
  {
12582
- ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
12276
+ ggml_compute_forward_rope_back_f16(params, src0, dst);
12583
12277
  } break;
12584
12278
  case GGML_TYPE_F32:
12585
12279
  {
12586
- ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
12280
+ ggml_compute_forward_rope_back_f32(params, src0, dst);
12587
12281
  } break;
12588
12282
  default:
12589
12283
  {
@@ -12780,7 +12474,7 @@ static void ggml_compute_forward_conv_1d_s1_ph(
12780
12474
  const struct ggml_compute_params * params,
12781
12475
  const struct ggml_tensor * src0,
12782
12476
  const struct ggml_tensor * src1,
12783
- struct ggml_tensor * dst) {
12477
+ struct ggml_tensor * dst) {
12784
12478
  switch (src0->type) {
12785
12479
  case GGML_TYPE_F16:
12786
12480
  {
@@ -12983,7 +12677,7 @@ static void ggml_compute_forward_conv_1d_s2_ph(
12983
12677
  const struct ggml_compute_params * params,
12984
12678
  const struct ggml_tensor * src0,
12985
12679
  const struct ggml_tensor * src1,
12986
- struct ggml_tensor * dst) {
12680
+ struct ggml_tensor * dst) {
12987
12681
  switch (src0->type) {
12988
12682
  case GGML_TYPE_F16:
12989
12683
  {
@@ -13003,14 +12697,13 @@ static void ggml_compute_forward_conv_1d_s2_ph(
13003
12697
  // ggml_compute_forward_conv_1d
13004
12698
 
13005
12699
  static void ggml_compute_forward_conv_1d(
13006
- const struct ggml_compute_params * params,
13007
- const struct ggml_tensor * src0,
13008
- const struct ggml_tensor * src1,
13009
- const struct ggml_tensor * opt0,
13010
- struct ggml_tensor * dst) {
13011
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13012
- const int32_t p0 = ((const int32_t*)(opt0->data))[1];
13013
- const int32_t d0 = ((const int32_t*)(opt0->data))[2];
12700
+ const struct ggml_compute_params * params,
12701
+ const struct ggml_tensor * src0,
12702
+ const struct ggml_tensor * src1,
12703
+ struct ggml_tensor * dst) {
12704
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12705
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
12706
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
13014
12707
  GGML_ASSERT(d0 == 1); // dilation not supported
13015
12708
  GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
13016
12709
  if (s0 == 1) {
@@ -13028,7 +12721,6 @@ static void ggml_compute_forward_conv_2d_f16_f32(
13028
12721
  const struct ggml_compute_params * params,
13029
12722
  const struct ggml_tensor * src0,
13030
12723
  const struct ggml_tensor * src1,
13031
- const struct ggml_tensor * opt0,
13032
12724
  struct ggml_tensor * dst) {
13033
12725
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13034
12726
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13048,12 +12740,12 @@ static void ggml_compute_forward_conv_2d_f16_f32(
13048
12740
  // size of the convolution row - the kernel size unrolled across all channels
13049
12741
  const int ew0 = nk0*nk1*ne02;
13050
12742
 
13051
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13052
- const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13053
- const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13054
- const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13055
- const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13056
- const int32_t d1 = ((const int32_t*)(opt0->data))[5];
12743
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12744
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12745
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12746
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12747
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12748
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
13057
12749
 
13058
12750
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13059
12751
  GGML_ASSERT(nb10 == sizeof(float));
@@ -13125,17 +12817,15 @@ static void ggml_compute_forward_conv_2d(
13125
12817
  const struct ggml_compute_params * params,
13126
12818
  const struct ggml_tensor * src0,
13127
12819
  const struct ggml_tensor * src1,
13128
- const struct ggml_tensor * opt0,
13129
- struct ggml_tensor * dst
13130
- ) {
12820
+ struct ggml_tensor * dst) {
13131
12821
  switch (src0->type) {
13132
12822
  case GGML_TYPE_F16:
13133
12823
  {
13134
- ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
12824
+ ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
13135
12825
  } break;
13136
12826
  case GGML_TYPE_F32:
13137
12827
  {
13138
- //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
12828
+ //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
13139
12829
  GGML_ASSERT(false);
13140
12830
  } break;
13141
12831
  default:
@@ -13200,12 +12890,11 @@ static void ggml_compute_forward_pool_1d_sk_p0(
13200
12890
  // ggml_compute_forward_pool_1d
13201
12891
 
13202
12892
  static void ggml_compute_forward_pool_1d(
13203
- const struct ggml_compute_params* params,
13204
- const struct ggml_tensor* src0,
13205
- const struct ggml_tensor* opt0,
13206
- struct ggml_tensor* dst) {
13207
- GGML_ASSERT(opt0->ne[0] == 4);
13208
- const int* opts = (const int*)opt0->data;
12893
+ const struct ggml_compute_params * params,
12894
+ const struct ggml_tensor * src0,
12895
+ struct ggml_tensor * dst) {
12896
+
12897
+ const int32_t* opts = (const int32_t*)dst->op_params;
13209
12898
  enum ggml_op_pool op = opts[0];
13210
12899
  const int k0 = opts[1];
13211
12900
  const int s0 = opts[2];
@@ -13219,12 +12908,12 @@ static void ggml_compute_forward_pool_1d(
13219
12908
  // ggml_compute_forward_pool_2d_sk_p0
13220
12909
 
13221
12910
  static void ggml_compute_forward_pool_2d_sk_p0(
13222
- const struct ggml_compute_params * params,
13223
- const enum ggml_op_pool op,
13224
- const struct ggml_tensor * src,
13225
- const int k0,
13226
- const int k1,
13227
- struct ggml_tensor * dst) {
12911
+ const struct ggml_compute_params * params,
12912
+ const enum ggml_op_pool op,
12913
+ const struct ggml_tensor * src,
12914
+ const int k0,
12915
+ const int k1,
12916
+ struct ggml_tensor * dst) {
13228
12917
  assert(src->type == GGML_TYPE_F32);
13229
12918
  assert(params->ith == 0);
13230
12919
 
@@ -13284,12 +12973,11 @@ static void ggml_compute_forward_pool_2d_sk_p0(
13284
12973
  // ggml_compute_forward_pool_2d
13285
12974
 
13286
12975
  static void ggml_compute_forward_pool_2d(
13287
- const struct ggml_compute_params * params,
13288
- const struct ggml_tensor * src0,
13289
- const struct ggml_tensor * opt0,
13290
- struct ggml_tensor * dst) {
13291
- GGML_ASSERT(opt0->ne[0] == 7);
13292
- const int* opts = (const int*)opt0->data;
12976
+ const struct ggml_compute_params * params,
12977
+ const struct ggml_tensor * src0,
12978
+ struct ggml_tensor * dst) {
12979
+
12980
+ const int32_t * opts = (const int32_t *)dst->op_params;
13293
12981
  enum ggml_op_pool op = opts[0];
13294
12982
  const int k0 = opts[1];
13295
12983
  const int k1 = opts[2];
@@ -13314,7 +13002,7 @@ static void ggml_compute_forward_flash_attn_f32(
13314
13002
  const struct ggml_tensor * k,
13315
13003
  const struct ggml_tensor * v,
13316
13004
  const bool masked,
13317
- struct ggml_tensor * dst) {
13005
+ struct ggml_tensor * dst) {
13318
13006
  int64_t t0 = ggml_perf_time_us();
13319
13007
  UNUSED(t0);
13320
13008
 
@@ -13492,7 +13180,7 @@ static void ggml_compute_forward_flash_attn_f16(
13492
13180
  const struct ggml_tensor * k,
13493
13181
  const struct ggml_tensor * v,
13494
13182
  const bool masked,
13495
- struct ggml_tensor * dst) {
13183
+ struct ggml_tensor * dst) {
13496
13184
  int64_t t0 = ggml_perf_time_us();
13497
13185
  UNUSED(t0);
13498
13186
 
@@ -14257,7 +13945,6 @@ static void ggml_compute_forward_flash_attn_back(
14257
13945
  static void ggml_compute_forward_win_part_f32(
14258
13946
  const struct ggml_compute_params * params,
14259
13947
  const struct ggml_tensor * src0,
14260
- const struct ggml_tensor * opt0,
14261
13948
  struct ggml_tensor * dst) {
14262
13949
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14263
13950
  return;
@@ -14266,9 +13953,9 @@ static void ggml_compute_forward_win_part_f32(
14266
13953
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14267
13954
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14268
13955
 
14269
- const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14270
- const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
14271
- const int32_t w = ((const int32_t *)(opt0->data))[2];
13956
+ const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
13957
+ const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
13958
+ const int32_t w = ((const int32_t *)(dst->op_params))[2];
14272
13959
 
14273
13960
  assert(ne00 == ne0);
14274
13961
  assert(ne3 == nep0*nep1);
@@ -14302,12 +13989,11 @@ static void ggml_compute_forward_win_part_f32(
14302
13989
  static void ggml_compute_forward_win_part(
14303
13990
  const struct ggml_compute_params * params,
14304
13991
  const struct ggml_tensor * src0,
14305
- const struct ggml_tensor * opt0,
14306
13992
  struct ggml_tensor * dst) {
14307
13993
  switch (src0->type) {
14308
13994
  case GGML_TYPE_F32:
14309
13995
  {
14310
- ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
13996
+ ggml_compute_forward_win_part_f32(params, src0, dst);
14311
13997
  } break;
14312
13998
  default:
14313
13999
  {
@@ -14321,7 +14007,6 @@ static void ggml_compute_forward_win_part(
14321
14007
  static void ggml_compute_forward_win_unpart_f32(
14322
14008
  const struct ggml_compute_params * params,
14323
14009
  const struct ggml_tensor * src0,
14324
- const struct ggml_tensor * opt0,
14325
14010
  struct ggml_tensor * dst) {
14326
14011
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14327
14012
  return;
@@ -14330,7 +14015,7 @@ static void ggml_compute_forward_win_unpart_f32(
14330
14015
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14331
14016
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14332
14017
 
14333
- const int32_t w = ((const int32_t *)(opt0->data))[0];
14018
+ const int32_t w = ((const int32_t *)(dst->op_params))[0];
14334
14019
 
14335
14020
  // padding
14336
14021
  const int px = (w - ne1%w)%w;
@@ -14364,12 +14049,67 @@ static void ggml_compute_forward_win_unpart_f32(
14364
14049
  static void ggml_compute_forward_win_unpart(
14365
14050
  const struct ggml_compute_params * params,
14366
14051
  const struct ggml_tensor * src0,
14367
- const struct ggml_tensor * opt0,
14368
14052
  struct ggml_tensor * dst) {
14369
14053
  switch (src0->type) {
14370
14054
  case GGML_TYPE_F32:
14371
14055
  {
14372
- ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
14056
+ ggml_compute_forward_win_unpart_f32(params, src0, dst);
14057
+ } break;
14058
+ default:
14059
+ {
14060
+ GGML_ASSERT(false);
14061
+ } break;
14062
+ }
14063
+ }
14064
+
14065
+ //gmml_compute_forward_unary
14066
+
14067
+ static void ggml_compute_forward_unary(
14068
+ const struct ggml_compute_params * params,
14069
+ const struct ggml_tensor * src0,
14070
+ struct ggml_tensor * dst) {
14071
+ const enum ggml_unary_op op = ggml_get_unary_op(dst);
14072
+
14073
+ switch (op) {
14074
+ case GGML_UNARY_OP_ABS:
14075
+ {
14076
+ ggml_compute_forward_abs(params, src0, dst);
14077
+ } break;
14078
+ case GGML_UNARY_OP_SGN:
14079
+ {
14080
+ ggml_compute_forward_sgn(params, src0, dst);
14081
+ } break;
14082
+ case GGML_UNARY_OP_NEG:
14083
+ {
14084
+ ggml_compute_forward_neg(params, src0, dst);
14085
+ } break;
14086
+ case GGML_UNARY_OP_STEP:
14087
+ {
14088
+ ggml_compute_forward_step(params, src0, dst);
14089
+ } break;
14090
+ case GGML_UNARY_OP_TANH:
14091
+ {
14092
+ ggml_compute_forward_tanh(params, src0, dst);
14093
+ } break;
14094
+ case GGML_UNARY_OP_ELU:
14095
+ {
14096
+ ggml_compute_forward_elu(params, src0, dst);
14097
+ } break;
14098
+ case GGML_UNARY_OP_RELU:
14099
+ {
14100
+ ggml_compute_forward_relu(params, src0, dst);
14101
+ } break;
14102
+ case GGML_UNARY_OP_GELU:
14103
+ {
14104
+ ggml_compute_forward_gelu(params, src0, dst);
14105
+ } break;
14106
+ case GGML_UNARY_OP_GELU_QUICK:
14107
+ {
14108
+ ggml_compute_forward_gelu_quick(params, src0, dst);
14109
+ } break;
14110
+ case GGML_UNARY_OP_SILU:
14111
+ {
14112
+ ggml_compute_forward_silu(params, src0, dst);
14373
14113
  } break;
14374
14114
  default:
14375
14115
  {
@@ -14888,7 +14628,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14888
14628
  } break;
14889
14629
  case GGML_OP_ACC:
14890
14630
  {
14891
- ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14631
+ ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
14892
14632
  } break;
14893
14633
  case GGML_OP_SUB:
14894
14634
  {
@@ -14938,46 +14678,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14938
14678
  {
14939
14679
  ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
14940
14680
  } break;
14941
- case GGML_OP_ABS:
14942
- {
14943
- ggml_compute_forward_abs(params, tensor->src[0], tensor);
14944
- } break;
14945
- case GGML_OP_SGN:
14946
- {
14947
- ggml_compute_forward_sgn(params, tensor->src[0], tensor);
14948
- } break;
14949
- case GGML_OP_NEG:
14950
- {
14951
- ggml_compute_forward_neg(params, tensor->src[0], tensor);
14952
- } break;
14953
- case GGML_OP_STEP:
14954
- {
14955
- ggml_compute_forward_step(params, tensor->src[0], tensor);
14956
- } break;
14957
- case GGML_OP_TANH:
14958
- {
14959
- ggml_compute_forward_tanh(params, tensor->src[0], tensor);
14960
- } break;
14961
- case GGML_OP_ELU:
14962
- {
14963
- ggml_compute_forward_elu(params, tensor->src[0], tensor);
14964
- } break;
14965
- case GGML_OP_RELU:
14966
- {
14967
- ggml_compute_forward_relu(params, tensor->src[0], tensor);
14968
- } break;
14969
- case GGML_OP_GELU:
14970
- {
14971
- ggml_compute_forward_gelu(params, tensor->src[0], tensor);
14972
- } break;
14973
- case GGML_OP_GELU_QUICK:
14974
- {
14975
- ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
14976
- } break;
14977
- case GGML_OP_SILU:
14978
- {
14979
- ggml_compute_forward_silu(params, tensor->src[0], tensor);
14980
- } break;
14981
14681
  case GGML_OP_SILU_BACK:
14982
14682
  {
14983
14683
  ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
@@ -15008,7 +14708,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15008
14708
  } break;
15009
14709
  case GGML_OP_SET:
15010
14710
  {
15011
- ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14711
+ ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
15012
14712
  } break;
15013
14713
  case GGML_OP_CPY:
15014
14714
  {
@@ -15048,11 +14748,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15048
14748
  } break;
15049
14749
  case GGML_OP_DIAG_MASK_INF:
15050
14750
  {
15051
- ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor);
14751
+ ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
15052
14752
  } break;
15053
14753
  case GGML_OP_DIAG_MASK_ZERO:
15054
14754
  {
15055
- ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor);
14755
+ ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
15056
14756
  } break;
15057
14757
  case GGML_OP_SOFT_MAX:
15058
14758
  {
@@ -15064,39 +14764,39 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15064
14764
  } break;
15065
14765
  case GGML_OP_ROPE:
15066
14766
  {
15067
- ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
14767
+ ggml_compute_forward_rope(params, tensor->src[0], tensor);
15068
14768
  } break;
15069
14769
  case GGML_OP_ROPE_BACK:
15070
14770
  {
15071
- ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
14771
+ ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
15072
14772
  } break;
15073
14773
  case GGML_OP_ALIBI:
15074
14774
  {
15075
- ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor);
14775
+ ggml_compute_forward_alibi(params, tensor->src[0], tensor);
15076
14776
  } break;
15077
14777
  case GGML_OP_CLAMP:
15078
14778
  {
15079
- ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor);
14779
+ ggml_compute_forward_clamp(params, tensor->src[0], tensor);
15080
14780
  } break;
15081
14781
  case GGML_OP_CONV_1D:
15082
14782
  {
15083
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14783
+ ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
15084
14784
  } break;
15085
14785
  case GGML_OP_CONV_2D:
15086
14786
  {
15087
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14787
+ ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
15088
14788
  } break;
15089
14789
  case GGML_OP_POOL_1D:
15090
14790
  {
15091
- ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
14791
+ ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
15092
14792
  } break;
15093
14793
  case GGML_OP_POOL_2D:
15094
14794
  {
15095
- ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
14795
+ ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
15096
14796
  } break;
15097
14797
  case GGML_OP_FLASH_ATTN:
15098
14798
  {
15099
- const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
14799
+ const int32_t t = ggml_get_op_params_i32(tensor, 0);
15100
14800
  GGML_ASSERT(t == 0 || t == 1);
15101
14801
  const bool masked = t != 0;
15102
14802
  ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
@@ -15107,47 +14807,56 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15107
14807
  } break;
15108
14808
  case GGML_OP_FLASH_ATTN_BACK:
15109
14809
  {
15110
- int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
14810
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15111
14811
  GGML_ASSERT(t == 0 || t == 1);
15112
14812
  bool masked = t != 0;
15113
14813
  ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
15114
14814
  } break;
15115
14815
  case GGML_OP_WIN_PART:
15116
14816
  {
15117
- ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor);
14817
+ ggml_compute_forward_win_part(params, tensor->src[0], tensor);
15118
14818
  } break;
15119
14819
  case GGML_OP_WIN_UNPART:
15120
14820
  {
15121
- ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor);
14821
+ ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
14822
+ } break;
14823
+ case GGML_OP_UNARY:
14824
+ {
14825
+ ggml_compute_forward_unary(params, tensor->src[0], tensor);
15122
14826
  } break;
15123
14827
  case GGML_OP_MAP_UNARY:
15124
14828
  {
15125
- const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data);
14829
+ ggml_unary_op_f32_t fun;
14830
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15126
14831
  ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
15127
14832
  }
15128
14833
  break;
15129
14834
  case GGML_OP_MAP_BINARY:
15130
14835
  {
15131
- const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data);
14836
+ ggml_binary_op_f32_t fun;
14837
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15132
14838
  ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
15133
14839
  }
15134
14840
  break;
15135
14841
  case GGML_OP_MAP_CUSTOM1:
15136
14842
  {
15137
- const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data);
14843
+ ggml_custom1_op_f32_t fun;
14844
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15138
14845
  ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
15139
14846
  }
15140
14847
  break;
15141
14848
  case GGML_OP_MAP_CUSTOM2:
15142
14849
  {
15143
- const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data);
14850
+ ggml_custom2_op_f32_t fun;
14851
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15144
14852
  ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
15145
14853
  }
15146
14854
  break;
15147
14855
  case GGML_OP_MAP_CUSTOM3:
15148
14856
  {
15149
- const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data);
15150
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun);
14857
+ ggml_custom3_op_f32_t fun;
14858
+ memcpy(&fun, tensor->op_params, sizeof(fun));
14859
+ ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15151
14860
  }
15152
14861
  break;
15153
14862
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -15211,12 +14920,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15211
14920
  src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
15212
14921
  }
15213
14922
  if (src1->grad) {
15214
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15215
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15216
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15217
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15218
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15219
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
14923
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
14924
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
14925
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
14926
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15220
14927
 
15221
14928
  struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
15222
14929
  tensor->grad,
@@ -15365,73 +15072,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15365
15072
  inplace);
15366
15073
  }
15367
15074
  } break;
15368
- case GGML_OP_ABS:
15369
- {
15370
- if (src0->grad) {
15371
- src0->grad =
15372
- ggml_add_impl(ctx,
15373
- src0->grad,
15374
- ggml_mul(ctx,
15375
- ggml_sgn(ctx, src0),
15376
- tensor->grad),
15377
- inplace);
15378
- }
15379
- } break;
15380
- case GGML_OP_SGN:
15381
- {
15382
- if (src0->grad) {
15383
- // noop
15384
- }
15385
- } break;
15386
- case GGML_OP_NEG:
15387
- {
15388
- if (src0->grad) {
15389
- src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15390
- }
15391
- } break;
15392
- case GGML_OP_STEP:
15393
- {
15394
- if (src0->grad) {
15395
- // noop
15396
- }
15397
- } break;
15398
- case GGML_OP_TANH:
15399
- {
15400
- GGML_ASSERT(false); // TODO: not implemented
15401
- } break;
15402
- case GGML_OP_ELU:
15403
- {
15404
- GGML_ASSERT(false); // TODO: not implemented
15405
- } break;
15406
- case GGML_OP_RELU:
15407
- {
15408
- if (src0->grad) {
15409
- src0->grad = ggml_sub_impl(ctx,
15410
- src0->grad,
15411
- ggml_mul(ctx,
15412
- ggml_step(ctx, src0),
15413
- tensor->grad),
15414
- inplace);
15415
- }
15416
- } break;
15417
- case GGML_OP_GELU:
15418
- {
15419
- GGML_ASSERT(false); // TODO: not implemented
15420
- } break;
15421
- case GGML_OP_GELU_QUICK:
15422
- {
15423
- GGML_ASSERT(false); // TODO: not implemented
15424
- } break;
15425
- case GGML_OP_SILU:
15426
- {
15427
- // necessary for llama
15428
- if (src0->grad) {
15429
- src0->grad = ggml_add_impl(ctx,
15430
- src0->grad,
15431
- ggml_silu_back(ctx, src0, tensor->grad),
15432
- inplace);
15433
- }
15434
- } break;
15435
15075
  case GGML_OP_SILU_BACK:
15436
15076
  {
15437
15077
  GGML_ASSERT(false); // TODO: not implemented
@@ -15524,12 +15164,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15524
15164
  } break;
15525
15165
  case GGML_OP_SET:
15526
15166
  {
15527
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15528
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15529
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15530
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15531
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15532
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
15167
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
15168
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
15169
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
15170
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15533
15171
 
15534
15172
  struct ggml_tensor * tensor_grad_view = NULL;
15535
15173
 
@@ -15606,8 +15244,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15606
15244
  if (src0->grad) {
15607
15245
  size_t offset;
15608
15246
 
15609
- GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2]));
15610
- memcpy(&offset, tensor->src[2]->data, sizeof(offset));
15247
+ memcpy(&offset, tensor->op_params, sizeof(offset));
15611
15248
 
15612
15249
  size_t nb1 = tensor->nb[1];
15613
15250
  size_t nb2 = tensor->nb[2];
@@ -15634,7 +15271,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15634
15271
  {
15635
15272
  // necessary for llama
15636
15273
  if (src0->grad) {
15637
- int32_t * axes = (int32_t *) tensor->src[2]->data;
15274
+ int32_t * axes = (int32_t *) tensor->op_params;
15638
15275
  int axis0 = axes[0] & 0x3;
15639
15276
  int axis1 = axes[1] & 0x3;
15640
15277
  int axis2 = axes[2] & 0x3;
@@ -15690,33 +15327,23 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15690
15327
  {
15691
15328
  // necessary for llama
15692
15329
  if (src0->grad) {
15693
- assert(src1->type == GGML_TYPE_I32);
15694
- assert(ggml_nelements(src1) == 2);
15695
- const int n_past = ((int32_t *) src1->data)[0];
15330
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15696
15331
  src0->grad =
15697
15332
  ggml_add_impl(ctx, src0->grad,
15698
15333
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15699
15334
  inplace);
15700
15335
  }
15701
- if (src1->grad) {
15702
- // noop
15703
- }
15704
15336
  } break;
15705
15337
  case GGML_OP_DIAG_MASK_ZERO:
15706
15338
  {
15707
15339
  // necessary for llama
15708
15340
  if (src0->grad) {
15709
- assert(src1->type == GGML_TYPE_I32);
15710
- assert(ggml_nelements(src1) == 2);
15711
- const int n_past = ((int32_t *) src1->data)[0];
15341
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15712
15342
  src0->grad =
15713
15343
  ggml_add_impl(ctx, src0->grad,
15714
15344
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15715
15345
  inplace);
15716
15346
  }
15717
- if (src1->grad) {
15718
- // noop
15719
- }
15720
15347
  } break;
15721
15348
  case GGML_OP_SOFT_MAX:
15722
15349
  {
@@ -15737,12 +15364,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15737
15364
  {
15738
15365
  // necessary for llama
15739
15366
  if (src0->grad) {
15740
- assert(src1->type == GGML_TYPE_I32);
15741
- assert(ggml_nelements(src1) == 6);
15742
- const int n_past = ((int32_t *) src1->data)[0];
15743
- const int n_dims = ((int32_t *) src1->data)[1];
15744
- const int mode = ((int32_t *) src1->data)[2];
15745
- const int n_ctx = ((int32_t *) src1->data)[3];
15367
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15368
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15369
+ const int mode = ((int32_t *) tensor->op_params)[2];
15370
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15746
15371
  src0->grad = ggml_add_impl(ctx,
15747
15372
  src0->grad,
15748
15373
  ggml_rope_back(ctx,
@@ -15753,19 +15378,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15753
15378
  n_ctx),
15754
15379
  inplace);
15755
15380
  }
15756
- if (src1->grad) {
15757
- // noop
15758
- }
15759
15381
  } break;
15760
15382
  case GGML_OP_ROPE_BACK:
15761
15383
  {
15762
15384
  if (src0->grad) {
15763
- assert(src1->type == GGML_TYPE_I32);
15764
- assert(ggml_nelements(src1) == 4);
15765
- const int n_past = ((int32_t *) src1->data)[0];
15766
- const int n_dims = ((int32_t *) src1->data)[1];
15767
- const int mode = ((int32_t *) src1->data)[2];
15768
- const int n_ctx = ((int32_t *) src1->data)[3];
15385
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15386
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15387
+ const int mode = ((int32_t *) tensor->op_params)[2];
15388
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15769
15389
  src0->grad = ggml_add_impl(ctx,
15770
15390
  src0->grad,
15771
15391
  ggml_rope(ctx,
@@ -15776,9 +15396,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15776
15396
  n_ctx),
15777
15397
  inplace);
15778
15398
  }
15779
- if (src1->grad) {
15780
- // noop
15781
- }
15782
15399
  } break;
15783
15400
  case GGML_OP_ALIBI:
15784
15401
  {
@@ -15808,7 +15425,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15808
15425
  {
15809
15426
  struct ggml_tensor * flash_grad = NULL;
15810
15427
  if (src0->grad || src1->grad || tensor->src[2]->grad) {
15811
- int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
15428
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15812
15429
  GGML_ASSERT(t == 0 || t == 1);
15813
15430
  bool masked = t != 0;
15814
15431
  flash_grad =
@@ -15971,6 +15588,80 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15971
15588
  } break;
15972
15589
  case GGML_OP_WIN_PART:
15973
15590
  case GGML_OP_WIN_UNPART:
15591
+ case GGML_OP_UNARY:
15592
+ {
15593
+ switch (ggml_get_unary_op(tensor)) {
15594
+ case GGML_UNARY_OP_ABS:
15595
+ {
15596
+ if (src0->grad) {
15597
+ src0->grad =
15598
+ ggml_add_impl(ctx,
15599
+ src0->grad,
15600
+ ggml_mul(ctx,
15601
+ ggml_sgn(ctx, src0),
15602
+ tensor->grad),
15603
+ inplace);
15604
+ }
15605
+ } break;
15606
+ case GGML_UNARY_OP_SGN:
15607
+ {
15608
+ if (src0->grad) {
15609
+ // noop
15610
+ }
15611
+ } break;
15612
+ case GGML_UNARY_OP_NEG:
15613
+ {
15614
+ if (src0->grad) {
15615
+ src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15616
+ }
15617
+ } break;
15618
+ case GGML_UNARY_OP_STEP:
15619
+ {
15620
+ if (src0->grad) {
15621
+ // noop
15622
+ }
15623
+ } break;
15624
+ case GGML_UNARY_OP_TANH:
15625
+ {
15626
+ GGML_ASSERT(false); // TODO: not implemented
15627
+ } break;
15628
+ case GGML_UNARY_OP_ELU:
15629
+ {
15630
+ GGML_ASSERT(false); // TODO: not implemented
15631
+ } break;
15632
+ case GGML_UNARY_OP_RELU:
15633
+ {
15634
+ if (src0->grad) {
15635
+ src0->grad = ggml_add_impl(ctx,
15636
+ src0->grad,
15637
+ ggml_mul(ctx,
15638
+ ggml_step(ctx, src0),
15639
+ tensor->grad),
15640
+ inplace);
15641
+ }
15642
+ } break;
15643
+ case GGML_UNARY_OP_GELU:
15644
+ {
15645
+ GGML_ASSERT(false); // TODO: not implemented
15646
+ } break;
15647
+ case GGML_UNARY_OP_GELU_QUICK:
15648
+ {
15649
+ GGML_ASSERT(false); // TODO: not implemented
15650
+ } break;
15651
+ case GGML_UNARY_OP_SILU:
15652
+ {
15653
+ // necessary for llama
15654
+ if (src0->grad) {
15655
+ src0->grad = ggml_add_impl(ctx,
15656
+ src0->grad,
15657
+ ggml_silu_back(ctx, src0, tensor->grad),
15658
+ inplace);
15659
+ }
15660
+ } break;
15661
+ default:
15662
+ GGML_ASSERT(false);
15663
+ }
15664
+ } break;
15974
15665
  case GGML_OP_MAP_UNARY:
15975
15666
  case GGML_OP_MAP_BINARY:
15976
15667
  case GGML_OP_MAP_CUSTOM1:
@@ -16006,6 +15697,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16006
15697
  }
16007
15698
  }
16008
15699
 
15700
+ static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
15701
+
15702
+ static size_t hash(void * p) {
15703
+ return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
15704
+ }
15705
+
15706
+ static bool hash_insert(void * hash_table[], void * p) {
15707
+ size_t h = hash(p);
15708
+
15709
+ // linear probing
15710
+ size_t i = h;
15711
+ while (hash_table[i] != NULL && hash_table[i] != p) {
15712
+ i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
15713
+ if (i == h) {
15714
+ // hash table is full
15715
+ GGML_ASSERT(false);
15716
+ }
15717
+ }
15718
+
15719
+ if (hash_table[i] == p) {
15720
+ return true;
15721
+ }
15722
+
15723
+ // insert
15724
+ hash_table[i] = p;
15725
+ return false;
15726
+ }
15727
+
16009
15728
  static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
16010
15729
  if (node->grad == NULL) {
16011
15730
  // this usually happens when we generate intermediate nodes from constants in the backward pass
@@ -16016,16 +15735,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
16016
15735
  }
16017
15736
 
16018
15737
  // check if already visited
16019
- for (int i = 0; i < cgraph->n_nodes; i++) {
16020
- if (cgraph->nodes[i] == node) {
16021
- return;
16022
- }
16023
- }
16024
-
16025
- for (int i = 0; i < cgraph->n_leafs; i++) {
16026
- if (cgraph->leafs[i] == node) {
16027
- return;
16028
- }
15738
+ if (hash_insert(cgraph->visited_hash_table, node)) {
15739
+ return;
16029
15740
  }
16030
15741
 
16031
15742
  for (int i = 0; i < GGML_MAX_SRC; ++i) {
@@ -16088,6 +15799,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16088
15799
  /*.nodes =*/ { NULL },
16089
15800
  /*.grads =*/ { NULL },
16090
15801
  /*.leafs =*/ { NULL },
15802
+ /*.hash_table =*/ { NULL },
16091
15803
  /*.perf_runs =*/ 0,
16092
15804
  /*.perf_cycles =*/ 0,
16093
15805
  /*.perf_time_us =*/ 0,
@@ -16129,13 +15841,42 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16129
15841
 
16130
15842
  if (node->is_param) {
16131
15843
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16132
- ggml_build_forward_impl(&result, node->grad, true);
15844
+ ggml_build_forward_expand(&result, node->grad);
16133
15845
  }
16134
15846
  }
16135
15847
 
16136
15848
  return result;
16137
15849
  }
16138
15850
 
15851
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15852
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15853
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15854
+
15855
+ *cgraph = (struct ggml_cgraph) {
15856
+ /*.n_nodes =*/ 0,
15857
+ /*.n_leafs =*/ 0,
15858
+ /*.nodes =*/ { NULL },
15859
+ /*.grads =*/ { NULL },
15860
+ /*.leafs =*/ { NULL },
15861
+ /*.hash_table =*/ { NULL },
15862
+ /*.perf_runs =*/ 0,
15863
+ /*.perf_cycles =*/ 0,
15864
+ /*.perf_time_us =*/ 0,
15865
+ };
15866
+
15867
+ return cgraph;
15868
+ }
15869
+
15870
+ struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
15871
+ struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
15872
+ ggml_build_forward_impl(cgraph, tensor, false);
15873
+ return cgraph;
15874
+ }
15875
+
15876
+ size_t ggml_graph_overhead(void) {
15877
+ return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15878
+ }
15879
+
16139
15880
  //
16140
15881
  // thread data
16141
15882
  //
@@ -16201,7 +15942,7 @@ typedef pthread_t ggml_thread_t;
16201
15942
 
16202
15943
  // Android's libc implementation "bionic" does not support setting affinity
16203
15944
  #if defined(__linux__) && !defined(__BIONIC__)
16204
- void set_numa_thread_affinity(int thread_n, int n_threads) {
15945
+ static void set_numa_thread_affinity(int thread_n, int n_threads) {
16205
15946
  if (!ggml_is_numa()) {
16206
15947
  return;
16207
15948
  }
@@ -16226,7 +15967,7 @@ void set_numa_thread_affinity(int thread_n, int n_threads) {
16226
15967
  CPU_FREE(cpus);
16227
15968
  }
16228
15969
 
16229
- void clear_numa_thread_affinity(void) {
15970
+ static void clear_numa_thread_affinity(void) {
16230
15971
  if (!ggml_is_numa()) {
16231
15972
  return;
16232
15973
  }
@@ -16250,8 +15991,8 @@ void clear_numa_thread_affinity(void) {
16250
15991
  #else
16251
15992
  // TODO: Windows etc.
16252
15993
  // (the linux implementation may also work on BSD, someone should test)
16253
- void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16254
- void clear_numa_thread_affinity(void) {}
15994
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15995
+ static void clear_numa_thread_affinity(void) {}
16255
15996
  #endif
16256
15997
 
16257
15998
  struct ggml_compute_state_shared {
@@ -16463,21 +16204,34 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16463
16204
  case GGML_OP_ARGMAX:
16464
16205
  case GGML_OP_REPEAT:
16465
16206
  case GGML_OP_REPEAT_BACK:
16466
- case GGML_OP_ABS:
16467
- case GGML_OP_SGN:
16468
- case GGML_OP_NEG:
16469
- case GGML_OP_STEP:
16470
- case GGML_OP_TANH:
16471
- case GGML_OP_ELU:
16472
- case GGML_OP_RELU:
16473
- {
16207
+ {
16474
16208
  n_tasks = 1;
16475
16209
  } break;
16476
- case GGML_OP_MUL:
16477
- case GGML_OP_GELU:
16478
- case GGML_OP_GELU_QUICK:
16479
- case GGML_OP_SILU:
16210
+
16211
+ case GGML_OP_UNARY:
16212
+ {
16213
+ switch (ggml_get_unary_op(node)) {
16214
+ case GGML_UNARY_OP_ABS:
16215
+ case GGML_UNARY_OP_SGN:
16216
+ case GGML_UNARY_OP_NEG:
16217
+ case GGML_UNARY_OP_STEP:
16218
+ case GGML_UNARY_OP_TANH:
16219
+ case GGML_UNARY_OP_ELU:
16220
+ case GGML_UNARY_OP_RELU:
16221
+ {
16222
+ n_tasks = 1;
16223
+ } break;
16224
+
16225
+ case GGML_UNARY_OP_GELU:
16226
+ case GGML_UNARY_OP_GELU_QUICK:
16227
+ case GGML_UNARY_OP_SILU:
16228
+ {
16229
+ n_tasks = n_threads;
16230
+ } break;
16231
+ }
16232
+ } break;
16480
16233
  case GGML_OP_SILU_BACK:
16234
+ case GGML_OP_MUL:
16481
16235
  case GGML_OP_NORM:
16482
16236
  case GGML_OP_RMS_NORM:
16483
16237
  case GGML_OP_RMS_NORM_BACK:
@@ -16542,10 +16296,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16542
16296
  case GGML_OP_GET_ROWS:
16543
16297
  case GGML_OP_GET_ROWS_BACK:
16544
16298
  case GGML_OP_DIAG:
16545
- case GGML_OP_DIAG_MASK_ZERO:
16546
16299
  {
16547
16300
  n_tasks = 1;
16548
16301
  } break;
16302
+ case GGML_OP_DIAG_MASK_ZERO:
16549
16303
  case GGML_OP_DIAG_MASK_INF:
16550
16304
  case GGML_OP_SOFT_MAX:
16551
16305
  case GGML_OP_SOFT_MAX_BACK:
@@ -16838,10 +16592,9 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16838
16592
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16839
16593
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16840
16594
 
16841
- struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
16842
- GGML_ASSERT(buf);
16595
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
16843
16596
 
16844
- cplan.work_data = buf->data;
16597
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
16845
16598
 
16846
16599
  ggml_graph_compute(cgraph, &cplan);
16847
16600
  }
@@ -16992,7 +16745,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16992
16745
  fwrite(&nb, sizeof(uint64_t), 1, fout);
16993
16746
  }
16994
16747
 
16995
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16748
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16749
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
16996
16750
 
16997
16751
  // dump the data
16998
16752
  // TODO: pad this to 32 byte boundary
@@ -17025,7 +16779,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17025
16779
  fwrite(&nb, sizeof(uint64_t), 1, fout);
17026
16780
  }
17027
16781
 
17028
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16782
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16783
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
17029
16784
 
17030
16785
  // output the op arguments
17031
16786
  {
@@ -17206,7 +16961,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17206
16961
 
17207
16962
  tensor->op = (enum ggml_op) op;
17208
16963
 
17209
- memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16964
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16965
+ memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
17210
16966
 
17211
16967
  tensor->data = (void *) ptr;
17212
16968
 
@@ -17251,7 +17007,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17251
17007
  nb[j] = nb_cur;
17252
17008
  }
17253
17009
 
17254
- const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
17010
+ const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
17011
+ const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
17255
17012
 
17256
17013
  const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
17257
17014
 
@@ -17288,8 +17045,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17288
17045
  {
17289
17046
  tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
17290
17047
 
17291
- uint64_t offs;
17292
- memcpy(&offs, args[2]->data, sizeof(offs));
17048
+ size_t offs;
17049
+ memcpy(&offs, ptr_op_params, sizeof(offs));
17293
17050
 
17294
17051
  tensor->data = ((char *) tensor->data) + offs;
17295
17052
  } break;
@@ -17309,7 +17066,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17309
17066
  } break;
17310
17067
  }
17311
17068
 
17312
- memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17069
+ memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17070
+ memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
17313
17071
 
17314
17072
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
17315
17073
  tensor->nb[j] = nb[j];
@@ -17343,7 +17101,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17343
17101
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17344
17102
  i,
17345
17103
  node->ne[0], node->ne[1], node->ne[2],
17346
- GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17104
+ ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17347
17105
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17348
17106
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17349
17107
  (double) node->perf_time_us / 1000.0,
@@ -17357,7 +17115,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17357
17115
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
17358
17116
  i,
17359
17117
  node->ne[0], node->ne[1],
17360
- GGML_OP_NAME[node->op]);
17118
+ ggml_op_name(node->op));
17361
17119
  }
17362
17120
 
17363
17121
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -17365,7 +17123,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17365
17123
  continue;
17366
17124
  }
17367
17125
 
17368
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
17126
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
17369
17127
  }
17370
17128
 
17371
17129
  GGML_PRINT("========================================\n");
@@ -17459,13 +17217,13 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17459
17217
  }
17460
17218
 
17461
17219
  if (node->n_dims == 2) {
17462
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
17220
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
17463
17221
  } else {
17464
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
17222
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
17465
17223
  }
17466
17224
 
17467
17225
  if (node->grad) {
17468
- fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
17226
+ fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
17469
17227
  } else {
17470
17228
  fprintf(fp, "\"; ]\n");
17471
17229
  }