llama_cpp 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,11 +31,17 @@
31
31
  #include <unistd.h>
32
32
  #endif
33
33
 
34
+ // static_assert should be a #define, but if it's not,
35
+ // fall back to the _Static_assert C11 keyword.
34
36
  // if C99 - static_assert is noop
35
37
  // ref: https://stackoverflow.com/a/53923785/4039976
36
38
  #ifndef static_assert
39
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
40
+ #define static_assert(cond, msg) _Static_assert(cond, msg)
41
+ #else
37
42
  #define static_assert(cond, msg) struct global_scope_noop_trick
38
43
  #endif
44
+ #endif
39
45
 
40
46
  #if defined(_MSC_VER)
41
47
  // disable "possible loss of data" to avoid hundreds of casts
@@ -112,10 +118,6 @@ typedef void * thread_ret_t;
112
118
  #endif
113
119
  #endif
114
120
 
115
- #ifdef __HAIKU__
116
- #define static_assert(cond, msg) _Static_assert(cond, msg)
117
- #endif
118
-
119
121
  /*#define GGML_PERF*/
120
122
  #define GGML_DEBUG 0
121
123
  #define GGML_GELU_FP16
@@ -3438,7 +3440,9 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
3438
3440
 
3439
3441
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
3440
3442
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
3441
- #if defined(GGML_SIMD)
3443
+ #if defined(GGML_USE_ACCELERATE)
3444
+ vDSP_vsmul(y, 1, &v, y, 1, n);
3445
+ #elif defined(GGML_SIMD)
3442
3446
  const int np = (n & ~(GGML_F32_STEP - 1));
3443
3447
 
3444
3448
  GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
@@ -3601,7 +3605,7 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
3601
3605
  #endif
3602
3606
  }
3603
3607
 
3604
- inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) {
3608
+ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
3605
3609
  ggml_float sum = 0.0;
3606
3610
  for (int i = 0; i < n; ++i) {
3607
3611
  sum += (ggml_float)x[i];
@@ -3609,6 +3613,14 @@ inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x
3609
3613
  *s = sum;
3610
3614
  }
3611
3615
 
3616
+ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
3617
+ float sum = 0.0f;
3618
+ for (int i = 0; i < n; ++i) {
3619
+ sum += GGML_FP16_TO_FP32(x[i]);
3620
+ }
3621
+ *s = sum;
3622
+ }
3623
+
3612
3624
  inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
3613
3625
  #ifndef GGML_USE_ACCELERATE
3614
3626
  float max = -INFINITY;
@@ -3748,16 +3760,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3748
3760
  "ARGMAX",
3749
3761
  "REPEAT",
3750
3762
  "REPEAT_BACK",
3751
- "ABS",
3752
- "SGN",
3753
- "NEG",
3754
- "STEP",
3755
- "TANH",
3756
- "ELU",
3757
- "RELU",
3758
- "GELU",
3759
- "GELU_QUICK",
3760
- "SILU",
3761
3763
  "SILU_BACK",
3762
3764
  "NORM",
3763
3765
  "RMS_NORM",
@@ -3796,6 +3798,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3796
3798
  "WIN_PART",
3797
3799
  "WIN_UNPART",
3798
3800
 
3801
+ "UNARY",
3802
+
3799
3803
  "MAP_UNARY",
3800
3804
  "MAP_BINARY",
3801
3805
 
@@ -3807,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3807
3811
  "CROSS_ENTROPY_LOSS_BACK",
3808
3812
  };
3809
3813
 
3810
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3814
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3811
3815
 
3812
3816
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3813
3817
  "none",
@@ -3828,16 +3832,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3828
3832
  "argmax(x)",
3829
3833
  "repeat(x)",
3830
3834
  "repeat_back(x)",
3831
- "abs(x)",
3832
- "sgn(x)",
3833
- "-x",
3834
- "step(x)",
3835
- "tanh(x)",
3836
- "elu(x)",
3837
- "relu(x)",
3838
- "gelu(x)",
3839
- "gelu_quick(x)",
3840
- "silu(x)",
3841
3835
  "silu_back(x)",
3842
3836
  "norm(x)",
3843
3837
  "rms_norm(x)",
@@ -3876,6 +3870,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3876
3870
  "win_part(x)",
3877
3871
  "win_unpart(x)",
3878
3872
 
3873
+ "unary(x)",
3874
+
3879
3875
  "f(x)",
3880
3876
  "f(x,y)",
3881
3877
 
@@ -3887,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3887
3883
  "cross_entropy_loss_back(x,y)",
3888
3884
  };
3889
3885
 
3890
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3886
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3891
3887
 
3892
3888
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3893
3889
 
@@ -4075,8 +4071,8 @@ bool ggml_is_numa(void) {
4075
4071
  ////////////////////////////////////////////////////////////////////////////////
4076
4072
 
4077
4073
  void ggml_print_object(const struct ggml_object * obj) {
4078
- GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
4079
- obj->offs, obj->size, (const void *) obj->next);
4074
+ GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
4075
+ obj->type, obj->offs, obj->size, (const void *) obj->next);
4080
4076
  }
4081
4077
 
4082
4078
  void ggml_print_objects(const struct ggml_context * ctx) {
@@ -4143,6 +4139,10 @@ const char * ggml_op_name(enum ggml_op op) {
4143
4139
  return GGML_OP_NAME[op];
4144
4140
  }
4145
4141
 
4142
+ const char * ggml_op_symbol(enum ggml_op op) {
4143
+ return GGML_OP_SYMBOL[op];
4144
+ }
4145
+
4146
4146
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
4147
4147
  return GGML_TYPE_SIZE[tensor->type];
4148
4148
  }
@@ -4212,7 +4212,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4212
4212
  }
4213
4213
 
4214
4214
  size_t ggml_tensor_overhead(void) {
4215
- return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
4215
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
4216
4216
  }
4217
4217
 
4218
4218
  bool ggml_is_transposed(const struct ggml_tensor * tensor) {
@@ -4229,6 +4229,15 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
4229
4229
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4230
4230
  }
4231
4231
 
4232
+ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
4233
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
+
4235
+ return
4236
+ tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4237
+ tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4238
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4239
+ }
4240
+
4232
4241
  bool ggml_is_permuted(const struct ggml_tensor * tensor) {
4233
4242
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
4243
 
@@ -4374,7 +4383,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4374
4383
  return NULL;
4375
4384
  }
4376
4385
 
4377
- const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
4386
+ const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4378
4387
 
4379
4388
  *ctx = (struct ggml_context) {
4380
4389
  /*.mem_size =*/ mem_size,
@@ -4410,8 +4419,8 @@ void ggml_free(struct ggml_context * ctx) {
4410
4419
  if (&g_state.contexts[i].context == ctx) {
4411
4420
  g_state.contexts[i].used = false;
4412
4421
 
4413
- GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
4414
- __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
4422
+ GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
4423
+ __func__, i, ggml_used_mem(ctx));
4415
4424
 
4416
4425
  if (ctx->mem_buffer_owned) {
4417
4426
  GGML_ALIGNED_FREE(ctx->mem_buffer);
@@ -4441,6 +4450,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4441
4450
  return result;
4442
4451
  }
4443
4452
 
4453
+ bool ggml_get_no_alloc(struct ggml_context * ctx) {
4454
+ return ctx->no_alloc;
4455
+ }
4456
+
4444
4457
  void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4445
4458
  ctx->no_alloc = no_alloc;
4446
4459
  }
@@ -4459,12 +4472,14 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4459
4472
  struct ggml_object * obj = ctx->objects_begin;
4460
4473
 
4461
4474
  while (obj != NULL) {
4462
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4475
+ if (obj->type == GGML_OBJECT_TENSOR) {
4476
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4463
4477
 
4464
- const size_t size = ggml_nbytes(tensor);
4478
+ const size_t size = ggml_nbytes(tensor);
4465
4479
 
4466
- if (max_size < size) {
4467
- max_size = size;
4480
+ if (max_size < size) {
4481
+ max_size = size;
4482
+ }
4468
4483
  }
4469
4484
 
4470
4485
  obj = obj->next;
@@ -4478,7 +4493,7 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4478
4493
  // this is an error prone process, but it is necessary to support inplace
4479
4494
  // operators when using scratch buffers
4480
4495
  // TODO: implement a better way
4481
- void ggml_scratch_save(struct ggml_context * ctx) {
4496
+ static void ggml_scratch_save(struct ggml_context * ctx) {
4482
4497
  // this is needed to allow opt tensors to store their data
4483
4498
  // TODO: again, need to find a better way
4484
4499
  ctx->no_alloc_save = ctx->no_alloc;
@@ -4488,7 +4503,7 @@ void ggml_scratch_save(struct ggml_context * ctx) {
4488
4503
  ctx->scratch.data = NULL;
4489
4504
  }
4490
4505
 
4491
- void ggml_scratch_load(struct ggml_context * ctx) {
4506
+ static void ggml_scratch_load(struct ggml_context * ctx) {
4492
4507
  ctx->no_alloc = ctx->no_alloc_save;
4493
4508
 
4494
4509
  ctx->scratch = ctx->scratch_save;
@@ -4496,12 +4511,7 @@ void ggml_scratch_load(struct ggml_context * ctx) {
4496
4511
 
4497
4512
  ////////////////////////////////////////////////////////////////////////////////
4498
4513
 
4499
- struct ggml_tensor * ggml_new_tensor_impl(
4500
- struct ggml_context * ctx,
4501
- enum ggml_type type,
4502
- int n_dims,
4503
- const int64_t* ne,
4504
- void* data) {
4514
+ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
4505
4515
  // always insert objects at the end of the context's memory pool
4506
4516
  struct ggml_object * obj_cur = ctx->objects_end;
4507
4517
 
@@ -4509,77 +4519,79 @@ struct ggml_tensor * ggml_new_tensor_impl(
4509
4519
  const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
4510
4520
  const size_t cur_end = cur_offs + cur_size;
4511
4521
 
4512
- size_t size_needed = 0;
4513
-
4514
- if (data == NULL && !ctx->no_alloc) {
4515
- size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4516
- for (int i = 1; i < n_dims; i++) {
4517
- size_needed *= ne[i];
4518
- }
4519
- // align to GGML_MEM_ALIGN
4520
- size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
4521
- }
4522
+ // align to GGML_MEM_ALIGN
4523
+ size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
4522
4524
 
4523
4525
  char * const mem_buffer = ctx->mem_buffer;
4524
4526
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4525
4527
 
4526
- if (ctx->scratch.data == NULL || data != NULL) {
4527
- size_needed += GGML_TENSOR_SIZE;
4528
+ if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4529
+ GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4530
+ __func__, cur_end + size_needed, ctx->mem_size);
4531
+ assert(false);
4532
+ return NULL;
4533
+ }
4534
+
4535
+ *obj_new = (struct ggml_object) {
4536
+ .offs = cur_end + GGML_OBJECT_SIZE,
4537
+ .size = size_needed,
4538
+ .next = NULL,
4539
+ .type = type,
4540
+ };
4528
4541
 
4529
- if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4530
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4531
- __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
4532
- assert(false);
4533
- return NULL;
4534
- }
4542
+ ggml_assert_aligned(mem_buffer + obj_new->offs);
4535
4543
 
4536
- *obj_new = (struct ggml_object) {
4537
- .offs = cur_end + GGML_OBJECT_SIZE,
4538
- .size = size_needed,
4539
- .next = NULL,
4540
- };
4544
+ if (obj_cur != NULL) {
4545
+ obj_cur->next = obj_new;
4541
4546
  } else {
4542
- if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4543
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4544
- __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4545
- assert(false);
4546
- return NULL;
4547
+ // this is the first object in this context
4548
+ ctx->objects_begin = obj_new;
4549
+ }
4550
+
4551
+ ctx->objects_end = obj_new;
4552
+
4553
+ //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4554
+
4555
+ return obj_new;
4556
+ }
4557
+
4558
+ static struct ggml_tensor * ggml_new_tensor_impl(
4559
+ struct ggml_context * ctx,
4560
+ enum ggml_type type,
4561
+ int n_dims,
4562
+ const int64_t* ne,
4563
+ void* data) {
4564
+
4565
+ size_t data_size = 0;
4566
+
4567
+ if (data == NULL && !ctx->no_alloc) {
4568
+ data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4569
+ for (int i = 1; i < n_dims; i++) {
4570
+ data_size *= ne[i];
4547
4571
  }
4572
+ }
4548
4573
 
4549
- if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4550
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4551
- __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4574
+ if (ctx->scratch.data != NULL && data == NULL) {
4575
+ // allocate tensor data in the scratch buffer
4576
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4577
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4578
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4552
4579
  assert(false);
4553
4580
  return NULL;
4554
4581
  }
4555
4582
 
4556
4583
  data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4557
4584
 
4558
- *obj_new = (struct ggml_object) {
4559
- .offs = cur_end + GGML_OBJECT_SIZE,
4560
- .size = GGML_TENSOR_SIZE,
4561
- .next = NULL,
4562
- };
4563
-
4564
- //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
4565
-
4566
- ctx->scratch.offs += size_needed;
4567
- }
4585
+ ctx->scratch.offs += data_size;
4568
4586
 
4569
- if (obj_cur != NULL) {
4570
- obj_cur->next = obj_new;
4571
- } else {
4572
- // this is the first object in this context
4573
- ctx->objects_begin = obj_new;
4587
+ data_size = 0;
4574
4588
  }
4575
4589
 
4576
- ctx->objects_end = obj_new;
4577
-
4578
- //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4590
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4579
4591
 
4580
- struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
4592
+ // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4581
4593
 
4582
- ggml_assert_aligned(result);
4594
+ struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
4583
4595
 
4584
4596
  *result = (struct ggml_tensor) {
4585
4597
  /*.type =*/ type,
@@ -4588,6 +4600,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4588
4600
  /*.ne =*/ { 1, 1, 1, 1 },
4589
4601
  /*.nb =*/ { 0, 0, 0, 0 },
4590
4602
  /*.op =*/ GGML_OP_NONE,
4603
+ /*.op_params =*/ {0},
4591
4604
  /*.is_param =*/ false,
4592
4605
  /*.grad =*/ NULL,
4593
4606
  /*.src =*/ { NULL },
@@ -4618,6 +4631,21 @@ struct ggml_tensor * ggml_new_tensor_impl(
4618
4631
  return result;
4619
4632
  }
4620
4633
 
4634
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4635
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4636
+ memcpy(tensor->op_params, params, params_size);
4637
+ }
4638
+
4639
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4640
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4641
+ return ((const int32_t *)(tensor->op_params))[i];
4642
+ }
4643
+
4644
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4645
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4646
+ ((int32_t *)(tensor->op_params))[i] = value;
4647
+ }
4648
+
4621
4649
  struct ggml_tensor * ggml_new_tensor(
4622
4650
  struct ggml_context * ctx,
4623
4651
  enum ggml_type type,
@@ -4949,6 +4977,11 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
4949
4977
  return (float *)(tensor->data);
4950
4978
  }
4951
4979
 
4980
+ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
4981
+ GGML_ASSERT(tensor->op == GGML_OP_UNARY);
4982
+ return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
4983
+ }
4984
+
4952
4985
  const char * ggml_get_name(const struct ggml_tensor * tensor) {
4953
4986
  return tensor->name;
4954
4987
  }
@@ -4987,9 +5020,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
4987
5020
  char * const mem_buffer = ctx->mem_buffer;
4988
5021
 
4989
5022
  while (obj != NULL) {
4990
- struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4991
- if (strcmp(cur->name, name) == 0) {
4992
- return cur;
5023
+ if (obj->type == GGML_OBJECT_TENSOR) {
5024
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
5025
+ if (strcmp(cur->name, name) == 0) {
5026
+ return cur;
5027
+ }
4993
5028
  }
4994
5029
 
4995
5030
  obj = obj->next;
@@ -5002,7 +5037,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
5002
5037
 
5003
5038
  // ggml_dup
5004
5039
 
5005
- struct ggml_tensor * ggml_dup_impl(
5040
+ static struct ggml_tensor * ggml_dup_impl(
5006
5041
  struct ggml_context * ctx,
5007
5042
  struct ggml_tensor * a,
5008
5043
  bool inplace) {
@@ -5017,7 +5052,6 @@ struct ggml_tensor * ggml_dup_impl(
5017
5052
  result->op = GGML_OP_DUP;
5018
5053
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5019
5054
  result->src[0] = a;
5020
- result->src[1] = NULL;
5021
5055
 
5022
5056
  return result;
5023
5057
  }
@@ -5036,7 +5070,7 @@ struct ggml_tensor * ggml_dup_inplace(
5036
5070
 
5037
5071
  // ggml_add
5038
5072
 
5039
- struct ggml_tensor * ggml_add_impl(
5073
+ static struct ggml_tensor * ggml_add_impl(
5040
5074
  struct ggml_context * ctx,
5041
5075
  struct ggml_tensor * a,
5042
5076
  struct ggml_tensor * b,
@@ -5079,7 +5113,7 @@ struct ggml_tensor * ggml_add_inplace(
5079
5113
 
5080
5114
  // ggml_add1
5081
5115
 
5082
- struct ggml_tensor * ggml_add1_impl(
5116
+ static struct ggml_tensor * ggml_add1_impl(
5083
5117
  struct ggml_context * ctx,
5084
5118
  struct ggml_tensor * a,
5085
5119
  struct ggml_tensor * b,
@@ -5119,7 +5153,7 @@ struct ggml_tensor * ggml_add1_inplace(
5119
5153
 
5120
5154
  // ggml_acc
5121
5155
 
5122
- struct ggml_tensor * ggml_acc_impl(
5156
+ static struct ggml_tensor * ggml_acc_impl(
5123
5157
  struct ggml_context * ctx,
5124
5158
  struct ggml_tensor * a,
5125
5159
  struct ggml_tensor * b,
@@ -5141,23 +5175,13 @@ struct ggml_tensor * ggml_acc_impl(
5141
5175
 
5142
5176
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5143
5177
 
5144
- ggml_scratch_save(ctx);
5145
-
5146
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
5147
-
5148
- ((int32_t *) c->data)[0] = nb1;
5149
- ((int32_t *) c->data)[1] = nb2;
5150
- ((int32_t *) c->data)[2] = nb3;
5151
- ((int32_t *) c->data)[3] = offset;
5152
- ((int32_t *) c->data)[4] = inplace ? 1 : 0;
5153
-
5154
- ggml_scratch_load(ctx);
5178
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5179
+ ggml_set_op_params(result, params, sizeof(params));
5155
5180
 
5156
5181
  result->op = GGML_OP_ACC;
5157
5182
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5158
5183
  result->src[0] = a;
5159
5184
  result->src[1] = b;
5160
- result->src[2] = c;
5161
5185
 
5162
5186
  return result;
5163
5187
  }
@@ -5186,7 +5210,7 @@ struct ggml_tensor * ggml_acc_inplace(
5186
5210
 
5187
5211
  // ggml_sub
5188
5212
 
5189
- struct ggml_tensor * ggml_sub_impl(
5213
+ static struct ggml_tensor * ggml_sub_impl(
5190
5214
  struct ggml_context * ctx,
5191
5215
  struct ggml_tensor * a,
5192
5216
  struct ggml_tensor * b,
@@ -5225,7 +5249,7 @@ struct ggml_tensor * ggml_sub_inplace(
5225
5249
 
5226
5250
  // ggml_mul
5227
5251
 
5228
- struct ggml_tensor * ggml_mul_impl(
5252
+ static struct ggml_tensor * ggml_mul_impl(
5229
5253
  struct ggml_context * ctx,
5230
5254
  struct ggml_tensor * a,
5231
5255
  struct ggml_tensor * b,
@@ -5272,7 +5296,7 @@ struct ggml_tensor * ggml_mul_inplace(
5272
5296
 
5273
5297
  // ggml_div
5274
5298
 
5275
- struct ggml_tensor * ggml_div_impl(
5299
+ static struct ggml_tensor * ggml_div_impl(
5276
5300
  struct ggml_context * ctx,
5277
5301
  struct ggml_tensor * a,
5278
5302
  struct ggml_tensor * b,
@@ -5315,7 +5339,7 @@ struct ggml_tensor * ggml_div_inplace(
5315
5339
 
5316
5340
  // ggml_sqr
5317
5341
 
5318
- struct ggml_tensor * ggml_sqr_impl(
5342
+ static struct ggml_tensor * ggml_sqr_impl(
5319
5343
  struct ggml_context * ctx,
5320
5344
  struct ggml_tensor * a,
5321
5345
  bool inplace) {
@@ -5330,7 +5354,6 @@ struct ggml_tensor * ggml_sqr_impl(
5330
5354
  result->op = GGML_OP_SQR;
5331
5355
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5332
5356
  result->src[0] = a;
5333
- result->src[1] = NULL;
5334
5357
 
5335
5358
  return result;
5336
5359
  }
@@ -5349,7 +5372,7 @@ struct ggml_tensor * ggml_sqr_inplace(
5349
5372
 
5350
5373
  // ggml_sqrt
5351
5374
 
5352
- struct ggml_tensor * ggml_sqrt_impl(
5375
+ static struct ggml_tensor * ggml_sqrt_impl(
5353
5376
  struct ggml_context * ctx,
5354
5377
  struct ggml_tensor * a,
5355
5378
  bool inplace) {
@@ -5364,7 +5387,6 @@ struct ggml_tensor * ggml_sqrt_impl(
5364
5387
  result->op = GGML_OP_SQRT;
5365
5388
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5366
5389
  result->src[0] = a;
5367
- result->src[1] = NULL;
5368
5390
 
5369
5391
  return result;
5370
5392
  }
@@ -5384,7 +5406,7 @@ struct ggml_tensor * ggml_sqrt_inplace(
5384
5406
 
5385
5407
  // ggml_log
5386
5408
 
5387
- struct ggml_tensor * ggml_log_impl(
5409
+ static struct ggml_tensor * ggml_log_impl(
5388
5410
  struct ggml_context * ctx,
5389
5411
  struct ggml_tensor * a,
5390
5412
  bool inplace) {
@@ -5399,7 +5421,6 @@ struct ggml_tensor * ggml_log_impl(
5399
5421
  result->op = GGML_OP_LOG;
5400
5422
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5401
5423
  result->src[0] = a;
5402
- result->src[1] = NULL;
5403
5424
 
5404
5425
  return result;
5405
5426
  }
@@ -5432,7 +5453,6 @@ struct ggml_tensor * ggml_sum(
5432
5453
  result->op = GGML_OP_SUM;
5433
5454
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5434
5455
  result->src[0] = a;
5435
- result->src[1] = NULL;
5436
5456
 
5437
5457
  return result;
5438
5458
  }
@@ -5459,7 +5479,6 @@ struct ggml_tensor * ggml_sum_rows(
5459
5479
  result->op = GGML_OP_SUM_ROWS;
5460
5480
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5461
5481
  result->src[0] = a;
5462
- result->src[1] = NULL;
5463
5482
 
5464
5483
  return result;
5465
5484
  }
@@ -5482,7 +5501,6 @@ struct ggml_tensor * ggml_mean(
5482
5501
  result->op = GGML_OP_MEAN;
5483
5502
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5484
5503
  result->src[0] = a;
5485
- result->src[1] = NULL;
5486
5504
 
5487
5505
  return result;
5488
5506
  }
@@ -5506,7 +5524,6 @@ struct ggml_tensor * ggml_argmax(
5506
5524
  result->op = GGML_OP_ARGMAX;
5507
5525
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5508
5526
  result->src[0] = a;
5509
- result->src[1] = NULL;
5510
5527
 
5511
5528
  return result;
5512
5529
  }
@@ -5569,343 +5586,142 @@ struct ggml_tensor * ggml_repeat_back(
5569
5586
 
5570
5587
  // ggml_abs
5571
5588
 
5572
- struct ggml_tensor * ggml_abs_impl(
5573
- struct ggml_context * ctx,
5574
- struct ggml_tensor * a,
5575
- bool inplace) {
5576
- bool is_node = false;
5577
-
5578
- if (!inplace && (a->grad)) {
5579
- is_node = true;
5580
- }
5581
-
5582
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5583
-
5584
- result->op = GGML_OP_ABS;
5585
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5586
- result->src[0] = a;
5587
- result->src[1] = NULL;
5588
-
5589
- return result;
5590
- }
5591
-
5592
5589
  struct ggml_tensor * ggml_abs(
5593
5590
  struct ggml_context * ctx,
5594
5591
  struct ggml_tensor * a) {
5595
- return ggml_abs_impl(ctx, a, false);
5592
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
5596
5593
  }
5597
5594
 
5598
5595
  struct ggml_tensor * ggml_abs_inplace(
5599
5596
  struct ggml_context * ctx,
5600
5597
  struct ggml_tensor * a) {
5601
- return ggml_abs_impl(ctx, a, true);
5598
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
5602
5599
  }
5603
5600
 
5604
-
5605
5601
  // ggml_sgn
5606
5602
 
5607
- struct ggml_tensor * ggml_sgn_impl(
5608
- struct ggml_context * ctx,
5609
- struct ggml_tensor * a,
5610
- bool inplace) {
5611
- bool is_node = false;
5612
-
5613
- if (!inplace && (a->grad)) {
5614
- is_node = true;
5615
- }
5616
-
5617
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5618
-
5619
- result->op = GGML_OP_SGN;
5620
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5621
- result->src[0] = a;
5622
- result->src[1] = NULL;
5623
-
5624
- return result;
5625
- }
5626
-
5627
5603
  struct ggml_tensor * ggml_sgn(
5628
5604
  struct ggml_context * ctx,
5629
5605
  struct ggml_tensor * a) {
5630
- return ggml_sgn_impl(ctx, a, false);
5606
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
5631
5607
  }
5632
5608
 
5633
5609
  struct ggml_tensor * ggml_sgn_inplace(
5634
5610
  struct ggml_context * ctx,
5635
5611
  struct ggml_tensor * a) {
5636
- return ggml_sgn_impl(ctx, a, true);
5612
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
5637
5613
  }
5638
5614
 
5639
5615
  // ggml_neg
5640
5616
 
5641
- struct ggml_tensor * ggml_neg_impl(
5642
- struct ggml_context * ctx,
5643
- struct ggml_tensor * a,
5644
- bool inplace) {
5645
- bool is_node = false;
5646
-
5647
- if (!inplace && (a->grad)) {
5648
- is_node = true;
5649
- }
5650
-
5651
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5652
-
5653
- result->op = GGML_OP_NEG;
5654
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5655
- result->src[0] = a;
5656
- result->src[1] = NULL;
5657
-
5658
- return result;
5659
- }
5660
-
5661
5617
  struct ggml_tensor * ggml_neg(
5662
5618
  struct ggml_context * ctx,
5663
5619
  struct ggml_tensor * a) {
5664
- return ggml_neg_impl(ctx, a, false);
5620
+ return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
5665
5621
  }
5666
5622
 
5667
5623
  struct ggml_tensor * ggml_neg_inplace(
5668
5624
  struct ggml_context * ctx,
5669
5625
  struct ggml_tensor * a) {
5670
- return ggml_neg_impl(ctx, a, true);
5626
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
5671
5627
  }
5672
5628
 
5673
5629
  // ggml_step
5674
5630
 
5675
- struct ggml_tensor * ggml_step_impl(
5676
- struct ggml_context * ctx,
5677
- struct ggml_tensor * a,
5678
- bool inplace) {
5679
- bool is_node = false;
5680
-
5681
- if (!inplace && (a->grad)) {
5682
- is_node = true;
5683
- }
5684
-
5685
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5686
-
5687
- result->op = GGML_OP_STEP;
5688
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5689
- result->src[0] = a;
5690
- result->src[1] = NULL;
5691
-
5692
- return result;
5693
- }
5694
-
5695
5631
  struct ggml_tensor * ggml_step(
5696
5632
  struct ggml_context * ctx,
5697
5633
  struct ggml_tensor * a) {
5698
- return ggml_step_impl(ctx, a, false);
5634
+ return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
5699
5635
  }
5700
5636
 
5701
5637
  struct ggml_tensor * ggml_step_inplace(
5702
5638
  struct ggml_context * ctx,
5703
5639
  struct ggml_tensor * a) {
5704
- return ggml_step_impl(ctx, a, true);
5640
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
5705
5641
  }
5706
5642
 
5707
5643
  // ggml_tanh
5708
5644
 
5709
- struct ggml_tensor * ggml_tanh_impl(
5710
- struct ggml_context * ctx,
5711
- struct ggml_tensor * a,
5712
- bool inplace) {
5713
- bool is_node = false;
5714
-
5715
- if (!inplace && (a->grad)) {
5716
- is_node = true;
5717
- }
5718
-
5719
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5720
-
5721
- result->op = GGML_OP_TANH;
5722
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5723
- result->src[0] = a;
5724
- result->src[1] = NULL;
5725
-
5726
- return result;
5727
- }
5728
-
5729
5645
  struct ggml_tensor * ggml_tanh(
5730
5646
  struct ggml_context * ctx,
5731
5647
  struct ggml_tensor * a) {
5732
- return ggml_tanh_impl(ctx, a, false);
5648
+ return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
5733
5649
  }
5734
5650
 
5735
5651
  struct ggml_tensor * ggml_tanh_inplace(
5736
5652
  struct ggml_context * ctx,
5737
5653
  struct ggml_tensor * a) {
5738
- return ggml_tanh_impl(ctx, a, true);
5654
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
5739
5655
  }
5740
5656
 
5741
5657
  // ggml_elu
5742
5658
 
5743
- struct ggml_tensor * ggml_elu_impl(
5744
- struct ggml_context * ctx,
5745
- struct ggml_tensor * a,
5746
- bool inplace) {
5747
- bool is_node = false;
5748
-
5749
- if (!inplace && (a->grad)) {
5750
- is_node = true;
5751
- }
5752
-
5753
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5754
-
5755
- result->op = GGML_OP_ELU;
5756
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5757
- result->src[0] = a;
5758
- result->src[1] = NULL;
5759
-
5760
- return result;
5761
- }
5762
-
5763
5659
  struct ggml_tensor * ggml_elu(
5764
5660
  struct ggml_context * ctx,
5765
5661
  struct ggml_tensor * a) {
5766
- return ggml_elu_impl(ctx, a, false);
5662
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
5767
5663
  }
5768
5664
 
5769
5665
  struct ggml_tensor * ggml_elu_inplace(
5770
5666
  struct ggml_context * ctx,
5771
5667
  struct ggml_tensor * a) {
5772
- return ggml_elu_impl(ctx, a, true);
5668
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
5773
5669
  }
5774
5670
 
5775
5671
  // ggml_relu
5776
5672
 
5777
- struct ggml_tensor * ggml_relu_impl(
5778
- struct ggml_context * ctx,
5779
- struct ggml_tensor * a,
5780
- bool inplace) {
5781
- bool is_node = false;
5782
-
5783
- if (!inplace && (a->grad)) {
5784
- is_node = true;
5785
- }
5786
-
5787
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5788
-
5789
- result->op = GGML_OP_RELU;
5790
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5791
- result->src[0] = a;
5792
- result->src[1] = NULL;
5793
-
5794
- return result;
5795
- }
5796
-
5797
5673
  struct ggml_tensor * ggml_relu(
5798
5674
  struct ggml_context * ctx,
5799
5675
  struct ggml_tensor * a) {
5800
- return ggml_relu_impl(ctx, a, false);
5676
+ return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
5801
5677
  }
5802
5678
 
5803
5679
  struct ggml_tensor * ggml_relu_inplace(
5804
5680
  struct ggml_context * ctx,
5805
5681
  struct ggml_tensor * a) {
5806
- return ggml_relu_impl(ctx, a, true);
5682
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
5807
5683
  }
5808
5684
 
5809
5685
  // ggml_gelu
5810
5686
 
5811
- struct ggml_tensor * ggml_gelu_impl(
5812
- struct ggml_context * ctx,
5813
- struct ggml_tensor * a,
5814
- bool inplace) {
5815
- bool is_node = false;
5816
-
5817
- if (!inplace && (a->grad)) {
5818
- is_node = true;
5819
- }
5820
-
5821
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5822
-
5823
- result->op = GGML_OP_GELU;
5824
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5825
- result->src[0] = a;
5826
- result->src[1] = NULL;
5827
-
5828
- return result;
5829
- }
5830
-
5831
5687
  struct ggml_tensor * ggml_gelu(
5832
5688
  struct ggml_context * ctx,
5833
5689
  struct ggml_tensor * a) {
5834
- return ggml_gelu_impl(ctx, a, false);
5690
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
5835
5691
  }
5836
5692
 
5837
5693
  struct ggml_tensor * ggml_gelu_inplace(
5838
5694
  struct ggml_context * ctx,
5839
5695
  struct ggml_tensor * a) {
5840
- return ggml_gelu_impl(ctx, a, true);
5696
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
5841
5697
  }
5842
5698
 
5843
5699
  // ggml_gelu_quick
5844
5700
 
5845
- struct ggml_tensor * ggml_gelu_quick_impl(
5846
- struct ggml_context * ctx,
5847
- struct ggml_tensor * a,
5848
- bool inplace) {
5849
- bool is_node = false;
5850
-
5851
- if (!inplace && (a->grad)) {
5852
- is_node = true;
5853
- }
5854
-
5855
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5856
-
5857
- result->op = GGML_OP_GELU_QUICK;
5858
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5859
- result->src[0] = a;
5860
- result->src[1] = NULL;
5861
-
5862
- return result;
5863
- }
5864
-
5865
5701
  struct ggml_tensor * ggml_gelu_quick(
5866
5702
  struct ggml_context * ctx,
5867
5703
  struct ggml_tensor * a) {
5868
- return ggml_gelu_quick_impl(ctx, a, false);
5704
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5869
5705
  }
5870
5706
 
5871
5707
  struct ggml_tensor * ggml_gelu_quick_inplace(
5872
5708
  struct ggml_context * ctx,
5873
5709
  struct ggml_tensor * a) {
5874
- return ggml_gelu_quick_impl(ctx, a, true);
5710
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5875
5711
  }
5876
5712
 
5877
5713
  // ggml_silu
5878
5714
 
5879
- struct ggml_tensor * ggml_silu_impl(
5880
- struct ggml_context * ctx,
5881
- struct ggml_tensor * a,
5882
- bool inplace) {
5883
- bool is_node = false;
5884
-
5885
- if (!inplace && (a->grad)) {
5886
- is_node = true;
5887
- }
5888
-
5889
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5890
-
5891
- result->op = GGML_OP_SILU;
5892
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5893
- result->src[0] = a;
5894
- result->src[1] = NULL;
5895
-
5896
- return result;
5897
- }
5898
-
5899
- struct ggml_tensor * ggml_silu(
5715
+ struct ggml_tensor * ggml_silu(
5900
5716
  struct ggml_context * ctx,
5901
5717
  struct ggml_tensor * a) {
5902
- return ggml_silu_impl(ctx, a, false);
5718
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
5903
5719
  }
5904
5720
 
5905
5721
  struct ggml_tensor * ggml_silu_inplace(
5906
5722
  struct ggml_context * ctx,
5907
5723
  struct ggml_tensor * a) {
5908
- return ggml_silu_impl(ctx, a, true);
5724
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
5909
5725
  }
5910
5726
 
5911
5727
  // ggml_silu_back
@@ -5933,7 +5749,7 @@ struct ggml_tensor * ggml_silu_back(
5933
5749
 
5934
5750
  // ggml_norm
5935
5751
 
5936
- struct ggml_tensor * ggml_norm_impl(
5752
+ static struct ggml_tensor * ggml_norm_impl(
5937
5753
  struct ggml_context * ctx,
5938
5754
  struct ggml_tensor * a,
5939
5755
  bool inplace) {
@@ -5946,10 +5762,11 @@ struct ggml_tensor * ggml_norm_impl(
5946
5762
 
5947
5763
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5948
5764
 
5765
+ // TODO: maybe store epsilon here?
5766
+
5949
5767
  result->op = GGML_OP_NORM;
5950
5768
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5951
5769
  result->src[0] = a;
5952
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5953
5770
 
5954
5771
  return result;
5955
5772
  }
@@ -5966,9 +5783,10 @@ struct ggml_tensor * ggml_norm_inplace(
5966
5783
  return ggml_norm_impl(ctx, a, true);
5967
5784
  }
5968
5785
 
5969
- struct ggml_tensor * ggml_rms_norm_impl(
5786
+ static struct ggml_tensor * ggml_rms_norm_impl(
5970
5787
  struct ggml_context * ctx,
5971
5788
  struct ggml_tensor * a,
5789
+ float eps,
5972
5790
  bool inplace) {
5973
5791
  bool is_node = false;
5974
5792
 
@@ -5978,24 +5796,27 @@ struct ggml_tensor * ggml_rms_norm_impl(
5978
5796
 
5979
5797
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5980
5798
 
5799
+ ggml_set_op_params(result, &eps, sizeof(eps));
5800
+
5981
5801
  result->op = GGML_OP_RMS_NORM;
5982
5802
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5983
5803
  result->src[0] = a;
5984
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5985
5804
 
5986
5805
  return result;
5987
5806
  }
5988
5807
 
5989
5808
  struct ggml_tensor * ggml_rms_norm(
5990
5809
  struct ggml_context * ctx,
5991
- struct ggml_tensor * a) {
5992
- return ggml_rms_norm_impl(ctx, a, false);
5810
+ struct ggml_tensor * a,
5811
+ float eps) {
5812
+ return ggml_rms_norm_impl(ctx, a, eps, false);
5993
5813
  }
5994
5814
 
5995
5815
  struct ggml_tensor * ggml_rms_norm_inplace(
5996
5816
  struct ggml_context * ctx,
5997
- struct ggml_tensor * a) {
5998
- return ggml_rms_norm_impl(ctx, a, true);
5817
+ struct ggml_tensor * a,
5818
+ float eps) {
5819
+ return ggml_rms_norm_impl(ctx, a, eps, true);
5999
5820
  }
6000
5821
 
6001
5822
  struct ggml_tensor * ggml_rms_norm_back(
@@ -6074,7 +5895,7 @@ struct ggml_tensor * ggml_out_prod(
6074
5895
 
6075
5896
  // ggml_scale
6076
5897
 
6077
- struct ggml_tensor * ggml_scale_impl(
5898
+ static struct ggml_tensor * ggml_scale_impl(
6078
5899
  struct ggml_context * ctx,
6079
5900
  struct ggml_tensor * a,
6080
5901
  struct ggml_tensor * b,
@@ -6114,7 +5935,7 @@ struct ggml_tensor * ggml_scale_inplace(
6114
5935
 
6115
5936
  // ggml_set
6116
5937
 
6117
- struct ggml_tensor * ggml_set_impl(
5938
+ static struct ggml_tensor * ggml_set_impl(
6118
5939
  struct ggml_context * ctx,
6119
5940
  struct ggml_tensor * a,
6120
5941
  struct ggml_tensor * b,
@@ -6134,23 +5955,13 @@ struct ggml_tensor * ggml_set_impl(
6134
5955
  // make a view of the destination
6135
5956
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6136
5957
 
6137
- ggml_scratch_save(ctx);
6138
-
6139
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
6140
-
6141
- (( int32_t * ) c->data)[0] = nb1;
6142
- (( int32_t * ) c->data)[1] = nb2;
6143
- (( int32_t * ) c->data)[2] = nb3;
6144
- (( int32_t * ) c->data)[3] = offset;
6145
- (( int32_t * ) c->data)[4] = inplace ? 1 : 0;
6146
-
6147
- ggml_scratch_load(ctx);
5958
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5959
+ ggml_set_op_params(result, params, sizeof(params));
6148
5960
 
6149
5961
  result->op = GGML_OP_SET;
6150
5962
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6151
5963
  result->src[0] = a;
6152
5964
  result->src[1] = b;
6153
- result->src[2] = c;
6154
5965
 
6155
5966
  return result;
6156
5967
  }
@@ -6214,7 +6025,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
6214
6025
 
6215
6026
  // ggml_cpy
6216
6027
 
6217
- struct ggml_tensor * ggml_cpy_impl(
6028
+ static struct ggml_tensor * ggml_cpy_impl(
6218
6029
  struct ggml_context * ctx,
6219
6030
  struct ggml_tensor * a,
6220
6031
  struct ggml_tensor * b,
@@ -6259,7 +6070,7 @@ struct ggml_tensor * ggml_cpy_inplace(
6259
6070
 
6260
6071
  // ggml_cont
6261
6072
 
6262
- struct ggml_tensor * ggml_cont_impl(
6073
+ static struct ggml_tensor * ggml_cont_impl(
6263
6074
  struct ggml_context * ctx,
6264
6075
  struct ggml_tensor * a,
6265
6076
  bool inplace) {
@@ -6275,7 +6086,6 @@ struct ggml_tensor * ggml_cont_impl(
6275
6086
  result->op = GGML_OP_CONT;
6276
6087
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6277
6088
  result->src[0] = a;
6278
- result->src[1] = NULL;
6279
6089
 
6280
6090
  return result;
6281
6091
  }
@@ -6319,7 +6129,6 @@ struct ggml_tensor * ggml_reshape(
6319
6129
  result->op = GGML_OP_RESHAPE;
6320
6130
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6321
6131
  result->src[0] = a;
6322
- result->src[1] = NULL;
6323
6132
 
6324
6133
  return result;
6325
6134
  }
@@ -6344,7 +6153,6 @@ struct ggml_tensor * ggml_reshape_1d(
6344
6153
  result->op = GGML_OP_RESHAPE;
6345
6154
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6346
6155
  result->src[0] = a;
6347
- result->src[1] = NULL;
6348
6156
 
6349
6157
  return result;
6350
6158
  }
@@ -6370,7 +6178,6 @@ struct ggml_tensor * ggml_reshape_2d(
6370
6178
  result->op = GGML_OP_RESHAPE;
6371
6179
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6372
6180
  result->src[0] = a;
6373
- result->src[1] = NULL;
6374
6181
 
6375
6182
  return result;
6376
6183
  }
@@ -6397,7 +6204,6 @@ struct ggml_tensor * ggml_reshape_3d(
6397
6204
  result->op = GGML_OP_RESHAPE;
6398
6205
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6399
6206
  result->src[0] = a;
6400
- result->src[1] = NULL;
6401
6207
 
6402
6208
  return result;
6403
6209
  }
@@ -6426,7 +6232,6 @@ struct ggml_tensor * ggml_reshape_4d(
6426
6232
  result->op = GGML_OP_RESHAPE;
6427
6233
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6428
6234
  result->src[0] = a;
6429
- result->src[1] = NULL;
6430
6235
 
6431
6236
  return result;
6432
6237
  }
@@ -6448,19 +6253,11 @@ struct ggml_tensor * ggml_view_1d(
6448
6253
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6449
6254
  ggml_format_name(result, "%s (view)", a->name);
6450
6255
 
6451
- ggml_scratch_save(ctx);
6452
-
6453
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6454
- ggml_set_name(offs, "offset");
6455
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6456
-
6457
- ggml_scratch_load(ctx);
6256
+ ggml_set_op_params(result, &offset, sizeof(offset));
6458
6257
 
6459
6258
  result->op = GGML_OP_VIEW;
6460
6259
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6461
6260
  result->src[0] = a;
6462
- result->src[1] = NULL;
6463
- result->src[2] = offs;
6464
6261
 
6465
6262
  return result;
6466
6263
  }
@@ -6486,13 +6283,7 @@ struct ggml_tensor * ggml_view_2d(
6486
6283
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6487
6284
  ggml_format_name(result, "%s (view)", a->name);
6488
6285
 
6489
- ggml_scratch_save(ctx);
6490
-
6491
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6492
- ggml_set_name(offs, "offset");
6493
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6494
-
6495
- ggml_scratch_load(ctx);
6286
+ ggml_set_op_params(result, &offset, sizeof(offset));
6496
6287
 
6497
6288
  result->nb[1] = nb1;
6498
6289
  result->nb[2] = result->nb[1]*ne1;
@@ -6501,8 +6292,6 @@ struct ggml_tensor * ggml_view_2d(
6501
6292
  result->op = GGML_OP_VIEW;
6502
6293
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6503
6294
  result->src[0] = a;
6504
- result->src[1] = NULL;
6505
- result->src[2] = offs;
6506
6295
 
6507
6296
  return result;
6508
6297
  }
@@ -6530,13 +6319,7 @@ struct ggml_tensor * ggml_view_3d(
6530
6319
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6531
6320
  ggml_format_name(result, "%s (view)", a->name);
6532
6321
 
6533
- ggml_scratch_save(ctx);
6534
-
6535
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6536
- ggml_set_name(offs, "offset");
6537
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6538
-
6539
- ggml_scratch_load(ctx);
6322
+ ggml_set_op_params(result, &offset, sizeof(offset));
6540
6323
 
6541
6324
  result->nb[1] = nb1;
6542
6325
  result->nb[2] = nb2;
@@ -6545,8 +6328,6 @@ struct ggml_tensor * ggml_view_3d(
6545
6328
  result->op = GGML_OP_VIEW;
6546
6329
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6547
6330
  result->src[0] = a;
6548
- result->src[1] = NULL;
6549
- result->src[2] = offs;
6550
6331
 
6551
6332
  return result;
6552
6333
  }
@@ -6576,13 +6357,7 @@ struct ggml_tensor * ggml_view_4d(
6576
6357
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6577
6358
  ggml_format_name(result, "%s (view)", a->name);
6578
6359
 
6579
- ggml_scratch_save(ctx);
6580
-
6581
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6582
- ggml_set_name(offs, "offset");
6583
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6584
-
6585
- ggml_scratch_load(ctx);
6360
+ ggml_set_op_params(result, &offset, sizeof(offset));
6586
6361
 
6587
6362
  result->nb[1] = nb1;
6588
6363
  result->nb[2] = nb2;
@@ -6591,8 +6366,6 @@ struct ggml_tensor * ggml_view_4d(
6591
6366
  result->op = GGML_OP_VIEW;
6592
6367
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6593
6368
  result->src[0] = a;
6594
- result->src[1] = NULL;
6595
- result->src[2] = offs;
6596
6369
 
6597
6370
  return result;
6598
6371
  }
@@ -6653,22 +6426,9 @@ struct ggml_tensor * ggml_permute(
6653
6426
  result->op = GGML_OP_PERMUTE;
6654
6427
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6655
6428
  result->src[0] = a;
6656
- result->src[1] = NULL;
6657
-
6658
- if (is_node) {
6659
- ggml_scratch_save(ctx);
6660
-
6661
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6662
-
6663
- ((int32_t *) b->data)[0] = axis0;
6664
- ((int32_t *) b->data)[1] = axis1;
6665
- ((int32_t *) b->data)[2] = axis2;
6666
- ((int32_t *) b->data)[3] = axis3;
6667
6429
 
6668
- ggml_scratch_load(ctx);
6669
-
6670
- result->src[2] = b;
6671
- }
6430
+ int32_t params[] = { axis0, axis1, axis2, axis3 };
6431
+ ggml_set_op_params(result, &params, sizeof(params));
6672
6432
 
6673
6433
  return result;
6674
6434
  }
@@ -6696,7 +6456,6 @@ struct ggml_tensor * ggml_transpose(
6696
6456
  result->op = GGML_OP_TRANSPOSE;
6697
6457
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6698
6458
  result->src[0] = a;
6699
- result->src[1] = NULL;
6700
6459
 
6701
6460
  return result;
6702
6461
  }
@@ -6774,7 +6533,6 @@ struct ggml_tensor * ggml_diag(
6774
6533
  result->op = GGML_OP_DIAG;
6775
6534
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6776
6535
  result->src[0] = a;
6777
- result->src[1] = NULL;
6778
6536
 
6779
6537
  return result;
6780
6538
  }
@@ -6782,7 +6540,7 @@ struct ggml_tensor * ggml_diag(
6782
6540
 
6783
6541
  // ggml_diag_mask_inf
6784
6542
 
6785
- struct ggml_tensor * ggml_diag_mask_inf_impl(
6543
+ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6786
6544
  struct ggml_context * ctx,
6787
6545
  struct ggml_tensor * a,
6788
6546
  int n_past,
@@ -6795,19 +6553,12 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
6795
6553
 
6796
6554
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6797
6555
 
6798
- ggml_scratch_save(ctx);
6799
-
6800
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6801
-
6802
- ((int32_t *) b->data)[0] = n_past;
6803
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6804
-
6805
- ggml_scratch_load(ctx);
6556
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6557
+ ggml_set_op_params(result, &params, sizeof(params));
6806
6558
 
6807
6559
  result->op = GGML_OP_DIAG_MASK_INF;
6808
6560
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6809
6561
  result->src[0] = a;
6810
- result->src[1] = b;
6811
6562
 
6812
6563
  return result;
6813
6564
  }
@@ -6829,7 +6580,7 @@ struct ggml_tensor * ggml_diag_mask_inf_inplace(
6829
6580
 
6830
6581
  // ggml_diag_mask_zero
6831
6582
 
6832
- struct ggml_tensor * ggml_diag_mask_zero_impl(
6583
+ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6833
6584
  struct ggml_context * ctx,
6834
6585
  struct ggml_tensor * a,
6835
6586
  int n_past,
@@ -6842,20 +6593,12 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
6842
6593
 
6843
6594
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6844
6595
 
6845
- ggml_scratch_save(ctx);
6846
-
6847
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6848
- ggml_set_name(b, "n_past, inplace");
6849
-
6850
- ((int32_t *) b->data)[0] = n_past;
6851
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6852
-
6853
- ggml_scratch_load(ctx);
6596
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6597
+ ggml_set_op_params(result, &params, sizeof(params));
6854
6598
 
6855
6599
  result->op = GGML_OP_DIAG_MASK_ZERO;
6856
6600
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6857
6601
  result->src[0] = a;
6858
- result->src[1] = b;
6859
6602
 
6860
6603
  return result;
6861
6604
  }
@@ -6876,7 +6619,7 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
6876
6619
 
6877
6620
  // ggml_soft_max
6878
6621
 
6879
- struct ggml_tensor * ggml_soft_max_impl(
6622
+ static struct ggml_tensor * ggml_soft_max_impl(
6880
6623
  struct ggml_context * ctx,
6881
6624
  struct ggml_tensor * a,
6882
6625
  bool inplace) {
@@ -6891,7 +6634,6 @@ struct ggml_tensor * ggml_soft_max_impl(
6891
6634
  result->op = GGML_OP_SOFT_MAX;
6892
6635
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6893
6636
  result->src[0] = a;
6894
- result->src[1] = NULL;
6895
6637
 
6896
6638
  return result;
6897
6639
  }
@@ -6911,7 +6653,7 @@ struct ggml_tensor * ggml_soft_max_inplace(
6911
6653
 
6912
6654
  // ggml_soft_max_back
6913
6655
 
6914
- struct ggml_tensor * ggml_soft_max_back_impl(
6656
+ static struct ggml_tensor * ggml_soft_max_back_impl(
6915
6657
  struct ggml_context * ctx,
6916
6658
  struct ggml_tensor * a,
6917
6659
  struct ggml_tensor * b,
@@ -6948,13 +6690,15 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
6948
6690
 
6949
6691
  // ggml_rope
6950
6692
 
6951
- struct ggml_tensor * ggml_rope_impl(
6693
+ static struct ggml_tensor * ggml_rope_impl(
6952
6694
  struct ggml_context * ctx,
6953
6695
  struct ggml_tensor * a,
6954
6696
  int n_past,
6955
6697
  int n_dims,
6956
6698
  int mode,
6957
6699
  int n_ctx,
6700
+ float freq_base,
6701
+ float freq_scale,
6958
6702
  bool inplace) {
6959
6703
  GGML_ASSERT(n_past >= 0);
6960
6704
  bool is_node = false;
@@ -6965,21 +6709,14 @@ struct ggml_tensor * ggml_rope_impl(
6965
6709
 
6966
6710
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6967
6711
 
6968
- ggml_scratch_save(ctx);
6969
-
6970
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6971
-
6972
- ((int32_t *) b->data)[0] = n_past;
6973
- ((int32_t *) b->data)[1] = n_dims;
6974
- ((int32_t *) b->data)[2] = mode;
6975
- ((int32_t *) b->data)[3] = n_ctx;
6976
-
6977
- ggml_scratch_load(ctx);
6712
+ int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6713
+ memcpy(params + 4, &freq_base, sizeof(float));
6714
+ memcpy(params + 5, &freq_scale, sizeof(float));
6715
+ ggml_set_op_params(result, &params, sizeof(params));
6978
6716
 
6979
6717
  result->op = GGML_OP_ROPE;
6980
6718
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6981
6719
  result->src[0] = a;
6982
- result->src[1] = b;
6983
6720
 
6984
6721
  return result;
6985
6722
  }
@@ -6991,7 +6728,7 @@ struct ggml_tensor * ggml_rope(
6991
6728
  int n_dims,
6992
6729
  int mode,
6993
6730
  int n_ctx) {
6994
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
6731
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
6995
6732
  }
6996
6733
 
6997
6734
  struct ggml_tensor * ggml_rope_inplace(
@@ -7001,7 +6738,19 @@ struct ggml_tensor * ggml_rope_inplace(
7001
6738
  int n_dims,
7002
6739
  int mode,
7003
6740
  int n_ctx) {
7004
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
6741
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
6742
+ }
6743
+
6744
+ struct ggml_tensor * ggml_rope_custom_inplace(
6745
+ struct ggml_context * ctx,
6746
+ struct ggml_tensor * a,
6747
+ int n_past,
6748
+ int n_dims,
6749
+ int mode,
6750
+ int n_ctx,
6751
+ float freq_base,
6752
+ float freq_scale) {
6753
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
7005
6754
  }
7006
6755
 
7007
6756
  // ggml_rope_back
@@ -7011,7 +6760,8 @@ struct ggml_tensor * ggml_rope_back(
7011
6760
  struct ggml_tensor * a,
7012
6761
  int n_past,
7013
6762
  int n_dims,
7014
- int mode) {
6763
+ int mode,
6764
+ int n_ctx) {
7015
6765
  GGML_ASSERT(n_past >= 0);
7016
6766
  GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
7017
6767
 
@@ -7023,21 +6773,12 @@ struct ggml_tensor * ggml_rope_back(
7023
6773
 
7024
6774
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7025
6775
 
7026
- ggml_scratch_save(ctx);
7027
-
7028
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7029
- ggml_set_name(b, "n_past, n_dims, mode");
7030
-
7031
- ((int32_t *) b->data)[0] = n_past;
7032
- ((int32_t *) b->data)[1] = n_dims;
7033
- ((int32_t *) b->data)[2] = mode;
7034
-
7035
- ggml_scratch_load(ctx);
6776
+ int32_t params[] = { n_past, n_dims, mode, n_ctx };
6777
+ ggml_set_op_params(result, &params, sizeof(params));
7036
6778
 
7037
6779
  result->op = GGML_OP_ROPE_BACK;
7038
6780
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7039
6781
  result->src[0] = a;
7040
- result->src[1] = b;
7041
6782
 
7042
6783
  return result;
7043
6784
  }
@@ -7062,21 +6803,13 @@ struct ggml_tensor * ggml_alibi(
7062
6803
  //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7063
6804
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7064
6805
 
7065
- ggml_scratch_save(ctx);
7066
-
7067
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7068
-
7069
- ((int32_t *) b->data)[0] = n_past;
7070
- ((int32_t *) b->data)[1] = n_head;
7071
- GGML_ASSERT(sizeof(float) == sizeof(int32_t));
7072
- (((float *) b->data)[2]) = bias_max;
7073
-
7074
- ggml_scratch_load(ctx);
6806
+ int32_t op_params[3] = { n_past, n_head };
6807
+ memcpy(op_params + 2, &bias_max, sizeof(float));
6808
+ ggml_set_op_params(result, &op_params, sizeof(op_params));
7075
6809
 
7076
6810
  result->op = GGML_OP_ALIBI;
7077
6811
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7078
6812
  result->src[0] = a;
7079
- result->src[1] = b;
7080
6813
 
7081
6814
  return result;
7082
6815
  }
@@ -7098,19 +6831,12 @@ struct ggml_tensor * ggml_clamp(
7098
6831
  // TODO: when implement backward, fix this:
7099
6832
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7100
6833
 
7101
- ggml_scratch_save(ctx);
7102
-
7103
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
7104
-
7105
- ((float *) b->data)[0] = min;
7106
- ((float *) b->data)[1] = max;
7107
-
7108
- ggml_scratch_load(ctx);
6834
+ float params[] = { min, max };
6835
+ ggml_set_op_params(result, &params, sizeof(params));
7109
6836
 
7110
6837
  result->op = GGML_OP_CLAMP;
7111
6838
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7112
6839
  result->src[0] = a;
7113
- result->src[1] = b;
7114
6840
 
7115
6841
  return result;
7116
6842
  }
@@ -7143,18 +6869,13 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
7143
6869
  };
7144
6870
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7145
6871
 
7146
- ggml_scratch_save(ctx);
7147
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7148
- ((int32_t*)c->data)[0] = s0;
7149
- ((int32_t*)c->data)[1] = p0;
7150
- ((int32_t*)c->data)[2] = d0;
7151
- ggml_scratch_load(ctx);
6872
+ int32_t params[] = { s0, p0, d0 };
6873
+ ggml_set_op_params(result, &params, sizeof(params));
7152
6874
 
7153
6875
  result->op = GGML_OP_CONV_1D;
7154
6876
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7155
6877
  result->src[0] = a;
7156
6878
  result->src[1] = b;
7157
- result->src[2] = c;
7158
6879
 
7159
6880
  return result;
7160
6881
  }
@@ -7187,21 +6908,13 @@ struct ggml_tensor* ggml_conv_2d(
7187
6908
  };
7188
6909
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7189
6910
 
7190
- ggml_scratch_save(ctx);
7191
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
7192
- ((int32_t*)c->data)[0] = s0;
7193
- ((int32_t*)c->data)[1] = s1;
7194
- ((int32_t*)c->data)[2] = p0;
7195
- ((int32_t*)c->data)[3] = p1;
7196
- ((int32_t*)c->data)[4] = d0;
7197
- ((int32_t*)c->data)[5] = d1;
7198
- ggml_scratch_load(ctx);
6911
+ int32_t params[] = { s0, s1, p0, p1, d0, d1 };
6912
+ ggml_set_op_params(result, &params, sizeof(params));
7199
6913
 
7200
6914
  result->op = GGML_OP_CONV_2D;
7201
6915
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7202
6916
  result->src[0] = a;
7203
6917
  result->src[1] = b;
7204
- result->src[2] = c;
7205
6918
 
7206
6919
  return result;
7207
6920
 
@@ -7225,7 +6938,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
7225
6938
  return (ins + 2 * p - ks) / s + 1;
7226
6939
  }
7227
6940
 
7228
- // ggml_pool_2d
6941
+ // ggml_pool_1d
7229
6942
 
7230
6943
  struct ggml_tensor* ggml_pool_1d(
7231
6944
  struct ggml_context * ctx,
@@ -7248,18 +6961,12 @@ struct ggml_tensor* ggml_pool_1d(
7248
6961
  };
7249
6962
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7250
6963
 
7251
- ggml_scratch_save(ctx);
7252
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7253
- ((int32_t*)c->data)[0] = op;
7254
- ((int32_t*)c->data)[1] = k0;
7255
- ((int32_t*)c->data)[2] = s0;
7256
- ((int32_t*)c->data)[3] = p0;
7257
- ggml_scratch_load(ctx);
6964
+ int32_t params[] = { op, k0, s0, p0 };
6965
+ ggml_set_op_params(result, &params, sizeof(params));
7258
6966
 
7259
6967
  result->op = GGML_OP_POOL_1D;
7260
6968
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7261
6969
  result->src[0] = a;
7262
- result->src[1] = c;
7263
6970
 
7264
6971
  return result;
7265
6972
  }
@@ -7291,21 +6998,12 @@ struct ggml_tensor* ggml_pool_2d(
7291
6998
  };
7292
6999
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7293
7000
 
7294
- ggml_scratch_save(ctx);
7295
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
7296
- ((int32_t*)c->data)[0] = op;
7297
- ((int32_t*)c->data)[1] = k0;
7298
- ((int32_t*)c->data)[2] = k1;
7299
- ((int32_t*)c->data)[3] = s0;
7300
- ((int32_t*)c->data)[4] = s1;
7301
- ((int32_t*)c->data)[5] = p0;
7302
- ((int32_t*)c->data)[6] = p1;
7303
- ggml_scratch_load(ctx);
7001
+ int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
7002
+ ggml_set_op_params(result, &params, sizeof(params));
7304
7003
 
7305
7004
  result->op = GGML_OP_POOL_2D;
7306
7005
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7307
7006
  result->src[0] = a;
7308
- result->src[1] = c;
7309
7007
 
7310
7008
  return result;
7311
7009
  }
@@ -7328,14 +7026,16 @@ struct ggml_tensor * ggml_flash_attn(
7328
7026
  }
7329
7027
 
7330
7028
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
7331
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne);
7029
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
7030
+
7031
+ int32_t t = masked ? 1 : 0;
7032
+ ggml_set_op_params(result, &t, sizeof(t));
7332
7033
 
7333
7034
  result->op = GGML_OP_FLASH_ATTN;
7334
7035
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7335
7036
  result->src[0] = q;
7336
7037
  result->src[1] = k;
7337
7038
  result->src[2] = v;
7338
- result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
7339
7039
 
7340
7040
  return result;
7341
7041
  }
@@ -7359,7 +7059,7 @@ struct ggml_tensor * ggml_flash_ff(
7359
7059
  }
7360
7060
 
7361
7061
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7362
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);
7062
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
7363
7063
 
7364
7064
  result->op = GGML_OP_FLASH_FF;
7365
7065
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7425,13 +7125,15 @@ struct ggml_tensor * ggml_flash_attn_back(
7425
7125
 
7426
7126
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7427
7127
 
7128
+ int32_t masked_i = masked ? 1 : 0;
7129
+ ggml_set_op_params(result, &masked_i, sizeof(masked_i));
7130
+
7428
7131
  result->op = GGML_OP_FLASH_ATTN_BACK;
7429
7132
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7430
7133
  result->src[0] = q;
7431
7134
  result->src[1] = k;
7432
7135
  result->src[2] = v;
7433
7136
  result->src[3] = d;
7434
- result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
7435
7137
 
7436
7138
  return result;
7437
7139
  }
@@ -7464,21 +7166,12 @@ struct ggml_tensor * ggml_win_part(
7464
7166
 
7465
7167
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7466
7168
 
7467
- ggml_scratch_save(ctx);
7468
-
7469
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7470
-
7471
- ((int32_t *) b->data)[0] = npx;
7472
- ((int32_t *) b->data)[1] = npy;
7473
- ((int32_t *) b->data)[2] = w;
7474
-
7475
- ggml_scratch_load(ctx);
7169
+ int32_t params[] = { npx, npy, w };
7170
+ ggml_set_op_params(result, &params, sizeof(params));
7476
7171
 
7477
7172
  result->op = GGML_OP_WIN_PART;
7478
7173
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7479
7174
  result->src[0] = a;
7480
- result->src[1] = NULL;
7481
- result->src[2] = b;
7482
7175
 
7483
7176
  return result;
7484
7177
  }
@@ -7503,26 +7196,57 @@ struct ggml_tensor * ggml_win_unpart(
7503
7196
  const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
7504
7197
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7505
7198
 
7506
- ggml_scratch_save(ctx);
7199
+ int32_t params[] = { w };
7200
+ ggml_set_op_params(result, &params, sizeof(params));
7507
7201
 
7508
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
7202
+ result->op = GGML_OP_WIN_UNPART;
7203
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7204
+ result->src[0] = a;
7509
7205
 
7510
- ((int32_t *) b->data)[0] = w;
7206
+ return result;
7207
+ }
7511
7208
 
7512
- ggml_scratch_load(ctx);
7209
+ // gmml_unary
7513
7210
 
7514
- result->op = GGML_OP_WIN_UNPART;
7211
+ static struct ggml_tensor * ggml_unary_impl(
7212
+ struct ggml_context * ctx,
7213
+ struct ggml_tensor * a,
7214
+ enum ggml_unary_op op,
7215
+ bool inplace) {
7216
+ bool is_node = false;
7217
+
7218
+ if (!inplace && (a->grad)) {
7219
+ is_node = true;
7220
+ }
7221
+
7222
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7223
+
7224
+ ggml_set_op_params_i32(result, 0, (int32_t) op);
7225
+
7226
+ result->op = GGML_OP_UNARY;
7515
7227
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7516
7228
  result->src[0] = a;
7517
- result->src[1] = NULL;
7518
- result->src[2] = b;
7519
7229
 
7520
7230
  return result;
7521
7231
  }
7522
7232
 
7233
+ struct ggml_tensor * ggml_unary(
7234
+ struct ggml_context * ctx,
7235
+ struct ggml_tensor * a,
7236
+ enum ggml_unary_op op) {
7237
+ return ggml_unary_impl(ctx, a, op, false);
7238
+ }
7239
+
7240
+ struct ggml_tensor * ggml_unary_inplace(
7241
+ struct ggml_context * ctx,
7242
+ struct ggml_tensor * a,
7243
+ enum ggml_unary_op op) {
7244
+ return ggml_unary_impl(ctx, a, op, true);
7245
+ }
7246
+
7523
7247
  // ggml_map_unary
7524
7248
 
7525
- struct ggml_tensor * ggml_map_unary_impl_f32(
7249
+ static struct ggml_tensor * ggml_map_unary_impl_f32(
7526
7250
  struct ggml_context * ctx,
7527
7251
  struct ggml_tensor * a,
7528
7252
  const ggml_unary_op_f32_t fun,
@@ -7533,19 +7257,13 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
7533
7257
  is_node = true;
7534
7258
  }
7535
7259
 
7536
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7537
-
7538
- ggml_scratch_save(ctx);
7539
-
7540
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7541
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7260
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7542
7261
 
7543
- ggml_scratch_load(ctx);
7262
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7544
7263
 
7545
7264
  result->op = GGML_OP_MAP_UNARY;
7546
7265
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7547
7266
  result->src[0] = a;
7548
- result->src[2] = addr_tensor;
7549
7267
 
7550
7268
  return result;
7551
7269
  }
@@ -7566,7 +7284,7 @@ struct ggml_tensor * ggml_map_unary_inplace_f32(
7566
7284
 
7567
7285
  // ggml_map_binary
7568
7286
 
7569
- struct ggml_tensor * ggml_map_binary_impl_f32(
7287
+ static struct ggml_tensor * ggml_map_binary_impl_f32(
7570
7288
  struct ggml_context * ctx,
7571
7289
  struct ggml_tensor * a,
7572
7290
  struct ggml_tensor * b,
@@ -7580,20 +7298,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
7580
7298
  is_node = true;
7581
7299
  }
7582
7300
 
7583
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7584
-
7585
- ggml_scratch_save(ctx);
7586
-
7587
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7588
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7301
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7589
7302
 
7590
- ggml_scratch_load(ctx);
7303
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7591
7304
 
7592
7305
  result->op = GGML_OP_MAP_BINARY;
7593
7306
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7594
7307
  result->src[0] = a;
7595
7308
  result->src[1] = b;
7596
- result->src[2] = addr_tensor;
7597
7309
 
7598
7310
  return result;
7599
7311
  }
@@ -7616,7 +7328,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7616
7328
 
7617
7329
  // ggml_map_custom1
7618
7330
 
7619
- struct ggml_tensor * ggml_map_custom1_impl_f32(
7331
+ static struct ggml_tensor * ggml_map_custom1_impl_f32(
7620
7332
  struct ggml_context * ctx,
7621
7333
  struct ggml_tensor * a,
7622
7334
  const ggml_custom1_op_f32_t fun,
@@ -7627,19 +7339,13 @@ struct ggml_tensor * ggml_map_custom1_impl_f32(
7627
7339
  is_node = true;
7628
7340
  }
7629
7341
 
7630
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7631
-
7632
- ggml_scratch_save(ctx);
7633
-
7634
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7635
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7342
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7636
7343
 
7637
- ggml_scratch_load(ctx);
7344
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7638
7345
 
7639
7346
  result->op = GGML_OP_MAP_CUSTOM1;
7640
7347
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7641
7348
  result->src[0] = a;
7642
- result->src[2] = addr_tensor;
7643
7349
 
7644
7350
  return result;
7645
7351
  }
@@ -7660,7 +7366,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7660
7366
 
7661
7367
  // ggml_map_custom2
7662
7368
 
7663
- struct ggml_tensor * ggml_map_custom2_impl_f32(
7369
+ static struct ggml_tensor * ggml_map_custom2_impl_f32(
7664
7370
  struct ggml_context * ctx,
7665
7371
  struct ggml_tensor * a,
7666
7372
  struct ggml_tensor * b,
@@ -7672,20 +7378,14 @@ struct ggml_tensor * ggml_map_custom2_impl_f32(
7672
7378
  is_node = true;
7673
7379
  }
7674
7380
 
7675
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7676
-
7677
- ggml_scratch_save(ctx);
7678
-
7679
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7680
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7381
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7681
7382
 
7682
- ggml_scratch_load(ctx);
7383
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7683
7384
 
7684
7385
  result->op = GGML_OP_MAP_CUSTOM2;
7685
7386
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7686
7387
  result->src[0] = a;
7687
7388
  result->src[1] = b;
7688
- result->src[2] = addr_tensor;
7689
7389
 
7690
7390
  return result;
7691
7391
  }
@@ -7708,7 +7408,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7708
7408
 
7709
7409
  // ggml_map_custom3
7710
7410
 
7711
- struct ggml_tensor * ggml_map_custom3_impl_f32(
7411
+ static struct ggml_tensor * ggml_map_custom3_impl_f32(
7712
7412
  struct ggml_context * ctx,
7713
7413
  struct ggml_tensor * a,
7714
7414
  struct ggml_tensor * b,
@@ -7721,21 +7421,15 @@ struct ggml_tensor * ggml_map_custom3_impl_f32(
7721
7421
  is_node = true;
7722
7422
  }
7723
7423
 
7724
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7725
-
7726
- ggml_scratch_save(ctx);
7727
-
7728
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7729
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7424
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7730
7425
 
7731
- ggml_scratch_load(ctx);
7426
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7732
7427
 
7733
7428
  result->op = GGML_OP_MAP_CUSTOM3;
7734
7429
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7735
7430
  result->src[0] = a;
7736
7431
  result->src[1] = b;
7737
- result->src[2] = addr_tensor;
7738
- result->src[3] = c;
7432
+ result->src[2] = c;
7739
7433
 
7740
7434
  return result;
7741
7435
  }
@@ -8963,21 +8657,17 @@ static void ggml_compute_forward_acc_f32(
8963
8657
  const struct ggml_compute_params * params,
8964
8658
  const struct ggml_tensor * src0,
8965
8659
  const struct ggml_tensor * src1,
8966
- const struct ggml_tensor * opt0,
8967
8660
  struct ggml_tensor * dst) {
8968
8661
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8969
8662
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
8970
8663
 
8971
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
8972
- GGML_ASSERT(ggml_nelements(opt0) == 5);
8973
-
8974
8664
  // view src0 and dst with these strides and data offset inbytes during acc
8975
8665
  // nb0 is implicitely element_size because src0 and dst are contiguous
8976
- size_t nb1 = ((int32_t *) opt0->data)[0];
8977
- size_t nb2 = ((int32_t *) opt0->data)[1];
8978
- size_t nb3 = ((int32_t *) opt0->data)[2];
8979
- size_t offset = ((int32_t *) opt0->data)[3];
8980
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
8666
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
8667
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
8668
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
8669
+ size_t offset = ((int32_t *) dst->op_params)[3];
8670
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
8981
8671
 
8982
8672
  if (!inplace && (params->type == GGML_TASK_INIT)) {
8983
8673
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -9046,13 +8736,12 @@ static void ggml_compute_forward_acc(
9046
8736
  const struct ggml_compute_params * params,
9047
8737
  const struct ggml_tensor * src0,
9048
8738
  const struct ggml_tensor * src1,
9049
- const struct ggml_tensor * opt0,
9050
8739
  struct ggml_tensor * dst) {
9051
8740
 
9052
8741
  switch (src0->type) {
9053
8742
  case GGML_TYPE_F32:
9054
8743
  {
9055
- ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst);
8744
+ ggml_compute_forward_acc_f32(params, src0, src1, dst);
9056
8745
  } break;
9057
8746
  case GGML_TYPE_F16:
9058
8747
  case GGML_TYPE_Q4_0:
@@ -9484,7 +9173,7 @@ static void ggml_compute_forward_sum_f32(
9484
9173
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9485
9174
  for (int64_t i02 = 0; i02 < ne02; i02++) {
9486
9175
  for (int64_t i01 = 0; i01 < ne01; i01++) {
9487
- ggml_vec_sum_ggf(ne00,
9176
+ ggml_vec_sum_f32_ggf(ne00,
9488
9177
  &row_sum,
9489
9178
  (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
9490
9179
  sum += row_sum;
@@ -9494,6 +9183,38 @@ static void ggml_compute_forward_sum_f32(
9494
9183
  ((float *) dst->data)[0] = sum;
9495
9184
  }
9496
9185
 
9186
+ static void ggml_compute_forward_sum_f16(
9187
+ const struct ggml_compute_params * params,
9188
+ const struct ggml_tensor * src0,
9189
+ struct ggml_tensor * dst) {
9190
+ assert(params->ith == 0);
9191
+ assert(ggml_is_scalar(dst));
9192
+
9193
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9194
+ return;
9195
+ }
9196
+
9197
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
9198
+
9199
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
9200
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb);
9201
+
9202
+ float sum = 0;
9203
+ float row_sum = 0;
9204
+
9205
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
9206
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
9207
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
9208
+ ggml_vec_sum_f16_ggf(ne00,
9209
+ &row_sum,
9210
+ (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
9211
+ sum += row_sum;
9212
+ }
9213
+ }
9214
+ }
9215
+ ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
9216
+ }
9217
+
9497
9218
  static void ggml_compute_forward_sum(
9498
9219
  const struct ggml_compute_params * params,
9499
9220
  const struct ggml_tensor * src0,
@@ -9503,6 +9224,10 @@ static void ggml_compute_forward_sum(
9503
9224
  {
9504
9225
  ggml_compute_forward_sum_f32(params, src0, dst);
9505
9226
  } break;
9227
+ case GGML_TYPE_F16:
9228
+ {
9229
+ ggml_compute_forward_sum_f16(params, src0, dst);
9230
+ } break;
9506
9231
  default:
9507
9232
  {
9508
9233
  GGML_ASSERT(false);
@@ -10098,8 +9823,8 @@ static void ggml_compute_forward_gelu_f32(
10098
9823
  const struct ggml_compute_params * params,
10099
9824
  const struct ggml_tensor * src0,
10100
9825
  struct ggml_tensor * dst) {
10101
- GGML_ASSERT(ggml_is_contiguous(src0));
10102
- GGML_ASSERT(ggml_is_contiguous(dst));
9826
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9827
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10103
9828
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10104
9829
 
10105
9830
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10157,8 +9882,8 @@ static void ggml_compute_forward_gelu_quick_f32(
10157
9882
  const struct ggml_compute_params * params,
10158
9883
  const struct ggml_tensor * src0,
10159
9884
  struct ggml_tensor * dst) {
10160
- GGML_ASSERT(ggml_is_contiguous(src0));
10161
- GGML_ASSERT(ggml_is_contiguous(dst));
9885
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9886
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10162
9887
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10163
9888
 
10164
9889
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10216,8 +9941,8 @@ static void ggml_compute_forward_silu_f32(
10216
9941
  const struct ggml_compute_params * params,
10217
9942
  const struct ggml_tensor * src0,
10218
9943
  struct ggml_tensor * dst) {
10219
- GGML_ASSERT(ggml_is_contiguous(src0));
10220
- GGML_ASSERT(ggml_is_contiguous(dst));
9944
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9945
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10221
9946
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10222
9947
 
10223
9948
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10269,7 +9994,6 @@ static void ggml_compute_forward_silu(
10269
9994
  }
10270
9995
  }
10271
9996
 
10272
-
10273
9997
  // ggml_compute_forward_silu_back
10274
9998
 
10275
9999
  static void ggml_compute_forward_silu_back_f32(
@@ -10277,9 +10001,9 @@ static void ggml_compute_forward_silu_back_f32(
10277
10001
  const struct ggml_tensor * src0,
10278
10002
  const struct ggml_tensor * grad,
10279
10003
  struct ggml_tensor * dst) {
10280
- GGML_ASSERT(ggml_is_contiguous(grad));
10281
- GGML_ASSERT(ggml_is_contiguous(src0));
10282
- GGML_ASSERT(ggml_is_contiguous(dst));
10004
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
10005
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
10006
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10283
10007
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10284
10008
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
10285
10009
 
@@ -10419,7 +10143,8 @@ static void ggml_compute_forward_rms_norm_f32(
10419
10143
 
10420
10144
  GGML_TENSOR_UNARY_OP_LOCALS;
10421
10145
 
10422
- const float eps = 1e-6f; // TODO: make this a parameter
10146
+ float eps;
10147
+ memcpy(&eps, dst->op_params, sizeof(float));
10423
10148
 
10424
10149
  // TODO: optimize
10425
10150
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -10684,6 +10409,8 @@ static void ggml_compute_forward_mul_mat(
10684
10409
 
10685
10410
  const enum ggml_type type = src0->type;
10686
10411
 
10412
+ const bool src1_cont = ggml_is_contiguous(src1);
10413
+
10687
10414
  ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
10688
10415
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10689
10416
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
@@ -10747,7 +10474,7 @@ static void ggml_compute_forward_mul_mat(
10747
10474
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
10748
10475
 
10749
10476
  if (type != GGML_TYPE_F32) {
10750
- float * const wdata = params->wdata;
10477
+ float * const wdata = params->wdata;
10751
10478
  ggml_to_float_t const to_float = type_traits[type].to_float;
10752
10479
 
10753
10480
  size_t id = 0;
@@ -10805,7 +10532,7 @@ static void ggml_compute_forward_mul_mat(
10805
10532
  // src1 rows
10806
10533
  const int64_t nr1 = ne11*ne12*ne13;
10807
10534
 
10808
- void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10535
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10809
10536
  const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10810
10537
 
10811
10538
  for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
@@ -10828,7 +10555,15 @@ static void ggml_compute_forward_mul_mat(
10828
10555
  const int64_t i3 = i13;
10829
10556
 
10830
10557
  const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
10831
- const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size;
10558
+
10559
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10560
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10561
+ // the original src1 data pointer, so we should index using the indices directly
10562
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
10563
+ const char * src1_col = (const char *) wdata +
10564
+ (src1_cont || src1->type != vec_dot_type
10565
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10566
+ : (i11*nb11 + i12*nb12 + i13*nb13));
10832
10567
 
10833
10568
  float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10834
10569
 
@@ -11062,21 +10797,17 @@ static void ggml_compute_forward_set_f32(
11062
10797
  const struct ggml_compute_params * params,
11063
10798
  const struct ggml_tensor * src0,
11064
10799
  const struct ggml_tensor * src1,
11065
- const struct ggml_tensor * opt0,
11066
10800
  struct ggml_tensor * dst) {
11067
10801
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11068
10802
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
11069
10803
 
11070
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
11071
- GGML_ASSERT(ggml_nelements(opt0) == 5);
11072
-
11073
10804
  // view src0 and dst with these strides and data offset inbytes during set
11074
10805
  // nb0 is implicitely element_size because src0 and dst are contiguous
11075
- size_t nb1 = ((int32_t *) opt0->data)[0];
11076
- size_t nb2 = ((int32_t *) opt0->data)[1];
11077
- size_t nb3 = ((int32_t *) opt0->data)[2];
11078
- size_t offset = ((int32_t *) opt0->data)[3];
11079
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
10806
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
10807
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
10808
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
10809
+ size_t offset = ((int32_t *) dst->op_params)[3];
10810
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
11080
10811
 
11081
10812
  if (!inplace && (params->type == GGML_TASK_INIT)) {
11082
10813
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -11136,13 +10867,12 @@ static void ggml_compute_forward_set(
11136
10867
  const struct ggml_compute_params * params,
11137
10868
  const struct ggml_tensor * src0,
11138
10869
  const struct ggml_tensor * src1,
11139
- const struct ggml_tensor * opt0,
11140
10870
  struct ggml_tensor * dst) {
11141
10871
 
11142
10872
  switch (src0->type) {
11143
10873
  case GGML_TYPE_F32:
11144
10874
  {
11145
- ggml_compute_forward_set_f32(params, src0, src1, opt0, dst);
10875
+ ggml_compute_forward_set_f32(params, src0, src1, dst);
11146
10876
  } break;
11147
10877
  case GGML_TYPE_F16:
11148
10878
  case GGML_TYPE_Q4_0:
@@ -11538,17 +11268,14 @@ static void ggml_compute_forward_diag(
11538
11268
  static void ggml_compute_forward_diag_mask_f32(
11539
11269
  const struct ggml_compute_params * params,
11540
11270
  const struct ggml_tensor * src0,
11541
- const struct ggml_tensor * src1,
11542
11271
  struct ggml_tensor * dst,
11543
11272
  const float value) {
11544
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11545
- GGML_ASSERT(ggml_nelements(src1) == 2);
11546
11273
 
11547
11274
  const int ith = params->ith;
11548
11275
  const int nth = params->nth;
11549
11276
 
11550
- const int n_past = ((int32_t *) src1->data)[0];
11551
- const bool inplace = (bool)((int32_t *) src1->data)[1];
11277
+ const int n_past = ((int32_t *) dst->op_params)[0];
11278
+ const bool inplace = (bool)((int32_t *) dst->op_params)[1];
11552
11279
 
11553
11280
  GGML_ASSERT(n_past >= 0);
11554
11281
 
@@ -11591,12 +11318,11 @@ static void ggml_compute_forward_diag_mask_f32(
11591
11318
  static void ggml_compute_forward_diag_mask_inf(
11592
11319
  const struct ggml_compute_params * params,
11593
11320
  const struct ggml_tensor * src0,
11594
- const struct ggml_tensor * src1,
11595
11321
  struct ggml_tensor * dst) {
11596
11322
  switch (src0->type) {
11597
11323
  case GGML_TYPE_F32:
11598
11324
  {
11599
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY);
11325
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
11600
11326
  } break;
11601
11327
  default:
11602
11328
  {
@@ -11608,12 +11334,11 @@ static void ggml_compute_forward_diag_mask_inf(
11608
11334
  static void ggml_compute_forward_diag_mask_zero(
11609
11335
  const struct ggml_compute_params * params,
11610
11336
  const struct ggml_tensor * src0,
11611
- const struct ggml_tensor * src1,
11612
11337
  struct ggml_tensor * dst) {
11613
11338
  switch (src0->type) {
11614
11339
  case GGML_TYPE_F32:
11615
11340
  {
11616
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0);
11341
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
11617
11342
  } break;
11618
11343
  default:
11619
11344
  {
@@ -11811,20 +11536,17 @@ static void ggml_compute_forward_soft_max_back(
11811
11536
  static void ggml_compute_forward_alibi_f32(
11812
11537
  const struct ggml_compute_params * params,
11813
11538
  const struct ggml_tensor * src0,
11814
- const struct ggml_tensor * src1,
11815
11539
  struct ggml_tensor * dst) {
11816
11540
  assert(params->ith == 0);
11817
11541
 
11818
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11819
- GGML_ASSERT(ggml_nelements(src1) == 3);
11820
-
11821
11542
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11822
11543
  return;
11823
11544
  }
11824
11545
 
11825
- const int n_past = ((int32_t *) src1->data)[0];
11826
- const int n_head = ((int32_t *) src1->data)[1];
11827
- const float max_bias = ((float *) src1->data)[2];
11546
+ const int n_past = ((int32_t *) dst->op_params)[0];
11547
+ const int n_head = ((int32_t *) dst->op_params)[1];
11548
+ float max_bias;
11549
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11828
11550
 
11829
11551
  assert(n_past >= 0);
11830
11552
 
@@ -11877,20 +11599,17 @@ static void ggml_compute_forward_alibi_f32(
11877
11599
  static void ggml_compute_forward_alibi_f16(
11878
11600
  const struct ggml_compute_params * params,
11879
11601
  const struct ggml_tensor * src0,
11880
- const struct ggml_tensor * src1,
11881
11602
  struct ggml_tensor * dst) {
11882
11603
  assert(params->ith == 0);
11883
11604
 
11884
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11885
- GGML_ASSERT(ggml_nelements(src1) == 3);
11886
-
11887
11605
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11888
11606
  return;
11889
11607
  }
11890
11608
 
11891
- const int n_past = ((int32_t *) src1->data)[0];
11892
- const int n_head = ((int32_t *) src1->data)[1];
11893
- const float max_bias = ((float *) src1->data)[2];
11609
+ const int n_past = ((int32_t *) dst->op_params)[0];
11610
+ const int n_head = ((int32_t *) dst->op_params)[1];
11611
+ float max_bias;
11612
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11894
11613
 
11895
11614
  assert(n_past >= 0);
11896
11615
 
@@ -11943,16 +11662,15 @@ static void ggml_compute_forward_alibi_f16(
11943
11662
  static void ggml_compute_forward_alibi(
11944
11663
  const struct ggml_compute_params * params,
11945
11664
  const struct ggml_tensor * src0,
11946
- const struct ggml_tensor * src1,
11947
11665
  struct ggml_tensor * dst) {
11948
11666
  switch (src0->type) {
11949
11667
  case GGML_TYPE_F16:
11950
11668
  {
11951
- ggml_compute_forward_alibi_f16(params, src0, src1, dst);
11669
+ ggml_compute_forward_alibi_f16(params, src0, dst);
11952
11670
  } break;
11953
11671
  case GGML_TYPE_F32:
11954
11672
  {
11955
- ggml_compute_forward_alibi_f32(params, src0, src1, dst);
11673
+ ggml_compute_forward_alibi_f32(params, src0, dst);
11956
11674
  } break;
11957
11675
  case GGML_TYPE_Q4_0:
11958
11676
  case GGML_TYPE_Q4_1:
@@ -11982,19 +11700,17 @@ static void ggml_compute_forward_alibi(
11982
11700
  static void ggml_compute_forward_clamp_f32(
11983
11701
  const struct ggml_compute_params * params,
11984
11702
  const struct ggml_tensor * src0,
11985
- const struct ggml_tensor * src1,
11986
11703
  struct ggml_tensor * dst) {
11987
11704
  assert(params->ith == 0);
11988
11705
 
11989
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11990
- GGML_ASSERT(ggml_nelements(src1) == 2);
11991
-
11992
11706
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11993
11707
  return;
11994
11708
  }
11995
11709
 
11996
- const float min = ((float *) src1->data)[0];
11997
- const float max = ((float *) src1->data)[1];
11710
+ float min;
11711
+ float max;
11712
+ memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
11713
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
11998
11714
 
11999
11715
  const int ith = params->ith;
12000
11716
  const int nth = params->nth;
@@ -12024,12 +11740,11 @@ static void ggml_compute_forward_clamp_f32(
12024
11740
  static void ggml_compute_forward_clamp(
12025
11741
  const struct ggml_compute_params * params,
12026
11742
  const struct ggml_tensor * src0,
12027
- const struct ggml_tensor * src1,
12028
11743
  struct ggml_tensor * dst) {
12029
11744
  switch (src0->type) {
12030
11745
  case GGML_TYPE_F32:
12031
11746
  {
12032
- ggml_compute_forward_clamp_f32(params, src0, src1, dst);
11747
+ ggml_compute_forward_clamp_f32(params, src0, dst);
12033
11748
  } break;
12034
11749
  case GGML_TYPE_F16:
12035
11750
  case GGML_TYPE_Q4_0:
@@ -12059,19 +11774,21 @@ static void ggml_compute_forward_clamp(
12059
11774
  static void ggml_compute_forward_rope_f32(
12060
11775
  const struct ggml_compute_params * params,
12061
11776
  const struct ggml_tensor * src0,
12062
- const struct ggml_tensor * src1,
12063
11777
  struct ggml_tensor * dst) {
12064
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12065
- GGML_ASSERT(ggml_nelements(src1) == 4);
12066
11778
 
12067
11779
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12068
11780
  return;
12069
11781
  }
12070
11782
 
12071
- const int n_past = ((int32_t *) src1->data)[0];
12072
- const int n_dims = ((int32_t *) src1->data)[1];
12073
- const int mode = ((int32_t *) src1->data)[2];
12074
- const int n_ctx = ((int32_t *) src1->data)[3];
11783
+ float freq_base;
11784
+ float freq_scale;
11785
+
11786
+ const int n_past = ((int32_t *) dst->op_params)[0];
11787
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11788
+ const int mode = ((int32_t *) dst->op_params)[2];
11789
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11790
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11791
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12075
11792
 
12076
11793
  assert(n_past >= 0);
12077
11794
 
@@ -12100,7 +11817,7 @@ static void ggml_compute_forward_rope_f32(
12100
11817
  // row index used to determine which thread to use
12101
11818
  int ir = 0;
12102
11819
 
12103
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
11820
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12104
11821
 
12105
11822
  const bool is_neox = mode & 2;
12106
11823
  const bool is_glm = mode & 4;
@@ -12112,7 +11829,7 @@ static void ggml_compute_forward_rope_f32(
12112
11829
  if (ir++ < ir0) continue;
12113
11830
  if (ir > ir1) break;
12114
11831
 
12115
- float theta = (float)p;
11832
+ float theta = freq_scale * (float)p;
12116
11833
 
12117
11834
  if (is_glm) {
12118
11835
  theta = MIN(p, n_ctx - 2);
@@ -12186,19 +11903,21 @@ static void ggml_compute_forward_rope_f32(
12186
11903
  static void ggml_compute_forward_rope_f16(
12187
11904
  const struct ggml_compute_params * params,
12188
11905
  const struct ggml_tensor * src0,
12189
- const struct ggml_tensor * src1,
12190
11906
  struct ggml_tensor * dst) {
12191
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12192
- GGML_ASSERT(ggml_nelements(src1) == 4);
12193
11907
 
12194
11908
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12195
11909
  return;
12196
11910
  }
12197
11911
 
12198
- const int n_past = ((int32_t *) src1->data)[0];
12199
- const int n_dims = ((int32_t *) src1->data)[1];
12200
- const int mode = ((int32_t *) src1->data)[2];
12201
- const int n_ctx = ((int32_t *) src1->data)[3];
11912
+ float freq_base;
11913
+ float freq_scale;
11914
+
11915
+ const int n_past = ((int32_t *) dst->op_params)[0];
11916
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11917
+ const int mode = ((int32_t *) dst->op_params)[2];
11918
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11919
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11920
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12202
11921
 
12203
11922
  assert(n_past >= 0);
12204
11923
 
@@ -12227,7 +11946,7 @@ static void ggml_compute_forward_rope_f16(
12227
11946
  // row index used to determine which thread to use
12228
11947
  int ir = 0;
12229
11948
 
12230
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
11949
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12231
11950
 
12232
11951
  const bool is_neox = mode & 2;
12233
11952
  const bool is_glm = mode & 4;
@@ -12239,7 +11958,7 @@ static void ggml_compute_forward_rope_f16(
12239
11958
  if (ir++ < ir0) continue;
12240
11959
  if (ir > ir1) break;
12241
11960
 
12242
- float theta = (float)p;
11961
+ float theta = freq_scale * (float)p;
12243
11962
 
12244
11963
  if (is_glm) {
12245
11964
  theta = MIN(p, n_ctx - 2);
@@ -12300,7 +12019,7 @@ static void ggml_compute_forward_rope_f16(
12300
12019
  const float x0 = GGML_FP16_TO_FP32(src[0]);
12301
12020
  const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
12302
12021
 
12303
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12022
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12304
12023
  dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12305
12024
  }
12306
12025
  }
@@ -12313,16 +12032,15 @@ static void ggml_compute_forward_rope_f16(
12313
12032
  static void ggml_compute_forward_rope(
12314
12033
  const struct ggml_compute_params * params,
12315
12034
  const struct ggml_tensor * src0,
12316
- const struct ggml_tensor * src1,
12317
12035
  struct ggml_tensor * dst) {
12318
12036
  switch (src0->type) {
12319
12037
  case GGML_TYPE_F16:
12320
12038
  {
12321
- ggml_compute_forward_rope_f16(params, src0, src1, dst);
12039
+ ggml_compute_forward_rope_f16(params, src0, dst);
12322
12040
  } break;
12323
12041
  case GGML_TYPE_F32:
12324
12042
  {
12325
- ggml_compute_forward_rope_f32(params, src0, src1, dst);
12043
+ ggml_compute_forward_rope_f32(params, src0, dst);
12326
12044
  } break;
12327
12045
  default:
12328
12046
  {
@@ -12336,10 +12054,7 @@ static void ggml_compute_forward_rope(
12336
12054
  static void ggml_compute_forward_rope_back_f32(
12337
12055
  const struct ggml_compute_params * params,
12338
12056
  const struct ggml_tensor * src0,
12339
- const struct ggml_tensor * src1,
12340
12057
  struct ggml_tensor * dst) {
12341
- assert(src1->type == GGML_TYPE_I32);
12342
- assert(ggml_nelements(src1) == 3);
12343
12058
 
12344
12059
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12345
12060
  return;
@@ -12349,9 +12064,9 @@ static void ggml_compute_forward_rope_back_f32(
12349
12064
  // dx = rope_back(dy, src1)
12350
12065
  // src0 is dy, src1 contains options
12351
12066
 
12352
- const int n_past = ((int32_t *) src1->data)[0];
12353
- const int n_dims = ((int32_t *) src1->data)[1];
12354
- const int mode = ((int32_t *) src1->data)[2];
12067
+ const int n_past = ((int32_t *) dst->op_params)[0];
12068
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12069
+ const int mode = ((int32_t *) dst->op_params)[2];
12355
12070
 
12356
12071
  assert(n_past >= 0);
12357
12072
 
@@ -12435,10 +12150,7 @@ static void ggml_compute_forward_rope_back_f32(
12435
12150
  static void ggml_compute_forward_rope_back_f16(
12436
12151
  const struct ggml_compute_params * params,
12437
12152
  const struct ggml_tensor * src0,
12438
- const struct ggml_tensor * src1,
12439
12153
  struct ggml_tensor * dst) {
12440
- assert(src1->type == GGML_TYPE_I32);
12441
- assert(ggml_nelements(src1) == 3);
12442
12154
 
12443
12155
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12444
12156
  return;
@@ -12448,9 +12160,9 @@ static void ggml_compute_forward_rope_back_f16(
12448
12160
  // dx = rope_back(dy, src1)
12449
12161
  // src0 is dy, src1 contains options
12450
12162
 
12451
- const int n_past = ((int32_t *) src1->data)[0];
12452
- const int n_dims = ((int32_t *) src1->data)[1];
12453
- const int mode = ((int32_t *) src1->data)[2];
12163
+ const int n_past = ((int32_t *) dst->op_params)[0];
12164
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12165
+ const int mode = ((int32_t *) dst->op_params)[2];
12454
12166
 
12455
12167
  assert(n_past >= 0);
12456
12168
 
@@ -12534,16 +12246,15 @@ static void ggml_compute_forward_rope_back_f16(
12534
12246
  static void ggml_compute_forward_rope_back(
12535
12247
  const struct ggml_compute_params * params,
12536
12248
  const struct ggml_tensor * src0,
12537
- const struct ggml_tensor * src1,
12538
12249
  struct ggml_tensor * dst) {
12539
12250
  switch (src0->type) {
12540
12251
  case GGML_TYPE_F16:
12541
12252
  {
12542
- ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
12253
+ ggml_compute_forward_rope_back_f16(params, src0, dst);
12543
12254
  } break;
12544
12255
  case GGML_TYPE_F32:
12545
12256
  {
12546
- ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
12257
+ ggml_compute_forward_rope_back_f32(params, src0, dst);
12547
12258
  } break;
12548
12259
  default:
12549
12260
  {
@@ -12740,7 +12451,7 @@ static void ggml_compute_forward_conv_1d_s1_ph(
12740
12451
  const struct ggml_compute_params * params,
12741
12452
  const struct ggml_tensor * src0,
12742
12453
  const struct ggml_tensor * src1,
12743
- struct ggml_tensor * dst) {
12454
+ struct ggml_tensor * dst) {
12744
12455
  switch (src0->type) {
12745
12456
  case GGML_TYPE_F16:
12746
12457
  {
@@ -12943,7 +12654,7 @@ static void ggml_compute_forward_conv_1d_s2_ph(
12943
12654
  const struct ggml_compute_params * params,
12944
12655
  const struct ggml_tensor * src0,
12945
12656
  const struct ggml_tensor * src1,
12946
- struct ggml_tensor * dst) {
12657
+ struct ggml_tensor * dst) {
12947
12658
  switch (src0->type) {
12948
12659
  case GGML_TYPE_F16:
12949
12660
  {
@@ -12963,14 +12674,13 @@ static void ggml_compute_forward_conv_1d_s2_ph(
12963
12674
  // ggml_compute_forward_conv_1d
12964
12675
 
12965
12676
  static void ggml_compute_forward_conv_1d(
12966
- const struct ggml_compute_params * params,
12967
- const struct ggml_tensor * src0,
12968
- const struct ggml_tensor * src1,
12969
- const struct ggml_tensor * opt0,
12970
- struct ggml_tensor * dst) {
12971
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
12972
- const int32_t p0 = ((const int32_t*)(opt0->data))[1];
12973
- const int32_t d0 = ((const int32_t*)(opt0->data))[2];
12677
+ const struct ggml_compute_params * params,
12678
+ const struct ggml_tensor * src0,
12679
+ const struct ggml_tensor * src1,
12680
+ struct ggml_tensor * dst) {
12681
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12682
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
12683
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
12974
12684
  GGML_ASSERT(d0 == 1); // dilation not supported
12975
12685
  GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
12976
12686
  if (s0 == 1) {
@@ -12982,9 +12692,9 @@ static void ggml_compute_forward_conv_1d(
12982
12692
  };
12983
12693
  }
12984
12694
 
12985
- // ggml_compute_forward_conv_2d_sk_p0
12695
+ // ggml_compute_forward_conv_2d
12986
12696
 
12987
- static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
12697
+ static void ggml_compute_forward_conv_2d_f16_f32(
12988
12698
  const struct ggml_compute_params * params,
12989
12699
  const struct ggml_tensor * src0,
12990
12700
  const struct ggml_tensor * src1,
@@ -13007,28 +12717,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13007
12717
  // size of the convolution row - the kernel size unrolled across all channels
13008
12718
  const int ew0 = nk0*nk1*ne02;
13009
12719
 
12720
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12721
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12722
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12723
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12724
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12725
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
12726
+
13010
12727
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13011
12728
  GGML_ASSERT(nb10 == sizeof(float));
13012
12729
 
13013
12730
  if (params->type == GGML_TASK_INIT) {
13014
- // TODO: fix this memset (wsize is overestimated)
13015
12731
  memset(params->wdata, 0, params->wsize);
13016
12732
 
13017
12733
  // prepare source data (src1)
13018
12734
  {
13019
12735
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13020
12736
 
13021
- for (int i13 = 0; i13 < ne13; i13++) {
13022
- for (int i12 = 0; i12 < ne12; i12++) {
13023
- const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
13024
- ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
12737
+ for (int i12 = 0; i12 < ne12; i12++) {
12738
+ const float * const src = (float *)((char *) src1->data + i12*nb12);
12739
+ ggml_fp16_t * dst_data = wdata;
12740
+
12741
+ for (int i1 = 0; i1 < ne1; i1++) {
12742
+ for (int i0 = 0; i0 < ne0; i0++) {
12743
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
12744
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
12745
+ const int idx0 = i0*s0 + ik0*d0 - p0;
12746
+ const int idx1 = i1*s1 + ik1*d1 - p1;
13025
12747
 
13026
- for (int i1 = 0; i1 < ne1; i1++) {
13027
- for (int i0 = 0; i0 < ne0; i0++) {
13028
- for (int ik1 = 0; ik1 < nk1; ik1++) {
13029
- for (int ik0 = 0; ik0 < nk0; ik0++) {
12748
+ if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
13030
12749
  dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13031
- GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
12750
+ GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
13032
12751
  }
13033
12752
  }
13034
12753
  }
@@ -13071,19 +12790,19 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13071
12790
  }
13072
12791
  }
13073
12792
 
13074
- static void ggml_compute_forward_conv_2d_sk_p0(
12793
+ static void ggml_compute_forward_conv_2d(
13075
12794
  const struct ggml_compute_params * params,
13076
12795
  const struct ggml_tensor * src0,
13077
12796
  const struct ggml_tensor * src1,
13078
- struct ggml_tensor * dst) {
12797
+ struct ggml_tensor * dst) {
13079
12798
  switch (src0->type) {
13080
12799
  case GGML_TYPE_F16:
13081
12800
  {
13082
- ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
12801
+ ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
13083
12802
  } break;
13084
12803
  case GGML_TYPE_F32:
13085
12804
  {
13086
- //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
12805
+ //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
13087
12806
  GGML_ASSERT(false);
13088
12807
  } break;
13089
12808
  default:
@@ -13093,32 +12812,6 @@ static void ggml_compute_forward_conv_2d_sk_p0(
13093
12812
  }
13094
12813
  }
13095
12814
 
13096
- // ggml_compute_forward_conv_2d
13097
-
13098
- static void ggml_compute_forward_conv_2d(
13099
- const struct ggml_compute_params* params,
13100
- const struct ggml_tensor* src0,
13101
- const struct ggml_tensor* src1,
13102
- const struct ggml_tensor* opt0,
13103
- struct ggml_tensor* dst) {
13104
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13105
- const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13106
- const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13107
- const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13108
- const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13109
- const int32_t d1 = ((const int32_t*)(opt0->data))[5];
13110
- GGML_ASSERT(d0 == 1); // dilation not supported
13111
- GGML_ASSERT(d1 == 1);
13112
- GGML_ASSERT(p0 == 0); // padding not supported
13113
- GGML_ASSERT(p1 == 0);
13114
-
13115
- if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
13116
- ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
13117
- } else {
13118
- GGML_ASSERT(false); // only stride equal to kernel size is supported
13119
- }
13120
- }
13121
-
13122
12815
  // ggml_compute_forward_pool_1d_sk_p0
13123
12816
 
13124
12817
  static void ggml_compute_forward_pool_1d_sk_p0(
@@ -13174,12 +12867,11 @@ static void ggml_compute_forward_pool_1d_sk_p0(
13174
12867
  // ggml_compute_forward_pool_1d
13175
12868
 
13176
12869
  static void ggml_compute_forward_pool_1d(
13177
- const struct ggml_compute_params* params,
13178
- const struct ggml_tensor* src0,
13179
- const struct ggml_tensor* opt0,
13180
- struct ggml_tensor* dst) {
13181
- GGML_ASSERT(opt0->ne[0] == 4);
13182
- const int* opts = (const int*)opt0->data;
12870
+ const struct ggml_compute_params * params,
12871
+ const struct ggml_tensor * src0,
12872
+ struct ggml_tensor * dst) {
12873
+
12874
+ const int32_t* opts = (const int32_t*)dst->op_params;
13183
12875
  enum ggml_op_pool op = opts[0];
13184
12876
  const int k0 = opts[1];
13185
12877
  const int s0 = opts[2];
@@ -13193,12 +12885,12 @@ static void ggml_compute_forward_pool_1d(
13193
12885
  // ggml_compute_forward_pool_2d_sk_p0
13194
12886
 
13195
12887
  static void ggml_compute_forward_pool_2d_sk_p0(
13196
- const struct ggml_compute_params * params,
13197
- const enum ggml_op_pool op,
13198
- const struct ggml_tensor * src,
13199
- const int k0,
13200
- const int k1,
13201
- struct ggml_tensor * dst) {
12888
+ const struct ggml_compute_params * params,
12889
+ const enum ggml_op_pool op,
12890
+ const struct ggml_tensor * src,
12891
+ const int k0,
12892
+ const int k1,
12893
+ struct ggml_tensor * dst) {
13202
12894
  assert(src->type == GGML_TYPE_F32);
13203
12895
  assert(params->ith == 0);
13204
12896
 
@@ -13258,12 +12950,11 @@ static void ggml_compute_forward_pool_2d_sk_p0(
13258
12950
  // ggml_compute_forward_pool_2d
13259
12951
 
13260
12952
  static void ggml_compute_forward_pool_2d(
13261
- const struct ggml_compute_params * params,
13262
- const struct ggml_tensor * src0,
13263
- const struct ggml_tensor * opt0,
13264
- struct ggml_tensor * dst) {
13265
- GGML_ASSERT(opt0->ne[0] == 7);
13266
- const int* opts = (const int*)opt0->data;
12953
+ const struct ggml_compute_params * params,
12954
+ const struct ggml_tensor * src0,
12955
+ struct ggml_tensor * dst) {
12956
+
12957
+ const int32_t * opts = (const int32_t *)dst->op_params;
13267
12958
  enum ggml_op_pool op = opts[0];
13268
12959
  const int k0 = opts[1];
13269
12960
  const int k1 = opts[2];
@@ -13288,7 +12979,7 @@ static void ggml_compute_forward_flash_attn_f32(
13288
12979
  const struct ggml_tensor * k,
13289
12980
  const struct ggml_tensor * v,
13290
12981
  const bool masked,
13291
- struct ggml_tensor * dst) {
12982
+ struct ggml_tensor * dst) {
13292
12983
  int64_t t0 = ggml_perf_time_us();
13293
12984
  UNUSED(t0);
13294
12985
 
@@ -13466,7 +13157,7 @@ static void ggml_compute_forward_flash_attn_f16(
13466
13157
  const struct ggml_tensor * k,
13467
13158
  const struct ggml_tensor * v,
13468
13159
  const bool masked,
13469
- struct ggml_tensor * dst) {
13160
+ struct ggml_tensor * dst) {
13470
13161
  int64_t t0 = ggml_perf_time_us();
13471
13162
  UNUSED(t0);
13472
13163
 
@@ -14231,7 +13922,6 @@ static void ggml_compute_forward_flash_attn_back(
14231
13922
  static void ggml_compute_forward_win_part_f32(
14232
13923
  const struct ggml_compute_params * params,
14233
13924
  const struct ggml_tensor * src0,
14234
- const struct ggml_tensor * opt0,
14235
13925
  struct ggml_tensor * dst) {
14236
13926
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14237
13927
  return;
@@ -14240,9 +13930,9 @@ static void ggml_compute_forward_win_part_f32(
14240
13930
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14241
13931
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14242
13932
 
14243
- const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14244
- const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
14245
- const int32_t w = ((const int32_t *)(opt0->data))[2];
13933
+ const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
13934
+ const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
13935
+ const int32_t w = ((const int32_t *)(dst->op_params))[2];
14246
13936
 
14247
13937
  assert(ne00 == ne0);
14248
13938
  assert(ne3 == nep0*nep1);
@@ -14276,12 +13966,11 @@ static void ggml_compute_forward_win_part_f32(
14276
13966
  static void ggml_compute_forward_win_part(
14277
13967
  const struct ggml_compute_params * params,
14278
13968
  const struct ggml_tensor * src0,
14279
- const struct ggml_tensor * opt0,
14280
13969
  struct ggml_tensor * dst) {
14281
13970
  switch (src0->type) {
14282
13971
  case GGML_TYPE_F32:
14283
13972
  {
14284
- ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
13973
+ ggml_compute_forward_win_part_f32(params, src0, dst);
14285
13974
  } break;
14286
13975
  default:
14287
13976
  {
@@ -14295,7 +13984,6 @@ static void ggml_compute_forward_win_part(
14295
13984
  static void ggml_compute_forward_win_unpart_f32(
14296
13985
  const struct ggml_compute_params * params,
14297
13986
  const struct ggml_tensor * src0,
14298
- const struct ggml_tensor * opt0,
14299
13987
  struct ggml_tensor * dst) {
14300
13988
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14301
13989
  return;
@@ -14304,7 +13992,7 @@ static void ggml_compute_forward_win_unpart_f32(
14304
13992
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14305
13993
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14306
13994
 
14307
- const int32_t w = ((const int32_t *)(opt0->data))[0];
13995
+ const int32_t w = ((const int32_t *)(dst->op_params))[0];
14308
13996
 
14309
13997
  // padding
14310
13998
  const int px = (w - ne1%w)%w;
@@ -14338,12 +14026,67 @@ static void ggml_compute_forward_win_unpart_f32(
14338
14026
  static void ggml_compute_forward_win_unpart(
14339
14027
  const struct ggml_compute_params * params,
14340
14028
  const struct ggml_tensor * src0,
14341
- const struct ggml_tensor * opt0,
14342
14029
  struct ggml_tensor * dst) {
14343
14030
  switch (src0->type) {
14344
14031
  case GGML_TYPE_F32:
14345
14032
  {
14346
- ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
14033
+ ggml_compute_forward_win_unpart_f32(params, src0, dst);
14034
+ } break;
14035
+ default:
14036
+ {
14037
+ GGML_ASSERT(false);
14038
+ } break;
14039
+ }
14040
+ }
14041
+
14042
+ //gmml_compute_forward_unary
14043
+
14044
+ static void ggml_compute_forward_unary(
14045
+ const struct ggml_compute_params * params,
14046
+ const struct ggml_tensor * src0,
14047
+ struct ggml_tensor * dst) {
14048
+ const enum ggml_unary_op op = ggml_get_unary_op(dst);
14049
+
14050
+ switch (op) {
14051
+ case GGML_UNARY_OP_ABS:
14052
+ {
14053
+ ggml_compute_forward_abs(params, src0, dst);
14054
+ } break;
14055
+ case GGML_UNARY_OP_SGN:
14056
+ {
14057
+ ggml_compute_forward_sgn(params, src0, dst);
14058
+ } break;
14059
+ case GGML_UNARY_OP_NEG:
14060
+ {
14061
+ ggml_compute_forward_neg(params, src0, dst);
14062
+ } break;
14063
+ case GGML_UNARY_OP_STEP:
14064
+ {
14065
+ ggml_compute_forward_step(params, src0, dst);
14066
+ } break;
14067
+ case GGML_UNARY_OP_TANH:
14068
+ {
14069
+ ggml_compute_forward_tanh(params, src0, dst);
14070
+ } break;
14071
+ case GGML_UNARY_OP_ELU:
14072
+ {
14073
+ ggml_compute_forward_elu(params, src0, dst);
14074
+ } break;
14075
+ case GGML_UNARY_OP_RELU:
14076
+ {
14077
+ ggml_compute_forward_relu(params, src0, dst);
14078
+ } break;
14079
+ case GGML_UNARY_OP_GELU:
14080
+ {
14081
+ ggml_compute_forward_gelu(params, src0, dst);
14082
+ } break;
14083
+ case GGML_UNARY_OP_GELU_QUICK:
14084
+ {
14085
+ ggml_compute_forward_gelu_quick(params, src0, dst);
14086
+ } break;
14087
+ case GGML_UNARY_OP_SILU:
14088
+ {
14089
+ ggml_compute_forward_silu(params, src0, dst);
14347
14090
  } break;
14348
14091
  default:
14349
14092
  {
@@ -14862,7 +14605,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14862
14605
  } break;
14863
14606
  case GGML_OP_ACC:
14864
14607
  {
14865
- ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14608
+ ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
14866
14609
  } break;
14867
14610
  case GGML_OP_SUB:
14868
14611
  {
@@ -14912,46 +14655,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14912
14655
  {
14913
14656
  ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
14914
14657
  } break;
14915
- case GGML_OP_ABS:
14916
- {
14917
- ggml_compute_forward_abs(params, tensor->src[0], tensor);
14918
- } break;
14919
- case GGML_OP_SGN:
14920
- {
14921
- ggml_compute_forward_sgn(params, tensor->src[0], tensor);
14922
- } break;
14923
- case GGML_OP_NEG:
14924
- {
14925
- ggml_compute_forward_neg(params, tensor->src[0], tensor);
14926
- } break;
14927
- case GGML_OP_STEP:
14928
- {
14929
- ggml_compute_forward_step(params, tensor->src[0], tensor);
14930
- } break;
14931
- case GGML_OP_TANH:
14932
- {
14933
- ggml_compute_forward_tanh(params, tensor->src[0], tensor);
14934
- } break;
14935
- case GGML_OP_ELU:
14936
- {
14937
- ggml_compute_forward_elu(params, tensor->src[0], tensor);
14938
- } break;
14939
- case GGML_OP_RELU:
14940
- {
14941
- ggml_compute_forward_relu(params, tensor->src[0], tensor);
14942
- } break;
14943
- case GGML_OP_GELU:
14944
- {
14945
- ggml_compute_forward_gelu(params, tensor->src[0], tensor);
14946
- } break;
14947
- case GGML_OP_GELU_QUICK:
14948
- {
14949
- ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
14950
- } break;
14951
- case GGML_OP_SILU:
14952
- {
14953
- ggml_compute_forward_silu(params, tensor->src[0], tensor);
14954
- } break;
14955
14658
  case GGML_OP_SILU_BACK:
14956
14659
  {
14957
14660
  ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
@@ -14982,7 +14685,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14982
14685
  } break;
14983
14686
  case GGML_OP_SET:
14984
14687
  {
14985
- ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14688
+ ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
14986
14689
  } break;
14987
14690
  case GGML_OP_CPY:
14988
14691
  {
@@ -15022,11 +14725,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15022
14725
  } break;
15023
14726
  case GGML_OP_DIAG_MASK_INF:
15024
14727
  {
15025
- ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor);
14728
+ ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
15026
14729
  } break;
15027
14730
  case GGML_OP_DIAG_MASK_ZERO:
15028
14731
  {
15029
- ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor);
14732
+ ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
15030
14733
  } break;
15031
14734
  case GGML_OP_SOFT_MAX:
15032
14735
  {
@@ -15038,39 +14741,39 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15038
14741
  } break;
15039
14742
  case GGML_OP_ROPE:
15040
14743
  {
15041
- ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
14744
+ ggml_compute_forward_rope(params, tensor->src[0], tensor);
15042
14745
  } break;
15043
14746
  case GGML_OP_ROPE_BACK:
15044
14747
  {
15045
- ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
14748
+ ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
15046
14749
  } break;
15047
14750
  case GGML_OP_ALIBI:
15048
14751
  {
15049
- ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor);
14752
+ ggml_compute_forward_alibi(params, tensor->src[0], tensor);
15050
14753
  } break;
15051
14754
  case GGML_OP_CLAMP:
15052
14755
  {
15053
- ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor);
14756
+ ggml_compute_forward_clamp(params, tensor->src[0], tensor);
15054
14757
  } break;
15055
14758
  case GGML_OP_CONV_1D:
15056
14759
  {
15057
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14760
+ ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
15058
14761
  } break;
15059
14762
  case GGML_OP_CONV_2D:
15060
14763
  {
15061
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14764
+ ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
15062
14765
  } break;
15063
14766
  case GGML_OP_POOL_1D:
15064
14767
  {
15065
- ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
14768
+ ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
15066
14769
  } break;
15067
14770
  case GGML_OP_POOL_2D:
15068
14771
  {
15069
- ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
14772
+ ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
15070
14773
  } break;
15071
14774
  case GGML_OP_FLASH_ATTN:
15072
14775
  {
15073
- const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
14776
+ const int32_t t = ggml_get_op_params_i32(tensor, 0);
15074
14777
  GGML_ASSERT(t == 0 || t == 1);
15075
14778
  const bool masked = t != 0;
15076
14779
  ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
@@ -15081,47 +14784,56 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15081
14784
  } break;
15082
14785
  case GGML_OP_FLASH_ATTN_BACK:
15083
14786
  {
15084
- int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
14787
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15085
14788
  GGML_ASSERT(t == 0 || t == 1);
15086
14789
  bool masked = t != 0;
15087
14790
  ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
15088
14791
  } break;
15089
14792
  case GGML_OP_WIN_PART:
15090
14793
  {
15091
- ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor);
14794
+ ggml_compute_forward_win_part(params, tensor->src[0], tensor);
15092
14795
  } break;
15093
14796
  case GGML_OP_WIN_UNPART:
15094
14797
  {
15095
- ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor);
14798
+ ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
14799
+ } break;
14800
+ case GGML_OP_UNARY:
14801
+ {
14802
+ ggml_compute_forward_unary(params, tensor->src[0], tensor);
15096
14803
  } break;
15097
14804
  case GGML_OP_MAP_UNARY:
15098
14805
  {
15099
- const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data);
14806
+ ggml_unary_op_f32_t fun;
14807
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15100
14808
  ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
15101
14809
  }
15102
14810
  break;
15103
14811
  case GGML_OP_MAP_BINARY:
15104
14812
  {
15105
- const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data);
14813
+ ggml_binary_op_f32_t fun;
14814
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15106
14815
  ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
15107
14816
  }
15108
14817
  break;
15109
14818
  case GGML_OP_MAP_CUSTOM1:
15110
14819
  {
15111
- const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data);
14820
+ ggml_custom1_op_f32_t fun;
14821
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15112
14822
  ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
15113
14823
  }
15114
14824
  break;
15115
14825
  case GGML_OP_MAP_CUSTOM2:
15116
14826
  {
15117
- const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data);
14827
+ ggml_custom2_op_f32_t fun;
14828
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15118
14829
  ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
15119
14830
  }
15120
14831
  break;
15121
14832
  case GGML_OP_MAP_CUSTOM3:
15122
14833
  {
15123
- const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data);
15124
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun);
14834
+ ggml_custom3_op_f32_t fun;
14835
+ memcpy(&fun, tensor->op_params, sizeof(fun));
14836
+ ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15125
14837
  }
15126
14838
  break;
15127
14839
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -15185,12 +14897,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15185
14897
  src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
15186
14898
  }
15187
14899
  if (src1->grad) {
15188
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15189
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15190
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15191
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15192
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15193
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
14900
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
14901
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
14902
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
14903
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15194
14904
 
15195
14905
  struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
15196
14906
  tensor->grad,
@@ -15339,73 +15049,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15339
15049
  inplace);
15340
15050
  }
15341
15051
  } break;
15342
- case GGML_OP_ABS:
15343
- {
15344
- if (src0->grad) {
15345
- src0->grad =
15346
- ggml_add_impl(ctx,
15347
- src0->grad,
15348
- ggml_mul(ctx,
15349
- ggml_sgn(ctx, src0),
15350
- tensor->grad),
15351
- inplace);
15352
- }
15353
- } break;
15354
- case GGML_OP_SGN:
15355
- {
15356
- if (src0->grad) {
15357
- // noop
15358
- }
15359
- } break;
15360
- case GGML_OP_NEG:
15361
- {
15362
- if (src0->grad) {
15363
- src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15364
- }
15365
- } break;
15366
- case GGML_OP_STEP:
15367
- {
15368
- if (src0->grad) {
15369
- // noop
15370
- }
15371
- } break;
15372
- case GGML_OP_TANH:
15373
- {
15374
- GGML_ASSERT(false); // TODO: not implemented
15375
- } break;
15376
- case GGML_OP_ELU:
15377
- {
15378
- GGML_ASSERT(false); // TODO: not implemented
15379
- } break;
15380
- case GGML_OP_RELU:
15381
- {
15382
- if (src0->grad) {
15383
- src0->grad = ggml_sub_impl(ctx,
15384
- src0->grad,
15385
- ggml_mul(ctx,
15386
- ggml_step(ctx, src0),
15387
- tensor->grad),
15388
- inplace);
15389
- }
15390
- } break;
15391
- case GGML_OP_GELU:
15392
- {
15393
- GGML_ASSERT(false); // TODO: not implemented
15394
- } break;
15395
- case GGML_OP_GELU_QUICK:
15396
- {
15397
- GGML_ASSERT(false); // TODO: not implemented
15398
- } break;
15399
- case GGML_OP_SILU:
15400
- {
15401
- // necessary for llama
15402
- if (src0->grad) {
15403
- src0->grad = ggml_add_impl(ctx,
15404
- src0->grad,
15405
- ggml_silu_back(ctx, src0, tensor->grad),
15406
- inplace);
15407
- }
15408
- } break;
15409
15052
  case GGML_OP_SILU_BACK:
15410
15053
  {
15411
15054
  GGML_ASSERT(false); // TODO: not implemented
@@ -15498,12 +15141,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15498
15141
  } break;
15499
15142
  case GGML_OP_SET:
15500
15143
  {
15501
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15502
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15503
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15504
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15505
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15506
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
15144
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
15145
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
15146
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
15147
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15507
15148
 
15508
15149
  struct ggml_tensor * tensor_grad_view = NULL;
15509
15150
 
@@ -15580,8 +15221,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15580
15221
  if (src0->grad) {
15581
15222
  size_t offset;
15582
15223
 
15583
- GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2]));
15584
- memcpy(&offset, tensor->src[2]->data, sizeof(offset));
15224
+ memcpy(&offset, tensor->op_params, sizeof(offset));
15585
15225
 
15586
15226
  size_t nb1 = tensor->nb[1];
15587
15227
  size_t nb2 = tensor->nb[2];
@@ -15608,7 +15248,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15608
15248
  {
15609
15249
  // necessary for llama
15610
15250
  if (src0->grad) {
15611
- int32_t * axes = (int32_t *) tensor->src[2]->data;
15251
+ int32_t * axes = (int32_t *) tensor->op_params;
15612
15252
  int axis0 = axes[0] & 0x3;
15613
15253
  int axis1 = axes[1] & 0x3;
15614
15254
  int axis2 = axes[2] & 0x3;
@@ -15664,33 +15304,23 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15664
15304
  {
15665
15305
  // necessary for llama
15666
15306
  if (src0->grad) {
15667
- assert(src1->type == GGML_TYPE_I32);
15668
- assert(ggml_nelements(src1) == 2);
15669
- const int n_past = ((int32_t *) src1->data)[0];
15307
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15670
15308
  src0->grad =
15671
15309
  ggml_add_impl(ctx, src0->grad,
15672
15310
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15673
15311
  inplace);
15674
15312
  }
15675
- if (src1->grad) {
15676
- // noop
15677
- }
15678
15313
  } break;
15679
15314
  case GGML_OP_DIAG_MASK_ZERO:
15680
15315
  {
15681
15316
  // necessary for llama
15682
15317
  if (src0->grad) {
15683
- assert(src1->type == GGML_TYPE_I32);
15684
- assert(ggml_nelements(src1) == 2);
15685
- const int n_past = ((int32_t *) src1->data)[0];
15318
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15686
15319
  src0->grad =
15687
15320
  ggml_add_impl(ctx, src0->grad,
15688
15321
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15689
15322
  inplace);
15690
15323
  }
15691
- if (src1->grad) {
15692
- // noop
15693
- }
15694
15324
  } break;
15695
15325
  case GGML_OP_SOFT_MAX:
15696
15326
  {
@@ -15711,33 +15341,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15711
15341
  {
15712
15342
  // necessary for llama
15713
15343
  if (src0->grad) {
15714
- assert(src1->type == GGML_TYPE_I32);
15715
- assert(ggml_nelements(src1) == 4);
15716
- const int n_past = ((int32_t *) src1->data)[0];
15717
- const int n_dims = ((int32_t *) src1->data)[1];
15718
- const int mode = ((int32_t *) src1->data)[2];
15344
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15345
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15346
+ const int mode = ((int32_t *) tensor->op_params)[2];
15347
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15719
15348
  src0->grad = ggml_add_impl(ctx,
15720
15349
  src0->grad,
15721
15350
  ggml_rope_back(ctx,
15722
15351
  tensor->grad,
15723
15352
  n_past,
15724
15353
  n_dims,
15725
- mode),
15354
+ mode,
15355
+ n_ctx),
15726
15356
  inplace);
15727
15357
  }
15728
- if (src1->grad) {
15729
- // noop
15730
- }
15731
15358
  } break;
15732
15359
  case GGML_OP_ROPE_BACK:
15733
15360
  {
15734
15361
  if (src0->grad) {
15735
- assert(src1->type == GGML_TYPE_I32);
15736
- assert(ggml_nelements(src1) == 4);
15737
- const int n_past = ((int32_t *) src1->data)[0];
15738
- const int n_dims = ((int32_t *) src1->data)[1];
15739
- const int mode = ((int32_t *) src1->data)[2];
15740
- const int n_ctx = ((int32_t *) src1->data)[3];
15362
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15363
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15364
+ const int mode = ((int32_t *) tensor->op_params)[2];
15365
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15741
15366
  src0->grad = ggml_add_impl(ctx,
15742
15367
  src0->grad,
15743
15368
  ggml_rope(ctx,
@@ -15748,9 +15373,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15748
15373
  n_ctx),
15749
15374
  inplace);
15750
15375
  }
15751
- if (src1->grad) {
15752
- // noop
15753
- }
15754
15376
  } break;
15755
15377
  case GGML_OP_ALIBI:
15756
15378
  {
@@ -15780,7 +15402,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15780
15402
  {
15781
15403
  struct ggml_tensor * flash_grad = NULL;
15782
15404
  if (src0->grad || src1->grad || tensor->src[2]->grad) {
15783
- int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
15405
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15784
15406
  GGML_ASSERT(t == 0 || t == 1);
15785
15407
  bool masked = t != 0;
15786
15408
  flash_grad =
@@ -15943,6 +15565,80 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15943
15565
  } break;
15944
15566
  case GGML_OP_WIN_PART:
15945
15567
  case GGML_OP_WIN_UNPART:
15568
+ case GGML_OP_UNARY:
15569
+ {
15570
+ switch (ggml_get_unary_op(tensor)) {
15571
+ case GGML_UNARY_OP_ABS:
15572
+ {
15573
+ if (src0->grad) {
15574
+ src0->grad =
15575
+ ggml_add_impl(ctx,
15576
+ src0->grad,
15577
+ ggml_mul(ctx,
15578
+ ggml_sgn(ctx, src0),
15579
+ tensor->grad),
15580
+ inplace);
15581
+ }
15582
+ } break;
15583
+ case GGML_UNARY_OP_SGN:
15584
+ {
15585
+ if (src0->grad) {
15586
+ // noop
15587
+ }
15588
+ } break;
15589
+ case GGML_UNARY_OP_NEG:
15590
+ {
15591
+ if (src0->grad) {
15592
+ src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15593
+ }
15594
+ } break;
15595
+ case GGML_UNARY_OP_STEP:
15596
+ {
15597
+ if (src0->grad) {
15598
+ // noop
15599
+ }
15600
+ } break;
15601
+ case GGML_UNARY_OP_TANH:
15602
+ {
15603
+ GGML_ASSERT(false); // TODO: not implemented
15604
+ } break;
15605
+ case GGML_UNARY_OP_ELU:
15606
+ {
15607
+ GGML_ASSERT(false); // TODO: not implemented
15608
+ } break;
15609
+ case GGML_UNARY_OP_RELU:
15610
+ {
15611
+ if (src0->grad) {
15612
+ src0->grad = ggml_add_impl(ctx,
15613
+ src0->grad,
15614
+ ggml_mul(ctx,
15615
+ ggml_step(ctx, src0),
15616
+ tensor->grad),
15617
+ inplace);
15618
+ }
15619
+ } break;
15620
+ case GGML_UNARY_OP_GELU:
15621
+ {
15622
+ GGML_ASSERT(false); // TODO: not implemented
15623
+ } break;
15624
+ case GGML_UNARY_OP_GELU_QUICK:
15625
+ {
15626
+ GGML_ASSERT(false); // TODO: not implemented
15627
+ } break;
15628
+ case GGML_UNARY_OP_SILU:
15629
+ {
15630
+ // necessary for llama
15631
+ if (src0->grad) {
15632
+ src0->grad = ggml_add_impl(ctx,
15633
+ src0->grad,
15634
+ ggml_silu_back(ctx, src0, tensor->grad),
15635
+ inplace);
15636
+ }
15637
+ } break;
15638
+ default:
15639
+ GGML_ASSERT(false);
15640
+ }
15641
+ } break;
15946
15642
  case GGML_OP_MAP_UNARY:
15947
15643
  case GGML_OP_MAP_BINARY:
15948
15644
  case GGML_OP_MAP_CUSTOM1:
@@ -15978,6 +15674,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15978
15674
  }
15979
15675
  }
15980
15676
 
15677
+ static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
15678
+
15679
+ static size_t hash(void * p) {
15680
+ return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
15681
+ }
15682
+
15683
+ static bool hash_insert(void * hash_table[], void * p) {
15684
+ size_t h = hash(p);
15685
+
15686
+ // linear probing
15687
+ size_t i = h;
15688
+ while (hash_table[i] != NULL && hash_table[i] != p) {
15689
+ i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
15690
+ if (i == h) {
15691
+ // hash table is full
15692
+ GGML_ASSERT(false);
15693
+ }
15694
+ }
15695
+
15696
+ if (hash_table[i] == p) {
15697
+ return true;
15698
+ }
15699
+
15700
+ // insert
15701
+ hash_table[i] = p;
15702
+ return false;
15703
+ }
15704
+
15981
15705
  static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
15982
15706
  if (node->grad == NULL) {
15983
15707
  // this usually happens when we generate intermediate nodes from constants in the backward pass
@@ -15988,16 +15712,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15988
15712
  }
15989
15713
 
15990
15714
  // check if already visited
15991
- for (int i = 0; i < cgraph->n_nodes; i++) {
15992
- if (cgraph->nodes[i] == node) {
15993
- return;
15994
- }
15995
- }
15996
-
15997
- for (int i = 0; i < cgraph->n_leafs; i++) {
15998
- if (cgraph->leafs[i] == node) {
15999
- return;
16000
- }
15715
+ if (hash_insert(cgraph->visited_hash_table, node)) {
15716
+ return;
16001
15717
  }
16002
15718
 
16003
15719
  for (int i = 0; i < GGML_MAX_SRC; ++i) {
@@ -16060,6 +15776,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16060
15776
  /*.nodes =*/ { NULL },
16061
15777
  /*.grads =*/ { NULL },
16062
15778
  /*.leafs =*/ { NULL },
15779
+ /*.hash_table =*/ { NULL },
16063
15780
  /*.perf_runs =*/ 0,
16064
15781
  /*.perf_cycles =*/ 0,
16065
15782
  /*.perf_time_us =*/ 0,
@@ -16101,13 +15818,42 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16101
15818
 
16102
15819
  if (node->is_param) {
16103
15820
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16104
- ggml_build_forward_impl(&result, node->grad, true);
15821
+ ggml_build_forward_expand(&result, node->grad);
16105
15822
  }
16106
15823
  }
16107
15824
 
16108
15825
  return result;
16109
15826
  }
16110
15827
 
15828
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15829
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15830
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15831
+
15832
+ *cgraph = (struct ggml_cgraph) {
15833
+ /*.n_nodes =*/ 0,
15834
+ /*.n_leafs =*/ 0,
15835
+ /*.nodes =*/ { NULL },
15836
+ /*.grads =*/ { NULL },
15837
+ /*.leafs =*/ { NULL },
15838
+ /*.hash_table =*/ { NULL },
15839
+ /*.perf_runs =*/ 0,
15840
+ /*.perf_cycles =*/ 0,
15841
+ /*.perf_time_us =*/ 0,
15842
+ };
15843
+
15844
+ return cgraph;
15845
+ }
15846
+
15847
+ struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
15848
+ struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
15849
+ ggml_build_forward_impl(cgraph, tensor, false);
15850
+ return cgraph;
15851
+ }
15852
+
15853
+ size_t ggml_graph_overhead(void) {
15854
+ return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15855
+ }
15856
+
16111
15857
  //
16112
15858
  // thread data
16113
15859
  //
@@ -16173,7 +15919,7 @@ typedef pthread_t ggml_thread_t;
16173
15919
 
16174
15920
  // Android's libc implementation "bionic" does not support setting affinity
16175
15921
  #if defined(__linux__) && !defined(__BIONIC__)
16176
- void set_numa_thread_affinity(int thread_n, int n_threads) {
15922
+ static void set_numa_thread_affinity(int thread_n, int n_threads) {
16177
15923
  if (!ggml_is_numa()) {
16178
15924
  return;
16179
15925
  }
@@ -16198,7 +15944,7 @@ void set_numa_thread_affinity(int thread_n, int n_threads) {
16198
15944
  CPU_FREE(cpus);
16199
15945
  }
16200
15946
 
16201
- void clear_numa_thread_affinity(void) {
15947
+ static void clear_numa_thread_affinity(void) {
16202
15948
  if (!ggml_is_numa()) {
16203
15949
  return;
16204
15950
  }
@@ -16222,8 +15968,8 @@ void clear_numa_thread_affinity(void) {
16222
15968
  #else
16223
15969
  // TODO: Windows etc.
16224
15970
  // (the linux implementation may also work on BSD, someone should test)
16225
- void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16226
- void clear_numa_thread_affinity(void) {}
15971
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15972
+ static void clear_numa_thread_affinity(void) {}
16227
15973
  #endif
16228
15974
 
16229
15975
  struct ggml_compute_state_shared {
@@ -16293,8 +16039,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16293
16039
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16294
16040
  params.nth = n_tasks_arr[node_n];
16295
16041
  ggml_compute_forward(&params, node);
16296
- ggml_graph_compute_perf_stats_node(node, state->shared);
16297
16042
  }
16043
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16298
16044
  }
16299
16045
 
16300
16046
  // distribute new work or execute it direct if 1T
@@ -16324,8 +16070,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16324
16070
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16325
16071
  params.type = GGML_TASK_FINALIZE;
16326
16072
  ggml_compute_forward(&params, node);
16327
- ggml_graph_compute_perf_stats_node(node, state->shared);
16328
16073
  }
16074
+
16075
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16329
16076
  } else {
16330
16077
  break;
16331
16078
  }
@@ -16434,21 +16181,34 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16434
16181
  case GGML_OP_ARGMAX:
16435
16182
  case GGML_OP_REPEAT:
16436
16183
  case GGML_OP_REPEAT_BACK:
16437
- case GGML_OP_ABS:
16438
- case GGML_OP_SGN:
16439
- case GGML_OP_NEG:
16440
- case GGML_OP_STEP:
16441
- case GGML_OP_TANH:
16442
- case GGML_OP_ELU:
16443
- case GGML_OP_RELU:
16444
- {
16184
+ {
16445
16185
  n_tasks = 1;
16446
16186
  } break;
16447
- case GGML_OP_MUL:
16448
- case GGML_OP_GELU:
16449
- case GGML_OP_GELU_QUICK:
16450
- case GGML_OP_SILU:
16187
+
16188
+ case GGML_OP_UNARY:
16189
+ {
16190
+ switch (ggml_get_unary_op(node)) {
16191
+ case GGML_UNARY_OP_ABS:
16192
+ case GGML_UNARY_OP_SGN:
16193
+ case GGML_UNARY_OP_NEG:
16194
+ case GGML_UNARY_OP_STEP:
16195
+ case GGML_UNARY_OP_TANH:
16196
+ case GGML_UNARY_OP_ELU:
16197
+ case GGML_UNARY_OP_RELU:
16198
+ {
16199
+ n_tasks = 1;
16200
+ } break;
16201
+
16202
+ case GGML_UNARY_OP_GELU:
16203
+ case GGML_UNARY_OP_GELU_QUICK:
16204
+ case GGML_UNARY_OP_SILU:
16205
+ {
16206
+ n_tasks = n_threads;
16207
+ } break;
16208
+ }
16209
+ } break;
16451
16210
  case GGML_OP_SILU_BACK:
16211
+ case GGML_OP_MUL:
16452
16212
  case GGML_OP_NORM:
16453
16213
  case GGML_OP_RMS_NORM:
16454
16214
  case GGML_OP_RMS_NORM_BACK:
@@ -16513,10 +16273,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16513
16273
  case GGML_OP_GET_ROWS:
16514
16274
  case GGML_OP_GET_ROWS_BACK:
16515
16275
  case GGML_OP_DIAG:
16516
- case GGML_OP_DIAG_MASK_ZERO:
16517
16276
  {
16518
16277
  n_tasks = 1;
16519
16278
  } break;
16279
+ case GGML_OP_DIAG_MASK_ZERO:
16520
16280
  case GGML_OP_DIAG_MASK_INF:
16521
16281
  case GGML_OP_SOFT_MAX:
16522
16282
  case GGML_OP_SOFT_MAX_BACK:
@@ -16575,19 +16335,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16575
16335
  const int64_t ne11 = node->src[1]->ne[1]; // H
16576
16336
  const int64_t ne12 = node->src[1]->ne[2]; // C
16577
16337
 
16338
+ const int64_t ne0 = node->ne[0];
16339
+ const int64_t ne1 = node->ne[1];
16340
+ const int64_t ne2 = node->ne[2];
16578
16341
  const int64_t nk = ne00*ne01;
16342
+ const int64_t ew0 = nk * ne02;
16579
16343
 
16580
- UNUSED(ne02);
16581
16344
  UNUSED(ne03);
16582
- UNUSED(nk);
16345
+ UNUSED(ne2);
16583
16346
 
16584
16347
  size_t cur = 0;
16585
16348
 
16586
16349
  if (node->src[0]->type == GGML_TYPE_F16 &&
16587
- node->src[1]->type == GGML_TYPE_F32) {
16588
- cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
16350
+ node->src[1]->type == GGML_TYPE_F32) {
16351
+ cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
16589
16352
  } else if (node->src[0]->type == GGML_TYPE_F32 &&
16590
- node->src[1]->type == GGML_TYPE_F32) {
16353
+ node->src[1]->type == GGML_TYPE_F32) {
16591
16354
  cur = sizeof(float)* (ne10*ne11*ne12);
16592
16355
  } else {
16593
16356
  GGML_ASSERT(false);
@@ -16806,10 +16569,9 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16806
16569
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16807
16570
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16808
16571
 
16809
- struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
16810
- GGML_ASSERT(buf);
16572
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
16811
16573
 
16812
- cplan.work_data = buf->data;
16574
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
16813
16575
 
16814
16576
  ggml_graph_compute(cgraph, &cplan);
16815
16577
  }
@@ -16864,9 +16626,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
16864
16626
  }
16865
16627
 
16866
16628
  void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16867
- //assert(cgraph->work == NULL);
16868
- //assert(cgraph->work_size == 0);
16869
-
16870
16629
  uint64_t size_eval = 0;
16871
16630
 
16872
16631
  // compute size of intermediate results
@@ -16963,7 +16722,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16963
16722
  fwrite(&nb, sizeof(uint64_t), 1, fout);
16964
16723
  }
16965
16724
 
16966
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16725
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16726
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
16967
16727
 
16968
16728
  // dump the data
16969
16729
  // TODO: pad this to 32 byte boundary
@@ -16996,7 +16756,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16996
16756
  fwrite(&nb, sizeof(uint64_t), 1, fout);
16997
16757
  }
16998
16758
 
16999
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16759
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16760
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
17000
16761
 
17001
16762
  // output the op arguments
17002
16763
  {
@@ -17177,7 +16938,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17177
16938
 
17178
16939
  tensor->op = (enum ggml_op) op;
17179
16940
 
17180
- memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16941
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16942
+ memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
17181
16943
 
17182
16944
  tensor->data = (void *) ptr;
17183
16945
 
@@ -17222,7 +16984,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17222
16984
  nb[j] = nb_cur;
17223
16985
  }
17224
16986
 
17225
- const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
16987
+ const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
16988
+ const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
17226
16989
 
17227
16990
  const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
17228
16991
 
@@ -17259,8 +17022,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17259
17022
  {
17260
17023
  tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
17261
17024
 
17262
- uint64_t offs;
17263
- memcpy(&offs, args[2]->data, sizeof(offs));
17025
+ size_t offs;
17026
+ memcpy(&offs, ptr_op_params, sizeof(offs));
17264
17027
 
17265
17028
  tensor->data = ((char *) tensor->data) + offs;
17266
17029
  } break;
@@ -17280,7 +17043,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17280
17043
  } break;
17281
17044
  }
17282
17045
 
17283
- memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17046
+ memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17047
+ memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
17284
17048
 
17285
17049
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
17286
17050
  tensor->nb[j] = nb[j];
@@ -17305,9 +17069,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17305
17069
 
17306
17070
  GGML_PRINT("=== GRAPH ===\n");
17307
17071
 
17308
- GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
17309
- GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
17310
-
17311
17072
  GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
17312
17073
  for (int i = 0; i < cgraph->n_nodes; i++) {
17313
17074
  struct ggml_tensor * node = cgraph->nodes[i];
@@ -17317,7 +17078,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17317
17078
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17318
17079
  i,
17319
17080
  node->ne[0], node->ne[1], node->ne[2],
17320
- GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17081
+ ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17321
17082
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17322
17083
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17323
17084
  (double) node->perf_time_us / 1000.0,
@@ -17331,7 +17092,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17331
17092
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
17332
17093
  i,
17333
17094
  node->ne[0], node->ne[1],
17334
- GGML_OP_NAME[node->op]);
17095
+ ggml_op_name(node->op));
17335
17096
  }
17336
17097
 
17337
17098
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -17339,7 +17100,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17339
17100
  continue;
17340
17101
  }
17341
17102
 
17342
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
17103
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
17343
17104
  }
17344
17105
 
17345
17106
  GGML_PRINT("========================================\n");
@@ -17433,13 +17194,13 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17433
17194
  }
17434
17195
 
17435
17196
  if (node->n_dims == 2) {
17436
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
17197
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
17437
17198
  } else {
17438
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
17199
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
17439
17200
  }
17440
17201
 
17441
17202
  if (node->grad) {
17442
- fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
17203
+ fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
17443
17204
  } else {
17444
17205
  fprintf(fp, "\"; ]\n");
17445
17206
  }