llama_cpp 0.3.3 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -31,11 +31,17 @@
31
31
  #include <unistd.h>
32
32
  #endif
33
33
 
34
+ // static_assert should be a #define, but if it's not,
35
+ // fall back to the _Static_assert C11 keyword.
34
36
  // if C99 - static_assert is noop
35
37
  // ref: https://stackoverflow.com/a/53923785/4039976
36
38
  #ifndef static_assert
39
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
40
+ #define static_assert(cond, msg) _Static_assert(cond, msg)
41
+ #else
37
42
  #define static_assert(cond, msg) struct global_scope_noop_trick
38
43
  #endif
44
+ #endif
39
45
 
40
46
  #if defined(_MSC_VER)
41
47
  // disable "possible loss of data" to avoid hundreds of casts
@@ -112,10 +118,6 @@ typedef void * thread_ret_t;
112
118
  #endif
113
119
  #endif
114
120
 
115
- #ifdef __HAIKU__
116
- #define static_assert(cond, msg) _Static_assert(cond, msg)
117
- #endif
118
-
119
121
  /*#define GGML_PERF*/
120
122
  #define GGML_DEBUG 0
121
123
  #define GGML_GELU_FP16
@@ -3438,7 +3440,9 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
3438
3440
 
3439
3441
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
3440
3442
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
3441
- #if defined(GGML_SIMD)
3443
+ #if defined(GGML_USE_ACCELERATE)
3444
+ vDSP_vsmul(y, 1, &v, y, 1, n);
3445
+ #elif defined(GGML_SIMD)
3442
3446
  const int np = (n & ~(GGML_F32_STEP - 1));
3443
3447
 
3444
3448
  GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
@@ -3601,7 +3605,7 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
3601
3605
  #endif
3602
3606
  }
3603
3607
 
3604
- inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) {
3608
+ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
3605
3609
  ggml_float sum = 0.0;
3606
3610
  for (int i = 0; i < n; ++i) {
3607
3611
  sum += (ggml_float)x[i];
@@ -3609,6 +3613,14 @@ inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x
3609
3613
  *s = sum;
3610
3614
  }
3611
3615
 
3616
+ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
3617
+ float sum = 0.0f;
3618
+ for (int i = 0; i < n; ++i) {
3619
+ sum += GGML_FP16_TO_FP32(x[i]);
3620
+ }
3621
+ *s = sum;
3622
+ }
3623
+
3612
3624
  inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
3613
3625
  #ifndef GGML_USE_ACCELERATE
3614
3626
  float max = -INFINITY;
@@ -3748,16 +3760,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3748
3760
  "ARGMAX",
3749
3761
  "REPEAT",
3750
3762
  "REPEAT_BACK",
3751
- "ABS",
3752
- "SGN",
3753
- "NEG",
3754
- "STEP",
3755
- "TANH",
3756
- "ELU",
3757
- "RELU",
3758
- "GELU",
3759
- "GELU_QUICK",
3760
- "SILU",
3761
3763
  "SILU_BACK",
3762
3764
  "NORM",
3763
3765
  "RMS_NORM",
@@ -3796,6 +3798,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3796
3798
  "WIN_PART",
3797
3799
  "WIN_UNPART",
3798
3800
 
3801
+ "UNARY",
3802
+
3799
3803
  "MAP_UNARY",
3800
3804
  "MAP_BINARY",
3801
3805
 
@@ -3807,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3807
3811
  "CROSS_ENTROPY_LOSS_BACK",
3808
3812
  };
3809
3813
 
3810
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3814
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3811
3815
 
3812
3816
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3813
3817
  "none",
@@ -3828,16 +3832,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3828
3832
  "argmax(x)",
3829
3833
  "repeat(x)",
3830
3834
  "repeat_back(x)",
3831
- "abs(x)",
3832
- "sgn(x)",
3833
- "-x",
3834
- "step(x)",
3835
- "tanh(x)",
3836
- "elu(x)",
3837
- "relu(x)",
3838
- "gelu(x)",
3839
- "gelu_quick(x)",
3840
- "silu(x)",
3841
3835
  "silu_back(x)",
3842
3836
  "norm(x)",
3843
3837
  "rms_norm(x)",
@@ -3876,6 +3870,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3876
3870
  "win_part(x)",
3877
3871
  "win_unpart(x)",
3878
3872
 
3873
+ "unary(x)",
3874
+
3879
3875
  "f(x)",
3880
3876
  "f(x,y)",
3881
3877
 
@@ -3887,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3887
3883
  "cross_entropy_loss_back(x,y)",
3888
3884
  };
3889
3885
 
3890
- static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3886
+ static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3891
3887
 
3892
3888
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3893
3889
 
@@ -4075,8 +4071,8 @@ bool ggml_is_numa(void) {
4075
4071
  ////////////////////////////////////////////////////////////////////////////////
4076
4072
 
4077
4073
  void ggml_print_object(const struct ggml_object * obj) {
4078
- GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
4079
- obj->offs, obj->size, (const void *) obj->next);
4074
+ GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
4075
+ obj->type, obj->offs, obj->size, (const void *) obj->next);
4080
4076
  }
4081
4077
 
4082
4078
  void ggml_print_objects(const struct ggml_context * ctx) {
@@ -4143,6 +4139,10 @@ const char * ggml_op_name(enum ggml_op op) {
4143
4139
  return GGML_OP_NAME[op];
4144
4140
  }
4145
4141
 
4142
+ const char * ggml_op_symbol(enum ggml_op op) {
4143
+ return GGML_OP_SYMBOL[op];
4144
+ }
4145
+
4146
4146
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
4147
4147
  return GGML_TYPE_SIZE[tensor->type];
4148
4148
  }
@@ -4212,7 +4212,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4212
4212
  }
4213
4213
 
4214
4214
  size_t ggml_tensor_overhead(void) {
4215
- return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
4215
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
4216
4216
  }
4217
4217
 
4218
4218
  bool ggml_is_transposed(const struct ggml_tensor * tensor) {
@@ -4229,6 +4229,15 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
4229
4229
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4230
4230
  }
4231
4231
 
4232
+ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
4233
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
+
4235
+ return
4236
+ tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4237
+ tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4238
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4239
+ }
4240
+
4232
4241
  bool ggml_is_permuted(const struct ggml_tensor * tensor) {
4233
4242
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
4243
 
@@ -4374,7 +4383,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4374
4383
  return NULL;
4375
4384
  }
4376
4385
 
4377
- const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
4386
+ const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4378
4387
 
4379
4388
  *ctx = (struct ggml_context) {
4380
4389
  /*.mem_size =*/ mem_size,
@@ -4410,8 +4419,8 @@ void ggml_free(struct ggml_context * ctx) {
4410
4419
  if (&g_state.contexts[i].context == ctx) {
4411
4420
  g_state.contexts[i].used = false;
4412
4421
 
4413
- GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
4414
- __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
4422
+ GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
4423
+ __func__, i, ggml_used_mem(ctx));
4415
4424
 
4416
4425
  if (ctx->mem_buffer_owned) {
4417
4426
  GGML_ALIGNED_FREE(ctx->mem_buffer);
@@ -4441,6 +4450,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4441
4450
  return result;
4442
4451
  }
4443
4452
 
4453
+ bool ggml_get_no_alloc(struct ggml_context * ctx) {
4454
+ return ctx->no_alloc;
4455
+ }
4456
+
4444
4457
  void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4445
4458
  ctx->no_alloc = no_alloc;
4446
4459
  }
@@ -4459,12 +4472,14 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4459
4472
  struct ggml_object * obj = ctx->objects_begin;
4460
4473
 
4461
4474
  while (obj != NULL) {
4462
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4475
+ if (obj->type == GGML_OBJECT_TENSOR) {
4476
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4463
4477
 
4464
- const size_t size = ggml_nbytes(tensor);
4478
+ const size_t size = ggml_nbytes(tensor);
4465
4479
 
4466
- if (max_size < size) {
4467
- max_size = size;
4480
+ if (max_size < size) {
4481
+ max_size = size;
4482
+ }
4468
4483
  }
4469
4484
 
4470
4485
  obj = obj->next;
@@ -4478,7 +4493,7 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4478
4493
  // this is an error prone process, but it is necessary to support inplace
4479
4494
  // operators when using scratch buffers
4480
4495
  // TODO: implement a better way
4481
- void ggml_scratch_save(struct ggml_context * ctx) {
4496
+ static void ggml_scratch_save(struct ggml_context * ctx) {
4482
4497
  // this is needed to allow opt tensors to store their data
4483
4498
  // TODO: again, need to find a better way
4484
4499
  ctx->no_alloc_save = ctx->no_alloc;
@@ -4488,7 +4503,7 @@ void ggml_scratch_save(struct ggml_context * ctx) {
4488
4503
  ctx->scratch.data = NULL;
4489
4504
  }
4490
4505
 
4491
- void ggml_scratch_load(struct ggml_context * ctx) {
4506
+ static void ggml_scratch_load(struct ggml_context * ctx) {
4492
4507
  ctx->no_alloc = ctx->no_alloc_save;
4493
4508
 
4494
4509
  ctx->scratch = ctx->scratch_save;
@@ -4496,12 +4511,7 @@ void ggml_scratch_load(struct ggml_context * ctx) {
4496
4511
 
4497
4512
  ////////////////////////////////////////////////////////////////////////////////
4498
4513
 
4499
- struct ggml_tensor * ggml_new_tensor_impl(
4500
- struct ggml_context * ctx,
4501
- enum ggml_type type,
4502
- int n_dims,
4503
- const int64_t* ne,
4504
- void* data) {
4514
+ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
4505
4515
  // always insert objects at the end of the context's memory pool
4506
4516
  struct ggml_object * obj_cur = ctx->objects_end;
4507
4517
 
@@ -4509,77 +4519,79 @@ struct ggml_tensor * ggml_new_tensor_impl(
4509
4519
  const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
4510
4520
  const size_t cur_end = cur_offs + cur_size;
4511
4521
 
4512
- size_t size_needed = 0;
4513
-
4514
- if (data == NULL && !ctx->no_alloc) {
4515
- size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4516
- for (int i = 1; i < n_dims; i++) {
4517
- size_needed *= ne[i];
4518
- }
4519
- // align to GGML_MEM_ALIGN
4520
- size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
4521
- }
4522
+ // align to GGML_MEM_ALIGN
4523
+ size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
4522
4524
 
4523
4525
  char * const mem_buffer = ctx->mem_buffer;
4524
4526
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4525
4527
 
4526
- if (ctx->scratch.data == NULL || data != NULL) {
4527
- size_needed += GGML_TENSOR_SIZE;
4528
+ if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4529
+ GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4530
+ __func__, cur_end + size_needed, ctx->mem_size);
4531
+ assert(false);
4532
+ return NULL;
4533
+ }
4534
+
4535
+ *obj_new = (struct ggml_object) {
4536
+ .offs = cur_end + GGML_OBJECT_SIZE,
4537
+ .size = size_needed,
4538
+ .next = NULL,
4539
+ .type = type,
4540
+ };
4528
4541
 
4529
- if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4530
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4531
- __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
4532
- assert(false);
4533
- return NULL;
4534
- }
4542
+ ggml_assert_aligned(mem_buffer + obj_new->offs);
4535
4543
 
4536
- *obj_new = (struct ggml_object) {
4537
- .offs = cur_end + GGML_OBJECT_SIZE,
4538
- .size = size_needed,
4539
- .next = NULL,
4540
- };
4544
+ if (obj_cur != NULL) {
4545
+ obj_cur->next = obj_new;
4541
4546
  } else {
4542
- if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4543
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4544
- __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4545
- assert(false);
4546
- return NULL;
4547
+ // this is the first object in this context
4548
+ ctx->objects_begin = obj_new;
4549
+ }
4550
+
4551
+ ctx->objects_end = obj_new;
4552
+
4553
+ //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4554
+
4555
+ return obj_new;
4556
+ }
4557
+
4558
+ static struct ggml_tensor * ggml_new_tensor_impl(
4559
+ struct ggml_context * ctx,
4560
+ enum ggml_type type,
4561
+ int n_dims,
4562
+ const int64_t* ne,
4563
+ void* data) {
4564
+
4565
+ size_t data_size = 0;
4566
+
4567
+ if (data == NULL && !ctx->no_alloc) {
4568
+ data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4569
+ for (int i = 1; i < n_dims; i++) {
4570
+ data_size *= ne[i];
4547
4571
  }
4572
+ }
4548
4573
 
4549
- if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4550
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4551
- __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4574
+ if (ctx->scratch.data != NULL && data == NULL) {
4575
+ // allocate tensor data in the scratch buffer
4576
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4577
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4578
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4552
4579
  assert(false);
4553
4580
  return NULL;
4554
4581
  }
4555
4582
 
4556
4583
  data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4557
4584
 
4558
- *obj_new = (struct ggml_object) {
4559
- .offs = cur_end + GGML_OBJECT_SIZE,
4560
- .size = GGML_TENSOR_SIZE,
4561
- .next = NULL,
4562
- };
4563
-
4564
- //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
4565
-
4566
- ctx->scratch.offs += size_needed;
4567
- }
4585
+ ctx->scratch.offs += data_size;
4568
4586
 
4569
- if (obj_cur != NULL) {
4570
- obj_cur->next = obj_new;
4571
- } else {
4572
- // this is the first object in this context
4573
- ctx->objects_begin = obj_new;
4587
+ data_size = 0;
4574
4588
  }
4575
4589
 
4576
- ctx->objects_end = obj_new;
4577
-
4578
- //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4590
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
4579
4591
 
4580
- struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
4592
+ // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
4581
4593
 
4582
- ggml_assert_aligned(result);
4594
+ struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
4583
4595
 
4584
4596
  *result = (struct ggml_tensor) {
4585
4597
  /*.type =*/ type,
@@ -4588,6 +4600,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4588
4600
  /*.ne =*/ { 1, 1, 1, 1 },
4589
4601
  /*.nb =*/ { 0, 0, 0, 0 },
4590
4602
  /*.op =*/ GGML_OP_NONE,
4603
+ /*.op_params =*/ {0},
4591
4604
  /*.is_param =*/ false,
4592
4605
  /*.grad =*/ NULL,
4593
4606
  /*.src =*/ { NULL },
@@ -4618,6 +4631,21 @@ struct ggml_tensor * ggml_new_tensor_impl(
4618
4631
  return result;
4619
4632
  }
4620
4633
 
4634
+ static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4635
+ assert(params_size <= GGML_MAX_OP_PARAMS);
4636
+ memcpy(tensor->op_params, params, params_size);
4637
+ }
4638
+
4639
+ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
4640
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4641
+ return ((const int32_t *)(tensor->op_params))[i];
4642
+ }
4643
+
4644
+ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
4645
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
4646
+ ((int32_t *)(tensor->op_params))[i] = value;
4647
+ }
4648
+
4621
4649
  struct ggml_tensor * ggml_new_tensor(
4622
4650
  struct ggml_context * ctx,
4623
4651
  enum ggml_type type,
@@ -4949,6 +4977,11 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
4949
4977
  return (float *)(tensor->data);
4950
4978
  }
4951
4979
 
4980
+ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
4981
+ GGML_ASSERT(tensor->op == GGML_OP_UNARY);
4982
+ return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
4983
+ }
4984
+
4952
4985
  const char * ggml_get_name(const struct ggml_tensor * tensor) {
4953
4986
  return tensor->name;
4954
4987
  }
@@ -4987,9 +5020,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
4987
5020
  char * const mem_buffer = ctx->mem_buffer;
4988
5021
 
4989
5022
  while (obj != NULL) {
4990
- struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4991
- if (strcmp(cur->name, name) == 0) {
4992
- return cur;
5023
+ if (obj->type == GGML_OBJECT_TENSOR) {
5024
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
5025
+ if (strcmp(cur->name, name) == 0) {
5026
+ return cur;
5027
+ }
4993
5028
  }
4994
5029
 
4995
5030
  obj = obj->next;
@@ -5002,7 +5037,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
5002
5037
 
5003
5038
  // ggml_dup
5004
5039
 
5005
- struct ggml_tensor * ggml_dup_impl(
5040
+ static struct ggml_tensor * ggml_dup_impl(
5006
5041
  struct ggml_context * ctx,
5007
5042
  struct ggml_tensor * a,
5008
5043
  bool inplace) {
@@ -5017,7 +5052,6 @@ struct ggml_tensor * ggml_dup_impl(
5017
5052
  result->op = GGML_OP_DUP;
5018
5053
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5019
5054
  result->src[0] = a;
5020
- result->src[1] = NULL;
5021
5055
 
5022
5056
  return result;
5023
5057
  }
@@ -5036,7 +5070,7 @@ struct ggml_tensor * ggml_dup_inplace(
5036
5070
 
5037
5071
  // ggml_add
5038
5072
 
5039
- struct ggml_tensor * ggml_add_impl(
5073
+ static struct ggml_tensor * ggml_add_impl(
5040
5074
  struct ggml_context * ctx,
5041
5075
  struct ggml_tensor * a,
5042
5076
  struct ggml_tensor * b,
@@ -5079,7 +5113,7 @@ struct ggml_tensor * ggml_add_inplace(
5079
5113
 
5080
5114
  // ggml_add1
5081
5115
 
5082
- struct ggml_tensor * ggml_add1_impl(
5116
+ static struct ggml_tensor * ggml_add1_impl(
5083
5117
  struct ggml_context * ctx,
5084
5118
  struct ggml_tensor * a,
5085
5119
  struct ggml_tensor * b,
@@ -5119,7 +5153,7 @@ struct ggml_tensor * ggml_add1_inplace(
5119
5153
 
5120
5154
  // ggml_acc
5121
5155
 
5122
- struct ggml_tensor * ggml_acc_impl(
5156
+ static struct ggml_tensor * ggml_acc_impl(
5123
5157
  struct ggml_context * ctx,
5124
5158
  struct ggml_tensor * a,
5125
5159
  struct ggml_tensor * b,
@@ -5141,23 +5175,13 @@ struct ggml_tensor * ggml_acc_impl(
5141
5175
 
5142
5176
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5143
5177
 
5144
- ggml_scratch_save(ctx);
5145
-
5146
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
5147
-
5148
- ((int32_t *) c->data)[0] = nb1;
5149
- ((int32_t *) c->data)[1] = nb2;
5150
- ((int32_t *) c->data)[2] = nb3;
5151
- ((int32_t *) c->data)[3] = offset;
5152
- ((int32_t *) c->data)[4] = inplace ? 1 : 0;
5153
-
5154
- ggml_scratch_load(ctx);
5178
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5179
+ ggml_set_op_params(result, params, sizeof(params));
5155
5180
 
5156
5181
  result->op = GGML_OP_ACC;
5157
5182
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5158
5183
  result->src[0] = a;
5159
5184
  result->src[1] = b;
5160
- result->src[2] = c;
5161
5185
 
5162
5186
  return result;
5163
5187
  }
@@ -5186,7 +5210,7 @@ struct ggml_tensor * ggml_acc_inplace(
5186
5210
 
5187
5211
  // ggml_sub
5188
5212
 
5189
- struct ggml_tensor * ggml_sub_impl(
5213
+ static struct ggml_tensor * ggml_sub_impl(
5190
5214
  struct ggml_context * ctx,
5191
5215
  struct ggml_tensor * a,
5192
5216
  struct ggml_tensor * b,
@@ -5225,7 +5249,7 @@ struct ggml_tensor * ggml_sub_inplace(
5225
5249
 
5226
5250
  // ggml_mul
5227
5251
 
5228
- struct ggml_tensor * ggml_mul_impl(
5252
+ static struct ggml_tensor * ggml_mul_impl(
5229
5253
  struct ggml_context * ctx,
5230
5254
  struct ggml_tensor * a,
5231
5255
  struct ggml_tensor * b,
@@ -5272,7 +5296,7 @@ struct ggml_tensor * ggml_mul_inplace(
5272
5296
 
5273
5297
  // ggml_div
5274
5298
 
5275
- struct ggml_tensor * ggml_div_impl(
5299
+ static struct ggml_tensor * ggml_div_impl(
5276
5300
  struct ggml_context * ctx,
5277
5301
  struct ggml_tensor * a,
5278
5302
  struct ggml_tensor * b,
@@ -5315,7 +5339,7 @@ struct ggml_tensor * ggml_div_inplace(
5315
5339
 
5316
5340
  // ggml_sqr
5317
5341
 
5318
- struct ggml_tensor * ggml_sqr_impl(
5342
+ static struct ggml_tensor * ggml_sqr_impl(
5319
5343
  struct ggml_context * ctx,
5320
5344
  struct ggml_tensor * a,
5321
5345
  bool inplace) {
@@ -5330,7 +5354,6 @@ struct ggml_tensor * ggml_sqr_impl(
5330
5354
  result->op = GGML_OP_SQR;
5331
5355
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5332
5356
  result->src[0] = a;
5333
- result->src[1] = NULL;
5334
5357
 
5335
5358
  return result;
5336
5359
  }
@@ -5349,7 +5372,7 @@ struct ggml_tensor * ggml_sqr_inplace(
5349
5372
 
5350
5373
  // ggml_sqrt
5351
5374
 
5352
- struct ggml_tensor * ggml_sqrt_impl(
5375
+ static struct ggml_tensor * ggml_sqrt_impl(
5353
5376
  struct ggml_context * ctx,
5354
5377
  struct ggml_tensor * a,
5355
5378
  bool inplace) {
@@ -5364,7 +5387,6 @@ struct ggml_tensor * ggml_sqrt_impl(
5364
5387
  result->op = GGML_OP_SQRT;
5365
5388
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5366
5389
  result->src[0] = a;
5367
- result->src[1] = NULL;
5368
5390
 
5369
5391
  return result;
5370
5392
  }
@@ -5384,7 +5406,7 @@ struct ggml_tensor * ggml_sqrt_inplace(
5384
5406
 
5385
5407
  // ggml_log
5386
5408
 
5387
- struct ggml_tensor * ggml_log_impl(
5409
+ static struct ggml_tensor * ggml_log_impl(
5388
5410
  struct ggml_context * ctx,
5389
5411
  struct ggml_tensor * a,
5390
5412
  bool inplace) {
@@ -5399,7 +5421,6 @@ struct ggml_tensor * ggml_log_impl(
5399
5421
  result->op = GGML_OP_LOG;
5400
5422
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5401
5423
  result->src[0] = a;
5402
- result->src[1] = NULL;
5403
5424
 
5404
5425
  return result;
5405
5426
  }
@@ -5432,7 +5453,6 @@ struct ggml_tensor * ggml_sum(
5432
5453
  result->op = GGML_OP_SUM;
5433
5454
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5434
5455
  result->src[0] = a;
5435
- result->src[1] = NULL;
5436
5456
 
5437
5457
  return result;
5438
5458
  }
@@ -5459,7 +5479,6 @@ struct ggml_tensor * ggml_sum_rows(
5459
5479
  result->op = GGML_OP_SUM_ROWS;
5460
5480
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5461
5481
  result->src[0] = a;
5462
- result->src[1] = NULL;
5463
5482
 
5464
5483
  return result;
5465
5484
  }
@@ -5482,7 +5501,6 @@ struct ggml_tensor * ggml_mean(
5482
5501
  result->op = GGML_OP_MEAN;
5483
5502
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5484
5503
  result->src[0] = a;
5485
- result->src[1] = NULL;
5486
5504
 
5487
5505
  return result;
5488
5506
  }
@@ -5506,7 +5524,6 @@ struct ggml_tensor * ggml_argmax(
5506
5524
  result->op = GGML_OP_ARGMAX;
5507
5525
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5508
5526
  result->src[0] = a;
5509
- result->src[1] = NULL;
5510
5527
 
5511
5528
  return result;
5512
5529
  }
@@ -5569,343 +5586,142 @@ struct ggml_tensor * ggml_repeat_back(
5569
5586
 
5570
5587
  // ggml_abs
5571
5588
 
5572
- struct ggml_tensor * ggml_abs_impl(
5573
- struct ggml_context * ctx,
5574
- struct ggml_tensor * a,
5575
- bool inplace) {
5576
- bool is_node = false;
5577
-
5578
- if (!inplace && (a->grad)) {
5579
- is_node = true;
5580
- }
5581
-
5582
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5583
-
5584
- result->op = GGML_OP_ABS;
5585
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5586
- result->src[0] = a;
5587
- result->src[1] = NULL;
5588
-
5589
- return result;
5590
- }
5591
-
5592
5589
  struct ggml_tensor * ggml_abs(
5593
5590
  struct ggml_context * ctx,
5594
5591
  struct ggml_tensor * a) {
5595
- return ggml_abs_impl(ctx, a, false);
5592
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
5596
5593
  }
5597
5594
 
5598
5595
  struct ggml_tensor * ggml_abs_inplace(
5599
5596
  struct ggml_context * ctx,
5600
5597
  struct ggml_tensor * a) {
5601
- return ggml_abs_impl(ctx, a, true);
5598
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
5602
5599
  }
5603
5600
 
5604
-
5605
5601
  // ggml_sgn
5606
5602
 
5607
- struct ggml_tensor * ggml_sgn_impl(
5608
- struct ggml_context * ctx,
5609
- struct ggml_tensor * a,
5610
- bool inplace) {
5611
- bool is_node = false;
5612
-
5613
- if (!inplace && (a->grad)) {
5614
- is_node = true;
5615
- }
5616
-
5617
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5618
-
5619
- result->op = GGML_OP_SGN;
5620
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5621
- result->src[0] = a;
5622
- result->src[1] = NULL;
5623
-
5624
- return result;
5625
- }
5626
-
5627
5603
  struct ggml_tensor * ggml_sgn(
5628
5604
  struct ggml_context * ctx,
5629
5605
  struct ggml_tensor * a) {
5630
- return ggml_sgn_impl(ctx, a, false);
5606
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
5631
5607
  }
5632
5608
 
5633
5609
  struct ggml_tensor * ggml_sgn_inplace(
5634
5610
  struct ggml_context * ctx,
5635
5611
  struct ggml_tensor * a) {
5636
- return ggml_sgn_impl(ctx, a, true);
5612
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
5637
5613
  }
5638
5614
 
5639
5615
  // ggml_neg
5640
5616
 
5641
- struct ggml_tensor * ggml_neg_impl(
5642
- struct ggml_context * ctx,
5643
- struct ggml_tensor * a,
5644
- bool inplace) {
5645
- bool is_node = false;
5646
-
5647
- if (!inplace && (a->grad)) {
5648
- is_node = true;
5649
- }
5650
-
5651
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5652
-
5653
- result->op = GGML_OP_NEG;
5654
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5655
- result->src[0] = a;
5656
- result->src[1] = NULL;
5657
-
5658
- return result;
5659
- }
5660
-
5661
5617
  struct ggml_tensor * ggml_neg(
5662
5618
  struct ggml_context * ctx,
5663
5619
  struct ggml_tensor * a) {
5664
- return ggml_neg_impl(ctx, a, false);
5620
+ return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
5665
5621
  }
5666
5622
 
5667
5623
  struct ggml_tensor * ggml_neg_inplace(
5668
5624
  struct ggml_context * ctx,
5669
5625
  struct ggml_tensor * a) {
5670
- return ggml_neg_impl(ctx, a, true);
5626
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
5671
5627
  }
5672
5628
 
5673
5629
  // ggml_step
5674
5630
 
5675
- struct ggml_tensor * ggml_step_impl(
5676
- struct ggml_context * ctx,
5677
- struct ggml_tensor * a,
5678
- bool inplace) {
5679
- bool is_node = false;
5680
-
5681
- if (!inplace && (a->grad)) {
5682
- is_node = true;
5683
- }
5684
-
5685
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5686
-
5687
- result->op = GGML_OP_STEP;
5688
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5689
- result->src[0] = a;
5690
- result->src[1] = NULL;
5691
-
5692
- return result;
5693
- }
5694
-
5695
5631
  struct ggml_tensor * ggml_step(
5696
5632
  struct ggml_context * ctx,
5697
5633
  struct ggml_tensor * a) {
5698
- return ggml_step_impl(ctx, a, false);
5634
+ return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
5699
5635
  }
5700
5636
 
5701
5637
  struct ggml_tensor * ggml_step_inplace(
5702
5638
  struct ggml_context * ctx,
5703
5639
  struct ggml_tensor * a) {
5704
- return ggml_step_impl(ctx, a, true);
5640
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
5705
5641
  }
5706
5642
 
5707
5643
  // ggml_tanh
5708
5644
 
5709
- struct ggml_tensor * ggml_tanh_impl(
5710
- struct ggml_context * ctx,
5711
- struct ggml_tensor * a,
5712
- bool inplace) {
5713
- bool is_node = false;
5714
-
5715
- if (!inplace && (a->grad)) {
5716
- is_node = true;
5717
- }
5718
-
5719
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5720
-
5721
- result->op = GGML_OP_TANH;
5722
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5723
- result->src[0] = a;
5724
- result->src[1] = NULL;
5725
-
5726
- return result;
5727
- }
5728
-
5729
5645
  struct ggml_tensor * ggml_tanh(
5730
5646
  struct ggml_context * ctx,
5731
5647
  struct ggml_tensor * a) {
5732
- return ggml_tanh_impl(ctx, a, false);
5648
+ return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
5733
5649
  }
5734
5650
 
5735
5651
  struct ggml_tensor * ggml_tanh_inplace(
5736
5652
  struct ggml_context * ctx,
5737
5653
  struct ggml_tensor * a) {
5738
- return ggml_tanh_impl(ctx, a, true);
5654
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
5739
5655
  }
5740
5656
 
5741
5657
  // ggml_elu
5742
5658
 
5743
- struct ggml_tensor * ggml_elu_impl(
5744
- struct ggml_context * ctx,
5745
- struct ggml_tensor * a,
5746
- bool inplace) {
5747
- bool is_node = false;
5748
-
5749
- if (!inplace && (a->grad)) {
5750
- is_node = true;
5751
- }
5752
-
5753
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5754
-
5755
- result->op = GGML_OP_ELU;
5756
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5757
- result->src[0] = a;
5758
- result->src[1] = NULL;
5759
-
5760
- return result;
5761
- }
5762
-
5763
5659
  struct ggml_tensor * ggml_elu(
5764
5660
  struct ggml_context * ctx,
5765
5661
  struct ggml_tensor * a) {
5766
- return ggml_elu_impl(ctx, a, false);
5662
+ return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
5767
5663
  }
5768
5664
 
5769
5665
  struct ggml_tensor * ggml_elu_inplace(
5770
5666
  struct ggml_context * ctx,
5771
5667
  struct ggml_tensor * a) {
5772
- return ggml_elu_impl(ctx, a, true);
5668
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
5773
5669
  }
5774
5670
 
5775
5671
  // ggml_relu
5776
5672
 
5777
- struct ggml_tensor * ggml_relu_impl(
5778
- struct ggml_context * ctx,
5779
- struct ggml_tensor * a,
5780
- bool inplace) {
5781
- bool is_node = false;
5782
-
5783
- if (!inplace && (a->grad)) {
5784
- is_node = true;
5785
- }
5786
-
5787
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5788
-
5789
- result->op = GGML_OP_RELU;
5790
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5791
- result->src[0] = a;
5792
- result->src[1] = NULL;
5793
-
5794
- return result;
5795
- }
5796
-
5797
5673
  struct ggml_tensor * ggml_relu(
5798
5674
  struct ggml_context * ctx,
5799
5675
  struct ggml_tensor * a) {
5800
- return ggml_relu_impl(ctx, a, false);
5676
+ return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
5801
5677
  }
5802
5678
 
5803
5679
  struct ggml_tensor * ggml_relu_inplace(
5804
5680
  struct ggml_context * ctx,
5805
5681
  struct ggml_tensor * a) {
5806
- return ggml_relu_impl(ctx, a, true);
5682
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
5807
5683
  }
5808
5684
 
5809
5685
  // ggml_gelu
5810
5686
 
5811
- struct ggml_tensor * ggml_gelu_impl(
5812
- struct ggml_context * ctx,
5813
- struct ggml_tensor * a,
5814
- bool inplace) {
5815
- bool is_node = false;
5816
-
5817
- if (!inplace && (a->grad)) {
5818
- is_node = true;
5819
- }
5820
-
5821
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5822
-
5823
- result->op = GGML_OP_GELU;
5824
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5825
- result->src[0] = a;
5826
- result->src[1] = NULL;
5827
-
5828
- return result;
5829
- }
5830
-
5831
5687
  struct ggml_tensor * ggml_gelu(
5832
5688
  struct ggml_context * ctx,
5833
5689
  struct ggml_tensor * a) {
5834
- return ggml_gelu_impl(ctx, a, false);
5690
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
5835
5691
  }
5836
5692
 
5837
5693
  struct ggml_tensor * ggml_gelu_inplace(
5838
5694
  struct ggml_context * ctx,
5839
5695
  struct ggml_tensor * a) {
5840
- return ggml_gelu_impl(ctx, a, true);
5696
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
5841
5697
  }
5842
5698
 
5843
5699
  // ggml_gelu_quick
5844
5700
 
5845
- struct ggml_tensor * ggml_gelu_quick_impl(
5846
- struct ggml_context * ctx,
5847
- struct ggml_tensor * a,
5848
- bool inplace) {
5849
- bool is_node = false;
5850
-
5851
- if (!inplace && (a->grad)) {
5852
- is_node = true;
5853
- }
5854
-
5855
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5856
-
5857
- result->op = GGML_OP_GELU_QUICK;
5858
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5859
- result->src[0] = a;
5860
- result->src[1] = NULL;
5861
-
5862
- return result;
5863
- }
5864
-
5865
5701
  struct ggml_tensor * ggml_gelu_quick(
5866
5702
  struct ggml_context * ctx,
5867
5703
  struct ggml_tensor * a) {
5868
- return ggml_gelu_quick_impl(ctx, a, false);
5704
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5869
5705
  }
5870
5706
 
5871
5707
  struct ggml_tensor * ggml_gelu_quick_inplace(
5872
5708
  struct ggml_context * ctx,
5873
5709
  struct ggml_tensor * a) {
5874
- return ggml_gelu_quick_impl(ctx, a, true);
5710
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
5875
5711
  }
5876
5712
 
5877
5713
  // ggml_silu
5878
5714
 
5879
- struct ggml_tensor * ggml_silu_impl(
5880
- struct ggml_context * ctx,
5881
- struct ggml_tensor * a,
5882
- bool inplace) {
5883
- bool is_node = false;
5884
-
5885
- if (!inplace && (a->grad)) {
5886
- is_node = true;
5887
- }
5888
-
5889
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5890
-
5891
- result->op = GGML_OP_SILU;
5892
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5893
- result->src[0] = a;
5894
- result->src[1] = NULL;
5895
-
5896
- return result;
5897
- }
5898
-
5899
- struct ggml_tensor * ggml_silu(
5715
+ struct ggml_tensor * ggml_silu(
5900
5716
  struct ggml_context * ctx,
5901
5717
  struct ggml_tensor * a) {
5902
- return ggml_silu_impl(ctx, a, false);
5718
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
5903
5719
  }
5904
5720
 
5905
5721
  struct ggml_tensor * ggml_silu_inplace(
5906
5722
  struct ggml_context * ctx,
5907
5723
  struct ggml_tensor * a) {
5908
- return ggml_silu_impl(ctx, a, true);
5724
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
5909
5725
  }
5910
5726
 
5911
5727
  // ggml_silu_back
@@ -5933,7 +5749,7 @@ struct ggml_tensor * ggml_silu_back(
5933
5749
 
5934
5750
  // ggml_norm
5935
5751
 
5936
- struct ggml_tensor * ggml_norm_impl(
5752
+ static struct ggml_tensor * ggml_norm_impl(
5937
5753
  struct ggml_context * ctx,
5938
5754
  struct ggml_tensor * a,
5939
5755
  bool inplace) {
@@ -5946,10 +5762,11 @@ struct ggml_tensor * ggml_norm_impl(
5946
5762
 
5947
5763
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5948
5764
 
5765
+ // TODO: maybe store epsilon here?
5766
+
5949
5767
  result->op = GGML_OP_NORM;
5950
5768
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5951
5769
  result->src[0] = a;
5952
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5953
5770
 
5954
5771
  return result;
5955
5772
  }
@@ -5966,9 +5783,10 @@ struct ggml_tensor * ggml_norm_inplace(
5966
5783
  return ggml_norm_impl(ctx, a, true);
5967
5784
  }
5968
5785
 
5969
- struct ggml_tensor * ggml_rms_norm_impl(
5786
+ static struct ggml_tensor * ggml_rms_norm_impl(
5970
5787
  struct ggml_context * ctx,
5971
5788
  struct ggml_tensor * a,
5789
+ float eps,
5972
5790
  bool inplace) {
5973
5791
  bool is_node = false;
5974
5792
 
@@ -5978,24 +5796,27 @@ struct ggml_tensor * ggml_rms_norm_impl(
5978
5796
 
5979
5797
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5980
5798
 
5799
+ ggml_set_op_params(result, &eps, sizeof(eps));
5800
+
5981
5801
  result->op = GGML_OP_RMS_NORM;
5982
5802
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5983
5803
  result->src[0] = a;
5984
- result->src[1] = NULL; // TODO: maybe store epsilon here?
5985
5804
 
5986
5805
  return result;
5987
5806
  }
5988
5807
 
5989
5808
  struct ggml_tensor * ggml_rms_norm(
5990
5809
  struct ggml_context * ctx,
5991
- struct ggml_tensor * a) {
5992
- return ggml_rms_norm_impl(ctx, a, false);
5810
+ struct ggml_tensor * a,
5811
+ float eps) {
5812
+ return ggml_rms_norm_impl(ctx, a, eps, false);
5993
5813
  }
5994
5814
 
5995
5815
  struct ggml_tensor * ggml_rms_norm_inplace(
5996
5816
  struct ggml_context * ctx,
5997
- struct ggml_tensor * a) {
5998
- return ggml_rms_norm_impl(ctx, a, true);
5817
+ struct ggml_tensor * a,
5818
+ float eps) {
5819
+ return ggml_rms_norm_impl(ctx, a, eps, true);
5999
5820
  }
6000
5821
 
6001
5822
  struct ggml_tensor * ggml_rms_norm_back(
@@ -6074,7 +5895,7 @@ struct ggml_tensor * ggml_out_prod(
6074
5895
 
6075
5896
  // ggml_scale
6076
5897
 
6077
- struct ggml_tensor * ggml_scale_impl(
5898
+ static struct ggml_tensor * ggml_scale_impl(
6078
5899
  struct ggml_context * ctx,
6079
5900
  struct ggml_tensor * a,
6080
5901
  struct ggml_tensor * b,
@@ -6114,7 +5935,7 @@ struct ggml_tensor * ggml_scale_inplace(
6114
5935
 
6115
5936
  // ggml_set
6116
5937
 
6117
- struct ggml_tensor * ggml_set_impl(
5938
+ static struct ggml_tensor * ggml_set_impl(
6118
5939
  struct ggml_context * ctx,
6119
5940
  struct ggml_tensor * a,
6120
5941
  struct ggml_tensor * b,
@@ -6134,23 +5955,13 @@ struct ggml_tensor * ggml_set_impl(
6134
5955
  // make a view of the destination
6135
5956
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6136
5957
 
6137
- ggml_scratch_save(ctx);
6138
-
6139
- struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
6140
-
6141
- (( int32_t * ) c->data)[0] = nb1;
6142
- (( int32_t * ) c->data)[1] = nb2;
6143
- (( int32_t * ) c->data)[2] = nb3;
6144
- (( int32_t * ) c->data)[3] = offset;
6145
- (( int32_t * ) c->data)[4] = inplace ? 1 : 0;
6146
-
6147
- ggml_scratch_load(ctx);
5958
+ int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
5959
+ ggml_set_op_params(result, params, sizeof(params));
6148
5960
 
6149
5961
  result->op = GGML_OP_SET;
6150
5962
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6151
5963
  result->src[0] = a;
6152
5964
  result->src[1] = b;
6153
- result->src[2] = c;
6154
5965
 
6155
5966
  return result;
6156
5967
  }
@@ -6214,7 +6025,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
6214
6025
 
6215
6026
  // ggml_cpy
6216
6027
 
6217
- struct ggml_tensor * ggml_cpy_impl(
6028
+ static struct ggml_tensor * ggml_cpy_impl(
6218
6029
  struct ggml_context * ctx,
6219
6030
  struct ggml_tensor * a,
6220
6031
  struct ggml_tensor * b,
@@ -6259,7 +6070,7 @@ struct ggml_tensor * ggml_cpy_inplace(
6259
6070
 
6260
6071
  // ggml_cont
6261
6072
 
6262
- struct ggml_tensor * ggml_cont_impl(
6073
+ static struct ggml_tensor * ggml_cont_impl(
6263
6074
  struct ggml_context * ctx,
6264
6075
  struct ggml_tensor * a,
6265
6076
  bool inplace) {
@@ -6275,7 +6086,6 @@ struct ggml_tensor * ggml_cont_impl(
6275
6086
  result->op = GGML_OP_CONT;
6276
6087
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6277
6088
  result->src[0] = a;
6278
- result->src[1] = NULL;
6279
6089
 
6280
6090
  return result;
6281
6091
  }
@@ -6319,7 +6129,6 @@ struct ggml_tensor * ggml_reshape(
6319
6129
  result->op = GGML_OP_RESHAPE;
6320
6130
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6321
6131
  result->src[0] = a;
6322
- result->src[1] = NULL;
6323
6132
 
6324
6133
  return result;
6325
6134
  }
@@ -6344,7 +6153,6 @@ struct ggml_tensor * ggml_reshape_1d(
6344
6153
  result->op = GGML_OP_RESHAPE;
6345
6154
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6346
6155
  result->src[0] = a;
6347
- result->src[1] = NULL;
6348
6156
 
6349
6157
  return result;
6350
6158
  }
@@ -6370,7 +6178,6 @@ struct ggml_tensor * ggml_reshape_2d(
6370
6178
  result->op = GGML_OP_RESHAPE;
6371
6179
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6372
6180
  result->src[0] = a;
6373
- result->src[1] = NULL;
6374
6181
 
6375
6182
  return result;
6376
6183
  }
@@ -6397,7 +6204,6 @@ struct ggml_tensor * ggml_reshape_3d(
6397
6204
  result->op = GGML_OP_RESHAPE;
6398
6205
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6399
6206
  result->src[0] = a;
6400
- result->src[1] = NULL;
6401
6207
 
6402
6208
  return result;
6403
6209
  }
@@ -6426,7 +6232,6 @@ struct ggml_tensor * ggml_reshape_4d(
6426
6232
  result->op = GGML_OP_RESHAPE;
6427
6233
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6428
6234
  result->src[0] = a;
6429
- result->src[1] = NULL;
6430
6235
 
6431
6236
  return result;
6432
6237
  }
@@ -6448,19 +6253,11 @@ struct ggml_tensor * ggml_view_1d(
6448
6253
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6449
6254
  ggml_format_name(result, "%s (view)", a->name);
6450
6255
 
6451
- ggml_scratch_save(ctx);
6452
-
6453
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6454
- ggml_set_name(offs, "offset");
6455
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6456
-
6457
- ggml_scratch_load(ctx);
6256
+ ggml_set_op_params(result, &offset, sizeof(offset));
6458
6257
 
6459
6258
  result->op = GGML_OP_VIEW;
6460
6259
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6461
6260
  result->src[0] = a;
6462
- result->src[1] = NULL;
6463
- result->src[2] = offs;
6464
6261
 
6465
6262
  return result;
6466
6263
  }
@@ -6486,13 +6283,7 @@ struct ggml_tensor * ggml_view_2d(
6486
6283
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6487
6284
  ggml_format_name(result, "%s (view)", a->name);
6488
6285
 
6489
- ggml_scratch_save(ctx);
6490
-
6491
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6492
- ggml_set_name(offs, "offset");
6493
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6494
-
6495
- ggml_scratch_load(ctx);
6286
+ ggml_set_op_params(result, &offset, sizeof(offset));
6496
6287
 
6497
6288
  result->nb[1] = nb1;
6498
6289
  result->nb[2] = result->nb[1]*ne1;
@@ -6501,8 +6292,6 @@ struct ggml_tensor * ggml_view_2d(
6501
6292
  result->op = GGML_OP_VIEW;
6502
6293
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6503
6294
  result->src[0] = a;
6504
- result->src[1] = NULL;
6505
- result->src[2] = offs;
6506
6295
 
6507
6296
  return result;
6508
6297
  }
@@ -6530,13 +6319,7 @@ struct ggml_tensor * ggml_view_3d(
6530
6319
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6531
6320
  ggml_format_name(result, "%s (view)", a->name);
6532
6321
 
6533
- ggml_scratch_save(ctx);
6534
-
6535
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6536
- ggml_set_name(offs, "offset");
6537
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6538
-
6539
- ggml_scratch_load(ctx);
6322
+ ggml_set_op_params(result, &offset, sizeof(offset));
6540
6323
 
6541
6324
  result->nb[1] = nb1;
6542
6325
  result->nb[2] = nb2;
@@ -6545,8 +6328,6 @@ struct ggml_tensor * ggml_view_3d(
6545
6328
  result->op = GGML_OP_VIEW;
6546
6329
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6547
6330
  result->src[0] = a;
6548
- result->src[1] = NULL;
6549
- result->src[2] = offs;
6550
6331
 
6551
6332
  return result;
6552
6333
  }
@@ -6576,13 +6357,7 @@ struct ggml_tensor * ggml_view_4d(
6576
6357
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6577
6358
  ggml_format_name(result, "%s (view)", a->name);
6578
6359
 
6579
- ggml_scratch_save(ctx);
6580
-
6581
- struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6582
- ggml_set_name(offs, "offset");
6583
- memcpy(offs->data, &offset, 2*sizeof(int32_t));
6584
-
6585
- ggml_scratch_load(ctx);
6360
+ ggml_set_op_params(result, &offset, sizeof(offset));
6586
6361
 
6587
6362
  result->nb[1] = nb1;
6588
6363
  result->nb[2] = nb2;
@@ -6591,8 +6366,6 @@ struct ggml_tensor * ggml_view_4d(
6591
6366
  result->op = GGML_OP_VIEW;
6592
6367
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6593
6368
  result->src[0] = a;
6594
- result->src[1] = NULL;
6595
- result->src[2] = offs;
6596
6369
 
6597
6370
  return result;
6598
6371
  }
@@ -6653,22 +6426,9 @@ struct ggml_tensor * ggml_permute(
6653
6426
  result->op = GGML_OP_PERMUTE;
6654
6427
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6655
6428
  result->src[0] = a;
6656
- result->src[1] = NULL;
6657
-
6658
- if (is_node) {
6659
- ggml_scratch_save(ctx);
6660
-
6661
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6662
-
6663
- ((int32_t *) b->data)[0] = axis0;
6664
- ((int32_t *) b->data)[1] = axis1;
6665
- ((int32_t *) b->data)[2] = axis2;
6666
- ((int32_t *) b->data)[3] = axis3;
6667
6429
 
6668
- ggml_scratch_load(ctx);
6669
-
6670
- result->src[2] = b;
6671
- }
6430
+ int32_t params[] = { axis0, axis1, axis2, axis3 };
6431
+ ggml_set_op_params(result, &params, sizeof(params));
6672
6432
 
6673
6433
  return result;
6674
6434
  }
@@ -6696,7 +6456,6 @@ struct ggml_tensor * ggml_transpose(
6696
6456
  result->op = GGML_OP_TRANSPOSE;
6697
6457
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6698
6458
  result->src[0] = a;
6699
- result->src[1] = NULL;
6700
6459
 
6701
6460
  return result;
6702
6461
  }
@@ -6774,7 +6533,6 @@ struct ggml_tensor * ggml_diag(
6774
6533
  result->op = GGML_OP_DIAG;
6775
6534
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6776
6535
  result->src[0] = a;
6777
- result->src[1] = NULL;
6778
6536
 
6779
6537
  return result;
6780
6538
  }
@@ -6782,7 +6540,7 @@ struct ggml_tensor * ggml_diag(
6782
6540
 
6783
6541
  // ggml_diag_mask_inf
6784
6542
 
6785
- struct ggml_tensor * ggml_diag_mask_inf_impl(
6543
+ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6786
6544
  struct ggml_context * ctx,
6787
6545
  struct ggml_tensor * a,
6788
6546
  int n_past,
@@ -6795,19 +6553,12 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
6795
6553
 
6796
6554
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6797
6555
 
6798
- ggml_scratch_save(ctx);
6799
-
6800
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6801
-
6802
- ((int32_t *) b->data)[0] = n_past;
6803
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6804
-
6805
- ggml_scratch_load(ctx);
6556
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6557
+ ggml_set_op_params(result, &params, sizeof(params));
6806
6558
 
6807
6559
  result->op = GGML_OP_DIAG_MASK_INF;
6808
6560
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6809
6561
  result->src[0] = a;
6810
- result->src[1] = b;
6811
6562
 
6812
6563
  return result;
6813
6564
  }
@@ -6829,7 +6580,7 @@ struct ggml_tensor * ggml_diag_mask_inf_inplace(
6829
6580
 
6830
6581
  // ggml_diag_mask_zero
6831
6582
 
6832
- struct ggml_tensor * ggml_diag_mask_zero_impl(
6583
+ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6833
6584
  struct ggml_context * ctx,
6834
6585
  struct ggml_tensor * a,
6835
6586
  int n_past,
@@ -6842,20 +6593,12 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
6842
6593
 
6843
6594
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6844
6595
 
6845
- ggml_scratch_save(ctx);
6846
-
6847
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6848
- ggml_set_name(b, "n_past, inplace");
6849
-
6850
- ((int32_t *) b->data)[0] = n_past;
6851
- ((int32_t *) b->data)[1] = inplace ? 1 : 0;
6852
-
6853
- ggml_scratch_load(ctx);
6596
+ int32_t params[] = { n_past, inplace ? 1 : 0 };
6597
+ ggml_set_op_params(result, &params, sizeof(params));
6854
6598
 
6855
6599
  result->op = GGML_OP_DIAG_MASK_ZERO;
6856
6600
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6857
6601
  result->src[0] = a;
6858
- result->src[1] = b;
6859
6602
 
6860
6603
  return result;
6861
6604
  }
@@ -6876,7 +6619,7 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
6876
6619
 
6877
6620
  // ggml_soft_max
6878
6621
 
6879
- struct ggml_tensor * ggml_soft_max_impl(
6622
+ static struct ggml_tensor * ggml_soft_max_impl(
6880
6623
  struct ggml_context * ctx,
6881
6624
  struct ggml_tensor * a,
6882
6625
  bool inplace) {
@@ -6891,7 +6634,6 @@ struct ggml_tensor * ggml_soft_max_impl(
6891
6634
  result->op = GGML_OP_SOFT_MAX;
6892
6635
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6893
6636
  result->src[0] = a;
6894
- result->src[1] = NULL;
6895
6637
 
6896
6638
  return result;
6897
6639
  }
@@ -6911,7 +6653,7 @@ struct ggml_tensor * ggml_soft_max_inplace(
6911
6653
 
6912
6654
  // ggml_soft_max_back
6913
6655
 
6914
- struct ggml_tensor * ggml_soft_max_back_impl(
6656
+ static struct ggml_tensor * ggml_soft_max_back_impl(
6915
6657
  struct ggml_context * ctx,
6916
6658
  struct ggml_tensor * a,
6917
6659
  struct ggml_tensor * b,
@@ -6948,13 +6690,15 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
6948
6690
 
6949
6691
  // ggml_rope
6950
6692
 
6951
- struct ggml_tensor * ggml_rope_impl(
6693
+ static struct ggml_tensor * ggml_rope_impl(
6952
6694
  struct ggml_context * ctx,
6953
6695
  struct ggml_tensor * a,
6954
6696
  int n_past,
6955
6697
  int n_dims,
6956
6698
  int mode,
6957
6699
  int n_ctx,
6700
+ float freq_base,
6701
+ float freq_scale,
6958
6702
  bool inplace) {
6959
6703
  GGML_ASSERT(n_past >= 0);
6960
6704
  bool is_node = false;
@@ -6965,21 +6709,14 @@ struct ggml_tensor * ggml_rope_impl(
6965
6709
 
6966
6710
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6967
6711
 
6968
- ggml_scratch_save(ctx);
6969
-
6970
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6971
-
6972
- ((int32_t *) b->data)[0] = n_past;
6973
- ((int32_t *) b->data)[1] = n_dims;
6974
- ((int32_t *) b->data)[2] = mode;
6975
- ((int32_t *) b->data)[3] = n_ctx;
6976
-
6977
- ggml_scratch_load(ctx);
6712
+ int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6713
+ memcpy(params + 4, &freq_base, sizeof(float));
6714
+ memcpy(params + 5, &freq_scale, sizeof(float));
6715
+ ggml_set_op_params(result, &params, sizeof(params));
6978
6716
 
6979
6717
  result->op = GGML_OP_ROPE;
6980
6718
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6981
6719
  result->src[0] = a;
6982
- result->src[1] = b;
6983
6720
 
6984
6721
  return result;
6985
6722
  }
@@ -6991,7 +6728,7 @@ struct ggml_tensor * ggml_rope(
6991
6728
  int n_dims,
6992
6729
  int mode,
6993
6730
  int n_ctx) {
6994
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
6731
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
6995
6732
  }
6996
6733
 
6997
6734
  struct ggml_tensor * ggml_rope_inplace(
@@ -7001,7 +6738,19 @@ struct ggml_tensor * ggml_rope_inplace(
7001
6738
  int n_dims,
7002
6739
  int mode,
7003
6740
  int n_ctx) {
7004
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
6741
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
6742
+ }
6743
+
6744
+ struct ggml_tensor * ggml_rope_custom_inplace(
6745
+ struct ggml_context * ctx,
6746
+ struct ggml_tensor * a,
6747
+ int n_past,
6748
+ int n_dims,
6749
+ int mode,
6750
+ int n_ctx,
6751
+ float freq_base,
6752
+ float freq_scale) {
6753
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
7005
6754
  }
7006
6755
 
7007
6756
  // ggml_rope_back
@@ -7011,7 +6760,8 @@ struct ggml_tensor * ggml_rope_back(
7011
6760
  struct ggml_tensor * a,
7012
6761
  int n_past,
7013
6762
  int n_dims,
7014
- int mode) {
6763
+ int mode,
6764
+ int n_ctx) {
7015
6765
  GGML_ASSERT(n_past >= 0);
7016
6766
  GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
7017
6767
 
@@ -7023,21 +6773,12 @@ struct ggml_tensor * ggml_rope_back(
7023
6773
 
7024
6774
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7025
6775
 
7026
- ggml_scratch_save(ctx);
7027
-
7028
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7029
- ggml_set_name(b, "n_past, n_dims, mode");
7030
-
7031
- ((int32_t *) b->data)[0] = n_past;
7032
- ((int32_t *) b->data)[1] = n_dims;
7033
- ((int32_t *) b->data)[2] = mode;
7034
-
7035
- ggml_scratch_load(ctx);
6776
+ int32_t params[] = { n_past, n_dims, mode, n_ctx };
6777
+ ggml_set_op_params(result, &params, sizeof(params));
7036
6778
 
7037
6779
  result->op = GGML_OP_ROPE_BACK;
7038
6780
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7039
6781
  result->src[0] = a;
7040
- result->src[1] = b;
7041
6782
 
7042
6783
  return result;
7043
6784
  }
@@ -7062,21 +6803,13 @@ struct ggml_tensor * ggml_alibi(
7062
6803
  //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7063
6804
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7064
6805
 
7065
- ggml_scratch_save(ctx);
7066
-
7067
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7068
-
7069
- ((int32_t *) b->data)[0] = n_past;
7070
- ((int32_t *) b->data)[1] = n_head;
7071
- GGML_ASSERT(sizeof(float) == sizeof(int32_t));
7072
- (((float *) b->data)[2]) = bias_max;
7073
-
7074
- ggml_scratch_load(ctx);
6806
+ int32_t op_params[3] = { n_past, n_head };
6807
+ memcpy(op_params + 2, &bias_max, sizeof(float));
6808
+ ggml_set_op_params(result, &op_params, sizeof(op_params));
7075
6809
 
7076
6810
  result->op = GGML_OP_ALIBI;
7077
6811
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7078
6812
  result->src[0] = a;
7079
- result->src[1] = b;
7080
6813
 
7081
6814
  return result;
7082
6815
  }
@@ -7098,19 +6831,12 @@ struct ggml_tensor * ggml_clamp(
7098
6831
  // TODO: when implement backward, fix this:
7099
6832
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
7100
6833
 
7101
- ggml_scratch_save(ctx);
7102
-
7103
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
7104
-
7105
- ((float *) b->data)[0] = min;
7106
- ((float *) b->data)[1] = max;
7107
-
7108
- ggml_scratch_load(ctx);
6834
+ float params[] = { min, max };
6835
+ ggml_set_op_params(result, &params, sizeof(params));
7109
6836
 
7110
6837
  result->op = GGML_OP_CLAMP;
7111
6838
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7112
6839
  result->src[0] = a;
7113
- result->src[1] = b;
7114
6840
 
7115
6841
  return result;
7116
6842
  }
@@ -7143,18 +6869,13 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
7143
6869
  };
7144
6870
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7145
6871
 
7146
- ggml_scratch_save(ctx);
7147
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7148
- ((int32_t*)c->data)[0] = s0;
7149
- ((int32_t*)c->data)[1] = p0;
7150
- ((int32_t*)c->data)[2] = d0;
7151
- ggml_scratch_load(ctx);
6872
+ int32_t params[] = { s0, p0, d0 };
6873
+ ggml_set_op_params(result, &params, sizeof(params));
7152
6874
 
7153
6875
  result->op = GGML_OP_CONV_1D;
7154
6876
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7155
6877
  result->src[0] = a;
7156
6878
  result->src[1] = b;
7157
- result->src[2] = c;
7158
6879
 
7159
6880
  return result;
7160
6881
  }
@@ -7187,21 +6908,13 @@ struct ggml_tensor* ggml_conv_2d(
7187
6908
  };
7188
6909
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7189
6910
 
7190
- ggml_scratch_save(ctx);
7191
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
7192
- ((int32_t*)c->data)[0] = s0;
7193
- ((int32_t*)c->data)[1] = s1;
7194
- ((int32_t*)c->data)[2] = p0;
7195
- ((int32_t*)c->data)[3] = p1;
7196
- ((int32_t*)c->data)[4] = d0;
7197
- ((int32_t*)c->data)[5] = d1;
7198
- ggml_scratch_load(ctx);
6911
+ int32_t params[] = { s0, s1, p0, p1, d0, d1 };
6912
+ ggml_set_op_params(result, &params, sizeof(params));
7199
6913
 
7200
6914
  result->op = GGML_OP_CONV_2D;
7201
6915
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7202
6916
  result->src[0] = a;
7203
6917
  result->src[1] = b;
7204
- result->src[2] = c;
7205
6918
 
7206
6919
  return result;
7207
6920
 
@@ -7225,7 +6938,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
7225
6938
  return (ins + 2 * p - ks) / s + 1;
7226
6939
  }
7227
6940
 
7228
- // ggml_pool_2d
6941
+ // ggml_pool_1d
7229
6942
 
7230
6943
  struct ggml_tensor* ggml_pool_1d(
7231
6944
  struct ggml_context * ctx,
@@ -7248,18 +6961,12 @@ struct ggml_tensor* ggml_pool_1d(
7248
6961
  };
7249
6962
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
7250
6963
 
7251
- ggml_scratch_save(ctx);
7252
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
7253
- ((int32_t*)c->data)[0] = op;
7254
- ((int32_t*)c->data)[1] = k0;
7255
- ((int32_t*)c->data)[2] = s0;
7256
- ((int32_t*)c->data)[3] = p0;
7257
- ggml_scratch_load(ctx);
6964
+ int32_t params[] = { op, k0, s0, p0 };
6965
+ ggml_set_op_params(result, &params, sizeof(params));
7258
6966
 
7259
6967
  result->op = GGML_OP_POOL_1D;
7260
6968
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7261
6969
  result->src[0] = a;
7262
- result->src[1] = c;
7263
6970
 
7264
6971
  return result;
7265
6972
  }
@@ -7291,21 +6998,12 @@ struct ggml_tensor* ggml_pool_2d(
7291
6998
  };
7292
6999
  struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7293
7000
 
7294
- ggml_scratch_save(ctx);
7295
- struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
7296
- ((int32_t*)c->data)[0] = op;
7297
- ((int32_t*)c->data)[1] = k0;
7298
- ((int32_t*)c->data)[2] = k1;
7299
- ((int32_t*)c->data)[3] = s0;
7300
- ((int32_t*)c->data)[4] = s1;
7301
- ((int32_t*)c->data)[5] = p0;
7302
- ((int32_t*)c->data)[6] = p1;
7303
- ggml_scratch_load(ctx);
7001
+ int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
7002
+ ggml_set_op_params(result, &params, sizeof(params));
7304
7003
 
7305
7004
  result->op = GGML_OP_POOL_2D;
7306
7005
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7307
7006
  result->src[0] = a;
7308
- result->src[1] = c;
7309
7007
 
7310
7008
  return result;
7311
7009
  }
@@ -7328,14 +7026,16 @@ struct ggml_tensor * ggml_flash_attn(
7328
7026
  }
7329
7027
 
7330
7028
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
7331
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne);
7029
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
7030
+
7031
+ int32_t t = masked ? 1 : 0;
7032
+ ggml_set_op_params(result, &t, sizeof(t));
7332
7033
 
7333
7034
  result->op = GGML_OP_FLASH_ATTN;
7334
7035
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7335
7036
  result->src[0] = q;
7336
7037
  result->src[1] = k;
7337
7038
  result->src[2] = v;
7338
- result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
7339
7039
 
7340
7040
  return result;
7341
7041
  }
@@ -7359,7 +7059,7 @@ struct ggml_tensor * ggml_flash_ff(
7359
7059
  }
7360
7060
 
7361
7061
  //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7362
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);
7062
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
7363
7063
 
7364
7064
  result->op = GGML_OP_FLASH_FF;
7365
7065
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7425,13 +7125,15 @@ struct ggml_tensor * ggml_flash_attn_back(
7425
7125
 
7426
7126
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7427
7127
 
7128
+ int32_t masked_i = masked ? 1 : 0;
7129
+ ggml_set_op_params(result, &masked_i, sizeof(masked_i));
7130
+
7428
7131
  result->op = GGML_OP_FLASH_ATTN_BACK;
7429
7132
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7430
7133
  result->src[0] = q;
7431
7134
  result->src[1] = k;
7432
7135
  result->src[2] = v;
7433
7136
  result->src[3] = d;
7434
- result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
7435
7137
 
7436
7138
  return result;
7437
7139
  }
@@ -7464,21 +7166,12 @@ struct ggml_tensor * ggml_win_part(
7464
7166
 
7465
7167
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7466
7168
 
7467
- ggml_scratch_save(ctx);
7468
-
7469
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7470
-
7471
- ((int32_t *) b->data)[0] = npx;
7472
- ((int32_t *) b->data)[1] = npy;
7473
- ((int32_t *) b->data)[2] = w;
7474
-
7475
- ggml_scratch_load(ctx);
7169
+ int32_t params[] = { npx, npy, w };
7170
+ ggml_set_op_params(result, &params, sizeof(params));
7476
7171
 
7477
7172
  result->op = GGML_OP_WIN_PART;
7478
7173
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7479
7174
  result->src[0] = a;
7480
- result->src[1] = NULL;
7481
- result->src[2] = b;
7482
7175
 
7483
7176
  return result;
7484
7177
  }
@@ -7503,26 +7196,57 @@ struct ggml_tensor * ggml_win_unpart(
7503
7196
  const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
7504
7197
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7505
7198
 
7506
- ggml_scratch_save(ctx);
7199
+ int32_t params[] = { w };
7200
+ ggml_set_op_params(result, &params, sizeof(params));
7507
7201
 
7508
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
7202
+ result->op = GGML_OP_WIN_UNPART;
7203
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7204
+ result->src[0] = a;
7509
7205
 
7510
- ((int32_t *) b->data)[0] = w;
7206
+ return result;
7207
+ }
7511
7208
 
7512
- ggml_scratch_load(ctx);
7209
+ // gmml_unary
7513
7210
 
7514
- result->op = GGML_OP_WIN_UNPART;
7211
+ static struct ggml_tensor * ggml_unary_impl(
7212
+ struct ggml_context * ctx,
7213
+ struct ggml_tensor * a,
7214
+ enum ggml_unary_op op,
7215
+ bool inplace) {
7216
+ bool is_node = false;
7217
+
7218
+ if (!inplace && (a->grad)) {
7219
+ is_node = true;
7220
+ }
7221
+
7222
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7223
+
7224
+ ggml_set_op_params_i32(result, 0, (int32_t) op);
7225
+
7226
+ result->op = GGML_OP_UNARY;
7515
7227
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7516
7228
  result->src[0] = a;
7517
- result->src[1] = NULL;
7518
- result->src[2] = b;
7519
7229
 
7520
7230
  return result;
7521
7231
  }
7522
7232
 
7233
+ struct ggml_tensor * ggml_unary(
7234
+ struct ggml_context * ctx,
7235
+ struct ggml_tensor * a,
7236
+ enum ggml_unary_op op) {
7237
+ return ggml_unary_impl(ctx, a, op, false);
7238
+ }
7239
+
7240
+ struct ggml_tensor * ggml_unary_inplace(
7241
+ struct ggml_context * ctx,
7242
+ struct ggml_tensor * a,
7243
+ enum ggml_unary_op op) {
7244
+ return ggml_unary_impl(ctx, a, op, true);
7245
+ }
7246
+
7523
7247
  // ggml_map_unary
7524
7248
 
7525
- struct ggml_tensor * ggml_map_unary_impl_f32(
7249
+ static struct ggml_tensor * ggml_map_unary_impl_f32(
7526
7250
  struct ggml_context * ctx,
7527
7251
  struct ggml_tensor * a,
7528
7252
  const ggml_unary_op_f32_t fun,
@@ -7533,19 +7257,13 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
7533
7257
  is_node = true;
7534
7258
  }
7535
7259
 
7536
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7537
-
7538
- ggml_scratch_save(ctx);
7539
-
7540
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7541
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7260
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7542
7261
 
7543
- ggml_scratch_load(ctx);
7262
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7544
7263
 
7545
7264
  result->op = GGML_OP_MAP_UNARY;
7546
7265
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7547
7266
  result->src[0] = a;
7548
- result->src[2] = addr_tensor;
7549
7267
 
7550
7268
  return result;
7551
7269
  }
@@ -7566,7 +7284,7 @@ struct ggml_tensor * ggml_map_unary_inplace_f32(
7566
7284
 
7567
7285
  // ggml_map_binary
7568
7286
 
7569
- struct ggml_tensor * ggml_map_binary_impl_f32(
7287
+ static struct ggml_tensor * ggml_map_binary_impl_f32(
7570
7288
  struct ggml_context * ctx,
7571
7289
  struct ggml_tensor * a,
7572
7290
  struct ggml_tensor * b,
@@ -7580,20 +7298,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
7580
7298
  is_node = true;
7581
7299
  }
7582
7300
 
7583
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7584
-
7585
- ggml_scratch_save(ctx);
7586
-
7587
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7588
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7301
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7589
7302
 
7590
- ggml_scratch_load(ctx);
7303
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7591
7304
 
7592
7305
  result->op = GGML_OP_MAP_BINARY;
7593
7306
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7594
7307
  result->src[0] = a;
7595
7308
  result->src[1] = b;
7596
- result->src[2] = addr_tensor;
7597
7309
 
7598
7310
  return result;
7599
7311
  }
@@ -7616,7 +7328,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7616
7328
 
7617
7329
  // ggml_map_custom1
7618
7330
 
7619
- struct ggml_tensor * ggml_map_custom1_impl_f32(
7331
+ static struct ggml_tensor * ggml_map_custom1_impl_f32(
7620
7332
  struct ggml_context * ctx,
7621
7333
  struct ggml_tensor * a,
7622
7334
  const ggml_custom1_op_f32_t fun,
@@ -7627,19 +7339,13 @@ struct ggml_tensor * ggml_map_custom1_impl_f32(
7627
7339
  is_node = true;
7628
7340
  }
7629
7341
 
7630
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7631
-
7632
- ggml_scratch_save(ctx);
7633
-
7634
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7635
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7342
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7636
7343
 
7637
- ggml_scratch_load(ctx);
7344
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7638
7345
 
7639
7346
  result->op = GGML_OP_MAP_CUSTOM1;
7640
7347
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7641
7348
  result->src[0] = a;
7642
- result->src[2] = addr_tensor;
7643
7349
 
7644
7350
  return result;
7645
7351
  }
@@ -7660,7 +7366,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7660
7366
 
7661
7367
  // ggml_map_custom2
7662
7368
 
7663
- struct ggml_tensor * ggml_map_custom2_impl_f32(
7369
+ static struct ggml_tensor * ggml_map_custom2_impl_f32(
7664
7370
  struct ggml_context * ctx,
7665
7371
  struct ggml_tensor * a,
7666
7372
  struct ggml_tensor * b,
@@ -7672,20 +7378,14 @@ struct ggml_tensor * ggml_map_custom2_impl_f32(
7672
7378
  is_node = true;
7673
7379
  }
7674
7380
 
7675
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7676
-
7677
- ggml_scratch_save(ctx);
7678
-
7679
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7680
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7381
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7681
7382
 
7682
- ggml_scratch_load(ctx);
7383
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7683
7384
 
7684
7385
  result->op = GGML_OP_MAP_CUSTOM2;
7685
7386
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7686
7387
  result->src[0] = a;
7687
7388
  result->src[1] = b;
7688
- result->src[2] = addr_tensor;
7689
7389
 
7690
7390
  return result;
7691
7391
  }
@@ -7708,7 +7408,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7708
7408
 
7709
7409
  // ggml_map_custom3
7710
7410
 
7711
- struct ggml_tensor * ggml_map_custom3_impl_f32(
7411
+ static struct ggml_tensor * ggml_map_custom3_impl_f32(
7712
7412
  struct ggml_context * ctx,
7713
7413
  struct ggml_tensor * a,
7714
7414
  struct ggml_tensor * b,
@@ -7721,21 +7421,15 @@ struct ggml_tensor * ggml_map_custom3_impl_f32(
7721
7421
  is_node = true;
7722
7422
  }
7723
7423
 
7724
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7725
-
7726
- ggml_scratch_save(ctx);
7727
-
7728
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7729
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7424
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7730
7425
 
7731
- ggml_scratch_load(ctx);
7426
+ ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7732
7427
 
7733
7428
  result->op = GGML_OP_MAP_CUSTOM3;
7734
7429
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7735
7430
  result->src[0] = a;
7736
7431
  result->src[1] = b;
7737
- result->src[2] = addr_tensor;
7738
- result->src[3] = c;
7432
+ result->src[2] = c;
7739
7433
 
7740
7434
  return result;
7741
7435
  }
@@ -8963,21 +8657,17 @@ static void ggml_compute_forward_acc_f32(
8963
8657
  const struct ggml_compute_params * params,
8964
8658
  const struct ggml_tensor * src0,
8965
8659
  const struct ggml_tensor * src1,
8966
- const struct ggml_tensor * opt0,
8967
8660
  struct ggml_tensor * dst) {
8968
8661
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8969
8662
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
8970
8663
 
8971
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
8972
- GGML_ASSERT(ggml_nelements(opt0) == 5);
8973
-
8974
8664
  // view src0 and dst with these strides and data offset inbytes during acc
8975
8665
  // nb0 is implicitely element_size because src0 and dst are contiguous
8976
- size_t nb1 = ((int32_t *) opt0->data)[0];
8977
- size_t nb2 = ((int32_t *) opt0->data)[1];
8978
- size_t nb3 = ((int32_t *) opt0->data)[2];
8979
- size_t offset = ((int32_t *) opt0->data)[3];
8980
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
8666
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
8667
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
8668
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
8669
+ size_t offset = ((int32_t *) dst->op_params)[3];
8670
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
8981
8671
 
8982
8672
  if (!inplace && (params->type == GGML_TASK_INIT)) {
8983
8673
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -9046,13 +8736,12 @@ static void ggml_compute_forward_acc(
9046
8736
  const struct ggml_compute_params * params,
9047
8737
  const struct ggml_tensor * src0,
9048
8738
  const struct ggml_tensor * src1,
9049
- const struct ggml_tensor * opt0,
9050
8739
  struct ggml_tensor * dst) {
9051
8740
 
9052
8741
  switch (src0->type) {
9053
8742
  case GGML_TYPE_F32:
9054
8743
  {
9055
- ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst);
8744
+ ggml_compute_forward_acc_f32(params, src0, src1, dst);
9056
8745
  } break;
9057
8746
  case GGML_TYPE_F16:
9058
8747
  case GGML_TYPE_Q4_0:
@@ -9484,7 +9173,7 @@ static void ggml_compute_forward_sum_f32(
9484
9173
  for (int64_t i03 = 0; i03 < ne03; i03++) {
9485
9174
  for (int64_t i02 = 0; i02 < ne02; i02++) {
9486
9175
  for (int64_t i01 = 0; i01 < ne01; i01++) {
9487
- ggml_vec_sum_ggf(ne00,
9176
+ ggml_vec_sum_f32_ggf(ne00,
9488
9177
  &row_sum,
9489
9178
  (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
9490
9179
  sum += row_sum;
@@ -9494,6 +9183,38 @@ static void ggml_compute_forward_sum_f32(
9494
9183
  ((float *) dst->data)[0] = sum;
9495
9184
  }
9496
9185
 
9186
+ static void ggml_compute_forward_sum_f16(
9187
+ const struct ggml_compute_params * params,
9188
+ const struct ggml_tensor * src0,
9189
+ struct ggml_tensor * dst) {
9190
+ assert(params->ith == 0);
9191
+ assert(ggml_is_scalar(dst));
9192
+
9193
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9194
+ return;
9195
+ }
9196
+
9197
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
9198
+
9199
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
9200
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb);
9201
+
9202
+ float sum = 0;
9203
+ float row_sum = 0;
9204
+
9205
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
9206
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
9207
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
9208
+ ggml_vec_sum_f16_ggf(ne00,
9209
+ &row_sum,
9210
+ (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
9211
+ sum += row_sum;
9212
+ }
9213
+ }
9214
+ }
9215
+ ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
9216
+ }
9217
+
9497
9218
  static void ggml_compute_forward_sum(
9498
9219
  const struct ggml_compute_params * params,
9499
9220
  const struct ggml_tensor * src0,
@@ -9503,6 +9224,10 @@ static void ggml_compute_forward_sum(
9503
9224
  {
9504
9225
  ggml_compute_forward_sum_f32(params, src0, dst);
9505
9226
  } break;
9227
+ case GGML_TYPE_F16:
9228
+ {
9229
+ ggml_compute_forward_sum_f16(params, src0, dst);
9230
+ } break;
9506
9231
  default:
9507
9232
  {
9508
9233
  GGML_ASSERT(false);
@@ -10098,8 +9823,8 @@ static void ggml_compute_forward_gelu_f32(
10098
9823
  const struct ggml_compute_params * params,
10099
9824
  const struct ggml_tensor * src0,
10100
9825
  struct ggml_tensor * dst) {
10101
- GGML_ASSERT(ggml_is_contiguous(src0));
10102
- GGML_ASSERT(ggml_is_contiguous(dst));
9826
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9827
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10103
9828
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10104
9829
 
10105
9830
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10157,8 +9882,8 @@ static void ggml_compute_forward_gelu_quick_f32(
10157
9882
  const struct ggml_compute_params * params,
10158
9883
  const struct ggml_tensor * src0,
10159
9884
  struct ggml_tensor * dst) {
10160
- GGML_ASSERT(ggml_is_contiguous(src0));
10161
- GGML_ASSERT(ggml_is_contiguous(dst));
9885
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9886
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10162
9887
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10163
9888
 
10164
9889
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10216,8 +9941,8 @@ static void ggml_compute_forward_silu_f32(
10216
9941
  const struct ggml_compute_params * params,
10217
9942
  const struct ggml_tensor * src0,
10218
9943
  struct ggml_tensor * dst) {
10219
- GGML_ASSERT(ggml_is_contiguous(src0));
10220
- GGML_ASSERT(ggml_is_contiguous(dst));
9944
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9945
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10221
9946
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10222
9947
 
10223
9948
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -10269,7 +9994,6 @@ static void ggml_compute_forward_silu(
10269
9994
  }
10270
9995
  }
10271
9996
 
10272
-
10273
9997
  // ggml_compute_forward_silu_back
10274
9998
 
10275
9999
  static void ggml_compute_forward_silu_back_f32(
@@ -10277,9 +10001,9 @@ static void ggml_compute_forward_silu_back_f32(
10277
10001
  const struct ggml_tensor * src0,
10278
10002
  const struct ggml_tensor * grad,
10279
10003
  struct ggml_tensor * dst) {
10280
- GGML_ASSERT(ggml_is_contiguous(grad));
10281
- GGML_ASSERT(ggml_is_contiguous(src0));
10282
- GGML_ASSERT(ggml_is_contiguous(dst));
10004
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
10005
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
10006
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
10283
10007
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10284
10008
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
10285
10009
 
@@ -10419,7 +10143,8 @@ static void ggml_compute_forward_rms_norm_f32(
10419
10143
 
10420
10144
  GGML_TENSOR_UNARY_OP_LOCALS;
10421
10145
 
10422
- const float eps = 1e-6f; // TODO: make this a parameter
10146
+ float eps;
10147
+ memcpy(&eps, dst->op_params, sizeof(float));
10423
10148
 
10424
10149
  // TODO: optimize
10425
10150
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -10684,6 +10409,8 @@ static void ggml_compute_forward_mul_mat(
10684
10409
 
10685
10410
  const enum ggml_type type = src0->type;
10686
10411
 
10412
+ const bool src1_cont = ggml_is_contiguous(src1);
10413
+
10687
10414
  ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
10688
10415
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
10689
10416
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
@@ -10747,7 +10474,7 @@ static void ggml_compute_forward_mul_mat(
10747
10474
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
10748
10475
 
10749
10476
  if (type != GGML_TYPE_F32) {
10750
- float * const wdata = params->wdata;
10477
+ float * const wdata = params->wdata;
10751
10478
  ggml_to_float_t const to_float = type_traits[type].to_float;
10752
10479
 
10753
10480
  size_t id = 0;
@@ -10805,7 +10532,7 @@ static void ggml_compute_forward_mul_mat(
10805
10532
  // src1 rows
10806
10533
  const int64_t nr1 = ne11*ne12*ne13;
10807
10534
 
10808
- void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10535
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10809
10536
  const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10810
10537
 
10811
10538
  for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
@@ -10828,7 +10555,15 @@ static void ggml_compute_forward_mul_mat(
10828
10555
  const int64_t i3 = i13;
10829
10556
 
10830
10557
  const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
10831
- const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size;
10558
+
10559
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10560
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10561
+ // the original src1 data pointer, so we should index using the indices directly
10562
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
10563
+ const char * src1_col = (const char *) wdata +
10564
+ (src1_cont || src1->type != vec_dot_type
10565
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10566
+ : (i11*nb11 + i12*nb12 + i13*nb13));
10832
10567
 
10833
10568
  float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10834
10569
 
@@ -11062,21 +10797,17 @@ static void ggml_compute_forward_set_f32(
11062
10797
  const struct ggml_compute_params * params,
11063
10798
  const struct ggml_tensor * src0,
11064
10799
  const struct ggml_tensor * src1,
11065
- const struct ggml_tensor * opt0,
11066
10800
  struct ggml_tensor * dst) {
11067
10801
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11068
10802
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
11069
10803
 
11070
- GGML_ASSERT(opt0->type == GGML_TYPE_I32);
11071
- GGML_ASSERT(ggml_nelements(opt0) == 5);
11072
-
11073
10804
  // view src0 and dst with these strides and data offset inbytes during set
11074
10805
  // nb0 is implicitely element_size because src0 and dst are contiguous
11075
- size_t nb1 = ((int32_t *) opt0->data)[0];
11076
- size_t nb2 = ((int32_t *) opt0->data)[1];
11077
- size_t nb3 = ((int32_t *) opt0->data)[2];
11078
- size_t offset = ((int32_t *) opt0->data)[3];
11079
- bool inplace = (bool) ((int32_t *) opt0->data)[4];
10806
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
10807
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
10808
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
10809
+ size_t offset = ((int32_t *) dst->op_params)[3];
10810
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
11080
10811
 
11081
10812
  if (!inplace && (params->type == GGML_TASK_INIT)) {
11082
10813
  // memcpy needs to be synchronized across threads to avoid race conditions.
@@ -11136,13 +10867,12 @@ static void ggml_compute_forward_set(
11136
10867
  const struct ggml_compute_params * params,
11137
10868
  const struct ggml_tensor * src0,
11138
10869
  const struct ggml_tensor * src1,
11139
- const struct ggml_tensor * opt0,
11140
10870
  struct ggml_tensor * dst) {
11141
10871
 
11142
10872
  switch (src0->type) {
11143
10873
  case GGML_TYPE_F32:
11144
10874
  {
11145
- ggml_compute_forward_set_f32(params, src0, src1, opt0, dst);
10875
+ ggml_compute_forward_set_f32(params, src0, src1, dst);
11146
10876
  } break;
11147
10877
  case GGML_TYPE_F16:
11148
10878
  case GGML_TYPE_Q4_0:
@@ -11538,17 +11268,14 @@ static void ggml_compute_forward_diag(
11538
11268
  static void ggml_compute_forward_diag_mask_f32(
11539
11269
  const struct ggml_compute_params * params,
11540
11270
  const struct ggml_tensor * src0,
11541
- const struct ggml_tensor * src1,
11542
11271
  struct ggml_tensor * dst,
11543
11272
  const float value) {
11544
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11545
- GGML_ASSERT(ggml_nelements(src1) == 2);
11546
11273
 
11547
11274
  const int ith = params->ith;
11548
11275
  const int nth = params->nth;
11549
11276
 
11550
- const int n_past = ((int32_t *) src1->data)[0];
11551
- const bool inplace = (bool)((int32_t *) src1->data)[1];
11277
+ const int n_past = ((int32_t *) dst->op_params)[0];
11278
+ const bool inplace = (bool)((int32_t *) dst->op_params)[1];
11552
11279
 
11553
11280
  GGML_ASSERT(n_past >= 0);
11554
11281
 
@@ -11591,12 +11318,11 @@ static void ggml_compute_forward_diag_mask_f32(
11591
11318
  static void ggml_compute_forward_diag_mask_inf(
11592
11319
  const struct ggml_compute_params * params,
11593
11320
  const struct ggml_tensor * src0,
11594
- const struct ggml_tensor * src1,
11595
11321
  struct ggml_tensor * dst) {
11596
11322
  switch (src0->type) {
11597
11323
  case GGML_TYPE_F32:
11598
11324
  {
11599
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY);
11325
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
11600
11326
  } break;
11601
11327
  default:
11602
11328
  {
@@ -11608,12 +11334,11 @@ static void ggml_compute_forward_diag_mask_inf(
11608
11334
  static void ggml_compute_forward_diag_mask_zero(
11609
11335
  const struct ggml_compute_params * params,
11610
11336
  const struct ggml_tensor * src0,
11611
- const struct ggml_tensor * src1,
11612
11337
  struct ggml_tensor * dst) {
11613
11338
  switch (src0->type) {
11614
11339
  case GGML_TYPE_F32:
11615
11340
  {
11616
- ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0);
11341
+ ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
11617
11342
  } break;
11618
11343
  default:
11619
11344
  {
@@ -11811,20 +11536,17 @@ static void ggml_compute_forward_soft_max_back(
11811
11536
  static void ggml_compute_forward_alibi_f32(
11812
11537
  const struct ggml_compute_params * params,
11813
11538
  const struct ggml_tensor * src0,
11814
- const struct ggml_tensor * src1,
11815
11539
  struct ggml_tensor * dst) {
11816
11540
  assert(params->ith == 0);
11817
11541
 
11818
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11819
- GGML_ASSERT(ggml_nelements(src1) == 3);
11820
-
11821
11542
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11822
11543
  return;
11823
11544
  }
11824
11545
 
11825
- const int n_past = ((int32_t *) src1->data)[0];
11826
- const int n_head = ((int32_t *) src1->data)[1];
11827
- const float max_bias = ((float *) src1->data)[2];
11546
+ const int n_past = ((int32_t *) dst->op_params)[0];
11547
+ const int n_head = ((int32_t *) dst->op_params)[1];
11548
+ float max_bias;
11549
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11828
11550
 
11829
11551
  assert(n_past >= 0);
11830
11552
 
@@ -11877,20 +11599,17 @@ static void ggml_compute_forward_alibi_f32(
11877
11599
  static void ggml_compute_forward_alibi_f16(
11878
11600
  const struct ggml_compute_params * params,
11879
11601
  const struct ggml_tensor * src0,
11880
- const struct ggml_tensor * src1,
11881
11602
  struct ggml_tensor * dst) {
11882
11603
  assert(params->ith == 0);
11883
11604
 
11884
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
11885
- GGML_ASSERT(ggml_nelements(src1) == 3);
11886
-
11887
11605
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11888
11606
  return;
11889
11607
  }
11890
11608
 
11891
- const int n_past = ((int32_t *) src1->data)[0];
11892
- const int n_head = ((int32_t *) src1->data)[1];
11893
- const float max_bias = ((float *) src1->data)[2];
11609
+ const int n_past = ((int32_t *) dst->op_params)[0];
11610
+ const int n_head = ((int32_t *) dst->op_params)[1];
11611
+ float max_bias;
11612
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
11894
11613
 
11895
11614
  assert(n_past >= 0);
11896
11615
 
@@ -11943,16 +11662,15 @@ static void ggml_compute_forward_alibi_f16(
11943
11662
  static void ggml_compute_forward_alibi(
11944
11663
  const struct ggml_compute_params * params,
11945
11664
  const struct ggml_tensor * src0,
11946
- const struct ggml_tensor * src1,
11947
11665
  struct ggml_tensor * dst) {
11948
11666
  switch (src0->type) {
11949
11667
  case GGML_TYPE_F16:
11950
11668
  {
11951
- ggml_compute_forward_alibi_f16(params, src0, src1, dst);
11669
+ ggml_compute_forward_alibi_f16(params, src0, dst);
11952
11670
  } break;
11953
11671
  case GGML_TYPE_F32:
11954
11672
  {
11955
- ggml_compute_forward_alibi_f32(params, src0, src1, dst);
11673
+ ggml_compute_forward_alibi_f32(params, src0, dst);
11956
11674
  } break;
11957
11675
  case GGML_TYPE_Q4_0:
11958
11676
  case GGML_TYPE_Q4_1:
@@ -11982,19 +11700,17 @@ static void ggml_compute_forward_alibi(
11982
11700
  static void ggml_compute_forward_clamp_f32(
11983
11701
  const struct ggml_compute_params * params,
11984
11702
  const struct ggml_tensor * src0,
11985
- const struct ggml_tensor * src1,
11986
11703
  struct ggml_tensor * dst) {
11987
11704
  assert(params->ith == 0);
11988
11705
 
11989
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11990
- GGML_ASSERT(ggml_nelements(src1) == 2);
11991
-
11992
11706
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11993
11707
  return;
11994
11708
  }
11995
11709
 
11996
- const float min = ((float *) src1->data)[0];
11997
- const float max = ((float *) src1->data)[1];
11710
+ float min;
11711
+ float max;
11712
+ memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
11713
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
11998
11714
 
11999
11715
  const int ith = params->ith;
12000
11716
  const int nth = params->nth;
@@ -12024,12 +11740,11 @@ static void ggml_compute_forward_clamp_f32(
12024
11740
  static void ggml_compute_forward_clamp(
12025
11741
  const struct ggml_compute_params * params,
12026
11742
  const struct ggml_tensor * src0,
12027
- const struct ggml_tensor * src1,
12028
11743
  struct ggml_tensor * dst) {
12029
11744
  switch (src0->type) {
12030
11745
  case GGML_TYPE_F32:
12031
11746
  {
12032
- ggml_compute_forward_clamp_f32(params, src0, src1, dst);
11747
+ ggml_compute_forward_clamp_f32(params, src0, dst);
12033
11748
  } break;
12034
11749
  case GGML_TYPE_F16:
12035
11750
  case GGML_TYPE_Q4_0:
@@ -12059,19 +11774,21 @@ static void ggml_compute_forward_clamp(
12059
11774
  static void ggml_compute_forward_rope_f32(
12060
11775
  const struct ggml_compute_params * params,
12061
11776
  const struct ggml_tensor * src0,
12062
- const struct ggml_tensor * src1,
12063
11777
  struct ggml_tensor * dst) {
12064
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12065
- GGML_ASSERT(ggml_nelements(src1) == 4);
12066
11778
 
12067
11779
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12068
11780
  return;
12069
11781
  }
12070
11782
 
12071
- const int n_past = ((int32_t *) src1->data)[0];
12072
- const int n_dims = ((int32_t *) src1->data)[1];
12073
- const int mode = ((int32_t *) src1->data)[2];
12074
- const int n_ctx = ((int32_t *) src1->data)[3];
11783
+ float freq_base;
11784
+ float freq_scale;
11785
+
11786
+ const int n_past = ((int32_t *) dst->op_params)[0];
11787
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11788
+ const int mode = ((int32_t *) dst->op_params)[2];
11789
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11790
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11791
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12075
11792
 
12076
11793
  assert(n_past >= 0);
12077
11794
 
@@ -12100,7 +11817,7 @@ static void ggml_compute_forward_rope_f32(
12100
11817
  // row index used to determine which thread to use
12101
11818
  int ir = 0;
12102
11819
 
12103
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
11820
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12104
11821
 
12105
11822
  const bool is_neox = mode & 2;
12106
11823
  const bool is_glm = mode & 4;
@@ -12112,7 +11829,7 @@ static void ggml_compute_forward_rope_f32(
12112
11829
  if (ir++ < ir0) continue;
12113
11830
  if (ir > ir1) break;
12114
11831
 
12115
- float theta = (float)p;
11832
+ float theta = freq_scale * (float)p;
12116
11833
 
12117
11834
  if (is_glm) {
12118
11835
  theta = MIN(p, n_ctx - 2);
@@ -12186,19 +11903,21 @@ static void ggml_compute_forward_rope_f32(
12186
11903
  static void ggml_compute_forward_rope_f16(
12187
11904
  const struct ggml_compute_params * params,
12188
11905
  const struct ggml_tensor * src0,
12189
- const struct ggml_tensor * src1,
12190
11906
  struct ggml_tensor * dst) {
12191
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
12192
- GGML_ASSERT(ggml_nelements(src1) == 4);
12193
11907
 
12194
11908
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12195
11909
  return;
12196
11910
  }
12197
11911
 
12198
- const int n_past = ((int32_t *) src1->data)[0];
12199
- const int n_dims = ((int32_t *) src1->data)[1];
12200
- const int mode = ((int32_t *) src1->data)[2];
12201
- const int n_ctx = ((int32_t *) src1->data)[3];
11912
+ float freq_base;
11913
+ float freq_scale;
11914
+
11915
+ const int n_past = ((int32_t *) dst->op_params)[0];
11916
+ const int n_dims = ((int32_t *) dst->op_params)[1];
11917
+ const int mode = ((int32_t *) dst->op_params)[2];
11918
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
11919
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11920
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12202
11921
 
12203
11922
  assert(n_past >= 0);
12204
11923
 
@@ -12227,7 +11946,7 @@ static void ggml_compute_forward_rope_f16(
12227
11946
  // row index used to determine which thread to use
12228
11947
  int ir = 0;
12229
11948
 
12230
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
11949
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12231
11950
 
12232
11951
  const bool is_neox = mode & 2;
12233
11952
  const bool is_glm = mode & 4;
@@ -12239,7 +11958,7 @@ static void ggml_compute_forward_rope_f16(
12239
11958
  if (ir++ < ir0) continue;
12240
11959
  if (ir > ir1) break;
12241
11960
 
12242
- float theta = (float)p;
11961
+ float theta = freq_scale * (float)p;
12243
11962
 
12244
11963
  if (is_glm) {
12245
11964
  theta = MIN(p, n_ctx - 2);
@@ -12300,7 +12019,7 @@ static void ggml_compute_forward_rope_f16(
12300
12019
  const float x0 = GGML_FP16_TO_FP32(src[0]);
12301
12020
  const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
12302
12021
 
12303
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12022
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12304
12023
  dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12305
12024
  }
12306
12025
  }
@@ -12313,16 +12032,15 @@ static void ggml_compute_forward_rope_f16(
12313
12032
  static void ggml_compute_forward_rope(
12314
12033
  const struct ggml_compute_params * params,
12315
12034
  const struct ggml_tensor * src0,
12316
- const struct ggml_tensor * src1,
12317
12035
  struct ggml_tensor * dst) {
12318
12036
  switch (src0->type) {
12319
12037
  case GGML_TYPE_F16:
12320
12038
  {
12321
- ggml_compute_forward_rope_f16(params, src0, src1, dst);
12039
+ ggml_compute_forward_rope_f16(params, src0, dst);
12322
12040
  } break;
12323
12041
  case GGML_TYPE_F32:
12324
12042
  {
12325
- ggml_compute_forward_rope_f32(params, src0, src1, dst);
12043
+ ggml_compute_forward_rope_f32(params, src0, dst);
12326
12044
  } break;
12327
12045
  default:
12328
12046
  {
@@ -12336,10 +12054,7 @@ static void ggml_compute_forward_rope(
12336
12054
  static void ggml_compute_forward_rope_back_f32(
12337
12055
  const struct ggml_compute_params * params,
12338
12056
  const struct ggml_tensor * src0,
12339
- const struct ggml_tensor * src1,
12340
12057
  struct ggml_tensor * dst) {
12341
- assert(src1->type == GGML_TYPE_I32);
12342
- assert(ggml_nelements(src1) == 3);
12343
12058
 
12344
12059
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12345
12060
  return;
@@ -12349,9 +12064,9 @@ static void ggml_compute_forward_rope_back_f32(
12349
12064
  // dx = rope_back(dy, src1)
12350
12065
  // src0 is dy, src1 contains options
12351
12066
 
12352
- const int n_past = ((int32_t *) src1->data)[0];
12353
- const int n_dims = ((int32_t *) src1->data)[1];
12354
- const int mode = ((int32_t *) src1->data)[2];
12067
+ const int n_past = ((int32_t *) dst->op_params)[0];
12068
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12069
+ const int mode = ((int32_t *) dst->op_params)[2];
12355
12070
 
12356
12071
  assert(n_past >= 0);
12357
12072
 
@@ -12435,10 +12150,7 @@ static void ggml_compute_forward_rope_back_f32(
12435
12150
  static void ggml_compute_forward_rope_back_f16(
12436
12151
  const struct ggml_compute_params * params,
12437
12152
  const struct ggml_tensor * src0,
12438
- const struct ggml_tensor * src1,
12439
12153
  struct ggml_tensor * dst) {
12440
- assert(src1->type == GGML_TYPE_I32);
12441
- assert(ggml_nelements(src1) == 3);
12442
12154
 
12443
12155
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12444
12156
  return;
@@ -12448,9 +12160,9 @@ static void ggml_compute_forward_rope_back_f16(
12448
12160
  // dx = rope_back(dy, src1)
12449
12161
  // src0 is dy, src1 contains options
12450
12162
 
12451
- const int n_past = ((int32_t *) src1->data)[0];
12452
- const int n_dims = ((int32_t *) src1->data)[1];
12453
- const int mode = ((int32_t *) src1->data)[2];
12163
+ const int n_past = ((int32_t *) dst->op_params)[0];
12164
+ const int n_dims = ((int32_t *) dst->op_params)[1];
12165
+ const int mode = ((int32_t *) dst->op_params)[2];
12454
12166
 
12455
12167
  assert(n_past >= 0);
12456
12168
 
@@ -12534,16 +12246,15 @@ static void ggml_compute_forward_rope_back_f16(
12534
12246
  static void ggml_compute_forward_rope_back(
12535
12247
  const struct ggml_compute_params * params,
12536
12248
  const struct ggml_tensor * src0,
12537
- const struct ggml_tensor * src1,
12538
12249
  struct ggml_tensor * dst) {
12539
12250
  switch (src0->type) {
12540
12251
  case GGML_TYPE_F16:
12541
12252
  {
12542
- ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
12253
+ ggml_compute_forward_rope_back_f16(params, src0, dst);
12543
12254
  } break;
12544
12255
  case GGML_TYPE_F32:
12545
12256
  {
12546
- ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
12257
+ ggml_compute_forward_rope_back_f32(params, src0, dst);
12547
12258
  } break;
12548
12259
  default:
12549
12260
  {
@@ -12740,7 +12451,7 @@ static void ggml_compute_forward_conv_1d_s1_ph(
12740
12451
  const struct ggml_compute_params * params,
12741
12452
  const struct ggml_tensor * src0,
12742
12453
  const struct ggml_tensor * src1,
12743
- struct ggml_tensor * dst) {
12454
+ struct ggml_tensor * dst) {
12744
12455
  switch (src0->type) {
12745
12456
  case GGML_TYPE_F16:
12746
12457
  {
@@ -12943,7 +12654,7 @@ static void ggml_compute_forward_conv_1d_s2_ph(
12943
12654
  const struct ggml_compute_params * params,
12944
12655
  const struct ggml_tensor * src0,
12945
12656
  const struct ggml_tensor * src1,
12946
- struct ggml_tensor * dst) {
12657
+ struct ggml_tensor * dst) {
12947
12658
  switch (src0->type) {
12948
12659
  case GGML_TYPE_F16:
12949
12660
  {
@@ -12963,14 +12674,13 @@ static void ggml_compute_forward_conv_1d_s2_ph(
12963
12674
  // ggml_compute_forward_conv_1d
12964
12675
 
12965
12676
  static void ggml_compute_forward_conv_1d(
12966
- const struct ggml_compute_params * params,
12967
- const struct ggml_tensor * src0,
12968
- const struct ggml_tensor * src1,
12969
- const struct ggml_tensor * opt0,
12970
- struct ggml_tensor * dst) {
12971
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
12972
- const int32_t p0 = ((const int32_t*)(opt0->data))[1];
12973
- const int32_t d0 = ((const int32_t*)(opt0->data))[2];
12677
+ const struct ggml_compute_params * params,
12678
+ const struct ggml_tensor * src0,
12679
+ const struct ggml_tensor * src1,
12680
+ struct ggml_tensor * dst) {
12681
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12682
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
12683
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
12974
12684
  GGML_ASSERT(d0 == 1); // dilation not supported
12975
12685
  GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
12976
12686
  if (s0 == 1) {
@@ -12982,9 +12692,9 @@ static void ggml_compute_forward_conv_1d(
12982
12692
  };
12983
12693
  }
12984
12694
 
12985
- // ggml_compute_forward_conv_2d_sk_p0
12695
+ // ggml_compute_forward_conv_2d
12986
12696
 
12987
- static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
12697
+ static void ggml_compute_forward_conv_2d_f16_f32(
12988
12698
  const struct ggml_compute_params * params,
12989
12699
  const struct ggml_tensor * src0,
12990
12700
  const struct ggml_tensor * src1,
@@ -13007,28 +12717,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13007
12717
  // size of the convolution row - the kernel size unrolled across all channels
13008
12718
  const int ew0 = nk0*nk1*ne02;
13009
12719
 
12720
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12721
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12722
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12723
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12724
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12725
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
12726
+
13010
12727
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13011
12728
  GGML_ASSERT(nb10 == sizeof(float));
13012
12729
 
13013
12730
  if (params->type == GGML_TASK_INIT) {
13014
- // TODO: fix this memset (wsize is overestimated)
13015
12731
  memset(params->wdata, 0, params->wsize);
13016
12732
 
13017
12733
  // prepare source data (src1)
13018
12734
  {
13019
12735
  ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13020
12736
 
13021
- for (int i13 = 0; i13 < ne13; i13++) {
13022
- for (int i12 = 0; i12 < ne12; i12++) {
13023
- const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
13024
- ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
12737
+ for (int i12 = 0; i12 < ne12; i12++) {
12738
+ const float * const src = (float *)((char *) src1->data + i12*nb12);
12739
+ ggml_fp16_t * dst_data = wdata;
12740
+
12741
+ for (int i1 = 0; i1 < ne1; i1++) {
12742
+ for (int i0 = 0; i0 < ne0; i0++) {
12743
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
12744
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
12745
+ const int idx0 = i0*s0 + ik0*d0 - p0;
12746
+ const int idx1 = i1*s1 + ik1*d1 - p1;
13025
12747
 
13026
- for (int i1 = 0; i1 < ne1; i1++) {
13027
- for (int i0 = 0; i0 < ne0; i0++) {
13028
- for (int ik1 = 0; ik1 < nk1; ik1++) {
13029
- for (int ik0 = 0; ik0 < nk0; ik0++) {
12748
+ if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
13030
12749
  dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13031
- GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
12750
+ GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
13032
12751
  }
13033
12752
  }
13034
12753
  }
@@ -13071,19 +12790,19 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13071
12790
  }
13072
12791
  }
13073
12792
 
13074
- static void ggml_compute_forward_conv_2d_sk_p0(
12793
+ static void ggml_compute_forward_conv_2d(
13075
12794
  const struct ggml_compute_params * params,
13076
12795
  const struct ggml_tensor * src0,
13077
12796
  const struct ggml_tensor * src1,
13078
- struct ggml_tensor * dst) {
12797
+ struct ggml_tensor * dst) {
13079
12798
  switch (src0->type) {
13080
12799
  case GGML_TYPE_F16:
13081
12800
  {
13082
- ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
12801
+ ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
13083
12802
  } break;
13084
12803
  case GGML_TYPE_F32:
13085
12804
  {
13086
- //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
12805
+ //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
13087
12806
  GGML_ASSERT(false);
13088
12807
  } break;
13089
12808
  default:
@@ -13093,32 +12812,6 @@ static void ggml_compute_forward_conv_2d_sk_p0(
13093
12812
  }
13094
12813
  }
13095
12814
 
13096
- // ggml_compute_forward_conv_2d
13097
-
13098
- static void ggml_compute_forward_conv_2d(
13099
- const struct ggml_compute_params* params,
13100
- const struct ggml_tensor* src0,
13101
- const struct ggml_tensor* src1,
13102
- const struct ggml_tensor* opt0,
13103
- struct ggml_tensor* dst) {
13104
- const int32_t s0 = ((const int32_t*)(opt0->data))[0];
13105
- const int32_t s1 = ((const int32_t*)(opt0->data))[1];
13106
- const int32_t p0 = ((const int32_t*)(opt0->data))[2];
13107
- const int32_t p1 = ((const int32_t*)(opt0->data))[3];
13108
- const int32_t d0 = ((const int32_t*)(opt0->data))[4];
13109
- const int32_t d1 = ((const int32_t*)(opt0->data))[5];
13110
- GGML_ASSERT(d0 == 1); // dilation not supported
13111
- GGML_ASSERT(d1 == 1);
13112
- GGML_ASSERT(p0 == 0); // padding not supported
13113
- GGML_ASSERT(p1 == 0);
13114
-
13115
- if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
13116
- ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
13117
- } else {
13118
- GGML_ASSERT(false); // only stride equal to kernel size is supported
13119
- }
13120
- }
13121
-
13122
12815
  // ggml_compute_forward_pool_1d_sk_p0
13123
12816
 
13124
12817
  static void ggml_compute_forward_pool_1d_sk_p0(
@@ -13174,12 +12867,11 @@ static void ggml_compute_forward_pool_1d_sk_p0(
13174
12867
  // ggml_compute_forward_pool_1d
13175
12868
 
13176
12869
  static void ggml_compute_forward_pool_1d(
13177
- const struct ggml_compute_params* params,
13178
- const struct ggml_tensor* src0,
13179
- const struct ggml_tensor* opt0,
13180
- struct ggml_tensor* dst) {
13181
- GGML_ASSERT(opt0->ne[0] == 4);
13182
- const int* opts = (const int*)opt0->data;
12870
+ const struct ggml_compute_params * params,
12871
+ const struct ggml_tensor * src0,
12872
+ struct ggml_tensor * dst) {
12873
+
12874
+ const int32_t* opts = (const int32_t*)dst->op_params;
13183
12875
  enum ggml_op_pool op = opts[0];
13184
12876
  const int k0 = opts[1];
13185
12877
  const int s0 = opts[2];
@@ -13193,12 +12885,12 @@ static void ggml_compute_forward_pool_1d(
13193
12885
  // ggml_compute_forward_pool_2d_sk_p0
13194
12886
 
13195
12887
  static void ggml_compute_forward_pool_2d_sk_p0(
13196
- const struct ggml_compute_params * params,
13197
- const enum ggml_op_pool op,
13198
- const struct ggml_tensor * src,
13199
- const int k0,
13200
- const int k1,
13201
- struct ggml_tensor * dst) {
12888
+ const struct ggml_compute_params * params,
12889
+ const enum ggml_op_pool op,
12890
+ const struct ggml_tensor * src,
12891
+ const int k0,
12892
+ const int k1,
12893
+ struct ggml_tensor * dst) {
13202
12894
  assert(src->type == GGML_TYPE_F32);
13203
12895
  assert(params->ith == 0);
13204
12896
 
@@ -13258,12 +12950,11 @@ static void ggml_compute_forward_pool_2d_sk_p0(
13258
12950
  // ggml_compute_forward_pool_2d
13259
12951
 
13260
12952
  static void ggml_compute_forward_pool_2d(
13261
- const struct ggml_compute_params * params,
13262
- const struct ggml_tensor * src0,
13263
- const struct ggml_tensor * opt0,
13264
- struct ggml_tensor * dst) {
13265
- GGML_ASSERT(opt0->ne[0] == 7);
13266
- const int* opts = (const int*)opt0->data;
12953
+ const struct ggml_compute_params * params,
12954
+ const struct ggml_tensor * src0,
12955
+ struct ggml_tensor * dst) {
12956
+
12957
+ const int32_t * opts = (const int32_t *)dst->op_params;
13267
12958
  enum ggml_op_pool op = opts[0];
13268
12959
  const int k0 = opts[1];
13269
12960
  const int k1 = opts[2];
@@ -13288,7 +12979,7 @@ static void ggml_compute_forward_flash_attn_f32(
13288
12979
  const struct ggml_tensor * k,
13289
12980
  const struct ggml_tensor * v,
13290
12981
  const bool masked,
13291
- struct ggml_tensor * dst) {
12982
+ struct ggml_tensor * dst) {
13292
12983
  int64_t t0 = ggml_perf_time_us();
13293
12984
  UNUSED(t0);
13294
12985
 
@@ -13466,7 +13157,7 @@ static void ggml_compute_forward_flash_attn_f16(
13466
13157
  const struct ggml_tensor * k,
13467
13158
  const struct ggml_tensor * v,
13468
13159
  const bool masked,
13469
- struct ggml_tensor * dst) {
13160
+ struct ggml_tensor * dst) {
13470
13161
  int64_t t0 = ggml_perf_time_us();
13471
13162
  UNUSED(t0);
13472
13163
 
@@ -14231,7 +13922,6 @@ static void ggml_compute_forward_flash_attn_back(
14231
13922
  static void ggml_compute_forward_win_part_f32(
14232
13923
  const struct ggml_compute_params * params,
14233
13924
  const struct ggml_tensor * src0,
14234
- const struct ggml_tensor * opt0,
14235
13925
  struct ggml_tensor * dst) {
14236
13926
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14237
13927
  return;
@@ -14240,9 +13930,9 @@ static void ggml_compute_forward_win_part_f32(
14240
13930
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14241
13931
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14242
13932
 
14243
- const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14244
- const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
14245
- const int32_t w = ((const int32_t *)(opt0->data))[2];
13933
+ const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
13934
+ const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
13935
+ const int32_t w = ((const int32_t *)(dst->op_params))[2];
14246
13936
 
14247
13937
  assert(ne00 == ne0);
14248
13938
  assert(ne3 == nep0*nep1);
@@ -14276,12 +13966,11 @@ static void ggml_compute_forward_win_part_f32(
14276
13966
  static void ggml_compute_forward_win_part(
14277
13967
  const struct ggml_compute_params * params,
14278
13968
  const struct ggml_tensor * src0,
14279
- const struct ggml_tensor * opt0,
14280
13969
  struct ggml_tensor * dst) {
14281
13970
  switch (src0->type) {
14282
13971
  case GGML_TYPE_F32:
14283
13972
  {
14284
- ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
13973
+ ggml_compute_forward_win_part_f32(params, src0, dst);
14285
13974
  } break;
14286
13975
  default:
14287
13976
  {
@@ -14295,7 +13984,6 @@ static void ggml_compute_forward_win_part(
14295
13984
  static void ggml_compute_forward_win_unpart_f32(
14296
13985
  const struct ggml_compute_params * params,
14297
13986
  const struct ggml_tensor * src0,
14298
- const struct ggml_tensor * opt0,
14299
13987
  struct ggml_tensor * dst) {
14300
13988
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14301
13989
  return;
@@ -14304,7 +13992,7 @@ static void ggml_compute_forward_win_unpart_f32(
14304
13992
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
14305
13993
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
14306
13994
 
14307
- const int32_t w = ((const int32_t *)(opt0->data))[0];
13995
+ const int32_t w = ((const int32_t *)(dst->op_params))[0];
14308
13996
 
14309
13997
  // padding
14310
13998
  const int px = (w - ne1%w)%w;
@@ -14338,12 +14026,67 @@ static void ggml_compute_forward_win_unpart_f32(
14338
14026
  static void ggml_compute_forward_win_unpart(
14339
14027
  const struct ggml_compute_params * params,
14340
14028
  const struct ggml_tensor * src0,
14341
- const struct ggml_tensor * opt0,
14342
14029
  struct ggml_tensor * dst) {
14343
14030
  switch (src0->type) {
14344
14031
  case GGML_TYPE_F32:
14345
14032
  {
14346
- ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
14033
+ ggml_compute_forward_win_unpart_f32(params, src0, dst);
14034
+ } break;
14035
+ default:
14036
+ {
14037
+ GGML_ASSERT(false);
14038
+ } break;
14039
+ }
14040
+ }
14041
+
14042
+ //gmml_compute_forward_unary
14043
+
14044
+ static void ggml_compute_forward_unary(
14045
+ const struct ggml_compute_params * params,
14046
+ const struct ggml_tensor * src0,
14047
+ struct ggml_tensor * dst) {
14048
+ const enum ggml_unary_op op = ggml_get_unary_op(dst);
14049
+
14050
+ switch (op) {
14051
+ case GGML_UNARY_OP_ABS:
14052
+ {
14053
+ ggml_compute_forward_abs(params, src0, dst);
14054
+ } break;
14055
+ case GGML_UNARY_OP_SGN:
14056
+ {
14057
+ ggml_compute_forward_sgn(params, src0, dst);
14058
+ } break;
14059
+ case GGML_UNARY_OP_NEG:
14060
+ {
14061
+ ggml_compute_forward_neg(params, src0, dst);
14062
+ } break;
14063
+ case GGML_UNARY_OP_STEP:
14064
+ {
14065
+ ggml_compute_forward_step(params, src0, dst);
14066
+ } break;
14067
+ case GGML_UNARY_OP_TANH:
14068
+ {
14069
+ ggml_compute_forward_tanh(params, src0, dst);
14070
+ } break;
14071
+ case GGML_UNARY_OP_ELU:
14072
+ {
14073
+ ggml_compute_forward_elu(params, src0, dst);
14074
+ } break;
14075
+ case GGML_UNARY_OP_RELU:
14076
+ {
14077
+ ggml_compute_forward_relu(params, src0, dst);
14078
+ } break;
14079
+ case GGML_UNARY_OP_GELU:
14080
+ {
14081
+ ggml_compute_forward_gelu(params, src0, dst);
14082
+ } break;
14083
+ case GGML_UNARY_OP_GELU_QUICK:
14084
+ {
14085
+ ggml_compute_forward_gelu_quick(params, src0, dst);
14086
+ } break;
14087
+ case GGML_UNARY_OP_SILU:
14088
+ {
14089
+ ggml_compute_forward_silu(params, src0, dst);
14347
14090
  } break;
14348
14091
  default:
14349
14092
  {
@@ -14862,7 +14605,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14862
14605
  } break;
14863
14606
  case GGML_OP_ACC:
14864
14607
  {
14865
- ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14608
+ ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
14866
14609
  } break;
14867
14610
  case GGML_OP_SUB:
14868
14611
  {
@@ -14912,46 +14655,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14912
14655
  {
14913
14656
  ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
14914
14657
  } break;
14915
- case GGML_OP_ABS:
14916
- {
14917
- ggml_compute_forward_abs(params, tensor->src[0], tensor);
14918
- } break;
14919
- case GGML_OP_SGN:
14920
- {
14921
- ggml_compute_forward_sgn(params, tensor->src[0], tensor);
14922
- } break;
14923
- case GGML_OP_NEG:
14924
- {
14925
- ggml_compute_forward_neg(params, tensor->src[0], tensor);
14926
- } break;
14927
- case GGML_OP_STEP:
14928
- {
14929
- ggml_compute_forward_step(params, tensor->src[0], tensor);
14930
- } break;
14931
- case GGML_OP_TANH:
14932
- {
14933
- ggml_compute_forward_tanh(params, tensor->src[0], tensor);
14934
- } break;
14935
- case GGML_OP_ELU:
14936
- {
14937
- ggml_compute_forward_elu(params, tensor->src[0], tensor);
14938
- } break;
14939
- case GGML_OP_RELU:
14940
- {
14941
- ggml_compute_forward_relu(params, tensor->src[0], tensor);
14942
- } break;
14943
- case GGML_OP_GELU:
14944
- {
14945
- ggml_compute_forward_gelu(params, tensor->src[0], tensor);
14946
- } break;
14947
- case GGML_OP_GELU_QUICK:
14948
- {
14949
- ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
14950
- } break;
14951
- case GGML_OP_SILU:
14952
- {
14953
- ggml_compute_forward_silu(params, tensor->src[0], tensor);
14954
- } break;
14955
14658
  case GGML_OP_SILU_BACK:
14956
14659
  {
14957
14660
  ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
@@ -14982,7 +14685,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14982
14685
  } break;
14983
14686
  case GGML_OP_SET:
14984
14687
  {
14985
- ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14688
+ ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
14986
14689
  } break;
14987
14690
  case GGML_OP_CPY:
14988
14691
  {
@@ -15022,11 +14725,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15022
14725
  } break;
15023
14726
  case GGML_OP_DIAG_MASK_INF:
15024
14727
  {
15025
- ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor);
14728
+ ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
15026
14729
  } break;
15027
14730
  case GGML_OP_DIAG_MASK_ZERO:
15028
14731
  {
15029
- ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor);
14732
+ ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
15030
14733
  } break;
15031
14734
  case GGML_OP_SOFT_MAX:
15032
14735
  {
@@ -15038,39 +14741,39 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15038
14741
  } break;
15039
14742
  case GGML_OP_ROPE:
15040
14743
  {
15041
- ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
14744
+ ggml_compute_forward_rope(params, tensor->src[0], tensor);
15042
14745
  } break;
15043
14746
  case GGML_OP_ROPE_BACK:
15044
14747
  {
15045
- ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
14748
+ ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
15046
14749
  } break;
15047
14750
  case GGML_OP_ALIBI:
15048
14751
  {
15049
- ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor);
14752
+ ggml_compute_forward_alibi(params, tensor->src[0], tensor);
15050
14753
  } break;
15051
14754
  case GGML_OP_CLAMP:
15052
14755
  {
15053
- ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor);
14756
+ ggml_compute_forward_clamp(params, tensor->src[0], tensor);
15054
14757
  } break;
15055
14758
  case GGML_OP_CONV_1D:
15056
14759
  {
15057
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14760
+ ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
15058
14761
  } break;
15059
14762
  case GGML_OP_CONV_2D:
15060
14763
  {
15061
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14764
+ ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
15062
14765
  } break;
15063
14766
  case GGML_OP_POOL_1D:
15064
14767
  {
15065
- ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
14768
+ ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
15066
14769
  } break;
15067
14770
  case GGML_OP_POOL_2D:
15068
14771
  {
15069
- ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
14772
+ ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
15070
14773
  } break;
15071
14774
  case GGML_OP_FLASH_ATTN:
15072
14775
  {
15073
- const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
14776
+ const int32_t t = ggml_get_op_params_i32(tensor, 0);
15074
14777
  GGML_ASSERT(t == 0 || t == 1);
15075
14778
  const bool masked = t != 0;
15076
14779
  ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
@@ -15081,47 +14784,56 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15081
14784
  } break;
15082
14785
  case GGML_OP_FLASH_ATTN_BACK:
15083
14786
  {
15084
- int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
14787
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15085
14788
  GGML_ASSERT(t == 0 || t == 1);
15086
14789
  bool masked = t != 0;
15087
14790
  ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
15088
14791
  } break;
15089
14792
  case GGML_OP_WIN_PART:
15090
14793
  {
15091
- ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor);
14794
+ ggml_compute_forward_win_part(params, tensor->src[0], tensor);
15092
14795
  } break;
15093
14796
  case GGML_OP_WIN_UNPART:
15094
14797
  {
15095
- ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor);
14798
+ ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
14799
+ } break;
14800
+ case GGML_OP_UNARY:
14801
+ {
14802
+ ggml_compute_forward_unary(params, tensor->src[0], tensor);
15096
14803
  } break;
15097
14804
  case GGML_OP_MAP_UNARY:
15098
14805
  {
15099
- const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data);
14806
+ ggml_unary_op_f32_t fun;
14807
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15100
14808
  ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
15101
14809
  }
15102
14810
  break;
15103
14811
  case GGML_OP_MAP_BINARY:
15104
14812
  {
15105
- const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data);
14813
+ ggml_binary_op_f32_t fun;
14814
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15106
14815
  ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
15107
14816
  }
15108
14817
  break;
15109
14818
  case GGML_OP_MAP_CUSTOM1:
15110
14819
  {
15111
- const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data);
14820
+ ggml_custom1_op_f32_t fun;
14821
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15112
14822
  ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
15113
14823
  }
15114
14824
  break;
15115
14825
  case GGML_OP_MAP_CUSTOM2:
15116
14826
  {
15117
- const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data);
14827
+ ggml_custom2_op_f32_t fun;
14828
+ memcpy(&fun, tensor->op_params, sizeof(fun));
15118
14829
  ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
15119
14830
  }
15120
14831
  break;
15121
14832
  case GGML_OP_MAP_CUSTOM3:
15122
14833
  {
15123
- const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data);
15124
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun);
14834
+ ggml_custom3_op_f32_t fun;
14835
+ memcpy(&fun, tensor->op_params, sizeof(fun));
14836
+ ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15125
14837
  }
15126
14838
  break;
15127
14839
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -15185,12 +14897,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15185
14897
  src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
15186
14898
  }
15187
14899
  if (src1->grad) {
15188
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15189
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15190
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15191
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15192
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15193
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
14900
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
14901
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
14902
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
14903
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15194
14904
 
15195
14905
  struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
15196
14906
  tensor->grad,
@@ -15339,73 +15049,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15339
15049
  inplace);
15340
15050
  }
15341
15051
  } break;
15342
- case GGML_OP_ABS:
15343
- {
15344
- if (src0->grad) {
15345
- src0->grad =
15346
- ggml_add_impl(ctx,
15347
- src0->grad,
15348
- ggml_mul(ctx,
15349
- ggml_sgn(ctx, src0),
15350
- tensor->grad),
15351
- inplace);
15352
- }
15353
- } break;
15354
- case GGML_OP_SGN:
15355
- {
15356
- if (src0->grad) {
15357
- // noop
15358
- }
15359
- } break;
15360
- case GGML_OP_NEG:
15361
- {
15362
- if (src0->grad) {
15363
- src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15364
- }
15365
- } break;
15366
- case GGML_OP_STEP:
15367
- {
15368
- if (src0->grad) {
15369
- // noop
15370
- }
15371
- } break;
15372
- case GGML_OP_TANH:
15373
- {
15374
- GGML_ASSERT(false); // TODO: not implemented
15375
- } break;
15376
- case GGML_OP_ELU:
15377
- {
15378
- GGML_ASSERT(false); // TODO: not implemented
15379
- } break;
15380
- case GGML_OP_RELU:
15381
- {
15382
- if (src0->grad) {
15383
- src0->grad = ggml_sub_impl(ctx,
15384
- src0->grad,
15385
- ggml_mul(ctx,
15386
- ggml_step(ctx, src0),
15387
- tensor->grad),
15388
- inplace);
15389
- }
15390
- } break;
15391
- case GGML_OP_GELU:
15392
- {
15393
- GGML_ASSERT(false); // TODO: not implemented
15394
- } break;
15395
- case GGML_OP_GELU_QUICK:
15396
- {
15397
- GGML_ASSERT(false); // TODO: not implemented
15398
- } break;
15399
- case GGML_OP_SILU:
15400
- {
15401
- // necessary for llama
15402
- if (src0->grad) {
15403
- src0->grad = ggml_add_impl(ctx,
15404
- src0->grad,
15405
- ggml_silu_back(ctx, src0, tensor->grad),
15406
- inplace);
15407
- }
15408
- } break;
15409
15052
  case GGML_OP_SILU_BACK:
15410
15053
  {
15411
15054
  GGML_ASSERT(false); // TODO: not implemented
@@ -15498,12 +15141,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15498
15141
  } break;
15499
15142
  case GGML_OP_SET:
15500
15143
  {
15501
- GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15502
- GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15503
- const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15504
- const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15505
- const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15506
- const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
15144
+ const size_t nb1 = ((int32_t *) tensor->op_params)[0];
15145
+ const size_t nb2 = ((int32_t *) tensor->op_params)[1];
15146
+ const size_t nb3 = ((int32_t *) tensor->op_params)[2];
15147
+ const size_t offset = ((int32_t *) tensor->op_params)[3];
15507
15148
 
15508
15149
  struct ggml_tensor * tensor_grad_view = NULL;
15509
15150
 
@@ -15580,8 +15221,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15580
15221
  if (src0->grad) {
15581
15222
  size_t offset;
15582
15223
 
15583
- GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2]));
15584
- memcpy(&offset, tensor->src[2]->data, sizeof(offset));
15224
+ memcpy(&offset, tensor->op_params, sizeof(offset));
15585
15225
 
15586
15226
  size_t nb1 = tensor->nb[1];
15587
15227
  size_t nb2 = tensor->nb[2];
@@ -15608,7 +15248,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15608
15248
  {
15609
15249
  // necessary for llama
15610
15250
  if (src0->grad) {
15611
- int32_t * axes = (int32_t *) tensor->src[2]->data;
15251
+ int32_t * axes = (int32_t *) tensor->op_params;
15612
15252
  int axis0 = axes[0] & 0x3;
15613
15253
  int axis1 = axes[1] & 0x3;
15614
15254
  int axis2 = axes[2] & 0x3;
@@ -15664,33 +15304,23 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15664
15304
  {
15665
15305
  // necessary for llama
15666
15306
  if (src0->grad) {
15667
- assert(src1->type == GGML_TYPE_I32);
15668
- assert(ggml_nelements(src1) == 2);
15669
- const int n_past = ((int32_t *) src1->data)[0];
15307
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15670
15308
  src0->grad =
15671
15309
  ggml_add_impl(ctx, src0->grad,
15672
15310
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15673
15311
  inplace);
15674
15312
  }
15675
- if (src1->grad) {
15676
- // noop
15677
- }
15678
15313
  } break;
15679
15314
  case GGML_OP_DIAG_MASK_ZERO:
15680
15315
  {
15681
15316
  // necessary for llama
15682
15317
  if (src0->grad) {
15683
- assert(src1->type == GGML_TYPE_I32);
15684
- assert(ggml_nelements(src1) == 2);
15685
- const int n_past = ((int32_t *) src1->data)[0];
15318
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15686
15319
  src0->grad =
15687
15320
  ggml_add_impl(ctx, src0->grad,
15688
15321
  ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
15689
15322
  inplace);
15690
15323
  }
15691
- if (src1->grad) {
15692
- // noop
15693
- }
15694
15324
  } break;
15695
15325
  case GGML_OP_SOFT_MAX:
15696
15326
  {
@@ -15711,33 +15341,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15711
15341
  {
15712
15342
  // necessary for llama
15713
15343
  if (src0->grad) {
15714
- assert(src1->type == GGML_TYPE_I32);
15715
- assert(ggml_nelements(src1) == 4);
15716
- const int n_past = ((int32_t *) src1->data)[0];
15717
- const int n_dims = ((int32_t *) src1->data)[1];
15718
- const int mode = ((int32_t *) src1->data)[2];
15344
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15345
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15346
+ const int mode = ((int32_t *) tensor->op_params)[2];
15347
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15719
15348
  src0->grad = ggml_add_impl(ctx,
15720
15349
  src0->grad,
15721
15350
  ggml_rope_back(ctx,
15722
15351
  tensor->grad,
15723
15352
  n_past,
15724
15353
  n_dims,
15725
- mode),
15354
+ mode,
15355
+ n_ctx),
15726
15356
  inplace);
15727
15357
  }
15728
- if (src1->grad) {
15729
- // noop
15730
- }
15731
15358
  } break;
15732
15359
  case GGML_OP_ROPE_BACK:
15733
15360
  {
15734
15361
  if (src0->grad) {
15735
- assert(src1->type == GGML_TYPE_I32);
15736
- assert(ggml_nelements(src1) == 4);
15737
- const int n_past = ((int32_t *) src1->data)[0];
15738
- const int n_dims = ((int32_t *) src1->data)[1];
15739
- const int mode = ((int32_t *) src1->data)[2];
15740
- const int n_ctx = ((int32_t *) src1->data)[3];
15362
+ const int n_past = ((int32_t *) tensor->op_params)[0];
15363
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
15364
+ const int mode = ((int32_t *) tensor->op_params)[2];
15365
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
15741
15366
  src0->grad = ggml_add_impl(ctx,
15742
15367
  src0->grad,
15743
15368
  ggml_rope(ctx,
@@ -15748,9 +15373,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15748
15373
  n_ctx),
15749
15374
  inplace);
15750
15375
  }
15751
- if (src1->grad) {
15752
- // noop
15753
- }
15754
15376
  } break;
15755
15377
  case GGML_OP_ALIBI:
15756
15378
  {
@@ -15780,7 +15402,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15780
15402
  {
15781
15403
  struct ggml_tensor * flash_grad = NULL;
15782
15404
  if (src0->grad || src1->grad || tensor->src[2]->grad) {
15783
- int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
15405
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
15784
15406
  GGML_ASSERT(t == 0 || t == 1);
15785
15407
  bool masked = t != 0;
15786
15408
  flash_grad =
@@ -15943,6 +15565,80 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15943
15565
  } break;
15944
15566
  case GGML_OP_WIN_PART:
15945
15567
  case GGML_OP_WIN_UNPART:
15568
+ case GGML_OP_UNARY:
15569
+ {
15570
+ switch (ggml_get_unary_op(tensor)) {
15571
+ case GGML_UNARY_OP_ABS:
15572
+ {
15573
+ if (src0->grad) {
15574
+ src0->grad =
15575
+ ggml_add_impl(ctx,
15576
+ src0->grad,
15577
+ ggml_mul(ctx,
15578
+ ggml_sgn(ctx, src0),
15579
+ tensor->grad),
15580
+ inplace);
15581
+ }
15582
+ } break;
15583
+ case GGML_UNARY_OP_SGN:
15584
+ {
15585
+ if (src0->grad) {
15586
+ // noop
15587
+ }
15588
+ } break;
15589
+ case GGML_UNARY_OP_NEG:
15590
+ {
15591
+ if (src0->grad) {
15592
+ src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
15593
+ }
15594
+ } break;
15595
+ case GGML_UNARY_OP_STEP:
15596
+ {
15597
+ if (src0->grad) {
15598
+ // noop
15599
+ }
15600
+ } break;
15601
+ case GGML_UNARY_OP_TANH:
15602
+ {
15603
+ GGML_ASSERT(false); // TODO: not implemented
15604
+ } break;
15605
+ case GGML_UNARY_OP_ELU:
15606
+ {
15607
+ GGML_ASSERT(false); // TODO: not implemented
15608
+ } break;
15609
+ case GGML_UNARY_OP_RELU:
15610
+ {
15611
+ if (src0->grad) {
15612
+ src0->grad = ggml_add_impl(ctx,
15613
+ src0->grad,
15614
+ ggml_mul(ctx,
15615
+ ggml_step(ctx, src0),
15616
+ tensor->grad),
15617
+ inplace);
15618
+ }
15619
+ } break;
15620
+ case GGML_UNARY_OP_GELU:
15621
+ {
15622
+ GGML_ASSERT(false); // TODO: not implemented
15623
+ } break;
15624
+ case GGML_UNARY_OP_GELU_QUICK:
15625
+ {
15626
+ GGML_ASSERT(false); // TODO: not implemented
15627
+ } break;
15628
+ case GGML_UNARY_OP_SILU:
15629
+ {
15630
+ // necessary for llama
15631
+ if (src0->grad) {
15632
+ src0->grad = ggml_add_impl(ctx,
15633
+ src0->grad,
15634
+ ggml_silu_back(ctx, src0, tensor->grad),
15635
+ inplace);
15636
+ }
15637
+ } break;
15638
+ default:
15639
+ GGML_ASSERT(false);
15640
+ }
15641
+ } break;
15946
15642
  case GGML_OP_MAP_UNARY:
15947
15643
  case GGML_OP_MAP_BINARY:
15948
15644
  case GGML_OP_MAP_CUSTOM1:
@@ -15978,6 +15674,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15978
15674
  }
15979
15675
  }
15980
15676
 
15677
+ static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
15678
+
15679
+ static size_t hash(void * p) {
15680
+ return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
15681
+ }
15682
+
15683
+ static bool hash_insert(void * hash_table[], void * p) {
15684
+ size_t h = hash(p);
15685
+
15686
+ // linear probing
15687
+ size_t i = h;
15688
+ while (hash_table[i] != NULL && hash_table[i] != p) {
15689
+ i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
15690
+ if (i == h) {
15691
+ // hash table is full
15692
+ GGML_ASSERT(false);
15693
+ }
15694
+ }
15695
+
15696
+ if (hash_table[i] == p) {
15697
+ return true;
15698
+ }
15699
+
15700
+ // insert
15701
+ hash_table[i] = p;
15702
+ return false;
15703
+ }
15704
+
15981
15705
  static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
15982
15706
  if (node->grad == NULL) {
15983
15707
  // this usually happens when we generate intermediate nodes from constants in the backward pass
@@ -15988,16 +15712,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15988
15712
  }
15989
15713
 
15990
15714
  // check if already visited
15991
- for (int i = 0; i < cgraph->n_nodes; i++) {
15992
- if (cgraph->nodes[i] == node) {
15993
- return;
15994
- }
15995
- }
15996
-
15997
- for (int i = 0; i < cgraph->n_leafs; i++) {
15998
- if (cgraph->leafs[i] == node) {
15999
- return;
16000
- }
15715
+ if (hash_insert(cgraph->visited_hash_table, node)) {
15716
+ return;
16001
15717
  }
16002
15718
 
16003
15719
  for (int i = 0; i < GGML_MAX_SRC; ++i) {
@@ -16060,6 +15776,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
16060
15776
  /*.nodes =*/ { NULL },
16061
15777
  /*.grads =*/ { NULL },
16062
15778
  /*.leafs =*/ { NULL },
15779
+ /*.hash_table =*/ { NULL },
16063
15780
  /*.perf_runs =*/ 0,
16064
15781
  /*.perf_cycles =*/ 0,
16065
15782
  /*.perf_time_us =*/ 0,
@@ -16101,13 +15818,42 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
16101
15818
 
16102
15819
  if (node->is_param) {
16103
15820
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16104
- ggml_build_forward_impl(&result, node->grad, true);
15821
+ ggml_build_forward_expand(&result, node->grad);
16105
15822
  }
16106
15823
  }
16107
15824
 
16108
15825
  return result;
16109
15826
  }
16110
15827
 
15828
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15829
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15830
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15831
+
15832
+ *cgraph = (struct ggml_cgraph) {
15833
+ /*.n_nodes =*/ 0,
15834
+ /*.n_leafs =*/ 0,
15835
+ /*.nodes =*/ { NULL },
15836
+ /*.grads =*/ { NULL },
15837
+ /*.leafs =*/ { NULL },
15838
+ /*.hash_table =*/ { NULL },
15839
+ /*.perf_runs =*/ 0,
15840
+ /*.perf_cycles =*/ 0,
15841
+ /*.perf_time_us =*/ 0,
15842
+ };
15843
+
15844
+ return cgraph;
15845
+ }
15846
+
15847
+ struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
15848
+ struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
15849
+ ggml_build_forward_impl(cgraph, tensor, false);
15850
+ return cgraph;
15851
+ }
15852
+
15853
+ size_t ggml_graph_overhead(void) {
15854
+ return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15855
+ }
15856
+
16111
15857
  //
16112
15858
  // thread data
16113
15859
  //
@@ -16173,7 +15919,7 @@ typedef pthread_t ggml_thread_t;
16173
15919
 
16174
15920
  // Android's libc implementation "bionic" does not support setting affinity
16175
15921
  #if defined(__linux__) && !defined(__BIONIC__)
16176
- void set_numa_thread_affinity(int thread_n, int n_threads) {
15922
+ static void set_numa_thread_affinity(int thread_n, int n_threads) {
16177
15923
  if (!ggml_is_numa()) {
16178
15924
  return;
16179
15925
  }
@@ -16198,7 +15944,7 @@ void set_numa_thread_affinity(int thread_n, int n_threads) {
16198
15944
  CPU_FREE(cpus);
16199
15945
  }
16200
15946
 
16201
- void clear_numa_thread_affinity(void) {
15947
+ static void clear_numa_thread_affinity(void) {
16202
15948
  if (!ggml_is_numa()) {
16203
15949
  return;
16204
15950
  }
@@ -16222,8 +15968,8 @@ void clear_numa_thread_affinity(void) {
16222
15968
  #else
16223
15969
  // TODO: Windows etc.
16224
15970
  // (the linux implementation may also work on BSD, someone should test)
16225
- void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16226
- void clear_numa_thread_affinity(void) {}
15971
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15972
+ static void clear_numa_thread_affinity(void) {}
16227
15973
  #endif
16228
15974
 
16229
15975
  struct ggml_compute_state_shared {
@@ -16293,8 +16039,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16293
16039
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16294
16040
  params.nth = n_tasks_arr[node_n];
16295
16041
  ggml_compute_forward(&params, node);
16296
- ggml_graph_compute_perf_stats_node(node, state->shared);
16297
16042
  }
16043
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16298
16044
  }
16299
16045
 
16300
16046
  // distribute new work or execute it direct if 1T
@@ -16324,8 +16070,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16324
16070
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16325
16071
  params.type = GGML_TASK_FINALIZE;
16326
16072
  ggml_compute_forward(&params, node);
16327
- ggml_graph_compute_perf_stats_node(node, state->shared);
16328
16073
  }
16074
+
16075
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16329
16076
  } else {
16330
16077
  break;
16331
16078
  }
@@ -16434,21 +16181,34 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16434
16181
  case GGML_OP_ARGMAX:
16435
16182
  case GGML_OP_REPEAT:
16436
16183
  case GGML_OP_REPEAT_BACK:
16437
- case GGML_OP_ABS:
16438
- case GGML_OP_SGN:
16439
- case GGML_OP_NEG:
16440
- case GGML_OP_STEP:
16441
- case GGML_OP_TANH:
16442
- case GGML_OP_ELU:
16443
- case GGML_OP_RELU:
16444
- {
16184
+ {
16445
16185
  n_tasks = 1;
16446
16186
  } break;
16447
- case GGML_OP_MUL:
16448
- case GGML_OP_GELU:
16449
- case GGML_OP_GELU_QUICK:
16450
- case GGML_OP_SILU:
16187
+
16188
+ case GGML_OP_UNARY:
16189
+ {
16190
+ switch (ggml_get_unary_op(node)) {
16191
+ case GGML_UNARY_OP_ABS:
16192
+ case GGML_UNARY_OP_SGN:
16193
+ case GGML_UNARY_OP_NEG:
16194
+ case GGML_UNARY_OP_STEP:
16195
+ case GGML_UNARY_OP_TANH:
16196
+ case GGML_UNARY_OP_ELU:
16197
+ case GGML_UNARY_OP_RELU:
16198
+ {
16199
+ n_tasks = 1;
16200
+ } break;
16201
+
16202
+ case GGML_UNARY_OP_GELU:
16203
+ case GGML_UNARY_OP_GELU_QUICK:
16204
+ case GGML_UNARY_OP_SILU:
16205
+ {
16206
+ n_tasks = n_threads;
16207
+ } break;
16208
+ }
16209
+ } break;
16451
16210
  case GGML_OP_SILU_BACK:
16211
+ case GGML_OP_MUL:
16452
16212
  case GGML_OP_NORM:
16453
16213
  case GGML_OP_RMS_NORM:
16454
16214
  case GGML_OP_RMS_NORM_BACK:
@@ -16513,10 +16273,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16513
16273
  case GGML_OP_GET_ROWS:
16514
16274
  case GGML_OP_GET_ROWS_BACK:
16515
16275
  case GGML_OP_DIAG:
16516
- case GGML_OP_DIAG_MASK_ZERO:
16517
16276
  {
16518
16277
  n_tasks = 1;
16519
16278
  } break;
16279
+ case GGML_OP_DIAG_MASK_ZERO:
16520
16280
  case GGML_OP_DIAG_MASK_INF:
16521
16281
  case GGML_OP_SOFT_MAX:
16522
16282
  case GGML_OP_SOFT_MAX_BACK:
@@ -16575,19 +16335,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16575
16335
  const int64_t ne11 = node->src[1]->ne[1]; // H
16576
16336
  const int64_t ne12 = node->src[1]->ne[2]; // C
16577
16337
 
16338
+ const int64_t ne0 = node->ne[0];
16339
+ const int64_t ne1 = node->ne[1];
16340
+ const int64_t ne2 = node->ne[2];
16578
16341
  const int64_t nk = ne00*ne01;
16342
+ const int64_t ew0 = nk * ne02;
16579
16343
 
16580
- UNUSED(ne02);
16581
16344
  UNUSED(ne03);
16582
- UNUSED(nk);
16345
+ UNUSED(ne2);
16583
16346
 
16584
16347
  size_t cur = 0;
16585
16348
 
16586
16349
  if (node->src[0]->type == GGML_TYPE_F16 &&
16587
- node->src[1]->type == GGML_TYPE_F32) {
16588
- cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
16350
+ node->src[1]->type == GGML_TYPE_F32) {
16351
+ cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
16589
16352
  } else if (node->src[0]->type == GGML_TYPE_F32 &&
16590
- node->src[1]->type == GGML_TYPE_F32) {
16353
+ node->src[1]->type == GGML_TYPE_F32) {
16591
16354
  cur = sizeof(float)* (ne10*ne11*ne12);
16592
16355
  } else {
16593
16356
  GGML_ASSERT(false);
@@ -16806,10 +16569,9 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16806
16569
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16807
16570
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16808
16571
 
16809
- struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
16810
- GGML_ASSERT(buf);
16572
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
16811
16573
 
16812
- cplan.work_data = buf->data;
16574
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
16813
16575
 
16814
16576
  ggml_graph_compute(cgraph, &cplan);
16815
16577
  }
@@ -16864,9 +16626,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
16864
16626
  }
16865
16627
 
16866
16628
  void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16867
- //assert(cgraph->work == NULL);
16868
- //assert(cgraph->work_size == 0);
16869
-
16870
16629
  uint64_t size_eval = 0;
16871
16630
 
16872
16631
  // compute size of intermediate results
@@ -16963,7 +16722,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16963
16722
  fwrite(&nb, sizeof(uint64_t), 1, fout);
16964
16723
  }
16965
16724
 
16966
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16725
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16726
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
16967
16727
 
16968
16728
  // dump the data
16969
16729
  // TODO: pad this to 32 byte boundary
@@ -16996,7 +16756,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16996
16756
  fwrite(&nb, sizeof(uint64_t), 1, fout);
16997
16757
  }
16998
16758
 
16999
- fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16759
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
16760
+ fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
17000
16761
 
17001
16762
  // output the op arguments
17002
16763
  {
@@ -17177,7 +16938,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17177
16938
 
17178
16939
  tensor->op = (enum ggml_op) op;
17179
16940
 
17180
- memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16941
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
16942
+ memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
17181
16943
 
17182
16944
  tensor->data = (void *) ptr;
17183
16945
 
@@ -17222,7 +16984,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17222
16984
  nb[j] = nb_cur;
17223
16985
  }
17224
16986
 
17225
- const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
16987
+ const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
16988
+ const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
17226
16989
 
17227
16990
  const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
17228
16991
 
@@ -17259,8 +17022,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17259
17022
  {
17260
17023
  tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
17261
17024
 
17262
- uint64_t offs;
17263
- memcpy(&offs, args[2]->data, sizeof(offs));
17025
+ size_t offs;
17026
+ memcpy(&offs, ptr_op_params, sizeof(offs));
17264
17027
 
17265
17028
  tensor->data = ((char *) tensor->data) + offs;
17266
17029
  } break;
@@ -17280,7 +17043,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17280
17043
  } break;
17281
17044
  }
17282
17045
 
17283
- memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17046
+ memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
17047
+ memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
17284
17048
 
17285
17049
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
17286
17050
  tensor->nb[j] = nb[j];
@@ -17305,9 +17069,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17305
17069
 
17306
17070
  GGML_PRINT("=== GRAPH ===\n");
17307
17071
 
17308
- GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
17309
- GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
17310
-
17311
17072
  GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
17312
17073
  for (int i = 0; i < cgraph->n_nodes; i++) {
17313
17074
  struct ggml_tensor * node = cgraph->nodes[i];
@@ -17317,7 +17078,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17317
17078
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17318
17079
  i,
17319
17080
  node->ne[0], node->ne[1], node->ne[2],
17320
- GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17081
+ ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
17321
17082
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17322
17083
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17323
17084
  (double) node->perf_time_us / 1000.0,
@@ -17331,7 +17092,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17331
17092
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
17332
17093
  i,
17333
17094
  node->ne[0], node->ne[1],
17334
- GGML_OP_NAME[node->op]);
17095
+ ggml_op_name(node->op));
17335
17096
  }
17336
17097
 
17337
17098
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -17339,7 +17100,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17339
17100
  continue;
17340
17101
  }
17341
17102
 
17342
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
17103
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
17343
17104
  }
17344
17105
 
17345
17106
  GGML_PRINT("========================================\n");
@@ -17433,13 +17194,13 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17433
17194
  }
17434
17195
 
17435
17196
  if (node->n_dims == 2) {
17436
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
17197
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
17437
17198
  } else {
17438
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
17199
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
17439
17200
  }
17440
17201
 
17441
17202
  if (node->grad) {
17442
- fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
17203
+ fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
17443
17204
  } else {
17444
17205
  fprintf(fp, "\"; ]\n");
17445
17206
  }