llama_cpp 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +165 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +217 -76
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +16 -5
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1082 -774
- data/ext/llama_cpp/src/ggml.h +64 -18
- data/ext/llama_cpp/src/llama.cpp +179 -51
- data/ext/llama_cpp/src/llama.h +15 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +3 -1
- metadata +4 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -25,6 +25,7 @@
|
|
25
25
|
#include <float.h>
|
26
26
|
#include <limits.h>
|
27
27
|
#include <stdarg.h>
|
28
|
+
#include <signal.h>
|
28
29
|
|
29
30
|
#ifdef GGML_USE_METAL
|
30
31
|
#include <unistd.h>
|
@@ -49,23 +50,23 @@
|
|
49
50
|
typedef volatile LONG atomic_int;
|
50
51
|
typedef atomic_int atomic_bool;
|
51
52
|
|
52
|
-
static void atomic_store(atomic_int* ptr, LONG val) {
|
53
|
+
static void atomic_store(atomic_int * ptr, LONG val) {
|
53
54
|
InterlockedExchange(ptr, val);
|
54
55
|
}
|
55
|
-
static LONG atomic_load(atomic_int* ptr) {
|
56
|
+
static LONG atomic_load(atomic_int * ptr) {
|
56
57
|
return InterlockedCompareExchange(ptr, 0, 0);
|
57
58
|
}
|
58
|
-
static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
|
59
|
+
static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
|
59
60
|
return InterlockedExchangeAdd(ptr, inc);
|
60
61
|
}
|
61
|
-
static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
|
62
|
+
static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
|
62
63
|
return atomic_fetch_add(ptr, -(dec));
|
63
64
|
}
|
64
65
|
|
65
66
|
typedef HANDLE pthread_t;
|
66
67
|
|
67
68
|
typedef DWORD thread_ret_t;
|
68
|
-
static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
|
69
|
+
static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
|
69
70
|
(void) unused;
|
70
71
|
HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
|
71
72
|
if (handle == NULL)
|
@@ -77,7 +78,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void
|
|
77
78
|
return 0;
|
78
79
|
}
|
79
80
|
|
80
|
-
static int pthread_join(pthread_t thread, void* unused) {
|
81
|
+
static int pthread_join(pthread_t thread, void * unused) {
|
81
82
|
(void) unused;
|
82
83
|
return (int) WaitForSingleObject(thread, INFINITE);
|
83
84
|
}
|
@@ -90,7 +91,7 @@ static int sched_yield (void) {
|
|
90
91
|
#include <pthread.h>
|
91
92
|
#include <stdatomic.h>
|
92
93
|
|
93
|
-
typedef void* thread_ret_t;
|
94
|
+
typedef void * thread_ret_t;
|
94
95
|
|
95
96
|
#include <sys/types.h>
|
96
97
|
#include <sys/stat.h>
|
@@ -247,7 +248,11 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|
247
248
|
#include "ggml-opencl.h"
|
248
249
|
#endif
|
249
250
|
#elif defined(GGML_USE_OPENBLAS)
|
251
|
+
#if defined(GGML_BLAS_USE_MKL)
|
252
|
+
#include <mkl.h>
|
253
|
+
#else
|
250
254
|
#include <cblas.h>
|
255
|
+
#endif
|
251
256
|
#elif defined(GGML_USE_CUBLAS)
|
252
257
|
#include "ggml-cuda.h"
|
253
258
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -3782,6 +3787,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3782
3787
|
"CLAMP",
|
3783
3788
|
"CONV_1D",
|
3784
3789
|
"CONV_2D",
|
3790
|
+
"POOL_1D",
|
3791
|
+
"POOL_2D",
|
3785
3792
|
|
3786
3793
|
"FLASH_ATTN",
|
3787
3794
|
"FLASH_FF",
|
@@ -3800,7 +3807,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3800
3807
|
"CROSS_ENTROPY_LOSS_BACK",
|
3801
3808
|
};
|
3802
3809
|
|
3803
|
-
static_assert(GGML_OP_COUNT ==
|
3810
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3804
3811
|
|
3805
3812
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3806
3813
|
"none",
|
@@ -3860,6 +3867,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3860
3867
|
"clamp(x)",
|
3861
3868
|
"conv_1d(x)",
|
3862
3869
|
"conv_2d(x)",
|
3870
|
+
"pool_1d(x)",
|
3871
|
+
"pool_2d(x)",
|
3863
3872
|
|
3864
3873
|
"flash_attn(x)",
|
3865
3874
|
"flash_ff(x)",
|
@@ -3878,7 +3887,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3878
3887
|
"cross_entropy_loss_back(x,y)",
|
3879
3888
|
};
|
3880
3889
|
|
3881
|
-
static_assert(GGML_OP_COUNT ==
|
3890
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3891
|
+
|
3892
|
+
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
3882
3893
|
|
3883
3894
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3884
3895
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -4157,10 +4168,9 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
|
4157
4168
|
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
4158
4169
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4159
4170
|
|
4160
|
-
return
|
4161
|
-
|
4162
|
-
|
4163
|
-
(t0->ne[3] == t1->ne[3]);
|
4171
|
+
return (t0->ne[0] == t1->ne[0]) &&
|
4172
|
+
(t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
|
4173
|
+
(t1->ne[3]%t0->ne[3] == 0);
|
4164
4174
|
}
|
4165
4175
|
|
4166
4176
|
static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
@@ -4580,17 +4590,14 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4580
4590
|
/*.op =*/ GGML_OP_NONE,
|
4581
4591
|
/*.is_param =*/ false,
|
4582
4592
|
/*.grad =*/ NULL,
|
4583
|
-
/*.
|
4584
|
-
/*.src1 =*/ NULL,
|
4585
|
-
/*.opt =*/ { NULL },
|
4586
|
-
/*.n_tasks =*/ 0,
|
4593
|
+
/*.src =*/ { NULL },
|
4587
4594
|
/*.perf_runs =*/ 0,
|
4588
4595
|
/*.perf_cycles =*/ 0,
|
4589
4596
|
/*.perf_time_us =*/ 0,
|
4590
4597
|
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
4591
4598
|
/*.name =*/ { 0 },
|
4592
4599
|
/*.extra =*/ NULL,
|
4593
|
-
/*.
|
4600
|
+
/*.padding =*/ { 0 },
|
4594
4601
|
};
|
4595
4602
|
|
4596
4603
|
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
@@ -4722,7 +4729,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
|
|
4722
4729
|
{
|
4723
4730
|
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
|
4724
4731
|
for (int i = 0; i < n; i++) {
|
4725
|
-
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
|
4732
|
+
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
|
4726
4733
|
}
|
4727
4734
|
} break;
|
4728
4735
|
case GGML_TYPE_F32:
|
@@ -4774,7 +4781,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
|
|
4774
4781
|
{
|
4775
4782
|
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
|
4776
4783
|
for (int i = 0; i < n; i++) {
|
4777
|
-
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
|
4784
|
+
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
|
4778
4785
|
}
|
4779
4786
|
} break;
|
4780
4787
|
case GGML_TYPE_F32:
|
@@ -5009,8 +5016,8 @@ struct ggml_tensor * ggml_dup_impl(
|
|
5009
5016
|
|
5010
5017
|
result->op = GGML_OP_DUP;
|
5011
5018
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5012
|
-
result->
|
5013
|
-
result->
|
5019
|
+
result->src[0] = a;
|
5020
|
+
result->src[1] = NULL;
|
5014
5021
|
|
5015
5022
|
return result;
|
5016
5023
|
}
|
@@ -5034,11 +5041,15 @@ struct ggml_tensor * ggml_add_impl(
|
|
5034
5041
|
struct ggml_tensor * a,
|
5035
5042
|
struct ggml_tensor * b,
|
5036
5043
|
bool inplace) {
|
5037
|
-
|
5044
|
+
// TODO: support less-strict constraint
|
5045
|
+
// GGML_ASSERT(ggml_can_repeat(b, a));
|
5046
|
+
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
5038
5047
|
|
5039
5048
|
bool is_node = false;
|
5040
5049
|
|
5041
|
-
if (a->grad || b->grad) {
|
5050
|
+
if (!inplace && (a->grad || b->grad)) {
|
5051
|
+
// TODO: support backward pass for broadcasting
|
5052
|
+
GGML_ASSERT(ggml_are_same_shape(a, b));
|
5042
5053
|
is_node = true;
|
5043
5054
|
}
|
5044
5055
|
|
@@ -5046,8 +5057,8 @@ struct ggml_tensor * ggml_add_impl(
|
|
5046
5057
|
|
5047
5058
|
result->op = GGML_OP_ADD;
|
5048
5059
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5049
|
-
result->
|
5050
|
-
result->
|
5060
|
+
result->src[0] = a;
|
5061
|
+
result->src[1] = b;
|
5051
5062
|
|
5052
5063
|
return result;
|
5053
5064
|
}
|
@@ -5086,8 +5097,8 @@ struct ggml_tensor * ggml_add1_impl(
|
|
5086
5097
|
|
5087
5098
|
result->op = GGML_OP_ADD1;
|
5088
5099
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5089
|
-
result->
|
5090
|
-
result->
|
5100
|
+
result->src[0] = a;
|
5101
|
+
result->src[1] = b;
|
5091
5102
|
|
5092
5103
|
return result;
|
5093
5104
|
}
|
@@ -5144,9 +5155,9 @@ struct ggml_tensor * ggml_acc_impl(
|
|
5144
5155
|
|
5145
5156
|
result->op = GGML_OP_ACC;
|
5146
5157
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5147
|
-
result->
|
5148
|
-
result->
|
5149
|
-
result->
|
5158
|
+
result->src[0] = a;
|
5159
|
+
result->src[1] = b;
|
5160
|
+
result->src[2] = c;
|
5150
5161
|
|
5151
5162
|
return result;
|
5152
5163
|
}
|
@@ -5192,8 +5203,8 @@ struct ggml_tensor * ggml_sub_impl(
|
|
5192
5203
|
|
5193
5204
|
result->op = GGML_OP_SUB;
|
5194
5205
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5195
|
-
result->
|
5196
|
-
result->
|
5206
|
+
result->src[0] = a;
|
5207
|
+
result->src[1] = b;
|
5197
5208
|
|
5198
5209
|
return result;
|
5199
5210
|
}
|
@@ -5239,8 +5250,8 @@ struct ggml_tensor * ggml_mul_impl(
|
|
5239
5250
|
|
5240
5251
|
result->op = GGML_OP_MUL;
|
5241
5252
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5242
|
-
result->
|
5243
|
-
result->
|
5253
|
+
result->src[0] = a;
|
5254
|
+
result->src[1] = b;
|
5244
5255
|
|
5245
5256
|
return result;
|
5246
5257
|
}
|
@@ -5282,8 +5293,8 @@ struct ggml_tensor * ggml_div_impl(
|
|
5282
5293
|
|
5283
5294
|
result->op = GGML_OP_DIV;
|
5284
5295
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5285
|
-
result->
|
5286
|
-
result->
|
5296
|
+
result->src[0] = a;
|
5297
|
+
result->src[1] = b;
|
5287
5298
|
|
5288
5299
|
return result;
|
5289
5300
|
}
|
@@ -5318,8 +5329,8 @@ struct ggml_tensor * ggml_sqr_impl(
|
|
5318
5329
|
|
5319
5330
|
result->op = GGML_OP_SQR;
|
5320
5331
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5321
|
-
result->
|
5322
|
-
result->
|
5332
|
+
result->src[0] = a;
|
5333
|
+
result->src[1] = NULL;
|
5323
5334
|
|
5324
5335
|
return result;
|
5325
5336
|
}
|
@@ -5352,8 +5363,8 @@ struct ggml_tensor * ggml_sqrt_impl(
|
|
5352
5363
|
|
5353
5364
|
result->op = GGML_OP_SQRT;
|
5354
5365
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5355
|
-
result->
|
5356
|
-
result->
|
5366
|
+
result->src[0] = a;
|
5367
|
+
result->src[1] = NULL;
|
5357
5368
|
|
5358
5369
|
return result;
|
5359
5370
|
}
|
@@ -5387,8 +5398,8 @@ struct ggml_tensor * ggml_log_impl(
|
|
5387
5398
|
|
5388
5399
|
result->op = GGML_OP_LOG;
|
5389
5400
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5390
|
-
result->
|
5391
|
-
result->
|
5401
|
+
result->src[0] = a;
|
5402
|
+
result->src[1] = NULL;
|
5392
5403
|
|
5393
5404
|
return result;
|
5394
5405
|
}
|
@@ -5420,8 +5431,8 @@ struct ggml_tensor * ggml_sum(
|
|
5420
5431
|
|
5421
5432
|
result->op = GGML_OP_SUM;
|
5422
5433
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5423
|
-
result->
|
5424
|
-
result->
|
5434
|
+
result->src[0] = a;
|
5435
|
+
result->src[1] = NULL;
|
5425
5436
|
|
5426
5437
|
return result;
|
5427
5438
|
}
|
@@ -5447,8 +5458,8 @@ struct ggml_tensor * ggml_sum_rows(
|
|
5447
5458
|
|
5448
5459
|
result->op = GGML_OP_SUM_ROWS;
|
5449
5460
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5450
|
-
result->
|
5451
|
-
result->
|
5461
|
+
result->src[0] = a;
|
5462
|
+
result->src[1] = NULL;
|
5452
5463
|
|
5453
5464
|
return result;
|
5454
5465
|
}
|
@@ -5470,8 +5481,8 @@ struct ggml_tensor * ggml_mean(
|
|
5470
5481
|
|
5471
5482
|
result->op = GGML_OP_MEAN;
|
5472
5483
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5473
|
-
result->
|
5474
|
-
result->
|
5484
|
+
result->src[0] = a;
|
5485
|
+
result->src[1] = NULL;
|
5475
5486
|
|
5476
5487
|
return result;
|
5477
5488
|
}
|
@@ -5494,8 +5505,8 @@ struct ggml_tensor * ggml_argmax(
|
|
5494
5505
|
|
5495
5506
|
result->op = GGML_OP_ARGMAX;
|
5496
5507
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5497
|
-
result->
|
5498
|
-
result->
|
5508
|
+
result->src[0] = a;
|
5509
|
+
result->src[1] = NULL;
|
5499
5510
|
|
5500
5511
|
return result;
|
5501
5512
|
}
|
@@ -5522,8 +5533,8 @@ struct ggml_tensor * ggml_repeat(
|
|
5522
5533
|
|
5523
5534
|
result->op = GGML_OP_REPEAT;
|
5524
5535
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5525
|
-
result->
|
5526
|
-
result->
|
5536
|
+
result->src[0] = a;
|
5537
|
+
result->src[1] = b;
|
5527
5538
|
|
5528
5539
|
return result;
|
5529
5540
|
}
|
@@ -5550,8 +5561,8 @@ struct ggml_tensor * ggml_repeat_back(
|
|
5550
5561
|
|
5551
5562
|
result->op = GGML_OP_REPEAT_BACK;
|
5552
5563
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5553
|
-
result->
|
5554
|
-
result->
|
5564
|
+
result->src[0] = a;
|
5565
|
+
result->src[1] = b;
|
5555
5566
|
|
5556
5567
|
return result;
|
5557
5568
|
}
|
@@ -5572,8 +5583,8 @@ struct ggml_tensor * ggml_abs_impl(
|
|
5572
5583
|
|
5573
5584
|
result->op = GGML_OP_ABS;
|
5574
5585
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5575
|
-
result->
|
5576
|
-
result->
|
5586
|
+
result->src[0] = a;
|
5587
|
+
result->src[1] = NULL;
|
5577
5588
|
|
5578
5589
|
return result;
|
5579
5590
|
}
|
@@ -5607,8 +5618,8 @@ struct ggml_tensor * ggml_sgn_impl(
|
|
5607
5618
|
|
5608
5619
|
result->op = GGML_OP_SGN;
|
5609
5620
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5610
|
-
result->
|
5611
|
-
result->
|
5621
|
+
result->src[0] = a;
|
5622
|
+
result->src[1] = NULL;
|
5612
5623
|
|
5613
5624
|
return result;
|
5614
5625
|
}
|
@@ -5641,8 +5652,8 @@ struct ggml_tensor * ggml_neg_impl(
|
|
5641
5652
|
|
5642
5653
|
result->op = GGML_OP_NEG;
|
5643
5654
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5644
|
-
result->
|
5645
|
-
result->
|
5655
|
+
result->src[0] = a;
|
5656
|
+
result->src[1] = NULL;
|
5646
5657
|
|
5647
5658
|
return result;
|
5648
5659
|
}
|
@@ -5675,8 +5686,8 @@ struct ggml_tensor * ggml_step_impl(
|
|
5675
5686
|
|
5676
5687
|
result->op = GGML_OP_STEP;
|
5677
5688
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5678
|
-
result->
|
5679
|
-
result->
|
5689
|
+
result->src[0] = a;
|
5690
|
+
result->src[1] = NULL;
|
5680
5691
|
|
5681
5692
|
return result;
|
5682
5693
|
}
|
@@ -5709,8 +5720,8 @@ struct ggml_tensor * ggml_tanh_impl(
|
|
5709
5720
|
|
5710
5721
|
result->op = GGML_OP_TANH;
|
5711
5722
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5712
|
-
result->
|
5713
|
-
result->
|
5723
|
+
result->src[0] = a;
|
5724
|
+
result->src[1] = NULL;
|
5714
5725
|
|
5715
5726
|
return result;
|
5716
5727
|
}
|
@@ -5743,8 +5754,8 @@ struct ggml_tensor * ggml_elu_impl(
|
|
5743
5754
|
|
5744
5755
|
result->op = GGML_OP_ELU;
|
5745
5756
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5746
|
-
result->
|
5747
|
-
result->
|
5757
|
+
result->src[0] = a;
|
5758
|
+
result->src[1] = NULL;
|
5748
5759
|
|
5749
5760
|
return result;
|
5750
5761
|
}
|
@@ -5777,8 +5788,8 @@ struct ggml_tensor * ggml_relu_impl(
|
|
5777
5788
|
|
5778
5789
|
result->op = GGML_OP_RELU;
|
5779
5790
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5780
|
-
result->
|
5781
|
-
result->
|
5791
|
+
result->src[0] = a;
|
5792
|
+
result->src[1] = NULL;
|
5782
5793
|
|
5783
5794
|
return result;
|
5784
5795
|
}
|
@@ -5811,8 +5822,8 @@ struct ggml_tensor * ggml_gelu_impl(
|
|
5811
5822
|
|
5812
5823
|
result->op = GGML_OP_GELU;
|
5813
5824
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5814
|
-
result->
|
5815
|
-
result->
|
5825
|
+
result->src[0] = a;
|
5826
|
+
result->src[1] = NULL;
|
5816
5827
|
|
5817
5828
|
return result;
|
5818
5829
|
}
|
@@ -5845,8 +5856,8 @@ struct ggml_tensor * ggml_gelu_quick_impl(
|
|
5845
5856
|
|
5846
5857
|
result->op = GGML_OP_GELU_QUICK;
|
5847
5858
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5848
|
-
result->
|
5849
|
-
result->
|
5859
|
+
result->src[0] = a;
|
5860
|
+
result->src[1] = NULL;
|
5850
5861
|
|
5851
5862
|
return result;
|
5852
5863
|
}
|
@@ -5879,8 +5890,8 @@ struct ggml_tensor * ggml_silu_impl(
|
|
5879
5890
|
|
5880
5891
|
result->op = GGML_OP_SILU;
|
5881
5892
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5882
|
-
result->
|
5883
|
-
result->
|
5893
|
+
result->src[0] = a;
|
5894
|
+
result->src[1] = NULL;
|
5884
5895
|
|
5885
5896
|
return result;
|
5886
5897
|
}
|
@@ -5914,8 +5925,8 @@ struct ggml_tensor * ggml_silu_back(
|
|
5914
5925
|
|
5915
5926
|
result->op = GGML_OP_SILU_BACK;
|
5916
5927
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5917
|
-
result->
|
5918
|
-
result->
|
5928
|
+
result->src[0] = a;
|
5929
|
+
result->src[1] = b;
|
5919
5930
|
|
5920
5931
|
return result;
|
5921
5932
|
}
|
@@ -5937,8 +5948,8 @@ struct ggml_tensor * ggml_norm_impl(
|
|
5937
5948
|
|
5938
5949
|
result->op = GGML_OP_NORM;
|
5939
5950
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5940
|
-
result->
|
5941
|
-
result->
|
5951
|
+
result->src[0] = a;
|
5952
|
+
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
5942
5953
|
|
5943
5954
|
return result;
|
5944
5955
|
}
|
@@ -5969,8 +5980,8 @@ struct ggml_tensor * ggml_rms_norm_impl(
|
|
5969
5980
|
|
5970
5981
|
result->op = GGML_OP_RMS_NORM;
|
5971
5982
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5972
|
-
result->
|
5973
|
-
result->
|
5983
|
+
result->src[0] = a;
|
5984
|
+
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
5974
5985
|
|
5975
5986
|
return result;
|
5976
5987
|
}
|
@@ -6002,8 +6013,8 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
6002
6013
|
|
6003
6014
|
result->op = GGML_OP_RMS_NORM_BACK;
|
6004
6015
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6005
|
-
result->
|
6006
|
-
result->
|
6016
|
+
result->src[0] = a;
|
6017
|
+
result->src[1] = b;
|
6007
6018
|
|
6008
6019
|
return result;
|
6009
6020
|
}
|
@@ -6024,13 +6035,13 @@ struct ggml_tensor * ggml_mul_mat(
|
|
6024
6035
|
is_node = true;
|
6025
6036
|
}
|
6026
6037
|
|
6027
|
-
const int64_t ne[4] = { a->ne[1], b->ne[1],
|
6028
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
6038
|
+
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
6039
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
|
6029
6040
|
|
6030
6041
|
result->op = GGML_OP_MUL_MAT;
|
6031
6042
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6032
|
-
result->
|
6033
|
-
result->
|
6043
|
+
result->src[0] = a;
|
6044
|
+
result->src[1] = b;
|
6034
6045
|
|
6035
6046
|
return result;
|
6036
6047
|
}
|
@@ -6055,8 +6066,8 @@ struct ggml_tensor * ggml_out_prod(
|
|
6055
6066
|
|
6056
6067
|
result->op = GGML_OP_OUT_PROD;
|
6057
6068
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6058
|
-
result->
|
6059
|
-
result->
|
6069
|
+
result->src[0] = a;
|
6070
|
+
result->src[1] = b;
|
6060
6071
|
|
6061
6072
|
return result;
|
6062
6073
|
}
|
@@ -6081,8 +6092,8 @@ struct ggml_tensor * ggml_scale_impl(
|
|
6081
6092
|
|
6082
6093
|
result->op = GGML_OP_SCALE;
|
6083
6094
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6084
|
-
result->
|
6085
|
-
result->
|
6095
|
+
result->src[0] = a;
|
6096
|
+
result->src[1] = b;
|
6086
6097
|
|
6087
6098
|
return result;
|
6088
6099
|
}
|
@@ -6137,9 +6148,9 @@ struct ggml_tensor * ggml_set_impl(
|
|
6137
6148
|
|
6138
6149
|
result->op = GGML_OP_SET;
|
6139
6150
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6140
|
-
result->
|
6141
|
-
result->
|
6142
|
-
result->
|
6151
|
+
result->src[0] = a;
|
6152
|
+
result->src[1] = b;
|
6153
|
+
result->src[2] = c;
|
6143
6154
|
|
6144
6155
|
return result;
|
6145
6156
|
}
|
@@ -6226,8 +6237,8 @@ struct ggml_tensor * ggml_cpy_impl(
|
|
6226
6237
|
|
6227
6238
|
result->op = GGML_OP_CPY;
|
6228
6239
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6229
|
-
result->
|
6230
|
-
result->
|
6240
|
+
result->src[0] = a;
|
6241
|
+
result->src[1] = b;
|
6231
6242
|
|
6232
6243
|
return result;
|
6233
6244
|
}
|
@@ -6263,8 +6274,8 @@ struct ggml_tensor * ggml_cont_impl(
|
|
6263
6274
|
|
6264
6275
|
result->op = GGML_OP_CONT;
|
6265
6276
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6266
|
-
result->
|
6267
|
-
result->
|
6277
|
+
result->src[0] = a;
|
6278
|
+
result->src[1] = NULL;
|
6268
6279
|
|
6269
6280
|
return result;
|
6270
6281
|
}
|
@@ -6307,8 +6318,8 @@ struct ggml_tensor * ggml_reshape(
|
|
6307
6318
|
|
6308
6319
|
result->op = GGML_OP_RESHAPE;
|
6309
6320
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6310
|
-
result->
|
6311
|
-
result->
|
6321
|
+
result->src[0] = a;
|
6322
|
+
result->src[1] = NULL;
|
6312
6323
|
|
6313
6324
|
return result;
|
6314
6325
|
}
|
@@ -6332,8 +6343,8 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
6332
6343
|
|
6333
6344
|
result->op = GGML_OP_RESHAPE;
|
6334
6345
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6335
|
-
result->
|
6336
|
-
result->
|
6346
|
+
result->src[0] = a;
|
6347
|
+
result->src[1] = NULL;
|
6337
6348
|
|
6338
6349
|
return result;
|
6339
6350
|
}
|
@@ -6358,8 +6369,8 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
6358
6369
|
|
6359
6370
|
result->op = GGML_OP_RESHAPE;
|
6360
6371
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6361
|
-
result->
|
6362
|
-
result->
|
6372
|
+
result->src[0] = a;
|
6373
|
+
result->src[1] = NULL;
|
6363
6374
|
|
6364
6375
|
return result;
|
6365
6376
|
}
|
@@ -6385,8 +6396,8 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6385
6396
|
|
6386
6397
|
result->op = GGML_OP_RESHAPE;
|
6387
6398
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6388
|
-
result->
|
6389
|
-
result->
|
6399
|
+
result->src[0] = a;
|
6400
|
+
result->src[1] = NULL;
|
6390
6401
|
|
6391
6402
|
return result;
|
6392
6403
|
}
|
@@ -6414,8 +6425,8 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6414
6425
|
|
6415
6426
|
result->op = GGML_OP_RESHAPE;
|
6416
6427
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6417
|
-
result->
|
6418
|
-
result->
|
6428
|
+
result->src[0] = a;
|
6429
|
+
result->src[1] = NULL;
|
6419
6430
|
|
6420
6431
|
return result;
|
6421
6432
|
}
|
@@ -6447,9 +6458,9 @@ struct ggml_tensor * ggml_view_1d(
|
|
6447
6458
|
|
6448
6459
|
result->op = GGML_OP_VIEW;
|
6449
6460
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6450
|
-
result->
|
6451
|
-
result->
|
6452
|
-
result->
|
6461
|
+
result->src[0] = a;
|
6462
|
+
result->src[1] = NULL;
|
6463
|
+
result->src[2] = offs;
|
6453
6464
|
|
6454
6465
|
return result;
|
6455
6466
|
}
|
@@ -6489,9 +6500,9 @@ struct ggml_tensor * ggml_view_2d(
|
|
6489
6500
|
|
6490
6501
|
result->op = GGML_OP_VIEW;
|
6491
6502
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6492
|
-
result->
|
6493
|
-
result->
|
6494
|
-
result->
|
6503
|
+
result->src[0] = a;
|
6504
|
+
result->src[1] = NULL;
|
6505
|
+
result->src[2] = offs;
|
6495
6506
|
|
6496
6507
|
return result;
|
6497
6508
|
}
|
@@ -6533,9 +6544,9 @@ struct ggml_tensor * ggml_view_3d(
|
|
6533
6544
|
|
6534
6545
|
result->op = GGML_OP_VIEW;
|
6535
6546
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6536
|
-
result->
|
6537
|
-
result->
|
6538
|
-
result->
|
6547
|
+
result->src[0] = a;
|
6548
|
+
result->src[1] = NULL;
|
6549
|
+
result->src[2] = offs;
|
6539
6550
|
|
6540
6551
|
return result;
|
6541
6552
|
}
|
@@ -6579,9 +6590,9 @@ struct ggml_tensor * ggml_view_4d(
|
|
6579
6590
|
|
6580
6591
|
result->op = GGML_OP_VIEW;
|
6581
6592
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6582
|
-
result->
|
6583
|
-
result->
|
6584
|
-
result->
|
6593
|
+
result->src[0] = a;
|
6594
|
+
result->src[1] = NULL;
|
6595
|
+
result->src[2] = offs;
|
6585
6596
|
|
6586
6597
|
return result;
|
6587
6598
|
}
|
@@ -6641,8 +6652,8 @@ struct ggml_tensor * ggml_permute(
|
|
6641
6652
|
|
6642
6653
|
result->op = GGML_OP_PERMUTE;
|
6643
6654
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6644
|
-
result->
|
6645
|
-
result->
|
6655
|
+
result->src[0] = a;
|
6656
|
+
result->src[1] = NULL;
|
6646
6657
|
|
6647
6658
|
if (is_node) {
|
6648
6659
|
ggml_scratch_save(ctx);
|
@@ -6656,7 +6667,7 @@ struct ggml_tensor * ggml_permute(
|
|
6656
6667
|
|
6657
6668
|
ggml_scratch_load(ctx);
|
6658
6669
|
|
6659
|
-
result->
|
6670
|
+
result->src[2] = b;
|
6660
6671
|
}
|
6661
6672
|
|
6662
6673
|
return result;
|
@@ -6684,8 +6695,8 @@ struct ggml_tensor * ggml_transpose(
|
|
6684
6695
|
|
6685
6696
|
result->op = GGML_OP_TRANSPOSE;
|
6686
6697
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6687
|
-
result->
|
6688
|
-
result->
|
6698
|
+
result->src[0] = a;
|
6699
|
+
result->src[1] = NULL;
|
6689
6700
|
|
6690
6701
|
return result;
|
6691
6702
|
}
|
@@ -6710,8 +6721,8 @@ struct ggml_tensor * ggml_get_rows(
|
|
6710
6721
|
|
6711
6722
|
result->op = GGML_OP_GET_ROWS;
|
6712
6723
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6713
|
-
result->
|
6714
|
-
result->
|
6724
|
+
result->src[0] = a;
|
6725
|
+
result->src[1] = b;
|
6715
6726
|
|
6716
6727
|
return result;
|
6717
6728
|
}
|
@@ -6738,9 +6749,9 @@ struct ggml_tensor * ggml_get_rows_back(
|
|
6738
6749
|
|
6739
6750
|
result->op = GGML_OP_GET_ROWS_BACK;
|
6740
6751
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6741
|
-
result->
|
6742
|
-
result->
|
6743
|
-
result->
|
6752
|
+
result->src[0] = a;
|
6753
|
+
result->src[1] = b;
|
6754
|
+
result->src[2] = c;
|
6744
6755
|
|
6745
6756
|
return result;
|
6746
6757
|
}
|
@@ -6762,8 +6773,8 @@ struct ggml_tensor * ggml_diag(
|
|
6762
6773
|
|
6763
6774
|
result->op = GGML_OP_DIAG;
|
6764
6775
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6765
|
-
result->
|
6766
|
-
result->
|
6776
|
+
result->src[0] = a;
|
6777
|
+
result->src[1] = NULL;
|
6767
6778
|
|
6768
6779
|
return result;
|
6769
6780
|
}
|
@@ -6795,8 +6806,8 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
|
|
6795
6806
|
|
6796
6807
|
result->op = GGML_OP_DIAG_MASK_INF;
|
6797
6808
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6798
|
-
result->
|
6799
|
-
result->
|
6809
|
+
result->src[0] = a;
|
6810
|
+
result->src[1] = b;
|
6800
6811
|
|
6801
6812
|
return result;
|
6802
6813
|
}
|
@@ -6843,8 +6854,8 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
|
|
6843
6854
|
|
6844
6855
|
result->op = GGML_OP_DIAG_MASK_ZERO;
|
6845
6856
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6846
|
-
result->
|
6847
|
-
result->
|
6857
|
+
result->src[0] = a;
|
6858
|
+
result->src[1] = b;
|
6848
6859
|
|
6849
6860
|
return result;
|
6850
6861
|
}
|
@@ -6879,8 +6890,8 @@ struct ggml_tensor * ggml_soft_max_impl(
|
|
6879
6890
|
|
6880
6891
|
result->op = GGML_OP_SOFT_MAX;
|
6881
6892
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6882
|
-
result->
|
6883
|
-
result->
|
6893
|
+
result->src[0] = a;
|
6894
|
+
result->src[1] = NULL;
|
6884
6895
|
|
6885
6896
|
return result;
|
6886
6897
|
}
|
@@ -6915,8 +6926,8 @@ struct ggml_tensor * ggml_soft_max_back_impl(
|
|
6915
6926
|
|
6916
6927
|
result->op = GGML_OP_SOFT_MAX_BACK;
|
6917
6928
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6918
|
-
result->
|
6919
|
-
result->
|
6929
|
+
result->src[0] = a;
|
6930
|
+
result->src[1] = b;
|
6920
6931
|
|
6921
6932
|
return result;
|
6922
6933
|
}
|
@@ -6967,8 +6978,8 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6967
6978
|
|
6968
6979
|
result->op = GGML_OP_ROPE;
|
6969
6980
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6970
|
-
result->
|
6971
|
-
result->
|
6981
|
+
result->src[0] = a;
|
6982
|
+
result->src[1] = b;
|
6972
6983
|
|
6973
6984
|
return result;
|
6974
6985
|
}
|
@@ -7025,8 +7036,8 @@ struct ggml_tensor * ggml_rope_back(
|
|
7025
7036
|
|
7026
7037
|
result->op = GGML_OP_ROPE_BACK;
|
7027
7038
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7028
|
-
result->
|
7029
|
-
result->
|
7039
|
+
result->src[0] = a;
|
7040
|
+
result->src[1] = b;
|
7030
7041
|
|
7031
7042
|
return result;
|
7032
7043
|
}
|
@@ -7064,8 +7075,8 @@ struct ggml_tensor * ggml_alibi(
|
|
7064
7075
|
|
7065
7076
|
result->op = GGML_OP_ALIBI;
|
7066
7077
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7067
|
-
result->
|
7068
|
-
result->
|
7078
|
+
result->src[0] = a;
|
7079
|
+
result->src[1] = b;
|
7069
7080
|
|
7070
7081
|
return result;
|
7071
7082
|
}
|
@@ -7098,8 +7109,8 @@ struct ggml_tensor * ggml_clamp(
|
|
7098
7109
|
|
7099
7110
|
result->op = GGML_OP_CLAMP;
|
7100
7111
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7101
|
-
result->
|
7102
|
-
result->
|
7112
|
+
result->src[0] = a;
|
7113
|
+
result->src[1] = b;
|
7103
7114
|
|
7104
7115
|
return result;
|
7105
7116
|
}
|
@@ -7141,9 +7152,9 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
7141
7152
|
|
7142
7153
|
result->op = GGML_OP_CONV_1D;
|
7143
7154
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7144
|
-
result->
|
7145
|
-
result->
|
7146
|
-
result->
|
7155
|
+
result->src[0] = a;
|
7156
|
+
result->src[1] = b;
|
7157
|
+
result->src[2] = c;
|
7147
7158
|
|
7148
7159
|
return result;
|
7149
7160
|
}
|
@@ -7161,7 +7172,6 @@ struct ggml_tensor* ggml_conv_2d(
|
|
7161
7172
|
int d0,
|
7162
7173
|
int d1) {
|
7163
7174
|
|
7164
|
-
GGML_ASSERT(b->ne[3] == 1);
|
7165
7175
|
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
7166
7176
|
bool is_node = false;
|
7167
7177
|
|
@@ -7173,7 +7183,7 @@ struct ggml_tensor* ggml_conv_2d(
|
|
7173
7183
|
const int64_t ne[4] = {
|
7174
7184
|
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
7175
7185
|
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
7176
|
-
a->ne[3],
|
7186
|
+
a->ne[3], b->ne[3],
|
7177
7187
|
};
|
7178
7188
|
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7179
7189
|
|
@@ -7189,9 +7199,9 @@ struct ggml_tensor* ggml_conv_2d(
|
|
7189
7199
|
|
7190
7200
|
result->op = GGML_OP_CONV_2D;
|
7191
7201
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7192
|
-
result->
|
7193
|
-
result->
|
7194
|
-
result->
|
7202
|
+
result->src[0] = a;
|
7203
|
+
result->src[1] = b;
|
7204
|
+
result->src[2] = c;
|
7195
7205
|
|
7196
7206
|
return result;
|
7197
7207
|
|
@@ -7208,6 +7218,98 @@ struct ggml_tensor* ggml_conv_1d_ph(
|
|
7208
7218
|
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
|
7209
7219
|
}
|
7210
7220
|
|
7221
|
+
|
7222
|
+
// ggml_pool_*
|
7223
|
+
|
7224
|
+
static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
|
7225
|
+
return (ins + 2 * p - ks) / s + 1;
|
7226
|
+
}
|
7227
|
+
|
7228
|
+
// ggml_pool_2d
|
7229
|
+
|
7230
|
+
struct ggml_tensor* ggml_pool_1d(
|
7231
|
+
struct ggml_context * ctx,
|
7232
|
+
struct ggml_tensor * a,
|
7233
|
+
enum ggml_op_pool op,
|
7234
|
+
int k0,
|
7235
|
+
int s0,
|
7236
|
+
int p0) {
|
7237
|
+
|
7238
|
+
bool is_node = false;
|
7239
|
+
|
7240
|
+
if (a->grad) {
|
7241
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7242
|
+
is_node = true;
|
7243
|
+
}
|
7244
|
+
|
7245
|
+
const int64_t ne[3] = {
|
7246
|
+
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
7247
|
+
a->ne[1],
|
7248
|
+
};
|
7249
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
7250
|
+
|
7251
|
+
ggml_scratch_save(ctx);
|
7252
|
+
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
7253
|
+
((int32_t*)c->data)[0] = op;
|
7254
|
+
((int32_t*)c->data)[1] = k0;
|
7255
|
+
((int32_t*)c->data)[2] = s0;
|
7256
|
+
((int32_t*)c->data)[3] = p0;
|
7257
|
+
ggml_scratch_load(ctx);
|
7258
|
+
|
7259
|
+
result->op = GGML_OP_POOL_1D;
|
7260
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7261
|
+
result->src[0] = a;
|
7262
|
+
result->src[1] = c;
|
7263
|
+
|
7264
|
+
return result;
|
7265
|
+
}
|
7266
|
+
|
7267
|
+
// ggml_pool_2d
|
7268
|
+
|
7269
|
+
struct ggml_tensor* ggml_pool_2d(
|
7270
|
+
struct ggml_context * ctx,
|
7271
|
+
struct ggml_tensor * a,
|
7272
|
+
enum ggml_op_pool op,
|
7273
|
+
int k0,
|
7274
|
+
int k1,
|
7275
|
+
int s0,
|
7276
|
+
int s1,
|
7277
|
+
int p0,
|
7278
|
+
int p1) {
|
7279
|
+
|
7280
|
+
bool is_node = false;
|
7281
|
+
|
7282
|
+
if (a->grad) {
|
7283
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7284
|
+
is_node = true;
|
7285
|
+
}
|
7286
|
+
|
7287
|
+
const int64_t ne[3] = {
|
7288
|
+
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
7289
|
+
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
7290
|
+
a->ne[2],
|
7291
|
+
};
|
7292
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7293
|
+
|
7294
|
+
ggml_scratch_save(ctx);
|
7295
|
+
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
|
7296
|
+
((int32_t*)c->data)[0] = op;
|
7297
|
+
((int32_t*)c->data)[1] = k0;
|
7298
|
+
((int32_t*)c->data)[2] = k1;
|
7299
|
+
((int32_t*)c->data)[3] = s0;
|
7300
|
+
((int32_t*)c->data)[4] = s1;
|
7301
|
+
((int32_t*)c->data)[5] = p0;
|
7302
|
+
((int32_t*)c->data)[6] = p1;
|
7303
|
+
ggml_scratch_load(ctx);
|
7304
|
+
|
7305
|
+
result->op = GGML_OP_POOL_2D;
|
7306
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7307
|
+
result->src[0] = a;
|
7308
|
+
result->src[1] = c;
|
7309
|
+
|
7310
|
+
return result;
|
7311
|
+
}
|
7312
|
+
|
7211
7313
|
// ggml_flash_attn
|
7212
7314
|
|
7213
7315
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -7230,10 +7332,10 @@ struct ggml_tensor * ggml_flash_attn(
|
|
7230
7332
|
|
7231
7333
|
result->op = GGML_OP_FLASH_ATTN;
|
7232
7334
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7233
|
-
result->
|
7234
|
-
result->
|
7235
|
-
result->
|
7236
|
-
result->
|
7335
|
+
result->src[0] = q;
|
7336
|
+
result->src[1] = k;
|
7337
|
+
result->src[2] = v;
|
7338
|
+
result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
|
7237
7339
|
|
7238
7340
|
return result;
|
7239
7341
|
}
|
@@ -7261,11 +7363,11 @@ struct ggml_tensor * ggml_flash_ff(
|
|
7261
7363
|
|
7262
7364
|
result->op = GGML_OP_FLASH_FF;
|
7263
7365
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7264
|
-
result->
|
7265
|
-
result->
|
7266
|
-
result->
|
7267
|
-
result->
|
7268
|
-
result->
|
7366
|
+
result->src[0] = a;
|
7367
|
+
result->src[1] = b0;
|
7368
|
+
result->src[2] = b1;
|
7369
|
+
result->src[3] = c0;
|
7370
|
+
result->src[4] = c1;
|
7269
7371
|
|
7270
7372
|
return result;
|
7271
7373
|
}
|
@@ -7325,11 +7427,11 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|
7325
7427
|
|
7326
7428
|
result->op = GGML_OP_FLASH_ATTN_BACK;
|
7327
7429
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7328
|
-
result->
|
7329
|
-
result->
|
7330
|
-
result->
|
7331
|
-
result->
|
7332
|
-
result->
|
7430
|
+
result->src[0] = q;
|
7431
|
+
result->src[1] = k;
|
7432
|
+
result->src[2] = v;
|
7433
|
+
result->src[3] = d;
|
7434
|
+
result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
|
7333
7435
|
|
7334
7436
|
return result;
|
7335
7437
|
}
|
@@ -7374,9 +7476,9 @@ struct ggml_tensor * ggml_win_part(
|
|
7374
7476
|
|
7375
7477
|
result->op = GGML_OP_WIN_PART;
|
7376
7478
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7377
|
-
result->
|
7378
|
-
result->
|
7379
|
-
result->
|
7479
|
+
result->src[0] = a;
|
7480
|
+
result->src[1] = NULL;
|
7481
|
+
result->src[2] = b;
|
7380
7482
|
|
7381
7483
|
return result;
|
7382
7484
|
}
|
@@ -7411,9 +7513,9 @@ struct ggml_tensor * ggml_win_unpart(
|
|
7411
7513
|
|
7412
7514
|
result->op = GGML_OP_WIN_UNPART;
|
7413
7515
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7414
|
-
result->
|
7415
|
-
result->
|
7416
|
-
result->
|
7516
|
+
result->src[0] = a;
|
7517
|
+
result->src[1] = NULL;
|
7518
|
+
result->src[2] = b;
|
7417
7519
|
|
7418
7520
|
return result;
|
7419
7521
|
}
|
@@ -7442,8 +7544,8 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
|
|
7442
7544
|
|
7443
7545
|
result->op = GGML_OP_MAP_UNARY;
|
7444
7546
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7445
|
-
result->
|
7446
|
-
result->
|
7547
|
+
result->src[0] = a;
|
7548
|
+
result->src[2] = addr_tensor;
|
7447
7549
|
|
7448
7550
|
return result;
|
7449
7551
|
}
|
@@ -7489,9 +7591,9 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
|
|
7489
7591
|
|
7490
7592
|
result->op = GGML_OP_MAP_BINARY;
|
7491
7593
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7492
|
-
result->
|
7493
|
-
result->
|
7494
|
-
result->
|
7594
|
+
result->src[0] = a;
|
7595
|
+
result->src[1] = b;
|
7596
|
+
result->src[2] = addr_tensor;
|
7495
7597
|
|
7496
7598
|
return result;
|
7497
7599
|
}
|
@@ -7536,8 +7638,8 @@ struct ggml_tensor * ggml_map_custom1_impl_f32(
|
|
7536
7638
|
|
7537
7639
|
result->op = GGML_OP_MAP_CUSTOM1;
|
7538
7640
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7539
|
-
result->
|
7540
|
-
result->
|
7641
|
+
result->src[0] = a;
|
7642
|
+
result->src[2] = addr_tensor;
|
7541
7643
|
|
7542
7644
|
return result;
|
7543
7645
|
}
|
@@ -7581,9 +7683,9 @@ struct ggml_tensor * ggml_map_custom2_impl_f32(
|
|
7581
7683
|
|
7582
7684
|
result->op = GGML_OP_MAP_CUSTOM2;
|
7583
7685
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7584
|
-
result->
|
7585
|
-
result->
|
7586
|
-
result->
|
7686
|
+
result->src[0] = a;
|
7687
|
+
result->src[1] = b;
|
7688
|
+
result->src[2] = addr_tensor;
|
7587
7689
|
|
7588
7690
|
return result;
|
7589
7691
|
}
|
@@ -7630,10 +7732,10 @@ struct ggml_tensor * ggml_map_custom3_impl_f32(
|
|
7630
7732
|
|
7631
7733
|
result->op = GGML_OP_MAP_CUSTOM3;
|
7632
7734
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7633
|
-
result->
|
7634
|
-
result->
|
7635
|
-
result->
|
7636
|
-
result->
|
7735
|
+
result->src[0] = a;
|
7736
|
+
result->src[1] = b;
|
7737
|
+
result->src[2] = addr_tensor;
|
7738
|
+
result->src[3] = c;
|
7637
7739
|
|
7638
7740
|
return result;
|
7639
7741
|
}
|
@@ -7673,8 +7775,8 @@ struct ggml_tensor * ggml_cross_entropy_loss(
|
|
7673
7775
|
|
7674
7776
|
result->op = GGML_OP_CROSS_ENTROPY_LOSS;
|
7675
7777
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7676
|
-
result->
|
7677
|
-
result->
|
7778
|
+
result->src[0] = a;
|
7779
|
+
result->src[1] = b;
|
7678
7780
|
|
7679
7781
|
return result;
|
7680
7782
|
}
|
@@ -7693,9 +7795,9 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
|
|
7693
7795
|
|
7694
7796
|
result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
|
7695
7797
|
result->grad = NULL;
|
7696
|
-
result->
|
7697
|
-
result->
|
7698
|
-
result->
|
7798
|
+
result->src[0] = a;
|
7799
|
+
result->src[1] = b;
|
7800
|
+
result->src[2] = c;
|
7699
7801
|
|
7700
7802
|
return result;
|
7701
7803
|
}
|
@@ -8296,7 +8398,7 @@ static void ggml_compute_forward_add_f32(
|
|
8296
8398
|
const struct ggml_tensor * src0,
|
8297
8399
|
const struct ggml_tensor * src1,
|
8298
8400
|
struct ggml_tensor * dst) {
|
8299
|
-
GGML_ASSERT(
|
8401
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
|
8300
8402
|
|
8301
8403
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
8302
8404
|
return;
|
@@ -8321,23 +8423,23 @@ static void ggml_compute_forward_add_f32(
|
|
8321
8423
|
|
8322
8424
|
if (nb10 == sizeof(float)) {
|
8323
8425
|
for (int ir = ir0; ir < ir1; ++ir) {
|
8324
|
-
// src0
|
8325
|
-
const
|
8326
|
-
const
|
8327
|
-
const
|
8426
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
8427
|
+
const int64_t i03 = ir/(ne02*ne01);
|
8428
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
8429
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
8430
|
+
|
8431
|
+
const int64_t i13 = i03 % ne13;
|
8432
|
+
const int64_t i12 = i02 % ne12;
|
8433
|
+
const int64_t i11 = i01 % ne11;
|
8328
8434
|
|
8435
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
8436
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
8437
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
8329
8438
|
|
8330
8439
|
#ifdef GGML_USE_ACCELERATE
|
8331
|
-
vDSP_vadd(
|
8332
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
8333
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
8334
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
8335
|
-
ne0);
|
8440
|
+
vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
|
8336
8441
|
#else
|
8337
|
-
ggml_vec_add_f32(
|
8338
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
8339
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
8340
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
8442
|
+
ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
8341
8443
|
#endif
|
8342
8444
|
// }
|
8343
8445
|
// }
|
@@ -8345,15 +8447,20 @@ static void ggml_compute_forward_add_f32(
|
|
8345
8447
|
} else {
|
8346
8448
|
// src1 is not contiguous
|
8347
8449
|
for (int ir = ir0; ir < ir1; ++ir) {
|
8348
|
-
// src0
|
8349
|
-
const
|
8350
|
-
const
|
8351
|
-
const
|
8450
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
8451
|
+
const int64_t i03 = ir/(ne02*ne01);
|
8452
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
8453
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
8454
|
+
|
8455
|
+
const int64_t i13 = i03 % ne13;
|
8456
|
+
const int64_t i12 = i02 % ne12;
|
8457
|
+
const int64_t i11 = i01 % ne11;
|
8458
|
+
|
8459
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
8460
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
8352
8461
|
|
8353
|
-
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
|
8354
|
-
float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
8355
8462
|
for (int i0 = 0; i0 < ne0; i0++) {
|
8356
|
-
float * src1_ptr = (float *) ((char *) src1->data +
|
8463
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
|
8357
8464
|
|
8358
8465
|
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
|
8359
8466
|
}
|
@@ -10532,7 +10639,6 @@ static void ggml_compute_forward_rms_norm_back(
|
|
10532
10639
|
}
|
10533
10640
|
}
|
10534
10641
|
|
10535
|
-
|
10536
10642
|
// ggml_compute_forward_mul_mat
|
10537
10643
|
|
10538
10644
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
@@ -10576,17 +10682,17 @@ static void ggml_compute_forward_mul_mat(
|
|
10576
10682
|
const int ith = params->ith;
|
10577
10683
|
const int nth = params->nth;
|
10578
10684
|
|
10579
|
-
GGML_ASSERT(ne02 == ne12);
|
10580
|
-
GGML_ASSERT(ne03 == ne13);
|
10581
|
-
GGML_ASSERT(ne2 == ne12);
|
10582
|
-
GGML_ASSERT(ne3 == ne13);
|
10583
|
-
|
10584
10685
|
const enum ggml_type type = src0->type;
|
10585
10686
|
|
10586
10687
|
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
10587
10688
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10588
10689
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
10589
10690
|
|
10691
|
+
GGML_ASSERT(ne0 == ne01);
|
10692
|
+
GGML_ASSERT(ne1 == ne11);
|
10693
|
+
GGML_ASSERT(ne2 == ne12);
|
10694
|
+
GGML_ASSERT(ne3 == ne13);
|
10695
|
+
|
10590
10696
|
// we don't support permuted src0 or src1
|
10591
10697
|
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
10592
10698
|
GGML_ASSERT(nb10 == sizeof(float));
|
@@ -10597,16 +10703,16 @@ static void ggml_compute_forward_mul_mat(
|
|
10597
10703
|
GGML_ASSERT(nb1 <= nb2);
|
10598
10704
|
GGML_ASSERT(nb2 <= nb3);
|
10599
10705
|
|
10600
|
-
GGML_ASSERT(ne0 == ne01);
|
10601
|
-
GGML_ASSERT(ne1 == ne11);
|
10602
|
-
GGML_ASSERT(ne2 == ne02);
|
10603
|
-
GGML_ASSERT(ne3 == ne03);
|
10604
|
-
|
10605
10706
|
// nb01 >= nb00 - src0 is not transposed
|
10606
10707
|
// compute by src0 rows
|
10607
10708
|
|
10608
10709
|
#if defined(GGML_USE_CLBLAST)
|
10609
10710
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
10711
|
+
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
|
10712
|
+
// ref: https://github.com/ggerganov/ggml/pull/224
|
10713
|
+
GGML_ASSERT(ne02 == ne12);
|
10714
|
+
GGML_ASSERT(ne03 == ne13);
|
10715
|
+
|
10610
10716
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
10611
10717
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
10612
10718
|
}
|
@@ -10616,6 +10722,11 @@ static void ggml_compute_forward_mul_mat(
|
|
10616
10722
|
|
10617
10723
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10618
10724
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
10725
|
+
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
|
10726
|
+
// ref: https://github.com/ggerganov/ggml/pull/224
|
10727
|
+
GGML_ASSERT(ne02 == ne12);
|
10728
|
+
GGML_ASSERT(ne03 == ne13);
|
10729
|
+
|
10619
10730
|
if (params->ith != 0) {
|
10620
10731
|
return;
|
10621
10732
|
}
|
@@ -10685,43 +10796,44 @@ static void ggml_compute_forward_mul_mat(
|
|
10685
10796
|
return;
|
10686
10797
|
}
|
10687
10798
|
|
10688
|
-
// parallelize by src0 rows
|
10689
|
-
|
10690
|
-
// total rows in src0
|
10691
|
-
const int nr = ne01*ne02*ne03;
|
10799
|
+
// parallelize by src0 rows
|
10800
|
+
const int64_t dr = (ne01 + nth - 1)/nth;
|
10692
10801
|
|
10693
|
-
|
10694
|
-
const
|
10802
|
+
const int64_t ir10 = dr*ith;
|
10803
|
+
const int64_t ir11 = MIN(ir10 + dr, ne01);
|
10695
10804
|
|
10696
|
-
//
|
10697
|
-
const
|
10698
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
10805
|
+
// src1 rows
|
10806
|
+
const int64_t nr1 = ne11*ne12*ne13;
|
10699
10807
|
|
10700
10808
|
void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10701
|
-
const size_t row_size =
|
10702
|
-
|
10703
|
-
for (
|
10704
|
-
|
10705
|
-
const
|
10706
|
-
const
|
10707
|
-
|
10708
|
-
|
10709
|
-
const
|
10710
|
-
|
10711
|
-
|
10712
|
-
|
10713
|
-
|
10714
|
-
const
|
10715
|
-
|
10716
|
-
|
10717
|
-
|
10718
|
-
|
10719
|
-
|
10720
|
-
|
10721
|
-
|
10722
|
-
|
10723
|
-
|
10724
|
-
|
10809
|
+
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10810
|
+
|
10811
|
+
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
|
10812
|
+
const int64_t i13 = (ir1/(ne12*ne11));
|
10813
|
+
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
|
10814
|
+
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
|
10815
|
+
|
10816
|
+
const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
|
10817
|
+
const int64_t i03 = (ir0/(ne02));
|
10818
|
+
// Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
|
10819
|
+
// See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
|
10820
|
+
// GG: this is likely the correct way to broadcast, though need some more thought
|
10821
|
+
// therefore leaving the comments to remind us for now
|
10822
|
+
const int64_t i02 = (i12 / (ne12 / ne02));
|
10823
|
+
// Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
|
10824
|
+
// const int64_t i02 = (ir0 - i03*ne02);
|
10825
|
+
|
10826
|
+
const int64_t i1 = i11;
|
10827
|
+
const int64_t i2 = i12;
|
10828
|
+
const int64_t i3 = i13;
|
10829
|
+
|
10830
|
+
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
|
10831
|
+
const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size;
|
10832
|
+
|
10833
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10834
|
+
|
10835
|
+
for (int64_t ir = ir10; ir < ir11; ++ir) {
|
10836
|
+
vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
|
10725
10837
|
}
|
10726
10838
|
}
|
10727
10839
|
|
@@ -11718,7 +11830,7 @@ static void ggml_compute_forward_alibi_f32(
|
|
11718
11830
|
|
11719
11831
|
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
11720
11832
|
const int ne1 = src0->ne[1]; // seq_len_without_past
|
11721
|
-
|
11833
|
+
const int ne2 = src0->ne[2]; // n_head -> this is k
|
11722
11834
|
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
11723
11835
|
|
11724
11836
|
const int n = ggml_nrows(src0);
|
@@ -11729,8 +11841,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
11729
11841
|
const int nb2 = src0->nb[2];
|
11730
11842
|
//const int nb3 = src0->nb[3];
|
11731
11843
|
|
11732
|
-
|
11733
|
-
|
11844
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
11845
|
+
GGML_ASSERT(ne1 + n_past == ne0);
|
11846
|
+
GGML_ASSERT(n_head == ne2);
|
11734
11847
|
|
11735
11848
|
// add alibi to src0 (KQ_scaled)
|
11736
11849
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
@@ -11754,7 +11867,7 @@ static void ggml_compute_forward_alibi_f32(
|
|
11754
11867
|
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
11755
11868
|
}
|
11756
11869
|
|
11757
|
-
pdst[0] =
|
11870
|
+
pdst[0] = i * m_k + src[0];
|
11758
11871
|
|
11759
11872
|
}
|
11760
11873
|
}
|
@@ -11783,7 +11896,7 @@ static void ggml_compute_forward_alibi_f16(
|
|
11783
11896
|
|
11784
11897
|
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
11785
11898
|
const int ne1 = src0->ne[1]; // seq_len_without_past
|
11786
|
-
|
11899
|
+
const int ne2 = src0->ne[2]; // n_head -> this is k
|
11787
11900
|
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
11788
11901
|
|
11789
11902
|
const int n = ggml_nrows(src0);
|
@@ -11794,8 +11907,9 @@ static void ggml_compute_forward_alibi_f16(
|
|
11794
11907
|
const int nb2 = src0->nb[2];
|
11795
11908
|
//const int nb3 = src0->nb[3];
|
11796
11909
|
|
11797
|
-
|
11798
|
-
|
11910
|
+
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
|
11911
|
+
GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
|
11912
|
+
GGML_ASSERT(n_head == ne2);
|
11799
11913
|
|
11800
11914
|
// add alibi to src0 (KQ_scaled)
|
11801
11915
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
@@ -11820,7 +11934,7 @@ static void ggml_compute_forward_alibi_f16(
|
|
11820
11934
|
}
|
11821
11935
|
|
11822
11936
|
// we return F32
|
11823
|
-
pdst[0] =
|
11937
|
+
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
11824
11938
|
}
|
11825
11939
|
}
|
11826
11940
|
}
|
@@ -12904,16 +13018,18 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
12904
13018
|
{
|
12905
13019
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
12906
13020
|
|
12907
|
-
for (int
|
12908
|
-
|
12909
|
-
|
12910
|
-
|
12911
|
-
|
12912
|
-
for (int
|
12913
|
-
for (int
|
12914
|
-
for (int
|
12915
|
-
|
12916
|
-
|
13021
|
+
for (int i13 = 0; i13 < ne13; i13++) {
|
13022
|
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13023
|
+
const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
|
13024
|
+
ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
|
13025
|
+
|
13026
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
13027
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
13028
|
+
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
13029
|
+
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
13030
|
+
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
13031
|
+
GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
|
13032
|
+
}
|
12917
13033
|
}
|
12918
13034
|
}
|
12919
13035
|
}
|
@@ -12940,14 +13056,16 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
12940
13056
|
|
12941
13057
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
12942
13058
|
|
12943
|
-
for (int
|
12944
|
-
|
12945
|
-
|
12946
|
-
|
12947
|
-
for (int
|
12948
|
-
|
12949
|
-
|
12950
|
-
|
13059
|
+
for (int i3 = 0; i3 < ne3; i3++) {
|
13060
|
+
for (int i2 = ip0; i2 < ip1; i2++) {
|
13061
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
|
13062
|
+
|
13063
|
+
for (int i1 = 0; i1 < ne1; ++i1) {
|
13064
|
+
for (int i0 = 0; i0 < ne0; ++i0) {
|
13065
|
+
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
|
13066
|
+
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
|
13067
|
+
(ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
|
13068
|
+
}
|
12951
13069
|
}
|
12952
13070
|
}
|
12953
13071
|
}
|
@@ -12996,10 +13114,169 @@ static void ggml_compute_forward_conv_2d(
|
|
12996
13114
|
|
12997
13115
|
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
|
12998
13116
|
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
|
12999
|
-
}
|
13000
|
-
else {
|
13117
|
+
} else {
|
13001
13118
|
GGML_ASSERT(false); // only stride equal to kernel size is supported
|
13002
|
-
}
|
13119
|
+
}
|
13120
|
+
}
|
13121
|
+
|
13122
|
+
// ggml_compute_forward_pool_1d_sk_p0
|
13123
|
+
|
13124
|
+
static void ggml_compute_forward_pool_1d_sk_p0(
|
13125
|
+
const struct ggml_compute_params * params,
|
13126
|
+
const enum ggml_op_pool op,
|
13127
|
+
const struct ggml_tensor * src,
|
13128
|
+
const int k,
|
13129
|
+
struct ggml_tensor * dst) {
|
13130
|
+
assert(src->type == GGML_TYPE_F32);
|
13131
|
+
assert(params->ith == 0);
|
13132
|
+
|
13133
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
13134
|
+
return;
|
13135
|
+
}
|
13136
|
+
|
13137
|
+
const char * cdata = (const char *)src->data;
|
13138
|
+
const char * const data_end = cdata + ggml_nbytes(src);
|
13139
|
+
float * drow = (float *)dst->data;
|
13140
|
+
|
13141
|
+
const int64_t rs = dst->ne[0];
|
13142
|
+
|
13143
|
+
while (cdata < data_end) {
|
13144
|
+
const float * const srow = (const float *)cdata;
|
13145
|
+
|
13146
|
+
int j = 0;
|
13147
|
+
|
13148
|
+
for (int64_t i = 0; i < rs; ++i) {
|
13149
|
+
switch (op) {
|
13150
|
+
case GGML_OP_POOL_AVG: drow[i] = 0; break;
|
13151
|
+
case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break;
|
13152
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13153
|
+
}
|
13154
|
+
for (int ki = 0; ki < k; ++ki) {
|
13155
|
+
switch (op) {
|
13156
|
+
case GGML_OP_POOL_AVG: drow[i] += srow[j]; break;
|
13157
|
+
case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break;
|
13158
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13159
|
+
}
|
13160
|
+
++j;
|
13161
|
+
}
|
13162
|
+
switch (op) {
|
13163
|
+
case GGML_OP_POOL_AVG: drow[i] /= k; break;
|
13164
|
+
case GGML_OP_POOL_MAX: break;
|
13165
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13166
|
+
}
|
13167
|
+
}
|
13168
|
+
|
13169
|
+
cdata += src->nb[1];
|
13170
|
+
drow += rs;
|
13171
|
+
}
|
13172
|
+
}
|
13173
|
+
|
13174
|
+
// ggml_compute_forward_pool_1d
|
13175
|
+
|
13176
|
+
static void ggml_compute_forward_pool_1d(
|
13177
|
+
const struct ggml_compute_params* params,
|
13178
|
+
const struct ggml_tensor* src0,
|
13179
|
+
const struct ggml_tensor* opt0,
|
13180
|
+
struct ggml_tensor* dst) {
|
13181
|
+
GGML_ASSERT(opt0->ne[0] == 4);
|
13182
|
+
const int* opts = (const int*)opt0->data;
|
13183
|
+
enum ggml_op_pool op = opts[0];
|
13184
|
+
const int k0 = opts[1];
|
13185
|
+
const int s0 = opts[2];
|
13186
|
+
const int p0 = opts[3];
|
13187
|
+
GGML_ASSERT(p0 == 0); // padding not supported
|
13188
|
+
GGML_ASSERT(k0 == s0); // only s = k supported
|
13189
|
+
|
13190
|
+
ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
|
13191
|
+
}
|
13192
|
+
|
13193
|
+
// ggml_compute_forward_pool_2d_sk_p0
|
13194
|
+
|
13195
|
+
static void ggml_compute_forward_pool_2d_sk_p0(
|
13196
|
+
const struct ggml_compute_params * params,
|
13197
|
+
const enum ggml_op_pool op,
|
13198
|
+
const struct ggml_tensor * src,
|
13199
|
+
const int k0,
|
13200
|
+
const int k1,
|
13201
|
+
struct ggml_tensor * dst) {
|
13202
|
+
assert(src->type == GGML_TYPE_F32);
|
13203
|
+
assert(params->ith == 0);
|
13204
|
+
|
13205
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
13206
|
+
return;
|
13207
|
+
}
|
13208
|
+
|
13209
|
+
const char * cdata = (const char*)src->data;
|
13210
|
+
const char * const data_end = cdata + ggml_nbytes(src);
|
13211
|
+
|
13212
|
+
const int64_t px = dst->ne[0];
|
13213
|
+
const int64_t py = dst->ne[1];
|
13214
|
+
const int64_t pa = px * py;
|
13215
|
+
|
13216
|
+
float * dplane = (float *)dst->data;
|
13217
|
+
|
13218
|
+
const int ka = k0 * k1;
|
13219
|
+
|
13220
|
+
while (cdata < data_end) {
|
13221
|
+
for (int oy = 0; oy < py; ++oy) {
|
13222
|
+
float * const drow = dplane + oy * px;
|
13223
|
+
for (int ox = 0; ox < px; ++ox) {
|
13224
|
+
float * const out = drow + ox;
|
13225
|
+
switch (op) {
|
13226
|
+
case GGML_OP_POOL_AVG: *out = 0; break;
|
13227
|
+
case GGML_OP_POOL_MAX: *out = -FLT_MAX; break;
|
13228
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13229
|
+
}
|
13230
|
+
|
13231
|
+
const int ix = ox * k0;
|
13232
|
+
const int iy = oy * k1;
|
13233
|
+
|
13234
|
+
for (int ky = 0; ky < k1; ++ky) {
|
13235
|
+
const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
|
13236
|
+
for (int kx = 0; kx < k0; ++kx) {
|
13237
|
+
int j = ix + kx;
|
13238
|
+
switch (op) {
|
13239
|
+
case GGML_OP_POOL_AVG: *out += srow[j]; break;
|
13240
|
+
case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
|
13241
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13242
|
+
}
|
13243
|
+
}
|
13244
|
+
}
|
13245
|
+
switch (op) {
|
13246
|
+
case GGML_OP_POOL_AVG: *out /= ka; break;
|
13247
|
+
case GGML_OP_POOL_MAX: break;
|
13248
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13249
|
+
}
|
13250
|
+
}
|
13251
|
+
}
|
13252
|
+
|
13253
|
+
cdata += src->nb[2];
|
13254
|
+
dplane += pa;
|
13255
|
+
}
|
13256
|
+
}
|
13257
|
+
|
13258
|
+
// ggml_compute_forward_pool_2d
|
13259
|
+
|
13260
|
+
static void ggml_compute_forward_pool_2d(
|
13261
|
+
const struct ggml_compute_params * params,
|
13262
|
+
const struct ggml_tensor * src0,
|
13263
|
+
const struct ggml_tensor * opt0,
|
13264
|
+
struct ggml_tensor * dst) {
|
13265
|
+
GGML_ASSERT(opt0->ne[0] == 7);
|
13266
|
+
const int* opts = (const int*)opt0->data;
|
13267
|
+
enum ggml_op_pool op = opts[0];
|
13268
|
+
const int k0 = opts[1];
|
13269
|
+
const int k1 = opts[2];
|
13270
|
+
const int s0 = opts[3];
|
13271
|
+
const int s1 = opts[4];
|
13272
|
+
const int p0 = opts[5];
|
13273
|
+
const int p1 = opts[6];
|
13274
|
+
GGML_ASSERT(p0 == 0);
|
13275
|
+
GGML_ASSERT(p1 == 0); // padding not supported
|
13276
|
+
GGML_ASSERT(k0 == s0);
|
13277
|
+
GGML_ASSERT(k1 == s1); // only s = k supported
|
13278
|
+
|
13279
|
+
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
|
13003
13280
|
}
|
13004
13281
|
|
13005
13282
|
|
@@ -14566,287 +14843,295 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14566
14843
|
if (skip_cpu) {
|
14567
14844
|
return;
|
14568
14845
|
}
|
14569
|
-
GGML_ASSERT(tensor->
|
14570
|
-
GGML_ASSERT(tensor->
|
14846
|
+
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
14847
|
+
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
14571
14848
|
#endif // GGML_USE_CUBLAS
|
14572
14849
|
|
14573
14850
|
switch (tensor->op) {
|
14574
14851
|
case GGML_OP_DUP:
|
14575
14852
|
{
|
14576
|
-
ggml_compute_forward_dup(params, tensor->
|
14853
|
+
ggml_compute_forward_dup(params, tensor->src[0], tensor);
|
14577
14854
|
} break;
|
14578
14855
|
case GGML_OP_ADD:
|
14579
14856
|
{
|
14580
|
-
ggml_compute_forward_add(params, tensor->
|
14857
|
+
ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
|
14581
14858
|
} break;
|
14582
14859
|
case GGML_OP_ADD1:
|
14583
14860
|
{
|
14584
|
-
ggml_compute_forward_add1(params, tensor->
|
14861
|
+
ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
|
14585
14862
|
} break;
|
14586
14863
|
case GGML_OP_ACC:
|
14587
14864
|
{
|
14588
|
-
ggml_compute_forward_acc(params, tensor->
|
14865
|
+
ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14589
14866
|
} break;
|
14590
14867
|
case GGML_OP_SUB:
|
14591
14868
|
{
|
14592
|
-
ggml_compute_forward_sub(params, tensor->
|
14869
|
+
ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
|
14593
14870
|
} break;
|
14594
14871
|
case GGML_OP_MUL:
|
14595
14872
|
{
|
14596
|
-
ggml_compute_forward_mul(params, tensor->
|
14873
|
+
ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
|
14597
14874
|
} break;
|
14598
14875
|
case GGML_OP_DIV:
|
14599
14876
|
{
|
14600
|
-
ggml_compute_forward_div(params, tensor->
|
14877
|
+
ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
|
14601
14878
|
} break;
|
14602
14879
|
case GGML_OP_SQR:
|
14603
14880
|
{
|
14604
|
-
ggml_compute_forward_sqr(params, tensor->
|
14881
|
+
ggml_compute_forward_sqr(params, tensor->src[0], tensor);
|
14605
14882
|
} break;
|
14606
14883
|
case GGML_OP_SQRT:
|
14607
14884
|
{
|
14608
|
-
ggml_compute_forward_sqrt(params, tensor->
|
14885
|
+
ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
|
14609
14886
|
} break;
|
14610
14887
|
case GGML_OP_LOG:
|
14611
14888
|
{
|
14612
|
-
ggml_compute_forward_log(params, tensor->
|
14889
|
+
ggml_compute_forward_log(params, tensor->src[0], tensor);
|
14613
14890
|
} break;
|
14614
14891
|
case GGML_OP_SUM:
|
14615
14892
|
{
|
14616
|
-
ggml_compute_forward_sum(params, tensor->
|
14893
|
+
ggml_compute_forward_sum(params, tensor->src[0], tensor);
|
14617
14894
|
} break;
|
14618
14895
|
case GGML_OP_SUM_ROWS:
|
14619
14896
|
{
|
14620
|
-
ggml_compute_forward_sum_rows(params, tensor->
|
14897
|
+
ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
|
14621
14898
|
} break;
|
14622
14899
|
case GGML_OP_MEAN:
|
14623
14900
|
{
|
14624
|
-
ggml_compute_forward_mean(params, tensor->
|
14901
|
+
ggml_compute_forward_mean(params, tensor->src[0], tensor);
|
14625
14902
|
} break;
|
14626
14903
|
case GGML_OP_ARGMAX:
|
14627
14904
|
{
|
14628
|
-
ggml_compute_forward_argmax(params, tensor->
|
14905
|
+
ggml_compute_forward_argmax(params, tensor->src[0], tensor);
|
14629
14906
|
} break;
|
14630
14907
|
case GGML_OP_REPEAT:
|
14631
14908
|
{
|
14632
|
-
ggml_compute_forward_repeat(params, tensor->
|
14909
|
+
ggml_compute_forward_repeat(params, tensor->src[0], tensor);
|
14633
14910
|
} break;
|
14634
14911
|
case GGML_OP_REPEAT_BACK:
|
14635
14912
|
{
|
14636
|
-
ggml_compute_forward_repeat_back(params, tensor->
|
14913
|
+
ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
|
14637
14914
|
} break;
|
14638
14915
|
case GGML_OP_ABS:
|
14639
14916
|
{
|
14640
|
-
ggml_compute_forward_abs(params, tensor->
|
14917
|
+
ggml_compute_forward_abs(params, tensor->src[0], tensor);
|
14641
14918
|
} break;
|
14642
14919
|
case GGML_OP_SGN:
|
14643
14920
|
{
|
14644
|
-
ggml_compute_forward_sgn(params, tensor->
|
14921
|
+
ggml_compute_forward_sgn(params, tensor->src[0], tensor);
|
14645
14922
|
} break;
|
14646
14923
|
case GGML_OP_NEG:
|
14647
14924
|
{
|
14648
|
-
ggml_compute_forward_neg(params, tensor->
|
14925
|
+
ggml_compute_forward_neg(params, tensor->src[0], tensor);
|
14649
14926
|
} break;
|
14650
14927
|
case GGML_OP_STEP:
|
14651
14928
|
{
|
14652
|
-
ggml_compute_forward_step(params, tensor->
|
14929
|
+
ggml_compute_forward_step(params, tensor->src[0], tensor);
|
14653
14930
|
} break;
|
14654
14931
|
case GGML_OP_TANH:
|
14655
14932
|
{
|
14656
|
-
ggml_compute_forward_tanh(params, tensor->
|
14933
|
+
ggml_compute_forward_tanh(params, tensor->src[0], tensor);
|
14657
14934
|
} break;
|
14658
14935
|
case GGML_OP_ELU:
|
14659
14936
|
{
|
14660
|
-
ggml_compute_forward_elu(params, tensor->
|
14937
|
+
ggml_compute_forward_elu(params, tensor->src[0], tensor);
|
14661
14938
|
} break;
|
14662
14939
|
case GGML_OP_RELU:
|
14663
14940
|
{
|
14664
|
-
ggml_compute_forward_relu(params, tensor->
|
14941
|
+
ggml_compute_forward_relu(params, tensor->src[0], tensor);
|
14665
14942
|
} break;
|
14666
14943
|
case GGML_OP_GELU:
|
14667
14944
|
{
|
14668
|
-
ggml_compute_forward_gelu(params, tensor->
|
14945
|
+
ggml_compute_forward_gelu(params, tensor->src[0], tensor);
|
14669
14946
|
} break;
|
14670
14947
|
case GGML_OP_GELU_QUICK:
|
14671
14948
|
{
|
14672
|
-
ggml_compute_forward_gelu_quick(params, tensor->
|
14949
|
+
ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
|
14673
14950
|
} break;
|
14674
14951
|
case GGML_OP_SILU:
|
14675
14952
|
{
|
14676
|
-
ggml_compute_forward_silu(params, tensor->
|
14953
|
+
ggml_compute_forward_silu(params, tensor->src[0], tensor);
|
14677
14954
|
} break;
|
14678
14955
|
case GGML_OP_SILU_BACK:
|
14679
14956
|
{
|
14680
|
-
ggml_compute_forward_silu_back(params, tensor->
|
14957
|
+
ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
|
14681
14958
|
} break;
|
14682
14959
|
case GGML_OP_NORM:
|
14683
14960
|
{
|
14684
|
-
ggml_compute_forward_norm(params, tensor->
|
14961
|
+
ggml_compute_forward_norm(params, tensor->src[0], tensor);
|
14685
14962
|
} break;
|
14686
14963
|
case GGML_OP_RMS_NORM:
|
14687
14964
|
{
|
14688
|
-
ggml_compute_forward_rms_norm(params, tensor->
|
14965
|
+
ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
|
14689
14966
|
} break;
|
14690
14967
|
case GGML_OP_RMS_NORM_BACK:
|
14691
14968
|
{
|
14692
|
-
ggml_compute_forward_rms_norm_back(params, tensor->
|
14969
|
+
ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
|
14693
14970
|
} break;
|
14694
14971
|
case GGML_OP_MUL_MAT:
|
14695
14972
|
{
|
14696
|
-
ggml_compute_forward_mul_mat(params, tensor->
|
14973
|
+
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
14697
14974
|
} break;
|
14698
14975
|
case GGML_OP_OUT_PROD:
|
14699
14976
|
{
|
14700
|
-
ggml_compute_forward_out_prod(params, tensor->
|
14977
|
+
ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
|
14701
14978
|
} break;
|
14702
14979
|
case GGML_OP_SCALE:
|
14703
14980
|
{
|
14704
|
-
ggml_compute_forward_scale(params, tensor->
|
14981
|
+
ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
|
14705
14982
|
} break;
|
14706
14983
|
case GGML_OP_SET:
|
14707
14984
|
{
|
14708
|
-
ggml_compute_forward_set(params, tensor->
|
14985
|
+
ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14709
14986
|
} break;
|
14710
14987
|
case GGML_OP_CPY:
|
14711
14988
|
{
|
14712
|
-
ggml_compute_forward_cpy(params, tensor->
|
14989
|
+
ggml_compute_forward_cpy(params, tensor->src[0], tensor);
|
14713
14990
|
} break;
|
14714
14991
|
case GGML_OP_CONT:
|
14715
14992
|
{
|
14716
|
-
ggml_compute_forward_cont(params, tensor->
|
14993
|
+
ggml_compute_forward_cont(params, tensor->src[0], tensor);
|
14717
14994
|
} break;
|
14718
14995
|
case GGML_OP_RESHAPE:
|
14719
14996
|
{
|
14720
|
-
ggml_compute_forward_reshape(params, tensor->
|
14997
|
+
ggml_compute_forward_reshape(params, tensor->src[0], tensor);
|
14721
14998
|
} break;
|
14722
14999
|
case GGML_OP_VIEW:
|
14723
15000
|
{
|
14724
|
-
ggml_compute_forward_view(params, tensor->
|
15001
|
+
ggml_compute_forward_view(params, tensor->src[0]);
|
14725
15002
|
} break;
|
14726
15003
|
case GGML_OP_PERMUTE:
|
14727
15004
|
{
|
14728
|
-
ggml_compute_forward_permute(params, tensor->
|
15005
|
+
ggml_compute_forward_permute(params, tensor->src[0]);
|
14729
15006
|
} break;
|
14730
15007
|
case GGML_OP_TRANSPOSE:
|
14731
15008
|
{
|
14732
|
-
ggml_compute_forward_transpose(params, tensor->
|
15009
|
+
ggml_compute_forward_transpose(params, tensor->src[0]);
|
14733
15010
|
} break;
|
14734
15011
|
case GGML_OP_GET_ROWS:
|
14735
15012
|
{
|
14736
|
-
ggml_compute_forward_get_rows(params, tensor->
|
15013
|
+
ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
|
14737
15014
|
} break;
|
14738
15015
|
case GGML_OP_GET_ROWS_BACK:
|
14739
15016
|
{
|
14740
|
-
ggml_compute_forward_get_rows_back(params, tensor->
|
15017
|
+
ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14741
15018
|
} break;
|
14742
15019
|
case GGML_OP_DIAG:
|
14743
15020
|
{
|
14744
|
-
ggml_compute_forward_diag(params, tensor->
|
15021
|
+
ggml_compute_forward_diag(params, tensor->src[0], tensor);
|
14745
15022
|
} break;
|
14746
15023
|
case GGML_OP_DIAG_MASK_INF:
|
14747
15024
|
{
|
14748
|
-
ggml_compute_forward_diag_mask_inf(params, tensor->
|
15025
|
+
ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor);
|
14749
15026
|
} break;
|
14750
15027
|
case GGML_OP_DIAG_MASK_ZERO:
|
14751
15028
|
{
|
14752
|
-
ggml_compute_forward_diag_mask_zero(params, tensor->
|
15029
|
+
ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor);
|
14753
15030
|
} break;
|
14754
15031
|
case GGML_OP_SOFT_MAX:
|
14755
15032
|
{
|
14756
|
-
ggml_compute_forward_soft_max(params, tensor->
|
15033
|
+
ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
|
14757
15034
|
} break;
|
14758
15035
|
case GGML_OP_SOFT_MAX_BACK:
|
14759
15036
|
{
|
14760
|
-
ggml_compute_forward_soft_max_back(params, tensor->
|
15037
|
+
ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
|
14761
15038
|
} break;
|
14762
15039
|
case GGML_OP_ROPE:
|
14763
15040
|
{
|
14764
|
-
ggml_compute_forward_rope(params, tensor->
|
15041
|
+
ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
|
14765
15042
|
} break;
|
14766
15043
|
case GGML_OP_ROPE_BACK:
|
14767
15044
|
{
|
14768
|
-
ggml_compute_forward_rope_back(params, tensor->
|
15045
|
+
ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
|
14769
15046
|
} break;
|
14770
15047
|
case GGML_OP_ALIBI:
|
14771
15048
|
{
|
14772
|
-
ggml_compute_forward_alibi(params, tensor->
|
15049
|
+
ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor);
|
14773
15050
|
} break;
|
14774
15051
|
case GGML_OP_CLAMP:
|
14775
15052
|
{
|
14776
|
-
ggml_compute_forward_clamp(params, tensor->
|
15053
|
+
ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor);
|
14777
15054
|
} break;
|
14778
15055
|
case GGML_OP_CONV_1D:
|
14779
15056
|
{
|
14780
|
-
ggml_compute_forward_conv_1d(params, tensor->
|
15057
|
+
ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14781
15058
|
} break;
|
14782
15059
|
case GGML_OP_CONV_2D:
|
14783
15060
|
{
|
14784
|
-
ggml_compute_forward_conv_2d(params, tensor->
|
15061
|
+
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
15062
|
+
} break;
|
15063
|
+
case GGML_OP_POOL_1D:
|
15064
|
+
{
|
15065
|
+
ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
|
15066
|
+
} break;
|
15067
|
+
case GGML_OP_POOL_2D:
|
15068
|
+
{
|
15069
|
+
ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
|
14785
15070
|
} break;
|
14786
15071
|
case GGML_OP_FLASH_ATTN:
|
14787
15072
|
{
|
14788
|
-
const int32_t t = ggml_get_i32_1d(tensor->
|
15073
|
+
const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
|
14789
15074
|
GGML_ASSERT(t == 0 || t == 1);
|
14790
15075
|
const bool masked = t != 0;
|
14791
|
-
ggml_compute_forward_flash_attn(params, tensor->
|
15076
|
+
ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
|
14792
15077
|
} break;
|
14793
15078
|
case GGML_OP_FLASH_FF:
|
14794
15079
|
{
|
14795
|
-
ggml_compute_forward_flash_ff(params, tensor->
|
15080
|
+
ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
|
14796
15081
|
} break;
|
14797
15082
|
case GGML_OP_FLASH_ATTN_BACK:
|
14798
15083
|
{
|
14799
|
-
int32_t t = ggml_get_i32_1d(tensor->
|
15084
|
+
int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
|
14800
15085
|
GGML_ASSERT(t == 0 || t == 1);
|
14801
15086
|
bool masked = t != 0;
|
14802
|
-
ggml_compute_forward_flash_attn_back(params, tensor->
|
15087
|
+
ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
|
14803
15088
|
} break;
|
14804
15089
|
case GGML_OP_WIN_PART:
|
14805
15090
|
{
|
14806
|
-
ggml_compute_forward_win_part(params, tensor->
|
15091
|
+
ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor);
|
14807
15092
|
} break;
|
14808
15093
|
case GGML_OP_WIN_UNPART:
|
14809
15094
|
{
|
14810
|
-
ggml_compute_forward_win_unpart(params, tensor->
|
15095
|
+
ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor);
|
14811
15096
|
} break;
|
14812
15097
|
case GGML_OP_MAP_UNARY:
|
14813
15098
|
{
|
14814
|
-
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->
|
14815
|
-
ggml_compute_forward_map_unary(params, tensor->
|
15099
|
+
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data);
|
15100
|
+
ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
|
14816
15101
|
}
|
14817
15102
|
break;
|
14818
15103
|
case GGML_OP_MAP_BINARY:
|
14819
15104
|
{
|
14820
|
-
const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->
|
14821
|
-
ggml_compute_forward_map_binary(params, tensor->
|
15105
|
+
const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data);
|
15106
|
+
ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14822
15107
|
}
|
14823
15108
|
break;
|
14824
15109
|
case GGML_OP_MAP_CUSTOM1:
|
14825
15110
|
{
|
14826
|
-
const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->
|
14827
|
-
ggml_compute_forward_map_custom1(params, tensor->
|
15111
|
+
const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data);
|
15112
|
+
ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
|
14828
15113
|
}
|
14829
15114
|
break;
|
14830
15115
|
case GGML_OP_MAP_CUSTOM2:
|
14831
15116
|
{
|
14832
|
-
const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->
|
14833
|
-
ggml_compute_forward_map_custom2(params, tensor->
|
15117
|
+
const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data);
|
15118
|
+
ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14834
15119
|
}
|
14835
15120
|
break;
|
14836
15121
|
case GGML_OP_MAP_CUSTOM3:
|
14837
15122
|
{
|
14838
|
-
const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->
|
14839
|
-
ggml_compute_forward_map_custom3(params, tensor->
|
15123
|
+
const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data);
|
15124
|
+
ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun);
|
14840
15125
|
}
|
14841
15126
|
break;
|
14842
15127
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
14843
15128
|
{
|
14844
|
-
ggml_compute_forward_cross_entropy_loss(params, tensor->
|
15129
|
+
ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
|
14845
15130
|
}
|
14846
15131
|
break;
|
14847
15132
|
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
14848
15133
|
{
|
14849
|
-
ggml_compute_forward_cross_entropy_loss_back(params, tensor->
|
15134
|
+
ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14850
15135
|
}
|
14851
15136
|
break;
|
14852
15137
|
case GGML_OP_NONE:
|
@@ -14863,8 +15148,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14863
15148
|
////////////////////////////////////////////////////////////////////////////////
|
14864
15149
|
|
14865
15150
|
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
|
14866
|
-
struct ggml_tensor * src0 = tensor->
|
14867
|
-
struct ggml_tensor * src1 = tensor->
|
15151
|
+
struct ggml_tensor * src0 = tensor->src[0];
|
15152
|
+
struct ggml_tensor * src1 = tensor->src[1];
|
14868
15153
|
|
14869
15154
|
switch (tensor->op) {
|
14870
15155
|
case GGML_OP_DUP:
|
@@ -14900,12 +15185,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14900
15185
|
src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
|
14901
15186
|
}
|
14902
15187
|
if (src1->grad) {
|
14903
|
-
GGML_ASSERT(ggml_nelements(tensor->
|
14904
|
-
GGML_ASSERT(tensor->
|
14905
|
-
const size_t nb1 = (( int32_t * ) tensor->
|
14906
|
-
const size_t nb2 = (( int32_t * ) tensor->
|
14907
|
-
const size_t nb3 = (( int32_t * ) tensor->
|
14908
|
-
const size_t offset = (( int32_t * ) tensor->
|
15188
|
+
GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
|
15189
|
+
GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
|
15190
|
+
const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
|
15191
|
+
const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
|
15192
|
+
const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
|
15193
|
+
const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
|
14909
15194
|
|
14910
15195
|
struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
|
14911
15196
|
tensor->grad,
|
@@ -15213,12 +15498,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15213
15498
|
} break;
|
15214
15499
|
case GGML_OP_SET:
|
15215
15500
|
{
|
15216
|
-
GGML_ASSERT(ggml_nelements(tensor->
|
15217
|
-
GGML_ASSERT(tensor->
|
15218
|
-
const size_t nb1 = (( int32_t * ) tensor->
|
15219
|
-
const size_t nb2 = (( int32_t * ) tensor->
|
15220
|
-
const size_t nb3 = (( int32_t * ) tensor->
|
15221
|
-
const size_t offset = (( int32_t * ) tensor->
|
15501
|
+
GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
|
15502
|
+
GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
|
15503
|
+
const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
|
15504
|
+
const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
|
15505
|
+
const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
|
15506
|
+
const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
|
15222
15507
|
|
15223
15508
|
struct ggml_tensor * tensor_grad_view = NULL;
|
15224
15509
|
|
@@ -15295,8 +15580,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15295
15580
|
if (src0->grad) {
|
15296
15581
|
size_t offset;
|
15297
15582
|
|
15298
|
-
GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->
|
15299
|
-
memcpy(&offset, tensor->
|
15583
|
+
GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2]));
|
15584
|
+
memcpy(&offset, tensor->src[2]->data, sizeof(offset));
|
15300
15585
|
|
15301
15586
|
size_t nb1 = tensor->nb[1];
|
15302
15587
|
size_t nb2 = tensor->nb[2];
|
@@ -15323,7 +15608,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15323
15608
|
{
|
15324
15609
|
// necessary for llama
|
15325
15610
|
if (src0->grad) {
|
15326
|
-
int32_t * axes = (int32_t *) tensor->
|
15611
|
+
int32_t * axes = (int32_t *) tensor->src[2]->data;
|
15327
15612
|
int axis0 = axes[0] & 0x3;
|
15328
15613
|
int axis1 = axes[1] & 0x3;
|
15329
15614
|
int axis2 = axes[2] & 0x3;
|
@@ -15483,18 +15768,26 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15483
15768
|
{
|
15484
15769
|
GGML_ASSERT(false); // TODO: not implemented
|
15485
15770
|
} break;
|
15771
|
+
case GGML_OP_POOL_1D:
|
15772
|
+
{
|
15773
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15774
|
+
} break;
|
15775
|
+
case GGML_OP_POOL_2D:
|
15776
|
+
{
|
15777
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15778
|
+
} break;
|
15486
15779
|
case GGML_OP_FLASH_ATTN:
|
15487
15780
|
{
|
15488
15781
|
struct ggml_tensor * flash_grad = NULL;
|
15489
|
-
if (src0->grad || src1->grad || tensor->
|
15490
|
-
int32_t t = ggml_get_i32_1d(tensor->
|
15782
|
+
if (src0->grad || src1->grad || tensor->src[2]->grad) {
|
15783
|
+
int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
|
15491
15784
|
GGML_ASSERT(t == 0 || t == 1);
|
15492
15785
|
bool masked = t != 0;
|
15493
15786
|
flash_grad =
|
15494
15787
|
ggml_flash_attn_back(ctx,
|
15495
15788
|
src0,
|
15496
15789
|
src1,
|
15497
|
-
tensor->
|
15790
|
+
tensor->src[2],
|
15498
15791
|
tensor->grad,
|
15499
15792
|
masked);
|
15500
15793
|
}
|
@@ -15591,7 +15884,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15591
15884
|
inplace);
|
15592
15885
|
}
|
15593
15886
|
|
15594
|
-
struct ggml_tensor * opt0 = tensor->
|
15887
|
+
struct ggml_tensor * opt0 = tensor->src[2];
|
15595
15888
|
|
15596
15889
|
if (opt0->grad) {
|
15597
15890
|
struct ggml_tensor * grad_v = NULL;
|
@@ -15707,17 +16000,9 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15707
16000
|
}
|
15708
16001
|
}
|
15709
16002
|
|
15710
|
-
|
15711
|
-
|
15712
|
-
|
15713
|
-
|
15714
|
-
if (node->src1) {
|
15715
|
-
ggml_visit_parents(cgraph, node->src1);
|
15716
|
-
}
|
15717
|
-
|
15718
|
-
for (int i = 0; i < GGML_MAX_OPT; ++i) {
|
15719
|
-
if (node->opt[i]) {
|
15720
|
-
ggml_visit_parents(cgraph, node->opt[i]);
|
16003
|
+
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
16004
|
+
if (node->src[i]) {
|
16005
|
+
ggml_visit_parents(cgraph, node->src[i]);
|
15721
16006
|
}
|
15722
16007
|
}
|
15723
16008
|
|
@@ -15772,9 +16057,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|
15772
16057
|
struct ggml_cgraph result = {
|
15773
16058
|
/*.n_nodes =*/ 0,
|
15774
16059
|
/*.n_leafs =*/ 0,
|
15775
|
-
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
|
15776
|
-
/*.work_size =*/ 0,
|
15777
|
-
/*.work =*/ NULL,
|
15778
16060
|
/*.nodes =*/ { NULL },
|
15779
16061
|
/*.grads =*/ { NULL },
|
15780
16062
|
/*.leafs =*/ { NULL },
|
@@ -15945,16 +16227,20 @@ void clear_numa_thread_affinity(void) {}
|
|
15945
16227
|
#endif
|
15946
16228
|
|
15947
16229
|
struct ggml_compute_state_shared {
|
15948
|
-
struct ggml_cgraph * cgraph;
|
16230
|
+
const struct ggml_cgraph * cgraph;
|
16231
|
+
const struct ggml_cplan * cplan;
|
15949
16232
|
|
15950
16233
|
int64_t perf_node_start_cycles;
|
15951
16234
|
int64_t perf_node_start_time_us;
|
15952
16235
|
|
15953
|
-
int n_threads;
|
16236
|
+
const int n_threads;
|
15954
16237
|
|
15955
16238
|
// synchronization primitives
|
15956
16239
|
atomic_int n_active; // num active threads
|
15957
16240
|
atomic_int node_n; // active graph node
|
16241
|
+
|
16242
|
+
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
16243
|
+
void * abort_callback_data;
|
15958
16244
|
};
|
15959
16245
|
|
15960
16246
|
struct ggml_compute_state {
|
@@ -15974,14 +16260,22 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
|
15974
16260
|
|
15975
16261
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
15976
16262
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
15977
|
-
struct ggml_cgraph * cgraph = state->shared->cgraph;
|
15978
16263
|
|
15979
|
-
const
|
16264
|
+
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
16265
|
+
const struct ggml_cplan * cplan = state->shared->cplan;
|
16266
|
+
|
16267
|
+
const int * n_tasks_arr = cplan->n_tasks;
|
16268
|
+
const int n_threads = state->shared->n_threads;
|
16269
|
+
|
15980
16270
|
set_numa_thread_affinity(state->ith, n_threads);
|
15981
16271
|
|
15982
16272
|
int node_n = -1;
|
15983
16273
|
|
15984
16274
|
while (true) {
|
16275
|
+
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
16276
|
+
state->shared->node_n += 1;
|
16277
|
+
return (thread_ret_t) GGML_EXIT_ABORTED;
|
16278
|
+
}
|
15985
16279
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
15986
16280
|
// all other threads are finished and spinning
|
15987
16281
|
// do finalize and init here so we don't have synchronize again
|
@@ -15989,15 +16283,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
15989
16283
|
/*.type =*/ GGML_TASK_FINALIZE,
|
15990
16284
|
/*.ith =*/ 0,
|
15991
16285
|
/*.nth =*/ 0,
|
15992
|
-
/*.wsize =*/
|
15993
|
-
/*.wdata =*/
|
16286
|
+
/*.wsize =*/ cplan->work_size,
|
16287
|
+
/*.wdata =*/ cplan->work_data,
|
15994
16288
|
};
|
15995
16289
|
|
15996
16290
|
if (node_n != -1) {
|
15997
16291
|
/* FINALIZE */
|
15998
16292
|
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
15999
16293
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16000
|
-
params.nth =
|
16294
|
+
params.nth = n_tasks_arr[node_n];
|
16001
16295
|
ggml_compute_forward(¶ms, node);
|
16002
16296
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16003
16297
|
}
|
@@ -16008,11 +16302,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16008
16302
|
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
16009
16303
|
|
16010
16304
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16305
|
+
const int n_tasks = n_tasks_arr[node_n];
|
16011
16306
|
|
16012
16307
|
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16013
16308
|
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
16014
16309
|
|
16015
|
-
params.nth =
|
16310
|
+
params.nth = n_tasks;
|
16016
16311
|
|
16017
16312
|
/* INIT */
|
16018
16313
|
if (GGML_OP_HAS_INIT[node->op]) {
|
@@ -16020,7 +16315,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16020
16315
|
ggml_compute_forward(¶ms, node);
|
16021
16316
|
}
|
16022
16317
|
|
16023
|
-
if (
|
16318
|
+
if (n_tasks == 1) {
|
16024
16319
|
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
16025
16320
|
// they do something more efficient than spinning (?)
|
16026
16321
|
params.type = GGML_TASK_COMPUTE;
|
@@ -16034,6 +16329,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16034
16329
|
} else {
|
16035
16330
|
break;
|
16036
16331
|
}
|
16332
|
+
|
16333
|
+
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
16334
|
+
break;
|
16335
|
+
}
|
16037
16336
|
}
|
16038
16337
|
|
16039
16338
|
atomic_store(&state->shared->n_active, n_threads);
|
@@ -16042,7 +16341,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16042
16341
|
// wait for other threads to finish
|
16043
16342
|
const int last = node_n;
|
16044
16343
|
do {
|
16045
|
-
sched_yield();
|
16344
|
+
//sched_yield();
|
16046
16345
|
node_n = atomic_load(&state->shared->node_n);
|
16047
16346
|
} while (node_n == last);
|
16048
16347
|
}
|
@@ -16052,366 +16351,395 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16052
16351
|
|
16053
16352
|
/* COMPUTE */
|
16054
16353
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16354
|
+
const int n_tasks = n_tasks_arr[node_n];
|
16055
16355
|
|
16056
16356
|
struct ggml_compute_params params = {
|
16057
16357
|
/*.type =*/ GGML_TASK_COMPUTE,
|
16058
16358
|
/*.ith =*/ state->ith,
|
16059
|
-
/*.nth =*/
|
16060
|
-
/*.wsize =*/
|
16061
|
-
/*.wdata =*/
|
16359
|
+
/*.nth =*/ n_tasks,
|
16360
|
+
/*.wsize =*/ cplan->work_size,
|
16361
|
+
/*.wdata =*/ cplan->work_data,
|
16062
16362
|
};
|
16063
16363
|
|
16064
|
-
if (state->ith <
|
16364
|
+
if (state->ith < n_tasks) {
|
16065
16365
|
ggml_compute_forward(¶ms, node);
|
16066
16366
|
}
|
16067
16367
|
}
|
16068
16368
|
|
16069
|
-
return
|
16369
|
+
return GGML_EXIT_SUCCESS;
|
16070
16370
|
}
|
16071
16371
|
|
16072
|
-
|
16073
|
-
|
16372
|
+
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
16373
|
+
if (n_threads <= 0) {
|
16374
|
+
n_threads = GGML_DEFAULT_N_THREADS;
|
16375
|
+
}
|
16074
16376
|
|
16075
|
-
|
16076
|
-
/*.cgraph =*/ cgraph,
|
16077
|
-
/*.perf_node_start_cycles =*/ 0,
|
16078
|
-
/*.perf_node_start_time_us =*/ 0,
|
16079
|
-
/*.n_threads =*/ n_threads,
|
16080
|
-
/*.n_active =*/ n_threads,
|
16081
|
-
/*.node_n =*/ -1,
|
16082
|
-
};
|
16083
|
-
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
16377
|
+
size_t work_size = 0;
|
16084
16378
|
|
16085
|
-
|
16086
|
-
|
16087
|
-
size_t work_size = 0;
|
16379
|
+
struct ggml_cplan cplan;
|
16380
|
+
memset(&cplan, 0, sizeof(struct ggml_cplan));
|
16088
16381
|
|
16089
|
-
|
16090
|
-
|
16091
|
-
|
16382
|
+
// thread scheduling for the different operations + work buffer size estimation
|
16383
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
16384
|
+
int n_tasks = 1;
|
16092
16385
|
|
16093
|
-
|
16094
|
-
case GGML_OP_CPY:
|
16095
|
-
case GGML_OP_DUP:
|
16096
|
-
{
|
16097
|
-
node->n_tasks = n_threads;
|
16386
|
+
struct ggml_tensor * node = cgraph->nodes[i];
|
16098
16387
|
|
16099
|
-
|
16100
|
-
|
16101
|
-
|
16102
|
-
|
16388
|
+
switch (node->op) {
|
16389
|
+
case GGML_OP_CPY:
|
16390
|
+
case GGML_OP_DUP:
|
16391
|
+
{
|
16392
|
+
n_tasks = n_threads;
|
16103
16393
|
|
16104
|
-
|
16105
|
-
|
16106
|
-
|
16107
|
-
|
16108
|
-
{
|
16109
|
-
node->n_tasks = n_threads;
|
16394
|
+
size_t cur = 0;
|
16395
|
+
if (ggml_is_quantized(node->type)) {
|
16396
|
+
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
|
16397
|
+
}
|
16110
16398
|
|
16111
|
-
|
16399
|
+
work_size = MAX(work_size, cur);
|
16400
|
+
} break;
|
16401
|
+
case GGML_OP_ADD:
|
16402
|
+
case GGML_OP_ADD1:
|
16403
|
+
{
|
16404
|
+
n_tasks = n_threads;
|
16112
16405
|
|
16113
|
-
|
16114
|
-
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
|
16115
|
-
}
|
16406
|
+
size_t cur = 0;
|
16116
16407
|
|
16117
|
-
|
16118
|
-
|
16119
|
-
|
16120
|
-
{
|
16121
|
-
node->n_tasks = n_threads;
|
16408
|
+
if (ggml_is_quantized(node->src[0]->type)) {
|
16409
|
+
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
|
16410
|
+
}
|
16122
16411
|
|
16123
|
-
|
16412
|
+
work_size = MAX(work_size, cur);
|
16413
|
+
} break;
|
16414
|
+
case GGML_OP_ACC:
|
16415
|
+
{
|
16416
|
+
n_tasks = n_threads;
|
16124
16417
|
|
16125
|
-
|
16126
|
-
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
|
16127
|
-
}
|
16418
|
+
size_t cur = 0;
|
16128
16419
|
|
16129
|
-
|
16130
|
-
|
16131
|
-
|
16132
|
-
|
16133
|
-
|
16134
|
-
|
16135
|
-
|
16136
|
-
|
16137
|
-
|
16138
|
-
|
16139
|
-
|
16140
|
-
|
16141
|
-
|
16142
|
-
|
16143
|
-
|
16144
|
-
|
16145
|
-
|
16146
|
-
|
16147
|
-
|
16148
|
-
|
16149
|
-
|
16150
|
-
|
16151
|
-
|
16152
|
-
|
16153
|
-
|
16154
|
-
|
16155
|
-
|
16156
|
-
|
16157
|
-
|
16158
|
-
|
16159
|
-
|
16160
|
-
|
16161
|
-
|
16162
|
-
|
16163
|
-
|
16164
|
-
|
16165
|
-
|
16166
|
-
|
16167
|
-
|
16168
|
-
|
16169
|
-
|
16170
|
-
|
16171
|
-
|
16172
|
-
|
16173
|
-
|
16174
|
-
|
16175
|
-
|
16176
|
-
|
16420
|
+
if (ggml_is_quantized(node->src[0]->type)) {
|
16421
|
+
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
|
16422
|
+
}
|
16423
|
+
|
16424
|
+
work_size = MAX(work_size, cur);
|
16425
|
+
} break;
|
16426
|
+
case GGML_OP_SUB:
|
16427
|
+
case GGML_OP_DIV:
|
16428
|
+
case GGML_OP_SQR:
|
16429
|
+
case GGML_OP_SQRT:
|
16430
|
+
case GGML_OP_LOG:
|
16431
|
+
case GGML_OP_SUM:
|
16432
|
+
case GGML_OP_SUM_ROWS:
|
16433
|
+
case GGML_OP_MEAN:
|
16434
|
+
case GGML_OP_ARGMAX:
|
16435
|
+
case GGML_OP_REPEAT:
|
16436
|
+
case GGML_OP_REPEAT_BACK:
|
16437
|
+
case GGML_OP_ABS:
|
16438
|
+
case GGML_OP_SGN:
|
16439
|
+
case GGML_OP_NEG:
|
16440
|
+
case GGML_OP_STEP:
|
16441
|
+
case GGML_OP_TANH:
|
16442
|
+
case GGML_OP_ELU:
|
16443
|
+
case GGML_OP_RELU:
|
16444
|
+
{
|
16445
|
+
n_tasks = 1;
|
16446
|
+
} break;
|
16447
|
+
case GGML_OP_MUL:
|
16448
|
+
case GGML_OP_GELU:
|
16449
|
+
case GGML_OP_GELU_QUICK:
|
16450
|
+
case GGML_OP_SILU:
|
16451
|
+
case GGML_OP_SILU_BACK:
|
16452
|
+
case GGML_OP_NORM:
|
16453
|
+
case GGML_OP_RMS_NORM:
|
16454
|
+
case GGML_OP_RMS_NORM_BACK:
|
16455
|
+
{
|
16456
|
+
n_tasks = n_threads;
|
16457
|
+
} break;
|
16458
|
+
case GGML_OP_MUL_MAT:
|
16459
|
+
case GGML_OP_OUT_PROD:
|
16460
|
+
{
|
16461
|
+
n_tasks = n_threads;
|
16462
|
+
|
16463
|
+
// TODO: use different scheduling for different matrix sizes
|
16464
|
+
//const int nr0 = ggml_nrows(node->src[0]);
|
16465
|
+
//const int nr1 = ggml_nrows(node->src[1]);
|
16466
|
+
|
16467
|
+
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
16468
|
+
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
16469
|
+
|
16470
|
+
size_t cur = 0;
|
16471
|
+
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
16177
16472
|
|
16178
16473
|
#if defined(GGML_USE_CUBLAS)
|
16179
|
-
|
16180
|
-
|
16181
|
-
|
16182
|
-
|
16183
|
-
else
|
16474
|
+
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
16475
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
16476
|
+
// the threads are still spinning
|
16477
|
+
} else
|
16184
16478
|
#elif defined(GGML_USE_CLBLAST)
|
16185
|
-
|
16186
|
-
|
16187
|
-
|
16188
|
-
|
16189
|
-
|
16190
|
-
else
|
16479
|
+
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
16480
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
16481
|
+
// the threads are still spinning
|
16482
|
+
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
|
16483
|
+
} else
|
16191
16484
|
#endif
|
16192
16485
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16193
|
-
|
16194
|
-
|
16195
|
-
|
16196
|
-
|
16197
|
-
|
16198
|
-
|
16199
|
-
}
|
16200
|
-
} else
|
16201
|
-
#endif
|
16202
|
-
if (node->src1->type != vec_dot_type) {
|
16203
|
-
cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
|
16204
|
-
} else {
|
16205
|
-
cur = 0;
|
16486
|
+
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
16487
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
16488
|
+
// the threads are still spinning
|
16489
|
+
if (node->src[0]->type != GGML_TYPE_F32) {
|
16490
|
+
// here we need memory just for single 2D matrix from src0
|
16491
|
+
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
16206
16492
|
}
|
16493
|
+
} else
|
16494
|
+
#endif
|
16495
|
+
if (node->src[1]->type != vec_dot_type) {
|
16496
|
+
cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
|
16497
|
+
} else {
|
16498
|
+
cur = 0;
|
16499
|
+
}
|
16207
16500
|
|
16208
|
-
|
16209
|
-
|
16210
|
-
|
16211
|
-
|
16212
|
-
|
16213
|
-
|
16214
|
-
|
16215
|
-
|
16216
|
-
|
16217
|
-
|
16218
|
-
|
16219
|
-
|
16220
|
-
|
16221
|
-
|
16222
|
-
|
16223
|
-
|
16224
|
-
|
16225
|
-
|
16226
|
-
|
16227
|
-
|
16228
|
-
|
16229
|
-
|
16230
|
-
|
16231
|
-
|
16232
|
-
|
16233
|
-
|
16234
|
-
|
16235
|
-
|
16236
|
-
|
16237
|
-
|
16238
|
-
|
16239
|
-
|
16240
|
-
|
16241
|
-
|
16242
|
-
|
16243
|
-
|
16244
|
-
|
16245
|
-
|
16246
|
-
|
16247
|
-
|
16248
|
-
|
16249
|
-
|
16250
|
-
|
16251
|
-
|
16252
|
-
|
16253
|
-
|
16254
|
-
|
16255
|
-
node->
|
16256
|
-
|
16257
|
-
|
16258
|
-
|
16259
|
-
|
16260
|
-
|
16261
|
-
|
16262
|
-
|
16263
|
-
|
16264
|
-
|
16265
|
-
|
16266
|
-
|
16267
|
-
|
16268
|
-
|
16501
|
+
work_size = MAX(work_size, cur);
|
16502
|
+
} break;
|
16503
|
+
case GGML_OP_SCALE:
|
16504
|
+
{
|
16505
|
+
n_tasks = 1;
|
16506
|
+
} break;
|
16507
|
+
case GGML_OP_SET:
|
16508
|
+
case GGML_OP_CONT:
|
16509
|
+
case GGML_OP_RESHAPE:
|
16510
|
+
case GGML_OP_VIEW:
|
16511
|
+
case GGML_OP_PERMUTE:
|
16512
|
+
case GGML_OP_TRANSPOSE:
|
16513
|
+
case GGML_OP_GET_ROWS:
|
16514
|
+
case GGML_OP_GET_ROWS_BACK:
|
16515
|
+
case GGML_OP_DIAG:
|
16516
|
+
case GGML_OP_DIAG_MASK_ZERO:
|
16517
|
+
{
|
16518
|
+
n_tasks = 1;
|
16519
|
+
} break;
|
16520
|
+
case GGML_OP_DIAG_MASK_INF:
|
16521
|
+
case GGML_OP_SOFT_MAX:
|
16522
|
+
case GGML_OP_SOFT_MAX_BACK:
|
16523
|
+
case GGML_OP_ROPE:
|
16524
|
+
case GGML_OP_ROPE_BACK:
|
16525
|
+
{
|
16526
|
+
n_tasks = n_threads;
|
16527
|
+
} break;
|
16528
|
+
case GGML_OP_ALIBI:
|
16529
|
+
{
|
16530
|
+
n_tasks = 1; //TODO
|
16531
|
+
} break;
|
16532
|
+
case GGML_OP_CLAMP:
|
16533
|
+
{
|
16534
|
+
n_tasks = 1; //TODO
|
16535
|
+
} break;
|
16536
|
+
case GGML_OP_CONV_1D:
|
16537
|
+
{
|
16538
|
+
n_tasks = n_threads;
|
16539
|
+
|
16540
|
+
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
16541
|
+
GGML_ASSERT(node->src[1]->ne[2] == 1);
|
16542
|
+
GGML_ASSERT(node->src[1]->ne[3] == 1);
|
16543
|
+
|
16544
|
+
size_t cur = 0;
|
16545
|
+
const int nk = node->src[0]->ne[0];
|
16546
|
+
|
16547
|
+
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16548
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16549
|
+
cur = sizeof(ggml_fp16_t)*(
|
16550
|
+
nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
|
16551
|
+
( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
|
16552
|
+
);
|
16553
|
+
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
16554
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16555
|
+
cur = sizeof(float)*(
|
16556
|
+
nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
|
16557
|
+
( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
|
16558
|
+
);
|
16559
|
+
} else {
|
16560
|
+
GGML_ASSERT(false);
|
16561
|
+
}
|
16269
16562
|
|
16270
|
-
|
16271
|
-
|
16272
|
-
|
16273
|
-
|
16274
|
-
|
16563
|
+
work_size = MAX(work_size, cur);
|
16564
|
+
} break;
|
16565
|
+
case GGML_OP_CONV_2D:
|
16566
|
+
{
|
16567
|
+
n_tasks = n_threads;
|
16275
16568
|
|
16276
|
-
|
16569
|
+
const int64_t ne00 = node->src[0]->ne[0]; // W
|
16570
|
+
const int64_t ne01 = node->src[0]->ne[1]; // H
|
16571
|
+
const int64_t ne02 = node->src[0]->ne[2]; // C
|
16572
|
+
const int64_t ne03 = node->src[0]->ne[3]; // N
|
16277
16573
|
|
16278
|
-
|
16279
|
-
|
16280
|
-
|
16281
|
-
const int64_t ne03 = node->src0->ne[3]; // N
|
16574
|
+
const int64_t ne10 = node->src[1]->ne[0]; // W
|
16575
|
+
const int64_t ne11 = node->src[1]->ne[1]; // H
|
16576
|
+
const int64_t ne12 = node->src[1]->ne[2]; // C
|
16282
16577
|
|
16283
|
-
|
16284
|
-
const int64_t ne11 = node->src1->ne[1]; // H
|
16285
|
-
const int64_t ne12 = node->src1->ne[2]; // C
|
16578
|
+
const int64_t nk = ne00*ne01;
|
16286
16579
|
|
16287
|
-
|
16580
|
+
UNUSED(ne02);
|
16581
|
+
UNUSED(ne03);
|
16582
|
+
UNUSED(nk);
|
16288
16583
|
|
16289
|
-
|
16290
|
-
UNUSED(ne03);
|
16291
|
-
UNUSED(nk);
|
16584
|
+
size_t cur = 0;
|
16292
16585
|
|
16293
|
-
|
16586
|
+
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16587
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16588
|
+
cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
|
16589
|
+
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
16590
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16591
|
+
cur = sizeof(float)* (ne10*ne11*ne12);
|
16592
|
+
} else {
|
16593
|
+
GGML_ASSERT(false);
|
16594
|
+
}
|
16294
16595
|
|
16295
|
-
|
16296
|
-
|
16297
|
-
|
16298
|
-
|
16299
|
-
|
16300
|
-
|
16301
|
-
|
16302
|
-
|
16303
|
-
|
16596
|
+
work_size = MAX(work_size, cur);
|
16597
|
+
} break;
|
16598
|
+
case GGML_OP_POOL_1D:
|
16599
|
+
case GGML_OP_POOL_2D:
|
16600
|
+
{
|
16601
|
+
n_tasks = 1;
|
16602
|
+
} break;
|
16603
|
+
case GGML_OP_FLASH_ATTN:
|
16604
|
+
{
|
16605
|
+
n_tasks = n_threads;
|
16304
16606
|
|
16305
|
-
|
16306
|
-
} break;
|
16307
|
-
case GGML_OP_FLASH_ATTN:
|
16308
|
-
{
|
16309
|
-
node->n_tasks = n_threads;
|
16607
|
+
size_t cur = 0;
|
16310
16608
|
|
16311
|
-
|
16609
|
+
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16312
16610
|
|
16313
|
-
|
16611
|
+
if (node->src[1]->type == GGML_TYPE_F32) {
|
16612
|
+
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
16613
|
+
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
16614
|
+
}
|
16314
16615
|
|
16315
|
-
|
16316
|
-
|
16317
|
-
|
16318
|
-
|
16616
|
+
if (node->src[1]->type == GGML_TYPE_F16) {
|
16617
|
+
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
16618
|
+
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
16619
|
+
}
|
16319
16620
|
|
16320
|
-
|
16321
|
-
|
16322
|
-
|
16323
|
-
|
16621
|
+
work_size = MAX(work_size, cur);
|
16622
|
+
} break;
|
16623
|
+
case GGML_OP_FLASH_FF:
|
16624
|
+
{
|
16625
|
+
n_tasks = n_threads;
|
16324
16626
|
|
16325
|
-
|
16326
|
-
} break;
|
16327
|
-
case GGML_OP_FLASH_FF:
|
16328
|
-
{
|
16329
|
-
node->n_tasks = n_threads;
|
16627
|
+
size_t cur = 0;
|
16330
16628
|
|
16331
|
-
|
16629
|
+
if (node->src[1]->type == GGML_TYPE_F32) {
|
16630
|
+
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16631
|
+
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
16632
|
+
}
|
16332
16633
|
|
16333
|
-
|
16334
|
-
|
16335
|
-
|
16336
|
-
|
16634
|
+
if (node->src[1]->type == GGML_TYPE_F16) {
|
16635
|
+
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16636
|
+
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
16637
|
+
}
|
16337
16638
|
|
16338
|
-
|
16339
|
-
|
16340
|
-
|
16341
|
-
|
16639
|
+
work_size = MAX(work_size, cur);
|
16640
|
+
} break;
|
16641
|
+
case GGML_OP_FLASH_ATTN_BACK:
|
16642
|
+
{
|
16643
|
+
n_tasks = n_threads;
|
16342
16644
|
|
16343
|
-
|
16344
|
-
} break;
|
16345
|
-
case GGML_OP_FLASH_ATTN_BACK:
|
16346
|
-
{
|
16347
|
-
node->n_tasks = n_threads;
|
16645
|
+
size_t cur = 0;
|
16348
16646
|
|
16349
|
-
|
16647
|
+
const int64_t D = node->src[0]->ne[0];
|
16648
|
+
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16649
|
+
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
16650
|
+
if (node->src[1]->type == GGML_TYPE_F32) {
|
16651
|
+
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
16652
|
+
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
16653
|
+
}
|
16350
16654
|
|
16351
|
-
|
16352
|
-
|
16353
|
-
|
16354
|
-
|
16355
|
-
cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
|
16356
|
-
cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
|
16357
|
-
}
|
16655
|
+
if (node->src[1]->type == GGML_TYPE_F16) {
|
16656
|
+
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
16657
|
+
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
16658
|
+
}
|
16358
16659
|
|
16359
|
-
|
16360
|
-
|
16361
|
-
|
16362
|
-
|
16660
|
+
work_size = MAX(work_size, cur);
|
16661
|
+
} break;
|
16662
|
+
case GGML_OP_WIN_PART:
|
16663
|
+
case GGML_OP_WIN_UNPART:
|
16664
|
+
case GGML_OP_MAP_UNARY:
|
16665
|
+
case GGML_OP_MAP_BINARY:
|
16666
|
+
case GGML_OP_MAP_CUSTOM1:
|
16667
|
+
case GGML_OP_MAP_CUSTOM2:
|
16668
|
+
case GGML_OP_MAP_CUSTOM3:
|
16669
|
+
{
|
16670
|
+
n_tasks = 1;
|
16671
|
+
} break;
|
16672
|
+
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16673
|
+
{
|
16674
|
+
n_tasks = n_threads;
|
16363
16675
|
|
16364
|
-
|
16365
|
-
|
16366
|
-
|
16367
|
-
|
16368
|
-
|
16369
|
-
|
16370
|
-
|
16371
|
-
|
16372
|
-
|
16373
|
-
{
|
16374
|
-
node->n_tasks = 1;
|
16375
|
-
} break;
|
16376
|
-
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16377
|
-
{
|
16378
|
-
node->n_tasks = n_threads;
|
16379
|
-
|
16380
|
-
size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
|
16381
|
-
|
16382
|
-
work_size = MAX(work_size, cur);
|
16383
|
-
} break;
|
16384
|
-
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
16385
|
-
{
|
16386
|
-
node->n_tasks = n_threads;
|
16387
|
-
|
16388
|
-
size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
|
16389
|
-
|
16390
|
-
work_size = MAX(work_size, cur);
|
16391
|
-
} break;
|
16392
|
-
case GGML_OP_NONE:
|
16393
|
-
{
|
16394
|
-
node->n_tasks = 1;
|
16395
|
-
} break;
|
16396
|
-
case GGML_OP_COUNT:
|
16397
|
-
{
|
16398
|
-
GGML_ASSERT(false);
|
16399
|
-
} break;
|
16400
|
-
}
|
16401
|
-
}
|
16676
|
+
size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
16677
|
+
|
16678
|
+
work_size = MAX(work_size, cur);
|
16679
|
+
} break;
|
16680
|
+
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
16681
|
+
{
|
16682
|
+
n_tasks = n_threads;
|
16683
|
+
|
16684
|
+
size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
|
16402
16685
|
|
16403
|
-
|
16404
|
-
|
16686
|
+
work_size = MAX(work_size, cur);
|
16687
|
+
} break;
|
16688
|
+
case GGML_OP_NONE:
|
16689
|
+
{
|
16690
|
+
n_tasks = 1;
|
16691
|
+
} break;
|
16692
|
+
case GGML_OP_COUNT:
|
16693
|
+
{
|
16694
|
+
GGML_ASSERT(false);
|
16695
|
+
} break;
|
16405
16696
|
}
|
16406
16697
|
|
16407
|
-
|
16408
|
-
|
16698
|
+
cplan.n_tasks[i] = n_tasks;
|
16699
|
+
}
|
16700
|
+
|
16701
|
+
if (work_size > 0) {
|
16702
|
+
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
16703
|
+
}
|
16704
|
+
|
16705
|
+
cplan.n_threads = n_threads;
|
16706
|
+
cplan.work_size = work_size;
|
16707
|
+
cplan.work_data = NULL;
|
16708
|
+
|
16709
|
+
return cplan;
|
16710
|
+
}
|
16711
|
+
|
16712
|
+
int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
16713
|
+
{
|
16714
|
+
GGML_ASSERT(cplan);
|
16715
|
+
GGML_ASSERT(cplan->n_threads > 0);
|
16409
16716
|
|
16410
|
-
|
16411
|
-
|
16717
|
+
if (cplan->work_size > 0) {
|
16718
|
+
GGML_ASSERT(cplan->work_data);
|
16719
|
+
}
|
16720
|
+
|
16721
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
16722
|
+
if (cgraph->nodes[i]->op != GGML_OP_NONE) {
|
16723
|
+
GGML_ASSERT(cplan->n_tasks[i] > 0);
|
16724
|
+
}
|
16412
16725
|
}
|
16413
16726
|
}
|
16414
16727
|
|
16728
|
+
const int n_threads = cplan->n_threads;
|
16729
|
+
|
16730
|
+
struct ggml_compute_state_shared state_shared = {
|
16731
|
+
/*.cgraph =*/ cgraph,
|
16732
|
+
/*.cgraph_plan =*/ cplan,
|
16733
|
+
/*.perf_node_start_cycles =*/ 0,
|
16734
|
+
/*.perf_node_start_time_us =*/ 0,
|
16735
|
+
/*.n_threads =*/ n_threads,
|
16736
|
+
/*.n_active =*/ n_threads,
|
16737
|
+
/*.node_n =*/ -1,
|
16738
|
+
/*.abort_callback =*/ NULL,
|
16739
|
+
/*.abort_callback_data =*/ NULL,
|
16740
|
+
};
|
16741
|
+
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
16742
|
+
|
16415
16743
|
// create thread pool
|
16416
16744
|
if (n_threads > 1) {
|
16417
16745
|
for (int j = 1; j < n_threads; ++j) {
|
@@ -16432,12 +16760,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16432
16760
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
16433
16761
|
|
16434
16762
|
// this is a work thread too
|
16435
|
-
ggml_graph_compute_thread(&workers[0]);
|
16763
|
+
int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
|
16436
16764
|
|
16437
16765
|
// don't leave affinity set on the main thread
|
16438
16766
|
clear_numa_thread_affinity();
|
16439
16767
|
|
16440
|
-
// join thread pool
|
16768
|
+
// join or kill thread pool
|
16441
16769
|
if (n_threads > 1) {
|
16442
16770
|
for (int j = 1; j < n_threads; j++) {
|
16443
16771
|
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
@@ -16461,6 +16789,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16461
16789
|
(double) perf_time_us_cur / 1000.0,
|
16462
16790
|
(double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
|
16463
16791
|
}
|
16792
|
+
|
16793
|
+
return compute_status;
|
16464
16794
|
}
|
16465
16795
|
|
16466
16796
|
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
@@ -16473,6 +16803,17 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
|
16473
16803
|
}
|
16474
16804
|
}
|
16475
16805
|
|
16806
|
+
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
16807
|
+
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
16808
|
+
|
16809
|
+
struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
|
16810
|
+
GGML_ASSERT(buf);
|
16811
|
+
|
16812
|
+
cplan.work_data = buf->data;
|
16813
|
+
|
16814
|
+
ggml_graph_compute(cgraph, &cplan);
|
16815
|
+
}
|
16816
|
+
|
16476
16817
|
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
16477
16818
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
16478
16819
|
struct ggml_tensor * leaf = cgraph->leafs[i];
|
@@ -16511,14 +16852,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
16511
16852
|
const int64_t * ne = tensor->ne;
|
16512
16853
|
const size_t * nb = tensor->nb;
|
16513
16854
|
|
16514
|
-
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %
|
16855
|
+
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
16515
16856
|
arg,
|
16516
16857
|
ggml_type_name(tensor->type),
|
16517
16858
|
ggml_op_name (tensor->op),
|
16518
16859
|
tensor->n_dims,
|
16519
16860
|
ne[0], ne[1], ne[2], ne[3],
|
16520
16861
|
nb[0], nb[1], nb[2], nb[3],
|
16521
|
-
tensor->n_tasks,
|
16522
16862
|
tensor->data,
|
16523
16863
|
tensor->name);
|
16524
16864
|
}
|
@@ -16555,8 +16895,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16555
16895
|
ggml_graph_export_leaf(cgraph->leafs[i], fout);
|
16556
16896
|
|
16557
16897
|
GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
|
16558
|
-
GGML_ASSERT(cgraph->leafs[i]->
|
16559
|
-
GGML_ASSERT(cgraph->leafs[i]->
|
16898
|
+
GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL);
|
16899
|
+
GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL);
|
16560
16900
|
}
|
16561
16901
|
|
16562
16902
|
// header
|
@@ -16567,17 +16907,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16567
16907
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
16568
16908
|
ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
|
16569
16909
|
|
16570
|
-
|
16571
|
-
|
16572
|
-
|
16573
|
-
|
16574
|
-
if (cgraph->nodes[i]->src1) {
|
16575
|
-
ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
|
16576
|
-
}
|
16577
|
-
|
16578
|
-
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
16579
|
-
if (cgraph->nodes[i]->opt[j]) {
|
16580
|
-
ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
|
16910
|
+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
16911
|
+
if (cgraph->nodes[i]->src[j]) {
|
16912
|
+
ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
|
16581
16913
|
}
|
16582
16914
|
}
|
16583
16915
|
|
@@ -16668,16 +17000,13 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16668
17000
|
|
16669
17001
|
// output the op arguments
|
16670
17002
|
{
|
16671
|
-
struct ggml_tensor * args[
|
16672
|
-
|
16673
|
-
args[0] = tensor->src0;
|
16674
|
-
args[1] = tensor->src1;
|
17003
|
+
struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
|
16675
17004
|
|
16676
|
-
for (int j = 0; j <
|
16677
|
-
args[
|
17005
|
+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
17006
|
+
args[j] = tensor->src[j];
|
16678
17007
|
}
|
16679
17008
|
|
16680
|
-
for (int j = 0; j <
|
17009
|
+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
16681
17010
|
if (args[j]) {
|
16682
17011
|
int32_t idx = -1;
|
16683
17012
|
|
@@ -16895,12 +17224,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
16895
17224
|
|
16896
17225
|
const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
|
16897
17226
|
|
16898
|
-
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr +=
|
17227
|
+
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
|
16899
17228
|
|
16900
|
-
struct ggml_tensor * args[
|
17229
|
+
struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
|
16901
17230
|
|
16902
17231
|
// parse args
|
16903
|
-
for (int j = 0; j <
|
17232
|
+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
16904
17233
|
const int32_t arg_idx = ptr_arg_idx[j];
|
16905
17234
|
|
16906
17235
|
if (arg_idx == -1) {
|
@@ -16957,11 +17286,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
16957
17286
|
tensor->nb[j] = nb[j];
|
16958
17287
|
}
|
16959
17288
|
|
16960
|
-
|
16961
|
-
|
16962
|
-
|
16963
|
-
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
16964
|
-
tensor->opt[j] = args[2 + j];
|
17289
|
+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
17290
|
+
tensor->src[j] = args[j];
|
16965
17291
|
}
|
16966
17292
|
|
16967
17293
|
result.nodes[i] = tensor;
|
@@ -17160,19 +17486,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17160
17486
|
for (int i = 0; i < gb->n_nodes; i++) {
|
17161
17487
|
struct ggml_tensor * node = gb->nodes[i];
|
17162
17488
|
|
17163
|
-
|
17164
|
-
|
17165
|
-
}
|
17166
|
-
|
17167
|
-
if (node->src1) {
|
17168
|
-
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
|
17169
|
-
}
|
17170
|
-
|
17171
|
-
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
17172
|
-
if (node->opt[j]) {
|
17489
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
17490
|
+
if (node->src[j]) {
|
17173
17491
|
char label[16];
|
17174
|
-
snprintf(label, sizeof(label), "
|
17175
|
-
ggml_graph_dump_dot_node_edge(fp, gb, node, node->
|
17492
|
+
snprintf(label, sizeof(label), "src %d", j);
|
17493
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
|
17176
17494
|
}
|
17177
17495
|
}
|
17178
17496
|
}
|
@@ -17180,19 +17498,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17180
17498
|
for (int i = 0; i < gb->n_leafs; i++) {
|
17181
17499
|
struct ggml_tensor * node = gb->leafs[i];
|
17182
17500
|
|
17183
|
-
|
17184
|
-
|
17185
|
-
}
|
17186
|
-
|
17187
|
-
if (node->src1) {
|
17188
|
-
ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
|
17189
|
-
}
|
17190
|
-
|
17191
|
-
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
17192
|
-
if (node->opt[j]) {
|
17501
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
17502
|
+
if (node->src[j]) {
|
17193
17503
|
char label[16];
|
17194
|
-
snprintf(label, sizeof(label), "
|
17195
|
-
ggml_graph_dump_dot_leaf_edge(fp, node, node->
|
17504
|
+
snprintf(label, sizeof(label), "src %d", j);
|
17505
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
|
17196
17506
|
}
|
17197
17507
|
}
|
17198
17508
|
}
|
@@ -17254,9 +17564,6 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17254
17564
|
struct ggml_cgraph * gb) {
|
17255
17565
|
GGML_ASSERT(ggml_is_scalar(f));
|
17256
17566
|
|
17257
|
-
gf->n_threads = params.n_threads;
|
17258
|
-
gb->n_threads = params.n_threads;
|
17259
|
-
|
17260
17567
|
// these will store the parameters we want to optimize
|
17261
17568
|
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
17262
17569
|
|
@@ -17303,7 +17610,8 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17303
17610
|
// compute the function value
|
17304
17611
|
ggml_graph_reset (gf);
|
17305
17612
|
ggml_set_f32 (f->grad, 1.0f);
|
17306
|
-
|
17613
|
+
|
17614
|
+
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
17307
17615
|
|
17308
17616
|
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
17309
17617
|
opt->adam.fx_best = opt->adam.fx_prev;
|
@@ -17383,7 +17691,8 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17383
17691
|
|
17384
17692
|
ggml_graph_reset (gf);
|
17385
17693
|
ggml_set_f32 (f->grad, 1.0f);
|
17386
|
-
|
17694
|
+
|
17695
|
+
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
17387
17696
|
|
17388
17697
|
const float fx = ggml_get_f32_1d(f, 0);
|
17389
17698
|
|
@@ -17505,7 +17814,8 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
17505
17814
|
|
17506
17815
|
ggml_graph_reset (gf);
|
17507
17816
|
ggml_set_f32 (f->grad, 1.0f);
|
17508
|
-
|
17817
|
+
|
17818
|
+
ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
|
17509
17819
|
|
17510
17820
|
ggml_opt_get_grad(np, ps, g);
|
17511
17821
|
|
@@ -17573,9 +17883,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17573
17883
|
}
|
17574
17884
|
}
|
17575
17885
|
|
17576
|
-
gf->n_threads = params.n_threads;
|
17577
|
-
gb->n_threads = params.n_threads;
|
17578
|
-
|
17579
17886
|
const int m = params.lbfgs.m;
|
17580
17887
|
|
17581
17888
|
// these will store the parameters we want to optimize
|
@@ -17627,7 +17934,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17627
17934
|
|
17628
17935
|
ggml_graph_reset (gf);
|
17629
17936
|
ggml_set_f32 (f->grad, 1.0f);
|
17630
|
-
|
17937
|
+
|
17938
|
+
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
17631
17939
|
|
17632
17940
|
ggml_opt_get_grad(np, ps, g);
|
17633
17941
|
|