llama_cpp 0.3.2 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +302 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +677 -118
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +65 -45
- data/ext/llama_cpp/src/ggml-metal.metal +610 -484
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1146 -812
- data/ext/llama_cpp/src/ggml.h +77 -19
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +289 -104
- data/ext/llama_cpp/src/llama.h +46 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -1
- data/sig/llama_cpp.rbs +14 -1
- metadata +4 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -25,16 +25,23 @@
|
|
25
25
|
#include <float.h>
|
26
26
|
#include <limits.h>
|
27
27
|
#include <stdarg.h>
|
28
|
+
#include <signal.h>
|
28
29
|
|
29
30
|
#ifdef GGML_USE_METAL
|
30
31
|
#include <unistd.h>
|
31
32
|
#endif
|
32
33
|
|
34
|
+
// static_assert should be a #define, but if it's not,
|
35
|
+
// fall back to the _Static_assert C11 keyword.
|
33
36
|
// if C99 - static_assert is noop
|
34
37
|
// ref: https://stackoverflow.com/a/53923785/4039976
|
35
38
|
#ifndef static_assert
|
39
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
40
|
+
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
41
|
+
#else
|
36
42
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
37
43
|
#endif
|
44
|
+
#endif
|
38
45
|
|
39
46
|
#if defined(_MSC_VER)
|
40
47
|
// disable "possible loss of data" to avoid hundreds of casts
|
@@ -49,23 +56,23 @@
|
|
49
56
|
typedef volatile LONG atomic_int;
|
50
57
|
typedef atomic_int atomic_bool;
|
51
58
|
|
52
|
-
static void atomic_store(atomic_int* ptr, LONG val) {
|
59
|
+
static void atomic_store(atomic_int * ptr, LONG val) {
|
53
60
|
InterlockedExchange(ptr, val);
|
54
61
|
}
|
55
|
-
static LONG atomic_load(atomic_int* ptr) {
|
62
|
+
static LONG atomic_load(atomic_int * ptr) {
|
56
63
|
return InterlockedCompareExchange(ptr, 0, 0);
|
57
64
|
}
|
58
|
-
static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
|
65
|
+
static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
|
59
66
|
return InterlockedExchangeAdd(ptr, inc);
|
60
67
|
}
|
61
|
-
static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
|
68
|
+
static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
|
62
69
|
return atomic_fetch_add(ptr, -(dec));
|
63
70
|
}
|
64
71
|
|
65
72
|
typedef HANDLE pthread_t;
|
66
73
|
|
67
74
|
typedef DWORD thread_ret_t;
|
68
|
-
static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
|
75
|
+
static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
|
69
76
|
(void) unused;
|
70
77
|
HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
|
71
78
|
if (handle == NULL)
|
@@ -77,7 +84,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void
|
|
77
84
|
return 0;
|
78
85
|
}
|
79
86
|
|
80
|
-
static int pthread_join(pthread_t thread, void* unused) {
|
87
|
+
static int pthread_join(pthread_t thread, void * unused) {
|
81
88
|
(void) unused;
|
82
89
|
return (int) WaitForSingleObject(thread, INFINITE);
|
83
90
|
}
|
@@ -90,7 +97,7 @@ static int sched_yield (void) {
|
|
90
97
|
#include <pthread.h>
|
91
98
|
#include <stdatomic.h>
|
92
99
|
|
93
|
-
typedef void* thread_ret_t;
|
100
|
+
typedef void * thread_ret_t;
|
94
101
|
|
95
102
|
#include <sys/types.h>
|
96
103
|
#include <sys/stat.h>
|
@@ -111,10 +118,6 @@ typedef void* thread_ret_t;
|
|
111
118
|
#endif
|
112
119
|
#endif
|
113
120
|
|
114
|
-
#ifdef __HAIKU__
|
115
|
-
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
116
|
-
#endif
|
117
|
-
|
118
121
|
/*#define GGML_PERF*/
|
119
122
|
#define GGML_DEBUG 0
|
120
123
|
#define GGML_GELU_FP16
|
@@ -247,7 +250,11 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|
247
250
|
#include "ggml-opencl.h"
|
248
251
|
#endif
|
249
252
|
#elif defined(GGML_USE_OPENBLAS)
|
253
|
+
#if defined(GGML_BLAS_USE_MKL)
|
254
|
+
#include <mkl.h>
|
255
|
+
#else
|
250
256
|
#include <cblas.h>
|
257
|
+
#endif
|
251
258
|
#elif defined(GGML_USE_CUBLAS)
|
252
259
|
#include "ggml-cuda.h"
|
253
260
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -3782,6 +3789,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3782
3789
|
"CLAMP",
|
3783
3790
|
"CONV_1D",
|
3784
3791
|
"CONV_2D",
|
3792
|
+
"POOL_1D",
|
3793
|
+
"POOL_2D",
|
3785
3794
|
|
3786
3795
|
"FLASH_ATTN",
|
3787
3796
|
"FLASH_FF",
|
@@ -3800,7 +3809,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3800
3809
|
"CROSS_ENTROPY_LOSS_BACK",
|
3801
3810
|
};
|
3802
3811
|
|
3803
|
-
static_assert(GGML_OP_COUNT ==
|
3812
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3804
3813
|
|
3805
3814
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3806
3815
|
"none",
|
@@ -3860,6 +3869,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3860
3869
|
"clamp(x)",
|
3861
3870
|
"conv_1d(x)",
|
3862
3871
|
"conv_2d(x)",
|
3872
|
+
"pool_1d(x)",
|
3873
|
+
"pool_2d(x)",
|
3863
3874
|
|
3864
3875
|
"flash_attn(x)",
|
3865
3876
|
"flash_ff(x)",
|
@@ -3878,7 +3889,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3878
3889
|
"cross_entropy_loss_back(x,y)",
|
3879
3890
|
};
|
3880
3891
|
|
3881
|
-
static_assert(GGML_OP_COUNT ==
|
3892
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3893
|
+
|
3894
|
+
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
3882
3895
|
|
3883
3896
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3884
3897
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -4157,10 +4170,9 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
|
4157
4170
|
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
4158
4171
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4159
4172
|
|
4160
|
-
return
|
4161
|
-
|
4162
|
-
|
4163
|
-
(t0->ne[3] == t1->ne[3]);
|
4173
|
+
return (t0->ne[0] == t1->ne[0]) &&
|
4174
|
+
(t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
|
4175
|
+
(t1->ne[3]%t0->ne[3] == 0);
|
4164
4176
|
}
|
4165
4177
|
|
4166
4178
|
static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
@@ -4400,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) {
|
|
4400
4412
|
if (&g_state.contexts[i].context == ctx) {
|
4401
4413
|
g_state.contexts[i].used = false;
|
4402
4414
|
|
4403
|
-
GGML_PRINT_DEBUG("%s: context %d
|
4404
|
-
__func__, i, ctx
|
4415
|
+
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
|
4416
|
+
__func__, i, ggml_used_mem(ctx));
|
4405
4417
|
|
4406
4418
|
if (ctx->mem_buffer_owned) {
|
4407
4419
|
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
@@ -4580,17 +4592,14 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4580
4592
|
/*.op =*/ GGML_OP_NONE,
|
4581
4593
|
/*.is_param =*/ false,
|
4582
4594
|
/*.grad =*/ NULL,
|
4583
|
-
/*.
|
4584
|
-
/*.src1 =*/ NULL,
|
4585
|
-
/*.opt =*/ { NULL },
|
4586
|
-
/*.n_tasks =*/ 0,
|
4595
|
+
/*.src =*/ { NULL },
|
4587
4596
|
/*.perf_runs =*/ 0,
|
4588
4597
|
/*.perf_cycles =*/ 0,
|
4589
4598
|
/*.perf_time_us =*/ 0,
|
4590
4599
|
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
4591
4600
|
/*.name =*/ { 0 },
|
4592
4601
|
/*.extra =*/ NULL,
|
4593
|
-
/*.
|
4602
|
+
/*.padding =*/ { 0 },
|
4594
4603
|
};
|
4595
4604
|
|
4596
4605
|
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
@@ -4722,7 +4731,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
|
|
4722
4731
|
{
|
4723
4732
|
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
|
4724
4733
|
for (int i = 0; i < n; i++) {
|
4725
|
-
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
|
4734
|
+
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
|
4726
4735
|
}
|
4727
4736
|
} break;
|
4728
4737
|
case GGML_TYPE_F32:
|
@@ -4774,7 +4783,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
|
|
4774
4783
|
{
|
4775
4784
|
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
|
4776
4785
|
for (int i = 0; i < n; i++) {
|
4777
|
-
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
|
4786
|
+
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
|
4778
4787
|
}
|
4779
4788
|
} break;
|
4780
4789
|
case GGML_TYPE_F32:
|
@@ -5009,8 +5018,8 @@ struct ggml_tensor * ggml_dup_impl(
|
|
5009
5018
|
|
5010
5019
|
result->op = GGML_OP_DUP;
|
5011
5020
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5012
|
-
result->
|
5013
|
-
result->
|
5021
|
+
result->src[0] = a;
|
5022
|
+
result->src[1] = NULL;
|
5014
5023
|
|
5015
5024
|
return result;
|
5016
5025
|
}
|
@@ -5034,11 +5043,15 @@ struct ggml_tensor * ggml_add_impl(
|
|
5034
5043
|
struct ggml_tensor * a,
|
5035
5044
|
struct ggml_tensor * b,
|
5036
5045
|
bool inplace) {
|
5037
|
-
|
5046
|
+
// TODO: support less-strict constraint
|
5047
|
+
// GGML_ASSERT(ggml_can_repeat(b, a));
|
5048
|
+
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
5038
5049
|
|
5039
5050
|
bool is_node = false;
|
5040
5051
|
|
5041
|
-
if (a->grad || b->grad) {
|
5052
|
+
if (!inplace && (a->grad || b->grad)) {
|
5053
|
+
// TODO: support backward pass for broadcasting
|
5054
|
+
GGML_ASSERT(ggml_are_same_shape(a, b));
|
5042
5055
|
is_node = true;
|
5043
5056
|
}
|
5044
5057
|
|
@@ -5046,8 +5059,8 @@ struct ggml_tensor * ggml_add_impl(
|
|
5046
5059
|
|
5047
5060
|
result->op = GGML_OP_ADD;
|
5048
5061
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5049
|
-
result->
|
5050
|
-
result->
|
5062
|
+
result->src[0] = a;
|
5063
|
+
result->src[1] = b;
|
5051
5064
|
|
5052
5065
|
return result;
|
5053
5066
|
}
|
@@ -5086,8 +5099,8 @@ struct ggml_tensor * ggml_add1_impl(
|
|
5086
5099
|
|
5087
5100
|
result->op = GGML_OP_ADD1;
|
5088
5101
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5089
|
-
result->
|
5090
|
-
result->
|
5102
|
+
result->src[0] = a;
|
5103
|
+
result->src[1] = b;
|
5091
5104
|
|
5092
5105
|
return result;
|
5093
5106
|
}
|
@@ -5144,9 +5157,9 @@ struct ggml_tensor * ggml_acc_impl(
|
|
5144
5157
|
|
5145
5158
|
result->op = GGML_OP_ACC;
|
5146
5159
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5147
|
-
result->
|
5148
|
-
result->
|
5149
|
-
result->
|
5160
|
+
result->src[0] = a;
|
5161
|
+
result->src[1] = b;
|
5162
|
+
result->src[2] = c;
|
5150
5163
|
|
5151
5164
|
return result;
|
5152
5165
|
}
|
@@ -5192,8 +5205,8 @@ struct ggml_tensor * ggml_sub_impl(
|
|
5192
5205
|
|
5193
5206
|
result->op = GGML_OP_SUB;
|
5194
5207
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5195
|
-
result->
|
5196
|
-
result->
|
5208
|
+
result->src[0] = a;
|
5209
|
+
result->src[1] = b;
|
5197
5210
|
|
5198
5211
|
return result;
|
5199
5212
|
}
|
@@ -5239,8 +5252,8 @@ struct ggml_tensor * ggml_mul_impl(
|
|
5239
5252
|
|
5240
5253
|
result->op = GGML_OP_MUL;
|
5241
5254
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5242
|
-
result->
|
5243
|
-
result->
|
5255
|
+
result->src[0] = a;
|
5256
|
+
result->src[1] = b;
|
5244
5257
|
|
5245
5258
|
return result;
|
5246
5259
|
}
|
@@ -5282,8 +5295,8 @@ struct ggml_tensor * ggml_div_impl(
|
|
5282
5295
|
|
5283
5296
|
result->op = GGML_OP_DIV;
|
5284
5297
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5285
|
-
result->
|
5286
|
-
result->
|
5298
|
+
result->src[0] = a;
|
5299
|
+
result->src[1] = b;
|
5287
5300
|
|
5288
5301
|
return result;
|
5289
5302
|
}
|
@@ -5318,8 +5331,8 @@ struct ggml_tensor * ggml_sqr_impl(
|
|
5318
5331
|
|
5319
5332
|
result->op = GGML_OP_SQR;
|
5320
5333
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5321
|
-
result->
|
5322
|
-
result->
|
5334
|
+
result->src[0] = a;
|
5335
|
+
result->src[1] = NULL;
|
5323
5336
|
|
5324
5337
|
return result;
|
5325
5338
|
}
|
@@ -5352,8 +5365,8 @@ struct ggml_tensor * ggml_sqrt_impl(
|
|
5352
5365
|
|
5353
5366
|
result->op = GGML_OP_SQRT;
|
5354
5367
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5355
|
-
result->
|
5356
|
-
result->
|
5368
|
+
result->src[0] = a;
|
5369
|
+
result->src[1] = NULL;
|
5357
5370
|
|
5358
5371
|
return result;
|
5359
5372
|
}
|
@@ -5387,8 +5400,8 @@ struct ggml_tensor * ggml_log_impl(
|
|
5387
5400
|
|
5388
5401
|
result->op = GGML_OP_LOG;
|
5389
5402
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5390
|
-
result->
|
5391
|
-
result->
|
5403
|
+
result->src[0] = a;
|
5404
|
+
result->src[1] = NULL;
|
5392
5405
|
|
5393
5406
|
return result;
|
5394
5407
|
}
|
@@ -5420,8 +5433,8 @@ struct ggml_tensor * ggml_sum(
|
|
5420
5433
|
|
5421
5434
|
result->op = GGML_OP_SUM;
|
5422
5435
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5423
|
-
result->
|
5424
|
-
result->
|
5436
|
+
result->src[0] = a;
|
5437
|
+
result->src[1] = NULL;
|
5425
5438
|
|
5426
5439
|
return result;
|
5427
5440
|
}
|
@@ -5447,8 +5460,8 @@ struct ggml_tensor * ggml_sum_rows(
|
|
5447
5460
|
|
5448
5461
|
result->op = GGML_OP_SUM_ROWS;
|
5449
5462
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5450
|
-
result->
|
5451
|
-
result->
|
5463
|
+
result->src[0] = a;
|
5464
|
+
result->src[1] = NULL;
|
5452
5465
|
|
5453
5466
|
return result;
|
5454
5467
|
}
|
@@ -5470,8 +5483,8 @@ struct ggml_tensor * ggml_mean(
|
|
5470
5483
|
|
5471
5484
|
result->op = GGML_OP_MEAN;
|
5472
5485
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5473
|
-
result->
|
5474
|
-
result->
|
5486
|
+
result->src[0] = a;
|
5487
|
+
result->src[1] = NULL;
|
5475
5488
|
|
5476
5489
|
return result;
|
5477
5490
|
}
|
@@ -5494,8 +5507,8 @@ struct ggml_tensor * ggml_argmax(
|
|
5494
5507
|
|
5495
5508
|
result->op = GGML_OP_ARGMAX;
|
5496
5509
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5497
|
-
result->
|
5498
|
-
result->
|
5510
|
+
result->src[0] = a;
|
5511
|
+
result->src[1] = NULL;
|
5499
5512
|
|
5500
5513
|
return result;
|
5501
5514
|
}
|
@@ -5522,8 +5535,8 @@ struct ggml_tensor * ggml_repeat(
|
|
5522
5535
|
|
5523
5536
|
result->op = GGML_OP_REPEAT;
|
5524
5537
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5525
|
-
result->
|
5526
|
-
result->
|
5538
|
+
result->src[0] = a;
|
5539
|
+
result->src[1] = b;
|
5527
5540
|
|
5528
5541
|
return result;
|
5529
5542
|
}
|
@@ -5550,8 +5563,8 @@ struct ggml_tensor * ggml_repeat_back(
|
|
5550
5563
|
|
5551
5564
|
result->op = GGML_OP_REPEAT_BACK;
|
5552
5565
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5553
|
-
result->
|
5554
|
-
result->
|
5566
|
+
result->src[0] = a;
|
5567
|
+
result->src[1] = b;
|
5555
5568
|
|
5556
5569
|
return result;
|
5557
5570
|
}
|
@@ -5572,8 +5585,8 @@ struct ggml_tensor * ggml_abs_impl(
|
|
5572
5585
|
|
5573
5586
|
result->op = GGML_OP_ABS;
|
5574
5587
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5575
|
-
result->
|
5576
|
-
result->
|
5588
|
+
result->src[0] = a;
|
5589
|
+
result->src[1] = NULL;
|
5577
5590
|
|
5578
5591
|
return result;
|
5579
5592
|
}
|
@@ -5607,8 +5620,8 @@ struct ggml_tensor * ggml_sgn_impl(
|
|
5607
5620
|
|
5608
5621
|
result->op = GGML_OP_SGN;
|
5609
5622
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5610
|
-
result->
|
5611
|
-
result->
|
5623
|
+
result->src[0] = a;
|
5624
|
+
result->src[1] = NULL;
|
5612
5625
|
|
5613
5626
|
return result;
|
5614
5627
|
}
|
@@ -5641,8 +5654,8 @@ struct ggml_tensor * ggml_neg_impl(
|
|
5641
5654
|
|
5642
5655
|
result->op = GGML_OP_NEG;
|
5643
5656
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5644
|
-
result->
|
5645
|
-
result->
|
5657
|
+
result->src[0] = a;
|
5658
|
+
result->src[1] = NULL;
|
5646
5659
|
|
5647
5660
|
return result;
|
5648
5661
|
}
|
@@ -5675,8 +5688,8 @@ struct ggml_tensor * ggml_step_impl(
|
|
5675
5688
|
|
5676
5689
|
result->op = GGML_OP_STEP;
|
5677
5690
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5678
|
-
result->
|
5679
|
-
result->
|
5691
|
+
result->src[0] = a;
|
5692
|
+
result->src[1] = NULL;
|
5680
5693
|
|
5681
5694
|
return result;
|
5682
5695
|
}
|
@@ -5709,8 +5722,8 @@ struct ggml_tensor * ggml_tanh_impl(
|
|
5709
5722
|
|
5710
5723
|
result->op = GGML_OP_TANH;
|
5711
5724
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5712
|
-
result->
|
5713
|
-
result->
|
5725
|
+
result->src[0] = a;
|
5726
|
+
result->src[1] = NULL;
|
5714
5727
|
|
5715
5728
|
return result;
|
5716
5729
|
}
|
@@ -5743,8 +5756,8 @@ struct ggml_tensor * ggml_elu_impl(
|
|
5743
5756
|
|
5744
5757
|
result->op = GGML_OP_ELU;
|
5745
5758
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5746
|
-
result->
|
5747
|
-
result->
|
5759
|
+
result->src[0] = a;
|
5760
|
+
result->src[1] = NULL;
|
5748
5761
|
|
5749
5762
|
return result;
|
5750
5763
|
}
|
@@ -5777,8 +5790,8 @@ struct ggml_tensor * ggml_relu_impl(
|
|
5777
5790
|
|
5778
5791
|
result->op = GGML_OP_RELU;
|
5779
5792
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5780
|
-
result->
|
5781
|
-
result->
|
5793
|
+
result->src[0] = a;
|
5794
|
+
result->src[1] = NULL;
|
5782
5795
|
|
5783
5796
|
return result;
|
5784
5797
|
}
|
@@ -5811,8 +5824,8 @@ struct ggml_tensor * ggml_gelu_impl(
|
|
5811
5824
|
|
5812
5825
|
result->op = GGML_OP_GELU;
|
5813
5826
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5814
|
-
result->
|
5815
|
-
result->
|
5827
|
+
result->src[0] = a;
|
5828
|
+
result->src[1] = NULL;
|
5816
5829
|
|
5817
5830
|
return result;
|
5818
5831
|
}
|
@@ -5845,8 +5858,8 @@ struct ggml_tensor * ggml_gelu_quick_impl(
|
|
5845
5858
|
|
5846
5859
|
result->op = GGML_OP_GELU_QUICK;
|
5847
5860
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5848
|
-
result->
|
5849
|
-
result->
|
5861
|
+
result->src[0] = a;
|
5862
|
+
result->src[1] = NULL;
|
5850
5863
|
|
5851
5864
|
return result;
|
5852
5865
|
}
|
@@ -5879,8 +5892,8 @@ struct ggml_tensor * ggml_silu_impl(
|
|
5879
5892
|
|
5880
5893
|
result->op = GGML_OP_SILU;
|
5881
5894
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5882
|
-
result->
|
5883
|
-
result->
|
5895
|
+
result->src[0] = a;
|
5896
|
+
result->src[1] = NULL;
|
5884
5897
|
|
5885
5898
|
return result;
|
5886
5899
|
}
|
@@ -5914,8 +5927,8 @@ struct ggml_tensor * ggml_silu_back(
|
|
5914
5927
|
|
5915
5928
|
result->op = GGML_OP_SILU_BACK;
|
5916
5929
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5917
|
-
result->
|
5918
|
-
result->
|
5930
|
+
result->src[0] = a;
|
5931
|
+
result->src[1] = b;
|
5919
5932
|
|
5920
5933
|
return result;
|
5921
5934
|
}
|
@@ -5937,8 +5950,8 @@ struct ggml_tensor * ggml_norm_impl(
|
|
5937
5950
|
|
5938
5951
|
result->op = GGML_OP_NORM;
|
5939
5952
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5940
|
-
result->
|
5941
|
-
result->
|
5953
|
+
result->src[0] = a;
|
5954
|
+
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
5942
5955
|
|
5943
5956
|
return result;
|
5944
5957
|
}
|
@@ -5969,8 +5982,8 @@ struct ggml_tensor * ggml_rms_norm_impl(
|
|
5969
5982
|
|
5970
5983
|
result->op = GGML_OP_RMS_NORM;
|
5971
5984
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5972
|
-
result->
|
5973
|
-
result->
|
5985
|
+
result->src[0] = a;
|
5986
|
+
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
5974
5987
|
|
5975
5988
|
return result;
|
5976
5989
|
}
|
@@ -6002,8 +6015,8 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
6002
6015
|
|
6003
6016
|
result->op = GGML_OP_RMS_NORM_BACK;
|
6004
6017
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6005
|
-
result->
|
6006
|
-
result->
|
6018
|
+
result->src[0] = a;
|
6019
|
+
result->src[1] = b;
|
6007
6020
|
|
6008
6021
|
return result;
|
6009
6022
|
}
|
@@ -6024,13 +6037,13 @@ struct ggml_tensor * ggml_mul_mat(
|
|
6024
6037
|
is_node = true;
|
6025
6038
|
}
|
6026
6039
|
|
6027
|
-
const int64_t ne[4] = { a->ne[1], b->ne[1],
|
6028
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
6040
|
+
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
6041
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
|
6029
6042
|
|
6030
6043
|
result->op = GGML_OP_MUL_MAT;
|
6031
6044
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6032
|
-
result->
|
6033
|
-
result->
|
6045
|
+
result->src[0] = a;
|
6046
|
+
result->src[1] = b;
|
6034
6047
|
|
6035
6048
|
return result;
|
6036
6049
|
}
|
@@ -6055,8 +6068,8 @@ struct ggml_tensor * ggml_out_prod(
|
|
6055
6068
|
|
6056
6069
|
result->op = GGML_OP_OUT_PROD;
|
6057
6070
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6058
|
-
result->
|
6059
|
-
result->
|
6071
|
+
result->src[0] = a;
|
6072
|
+
result->src[1] = b;
|
6060
6073
|
|
6061
6074
|
return result;
|
6062
6075
|
}
|
@@ -6081,8 +6094,8 @@ struct ggml_tensor * ggml_scale_impl(
|
|
6081
6094
|
|
6082
6095
|
result->op = GGML_OP_SCALE;
|
6083
6096
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6084
|
-
result->
|
6085
|
-
result->
|
6097
|
+
result->src[0] = a;
|
6098
|
+
result->src[1] = b;
|
6086
6099
|
|
6087
6100
|
return result;
|
6088
6101
|
}
|
@@ -6137,9 +6150,9 @@ struct ggml_tensor * ggml_set_impl(
|
|
6137
6150
|
|
6138
6151
|
result->op = GGML_OP_SET;
|
6139
6152
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6140
|
-
result->
|
6141
|
-
result->
|
6142
|
-
result->
|
6153
|
+
result->src[0] = a;
|
6154
|
+
result->src[1] = b;
|
6155
|
+
result->src[2] = c;
|
6143
6156
|
|
6144
6157
|
return result;
|
6145
6158
|
}
|
@@ -6226,8 +6239,8 @@ struct ggml_tensor * ggml_cpy_impl(
|
|
6226
6239
|
|
6227
6240
|
result->op = GGML_OP_CPY;
|
6228
6241
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6229
|
-
result->
|
6230
|
-
result->
|
6242
|
+
result->src[0] = a;
|
6243
|
+
result->src[1] = b;
|
6231
6244
|
|
6232
6245
|
return result;
|
6233
6246
|
}
|
@@ -6263,8 +6276,8 @@ struct ggml_tensor * ggml_cont_impl(
|
|
6263
6276
|
|
6264
6277
|
result->op = GGML_OP_CONT;
|
6265
6278
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6266
|
-
result->
|
6267
|
-
result->
|
6279
|
+
result->src[0] = a;
|
6280
|
+
result->src[1] = NULL;
|
6268
6281
|
|
6269
6282
|
return result;
|
6270
6283
|
}
|
@@ -6307,8 +6320,8 @@ struct ggml_tensor * ggml_reshape(
|
|
6307
6320
|
|
6308
6321
|
result->op = GGML_OP_RESHAPE;
|
6309
6322
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6310
|
-
result->
|
6311
|
-
result->
|
6323
|
+
result->src[0] = a;
|
6324
|
+
result->src[1] = NULL;
|
6312
6325
|
|
6313
6326
|
return result;
|
6314
6327
|
}
|
@@ -6332,8 +6345,8 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
6332
6345
|
|
6333
6346
|
result->op = GGML_OP_RESHAPE;
|
6334
6347
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6335
|
-
result->
|
6336
|
-
result->
|
6348
|
+
result->src[0] = a;
|
6349
|
+
result->src[1] = NULL;
|
6337
6350
|
|
6338
6351
|
return result;
|
6339
6352
|
}
|
@@ -6358,8 +6371,8 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
6358
6371
|
|
6359
6372
|
result->op = GGML_OP_RESHAPE;
|
6360
6373
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6361
|
-
result->
|
6362
|
-
result->
|
6374
|
+
result->src[0] = a;
|
6375
|
+
result->src[1] = NULL;
|
6363
6376
|
|
6364
6377
|
return result;
|
6365
6378
|
}
|
@@ -6385,8 +6398,8 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6385
6398
|
|
6386
6399
|
result->op = GGML_OP_RESHAPE;
|
6387
6400
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6388
|
-
result->
|
6389
|
-
result->
|
6401
|
+
result->src[0] = a;
|
6402
|
+
result->src[1] = NULL;
|
6390
6403
|
|
6391
6404
|
return result;
|
6392
6405
|
}
|
@@ -6414,8 +6427,8 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6414
6427
|
|
6415
6428
|
result->op = GGML_OP_RESHAPE;
|
6416
6429
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6417
|
-
result->
|
6418
|
-
result->
|
6430
|
+
result->src[0] = a;
|
6431
|
+
result->src[1] = NULL;
|
6419
6432
|
|
6420
6433
|
return result;
|
6421
6434
|
}
|
@@ -6447,9 +6460,9 @@ struct ggml_tensor * ggml_view_1d(
|
|
6447
6460
|
|
6448
6461
|
result->op = GGML_OP_VIEW;
|
6449
6462
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6450
|
-
result->
|
6451
|
-
result->
|
6452
|
-
result->
|
6463
|
+
result->src[0] = a;
|
6464
|
+
result->src[1] = NULL;
|
6465
|
+
result->src[2] = offs;
|
6453
6466
|
|
6454
6467
|
return result;
|
6455
6468
|
}
|
@@ -6489,9 +6502,9 @@ struct ggml_tensor * ggml_view_2d(
|
|
6489
6502
|
|
6490
6503
|
result->op = GGML_OP_VIEW;
|
6491
6504
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6492
|
-
result->
|
6493
|
-
result->
|
6494
|
-
result->
|
6505
|
+
result->src[0] = a;
|
6506
|
+
result->src[1] = NULL;
|
6507
|
+
result->src[2] = offs;
|
6495
6508
|
|
6496
6509
|
return result;
|
6497
6510
|
}
|
@@ -6533,9 +6546,9 @@ struct ggml_tensor * ggml_view_3d(
|
|
6533
6546
|
|
6534
6547
|
result->op = GGML_OP_VIEW;
|
6535
6548
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6536
|
-
result->
|
6537
|
-
result->
|
6538
|
-
result->
|
6549
|
+
result->src[0] = a;
|
6550
|
+
result->src[1] = NULL;
|
6551
|
+
result->src[2] = offs;
|
6539
6552
|
|
6540
6553
|
return result;
|
6541
6554
|
}
|
@@ -6579,9 +6592,9 @@ struct ggml_tensor * ggml_view_4d(
|
|
6579
6592
|
|
6580
6593
|
result->op = GGML_OP_VIEW;
|
6581
6594
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6582
|
-
result->
|
6583
|
-
result->
|
6584
|
-
result->
|
6595
|
+
result->src[0] = a;
|
6596
|
+
result->src[1] = NULL;
|
6597
|
+
result->src[2] = offs;
|
6585
6598
|
|
6586
6599
|
return result;
|
6587
6600
|
}
|
@@ -6641,8 +6654,8 @@ struct ggml_tensor * ggml_permute(
|
|
6641
6654
|
|
6642
6655
|
result->op = GGML_OP_PERMUTE;
|
6643
6656
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6644
|
-
result->
|
6645
|
-
result->
|
6657
|
+
result->src[0] = a;
|
6658
|
+
result->src[1] = NULL;
|
6646
6659
|
|
6647
6660
|
if (is_node) {
|
6648
6661
|
ggml_scratch_save(ctx);
|
@@ -6656,7 +6669,7 @@ struct ggml_tensor * ggml_permute(
|
|
6656
6669
|
|
6657
6670
|
ggml_scratch_load(ctx);
|
6658
6671
|
|
6659
|
-
result->
|
6672
|
+
result->src[2] = b;
|
6660
6673
|
}
|
6661
6674
|
|
6662
6675
|
return result;
|
@@ -6684,8 +6697,8 @@ struct ggml_tensor * ggml_transpose(
|
|
6684
6697
|
|
6685
6698
|
result->op = GGML_OP_TRANSPOSE;
|
6686
6699
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6687
|
-
result->
|
6688
|
-
result->
|
6700
|
+
result->src[0] = a;
|
6701
|
+
result->src[1] = NULL;
|
6689
6702
|
|
6690
6703
|
return result;
|
6691
6704
|
}
|
@@ -6710,8 +6723,8 @@ struct ggml_tensor * ggml_get_rows(
|
|
6710
6723
|
|
6711
6724
|
result->op = GGML_OP_GET_ROWS;
|
6712
6725
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6713
|
-
result->
|
6714
|
-
result->
|
6726
|
+
result->src[0] = a;
|
6727
|
+
result->src[1] = b;
|
6715
6728
|
|
6716
6729
|
return result;
|
6717
6730
|
}
|
@@ -6738,9 +6751,9 @@ struct ggml_tensor * ggml_get_rows_back(
|
|
6738
6751
|
|
6739
6752
|
result->op = GGML_OP_GET_ROWS_BACK;
|
6740
6753
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6741
|
-
result->
|
6742
|
-
result->
|
6743
|
-
result->
|
6754
|
+
result->src[0] = a;
|
6755
|
+
result->src[1] = b;
|
6756
|
+
result->src[2] = c;
|
6744
6757
|
|
6745
6758
|
return result;
|
6746
6759
|
}
|
@@ -6762,8 +6775,8 @@ struct ggml_tensor * ggml_diag(
|
|
6762
6775
|
|
6763
6776
|
result->op = GGML_OP_DIAG;
|
6764
6777
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6765
|
-
result->
|
6766
|
-
result->
|
6778
|
+
result->src[0] = a;
|
6779
|
+
result->src[1] = NULL;
|
6767
6780
|
|
6768
6781
|
return result;
|
6769
6782
|
}
|
@@ -6795,8 +6808,8 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
|
|
6795
6808
|
|
6796
6809
|
result->op = GGML_OP_DIAG_MASK_INF;
|
6797
6810
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6798
|
-
result->
|
6799
|
-
result->
|
6811
|
+
result->src[0] = a;
|
6812
|
+
result->src[1] = b;
|
6800
6813
|
|
6801
6814
|
return result;
|
6802
6815
|
}
|
@@ -6843,8 +6856,8 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
|
|
6843
6856
|
|
6844
6857
|
result->op = GGML_OP_DIAG_MASK_ZERO;
|
6845
6858
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6846
|
-
result->
|
6847
|
-
result->
|
6859
|
+
result->src[0] = a;
|
6860
|
+
result->src[1] = b;
|
6848
6861
|
|
6849
6862
|
return result;
|
6850
6863
|
}
|
@@ -6879,8 +6892,8 @@ struct ggml_tensor * ggml_soft_max_impl(
|
|
6879
6892
|
|
6880
6893
|
result->op = GGML_OP_SOFT_MAX;
|
6881
6894
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6882
|
-
result->
|
6883
|
-
result->
|
6895
|
+
result->src[0] = a;
|
6896
|
+
result->src[1] = NULL;
|
6884
6897
|
|
6885
6898
|
return result;
|
6886
6899
|
}
|
@@ -6915,8 +6928,8 @@ struct ggml_tensor * ggml_soft_max_back_impl(
|
|
6915
6928
|
|
6916
6929
|
result->op = GGML_OP_SOFT_MAX_BACK;
|
6917
6930
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6918
|
-
result->
|
6919
|
-
result->
|
6931
|
+
result->src[0] = a;
|
6932
|
+
result->src[1] = b;
|
6920
6933
|
|
6921
6934
|
return result;
|
6922
6935
|
}
|
@@ -6944,6 +6957,8 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6944
6957
|
int n_dims,
|
6945
6958
|
int mode,
|
6946
6959
|
int n_ctx,
|
6960
|
+
float freq_base,
|
6961
|
+
float freq_scale,
|
6947
6962
|
bool inplace) {
|
6948
6963
|
GGML_ASSERT(n_past >= 0);
|
6949
6964
|
bool is_node = false;
|
@@ -6956,19 +6971,21 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6956
6971
|
|
6957
6972
|
ggml_scratch_save(ctx);
|
6958
6973
|
|
6959
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
6974
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
|
6960
6975
|
|
6961
6976
|
((int32_t *) b->data)[0] = n_past;
|
6962
6977
|
((int32_t *) b->data)[1] = n_dims;
|
6963
6978
|
((int32_t *) b->data)[2] = mode;
|
6964
6979
|
((int32_t *) b->data)[3] = n_ctx;
|
6980
|
+
memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
|
6981
|
+
memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
|
6965
6982
|
|
6966
6983
|
ggml_scratch_load(ctx);
|
6967
6984
|
|
6968
6985
|
result->op = GGML_OP_ROPE;
|
6969
6986
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6970
|
-
result->
|
6971
|
-
result->
|
6987
|
+
result->src[0] = a;
|
6988
|
+
result->src[1] = b;
|
6972
6989
|
|
6973
6990
|
return result;
|
6974
6991
|
}
|
@@ -6980,7 +6997,7 @@ struct ggml_tensor * ggml_rope(
|
|
6980
6997
|
int n_dims,
|
6981
6998
|
int mode,
|
6982
6999
|
int n_ctx) {
|
6983
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
7000
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
|
6984
7001
|
}
|
6985
7002
|
|
6986
7003
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -6990,7 +7007,19 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6990
7007
|
int n_dims,
|
6991
7008
|
int mode,
|
6992
7009
|
int n_ctx) {
|
6993
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
7010
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
7011
|
+
}
|
7012
|
+
|
7013
|
+
struct ggml_tensor * ggml_rope_custom_inplace(
|
7014
|
+
struct ggml_context * ctx,
|
7015
|
+
struct ggml_tensor * a,
|
7016
|
+
int n_past,
|
7017
|
+
int n_dims,
|
7018
|
+
int mode,
|
7019
|
+
int n_ctx,
|
7020
|
+
float freq_base,
|
7021
|
+
float freq_scale) {
|
7022
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
|
6994
7023
|
}
|
6995
7024
|
|
6996
7025
|
// ggml_rope_back
|
@@ -7000,7 +7029,8 @@ struct ggml_tensor * ggml_rope_back(
|
|
7000
7029
|
struct ggml_tensor * a,
|
7001
7030
|
int n_past,
|
7002
7031
|
int n_dims,
|
7003
|
-
int mode
|
7032
|
+
int mode,
|
7033
|
+
int n_ctx) {
|
7004
7034
|
GGML_ASSERT(n_past >= 0);
|
7005
7035
|
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
7006
7036
|
|
@@ -7014,19 +7044,20 @@ struct ggml_tensor * ggml_rope_back(
|
|
7014
7044
|
|
7015
7045
|
ggml_scratch_save(ctx);
|
7016
7046
|
|
7017
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
7047
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
7018
7048
|
ggml_set_name(b, "n_past, n_dims, mode");
|
7019
7049
|
|
7020
7050
|
((int32_t *) b->data)[0] = n_past;
|
7021
7051
|
((int32_t *) b->data)[1] = n_dims;
|
7022
7052
|
((int32_t *) b->data)[2] = mode;
|
7053
|
+
((int32_t *) b->data)[3] = n_ctx;
|
7023
7054
|
|
7024
7055
|
ggml_scratch_load(ctx);
|
7025
7056
|
|
7026
7057
|
result->op = GGML_OP_ROPE_BACK;
|
7027
7058
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7028
|
-
result->
|
7029
|
-
result->
|
7059
|
+
result->src[0] = a;
|
7060
|
+
result->src[1] = b;
|
7030
7061
|
|
7031
7062
|
return result;
|
7032
7063
|
}
|
@@ -7064,8 +7095,8 @@ struct ggml_tensor * ggml_alibi(
|
|
7064
7095
|
|
7065
7096
|
result->op = GGML_OP_ALIBI;
|
7066
7097
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7067
|
-
result->
|
7068
|
-
result->
|
7098
|
+
result->src[0] = a;
|
7099
|
+
result->src[1] = b;
|
7069
7100
|
|
7070
7101
|
return result;
|
7071
7102
|
}
|
@@ -7098,8 +7129,8 @@ struct ggml_tensor * ggml_clamp(
|
|
7098
7129
|
|
7099
7130
|
result->op = GGML_OP_CLAMP;
|
7100
7131
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7101
|
-
result->
|
7102
|
-
result->
|
7132
|
+
result->src[0] = a;
|
7133
|
+
result->src[1] = b;
|
7103
7134
|
|
7104
7135
|
return result;
|
7105
7136
|
}
|
@@ -7141,9 +7172,9 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
7141
7172
|
|
7142
7173
|
result->op = GGML_OP_CONV_1D;
|
7143
7174
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7144
|
-
result->
|
7145
|
-
result->
|
7146
|
-
result->
|
7175
|
+
result->src[0] = a;
|
7176
|
+
result->src[1] = b;
|
7177
|
+
result->src[2] = c;
|
7147
7178
|
|
7148
7179
|
return result;
|
7149
7180
|
}
|
@@ -7161,7 +7192,6 @@ struct ggml_tensor* ggml_conv_2d(
|
|
7161
7192
|
int d0,
|
7162
7193
|
int d1) {
|
7163
7194
|
|
7164
|
-
GGML_ASSERT(b->ne[3] == 1);
|
7165
7195
|
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
7166
7196
|
bool is_node = false;
|
7167
7197
|
|
@@ -7173,7 +7203,7 @@ struct ggml_tensor* ggml_conv_2d(
|
|
7173
7203
|
const int64_t ne[4] = {
|
7174
7204
|
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
7175
7205
|
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
7176
|
-
a->ne[3],
|
7206
|
+
a->ne[3], b->ne[3],
|
7177
7207
|
};
|
7178
7208
|
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7179
7209
|
|
@@ -7189,9 +7219,9 @@ struct ggml_tensor* ggml_conv_2d(
|
|
7189
7219
|
|
7190
7220
|
result->op = GGML_OP_CONV_2D;
|
7191
7221
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7192
|
-
result->
|
7193
|
-
result->
|
7194
|
-
result->
|
7222
|
+
result->src[0] = a;
|
7223
|
+
result->src[1] = b;
|
7224
|
+
result->src[2] = c;
|
7195
7225
|
|
7196
7226
|
return result;
|
7197
7227
|
|
@@ -7208,6 +7238,98 @@ struct ggml_tensor* ggml_conv_1d_ph(
|
|
7208
7238
|
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
|
7209
7239
|
}
|
7210
7240
|
|
7241
|
+
|
7242
|
+
// ggml_pool_*
|
7243
|
+
|
7244
|
+
static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
|
7245
|
+
return (ins + 2 * p - ks) / s + 1;
|
7246
|
+
}
|
7247
|
+
|
7248
|
+
// ggml_pool_2d
|
7249
|
+
|
7250
|
+
struct ggml_tensor* ggml_pool_1d(
|
7251
|
+
struct ggml_context * ctx,
|
7252
|
+
struct ggml_tensor * a,
|
7253
|
+
enum ggml_op_pool op,
|
7254
|
+
int k0,
|
7255
|
+
int s0,
|
7256
|
+
int p0) {
|
7257
|
+
|
7258
|
+
bool is_node = false;
|
7259
|
+
|
7260
|
+
if (a->grad) {
|
7261
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7262
|
+
is_node = true;
|
7263
|
+
}
|
7264
|
+
|
7265
|
+
const int64_t ne[3] = {
|
7266
|
+
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
7267
|
+
a->ne[1],
|
7268
|
+
};
|
7269
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
7270
|
+
|
7271
|
+
ggml_scratch_save(ctx);
|
7272
|
+
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
7273
|
+
((int32_t*)c->data)[0] = op;
|
7274
|
+
((int32_t*)c->data)[1] = k0;
|
7275
|
+
((int32_t*)c->data)[2] = s0;
|
7276
|
+
((int32_t*)c->data)[3] = p0;
|
7277
|
+
ggml_scratch_load(ctx);
|
7278
|
+
|
7279
|
+
result->op = GGML_OP_POOL_1D;
|
7280
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7281
|
+
result->src[0] = a;
|
7282
|
+
result->src[1] = c;
|
7283
|
+
|
7284
|
+
return result;
|
7285
|
+
}
|
7286
|
+
|
7287
|
+
// ggml_pool_2d
|
7288
|
+
|
7289
|
+
struct ggml_tensor* ggml_pool_2d(
|
7290
|
+
struct ggml_context * ctx,
|
7291
|
+
struct ggml_tensor * a,
|
7292
|
+
enum ggml_op_pool op,
|
7293
|
+
int k0,
|
7294
|
+
int k1,
|
7295
|
+
int s0,
|
7296
|
+
int s1,
|
7297
|
+
int p0,
|
7298
|
+
int p1) {
|
7299
|
+
|
7300
|
+
bool is_node = false;
|
7301
|
+
|
7302
|
+
if (a->grad) {
|
7303
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7304
|
+
is_node = true;
|
7305
|
+
}
|
7306
|
+
|
7307
|
+
const int64_t ne[3] = {
|
7308
|
+
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
7309
|
+
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
7310
|
+
a->ne[2],
|
7311
|
+
};
|
7312
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7313
|
+
|
7314
|
+
ggml_scratch_save(ctx);
|
7315
|
+
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
|
7316
|
+
((int32_t*)c->data)[0] = op;
|
7317
|
+
((int32_t*)c->data)[1] = k0;
|
7318
|
+
((int32_t*)c->data)[2] = k1;
|
7319
|
+
((int32_t*)c->data)[3] = s0;
|
7320
|
+
((int32_t*)c->data)[4] = s1;
|
7321
|
+
((int32_t*)c->data)[5] = p0;
|
7322
|
+
((int32_t*)c->data)[6] = p1;
|
7323
|
+
ggml_scratch_load(ctx);
|
7324
|
+
|
7325
|
+
result->op = GGML_OP_POOL_2D;
|
7326
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7327
|
+
result->src[0] = a;
|
7328
|
+
result->src[1] = c;
|
7329
|
+
|
7330
|
+
return result;
|
7331
|
+
}
|
7332
|
+
|
7211
7333
|
// ggml_flash_attn
|
7212
7334
|
|
7213
7335
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -7230,10 +7352,10 @@ struct ggml_tensor * ggml_flash_attn(
|
|
7230
7352
|
|
7231
7353
|
result->op = GGML_OP_FLASH_ATTN;
|
7232
7354
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7233
|
-
result->
|
7234
|
-
result->
|
7235
|
-
result->
|
7236
|
-
result->
|
7355
|
+
result->src[0] = q;
|
7356
|
+
result->src[1] = k;
|
7357
|
+
result->src[2] = v;
|
7358
|
+
result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
|
7237
7359
|
|
7238
7360
|
return result;
|
7239
7361
|
}
|
@@ -7261,11 +7383,11 @@ struct ggml_tensor * ggml_flash_ff(
|
|
7261
7383
|
|
7262
7384
|
result->op = GGML_OP_FLASH_FF;
|
7263
7385
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7264
|
-
result->
|
7265
|
-
result->
|
7266
|
-
result->
|
7267
|
-
result->
|
7268
|
-
result->
|
7386
|
+
result->src[0] = a;
|
7387
|
+
result->src[1] = b0;
|
7388
|
+
result->src[2] = b1;
|
7389
|
+
result->src[3] = c0;
|
7390
|
+
result->src[4] = c1;
|
7269
7391
|
|
7270
7392
|
return result;
|
7271
7393
|
}
|
@@ -7325,11 +7447,11 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|
7325
7447
|
|
7326
7448
|
result->op = GGML_OP_FLASH_ATTN_BACK;
|
7327
7449
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7328
|
-
result->
|
7329
|
-
result->
|
7330
|
-
result->
|
7331
|
-
result->
|
7332
|
-
result->
|
7450
|
+
result->src[0] = q;
|
7451
|
+
result->src[1] = k;
|
7452
|
+
result->src[2] = v;
|
7453
|
+
result->src[3] = d;
|
7454
|
+
result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
|
7333
7455
|
|
7334
7456
|
return result;
|
7335
7457
|
}
|
@@ -7374,9 +7496,9 @@ struct ggml_tensor * ggml_win_part(
|
|
7374
7496
|
|
7375
7497
|
result->op = GGML_OP_WIN_PART;
|
7376
7498
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7377
|
-
result->
|
7378
|
-
result->
|
7379
|
-
result->
|
7499
|
+
result->src[0] = a;
|
7500
|
+
result->src[1] = NULL;
|
7501
|
+
result->src[2] = b;
|
7380
7502
|
|
7381
7503
|
return result;
|
7382
7504
|
}
|
@@ -7411,9 +7533,9 @@ struct ggml_tensor * ggml_win_unpart(
|
|
7411
7533
|
|
7412
7534
|
result->op = GGML_OP_WIN_UNPART;
|
7413
7535
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7414
|
-
result->
|
7415
|
-
result->
|
7416
|
-
result->
|
7536
|
+
result->src[0] = a;
|
7537
|
+
result->src[1] = NULL;
|
7538
|
+
result->src[2] = b;
|
7417
7539
|
|
7418
7540
|
return result;
|
7419
7541
|
}
|
@@ -7442,8 +7564,8 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
|
|
7442
7564
|
|
7443
7565
|
result->op = GGML_OP_MAP_UNARY;
|
7444
7566
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7445
|
-
result->
|
7446
|
-
result->
|
7567
|
+
result->src[0] = a;
|
7568
|
+
result->src[2] = addr_tensor;
|
7447
7569
|
|
7448
7570
|
return result;
|
7449
7571
|
}
|
@@ -7489,9 +7611,9 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
|
|
7489
7611
|
|
7490
7612
|
result->op = GGML_OP_MAP_BINARY;
|
7491
7613
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7492
|
-
result->
|
7493
|
-
result->
|
7494
|
-
result->
|
7614
|
+
result->src[0] = a;
|
7615
|
+
result->src[1] = b;
|
7616
|
+
result->src[2] = addr_tensor;
|
7495
7617
|
|
7496
7618
|
return result;
|
7497
7619
|
}
|
@@ -7536,8 +7658,8 @@ struct ggml_tensor * ggml_map_custom1_impl_f32(
|
|
7536
7658
|
|
7537
7659
|
result->op = GGML_OP_MAP_CUSTOM1;
|
7538
7660
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7539
|
-
result->
|
7540
|
-
result->
|
7661
|
+
result->src[0] = a;
|
7662
|
+
result->src[2] = addr_tensor;
|
7541
7663
|
|
7542
7664
|
return result;
|
7543
7665
|
}
|
@@ -7581,9 +7703,9 @@ struct ggml_tensor * ggml_map_custom2_impl_f32(
|
|
7581
7703
|
|
7582
7704
|
result->op = GGML_OP_MAP_CUSTOM2;
|
7583
7705
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7584
|
-
result->
|
7585
|
-
result->
|
7586
|
-
result->
|
7706
|
+
result->src[0] = a;
|
7707
|
+
result->src[1] = b;
|
7708
|
+
result->src[2] = addr_tensor;
|
7587
7709
|
|
7588
7710
|
return result;
|
7589
7711
|
}
|
@@ -7630,10 +7752,10 @@ struct ggml_tensor * ggml_map_custom3_impl_f32(
|
|
7630
7752
|
|
7631
7753
|
result->op = GGML_OP_MAP_CUSTOM3;
|
7632
7754
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7633
|
-
result->
|
7634
|
-
result->
|
7635
|
-
result->
|
7636
|
-
result->
|
7755
|
+
result->src[0] = a;
|
7756
|
+
result->src[1] = b;
|
7757
|
+
result->src[2] = addr_tensor;
|
7758
|
+
result->src[3] = c;
|
7637
7759
|
|
7638
7760
|
return result;
|
7639
7761
|
}
|
@@ -7673,8 +7795,8 @@ struct ggml_tensor * ggml_cross_entropy_loss(
|
|
7673
7795
|
|
7674
7796
|
result->op = GGML_OP_CROSS_ENTROPY_LOSS;
|
7675
7797
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7676
|
-
result->
|
7677
|
-
result->
|
7798
|
+
result->src[0] = a;
|
7799
|
+
result->src[1] = b;
|
7678
7800
|
|
7679
7801
|
return result;
|
7680
7802
|
}
|
@@ -7693,9 +7815,9 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
|
|
7693
7815
|
|
7694
7816
|
result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
|
7695
7817
|
result->grad = NULL;
|
7696
|
-
result->
|
7697
|
-
result->
|
7698
|
-
result->
|
7818
|
+
result->src[0] = a;
|
7819
|
+
result->src[1] = b;
|
7820
|
+
result->src[2] = c;
|
7699
7821
|
|
7700
7822
|
return result;
|
7701
7823
|
}
|
@@ -8296,7 +8418,7 @@ static void ggml_compute_forward_add_f32(
|
|
8296
8418
|
const struct ggml_tensor * src0,
|
8297
8419
|
const struct ggml_tensor * src1,
|
8298
8420
|
struct ggml_tensor * dst) {
|
8299
|
-
GGML_ASSERT(
|
8421
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
|
8300
8422
|
|
8301
8423
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
8302
8424
|
return;
|
@@ -8321,23 +8443,23 @@ static void ggml_compute_forward_add_f32(
|
|
8321
8443
|
|
8322
8444
|
if (nb10 == sizeof(float)) {
|
8323
8445
|
for (int ir = ir0; ir < ir1; ++ir) {
|
8324
|
-
// src0
|
8325
|
-
const
|
8326
|
-
const
|
8327
|
-
const
|
8446
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
8447
|
+
const int64_t i03 = ir/(ne02*ne01);
|
8448
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
8449
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
8450
|
+
|
8451
|
+
const int64_t i13 = i03 % ne13;
|
8452
|
+
const int64_t i12 = i02 % ne12;
|
8453
|
+
const int64_t i11 = i01 % ne11;
|
8328
8454
|
|
8455
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
8456
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
8457
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
8329
8458
|
|
8330
8459
|
#ifdef GGML_USE_ACCELERATE
|
8331
|
-
vDSP_vadd(
|
8332
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
8333
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
8334
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
8335
|
-
ne0);
|
8460
|
+
vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
|
8336
8461
|
#else
|
8337
|
-
ggml_vec_add_f32(
|
8338
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
8339
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
8340
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
8462
|
+
ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
8341
8463
|
#endif
|
8342
8464
|
// }
|
8343
8465
|
// }
|
@@ -8345,15 +8467,20 @@ static void ggml_compute_forward_add_f32(
|
|
8345
8467
|
} else {
|
8346
8468
|
// src1 is not contiguous
|
8347
8469
|
for (int ir = ir0; ir < ir1; ++ir) {
|
8348
|
-
// src0
|
8349
|
-
const
|
8350
|
-
const
|
8351
|
-
const
|
8470
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
8471
|
+
const int64_t i03 = ir/(ne02*ne01);
|
8472
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
8473
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
8474
|
+
|
8475
|
+
const int64_t i13 = i03 % ne13;
|
8476
|
+
const int64_t i12 = i02 % ne12;
|
8477
|
+
const int64_t i11 = i01 % ne11;
|
8478
|
+
|
8479
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
8480
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
8352
8481
|
|
8353
|
-
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
|
8354
|
-
float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
8355
8482
|
for (int i0 = 0; i0 < ne0; i0++) {
|
8356
|
-
float * src1_ptr = (float *) ((char *) src1->data +
|
8483
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
|
8357
8484
|
|
8358
8485
|
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
|
8359
8486
|
}
|
@@ -10532,7 +10659,6 @@ static void ggml_compute_forward_rms_norm_back(
|
|
10532
10659
|
}
|
10533
10660
|
}
|
10534
10661
|
|
10535
|
-
|
10536
10662
|
// ggml_compute_forward_mul_mat
|
10537
10663
|
|
10538
10664
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
@@ -10576,17 +10702,19 @@ static void ggml_compute_forward_mul_mat(
|
|
10576
10702
|
const int ith = params->ith;
|
10577
10703
|
const int nth = params->nth;
|
10578
10704
|
|
10579
|
-
GGML_ASSERT(ne02 == ne12);
|
10580
|
-
GGML_ASSERT(ne03 == ne13);
|
10581
|
-
GGML_ASSERT(ne2 == ne12);
|
10582
|
-
GGML_ASSERT(ne3 == ne13);
|
10583
|
-
|
10584
10705
|
const enum ggml_type type = src0->type;
|
10585
10706
|
|
10707
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
10708
|
+
|
10586
10709
|
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
10587
10710
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10588
10711
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
10589
10712
|
|
10713
|
+
GGML_ASSERT(ne0 == ne01);
|
10714
|
+
GGML_ASSERT(ne1 == ne11);
|
10715
|
+
GGML_ASSERT(ne2 == ne12);
|
10716
|
+
GGML_ASSERT(ne3 == ne13);
|
10717
|
+
|
10590
10718
|
// we don't support permuted src0 or src1
|
10591
10719
|
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
10592
10720
|
GGML_ASSERT(nb10 == sizeof(float));
|
@@ -10597,16 +10725,16 @@ static void ggml_compute_forward_mul_mat(
|
|
10597
10725
|
GGML_ASSERT(nb1 <= nb2);
|
10598
10726
|
GGML_ASSERT(nb2 <= nb3);
|
10599
10727
|
|
10600
|
-
GGML_ASSERT(ne0 == ne01);
|
10601
|
-
GGML_ASSERT(ne1 == ne11);
|
10602
|
-
GGML_ASSERT(ne2 == ne02);
|
10603
|
-
GGML_ASSERT(ne3 == ne03);
|
10604
|
-
|
10605
10728
|
// nb01 >= nb00 - src0 is not transposed
|
10606
10729
|
// compute by src0 rows
|
10607
10730
|
|
10608
10731
|
#if defined(GGML_USE_CLBLAST)
|
10609
10732
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
10733
|
+
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
|
10734
|
+
// ref: https://github.com/ggerganov/ggml/pull/224
|
10735
|
+
GGML_ASSERT(ne02 == ne12);
|
10736
|
+
GGML_ASSERT(ne03 == ne13);
|
10737
|
+
|
10610
10738
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
10611
10739
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
10612
10740
|
}
|
@@ -10616,6 +10744,11 @@ static void ggml_compute_forward_mul_mat(
|
|
10616
10744
|
|
10617
10745
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10618
10746
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
10747
|
+
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
|
10748
|
+
// ref: https://github.com/ggerganov/ggml/pull/224
|
10749
|
+
GGML_ASSERT(ne02 == ne12);
|
10750
|
+
GGML_ASSERT(ne03 == ne13);
|
10751
|
+
|
10619
10752
|
if (params->ith != 0) {
|
10620
10753
|
return;
|
10621
10754
|
}
|
@@ -10636,7 +10769,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10636
10769
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
10637
10770
|
|
10638
10771
|
if (type != GGML_TYPE_F32) {
|
10639
|
-
|
10772
|
+
float * const wdata = params->wdata;
|
10640
10773
|
ggml_to_float_t const to_float = type_traits[type].to_float;
|
10641
10774
|
|
10642
10775
|
size_t id = 0;
|
@@ -10685,43 +10818,52 @@ static void ggml_compute_forward_mul_mat(
|
|
10685
10818
|
return;
|
10686
10819
|
}
|
10687
10820
|
|
10688
|
-
// parallelize by src0 rows
|
10821
|
+
// parallelize by src0 rows
|
10822
|
+
const int64_t dr = (ne01 + nth - 1)/nth;
|
10689
10823
|
|
10690
|
-
|
10691
|
-
const
|
10824
|
+
const int64_t ir10 = dr*ith;
|
10825
|
+
const int64_t ir11 = MIN(ir10 + dr, ne01);
|
10692
10826
|
|
10693
|
-
// rows
|
10694
|
-
const
|
10827
|
+
// src1 rows
|
10828
|
+
const int64_t nr1 = ne11*ne12*ne13;
|
10695
10829
|
|
10696
|
-
|
10697
|
-
const
|
10698
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
10830
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10831
|
+
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10699
10832
|
|
10700
|
-
|
10701
|
-
|
10833
|
+
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
|
10834
|
+
const int64_t i13 = (ir1/(ne12*ne11));
|
10835
|
+
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
|
10836
|
+
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
|
10702
10837
|
|
10703
|
-
|
10704
|
-
|
10705
|
-
|
10706
|
-
|
10707
|
-
|
10838
|
+
const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
|
10839
|
+
const int64_t i03 = (ir0/(ne02));
|
10840
|
+
// Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
|
10841
|
+
// See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
|
10842
|
+
// GG: this is likely the correct way to broadcast, though need some more thought
|
10843
|
+
// therefore leaving the comments to remind us for now
|
10844
|
+
const int64_t i02 = (i12 / (ne12 / ne02));
|
10845
|
+
// Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
|
10846
|
+
// const int64_t i02 = (ir0 - i03*ne02);
|
10708
10847
|
|
10709
|
-
const
|
10710
|
-
const
|
10848
|
+
const int64_t i1 = i11;
|
10849
|
+
const int64_t i2 = i12;
|
10850
|
+
const int64_t i3 = i13;
|
10711
10851
|
|
10712
|
-
const
|
10713
|
-
const int i2 = i02;
|
10714
|
-
const int i3 = i03;
|
10852
|
+
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
|
10715
10853
|
|
10716
|
-
|
10717
|
-
|
10854
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10855
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
10856
|
+
// the original src1 data pointer, so we should index using the indices directly
|
10857
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
10858
|
+
const char * src1_col = (const char *) wdata +
|
10859
|
+
(src1_cont || src1->type != vec_dot_type
|
10860
|
+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10861
|
+
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10718
10862
|
|
10719
|
-
float * dst_col = (float *) ((char *) dst->data + (
|
10863
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10720
10864
|
|
10721
|
-
|
10722
|
-
|
10723
|
-
for (int64_t ic = 0; ic < ne11; ++ic) {
|
10724
|
-
vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
|
10865
|
+
for (int64_t ir = ir10; ir < ir11; ++ir) {
|
10866
|
+
vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
|
10725
10867
|
}
|
10726
10868
|
}
|
10727
10869
|
|
@@ -11718,7 +11860,7 @@ static void ggml_compute_forward_alibi_f32(
|
|
11718
11860
|
|
11719
11861
|
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
11720
11862
|
const int ne1 = src0->ne[1]; // seq_len_without_past
|
11721
|
-
|
11863
|
+
const int ne2 = src0->ne[2]; // n_head -> this is k
|
11722
11864
|
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
11723
11865
|
|
11724
11866
|
const int n = ggml_nrows(src0);
|
@@ -11729,8 +11871,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
11729
11871
|
const int nb2 = src0->nb[2];
|
11730
11872
|
//const int nb3 = src0->nb[3];
|
11731
11873
|
|
11732
|
-
|
11733
|
-
|
11874
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
11875
|
+
GGML_ASSERT(ne1 + n_past == ne0);
|
11876
|
+
GGML_ASSERT(n_head == ne2);
|
11734
11877
|
|
11735
11878
|
// add alibi to src0 (KQ_scaled)
|
11736
11879
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
@@ -11754,7 +11897,7 @@ static void ggml_compute_forward_alibi_f32(
|
|
11754
11897
|
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
11755
11898
|
}
|
11756
11899
|
|
11757
|
-
pdst[0] =
|
11900
|
+
pdst[0] = i * m_k + src[0];
|
11758
11901
|
|
11759
11902
|
}
|
11760
11903
|
}
|
@@ -11783,7 +11926,7 @@ static void ggml_compute_forward_alibi_f16(
|
|
11783
11926
|
|
11784
11927
|
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
11785
11928
|
const int ne1 = src0->ne[1]; // seq_len_without_past
|
11786
|
-
|
11929
|
+
const int ne2 = src0->ne[2]; // n_head -> this is k
|
11787
11930
|
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
11788
11931
|
|
11789
11932
|
const int n = ggml_nrows(src0);
|
@@ -11794,8 +11937,9 @@ static void ggml_compute_forward_alibi_f16(
|
|
11794
11937
|
const int nb2 = src0->nb[2];
|
11795
11938
|
//const int nb3 = src0->nb[3];
|
11796
11939
|
|
11797
|
-
|
11798
|
-
|
11940
|
+
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
|
11941
|
+
GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
|
11942
|
+
GGML_ASSERT(n_head == ne2);
|
11799
11943
|
|
11800
11944
|
// add alibi to src0 (KQ_scaled)
|
11801
11945
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
@@ -11820,7 +11964,7 @@ static void ggml_compute_forward_alibi_f16(
|
|
11820
11964
|
}
|
11821
11965
|
|
11822
11966
|
// we return F32
|
11823
|
-
pdst[0] =
|
11967
|
+
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
11824
11968
|
}
|
11825
11969
|
}
|
11826
11970
|
}
|
@@ -11948,16 +12092,21 @@ static void ggml_compute_forward_rope_f32(
|
|
11948
12092
|
const struct ggml_tensor * src1,
|
11949
12093
|
struct ggml_tensor * dst) {
|
11950
12094
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
11951
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12095
|
+
GGML_ASSERT(ggml_nelements(src1) == 6);
|
11952
12096
|
|
11953
12097
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11954
12098
|
return;
|
11955
12099
|
}
|
11956
12100
|
|
12101
|
+
float freq_base;
|
12102
|
+
float freq_scale;
|
12103
|
+
|
11957
12104
|
const int n_past = ((int32_t *) src1->data)[0];
|
11958
12105
|
const int n_dims = ((int32_t *) src1->data)[1];
|
11959
12106
|
const int mode = ((int32_t *) src1->data)[2];
|
11960
12107
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
12108
|
+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
12109
|
+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
11961
12110
|
|
11962
12111
|
assert(n_past >= 0);
|
11963
12112
|
|
@@ -11986,7 +12135,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11986
12135
|
// row index used to determine which thread to use
|
11987
12136
|
int ir = 0;
|
11988
12137
|
|
11989
|
-
const float theta_scale = powf(
|
12138
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
11990
12139
|
|
11991
12140
|
const bool is_neox = mode & 2;
|
11992
12141
|
const bool is_glm = mode & 4;
|
@@ -11998,7 +12147,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11998
12147
|
if (ir++ < ir0) continue;
|
11999
12148
|
if (ir > ir1) break;
|
12000
12149
|
|
12001
|
-
float theta = (float)p;
|
12150
|
+
float theta = freq_scale * (float)p;
|
12002
12151
|
|
12003
12152
|
if (is_glm) {
|
12004
12153
|
theta = MIN(p, n_ctx - 2);
|
@@ -12075,16 +12224,21 @@ static void ggml_compute_forward_rope_f16(
|
|
12075
12224
|
const struct ggml_tensor * src1,
|
12076
12225
|
struct ggml_tensor * dst) {
|
12077
12226
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12078
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12227
|
+
GGML_ASSERT(ggml_nelements(src1) == 6);
|
12079
12228
|
|
12080
12229
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12081
12230
|
return;
|
12082
12231
|
}
|
12083
12232
|
|
12233
|
+
float freq_base;
|
12234
|
+
float freq_scale;
|
12235
|
+
|
12084
12236
|
const int n_past = ((int32_t *) src1->data)[0];
|
12085
12237
|
const int n_dims = ((int32_t *) src1->data)[1];
|
12086
12238
|
const int mode = ((int32_t *) src1->data)[2];
|
12087
12239
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
12240
|
+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
12241
|
+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
12088
12242
|
|
12089
12243
|
assert(n_past >= 0);
|
12090
12244
|
|
@@ -12113,7 +12267,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12113
12267
|
// row index used to determine which thread to use
|
12114
12268
|
int ir = 0;
|
12115
12269
|
|
12116
|
-
const float theta_scale = powf(
|
12270
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
12117
12271
|
|
12118
12272
|
const bool is_neox = mode & 2;
|
12119
12273
|
const bool is_glm = mode & 4;
|
@@ -12125,7 +12279,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12125
12279
|
if (ir++ < ir0) continue;
|
12126
12280
|
if (ir > ir1) break;
|
12127
12281
|
|
12128
|
-
float theta = (float)p;
|
12282
|
+
float theta = freq_scale * (float)p;
|
12129
12283
|
|
12130
12284
|
if (is_glm) {
|
12131
12285
|
theta = MIN(p, n_ctx - 2);
|
@@ -12186,7 +12340,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12186
12340
|
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
12187
12341
|
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
12188
12342
|
|
12189
|
-
dst_data[0]
|
12343
|
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
12190
12344
|
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
12191
12345
|
}
|
12192
12346
|
}
|
@@ -12225,7 +12379,7 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12225
12379
|
const struct ggml_tensor * src1,
|
12226
12380
|
struct ggml_tensor * dst) {
|
12227
12381
|
assert(src1->type == GGML_TYPE_I32);
|
12228
|
-
assert(ggml_nelements(src1) ==
|
12382
|
+
assert(ggml_nelements(src1) == 4);
|
12229
12383
|
|
12230
12384
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12231
12385
|
return;
|
@@ -12868,12 +13022,13 @@ static void ggml_compute_forward_conv_1d(
|
|
12868
13022
|
};
|
12869
13023
|
}
|
12870
13024
|
|
12871
|
-
//
|
13025
|
+
// ggml_compute_forward_conv_2d
|
12872
13026
|
|
12873
|
-
static void
|
13027
|
+
static void ggml_compute_forward_conv_2d_f16_f32(
|
12874
13028
|
const struct ggml_compute_params * params,
|
12875
13029
|
const struct ggml_tensor * src0,
|
12876
13030
|
const struct ggml_tensor * src1,
|
13031
|
+
const struct ggml_tensor * opt0,
|
12877
13032
|
struct ggml_tensor * dst) {
|
12878
13033
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12879
13034
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -12893,11 +13048,17 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
12893
13048
|
// size of the convolution row - the kernel size unrolled across all channels
|
12894
13049
|
const int ew0 = nk0*nk1*ne02;
|
12895
13050
|
|
13051
|
+
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
13052
|
+
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
|
13053
|
+
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
|
13054
|
+
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
|
13055
|
+
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
|
13056
|
+
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
|
13057
|
+
|
12896
13058
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
12897
13059
|
GGML_ASSERT(nb10 == sizeof(float));
|
12898
13060
|
|
12899
13061
|
if (params->type == GGML_TASK_INIT) {
|
12900
|
-
// TODO: fix this memset (wsize is overestimated)
|
12901
13062
|
memset(params->wdata, 0, params->wsize);
|
12902
13063
|
|
12903
13064
|
// prepare source data (src1)
|
@@ -12912,8 +13073,13 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
12912
13073
|
for (int i0 = 0; i0 < ne0; i0++) {
|
12913
13074
|
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
12914
13075
|
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
12915
|
-
|
12916
|
-
|
13076
|
+
const int idx0 = i0*s0 + ik0*d0 - p0;
|
13077
|
+
const int idx1 = i1*s1 + ik1*d1 - p1;
|
13078
|
+
|
13079
|
+
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
|
13080
|
+
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
13081
|
+
GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
|
13082
|
+
}
|
12917
13083
|
}
|
12918
13084
|
}
|
12919
13085
|
}
|
@@ -12940,32 +13106,36 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
12940
13106
|
|
12941
13107
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
12942
13108
|
|
12943
|
-
for (int
|
12944
|
-
|
12945
|
-
|
12946
|
-
|
12947
|
-
for (int
|
12948
|
-
|
12949
|
-
|
12950
|
-
|
13109
|
+
for (int i3 = 0; i3 < ne3; i3++) {
|
13110
|
+
for (int i2 = ip0; i2 < ip1; i2++) {
|
13111
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
|
13112
|
+
|
13113
|
+
for (int i1 = 0; i1 < ne1; ++i1) {
|
13114
|
+
for (int i0 = 0; i0 < ne0; ++i0) {
|
13115
|
+
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
|
13116
|
+
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
|
13117
|
+
(ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
|
13118
|
+
}
|
12951
13119
|
}
|
12952
13120
|
}
|
12953
13121
|
}
|
12954
13122
|
}
|
12955
13123
|
|
12956
|
-
static void
|
13124
|
+
static void ggml_compute_forward_conv_2d(
|
12957
13125
|
const struct ggml_compute_params * params,
|
12958
13126
|
const struct ggml_tensor * src0,
|
12959
13127
|
const struct ggml_tensor * src1,
|
12960
|
-
struct ggml_tensor *
|
13128
|
+
const struct ggml_tensor * opt0,
|
13129
|
+
struct ggml_tensor * dst
|
13130
|
+
) {
|
12961
13131
|
switch (src0->type) {
|
12962
13132
|
case GGML_TYPE_F16:
|
12963
13133
|
{
|
12964
|
-
|
13134
|
+
ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
|
12965
13135
|
} break;
|
12966
13136
|
case GGML_TYPE_F32:
|
12967
13137
|
{
|
12968
|
-
//
|
13138
|
+
//ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
|
12969
13139
|
GGML_ASSERT(false);
|
12970
13140
|
} break;
|
12971
13141
|
default:
|
@@ -12975,31 +13145,164 @@ static void ggml_compute_forward_conv_2d_sk_p0(
|
|
12975
13145
|
}
|
12976
13146
|
}
|
12977
13147
|
|
12978
|
-
//
|
13148
|
+
// ggml_compute_forward_pool_1d_sk_p0
|
12979
13149
|
|
12980
|
-
static void
|
13150
|
+
static void ggml_compute_forward_pool_1d_sk_p0(
|
13151
|
+
const struct ggml_compute_params * params,
|
13152
|
+
const enum ggml_op_pool op,
|
13153
|
+
const struct ggml_tensor * src,
|
13154
|
+
const int k,
|
13155
|
+
struct ggml_tensor * dst) {
|
13156
|
+
assert(src->type == GGML_TYPE_F32);
|
13157
|
+
assert(params->ith == 0);
|
13158
|
+
|
13159
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
13160
|
+
return;
|
13161
|
+
}
|
13162
|
+
|
13163
|
+
const char * cdata = (const char *)src->data;
|
13164
|
+
const char * const data_end = cdata + ggml_nbytes(src);
|
13165
|
+
float * drow = (float *)dst->data;
|
13166
|
+
|
13167
|
+
const int64_t rs = dst->ne[0];
|
13168
|
+
|
13169
|
+
while (cdata < data_end) {
|
13170
|
+
const float * const srow = (const float *)cdata;
|
13171
|
+
|
13172
|
+
int j = 0;
|
13173
|
+
|
13174
|
+
for (int64_t i = 0; i < rs; ++i) {
|
13175
|
+
switch (op) {
|
13176
|
+
case GGML_OP_POOL_AVG: drow[i] = 0; break;
|
13177
|
+
case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break;
|
13178
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13179
|
+
}
|
13180
|
+
for (int ki = 0; ki < k; ++ki) {
|
13181
|
+
switch (op) {
|
13182
|
+
case GGML_OP_POOL_AVG: drow[i] += srow[j]; break;
|
13183
|
+
case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break;
|
13184
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13185
|
+
}
|
13186
|
+
++j;
|
13187
|
+
}
|
13188
|
+
switch (op) {
|
13189
|
+
case GGML_OP_POOL_AVG: drow[i] /= k; break;
|
13190
|
+
case GGML_OP_POOL_MAX: break;
|
13191
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13192
|
+
}
|
13193
|
+
}
|
13194
|
+
|
13195
|
+
cdata += src->nb[1];
|
13196
|
+
drow += rs;
|
13197
|
+
}
|
13198
|
+
}
|
13199
|
+
|
13200
|
+
// ggml_compute_forward_pool_1d
|
13201
|
+
|
13202
|
+
static void ggml_compute_forward_pool_1d(
|
12981
13203
|
const struct ggml_compute_params* params,
|
12982
13204
|
const struct ggml_tensor* src0,
|
12983
|
-
const struct ggml_tensor* src1,
|
12984
13205
|
const struct ggml_tensor* opt0,
|
12985
13206
|
struct ggml_tensor* dst) {
|
12986
|
-
|
12987
|
-
const
|
12988
|
-
|
12989
|
-
const
|
12990
|
-
const
|
12991
|
-
const
|
12992
|
-
GGML_ASSERT(d0 == 1); // dilation not supported
|
12993
|
-
GGML_ASSERT(d1 == 1);
|
13207
|
+
GGML_ASSERT(opt0->ne[0] == 4);
|
13208
|
+
const int* opts = (const int*)opt0->data;
|
13209
|
+
enum ggml_op_pool op = opts[0];
|
13210
|
+
const int k0 = opts[1];
|
13211
|
+
const int s0 = opts[2];
|
13212
|
+
const int p0 = opts[3];
|
12994
13213
|
GGML_ASSERT(p0 == 0); // padding not supported
|
12995
|
-
GGML_ASSERT(
|
13214
|
+
GGML_ASSERT(k0 == s0); // only s = k supported
|
13215
|
+
|
13216
|
+
ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
|
13217
|
+
}
|
12996
13218
|
|
12997
|
-
|
12998
|
-
|
13219
|
+
// ggml_compute_forward_pool_2d_sk_p0
|
13220
|
+
|
13221
|
+
static void ggml_compute_forward_pool_2d_sk_p0(
|
13222
|
+
const struct ggml_compute_params * params,
|
13223
|
+
const enum ggml_op_pool op,
|
13224
|
+
const struct ggml_tensor * src,
|
13225
|
+
const int k0,
|
13226
|
+
const int k1,
|
13227
|
+
struct ggml_tensor * dst) {
|
13228
|
+
assert(src->type == GGML_TYPE_F32);
|
13229
|
+
assert(params->ith == 0);
|
13230
|
+
|
13231
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
13232
|
+
return;
|
13233
|
+
}
|
13234
|
+
|
13235
|
+
const char * cdata = (const char*)src->data;
|
13236
|
+
const char * const data_end = cdata + ggml_nbytes(src);
|
13237
|
+
|
13238
|
+
const int64_t px = dst->ne[0];
|
13239
|
+
const int64_t py = dst->ne[1];
|
13240
|
+
const int64_t pa = px * py;
|
13241
|
+
|
13242
|
+
float * dplane = (float *)dst->data;
|
13243
|
+
|
13244
|
+
const int ka = k0 * k1;
|
13245
|
+
|
13246
|
+
while (cdata < data_end) {
|
13247
|
+
for (int oy = 0; oy < py; ++oy) {
|
13248
|
+
float * const drow = dplane + oy * px;
|
13249
|
+
for (int ox = 0; ox < px; ++ox) {
|
13250
|
+
float * const out = drow + ox;
|
13251
|
+
switch (op) {
|
13252
|
+
case GGML_OP_POOL_AVG: *out = 0; break;
|
13253
|
+
case GGML_OP_POOL_MAX: *out = -FLT_MAX; break;
|
13254
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13255
|
+
}
|
13256
|
+
|
13257
|
+
const int ix = ox * k0;
|
13258
|
+
const int iy = oy * k1;
|
13259
|
+
|
13260
|
+
for (int ky = 0; ky < k1; ++ky) {
|
13261
|
+
const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
|
13262
|
+
for (int kx = 0; kx < k0; ++kx) {
|
13263
|
+
int j = ix + kx;
|
13264
|
+
switch (op) {
|
13265
|
+
case GGML_OP_POOL_AVG: *out += srow[j]; break;
|
13266
|
+
case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
|
13267
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13268
|
+
}
|
13269
|
+
}
|
13270
|
+
}
|
13271
|
+
switch (op) {
|
13272
|
+
case GGML_OP_POOL_AVG: *out /= ka; break;
|
13273
|
+
case GGML_OP_POOL_MAX: break;
|
13274
|
+
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
13275
|
+
}
|
13276
|
+
}
|
13277
|
+
}
|
13278
|
+
|
13279
|
+
cdata += src->nb[2];
|
13280
|
+
dplane += pa;
|
12999
13281
|
}
|
13000
|
-
|
13001
|
-
|
13002
|
-
|
13282
|
+
}
|
13283
|
+
|
13284
|
+
// ggml_compute_forward_pool_2d
|
13285
|
+
|
13286
|
+
static void ggml_compute_forward_pool_2d(
|
13287
|
+
const struct ggml_compute_params * params,
|
13288
|
+
const struct ggml_tensor * src0,
|
13289
|
+
const struct ggml_tensor * opt0,
|
13290
|
+
struct ggml_tensor * dst) {
|
13291
|
+
GGML_ASSERT(opt0->ne[0] == 7);
|
13292
|
+
const int* opts = (const int*)opt0->data;
|
13293
|
+
enum ggml_op_pool op = opts[0];
|
13294
|
+
const int k0 = opts[1];
|
13295
|
+
const int k1 = opts[2];
|
13296
|
+
const int s0 = opts[3];
|
13297
|
+
const int s1 = opts[4];
|
13298
|
+
const int p0 = opts[5];
|
13299
|
+
const int p1 = opts[6];
|
13300
|
+
GGML_ASSERT(p0 == 0);
|
13301
|
+
GGML_ASSERT(p1 == 0); // padding not supported
|
13302
|
+
GGML_ASSERT(k0 == s0);
|
13303
|
+
GGML_ASSERT(k1 == s1); // only s = k supported
|
13304
|
+
|
13305
|
+
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
|
13003
13306
|
}
|
13004
13307
|
|
13005
13308
|
|
@@ -14566,287 +14869,295 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14566
14869
|
if (skip_cpu) {
|
14567
14870
|
return;
|
14568
14871
|
}
|
14569
|
-
GGML_ASSERT(tensor->
|
14570
|
-
GGML_ASSERT(tensor->
|
14872
|
+
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
14873
|
+
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
14571
14874
|
#endif // GGML_USE_CUBLAS
|
14572
14875
|
|
14573
14876
|
switch (tensor->op) {
|
14574
14877
|
case GGML_OP_DUP:
|
14575
14878
|
{
|
14576
|
-
ggml_compute_forward_dup(params, tensor->
|
14879
|
+
ggml_compute_forward_dup(params, tensor->src[0], tensor);
|
14577
14880
|
} break;
|
14578
14881
|
case GGML_OP_ADD:
|
14579
14882
|
{
|
14580
|
-
ggml_compute_forward_add(params, tensor->
|
14883
|
+
ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
|
14581
14884
|
} break;
|
14582
14885
|
case GGML_OP_ADD1:
|
14583
14886
|
{
|
14584
|
-
ggml_compute_forward_add1(params, tensor->
|
14887
|
+
ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
|
14585
14888
|
} break;
|
14586
14889
|
case GGML_OP_ACC:
|
14587
14890
|
{
|
14588
|
-
ggml_compute_forward_acc(params, tensor->
|
14891
|
+
ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14589
14892
|
} break;
|
14590
14893
|
case GGML_OP_SUB:
|
14591
14894
|
{
|
14592
|
-
ggml_compute_forward_sub(params, tensor->
|
14895
|
+
ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
|
14593
14896
|
} break;
|
14594
14897
|
case GGML_OP_MUL:
|
14595
14898
|
{
|
14596
|
-
ggml_compute_forward_mul(params, tensor->
|
14899
|
+
ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
|
14597
14900
|
} break;
|
14598
14901
|
case GGML_OP_DIV:
|
14599
14902
|
{
|
14600
|
-
ggml_compute_forward_div(params, tensor->
|
14903
|
+
ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
|
14601
14904
|
} break;
|
14602
14905
|
case GGML_OP_SQR:
|
14603
14906
|
{
|
14604
|
-
ggml_compute_forward_sqr(params, tensor->
|
14907
|
+
ggml_compute_forward_sqr(params, tensor->src[0], tensor);
|
14605
14908
|
} break;
|
14606
14909
|
case GGML_OP_SQRT:
|
14607
14910
|
{
|
14608
|
-
ggml_compute_forward_sqrt(params, tensor->
|
14911
|
+
ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
|
14609
14912
|
} break;
|
14610
14913
|
case GGML_OP_LOG:
|
14611
14914
|
{
|
14612
|
-
ggml_compute_forward_log(params, tensor->
|
14915
|
+
ggml_compute_forward_log(params, tensor->src[0], tensor);
|
14613
14916
|
} break;
|
14614
14917
|
case GGML_OP_SUM:
|
14615
14918
|
{
|
14616
|
-
ggml_compute_forward_sum(params, tensor->
|
14919
|
+
ggml_compute_forward_sum(params, tensor->src[0], tensor);
|
14617
14920
|
} break;
|
14618
14921
|
case GGML_OP_SUM_ROWS:
|
14619
14922
|
{
|
14620
|
-
ggml_compute_forward_sum_rows(params, tensor->
|
14923
|
+
ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
|
14621
14924
|
} break;
|
14622
14925
|
case GGML_OP_MEAN:
|
14623
14926
|
{
|
14624
|
-
ggml_compute_forward_mean(params, tensor->
|
14927
|
+
ggml_compute_forward_mean(params, tensor->src[0], tensor);
|
14625
14928
|
} break;
|
14626
14929
|
case GGML_OP_ARGMAX:
|
14627
14930
|
{
|
14628
|
-
ggml_compute_forward_argmax(params, tensor->
|
14931
|
+
ggml_compute_forward_argmax(params, tensor->src[0], tensor);
|
14629
14932
|
} break;
|
14630
14933
|
case GGML_OP_REPEAT:
|
14631
14934
|
{
|
14632
|
-
ggml_compute_forward_repeat(params, tensor->
|
14935
|
+
ggml_compute_forward_repeat(params, tensor->src[0], tensor);
|
14633
14936
|
} break;
|
14634
14937
|
case GGML_OP_REPEAT_BACK:
|
14635
14938
|
{
|
14636
|
-
ggml_compute_forward_repeat_back(params, tensor->
|
14939
|
+
ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
|
14637
14940
|
} break;
|
14638
14941
|
case GGML_OP_ABS:
|
14639
14942
|
{
|
14640
|
-
ggml_compute_forward_abs(params, tensor->
|
14943
|
+
ggml_compute_forward_abs(params, tensor->src[0], tensor);
|
14641
14944
|
} break;
|
14642
14945
|
case GGML_OP_SGN:
|
14643
14946
|
{
|
14644
|
-
ggml_compute_forward_sgn(params, tensor->
|
14947
|
+
ggml_compute_forward_sgn(params, tensor->src[0], tensor);
|
14645
14948
|
} break;
|
14646
14949
|
case GGML_OP_NEG:
|
14647
14950
|
{
|
14648
|
-
ggml_compute_forward_neg(params, tensor->
|
14951
|
+
ggml_compute_forward_neg(params, tensor->src[0], tensor);
|
14649
14952
|
} break;
|
14650
14953
|
case GGML_OP_STEP:
|
14651
14954
|
{
|
14652
|
-
ggml_compute_forward_step(params, tensor->
|
14955
|
+
ggml_compute_forward_step(params, tensor->src[0], tensor);
|
14653
14956
|
} break;
|
14654
14957
|
case GGML_OP_TANH:
|
14655
14958
|
{
|
14656
|
-
ggml_compute_forward_tanh(params, tensor->
|
14959
|
+
ggml_compute_forward_tanh(params, tensor->src[0], tensor);
|
14657
14960
|
} break;
|
14658
14961
|
case GGML_OP_ELU:
|
14659
14962
|
{
|
14660
|
-
ggml_compute_forward_elu(params, tensor->
|
14963
|
+
ggml_compute_forward_elu(params, tensor->src[0], tensor);
|
14661
14964
|
} break;
|
14662
14965
|
case GGML_OP_RELU:
|
14663
14966
|
{
|
14664
|
-
ggml_compute_forward_relu(params, tensor->
|
14967
|
+
ggml_compute_forward_relu(params, tensor->src[0], tensor);
|
14665
14968
|
} break;
|
14666
14969
|
case GGML_OP_GELU:
|
14667
14970
|
{
|
14668
|
-
ggml_compute_forward_gelu(params, tensor->
|
14971
|
+
ggml_compute_forward_gelu(params, tensor->src[0], tensor);
|
14669
14972
|
} break;
|
14670
14973
|
case GGML_OP_GELU_QUICK:
|
14671
14974
|
{
|
14672
|
-
ggml_compute_forward_gelu_quick(params, tensor->
|
14975
|
+
ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
|
14673
14976
|
} break;
|
14674
14977
|
case GGML_OP_SILU:
|
14675
14978
|
{
|
14676
|
-
ggml_compute_forward_silu(params, tensor->
|
14979
|
+
ggml_compute_forward_silu(params, tensor->src[0], tensor);
|
14677
14980
|
} break;
|
14678
14981
|
case GGML_OP_SILU_BACK:
|
14679
14982
|
{
|
14680
|
-
ggml_compute_forward_silu_back(params, tensor->
|
14983
|
+
ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
|
14681
14984
|
} break;
|
14682
14985
|
case GGML_OP_NORM:
|
14683
14986
|
{
|
14684
|
-
ggml_compute_forward_norm(params, tensor->
|
14987
|
+
ggml_compute_forward_norm(params, tensor->src[0], tensor);
|
14685
14988
|
} break;
|
14686
14989
|
case GGML_OP_RMS_NORM:
|
14687
14990
|
{
|
14688
|
-
ggml_compute_forward_rms_norm(params, tensor->
|
14991
|
+
ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
|
14689
14992
|
} break;
|
14690
14993
|
case GGML_OP_RMS_NORM_BACK:
|
14691
14994
|
{
|
14692
|
-
ggml_compute_forward_rms_norm_back(params, tensor->
|
14995
|
+
ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
|
14693
14996
|
} break;
|
14694
14997
|
case GGML_OP_MUL_MAT:
|
14695
14998
|
{
|
14696
|
-
ggml_compute_forward_mul_mat(params, tensor->
|
14999
|
+
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
14697
15000
|
} break;
|
14698
15001
|
case GGML_OP_OUT_PROD:
|
14699
15002
|
{
|
14700
|
-
ggml_compute_forward_out_prod(params, tensor->
|
15003
|
+
ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
|
14701
15004
|
} break;
|
14702
15005
|
case GGML_OP_SCALE:
|
14703
15006
|
{
|
14704
|
-
ggml_compute_forward_scale(params, tensor->
|
15007
|
+
ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
|
14705
15008
|
} break;
|
14706
15009
|
case GGML_OP_SET:
|
14707
15010
|
{
|
14708
|
-
ggml_compute_forward_set(params, tensor->
|
15011
|
+
ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14709
15012
|
} break;
|
14710
15013
|
case GGML_OP_CPY:
|
14711
15014
|
{
|
14712
|
-
ggml_compute_forward_cpy(params, tensor->
|
15015
|
+
ggml_compute_forward_cpy(params, tensor->src[0], tensor);
|
14713
15016
|
} break;
|
14714
15017
|
case GGML_OP_CONT:
|
14715
15018
|
{
|
14716
|
-
ggml_compute_forward_cont(params, tensor->
|
15019
|
+
ggml_compute_forward_cont(params, tensor->src[0], tensor);
|
14717
15020
|
} break;
|
14718
15021
|
case GGML_OP_RESHAPE:
|
14719
15022
|
{
|
14720
|
-
ggml_compute_forward_reshape(params, tensor->
|
15023
|
+
ggml_compute_forward_reshape(params, tensor->src[0], tensor);
|
14721
15024
|
} break;
|
14722
15025
|
case GGML_OP_VIEW:
|
14723
15026
|
{
|
14724
|
-
ggml_compute_forward_view(params, tensor->
|
15027
|
+
ggml_compute_forward_view(params, tensor->src[0]);
|
14725
15028
|
} break;
|
14726
15029
|
case GGML_OP_PERMUTE:
|
14727
15030
|
{
|
14728
|
-
ggml_compute_forward_permute(params, tensor->
|
15031
|
+
ggml_compute_forward_permute(params, tensor->src[0]);
|
14729
15032
|
} break;
|
14730
15033
|
case GGML_OP_TRANSPOSE:
|
14731
15034
|
{
|
14732
|
-
ggml_compute_forward_transpose(params, tensor->
|
15035
|
+
ggml_compute_forward_transpose(params, tensor->src[0]);
|
14733
15036
|
} break;
|
14734
15037
|
case GGML_OP_GET_ROWS:
|
14735
15038
|
{
|
14736
|
-
ggml_compute_forward_get_rows(params, tensor->
|
15039
|
+
ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
|
14737
15040
|
} break;
|
14738
15041
|
case GGML_OP_GET_ROWS_BACK:
|
14739
15042
|
{
|
14740
|
-
ggml_compute_forward_get_rows_back(params, tensor->
|
15043
|
+
ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14741
15044
|
} break;
|
14742
15045
|
case GGML_OP_DIAG:
|
14743
15046
|
{
|
14744
|
-
ggml_compute_forward_diag(params, tensor->
|
15047
|
+
ggml_compute_forward_diag(params, tensor->src[0], tensor);
|
14745
15048
|
} break;
|
14746
15049
|
case GGML_OP_DIAG_MASK_INF:
|
14747
15050
|
{
|
14748
|
-
ggml_compute_forward_diag_mask_inf(params, tensor->
|
15051
|
+
ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor);
|
14749
15052
|
} break;
|
14750
15053
|
case GGML_OP_DIAG_MASK_ZERO:
|
14751
15054
|
{
|
14752
|
-
ggml_compute_forward_diag_mask_zero(params, tensor->
|
15055
|
+
ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor);
|
14753
15056
|
} break;
|
14754
15057
|
case GGML_OP_SOFT_MAX:
|
14755
15058
|
{
|
14756
|
-
ggml_compute_forward_soft_max(params, tensor->
|
15059
|
+
ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
|
14757
15060
|
} break;
|
14758
15061
|
case GGML_OP_SOFT_MAX_BACK:
|
14759
15062
|
{
|
14760
|
-
ggml_compute_forward_soft_max_back(params, tensor->
|
15063
|
+
ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
|
14761
15064
|
} break;
|
14762
15065
|
case GGML_OP_ROPE:
|
14763
15066
|
{
|
14764
|
-
ggml_compute_forward_rope(params, tensor->
|
15067
|
+
ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
|
14765
15068
|
} break;
|
14766
15069
|
case GGML_OP_ROPE_BACK:
|
14767
15070
|
{
|
14768
|
-
ggml_compute_forward_rope_back(params, tensor->
|
15071
|
+
ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
|
14769
15072
|
} break;
|
14770
15073
|
case GGML_OP_ALIBI:
|
14771
15074
|
{
|
14772
|
-
ggml_compute_forward_alibi(params, tensor->
|
15075
|
+
ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor);
|
14773
15076
|
} break;
|
14774
15077
|
case GGML_OP_CLAMP:
|
14775
15078
|
{
|
14776
|
-
ggml_compute_forward_clamp(params, tensor->
|
15079
|
+
ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor);
|
14777
15080
|
} break;
|
14778
15081
|
case GGML_OP_CONV_1D:
|
14779
15082
|
{
|
14780
|
-
ggml_compute_forward_conv_1d(params, tensor->
|
15083
|
+
ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14781
15084
|
} break;
|
14782
15085
|
case GGML_OP_CONV_2D:
|
14783
15086
|
{
|
14784
|
-
ggml_compute_forward_conv_2d(params, tensor->
|
15087
|
+
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
15088
|
+
} break;
|
15089
|
+
case GGML_OP_POOL_1D:
|
15090
|
+
{
|
15091
|
+
ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
|
15092
|
+
} break;
|
15093
|
+
case GGML_OP_POOL_2D:
|
15094
|
+
{
|
15095
|
+
ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
|
14785
15096
|
} break;
|
14786
15097
|
case GGML_OP_FLASH_ATTN:
|
14787
15098
|
{
|
14788
|
-
const int32_t t = ggml_get_i32_1d(tensor->
|
15099
|
+
const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
|
14789
15100
|
GGML_ASSERT(t == 0 || t == 1);
|
14790
15101
|
const bool masked = t != 0;
|
14791
|
-
ggml_compute_forward_flash_attn(params, tensor->
|
15102
|
+
ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
|
14792
15103
|
} break;
|
14793
15104
|
case GGML_OP_FLASH_FF:
|
14794
15105
|
{
|
14795
|
-
ggml_compute_forward_flash_ff(params, tensor->
|
15106
|
+
ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
|
14796
15107
|
} break;
|
14797
15108
|
case GGML_OP_FLASH_ATTN_BACK:
|
14798
15109
|
{
|
14799
|
-
int32_t t = ggml_get_i32_1d(tensor->
|
15110
|
+
int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
|
14800
15111
|
GGML_ASSERT(t == 0 || t == 1);
|
14801
15112
|
bool masked = t != 0;
|
14802
|
-
ggml_compute_forward_flash_attn_back(params, tensor->
|
15113
|
+
ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
|
14803
15114
|
} break;
|
14804
15115
|
case GGML_OP_WIN_PART:
|
14805
15116
|
{
|
14806
|
-
ggml_compute_forward_win_part(params, tensor->
|
15117
|
+
ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor);
|
14807
15118
|
} break;
|
14808
15119
|
case GGML_OP_WIN_UNPART:
|
14809
15120
|
{
|
14810
|
-
ggml_compute_forward_win_unpart(params, tensor->
|
15121
|
+
ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor);
|
14811
15122
|
} break;
|
14812
15123
|
case GGML_OP_MAP_UNARY:
|
14813
15124
|
{
|
14814
|
-
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->
|
14815
|
-
ggml_compute_forward_map_unary(params, tensor->
|
15125
|
+
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data);
|
15126
|
+
ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
|
14816
15127
|
}
|
14817
15128
|
break;
|
14818
15129
|
case GGML_OP_MAP_BINARY:
|
14819
15130
|
{
|
14820
|
-
const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->
|
14821
|
-
ggml_compute_forward_map_binary(params, tensor->
|
15131
|
+
const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data);
|
15132
|
+
ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14822
15133
|
}
|
14823
15134
|
break;
|
14824
15135
|
case GGML_OP_MAP_CUSTOM1:
|
14825
15136
|
{
|
14826
|
-
const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->
|
14827
|
-
ggml_compute_forward_map_custom1(params, tensor->
|
15137
|
+
const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data);
|
15138
|
+
ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
|
14828
15139
|
}
|
14829
15140
|
break;
|
14830
15141
|
case GGML_OP_MAP_CUSTOM2:
|
14831
15142
|
{
|
14832
|
-
const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->
|
14833
|
-
ggml_compute_forward_map_custom2(params, tensor->
|
15143
|
+
const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data);
|
15144
|
+
ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14834
15145
|
}
|
14835
15146
|
break;
|
14836
15147
|
case GGML_OP_MAP_CUSTOM3:
|
14837
15148
|
{
|
14838
|
-
const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->
|
14839
|
-
ggml_compute_forward_map_custom3(params, tensor->
|
15149
|
+
const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data);
|
15150
|
+
ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun);
|
14840
15151
|
}
|
14841
15152
|
break;
|
14842
15153
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
14843
15154
|
{
|
14844
|
-
ggml_compute_forward_cross_entropy_loss(params, tensor->
|
15155
|
+
ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
|
14845
15156
|
}
|
14846
15157
|
break;
|
14847
15158
|
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
14848
15159
|
{
|
14849
|
-
ggml_compute_forward_cross_entropy_loss_back(params, tensor->
|
15160
|
+
ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14850
15161
|
}
|
14851
15162
|
break;
|
14852
15163
|
case GGML_OP_NONE:
|
@@ -14863,8 +15174,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14863
15174
|
////////////////////////////////////////////////////////////////////////////////
|
14864
15175
|
|
14865
15176
|
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
|
14866
|
-
struct ggml_tensor * src0 = tensor->
|
14867
|
-
struct ggml_tensor * src1 = tensor->
|
15177
|
+
struct ggml_tensor * src0 = tensor->src[0];
|
15178
|
+
struct ggml_tensor * src1 = tensor->src[1];
|
14868
15179
|
|
14869
15180
|
switch (tensor->op) {
|
14870
15181
|
case GGML_OP_DUP:
|
@@ -14900,12 +15211,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14900
15211
|
src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
|
14901
15212
|
}
|
14902
15213
|
if (src1->grad) {
|
14903
|
-
GGML_ASSERT(ggml_nelements(tensor->
|
14904
|
-
GGML_ASSERT(tensor->
|
14905
|
-
const size_t nb1 = (( int32_t * ) tensor->
|
14906
|
-
const size_t nb2 = (( int32_t * ) tensor->
|
14907
|
-
const size_t nb3 = (( int32_t * ) tensor->
|
14908
|
-
const size_t offset = (( int32_t * ) tensor->
|
15214
|
+
GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
|
15215
|
+
GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
|
15216
|
+
const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
|
15217
|
+
const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
|
15218
|
+
const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
|
15219
|
+
const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
|
14909
15220
|
|
14910
15221
|
struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
|
14911
15222
|
tensor->grad,
|
@@ -15213,12 +15524,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15213
15524
|
} break;
|
15214
15525
|
case GGML_OP_SET:
|
15215
15526
|
{
|
15216
|
-
GGML_ASSERT(ggml_nelements(tensor->
|
15217
|
-
GGML_ASSERT(tensor->
|
15218
|
-
const size_t nb1 = (( int32_t * ) tensor->
|
15219
|
-
const size_t nb2 = (( int32_t * ) tensor->
|
15220
|
-
const size_t nb3 = (( int32_t * ) tensor->
|
15221
|
-
const size_t offset = (( int32_t * ) tensor->
|
15527
|
+
GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
|
15528
|
+
GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
|
15529
|
+
const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
|
15530
|
+
const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
|
15531
|
+
const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
|
15532
|
+
const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
|
15222
15533
|
|
15223
15534
|
struct ggml_tensor * tensor_grad_view = NULL;
|
15224
15535
|
|
@@ -15295,8 +15606,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15295
15606
|
if (src0->grad) {
|
15296
15607
|
size_t offset;
|
15297
15608
|
|
15298
|
-
GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->
|
15299
|
-
memcpy(&offset, tensor->
|
15609
|
+
GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2]));
|
15610
|
+
memcpy(&offset, tensor->src[2]->data, sizeof(offset));
|
15300
15611
|
|
15301
15612
|
size_t nb1 = tensor->nb[1];
|
15302
15613
|
size_t nb2 = tensor->nb[2];
|
@@ -15323,7 +15634,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15323
15634
|
{
|
15324
15635
|
// necessary for llama
|
15325
15636
|
if (src0->grad) {
|
15326
|
-
int32_t * axes = (int32_t *) tensor->
|
15637
|
+
int32_t * axes = (int32_t *) tensor->src[2]->data;
|
15327
15638
|
int axis0 = axes[0] & 0x3;
|
15328
15639
|
int axis1 = axes[1] & 0x3;
|
15329
15640
|
int axis2 = axes[2] & 0x3;
|
@@ -15427,17 +15738,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15427
15738
|
// necessary for llama
|
15428
15739
|
if (src0->grad) {
|
15429
15740
|
assert(src1->type == GGML_TYPE_I32);
|
15430
|
-
assert(ggml_nelements(src1) ==
|
15741
|
+
assert(ggml_nelements(src1) == 6);
|
15431
15742
|
const int n_past = ((int32_t *) src1->data)[0];
|
15432
15743
|
const int n_dims = ((int32_t *) src1->data)[1];
|
15433
15744
|
const int mode = ((int32_t *) src1->data)[2];
|
15745
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
15434
15746
|
src0->grad = ggml_add_impl(ctx,
|
15435
15747
|
src0->grad,
|
15436
15748
|
ggml_rope_back(ctx,
|
15437
15749
|
tensor->grad,
|
15438
15750
|
n_past,
|
15439
15751
|
n_dims,
|
15440
|
-
mode
|
15752
|
+
mode,
|
15753
|
+
n_ctx),
|
15441
15754
|
inplace);
|
15442
15755
|
}
|
15443
15756
|
if (src1->grad) {
|
@@ -15483,18 +15796,26 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15483
15796
|
{
|
15484
15797
|
GGML_ASSERT(false); // TODO: not implemented
|
15485
15798
|
} break;
|
15799
|
+
case GGML_OP_POOL_1D:
|
15800
|
+
{
|
15801
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15802
|
+
} break;
|
15803
|
+
case GGML_OP_POOL_2D:
|
15804
|
+
{
|
15805
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15806
|
+
} break;
|
15486
15807
|
case GGML_OP_FLASH_ATTN:
|
15487
15808
|
{
|
15488
15809
|
struct ggml_tensor * flash_grad = NULL;
|
15489
|
-
if (src0->grad || src1->grad || tensor->
|
15490
|
-
int32_t t = ggml_get_i32_1d(tensor->
|
15810
|
+
if (src0->grad || src1->grad || tensor->src[2]->grad) {
|
15811
|
+
int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
|
15491
15812
|
GGML_ASSERT(t == 0 || t == 1);
|
15492
15813
|
bool masked = t != 0;
|
15493
15814
|
flash_grad =
|
15494
15815
|
ggml_flash_attn_back(ctx,
|
15495
15816
|
src0,
|
15496
15817
|
src1,
|
15497
|
-
tensor->
|
15818
|
+
tensor->src[2],
|
15498
15819
|
tensor->grad,
|
15499
15820
|
masked);
|
15500
15821
|
}
|
@@ -15591,7 +15912,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15591
15912
|
inplace);
|
15592
15913
|
}
|
15593
15914
|
|
15594
|
-
struct ggml_tensor * opt0 = tensor->
|
15915
|
+
struct ggml_tensor * opt0 = tensor->src[2];
|
15595
15916
|
|
15596
15917
|
if (opt0->grad) {
|
15597
15918
|
struct ggml_tensor * grad_v = NULL;
|
@@ -15707,17 +16028,9 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15707
16028
|
}
|
15708
16029
|
}
|
15709
16030
|
|
15710
|
-
|
15711
|
-
|
15712
|
-
|
15713
|
-
|
15714
|
-
if (node->src1) {
|
15715
|
-
ggml_visit_parents(cgraph, node->src1);
|
15716
|
-
}
|
15717
|
-
|
15718
|
-
for (int i = 0; i < GGML_MAX_OPT; ++i) {
|
15719
|
-
if (node->opt[i]) {
|
15720
|
-
ggml_visit_parents(cgraph, node->opt[i]);
|
16031
|
+
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
16032
|
+
if (node->src[i]) {
|
16033
|
+
ggml_visit_parents(cgraph, node->src[i]);
|
15721
16034
|
}
|
15722
16035
|
}
|
15723
16036
|
|
@@ -15772,9 +16085,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|
15772
16085
|
struct ggml_cgraph result = {
|
15773
16086
|
/*.n_nodes =*/ 0,
|
15774
16087
|
/*.n_leafs =*/ 0,
|
15775
|
-
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
|
15776
|
-
/*.work_size =*/ 0,
|
15777
|
-
/*.work =*/ NULL,
|
15778
16088
|
/*.nodes =*/ { NULL },
|
15779
16089
|
/*.grads =*/ { NULL },
|
15780
16090
|
/*.leafs =*/ { NULL },
|
@@ -15945,16 +16255,20 @@ void clear_numa_thread_affinity(void) {}
|
|
15945
16255
|
#endif
|
15946
16256
|
|
15947
16257
|
struct ggml_compute_state_shared {
|
15948
|
-
struct ggml_cgraph * cgraph;
|
16258
|
+
const struct ggml_cgraph * cgraph;
|
16259
|
+
const struct ggml_cplan * cplan;
|
15949
16260
|
|
15950
16261
|
int64_t perf_node_start_cycles;
|
15951
16262
|
int64_t perf_node_start_time_us;
|
15952
16263
|
|
15953
|
-
int n_threads;
|
16264
|
+
const int n_threads;
|
15954
16265
|
|
15955
16266
|
// synchronization primitives
|
15956
16267
|
atomic_int n_active; // num active threads
|
15957
16268
|
atomic_int node_n; // active graph node
|
16269
|
+
|
16270
|
+
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
16271
|
+
void * abort_callback_data;
|
15958
16272
|
};
|
15959
16273
|
|
15960
16274
|
struct ggml_compute_state {
|
@@ -15974,14 +16288,22 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
|
15974
16288
|
|
15975
16289
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
15976
16290
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
15977
|
-
struct ggml_cgraph * cgraph = state->shared->cgraph;
|
15978
16291
|
|
15979
|
-
const
|
16292
|
+
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
16293
|
+
const struct ggml_cplan * cplan = state->shared->cplan;
|
16294
|
+
|
16295
|
+
const int * n_tasks_arr = cplan->n_tasks;
|
16296
|
+
const int n_threads = state->shared->n_threads;
|
16297
|
+
|
15980
16298
|
set_numa_thread_affinity(state->ith, n_threads);
|
15981
16299
|
|
15982
16300
|
int node_n = -1;
|
15983
16301
|
|
15984
16302
|
while (true) {
|
16303
|
+
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
16304
|
+
state->shared->node_n += 1;
|
16305
|
+
return (thread_ret_t) GGML_EXIT_ABORTED;
|
16306
|
+
}
|
15985
16307
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
15986
16308
|
// all other threads are finished and spinning
|
15987
16309
|
// do finalize and init here so we don't have synchronize again
|
@@ -15989,18 +16311,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
15989
16311
|
/*.type =*/ GGML_TASK_FINALIZE,
|
15990
16312
|
/*.ith =*/ 0,
|
15991
16313
|
/*.nth =*/ 0,
|
15992
|
-
/*.wsize =*/
|
15993
|
-
/*.wdata =*/
|
16314
|
+
/*.wsize =*/ cplan->work_size,
|
16315
|
+
/*.wdata =*/ cplan->work_data,
|
15994
16316
|
};
|
15995
16317
|
|
15996
16318
|
if (node_n != -1) {
|
15997
16319
|
/* FINALIZE */
|
15998
16320
|
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
15999
16321
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16000
|
-
params.nth =
|
16322
|
+
params.nth = n_tasks_arr[node_n];
|
16001
16323
|
ggml_compute_forward(¶ms, node);
|
16002
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16003
16324
|
}
|
16325
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16004
16326
|
}
|
16005
16327
|
|
16006
16328
|
// distribute new work or execute it direct if 1T
|
@@ -16008,11 +16330,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16008
16330
|
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
16009
16331
|
|
16010
16332
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16333
|
+
const int n_tasks = n_tasks_arr[node_n];
|
16011
16334
|
|
16012
16335
|
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16013
16336
|
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
16014
16337
|
|
16015
|
-
params.nth =
|
16338
|
+
params.nth = n_tasks;
|
16016
16339
|
|
16017
16340
|
/* INIT */
|
16018
16341
|
if (GGML_OP_HAS_INIT[node->op]) {
|
@@ -16020,7 +16343,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16020
16343
|
ggml_compute_forward(¶ms, node);
|
16021
16344
|
}
|
16022
16345
|
|
16023
|
-
if (
|
16346
|
+
if (n_tasks == 1) {
|
16024
16347
|
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
16025
16348
|
// they do something more efficient than spinning (?)
|
16026
16349
|
params.type = GGML_TASK_COMPUTE;
|
@@ -16029,11 +16352,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16029
16352
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16030
16353
|
params.type = GGML_TASK_FINALIZE;
|
16031
16354
|
ggml_compute_forward(¶ms, node);
|
16032
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16033
16355
|
}
|
16356
|
+
|
16357
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16034
16358
|
} else {
|
16035
16359
|
break;
|
16036
16360
|
}
|
16361
|
+
|
16362
|
+
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
16363
|
+
break;
|
16364
|
+
}
|
16037
16365
|
}
|
16038
16366
|
|
16039
16367
|
atomic_store(&state->shared->n_active, n_threads);
|
@@ -16042,7 +16370,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16042
16370
|
// wait for other threads to finish
|
16043
16371
|
const int last = node_n;
|
16044
16372
|
do {
|
16045
|
-
sched_yield();
|
16373
|
+
//sched_yield();
|
16046
16374
|
node_n = atomic_load(&state->shared->node_n);
|
16047
16375
|
} while (node_n == last);
|
16048
16376
|
}
|
@@ -16052,366 +16380,398 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16052
16380
|
|
16053
16381
|
/* COMPUTE */
|
16054
16382
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16383
|
+
const int n_tasks = n_tasks_arr[node_n];
|
16055
16384
|
|
16056
16385
|
struct ggml_compute_params params = {
|
16057
16386
|
/*.type =*/ GGML_TASK_COMPUTE,
|
16058
16387
|
/*.ith =*/ state->ith,
|
16059
|
-
/*.nth =*/
|
16060
|
-
/*.wsize =*/
|
16061
|
-
/*.wdata =*/
|
16388
|
+
/*.nth =*/ n_tasks,
|
16389
|
+
/*.wsize =*/ cplan->work_size,
|
16390
|
+
/*.wdata =*/ cplan->work_data,
|
16062
16391
|
};
|
16063
16392
|
|
16064
|
-
if (state->ith <
|
16393
|
+
if (state->ith < n_tasks) {
|
16065
16394
|
ggml_compute_forward(¶ms, node);
|
16066
16395
|
}
|
16067
16396
|
}
|
16068
16397
|
|
16069
|
-
return
|
16398
|
+
return GGML_EXIT_SUCCESS;
|
16070
16399
|
}
|
16071
16400
|
|
16072
|
-
|
16073
|
-
|
16401
|
+
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
16402
|
+
if (n_threads <= 0) {
|
16403
|
+
n_threads = GGML_DEFAULT_N_THREADS;
|
16404
|
+
}
|
16074
16405
|
|
16075
|
-
|
16076
|
-
/*.cgraph =*/ cgraph,
|
16077
|
-
/*.perf_node_start_cycles =*/ 0,
|
16078
|
-
/*.perf_node_start_time_us =*/ 0,
|
16079
|
-
/*.n_threads =*/ n_threads,
|
16080
|
-
/*.n_active =*/ n_threads,
|
16081
|
-
/*.node_n =*/ -1,
|
16082
|
-
};
|
16083
|
-
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
16406
|
+
size_t work_size = 0;
|
16084
16407
|
|
16085
|
-
|
16086
|
-
|
16087
|
-
size_t work_size = 0;
|
16408
|
+
struct ggml_cplan cplan;
|
16409
|
+
memset(&cplan, 0, sizeof(struct ggml_cplan));
|
16088
16410
|
|
16089
|
-
|
16090
|
-
|
16091
|
-
|
16411
|
+
// thread scheduling for the different operations + work buffer size estimation
|
16412
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
16413
|
+
int n_tasks = 1;
|
16092
16414
|
|
16093
|
-
|
16094
|
-
case GGML_OP_CPY:
|
16095
|
-
case GGML_OP_DUP:
|
16096
|
-
{
|
16097
|
-
node->n_tasks = n_threads;
|
16415
|
+
struct ggml_tensor * node = cgraph->nodes[i];
|
16098
16416
|
|
16099
|
-
|
16100
|
-
|
16101
|
-
|
16102
|
-
|
16417
|
+
switch (node->op) {
|
16418
|
+
case GGML_OP_CPY:
|
16419
|
+
case GGML_OP_DUP:
|
16420
|
+
{
|
16421
|
+
n_tasks = n_threads;
|
16422
|
+
|
16423
|
+
size_t cur = 0;
|
16424
|
+
if (ggml_is_quantized(node->type)) {
|
16425
|
+
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
|
16426
|
+
}
|
16103
16427
|
|
16104
|
-
|
16105
|
-
|
16106
|
-
|
16107
|
-
|
16108
|
-
|
16109
|
-
|
16428
|
+
work_size = MAX(work_size, cur);
|
16429
|
+
} break;
|
16430
|
+
case GGML_OP_ADD:
|
16431
|
+
case GGML_OP_ADD1:
|
16432
|
+
{
|
16433
|
+
n_tasks = n_threads;
|
16110
16434
|
|
16111
|
-
|
16435
|
+
size_t cur = 0;
|
16112
16436
|
|
16113
|
-
|
16114
|
-
|
16115
|
-
|
16437
|
+
if (ggml_is_quantized(node->src[0]->type)) {
|
16438
|
+
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
|
16439
|
+
}
|
16116
16440
|
|
16117
|
-
|
16118
|
-
|
16119
|
-
|
16120
|
-
|
16121
|
-
|
16441
|
+
work_size = MAX(work_size, cur);
|
16442
|
+
} break;
|
16443
|
+
case GGML_OP_ACC:
|
16444
|
+
{
|
16445
|
+
n_tasks = n_threads;
|
16122
16446
|
|
16123
|
-
|
16447
|
+
size_t cur = 0;
|
16124
16448
|
|
16125
|
-
|
16126
|
-
|
16127
|
-
|
16449
|
+
if (ggml_is_quantized(node->src[0]->type)) {
|
16450
|
+
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
|
16451
|
+
}
|
16452
|
+
|
16453
|
+
work_size = MAX(work_size, cur);
|
16454
|
+
} break;
|
16455
|
+
case GGML_OP_SUB:
|
16456
|
+
case GGML_OP_DIV:
|
16457
|
+
case GGML_OP_SQR:
|
16458
|
+
case GGML_OP_SQRT:
|
16459
|
+
case GGML_OP_LOG:
|
16460
|
+
case GGML_OP_SUM:
|
16461
|
+
case GGML_OP_SUM_ROWS:
|
16462
|
+
case GGML_OP_MEAN:
|
16463
|
+
case GGML_OP_ARGMAX:
|
16464
|
+
case GGML_OP_REPEAT:
|
16465
|
+
case GGML_OP_REPEAT_BACK:
|
16466
|
+
case GGML_OP_ABS:
|
16467
|
+
case GGML_OP_SGN:
|
16468
|
+
case GGML_OP_NEG:
|
16469
|
+
case GGML_OP_STEP:
|
16470
|
+
case GGML_OP_TANH:
|
16471
|
+
case GGML_OP_ELU:
|
16472
|
+
case GGML_OP_RELU:
|
16473
|
+
{
|
16474
|
+
n_tasks = 1;
|
16475
|
+
} break;
|
16476
|
+
case GGML_OP_MUL:
|
16477
|
+
case GGML_OP_GELU:
|
16478
|
+
case GGML_OP_GELU_QUICK:
|
16479
|
+
case GGML_OP_SILU:
|
16480
|
+
case GGML_OP_SILU_BACK:
|
16481
|
+
case GGML_OP_NORM:
|
16482
|
+
case GGML_OP_RMS_NORM:
|
16483
|
+
case GGML_OP_RMS_NORM_BACK:
|
16484
|
+
{
|
16485
|
+
n_tasks = n_threads;
|
16486
|
+
} break;
|
16487
|
+
case GGML_OP_MUL_MAT:
|
16488
|
+
case GGML_OP_OUT_PROD:
|
16489
|
+
{
|
16490
|
+
n_tasks = n_threads;
|
16491
|
+
|
16492
|
+
// TODO: use different scheduling for different matrix sizes
|
16493
|
+
//const int nr0 = ggml_nrows(node->src[0]);
|
16494
|
+
//const int nr1 = ggml_nrows(node->src[1]);
|
16495
|
+
|
16496
|
+
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
16497
|
+
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
16128
16498
|
|
16129
|
-
|
16130
|
-
|
16131
|
-
case GGML_OP_SUB:
|
16132
|
-
case GGML_OP_DIV:
|
16133
|
-
case GGML_OP_SQR:
|
16134
|
-
case GGML_OP_SQRT:
|
16135
|
-
case GGML_OP_LOG:
|
16136
|
-
case GGML_OP_SUM:
|
16137
|
-
case GGML_OP_SUM_ROWS:
|
16138
|
-
case GGML_OP_MEAN:
|
16139
|
-
case GGML_OP_ARGMAX:
|
16140
|
-
case GGML_OP_REPEAT:
|
16141
|
-
case GGML_OP_REPEAT_BACK:
|
16142
|
-
case GGML_OP_ABS:
|
16143
|
-
case GGML_OP_SGN:
|
16144
|
-
case GGML_OP_NEG:
|
16145
|
-
case GGML_OP_STEP:
|
16146
|
-
case GGML_OP_TANH:
|
16147
|
-
case GGML_OP_ELU:
|
16148
|
-
case GGML_OP_RELU:
|
16149
|
-
{
|
16150
|
-
node->n_tasks = 1;
|
16151
|
-
} break;
|
16152
|
-
case GGML_OP_MUL:
|
16153
|
-
case GGML_OP_GELU:
|
16154
|
-
case GGML_OP_GELU_QUICK:
|
16155
|
-
case GGML_OP_SILU:
|
16156
|
-
case GGML_OP_SILU_BACK:
|
16157
|
-
case GGML_OP_NORM:
|
16158
|
-
case GGML_OP_RMS_NORM:
|
16159
|
-
case GGML_OP_RMS_NORM_BACK:
|
16160
|
-
{
|
16161
|
-
node->n_tasks = n_threads;
|
16162
|
-
} break;
|
16163
|
-
case GGML_OP_MUL_MAT:
|
16164
|
-
case GGML_OP_OUT_PROD:
|
16165
|
-
{
|
16166
|
-
node->n_tasks = n_threads;
|
16167
|
-
|
16168
|
-
// TODO: use different scheduling for different matrix sizes
|
16169
|
-
//const int nr0 = ggml_nrows(node->src0);
|
16170
|
-
//const int nr1 = ggml_nrows(node->src1);
|
16171
|
-
|
16172
|
-
//node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
16173
|
-
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
|
16174
|
-
|
16175
|
-
size_t cur = 0;
|
16176
|
-
const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
|
16499
|
+
size_t cur = 0;
|
16500
|
+
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
16177
16501
|
|
16178
16502
|
#if defined(GGML_USE_CUBLAS)
|
16179
|
-
|
16180
|
-
|
16181
|
-
|
16182
|
-
|
16183
|
-
else
|
16503
|
+
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
16504
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
16505
|
+
// the threads are still spinning
|
16506
|
+
} else
|
16184
16507
|
#elif defined(GGML_USE_CLBLAST)
|
16185
|
-
|
16186
|
-
|
16187
|
-
|
16188
|
-
|
16189
|
-
|
16190
|
-
else
|
16508
|
+
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
16509
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
16510
|
+
// the threads are still spinning
|
16511
|
+
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
|
16512
|
+
} else
|
16191
16513
|
#endif
|
16192
16514
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16193
|
-
|
16194
|
-
|
16195
|
-
|
16196
|
-
|
16197
|
-
|
16198
|
-
|
16199
|
-
}
|
16200
|
-
} else
|
16201
|
-
#endif
|
16202
|
-
if (node->src1->type != vec_dot_type) {
|
16203
|
-
cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
|
16204
|
-
} else {
|
16205
|
-
cur = 0;
|
16515
|
+
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
16516
|
+
n_tasks = 1; // TODO: this actually is doing nothing
|
16517
|
+
// the threads are still spinning
|
16518
|
+
if (node->src[0]->type != GGML_TYPE_F32) {
|
16519
|
+
// here we need memory just for single 2D matrix from src0
|
16520
|
+
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
16206
16521
|
}
|
16522
|
+
} else
|
16523
|
+
#endif
|
16524
|
+
if (node->src[1]->type != vec_dot_type) {
|
16525
|
+
cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
|
16526
|
+
} else {
|
16527
|
+
cur = 0;
|
16528
|
+
}
|
16207
16529
|
|
16208
|
-
|
16209
|
-
|
16210
|
-
|
16211
|
-
|
16212
|
-
|
16213
|
-
|
16214
|
-
|
16215
|
-
|
16216
|
-
|
16217
|
-
|
16218
|
-
|
16219
|
-
|
16220
|
-
|
16221
|
-
|
16222
|
-
|
16223
|
-
|
16224
|
-
|
16225
|
-
|
16226
|
-
|
16227
|
-
|
16228
|
-
|
16229
|
-
|
16230
|
-
|
16231
|
-
|
16232
|
-
|
16233
|
-
|
16234
|
-
|
16235
|
-
|
16236
|
-
|
16237
|
-
|
16238
|
-
|
16239
|
-
|
16240
|
-
|
16241
|
-
|
16242
|
-
|
16243
|
-
|
16244
|
-
|
16245
|
-
|
16246
|
-
|
16247
|
-
|
16248
|
-
|
16249
|
-
|
16250
|
-
|
16251
|
-
|
16252
|
-
|
16253
|
-
|
16254
|
-
|
16255
|
-
node->
|
16256
|
-
|
16257
|
-
|
16258
|
-
|
16259
|
-
|
16260
|
-
|
16261
|
-
|
16262
|
-
|
16263
|
-
|
16264
|
-
|
16265
|
-
|
16266
|
-
|
16267
|
-
|
16268
|
-
|
16530
|
+
work_size = MAX(work_size, cur);
|
16531
|
+
} break;
|
16532
|
+
case GGML_OP_SCALE:
|
16533
|
+
{
|
16534
|
+
n_tasks = 1;
|
16535
|
+
} break;
|
16536
|
+
case GGML_OP_SET:
|
16537
|
+
case GGML_OP_CONT:
|
16538
|
+
case GGML_OP_RESHAPE:
|
16539
|
+
case GGML_OP_VIEW:
|
16540
|
+
case GGML_OP_PERMUTE:
|
16541
|
+
case GGML_OP_TRANSPOSE:
|
16542
|
+
case GGML_OP_GET_ROWS:
|
16543
|
+
case GGML_OP_GET_ROWS_BACK:
|
16544
|
+
case GGML_OP_DIAG:
|
16545
|
+
case GGML_OP_DIAG_MASK_ZERO:
|
16546
|
+
{
|
16547
|
+
n_tasks = 1;
|
16548
|
+
} break;
|
16549
|
+
case GGML_OP_DIAG_MASK_INF:
|
16550
|
+
case GGML_OP_SOFT_MAX:
|
16551
|
+
case GGML_OP_SOFT_MAX_BACK:
|
16552
|
+
case GGML_OP_ROPE:
|
16553
|
+
case GGML_OP_ROPE_BACK:
|
16554
|
+
{
|
16555
|
+
n_tasks = n_threads;
|
16556
|
+
} break;
|
16557
|
+
case GGML_OP_ALIBI:
|
16558
|
+
{
|
16559
|
+
n_tasks = 1; //TODO
|
16560
|
+
} break;
|
16561
|
+
case GGML_OP_CLAMP:
|
16562
|
+
{
|
16563
|
+
n_tasks = 1; //TODO
|
16564
|
+
} break;
|
16565
|
+
case GGML_OP_CONV_1D:
|
16566
|
+
{
|
16567
|
+
n_tasks = n_threads;
|
16568
|
+
|
16569
|
+
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
16570
|
+
GGML_ASSERT(node->src[1]->ne[2] == 1);
|
16571
|
+
GGML_ASSERT(node->src[1]->ne[3] == 1);
|
16572
|
+
|
16573
|
+
size_t cur = 0;
|
16574
|
+
const int nk = node->src[0]->ne[0];
|
16575
|
+
|
16576
|
+
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16577
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16578
|
+
cur = sizeof(ggml_fp16_t)*(
|
16579
|
+
nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
|
16580
|
+
( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
|
16581
|
+
);
|
16582
|
+
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
16583
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16584
|
+
cur = sizeof(float)*(
|
16585
|
+
nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
|
16586
|
+
( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
|
16587
|
+
);
|
16588
|
+
} else {
|
16589
|
+
GGML_ASSERT(false);
|
16590
|
+
}
|
16269
16591
|
|
16270
|
-
|
16271
|
-
|
16272
|
-
|
16273
|
-
|
16274
|
-
|
16592
|
+
work_size = MAX(work_size, cur);
|
16593
|
+
} break;
|
16594
|
+
case GGML_OP_CONV_2D:
|
16595
|
+
{
|
16596
|
+
n_tasks = n_threads;
|
16597
|
+
|
16598
|
+
const int64_t ne00 = node->src[0]->ne[0]; // W
|
16599
|
+
const int64_t ne01 = node->src[0]->ne[1]; // H
|
16600
|
+
const int64_t ne02 = node->src[0]->ne[2]; // C
|
16601
|
+
const int64_t ne03 = node->src[0]->ne[3]; // N
|
16602
|
+
|
16603
|
+
const int64_t ne10 = node->src[1]->ne[0]; // W
|
16604
|
+
const int64_t ne11 = node->src[1]->ne[1]; // H
|
16605
|
+
const int64_t ne12 = node->src[1]->ne[2]; // C
|
16606
|
+
|
16607
|
+
const int64_t ne0 = node->ne[0];
|
16608
|
+
const int64_t ne1 = node->ne[1];
|
16609
|
+
const int64_t ne2 = node->ne[2];
|
16610
|
+
const int64_t nk = ne00*ne01;
|
16611
|
+
const int64_t ew0 = nk * ne02;
|
16612
|
+
|
16613
|
+
UNUSED(ne03);
|
16614
|
+
UNUSED(ne2);
|
16615
|
+
|
16616
|
+
size_t cur = 0;
|
16617
|
+
|
16618
|
+
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16619
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16620
|
+
cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
|
16621
|
+
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
16622
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16623
|
+
cur = sizeof(float)* (ne10*ne11*ne12);
|
16624
|
+
} else {
|
16625
|
+
GGML_ASSERT(false);
|
16626
|
+
}
|
16275
16627
|
|
16276
|
-
|
16628
|
+
work_size = MAX(work_size, cur);
|
16629
|
+
} break;
|
16630
|
+
case GGML_OP_POOL_1D:
|
16631
|
+
case GGML_OP_POOL_2D:
|
16632
|
+
{
|
16633
|
+
n_tasks = 1;
|
16634
|
+
} break;
|
16635
|
+
case GGML_OP_FLASH_ATTN:
|
16636
|
+
{
|
16637
|
+
n_tasks = n_threads;
|
16277
16638
|
|
16278
|
-
|
16279
|
-
const int64_t ne01 = node->src0->ne[1]; // H
|
16280
|
-
const int64_t ne02 = node->src0->ne[2]; // C
|
16281
|
-
const int64_t ne03 = node->src0->ne[3]; // N
|
16639
|
+
size_t cur = 0;
|
16282
16640
|
|
16283
|
-
|
16284
|
-
const int64_t ne11 = node->src1->ne[1]; // H
|
16285
|
-
const int64_t ne12 = node->src1->ne[2]; // C
|
16641
|
+
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16286
16642
|
|
16287
|
-
|
16643
|
+
if (node->src[1]->type == GGML_TYPE_F32) {
|
16644
|
+
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
16645
|
+
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
16646
|
+
}
|
16288
16647
|
|
16289
|
-
|
16290
|
-
|
16291
|
-
|
16648
|
+
if (node->src[1]->type == GGML_TYPE_F16) {
|
16649
|
+
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
16650
|
+
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
16651
|
+
}
|
16292
16652
|
|
16293
|
-
|
16653
|
+
work_size = MAX(work_size, cur);
|
16654
|
+
} break;
|
16655
|
+
case GGML_OP_FLASH_FF:
|
16656
|
+
{
|
16657
|
+
n_tasks = n_threads;
|
16294
16658
|
|
16295
|
-
|
16296
|
-
node->src1->type == GGML_TYPE_F32) {
|
16297
|
-
cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
|
16298
|
-
} else if (node->src0->type == GGML_TYPE_F32 &&
|
16299
|
-
node->src1->type == GGML_TYPE_F32) {
|
16300
|
-
cur = sizeof(float)* (ne10*ne11*ne12);
|
16301
|
-
} else {
|
16302
|
-
GGML_ASSERT(false);
|
16303
|
-
}
|
16659
|
+
size_t cur = 0;
|
16304
16660
|
|
16305
|
-
|
16306
|
-
|
16307
|
-
|
16308
|
-
|
16309
|
-
node->n_tasks = n_threads;
|
16661
|
+
if (node->src[1]->type == GGML_TYPE_F32) {
|
16662
|
+
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16663
|
+
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
16664
|
+
}
|
16310
16665
|
|
16311
|
-
|
16666
|
+
if (node->src[1]->type == GGML_TYPE_F16) {
|
16667
|
+
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16668
|
+
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
16669
|
+
}
|
16312
16670
|
|
16313
|
-
|
16671
|
+
work_size = MAX(work_size, cur);
|
16672
|
+
} break;
|
16673
|
+
case GGML_OP_FLASH_ATTN_BACK:
|
16674
|
+
{
|
16675
|
+
n_tasks = n_threads;
|
16314
16676
|
|
16315
|
-
|
16316
|
-
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
16317
|
-
cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
|
16318
|
-
}
|
16677
|
+
size_t cur = 0;
|
16319
16678
|
|
16320
|
-
|
16321
|
-
|
16322
|
-
|
16323
|
-
|
16679
|
+
const int64_t D = node->src[0]->ne[0];
|
16680
|
+
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16681
|
+
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
16682
|
+
if (node->src[1]->type == GGML_TYPE_F32) {
|
16683
|
+
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
16684
|
+
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
16685
|
+
}
|
16324
16686
|
|
16325
|
-
|
16326
|
-
|
16327
|
-
|
16328
|
-
|
16329
|
-
node->n_tasks = n_threads;
|
16687
|
+
if (node->src[1]->type == GGML_TYPE_F16) {
|
16688
|
+
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
16689
|
+
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
16690
|
+
}
|
16330
16691
|
|
16331
|
-
|
16692
|
+
work_size = MAX(work_size, cur);
|
16693
|
+
} break;
|
16694
|
+
case GGML_OP_WIN_PART:
|
16695
|
+
case GGML_OP_WIN_UNPART:
|
16696
|
+
case GGML_OP_MAP_UNARY:
|
16697
|
+
case GGML_OP_MAP_BINARY:
|
16698
|
+
case GGML_OP_MAP_CUSTOM1:
|
16699
|
+
case GGML_OP_MAP_CUSTOM2:
|
16700
|
+
case GGML_OP_MAP_CUSTOM3:
|
16701
|
+
{
|
16702
|
+
n_tasks = 1;
|
16703
|
+
} break;
|
16704
|
+
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16705
|
+
{
|
16706
|
+
n_tasks = n_threads;
|
16332
16707
|
|
16333
|
-
|
16334
|
-
cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
|
16335
|
-
cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
|
16336
|
-
}
|
16708
|
+
size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
16337
16709
|
|
16338
|
-
|
16339
|
-
|
16340
|
-
|
16341
|
-
|
16710
|
+
work_size = MAX(work_size, cur);
|
16711
|
+
} break;
|
16712
|
+
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
16713
|
+
{
|
16714
|
+
n_tasks = n_threads;
|
16342
16715
|
|
16343
|
-
|
16344
|
-
} break;
|
16345
|
-
case GGML_OP_FLASH_ATTN_BACK:
|
16346
|
-
{
|
16347
|
-
node->n_tasks = n_threads;
|
16716
|
+
size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
|
16348
16717
|
|
16349
|
-
|
16718
|
+
work_size = MAX(work_size, cur);
|
16719
|
+
} break;
|
16720
|
+
case GGML_OP_NONE:
|
16721
|
+
{
|
16722
|
+
n_tasks = 1;
|
16723
|
+
} break;
|
16724
|
+
case GGML_OP_COUNT:
|
16725
|
+
{
|
16726
|
+
GGML_ASSERT(false);
|
16727
|
+
} break;
|
16728
|
+
}
|
16350
16729
|
|
16351
|
-
|
16352
|
-
|
16353
|
-
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
16354
|
-
if (node->src1->type == GGML_TYPE_F32) {
|
16355
|
-
cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
|
16356
|
-
cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
|
16357
|
-
}
|
16730
|
+
cplan.n_tasks[i] = n_tasks;
|
16731
|
+
}
|
16358
16732
|
|
16359
|
-
|
16360
|
-
|
16361
|
-
|
16362
|
-
}
|
16733
|
+
if (work_size > 0) {
|
16734
|
+
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
16735
|
+
}
|
16363
16736
|
|
16364
|
-
|
16365
|
-
|
16366
|
-
|
16367
|
-
case GGML_OP_WIN_UNPART:
|
16368
|
-
case GGML_OP_MAP_UNARY:
|
16369
|
-
case GGML_OP_MAP_BINARY:
|
16370
|
-
case GGML_OP_MAP_CUSTOM1:
|
16371
|
-
case GGML_OP_MAP_CUSTOM2:
|
16372
|
-
case GGML_OP_MAP_CUSTOM3:
|
16373
|
-
{
|
16374
|
-
node->n_tasks = 1;
|
16375
|
-
} break;
|
16376
|
-
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16377
|
-
{
|
16378
|
-
node->n_tasks = n_threads;
|
16379
|
-
|
16380
|
-
size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
|
16381
|
-
|
16382
|
-
work_size = MAX(work_size, cur);
|
16383
|
-
} break;
|
16384
|
-
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
16385
|
-
{
|
16386
|
-
node->n_tasks = n_threads;
|
16387
|
-
|
16388
|
-
size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
|
16389
|
-
|
16390
|
-
work_size = MAX(work_size, cur);
|
16391
|
-
} break;
|
16392
|
-
case GGML_OP_NONE:
|
16393
|
-
{
|
16394
|
-
node->n_tasks = 1;
|
16395
|
-
} break;
|
16396
|
-
case GGML_OP_COUNT:
|
16397
|
-
{
|
16398
|
-
GGML_ASSERT(false);
|
16399
|
-
} break;
|
16400
|
-
}
|
16401
|
-
}
|
16737
|
+
cplan.n_threads = n_threads;
|
16738
|
+
cplan.work_size = work_size;
|
16739
|
+
cplan.work_data = NULL;
|
16402
16740
|
|
16403
|
-
|
16404
|
-
|
16405
|
-
}
|
16741
|
+
return cplan;
|
16742
|
+
}
|
16406
16743
|
|
16407
|
-
|
16408
|
-
|
16744
|
+
int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
16745
|
+
{
|
16746
|
+
GGML_ASSERT(cplan);
|
16747
|
+
GGML_ASSERT(cplan->n_threads > 0);
|
16748
|
+
|
16749
|
+
if (cplan->work_size > 0) {
|
16750
|
+
GGML_ASSERT(cplan->work_data);
|
16751
|
+
}
|
16409
16752
|
|
16410
|
-
|
16411
|
-
cgraph->
|
16753
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
16754
|
+
if (cgraph->nodes[i]->op != GGML_OP_NONE) {
|
16755
|
+
GGML_ASSERT(cplan->n_tasks[i] > 0);
|
16756
|
+
}
|
16412
16757
|
}
|
16413
16758
|
}
|
16414
16759
|
|
16760
|
+
const int n_threads = cplan->n_threads;
|
16761
|
+
|
16762
|
+
struct ggml_compute_state_shared state_shared = {
|
16763
|
+
/*.cgraph =*/ cgraph,
|
16764
|
+
/*.cgraph_plan =*/ cplan,
|
16765
|
+
/*.perf_node_start_cycles =*/ 0,
|
16766
|
+
/*.perf_node_start_time_us =*/ 0,
|
16767
|
+
/*.n_threads =*/ n_threads,
|
16768
|
+
/*.n_active =*/ n_threads,
|
16769
|
+
/*.node_n =*/ -1,
|
16770
|
+
/*.abort_callback =*/ NULL,
|
16771
|
+
/*.abort_callback_data =*/ NULL,
|
16772
|
+
};
|
16773
|
+
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
16774
|
+
|
16415
16775
|
// create thread pool
|
16416
16776
|
if (n_threads > 1) {
|
16417
16777
|
for (int j = 1; j < n_threads; ++j) {
|
@@ -16432,12 +16792,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16432
16792
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
16433
16793
|
|
16434
16794
|
// this is a work thread too
|
16435
|
-
ggml_graph_compute_thread(&workers[0]);
|
16795
|
+
int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
|
16436
16796
|
|
16437
16797
|
// don't leave affinity set on the main thread
|
16438
16798
|
clear_numa_thread_affinity();
|
16439
16799
|
|
16440
|
-
// join thread pool
|
16800
|
+
// join or kill thread pool
|
16441
16801
|
if (n_threads > 1) {
|
16442
16802
|
for (int j = 1; j < n_threads; j++) {
|
16443
16803
|
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
@@ -16461,6 +16821,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16461
16821
|
(double) perf_time_us_cur / 1000.0,
|
16462
16822
|
(double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
|
16463
16823
|
}
|
16824
|
+
|
16825
|
+
return compute_status;
|
16464
16826
|
}
|
16465
16827
|
|
16466
16828
|
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
@@ -16473,6 +16835,17 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
|
16473
16835
|
}
|
16474
16836
|
}
|
16475
16837
|
|
16838
|
+
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
16839
|
+
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
16840
|
+
|
16841
|
+
struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
|
16842
|
+
GGML_ASSERT(buf);
|
16843
|
+
|
16844
|
+
cplan.work_data = buf->data;
|
16845
|
+
|
16846
|
+
ggml_graph_compute(cgraph, &cplan);
|
16847
|
+
}
|
16848
|
+
|
16476
16849
|
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
16477
16850
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
16478
16851
|
struct ggml_tensor * leaf = cgraph->leafs[i];
|
@@ -16511,22 +16884,18 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
16511
16884
|
const int64_t * ne = tensor->ne;
|
16512
16885
|
const size_t * nb = tensor->nb;
|
16513
16886
|
|
16514
|
-
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %
|
16887
|
+
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
16515
16888
|
arg,
|
16516
16889
|
ggml_type_name(tensor->type),
|
16517
16890
|
ggml_op_name (tensor->op),
|
16518
16891
|
tensor->n_dims,
|
16519
16892
|
ne[0], ne[1], ne[2], ne[3],
|
16520
16893
|
nb[0], nb[1], nb[2], nb[3],
|
16521
|
-
tensor->n_tasks,
|
16522
16894
|
tensor->data,
|
16523
16895
|
tensor->name);
|
16524
16896
|
}
|
16525
16897
|
|
16526
16898
|
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
16527
|
-
//assert(cgraph->work == NULL);
|
16528
|
-
//assert(cgraph->work_size == 0);
|
16529
|
-
|
16530
16899
|
uint64_t size_eval = 0;
|
16531
16900
|
|
16532
16901
|
// compute size of intermediate results
|
@@ -16555,8 +16924,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16555
16924
|
ggml_graph_export_leaf(cgraph->leafs[i], fout);
|
16556
16925
|
|
16557
16926
|
GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
|
16558
|
-
GGML_ASSERT(cgraph->leafs[i]->
|
16559
|
-
GGML_ASSERT(cgraph->leafs[i]->
|
16927
|
+
GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL);
|
16928
|
+
GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL);
|
16560
16929
|
}
|
16561
16930
|
|
16562
16931
|
// header
|
@@ -16567,17 +16936,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16567
16936
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
16568
16937
|
ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
|
16569
16938
|
|
16570
|
-
|
16571
|
-
|
16572
|
-
|
16573
|
-
|
16574
|
-
if (cgraph->nodes[i]->src1) {
|
16575
|
-
ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
|
16576
|
-
}
|
16577
|
-
|
16578
|
-
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
16579
|
-
if (cgraph->nodes[i]->opt[j]) {
|
16580
|
-
ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
|
16939
|
+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
16940
|
+
if (cgraph->nodes[i]->src[j]) {
|
16941
|
+
ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
|
16581
16942
|
}
|
16582
16943
|
}
|
16583
16944
|
|
@@ -16668,16 +17029,13 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16668
17029
|
|
16669
17030
|
// output the op arguments
|
16670
17031
|
{
|
16671
|
-
struct ggml_tensor * args[
|
16672
|
-
|
16673
|
-
args[0] = tensor->src0;
|
16674
|
-
args[1] = tensor->src1;
|
17032
|
+
struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
|
16675
17033
|
|
16676
|
-
for (int j = 0; j <
|
16677
|
-
args[
|
17034
|
+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
17035
|
+
args[j] = tensor->src[j];
|
16678
17036
|
}
|
16679
17037
|
|
16680
|
-
for (int j = 0; j <
|
17038
|
+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
16681
17039
|
if (args[j]) {
|
16682
17040
|
int32_t idx = -1;
|
16683
17041
|
|
@@ -16895,12 +17253,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
16895
17253
|
|
16896
17254
|
const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
|
16897
17255
|
|
16898
|
-
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr +=
|
17256
|
+
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
|
16899
17257
|
|
16900
|
-
struct ggml_tensor * args[
|
17258
|
+
struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
|
16901
17259
|
|
16902
17260
|
// parse args
|
16903
|
-
for (int j = 0; j <
|
17261
|
+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
16904
17262
|
const int32_t arg_idx = ptr_arg_idx[j];
|
16905
17263
|
|
16906
17264
|
if (arg_idx == -1) {
|
@@ -16957,11 +17315,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
16957
17315
|
tensor->nb[j] = nb[j];
|
16958
17316
|
}
|
16959
17317
|
|
16960
|
-
|
16961
|
-
|
16962
|
-
|
16963
|
-
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
16964
|
-
tensor->opt[j] = args[2 + j];
|
17318
|
+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
17319
|
+
tensor->src[j] = args[j];
|
16965
17320
|
}
|
16966
17321
|
|
16967
17322
|
result.nodes[i] = tensor;
|
@@ -16979,9 +17334,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
16979
17334
|
|
16980
17335
|
GGML_PRINT("=== GRAPH ===\n");
|
16981
17336
|
|
16982
|
-
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
|
16983
|
-
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
|
16984
|
-
|
16985
17337
|
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
16986
17338
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
16987
17339
|
struct ggml_tensor * node = cgraph->nodes[i];
|
@@ -17160,19 +17512,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17160
17512
|
for (int i = 0; i < gb->n_nodes; i++) {
|
17161
17513
|
struct ggml_tensor * node = gb->nodes[i];
|
17162
17514
|
|
17163
|
-
|
17164
|
-
|
17165
|
-
}
|
17166
|
-
|
17167
|
-
if (node->src1) {
|
17168
|
-
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
|
17169
|
-
}
|
17170
|
-
|
17171
|
-
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
17172
|
-
if (node->opt[j]) {
|
17515
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
17516
|
+
if (node->src[j]) {
|
17173
17517
|
char label[16];
|
17174
|
-
snprintf(label, sizeof(label), "
|
17175
|
-
ggml_graph_dump_dot_node_edge(fp, gb, node, node->
|
17518
|
+
snprintf(label, sizeof(label), "src %d", j);
|
17519
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
|
17176
17520
|
}
|
17177
17521
|
}
|
17178
17522
|
}
|
@@ -17180,19 +17524,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
17180
17524
|
for (int i = 0; i < gb->n_leafs; i++) {
|
17181
17525
|
struct ggml_tensor * node = gb->leafs[i];
|
17182
17526
|
|
17183
|
-
|
17184
|
-
|
17185
|
-
}
|
17186
|
-
|
17187
|
-
if (node->src1) {
|
17188
|
-
ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
|
17189
|
-
}
|
17190
|
-
|
17191
|
-
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
17192
|
-
if (node->opt[j]) {
|
17527
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
17528
|
+
if (node->src[j]) {
|
17193
17529
|
char label[16];
|
17194
|
-
snprintf(label, sizeof(label), "
|
17195
|
-
ggml_graph_dump_dot_leaf_edge(fp, node, node->
|
17530
|
+
snprintf(label, sizeof(label), "src %d", j);
|
17531
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
|
17196
17532
|
}
|
17197
17533
|
}
|
17198
17534
|
}
|
@@ -17254,9 +17590,6 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17254
17590
|
struct ggml_cgraph * gb) {
|
17255
17591
|
GGML_ASSERT(ggml_is_scalar(f));
|
17256
17592
|
|
17257
|
-
gf->n_threads = params.n_threads;
|
17258
|
-
gb->n_threads = params.n_threads;
|
17259
|
-
|
17260
17593
|
// these will store the parameters we want to optimize
|
17261
17594
|
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
17262
17595
|
|
@@ -17303,7 +17636,8 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17303
17636
|
// compute the function value
|
17304
17637
|
ggml_graph_reset (gf);
|
17305
17638
|
ggml_set_f32 (f->grad, 1.0f);
|
17306
|
-
|
17639
|
+
|
17640
|
+
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
17307
17641
|
|
17308
17642
|
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
17309
17643
|
opt->adam.fx_best = opt->adam.fx_prev;
|
@@ -17383,7 +17717,8 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17383
17717
|
|
17384
17718
|
ggml_graph_reset (gf);
|
17385
17719
|
ggml_set_f32 (f->grad, 1.0f);
|
17386
|
-
|
17720
|
+
|
17721
|
+
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
17387
17722
|
|
17388
17723
|
const float fx = ggml_get_f32_1d(f, 0);
|
17389
17724
|
|
@@ -17505,7 +17840,8 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
17505
17840
|
|
17506
17841
|
ggml_graph_reset (gf);
|
17507
17842
|
ggml_set_f32 (f->grad, 1.0f);
|
17508
|
-
|
17843
|
+
|
17844
|
+
ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
|
17509
17845
|
|
17510
17846
|
ggml_opt_get_grad(np, ps, g);
|
17511
17847
|
|
@@ -17573,9 +17909,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17573
17909
|
}
|
17574
17910
|
}
|
17575
17911
|
|
17576
|
-
gf->n_threads = params.n_threads;
|
17577
|
-
gb->n_threads = params.n_threads;
|
17578
|
-
|
17579
17912
|
const int m = params.lbfgs.m;
|
17580
17913
|
|
17581
17914
|
// these will store the parameters we want to optimize
|
@@ -17627,7 +17960,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17627
17960
|
|
17628
17961
|
ggml_graph_reset (gf);
|
17629
17962
|
ggml_set_f32 (f->grad, 1.0f);
|
17630
|
-
|
17963
|
+
|
17964
|
+
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
17631
17965
|
|
17632
17966
|
ggml_opt_get_grad(np, ps, g);
|
17633
17967
|
|