llama_cpp 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -2
- data/README.md +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +39 -1
- data/ext/llama_cpp/src/ggml.c +587 -485
- data/ext/llama_cpp/src/ggml.h +36 -26
- data/ext/llama_cpp/src/llama.cpp +85 -46
- data/ext/llama_cpp/src/llama.h +17 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -1
- data/sig/llama_cpp.rbs +52 -0
- metadata +3 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -16,6 +16,7 @@
|
|
16
16
|
#include <stdlib.h>
|
17
17
|
#include <string.h>
|
18
18
|
#include <stdint.h>
|
19
|
+
#include <inttypes.h>
|
19
20
|
#include <stdio.h>
|
20
21
|
#include <float.h>
|
21
22
|
|
@@ -1961,42 +1962,71 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1961
1962
|
// Initialize accumulator with zeros
|
1962
1963
|
__m256 acc = _mm256_setzero_ps();
|
1963
1964
|
|
1964
|
-
|
1965
|
-
|
1966
|
-
|
1967
|
-
#pragma GCC unroll 16
|
1968
|
-
#endif
|
1969
|
-
for (int i = 0; i < nb; ++i) {
|
1970
|
-
// Compute combined scale for the block
|
1971
|
-
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
1972
|
-
|
1973
|
-
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
1974
|
-
__m256i bx = bytesFromNibbles( x[i].qs );
|
1975
|
-
__m256i by = bytesFromNibbles( y[i].qs );
|
1976
|
-
|
1977
|
-
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
1978
|
-
const __m256i off = _mm256_set1_epi8( 8 );
|
1979
|
-
bx = _mm256_sub_epi8( bx, off );
|
1980
|
-
by = _mm256_sub_epi8( by, off );
|
1981
|
-
|
1982
|
-
// Get absolute values of x vectors
|
1983
|
-
const __m256i ax = _mm256_sign_epi8(bx, bx);
|
1984
|
-
|
1985
|
-
// Sign the values of the y vectors
|
1986
|
-
const __m256i sy = _mm256_sign_epi8(by, bx);
|
1987
|
-
|
1988
|
-
// Perform multiplication and create 16-bit values
|
1989
|
-
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
1990
|
-
|
1991
|
-
const __m256i ones = _mm256_set1_epi16(1);
|
1992
|
-
const __m256i i32 = _mm256_madd_epi16(ones, dot);
|
1965
|
+
/* Prepare the constants we will need during execution */
|
1966
|
+
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
1967
|
+
const __m256i offset_8 = _mm256_set1_epi16( 8 );
|
1993
1968
|
|
1994
|
-
|
1995
|
-
|
1969
|
+
#define UNROLL_COUNT 8
|
1970
|
+
// make sure we only unroll multiples of the block count
|
1971
|
+
assert(nb % UNROLL_COUNT == 0);
|
1996
1972
|
|
1997
|
-
|
1998
|
-
|
1999
|
-
|
1973
|
+
// Main loop
|
1974
|
+
for (int i = 0; i < nb; i+=UNROLL_COUNT) {
|
1975
|
+
|
1976
|
+
// This loop will be unrolled by the compiler
|
1977
|
+
for (int u=0;u<UNROLL_COUNT;u++) {
|
1978
|
+
/* Compute combined scale for the block */
|
1979
|
+
const __m256 scale = _mm256_mul_ps(
|
1980
|
+
_mm256_broadcast_ss( &x[i+u].d ),
|
1981
|
+
_mm256_broadcast_ss( &y[i+u].d ) );
|
1982
|
+
|
1983
|
+
/* get input from x
|
1984
|
+
Input: 32 Nibbles (16 bytes) at *x[i+u]
|
1985
|
+
Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
|
1986
|
+
|
1987
|
+
/* Load 16 bytes from memory */
|
1988
|
+
const __m128i tmp_x = _mm_loadu_si128( ( const __m128i* ) x[i+u].qs);
|
1989
|
+
/* Expand bytes into uint16_t values */
|
1990
|
+
const __m256i bytes_x = _mm256_cvtepu8_epi16(tmp_x);
|
1991
|
+
/* Unpack values into individual bytes */
|
1992
|
+
__m256i x_low_q = _mm256_and_si256( lowMask, bytes_x );
|
1993
|
+
const __m256i pre_shift_x_high_q = _mm256_andnot_si256( lowMask, bytes_x );
|
1994
|
+
__m256i x_high_q = _mm256_srli_epi16( pre_shift_x_high_q, 4 );
|
1995
|
+
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
1996
|
+
x_high_q = _mm256_sub_epi16( x_high_q, offset_8 );
|
1997
|
+
x_low_q = _mm256_sub_epi16( x_low_q, offset_8 );
|
1998
|
+
|
1999
|
+
/* get input from y
|
2000
|
+
Input: 32 Nibbles (16 bytes) at *y[i+u]
|
2001
|
+
Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
|
2002
|
+
|
2003
|
+
/* Load 16 bytes from memory */
|
2004
|
+
const __m128i tmp_y = _mm_loadu_si128( (const __m128i* ) y[i+u].qs);
|
2005
|
+
/* Expand bytes into uint16_t values */
|
2006
|
+
const __m256i bytes_y = _mm256_cvtepu8_epi16(tmp_y);
|
2007
|
+
/* Unpack values into individual bytes */
|
2008
|
+
const __m256i pre_shift_y_high_q = _mm256_andnot_si256( lowMask, bytes_y );
|
2009
|
+
__m256i y_high_q = _mm256_srli_epi16( pre_shift_y_high_q, 4 );
|
2010
|
+
__m256i y_low_q = _mm256_and_si256( lowMask, bytes_y );
|
2011
|
+
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
2012
|
+
y_high_q = _mm256_sub_epi16( y_high_q, offset_8 );
|
2013
|
+
y_low_q = _mm256_sub_epi16( y_low_q, offset_8 );
|
2014
|
+
|
2015
|
+
/* Compute products of int16_t integers, add pairwise, store as int32_t */
|
2016
|
+
__m256i xy_high_q = _mm256_madd_epi16( x_high_q, y_high_q );
|
2017
|
+
__m256i xy_low_q = _mm256_madd_epi16( x_low_q, y_low_q );
|
2018
|
+
|
2019
|
+
/* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
|
2020
|
+
__m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q );
|
2021
|
+
|
2022
|
+
/* Convert to vectore of 8 int32_t to 8 floats */
|
2023
|
+
__m256 q = _mm256_cvtepi32_ps( xy_q );
|
2024
|
+
|
2025
|
+
/* Multiply q with scale and accumulate */
|
2026
|
+
acc = _mm256_fmadd_ps( scale, q, acc );
|
2027
|
+
}
|
2028
|
+
|
2029
|
+
}
|
2000
2030
|
|
2001
2031
|
// Return horizontal sum of the acc vector
|
2002
2032
|
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
@@ -2025,7 +2055,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
2025
2055
|
bx = _mm_sub_epi8( bx, off );
|
2026
2056
|
by = _mm_sub_epi8( by, off );
|
2027
2057
|
|
2028
|
-
|
2058
|
+
// Get absolute values of x vectors
|
2029
2059
|
const __m128i ax = _mm_sign_epi8(bx, bx);
|
2030
2060
|
|
2031
2061
|
// Sign the values of the y vectors
|
@@ -2774,7 +2804,7 @@ void ggml_print_objects(const struct ggml_context * ctx) {
|
|
2774
2804
|
GGML_PRINT("%s: --- end ---\n", __func__);
|
2775
2805
|
}
|
2776
2806
|
|
2777
|
-
|
2807
|
+
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
2778
2808
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2779
2809
|
|
2780
2810
|
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
@@ -3090,7 +3120,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
3090
3120
|
struct ggml_context * ctx,
|
3091
3121
|
enum ggml_type type,
|
3092
3122
|
int n_dims,
|
3093
|
-
const
|
3123
|
+
const int64_t* ne,
|
3094
3124
|
void* data) {
|
3095
3125
|
// always insert objects at the end of the context's memory pool
|
3096
3126
|
struct ggml_object * obj_cur = ctx->objects_end;
|
@@ -3189,7 +3219,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
3189
3219
|
/*.pad =*/ { 0 },
|
3190
3220
|
};
|
3191
3221
|
|
3192
|
-
|
3222
|
+
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
3223
|
+
//ggml_assert_aligned(result->data);
|
3193
3224
|
|
3194
3225
|
for (int i = 0; i < n_dims; i++) {
|
3195
3226
|
result->ne[i] = ne[i];
|
@@ -3210,44 +3241,44 @@ struct ggml_tensor * ggml_new_tensor(
|
|
3210
3241
|
struct ggml_context * ctx,
|
3211
3242
|
enum ggml_type type,
|
3212
3243
|
int n_dims,
|
3213
|
-
const
|
3244
|
+
const int64_t * ne) {
|
3214
3245
|
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
3215
3246
|
}
|
3216
3247
|
|
3217
3248
|
struct ggml_tensor * ggml_new_tensor_1d(
|
3218
3249
|
struct ggml_context * ctx,
|
3219
3250
|
enum ggml_type type,
|
3220
|
-
|
3251
|
+
int64_t ne0) {
|
3221
3252
|
return ggml_new_tensor(ctx, type, 1, &ne0);
|
3222
3253
|
}
|
3223
3254
|
|
3224
3255
|
struct ggml_tensor * ggml_new_tensor_2d(
|
3225
3256
|
struct ggml_context * ctx,
|
3226
3257
|
enum ggml_type type,
|
3227
|
-
|
3228
|
-
|
3229
|
-
const
|
3258
|
+
int64_t ne0,
|
3259
|
+
int64_t ne1) {
|
3260
|
+
const int64_t ne[2] = { ne0, ne1 };
|
3230
3261
|
return ggml_new_tensor(ctx, type, 2, ne);
|
3231
3262
|
}
|
3232
3263
|
|
3233
3264
|
struct ggml_tensor * ggml_new_tensor_3d(
|
3234
3265
|
struct ggml_context * ctx,
|
3235
3266
|
enum ggml_type type,
|
3236
|
-
|
3237
|
-
|
3238
|
-
|
3239
|
-
const
|
3267
|
+
int64_t ne0,
|
3268
|
+
int64_t ne1,
|
3269
|
+
int64_t ne2) {
|
3270
|
+
const int64_t ne[3] = { ne0, ne1, ne2 };
|
3240
3271
|
return ggml_new_tensor(ctx, type, 3, ne);
|
3241
3272
|
}
|
3242
3273
|
|
3243
3274
|
struct ggml_tensor * ggml_new_tensor_4d(
|
3244
3275
|
struct ggml_context * ctx,
|
3245
3276
|
enum ggml_type type,
|
3246
|
-
|
3247
|
-
|
3248
|
-
|
3249
|
-
|
3250
|
-
const
|
3277
|
+
int64_t ne0,
|
3278
|
+
int64_t ne1,
|
3279
|
+
int64_t ne2,
|
3280
|
+
int64_t ne3) {
|
3281
|
+
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
3251
3282
|
return ggml_new_tensor(ctx, type, 4, ne);
|
3252
3283
|
}
|
3253
3284
|
|
@@ -3590,7 +3621,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
|
3590
3621
|
struct ggml_tensor * ggml_view_tensor(
|
3591
3622
|
struct ggml_context * ctx,
|
3592
3623
|
const struct ggml_tensor * src) {
|
3593
|
-
|
3624
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
3625
|
+
|
3626
|
+
result->nb[0] = src->nb[0];
|
3627
|
+
result->nb[1] = src->nb[1];
|
3628
|
+
result->nb[2] = src->nb[2];
|
3629
|
+
result->nb[3] = src->nb[3];
|
3630
|
+
|
3631
|
+
return result;
|
3594
3632
|
}
|
3595
3633
|
|
3596
3634
|
////////////////////////////////////////////////////////////////////////////////
|
@@ -3894,7 +3932,7 @@ struct ggml_tensor * ggml_mean(
|
|
3894
3932
|
is_node = true;
|
3895
3933
|
}
|
3896
3934
|
|
3897
|
-
|
3935
|
+
int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
|
3898
3936
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
|
3899
3937
|
|
3900
3938
|
result->op = GGML_OP_MEAN;
|
@@ -4255,7 +4293,7 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4255
4293
|
is_node = true;
|
4256
4294
|
}
|
4257
4295
|
|
4258
|
-
const
|
4296
|
+
const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
|
4259
4297
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
|
4260
4298
|
|
4261
4299
|
result->op = GGML_OP_MUL_MAT;
|
@@ -4380,8 +4418,8 @@ struct ggml_tensor * ggml_reshape(
|
|
4380
4418
|
struct ggml_tensor * ggml_reshape_2d(
|
4381
4419
|
struct ggml_context * ctx,
|
4382
4420
|
struct ggml_tensor * a,
|
4383
|
-
|
4384
|
-
|
4421
|
+
int64_t ne0,
|
4422
|
+
int64_t ne1) {
|
4385
4423
|
GGML_ASSERT(ggml_is_contiguous(a));
|
4386
4424
|
GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
|
4387
4425
|
|
@@ -4392,7 +4430,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
4392
4430
|
is_node = true;
|
4393
4431
|
}
|
4394
4432
|
|
4395
|
-
const
|
4433
|
+
const int64_t ne[2] = { ne0, ne1 };
|
4396
4434
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
4397
4435
|
|
4398
4436
|
result->op = GGML_OP_RESHAPE;
|
@@ -4406,9 +4444,9 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
4406
4444
|
struct ggml_tensor * ggml_reshape_3d(
|
4407
4445
|
struct ggml_context * ctx,
|
4408
4446
|
struct ggml_tensor * a,
|
4409
|
-
|
4410
|
-
|
4411
|
-
|
4447
|
+
int64_t ne0,
|
4448
|
+
int64_t ne1,
|
4449
|
+
int64_t ne2) {
|
4412
4450
|
GGML_ASSERT(ggml_is_contiguous(a));
|
4413
4451
|
GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
|
4414
4452
|
|
@@ -4419,7 +4457,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
4419
4457
|
is_node = true;
|
4420
4458
|
}
|
4421
4459
|
|
4422
|
-
const
|
4460
|
+
const int64_t ne[3] = { ne0, ne1, ne2 };
|
4423
4461
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
4424
4462
|
|
4425
4463
|
result->op = GGML_OP_RESHAPE;
|
@@ -4435,7 +4473,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
4435
4473
|
struct ggml_tensor * ggml_view_1d(
|
4436
4474
|
struct ggml_context * ctx,
|
4437
4475
|
struct ggml_tensor * a,
|
4438
|
-
|
4476
|
+
int64_t ne0,
|
4439
4477
|
size_t offset) {
|
4440
4478
|
if (a->grad) {
|
4441
4479
|
GGML_ASSERT(false); // gradient propagation is not supported
|
@@ -4456,15 +4494,15 @@ struct ggml_tensor * ggml_view_1d(
|
|
4456
4494
|
struct ggml_tensor * ggml_view_2d(
|
4457
4495
|
struct ggml_context * ctx,
|
4458
4496
|
struct ggml_tensor * a,
|
4459
|
-
|
4460
|
-
|
4497
|
+
int64_t ne0,
|
4498
|
+
int64_t ne1,
|
4461
4499
|
size_t nb1,
|
4462
4500
|
size_t offset) {
|
4463
4501
|
if (a->grad) {
|
4464
4502
|
GGML_ASSERT(false); // gradient propagation is not supported
|
4465
4503
|
}
|
4466
4504
|
|
4467
|
-
const
|
4505
|
+
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
4468
4506
|
|
4469
4507
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
4470
4508
|
|
@@ -4480,6 +4518,37 @@ struct ggml_tensor * ggml_view_2d(
|
|
4480
4518
|
return result;
|
4481
4519
|
}
|
4482
4520
|
|
4521
|
+
// ggml_view_3d
|
4522
|
+
|
4523
|
+
struct ggml_tensor * ggml_view_3d(
|
4524
|
+
struct ggml_context * ctx,
|
4525
|
+
struct ggml_tensor * a,
|
4526
|
+
int64_t ne0,
|
4527
|
+
int64_t ne1,
|
4528
|
+
int64_t ne2,
|
4529
|
+
size_t nb1,
|
4530
|
+
size_t nb2,
|
4531
|
+
size_t offset) {
|
4532
|
+
if (a->grad) {
|
4533
|
+
GGML_ASSERT(false); // gradient propagation is not supported
|
4534
|
+
}
|
4535
|
+
|
4536
|
+
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
4537
|
+
|
4538
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
4539
|
+
|
4540
|
+
result->nb[1] = nb1;
|
4541
|
+
result->nb[2] = nb2;
|
4542
|
+
result->nb[3] = result->nb[2]*ne2;
|
4543
|
+
|
4544
|
+
result->op = GGML_OP_VIEW;
|
4545
|
+
result->grad = NULL;
|
4546
|
+
result->src0 = a;
|
4547
|
+
result->src1 = NULL; // TODO: maybe store the offset here?
|
4548
|
+
|
4549
|
+
return result;
|
4550
|
+
}
|
4551
|
+
|
4483
4552
|
// ggml_permute
|
4484
4553
|
|
4485
4554
|
struct ggml_tensor * ggml_permute(
|
@@ -4695,7 +4764,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
4695
4764
|
is_node = true;
|
4696
4765
|
}
|
4697
4766
|
|
4698
|
-
const
|
4767
|
+
const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
|
4699
4768
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
4700
4769
|
|
4701
4770
|
result->op = GGML_OP_CONV_1D_1S;
|
@@ -4722,7 +4791,7 @@ struct ggml_tensor * ggml_conv_1d_2s(
|
|
4722
4791
|
is_node = true;
|
4723
4792
|
}
|
4724
4793
|
|
4725
|
-
const
|
4794
|
+
const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
|
4726
4795
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
4727
4796
|
|
4728
4797
|
result->op = GGML_OP_CONV_1D_2S;
|
@@ -4815,102 +4884,112 @@ static void ggml_compute_forward_dup_f16(
|
|
4815
4884
|
const struct ggml_tensor * src0,
|
4816
4885
|
struct ggml_tensor * dst) {
|
4817
4886
|
GGML_ASSERT(params->ith == 0);
|
4818
|
-
GGML_ASSERT(ggml_is_contiguous(dst));
|
4819
4887
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
4820
4888
|
|
4821
4889
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
4822
4890
|
return;
|
4823
4891
|
}
|
4824
4892
|
|
4825
|
-
const
|
4826
|
-
const
|
4827
|
-
const
|
4828
|
-
const
|
4893
|
+
const int64_t ne00 = src0->ne[0];
|
4894
|
+
const int64_t ne01 = src0->ne[1];
|
4895
|
+
const int64_t ne02 = src0->ne[2];
|
4896
|
+
const int64_t ne03 = src0->ne[3];
|
4829
4897
|
|
4830
4898
|
const size_t nb00 = src0->nb[0];
|
4831
4899
|
const size_t nb01 = src0->nb[1];
|
4832
4900
|
const size_t nb02 = src0->nb[2];
|
4833
4901
|
const size_t nb03 = src0->nb[3];
|
4834
4902
|
|
4835
|
-
|
4903
|
+
const size_t nb0 = dst->nb[0];
|
4904
|
+
const size_t nb1 = dst->nb[1];
|
4905
|
+
const size_t nb2 = dst->nb[2];
|
4906
|
+
const size_t nb3 = dst->nb[3];
|
4907
|
+
|
4908
|
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
4836
4909
|
memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
|
4837
4910
|
return;
|
4838
4911
|
}
|
4839
4912
|
|
4840
|
-
if (src0->
|
4841
|
-
|
4842
|
-
|
4843
|
-
|
4844
|
-
|
4845
|
-
|
4846
|
-
|
4847
|
-
|
4848
|
-
|
4849
|
-
|
4850
|
-
|
4851
|
-
|
4852
|
-
|
4853
|
-
id++;
|
4854
|
-
}
|
4855
|
-
}
|
4856
|
-
}
|
4857
|
-
} else if (dst->type == GGML_TYPE_F32) {
|
4858
|
-
size_t id = 0;
|
4859
|
-
float * dst_ptr = (float *) dst->data;
|
4860
|
-
|
4861
|
-
for (int i03 = 0; i03 < ne03; i03++) {
|
4862
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
4863
|
-
for (int i01 = 0; i01 < ne01; i01++) {
|
4864
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
4865
|
-
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
4866
|
-
|
4867
|
-
dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
|
4868
|
-
id++;
|
4869
|
-
}
|
4870
|
-
}
|
4913
|
+
if (src0->type == dst->type &&
|
4914
|
+
src0->ne[0] == dst->ne[0] &&
|
4915
|
+
src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) {
|
4916
|
+
// copy by rows
|
4917
|
+
const size_t rs = ne00*nb00;
|
4918
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
4919
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
4920
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
4921
|
+
memcpy(
|
4922
|
+
((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
|
4923
|
+
((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
|
4924
|
+
rs);
|
4871
4925
|
}
|
4872
4926
|
}
|
4873
|
-
} else {
|
4874
|
-
GGML_ASSERT(false); // TODO: implement
|
4875
4927
|
}
|
4876
|
-
|
4877
|
-
|
4878
|
-
|
4879
|
-
if (dst->type == GGML_TYPE_F32) {
|
4880
|
-
size_t id = 0;
|
4881
|
-
float * dst_ptr = (float *) dst->data;
|
4882
|
-
|
4883
|
-
for (int i03 = 0; i03 < ne03; i03++) {
|
4884
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
4885
|
-
for (int i01 = 0; i01 < ne01; i01++) {
|
4886
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
4887
|
-
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
4928
|
+
return;
|
4929
|
+
}
|
4888
4930
|
|
4889
|
-
|
4890
|
-
|
4931
|
+
// TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
|
4932
|
+
|
4933
|
+
// dst counters
|
4934
|
+
int64_t i10 = 0;
|
4935
|
+
int64_t i11 = 0;
|
4936
|
+
int64_t i12 = 0;
|
4937
|
+
int64_t i13 = 0;
|
4938
|
+
|
4939
|
+
if (dst->type == GGML_TYPE_F16) {
|
4940
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
4941
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
4942
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
4943
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
4944
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
4945
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
4946
|
+
|
4947
|
+
memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
|
4948
|
+
|
4949
|
+
if (++i10 == ne00) {
|
4950
|
+
i10 = 0;
|
4951
|
+
if (++i11 == ne01) {
|
4952
|
+
i11 = 0;
|
4953
|
+
if (++i12 == ne02) {
|
4954
|
+
i12 = 0;
|
4955
|
+
if (++i13 == ne03) {
|
4956
|
+
i13 = 0;
|
4957
|
+
}
|
4958
|
+
}
|
4959
|
+
}
|
4891
4960
|
}
|
4892
4961
|
}
|
4893
4962
|
}
|
4894
4963
|
}
|
4895
|
-
}
|
4896
|
-
|
4897
|
-
|
4898
|
-
|
4899
|
-
|
4900
|
-
|
4901
|
-
|
4902
|
-
|
4903
|
-
|
4904
|
-
|
4905
|
-
|
4906
|
-
|
4964
|
+
}
|
4965
|
+
} else if (dst->type == GGML_TYPE_F32) {
|
4966
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
4967
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
4968
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
4969
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
4970
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
4971
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
4972
|
+
|
4973
|
+
*(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
|
4974
|
+
|
4975
|
+
if (++i10 == ne00) {
|
4976
|
+
i10 = 0;
|
4977
|
+
if (++i11 == ne01) {
|
4978
|
+
i11 = 0;
|
4979
|
+
if (++i12 == ne02) {
|
4980
|
+
i12 = 0;
|
4981
|
+
if (++i13 == ne03) {
|
4982
|
+
i13 = 0;
|
4983
|
+
}
|
4984
|
+
}
|
4985
|
+
}
|
4907
4986
|
}
|
4908
4987
|
}
|
4909
4988
|
}
|
4910
4989
|
}
|
4911
|
-
} else {
|
4912
|
-
GGML_ASSERT(false); // TODO: implement
|
4913
4990
|
}
|
4991
|
+
} else {
|
4992
|
+
GGML_ASSERT(false); // TODO: implement
|
4914
4993
|
}
|
4915
4994
|
}
|
4916
4995
|
|
@@ -4919,102 +4998,92 @@ static void ggml_compute_forward_dup_f32(
|
|
4919
4998
|
const struct ggml_tensor * src0,
|
4920
4999
|
struct ggml_tensor * dst) {
|
4921
5000
|
GGML_ASSERT(params->ith == 0);
|
4922
|
-
GGML_ASSERT(ggml_is_contiguous(dst));
|
4923
5001
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
4924
5002
|
|
4925
5003
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
4926
5004
|
return;
|
4927
5005
|
}
|
4928
5006
|
|
4929
|
-
const
|
4930
|
-
const
|
4931
|
-
const
|
4932
|
-
const
|
5007
|
+
const int64_t ne00 = src0->ne[0];
|
5008
|
+
const int64_t ne01 = src0->ne[1];
|
5009
|
+
const int64_t ne02 = src0->ne[2];
|
5010
|
+
const int64_t ne03 = src0->ne[3];
|
4933
5011
|
|
4934
5012
|
const size_t nb00 = src0->nb[0];
|
4935
5013
|
const size_t nb01 = src0->nb[1];
|
4936
5014
|
const size_t nb02 = src0->nb[2];
|
4937
5015
|
const size_t nb03 = src0->nb[3];
|
4938
5016
|
|
4939
|
-
|
5017
|
+
const size_t nb0 = dst->nb[0];
|
5018
|
+
const size_t nb1 = dst->nb[1];
|
5019
|
+
const size_t nb2 = dst->nb[2];
|
5020
|
+
const size_t nb3 = dst->nb[3];
|
5021
|
+
|
5022
|
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
4940
5023
|
memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
|
4941
5024
|
return;
|
4942
5025
|
}
|
4943
5026
|
|
4944
|
-
|
4945
|
-
|
4946
|
-
|
4947
|
-
|
4948
|
-
|
4949
|
-
|
4950
|
-
|
4951
|
-
|
4952
|
-
|
4953
|
-
|
4954
|
-
|
4955
|
-
|
4956
|
-
|
4957
|
-
|
4958
|
-
|
4959
|
-
|
4960
|
-
|
4961
|
-
|
4962
|
-
|
4963
|
-
|
4964
|
-
|
4965
|
-
|
4966
|
-
|
4967
|
-
|
4968
|
-
|
4969
|
-
|
4970
|
-
|
4971
|
-
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
4972
|
-
id++;
|
5027
|
+
// dst counters
|
5028
|
+
int64_t i10 = 0;
|
5029
|
+
int64_t i11 = 0;
|
5030
|
+
int64_t i12 = 0;
|
5031
|
+
int64_t i13 = 0;
|
5032
|
+
|
5033
|
+
if (dst->type == GGML_TYPE_F32) {
|
5034
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5035
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5036
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5037
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5038
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5039
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
5040
|
+
|
5041
|
+
memcpy(dst_ptr, src0_ptr, sizeof(float));
|
5042
|
+
|
5043
|
+
if (++i10 == dst->ne[0]) {
|
5044
|
+
i10 = 0;
|
5045
|
+
if (++i11 == dst->ne[1]) {
|
5046
|
+
i11 = 0;
|
5047
|
+
if (++i12 == dst->ne[2]) {
|
5048
|
+
i12 = 0;
|
5049
|
+
if (++i13 == dst->ne[3]) {
|
5050
|
+
i13 = 0;
|
5051
|
+
}
|
5052
|
+
}
|
5053
|
+
}
|
4973
5054
|
}
|
4974
5055
|
}
|
4975
5056
|
}
|
4976
5057
|
}
|
4977
|
-
} else {
|
4978
|
-
GGML_ASSERT(false); // TODO: implement
|
4979
5058
|
}
|
4980
|
-
} else {
|
4981
|
-
|
4982
|
-
|
4983
|
-
|
4984
|
-
|
4985
|
-
|
4986
|
-
|
4987
|
-
|
4988
|
-
|
4989
|
-
|
4990
|
-
|
4991
|
-
|
4992
|
-
|
4993
|
-
|
4994
|
-
|
4995
|
-
|
4996
|
-
|
4997
|
-
|
4998
|
-
|
4999
|
-
|
5000
|
-
|
5001
|
-
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
5002
|
-
|
5003
|
-
for (int i03 = 0; i03 < ne03; i03++) {
|
5004
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
5005
|
-
for (int i01 = 0; i01 < ne01; i01++) {
|
5006
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
5007
|
-
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5008
|
-
|
5009
|
-
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
5010
|
-
id++;
|
5059
|
+
} else if (dst->type == GGML_TYPE_F16) {
|
5060
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5061
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5062
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5063
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5064
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5065
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
5066
|
+
|
5067
|
+
*(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
|
5068
|
+
|
5069
|
+
if (++i10 == dst->ne[0]) {
|
5070
|
+
i10 = 0;
|
5071
|
+
if (++i11 == dst->ne[1]) {
|
5072
|
+
i11 = 0;
|
5073
|
+
if (++i12 == dst->ne[2]) {
|
5074
|
+
i12 = 0;
|
5075
|
+
if (++i13 == dst->ne[3]) {
|
5076
|
+
i13 = 0;
|
5077
|
+
}
|
5078
|
+
}
|
5079
|
+
}
|
5011
5080
|
}
|
5012
5081
|
}
|
5013
5082
|
}
|
5014
5083
|
}
|
5015
|
-
} else {
|
5016
|
-
GGML_ASSERT(false); // TODO: implement
|
5017
5084
|
}
|
5085
|
+
} else {
|
5086
|
+
GGML_ASSERT(false); // TODO: implement
|
5018
5087
|
}
|
5019
5088
|
}
|
5020
5089
|
|
@@ -5389,18 +5458,18 @@ static void ggml_compute_forward_sum_f32(
|
|
5389
5458
|
assert(ggml_is_scalar(dst));
|
5390
5459
|
assert(src0->nb[0] == sizeof(float));
|
5391
5460
|
|
5392
|
-
const
|
5393
|
-
const
|
5394
|
-
const
|
5395
|
-
const
|
5461
|
+
const int64_t ne00 = src0->ne[0];
|
5462
|
+
const int64_t ne01 = src0->ne[1];
|
5463
|
+
const int64_t ne02 = src0->ne[2];
|
5464
|
+
const int64_t ne03 = src0->ne[3];
|
5396
5465
|
|
5397
5466
|
const size_t nb01 = src0->nb[1];
|
5398
5467
|
const size_t nb02 = src0->nb[2];
|
5399
5468
|
const size_t nb03 = src0->nb[3];
|
5400
5469
|
|
5401
|
-
for (
|
5402
|
-
for (
|
5403
|
-
for (
|
5470
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5471
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5472
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5404
5473
|
ggml_vec_sum_f32(ne00,
|
5405
5474
|
(float *) (dst->data),
|
5406
5475
|
(float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
|
@@ -5445,19 +5514,19 @@ static void ggml_compute_forward_mean_f32(
|
|
5445
5514
|
|
5446
5515
|
assert(src0->nb[0] == sizeof(float));
|
5447
5516
|
|
5448
|
-
const
|
5449
|
-
const
|
5450
|
-
const
|
5451
|
-
const
|
5517
|
+
const int64_t ne00 = src0->ne[0];
|
5518
|
+
const int64_t ne01 = src0->ne[1];
|
5519
|
+
const int64_t ne02 = src0->ne[2];
|
5520
|
+
const int64_t ne03 = src0->ne[3];
|
5452
5521
|
|
5453
5522
|
const size_t nb01 = src0->nb[1];
|
5454
5523
|
const size_t nb02 = src0->nb[2];
|
5455
5524
|
const size_t nb03 = src0->nb[3];
|
5456
5525
|
|
5457
|
-
const
|
5458
|
-
const
|
5459
|
-
const
|
5460
|
-
const
|
5526
|
+
const int64_t ne0 = dst->ne[0];
|
5527
|
+
const int64_t ne1 = dst->ne[1];
|
5528
|
+
const int64_t ne2 = dst->ne[2];
|
5529
|
+
const int64_t ne3 = dst->ne[3];
|
5461
5530
|
|
5462
5531
|
assert(ne0 == 1);
|
5463
5532
|
assert(ne1 == ne01);
|
@@ -5473,9 +5542,9 @@ static void ggml_compute_forward_mean_f32(
|
|
5473
5542
|
const size_t nb2 = dst->nb[2];
|
5474
5543
|
const size_t nb3 = dst->nb[3];
|
5475
5544
|
|
5476
|
-
for (
|
5477
|
-
for (
|
5478
|
-
for (
|
5545
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5546
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5547
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5479
5548
|
ggml_vec_sum_f32(ne00,
|
5480
5549
|
(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
|
5481
5550
|
(float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
|
@@ -5962,10 +6031,10 @@ static void ggml_compute_forward_norm_f32(
|
|
5962
6031
|
const int ith = params->ith;
|
5963
6032
|
const int nth = params->nth;
|
5964
6033
|
|
5965
|
-
const
|
5966
|
-
const
|
5967
|
-
const
|
5968
|
-
const
|
6034
|
+
const int64_t ne00 = src0->ne[0];
|
6035
|
+
const int64_t ne01 = src0->ne[1];
|
6036
|
+
const int64_t ne02 = src0->ne[2];
|
6037
|
+
const int64_t ne03 = src0->ne[3];
|
5969
6038
|
|
5970
6039
|
const size_t nb01 = src0->nb[1];
|
5971
6040
|
const size_t nb02 = src0->nb[2];
|
@@ -5978,13 +6047,13 @@ static void ggml_compute_forward_norm_f32(
|
|
5978
6047
|
const float eps = 1e-5f; // TODO: make this a parameter
|
5979
6048
|
|
5980
6049
|
// TODO: optimize
|
5981
|
-
for (
|
5982
|
-
for (
|
5983
|
-
for (
|
6050
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6051
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6052
|
+
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
5984
6053
|
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
5985
6054
|
|
5986
6055
|
ggml_float sum = 0.0;
|
5987
|
-
for (
|
6056
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5988
6057
|
sum += (ggml_float)x[i00];
|
5989
6058
|
}
|
5990
6059
|
|
@@ -5993,7 +6062,7 @@ static void ggml_compute_forward_norm_f32(
|
|
5993
6062
|
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
5994
6063
|
|
5995
6064
|
ggml_float sum2 = 0.0;
|
5996
|
-
for (
|
6065
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5997
6066
|
float v = x[i00] - mean;
|
5998
6067
|
y[i00] = v;
|
5999
6068
|
sum2 += (ggml_float)(v*v);
|
@@ -6045,10 +6114,10 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
6045
6114
|
const int ith = params->ith;
|
6046
6115
|
const int nth = params->nth;
|
6047
6116
|
|
6048
|
-
const
|
6049
|
-
const
|
6050
|
-
const
|
6051
|
-
const
|
6117
|
+
const int64_t ne00 = src0->ne[0];
|
6118
|
+
const int64_t ne01 = src0->ne[1];
|
6119
|
+
const int64_t ne02 = src0->ne[2];
|
6120
|
+
const int64_t ne03 = src0->ne[3];
|
6052
6121
|
|
6053
6122
|
const size_t nb01 = src0->nb[1];
|
6054
6123
|
const size_t nb02 = src0->nb[2];
|
@@ -6061,13 +6130,13 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
6061
6130
|
const float eps = 1e-6f; // TODO: make this a parameter
|
6062
6131
|
|
6063
6132
|
// TODO: optimize
|
6064
|
-
for (
|
6065
|
-
for (
|
6066
|
-
for (
|
6133
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6134
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6135
|
+
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
6067
6136
|
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
6068
6137
|
|
6069
6138
|
ggml_float sum = 0.0;
|
6070
|
-
for (
|
6139
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
6071
6140
|
sum += (ggml_float)(x[i00] * x[i00]);
|
6072
6141
|
}
|
6073
6142
|
|
@@ -6120,13 +6189,13 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
6120
6189
|
const struct ggml_tensor * src0,
|
6121
6190
|
const struct ggml_tensor * src1,
|
6122
6191
|
struct ggml_tensor * dst) {
|
6123
|
-
//const
|
6124
|
-
//const
|
6192
|
+
//const int64_t ne00 = src0->ne[0];
|
6193
|
+
//const int64_t ne01 = src0->ne[1];
|
6125
6194
|
|
6126
|
-
const
|
6195
|
+
const int64_t ne10 = src1->ne[0];
|
6127
6196
|
|
6128
|
-
const
|
6129
|
-
const
|
6197
|
+
const int64_t ne0 = dst->ne[0];
|
6198
|
+
const int64_t ne1 = dst->ne[1];
|
6130
6199
|
|
6131
6200
|
// TODO: find the optimal values for these
|
6132
6201
|
if (ggml_is_contiguous(src0) &&
|
@@ -6148,23 +6217,23 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
6148
6217
|
int64_t t0 = ggml_perf_time_us();
|
6149
6218
|
UNUSED(t0);
|
6150
6219
|
|
6151
|
-
const
|
6152
|
-
const
|
6153
|
-
const
|
6154
|
-
const
|
6220
|
+
const int64_t ne00 = src0->ne[0];
|
6221
|
+
const int64_t ne01 = src0->ne[1];
|
6222
|
+
const int64_t ne02 = src0->ne[2];
|
6223
|
+
const int64_t ne03 = src0->ne[3];
|
6155
6224
|
|
6156
6225
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
6157
|
-
const
|
6226
|
+
const int64_t ne10 = src1->ne[0];
|
6158
6227
|
#endif
|
6159
|
-
const
|
6228
|
+
const int64_t ne11 = src1->ne[1];
|
6160
6229
|
#ifndef NDEBUG
|
6161
|
-
const
|
6162
|
-
const
|
6230
|
+
const int64_t ne12 = src1->ne[2];
|
6231
|
+
const int64_t ne13 = src1->ne[3];
|
6163
6232
|
|
6164
|
-
const
|
6165
|
-
const
|
6166
|
-
const
|
6167
|
-
const
|
6233
|
+
const int64_t ne0 = dst->ne[0];
|
6234
|
+
const int64_t ne1 = dst->ne[1];
|
6235
|
+
const int64_t ne2 = dst->ne[2];
|
6236
|
+
const int64_t ne3 = dst->ne[3];
|
6168
6237
|
|
6169
6238
|
const int nb00 = src0->nb[0];
|
6170
6239
|
#endif
|
@@ -6224,8 +6293,8 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
6224
6293
|
return;
|
6225
6294
|
}
|
6226
6295
|
|
6227
|
-
for (
|
6228
|
-
for (
|
6296
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6297
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6229
6298
|
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
6230
6299
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
6231
6300
|
|
@@ -6272,7 +6341,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
6272
6341
|
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
6273
6342
|
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
6274
6343
|
|
6275
|
-
for (
|
6344
|
+
for (int64_t ic = 0; ic < ne11; ++ic) {
|
6276
6345
|
// src1 indices
|
6277
6346
|
const int i13 = i03;
|
6278
6347
|
const int i12 = i02;
|
@@ -6313,21 +6382,21 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6313
6382
|
int64_t t0 = ggml_perf_time_us();
|
6314
6383
|
UNUSED(t0);
|
6315
6384
|
|
6316
|
-
const
|
6317
|
-
const
|
6318
|
-
const
|
6319
|
-
const
|
6385
|
+
const int64_t ne00 = src0->ne[0];
|
6386
|
+
const int64_t ne01 = src0->ne[1];
|
6387
|
+
const int64_t ne02 = src0->ne[2];
|
6388
|
+
const int64_t ne03 = src0->ne[3];
|
6320
6389
|
|
6321
|
-
const
|
6322
|
-
const
|
6323
|
-
const
|
6324
|
-
const
|
6390
|
+
const int64_t ne10 = src1->ne[0];
|
6391
|
+
const int64_t ne11 = src1->ne[1];
|
6392
|
+
const int64_t ne12 = src1->ne[2];
|
6393
|
+
const int64_t ne13 = src1->ne[3];
|
6325
6394
|
|
6326
|
-
const
|
6327
|
-
const
|
6328
|
-
const
|
6329
|
-
const
|
6330
|
-
//const
|
6395
|
+
const int64_t ne0 = dst->ne[0];
|
6396
|
+
const int64_t ne1 = dst->ne[1];
|
6397
|
+
const int64_t ne2 = dst->ne[2];
|
6398
|
+
const int64_t ne3 = dst->ne[3];
|
6399
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
6331
6400
|
|
6332
6401
|
const int nb00 = src0->nb[0];
|
6333
6402
|
const int nb01 = src0->nb[1];
|
@@ -6387,12 +6456,12 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6387
6456
|
|
6388
6457
|
float * const wdata = params->wdata;
|
6389
6458
|
|
6390
|
-
for (
|
6391
|
-
for (
|
6459
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6460
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6392
6461
|
{
|
6393
6462
|
size_t id = 0;
|
6394
|
-
for (
|
6395
|
-
for (
|
6463
|
+
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
6464
|
+
for (int64_t i00 = 0; i00 < ne00; ++i00) {
|
6396
6465
|
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
6397
6466
|
}
|
6398
6467
|
}
|
@@ -6422,10 +6491,10 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6422
6491
|
ggml_fp16_t * const wdata = params->wdata;
|
6423
6492
|
|
6424
6493
|
size_t id = 0;
|
6425
|
-
for (
|
6426
|
-
for (
|
6427
|
-
for (
|
6428
|
-
for (
|
6494
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
6495
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
6496
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
6497
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
6429
6498
|
wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
|
6430
6499
|
}
|
6431
6500
|
}
|
@@ -6477,7 +6546,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6477
6546
|
|
6478
6547
|
float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
|
6479
6548
|
|
6480
|
-
for (
|
6549
|
+
for (int64_t ic = 0; ic < ne11; ++ic) {
|
6481
6550
|
ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
|
6482
6551
|
}
|
6483
6552
|
}
|
@@ -6526,20 +6595,20 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6526
6595
|
int64_t t0 = ggml_perf_time_us();
|
6527
6596
|
UNUSED(t0);
|
6528
6597
|
|
6529
|
-
const
|
6530
|
-
const
|
6531
|
-
const
|
6532
|
-
const
|
6598
|
+
const int64_t ne00 = src0->ne[0];
|
6599
|
+
const int64_t ne01 = src0->ne[1];
|
6600
|
+
const int64_t ne02 = src0->ne[2];
|
6601
|
+
const int64_t ne03 = src0->ne[3];
|
6533
6602
|
|
6534
|
-
const
|
6535
|
-
const
|
6536
|
-
const
|
6537
|
-
const
|
6603
|
+
const int64_t ne10 = src1->ne[0];
|
6604
|
+
const int64_t ne11 = src1->ne[1];
|
6605
|
+
const int64_t ne12 = src1->ne[2];
|
6606
|
+
const int64_t ne13 = src1->ne[3];
|
6538
6607
|
|
6539
|
-
const
|
6540
|
-
const
|
6541
|
-
const
|
6542
|
-
const
|
6608
|
+
const int64_t ne0 = dst->ne[0];
|
6609
|
+
const int64_t ne1 = dst->ne[1];
|
6610
|
+
const int64_t ne2 = dst->ne[2];
|
6611
|
+
const int64_t ne3 = dst->ne[3];
|
6543
6612
|
|
6544
6613
|
const int nb00 = src0->nb[0];
|
6545
6614
|
const int nb01 = src0->nb[1];
|
@@ -6603,11 +6672,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6603
6672
|
float * const wdata = params->wdata;
|
6604
6673
|
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
6605
6674
|
|
6606
|
-
for (
|
6607
|
-
for (
|
6675
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6676
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6608
6677
|
{
|
6609
6678
|
size_t id = 0;
|
6610
|
-
for (
|
6679
|
+
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
6611
6680
|
dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
|
6612
6681
|
id += ne00;
|
6613
6682
|
}
|
@@ -6637,9 +6706,9 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6637
6706
|
char * wdata = params->wdata;
|
6638
6707
|
const size_t row_size = ne10*GGML_TYPE_SIZE[type]/GGML_BLCK_SIZE[type];
|
6639
6708
|
|
6640
|
-
for (
|
6641
|
-
for (
|
6642
|
-
for (
|
6709
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
6710
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
6711
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
6643
6712
|
quantize_row_q((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
|
6644
6713
|
wdata += row_size;
|
6645
6714
|
}
|
@@ -6688,7 +6757,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6688
6757
|
|
6689
6758
|
assert(ne00 % 32 == 0);
|
6690
6759
|
|
6691
|
-
for (
|
6760
|
+
for (int64_t ic = 0; ic < ne11; ++ic) {
|
6692
6761
|
vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
|
6693
6762
|
}
|
6694
6763
|
}
|
@@ -7169,7 +7238,6 @@ static void ggml_compute_forward_rope_f32(
|
|
7169
7238
|
const struct ggml_tensor * src0,
|
7170
7239
|
const struct ggml_tensor * src1,
|
7171
7240
|
struct ggml_tensor * dst) {
|
7172
|
-
assert(params->ith == 0);
|
7173
7241
|
assert(src1->type == GGML_TYPE_I32);
|
7174
7242
|
assert(ggml_nelements(src1) == 3);
|
7175
7243
|
|
@@ -7181,10 +7249,10 @@ static void ggml_compute_forward_rope_f32(
|
|
7181
7249
|
const int n_dims = ((int32_t *) src1->data)[1];
|
7182
7250
|
const int mode = ((int32_t *) src1->data)[2];
|
7183
7251
|
|
7184
|
-
//const
|
7185
|
-
const
|
7186
|
-
const
|
7187
|
-
const
|
7252
|
+
//const int64_t ne0 = src0->ne[0];
|
7253
|
+
const int64_t ne1 = src0->ne[1];
|
7254
|
+
const int64_t ne2 = src0->ne[2];
|
7255
|
+
const int64_t ne3 = src0->ne[3];
|
7188
7256
|
|
7189
7257
|
const int nb0 = src0->nb[0];
|
7190
7258
|
const int nb1 = src0->nb[1];
|
@@ -7196,11 +7264,28 @@ static void ggml_compute_forward_rope_f32(
|
|
7196
7264
|
|
7197
7265
|
assert(nb0 == sizeof(float));
|
7198
7266
|
|
7199
|
-
|
7200
|
-
|
7201
|
-
|
7267
|
+
const int ith = params->ith;
|
7268
|
+
const int nth = params->nth;
|
7269
|
+
|
7270
|
+
const int nr = ggml_nrows(src0);
|
7271
|
+
|
7272
|
+
// rows per thread
|
7273
|
+
const int dr = (nr + nth - 1)/nth;
|
7274
|
+
|
7275
|
+
// row range for this thread
|
7276
|
+
const int ir0 = dr*ith;
|
7277
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
7278
|
+
|
7279
|
+
// row index used to determine which thread to use
|
7280
|
+
int ir = 0;
|
7281
|
+
|
7282
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
7283
|
+
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
7202
7284
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
7203
|
-
for (
|
7285
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
7286
|
+
if (ir++ < ir0) continue;
|
7287
|
+
if (ir > ir1) break;
|
7288
|
+
|
7204
7289
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
7205
7290
|
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
7206
7291
|
|
@@ -7226,7 +7311,6 @@ static void ggml_compute_forward_rope_f16(
|
|
7226
7311
|
const struct ggml_tensor * src0,
|
7227
7312
|
const struct ggml_tensor * src1,
|
7228
7313
|
struct ggml_tensor * dst) {
|
7229
|
-
assert(params->ith == 0);
|
7230
7314
|
assert(src1->type == GGML_TYPE_I32);
|
7231
7315
|
assert(ggml_nelements(src1) == 3);
|
7232
7316
|
|
@@ -7238,10 +7322,10 @@ static void ggml_compute_forward_rope_f16(
|
|
7238
7322
|
const int n_dims = ((int32_t *) src1->data)[1];
|
7239
7323
|
const int mode = ((int32_t *) src1->data)[2];
|
7240
7324
|
|
7241
|
-
//const
|
7242
|
-
const
|
7243
|
-
const
|
7244
|
-
const
|
7325
|
+
//const int64_t ne0 = src0->ne[0];
|
7326
|
+
const int64_t ne1 = src0->ne[1];
|
7327
|
+
const int64_t ne2 = src0->ne[2];
|
7328
|
+
const int64_t ne3 = src0->ne[3];
|
7245
7329
|
|
7246
7330
|
const int nb0 = src0->nb[0];
|
7247
7331
|
const int nb1 = src0->nb[1];
|
@@ -7253,10 +7337,28 @@ static void ggml_compute_forward_rope_f16(
|
|
7253
7337
|
|
7254
7338
|
assert(nb0 == sizeof(ggml_fp16_t));
|
7255
7339
|
|
7256
|
-
|
7257
|
-
|
7340
|
+
const int ith = params->ith;
|
7341
|
+
const int nth = params->nth;
|
7342
|
+
|
7343
|
+
const int nr = ggml_nrows(src0);
|
7344
|
+
|
7345
|
+
// rows per thread
|
7346
|
+
const int dr = (nr + nth - 1)/nth;
|
7347
|
+
|
7348
|
+
// row range for this thread
|
7349
|
+
const int ir0 = dr*ith;
|
7350
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
7351
|
+
|
7352
|
+
// row index used to determine which thread to use
|
7353
|
+
int ir = 0;
|
7354
|
+
|
7355
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
7356
|
+
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
7258
7357
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
7259
|
-
for (
|
7358
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
7359
|
+
if (ir++ < ir0) continue;
|
7360
|
+
if (ir > ir1) break;
|
7361
|
+
|
7260
7362
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
7261
7363
|
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
7262
7364
|
|
@@ -7317,21 +7419,21 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7317
7419
|
int64_t t0 = ggml_perf_time_us();
|
7318
7420
|
UNUSED(t0);
|
7319
7421
|
|
7320
|
-
const
|
7321
|
-
const
|
7322
|
-
const
|
7323
|
-
//const
|
7422
|
+
const int64_t ne00 = src0->ne[0];
|
7423
|
+
const int64_t ne01 = src0->ne[1];
|
7424
|
+
const int64_t ne02 = src0->ne[2];
|
7425
|
+
//const int64_t ne03 = src0->ne[3];
|
7324
7426
|
|
7325
|
-
const
|
7326
|
-
const
|
7327
|
-
//const
|
7328
|
-
//const
|
7427
|
+
const int64_t ne10 = src1->ne[0];
|
7428
|
+
const int64_t ne11 = src1->ne[1];
|
7429
|
+
//const int64_t ne12 = src1->ne[2];
|
7430
|
+
//const int64_t ne13 = src1->ne[3];
|
7329
7431
|
|
7330
|
-
//const
|
7331
|
-
//const
|
7332
|
-
//const
|
7333
|
-
//const
|
7334
|
-
//const
|
7432
|
+
//const int64_t ne0 = dst->ne[0];
|
7433
|
+
//const int64_t ne1 = dst->ne[1];
|
7434
|
+
//const int64_t ne2 = dst->ne[2];
|
7435
|
+
//const int64_t ne3 = dst->ne[3];
|
7436
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7335
7437
|
|
7336
7438
|
const int nb00 = src0->nb[0];
|
7337
7439
|
const int nb01 = src0->nb[1];
|
@@ -7368,11 +7470,11 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7368
7470
|
{
|
7369
7471
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
7370
7472
|
|
7371
|
-
for (
|
7372
|
-
for (
|
7473
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7474
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7373
7475
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7374
7476
|
ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
|
7375
|
-
for (
|
7477
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7376
7478
|
dst_data[i00*ew0 + i01] = src[i00];
|
7377
7479
|
}
|
7378
7480
|
}
|
@@ -7383,10 +7485,10 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7383
7485
|
{
|
7384
7486
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
|
7385
7487
|
|
7386
|
-
for (
|
7488
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7387
7489
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7388
7490
|
ggml_fp16_t * dst_data = wdata;
|
7389
|
-
for (
|
7491
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7390
7492
|
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
|
7391
7493
|
}
|
7392
7494
|
}
|
@@ -7411,7 +7513,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7411
7513
|
|
7412
7514
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7413
7515
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7414
|
-
for (
|
7516
|
+
for (int64_t i0 = 0; i0 < ne10; ++i0) {
|
7415
7517
|
dst_data[i0] = 0;
|
7416
7518
|
for (int k = -nh; k <= nh; k++) {
|
7417
7519
|
float v = 0.0f;
|
@@ -7437,21 +7539,21 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7437
7539
|
int64_t t0 = ggml_perf_time_us();
|
7438
7540
|
UNUSED(t0);
|
7439
7541
|
|
7440
|
-
const
|
7441
|
-
const
|
7442
|
-
const
|
7443
|
-
//const
|
7542
|
+
const int64_t ne00 = src0->ne[0];
|
7543
|
+
const int64_t ne01 = src0->ne[1];
|
7544
|
+
const int64_t ne02 = src0->ne[2];
|
7545
|
+
//const int64_t ne03 = src0->ne[3];
|
7444
7546
|
|
7445
|
-
const
|
7446
|
-
const
|
7447
|
-
//const
|
7448
|
-
//const
|
7547
|
+
const int64_t ne10 = src1->ne[0];
|
7548
|
+
const int64_t ne11 = src1->ne[1];
|
7549
|
+
//const int64_t ne12 = src1->ne[2];
|
7550
|
+
//const int64_t ne13 = src1->ne[3];
|
7449
7551
|
|
7450
|
-
//const
|
7451
|
-
//const
|
7452
|
-
//const
|
7453
|
-
//const
|
7454
|
-
//const
|
7552
|
+
//const int64_t ne0 = dst->ne[0];
|
7553
|
+
//const int64_t ne1 = dst->ne[1];
|
7554
|
+
//const int64_t ne2 = dst->ne[2];
|
7555
|
+
//const int64_t ne3 = dst->ne[3];
|
7556
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7455
7557
|
|
7456
7558
|
const int nb00 = src0->nb[0];
|
7457
7559
|
const int nb01 = src0->nb[1];
|
@@ -7488,11 +7590,11 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7488
7590
|
{
|
7489
7591
|
float * const wdata = (float *) params->wdata + 0;
|
7490
7592
|
|
7491
|
-
for (
|
7492
|
-
for (
|
7593
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7594
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7493
7595
|
const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7494
7596
|
float * dst_data = wdata + i02*ew0*ne00;
|
7495
|
-
for (
|
7597
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7496
7598
|
dst_data[i00*ew0 + i01] = src[i00];
|
7497
7599
|
}
|
7498
7600
|
}
|
@@ -7503,10 +7605,10 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7503
7605
|
{
|
7504
7606
|
float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
|
7505
7607
|
|
7506
|
-
for (
|
7608
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7507
7609
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7508
7610
|
float * dst_data = wdata;
|
7509
|
-
for (
|
7611
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7510
7612
|
dst_data[(i10 + nh)*ew0 + i11] = src[i10];
|
7511
7613
|
}
|
7512
7614
|
}
|
@@ -7531,7 +7633,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7531
7633
|
|
7532
7634
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7533
7635
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7534
|
-
for (
|
7636
|
+
for (int64_t i0 = 0; i0 < ne10; ++i0) {
|
7535
7637
|
dst_data[i0] = 0;
|
7536
7638
|
for (int k = -nh; k <= nh; k++) {
|
7537
7639
|
float v = 0.0f;
|
@@ -7585,21 +7687,21 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7585
7687
|
int64_t t0 = ggml_perf_time_us();
|
7586
7688
|
UNUSED(t0);
|
7587
7689
|
|
7588
|
-
const
|
7589
|
-
const
|
7590
|
-
const
|
7591
|
-
//const
|
7690
|
+
const int64_t ne00 = src0->ne[0];
|
7691
|
+
const int64_t ne01 = src0->ne[1];
|
7692
|
+
const int64_t ne02 = src0->ne[2];
|
7693
|
+
//const int64_t ne03 = src0->ne[3];
|
7592
7694
|
|
7593
|
-
const
|
7594
|
-
const
|
7595
|
-
//const
|
7596
|
-
//const
|
7695
|
+
const int64_t ne10 = src1->ne[0];
|
7696
|
+
const int64_t ne11 = src1->ne[1];
|
7697
|
+
//const int64_t ne12 = src1->ne[2];
|
7698
|
+
//const int64_t ne13 = src1->ne[3];
|
7597
7699
|
|
7598
|
-
//const
|
7599
|
-
//const
|
7600
|
-
//const
|
7601
|
-
//const
|
7602
|
-
//const
|
7700
|
+
//const int64_t ne0 = dst->ne[0];
|
7701
|
+
//const int64_t ne1 = dst->ne[1];
|
7702
|
+
//const int64_t ne2 = dst->ne[2];
|
7703
|
+
//const int64_t ne3 = dst->ne[3];
|
7704
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7603
7705
|
|
7604
7706
|
const int nb00 = src0->nb[0];
|
7605
7707
|
const int nb01 = src0->nb[1];
|
@@ -7636,11 +7738,11 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7636
7738
|
{
|
7637
7739
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
7638
7740
|
|
7639
|
-
for (
|
7640
|
-
for (
|
7741
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7742
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7641
7743
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7642
7744
|
ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
|
7643
|
-
for (
|
7745
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7644
7746
|
dst_data[i00*ew0 + i01] = src[i00];
|
7645
7747
|
}
|
7646
7748
|
}
|
@@ -7651,10 +7753,10 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7651
7753
|
{
|
7652
7754
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
|
7653
7755
|
|
7654
|
-
for (
|
7756
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7655
7757
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7656
7758
|
ggml_fp16_t * dst_data = wdata;
|
7657
|
-
for (
|
7759
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7658
7760
|
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
|
7659
7761
|
}
|
7660
7762
|
}
|
@@ -7679,7 +7781,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7679
7781
|
|
7680
7782
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7681
7783
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7682
|
-
for (
|
7784
|
+
for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
|
7683
7785
|
dst_data[i0/2] = 0;
|
7684
7786
|
for (int k = -nh; k <= nh; k++) {
|
7685
7787
|
float v = 0.0f;
|
@@ -7705,21 +7807,21 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7705
7807
|
int64_t t0 = ggml_perf_time_us();
|
7706
7808
|
UNUSED(t0);
|
7707
7809
|
|
7708
|
-
const
|
7709
|
-
const
|
7710
|
-
const
|
7711
|
-
//const
|
7810
|
+
const int64_t ne00 = src0->ne[0];
|
7811
|
+
const int64_t ne01 = src0->ne[1];
|
7812
|
+
const int64_t ne02 = src0->ne[2];
|
7813
|
+
//const int64_t ne03 = src0->ne[3];
|
7712
7814
|
|
7713
|
-
const
|
7714
|
-
const
|
7715
|
-
//const
|
7716
|
-
//const
|
7815
|
+
const int64_t ne10 = src1->ne[0];
|
7816
|
+
const int64_t ne11 = src1->ne[1];
|
7817
|
+
//const int64_t ne12 = src1->ne[2];
|
7818
|
+
//const int64_t ne13 = src1->ne[3];
|
7717
7819
|
|
7718
|
-
//const
|
7719
|
-
//const
|
7720
|
-
//const
|
7721
|
-
//const
|
7722
|
-
//const
|
7820
|
+
//const int64_t ne0 = dst->ne[0];
|
7821
|
+
//const int64_t ne1 = dst->ne[1];
|
7822
|
+
//const int64_t ne2 = dst->ne[2];
|
7823
|
+
//const int64_t ne3 = dst->ne[3];
|
7824
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7723
7825
|
|
7724
7826
|
const int nb00 = src0->nb[0];
|
7725
7827
|
const int nb01 = src0->nb[1];
|
@@ -7756,11 +7858,11 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7756
7858
|
{
|
7757
7859
|
float * const wdata = (float *) params->wdata + 0;
|
7758
7860
|
|
7759
|
-
for (
|
7760
|
-
for (
|
7861
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7862
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7761
7863
|
const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7762
7864
|
float * dst_data = wdata + i02*ew0*ne00;
|
7763
|
-
for (
|
7865
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7764
7866
|
dst_data[i00*ew0 + i01] = src[i00];
|
7765
7867
|
}
|
7766
7868
|
}
|
@@ -7771,10 +7873,10 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7771
7873
|
{
|
7772
7874
|
float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
|
7773
7875
|
|
7774
|
-
for (
|
7876
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7775
7877
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7776
7878
|
float * dst_data = wdata;
|
7777
|
-
for (
|
7879
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7778
7880
|
dst_data[(i10 + nh)*ew0 + i11] = src[i10];
|
7779
7881
|
}
|
7780
7882
|
}
|
@@ -7799,7 +7901,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7799
7901
|
|
7800
7902
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7801
7903
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7802
|
-
for (
|
7904
|
+
for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
|
7803
7905
|
dst_data[i0/2] = 0;
|
7804
7906
|
for (int k = -nh; k <= nh; k++) {
|
7805
7907
|
float v = 0.0f;
|
@@ -7851,25 +7953,25 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7851
7953
|
int64_t t0 = ggml_perf_time_us();
|
7852
7954
|
UNUSED(t0);
|
7853
7955
|
|
7854
|
-
const
|
7855
|
-
const
|
7856
|
-
const
|
7857
|
-
const
|
7956
|
+
const int64_t neq0 = q->ne[0];
|
7957
|
+
const int64_t neq1 = q->ne[1];
|
7958
|
+
const int64_t neq2 = q->ne[2];
|
7959
|
+
const int64_t neq3 = q->ne[3];
|
7858
7960
|
|
7859
|
-
const
|
7860
|
-
const
|
7861
|
-
//const
|
7862
|
-
//const
|
7961
|
+
const int64_t nek0 = k->ne[0];
|
7962
|
+
const int64_t nek1 = k->ne[1];
|
7963
|
+
//const int64_t nek2 = k->ne[2];
|
7964
|
+
//const int64_t nek3 = k->ne[3];
|
7863
7965
|
|
7864
|
-
//const
|
7865
|
-
const
|
7866
|
-
//const
|
7867
|
-
//const
|
7966
|
+
//const int64_t nev0 = v->ne[0];
|
7967
|
+
const int64_t nev1 = v->ne[1];
|
7968
|
+
//const int64_t nev2 = v->ne[2];
|
7969
|
+
//const int64_t nev3 = v->ne[3];
|
7868
7970
|
|
7869
|
-
const
|
7870
|
-
const
|
7871
|
-
//const
|
7872
|
-
//const
|
7971
|
+
const int64_t ne0 = dst->ne[0];
|
7972
|
+
const int64_t ne1 = dst->ne[1];
|
7973
|
+
//const int64_t ne2 = dst->ne[2];
|
7974
|
+
//const int64_t ne3 = dst->ne[3];
|
7873
7975
|
|
7874
7976
|
const int nbk0 = k->nb[0];
|
7875
7977
|
const int nbk1 = k->nb[1];
|
@@ -7894,10 +7996,10 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7894
7996
|
const int ith = params->ith;
|
7895
7997
|
const int nth = params->nth;
|
7896
7998
|
|
7897
|
-
const
|
7898
|
-
const
|
7899
|
-
const
|
7900
|
-
const
|
7999
|
+
const int64_t D = neq0;
|
8000
|
+
const int64_t N = neq1;
|
8001
|
+
const int64_t P = nek1 - N;
|
8002
|
+
const int64_t M = P + N;
|
7901
8003
|
|
7902
8004
|
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
7903
8005
|
|
@@ -7959,7 +8061,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7959
8061
|
S[i] = -INFINITY;
|
7960
8062
|
}
|
7961
8063
|
|
7962
|
-
for (
|
8064
|
+
for (int64_t ic = 0; ic < nek1; ++ic) {
|
7963
8065
|
// k indices
|
7964
8066
|
const int ik3 = iq3;
|
7965
8067
|
const int ik2 = iq2;
|
@@ -7978,7 +8080,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7978
8080
|
ggml_vec_scale_f32(nek1, S, scale);
|
7979
8081
|
|
7980
8082
|
if (masked) {
|
7981
|
-
for (
|
8083
|
+
for (int64_t i = P; i < M; i++) {
|
7982
8084
|
if (i > P + iq1) {
|
7983
8085
|
S[i] = -INFINITY;
|
7984
8086
|
}
|
@@ -8036,7 +8138,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
8036
8138
|
#endif
|
8037
8139
|
}
|
8038
8140
|
|
8039
|
-
for (
|
8141
|
+
for (int64_t ic = 0; ic < nev1; ++ic) {
|
8040
8142
|
// dst indices
|
8041
8143
|
const int i1 = iq1;
|
8042
8144
|
const int i2 = iq2;
|
@@ -8060,25 +8162,25 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8060
8162
|
int64_t t0 = ggml_perf_time_us();
|
8061
8163
|
UNUSED(t0);
|
8062
8164
|
|
8063
|
-
const
|
8064
|
-
const
|
8065
|
-
const
|
8066
|
-
const
|
8165
|
+
const int64_t neq0 = q->ne[0];
|
8166
|
+
const int64_t neq1 = q->ne[1];
|
8167
|
+
const int64_t neq2 = q->ne[2];
|
8168
|
+
const int64_t neq3 = q->ne[3];
|
8067
8169
|
|
8068
|
-
const
|
8069
|
-
const
|
8070
|
-
//const
|
8071
|
-
//const
|
8170
|
+
const int64_t nek0 = k->ne[0];
|
8171
|
+
const int64_t nek1 = k->ne[1];
|
8172
|
+
//const int64_t nek2 = k->ne[2];
|
8173
|
+
//const int64_t nek3 = k->ne[3];
|
8072
8174
|
|
8073
|
-
//const
|
8074
|
-
const
|
8075
|
-
//const
|
8076
|
-
//const
|
8175
|
+
//const int64_t nev0 = v->ne[0];
|
8176
|
+
const int64_t nev1 = v->ne[1];
|
8177
|
+
//const int64_t nev2 = v->ne[2];
|
8178
|
+
//const int64_t nev3 = v->ne[3];
|
8077
8179
|
|
8078
|
-
const
|
8079
|
-
const
|
8080
|
-
//const
|
8081
|
-
//const
|
8180
|
+
const int64_t ne0 = dst->ne[0];
|
8181
|
+
const int64_t ne1 = dst->ne[1];
|
8182
|
+
//const int64_t ne2 = dst->ne[2];
|
8183
|
+
//const int64_t ne3 = dst->ne[3];
|
8082
8184
|
|
8083
8185
|
const int nbk0 = k->nb[0];
|
8084
8186
|
const int nbk1 = k->nb[1];
|
@@ -8103,10 +8205,10 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8103
8205
|
const int ith = params->ith;
|
8104
8206
|
const int nth = params->nth;
|
8105
8207
|
|
8106
|
-
const
|
8107
|
-
const
|
8108
|
-
const
|
8109
|
-
const
|
8208
|
+
const int64_t D = neq0;
|
8209
|
+
const int64_t N = neq1;
|
8210
|
+
const int64_t P = nek1 - N;
|
8211
|
+
const int64_t M = P + N;
|
8110
8212
|
|
8111
8213
|
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
8112
8214
|
|
@@ -8169,7 +8271,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8169
8271
|
}
|
8170
8272
|
|
8171
8273
|
if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
|
8172
|
-
for (
|
8274
|
+
for (int64_t ic = 0; ic < nek1; ++ic) {
|
8173
8275
|
// k indices
|
8174
8276
|
const int ik3 = iq3;
|
8175
8277
|
const int ik2 = iq2;
|
@@ -8184,7 +8286,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8184
8286
|
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
8185
8287
|
}
|
8186
8288
|
} else {
|
8187
|
-
for (
|
8289
|
+
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
8188
8290
|
// k indices
|
8189
8291
|
const int ik3 = iq3;
|
8190
8292
|
const int ik2 = iq2;
|
@@ -8204,7 +8306,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8204
8306
|
ggml_vec_scale_f32(nek1, S, scale);
|
8205
8307
|
|
8206
8308
|
if (masked) {
|
8207
|
-
for (
|
8309
|
+
for (int64_t i = P; i < M; i++) {
|
8208
8310
|
if (i > P + iq1) {
|
8209
8311
|
S[i] = -INFINITY;
|
8210
8312
|
}
|
@@ -8264,12 +8366,12 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8264
8366
|
|
8265
8367
|
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
|
8266
8368
|
|
8267
|
-
for (
|
8369
|
+
for (int64_t i = 0; i < M; i++) {
|
8268
8370
|
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
8269
8371
|
}
|
8270
8372
|
|
8271
8373
|
if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
|
8272
|
-
for (
|
8374
|
+
for (int64_t ic = 0; ic < nev1; ++ic) {
|
8273
8375
|
// dst indices
|
8274
8376
|
const int i1 = iq1;
|
8275
8377
|
const int i2 = iq2;
|
@@ -8281,7 +8383,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
8281
8383
|
S16);
|
8282
8384
|
}
|
8283
8385
|
} else {
|
8284
|
-
for (
|
8386
|
+
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
8285
8387
|
// dst indices
|
8286
8388
|
const int i1 = iq1;
|
8287
8389
|
const int i2 = iq2;
|
@@ -8337,35 +8439,35 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8337
8439
|
int64_t t0 = ggml_perf_time_us();
|
8338
8440
|
UNUSED(t0);
|
8339
8441
|
|
8340
|
-
const
|
8341
|
-
const
|
8342
|
-
const
|
8343
|
-
const
|
8442
|
+
const int64_t nea0 = a->ne[0];
|
8443
|
+
const int64_t nea1 = a->ne[1];
|
8444
|
+
const int64_t nea2 = a->ne[2];
|
8445
|
+
const int64_t nea3 = a->ne[3];
|
8344
8446
|
|
8345
|
-
const
|
8346
|
-
const
|
8347
|
-
//const
|
8348
|
-
//const
|
8447
|
+
const int64_t neb00 = b0->ne[0];
|
8448
|
+
const int64_t neb01 = b0->ne[1];
|
8449
|
+
//const int64_t neb02 = b0->ne[2];
|
8450
|
+
//const int64_t neb03 = b0->ne[3];
|
8349
8451
|
|
8350
|
-
const
|
8351
|
-
const
|
8352
|
-
//const
|
8353
|
-
//const
|
8452
|
+
const int64_t neb10 = b1->ne[0];
|
8453
|
+
const int64_t neb11 = b1->ne[1];
|
8454
|
+
//const int64_t neb12 = b1->ne[2];
|
8455
|
+
//const int64_t neb13 = b1->ne[3];
|
8354
8456
|
|
8355
|
-
const
|
8356
|
-
const
|
8357
|
-
//const
|
8358
|
-
//const
|
8457
|
+
const int64_t nec00 = c0->ne[0];
|
8458
|
+
const int64_t nec01 = c0->ne[1];
|
8459
|
+
//const int64_t nec02 = c0->ne[2];
|
8460
|
+
//const int64_t nec03 = c0->ne[3];
|
8359
8461
|
|
8360
|
-
const
|
8361
|
-
const
|
8362
|
-
//const
|
8363
|
-
//const
|
8462
|
+
const int64_t nec10 = c1->ne[0];
|
8463
|
+
const int64_t nec11 = c1->ne[1];
|
8464
|
+
//const int64_t nec12 = c1->ne[2];
|
8465
|
+
//const int64_t nec13 = c1->ne[3];
|
8364
8466
|
|
8365
|
-
const
|
8366
|
-
const
|
8367
|
-
const
|
8368
|
-
//const
|
8467
|
+
const int64_t ne0 = dst->ne[0];
|
8468
|
+
const int64_t ne1 = dst->ne[1];
|
8469
|
+
const int64_t ne2 = dst->ne[2];
|
8470
|
+
//const int64_t ne3 = dst->ne[3];
|
8369
8471
|
|
8370
8472
|
const int nba0 = a->nb[0];
|
8371
8473
|
const int nba1 = a->nb[1];
|
@@ -8400,9 +8502,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8400
8502
|
const int ith = params->ith;
|
8401
8503
|
const int nth = params->nth;
|
8402
8504
|
|
8403
|
-
const
|
8404
|
-
//const
|
8405
|
-
const
|
8505
|
+
const int64_t D = nea0;
|
8506
|
+
//const int64_t N = nea1;
|
8507
|
+
const int64_t M = neb01;
|
8406
8508
|
|
8407
8509
|
GGML_ASSERT(ne0 == nea0);
|
8408
8510
|
GGML_ASSERT(ne1 == nea1);
|
@@ -8458,7 +8560,7 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8458
8560
|
|
8459
8561
|
float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
|
8460
8562
|
|
8461
|
-
for (
|
8563
|
+
for (int64_t ic = 0; ic < neb01; ++ic) {
|
8462
8564
|
// b0 indices
|
8463
8565
|
const int ib03 = ia3;
|
8464
8566
|
const int ib02 = ia2;
|
@@ -8478,7 +8580,7 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8478
8580
|
|
8479
8581
|
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
8480
8582
|
|
8481
|
-
for (
|
8583
|
+
for (int64_t i = 0; i < M; i++) {
|
8482
8584
|
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
8483
8585
|
}
|
8484
8586
|
|
@@ -8490,7 +8592,7 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8490
8592
|
const int i2 = ia2;
|
8491
8593
|
const int i3 = ia3;
|
8492
8594
|
|
8493
|
-
for (
|
8595
|
+
for (int64_t ic = 0; ic < nec01; ++ic) {
|
8494
8596
|
|
8495
8597
|
ggml_vec_dot_f16(neb01,
|
8496
8598
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
@@ -9355,7 +9457,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
9355
9457
|
} break;
|
9356
9458
|
case GGML_OP_ROPE:
|
9357
9459
|
{
|
9358
|
-
node->n_tasks =
|
9460
|
+
node->n_tasks = n_threads;
|
9359
9461
|
} break;
|
9360
9462
|
case GGML_OP_CONV_1D_1S:
|
9361
9463
|
case GGML_OP_CONV_1D_2S:
|
@@ -9393,7 +9495,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
9393
9495
|
|
9394
9496
|
size_t cur = 0;
|
9395
9497
|
|
9396
|
-
const
|
9498
|
+
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
9397
9499
|
|
9398
9500
|
if (node->src1->type == GGML_TYPE_F32) {
|
9399
9501
|
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
@@ -9652,7 +9754,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
9652
9754
|
|
9653
9755
|
perf_total_per_op_us[node->op] += node->perf_time_us;
|
9654
9756
|
|
9655
|
-
GGML_PRINT(" - %3d: [ %
|
9757
|
+
GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 ", %" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
9656
9758
|
i,
|
9657
9759
|
node->ne[0], node->ne[1], node->ne[2],
|
9658
9760
|
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
@@ -9666,7 +9768,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
9666
9768
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
9667
9769
|
struct ggml_tensor * node = cgraph->leafs[i];
|
9668
9770
|
|
9669
|
-
GGML_PRINT(" - %3d: [ %
|
9771
|
+
GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 "] %8s\n",
|
9670
9772
|
i,
|
9671
9773
|
node->ne[0], node->ne[1],
|
9672
9774
|
GGML_OP_LABEL[node->op]);
|
@@ -9737,7 +9839,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
9737
9839
|
|
9738
9840
|
fprintf(fp, " \"%p\" [ \
|
9739
9841
|
style = filled; fillcolor = %s; shape = record; \
|
9740
|
-
label=\"%d [%
|
9842
|
+
label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
|
9741
9843
|
(void *) node, color,
|
9742
9844
|
i, node->ne[0], node->ne[1],
|
9743
9845
|
GGML_OP_SYMBOL[node->op]);
|
@@ -9762,7 +9864,7 @@ label=\"<x>%.1e\"; ]\n",
|
|
9762
9864
|
} else {
|
9763
9865
|
fprintf(fp, " \"%p\" [ \
|
9764
9866
|
style = filled; fillcolor = %s; shape = record; \
|
9765
|
-
label=\"<x>CONST %d [%
|
9867
|
+
label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
|
9766
9868
|
(void *) node, color,
|
9767
9869
|
i, node->ne[0], node->ne[1]);
|
9768
9870
|
}
|
@@ -9826,9 +9928,9 @@ label=\"<x>CONST %d [%d, %d]\"; ]\n",
|
|
9826
9928
|
static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
|
9827
9929
|
int i = 0;
|
9828
9930
|
for (int p = 0; p < np; ++p) {
|
9829
|
-
const
|
9931
|
+
const int64_t ne = ggml_nelements(ps[p]) ;
|
9830
9932
|
// TODO: add function to set tensor from array
|
9831
|
-
for (
|
9933
|
+
for (int64_t j = 0; j < ne; ++j) {
|
9832
9934
|
ggml_set_f32_1d(ps[p], j, x[i++]);
|
9833
9935
|
}
|
9834
9936
|
}
|
@@ -9837,9 +9939,9 @@ static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const f
|
|
9837
9939
|
static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
|
9838
9940
|
int i = 0;
|
9839
9941
|
for (int p = 0; p < np; ++p) {
|
9840
|
-
const
|
9942
|
+
const int64_t ne = ggml_nelements(ps[p]) ;
|
9841
9943
|
// TODO: add function to get all elements at once
|
9842
|
-
for (
|
9944
|
+
for (int64_t j = 0; j < ne; ++j) {
|
9843
9945
|
x[i++] = ggml_get_f32_1d(ps[p], j);
|
9844
9946
|
}
|
9845
9947
|
}
|
@@ -9848,9 +9950,9 @@ static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float *
|
|
9848
9950
|
static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
|
9849
9951
|
int i = 0;
|
9850
9952
|
for (int p = 0; p < np; ++p) {
|
9851
|
-
const
|
9953
|
+
const int64_t ne = ggml_nelements(ps[p]) ;
|
9852
9954
|
// TODO: add function to get all elements at once
|
9853
|
-
for (
|
9955
|
+
for (int64_t j = 0; j < ne; ++j) {
|
9854
9956
|
g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
|
9855
9957
|
}
|
9856
9958
|
}
|