llama_cpp 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -8
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +242 -52
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +835 -82
- data/ext/llama_cpp/src/ggml.h +64 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +489 -134
- data/ext/llama_cpp/src/llama.h +43 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -3,6 +3,10 @@
|
|
3
3
|
|
4
4
|
#include "ggml.h"
|
5
5
|
|
6
|
+
#ifdef GGML_USE_K_QUANTS
|
7
|
+
#include "k_quants.h"
|
8
|
+
#endif
|
9
|
+
|
6
10
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
7
11
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
8
12
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
@@ -21,6 +25,10 @@
|
|
21
25
|
#include <float.h>
|
22
26
|
#include <limits.h>
|
23
27
|
|
28
|
+
#ifdef GGML_USE_METAL
|
29
|
+
#include <unistd.h>
|
30
|
+
#endif
|
31
|
+
|
24
32
|
// if C99 - static_assert is noop
|
25
33
|
// ref: https://stackoverflow.com/a/53923785/4039976
|
26
34
|
#ifndef static_assert
|
@@ -121,7 +129,11 @@ typedef void* thread_ret_t;
|
|
121
129
|
#else
|
122
130
|
inline static void* ggml_aligned_malloc(size_t size) {
|
123
131
|
void* aligned_memory = NULL;
|
132
|
+
#ifdef GGML_USE_METAL
|
133
|
+
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
134
|
+
#else
|
124
135
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
136
|
+
#endif
|
125
137
|
if (result != 0) {
|
126
138
|
// Handle allocation failure
|
127
139
|
return NULL;
|
@@ -186,10 +198,12 @@ typedef double ggml_float;
|
|
186
198
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
187
199
|
#include <intrin.h>
|
188
200
|
#else
|
201
|
+
#if !defined(__riscv)
|
189
202
|
#include <immintrin.h>
|
190
203
|
#endif
|
191
204
|
#endif
|
192
205
|
#endif
|
206
|
+
#endif
|
193
207
|
|
194
208
|
#ifdef __F16C__
|
195
209
|
|
@@ -401,21 +415,27 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
|
401
415
|
//
|
402
416
|
|
403
417
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
404
|
-
static int64_t timer_freq;
|
418
|
+
static int64_t timer_freq, timer_start;
|
405
419
|
void ggml_time_init(void) {
|
406
|
-
LARGE_INTEGER
|
407
|
-
QueryPerformanceFrequency(&
|
408
|
-
timer_freq =
|
420
|
+
LARGE_INTEGER t;
|
421
|
+
QueryPerformanceFrequency(&t);
|
422
|
+
timer_freq = t.QuadPart;
|
423
|
+
|
424
|
+
// The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
|
425
|
+
// and the uptime is high enough.
|
426
|
+
// We subtract the program start time to reduce the likelihood of that happening.
|
427
|
+
QueryPerformanceCounter(&t);
|
428
|
+
timer_start = t.QuadPart;
|
409
429
|
}
|
410
430
|
int64_t ggml_time_ms(void) {
|
411
431
|
LARGE_INTEGER t;
|
412
432
|
QueryPerformanceCounter(&t);
|
413
|
-
return (t.QuadPart * 1000) / timer_freq;
|
433
|
+
return ((t.QuadPart-timer_start) * 1000) / timer_freq;
|
414
434
|
}
|
415
435
|
int64_t ggml_time_us(void) {
|
416
436
|
LARGE_INTEGER t;
|
417
437
|
QueryPerformanceCounter(&t);
|
418
|
-
return (t.QuadPart * 1000000) / timer_freq;
|
438
|
+
return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
|
419
439
|
}
|
420
440
|
#else
|
421
441
|
void ggml_time_init(void) {}
|
@@ -472,6 +492,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
472
492
|
// quantization
|
473
493
|
//
|
474
494
|
|
495
|
+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
496
|
+
|
475
497
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
476
498
|
// multiply int8_t, add results pairwise twice
|
477
499
|
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
@@ -531,7 +553,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
531
553
|
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
532
554
|
{
|
533
555
|
const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
|
534
|
-
const __m256i bytes =
|
556
|
+
const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
|
535
557
|
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
536
558
|
return _mm256_and_si256(lowMask, bytes);
|
537
559
|
}
|
@@ -604,7 +626,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
604
626
|
bytesh = _mm_or_si128(bytesh, bit_mask);
|
605
627
|
bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
|
606
628
|
bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
|
607
|
-
return
|
629
|
+
return MM256_SET_M128I(bytesh, bytesl);
|
608
630
|
}
|
609
631
|
|
610
632
|
// Unpack 32 4-bit fields into 32 bytes
|
@@ -617,7 +639,7 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
|
617
639
|
const __m128i lowMask = _mm_set1_epi8(0xF);
|
618
640
|
tmpl = _mm_and_si128(lowMask, tmpl);
|
619
641
|
tmph = _mm_and_si128(lowMask, tmph);
|
620
|
-
return
|
642
|
+
return MM256_SET_M128I(tmph, tmpl);
|
621
643
|
}
|
622
644
|
|
623
645
|
// add int16_t pairwise and return as float vector
|
@@ -625,7 +647,7 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
|
|
625
647
|
const __m128i ones = _mm_set1_epi16(1);
|
626
648
|
const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
|
627
649
|
const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
|
628
|
-
const __m256i summed_pairs =
|
650
|
+
const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
|
629
651
|
return _mm256_cvtepi32_ps(summed_pairs);
|
630
652
|
}
|
631
653
|
|
@@ -1563,6 +1585,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|
1563
1585
|
.vec_dot_q = NULL, // TODO
|
1564
1586
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1565
1587
|
},
|
1588
|
+
#ifdef GGML_USE_K_QUANTS
|
1589
|
+
[GGML_TYPE_Q2_K] = {
|
1590
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
|
1591
|
+
.quantize_row_q = quantize_row_q2_K,
|
1592
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
|
1593
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1594
|
+
.vec_dot_q = ggml_vec_dot_q2_K_q8_K,
|
1595
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1596
|
+
},
|
1597
|
+
[GGML_TYPE_Q3_K] = {
|
1598
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
|
1599
|
+
.quantize_row_q = quantize_row_q3_K,
|
1600
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
|
1601
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1602
|
+
.vec_dot_q = ggml_vec_dot_q3_K_q8_K,
|
1603
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1604
|
+
},
|
1605
|
+
[GGML_TYPE_Q4_K] = {
|
1606
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
|
1607
|
+
.quantize_row_q = quantize_row_q4_K,
|
1608
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
|
1609
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1610
|
+
.vec_dot_q = ggml_vec_dot_q4_K_q8_K,
|
1611
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1612
|
+
},
|
1613
|
+
[GGML_TYPE_Q5_K] = {
|
1614
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
|
1615
|
+
.quantize_row_q = quantize_row_q5_K,
|
1616
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
|
1617
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1618
|
+
.vec_dot_q = ggml_vec_dot_q5_K_q8_K,
|
1619
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1620
|
+
},
|
1621
|
+
[GGML_TYPE_Q6_K] = {
|
1622
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
|
1623
|
+
.quantize_row_q = quantize_row_q6_K,
|
1624
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
|
1625
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1626
|
+
.vec_dot_q = ggml_vec_dot_q6_K_q8_K,
|
1627
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1628
|
+
},
|
1629
|
+
#endif
|
1566
1630
|
};
|
1567
1631
|
|
1568
1632
|
// For internal test use
|
@@ -2288,7 +2352,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2288
2352
|
const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
|
2289
2353
|
|
2290
2354
|
// Convert int32_t to float
|
2291
|
-
__m256 p = _mm256_cvtepi32_ps(
|
2355
|
+
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
2292
2356
|
|
2293
2357
|
// Apply the scale, and accumulate
|
2294
2358
|
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
@@ -2764,7 +2828,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2764
2828
|
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
2765
2829
|
bxl = _mm_or_si128(bxl, bxhil);
|
2766
2830
|
bxh = _mm_or_si128(bxh, bxhih);
|
2767
|
-
bx =
|
2831
|
+
bx = MM256_SET_M128I(bxh, bxl);
|
2768
2832
|
|
2769
2833
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
2770
2834
|
|
@@ -3020,7 +3084,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3020
3084
|
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
3021
3085
|
bxl = _mm_or_si128(bxl, bxhil);
|
3022
3086
|
bxh = _mm_or_si128(bxh, bxhih);
|
3023
|
-
bx =
|
3087
|
+
bx = MM256_SET_M128I(bxh, bxl);
|
3024
3088
|
|
3025
3089
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
3026
3090
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
@@ -3442,11 +3506,19 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
|
3442
3506
|
[GGML_TYPE_Q5_1] = QK5_1,
|
3443
3507
|
[GGML_TYPE_Q8_0] = QK8_0,
|
3444
3508
|
[GGML_TYPE_Q8_1] = QK8_1,
|
3509
|
+
#ifdef GGML_USE_K_QUANTS
|
3510
|
+
[GGML_TYPE_Q2_K] = QK_K,
|
3511
|
+
[GGML_TYPE_Q3_K] = QK_K,
|
3512
|
+
[GGML_TYPE_Q4_K] = QK_K,
|
3513
|
+
[GGML_TYPE_Q5_K] = QK_K,
|
3514
|
+
[GGML_TYPE_Q6_K] = QK_K,
|
3515
|
+
[GGML_TYPE_Q8_K] = QK_K,
|
3516
|
+
#endif
|
3445
3517
|
[GGML_TYPE_I8] = 1,
|
3446
3518
|
[GGML_TYPE_I16] = 1,
|
3447
3519
|
[GGML_TYPE_I32] = 1,
|
3448
3520
|
};
|
3449
|
-
static_assert(GGML_TYPE_COUNT ==
|
3521
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
|
3450
3522
|
|
3451
3523
|
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
3452
3524
|
[GGML_TYPE_F32] = sizeof(float),
|
@@ -3457,11 +3529,19 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
|
3457
3529
|
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
3458
3530
|
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
3459
3531
|
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
3532
|
+
#ifdef GGML_USE_K_QUANTS
|
3533
|
+
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
3534
|
+
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
3535
|
+
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
3536
|
+
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
3537
|
+
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
3538
|
+
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
3539
|
+
#endif
|
3460
3540
|
[GGML_TYPE_I8] = sizeof(int8_t),
|
3461
3541
|
[GGML_TYPE_I16] = sizeof(int16_t),
|
3462
3542
|
[GGML_TYPE_I32] = sizeof(int32_t),
|
3463
3543
|
};
|
3464
|
-
static_assert(GGML_TYPE_COUNT ==
|
3544
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
|
3465
3545
|
|
3466
3546
|
|
3467
3547
|
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
@@ -3473,11 +3553,17 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
|
3473
3553
|
[GGML_TYPE_Q5_1] = "q5_1",
|
3474
3554
|
[GGML_TYPE_Q8_0] = "q8_0",
|
3475
3555
|
[GGML_TYPE_Q8_1] = "q8_1",
|
3556
|
+
[GGML_TYPE_Q2_K] = "q2_K",
|
3557
|
+
[GGML_TYPE_Q3_K] = "q3_K",
|
3558
|
+
[GGML_TYPE_Q4_K] = "q4_K",
|
3559
|
+
[GGML_TYPE_Q5_K] = "q5_K",
|
3560
|
+
[GGML_TYPE_Q6_K] = "q6_K",
|
3561
|
+
[GGML_TYPE_Q8_K] = "q8_K",
|
3476
3562
|
[GGML_TYPE_I8] = "i8",
|
3477
3563
|
[GGML_TYPE_I16] = "i16",
|
3478
3564
|
[GGML_TYPE_I32] = "i32",
|
3479
3565
|
};
|
3480
|
-
static_assert(GGML_TYPE_COUNT ==
|
3566
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
|
3481
3567
|
|
3482
3568
|
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
3483
3569
|
[GGML_TYPE_F32] = false,
|
@@ -3488,13 +3574,19 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
|
3488
3574
|
[GGML_TYPE_Q5_1] = true,
|
3489
3575
|
[GGML_TYPE_Q8_0] = true,
|
3490
3576
|
[GGML_TYPE_Q8_1] = true,
|
3577
|
+
[GGML_TYPE_Q2_K] = true,
|
3578
|
+
[GGML_TYPE_Q3_K] = true,
|
3579
|
+
[GGML_TYPE_Q4_K] = true,
|
3580
|
+
[GGML_TYPE_Q5_K] = true,
|
3581
|
+
[GGML_TYPE_Q6_K] = true,
|
3582
|
+
[GGML_TYPE_Q8_K] = true,
|
3491
3583
|
[GGML_TYPE_I8] = false,
|
3492
3584
|
[GGML_TYPE_I16] = false,
|
3493
3585
|
[GGML_TYPE_I32] = false,
|
3494
3586
|
};
|
3495
|
-
static_assert(GGML_TYPE_COUNT ==
|
3587
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
|
3496
3588
|
|
3497
|
-
static const char *
|
3589
|
+
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
3498
3590
|
"NONE",
|
3499
3591
|
|
3500
3592
|
"DUP",
|
@@ -3629,6 +3721,7 @@ struct ggml_context {
|
|
3629
3721
|
void * mem_buffer;
|
3630
3722
|
bool mem_buffer_owned;
|
3631
3723
|
bool no_alloc;
|
3724
|
+
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
3632
3725
|
|
3633
3726
|
int n_objects;
|
3634
3727
|
|
@@ -3645,26 +3738,6 @@ struct ggml_context_container {
|
|
3645
3738
|
struct ggml_context context;
|
3646
3739
|
};
|
3647
3740
|
|
3648
|
-
//
|
3649
|
-
// compute types
|
3650
|
-
//
|
3651
|
-
|
3652
|
-
enum ggml_task_type {
|
3653
|
-
GGML_TASK_INIT = 0,
|
3654
|
-
GGML_TASK_COMPUTE,
|
3655
|
-
GGML_TASK_FINALIZE,
|
3656
|
-
};
|
3657
|
-
|
3658
|
-
struct ggml_compute_params {
|
3659
|
-
enum ggml_task_type type;
|
3660
|
-
|
3661
|
-
int ith, nth;
|
3662
|
-
|
3663
|
-
// work buffer for all threads
|
3664
|
-
size_t wsize;
|
3665
|
-
void * wdata;
|
3666
|
-
};
|
3667
|
-
|
3668
3741
|
//
|
3669
3742
|
// ggml state
|
3670
3743
|
//
|
@@ -3721,7 +3794,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
|
3721
3794
|
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
3722
3795
|
}
|
3723
3796
|
|
3724
|
-
|
3797
|
+
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
3725
3798
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3726
3799
|
|
3727
3800
|
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
@@ -3730,7 +3803,20 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
|
|
3730
3803
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
3731
3804
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3732
3805
|
|
3733
|
-
|
3806
|
+
// this should handle cases where the tensor is not contiguous in memory
|
3807
|
+
// probaby just:
|
3808
|
+
//
|
3809
|
+
// return tensor->ne[3]*tensor->nb[3]
|
3810
|
+
//
|
3811
|
+
// is enough, but just in case, adding the second part
|
3812
|
+
|
3813
|
+
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
3814
|
+
}
|
3815
|
+
|
3816
|
+
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
3817
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3818
|
+
|
3819
|
+
return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
|
3734
3820
|
}
|
3735
3821
|
|
3736
3822
|
int ggml_blck_size(enum ggml_type type) {
|
@@ -3749,6 +3835,9 @@ const char * ggml_type_name(enum ggml_type type) {
|
|
3749
3835
|
return GGML_TYPE_NAME[type];
|
3750
3836
|
}
|
3751
3837
|
|
3838
|
+
const char * ggml_op_name(enum ggml_op op) {
|
3839
|
+
return GGML_OP_NAME[op];
|
3840
|
+
}
|
3752
3841
|
|
3753
3842
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
3754
3843
|
return GGML_TYPE_SIZE[tensor->type];
|
@@ -3796,6 +3885,11 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
3796
3885
|
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
3797
3886
|
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
3798
3887
|
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
3888
|
+
case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
|
3889
|
+
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
3890
|
+
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
3891
|
+
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
3892
|
+
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
3799
3893
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
3800
3894
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
3801
3895
|
}
|
@@ -3805,11 +3899,15 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
3805
3899
|
return wtype;
|
3806
3900
|
}
|
3807
3901
|
|
3808
|
-
|
3902
|
+
size_t ggml_tensor_overhead(void) {
|
3903
|
+
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
|
3904
|
+
}
|
3905
|
+
|
3906
|
+
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
3809
3907
|
return tensor->nb[0] > tensor->nb[1];
|
3810
3908
|
}
|
3811
3909
|
|
3812
|
-
|
3910
|
+
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
3813
3911
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3814
3912
|
|
3815
3913
|
return
|
@@ -3958,6 +4056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
3958
4056
|
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
|
3959
4057
|
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
3960
4058
|
/*.no_alloc =*/ params.no_alloc,
|
4059
|
+
/*.no_alloc_save =*/ params.no_alloc,
|
3961
4060
|
/*.n_objects =*/ 0,
|
3962
4061
|
/*.objects_begin =*/ NULL,
|
3963
4062
|
/*.objects_end =*/ NULL,
|
@@ -4017,17 +4116,36 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
|
|
4017
4116
|
return result;
|
4018
4117
|
}
|
4019
4118
|
|
4119
|
+
void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
|
4120
|
+
ctx->no_alloc = no_alloc;
|
4121
|
+
}
|
4122
|
+
|
4123
|
+
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
|
4124
|
+
return ctx->mem_buffer;
|
4125
|
+
}
|
4126
|
+
|
4127
|
+
size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
4128
|
+
return ctx->mem_size;
|
4129
|
+
}
|
4130
|
+
|
4020
4131
|
// IMPORTANT:
|
4021
4132
|
// when creating "opt" tensors, always save and load the scratch buffer
|
4022
4133
|
// this is an error prone process, but it is necessary to support inplace
|
4023
4134
|
// operators when using scratch buffers
|
4024
4135
|
// TODO: implement a better way
|
4025
4136
|
void ggml_scratch_save(struct ggml_context * ctx) {
|
4137
|
+
// this is needed to allow opt tensors to store their data
|
4138
|
+
// TODO: again, need to find a better way
|
4139
|
+
ctx->no_alloc_save = ctx->no_alloc;
|
4140
|
+
ctx->no_alloc = false;
|
4141
|
+
|
4026
4142
|
ctx->scratch_save = ctx->scratch;
|
4027
4143
|
ctx->scratch.data = NULL;
|
4028
4144
|
}
|
4029
4145
|
|
4030
4146
|
void ggml_scratch_load(struct ggml_context * ctx) {
|
4147
|
+
ctx->no_alloc = ctx->no_alloc_save;
|
4148
|
+
|
4031
4149
|
ctx->scratch = ctx->scratch_save;
|
4032
4150
|
}
|
4033
4151
|
|
@@ -4061,7 +4179,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4061
4179
|
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
|
4062
4180
|
|
4063
4181
|
if (ctx->scratch.data == NULL || data != NULL) {
|
4064
|
-
size_needed +=
|
4182
|
+
size_needed += GGML_TENSOR_SIZE;
|
4065
4183
|
|
4066
4184
|
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
4067
4185
|
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
@@ -4077,14 +4195,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4077
4195
|
};
|
4078
4196
|
} else {
|
4079
4197
|
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
|
4080
|
-
GGML_PRINT("%s: not enough space in the scratch memory\n",
|
4198
|
+
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
4199
|
+
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
|
4081
4200
|
assert(false);
|
4082
4201
|
return NULL;
|
4083
4202
|
}
|
4084
4203
|
|
4085
|
-
if (cur_end +
|
4204
|
+
if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
|
4086
4205
|
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
4087
|
-
__func__, cur_end +
|
4206
|
+
__func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
|
4088
4207
|
assert(false);
|
4089
4208
|
return NULL;
|
4090
4209
|
}
|
@@ -4093,7 +4212,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4093
4212
|
|
4094
4213
|
*obj_new = (struct ggml_object) {
|
4095
4214
|
.offs = cur_end + GGML_OBJECT_SIZE,
|
4096
|
-
.size =
|
4215
|
+
.size = GGML_TENSOR_SIZE,
|
4097
4216
|
.next = NULL,
|
4098
4217
|
};
|
4099
4218
|
|
@@ -4135,6 +4254,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4135
4254
|
/*.perf_time_us =*/ 0,
|
4136
4255
|
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
4137
4256
|
/*.name =*/ { 0 },
|
4257
|
+
/*.extra =*/ NULL,
|
4138
4258
|
/*.pad =*/ { 0 },
|
4139
4259
|
};
|
4140
4260
|
|
@@ -4509,6 +4629,23 @@ struct ggml_tensor * ggml_view_tensor(
|
|
4509
4629
|
return result;
|
4510
4630
|
}
|
4511
4631
|
|
4632
|
+
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
4633
|
+
struct ggml_object * obj = ctx->objects_begin;
|
4634
|
+
|
4635
|
+
char * const mem_buffer = ctx->mem_buffer;
|
4636
|
+
|
4637
|
+
while (obj != NULL) {
|
4638
|
+
struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
|
4639
|
+
if (strcmp(cur->name, name) == 0) {
|
4640
|
+
return cur;
|
4641
|
+
}
|
4642
|
+
|
4643
|
+
obj = obj->next;
|
4644
|
+
}
|
4645
|
+
|
4646
|
+
return NULL;
|
4647
|
+
}
|
4648
|
+
|
4512
4649
|
////////////////////////////////////////////////////////////////////////////////
|
4513
4650
|
|
4514
4651
|
// ggml_dup
|
@@ -5763,10 +5900,18 @@ struct ggml_tensor * ggml_view_1d(
|
|
5763
5900
|
|
5764
5901
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
5765
5902
|
|
5903
|
+
ggml_scratch_save(ctx);
|
5904
|
+
|
5905
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5906
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5907
|
+
|
5908
|
+
ggml_scratch_load(ctx);
|
5909
|
+
|
5766
5910
|
result->op = GGML_OP_VIEW;
|
5767
5911
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5768
5912
|
result->src0 = a;
|
5769
5913
|
result->src1 = NULL;
|
5914
|
+
result->opt[0] = offs;
|
5770
5915
|
|
5771
5916
|
if (is_node) {
|
5772
5917
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5795,6 +5940,13 @@ struct ggml_tensor * ggml_view_2d(
|
|
5795
5940
|
|
5796
5941
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
5797
5942
|
|
5943
|
+
ggml_scratch_save(ctx);
|
5944
|
+
|
5945
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5946
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5947
|
+
|
5948
|
+
ggml_scratch_load(ctx);
|
5949
|
+
|
5798
5950
|
result->nb[1] = nb1;
|
5799
5951
|
result->nb[2] = result->nb[1]*ne1;
|
5800
5952
|
result->nb[3] = result->nb[2];
|
@@ -5803,6 +5955,7 @@ struct ggml_tensor * ggml_view_2d(
|
|
5803
5955
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5804
5956
|
result->src0 = a;
|
5805
5957
|
result->src1 = NULL;
|
5958
|
+
result->opt[0] = offs;
|
5806
5959
|
|
5807
5960
|
if (is_node) {
|
5808
5961
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5833,6 +5986,13 @@ struct ggml_tensor * ggml_view_3d(
|
|
5833
5986
|
|
5834
5987
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
5835
5988
|
|
5989
|
+
ggml_scratch_save(ctx);
|
5990
|
+
|
5991
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5992
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5993
|
+
|
5994
|
+
ggml_scratch_load(ctx);
|
5995
|
+
|
5836
5996
|
result->nb[1] = nb1;
|
5837
5997
|
result->nb[2] = nb2;
|
5838
5998
|
result->nb[3] = result->nb[2]*ne2;
|
@@ -5841,6 +6001,7 @@ struct ggml_tensor * ggml_view_3d(
|
|
5841
6001
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5842
6002
|
result->src0 = a;
|
5843
6003
|
result->src1 = NULL;
|
6004
|
+
result->opt[0] = offs;
|
5844
6005
|
|
5845
6006
|
if (is_node) {
|
5846
6007
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5873,6 +6034,13 @@ struct ggml_tensor * ggml_view_4d(
|
|
5873
6034
|
|
5874
6035
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
5875
6036
|
|
6037
|
+
ggml_scratch_save(ctx);
|
6038
|
+
|
6039
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6040
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6041
|
+
|
6042
|
+
ggml_scratch_load(ctx);
|
6043
|
+
|
5876
6044
|
result->nb[1] = nb1;
|
5877
6045
|
result->nb[2] = nb2;
|
5878
6046
|
result->nb[3] = nb3;
|
@@ -5881,6 +6049,7 @@ struct ggml_tensor * ggml_view_4d(
|
|
5881
6049
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5882
6050
|
result->src0 = a;
|
5883
6051
|
result->src1 = NULL;
|
6052
|
+
result->opt[0] = offs;
|
5884
6053
|
|
5885
6054
|
if (is_node) {
|
5886
6055
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -6303,7 +6472,7 @@ struct ggml_tensor * ggml_alibi(
|
|
6303
6472
|
|
6304
6473
|
ggml_scratch_save(ctx);
|
6305
6474
|
|
6306
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
6475
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
6307
6476
|
|
6308
6477
|
((int32_t *) b->data)[0] = n_past;
|
6309
6478
|
((int32_t *) b->data)[1] = n_head;
|
@@ -7545,6 +7714,11 @@ static void ggml_compute_forward_add(
|
|
7545
7714
|
case GGML_TYPE_Q5_0:
|
7546
7715
|
case GGML_TYPE_Q5_1:
|
7547
7716
|
case GGML_TYPE_Q8_0:
|
7717
|
+
case GGML_TYPE_Q2_K:
|
7718
|
+
case GGML_TYPE_Q3_K:
|
7719
|
+
case GGML_TYPE_Q4_K:
|
7720
|
+
case GGML_TYPE_Q5_K:
|
7721
|
+
case GGML_TYPE_Q6_K:
|
7548
7722
|
{
|
7549
7723
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
7550
7724
|
} break;
|
@@ -7848,6 +8022,11 @@ static void ggml_compute_forward_add1(
|
|
7848
8022
|
case GGML_TYPE_Q5_1:
|
7849
8023
|
case GGML_TYPE_Q8_0:
|
7850
8024
|
case GGML_TYPE_Q8_1:
|
8025
|
+
case GGML_TYPE_Q2_K:
|
8026
|
+
case GGML_TYPE_Q3_K:
|
8027
|
+
case GGML_TYPE_Q4_K:
|
8028
|
+
case GGML_TYPE_Q5_K:
|
8029
|
+
case GGML_TYPE_Q6_K:
|
7851
8030
|
{
|
7852
8031
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
7853
8032
|
} break;
|
@@ -7970,6 +8149,11 @@ static void ggml_compute_forward_acc(
|
|
7970
8149
|
case GGML_TYPE_Q5_1:
|
7971
8150
|
case GGML_TYPE_Q8_0:
|
7972
8151
|
case GGML_TYPE_Q8_1:
|
8152
|
+
case GGML_TYPE_Q2_K:
|
8153
|
+
case GGML_TYPE_Q3_K:
|
8154
|
+
case GGML_TYPE_Q4_K:
|
8155
|
+
case GGML_TYPE_Q5_K:
|
8156
|
+
case GGML_TYPE_Q6_K:
|
7973
8157
|
default:
|
7974
8158
|
{
|
7975
8159
|
GGML_ASSERT(false);
|
@@ -8088,10 +8272,10 @@ static void ggml_compute_forward_mul_f32(
|
|
8088
8272
|
const int ith = params->ith;
|
8089
8273
|
const int nth = params->nth;
|
8090
8274
|
|
8091
|
-
#ifdef
|
8092
|
-
if (src1->backend ==
|
8275
|
+
#ifdef GGML_USE_CLBLAST
|
8276
|
+
if (src1->backend == GGML_BACKEND_GPU) {
|
8093
8277
|
if (ith == 0) {
|
8094
|
-
|
8278
|
+
ggml_cl_mul(src0, src1, dst);
|
8095
8279
|
}
|
8096
8280
|
return;
|
8097
8281
|
}
|
@@ -9206,7 +9390,7 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
9206
9390
|
sum += (ggml_float)(x[i00] * x[i00]);
|
9207
9391
|
}
|
9208
9392
|
|
9209
|
-
float mean = sum/ne00;
|
9393
|
+
const float mean = sum/ne00;
|
9210
9394
|
|
9211
9395
|
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
9212
9396
|
|
@@ -9529,14 +9713,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9529
9713
|
// nb01 >= nb00 - src0 is not transposed
|
9530
9714
|
// compute by src0 rows
|
9531
9715
|
|
9532
|
-
#if defined(
|
9533
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9534
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9535
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9536
|
-
}
|
9537
|
-
return;
|
9538
|
-
}
|
9539
|
-
#elif defined(GGML_USE_CLBLAST)
|
9716
|
+
#if defined(GGML_USE_CLBLAST)
|
9540
9717
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9541
9718
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9542
9719
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -9701,14 +9878,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9701
9878
|
// nb01 >= nb00 - src0 is not transposed
|
9702
9879
|
// compute by src0 rows
|
9703
9880
|
|
9704
|
-
#if defined(
|
9705
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9706
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9707
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9708
|
-
}
|
9709
|
-
return;
|
9710
|
-
}
|
9711
|
-
#elif defined(GGML_USE_CLBLAST)
|
9881
|
+
#if defined(GGML_USE_CLBLAST)
|
9712
9882
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9713
9883
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9714
9884
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -9913,14 +10083,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9913
10083
|
// nb01 >= nb00 - src0 is not transposed
|
9914
10084
|
// compute by src0 rows
|
9915
10085
|
|
9916
|
-
#if defined(
|
9917
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9918
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9919
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9920
|
-
}
|
9921
|
-
return;
|
9922
|
-
}
|
9923
|
-
#elif defined(GGML_USE_CLBLAST)
|
10086
|
+
#if defined(GGML_USE_CLBLAST)
|
9924
10087
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9925
10088
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9926
10089
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -10063,6 +10226,11 @@ static void ggml_compute_forward_mul_mat(
|
|
10063
10226
|
case GGML_TYPE_Q5_1:
|
10064
10227
|
case GGML_TYPE_Q8_0:
|
10065
10228
|
case GGML_TYPE_Q8_1:
|
10229
|
+
case GGML_TYPE_Q2_K:
|
10230
|
+
case GGML_TYPE_Q3_K:
|
10231
|
+
case GGML_TYPE_Q4_K:
|
10232
|
+
case GGML_TYPE_Q5_K:
|
10233
|
+
case GGML_TYPE_Q6_K:
|
10066
10234
|
{
|
10067
10235
|
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
|
10068
10236
|
} break;
|
@@ -10246,6 +10414,11 @@ static void ggml_compute_forward_set(
|
|
10246
10414
|
case GGML_TYPE_Q5_1:
|
10247
10415
|
case GGML_TYPE_Q8_0:
|
10248
10416
|
case GGML_TYPE_Q8_1:
|
10417
|
+
case GGML_TYPE_Q2_K:
|
10418
|
+
case GGML_TYPE_Q3_K:
|
10419
|
+
case GGML_TYPE_Q4_K:
|
10420
|
+
case GGML_TYPE_Q5_K:
|
10421
|
+
case GGML_TYPE_Q6_K:
|
10249
10422
|
default:
|
10250
10423
|
{
|
10251
10424
|
GGML_ASSERT(false);
|
@@ -10411,6 +10584,11 @@ static void ggml_compute_forward_get_rows(
|
|
10411
10584
|
case GGML_TYPE_Q5_1:
|
10412
10585
|
case GGML_TYPE_Q8_0:
|
10413
10586
|
case GGML_TYPE_Q8_1:
|
10587
|
+
case GGML_TYPE_Q2_K:
|
10588
|
+
case GGML_TYPE_Q3_K:
|
10589
|
+
case GGML_TYPE_Q4_K:
|
10590
|
+
case GGML_TYPE_Q5_K:
|
10591
|
+
case GGML_TYPE_Q6_K:
|
10414
10592
|
{
|
10415
10593
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
10416
10594
|
} break;
|
@@ -10957,6 +11135,12 @@ static void ggml_compute_forward_alibi(
|
|
10957
11135
|
case GGML_TYPE_Q5_1:
|
10958
11136
|
case GGML_TYPE_Q8_0:
|
10959
11137
|
case GGML_TYPE_Q8_1:
|
11138
|
+
case GGML_TYPE_Q2_K:
|
11139
|
+
case GGML_TYPE_Q3_K:
|
11140
|
+
case GGML_TYPE_Q4_K:
|
11141
|
+
case GGML_TYPE_Q5_K:
|
11142
|
+
case GGML_TYPE_Q6_K:
|
11143
|
+
case GGML_TYPE_Q8_K:
|
10960
11144
|
case GGML_TYPE_I8:
|
10961
11145
|
case GGML_TYPE_I16:
|
10962
11146
|
case GGML_TYPE_I32:
|
@@ -11028,6 +11212,12 @@ static void ggml_compute_forward_clamp(
|
|
11028
11212
|
case GGML_TYPE_Q5_1:
|
11029
11213
|
case GGML_TYPE_Q8_0:
|
11030
11214
|
case GGML_TYPE_Q8_1:
|
11215
|
+
case GGML_TYPE_Q2_K:
|
11216
|
+
case GGML_TYPE_Q3_K:
|
11217
|
+
case GGML_TYPE_Q4_K:
|
11218
|
+
case GGML_TYPE_Q5_K:
|
11219
|
+
case GGML_TYPE_Q6_K:
|
11220
|
+
case GGML_TYPE_Q8_K:
|
11031
11221
|
case GGML_TYPE_I8:
|
11032
11222
|
case GGML_TYPE_I16:
|
11033
11223
|
case GGML_TYPE_I32:
|
@@ -11117,7 +11307,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11117
11307
|
theta *= theta_scale;
|
11118
11308
|
|
11119
11309
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11120
|
-
float * dst_data = (float *)((char *) dst->data +
|
11310
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11121
11311
|
|
11122
11312
|
const float x0 = src[0];
|
11123
11313
|
const float x1 = src[1];
|
@@ -11138,7 +11328,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11138
11328
|
const int64_t i0 = ib*n_dims + ic/2;
|
11139
11329
|
|
11140
11330
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11141
|
-
float * dst_data = (float *)((char *) dst->data +
|
11331
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11142
11332
|
|
11143
11333
|
const float x0 = src[0];
|
11144
11334
|
const float x1 = src[n_dims/2];
|
@@ -12846,6 +13036,15 @@ static void ggml_compute_forward_map_binary(
|
|
12846
13036
|
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
12847
13037
|
GGML_ASSERT(params);
|
12848
13038
|
|
13039
|
+
#ifdef GGML_USE_CUBLAS
|
13040
|
+
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
13041
|
+
if (skip_cpu) {
|
13042
|
+
return;
|
13043
|
+
}
|
13044
|
+
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
13045
|
+
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
13046
|
+
#endif // GGML_USE_CUBLAS
|
13047
|
+
|
12849
13048
|
switch (tensor->op) {
|
12850
13049
|
case GGML_OP_DUP:
|
12851
13050
|
{
|
@@ -13792,11 +13991,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
13792
13991
|
// reached a leaf node, not part of the gradient graph (e.g. a constant)
|
13793
13992
|
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
13794
13993
|
|
13994
|
+
if (strlen(node->name) == 0) {
|
13995
|
+
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
|
13996
|
+
}
|
13997
|
+
|
13795
13998
|
cgraph->leafs[cgraph->n_leafs] = node;
|
13796
13999
|
cgraph->n_leafs++;
|
13797
14000
|
} else {
|
13798
14001
|
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
13799
14002
|
|
14003
|
+
if (strlen(node->name) == 0) {
|
14004
|
+
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
|
14005
|
+
}
|
14006
|
+
|
13800
14007
|
cgraph->nodes[cgraph->n_nodes] = node;
|
13801
14008
|
cgraph->grads[cgraph->n_nodes] = node->grad;
|
13802
14009
|
cgraph->n_nodes++;
|
@@ -14144,7 +14351,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
14144
14351
|
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
14145
14352
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
14146
14353
|
// the threads are still spinning
|
14147
|
-
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
14148
14354
|
}
|
14149
14355
|
else
|
14150
14356
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -14510,6 +14716,521 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
|
14510
14716
|
}
|
14511
14717
|
}
|
14512
14718
|
|
14719
|
+
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
14720
|
+
for (int i = 0; i < cgraph->n_leafs; i++) {
|
14721
|
+
struct ggml_tensor * leaf = cgraph->leafs[i];
|
14722
|
+
|
14723
|
+
if (strcmp(leaf->name, name) == 0) {
|
14724
|
+
return leaf;
|
14725
|
+
}
|
14726
|
+
}
|
14727
|
+
|
14728
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
14729
|
+
struct ggml_tensor * node = cgraph->nodes[i];
|
14730
|
+
|
14731
|
+
if (strcmp(node->name, name) == 0) {
|
14732
|
+
return node;
|
14733
|
+
}
|
14734
|
+
}
|
14735
|
+
|
14736
|
+
return NULL;
|
14737
|
+
}
|
14738
|
+
|
14739
|
+
static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
|
14740
|
+
const int64_t * ne = tensor->ne;
|
14741
|
+
const size_t * nb = tensor->nb;
|
14742
|
+
|
14743
|
+
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
14744
|
+
ggml_type_name(tensor->type),
|
14745
|
+
ggml_op_name (tensor->op),
|
14746
|
+
tensor->n_dims,
|
14747
|
+
ne[0], ne[1], ne[2], ne[3],
|
14748
|
+
nb[0], nb[1], nb[2], nb[3],
|
14749
|
+
tensor->data,
|
14750
|
+
tensor->name);
|
14751
|
+
}
|
14752
|
+
|
14753
|
+
static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
|
14754
|
+
const int64_t * ne = tensor->ne;
|
14755
|
+
const size_t * nb = tensor->nb;
|
14756
|
+
|
14757
|
+
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
14758
|
+
arg,
|
14759
|
+
ggml_type_name(tensor->type),
|
14760
|
+
ggml_op_name (tensor->op),
|
14761
|
+
tensor->n_dims,
|
14762
|
+
ne[0], ne[1], ne[2], ne[3],
|
14763
|
+
nb[0], nb[1], nb[2], nb[3],
|
14764
|
+
tensor->n_tasks,
|
14765
|
+
tensor->data,
|
14766
|
+
tensor->name);
|
14767
|
+
}
|
14768
|
+
|
14769
|
+
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
14770
|
+
//assert(cgraph->work == NULL);
|
14771
|
+
//assert(cgraph->work_size == 0);
|
14772
|
+
|
14773
|
+
uint64_t size_eval = 0;
|
14774
|
+
|
14775
|
+
// compute size of intermediate results
|
14776
|
+
// TODO: does not take into account scratch buffers !!!!
|
14777
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14778
|
+
size_eval += ggml_nbytes(cgraph->nodes[i]);
|
14779
|
+
}
|
14780
|
+
|
14781
|
+
// print
|
14782
|
+
{
|
14783
|
+
FILE * fout = stdout;
|
14784
|
+
|
14785
|
+
fprintf(fout, "\n");
|
14786
|
+
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
14787
|
+
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
14788
|
+
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
14789
|
+
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
14790
|
+
fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
|
14791
|
+
|
14792
|
+
// header
|
14793
|
+
fprintf(fout, "\n");
|
14794
|
+
fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
|
14795
|
+
"TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
|
14796
|
+
|
14797
|
+
for (int i = 0; i < cgraph->n_leafs; ++i) {
|
14798
|
+
ggml_graph_export_leaf(cgraph->leafs[i], fout);
|
14799
|
+
|
14800
|
+
GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
|
14801
|
+
GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
|
14802
|
+
GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
|
14803
|
+
}
|
14804
|
+
|
14805
|
+
// header
|
14806
|
+
fprintf(fout, "\n");
|
14807
|
+
fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
|
14808
|
+
"ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
|
14809
|
+
|
14810
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14811
|
+
ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
|
14812
|
+
|
14813
|
+
if (cgraph->nodes[i]->src0) {
|
14814
|
+
ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
|
14815
|
+
}
|
14816
|
+
|
14817
|
+
if (cgraph->nodes[i]->src1) {
|
14818
|
+
ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
|
14819
|
+
}
|
14820
|
+
|
14821
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
14822
|
+
if (cgraph->nodes[i]->opt[j]) {
|
14823
|
+
ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
|
14824
|
+
}
|
14825
|
+
}
|
14826
|
+
|
14827
|
+
fprintf(fout, "\n");
|
14828
|
+
}
|
14829
|
+
|
14830
|
+
fprintf(fout, "\n");
|
14831
|
+
}
|
14832
|
+
|
14833
|
+
// write binary data
|
14834
|
+
{
|
14835
|
+
FILE * fout = fopen(fname, "wb");
|
14836
|
+
|
14837
|
+
if (!fout) {
|
14838
|
+
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
14839
|
+
return;
|
14840
|
+
}
|
14841
|
+
|
14842
|
+
// header
|
14843
|
+
{
|
14844
|
+
const uint32_t magic = GGML_FILE_MAGIC;
|
14845
|
+
const uint32_t version = GGML_FILE_VERSION;
|
14846
|
+
const uint32_t n_leafs = cgraph->n_leafs;
|
14847
|
+
const uint32_t nodes = cgraph->n_nodes;
|
14848
|
+
|
14849
|
+
fwrite(&magic, sizeof(uint32_t), 1, fout);
|
14850
|
+
fwrite(&version, sizeof(uint32_t), 1, fout);
|
14851
|
+
fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
|
14852
|
+
fwrite(&nodes, sizeof(uint32_t), 1, fout);
|
14853
|
+
fwrite(&size_eval, sizeof(uint64_t), 1, fout);
|
14854
|
+
}
|
14855
|
+
|
14856
|
+
// leafs
|
14857
|
+
{
|
14858
|
+
for (int i = 0; i < cgraph->n_leafs; ++i) {
|
14859
|
+
const struct ggml_tensor * tensor = cgraph->leafs[i];
|
14860
|
+
|
14861
|
+
const uint32_t type = tensor->type;
|
14862
|
+
const uint32_t op = tensor->op;
|
14863
|
+
const uint32_t n_dims = tensor->n_dims;
|
14864
|
+
|
14865
|
+
fwrite(&type, sizeof(uint32_t), 1, fout);
|
14866
|
+
fwrite(&op, sizeof(uint32_t), 1, fout);
|
14867
|
+
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
14868
|
+
|
14869
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14870
|
+
const uint64_t ne = tensor->ne[j];
|
14871
|
+
const uint64_t nb = tensor->nb[j];
|
14872
|
+
|
14873
|
+
fwrite(&ne, sizeof(uint64_t), 1, fout);
|
14874
|
+
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
14875
|
+
}
|
14876
|
+
|
14877
|
+
// store the pointer address
|
14878
|
+
{
|
14879
|
+
const uint64_t ptr = (uint64_t) tensor->data;
|
14880
|
+
|
14881
|
+
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
14882
|
+
}
|
14883
|
+
|
14884
|
+
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
14885
|
+
|
14886
|
+
// dump the data
|
14887
|
+
// TODO: pad this to 32 byte boundary
|
14888
|
+
{
|
14889
|
+
const size_t size = ggml_nbytes(tensor);
|
14890
|
+
|
14891
|
+
fwrite(tensor->data, sizeof(char), size, fout);
|
14892
|
+
}
|
14893
|
+
}
|
14894
|
+
}
|
14895
|
+
|
14896
|
+
// nodes
|
14897
|
+
{
|
14898
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14899
|
+
const struct ggml_tensor * tensor = cgraph->nodes[i];
|
14900
|
+
|
14901
|
+
const uint32_t type = tensor->type;
|
14902
|
+
const uint32_t op = tensor->op;
|
14903
|
+
const uint32_t n_dims = tensor->n_dims;
|
14904
|
+
|
14905
|
+
fwrite(&type, sizeof(uint32_t), 1, fout);
|
14906
|
+
fwrite(&op, sizeof(uint32_t), 1, fout);
|
14907
|
+
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
14908
|
+
|
14909
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14910
|
+
const uint64_t ne = tensor->ne[j];
|
14911
|
+
const uint64_t nb = tensor->nb[j];
|
14912
|
+
|
14913
|
+
fwrite(&ne, sizeof(uint64_t), 1, fout);
|
14914
|
+
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
14915
|
+
}
|
14916
|
+
|
14917
|
+
// store the pointer address
|
14918
|
+
{
|
14919
|
+
const uint64_t ptr = (uint64_t) tensor->data;
|
14920
|
+
|
14921
|
+
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
14922
|
+
}
|
14923
|
+
|
14924
|
+
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
14925
|
+
|
14926
|
+
// output the op arguments
|
14927
|
+
{
|
14928
|
+
struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
|
14929
|
+
|
14930
|
+
args[0] = tensor->src0;
|
14931
|
+
args[1] = tensor->src1;
|
14932
|
+
|
14933
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
14934
|
+
args[2 + j] = tensor->opt[j];
|
14935
|
+
}
|
14936
|
+
|
14937
|
+
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
14938
|
+
if (args[j]) {
|
14939
|
+
int32_t idx = -1;
|
14940
|
+
|
14941
|
+
// check if leaf
|
14942
|
+
{
|
14943
|
+
for (int k = 0; k < cgraph->n_leafs; ++k) {
|
14944
|
+
if (args[j] == cgraph->leafs[k]) {
|
14945
|
+
idx = k;
|
14946
|
+
break;
|
14947
|
+
}
|
14948
|
+
}
|
14949
|
+
}
|
14950
|
+
|
14951
|
+
// check if node
|
14952
|
+
if (idx == -1) {
|
14953
|
+
for (int k = 0; k < cgraph->n_nodes; ++k) {
|
14954
|
+
if (args[j] == cgraph->nodes[k]) {
|
14955
|
+
idx = GGML_MAX_NODES + k;
|
14956
|
+
break;
|
14957
|
+
}
|
14958
|
+
}
|
14959
|
+
}
|
14960
|
+
|
14961
|
+
if (idx == -1) {
|
14962
|
+
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
14963
|
+
return;
|
14964
|
+
}
|
14965
|
+
|
14966
|
+
fwrite(&idx, sizeof(int32_t), 1, fout);
|
14967
|
+
} else {
|
14968
|
+
const int32_t nul = -1;
|
14969
|
+
|
14970
|
+
fwrite(&nul, sizeof(int32_t), 1, fout);
|
14971
|
+
}
|
14972
|
+
}
|
14973
|
+
}
|
14974
|
+
}
|
14975
|
+
}
|
14976
|
+
|
14977
|
+
fclose(fout);
|
14978
|
+
}
|
14979
|
+
}
|
14980
|
+
|
14981
|
+
struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
|
14982
|
+
assert(*ctx_data == NULL);
|
14983
|
+
assert(*ctx_eval == NULL);
|
14984
|
+
|
14985
|
+
struct ggml_cgraph result = { 0 };
|
14986
|
+
|
14987
|
+
struct ggml_tensor * data = NULL;
|
14988
|
+
|
14989
|
+
// read file into data
|
14990
|
+
{
|
14991
|
+
FILE * fin = fopen(fname, "rb");
|
14992
|
+
if (!fin) {
|
14993
|
+
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
14994
|
+
return result;
|
14995
|
+
}
|
14996
|
+
|
14997
|
+
size_t fsize = 0;
|
14998
|
+
|
14999
|
+
fseek(fin, 0, SEEK_END);
|
15000
|
+
fsize = ftell(fin);
|
15001
|
+
fseek(fin, 0, SEEK_SET);
|
15002
|
+
|
15003
|
+
// create the data context
|
15004
|
+
{
|
15005
|
+
const size_t overhead = 1*ggml_tensor_overhead();
|
15006
|
+
|
15007
|
+
struct ggml_init_params params = {
|
15008
|
+
.mem_size = fsize + overhead,
|
15009
|
+
.mem_buffer = NULL,
|
15010
|
+
.no_alloc = false,
|
15011
|
+
};
|
15012
|
+
|
15013
|
+
*ctx_data = ggml_init(params);
|
15014
|
+
|
15015
|
+
if (!*ctx_data) {
|
15016
|
+
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
15017
|
+
return result;
|
15018
|
+
}
|
15019
|
+
}
|
15020
|
+
|
15021
|
+
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
15022
|
+
|
15023
|
+
const size_t ret = fread(data->data, sizeof(char), fsize, fin);
|
15024
|
+
if (ret != fsize) {
|
15025
|
+
fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
|
15026
|
+
return result;
|
15027
|
+
}
|
15028
|
+
|
15029
|
+
fclose(fin);
|
15030
|
+
}
|
15031
|
+
|
15032
|
+
// populate result
|
15033
|
+
{
|
15034
|
+
char * ptr = (char *) data->data;
|
15035
|
+
|
15036
|
+
const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
|
15037
|
+
|
15038
|
+
if (magic != GGML_FILE_MAGIC) {
|
15039
|
+
fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
|
15040
|
+
return result;
|
15041
|
+
}
|
15042
|
+
|
15043
|
+
const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
|
15044
|
+
|
15045
|
+
if (version != GGML_FILE_VERSION) {
|
15046
|
+
fprintf(stderr, "%s: invalid version number\n", __func__);
|
15047
|
+
return result;
|
15048
|
+
}
|
15049
|
+
|
15050
|
+
const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
|
15051
|
+
const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
|
15052
|
+
const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
|
15053
|
+
|
15054
|
+
result.n_leafs = n_leafs;
|
15055
|
+
result.n_nodes = n_nodes;
|
15056
|
+
|
15057
|
+
// create the data context
|
15058
|
+
{
|
15059
|
+
const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
|
15060
|
+
|
15061
|
+
struct ggml_init_params params = {
|
15062
|
+
.mem_size = size_eval + overhead,
|
15063
|
+
.mem_buffer = NULL,
|
15064
|
+
.no_alloc = true,
|
15065
|
+
};
|
15066
|
+
|
15067
|
+
*ctx_eval = ggml_init(params);
|
15068
|
+
|
15069
|
+
if (!*ctx_eval) {
|
15070
|
+
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
15071
|
+
return result;
|
15072
|
+
}
|
15073
|
+
}
|
15074
|
+
|
15075
|
+
// leafs
|
15076
|
+
{
|
15077
|
+
uint32_t type;
|
15078
|
+
uint32_t op;
|
15079
|
+
uint32_t n_dims;
|
15080
|
+
|
15081
|
+
for (uint32_t i = 0; i < n_leafs; ++i) {
|
15082
|
+
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
15083
|
+
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
15084
|
+
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
15085
|
+
|
15086
|
+
int64_t ne[GGML_MAX_DIMS];
|
15087
|
+
size_t nb[GGML_MAX_DIMS];
|
15088
|
+
|
15089
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
15090
|
+
uint64_t ne_cur;
|
15091
|
+
uint64_t nb_cur;
|
15092
|
+
|
15093
|
+
ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
|
15094
|
+
nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
|
15095
|
+
|
15096
|
+
ne[j] = ne_cur;
|
15097
|
+
nb[j] = nb_cur;
|
15098
|
+
}
|
15099
|
+
|
15100
|
+
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
15101
|
+
|
15102
|
+
tensor->op = (enum ggml_op) op;
|
15103
|
+
|
15104
|
+
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
|
15105
|
+
|
15106
|
+
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
|
15107
|
+
|
15108
|
+
tensor->data = (void *) ptr;
|
15109
|
+
|
15110
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
15111
|
+
tensor->nb[j] = nb[j];
|
15112
|
+
}
|
15113
|
+
|
15114
|
+
result.leafs[i] = tensor;
|
15115
|
+
|
15116
|
+
ptr += ggml_nbytes(tensor);
|
15117
|
+
|
15118
|
+
fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
15119
|
+
}
|
15120
|
+
}
|
15121
|
+
|
15122
|
+
ggml_set_no_alloc(*ctx_eval, false);
|
15123
|
+
|
15124
|
+
// nodes
|
15125
|
+
{
|
15126
|
+
uint32_t type;
|
15127
|
+
uint32_t op;
|
15128
|
+
uint32_t n_dims;
|
15129
|
+
|
15130
|
+
for (uint32_t i = 0; i < n_nodes; ++i) {
|
15131
|
+
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
15132
|
+
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
15133
|
+
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
15134
|
+
|
15135
|
+
enum ggml_op eop = (enum ggml_op) op;
|
15136
|
+
|
15137
|
+
int64_t ne[GGML_MAX_DIMS];
|
15138
|
+
size_t nb[GGML_MAX_DIMS];
|
15139
|
+
|
15140
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
15141
|
+
uint64_t ne_cur;
|
15142
|
+
uint64_t nb_cur;
|
15143
|
+
|
15144
|
+
ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
|
15145
|
+
nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
|
15146
|
+
|
15147
|
+
ne[j] = ne_cur;
|
15148
|
+
nb[j] = nb_cur;
|
15149
|
+
}
|
15150
|
+
|
15151
|
+
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
|
15152
|
+
|
15153
|
+
const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
|
15154
|
+
|
15155
|
+
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
|
15156
|
+
|
15157
|
+
struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
|
15158
|
+
|
15159
|
+
// parse args
|
15160
|
+
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
15161
|
+
const int32_t arg_idx = ptr_arg_idx[j];
|
15162
|
+
|
15163
|
+
if (arg_idx == -1) {
|
15164
|
+
continue;
|
15165
|
+
}
|
15166
|
+
|
15167
|
+
if (arg_idx < GGML_MAX_NODES) {
|
15168
|
+
args[j] = result.leafs[arg_idx];
|
15169
|
+
} else {
|
15170
|
+
args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
|
15171
|
+
}
|
15172
|
+
}
|
15173
|
+
|
15174
|
+
// create the tensor
|
15175
|
+
// "view" operations are handled differently
|
15176
|
+
// TODO: handle inplace ops - currently a copy is always made
|
15177
|
+
|
15178
|
+
struct ggml_tensor * tensor = NULL;
|
15179
|
+
|
15180
|
+
switch (eop) {
|
15181
|
+
// TODO: implement other view ops
|
15182
|
+
case GGML_OP_RESHAPE:
|
15183
|
+
{
|
15184
|
+
tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
|
15185
|
+
} break;
|
15186
|
+
case GGML_OP_VIEW:
|
15187
|
+
{
|
15188
|
+
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
|
15189
|
+
|
15190
|
+
uint64_t offs;
|
15191
|
+
memcpy(&offs, args[2]->data, sizeof(offs));
|
15192
|
+
|
15193
|
+
tensor->data = ((char *) tensor->data) + offs;
|
15194
|
+
} break;
|
15195
|
+
case GGML_OP_TRANSPOSE:
|
15196
|
+
{
|
15197
|
+
tensor = ggml_transpose(*ctx_eval, args[0]);
|
15198
|
+
} break;
|
15199
|
+
case GGML_OP_PERMUTE:
|
15200
|
+
{
|
15201
|
+
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
|
15202
|
+
} break;
|
15203
|
+
default:
|
15204
|
+
{
|
15205
|
+
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
15206
|
+
|
15207
|
+
tensor->op = eop;
|
15208
|
+
} break;
|
15209
|
+
}
|
15210
|
+
|
15211
|
+
memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
|
15212
|
+
|
15213
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
15214
|
+
tensor->nb[j] = nb[j];
|
15215
|
+
}
|
15216
|
+
|
15217
|
+
tensor->src0 = args[0];
|
15218
|
+
tensor->src1 = args[1];
|
15219
|
+
|
15220
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
15221
|
+
tensor->opt[j] = args[2 + j];
|
15222
|
+
}
|
15223
|
+
|
15224
|
+
result.nodes[i] = tensor;
|
15225
|
+
|
15226
|
+
fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
15227
|
+
}
|
15228
|
+
}
|
15229
|
+
}
|
15230
|
+
|
15231
|
+
return result;
|
15232
|
+
}
|
15233
|
+
|
14513
15234
|
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
14514
15235
|
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
|
14515
15236
|
|
@@ -14527,7 +15248,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14527
15248
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
14528
15249
|
i,
|
14529
15250
|
node->ne[0], node->ne[1], node->ne[2],
|
14530
|
-
|
15251
|
+
GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
14531
15252
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
14532
15253
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
14533
15254
|
(double) node->perf_time_us / 1000.0,
|
@@ -14541,7 +15262,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14541
15262
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
14542
15263
|
i,
|
14543
15264
|
node->ne[0], node->ne[1],
|
14544
|
-
|
15265
|
+
GGML_OP_NAME[node->op]);
|
14545
15266
|
}
|
14546
15267
|
|
14547
15268
|
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
@@ -14549,7 +15270,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14549
15270
|
continue;
|
14550
15271
|
}
|
14551
15272
|
|
14552
|
-
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n",
|
15273
|
+
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
|
14553
15274
|
}
|
14554
15275
|
|
14555
15276
|
GGML_PRINT("========================================\n");
|
@@ -15548,6 +16269,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
15548
16269
|
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
15549
16270
|
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
15550
16271
|
} break;
|
16272
|
+
#ifdef GGML_USE_K_QUANTS
|
16273
|
+
case GGML_TYPE_Q2_K:
|
16274
|
+
{
|
16275
|
+
GGML_ASSERT(start % QK_K == 0);
|
16276
|
+
block_q2_K * block = (block_q2_K*)dst + start / QK_K;
|
16277
|
+
result = ggml_quantize_q2_K(src + start, block, n, n, hist);
|
16278
|
+
} break;
|
16279
|
+
case GGML_TYPE_Q3_K:
|
16280
|
+
{
|
16281
|
+
GGML_ASSERT(start % QK_K == 0);
|
16282
|
+
block_q3_K * block = (block_q3_K*)dst + start / QK_K;
|
16283
|
+
result = ggml_quantize_q3_K(src + start, block, n, n, hist);
|
16284
|
+
} break;
|
16285
|
+
case GGML_TYPE_Q4_K:
|
16286
|
+
{
|
16287
|
+
GGML_ASSERT(start % QK_K == 0);
|
16288
|
+
block_q4_K * block = (block_q4_K*)dst + start / QK_K;
|
16289
|
+
result = ggml_quantize_q4_K(src + start, block, n, n, hist);
|
16290
|
+
} break;
|
16291
|
+
case GGML_TYPE_Q5_K:
|
16292
|
+
{
|
16293
|
+
GGML_ASSERT(start % QK_K == 0);
|
16294
|
+
block_q5_K * block = (block_q5_K*)dst + start / QK_K;
|
16295
|
+
result = ggml_quantize_q5_K(src + start, block, n, n, hist);
|
16296
|
+
} break;
|
16297
|
+
case GGML_TYPE_Q6_K:
|
16298
|
+
{
|
16299
|
+
GGML_ASSERT(start % QK_K == 0);
|
16300
|
+
block_q6_K * block = (block_q6_K*)dst + start / QK_K;
|
16301
|
+
result = ggml_quantize_q6_K(src + start, block, n, n, hist);
|
16302
|
+
} break;
|
16303
|
+
#endif
|
15551
16304
|
default:
|
15552
16305
|
assert(false);
|
15553
16306
|
}
|