llama_cpp 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -8
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +242 -52
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +835 -82
- data/ext/llama_cpp/src/ggml.h +64 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +489 -134
- data/ext/llama_cpp/src/llama.h +43 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -3,6 +3,10 @@
|
|
3
3
|
|
4
4
|
#include "ggml.h"
|
5
5
|
|
6
|
+
#ifdef GGML_USE_K_QUANTS
|
7
|
+
#include "k_quants.h"
|
8
|
+
#endif
|
9
|
+
|
6
10
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
7
11
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
8
12
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
@@ -21,6 +25,10 @@
|
|
21
25
|
#include <float.h>
|
22
26
|
#include <limits.h>
|
23
27
|
|
28
|
+
#ifdef GGML_USE_METAL
|
29
|
+
#include <unistd.h>
|
30
|
+
#endif
|
31
|
+
|
24
32
|
// if C99 - static_assert is noop
|
25
33
|
// ref: https://stackoverflow.com/a/53923785/4039976
|
26
34
|
#ifndef static_assert
|
@@ -121,7 +129,11 @@ typedef void* thread_ret_t;
|
|
121
129
|
#else
|
122
130
|
inline static void* ggml_aligned_malloc(size_t size) {
|
123
131
|
void* aligned_memory = NULL;
|
132
|
+
#ifdef GGML_USE_METAL
|
133
|
+
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
134
|
+
#else
|
124
135
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
136
|
+
#endif
|
125
137
|
if (result != 0) {
|
126
138
|
// Handle allocation failure
|
127
139
|
return NULL;
|
@@ -186,10 +198,12 @@ typedef double ggml_float;
|
|
186
198
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
187
199
|
#include <intrin.h>
|
188
200
|
#else
|
201
|
+
#if !defined(__riscv)
|
189
202
|
#include <immintrin.h>
|
190
203
|
#endif
|
191
204
|
#endif
|
192
205
|
#endif
|
206
|
+
#endif
|
193
207
|
|
194
208
|
#ifdef __F16C__
|
195
209
|
|
@@ -401,21 +415,27 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
|
401
415
|
//
|
402
416
|
|
403
417
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
404
|
-
static int64_t timer_freq;
|
418
|
+
static int64_t timer_freq, timer_start;
|
405
419
|
void ggml_time_init(void) {
|
406
|
-
LARGE_INTEGER
|
407
|
-
QueryPerformanceFrequency(&
|
408
|
-
timer_freq =
|
420
|
+
LARGE_INTEGER t;
|
421
|
+
QueryPerformanceFrequency(&t);
|
422
|
+
timer_freq = t.QuadPart;
|
423
|
+
|
424
|
+
// The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
|
425
|
+
// and the uptime is high enough.
|
426
|
+
// We subtract the program start time to reduce the likelihood of that happening.
|
427
|
+
QueryPerformanceCounter(&t);
|
428
|
+
timer_start = t.QuadPart;
|
409
429
|
}
|
410
430
|
int64_t ggml_time_ms(void) {
|
411
431
|
LARGE_INTEGER t;
|
412
432
|
QueryPerformanceCounter(&t);
|
413
|
-
return (t.QuadPart * 1000) / timer_freq;
|
433
|
+
return ((t.QuadPart-timer_start) * 1000) / timer_freq;
|
414
434
|
}
|
415
435
|
int64_t ggml_time_us(void) {
|
416
436
|
LARGE_INTEGER t;
|
417
437
|
QueryPerformanceCounter(&t);
|
418
|
-
return (t.QuadPart * 1000000) / timer_freq;
|
438
|
+
return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
|
419
439
|
}
|
420
440
|
#else
|
421
441
|
void ggml_time_init(void) {}
|
@@ -472,6 +492,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
472
492
|
// quantization
|
473
493
|
//
|
474
494
|
|
495
|
+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
496
|
+
|
475
497
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
476
498
|
// multiply int8_t, add results pairwise twice
|
477
499
|
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
@@ -531,7 +553,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
531
553
|
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
532
554
|
{
|
533
555
|
const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
|
534
|
-
const __m256i bytes =
|
556
|
+
const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
|
535
557
|
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
536
558
|
return _mm256_and_si256(lowMask, bytes);
|
537
559
|
}
|
@@ -604,7 +626,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
604
626
|
bytesh = _mm_or_si128(bytesh, bit_mask);
|
605
627
|
bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
|
606
628
|
bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
|
607
|
-
return
|
629
|
+
return MM256_SET_M128I(bytesh, bytesl);
|
608
630
|
}
|
609
631
|
|
610
632
|
// Unpack 32 4-bit fields into 32 bytes
|
@@ -617,7 +639,7 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
|
617
639
|
const __m128i lowMask = _mm_set1_epi8(0xF);
|
618
640
|
tmpl = _mm_and_si128(lowMask, tmpl);
|
619
641
|
tmph = _mm_and_si128(lowMask, tmph);
|
620
|
-
return
|
642
|
+
return MM256_SET_M128I(tmph, tmpl);
|
621
643
|
}
|
622
644
|
|
623
645
|
// add int16_t pairwise and return as float vector
|
@@ -625,7 +647,7 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
|
|
625
647
|
const __m128i ones = _mm_set1_epi16(1);
|
626
648
|
const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
|
627
649
|
const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
|
628
|
-
const __m256i summed_pairs =
|
650
|
+
const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
|
629
651
|
return _mm256_cvtepi32_ps(summed_pairs);
|
630
652
|
}
|
631
653
|
|
@@ -1563,6 +1585,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|
1563
1585
|
.vec_dot_q = NULL, // TODO
|
1564
1586
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1565
1587
|
},
|
1588
|
+
#ifdef GGML_USE_K_QUANTS
|
1589
|
+
[GGML_TYPE_Q2_K] = {
|
1590
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
|
1591
|
+
.quantize_row_q = quantize_row_q2_K,
|
1592
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
|
1593
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1594
|
+
.vec_dot_q = ggml_vec_dot_q2_K_q8_K,
|
1595
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1596
|
+
},
|
1597
|
+
[GGML_TYPE_Q3_K] = {
|
1598
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
|
1599
|
+
.quantize_row_q = quantize_row_q3_K,
|
1600
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
|
1601
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1602
|
+
.vec_dot_q = ggml_vec_dot_q3_K_q8_K,
|
1603
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1604
|
+
},
|
1605
|
+
[GGML_TYPE_Q4_K] = {
|
1606
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
|
1607
|
+
.quantize_row_q = quantize_row_q4_K,
|
1608
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
|
1609
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1610
|
+
.vec_dot_q = ggml_vec_dot_q4_K_q8_K,
|
1611
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1612
|
+
},
|
1613
|
+
[GGML_TYPE_Q5_K] = {
|
1614
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
|
1615
|
+
.quantize_row_q = quantize_row_q5_K,
|
1616
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
|
1617
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1618
|
+
.vec_dot_q = ggml_vec_dot_q5_K_q8_K,
|
1619
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1620
|
+
},
|
1621
|
+
[GGML_TYPE_Q6_K] = {
|
1622
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
|
1623
|
+
.quantize_row_q = quantize_row_q6_K,
|
1624
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
|
1625
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1626
|
+
.vec_dot_q = ggml_vec_dot_q6_K_q8_K,
|
1627
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1628
|
+
},
|
1629
|
+
#endif
|
1566
1630
|
};
|
1567
1631
|
|
1568
1632
|
// For internal test use
|
@@ -2288,7 +2352,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2288
2352
|
const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
|
2289
2353
|
|
2290
2354
|
// Convert int32_t to float
|
2291
|
-
__m256 p = _mm256_cvtepi32_ps(
|
2355
|
+
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
2292
2356
|
|
2293
2357
|
// Apply the scale, and accumulate
|
2294
2358
|
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
@@ -2764,7 +2828,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2764
2828
|
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
2765
2829
|
bxl = _mm_or_si128(bxl, bxhil);
|
2766
2830
|
bxh = _mm_or_si128(bxh, bxhih);
|
2767
|
-
bx =
|
2831
|
+
bx = MM256_SET_M128I(bxh, bxl);
|
2768
2832
|
|
2769
2833
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
2770
2834
|
|
@@ -3020,7 +3084,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3020
3084
|
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
3021
3085
|
bxl = _mm_or_si128(bxl, bxhil);
|
3022
3086
|
bxh = _mm_or_si128(bxh, bxhih);
|
3023
|
-
bx =
|
3087
|
+
bx = MM256_SET_M128I(bxh, bxl);
|
3024
3088
|
|
3025
3089
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
3026
3090
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
@@ -3442,11 +3506,19 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
|
3442
3506
|
[GGML_TYPE_Q5_1] = QK5_1,
|
3443
3507
|
[GGML_TYPE_Q8_0] = QK8_0,
|
3444
3508
|
[GGML_TYPE_Q8_1] = QK8_1,
|
3509
|
+
#ifdef GGML_USE_K_QUANTS
|
3510
|
+
[GGML_TYPE_Q2_K] = QK_K,
|
3511
|
+
[GGML_TYPE_Q3_K] = QK_K,
|
3512
|
+
[GGML_TYPE_Q4_K] = QK_K,
|
3513
|
+
[GGML_TYPE_Q5_K] = QK_K,
|
3514
|
+
[GGML_TYPE_Q6_K] = QK_K,
|
3515
|
+
[GGML_TYPE_Q8_K] = QK_K,
|
3516
|
+
#endif
|
3445
3517
|
[GGML_TYPE_I8] = 1,
|
3446
3518
|
[GGML_TYPE_I16] = 1,
|
3447
3519
|
[GGML_TYPE_I32] = 1,
|
3448
3520
|
};
|
3449
|
-
static_assert(GGML_TYPE_COUNT ==
|
3521
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
|
3450
3522
|
|
3451
3523
|
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
3452
3524
|
[GGML_TYPE_F32] = sizeof(float),
|
@@ -3457,11 +3529,19 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
|
3457
3529
|
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
3458
3530
|
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
3459
3531
|
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
3532
|
+
#ifdef GGML_USE_K_QUANTS
|
3533
|
+
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
3534
|
+
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
3535
|
+
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
3536
|
+
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
3537
|
+
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
3538
|
+
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
3539
|
+
#endif
|
3460
3540
|
[GGML_TYPE_I8] = sizeof(int8_t),
|
3461
3541
|
[GGML_TYPE_I16] = sizeof(int16_t),
|
3462
3542
|
[GGML_TYPE_I32] = sizeof(int32_t),
|
3463
3543
|
};
|
3464
|
-
static_assert(GGML_TYPE_COUNT ==
|
3544
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
|
3465
3545
|
|
3466
3546
|
|
3467
3547
|
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
@@ -3473,11 +3553,17 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
|
3473
3553
|
[GGML_TYPE_Q5_1] = "q5_1",
|
3474
3554
|
[GGML_TYPE_Q8_0] = "q8_0",
|
3475
3555
|
[GGML_TYPE_Q8_1] = "q8_1",
|
3556
|
+
[GGML_TYPE_Q2_K] = "q2_K",
|
3557
|
+
[GGML_TYPE_Q3_K] = "q3_K",
|
3558
|
+
[GGML_TYPE_Q4_K] = "q4_K",
|
3559
|
+
[GGML_TYPE_Q5_K] = "q5_K",
|
3560
|
+
[GGML_TYPE_Q6_K] = "q6_K",
|
3561
|
+
[GGML_TYPE_Q8_K] = "q8_K",
|
3476
3562
|
[GGML_TYPE_I8] = "i8",
|
3477
3563
|
[GGML_TYPE_I16] = "i16",
|
3478
3564
|
[GGML_TYPE_I32] = "i32",
|
3479
3565
|
};
|
3480
|
-
static_assert(GGML_TYPE_COUNT ==
|
3566
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
|
3481
3567
|
|
3482
3568
|
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
3483
3569
|
[GGML_TYPE_F32] = false,
|
@@ -3488,13 +3574,19 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
|
3488
3574
|
[GGML_TYPE_Q5_1] = true,
|
3489
3575
|
[GGML_TYPE_Q8_0] = true,
|
3490
3576
|
[GGML_TYPE_Q8_1] = true,
|
3577
|
+
[GGML_TYPE_Q2_K] = true,
|
3578
|
+
[GGML_TYPE_Q3_K] = true,
|
3579
|
+
[GGML_TYPE_Q4_K] = true,
|
3580
|
+
[GGML_TYPE_Q5_K] = true,
|
3581
|
+
[GGML_TYPE_Q6_K] = true,
|
3582
|
+
[GGML_TYPE_Q8_K] = true,
|
3491
3583
|
[GGML_TYPE_I8] = false,
|
3492
3584
|
[GGML_TYPE_I16] = false,
|
3493
3585
|
[GGML_TYPE_I32] = false,
|
3494
3586
|
};
|
3495
|
-
static_assert(GGML_TYPE_COUNT ==
|
3587
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
|
3496
3588
|
|
3497
|
-
static const char *
|
3589
|
+
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
3498
3590
|
"NONE",
|
3499
3591
|
|
3500
3592
|
"DUP",
|
@@ -3629,6 +3721,7 @@ struct ggml_context {
|
|
3629
3721
|
void * mem_buffer;
|
3630
3722
|
bool mem_buffer_owned;
|
3631
3723
|
bool no_alloc;
|
3724
|
+
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
3632
3725
|
|
3633
3726
|
int n_objects;
|
3634
3727
|
|
@@ -3645,26 +3738,6 @@ struct ggml_context_container {
|
|
3645
3738
|
struct ggml_context context;
|
3646
3739
|
};
|
3647
3740
|
|
3648
|
-
//
|
3649
|
-
// compute types
|
3650
|
-
//
|
3651
|
-
|
3652
|
-
enum ggml_task_type {
|
3653
|
-
GGML_TASK_INIT = 0,
|
3654
|
-
GGML_TASK_COMPUTE,
|
3655
|
-
GGML_TASK_FINALIZE,
|
3656
|
-
};
|
3657
|
-
|
3658
|
-
struct ggml_compute_params {
|
3659
|
-
enum ggml_task_type type;
|
3660
|
-
|
3661
|
-
int ith, nth;
|
3662
|
-
|
3663
|
-
// work buffer for all threads
|
3664
|
-
size_t wsize;
|
3665
|
-
void * wdata;
|
3666
|
-
};
|
3667
|
-
|
3668
3741
|
//
|
3669
3742
|
// ggml state
|
3670
3743
|
//
|
@@ -3721,7 +3794,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
|
3721
3794
|
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
3722
3795
|
}
|
3723
3796
|
|
3724
|
-
|
3797
|
+
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
3725
3798
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3726
3799
|
|
3727
3800
|
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
@@ -3730,7 +3803,20 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
|
|
3730
3803
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
3731
3804
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3732
3805
|
|
3733
|
-
|
3806
|
+
// this should handle cases where the tensor is not contiguous in memory
|
3807
|
+
// probaby just:
|
3808
|
+
//
|
3809
|
+
// return tensor->ne[3]*tensor->nb[3]
|
3810
|
+
//
|
3811
|
+
// is enough, but just in case, adding the second part
|
3812
|
+
|
3813
|
+
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
3814
|
+
}
|
3815
|
+
|
3816
|
+
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
3817
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3818
|
+
|
3819
|
+
return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
|
3734
3820
|
}
|
3735
3821
|
|
3736
3822
|
int ggml_blck_size(enum ggml_type type) {
|
@@ -3749,6 +3835,9 @@ const char * ggml_type_name(enum ggml_type type) {
|
|
3749
3835
|
return GGML_TYPE_NAME[type];
|
3750
3836
|
}
|
3751
3837
|
|
3838
|
+
const char * ggml_op_name(enum ggml_op op) {
|
3839
|
+
return GGML_OP_NAME[op];
|
3840
|
+
}
|
3752
3841
|
|
3753
3842
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
3754
3843
|
return GGML_TYPE_SIZE[tensor->type];
|
@@ -3796,6 +3885,11 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
3796
3885
|
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
3797
3886
|
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
3798
3887
|
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
3888
|
+
case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
|
3889
|
+
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
3890
|
+
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
3891
|
+
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
3892
|
+
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
3799
3893
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
3800
3894
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
3801
3895
|
}
|
@@ -3805,11 +3899,15 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
3805
3899
|
return wtype;
|
3806
3900
|
}
|
3807
3901
|
|
3808
|
-
|
3902
|
+
size_t ggml_tensor_overhead(void) {
|
3903
|
+
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
|
3904
|
+
}
|
3905
|
+
|
3906
|
+
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
3809
3907
|
return tensor->nb[0] > tensor->nb[1];
|
3810
3908
|
}
|
3811
3909
|
|
3812
|
-
|
3910
|
+
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
3813
3911
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3814
3912
|
|
3815
3913
|
return
|
@@ -3958,6 +4056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
3958
4056
|
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
|
3959
4057
|
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
3960
4058
|
/*.no_alloc =*/ params.no_alloc,
|
4059
|
+
/*.no_alloc_save =*/ params.no_alloc,
|
3961
4060
|
/*.n_objects =*/ 0,
|
3962
4061
|
/*.objects_begin =*/ NULL,
|
3963
4062
|
/*.objects_end =*/ NULL,
|
@@ -4017,17 +4116,36 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
|
|
4017
4116
|
return result;
|
4018
4117
|
}
|
4019
4118
|
|
4119
|
+
void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
|
4120
|
+
ctx->no_alloc = no_alloc;
|
4121
|
+
}
|
4122
|
+
|
4123
|
+
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
|
4124
|
+
return ctx->mem_buffer;
|
4125
|
+
}
|
4126
|
+
|
4127
|
+
size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
4128
|
+
return ctx->mem_size;
|
4129
|
+
}
|
4130
|
+
|
4020
4131
|
// IMPORTANT:
|
4021
4132
|
// when creating "opt" tensors, always save and load the scratch buffer
|
4022
4133
|
// this is an error prone process, but it is necessary to support inplace
|
4023
4134
|
// operators when using scratch buffers
|
4024
4135
|
// TODO: implement a better way
|
4025
4136
|
void ggml_scratch_save(struct ggml_context * ctx) {
|
4137
|
+
// this is needed to allow opt tensors to store their data
|
4138
|
+
// TODO: again, need to find a better way
|
4139
|
+
ctx->no_alloc_save = ctx->no_alloc;
|
4140
|
+
ctx->no_alloc = false;
|
4141
|
+
|
4026
4142
|
ctx->scratch_save = ctx->scratch;
|
4027
4143
|
ctx->scratch.data = NULL;
|
4028
4144
|
}
|
4029
4145
|
|
4030
4146
|
void ggml_scratch_load(struct ggml_context * ctx) {
|
4147
|
+
ctx->no_alloc = ctx->no_alloc_save;
|
4148
|
+
|
4031
4149
|
ctx->scratch = ctx->scratch_save;
|
4032
4150
|
}
|
4033
4151
|
|
@@ -4061,7 +4179,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4061
4179
|
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
|
4062
4180
|
|
4063
4181
|
if (ctx->scratch.data == NULL || data != NULL) {
|
4064
|
-
size_needed +=
|
4182
|
+
size_needed += GGML_TENSOR_SIZE;
|
4065
4183
|
|
4066
4184
|
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
4067
4185
|
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
@@ -4077,14 +4195,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4077
4195
|
};
|
4078
4196
|
} else {
|
4079
4197
|
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
|
4080
|
-
GGML_PRINT("%s: not enough space in the scratch memory\n",
|
4198
|
+
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
4199
|
+
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
|
4081
4200
|
assert(false);
|
4082
4201
|
return NULL;
|
4083
4202
|
}
|
4084
4203
|
|
4085
|
-
if (cur_end +
|
4204
|
+
if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
|
4086
4205
|
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
4087
|
-
__func__, cur_end +
|
4206
|
+
__func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
|
4088
4207
|
assert(false);
|
4089
4208
|
return NULL;
|
4090
4209
|
}
|
@@ -4093,7 +4212,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4093
4212
|
|
4094
4213
|
*obj_new = (struct ggml_object) {
|
4095
4214
|
.offs = cur_end + GGML_OBJECT_SIZE,
|
4096
|
-
.size =
|
4215
|
+
.size = GGML_TENSOR_SIZE,
|
4097
4216
|
.next = NULL,
|
4098
4217
|
};
|
4099
4218
|
|
@@ -4135,6 +4254,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4135
4254
|
/*.perf_time_us =*/ 0,
|
4136
4255
|
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
4137
4256
|
/*.name =*/ { 0 },
|
4257
|
+
/*.extra =*/ NULL,
|
4138
4258
|
/*.pad =*/ { 0 },
|
4139
4259
|
};
|
4140
4260
|
|
@@ -4509,6 +4629,23 @@ struct ggml_tensor * ggml_view_tensor(
|
|
4509
4629
|
return result;
|
4510
4630
|
}
|
4511
4631
|
|
4632
|
+
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
4633
|
+
struct ggml_object * obj = ctx->objects_begin;
|
4634
|
+
|
4635
|
+
char * const mem_buffer = ctx->mem_buffer;
|
4636
|
+
|
4637
|
+
while (obj != NULL) {
|
4638
|
+
struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
|
4639
|
+
if (strcmp(cur->name, name) == 0) {
|
4640
|
+
return cur;
|
4641
|
+
}
|
4642
|
+
|
4643
|
+
obj = obj->next;
|
4644
|
+
}
|
4645
|
+
|
4646
|
+
return NULL;
|
4647
|
+
}
|
4648
|
+
|
4512
4649
|
////////////////////////////////////////////////////////////////////////////////
|
4513
4650
|
|
4514
4651
|
// ggml_dup
|
@@ -5763,10 +5900,18 @@ struct ggml_tensor * ggml_view_1d(
|
|
5763
5900
|
|
5764
5901
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
5765
5902
|
|
5903
|
+
ggml_scratch_save(ctx);
|
5904
|
+
|
5905
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5906
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5907
|
+
|
5908
|
+
ggml_scratch_load(ctx);
|
5909
|
+
|
5766
5910
|
result->op = GGML_OP_VIEW;
|
5767
5911
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5768
5912
|
result->src0 = a;
|
5769
5913
|
result->src1 = NULL;
|
5914
|
+
result->opt[0] = offs;
|
5770
5915
|
|
5771
5916
|
if (is_node) {
|
5772
5917
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5795,6 +5940,13 @@ struct ggml_tensor * ggml_view_2d(
|
|
5795
5940
|
|
5796
5941
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
5797
5942
|
|
5943
|
+
ggml_scratch_save(ctx);
|
5944
|
+
|
5945
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5946
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5947
|
+
|
5948
|
+
ggml_scratch_load(ctx);
|
5949
|
+
|
5798
5950
|
result->nb[1] = nb1;
|
5799
5951
|
result->nb[2] = result->nb[1]*ne1;
|
5800
5952
|
result->nb[3] = result->nb[2];
|
@@ -5803,6 +5955,7 @@ struct ggml_tensor * ggml_view_2d(
|
|
5803
5955
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5804
5956
|
result->src0 = a;
|
5805
5957
|
result->src1 = NULL;
|
5958
|
+
result->opt[0] = offs;
|
5806
5959
|
|
5807
5960
|
if (is_node) {
|
5808
5961
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5833,6 +5986,13 @@ struct ggml_tensor * ggml_view_3d(
|
|
5833
5986
|
|
5834
5987
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
5835
5988
|
|
5989
|
+
ggml_scratch_save(ctx);
|
5990
|
+
|
5991
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5992
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5993
|
+
|
5994
|
+
ggml_scratch_load(ctx);
|
5995
|
+
|
5836
5996
|
result->nb[1] = nb1;
|
5837
5997
|
result->nb[2] = nb2;
|
5838
5998
|
result->nb[3] = result->nb[2]*ne2;
|
@@ -5841,6 +6001,7 @@ struct ggml_tensor * ggml_view_3d(
|
|
5841
6001
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5842
6002
|
result->src0 = a;
|
5843
6003
|
result->src1 = NULL;
|
6004
|
+
result->opt[0] = offs;
|
5844
6005
|
|
5845
6006
|
if (is_node) {
|
5846
6007
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5873,6 +6034,13 @@ struct ggml_tensor * ggml_view_4d(
|
|
5873
6034
|
|
5874
6035
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
5875
6036
|
|
6037
|
+
ggml_scratch_save(ctx);
|
6038
|
+
|
6039
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6040
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6041
|
+
|
6042
|
+
ggml_scratch_load(ctx);
|
6043
|
+
|
5876
6044
|
result->nb[1] = nb1;
|
5877
6045
|
result->nb[2] = nb2;
|
5878
6046
|
result->nb[3] = nb3;
|
@@ -5881,6 +6049,7 @@ struct ggml_tensor * ggml_view_4d(
|
|
5881
6049
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5882
6050
|
result->src0 = a;
|
5883
6051
|
result->src1 = NULL;
|
6052
|
+
result->opt[0] = offs;
|
5884
6053
|
|
5885
6054
|
if (is_node) {
|
5886
6055
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -6303,7 +6472,7 @@ struct ggml_tensor * ggml_alibi(
|
|
6303
6472
|
|
6304
6473
|
ggml_scratch_save(ctx);
|
6305
6474
|
|
6306
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
6475
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
6307
6476
|
|
6308
6477
|
((int32_t *) b->data)[0] = n_past;
|
6309
6478
|
((int32_t *) b->data)[1] = n_head;
|
@@ -7545,6 +7714,11 @@ static void ggml_compute_forward_add(
|
|
7545
7714
|
case GGML_TYPE_Q5_0:
|
7546
7715
|
case GGML_TYPE_Q5_1:
|
7547
7716
|
case GGML_TYPE_Q8_0:
|
7717
|
+
case GGML_TYPE_Q2_K:
|
7718
|
+
case GGML_TYPE_Q3_K:
|
7719
|
+
case GGML_TYPE_Q4_K:
|
7720
|
+
case GGML_TYPE_Q5_K:
|
7721
|
+
case GGML_TYPE_Q6_K:
|
7548
7722
|
{
|
7549
7723
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
7550
7724
|
} break;
|
@@ -7848,6 +8022,11 @@ static void ggml_compute_forward_add1(
|
|
7848
8022
|
case GGML_TYPE_Q5_1:
|
7849
8023
|
case GGML_TYPE_Q8_0:
|
7850
8024
|
case GGML_TYPE_Q8_1:
|
8025
|
+
case GGML_TYPE_Q2_K:
|
8026
|
+
case GGML_TYPE_Q3_K:
|
8027
|
+
case GGML_TYPE_Q4_K:
|
8028
|
+
case GGML_TYPE_Q5_K:
|
8029
|
+
case GGML_TYPE_Q6_K:
|
7851
8030
|
{
|
7852
8031
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
7853
8032
|
} break;
|
@@ -7970,6 +8149,11 @@ static void ggml_compute_forward_acc(
|
|
7970
8149
|
case GGML_TYPE_Q5_1:
|
7971
8150
|
case GGML_TYPE_Q8_0:
|
7972
8151
|
case GGML_TYPE_Q8_1:
|
8152
|
+
case GGML_TYPE_Q2_K:
|
8153
|
+
case GGML_TYPE_Q3_K:
|
8154
|
+
case GGML_TYPE_Q4_K:
|
8155
|
+
case GGML_TYPE_Q5_K:
|
8156
|
+
case GGML_TYPE_Q6_K:
|
7973
8157
|
default:
|
7974
8158
|
{
|
7975
8159
|
GGML_ASSERT(false);
|
@@ -8088,10 +8272,10 @@ static void ggml_compute_forward_mul_f32(
|
|
8088
8272
|
const int ith = params->ith;
|
8089
8273
|
const int nth = params->nth;
|
8090
8274
|
|
8091
|
-
#ifdef
|
8092
|
-
if (src1->backend ==
|
8275
|
+
#ifdef GGML_USE_CLBLAST
|
8276
|
+
if (src1->backend == GGML_BACKEND_GPU) {
|
8093
8277
|
if (ith == 0) {
|
8094
|
-
|
8278
|
+
ggml_cl_mul(src0, src1, dst);
|
8095
8279
|
}
|
8096
8280
|
return;
|
8097
8281
|
}
|
@@ -9206,7 +9390,7 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
9206
9390
|
sum += (ggml_float)(x[i00] * x[i00]);
|
9207
9391
|
}
|
9208
9392
|
|
9209
|
-
float mean = sum/ne00;
|
9393
|
+
const float mean = sum/ne00;
|
9210
9394
|
|
9211
9395
|
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
9212
9396
|
|
@@ -9529,14 +9713,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9529
9713
|
// nb01 >= nb00 - src0 is not transposed
|
9530
9714
|
// compute by src0 rows
|
9531
9715
|
|
9532
|
-
#if defined(
|
9533
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9534
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9535
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9536
|
-
}
|
9537
|
-
return;
|
9538
|
-
}
|
9539
|
-
#elif defined(GGML_USE_CLBLAST)
|
9716
|
+
#if defined(GGML_USE_CLBLAST)
|
9540
9717
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9541
9718
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9542
9719
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -9701,14 +9878,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9701
9878
|
// nb01 >= nb00 - src0 is not transposed
|
9702
9879
|
// compute by src0 rows
|
9703
9880
|
|
9704
|
-
#if defined(
|
9705
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9706
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9707
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9708
|
-
}
|
9709
|
-
return;
|
9710
|
-
}
|
9711
|
-
#elif defined(GGML_USE_CLBLAST)
|
9881
|
+
#if defined(GGML_USE_CLBLAST)
|
9712
9882
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9713
9883
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9714
9884
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -9913,14 +10083,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9913
10083
|
// nb01 >= nb00 - src0 is not transposed
|
9914
10084
|
// compute by src0 rows
|
9915
10085
|
|
9916
|
-
#if defined(
|
9917
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9918
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9919
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9920
|
-
}
|
9921
|
-
return;
|
9922
|
-
}
|
9923
|
-
#elif defined(GGML_USE_CLBLAST)
|
10086
|
+
#if defined(GGML_USE_CLBLAST)
|
9924
10087
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9925
10088
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9926
10089
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -10063,6 +10226,11 @@ static void ggml_compute_forward_mul_mat(
|
|
10063
10226
|
case GGML_TYPE_Q5_1:
|
10064
10227
|
case GGML_TYPE_Q8_0:
|
10065
10228
|
case GGML_TYPE_Q8_1:
|
10229
|
+
case GGML_TYPE_Q2_K:
|
10230
|
+
case GGML_TYPE_Q3_K:
|
10231
|
+
case GGML_TYPE_Q4_K:
|
10232
|
+
case GGML_TYPE_Q5_K:
|
10233
|
+
case GGML_TYPE_Q6_K:
|
10066
10234
|
{
|
10067
10235
|
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
|
10068
10236
|
} break;
|
@@ -10246,6 +10414,11 @@ static void ggml_compute_forward_set(
|
|
10246
10414
|
case GGML_TYPE_Q5_1:
|
10247
10415
|
case GGML_TYPE_Q8_0:
|
10248
10416
|
case GGML_TYPE_Q8_1:
|
10417
|
+
case GGML_TYPE_Q2_K:
|
10418
|
+
case GGML_TYPE_Q3_K:
|
10419
|
+
case GGML_TYPE_Q4_K:
|
10420
|
+
case GGML_TYPE_Q5_K:
|
10421
|
+
case GGML_TYPE_Q6_K:
|
10249
10422
|
default:
|
10250
10423
|
{
|
10251
10424
|
GGML_ASSERT(false);
|
@@ -10411,6 +10584,11 @@ static void ggml_compute_forward_get_rows(
|
|
10411
10584
|
case GGML_TYPE_Q5_1:
|
10412
10585
|
case GGML_TYPE_Q8_0:
|
10413
10586
|
case GGML_TYPE_Q8_1:
|
10587
|
+
case GGML_TYPE_Q2_K:
|
10588
|
+
case GGML_TYPE_Q3_K:
|
10589
|
+
case GGML_TYPE_Q4_K:
|
10590
|
+
case GGML_TYPE_Q5_K:
|
10591
|
+
case GGML_TYPE_Q6_K:
|
10414
10592
|
{
|
10415
10593
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
10416
10594
|
} break;
|
@@ -10957,6 +11135,12 @@ static void ggml_compute_forward_alibi(
|
|
10957
11135
|
case GGML_TYPE_Q5_1:
|
10958
11136
|
case GGML_TYPE_Q8_0:
|
10959
11137
|
case GGML_TYPE_Q8_1:
|
11138
|
+
case GGML_TYPE_Q2_K:
|
11139
|
+
case GGML_TYPE_Q3_K:
|
11140
|
+
case GGML_TYPE_Q4_K:
|
11141
|
+
case GGML_TYPE_Q5_K:
|
11142
|
+
case GGML_TYPE_Q6_K:
|
11143
|
+
case GGML_TYPE_Q8_K:
|
10960
11144
|
case GGML_TYPE_I8:
|
10961
11145
|
case GGML_TYPE_I16:
|
10962
11146
|
case GGML_TYPE_I32:
|
@@ -11028,6 +11212,12 @@ static void ggml_compute_forward_clamp(
|
|
11028
11212
|
case GGML_TYPE_Q5_1:
|
11029
11213
|
case GGML_TYPE_Q8_0:
|
11030
11214
|
case GGML_TYPE_Q8_1:
|
11215
|
+
case GGML_TYPE_Q2_K:
|
11216
|
+
case GGML_TYPE_Q3_K:
|
11217
|
+
case GGML_TYPE_Q4_K:
|
11218
|
+
case GGML_TYPE_Q5_K:
|
11219
|
+
case GGML_TYPE_Q6_K:
|
11220
|
+
case GGML_TYPE_Q8_K:
|
11031
11221
|
case GGML_TYPE_I8:
|
11032
11222
|
case GGML_TYPE_I16:
|
11033
11223
|
case GGML_TYPE_I32:
|
@@ -11117,7 +11307,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11117
11307
|
theta *= theta_scale;
|
11118
11308
|
|
11119
11309
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11120
|
-
float * dst_data = (float *)((char *) dst->data +
|
11310
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11121
11311
|
|
11122
11312
|
const float x0 = src[0];
|
11123
11313
|
const float x1 = src[1];
|
@@ -11138,7 +11328,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11138
11328
|
const int64_t i0 = ib*n_dims + ic/2;
|
11139
11329
|
|
11140
11330
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11141
|
-
float * dst_data = (float *)((char *) dst->data +
|
11331
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11142
11332
|
|
11143
11333
|
const float x0 = src[0];
|
11144
11334
|
const float x1 = src[n_dims/2];
|
@@ -12846,6 +13036,15 @@ static void ggml_compute_forward_map_binary(
|
|
12846
13036
|
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
12847
13037
|
GGML_ASSERT(params);
|
12848
13038
|
|
13039
|
+
#ifdef GGML_USE_CUBLAS
|
13040
|
+
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
13041
|
+
if (skip_cpu) {
|
13042
|
+
return;
|
13043
|
+
}
|
13044
|
+
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
13045
|
+
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
13046
|
+
#endif // GGML_USE_CUBLAS
|
13047
|
+
|
12849
13048
|
switch (tensor->op) {
|
12850
13049
|
case GGML_OP_DUP:
|
12851
13050
|
{
|
@@ -13792,11 +13991,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
13792
13991
|
// reached a leaf node, not part of the gradient graph (e.g. a constant)
|
13793
13992
|
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
13794
13993
|
|
13994
|
+
if (strlen(node->name) == 0) {
|
13995
|
+
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
|
13996
|
+
}
|
13997
|
+
|
13795
13998
|
cgraph->leafs[cgraph->n_leafs] = node;
|
13796
13999
|
cgraph->n_leafs++;
|
13797
14000
|
} else {
|
13798
14001
|
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
13799
14002
|
|
14003
|
+
if (strlen(node->name) == 0) {
|
14004
|
+
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
|
14005
|
+
}
|
14006
|
+
|
13800
14007
|
cgraph->nodes[cgraph->n_nodes] = node;
|
13801
14008
|
cgraph->grads[cgraph->n_nodes] = node->grad;
|
13802
14009
|
cgraph->n_nodes++;
|
@@ -14144,7 +14351,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
14144
14351
|
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
14145
14352
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
14146
14353
|
// the threads are still spinning
|
14147
|
-
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
14148
14354
|
}
|
14149
14355
|
else
|
14150
14356
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -14510,6 +14716,521 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
|
14510
14716
|
}
|
14511
14717
|
}
|
14512
14718
|
|
14719
|
+
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
14720
|
+
for (int i = 0; i < cgraph->n_leafs; i++) {
|
14721
|
+
struct ggml_tensor * leaf = cgraph->leafs[i];
|
14722
|
+
|
14723
|
+
if (strcmp(leaf->name, name) == 0) {
|
14724
|
+
return leaf;
|
14725
|
+
}
|
14726
|
+
}
|
14727
|
+
|
14728
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
14729
|
+
struct ggml_tensor * node = cgraph->nodes[i];
|
14730
|
+
|
14731
|
+
if (strcmp(node->name, name) == 0) {
|
14732
|
+
return node;
|
14733
|
+
}
|
14734
|
+
}
|
14735
|
+
|
14736
|
+
return NULL;
|
14737
|
+
}
|
14738
|
+
|
14739
|
+
static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
|
14740
|
+
const int64_t * ne = tensor->ne;
|
14741
|
+
const size_t * nb = tensor->nb;
|
14742
|
+
|
14743
|
+
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
14744
|
+
ggml_type_name(tensor->type),
|
14745
|
+
ggml_op_name (tensor->op),
|
14746
|
+
tensor->n_dims,
|
14747
|
+
ne[0], ne[1], ne[2], ne[3],
|
14748
|
+
nb[0], nb[1], nb[2], nb[3],
|
14749
|
+
tensor->data,
|
14750
|
+
tensor->name);
|
14751
|
+
}
|
14752
|
+
|
14753
|
+
static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
|
14754
|
+
const int64_t * ne = tensor->ne;
|
14755
|
+
const size_t * nb = tensor->nb;
|
14756
|
+
|
14757
|
+
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
14758
|
+
arg,
|
14759
|
+
ggml_type_name(tensor->type),
|
14760
|
+
ggml_op_name (tensor->op),
|
14761
|
+
tensor->n_dims,
|
14762
|
+
ne[0], ne[1], ne[2], ne[3],
|
14763
|
+
nb[0], nb[1], nb[2], nb[3],
|
14764
|
+
tensor->n_tasks,
|
14765
|
+
tensor->data,
|
14766
|
+
tensor->name);
|
14767
|
+
}
|
14768
|
+
|
14769
|
+
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
14770
|
+
//assert(cgraph->work == NULL);
|
14771
|
+
//assert(cgraph->work_size == 0);
|
14772
|
+
|
14773
|
+
uint64_t size_eval = 0;
|
14774
|
+
|
14775
|
+
// compute size of intermediate results
|
14776
|
+
// TODO: does not take into account scratch buffers !!!!
|
14777
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14778
|
+
size_eval += ggml_nbytes(cgraph->nodes[i]);
|
14779
|
+
}
|
14780
|
+
|
14781
|
+
// print
|
14782
|
+
{
|
14783
|
+
FILE * fout = stdout;
|
14784
|
+
|
14785
|
+
fprintf(fout, "\n");
|
14786
|
+
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
14787
|
+
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
14788
|
+
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
14789
|
+
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
14790
|
+
fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
|
14791
|
+
|
14792
|
+
// header
|
14793
|
+
fprintf(fout, "\n");
|
14794
|
+
fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
|
14795
|
+
"TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
|
14796
|
+
|
14797
|
+
for (int i = 0; i < cgraph->n_leafs; ++i) {
|
14798
|
+
ggml_graph_export_leaf(cgraph->leafs[i], fout);
|
14799
|
+
|
14800
|
+
GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
|
14801
|
+
GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
|
14802
|
+
GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
|
14803
|
+
}
|
14804
|
+
|
14805
|
+
// header
|
14806
|
+
fprintf(fout, "\n");
|
14807
|
+
fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
|
14808
|
+
"ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
|
14809
|
+
|
14810
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14811
|
+
ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
|
14812
|
+
|
14813
|
+
if (cgraph->nodes[i]->src0) {
|
14814
|
+
ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
|
14815
|
+
}
|
14816
|
+
|
14817
|
+
if (cgraph->nodes[i]->src1) {
|
14818
|
+
ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
|
14819
|
+
}
|
14820
|
+
|
14821
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
14822
|
+
if (cgraph->nodes[i]->opt[j]) {
|
14823
|
+
ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
|
14824
|
+
}
|
14825
|
+
}
|
14826
|
+
|
14827
|
+
fprintf(fout, "\n");
|
14828
|
+
}
|
14829
|
+
|
14830
|
+
fprintf(fout, "\n");
|
14831
|
+
}
|
14832
|
+
|
14833
|
+
// write binary data
|
14834
|
+
{
|
14835
|
+
FILE * fout = fopen(fname, "wb");
|
14836
|
+
|
14837
|
+
if (!fout) {
|
14838
|
+
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
14839
|
+
return;
|
14840
|
+
}
|
14841
|
+
|
14842
|
+
// header
|
14843
|
+
{
|
14844
|
+
const uint32_t magic = GGML_FILE_MAGIC;
|
14845
|
+
const uint32_t version = GGML_FILE_VERSION;
|
14846
|
+
const uint32_t n_leafs = cgraph->n_leafs;
|
14847
|
+
const uint32_t nodes = cgraph->n_nodes;
|
14848
|
+
|
14849
|
+
fwrite(&magic, sizeof(uint32_t), 1, fout);
|
14850
|
+
fwrite(&version, sizeof(uint32_t), 1, fout);
|
14851
|
+
fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
|
14852
|
+
fwrite(&nodes, sizeof(uint32_t), 1, fout);
|
14853
|
+
fwrite(&size_eval, sizeof(uint64_t), 1, fout);
|
14854
|
+
}
|
14855
|
+
|
14856
|
+
// leafs
|
14857
|
+
{
|
14858
|
+
for (int i = 0; i < cgraph->n_leafs; ++i) {
|
14859
|
+
const struct ggml_tensor * tensor = cgraph->leafs[i];
|
14860
|
+
|
14861
|
+
const uint32_t type = tensor->type;
|
14862
|
+
const uint32_t op = tensor->op;
|
14863
|
+
const uint32_t n_dims = tensor->n_dims;
|
14864
|
+
|
14865
|
+
fwrite(&type, sizeof(uint32_t), 1, fout);
|
14866
|
+
fwrite(&op, sizeof(uint32_t), 1, fout);
|
14867
|
+
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
14868
|
+
|
14869
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14870
|
+
const uint64_t ne = tensor->ne[j];
|
14871
|
+
const uint64_t nb = tensor->nb[j];
|
14872
|
+
|
14873
|
+
fwrite(&ne, sizeof(uint64_t), 1, fout);
|
14874
|
+
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
14875
|
+
}
|
14876
|
+
|
14877
|
+
// store the pointer address
|
14878
|
+
{
|
14879
|
+
const uint64_t ptr = (uint64_t) tensor->data;
|
14880
|
+
|
14881
|
+
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
14882
|
+
}
|
14883
|
+
|
14884
|
+
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
14885
|
+
|
14886
|
+
// dump the data
|
14887
|
+
// TODO: pad this to 32 byte boundary
|
14888
|
+
{
|
14889
|
+
const size_t size = ggml_nbytes(tensor);
|
14890
|
+
|
14891
|
+
fwrite(tensor->data, sizeof(char), size, fout);
|
14892
|
+
}
|
14893
|
+
}
|
14894
|
+
}
|
14895
|
+
|
14896
|
+
// nodes
|
14897
|
+
{
|
14898
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14899
|
+
const struct ggml_tensor * tensor = cgraph->nodes[i];
|
14900
|
+
|
14901
|
+
const uint32_t type = tensor->type;
|
14902
|
+
const uint32_t op = tensor->op;
|
14903
|
+
const uint32_t n_dims = tensor->n_dims;
|
14904
|
+
|
14905
|
+
fwrite(&type, sizeof(uint32_t), 1, fout);
|
14906
|
+
fwrite(&op, sizeof(uint32_t), 1, fout);
|
14907
|
+
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
14908
|
+
|
14909
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14910
|
+
const uint64_t ne = tensor->ne[j];
|
14911
|
+
const uint64_t nb = tensor->nb[j];
|
14912
|
+
|
14913
|
+
fwrite(&ne, sizeof(uint64_t), 1, fout);
|
14914
|
+
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
14915
|
+
}
|
14916
|
+
|
14917
|
+
// store the pointer address
|
14918
|
+
{
|
14919
|
+
const uint64_t ptr = (uint64_t) tensor->data;
|
14920
|
+
|
14921
|
+
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
14922
|
+
}
|
14923
|
+
|
14924
|
+
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
14925
|
+
|
14926
|
+
// output the op arguments
|
14927
|
+
{
|
14928
|
+
struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
|
14929
|
+
|
14930
|
+
args[0] = tensor->src0;
|
14931
|
+
args[1] = tensor->src1;
|
14932
|
+
|
14933
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
14934
|
+
args[2 + j] = tensor->opt[j];
|
14935
|
+
}
|
14936
|
+
|
14937
|
+
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
14938
|
+
if (args[j]) {
|
14939
|
+
int32_t idx = -1;
|
14940
|
+
|
14941
|
+
// check if leaf
|
14942
|
+
{
|
14943
|
+
for (int k = 0; k < cgraph->n_leafs; ++k) {
|
14944
|
+
if (args[j] == cgraph->leafs[k]) {
|
14945
|
+
idx = k;
|
14946
|
+
break;
|
14947
|
+
}
|
14948
|
+
}
|
14949
|
+
}
|
14950
|
+
|
14951
|
+
// check if node
|
14952
|
+
if (idx == -1) {
|
14953
|
+
for (int k = 0; k < cgraph->n_nodes; ++k) {
|
14954
|
+
if (args[j] == cgraph->nodes[k]) {
|
14955
|
+
idx = GGML_MAX_NODES + k;
|
14956
|
+
break;
|
14957
|
+
}
|
14958
|
+
}
|
14959
|
+
}
|
14960
|
+
|
14961
|
+
if (idx == -1) {
|
14962
|
+
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
14963
|
+
return;
|
14964
|
+
}
|
14965
|
+
|
14966
|
+
fwrite(&idx, sizeof(int32_t), 1, fout);
|
14967
|
+
} else {
|
14968
|
+
const int32_t nul = -1;
|
14969
|
+
|
14970
|
+
fwrite(&nul, sizeof(int32_t), 1, fout);
|
14971
|
+
}
|
14972
|
+
}
|
14973
|
+
}
|
14974
|
+
}
|
14975
|
+
}
|
14976
|
+
|
14977
|
+
fclose(fout);
|
14978
|
+
}
|
14979
|
+
}
|
14980
|
+
|
14981
|
+
struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
|
14982
|
+
assert(*ctx_data == NULL);
|
14983
|
+
assert(*ctx_eval == NULL);
|
14984
|
+
|
14985
|
+
struct ggml_cgraph result = { 0 };
|
14986
|
+
|
14987
|
+
struct ggml_tensor * data = NULL;
|
14988
|
+
|
14989
|
+
// read file into data
|
14990
|
+
{
|
14991
|
+
FILE * fin = fopen(fname, "rb");
|
14992
|
+
if (!fin) {
|
14993
|
+
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
14994
|
+
return result;
|
14995
|
+
}
|
14996
|
+
|
14997
|
+
size_t fsize = 0;
|
14998
|
+
|
14999
|
+
fseek(fin, 0, SEEK_END);
|
15000
|
+
fsize = ftell(fin);
|
15001
|
+
fseek(fin, 0, SEEK_SET);
|
15002
|
+
|
15003
|
+
// create the data context
|
15004
|
+
{
|
15005
|
+
const size_t overhead = 1*ggml_tensor_overhead();
|
15006
|
+
|
15007
|
+
struct ggml_init_params params = {
|
15008
|
+
.mem_size = fsize + overhead,
|
15009
|
+
.mem_buffer = NULL,
|
15010
|
+
.no_alloc = false,
|
15011
|
+
};
|
15012
|
+
|
15013
|
+
*ctx_data = ggml_init(params);
|
15014
|
+
|
15015
|
+
if (!*ctx_data) {
|
15016
|
+
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
15017
|
+
return result;
|
15018
|
+
}
|
15019
|
+
}
|
15020
|
+
|
15021
|
+
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
15022
|
+
|
15023
|
+
const size_t ret = fread(data->data, sizeof(char), fsize, fin);
|
15024
|
+
if (ret != fsize) {
|
15025
|
+
fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
|
15026
|
+
return result;
|
15027
|
+
}
|
15028
|
+
|
15029
|
+
fclose(fin);
|
15030
|
+
}
|
15031
|
+
|
15032
|
+
// populate result
|
15033
|
+
{
|
15034
|
+
char * ptr = (char *) data->data;
|
15035
|
+
|
15036
|
+
const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
|
15037
|
+
|
15038
|
+
if (magic != GGML_FILE_MAGIC) {
|
15039
|
+
fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
|
15040
|
+
return result;
|
15041
|
+
}
|
15042
|
+
|
15043
|
+
const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
|
15044
|
+
|
15045
|
+
if (version != GGML_FILE_VERSION) {
|
15046
|
+
fprintf(stderr, "%s: invalid version number\n", __func__);
|
15047
|
+
return result;
|
15048
|
+
}
|
15049
|
+
|
15050
|
+
const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
|
15051
|
+
const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
|
15052
|
+
const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
|
15053
|
+
|
15054
|
+
result.n_leafs = n_leafs;
|
15055
|
+
result.n_nodes = n_nodes;
|
15056
|
+
|
15057
|
+
// create the data context
|
15058
|
+
{
|
15059
|
+
const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
|
15060
|
+
|
15061
|
+
struct ggml_init_params params = {
|
15062
|
+
.mem_size = size_eval + overhead,
|
15063
|
+
.mem_buffer = NULL,
|
15064
|
+
.no_alloc = true,
|
15065
|
+
};
|
15066
|
+
|
15067
|
+
*ctx_eval = ggml_init(params);
|
15068
|
+
|
15069
|
+
if (!*ctx_eval) {
|
15070
|
+
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
15071
|
+
return result;
|
15072
|
+
}
|
15073
|
+
}
|
15074
|
+
|
15075
|
+
// leafs
|
15076
|
+
{
|
15077
|
+
uint32_t type;
|
15078
|
+
uint32_t op;
|
15079
|
+
uint32_t n_dims;
|
15080
|
+
|
15081
|
+
for (uint32_t i = 0; i < n_leafs; ++i) {
|
15082
|
+
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
15083
|
+
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
15084
|
+
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
15085
|
+
|
15086
|
+
int64_t ne[GGML_MAX_DIMS];
|
15087
|
+
size_t nb[GGML_MAX_DIMS];
|
15088
|
+
|
15089
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
15090
|
+
uint64_t ne_cur;
|
15091
|
+
uint64_t nb_cur;
|
15092
|
+
|
15093
|
+
ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
|
15094
|
+
nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
|
15095
|
+
|
15096
|
+
ne[j] = ne_cur;
|
15097
|
+
nb[j] = nb_cur;
|
15098
|
+
}
|
15099
|
+
|
15100
|
+
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
15101
|
+
|
15102
|
+
tensor->op = (enum ggml_op) op;
|
15103
|
+
|
15104
|
+
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
|
15105
|
+
|
15106
|
+
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
|
15107
|
+
|
15108
|
+
tensor->data = (void *) ptr;
|
15109
|
+
|
15110
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
15111
|
+
tensor->nb[j] = nb[j];
|
15112
|
+
}
|
15113
|
+
|
15114
|
+
result.leafs[i] = tensor;
|
15115
|
+
|
15116
|
+
ptr += ggml_nbytes(tensor);
|
15117
|
+
|
15118
|
+
fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
15119
|
+
}
|
15120
|
+
}
|
15121
|
+
|
15122
|
+
ggml_set_no_alloc(*ctx_eval, false);
|
15123
|
+
|
15124
|
+
// nodes
|
15125
|
+
{
|
15126
|
+
uint32_t type;
|
15127
|
+
uint32_t op;
|
15128
|
+
uint32_t n_dims;
|
15129
|
+
|
15130
|
+
for (uint32_t i = 0; i < n_nodes; ++i) {
|
15131
|
+
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
15132
|
+
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
15133
|
+
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
15134
|
+
|
15135
|
+
enum ggml_op eop = (enum ggml_op) op;
|
15136
|
+
|
15137
|
+
int64_t ne[GGML_MAX_DIMS];
|
15138
|
+
size_t nb[GGML_MAX_DIMS];
|
15139
|
+
|
15140
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
15141
|
+
uint64_t ne_cur;
|
15142
|
+
uint64_t nb_cur;
|
15143
|
+
|
15144
|
+
ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
|
15145
|
+
nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
|
15146
|
+
|
15147
|
+
ne[j] = ne_cur;
|
15148
|
+
nb[j] = nb_cur;
|
15149
|
+
}
|
15150
|
+
|
15151
|
+
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
|
15152
|
+
|
15153
|
+
const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
|
15154
|
+
|
15155
|
+
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
|
15156
|
+
|
15157
|
+
struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
|
15158
|
+
|
15159
|
+
// parse args
|
15160
|
+
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
15161
|
+
const int32_t arg_idx = ptr_arg_idx[j];
|
15162
|
+
|
15163
|
+
if (arg_idx == -1) {
|
15164
|
+
continue;
|
15165
|
+
}
|
15166
|
+
|
15167
|
+
if (arg_idx < GGML_MAX_NODES) {
|
15168
|
+
args[j] = result.leafs[arg_idx];
|
15169
|
+
} else {
|
15170
|
+
args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
|
15171
|
+
}
|
15172
|
+
}
|
15173
|
+
|
15174
|
+
// create the tensor
|
15175
|
+
// "view" operations are handled differently
|
15176
|
+
// TODO: handle inplace ops - currently a copy is always made
|
15177
|
+
|
15178
|
+
struct ggml_tensor * tensor = NULL;
|
15179
|
+
|
15180
|
+
switch (eop) {
|
15181
|
+
// TODO: implement other view ops
|
15182
|
+
case GGML_OP_RESHAPE:
|
15183
|
+
{
|
15184
|
+
tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
|
15185
|
+
} break;
|
15186
|
+
case GGML_OP_VIEW:
|
15187
|
+
{
|
15188
|
+
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
|
15189
|
+
|
15190
|
+
uint64_t offs;
|
15191
|
+
memcpy(&offs, args[2]->data, sizeof(offs));
|
15192
|
+
|
15193
|
+
tensor->data = ((char *) tensor->data) + offs;
|
15194
|
+
} break;
|
15195
|
+
case GGML_OP_TRANSPOSE:
|
15196
|
+
{
|
15197
|
+
tensor = ggml_transpose(*ctx_eval, args[0]);
|
15198
|
+
} break;
|
15199
|
+
case GGML_OP_PERMUTE:
|
15200
|
+
{
|
15201
|
+
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
|
15202
|
+
} break;
|
15203
|
+
default:
|
15204
|
+
{
|
15205
|
+
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
15206
|
+
|
15207
|
+
tensor->op = eop;
|
15208
|
+
} break;
|
15209
|
+
}
|
15210
|
+
|
15211
|
+
memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
|
15212
|
+
|
15213
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
15214
|
+
tensor->nb[j] = nb[j];
|
15215
|
+
}
|
15216
|
+
|
15217
|
+
tensor->src0 = args[0];
|
15218
|
+
tensor->src1 = args[1];
|
15219
|
+
|
15220
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
15221
|
+
tensor->opt[j] = args[2 + j];
|
15222
|
+
}
|
15223
|
+
|
15224
|
+
result.nodes[i] = tensor;
|
15225
|
+
|
15226
|
+
fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
15227
|
+
}
|
15228
|
+
}
|
15229
|
+
}
|
15230
|
+
|
15231
|
+
return result;
|
15232
|
+
}
|
15233
|
+
|
14513
15234
|
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
14514
15235
|
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
|
14515
15236
|
|
@@ -14527,7 +15248,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14527
15248
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
14528
15249
|
i,
|
14529
15250
|
node->ne[0], node->ne[1], node->ne[2],
|
14530
|
-
|
15251
|
+
GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
14531
15252
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
14532
15253
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
14533
15254
|
(double) node->perf_time_us / 1000.0,
|
@@ -14541,7 +15262,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14541
15262
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
14542
15263
|
i,
|
14543
15264
|
node->ne[0], node->ne[1],
|
14544
|
-
|
15265
|
+
GGML_OP_NAME[node->op]);
|
14545
15266
|
}
|
14546
15267
|
|
14547
15268
|
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
@@ -14549,7 +15270,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14549
15270
|
continue;
|
14550
15271
|
}
|
14551
15272
|
|
14552
|
-
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n",
|
15273
|
+
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
|
14553
15274
|
}
|
14554
15275
|
|
14555
15276
|
GGML_PRINT("========================================\n");
|
@@ -15548,6 +16269,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
15548
16269
|
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
15549
16270
|
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
15550
16271
|
} break;
|
16272
|
+
#ifdef GGML_USE_K_QUANTS
|
16273
|
+
case GGML_TYPE_Q2_K:
|
16274
|
+
{
|
16275
|
+
GGML_ASSERT(start % QK_K == 0);
|
16276
|
+
block_q2_K * block = (block_q2_K*)dst + start / QK_K;
|
16277
|
+
result = ggml_quantize_q2_K(src + start, block, n, n, hist);
|
16278
|
+
} break;
|
16279
|
+
case GGML_TYPE_Q3_K:
|
16280
|
+
{
|
16281
|
+
GGML_ASSERT(start % QK_K == 0);
|
16282
|
+
block_q3_K * block = (block_q3_K*)dst + start / QK_K;
|
16283
|
+
result = ggml_quantize_q3_K(src + start, block, n, n, hist);
|
16284
|
+
} break;
|
16285
|
+
case GGML_TYPE_Q4_K:
|
16286
|
+
{
|
16287
|
+
GGML_ASSERT(start % QK_K == 0);
|
16288
|
+
block_q4_K * block = (block_q4_K*)dst + start / QK_K;
|
16289
|
+
result = ggml_quantize_q4_K(src + start, block, n, n, hist);
|
16290
|
+
} break;
|
16291
|
+
case GGML_TYPE_Q5_K:
|
16292
|
+
{
|
16293
|
+
GGML_ASSERT(start % QK_K == 0);
|
16294
|
+
block_q5_K * block = (block_q5_K*)dst + start / QK_K;
|
16295
|
+
result = ggml_quantize_q5_K(src + start, block, n, n, hist);
|
16296
|
+
} break;
|
16297
|
+
case GGML_TYPE_Q6_K:
|
16298
|
+
{
|
16299
|
+
GGML_ASSERT(start % QK_K == 0);
|
16300
|
+
block_q6_K * block = (block_q6_K*)dst + start / QK_K;
|
16301
|
+
result = ggml_quantize_q6_K(src + start, block, n, n, hist);
|
16302
|
+
} break;
|
16303
|
+
#endif
|
15551
16304
|
default:
|
15552
16305
|
assert(false);
|
15553
16306
|
}
|