llama_cpp 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -3,6 +3,10 @@
|
|
3
3
|
|
4
4
|
#include "ggml.h"
|
5
5
|
|
6
|
+
#ifdef GGML_USE_K_QUANTS
|
7
|
+
#include "k_quants.h"
|
8
|
+
#endif
|
9
|
+
|
6
10
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
7
11
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
8
12
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
@@ -21,6 +25,10 @@
|
|
21
25
|
#include <float.h>
|
22
26
|
#include <limits.h>
|
23
27
|
|
28
|
+
#ifdef GGML_USE_METAL
|
29
|
+
#include <unistd.h>
|
30
|
+
#endif
|
31
|
+
|
24
32
|
// if C99 - static_assert is noop
|
25
33
|
// ref: https://stackoverflow.com/a/53923785/4039976
|
26
34
|
#ifndef static_assert
|
@@ -121,7 +129,11 @@ typedef void* thread_ret_t;
|
|
121
129
|
#else
|
122
130
|
inline static void* ggml_aligned_malloc(size_t size) {
|
123
131
|
void* aligned_memory = NULL;
|
132
|
+
#ifdef GGML_USE_METAL
|
133
|
+
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
134
|
+
#else
|
124
135
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
136
|
+
#endif
|
125
137
|
if (result != 0) {
|
126
138
|
// Handle allocation failure
|
127
139
|
return NULL;
|
@@ -403,21 +415,27 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
|
403
415
|
//
|
404
416
|
|
405
417
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
406
|
-
static int64_t timer_freq;
|
418
|
+
static int64_t timer_freq, timer_start;
|
407
419
|
void ggml_time_init(void) {
|
408
|
-
LARGE_INTEGER
|
409
|
-
QueryPerformanceFrequency(&
|
410
|
-
timer_freq =
|
420
|
+
LARGE_INTEGER t;
|
421
|
+
QueryPerformanceFrequency(&t);
|
422
|
+
timer_freq = t.QuadPart;
|
423
|
+
|
424
|
+
// The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
|
425
|
+
// and the uptime is high enough.
|
426
|
+
// We subtract the program start time to reduce the likelihood of that happening.
|
427
|
+
QueryPerformanceCounter(&t);
|
428
|
+
timer_start = t.QuadPart;
|
411
429
|
}
|
412
430
|
int64_t ggml_time_ms(void) {
|
413
431
|
LARGE_INTEGER t;
|
414
432
|
QueryPerformanceCounter(&t);
|
415
|
-
return (t.QuadPart * 1000) / timer_freq;
|
433
|
+
return ((t.QuadPart-timer_start) * 1000) / timer_freq;
|
416
434
|
}
|
417
435
|
int64_t ggml_time_us(void) {
|
418
436
|
LARGE_INTEGER t;
|
419
437
|
QueryPerformanceCounter(&t);
|
420
|
-
return (t.QuadPart * 1000000) / timer_freq;
|
438
|
+
return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
|
421
439
|
}
|
422
440
|
#else
|
423
441
|
void ggml_time_init(void) {}
|
@@ -474,6 +492,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
474
492
|
// quantization
|
475
493
|
//
|
476
494
|
|
495
|
+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
496
|
+
|
477
497
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
478
498
|
// multiply int8_t, add results pairwise twice
|
479
499
|
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
@@ -533,7 +553,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
533
553
|
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
534
554
|
{
|
535
555
|
const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
|
536
|
-
const __m256i bytes =
|
556
|
+
const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
|
537
557
|
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
538
558
|
return _mm256_and_si256(lowMask, bytes);
|
539
559
|
}
|
@@ -606,7 +626,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
606
626
|
bytesh = _mm_or_si128(bytesh, bit_mask);
|
607
627
|
bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
|
608
628
|
bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
|
609
|
-
return
|
629
|
+
return MM256_SET_M128I(bytesh, bytesl);
|
610
630
|
}
|
611
631
|
|
612
632
|
// Unpack 32 4-bit fields into 32 bytes
|
@@ -619,7 +639,7 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
|
619
639
|
const __m128i lowMask = _mm_set1_epi8(0xF);
|
620
640
|
tmpl = _mm_and_si128(lowMask, tmpl);
|
621
641
|
tmph = _mm_and_si128(lowMask, tmph);
|
622
|
-
return
|
642
|
+
return MM256_SET_M128I(tmph, tmpl);
|
623
643
|
}
|
624
644
|
|
625
645
|
// add int16_t pairwise and return as float vector
|
@@ -627,7 +647,7 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
|
|
627
647
|
const __m128i ones = _mm_set1_epi16(1);
|
628
648
|
const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
|
629
649
|
const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
|
630
|
-
const __m256i summed_pairs =
|
650
|
+
const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
|
631
651
|
return _mm256_cvtepi32_ps(summed_pairs);
|
632
652
|
}
|
633
653
|
|
@@ -1565,6 +1585,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|
1565
1585
|
.vec_dot_q = NULL, // TODO
|
1566
1586
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1567
1587
|
},
|
1588
|
+
#ifdef GGML_USE_K_QUANTS
|
1589
|
+
[GGML_TYPE_Q2_K] = {
|
1590
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
|
1591
|
+
.quantize_row_q = quantize_row_q2_K,
|
1592
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
|
1593
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1594
|
+
.vec_dot_q = ggml_vec_dot_q2_K_q8_K,
|
1595
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1596
|
+
},
|
1597
|
+
[GGML_TYPE_Q3_K] = {
|
1598
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
|
1599
|
+
.quantize_row_q = quantize_row_q3_K,
|
1600
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
|
1601
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1602
|
+
.vec_dot_q = ggml_vec_dot_q3_K_q8_K,
|
1603
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1604
|
+
},
|
1605
|
+
[GGML_TYPE_Q4_K] = {
|
1606
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
|
1607
|
+
.quantize_row_q = quantize_row_q4_K,
|
1608
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
|
1609
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1610
|
+
.vec_dot_q = ggml_vec_dot_q4_K_q8_K,
|
1611
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1612
|
+
},
|
1613
|
+
[GGML_TYPE_Q5_K] = {
|
1614
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
|
1615
|
+
.quantize_row_q = quantize_row_q5_K,
|
1616
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
|
1617
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1618
|
+
.vec_dot_q = ggml_vec_dot_q5_K_q8_K,
|
1619
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1620
|
+
},
|
1621
|
+
[GGML_TYPE_Q6_K] = {
|
1622
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
|
1623
|
+
.quantize_row_q = quantize_row_q6_K,
|
1624
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
|
1625
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1626
|
+
.vec_dot_q = ggml_vec_dot_q6_K_q8_K,
|
1627
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1628
|
+
},
|
1629
|
+
#endif
|
1568
1630
|
};
|
1569
1631
|
|
1570
1632
|
// For internal test use
|
@@ -2290,7 +2352,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2290
2352
|
const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
|
2291
2353
|
|
2292
2354
|
// Convert int32_t to float
|
2293
|
-
__m256 p = _mm256_cvtepi32_ps(
|
2355
|
+
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
2294
2356
|
|
2295
2357
|
// Apply the scale, and accumulate
|
2296
2358
|
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
@@ -2766,7 +2828,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2766
2828
|
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
2767
2829
|
bxl = _mm_or_si128(bxl, bxhil);
|
2768
2830
|
bxh = _mm_or_si128(bxh, bxhih);
|
2769
|
-
bx =
|
2831
|
+
bx = MM256_SET_M128I(bxh, bxl);
|
2770
2832
|
|
2771
2833
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
2772
2834
|
|
@@ -3022,7 +3084,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3022
3084
|
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
3023
3085
|
bxl = _mm_or_si128(bxl, bxhil);
|
3024
3086
|
bxh = _mm_or_si128(bxh, bxhih);
|
3025
|
-
bx =
|
3087
|
+
bx = MM256_SET_M128I(bxh, bxl);
|
3026
3088
|
|
3027
3089
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
3028
3090
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
@@ -3444,11 +3506,19 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
|
3444
3506
|
[GGML_TYPE_Q5_1] = QK5_1,
|
3445
3507
|
[GGML_TYPE_Q8_0] = QK8_0,
|
3446
3508
|
[GGML_TYPE_Q8_1] = QK8_1,
|
3509
|
+
#ifdef GGML_USE_K_QUANTS
|
3510
|
+
[GGML_TYPE_Q2_K] = QK_K,
|
3511
|
+
[GGML_TYPE_Q3_K] = QK_K,
|
3512
|
+
[GGML_TYPE_Q4_K] = QK_K,
|
3513
|
+
[GGML_TYPE_Q5_K] = QK_K,
|
3514
|
+
[GGML_TYPE_Q6_K] = QK_K,
|
3515
|
+
[GGML_TYPE_Q8_K] = QK_K,
|
3516
|
+
#endif
|
3447
3517
|
[GGML_TYPE_I8] = 1,
|
3448
3518
|
[GGML_TYPE_I16] = 1,
|
3449
3519
|
[GGML_TYPE_I32] = 1,
|
3450
3520
|
};
|
3451
|
-
static_assert(GGML_TYPE_COUNT ==
|
3521
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
|
3452
3522
|
|
3453
3523
|
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
3454
3524
|
[GGML_TYPE_F32] = sizeof(float),
|
@@ -3459,11 +3529,19 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
|
3459
3529
|
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
3460
3530
|
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
3461
3531
|
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
3532
|
+
#ifdef GGML_USE_K_QUANTS
|
3533
|
+
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
3534
|
+
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
3535
|
+
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
3536
|
+
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
3537
|
+
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
3538
|
+
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
3539
|
+
#endif
|
3462
3540
|
[GGML_TYPE_I8] = sizeof(int8_t),
|
3463
3541
|
[GGML_TYPE_I16] = sizeof(int16_t),
|
3464
3542
|
[GGML_TYPE_I32] = sizeof(int32_t),
|
3465
3543
|
};
|
3466
|
-
static_assert(GGML_TYPE_COUNT ==
|
3544
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
|
3467
3545
|
|
3468
3546
|
|
3469
3547
|
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
@@ -3475,11 +3553,17 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
|
3475
3553
|
[GGML_TYPE_Q5_1] = "q5_1",
|
3476
3554
|
[GGML_TYPE_Q8_0] = "q8_0",
|
3477
3555
|
[GGML_TYPE_Q8_1] = "q8_1",
|
3556
|
+
[GGML_TYPE_Q2_K] = "q2_K",
|
3557
|
+
[GGML_TYPE_Q3_K] = "q3_K",
|
3558
|
+
[GGML_TYPE_Q4_K] = "q4_K",
|
3559
|
+
[GGML_TYPE_Q5_K] = "q5_K",
|
3560
|
+
[GGML_TYPE_Q6_K] = "q6_K",
|
3561
|
+
[GGML_TYPE_Q8_K] = "q8_K",
|
3478
3562
|
[GGML_TYPE_I8] = "i8",
|
3479
3563
|
[GGML_TYPE_I16] = "i16",
|
3480
3564
|
[GGML_TYPE_I32] = "i32",
|
3481
3565
|
};
|
3482
|
-
static_assert(GGML_TYPE_COUNT ==
|
3566
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
|
3483
3567
|
|
3484
3568
|
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
3485
3569
|
[GGML_TYPE_F32] = false,
|
@@ -3490,11 +3574,17 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
|
3490
3574
|
[GGML_TYPE_Q5_1] = true,
|
3491
3575
|
[GGML_TYPE_Q8_0] = true,
|
3492
3576
|
[GGML_TYPE_Q8_1] = true,
|
3577
|
+
[GGML_TYPE_Q2_K] = true,
|
3578
|
+
[GGML_TYPE_Q3_K] = true,
|
3579
|
+
[GGML_TYPE_Q4_K] = true,
|
3580
|
+
[GGML_TYPE_Q5_K] = true,
|
3581
|
+
[GGML_TYPE_Q6_K] = true,
|
3582
|
+
[GGML_TYPE_Q8_K] = true,
|
3493
3583
|
[GGML_TYPE_I8] = false,
|
3494
3584
|
[GGML_TYPE_I16] = false,
|
3495
3585
|
[GGML_TYPE_I32] = false,
|
3496
3586
|
};
|
3497
|
-
static_assert(GGML_TYPE_COUNT ==
|
3587
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
|
3498
3588
|
|
3499
3589
|
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
3500
3590
|
"NONE",
|
@@ -3631,6 +3721,7 @@ struct ggml_context {
|
|
3631
3721
|
void * mem_buffer;
|
3632
3722
|
bool mem_buffer_owned;
|
3633
3723
|
bool no_alloc;
|
3724
|
+
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
3634
3725
|
|
3635
3726
|
int n_objects;
|
3636
3727
|
|
@@ -3647,26 +3738,6 @@ struct ggml_context_container {
|
|
3647
3738
|
struct ggml_context context;
|
3648
3739
|
};
|
3649
3740
|
|
3650
|
-
//
|
3651
|
-
// compute types
|
3652
|
-
//
|
3653
|
-
|
3654
|
-
enum ggml_task_type {
|
3655
|
-
GGML_TASK_INIT = 0,
|
3656
|
-
GGML_TASK_COMPUTE,
|
3657
|
-
GGML_TASK_FINALIZE,
|
3658
|
-
};
|
3659
|
-
|
3660
|
-
struct ggml_compute_params {
|
3661
|
-
enum ggml_task_type type;
|
3662
|
-
|
3663
|
-
int ith, nth;
|
3664
|
-
|
3665
|
-
// work buffer for all threads
|
3666
|
-
size_t wsize;
|
3667
|
-
void * wdata;
|
3668
|
-
};
|
3669
|
-
|
3670
3741
|
//
|
3671
3742
|
// ggml state
|
3672
3743
|
//
|
@@ -3723,7 +3794,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
|
3723
3794
|
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
3724
3795
|
}
|
3725
3796
|
|
3726
|
-
|
3797
|
+
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
3727
3798
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3728
3799
|
|
3729
3800
|
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
@@ -3732,7 +3803,20 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
|
|
3732
3803
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
3733
3804
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3734
3805
|
|
3735
|
-
|
3806
|
+
// this should handle cases where the tensor is not contiguous in memory
|
3807
|
+
// probaby just:
|
3808
|
+
//
|
3809
|
+
// return tensor->ne[3]*tensor->nb[3]
|
3810
|
+
//
|
3811
|
+
// is enough, but just in case, adding the second part
|
3812
|
+
|
3813
|
+
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
3814
|
+
}
|
3815
|
+
|
3816
|
+
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
3817
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3818
|
+
|
3819
|
+
return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
|
3736
3820
|
}
|
3737
3821
|
|
3738
3822
|
int ggml_blck_size(enum ggml_type type) {
|
@@ -3801,6 +3885,11 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
3801
3885
|
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
3802
3886
|
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
3803
3887
|
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
3888
|
+
case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
|
3889
|
+
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
3890
|
+
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
3891
|
+
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
3892
|
+
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
3804
3893
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
3805
3894
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
3806
3895
|
}
|
@@ -3814,11 +3903,11 @@ size_t ggml_tensor_overhead(void) {
|
|
3814
3903
|
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
|
3815
3904
|
}
|
3816
3905
|
|
3817
|
-
|
3906
|
+
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
3818
3907
|
return tensor->nb[0] > tensor->nb[1];
|
3819
3908
|
}
|
3820
3909
|
|
3821
|
-
|
3910
|
+
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
3822
3911
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3823
3912
|
|
3824
3913
|
return
|
@@ -3967,6 +4056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
3967
4056
|
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
|
3968
4057
|
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
3969
4058
|
/*.no_alloc =*/ params.no_alloc,
|
4059
|
+
/*.no_alloc_save =*/ params.no_alloc,
|
3970
4060
|
/*.n_objects =*/ 0,
|
3971
4061
|
/*.objects_begin =*/ NULL,
|
3972
4062
|
/*.objects_end =*/ NULL,
|
@@ -4044,11 +4134,18 @@ size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
|
4044
4134
|
// operators when using scratch buffers
|
4045
4135
|
// TODO: implement a better way
|
4046
4136
|
void ggml_scratch_save(struct ggml_context * ctx) {
|
4137
|
+
// this is needed to allow opt tensors to store their data
|
4138
|
+
// TODO: again, need to find a better way
|
4139
|
+
ctx->no_alloc_save = ctx->no_alloc;
|
4140
|
+
ctx->no_alloc = false;
|
4141
|
+
|
4047
4142
|
ctx->scratch_save = ctx->scratch;
|
4048
4143
|
ctx->scratch.data = NULL;
|
4049
4144
|
}
|
4050
4145
|
|
4051
4146
|
void ggml_scratch_load(struct ggml_context * ctx) {
|
4147
|
+
ctx->no_alloc = ctx->no_alloc_save;
|
4148
|
+
|
4052
4149
|
ctx->scratch = ctx->scratch_save;
|
4053
4150
|
}
|
4054
4151
|
|
@@ -4157,6 +4254,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4157
4254
|
/*.perf_time_us =*/ 0,
|
4158
4255
|
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
4159
4256
|
/*.name =*/ { 0 },
|
4257
|
+
/*.extra =*/ NULL,
|
4160
4258
|
/*.pad =*/ { 0 },
|
4161
4259
|
};
|
4162
4260
|
|
@@ -5802,10 +5900,18 @@ struct ggml_tensor * ggml_view_1d(
|
|
5802
5900
|
|
5803
5901
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
5804
5902
|
|
5903
|
+
ggml_scratch_save(ctx);
|
5904
|
+
|
5905
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5906
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5907
|
+
|
5908
|
+
ggml_scratch_load(ctx);
|
5909
|
+
|
5805
5910
|
result->op = GGML_OP_VIEW;
|
5806
5911
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5807
5912
|
result->src0 = a;
|
5808
5913
|
result->src1 = NULL;
|
5914
|
+
result->opt[0] = offs;
|
5809
5915
|
|
5810
5916
|
if (is_node) {
|
5811
5917
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5834,6 +5940,13 @@ struct ggml_tensor * ggml_view_2d(
|
|
5834
5940
|
|
5835
5941
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
5836
5942
|
|
5943
|
+
ggml_scratch_save(ctx);
|
5944
|
+
|
5945
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5946
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5947
|
+
|
5948
|
+
ggml_scratch_load(ctx);
|
5949
|
+
|
5837
5950
|
result->nb[1] = nb1;
|
5838
5951
|
result->nb[2] = result->nb[1]*ne1;
|
5839
5952
|
result->nb[3] = result->nb[2];
|
@@ -5842,6 +5955,7 @@ struct ggml_tensor * ggml_view_2d(
|
|
5842
5955
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5843
5956
|
result->src0 = a;
|
5844
5957
|
result->src1 = NULL;
|
5958
|
+
result->opt[0] = offs;
|
5845
5959
|
|
5846
5960
|
if (is_node) {
|
5847
5961
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5872,6 +5986,13 @@ struct ggml_tensor * ggml_view_3d(
|
|
5872
5986
|
|
5873
5987
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
5874
5988
|
|
5989
|
+
ggml_scratch_save(ctx);
|
5990
|
+
|
5991
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5992
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5993
|
+
|
5994
|
+
ggml_scratch_load(ctx);
|
5995
|
+
|
5875
5996
|
result->nb[1] = nb1;
|
5876
5997
|
result->nb[2] = nb2;
|
5877
5998
|
result->nb[3] = result->nb[2]*ne2;
|
@@ -5880,6 +6001,7 @@ struct ggml_tensor * ggml_view_3d(
|
|
5880
6001
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5881
6002
|
result->src0 = a;
|
5882
6003
|
result->src1 = NULL;
|
6004
|
+
result->opt[0] = offs;
|
5883
6005
|
|
5884
6006
|
if (is_node) {
|
5885
6007
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5912,6 +6034,13 @@ struct ggml_tensor * ggml_view_4d(
|
|
5912
6034
|
|
5913
6035
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
5914
6036
|
|
6037
|
+
ggml_scratch_save(ctx);
|
6038
|
+
|
6039
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6040
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6041
|
+
|
6042
|
+
ggml_scratch_load(ctx);
|
6043
|
+
|
5915
6044
|
result->nb[1] = nb1;
|
5916
6045
|
result->nb[2] = nb2;
|
5917
6046
|
result->nb[3] = nb3;
|
@@ -5920,6 +6049,7 @@ struct ggml_tensor * ggml_view_4d(
|
|
5920
6049
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5921
6050
|
result->src0 = a;
|
5922
6051
|
result->src1 = NULL;
|
6052
|
+
result->opt[0] = offs;
|
5923
6053
|
|
5924
6054
|
if (is_node) {
|
5925
6055
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -7584,6 +7714,11 @@ static void ggml_compute_forward_add(
|
|
7584
7714
|
case GGML_TYPE_Q5_0:
|
7585
7715
|
case GGML_TYPE_Q5_1:
|
7586
7716
|
case GGML_TYPE_Q8_0:
|
7717
|
+
case GGML_TYPE_Q2_K:
|
7718
|
+
case GGML_TYPE_Q3_K:
|
7719
|
+
case GGML_TYPE_Q4_K:
|
7720
|
+
case GGML_TYPE_Q5_K:
|
7721
|
+
case GGML_TYPE_Q6_K:
|
7587
7722
|
{
|
7588
7723
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
7589
7724
|
} break;
|
@@ -7887,6 +8022,11 @@ static void ggml_compute_forward_add1(
|
|
7887
8022
|
case GGML_TYPE_Q5_1:
|
7888
8023
|
case GGML_TYPE_Q8_0:
|
7889
8024
|
case GGML_TYPE_Q8_1:
|
8025
|
+
case GGML_TYPE_Q2_K:
|
8026
|
+
case GGML_TYPE_Q3_K:
|
8027
|
+
case GGML_TYPE_Q4_K:
|
8028
|
+
case GGML_TYPE_Q5_K:
|
8029
|
+
case GGML_TYPE_Q6_K:
|
7890
8030
|
{
|
7891
8031
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
7892
8032
|
} break;
|
@@ -8009,6 +8149,11 @@ static void ggml_compute_forward_acc(
|
|
8009
8149
|
case GGML_TYPE_Q5_1:
|
8010
8150
|
case GGML_TYPE_Q8_0:
|
8011
8151
|
case GGML_TYPE_Q8_1:
|
8152
|
+
case GGML_TYPE_Q2_K:
|
8153
|
+
case GGML_TYPE_Q3_K:
|
8154
|
+
case GGML_TYPE_Q4_K:
|
8155
|
+
case GGML_TYPE_Q5_K:
|
8156
|
+
case GGML_TYPE_Q6_K:
|
8012
8157
|
default:
|
8013
8158
|
{
|
8014
8159
|
GGML_ASSERT(false);
|
@@ -8127,10 +8272,10 @@ static void ggml_compute_forward_mul_f32(
|
|
8127
8272
|
const int ith = params->ith;
|
8128
8273
|
const int nth = params->nth;
|
8129
8274
|
|
8130
|
-
#ifdef
|
8131
|
-
if (src1->backend ==
|
8275
|
+
#ifdef GGML_USE_CLBLAST
|
8276
|
+
if (src1->backend == GGML_BACKEND_GPU) {
|
8132
8277
|
if (ith == 0) {
|
8133
|
-
|
8278
|
+
ggml_cl_mul(src0, src1, dst);
|
8134
8279
|
}
|
8135
8280
|
return;
|
8136
8281
|
}
|
@@ -9245,7 +9390,7 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
9245
9390
|
sum += (ggml_float)(x[i00] * x[i00]);
|
9246
9391
|
}
|
9247
9392
|
|
9248
|
-
float mean = sum/ne00;
|
9393
|
+
const float mean = sum/ne00;
|
9249
9394
|
|
9250
9395
|
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
9251
9396
|
|
@@ -9568,14 +9713,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9568
9713
|
// nb01 >= nb00 - src0 is not transposed
|
9569
9714
|
// compute by src0 rows
|
9570
9715
|
|
9571
|
-
#if defined(
|
9572
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9573
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9574
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9575
|
-
}
|
9576
|
-
return;
|
9577
|
-
}
|
9578
|
-
#elif defined(GGML_USE_CLBLAST)
|
9716
|
+
#if defined(GGML_USE_CLBLAST)
|
9579
9717
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9580
9718
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9581
9719
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -9740,14 +9878,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9740
9878
|
// nb01 >= nb00 - src0 is not transposed
|
9741
9879
|
// compute by src0 rows
|
9742
9880
|
|
9743
|
-
#if defined(
|
9744
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9745
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9746
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9747
|
-
}
|
9748
|
-
return;
|
9749
|
-
}
|
9750
|
-
#elif defined(GGML_USE_CLBLAST)
|
9881
|
+
#if defined(GGML_USE_CLBLAST)
|
9751
9882
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9752
9883
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9753
9884
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -9952,14 +10083,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9952
10083
|
// nb01 >= nb00 - src0 is not transposed
|
9953
10084
|
// compute by src0 rows
|
9954
10085
|
|
9955
|
-
#if defined(
|
9956
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9957
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9958
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9959
|
-
}
|
9960
|
-
return;
|
9961
|
-
}
|
9962
|
-
#elif defined(GGML_USE_CLBLAST)
|
10086
|
+
#if defined(GGML_USE_CLBLAST)
|
9963
10087
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9964
10088
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9965
10089
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -10102,6 +10226,11 @@ static void ggml_compute_forward_mul_mat(
|
|
10102
10226
|
case GGML_TYPE_Q5_1:
|
10103
10227
|
case GGML_TYPE_Q8_0:
|
10104
10228
|
case GGML_TYPE_Q8_1:
|
10229
|
+
case GGML_TYPE_Q2_K:
|
10230
|
+
case GGML_TYPE_Q3_K:
|
10231
|
+
case GGML_TYPE_Q4_K:
|
10232
|
+
case GGML_TYPE_Q5_K:
|
10233
|
+
case GGML_TYPE_Q6_K:
|
10105
10234
|
{
|
10106
10235
|
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
|
10107
10236
|
} break;
|
@@ -10285,6 +10414,11 @@ static void ggml_compute_forward_set(
|
|
10285
10414
|
case GGML_TYPE_Q5_1:
|
10286
10415
|
case GGML_TYPE_Q8_0:
|
10287
10416
|
case GGML_TYPE_Q8_1:
|
10417
|
+
case GGML_TYPE_Q2_K:
|
10418
|
+
case GGML_TYPE_Q3_K:
|
10419
|
+
case GGML_TYPE_Q4_K:
|
10420
|
+
case GGML_TYPE_Q5_K:
|
10421
|
+
case GGML_TYPE_Q6_K:
|
10288
10422
|
default:
|
10289
10423
|
{
|
10290
10424
|
GGML_ASSERT(false);
|
@@ -10450,6 +10584,11 @@ static void ggml_compute_forward_get_rows(
|
|
10450
10584
|
case GGML_TYPE_Q5_1:
|
10451
10585
|
case GGML_TYPE_Q8_0:
|
10452
10586
|
case GGML_TYPE_Q8_1:
|
10587
|
+
case GGML_TYPE_Q2_K:
|
10588
|
+
case GGML_TYPE_Q3_K:
|
10589
|
+
case GGML_TYPE_Q4_K:
|
10590
|
+
case GGML_TYPE_Q5_K:
|
10591
|
+
case GGML_TYPE_Q6_K:
|
10453
10592
|
{
|
10454
10593
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
10455
10594
|
} break;
|
@@ -10996,6 +11135,12 @@ static void ggml_compute_forward_alibi(
|
|
10996
11135
|
case GGML_TYPE_Q5_1:
|
10997
11136
|
case GGML_TYPE_Q8_0:
|
10998
11137
|
case GGML_TYPE_Q8_1:
|
11138
|
+
case GGML_TYPE_Q2_K:
|
11139
|
+
case GGML_TYPE_Q3_K:
|
11140
|
+
case GGML_TYPE_Q4_K:
|
11141
|
+
case GGML_TYPE_Q5_K:
|
11142
|
+
case GGML_TYPE_Q6_K:
|
11143
|
+
case GGML_TYPE_Q8_K:
|
10999
11144
|
case GGML_TYPE_I8:
|
11000
11145
|
case GGML_TYPE_I16:
|
11001
11146
|
case GGML_TYPE_I32:
|
@@ -11067,6 +11212,12 @@ static void ggml_compute_forward_clamp(
|
|
11067
11212
|
case GGML_TYPE_Q5_1:
|
11068
11213
|
case GGML_TYPE_Q8_0:
|
11069
11214
|
case GGML_TYPE_Q8_1:
|
11215
|
+
case GGML_TYPE_Q2_K:
|
11216
|
+
case GGML_TYPE_Q3_K:
|
11217
|
+
case GGML_TYPE_Q4_K:
|
11218
|
+
case GGML_TYPE_Q5_K:
|
11219
|
+
case GGML_TYPE_Q6_K:
|
11220
|
+
case GGML_TYPE_Q8_K:
|
11070
11221
|
case GGML_TYPE_I8:
|
11071
11222
|
case GGML_TYPE_I16:
|
11072
11223
|
case GGML_TYPE_I32:
|
@@ -11156,7 +11307,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11156
11307
|
theta *= theta_scale;
|
11157
11308
|
|
11158
11309
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11159
|
-
float * dst_data = (float *)((char *) dst->data +
|
11310
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11160
11311
|
|
11161
11312
|
const float x0 = src[0];
|
11162
11313
|
const float x1 = src[1];
|
@@ -11177,7 +11328,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11177
11328
|
const int64_t i0 = ib*n_dims + ic/2;
|
11178
11329
|
|
11179
11330
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11180
|
-
float * dst_data = (float *)((char *) dst->data +
|
11331
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11181
11332
|
|
11182
11333
|
const float x0 = src[0];
|
11183
11334
|
const float x1 = src[n_dims/2];
|
@@ -12885,6 +13036,15 @@ static void ggml_compute_forward_map_binary(
|
|
12885
13036
|
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
12886
13037
|
GGML_ASSERT(params);
|
12887
13038
|
|
13039
|
+
#ifdef GGML_USE_CUBLAS
|
13040
|
+
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
13041
|
+
if (skip_cpu) {
|
13042
|
+
return;
|
13043
|
+
}
|
13044
|
+
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
13045
|
+
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
13046
|
+
#endif // GGML_USE_CUBLAS
|
13047
|
+
|
12888
13048
|
switch (tensor->op) {
|
12889
13049
|
case GGML_OP_DUP:
|
12890
13050
|
{
|
@@ -14191,7 +14351,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
14191
14351
|
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
14192
14352
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
14193
14353
|
// the threads are still spinning
|
14194
|
-
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
14195
14354
|
}
|
14196
14355
|
else
|
14197
14356
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -14581,7 +14740,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
|
14581
14740
|
const int64_t * ne = tensor->ne;
|
14582
14741
|
const size_t * nb = tensor->nb;
|
14583
14742
|
|
14584
|
-
fprintf(fout, "%-6s %-12s %8d %
|
14743
|
+
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
14585
14744
|
ggml_type_name(tensor->type),
|
14586
14745
|
ggml_op_name (tensor->op),
|
14587
14746
|
tensor->n_dims,
|
@@ -14595,7 +14754,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
14595
14754
|
const int64_t * ne = tensor->ne;
|
14596
14755
|
const size_t * nb = tensor->nb;
|
14597
14756
|
|
14598
|
-
fprintf(fout, "%-6s %-6s %-12s %8d %
|
14757
|
+
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
14599
14758
|
arg,
|
14600
14759
|
ggml_type_name(tensor->type),
|
14601
14760
|
ggml_op_name (tensor->op),
|
@@ -14608,8 +14767,8 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
14608
14767
|
}
|
14609
14768
|
|
14610
14769
|
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
14611
|
-
assert(cgraph->work == NULL);
|
14612
|
-
assert(cgraph->work_size == 0);
|
14770
|
+
//assert(cgraph->work == NULL);
|
14771
|
+
//assert(cgraph->work_size == 0);
|
14613
14772
|
|
14614
14773
|
uint64_t size_eval = 0;
|
14615
14774
|
|
@@ -14624,11 +14783,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
14624
14783
|
FILE * fout = stdout;
|
14625
14784
|
|
14626
14785
|
fprintf(fout, "\n");
|
14627
|
-
fprintf(fout, "%-16s %8x\n",
|
14628
|
-
fprintf(fout, "%-16s %8d\n",
|
14629
|
-
fprintf(fout, "%-16s %8d\n",
|
14630
|
-
fprintf(fout, "%-16s %8d\n",
|
14631
|
-
fprintf(fout, "%-16s %
|
14786
|
+
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
14787
|
+
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
14788
|
+
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
14789
|
+
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
14790
|
+
fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
|
14632
14791
|
|
14633
14792
|
// header
|
14634
14793
|
fprintf(fout, "\n");
|
@@ -14830,7 +14989,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
14830
14989
|
// read file into data
|
14831
14990
|
{
|
14832
14991
|
FILE * fin = fopen(fname, "rb");
|
14833
|
-
|
14834
14992
|
if (!fin) {
|
14835
14993
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
14836
14994
|
return result;
|
@@ -14862,7 +15020,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
14862
15020
|
|
14863
15021
|
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
14864
15022
|
|
14865
|
-
fread(data->data, sizeof(char), fsize, fin);
|
15023
|
+
const size_t ret = fread(data->data, sizeof(char), fsize, fin);
|
15024
|
+
if (ret != fsize) {
|
15025
|
+
fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
|
15026
|
+
return result;
|
15027
|
+
}
|
14866
15028
|
|
14867
15029
|
fclose(fin);
|
14868
15030
|
}
|
@@ -14970,6 +15132,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
14970
15132
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
14971
15133
|
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
14972
15134
|
|
15135
|
+
enum ggml_op eop = (enum ggml_op) op;
|
15136
|
+
|
14973
15137
|
int64_t ne[GGML_MAX_DIMS];
|
14974
15138
|
size_t nb[GGML_MAX_DIMS];
|
14975
15139
|
|
@@ -14984,42 +15148,77 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
14984
15148
|
nb[j] = nb_cur;
|
14985
15149
|
}
|
14986
15150
|
|
14987
|
-
|
15151
|
+
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
|
14988
15152
|
|
14989
|
-
|
15153
|
+
const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
|
14990
15154
|
|
14991
|
-
|
15155
|
+
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
|
14992
15156
|
|
14993
|
-
|
14994
|
-
|
14995
|
-
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14996
|
-
tensor->nb[j] = nb[j];
|
14997
|
-
}
|
15157
|
+
struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
|
14998
15158
|
|
14999
15159
|
// parse args
|
15000
|
-
{
|
15001
|
-
|
15002
|
-
&tensor->src0,
|
15003
|
-
&tensor->src1,
|
15004
|
-
};
|
15160
|
+
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
15161
|
+
const int32_t arg_idx = ptr_arg_idx[j];
|
15005
15162
|
|
15006
|
-
|
15007
|
-
|
15163
|
+
if (arg_idx == -1) {
|
15164
|
+
continue;
|
15008
15165
|
}
|
15009
15166
|
|
15010
|
-
|
15011
|
-
|
15167
|
+
if (arg_idx < GGML_MAX_NODES) {
|
15168
|
+
args[j] = result.leafs[arg_idx];
|
15169
|
+
} else {
|
15170
|
+
args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
|
15171
|
+
}
|
15172
|
+
}
|
15012
15173
|
|
15013
|
-
|
15014
|
-
|
15015
|
-
|
15174
|
+
// create the tensor
|
15175
|
+
// "view" operations are handled differently
|
15176
|
+
// TODO: handle inplace ops - currently a copy is always made
|
15177
|
+
|
15178
|
+
struct ggml_tensor * tensor = NULL;
|
15179
|
+
|
15180
|
+
switch (eop) {
|
15181
|
+
// TODO: implement other view ops
|
15182
|
+
case GGML_OP_RESHAPE:
|
15183
|
+
{
|
15184
|
+
tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
|
15185
|
+
} break;
|
15186
|
+
case GGML_OP_VIEW:
|
15187
|
+
{
|
15188
|
+
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
|
15189
|
+
|
15190
|
+
uint64_t offs;
|
15191
|
+
memcpy(&offs, args[2]->data, sizeof(offs));
|
15192
|
+
|
15193
|
+
tensor->data = ((char *) tensor->data) + offs;
|
15194
|
+
} break;
|
15195
|
+
case GGML_OP_TRANSPOSE:
|
15196
|
+
{
|
15197
|
+
tensor = ggml_transpose(*ctx_eval, args[0]);
|
15198
|
+
} break;
|
15199
|
+
case GGML_OP_PERMUTE:
|
15200
|
+
{
|
15201
|
+
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
|
15202
|
+
} break;
|
15203
|
+
default:
|
15204
|
+
{
|
15205
|
+
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
15206
|
+
|
15207
|
+
tensor->op = eop;
|
15208
|
+
} break;
|
15209
|
+
}
|
15016
15210
|
|
15017
|
-
|
15018
|
-
|
15019
|
-
|
15020
|
-
|
15021
|
-
|
15022
|
-
|
15211
|
+
memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
|
15212
|
+
|
15213
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
15214
|
+
tensor->nb[j] = nb[j];
|
15215
|
+
}
|
15216
|
+
|
15217
|
+
tensor->src0 = args[0];
|
15218
|
+
tensor->src1 = args[1];
|
15219
|
+
|
15220
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
15221
|
+
tensor->opt[j] = args[2 + j];
|
15023
15222
|
}
|
15024
15223
|
|
15025
15224
|
result.nodes[i] = tensor;
|
@@ -16070,6 +16269,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
16070
16269
|
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
16071
16270
|
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
16072
16271
|
} break;
|
16272
|
+
#ifdef GGML_USE_K_QUANTS
|
16273
|
+
case GGML_TYPE_Q2_K:
|
16274
|
+
{
|
16275
|
+
GGML_ASSERT(start % QK_K == 0);
|
16276
|
+
block_q2_K * block = (block_q2_K*)dst + start / QK_K;
|
16277
|
+
result = ggml_quantize_q2_K(src + start, block, n, n, hist);
|
16278
|
+
} break;
|
16279
|
+
case GGML_TYPE_Q3_K:
|
16280
|
+
{
|
16281
|
+
GGML_ASSERT(start % QK_K == 0);
|
16282
|
+
block_q3_K * block = (block_q3_K*)dst + start / QK_K;
|
16283
|
+
result = ggml_quantize_q3_K(src + start, block, n, n, hist);
|
16284
|
+
} break;
|
16285
|
+
case GGML_TYPE_Q4_K:
|
16286
|
+
{
|
16287
|
+
GGML_ASSERT(start % QK_K == 0);
|
16288
|
+
block_q4_K * block = (block_q4_K*)dst + start / QK_K;
|
16289
|
+
result = ggml_quantize_q4_K(src + start, block, n, n, hist);
|
16290
|
+
} break;
|
16291
|
+
case GGML_TYPE_Q5_K:
|
16292
|
+
{
|
16293
|
+
GGML_ASSERT(start % QK_K == 0);
|
16294
|
+
block_q5_K * block = (block_q5_K*)dst + start / QK_K;
|
16295
|
+
result = ggml_quantize_q5_K(src + start, block, n, n, hist);
|
16296
|
+
} break;
|
16297
|
+
case GGML_TYPE_Q6_K:
|
16298
|
+
{
|
16299
|
+
GGML_ASSERT(start % QK_K == 0);
|
16300
|
+
block_q6_K * block = (block_q6_K*)dst + start / QK_K;
|
16301
|
+
result = ggml_quantize_q6_K(src + start, block, n, n, hist);
|
16302
|
+
} break;
|
16303
|
+
#endif
|
16073
16304
|
default:
|
16074
16305
|
assert(false);
|
16075
16306
|
}
|