llama_cpp 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -3,6 +3,10 @@
|
|
3
3
|
|
4
4
|
#include "ggml.h"
|
5
5
|
|
6
|
+
#ifdef GGML_USE_K_QUANTS
|
7
|
+
#include "k_quants.h"
|
8
|
+
#endif
|
9
|
+
|
6
10
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
7
11
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
8
12
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
@@ -21,6 +25,10 @@
|
|
21
25
|
#include <float.h>
|
22
26
|
#include <limits.h>
|
23
27
|
|
28
|
+
#ifdef GGML_USE_METAL
|
29
|
+
#include <unistd.h>
|
30
|
+
#endif
|
31
|
+
|
24
32
|
// if C99 - static_assert is noop
|
25
33
|
// ref: https://stackoverflow.com/a/53923785/4039976
|
26
34
|
#ifndef static_assert
|
@@ -121,7 +129,11 @@ typedef void* thread_ret_t;
|
|
121
129
|
#else
|
122
130
|
inline static void* ggml_aligned_malloc(size_t size) {
|
123
131
|
void* aligned_memory = NULL;
|
132
|
+
#ifdef GGML_USE_METAL
|
133
|
+
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
134
|
+
#else
|
124
135
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
136
|
+
#endif
|
125
137
|
if (result != 0) {
|
126
138
|
// Handle allocation failure
|
127
139
|
return NULL;
|
@@ -403,21 +415,27 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
|
403
415
|
//
|
404
416
|
|
405
417
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
406
|
-
static int64_t timer_freq;
|
418
|
+
static int64_t timer_freq, timer_start;
|
407
419
|
void ggml_time_init(void) {
|
408
|
-
LARGE_INTEGER
|
409
|
-
QueryPerformanceFrequency(&
|
410
|
-
timer_freq =
|
420
|
+
LARGE_INTEGER t;
|
421
|
+
QueryPerformanceFrequency(&t);
|
422
|
+
timer_freq = t.QuadPart;
|
423
|
+
|
424
|
+
// The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
|
425
|
+
// and the uptime is high enough.
|
426
|
+
// We subtract the program start time to reduce the likelihood of that happening.
|
427
|
+
QueryPerformanceCounter(&t);
|
428
|
+
timer_start = t.QuadPart;
|
411
429
|
}
|
412
430
|
int64_t ggml_time_ms(void) {
|
413
431
|
LARGE_INTEGER t;
|
414
432
|
QueryPerformanceCounter(&t);
|
415
|
-
return (t.QuadPart * 1000) / timer_freq;
|
433
|
+
return ((t.QuadPart-timer_start) * 1000) / timer_freq;
|
416
434
|
}
|
417
435
|
int64_t ggml_time_us(void) {
|
418
436
|
LARGE_INTEGER t;
|
419
437
|
QueryPerformanceCounter(&t);
|
420
|
-
return (t.QuadPart * 1000000) / timer_freq;
|
438
|
+
return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
|
421
439
|
}
|
422
440
|
#else
|
423
441
|
void ggml_time_init(void) {}
|
@@ -474,6 +492,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
474
492
|
// quantization
|
475
493
|
//
|
476
494
|
|
495
|
+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
496
|
+
|
477
497
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
478
498
|
// multiply int8_t, add results pairwise twice
|
479
499
|
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
@@ -533,7 +553,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
533
553
|
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
534
554
|
{
|
535
555
|
const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
|
536
|
-
const __m256i bytes =
|
556
|
+
const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
|
537
557
|
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
538
558
|
return _mm256_and_si256(lowMask, bytes);
|
539
559
|
}
|
@@ -606,7 +626,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
606
626
|
bytesh = _mm_or_si128(bytesh, bit_mask);
|
607
627
|
bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
|
608
628
|
bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
|
609
|
-
return
|
629
|
+
return MM256_SET_M128I(bytesh, bytesl);
|
610
630
|
}
|
611
631
|
|
612
632
|
// Unpack 32 4-bit fields into 32 bytes
|
@@ -619,7 +639,7 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
|
619
639
|
const __m128i lowMask = _mm_set1_epi8(0xF);
|
620
640
|
tmpl = _mm_and_si128(lowMask, tmpl);
|
621
641
|
tmph = _mm_and_si128(lowMask, tmph);
|
622
|
-
return
|
642
|
+
return MM256_SET_M128I(tmph, tmpl);
|
623
643
|
}
|
624
644
|
|
625
645
|
// add int16_t pairwise and return as float vector
|
@@ -627,7 +647,7 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
|
|
627
647
|
const __m128i ones = _mm_set1_epi16(1);
|
628
648
|
const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
|
629
649
|
const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
|
630
|
-
const __m256i summed_pairs =
|
650
|
+
const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
|
631
651
|
return _mm256_cvtepi32_ps(summed_pairs);
|
632
652
|
}
|
633
653
|
|
@@ -1565,6 +1585,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|
1565
1585
|
.vec_dot_q = NULL, // TODO
|
1566
1586
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1567
1587
|
},
|
1588
|
+
#ifdef GGML_USE_K_QUANTS
|
1589
|
+
[GGML_TYPE_Q2_K] = {
|
1590
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
|
1591
|
+
.quantize_row_q = quantize_row_q2_K,
|
1592
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
|
1593
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1594
|
+
.vec_dot_q = ggml_vec_dot_q2_K_q8_K,
|
1595
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1596
|
+
},
|
1597
|
+
[GGML_TYPE_Q3_K] = {
|
1598
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
|
1599
|
+
.quantize_row_q = quantize_row_q3_K,
|
1600
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
|
1601
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1602
|
+
.vec_dot_q = ggml_vec_dot_q3_K_q8_K,
|
1603
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1604
|
+
},
|
1605
|
+
[GGML_TYPE_Q4_K] = {
|
1606
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
|
1607
|
+
.quantize_row_q = quantize_row_q4_K,
|
1608
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
|
1609
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1610
|
+
.vec_dot_q = ggml_vec_dot_q4_K_q8_K,
|
1611
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1612
|
+
},
|
1613
|
+
[GGML_TYPE_Q5_K] = {
|
1614
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
|
1615
|
+
.quantize_row_q = quantize_row_q5_K,
|
1616
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
|
1617
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1618
|
+
.vec_dot_q = ggml_vec_dot_q5_K_q8_K,
|
1619
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1620
|
+
},
|
1621
|
+
[GGML_TYPE_Q6_K] = {
|
1622
|
+
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
|
1623
|
+
.quantize_row_q = quantize_row_q6_K,
|
1624
|
+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
|
1625
|
+
.quantize_row_q_dot = quantize_row_q8_K,
|
1626
|
+
.vec_dot_q = ggml_vec_dot_q6_K_q8_K,
|
1627
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
1628
|
+
},
|
1629
|
+
#endif
|
1568
1630
|
};
|
1569
1631
|
|
1570
1632
|
// For internal test use
|
@@ -2290,7 +2352,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2290
2352
|
const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
|
2291
2353
|
|
2292
2354
|
// Convert int32_t to float
|
2293
|
-
__m256 p = _mm256_cvtepi32_ps(
|
2355
|
+
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
2294
2356
|
|
2295
2357
|
// Apply the scale, and accumulate
|
2296
2358
|
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
@@ -2766,7 +2828,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2766
2828
|
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
2767
2829
|
bxl = _mm_or_si128(bxl, bxhil);
|
2768
2830
|
bxh = _mm_or_si128(bxh, bxhih);
|
2769
|
-
bx =
|
2831
|
+
bx = MM256_SET_M128I(bxh, bxl);
|
2770
2832
|
|
2771
2833
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
2772
2834
|
|
@@ -3022,7 +3084,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3022
3084
|
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
3023
3085
|
bxl = _mm_or_si128(bxl, bxhil);
|
3024
3086
|
bxh = _mm_or_si128(bxh, bxhih);
|
3025
|
-
bx =
|
3087
|
+
bx = MM256_SET_M128I(bxh, bxl);
|
3026
3088
|
|
3027
3089
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
3028
3090
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
@@ -3444,11 +3506,19 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
|
3444
3506
|
[GGML_TYPE_Q5_1] = QK5_1,
|
3445
3507
|
[GGML_TYPE_Q8_0] = QK8_0,
|
3446
3508
|
[GGML_TYPE_Q8_1] = QK8_1,
|
3509
|
+
#ifdef GGML_USE_K_QUANTS
|
3510
|
+
[GGML_TYPE_Q2_K] = QK_K,
|
3511
|
+
[GGML_TYPE_Q3_K] = QK_K,
|
3512
|
+
[GGML_TYPE_Q4_K] = QK_K,
|
3513
|
+
[GGML_TYPE_Q5_K] = QK_K,
|
3514
|
+
[GGML_TYPE_Q6_K] = QK_K,
|
3515
|
+
[GGML_TYPE_Q8_K] = QK_K,
|
3516
|
+
#endif
|
3447
3517
|
[GGML_TYPE_I8] = 1,
|
3448
3518
|
[GGML_TYPE_I16] = 1,
|
3449
3519
|
[GGML_TYPE_I32] = 1,
|
3450
3520
|
};
|
3451
|
-
static_assert(GGML_TYPE_COUNT ==
|
3521
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
|
3452
3522
|
|
3453
3523
|
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
3454
3524
|
[GGML_TYPE_F32] = sizeof(float),
|
@@ -3459,11 +3529,19 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
|
3459
3529
|
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
3460
3530
|
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
3461
3531
|
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
3532
|
+
#ifdef GGML_USE_K_QUANTS
|
3533
|
+
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
3534
|
+
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
3535
|
+
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
3536
|
+
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
3537
|
+
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
3538
|
+
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
3539
|
+
#endif
|
3462
3540
|
[GGML_TYPE_I8] = sizeof(int8_t),
|
3463
3541
|
[GGML_TYPE_I16] = sizeof(int16_t),
|
3464
3542
|
[GGML_TYPE_I32] = sizeof(int32_t),
|
3465
3543
|
};
|
3466
|
-
static_assert(GGML_TYPE_COUNT ==
|
3544
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
|
3467
3545
|
|
3468
3546
|
|
3469
3547
|
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
@@ -3475,11 +3553,17 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
|
3475
3553
|
[GGML_TYPE_Q5_1] = "q5_1",
|
3476
3554
|
[GGML_TYPE_Q8_0] = "q8_0",
|
3477
3555
|
[GGML_TYPE_Q8_1] = "q8_1",
|
3556
|
+
[GGML_TYPE_Q2_K] = "q2_K",
|
3557
|
+
[GGML_TYPE_Q3_K] = "q3_K",
|
3558
|
+
[GGML_TYPE_Q4_K] = "q4_K",
|
3559
|
+
[GGML_TYPE_Q5_K] = "q5_K",
|
3560
|
+
[GGML_TYPE_Q6_K] = "q6_K",
|
3561
|
+
[GGML_TYPE_Q8_K] = "q8_K",
|
3478
3562
|
[GGML_TYPE_I8] = "i8",
|
3479
3563
|
[GGML_TYPE_I16] = "i16",
|
3480
3564
|
[GGML_TYPE_I32] = "i32",
|
3481
3565
|
};
|
3482
|
-
static_assert(GGML_TYPE_COUNT ==
|
3566
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
|
3483
3567
|
|
3484
3568
|
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
3485
3569
|
[GGML_TYPE_F32] = false,
|
@@ -3490,11 +3574,17 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
|
3490
3574
|
[GGML_TYPE_Q5_1] = true,
|
3491
3575
|
[GGML_TYPE_Q8_0] = true,
|
3492
3576
|
[GGML_TYPE_Q8_1] = true,
|
3577
|
+
[GGML_TYPE_Q2_K] = true,
|
3578
|
+
[GGML_TYPE_Q3_K] = true,
|
3579
|
+
[GGML_TYPE_Q4_K] = true,
|
3580
|
+
[GGML_TYPE_Q5_K] = true,
|
3581
|
+
[GGML_TYPE_Q6_K] = true,
|
3582
|
+
[GGML_TYPE_Q8_K] = true,
|
3493
3583
|
[GGML_TYPE_I8] = false,
|
3494
3584
|
[GGML_TYPE_I16] = false,
|
3495
3585
|
[GGML_TYPE_I32] = false,
|
3496
3586
|
};
|
3497
|
-
static_assert(GGML_TYPE_COUNT ==
|
3587
|
+
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
|
3498
3588
|
|
3499
3589
|
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
3500
3590
|
"NONE",
|
@@ -3631,6 +3721,7 @@ struct ggml_context {
|
|
3631
3721
|
void * mem_buffer;
|
3632
3722
|
bool mem_buffer_owned;
|
3633
3723
|
bool no_alloc;
|
3724
|
+
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
3634
3725
|
|
3635
3726
|
int n_objects;
|
3636
3727
|
|
@@ -3647,26 +3738,6 @@ struct ggml_context_container {
|
|
3647
3738
|
struct ggml_context context;
|
3648
3739
|
};
|
3649
3740
|
|
3650
|
-
//
|
3651
|
-
// compute types
|
3652
|
-
//
|
3653
|
-
|
3654
|
-
enum ggml_task_type {
|
3655
|
-
GGML_TASK_INIT = 0,
|
3656
|
-
GGML_TASK_COMPUTE,
|
3657
|
-
GGML_TASK_FINALIZE,
|
3658
|
-
};
|
3659
|
-
|
3660
|
-
struct ggml_compute_params {
|
3661
|
-
enum ggml_task_type type;
|
3662
|
-
|
3663
|
-
int ith, nth;
|
3664
|
-
|
3665
|
-
// work buffer for all threads
|
3666
|
-
size_t wsize;
|
3667
|
-
void * wdata;
|
3668
|
-
};
|
3669
|
-
|
3670
3741
|
//
|
3671
3742
|
// ggml state
|
3672
3743
|
//
|
@@ -3723,7 +3794,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
|
3723
3794
|
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
3724
3795
|
}
|
3725
3796
|
|
3726
|
-
|
3797
|
+
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
3727
3798
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3728
3799
|
|
3729
3800
|
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
@@ -3732,7 +3803,20 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
|
|
3732
3803
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
3733
3804
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3734
3805
|
|
3735
|
-
|
3806
|
+
// this should handle cases where the tensor is not contiguous in memory
|
3807
|
+
// probaby just:
|
3808
|
+
//
|
3809
|
+
// return tensor->ne[3]*tensor->nb[3]
|
3810
|
+
//
|
3811
|
+
// is enough, but just in case, adding the second part
|
3812
|
+
|
3813
|
+
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
3814
|
+
}
|
3815
|
+
|
3816
|
+
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
3817
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3818
|
+
|
3819
|
+
return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
|
3736
3820
|
}
|
3737
3821
|
|
3738
3822
|
int ggml_blck_size(enum ggml_type type) {
|
@@ -3801,6 +3885,11 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
3801
3885
|
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
3802
3886
|
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
3803
3887
|
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
3888
|
+
case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
|
3889
|
+
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
3890
|
+
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
3891
|
+
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
3892
|
+
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
3804
3893
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
3805
3894
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
3806
3895
|
}
|
@@ -3814,11 +3903,11 @@ size_t ggml_tensor_overhead(void) {
|
|
3814
3903
|
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
|
3815
3904
|
}
|
3816
3905
|
|
3817
|
-
|
3906
|
+
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
3818
3907
|
return tensor->nb[0] > tensor->nb[1];
|
3819
3908
|
}
|
3820
3909
|
|
3821
|
-
|
3910
|
+
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
3822
3911
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3823
3912
|
|
3824
3913
|
return
|
@@ -3967,6 +4056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
3967
4056
|
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
|
3968
4057
|
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
3969
4058
|
/*.no_alloc =*/ params.no_alloc,
|
4059
|
+
/*.no_alloc_save =*/ params.no_alloc,
|
3970
4060
|
/*.n_objects =*/ 0,
|
3971
4061
|
/*.objects_begin =*/ NULL,
|
3972
4062
|
/*.objects_end =*/ NULL,
|
@@ -4044,11 +4134,18 @@ size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
|
4044
4134
|
// operators when using scratch buffers
|
4045
4135
|
// TODO: implement a better way
|
4046
4136
|
void ggml_scratch_save(struct ggml_context * ctx) {
|
4137
|
+
// this is needed to allow opt tensors to store their data
|
4138
|
+
// TODO: again, need to find a better way
|
4139
|
+
ctx->no_alloc_save = ctx->no_alloc;
|
4140
|
+
ctx->no_alloc = false;
|
4141
|
+
|
4047
4142
|
ctx->scratch_save = ctx->scratch;
|
4048
4143
|
ctx->scratch.data = NULL;
|
4049
4144
|
}
|
4050
4145
|
|
4051
4146
|
void ggml_scratch_load(struct ggml_context * ctx) {
|
4147
|
+
ctx->no_alloc = ctx->no_alloc_save;
|
4148
|
+
|
4052
4149
|
ctx->scratch = ctx->scratch_save;
|
4053
4150
|
}
|
4054
4151
|
|
@@ -4157,6 +4254,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4157
4254
|
/*.perf_time_us =*/ 0,
|
4158
4255
|
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
4159
4256
|
/*.name =*/ { 0 },
|
4257
|
+
/*.extra =*/ NULL,
|
4160
4258
|
/*.pad =*/ { 0 },
|
4161
4259
|
};
|
4162
4260
|
|
@@ -5802,10 +5900,18 @@ struct ggml_tensor * ggml_view_1d(
|
|
5802
5900
|
|
5803
5901
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
5804
5902
|
|
5903
|
+
ggml_scratch_save(ctx);
|
5904
|
+
|
5905
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5906
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5907
|
+
|
5908
|
+
ggml_scratch_load(ctx);
|
5909
|
+
|
5805
5910
|
result->op = GGML_OP_VIEW;
|
5806
5911
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5807
5912
|
result->src0 = a;
|
5808
5913
|
result->src1 = NULL;
|
5914
|
+
result->opt[0] = offs;
|
5809
5915
|
|
5810
5916
|
if (is_node) {
|
5811
5917
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5834,6 +5940,13 @@ struct ggml_tensor * ggml_view_2d(
|
|
5834
5940
|
|
5835
5941
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
5836
5942
|
|
5943
|
+
ggml_scratch_save(ctx);
|
5944
|
+
|
5945
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5946
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5947
|
+
|
5948
|
+
ggml_scratch_load(ctx);
|
5949
|
+
|
5837
5950
|
result->nb[1] = nb1;
|
5838
5951
|
result->nb[2] = result->nb[1]*ne1;
|
5839
5952
|
result->nb[3] = result->nb[2];
|
@@ -5842,6 +5955,7 @@ struct ggml_tensor * ggml_view_2d(
|
|
5842
5955
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5843
5956
|
result->src0 = a;
|
5844
5957
|
result->src1 = NULL;
|
5958
|
+
result->opt[0] = offs;
|
5845
5959
|
|
5846
5960
|
if (is_node) {
|
5847
5961
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5872,6 +5986,13 @@ struct ggml_tensor * ggml_view_3d(
|
|
5872
5986
|
|
5873
5987
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
5874
5988
|
|
5989
|
+
ggml_scratch_save(ctx);
|
5990
|
+
|
5991
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
5992
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5993
|
+
|
5994
|
+
ggml_scratch_load(ctx);
|
5995
|
+
|
5875
5996
|
result->nb[1] = nb1;
|
5876
5997
|
result->nb[2] = nb2;
|
5877
5998
|
result->nb[3] = result->nb[2]*ne2;
|
@@ -5880,6 +6001,7 @@ struct ggml_tensor * ggml_view_3d(
|
|
5880
6001
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5881
6002
|
result->src0 = a;
|
5882
6003
|
result->src1 = NULL;
|
6004
|
+
result->opt[0] = offs;
|
5883
6005
|
|
5884
6006
|
if (is_node) {
|
5885
6007
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -5912,6 +6034,13 @@ struct ggml_tensor * ggml_view_4d(
|
|
5912
6034
|
|
5913
6035
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
5914
6036
|
|
6037
|
+
ggml_scratch_save(ctx);
|
6038
|
+
|
6039
|
+
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6040
|
+
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6041
|
+
|
6042
|
+
ggml_scratch_load(ctx);
|
6043
|
+
|
5915
6044
|
result->nb[1] = nb1;
|
5916
6045
|
result->nb[2] = nb2;
|
5917
6046
|
result->nb[3] = nb3;
|
@@ -5920,6 +6049,7 @@ struct ggml_tensor * ggml_view_4d(
|
|
5920
6049
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5921
6050
|
result->src0 = a;
|
5922
6051
|
result->src1 = NULL;
|
6052
|
+
result->opt[0] = offs;
|
5923
6053
|
|
5924
6054
|
if (is_node) {
|
5925
6055
|
memcpy(result->padding, &offset, sizeof(offset));
|
@@ -7584,6 +7714,11 @@ static void ggml_compute_forward_add(
|
|
7584
7714
|
case GGML_TYPE_Q5_0:
|
7585
7715
|
case GGML_TYPE_Q5_1:
|
7586
7716
|
case GGML_TYPE_Q8_0:
|
7717
|
+
case GGML_TYPE_Q2_K:
|
7718
|
+
case GGML_TYPE_Q3_K:
|
7719
|
+
case GGML_TYPE_Q4_K:
|
7720
|
+
case GGML_TYPE_Q5_K:
|
7721
|
+
case GGML_TYPE_Q6_K:
|
7587
7722
|
{
|
7588
7723
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
7589
7724
|
} break;
|
@@ -7887,6 +8022,11 @@ static void ggml_compute_forward_add1(
|
|
7887
8022
|
case GGML_TYPE_Q5_1:
|
7888
8023
|
case GGML_TYPE_Q8_0:
|
7889
8024
|
case GGML_TYPE_Q8_1:
|
8025
|
+
case GGML_TYPE_Q2_K:
|
8026
|
+
case GGML_TYPE_Q3_K:
|
8027
|
+
case GGML_TYPE_Q4_K:
|
8028
|
+
case GGML_TYPE_Q5_K:
|
8029
|
+
case GGML_TYPE_Q6_K:
|
7890
8030
|
{
|
7891
8031
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
7892
8032
|
} break;
|
@@ -8009,6 +8149,11 @@ static void ggml_compute_forward_acc(
|
|
8009
8149
|
case GGML_TYPE_Q5_1:
|
8010
8150
|
case GGML_TYPE_Q8_0:
|
8011
8151
|
case GGML_TYPE_Q8_1:
|
8152
|
+
case GGML_TYPE_Q2_K:
|
8153
|
+
case GGML_TYPE_Q3_K:
|
8154
|
+
case GGML_TYPE_Q4_K:
|
8155
|
+
case GGML_TYPE_Q5_K:
|
8156
|
+
case GGML_TYPE_Q6_K:
|
8012
8157
|
default:
|
8013
8158
|
{
|
8014
8159
|
GGML_ASSERT(false);
|
@@ -8127,10 +8272,10 @@ static void ggml_compute_forward_mul_f32(
|
|
8127
8272
|
const int ith = params->ith;
|
8128
8273
|
const int nth = params->nth;
|
8129
8274
|
|
8130
|
-
#ifdef
|
8131
|
-
if (src1->backend ==
|
8275
|
+
#ifdef GGML_USE_CLBLAST
|
8276
|
+
if (src1->backend == GGML_BACKEND_GPU) {
|
8132
8277
|
if (ith == 0) {
|
8133
|
-
|
8278
|
+
ggml_cl_mul(src0, src1, dst);
|
8134
8279
|
}
|
8135
8280
|
return;
|
8136
8281
|
}
|
@@ -9245,7 +9390,7 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
9245
9390
|
sum += (ggml_float)(x[i00] * x[i00]);
|
9246
9391
|
}
|
9247
9392
|
|
9248
|
-
float mean = sum/ne00;
|
9393
|
+
const float mean = sum/ne00;
|
9249
9394
|
|
9250
9395
|
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
9251
9396
|
|
@@ -9568,14 +9713,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9568
9713
|
// nb01 >= nb00 - src0 is not transposed
|
9569
9714
|
// compute by src0 rows
|
9570
9715
|
|
9571
|
-
#if defined(
|
9572
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9573
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9574
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9575
|
-
}
|
9576
|
-
return;
|
9577
|
-
}
|
9578
|
-
#elif defined(GGML_USE_CLBLAST)
|
9716
|
+
#if defined(GGML_USE_CLBLAST)
|
9579
9717
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9580
9718
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9581
9719
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -9740,14 +9878,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9740
9878
|
// nb01 >= nb00 - src0 is not transposed
|
9741
9879
|
// compute by src0 rows
|
9742
9880
|
|
9743
|
-
#if defined(
|
9744
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9745
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9746
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9747
|
-
}
|
9748
|
-
return;
|
9749
|
-
}
|
9750
|
-
#elif defined(GGML_USE_CLBLAST)
|
9881
|
+
#if defined(GGML_USE_CLBLAST)
|
9751
9882
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9752
9883
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9753
9884
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -9952,14 +10083,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9952
10083
|
// nb01 >= nb00 - src0 is not transposed
|
9953
10084
|
// compute by src0 rows
|
9954
10085
|
|
9955
|
-
#if defined(
|
9956
|
-
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
9957
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9958
|
-
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9959
|
-
}
|
9960
|
-
return;
|
9961
|
-
}
|
9962
|
-
#elif defined(GGML_USE_CLBLAST)
|
10086
|
+
#if defined(GGML_USE_CLBLAST)
|
9963
10087
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9964
10088
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9965
10089
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
@@ -10102,6 +10226,11 @@ static void ggml_compute_forward_mul_mat(
|
|
10102
10226
|
case GGML_TYPE_Q5_1:
|
10103
10227
|
case GGML_TYPE_Q8_0:
|
10104
10228
|
case GGML_TYPE_Q8_1:
|
10229
|
+
case GGML_TYPE_Q2_K:
|
10230
|
+
case GGML_TYPE_Q3_K:
|
10231
|
+
case GGML_TYPE_Q4_K:
|
10232
|
+
case GGML_TYPE_Q5_K:
|
10233
|
+
case GGML_TYPE_Q6_K:
|
10105
10234
|
{
|
10106
10235
|
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
|
10107
10236
|
} break;
|
@@ -10285,6 +10414,11 @@ static void ggml_compute_forward_set(
|
|
10285
10414
|
case GGML_TYPE_Q5_1:
|
10286
10415
|
case GGML_TYPE_Q8_0:
|
10287
10416
|
case GGML_TYPE_Q8_1:
|
10417
|
+
case GGML_TYPE_Q2_K:
|
10418
|
+
case GGML_TYPE_Q3_K:
|
10419
|
+
case GGML_TYPE_Q4_K:
|
10420
|
+
case GGML_TYPE_Q5_K:
|
10421
|
+
case GGML_TYPE_Q6_K:
|
10288
10422
|
default:
|
10289
10423
|
{
|
10290
10424
|
GGML_ASSERT(false);
|
@@ -10450,6 +10584,11 @@ static void ggml_compute_forward_get_rows(
|
|
10450
10584
|
case GGML_TYPE_Q5_1:
|
10451
10585
|
case GGML_TYPE_Q8_0:
|
10452
10586
|
case GGML_TYPE_Q8_1:
|
10587
|
+
case GGML_TYPE_Q2_K:
|
10588
|
+
case GGML_TYPE_Q3_K:
|
10589
|
+
case GGML_TYPE_Q4_K:
|
10590
|
+
case GGML_TYPE_Q5_K:
|
10591
|
+
case GGML_TYPE_Q6_K:
|
10453
10592
|
{
|
10454
10593
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
10455
10594
|
} break;
|
@@ -10996,6 +11135,12 @@ static void ggml_compute_forward_alibi(
|
|
10996
11135
|
case GGML_TYPE_Q5_1:
|
10997
11136
|
case GGML_TYPE_Q8_0:
|
10998
11137
|
case GGML_TYPE_Q8_1:
|
11138
|
+
case GGML_TYPE_Q2_K:
|
11139
|
+
case GGML_TYPE_Q3_K:
|
11140
|
+
case GGML_TYPE_Q4_K:
|
11141
|
+
case GGML_TYPE_Q5_K:
|
11142
|
+
case GGML_TYPE_Q6_K:
|
11143
|
+
case GGML_TYPE_Q8_K:
|
10999
11144
|
case GGML_TYPE_I8:
|
11000
11145
|
case GGML_TYPE_I16:
|
11001
11146
|
case GGML_TYPE_I32:
|
@@ -11067,6 +11212,12 @@ static void ggml_compute_forward_clamp(
|
|
11067
11212
|
case GGML_TYPE_Q5_1:
|
11068
11213
|
case GGML_TYPE_Q8_0:
|
11069
11214
|
case GGML_TYPE_Q8_1:
|
11215
|
+
case GGML_TYPE_Q2_K:
|
11216
|
+
case GGML_TYPE_Q3_K:
|
11217
|
+
case GGML_TYPE_Q4_K:
|
11218
|
+
case GGML_TYPE_Q5_K:
|
11219
|
+
case GGML_TYPE_Q6_K:
|
11220
|
+
case GGML_TYPE_Q8_K:
|
11070
11221
|
case GGML_TYPE_I8:
|
11071
11222
|
case GGML_TYPE_I16:
|
11072
11223
|
case GGML_TYPE_I32:
|
@@ -11156,7 +11307,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11156
11307
|
theta *= theta_scale;
|
11157
11308
|
|
11158
11309
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11159
|
-
float * dst_data = (float *)((char *) dst->data +
|
11310
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11160
11311
|
|
11161
11312
|
const float x0 = src[0];
|
11162
11313
|
const float x1 = src[1];
|
@@ -11177,7 +11328,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11177
11328
|
const int64_t i0 = ib*n_dims + ic/2;
|
11178
11329
|
|
11179
11330
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11180
|
-
float * dst_data = (float *)((char *) dst->data +
|
11331
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11181
11332
|
|
11182
11333
|
const float x0 = src[0];
|
11183
11334
|
const float x1 = src[n_dims/2];
|
@@ -12885,6 +13036,15 @@ static void ggml_compute_forward_map_binary(
|
|
12885
13036
|
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
12886
13037
|
GGML_ASSERT(params);
|
12887
13038
|
|
13039
|
+
#ifdef GGML_USE_CUBLAS
|
13040
|
+
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
13041
|
+
if (skip_cpu) {
|
13042
|
+
return;
|
13043
|
+
}
|
13044
|
+
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
13045
|
+
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
13046
|
+
#endif // GGML_USE_CUBLAS
|
13047
|
+
|
12888
13048
|
switch (tensor->op) {
|
12889
13049
|
case GGML_OP_DUP:
|
12890
13050
|
{
|
@@ -14191,7 +14351,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
14191
14351
|
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
14192
14352
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
14193
14353
|
// the threads are still spinning
|
14194
|
-
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
14195
14354
|
}
|
14196
14355
|
else
|
14197
14356
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -14581,7 +14740,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
|
14581
14740
|
const int64_t * ne = tensor->ne;
|
14582
14741
|
const size_t * nb = tensor->nb;
|
14583
14742
|
|
14584
|
-
fprintf(fout, "%-6s %-12s %8d %
|
14743
|
+
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
14585
14744
|
ggml_type_name(tensor->type),
|
14586
14745
|
ggml_op_name (tensor->op),
|
14587
14746
|
tensor->n_dims,
|
@@ -14595,7 +14754,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
14595
14754
|
const int64_t * ne = tensor->ne;
|
14596
14755
|
const size_t * nb = tensor->nb;
|
14597
14756
|
|
14598
|
-
fprintf(fout, "%-6s %-6s %-12s %8d %
|
14757
|
+
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
14599
14758
|
arg,
|
14600
14759
|
ggml_type_name(tensor->type),
|
14601
14760
|
ggml_op_name (tensor->op),
|
@@ -14608,8 +14767,8 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
14608
14767
|
}
|
14609
14768
|
|
14610
14769
|
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
14611
|
-
assert(cgraph->work == NULL);
|
14612
|
-
assert(cgraph->work_size == 0);
|
14770
|
+
//assert(cgraph->work == NULL);
|
14771
|
+
//assert(cgraph->work_size == 0);
|
14613
14772
|
|
14614
14773
|
uint64_t size_eval = 0;
|
14615
14774
|
|
@@ -14624,11 +14783,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
14624
14783
|
FILE * fout = stdout;
|
14625
14784
|
|
14626
14785
|
fprintf(fout, "\n");
|
14627
|
-
fprintf(fout, "%-16s %8x\n",
|
14628
|
-
fprintf(fout, "%-16s %8d\n",
|
14629
|
-
fprintf(fout, "%-16s %8d\n",
|
14630
|
-
fprintf(fout, "%-16s %8d\n",
|
14631
|
-
fprintf(fout, "%-16s %
|
14786
|
+
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
14787
|
+
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
14788
|
+
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
14789
|
+
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
14790
|
+
fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
|
14632
14791
|
|
14633
14792
|
// header
|
14634
14793
|
fprintf(fout, "\n");
|
@@ -14830,7 +14989,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
14830
14989
|
// read file into data
|
14831
14990
|
{
|
14832
14991
|
FILE * fin = fopen(fname, "rb");
|
14833
|
-
|
14834
14992
|
if (!fin) {
|
14835
14993
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
14836
14994
|
return result;
|
@@ -14862,7 +15020,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
14862
15020
|
|
14863
15021
|
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
14864
15022
|
|
14865
|
-
fread(data->data, sizeof(char), fsize, fin);
|
15023
|
+
const size_t ret = fread(data->data, sizeof(char), fsize, fin);
|
15024
|
+
if (ret != fsize) {
|
15025
|
+
fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
|
15026
|
+
return result;
|
15027
|
+
}
|
14866
15028
|
|
14867
15029
|
fclose(fin);
|
14868
15030
|
}
|
@@ -14970,6 +15132,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
14970
15132
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
14971
15133
|
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
14972
15134
|
|
15135
|
+
enum ggml_op eop = (enum ggml_op) op;
|
15136
|
+
|
14973
15137
|
int64_t ne[GGML_MAX_DIMS];
|
14974
15138
|
size_t nb[GGML_MAX_DIMS];
|
14975
15139
|
|
@@ -14984,42 +15148,77 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
14984
15148
|
nb[j] = nb_cur;
|
14985
15149
|
}
|
14986
15150
|
|
14987
|
-
|
15151
|
+
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
|
14988
15152
|
|
14989
|
-
|
15153
|
+
const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
|
14990
15154
|
|
14991
|
-
|
15155
|
+
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
|
14992
15156
|
|
14993
|
-
|
14994
|
-
|
14995
|
-
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14996
|
-
tensor->nb[j] = nb[j];
|
14997
|
-
}
|
15157
|
+
struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
|
14998
15158
|
|
14999
15159
|
// parse args
|
15000
|
-
{
|
15001
|
-
|
15002
|
-
&tensor->src0,
|
15003
|
-
&tensor->src1,
|
15004
|
-
};
|
15160
|
+
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
15161
|
+
const int32_t arg_idx = ptr_arg_idx[j];
|
15005
15162
|
|
15006
|
-
|
15007
|
-
|
15163
|
+
if (arg_idx == -1) {
|
15164
|
+
continue;
|
15008
15165
|
}
|
15009
15166
|
|
15010
|
-
|
15011
|
-
|
15167
|
+
if (arg_idx < GGML_MAX_NODES) {
|
15168
|
+
args[j] = result.leafs[arg_idx];
|
15169
|
+
} else {
|
15170
|
+
args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
|
15171
|
+
}
|
15172
|
+
}
|
15012
15173
|
|
15013
|
-
|
15014
|
-
|
15015
|
-
|
15174
|
+
// create the tensor
|
15175
|
+
// "view" operations are handled differently
|
15176
|
+
// TODO: handle inplace ops - currently a copy is always made
|
15177
|
+
|
15178
|
+
struct ggml_tensor * tensor = NULL;
|
15179
|
+
|
15180
|
+
switch (eop) {
|
15181
|
+
// TODO: implement other view ops
|
15182
|
+
case GGML_OP_RESHAPE:
|
15183
|
+
{
|
15184
|
+
tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
|
15185
|
+
} break;
|
15186
|
+
case GGML_OP_VIEW:
|
15187
|
+
{
|
15188
|
+
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
|
15189
|
+
|
15190
|
+
uint64_t offs;
|
15191
|
+
memcpy(&offs, args[2]->data, sizeof(offs));
|
15192
|
+
|
15193
|
+
tensor->data = ((char *) tensor->data) + offs;
|
15194
|
+
} break;
|
15195
|
+
case GGML_OP_TRANSPOSE:
|
15196
|
+
{
|
15197
|
+
tensor = ggml_transpose(*ctx_eval, args[0]);
|
15198
|
+
} break;
|
15199
|
+
case GGML_OP_PERMUTE:
|
15200
|
+
{
|
15201
|
+
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
|
15202
|
+
} break;
|
15203
|
+
default:
|
15204
|
+
{
|
15205
|
+
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
15206
|
+
|
15207
|
+
tensor->op = eop;
|
15208
|
+
} break;
|
15209
|
+
}
|
15016
15210
|
|
15017
|
-
|
15018
|
-
|
15019
|
-
|
15020
|
-
|
15021
|
-
|
15022
|
-
|
15211
|
+
memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
|
15212
|
+
|
15213
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
15214
|
+
tensor->nb[j] = nb[j];
|
15215
|
+
}
|
15216
|
+
|
15217
|
+
tensor->src0 = args[0];
|
15218
|
+
tensor->src1 = args[1];
|
15219
|
+
|
15220
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
15221
|
+
tensor->opt[j] = args[2 + j];
|
15023
15222
|
}
|
15024
15223
|
|
15025
15224
|
result.nodes[i] = tensor;
|
@@ -16070,6 +16269,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
16070
16269
|
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
16071
16270
|
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
16072
16271
|
} break;
|
16272
|
+
#ifdef GGML_USE_K_QUANTS
|
16273
|
+
case GGML_TYPE_Q2_K:
|
16274
|
+
{
|
16275
|
+
GGML_ASSERT(start % QK_K == 0);
|
16276
|
+
block_q2_K * block = (block_q2_K*)dst + start / QK_K;
|
16277
|
+
result = ggml_quantize_q2_K(src + start, block, n, n, hist);
|
16278
|
+
} break;
|
16279
|
+
case GGML_TYPE_Q3_K:
|
16280
|
+
{
|
16281
|
+
GGML_ASSERT(start % QK_K == 0);
|
16282
|
+
block_q3_K * block = (block_q3_K*)dst + start / QK_K;
|
16283
|
+
result = ggml_quantize_q3_K(src + start, block, n, n, hist);
|
16284
|
+
} break;
|
16285
|
+
case GGML_TYPE_Q4_K:
|
16286
|
+
{
|
16287
|
+
GGML_ASSERT(start % QK_K == 0);
|
16288
|
+
block_q4_K * block = (block_q4_K*)dst + start / QK_K;
|
16289
|
+
result = ggml_quantize_q4_K(src + start, block, n, n, hist);
|
16290
|
+
} break;
|
16291
|
+
case GGML_TYPE_Q5_K:
|
16292
|
+
{
|
16293
|
+
GGML_ASSERT(start % QK_K == 0);
|
16294
|
+
block_q5_K * block = (block_q5_K*)dst + start / QK_K;
|
16295
|
+
result = ggml_quantize_q5_K(src + start, block, n, n, hist);
|
16296
|
+
} break;
|
16297
|
+
case GGML_TYPE_Q6_K:
|
16298
|
+
{
|
16299
|
+
GGML_ASSERT(start % QK_K == 0);
|
16300
|
+
block_q6_K * block = (block_q6_K*)dst + start / QK_K;
|
16301
|
+
result = ggml_quantize_q6_K(src + start, block, n, n, hist);
|
16302
|
+
} break;
|
16303
|
+
#endif
|
16073
16304
|
default:
|
16074
16305
|
assert(false);
|
16075
16306
|
}
|