llama_cpp 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
@@ -0,0 +1,122 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "ggml.h"
|
4
|
+
|
5
|
+
#include <stdint.h>
|
6
|
+
#include <assert.h>
|
7
|
+
#include <stddef.h>
|
8
|
+
|
9
|
+
// Super-block size
|
10
|
+
#define QK_K 256
|
11
|
+
|
12
|
+
//
|
13
|
+
// Super-block quantization structures
|
14
|
+
//
|
15
|
+
|
16
|
+
// 2-bit quantization
|
17
|
+
// weight is represented as x = a * q + b
|
18
|
+
// 16 blocks of 16 elemenets each
|
19
|
+
// Effectively 2.5625 bits per weight
|
20
|
+
typedef struct {
|
21
|
+
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
22
|
+
uint8_t qs[QK_K/4]; // quants
|
23
|
+
ggml_fp16_t d; // super-block scale for quantized scales
|
24
|
+
ggml_fp16_t dmin; // super-block scale for quantized mins
|
25
|
+
} block_q2_K;
|
26
|
+
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
27
|
+
|
28
|
+
// 3-bit quantization
|
29
|
+
// weight is represented as x = a * q
|
30
|
+
// 16 blocks of 16 elemenets each
|
31
|
+
// Effectively 3.4375 bits per weight
|
32
|
+
typedef struct {
|
33
|
+
uint8_t hmask[QK_K/8]; // quants - high bit
|
34
|
+
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
35
|
+
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
36
|
+
ggml_fp16_t d; // super-block scale
|
37
|
+
} block_q3_K;
|
38
|
+
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
|
39
|
+
|
40
|
+
// 4-bit quantization
|
41
|
+
// 16 blocks of 32 elements each
|
42
|
+
// weight is represented as x = a * q + b
|
43
|
+
// Effectively 4.5 bits per weight
|
44
|
+
typedef struct {
|
45
|
+
ggml_fp16_t d; // super-block scale for quantized scales
|
46
|
+
ggml_fp16_t dmin; // super-block scale for quantized mins
|
47
|
+
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
48
|
+
uint8_t qs[QK_K/2]; // 4--bit quants
|
49
|
+
} block_q4_K;
|
50
|
+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
51
|
+
|
52
|
+
// 5-bit quantization
|
53
|
+
// 16 blocks of 32 elements each
|
54
|
+
// weight is represented as x = a * q + b
|
55
|
+
// Effectively 5.5 bits per weight
|
56
|
+
typedef struct {
|
57
|
+
ggml_fp16_t d; // super-block scale for quantized scales
|
58
|
+
ggml_fp16_t dmin; // super-block scale for quantized mins
|
59
|
+
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
60
|
+
uint8_t qh[QK_K/8]; // quants, high bit
|
61
|
+
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
62
|
+
} block_q5_K;
|
63
|
+
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
64
|
+
|
65
|
+
// 6-bit quantization
|
66
|
+
// weight is represented as x = a * q
|
67
|
+
// 16 blocks of 16 elemenets each
|
68
|
+
// Effectively 6.5625 bits per weight
|
69
|
+
typedef struct {
|
70
|
+
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
71
|
+
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
72
|
+
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
73
|
+
ggml_fp16_t d; // super-block scale
|
74
|
+
} block_q6_K;
|
75
|
+
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
|
76
|
+
|
77
|
+
// This is only used for intermediate quantization and dot products
|
78
|
+
typedef struct {
|
79
|
+
float d; // delta
|
80
|
+
int8_t qs[QK_K]; // quants
|
81
|
+
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
|
82
|
+
} block_q8_K;
|
83
|
+
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
84
|
+
|
85
|
+
|
86
|
+
// Quantization
|
87
|
+
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
88
|
+
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
89
|
+
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
90
|
+
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
91
|
+
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
92
|
+
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
93
|
+
|
94
|
+
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
95
|
+
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
96
|
+
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
97
|
+
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
98
|
+
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
99
|
+
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
100
|
+
|
101
|
+
// Dequantization
|
102
|
+
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
103
|
+
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
104
|
+
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
105
|
+
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
106
|
+
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
107
|
+
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
108
|
+
|
109
|
+
// Dot product
|
110
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
111
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
112
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
113
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
114
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
115
|
+
|
116
|
+
// Quantization with histogram collection
|
117
|
+
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
118
|
+
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
119
|
+
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
120
|
+
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
121
|
+
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
122
|
+
|
@@ -405,13 +405,29 @@ struct llama_buffer {
|
|
405
405
|
llama_buffer() = default;
|
406
406
|
|
407
407
|
void resize(size_t len) {
|
408
|
+
#ifdef GGML_USE_METAL
|
409
|
+
free(addr);
|
410
|
+
int result = posix_memalign((void **) &addr, getpagesize(), len);
|
411
|
+
if (result == 0) {
|
412
|
+
memset(addr, 0, len);
|
413
|
+
}
|
414
|
+
else {
|
415
|
+
addr = NULL;
|
416
|
+
}
|
417
|
+
#else
|
408
418
|
delete[] addr;
|
409
419
|
addr = new uint8_t[len];
|
420
|
+
#endif
|
410
421
|
size = len;
|
411
422
|
}
|
412
423
|
|
413
424
|
~llama_buffer() {
|
425
|
+
#ifdef GGML_USE_METAL
|
426
|
+
free(addr);
|
427
|
+
#else
|
414
428
|
delete[] addr;
|
429
|
+
#endif
|
430
|
+
addr = NULL;
|
415
431
|
}
|
416
432
|
|
417
433
|
// disable copy and move
|