llama_cpp 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ #include <stdint.h>
6
+ #include <assert.h>
7
+ #include <stddef.h>
8
+
9
+ // Super-block size
10
+ #define QK_K 256
11
+
12
+ //
13
+ // Super-block quantization structures
14
+ //
15
+
16
+ // 2-bit quantization
17
+ // weight is represented as x = a * q + b
18
+ // 16 blocks of 16 elemenets each
19
+ // Effectively 2.5625 bits per weight
20
+ typedef struct {
21
+ uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
22
+ uint8_t qs[QK_K/4]; // quants
23
+ ggml_fp16_t d; // super-block scale for quantized scales
24
+ ggml_fp16_t dmin; // super-block scale for quantized mins
25
+ } block_q2_K;
26
+ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
27
+
28
+ // 3-bit quantization
29
+ // weight is represented as x = a * q
30
+ // 16 blocks of 16 elemenets each
31
+ // Effectively 3.4375 bits per weight
32
+ typedef struct {
33
+ uint8_t hmask[QK_K/8]; // quants - high bit
34
+ uint8_t qs[QK_K/4]; // quants - low 2 bits
35
+ uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
36
+ ggml_fp16_t d; // super-block scale
37
+ } block_q3_K;
38
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
39
+
40
+ // 4-bit quantization
41
+ // 16 blocks of 32 elements each
42
+ // weight is represented as x = a * q + b
43
+ // Effectively 4.5 bits per weight
44
+ typedef struct {
45
+ ggml_fp16_t d; // super-block scale for quantized scales
46
+ ggml_fp16_t dmin; // super-block scale for quantized mins
47
+ uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
48
+ uint8_t qs[QK_K/2]; // 4--bit quants
49
+ } block_q4_K;
50
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
51
+
52
+ // 5-bit quantization
53
+ // 16 blocks of 32 elements each
54
+ // weight is represented as x = a * q + b
55
+ // Effectively 5.5 bits per weight
56
+ typedef struct {
57
+ ggml_fp16_t d; // super-block scale for quantized scales
58
+ ggml_fp16_t dmin; // super-block scale for quantized mins
59
+ uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
60
+ uint8_t qh[QK_K/8]; // quants, high bit
61
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
62
+ } block_q5_K;
63
+ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
64
+
65
+ // 6-bit quantization
66
+ // weight is represented as x = a * q
67
+ // 16 blocks of 16 elemenets each
68
+ // Effectively 6.5625 bits per weight
69
+ typedef struct {
70
+ uint8_t ql[QK_K/2]; // quants, lower 4 bits
71
+ uint8_t qh[QK_K/4]; // quants, upper 2 bits
72
+ int8_t scales[QK_K/16]; // scales, quantized with 8 bits
73
+ ggml_fp16_t d; // super-block scale
74
+ } block_q6_K;
75
+ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
76
+
77
+ // This is only used for intermediate quantization and dot products
78
+ typedef struct {
79
+ float d; // delta
80
+ int8_t qs[QK_K]; // quants
81
+ int16_t bsums[QK_K/16]; // sum of quants in groups of 16
82
+ } block_q8_K;
83
+ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
84
+
85
+
86
+ // Quantization
87
+ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
88
+ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
89
+ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
90
+ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
91
+ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
92
+ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
93
+
94
+ void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
95
+ void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
96
+ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
97
+ void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
98
+ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
99
+ void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
100
+
101
+ // Dequantization
102
+ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
103
+ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
104
+ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
105
+ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
106
+ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
107
+ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
108
+
109
+ // Dot product
110
+ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
111
+ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
112
+ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
113
+ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
114
+ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
115
+
116
+ // Quantization with histogram collection
117
+ size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
118
+ size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
119
+ size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
120
+ size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
121
+ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
122
+
@@ -405,13 +405,29 @@ struct llama_buffer {
405
405
  llama_buffer() = default;
406
406
 
407
407
  void resize(size_t len) {
408
+ #ifdef GGML_USE_METAL
409
+ free(addr);
410
+ int result = posix_memalign((void **) &addr, getpagesize(), len);
411
+ if (result == 0) {
412
+ memset(addr, 0, len);
413
+ }
414
+ else {
415
+ addr = NULL;
416
+ }
417
+ #else
408
418
  delete[] addr;
409
419
  addr = new uint8_t[len];
420
+ #endif
410
421
  size = len;
411
422
  }
412
423
 
413
424
  ~llama_buffer() {
425
+ #ifdef GGML_USE_METAL
426
+ free(addr);
427
+ #else
414
428
  delete[] addr;
429
+ #endif
430
+ addr = NULL;
415
431
  }
416
432
 
417
433
  // disable copy and move