llama_cpp 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +106 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +83 -23
- data/ext/llama_cpp/src/ggml-metal.m +35 -11
- data/ext/llama_cpp/src/ggml-metal.metal +145 -92
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +25 -53
- data/ext/llama_cpp/src/k_quants.c +45 -12
- data/ext/llama_cpp/src/llama.cpp +146 -70
- data/ext/llama_cpp/src/llama.h +3 -0
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -13,6 +13,26 @@
|
|
13
13
|
//
|
14
14
|
#include <arm_neon.h>
|
15
15
|
|
16
|
+
#if !defined(__aarch64__)
|
17
|
+
inline static int32_t vaddvq_s16(int16x8_t v) {
|
18
|
+
return
|
19
|
+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
20
|
+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
21
|
+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
22
|
+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
23
|
+
}
|
24
|
+
|
25
|
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
26
|
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
27
|
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
28
|
+
return vcombine_s16(a0, b0);
|
29
|
+
}
|
30
|
+
|
31
|
+
inline static int32_t vaddvq_s32(int32x4_t v) {
|
32
|
+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
33
|
+
}
|
34
|
+
#endif
|
35
|
+
|
16
36
|
#else
|
17
37
|
|
18
38
|
#ifdef __wasm_simd128__
|
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
63
83
|
float ax = fabsf(x[i]);
|
64
84
|
if (ax > amax) { amax = ax; max = x[i]; }
|
65
85
|
}
|
66
|
-
if (
|
86
|
+
if (amax < 1e-30f) { // all zero
|
67
87
|
for (int i = 0; i < n; ++i) {
|
68
88
|
L[i] = 0;
|
69
89
|
}
|
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
183
203
|
int ntry, float alpha) {
|
184
204
|
float min = x[0];
|
185
205
|
float max = x[0];
|
186
|
-
float sum_x = 0;
|
187
|
-
float sum_x2 = 0;
|
188
206
|
for (int i = 1; i < n; ++i) {
|
189
207
|
if (x[i] < min) min = x[i];
|
190
208
|
if (x[i] > max) max = x[i];
|
191
|
-
sum_x += x[i];
|
192
|
-
sum_x2 += x[i]*x[i];
|
193
209
|
}
|
194
210
|
if (max == min) {
|
195
211
|
for (int i = 0; i < n; ++i) L[i] = 0;
|
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
1070
1086
|
|
1071
1087
|
}
|
1072
1088
|
|
1089
|
+
if (!max_abs_scale) {
|
1090
|
+
memset(&y[i], 0, sizeof(block_q6_K));
|
1091
|
+
y[i].d = ggml_fp32_to_fp16(0.f);
|
1092
|
+
x += QK_K;
|
1093
|
+
continue;
|
1094
|
+
}
|
1095
|
+
|
1073
1096
|
float iscale = -128.f/max_scale;
|
1074
1097
|
y[i].d = ggml_fp32_to_fp16(1/iscale);
|
1075
1098
|
for (int ib = 0; ib < QK_K/16; ++ib) {
|
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1306
1329
|
|
1307
1330
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1308
1331
|
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
1332
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1309
1333
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1334
|
+
#endif
|
1310
1335
|
|
1311
1336
|
int8x16x2_t q2bytes;
|
1312
1337
|
uint8_t aux[16];
|
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1612
1637
|
#ifdef __ARM_NEON
|
1613
1638
|
|
1614
1639
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1640
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1615
1641
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1642
|
+
#endif
|
1616
1643
|
|
1617
1644
|
int8x16x4_t q2bytes;
|
1618
1645
|
|
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2060
2087
|
|
2061
2088
|
__m256 acc = _mm256_setzero_ps();
|
2062
2089
|
|
2063
|
-
uint32_t *aux;
|
2090
|
+
const uint32_t *aux;
|
2064
2091
|
|
2065
2092
|
for (int i = 0; i < nb; ++i) {
|
2066
2093
|
|
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2070
2097
|
const int8_t * restrict q8 = y[i].qs;
|
2071
2098
|
|
2072
2099
|
// Set up scales
|
2073
|
-
aux = (uint32_t *)x[i].scales;
|
2100
|
+
aux = (const uint32_t *)x[i].scales;
|
2074
2101
|
__m128i scales128 = _mm_set_epi32(
|
2075
2102
|
((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
|
2076
2103
|
((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
|
@@ -2596,8 +2623,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2596
2623
|
const uint8_t * restrict q4 = x[i].qs;
|
2597
2624
|
const int8_t * restrict q8 = y[i].qs;
|
2598
2625
|
|
2599
|
-
//int32x4_t isum = mzero;
|
2600
|
-
|
2601
2626
|
int32_t sumi1 = 0;
|
2602
2627
|
int32_t sumi2 = 0;
|
2603
2628
|
|
@@ -3096,9 +3121,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3096
3121
|
#ifdef __ARM_NEON
|
3097
3122
|
|
3098
3123
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3099
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3100
3124
|
const uint8x16_t mone = vdupq_n_u8(1);
|
3101
3125
|
const uint8x16_t mtwo = vdupq_n_u8(2);
|
3126
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3127
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3128
|
+
#endif
|
3102
3129
|
|
3103
3130
|
int8x16x4_t q5bytes;
|
3104
3131
|
|
@@ -3441,8 +3468,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3441
3468
|
#ifdef __ARM_NEON
|
3442
3469
|
|
3443
3470
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3444
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3445
3471
|
const uint8x16_t mh = vdupq_n_u8(16);
|
3472
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3473
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3474
|
+
#endif
|
3446
3475
|
|
3447
3476
|
int8x16x4_t q5bytes;
|
3448
3477
|
uint8x16x4_t q5h;
|
@@ -3660,7 +3689,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3660
3689
|
float sum = 0;
|
3661
3690
|
|
3662
3691
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
3692
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3663
3693
|
const int32x4_t vzero = vdupq_n_s32(0);
|
3694
|
+
#endif
|
3664
3695
|
//const int8x16_t m32s = vdupq_n_s8(32);
|
3665
3696
|
|
3666
3697
|
const uint8x16_t mone = vdupq_n_u8(3);
|
@@ -4049,8 +4080,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4049
4080
|
float sum = 0;
|
4050
4081
|
|
4051
4082
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
4052
|
-
const int32x4_t vzero = vdupq_n_s32(0);
|
4053
4083
|
const int8x16_t m32s = vdupq_n_s8(32);
|
4084
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
4085
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
4086
|
+
#endif
|
4054
4087
|
|
4055
4088
|
const uint8x16_t mone = vdupq_n_u8(3);
|
4056
4089
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -126,6 +126,9 @@ void replace_all(std::string & s, const std::string & search, const std::string
|
|
126
126
|
}
|
127
127
|
s = std::move(result);
|
128
128
|
}
|
129
|
+
#ifdef GGML_USE_CPU_HBM
|
130
|
+
#include <hbwmalloc.h>
|
131
|
+
#endif
|
129
132
|
|
130
133
|
static void zeros(std::ofstream & file, size_t n) {
|
131
134
|
char zero = 0;
|
@@ -325,6 +328,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
325
328
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
326
329
|
},
|
327
330
|
},
|
331
|
+
{
|
332
|
+
LLM_ARCH_GPT2,
|
333
|
+
{
|
334
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
335
|
+
},
|
336
|
+
},
|
337
|
+
{
|
338
|
+
LLM_ARCH_GPTJ,
|
339
|
+
{
|
340
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
341
|
+
},
|
342
|
+
},
|
343
|
+
{
|
344
|
+
LLM_ARCH_GPTNEOX,
|
345
|
+
{
|
346
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
347
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
348
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
349
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
350
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
351
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
352
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
353
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
354
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
355
|
+
},
|
356
|
+
},
|
357
|
+
{
|
358
|
+
LLM_ARCH_MPT,
|
359
|
+
{
|
360
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
361
|
+
},
|
362
|
+
},
|
363
|
+
{
|
364
|
+
LLM_ARCH_UNKNOWN,
|
365
|
+
{
|
366
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
367
|
+
},
|
368
|
+
},
|
328
369
|
};
|
329
370
|
|
330
371
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
@@ -412,6 +453,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
412
453
|
#elif GGML_USE_METAL
|
413
454
|
# define llama_host_malloc(n) ggml_metal_host_malloc(n)
|
414
455
|
# define llama_host_free(data) ggml_metal_host_free(data)
|
456
|
+
#elif GGML_USE_CPU_HBM
|
457
|
+
# define llama_host_malloc(n) hbw_malloc(n)
|
458
|
+
# define llama_host_free(data) if (data != NULL) hbw_free(data)
|
415
459
|
#else
|
416
460
|
# define llama_host_malloc(n) malloc(n)
|
417
461
|
# define llama_host_free(data) free(data)
|
@@ -568,16 +612,16 @@ struct llama_mmap {
|
|
568
612
|
|
569
613
|
if (prefetch > 0) {
|
570
614
|
// Advise the kernel to preload the mapped memory
|
571
|
-
if (
|
572
|
-
fprintf(stderr, "warning:
|
615
|
+
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
616
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
573
617
|
strerror(errno));
|
574
618
|
}
|
575
619
|
}
|
576
620
|
if (numa) {
|
577
621
|
// advise the kernel not to use readahead
|
578
622
|
// (because the next page might not belong on the same node)
|
579
|
-
if (
|
580
|
-
fprintf(stderr, "warning:
|
623
|
+
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
624
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
|
581
625
|
strerror(errno));
|
582
626
|
}
|
583
627
|
}
|
@@ -614,7 +658,9 @@ struct llama_mmap {
|
|
614
658
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
615
659
|
if (prefetch) {
|
616
660
|
// Advise the kernel to preload the mapped memory
|
661
|
+
|
617
662
|
WIN32_MEMORY_RANGE_ENTRY range;
|
663
|
+
|
618
664
|
range.VirtualAddress = addr;
|
619
665
|
range.NumberOfBytes = (SIZE_T)size;
|
620
666
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
@@ -1446,7 +1492,11 @@ struct llama_model_loader {
|
|
1446
1492
|
// allocate temp buffer if not using mmap
|
1447
1493
|
if (!use_mmap && cur->data == NULL) {
|
1448
1494
|
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
|
1449
|
-
|
1495
|
+
#ifdef GGML_USE_CPU_HBM
|
1496
|
+
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
|
1497
|
+
#else
|
1498
|
+
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
|
1499
|
+
#endif
|
1450
1500
|
}
|
1451
1501
|
|
1452
1502
|
load_data_for(cur);
|
@@ -1600,9 +1650,13 @@ static void llm_load_hparams(
|
|
1600
1650
|
|
1601
1651
|
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
|
1602
1652
|
|
1603
|
-
if (
|
1604
|
-
|
1653
|
+
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
1654
|
+
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
1655
|
+
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
|
1656
|
+
}
|
1605
1657
|
}
|
1658
|
+
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
1659
|
+
// gpt-j n_rot = rotary_dim
|
1606
1660
|
}
|
1607
1661
|
|
1608
1662
|
// arch-specific KVs
|
@@ -2895,7 +2949,12 @@ static bool llama_eval_internal(
|
|
2895
2949
|
|
2896
2950
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
2897
2951
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
2898
|
-
|
2952
|
+
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
2953
|
+
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
2954
|
+
// with the BLAS calls. need a better solution
|
2955
|
+
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
2956
|
+
n_threads = std::min(4, n_threads);
|
2957
|
+
}
|
2899
2958
|
|
2900
2959
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
2901
2960
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
@@ -3000,33 +3059,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
|
3000
3059
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
3001
3060
|
}
|
3002
3061
|
|
3003
|
-
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
|
3004
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
3005
|
-
}
|
3006
|
-
|
3007
|
-
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
|
3008
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
|
3009
|
-
}
|
3010
|
-
|
3011
3062
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
3012
3063
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
3013
3064
|
}
|
3014
3065
|
|
3015
|
-
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
|
3016
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3017
|
-
return id == vocab.special_bos_id;
|
3018
|
-
}
|
3019
|
-
|
3020
|
-
static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
|
3021
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3022
|
-
return id == vocab.special_eos_id;
|
3023
|
-
}
|
3024
|
-
|
3025
|
-
static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
|
3026
|
-
GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
|
3027
|
-
return id == vocab.special_pad_id;
|
3028
|
-
}
|
3029
|
-
|
3030
3066
|
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
|
3031
3067
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
3032
3068
|
const auto& token_data = vocab.id_to_token.at(id);
|
@@ -3319,9 +3355,15 @@ struct llm_tokenizer_bpe {
|
|
3319
3355
|
std::string byte_str(1, *j);
|
3320
3356
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
3321
3357
|
if (token_multibyte == vocab.token_to_id.end()) {
|
3322
|
-
|
3358
|
+
try {
|
3359
|
+
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
3360
|
+
output.push_back(token_byte);
|
3361
|
+
} catch (const std::out_of_range & err) {
|
3362
|
+
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
3363
|
+
}
|
3364
|
+
} else {
|
3365
|
+
output.push_back((*token_multibyte).second);
|
3323
3366
|
}
|
3324
|
-
output.push_back((*token_multibyte).second);
|
3325
3367
|
}
|
3326
3368
|
} else {
|
3327
3369
|
output.push_back((*token).second);
|
@@ -3595,7 +3637,7 @@ static void llama_grammar_advance_stack(
|
|
3595
3637
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
3596
3638
|
|
3597
3639
|
if (stack.empty()) {
|
3598
|
-
new_stacks.
|
3640
|
+
new_stacks.emplace_back(stack);
|
3599
3641
|
return;
|
3600
3642
|
}
|
3601
3643
|
|
@@ -3632,7 +3674,7 @@ static void llama_grammar_advance_stack(
|
|
3632
3674
|
}
|
3633
3675
|
case LLAMA_GRETYPE_CHAR:
|
3634
3676
|
case LLAMA_GRETYPE_CHAR_NOT:
|
3635
|
-
new_stacks.
|
3677
|
+
new_stacks.emplace_back(stack);
|
3636
3678
|
break;
|
3637
3679
|
default:
|
3638
3680
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
@@ -3797,6 +3839,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
|
|
3797
3839
|
delete grammar;
|
3798
3840
|
}
|
3799
3841
|
|
3842
|
+
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
3843
|
+
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
|
3844
|
+
|
3845
|
+
// redirect elements in stacks to point to new rules
|
3846
|
+
for (size_t is = 0; is < result->stacks.size(); is++) {
|
3847
|
+
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
|
3848
|
+
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
|
3849
|
+
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
|
3850
|
+
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
|
3851
|
+
result->stacks[is][ie] = &result->rules[ir0][ir1];
|
3852
|
+
}
|
3853
|
+
}
|
3854
|
+
}
|
3855
|
+
}
|
3856
|
+
}
|
3857
|
+
|
3858
|
+
return result;
|
3859
|
+
}
|
3860
|
+
|
3800
3861
|
//
|
3801
3862
|
// sampling
|
3802
3863
|
//
|
@@ -4388,7 +4449,7 @@ struct llama_logit_info {
|
|
4388
4449
|
}
|
4389
4450
|
return min_heap;
|
4390
4451
|
}
|
4391
|
-
float probability_from_logit(float logit) {
|
4452
|
+
float probability_from_logit(float logit) const {
|
4392
4453
|
return normalizer * std::exp(logit - max_l);
|
4393
4454
|
}
|
4394
4455
|
};
|
@@ -4678,6 +4739,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4678
4739
|
llm_load_arch(*ml, model);
|
4679
4740
|
llm_load_hparams(*ml, model, 0, 0, 0);
|
4680
4741
|
|
4742
|
+
if (params->only_copy) {
|
4743
|
+
ftype = model.ftype;
|
4744
|
+
}
|
4745
|
+
|
4681
4746
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
4682
4747
|
struct gguf_context * ctx_out = gguf_init_empty();
|
4683
4748
|
|
@@ -4719,9 +4784,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4719
4784
|
std::vector<std::thread> workers;
|
4720
4785
|
std::mutex mutex;
|
4721
4786
|
|
4787
|
+
#ifdef GGML_USE_K_QUANTS
|
4722
4788
|
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
4723
4789
|
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
4724
4790
|
};
|
4791
|
+
#endif
|
4725
4792
|
|
4726
4793
|
int idx = 0;
|
4727
4794
|
|
@@ -4764,18 +4831,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4764
4831
|
// quantize only 2D tensors
|
4765
4832
|
quantize &= (tensor->n_dims == 2);
|
4766
4833
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
4767
|
-
quantize &=
|
4834
|
+
quantize &= !params->only_copy;
|
4768
4835
|
|
4769
4836
|
enum ggml_type new_type;
|
4770
4837
|
void * new_data;
|
4771
4838
|
size_t new_size;
|
4772
4839
|
|
4773
|
-
if (
|
4774
|
-
new_type = tensor->type;
|
4775
|
-
new_data = tensor->data;
|
4776
|
-
new_size = ggml_nbytes(tensor);
|
4777
|
-
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
4778
|
-
} else {
|
4840
|
+
if (quantize) {
|
4779
4841
|
new_type = quantized_type;
|
4780
4842
|
#ifdef GGML_USE_K_QUANTS
|
4781
4843
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
@@ -4874,7 +4936,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4874
4936
|
}
|
4875
4937
|
}
|
4876
4938
|
#endif
|
4877
|
-
|
4939
|
+
// If we've decided to quantize to the same type the tensor is already
|
4940
|
+
// in then there's nothing to do.
|
4941
|
+
quantize = tensor->type != new_type;
|
4942
|
+
}
|
4943
|
+
if (!quantize) {
|
4944
|
+
new_type = tensor->type;
|
4945
|
+
new_data = tensor->data;
|
4946
|
+
new_size = ggml_nbytes(tensor);
|
4947
|
+
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
4948
|
+
} else {
|
4878
4949
|
const size_t nelements = ggml_nelements(tensor);
|
4879
4950
|
|
4880
4951
|
float * f32_data;
|
@@ -5279,7 +5350,7 @@ struct llama_context_params llama_context_default_params() {
|
|
5279
5350
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
5280
5351
|
/*.n_ctx =*/ 512,
|
5281
5352
|
/*.n_batch =*/ 512,
|
5282
|
-
/*.
|
5353
|
+
/*.n_gpu_layers =*/ 0,
|
5283
5354
|
/*.main_gpu =*/ 0,
|
5284
5355
|
/*.tensor_split =*/ nullptr,
|
5285
5356
|
/*.rope_freq_base =*/ 10000.0f,
|
@@ -5296,6 +5367,10 @@ struct llama_context_params llama_context_default_params() {
|
|
5296
5367
|
/*.embedding =*/ false,
|
5297
5368
|
};
|
5298
5369
|
|
5370
|
+
#ifdef GGML_USE_METAL
|
5371
|
+
result.n_gpu_layers = 1;
|
5372
|
+
#endif
|
5373
|
+
|
5299
5374
|
return result;
|
5300
5375
|
}
|
5301
5376
|
|
@@ -5305,6 +5380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
5305
5380
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
5306
5381
|
/*.allow_requantize =*/ false,
|
5307
5382
|
/*.quantize_output_tensor =*/ true,
|
5383
|
+
/*.only_copy =*/ false,
|
5308
5384
|
};
|
5309
5385
|
|
5310
5386
|
return result;
|
@@ -5487,43 +5563,43 @@ struct llama_context * llama_new_context_with_model(
|
|
5487
5563
|
}
|
5488
5564
|
#endif
|
5489
5565
|
}
|
5490
|
-
}
|
5491
5566
|
|
5492
5567
|
#ifdef GGML_USE_METAL
|
5493
|
-
|
5494
|
-
|
5568
|
+
if (params.n_gpu_layers > 0) {
|
5569
|
+
// this allocates all Metal resources and memory buffers
|
5495
5570
|
|
5496
|
-
|
5497
|
-
|
5571
|
+
void * data_ptr = NULL;
|
5572
|
+
size_t data_size = 0;
|
5498
5573
|
|
5499
|
-
|
5500
|
-
|
5501
|
-
|
5502
|
-
|
5503
|
-
|
5504
|
-
|
5505
|
-
|
5574
|
+
if (params.use_mmap) {
|
5575
|
+
data_ptr = ctx->model.mapping->addr;
|
5576
|
+
data_size = ctx->model.mapping->size;
|
5577
|
+
} else {
|
5578
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
5579
|
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
5580
|
+
}
|
5506
5581
|
|
5507
|
-
|
5582
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
5508
5583
|
|
5509
|
-
|
5584
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
5510
5585
|
|
5511
5586
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
5512
|
-
|
5513
|
-
|
5514
|
-
|
5515
|
-
|
5516
|
-
|
5587
|
+
if (!(result)) { \
|
5588
|
+
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
5589
|
+
llama_free(ctx); \
|
5590
|
+
return NULL; \
|
5591
|
+
}
|
5517
5592
|
|
5518
|
-
|
5593
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
5519
5594
|
|
5520
|
-
|
5521
|
-
|
5595
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
5596
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
5522
5597
|
|
5523
|
-
|
5598
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
5524
5599
|
#undef LLAMA_METAL_CHECK_BUF
|
5525
|
-
|
5600
|
+
}
|
5526
5601
|
#endif
|
5602
|
+
}
|
5527
5603
|
|
5528
5604
|
#ifdef GGML_USE_MPI
|
5529
5605
|
ctx->ctx_mpi = ggml_mpi_init();
|
@@ -5857,7 +5933,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
5857
5933
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
5858
5934
|
rng_ss >> ctx->rng;
|
5859
5935
|
|
5860
|
-
GGML_ASSERT(rng_ss.fail()
|
5936
|
+
GGML_ASSERT(!rng_ss.fail());
|
5861
5937
|
}
|
5862
5938
|
|
5863
5939
|
// set logits
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -164,6 +164,7 @@ extern "C" {
|
|
164
164
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
165
165
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
166
166
|
bool quantize_output_tensor; // quantize output.weight
|
167
|
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
167
168
|
} llama_model_quantize_params;
|
168
169
|
|
169
170
|
// grammar types
|
@@ -409,6 +410,8 @@ extern "C" {
|
|
409
410
|
|
410
411
|
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
411
412
|
|
413
|
+
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
|
414
|
+
|
412
415
|
//
|
413
416
|
// Sampling functions
|
414
417
|
//
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.5.
|
6
|
+
VERSION = '0.5.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1198'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-09-
|
11
|
+
date: 2023-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|