llama_cpp 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +106 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +83 -23
- data/ext/llama_cpp/src/ggml-metal.m +35 -11
- data/ext/llama_cpp/src/ggml-metal.metal +145 -92
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +25 -53
- data/ext/llama_cpp/src/k_quants.c +45 -12
- data/ext/llama_cpp/src/llama.cpp +146 -70
- data/ext/llama_cpp/src/llama.h +3 -0
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -13,6 +13,26 @@
|
|
13
13
|
//
|
14
14
|
#include <arm_neon.h>
|
15
15
|
|
16
|
+
#if !defined(__aarch64__)
|
17
|
+
inline static int32_t vaddvq_s16(int16x8_t v) {
|
18
|
+
return
|
19
|
+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
20
|
+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
21
|
+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
22
|
+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
23
|
+
}
|
24
|
+
|
25
|
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
26
|
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
27
|
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
28
|
+
return vcombine_s16(a0, b0);
|
29
|
+
}
|
30
|
+
|
31
|
+
inline static int32_t vaddvq_s32(int32x4_t v) {
|
32
|
+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
33
|
+
}
|
34
|
+
#endif
|
35
|
+
|
16
36
|
#else
|
17
37
|
|
18
38
|
#ifdef __wasm_simd128__
|
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
63
83
|
float ax = fabsf(x[i]);
|
64
84
|
if (ax > amax) { amax = ax; max = x[i]; }
|
65
85
|
}
|
66
|
-
if (
|
86
|
+
if (amax < 1e-30f) { // all zero
|
67
87
|
for (int i = 0; i < n; ++i) {
|
68
88
|
L[i] = 0;
|
69
89
|
}
|
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
183
203
|
int ntry, float alpha) {
|
184
204
|
float min = x[0];
|
185
205
|
float max = x[0];
|
186
|
-
float sum_x = 0;
|
187
|
-
float sum_x2 = 0;
|
188
206
|
for (int i = 1; i < n; ++i) {
|
189
207
|
if (x[i] < min) min = x[i];
|
190
208
|
if (x[i] > max) max = x[i];
|
191
|
-
sum_x += x[i];
|
192
|
-
sum_x2 += x[i]*x[i];
|
193
209
|
}
|
194
210
|
if (max == min) {
|
195
211
|
for (int i = 0; i < n; ++i) L[i] = 0;
|
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
1070
1086
|
|
1071
1087
|
}
|
1072
1088
|
|
1089
|
+
if (!max_abs_scale) {
|
1090
|
+
memset(&y[i], 0, sizeof(block_q6_K));
|
1091
|
+
y[i].d = ggml_fp32_to_fp16(0.f);
|
1092
|
+
x += QK_K;
|
1093
|
+
continue;
|
1094
|
+
}
|
1095
|
+
|
1073
1096
|
float iscale = -128.f/max_scale;
|
1074
1097
|
y[i].d = ggml_fp32_to_fp16(1/iscale);
|
1075
1098
|
for (int ib = 0; ib < QK_K/16; ++ib) {
|
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1306
1329
|
|
1307
1330
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1308
1331
|
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
1332
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1309
1333
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1334
|
+
#endif
|
1310
1335
|
|
1311
1336
|
int8x16x2_t q2bytes;
|
1312
1337
|
uint8_t aux[16];
|
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1612
1637
|
#ifdef __ARM_NEON
|
1613
1638
|
|
1614
1639
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1640
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1615
1641
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1642
|
+
#endif
|
1616
1643
|
|
1617
1644
|
int8x16x4_t q2bytes;
|
1618
1645
|
|
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2060
2087
|
|
2061
2088
|
__m256 acc = _mm256_setzero_ps();
|
2062
2089
|
|
2063
|
-
uint32_t *aux;
|
2090
|
+
const uint32_t *aux;
|
2064
2091
|
|
2065
2092
|
for (int i = 0; i < nb; ++i) {
|
2066
2093
|
|
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2070
2097
|
const int8_t * restrict q8 = y[i].qs;
|
2071
2098
|
|
2072
2099
|
// Set up scales
|
2073
|
-
aux = (uint32_t *)x[i].scales;
|
2100
|
+
aux = (const uint32_t *)x[i].scales;
|
2074
2101
|
__m128i scales128 = _mm_set_epi32(
|
2075
2102
|
((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
|
2076
2103
|
((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
|
@@ -2596,8 +2623,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2596
2623
|
const uint8_t * restrict q4 = x[i].qs;
|
2597
2624
|
const int8_t * restrict q8 = y[i].qs;
|
2598
2625
|
|
2599
|
-
//int32x4_t isum = mzero;
|
2600
|
-
|
2601
2626
|
int32_t sumi1 = 0;
|
2602
2627
|
int32_t sumi2 = 0;
|
2603
2628
|
|
@@ -3096,9 +3121,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3096
3121
|
#ifdef __ARM_NEON
|
3097
3122
|
|
3098
3123
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3099
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3100
3124
|
const uint8x16_t mone = vdupq_n_u8(1);
|
3101
3125
|
const uint8x16_t mtwo = vdupq_n_u8(2);
|
3126
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3127
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3128
|
+
#endif
|
3102
3129
|
|
3103
3130
|
int8x16x4_t q5bytes;
|
3104
3131
|
|
@@ -3441,8 +3468,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3441
3468
|
#ifdef __ARM_NEON
|
3442
3469
|
|
3443
3470
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3444
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3445
3471
|
const uint8x16_t mh = vdupq_n_u8(16);
|
3472
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3473
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3474
|
+
#endif
|
3446
3475
|
|
3447
3476
|
int8x16x4_t q5bytes;
|
3448
3477
|
uint8x16x4_t q5h;
|
@@ -3660,7 +3689,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3660
3689
|
float sum = 0;
|
3661
3690
|
|
3662
3691
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
3692
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3663
3693
|
const int32x4_t vzero = vdupq_n_s32(0);
|
3694
|
+
#endif
|
3664
3695
|
//const int8x16_t m32s = vdupq_n_s8(32);
|
3665
3696
|
|
3666
3697
|
const uint8x16_t mone = vdupq_n_u8(3);
|
@@ -4049,8 +4080,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4049
4080
|
float sum = 0;
|
4050
4081
|
|
4051
4082
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
4052
|
-
const int32x4_t vzero = vdupq_n_s32(0);
|
4053
4083
|
const int8x16_t m32s = vdupq_n_s8(32);
|
4084
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
4085
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
4086
|
+
#endif
|
4054
4087
|
|
4055
4088
|
const uint8x16_t mone = vdupq_n_u8(3);
|
4056
4089
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -126,6 +126,9 @@ void replace_all(std::string & s, const std::string & search, const std::string
|
|
126
126
|
}
|
127
127
|
s = std::move(result);
|
128
128
|
}
|
129
|
+
#ifdef GGML_USE_CPU_HBM
|
130
|
+
#include <hbwmalloc.h>
|
131
|
+
#endif
|
129
132
|
|
130
133
|
static void zeros(std::ofstream & file, size_t n) {
|
131
134
|
char zero = 0;
|
@@ -325,6 +328,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
325
328
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
326
329
|
},
|
327
330
|
},
|
331
|
+
{
|
332
|
+
LLM_ARCH_GPT2,
|
333
|
+
{
|
334
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
335
|
+
},
|
336
|
+
},
|
337
|
+
{
|
338
|
+
LLM_ARCH_GPTJ,
|
339
|
+
{
|
340
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
341
|
+
},
|
342
|
+
},
|
343
|
+
{
|
344
|
+
LLM_ARCH_GPTNEOX,
|
345
|
+
{
|
346
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
347
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
348
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
349
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
350
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
351
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
352
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
353
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
354
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
355
|
+
},
|
356
|
+
},
|
357
|
+
{
|
358
|
+
LLM_ARCH_MPT,
|
359
|
+
{
|
360
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
361
|
+
},
|
362
|
+
},
|
363
|
+
{
|
364
|
+
LLM_ARCH_UNKNOWN,
|
365
|
+
{
|
366
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
367
|
+
},
|
368
|
+
},
|
328
369
|
};
|
329
370
|
|
330
371
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
@@ -412,6 +453,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
412
453
|
#elif GGML_USE_METAL
|
413
454
|
# define llama_host_malloc(n) ggml_metal_host_malloc(n)
|
414
455
|
# define llama_host_free(data) ggml_metal_host_free(data)
|
456
|
+
#elif GGML_USE_CPU_HBM
|
457
|
+
# define llama_host_malloc(n) hbw_malloc(n)
|
458
|
+
# define llama_host_free(data) if (data != NULL) hbw_free(data)
|
415
459
|
#else
|
416
460
|
# define llama_host_malloc(n) malloc(n)
|
417
461
|
# define llama_host_free(data) free(data)
|
@@ -568,16 +612,16 @@ struct llama_mmap {
|
|
568
612
|
|
569
613
|
if (prefetch > 0) {
|
570
614
|
// Advise the kernel to preload the mapped memory
|
571
|
-
if (
|
572
|
-
fprintf(stderr, "warning:
|
615
|
+
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
616
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
573
617
|
strerror(errno));
|
574
618
|
}
|
575
619
|
}
|
576
620
|
if (numa) {
|
577
621
|
// advise the kernel not to use readahead
|
578
622
|
// (because the next page might not belong on the same node)
|
579
|
-
if (
|
580
|
-
fprintf(stderr, "warning:
|
623
|
+
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
624
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
|
581
625
|
strerror(errno));
|
582
626
|
}
|
583
627
|
}
|
@@ -614,7 +658,9 @@ struct llama_mmap {
|
|
614
658
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
615
659
|
if (prefetch) {
|
616
660
|
// Advise the kernel to preload the mapped memory
|
661
|
+
|
617
662
|
WIN32_MEMORY_RANGE_ENTRY range;
|
663
|
+
|
618
664
|
range.VirtualAddress = addr;
|
619
665
|
range.NumberOfBytes = (SIZE_T)size;
|
620
666
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
@@ -1446,7 +1492,11 @@ struct llama_model_loader {
|
|
1446
1492
|
// allocate temp buffer if not using mmap
|
1447
1493
|
if (!use_mmap && cur->data == NULL) {
|
1448
1494
|
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
|
1449
|
-
|
1495
|
+
#ifdef GGML_USE_CPU_HBM
|
1496
|
+
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
|
1497
|
+
#else
|
1498
|
+
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
|
1499
|
+
#endif
|
1450
1500
|
}
|
1451
1501
|
|
1452
1502
|
load_data_for(cur);
|
@@ -1600,9 +1650,13 @@ static void llm_load_hparams(
|
|
1600
1650
|
|
1601
1651
|
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
|
1602
1652
|
|
1603
|
-
if (
|
1604
|
-
|
1653
|
+
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
1654
|
+
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
1655
|
+
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
|
1656
|
+
}
|
1605
1657
|
}
|
1658
|
+
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
1659
|
+
// gpt-j n_rot = rotary_dim
|
1606
1660
|
}
|
1607
1661
|
|
1608
1662
|
// arch-specific KVs
|
@@ -2895,7 +2949,12 @@ static bool llama_eval_internal(
|
|
2895
2949
|
|
2896
2950
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
2897
2951
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
2898
|
-
|
2952
|
+
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
2953
|
+
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
2954
|
+
// with the BLAS calls. need a better solution
|
2955
|
+
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
2956
|
+
n_threads = std::min(4, n_threads);
|
2957
|
+
}
|
2899
2958
|
|
2900
2959
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
2901
2960
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
@@ -3000,33 +3059,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
|
3000
3059
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
3001
3060
|
}
|
3002
3061
|
|
3003
|
-
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
|
3004
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
3005
|
-
}
|
3006
|
-
|
3007
|
-
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
|
3008
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
|
3009
|
-
}
|
3010
|
-
|
3011
3062
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
3012
3063
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
3013
3064
|
}
|
3014
3065
|
|
3015
|
-
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
|
3016
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3017
|
-
return id == vocab.special_bos_id;
|
3018
|
-
}
|
3019
|
-
|
3020
|
-
static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
|
3021
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3022
|
-
return id == vocab.special_eos_id;
|
3023
|
-
}
|
3024
|
-
|
3025
|
-
static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
|
3026
|
-
GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
|
3027
|
-
return id == vocab.special_pad_id;
|
3028
|
-
}
|
3029
|
-
|
3030
3066
|
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
|
3031
3067
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
3032
3068
|
const auto& token_data = vocab.id_to_token.at(id);
|
@@ -3319,9 +3355,15 @@ struct llm_tokenizer_bpe {
|
|
3319
3355
|
std::string byte_str(1, *j);
|
3320
3356
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
3321
3357
|
if (token_multibyte == vocab.token_to_id.end()) {
|
3322
|
-
|
3358
|
+
try {
|
3359
|
+
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
3360
|
+
output.push_back(token_byte);
|
3361
|
+
} catch (const std::out_of_range & err) {
|
3362
|
+
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
3363
|
+
}
|
3364
|
+
} else {
|
3365
|
+
output.push_back((*token_multibyte).second);
|
3323
3366
|
}
|
3324
|
-
output.push_back((*token_multibyte).second);
|
3325
3367
|
}
|
3326
3368
|
} else {
|
3327
3369
|
output.push_back((*token).second);
|
@@ -3595,7 +3637,7 @@ static void llama_grammar_advance_stack(
|
|
3595
3637
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
3596
3638
|
|
3597
3639
|
if (stack.empty()) {
|
3598
|
-
new_stacks.
|
3640
|
+
new_stacks.emplace_back(stack);
|
3599
3641
|
return;
|
3600
3642
|
}
|
3601
3643
|
|
@@ -3632,7 +3674,7 @@ static void llama_grammar_advance_stack(
|
|
3632
3674
|
}
|
3633
3675
|
case LLAMA_GRETYPE_CHAR:
|
3634
3676
|
case LLAMA_GRETYPE_CHAR_NOT:
|
3635
|
-
new_stacks.
|
3677
|
+
new_stacks.emplace_back(stack);
|
3636
3678
|
break;
|
3637
3679
|
default:
|
3638
3680
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
@@ -3797,6 +3839,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
|
|
3797
3839
|
delete grammar;
|
3798
3840
|
}
|
3799
3841
|
|
3842
|
+
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
3843
|
+
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
|
3844
|
+
|
3845
|
+
// redirect elements in stacks to point to new rules
|
3846
|
+
for (size_t is = 0; is < result->stacks.size(); is++) {
|
3847
|
+
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
|
3848
|
+
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
|
3849
|
+
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
|
3850
|
+
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
|
3851
|
+
result->stacks[is][ie] = &result->rules[ir0][ir1];
|
3852
|
+
}
|
3853
|
+
}
|
3854
|
+
}
|
3855
|
+
}
|
3856
|
+
}
|
3857
|
+
|
3858
|
+
return result;
|
3859
|
+
}
|
3860
|
+
|
3800
3861
|
//
|
3801
3862
|
// sampling
|
3802
3863
|
//
|
@@ -4388,7 +4449,7 @@ struct llama_logit_info {
|
|
4388
4449
|
}
|
4389
4450
|
return min_heap;
|
4390
4451
|
}
|
4391
|
-
float probability_from_logit(float logit) {
|
4452
|
+
float probability_from_logit(float logit) const {
|
4392
4453
|
return normalizer * std::exp(logit - max_l);
|
4393
4454
|
}
|
4394
4455
|
};
|
@@ -4678,6 +4739,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4678
4739
|
llm_load_arch(*ml, model);
|
4679
4740
|
llm_load_hparams(*ml, model, 0, 0, 0);
|
4680
4741
|
|
4742
|
+
if (params->only_copy) {
|
4743
|
+
ftype = model.ftype;
|
4744
|
+
}
|
4745
|
+
|
4681
4746
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
4682
4747
|
struct gguf_context * ctx_out = gguf_init_empty();
|
4683
4748
|
|
@@ -4719,9 +4784,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4719
4784
|
std::vector<std::thread> workers;
|
4720
4785
|
std::mutex mutex;
|
4721
4786
|
|
4787
|
+
#ifdef GGML_USE_K_QUANTS
|
4722
4788
|
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
4723
4789
|
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
4724
4790
|
};
|
4791
|
+
#endif
|
4725
4792
|
|
4726
4793
|
int idx = 0;
|
4727
4794
|
|
@@ -4764,18 +4831,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4764
4831
|
// quantize only 2D tensors
|
4765
4832
|
quantize &= (tensor->n_dims == 2);
|
4766
4833
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
4767
|
-
quantize &=
|
4834
|
+
quantize &= !params->only_copy;
|
4768
4835
|
|
4769
4836
|
enum ggml_type new_type;
|
4770
4837
|
void * new_data;
|
4771
4838
|
size_t new_size;
|
4772
4839
|
|
4773
|
-
if (
|
4774
|
-
new_type = tensor->type;
|
4775
|
-
new_data = tensor->data;
|
4776
|
-
new_size = ggml_nbytes(tensor);
|
4777
|
-
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
4778
|
-
} else {
|
4840
|
+
if (quantize) {
|
4779
4841
|
new_type = quantized_type;
|
4780
4842
|
#ifdef GGML_USE_K_QUANTS
|
4781
4843
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
@@ -4874,7 +4936,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4874
4936
|
}
|
4875
4937
|
}
|
4876
4938
|
#endif
|
4877
|
-
|
4939
|
+
// If we've decided to quantize to the same type the tensor is already
|
4940
|
+
// in then there's nothing to do.
|
4941
|
+
quantize = tensor->type != new_type;
|
4942
|
+
}
|
4943
|
+
if (!quantize) {
|
4944
|
+
new_type = tensor->type;
|
4945
|
+
new_data = tensor->data;
|
4946
|
+
new_size = ggml_nbytes(tensor);
|
4947
|
+
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
4948
|
+
} else {
|
4878
4949
|
const size_t nelements = ggml_nelements(tensor);
|
4879
4950
|
|
4880
4951
|
float * f32_data;
|
@@ -5279,7 +5350,7 @@ struct llama_context_params llama_context_default_params() {
|
|
5279
5350
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
5280
5351
|
/*.n_ctx =*/ 512,
|
5281
5352
|
/*.n_batch =*/ 512,
|
5282
|
-
/*.
|
5353
|
+
/*.n_gpu_layers =*/ 0,
|
5283
5354
|
/*.main_gpu =*/ 0,
|
5284
5355
|
/*.tensor_split =*/ nullptr,
|
5285
5356
|
/*.rope_freq_base =*/ 10000.0f,
|
@@ -5296,6 +5367,10 @@ struct llama_context_params llama_context_default_params() {
|
|
5296
5367
|
/*.embedding =*/ false,
|
5297
5368
|
};
|
5298
5369
|
|
5370
|
+
#ifdef GGML_USE_METAL
|
5371
|
+
result.n_gpu_layers = 1;
|
5372
|
+
#endif
|
5373
|
+
|
5299
5374
|
return result;
|
5300
5375
|
}
|
5301
5376
|
|
@@ -5305,6 +5380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
5305
5380
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
5306
5381
|
/*.allow_requantize =*/ false,
|
5307
5382
|
/*.quantize_output_tensor =*/ true,
|
5383
|
+
/*.only_copy =*/ false,
|
5308
5384
|
};
|
5309
5385
|
|
5310
5386
|
return result;
|
@@ -5487,43 +5563,43 @@ struct llama_context * llama_new_context_with_model(
|
|
5487
5563
|
}
|
5488
5564
|
#endif
|
5489
5565
|
}
|
5490
|
-
}
|
5491
5566
|
|
5492
5567
|
#ifdef GGML_USE_METAL
|
5493
|
-
|
5494
|
-
|
5568
|
+
if (params.n_gpu_layers > 0) {
|
5569
|
+
// this allocates all Metal resources and memory buffers
|
5495
5570
|
|
5496
|
-
|
5497
|
-
|
5571
|
+
void * data_ptr = NULL;
|
5572
|
+
size_t data_size = 0;
|
5498
5573
|
|
5499
|
-
|
5500
|
-
|
5501
|
-
|
5502
|
-
|
5503
|
-
|
5504
|
-
|
5505
|
-
|
5574
|
+
if (params.use_mmap) {
|
5575
|
+
data_ptr = ctx->model.mapping->addr;
|
5576
|
+
data_size = ctx->model.mapping->size;
|
5577
|
+
} else {
|
5578
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
5579
|
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
5580
|
+
}
|
5506
5581
|
|
5507
|
-
|
5582
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
5508
5583
|
|
5509
|
-
|
5584
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
5510
5585
|
|
5511
5586
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
5512
|
-
|
5513
|
-
|
5514
|
-
|
5515
|
-
|
5516
|
-
|
5587
|
+
if (!(result)) { \
|
5588
|
+
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
5589
|
+
llama_free(ctx); \
|
5590
|
+
return NULL; \
|
5591
|
+
}
|
5517
5592
|
|
5518
|
-
|
5593
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
5519
5594
|
|
5520
|
-
|
5521
|
-
|
5595
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
5596
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
5522
5597
|
|
5523
|
-
|
5598
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
5524
5599
|
#undef LLAMA_METAL_CHECK_BUF
|
5525
|
-
|
5600
|
+
}
|
5526
5601
|
#endif
|
5602
|
+
}
|
5527
5603
|
|
5528
5604
|
#ifdef GGML_USE_MPI
|
5529
5605
|
ctx->ctx_mpi = ggml_mpi_init();
|
@@ -5857,7 +5933,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
5857
5933
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
5858
5934
|
rng_ss >> ctx->rng;
|
5859
5935
|
|
5860
|
-
GGML_ASSERT(rng_ss.fail()
|
5936
|
+
GGML_ASSERT(!rng_ss.fail());
|
5861
5937
|
}
|
5862
5938
|
|
5863
5939
|
// set logits
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -164,6 +164,7 @@ extern "C" {
|
|
164
164
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
165
165
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
166
166
|
bool quantize_output_tensor; // quantize output.weight
|
167
|
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
167
168
|
} llama_model_quantize_params;
|
168
169
|
|
169
170
|
// grammar types
|
@@ -409,6 +410,8 @@ extern "C" {
|
|
409
410
|
|
410
411
|
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
411
412
|
|
413
|
+
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
|
414
|
+
|
412
415
|
//
|
413
416
|
// Sampling functions
|
414
417
|
//
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.5.
|
6
|
+
VERSION = '0.5.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1198'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-09-
|
11
|
+
date: 2023-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|