cui-llama.rn 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/jni.cpp +3 -4
- package/cpp/common.cpp +183 -1990
- package/cpp/common.h +101 -130
- package/cpp/ggml-impl.h +32 -0
- package/cpp/ggml-metal.m +38 -28
- package/cpp/ggml-quants.c +275 -84
- package/cpp/ggml.c +89 -35
- package/cpp/ggml.h +30 -67
- package/cpp/llama-impl.h +1 -0
- package/cpp/llama-sampling.cpp +218 -102
- package/cpp/llama.cpp +599 -120
- package/cpp/llama.h +33 -25
- package/cpp/log.cpp +401 -0
- package/cpp/log.h +85 -703
- package/cpp/rn-llama.hpp +9 -11
- package/cpp/sampling.cpp +12 -9
- package/cpp/sampling.h +4 -56
- package/cpp/sgemm.cpp +38 -0
- package/package.json +1 -1
package/cpp/rn-llama.hpp
CHANGED
@@ -5,10 +5,7 @@
|
|
5
5
|
#include <iostream>
|
6
6
|
#include "common.h"
|
7
7
|
#include "llama.h"
|
8
|
-
|
9
|
-
#include <android/log.h>
|
10
|
-
#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
|
11
|
-
#define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
|
8
|
+
#include "sampling.h"
|
12
9
|
|
13
10
|
namespace rnllama {
|
14
11
|
|
@@ -27,6 +24,7 @@ static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
|
|
27
24
|
batch->n_tokens += 1;
|
28
25
|
}
|
29
26
|
|
27
|
+
|
30
28
|
// NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
|
31
29
|
|
32
30
|
static void log(const char *level, const char *function, int line,
|
@@ -308,9 +306,9 @@ struct llama_rn_context
|
|
308
306
|
}
|
309
307
|
// compare the evaluated prompt with the new prompt
|
310
308
|
n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
|
311
|
-
|
312
|
-
|
313
|
-
|
309
|
+
LOG_INFO("%s: n_past: %zu", __func__, n_past);
|
310
|
+
LOG_INFO("%s: embd size: %zu", __func__, embd.size());
|
311
|
+
LOG_INFO("%s: prompt_tokens size: %zu", __func__, prompt_tokens.size());
|
314
312
|
embd = prompt_tokens;
|
315
313
|
if (n_past == num_prompt_tokens)
|
316
314
|
{
|
@@ -334,7 +332,7 @@ struct llama_rn_context
|
|
334
332
|
{
|
335
333
|
// number of tokens to keep when resetting context
|
336
334
|
n_remain = params.n_predict;
|
337
|
-
|
335
|
+
llama_perf_context_reset(ctx);
|
338
336
|
is_predicting = true;
|
339
337
|
}
|
340
338
|
|
@@ -391,7 +389,7 @@ struct llama_rn_context
|
|
391
389
|
n_past += n_eval;
|
392
390
|
|
393
391
|
if(is_interrupted) {
|
394
|
-
|
392
|
+
LOG_INFO("Decoding Interrupted");
|
395
393
|
embd.resize(n_past);
|
396
394
|
has_next_token = false;
|
397
395
|
return result;
|
@@ -797,7 +795,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> ¤t_context
|
|
797
795
|
|
798
796
|
if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
|
799
797
|
{
|
800
|
-
|
798
|
+
LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
|
801
799
|
return; //no purge is needed
|
802
800
|
}
|
803
801
|
|
@@ -825,7 +823,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> ¤t_context
|
|
825
823
|
current_context_tokens[i - diff] = current_context_tokens[i];
|
826
824
|
}
|
827
825
|
|
828
|
-
|
826
|
+
LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
|
829
827
|
|
830
828
|
current_context_tokens.resize(current_context_tokens.size() - diff);
|
831
829
|
}
|
package/cpp/sampling.cpp
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
|
3
3
|
#include "common.h"
|
4
4
|
|
5
|
+
#include <cmath>
|
6
|
+
#include <unordered_map>
|
7
|
+
|
5
8
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
6
9
|
// TODO: deduplicate with llama-impl.h
|
7
10
|
template<typename T>
|
@@ -139,7 +142,7 @@ std::string gpt_sampler_params::print() const {
|
|
139
142
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
140
143
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
141
144
|
|
142
|
-
lparams.no_perf =
|
145
|
+
lparams.no_perf = params.no_perf;
|
143
146
|
|
144
147
|
auto * result = new gpt_sampler {
|
145
148
|
/* .params = */ params,
|
@@ -257,10 +260,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
|
257
260
|
// TODO: measure grammar performance
|
258
261
|
|
259
262
|
if (gsmpl) {
|
260
|
-
|
263
|
+
llama_perf_sampler_print(gsmpl->chain);
|
261
264
|
}
|
262
265
|
if (ctx) {
|
263
|
-
|
266
|
+
llama_perf_context_print(ctx);
|
264
267
|
}
|
265
268
|
}
|
266
269
|
|
@@ -310,6 +313,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
|
|
310
313
|
return cur_p.data[cur_p.selected].id;
|
311
314
|
}
|
312
315
|
|
316
|
+
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
|
317
|
+
return llama_sampler_get_seed(gsmpl->chain);
|
318
|
+
}
|
319
|
+
|
313
320
|
// helpers
|
314
321
|
|
315
322
|
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
|
@@ -321,7 +328,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
|
|
321
328
|
}
|
322
329
|
|
323
330
|
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
324
|
-
std::string result = "
|
331
|
+
std::string result = "logits ";
|
325
332
|
|
326
333
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
327
334
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
@@ -352,10 +359,6 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
|
|
352
359
|
return result;
|
353
360
|
}
|
354
361
|
|
355
|
-
struct llama_sampler_timings gpt_sampler_get_timigs(const struct gpt_sampler * gsmpl){
|
356
|
-
return llama_sampler_chain_timings(gsmpl -> chain);
|
357
|
-
}
|
358
|
-
|
359
362
|
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
|
360
363
|
switch (cnstr) {
|
361
364
|
case GPT_SAMPLER_TYPE_TOP_K: return 'k';
|
@@ -432,7 +435,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
|
|
432
435
|
}
|
433
436
|
|
434
437
|
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
435
|
-
std::unordered_map<char, gpt_sampler_type> sampler_name_map {
|
438
|
+
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
|
436
439
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
437
440
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
438
441
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
package/cpp/sampling.h
CHANGED
@@ -2,65 +2,11 @@
|
|
2
2
|
|
3
3
|
#include "llama.h"
|
4
4
|
|
5
|
+
#include "common.h"
|
6
|
+
|
5
7
|
#include <string>
|
6
8
|
#include <vector>
|
7
9
|
|
8
|
-
enum gpt_sampler_type {
|
9
|
-
GPT_SAMPLER_TYPE_NONE = 0,
|
10
|
-
GPT_SAMPLER_TYPE_TOP_K = 1,
|
11
|
-
GPT_SAMPLER_TYPE_TOP_P = 2,
|
12
|
-
GPT_SAMPLER_TYPE_MIN_P = 3,
|
13
|
-
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
14
|
-
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
15
|
-
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
16
|
-
GPT_SAMPLER_TYPE_XTC = 7,
|
17
|
-
};
|
18
|
-
|
19
|
-
// sampling parameters
|
20
|
-
struct gpt_sampler_params {
|
21
|
-
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
22
|
-
|
23
|
-
int32_t n_prev = 64; // number of previous tokens to remember
|
24
|
-
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
25
|
-
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
26
|
-
int32_t top_k = 40; // <= 0 to use vocab size
|
27
|
-
float top_p = 0.95f; // 1.0 = disabled
|
28
|
-
float min_p = 0.05f; // 0.0 = disabled
|
29
|
-
float tfs_z = 1.00f; // 1.0 = disabled
|
30
|
-
float xtc_t = 0.0f; // 0.0 = disabled
|
31
|
-
float xtc_p = 0.0f;
|
32
|
-
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
33
|
-
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
34
|
-
float dynatemp_range = 0.00f; // 0.0 = disabled
|
35
|
-
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
36
|
-
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
37
|
-
float penalty_repeat = 1.00f; // 1.0 = disabled
|
38
|
-
float penalty_freq = 0.00f; // 0.0 = disabled
|
39
|
-
float penalty_present = 0.00f; // 0.0 = disabled
|
40
|
-
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
41
|
-
float mirostat_tau = 5.00f; // target entropy
|
42
|
-
float mirostat_eta = 0.10f; // learning rate
|
43
|
-
bool penalize_nl = false; // consider newlines as a repeatable token
|
44
|
-
bool ignore_eos = false;
|
45
|
-
|
46
|
-
std::vector<enum gpt_sampler_type> samplers = {
|
47
|
-
GPT_SAMPLER_TYPE_TOP_K,
|
48
|
-
GPT_SAMPLER_TYPE_TFS_Z,
|
49
|
-
GPT_SAMPLER_TYPE_TYPICAL_P,
|
50
|
-
GPT_SAMPLER_TYPE_TOP_P,
|
51
|
-
GPT_SAMPLER_TYPE_MIN_P,
|
52
|
-
GPT_SAMPLER_TYPE_XTC,
|
53
|
-
GPT_SAMPLER_TYPE_TEMPERATURE
|
54
|
-
};
|
55
|
-
|
56
|
-
std::string grammar; // optional BNF-like grammar to constrain sampling
|
57
|
-
|
58
|
-
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
59
|
-
|
60
|
-
// print the parameters into a string
|
61
|
-
std::string print() const;
|
62
|
-
};
|
63
|
-
|
64
10
|
// gpt_sampler extends llama_sampler with additional functionality:
|
65
11
|
//
|
66
12
|
// - grammar support
|
@@ -114,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
|
114
60
|
//
|
115
61
|
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
116
62
|
|
63
|
+
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
|
64
|
+
|
117
65
|
// helpers
|
118
66
|
|
119
67
|
// access the internal list of current candidate tokens
|
package/cpp/sgemm.cpp
CHANGED
@@ -235,6 +235,14 @@ template <> inline __m512 load(const lm_ggml_fp16_t *p) {
|
|
235
235
|
}
|
236
236
|
#endif // __AVX512F__
|
237
237
|
|
238
|
+
////////////////////////////////////////////////////////////////////////////////////////////////////
|
239
|
+
// CONSTANTS
|
240
|
+
|
241
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
242
|
+
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
243
|
+
static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
|
244
|
+
#endif
|
245
|
+
|
238
246
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
239
247
|
// FLOATING POINT MATRIX MULTIPLICATION
|
240
248
|
|
@@ -933,6 +941,20 @@ class tinyBLAS_Q0_AVX {
|
|
933
941
|
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
|
934
942
|
}
|
935
943
|
|
944
|
+
inline __m256i load(const block_iq4_nl *b) {
|
945
|
+
return MM256_SET_M128I(load1(b), load0(b));
|
946
|
+
}
|
947
|
+
|
948
|
+
inline __m128i load0(const block_iq4_nl *b) {
|
949
|
+
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
950
|
+
return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
|
951
|
+
}
|
952
|
+
|
953
|
+
inline __m128i load1(const block_iq4_nl *b) {
|
954
|
+
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
955
|
+
return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
|
956
|
+
}
|
957
|
+
|
936
958
|
inline __m256 updot(__m256i u, __m256i s) {
|
937
959
|
__m256i res;
|
938
960
|
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
@@ -1159,6 +1181,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
1159
1181
|
#endif
|
1160
1182
|
}
|
1161
1183
|
|
1184
|
+
case LM_GGML_TYPE_IQ4_NL: {
|
1185
|
+
if (Btype != LM_GGML_TYPE_Q8_0)
|
1186
|
+
return false;
|
1187
|
+
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
1188
|
+
tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
|
1189
|
+
k, (const block_iq4_nl *)A, lda,
|
1190
|
+
(const block_q8_0 *)B, ldb,
|
1191
|
+
(float *)C, ldc,
|
1192
|
+
ith, nth};
|
1193
|
+
tb.matmul(m, n);
|
1194
|
+
return true;
|
1195
|
+
#else
|
1196
|
+
return false;
|
1197
|
+
#endif
|
1198
|
+
}
|
1199
|
+
|
1162
1200
|
default:
|
1163
1201
|
return false;
|
1164
1202
|
}
|