cui-llama.rn 1.1.5 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/jni.cpp +1 -4
- package/cpp/common.cpp +157 -53
- package/cpp/common.h +11 -3
- package/cpp/ggml-metal.m +33 -22
- package/cpp/ggml-quants.c +33 -36
- package/cpp/ggml.h +5 -4
- package/cpp/llama-impl.h +1 -0
- package/cpp/llama-sampling.cpp +0 -8
- package/cpp/llama.cpp +519 -34
- package/cpp/llama.h +0 -17
- package/cpp/log.cpp +401 -0
- package/cpp/log.h +85 -703
- package/cpp/rn-llama.hpp +7 -10
- package/cpp/sampling.cpp +1 -5
- package/cpp/sgemm.cpp +38 -0
- package/package.json +1 -1
package/cpp/rn-llama.hpp
CHANGED
@@ -5,11 +5,7 @@
|
|
5
5
|
#include <iostream>
|
6
6
|
#include "common.h"
|
7
7
|
#include "llama.h"
|
8
|
-
|
9
|
-
#include <android/log.h>
|
10
8
|
#include "sampling.h"
|
11
|
-
#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
|
12
|
-
#define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
|
13
9
|
|
14
10
|
namespace rnllama {
|
15
11
|
|
@@ -28,6 +24,7 @@ static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
|
|
28
24
|
batch->n_tokens += 1;
|
29
25
|
}
|
30
26
|
|
27
|
+
|
31
28
|
// NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
|
32
29
|
|
33
30
|
static void log(const char *level, const char *function, int line,
|
@@ -309,9 +306,9 @@ struct llama_rn_context
|
|
309
306
|
}
|
310
307
|
// compare the evaluated prompt with the new prompt
|
311
308
|
n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
|
312
|
-
|
313
|
-
|
314
|
-
|
309
|
+
LOG_INFO("%s: n_past: %zu", __func__, n_past);
|
310
|
+
LOG_INFO("%s: embd size: %zu", __func__, embd.size());
|
311
|
+
LOG_INFO("%s: prompt_tokens size: %zu", __func__, prompt_tokens.size());
|
315
312
|
embd = prompt_tokens;
|
316
313
|
if (n_past == num_prompt_tokens)
|
317
314
|
{
|
@@ -392,7 +389,7 @@ struct llama_rn_context
|
|
392
389
|
n_past += n_eval;
|
393
390
|
|
394
391
|
if(is_interrupted) {
|
395
|
-
|
392
|
+
LOG_INFO("Decoding Interrupted");
|
396
393
|
embd.resize(n_past);
|
397
394
|
has_next_token = false;
|
398
395
|
return result;
|
@@ -798,7 +795,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> ¤t_context
|
|
798
795
|
|
799
796
|
if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
|
800
797
|
{
|
801
|
-
|
798
|
+
LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
|
802
799
|
return; //no purge is needed
|
803
800
|
}
|
804
801
|
|
@@ -826,7 +823,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> ¤t_context
|
|
826
823
|
current_context_tokens[i - diff] = current_context_tokens[i];
|
827
824
|
}
|
828
825
|
|
829
|
-
|
826
|
+
LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
|
830
827
|
|
831
828
|
current_context_tokens.resize(current_context_tokens.size() - diff);
|
832
829
|
}
|
package/cpp/sampling.cpp
CHANGED
@@ -328,7 +328,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
|
|
328
328
|
}
|
329
329
|
|
330
330
|
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
331
|
-
std::string result = "
|
331
|
+
std::string result = "logits ";
|
332
332
|
|
333
333
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
334
334
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
@@ -359,10 +359,6 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
|
|
359
359
|
return result;
|
360
360
|
}
|
361
361
|
|
362
|
-
struct llama_sampler_timings gpt_sampler_get_timigs(const struct gpt_sampler * gsmpl){
|
363
|
-
return llama_sampler_chain_timings(gsmpl -> chain);
|
364
|
-
}
|
365
|
-
|
366
362
|
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
|
367
363
|
switch (cnstr) {
|
368
364
|
case GPT_SAMPLER_TYPE_TOP_K: return 'k';
|
package/cpp/sgemm.cpp
CHANGED
@@ -235,6 +235,14 @@ template <> inline __m512 load(const lm_ggml_fp16_t *p) {
|
|
235
235
|
}
|
236
236
|
#endif // __AVX512F__
|
237
237
|
|
238
|
+
////////////////////////////////////////////////////////////////////////////////////////////////////
|
239
|
+
// CONSTANTS
|
240
|
+
|
241
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
242
|
+
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
243
|
+
static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
|
244
|
+
#endif
|
245
|
+
|
238
246
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
239
247
|
// FLOATING POINT MATRIX MULTIPLICATION
|
240
248
|
|
@@ -933,6 +941,20 @@ class tinyBLAS_Q0_AVX {
|
|
933
941
|
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
|
934
942
|
}
|
935
943
|
|
944
|
+
inline __m256i load(const block_iq4_nl *b) {
|
945
|
+
return MM256_SET_M128I(load1(b), load0(b));
|
946
|
+
}
|
947
|
+
|
948
|
+
inline __m128i load0(const block_iq4_nl *b) {
|
949
|
+
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
950
|
+
return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
|
951
|
+
}
|
952
|
+
|
953
|
+
inline __m128i load1(const block_iq4_nl *b) {
|
954
|
+
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
955
|
+
return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
|
956
|
+
}
|
957
|
+
|
936
958
|
inline __m256 updot(__m256i u, __m256i s) {
|
937
959
|
__m256i res;
|
938
960
|
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
@@ -1159,6 +1181,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
1159
1181
|
#endif
|
1160
1182
|
}
|
1161
1183
|
|
1184
|
+
case LM_GGML_TYPE_IQ4_NL: {
|
1185
|
+
if (Btype != LM_GGML_TYPE_Q8_0)
|
1186
|
+
return false;
|
1187
|
+
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
1188
|
+
tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
|
1189
|
+
k, (const block_iq4_nl *)A, lda,
|
1190
|
+
(const block_q8_0 *)B, ldb,
|
1191
|
+
(float *)C, ldc,
|
1192
|
+
ith, nth};
|
1193
|
+
tb.matmul(m, n);
|
1194
|
+
return true;
|
1195
|
+
#else
|
1196
|
+
return false;
|
1197
|
+
#endif
|
1198
|
+
}
|
1199
|
+
|
1162
1200
|
default:
|
1163
1201
|
return false;
|
1164
1202
|
}
|