cui-llama.rn 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/rn-llama.hpp CHANGED
@@ -5,10 +5,7 @@
5
5
  #include <iostream>
6
6
  #include "common.h"
7
7
  #include "llama.h"
8
-
9
- #include <android/log.h>
10
- #define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
11
- #define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
8
+ #include "sampling.h"
12
9
 
13
10
  namespace rnllama {
14
11
 
@@ -27,6 +24,7 @@ static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
27
24
  batch->n_tokens += 1;
28
25
  }
29
26
 
27
+
30
28
  // NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
31
29
 
32
30
  static void log(const char *level, const char *function, int line,
@@ -308,9 +306,9 @@ struct llama_rn_context
308
306
  }
309
307
  // compare the evaluated prompt with the new prompt
310
308
  n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
311
- LLAMA_LOG_INFO("%s: n_past: %zu", __func__, n_past);
312
- LLAMA_LOG_INFO("%s: embd size: %zu", __func__, embd.size());
313
- LLAMA_LOG_INFO("%s: prompt_tokens size: %zu", __func__, prompt_tokens.size());
309
+ LOG_INFO("%s: n_past: %zu", __func__, n_past);
310
+ LOG_INFO("%s: embd size: %zu", __func__, embd.size());
311
+ LOG_INFO("%s: prompt_tokens size: %zu", __func__, prompt_tokens.size());
314
312
  embd = prompt_tokens;
315
313
  if (n_past == num_prompt_tokens)
316
314
  {
@@ -334,7 +332,7 @@ struct llama_rn_context
334
332
  {
335
333
  // number of tokens to keep when resetting context
336
334
  n_remain = params.n_predict;
337
- llama_perf_reset(ctx, LLAMA_PERF_TYPE_CONTEXT);
335
+ llama_perf_context_reset(ctx);
338
336
  is_predicting = true;
339
337
  }
340
338
 
@@ -391,7 +389,7 @@ struct llama_rn_context
391
389
  n_past += n_eval;
392
390
 
393
391
  if(is_interrupted) {
394
- LOG("Decoding Interrupted");
392
+ LOG_INFO("Decoding Interrupted");
395
393
  embd.resize(n_past);
396
394
  has_next_token = false;
397
395
  return result;
@@ -797,7 +795,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context
797
795
 
798
796
  if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
799
797
  {
800
- LLAMA_LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
798
+ LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
801
799
  return; //no purge is needed
802
800
  }
803
801
 
@@ -825,7 +823,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context
825
823
  current_context_tokens[i - diff] = current_context_tokens[i];
826
824
  }
827
825
 
828
- LLAMA_LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
826
+ LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
829
827
 
830
828
  current_context_tokens.resize(current_context_tokens.size() - diff);
831
829
  }
package/cpp/sampling.cpp CHANGED
@@ -2,6 +2,9 @@
2
2
 
3
3
  #include "common.h"
4
4
 
5
+ #include <cmath>
6
+ #include <unordered_map>
7
+
5
8
  // the ring buffer works similarly to std::deque, but with a fixed capacity
6
9
  // TODO: deduplicate with llama-impl.h
7
10
  template<typename T>
@@ -139,7 +142,7 @@ std::string gpt_sampler_params::print() const {
139
142
  struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
140
143
  llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
141
144
 
142
- lparams.no_perf = false; // TODO: control via params
145
+ lparams.no_perf = params.no_perf;
143
146
 
144
147
  auto * result = new gpt_sampler {
145
148
  /* .params = */ params,
@@ -257,10 +260,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
257
260
  // TODO: measure grammar performance
258
261
 
259
262
  if (gsmpl) {
260
- llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
263
+ llama_perf_sampler_print(gsmpl->chain);
261
264
  }
262
265
  if (ctx) {
263
- llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
266
+ llama_perf_context_print(ctx);
264
267
  }
265
268
  }
266
269
 
@@ -310,6 +313,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
310
313
  return cur_p.data[cur_p.selected].id;
311
314
  }
312
315
 
316
+ uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
317
+ return llama_sampler_get_seed(gsmpl->chain);
318
+ }
319
+
313
320
  // helpers
314
321
 
315
322
  llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
@@ -321,7 +328,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
321
328
  }
322
329
 
323
330
  std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
324
- std::string result = "\tlogits ";
331
+ std::string result = "logits ";
325
332
 
326
333
  for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
327
334
  const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
@@ -352,10 +359,6 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
352
359
  return result;
353
360
  }
354
361
 
355
- struct llama_sampler_timings gpt_sampler_get_timigs(const struct gpt_sampler * gsmpl){
356
- return llama_sampler_chain_timings(gsmpl -> chain);
357
- }
358
-
359
362
  char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
360
363
  switch (cnstr) {
361
364
  case GPT_SAMPLER_TYPE_TOP_K: return 'k';
@@ -432,7 +435,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
432
435
  }
433
436
 
434
437
  std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
435
- std::unordered_map<char, gpt_sampler_type> sampler_name_map {
438
+ std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
436
439
  { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
437
440
  { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
438
441
  { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
package/cpp/sampling.h CHANGED
@@ -2,65 +2,11 @@
2
2
 
3
3
  #include "llama.h"
4
4
 
5
+ #include "common.h"
6
+
5
7
  #include <string>
6
8
  #include <vector>
7
9
 
8
- enum gpt_sampler_type {
9
- GPT_SAMPLER_TYPE_NONE = 0,
10
- GPT_SAMPLER_TYPE_TOP_K = 1,
11
- GPT_SAMPLER_TYPE_TOP_P = 2,
12
- GPT_SAMPLER_TYPE_MIN_P = 3,
13
- GPT_SAMPLER_TYPE_TFS_Z = 4,
14
- GPT_SAMPLER_TYPE_TYPICAL_P = 5,
15
- GPT_SAMPLER_TYPE_TEMPERATURE = 6,
16
- GPT_SAMPLER_TYPE_XTC = 7,
17
- };
18
-
19
- // sampling parameters
20
- struct gpt_sampler_params {
21
- uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
22
-
23
- int32_t n_prev = 64; // number of previous tokens to remember
24
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
25
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
26
- int32_t top_k = 40; // <= 0 to use vocab size
27
- float top_p = 0.95f; // 1.0 = disabled
28
- float min_p = 0.05f; // 0.0 = disabled
29
- float tfs_z = 1.00f; // 1.0 = disabled
30
- float xtc_t = 0.0f; // 0.0 = disabled
31
- float xtc_p = 0.0f;
32
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
33
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
34
- float dynatemp_range = 0.00f; // 0.0 = disabled
35
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
36
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
37
- float penalty_repeat = 1.00f; // 1.0 = disabled
38
- float penalty_freq = 0.00f; // 0.0 = disabled
39
- float penalty_present = 0.00f; // 0.0 = disabled
40
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
41
- float mirostat_tau = 5.00f; // target entropy
42
- float mirostat_eta = 0.10f; // learning rate
43
- bool penalize_nl = false; // consider newlines as a repeatable token
44
- bool ignore_eos = false;
45
-
46
- std::vector<enum gpt_sampler_type> samplers = {
47
- GPT_SAMPLER_TYPE_TOP_K,
48
- GPT_SAMPLER_TYPE_TFS_Z,
49
- GPT_SAMPLER_TYPE_TYPICAL_P,
50
- GPT_SAMPLER_TYPE_TOP_P,
51
- GPT_SAMPLER_TYPE_MIN_P,
52
- GPT_SAMPLER_TYPE_XTC,
53
- GPT_SAMPLER_TYPE_TEMPERATURE
54
- };
55
-
56
- std::string grammar; // optional BNF-like grammar to constrain sampling
57
-
58
- std::vector<llama_logit_bias> logit_bias; // logit biases to apply
59
-
60
- // print the parameters into a string
61
- std::string print() const;
62
- };
63
-
64
10
  // gpt_sampler extends llama_sampler with additional functionality:
65
11
  //
66
12
  // - grammar support
@@ -114,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
114
60
  //
115
61
  llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
116
62
 
63
+ uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
64
+
117
65
  // helpers
118
66
 
119
67
  // access the internal list of current candidate tokens
package/cpp/sgemm.cpp CHANGED
@@ -235,6 +235,14 @@ template <> inline __m512 load(const lm_ggml_fp16_t *p) {
235
235
  }
236
236
  #endif // __AVX512F__
237
237
 
238
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
239
+ // CONSTANTS
240
+
241
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
242
+ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
243
+ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
244
+ #endif
245
+
238
246
  ////////////////////////////////////////////////////////////////////////////////////////////////////
239
247
  // FLOATING POINT MATRIX MULTIPLICATION
240
248
 
@@ -933,6 +941,20 @@ class tinyBLAS_Q0_AVX {
933
941
  return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
934
942
  }
935
943
 
944
+ inline __m256i load(const block_iq4_nl *b) {
945
+ return MM256_SET_M128I(load1(b), load0(b));
946
+ }
947
+
948
+ inline __m128i load0(const block_iq4_nl *b) {
949
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
950
+ return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
951
+ }
952
+
953
+ inline __m128i load1(const block_iq4_nl *b) {
954
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
955
+ return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
956
+ }
957
+
936
958
  inline __m256 updot(__m256i u, __m256i s) {
937
959
  __m256i res;
938
960
  #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
@@ -1159,6 +1181,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
1159
1181
  #endif
1160
1182
  }
1161
1183
 
1184
+ case LM_GGML_TYPE_IQ4_NL: {
1185
+ if (Btype != LM_GGML_TYPE_Q8_0)
1186
+ return false;
1187
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
1188
+ tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
1189
+ k, (const block_iq4_nl *)A, lda,
1190
+ (const block_q8_0 *)B, ldb,
1191
+ (float *)C, ldc,
1192
+ ith, nth};
1193
+ tb.matmul(m, n);
1194
+ return true;
1195
+ #else
1196
+ return false;
1197
+ #endif
1198
+ }
1199
+
1162
1200
  default:
1163
1201
  return false;
1164
1202
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cui-llama.rn",
3
- "version": "1.1.4",
3
+ "version": "1.1.6",
4
4
  "description": "Fork of llama.rn for ChatterUI",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",