cui-llama.rn 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/rn-llama.hpp CHANGED
@@ -5,11 +5,7 @@
5
5
  #include <iostream>
6
6
  #include "common.h"
7
7
  #include "llama.h"
8
-
9
- #include <android/log.h>
10
8
  #include "sampling.h"
11
- #define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
12
- #define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
13
9
 
14
10
  namespace rnllama {
15
11
 
@@ -28,6 +24,7 @@ static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
28
24
  batch->n_tokens += 1;
29
25
  }
30
26
 
27
+
31
28
  // NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
32
29
 
33
30
  static void log(const char *level, const char *function, int line,
@@ -309,9 +306,9 @@ struct llama_rn_context
309
306
  }
310
307
  // compare the evaluated prompt with the new prompt
311
308
  n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
312
- LLAMA_LOG_INFO("%s: n_past: %zu", __func__, n_past);
313
- LLAMA_LOG_INFO("%s: embd size: %zu", __func__, embd.size());
314
- LLAMA_LOG_INFO("%s: prompt_tokens size: %zu", __func__, prompt_tokens.size());
309
+ LOG_INFO("%s: n_past: %zu", __func__, n_past);
310
+ LOG_INFO("%s: embd size: %zu", __func__, embd.size());
311
+ LOG_INFO("%s: prompt_tokens size: %zu", __func__, prompt_tokens.size());
315
312
  embd = prompt_tokens;
316
313
  if (n_past == num_prompt_tokens)
317
314
  {
@@ -392,7 +389,7 @@ struct llama_rn_context
392
389
  n_past += n_eval;
393
390
 
394
391
  if(is_interrupted) {
395
- LOG("Decoding Interrupted");
392
+ LOG_INFO("Decoding Interrupted");
396
393
  embd.resize(n_past);
397
394
  has_next_token = false;
398
395
  return result;
@@ -798,7 +795,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context
798
795
 
799
796
  if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
800
797
  {
801
- LLAMA_LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
798
+ LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
802
799
  return; //no purge is needed
803
800
  }
804
801
 
@@ -826,7 +823,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context
826
823
  current_context_tokens[i - diff] = current_context_tokens[i];
827
824
  }
828
825
 
829
- LLAMA_LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
826
+ LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
830
827
 
831
828
  current_context_tokens.resize(current_context_tokens.size() - diff);
832
829
  }
package/cpp/sampling.cpp CHANGED
@@ -328,7 +328,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
328
328
  }
329
329
 
330
330
  std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
331
- std::string result = "\tlogits ";
331
+ std::string result = "logits ";
332
332
 
333
333
  for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
334
334
  const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
@@ -359,10 +359,6 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
359
359
  return result;
360
360
  }
361
361
 
362
- struct llama_sampler_timings gpt_sampler_get_timigs(const struct gpt_sampler * gsmpl){
363
- return llama_sampler_chain_timings(gsmpl -> chain);
364
- }
365
-
366
362
  char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
367
363
  switch (cnstr) {
368
364
  case GPT_SAMPLER_TYPE_TOP_K: return 'k';
package/cpp/sgemm.cpp CHANGED
@@ -235,6 +235,14 @@ template <> inline __m512 load(const lm_ggml_fp16_t *p) {
235
235
  }
236
236
  #endif // __AVX512F__
237
237
 
238
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
239
+ // CONSTANTS
240
+
241
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
242
+ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
243
+ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
244
+ #endif
245
+
238
246
  ////////////////////////////////////////////////////////////////////////////////////////////////////
239
247
  // FLOATING POINT MATRIX MULTIPLICATION
240
248
 
@@ -933,6 +941,20 @@ class tinyBLAS_Q0_AVX {
933
941
  return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
934
942
  }
935
943
 
944
+ inline __m256i load(const block_iq4_nl *b) {
945
+ return MM256_SET_M128I(load1(b), load0(b));
946
+ }
947
+
948
+ inline __m128i load0(const block_iq4_nl *b) {
949
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
950
+ return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
951
+ }
952
+
953
+ inline __m128i load1(const block_iq4_nl *b) {
954
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
955
+ return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
956
+ }
957
+
936
958
  inline __m256 updot(__m256i u, __m256i s) {
937
959
  __m256i res;
938
960
  #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
@@ -1159,6 +1181,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
1159
1181
  #endif
1160
1182
  }
1161
1183
 
1184
+ case LM_GGML_TYPE_IQ4_NL: {
1185
+ if (Btype != LM_GGML_TYPE_Q8_0)
1186
+ return false;
1187
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
1188
+ tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
1189
+ k, (const block_iq4_nl *)A, lda,
1190
+ (const block_q8_0 *)B, ldb,
1191
+ (float *)C, ldc,
1192
+ ith, nth};
1193
+ tb.matmul(m, n);
1194
+ return true;
1195
+ #else
1196
+ return false;
1197
+ #endif
1198
+ }
1199
+
1162
1200
  default:
1163
1201
  return false;
1164
1202
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cui-llama.rn",
3
- "version": "1.1.5",
3
+ "version": "1.1.7",
4
4
  "description": "Fork of llama.rn for ChatterUI",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",