cui-llama.rn 1.1.2 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/sampling.h CHANGED
@@ -2,163 +2,82 @@
2
2
 
3
3
  #include "llama.h"
4
4
 
5
- #include "grammar-parser.h"
5
+ #include "common.h"
6
6
 
7
- #include <random>
8
7
  #include <string>
9
- #include <unordered_map>
10
8
  #include <vector>
11
9
 
12
- // sampler types
13
- enum class llama_sampler_type : char {
14
- TOP_K = 'k',
15
- TOP_P = 'p',
16
- MIN_P = 'm',
17
- XTC = 'x',
18
- TFS_Z = 'f',
19
- TYPICAL_P = 'y',
20
- TEMPERATURE = 't'
21
- };
22
-
23
- // sampling parameters
24
- typedef struct llama_sampling_params {
25
- int32_t n_prev = 64; // number of previous tokens to remember
26
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
27
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
28
- int32_t top_k = 40; // <= 0 to use vocab size
29
- float top_p = 0.95f; // 1.0 = disabled
30
- float min_p = 0.05f; // 0.0 = disabled
31
- float xtc_t = 0.0f; // 0.0 = disabled
32
- float xtc_p = 0.0f; // controls the probability of XTC removal
33
- float tfs_z = 1.00f; // 1.0 = disabled
34
- float typical_p = 1.00f; // 1.0 = disabled
35
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
36
- float dynatemp_range = 0.00f; // 0.0 = disabled
37
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
38
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
39
- float penalty_repeat = 1.00f; // 1.0 = disabled
40
- float penalty_freq = 0.00f; // 0.0 = disabled
41
- float penalty_present = 0.00f; // 0.0 = disabled
42
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
43
- float mirostat_tau = 5.00f; // target entropy
44
- float mirostat_eta = 0.10f; // learning rate
45
- bool penalize_nl = false; // consider newlines as a repeatable token
46
- uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
47
-
48
- std::vector<llama_sampler_type> samplers_sequence = {
49
- llama_sampler_type::TOP_K,
50
- llama_sampler_type::TFS_Z,
51
- llama_sampler_type::TYPICAL_P,
52
- llama_sampler_type::TOP_P,
53
- llama_sampler_type::MIN_P,
54
- llama_sampler_type::XTC,
55
- llama_sampler_type::TEMPERATURE
56
- };
57
-
58
- std::string grammar; // optional BNF-like grammar to constrain sampling
59
-
60
- // Classifier-Free Guidance
61
- // https://arxiv.org/abs/2306.17806
62
- std::string cfg_negative_prompt; // string to help guidance
63
- float cfg_scale = 1.f; // how strong is guidance
64
-
65
- std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
66
-
67
- std::vector<llama_token> penalty_prompt_tokens;
68
- bool use_penalty_prompt_tokens = false;
69
- } llama_sampling_params;
70
-
71
- // general sampler context
72
- // TODO: move to llama.h
73
- struct llama_sampling_context {
74
- // parameters that will be used for sampling
75
- llama_sampling_params params;
76
-
77
- // mirostat sampler state
78
- float mirostat_mu;
79
-
80
- llama_grammar * grammar;
81
-
82
- // internal
83
- grammar_parser::parse_state parsed_grammar;
84
-
85
- // TODO: replace with ring-buffer
86
- std::vector<llama_token> prev;
87
- std::vector<llama_token_data> cur;
88
- size_t n_valid; // Number of correct top tokens with correct probabilities.
89
-
90
- std::mt19937 rng;
91
- };
10
+ // gpt_sampler extends llama_sampler with additional functionality:
11
+ //
12
+ // - grammar support
13
+ // - custom sampler logic based on the parameters
14
+ // - history of the last accepted tokens
15
+ // - performance metrics
16
+ //
17
+ // This goal is to have a common implementation of the sampling logic shared across the examples.
18
+ // For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
19
+ // complex (top-k, top-p, etc).
20
+ //
21
+ // Another example is related to the grammar. In general, the grammar constraints applied on the full
22
+ // vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
23
+ // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
24
+ // grammar constraints are applied to the full vocabulary and the token is resampled.
25
+ //
26
+ // The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
27
+ // be moved into the core llama library.
28
+ //
29
+ // For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
30
+ // This can be used to access the probabilities of the rest of the non-sampled tokens.
31
+ //
32
+ // TODO: measure grammar performance
33
+ //
92
34
 
93
- #include "common.h"
35
+ struct gpt_sampler;
94
36
 
95
- // Create a new sampling context instance.
96
- struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
37
+ // llama_sampler API overloads
97
38
 
98
- void llama_sampling_free(struct llama_sampling_context * ctx);
39
+ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
99
40
 
100
- // Reset the sampler context
101
- // - clear prev tokens
102
- // - reset grammar
103
- void llama_sampling_reset(llama_sampling_context * ctx);
41
+ void gpt_sampler_free(struct gpt_sampler * gsmpl);
104
42
 
105
- // Set the sampler seed
106
- void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
43
+ // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
44
+ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
45
+ void gpt_sampler_reset (struct gpt_sampler * gsmpl);
46
+ struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
107
47
 
108
- // Copy the sampler context
109
- void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
48
+ // arguments can be nullptr to skip printing
49
+ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
110
50
 
111
- // Get the last sampled token
112
- llama_token llama_sampling_last(llama_sampling_context * ctx);
51
+ // extended sampling implementation:
52
+ //
53
+ // - set logits
54
+ // - apply the configured sampler chain
55
+ // - check if the token fits the grammar (if any)
56
+ // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
57
+ //
58
+ // if grammar_first is true, the grammar is applied before the samplers (slower)
59
+ // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
60
+ //
61
+ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
113
62
 
114
- // Get a string representation of the last sampled tokens
115
- std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
63
+ uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
116
64
 
117
- // Print sampling parameters into a string
118
- std::string llama_sampling_print(const llama_sampling_params & params);
65
+ // helpers
119
66
 
120
- // Print sampling order into a string
121
- std::string llama_sampling_order_print(const llama_sampling_params & params);
67
+ // access the internal list of current candidate tokens
68
+ llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
122
69
 
123
- std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
70
+ // get the last accepted token
71
+ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
124
72
 
125
- std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
126
- std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
73
+ // print the sampler chain into a string
74
+ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
127
75
 
128
- // this is a common sampling function used across the examples for convenience
129
- // it can serve as a starting point for implementing your own sampling function
130
- // Note: When using multiple sequences, it is the caller's responsibility to call
131
- // llama_sampling_reset when a sequence ends
132
- //
133
- // required:
134
- // - ctx_main: context to use for sampling
135
- // - ctx_sampling: sampling-specific context
136
- //
137
- // optional:
138
- // - ctx_cfg: context to use for classifier-free guidance
139
- // - idx: sample from llama_get_logits_ith(ctx, idx)
140
- //
141
- // returns:
142
- // - token: sampled token
143
- // - candidates: vector of candidate tokens
144
- //
145
- llama_token llama_sampling_sample(
146
- struct llama_sampling_context * ctx_sampling,
147
- struct llama_context * ctx_main,
148
- struct llama_context * ctx_cfg,
149
- int idx = -1);
150
-
151
- // Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
152
- llama_token_data_array llama_sampling_prepare(
153
- struct llama_sampling_context * ctx_sampling,
154
- struct llama_context * ctx_main,
155
- struct llama_context * ctx_cfg,
156
- int idx = 0,
157
- bool apply_grammar = true,
158
- std::vector<float> * original_logits = nullptr);
159
-
160
- void llama_sampling_accept(
161
- struct llama_sampling_context * ctx_sampling,
162
- struct llama_context * ctx_main,
163
- llama_token id,
164
- bool apply_grammar);
76
+ // get a string representation of the last accepted tokens
77
+ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
78
+
79
+ char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
80
+ std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
81
+
82
+ std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
83
+ std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
package/cpp/sgemm.cpp CHANGED
@@ -606,17 +606,29 @@ class tinyBLAS_Q0_AVX {
606
606
  case 0x44:
607
607
  mc = 4;
608
608
  nc = 4;
609
+ #if defined(__AVX2__) && defined(__F16C__)
610
+ gemm4xN<4>(m0, m, n0, n);
611
+ #else
609
612
  gemm<4, 4>(m0, m, n0, n);
613
+ #endif
610
614
  break;
611
615
  case 0x43:
612
616
  mc = 4;
613
617
  nc = 3;
618
+ #if defined(__AVX2__) && defined(__F16C__)
619
+ gemm4xN<3>(m0, m, n0, n);
620
+ #else
614
621
  gemm<4, 3>(m0, m, n0, n);
622
+ #endif
615
623
  break;
616
624
  case 0x34:
617
625
  mc = 3;
618
626
  nc = 4;
627
+ #if defined(__AVX2__) && defined(__F16C__)
628
+ gemmMx4<3>(m0, m, n0, n);
629
+ #else
619
630
  gemm<3, 4>(m0, m, n0, n);
631
+ #endif
620
632
  break;
621
633
  case 0x33:
622
634
  mc = 3;
@@ -626,12 +638,20 @@ class tinyBLAS_Q0_AVX {
626
638
  case 0x42:
627
639
  mc = 4;
628
640
  nc = 2;
641
+ #if defined(__AVX2__) && defined(__F16C__)
642
+ gemm4xN<2>(m0, m, n0, n);
643
+ #else
629
644
  gemm<4, 2>(m0, m, n0, n);
645
+ #endif
630
646
  break;
631
647
  case 0x24:
632
648
  mc = 2;
633
649
  nc = 4;
650
+ #if defined(__AVX2__) && defined(__F16C__)
651
+ gemmMx4<2>(m0, m, n0, n);
652
+ #else
634
653
  gemm<2, 4>(m0, m, n0, n);
654
+ #endif
635
655
  break;
636
656
  #else
637
657
  case 0x44:
@@ -639,13 +659,21 @@ class tinyBLAS_Q0_AVX {
639
659
  case 0x42:
640
660
  mc = 4;
641
661
  nc = 2;
662
+ #if defined(__AVX2__) && defined(__F16C__)
663
+ gemm4xN<2>(m0, m, n0, n);
664
+ #else
642
665
  gemm<4, 2>(m0, m, n0, n);
666
+ #endif
643
667
  break;
644
668
  case 0x34:
645
669
  case 0x24:
646
670
  mc = 2;
647
671
  nc = 4;
672
+ #if defined(__AVX2__) && defined(__F16C__)
673
+ gemmMx4<2>(m0, m, n0, n);
674
+ #else
648
675
  gemm<2, 4>(m0, m, n0, n);
676
+ #endif
649
677
  break;
650
678
  case 0x33:
651
679
  #endif
@@ -662,7 +690,11 @@ class tinyBLAS_Q0_AVX {
662
690
  case 0x41:
663
691
  mc = 4;
664
692
  nc = 1;
693
+ #if defined(__AVX2__) && defined(__F16C__)
694
+ gemm4xN<1>(m0, m, n0, n);
695
+ #else
665
696
  gemm<4, 1>(m0, m, n0, n);
697
+ #endif
666
698
  break;
667
699
  case 0x22:
668
700
  mc = 2;
@@ -672,7 +704,11 @@ class tinyBLAS_Q0_AVX {
672
704
  case 0x14:
673
705
  mc = 1;
674
706
  nc = 4;
707
+ #if defined(__AVX2__) && defined(__F16C__)
708
+ gemmMx4<1>(m0, m, n0, n);
709
+ #else
675
710
  gemm<1, 4>(m0, m, n0, n);
711
+ #endif
676
712
  break;
677
713
  case 0x31:
678
714
  mc = 3;
@@ -708,6 +744,119 @@ class tinyBLAS_Q0_AVX {
708
744
  mnpack(m0, m, np, n);
709
745
  }
710
746
 
747
+ #if defined(__AVX2__) && defined(__F16C__)
748
+ // Templated functions for gemm of dimensions 4xN
749
+ template <int RN>
750
+ NOINLINE void gemm4xN(int64_t m0, int64_t m, int64_t n0, int64_t n) {
751
+ int64_t ytiles = (m - m0) / 4;
752
+ int64_t xtiles = (n - n0) / RN;
753
+ int64_t tiles = xtiles * ytiles;
754
+ int64_t duty = (tiles + nth - 1) / nth;
755
+ int64_t start = duty * ith;
756
+ int64_t end = start + duty;
757
+ if (end > tiles)
758
+ end = tiles;
759
+ for (int64_t job = start; job < end; ++job) {
760
+ int64_t ii = m0 + job / xtiles * 4;
761
+ int64_t jj = n0 + job % xtiles * RN;
762
+ __m256 Cv[RN][4] = {};
763
+ for (int64_t l = 0; l < k; ++l) {
764
+ uint64_t a_delta = ((uint64_t)A[lda * (ii + 3) + l].d << 48) | ((uint64_t)A[lda * (ii + 2) + l].d << 32) | ((uint64_t)A[lda * (ii + 1) + l].d << 16) | (A[lda * (ii + 0) + l].d);
765
+ // Convert delta values for four blocks to float values
766
+ __m128 da = _mm_cvtph_ps(_mm_set_epi64x(0, a_delta));
767
+ __m256i avec0 = load(A + lda * (ii + 0) + l);
768
+ __m256i avec1 = load(A + lda * (ii + 1) + l);
769
+ __m256i avec2 = load(A + lda * (ii + 2) + l);
770
+ __m256i avec3 = load(A + lda * (ii + 3) + l);
771
+ for (int64_t j = 0; j < RN; ++j) {
772
+ __m128 db = _mm_set1_ps(unhalf(B[ldb * (jj + j) + l].d));
773
+ // Computation of product of delta values for four blocks and replicate it across 256 bit lane
774
+ __m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
775
+ dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
776
+ // Computation of dot product and multiplication with appropriate delta value products
777
+ Cv[j][0] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
778
+ updot(_mm256_sign_epi8(avec0, avec0),
779
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec0)),
780
+ Cv[j][0]);
781
+ Cv[j][1] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
782
+ updot(_mm256_sign_epi8(avec1, avec1),
783
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec1)),
784
+ Cv[j][1]);
785
+ Cv[j][2] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
786
+ updot(_mm256_sign_epi8(avec2, avec2),
787
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec2)),
788
+ Cv[j][2]);
789
+ Cv[j][3] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
790
+ updot(_mm256_sign_epi8(avec3, avec3),
791
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec3)),
792
+ Cv[j][3]);
793
+ }
794
+ }
795
+
796
+ for (int64_t j = 0; j < RN; ++j)
797
+ for (int64_t i = 0; i < 4; ++i)
798
+ C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
799
+ }
800
+ }
801
+
802
+ // Templated functions for gemm of dimensions Mx4
803
+ template <int RM>
804
+ NOINLINE void gemmMx4(int64_t m0, int64_t m, int64_t n0, int64_t n) {
805
+ int64_t ytiles = (m - m0) / RM;
806
+ int64_t xtiles = (n - n0) / 4;
807
+ int64_t tiles = xtiles * ytiles;
808
+ int64_t duty = (tiles + nth - 1) / nth;
809
+ int64_t start = duty * ith;
810
+ int64_t end = start + duty;
811
+ if (end > tiles)
812
+ end = tiles;
813
+ for (int64_t job = start; job < end; ++job) {
814
+ int64_t ii = m0 + job / xtiles * RM;
815
+ int64_t jj = n0 + job % xtiles * 4;
816
+ __m256 Cv[4][RM] = {};
817
+ for (int64_t l = 0; l < k; ++l) {
818
+ uint64_t b_delta = ((uint64_t)B[ldb * (jj + 3) + l].d << 48) | ((uint64_t)B[ldb * (jj + 2) + l].d << 32) | ((uint64_t)B[ldb * (jj + 1) + l].d << 16) | (B[ldb * (jj + 0) + l].d);
819
+ // Convert delta values for four blocks to float values
820
+ __m128 db = _mm_cvtph_ps(_mm_set_epi64x(0, b_delta));
821
+ __m256i bvec0 = load(B + ldb * (jj + 0) + l);
822
+ __m256i bvec1 = load(B + ldb * (jj + 1) + l);
823
+ __m256i bvec2 = load(B + ldb * (jj + 2) + l);
824
+ __m256i bvec3 = load(B + ldb * (jj + 3) + l);
825
+ for (int64_t i = 0; i < RM; ++i) {
826
+ __m128 da = _mm_set1_ps(unhalf((A[lda * (ii + i) + l].d)));
827
+ // Computation of product of delta values for four blocks and replicate it across 256 bit lane
828
+ __m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
829
+ dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
830
+ // Computation of dot product and multiplication with appropriate delta value products
831
+ Cv[0][i] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
832
+ updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
833
+ load(A + lda * (ii + i) + l)),
834
+ _mm256_sign_epi8(bvec0, load(A + lda * (ii + i) + l))),
835
+ Cv[0][i]);
836
+ Cv[1][i] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
837
+ updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
838
+ load(A + lda * (ii + i) + l)),
839
+ _mm256_sign_epi8(bvec1, load(A + lda * (ii + i) + l))),
840
+ Cv[1][i]);
841
+ Cv[2][i] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
842
+ updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
843
+ load(A + lda * (ii + i) + l)),
844
+ _mm256_sign_epi8(bvec2, load(A + lda * (ii + i) + l))),
845
+ Cv[2][i]);
846
+ Cv[3][i] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
847
+ updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
848
+ load(A + lda * (ii + i) + l)),
849
+ _mm256_sign_epi8(bvec3, load(A + lda * (ii + i) + l))),
850
+ Cv[3][i]);
851
+ }
852
+ }
853
+ for (int64_t j = 0; j < 4; ++j)
854
+ for (int64_t i = 0; i < RM; ++i)
855
+ C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
856
+ }
857
+ }
858
+ #endif
859
+
711
860
  template <int RM, int RN>
712
861
  NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
713
862
  int64_t ytiles = (m - m0) / RM;
@@ -857,6 +1006,10 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
857
1006
  assert(nth > 0);
858
1007
  assert(ith < nth);
859
1008
 
1009
+ // only enable sgemm for prompt processing
1010
+ if (n < 2)
1011
+ return false;
1012
+
860
1013
  if (Ctype != LM_GGML_TYPE_F32)
861
1014
  return false;
862
1015
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cui-llama.rn",
3
- "version": "1.1.2",
3
+ "version": "1.1.5",
4
4
  "description": "Fork of llama.rn for ChatterUI",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",