cui-llama.rn 1.1.2 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,11 +2,115 @@
2
2
 
3
3
  #include "llama-impl.h"
4
4
 
5
+ #include <map>
6
+
5
7
  struct llama_vocab;
6
- struct llama_sampling;
8
+
9
+ // grammar element type
10
+ enum llama_gretype {
11
+ // end of rule definition
12
+ LLAMA_GRETYPE_END = 0,
13
+
14
+ // start of alternate definition for rule
15
+ LLAMA_GRETYPE_ALT = 1,
16
+
17
+ // non-terminal element: reference to rule
18
+ LLAMA_GRETYPE_RULE_REF = 2,
19
+
20
+ // terminal element: character (code point)
21
+ LLAMA_GRETYPE_CHAR = 3,
22
+
23
+ // inverse char(s) ([^a], [^a-b] [^abc])
24
+ LLAMA_GRETYPE_CHAR_NOT = 4,
25
+
26
+ // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
27
+ // be an inclusive range ([a-z])
28
+ LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
29
+
30
+ // modifies a preceding LLAMA_GRETYPE_CHAR or
31
+ // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
32
+ LLAMA_GRETYPE_CHAR_ALT = 6,
33
+
34
+ // any character (.)
35
+ LLAMA_GRETYPE_CHAR_ANY = 7,
36
+ };
37
+
38
+ typedef struct llama_grammar_element {
39
+ enum llama_gretype type;
40
+ uint32_t value; // Unicode code point or rule ID
41
+ } llama_grammar_element;
42
+
43
+ struct llama_partial_utf8 {
44
+ uint32_t value; // bit value so far (unshifted)
45
+ int n_remain; // num bytes remaining; -1 indicates invalid sequence
46
+ };
47
+
48
+ struct llama_grammar_candidate {
49
+ size_t index;
50
+ const uint32_t * code_points;
51
+ llama_partial_utf8 partial_utf8;
52
+ };
53
+
54
+ using llama_grammar_rule = std::vector< llama_grammar_element>;
55
+ using llama_grammar_stack = std::vector<const llama_grammar_element *>;
56
+
57
+ using llama_grammar_rules = std::vector<llama_grammar_rule>;
58
+ using llama_grammar_stacks = std::vector<llama_grammar_stack>;
59
+ using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
60
+
61
+ const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
62
+ llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
63
+
64
+ // takes a set of possible pushdown stacks on a grammar, which are required to
65
+ // be positioned at a character range (see `llama_grammar_advance_stack`), and
66
+ // produces the N possible stacks if the given char is accepted at those
67
+ // positions
68
+ void llama_grammar_accept(
69
+ const llama_grammar_rules & rules,
70
+ const llama_grammar_stacks & stacks,
71
+ uint32_t chr,
72
+ llama_grammar_stacks & stacks_new);
73
+
74
+ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
75
+ const llama_grammar_rules & rules,
76
+ const llama_grammar_stack & stack,
77
+ const llama_grammar_candidates & candidates);
78
+
79
+ struct llama_grammar_parser {
80
+ std::map<std::string, uint32_t> symbol_ids;
81
+
82
+ llama_grammar_rules rules;
83
+
84
+ llama_grammar_stack c_rules() const;
85
+
86
+ uint32_t get_symbol_id(const char * src, size_t len);
87
+ uint32_t generate_symbol_id(const std::string & base_name);
88
+
89
+ void add_rule(uint32_t rule_id, const llama_grammar_rule & rule);
90
+
91
+ const char * parse_alternates(
92
+ const char * src,
93
+ const std::string & rule_name,
94
+ uint32_t rule_id,
95
+ bool is_nested);
96
+
97
+ const char * parse_sequence(
98
+ const char * src,
99
+ const std::string & rule_name,
100
+ llama_grammar_rule & rule,
101
+ bool is_nested);
102
+
103
+ const char * parse_rule(const char * src);
104
+
105
+ bool parse(const char * src);
106
+ void print(FILE * file);
107
+ };
7
108
 
8
109
  struct llama_grammar {
9
- const llama_grammar_rules rules;
110
+ // note: allow null vocab for testing (not great)
111
+ const llama_vocab * vocab;
112
+
113
+ const llama_grammar_rules rules; // TODO: shared ptr
10
114
  llama_grammar_stacks stacks;
11
115
 
12
116
  // buffer for partially generated UTF-8 sequence from accepted tokens
@@ -17,23 +121,24 @@ struct llama_grammar {
17
121
  // internal API
18
122
  //
19
123
 
124
+ // note: needed for tests (not great)
20
125
  struct llama_grammar * llama_grammar_init_impl(
21
- const llama_grammar_element ** rules,
22
- size_t n_rules,
23
- size_t start_rule_index);
126
+ const struct llama_vocab * vocab,
127
+ const llama_grammar_element ** rules,
128
+ size_t n_rules,
129
+ size_t start_rule_index);
130
+
131
+ struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
24
132
 
25
133
  void llama_grammar_free_impl(struct llama_grammar * grammar);
26
134
 
27
- struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar);
135
+ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar);
28
136
 
29
- void llama_grammar_sample_impl(
30
- const struct llama_grammar * grammar,
31
- const struct llama_vocab * vocab,
32
- const struct llama_sampling * smpl,
33
- llama_token_data_array * candidates);
137
+ // TODO: move the API below as member functions of llama_grammar
138
+ void llama_grammar_apply_impl(
139
+ const struct llama_grammar & grammar,
140
+ llama_token_data_array * cur_p);
34
141
 
35
- void llama_grammar_accept_token_impl(
36
- struct llama_grammar * grammar,
37
- const struct llama_vocab * vocab,
38
- const struct llama_sampling * smpl,
142
+ void llama_grammar_accept_impl(
143
+ struct llama_grammar & grammar,
39
144
  llama_token token);
package/cpp/llama-impl.h CHANGED
@@ -1,8 +1,11 @@
1
1
  #pragma once
2
2
 
3
- #define LLAMA_API_INTERNAL
4
3
  #include "llama.h"
5
4
 
5
+ #include <string>
6
+ #include <vector>
7
+ #include <stdexcept>
8
+
6
9
  #ifdef __GNUC__
7
10
  #ifdef __MINGW32__
8
11
  #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
@@ -29,6 +32,20 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
29
32
  // helpers
30
33
  //
31
34
 
35
+ struct time_meas {
36
+ time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : lm_ggml_time_us()), t_acc(t_acc) {}
37
+
38
+ ~time_meas() {
39
+ if (t_start_us >= 0) {
40
+ t_acc += lm_ggml_time_us() - t_start_us;
41
+ }
42
+ }
43
+
44
+ const int64_t t_start_us;
45
+
46
+ int64_t & t_acc;
47
+ };
48
+
32
49
  static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
33
50
  if (search.empty()) {
34
51
  return;
@@ -45,3 +62,117 @@ static void replace_all(std::string & s, const std::string & search, const std::
45
62
  builder.append(s, last_pos, std::string::npos);
46
63
  s = std::move(builder);
47
64
  }
65
+
66
+ const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(
67
+ struct llama_context * ctx
68
+ );
69
+
70
+ // the ring buffer works similarly to std::deque, but with a fixed capacity
71
+ template<typename T>
72
+ struct ring_buffer {
73
+ ring_buffer(size_t cap) : capacity(cap), data(cap) {}
74
+
75
+ T & front() {
76
+ if (sz == 0) {
77
+ throw std::runtime_error("ring buffer is empty");
78
+ }
79
+ return data[first];
80
+ }
81
+
82
+ const T & front() const {
83
+ if (sz == 0) {
84
+ throw std::runtime_error("ring buffer is empty");
85
+ }
86
+ return data[first];
87
+ }
88
+
89
+ T & back() {
90
+ if (sz == 0) {
91
+ throw std::runtime_error("ring buffer is empty");
92
+ }
93
+ return data[pos];
94
+ }
95
+
96
+ const T & back() const {
97
+ if (sz == 0) {
98
+ throw std::runtime_error("ring buffer is empty");
99
+ }
100
+ return data[pos];
101
+ }
102
+
103
+ void push_back(const T & value) {
104
+ if (capacity == 0) {
105
+ throw std::runtime_error("ring buffer: capacity is zero");
106
+ }
107
+
108
+ if (sz == capacity) {
109
+ // advance the start when buffer is full
110
+ first = (first + 1) % capacity;
111
+ } else {
112
+ sz++;
113
+ }
114
+ data[pos] = value;
115
+ pos = (pos + 1) % capacity;
116
+ }
117
+
118
+ T pop_front() {
119
+ if (sz == 0) {
120
+ throw std::runtime_error("ring buffer is empty");
121
+ }
122
+ T value = data[first];
123
+ first = (first + 1) % capacity;
124
+ sz--;
125
+ return value;
126
+ }
127
+
128
+ //T & operator[](size_t i) {
129
+ // if (i >= sz) {
130
+ // throw std::runtime_error("ring buffer: index out of bounds");
131
+ // }
132
+ // return data[(first + i) % capacity];
133
+ //}
134
+
135
+ //const T & at(size_t i) const {
136
+ // if (i >= sz) {
137
+ // throw std::runtime_error("ring buffer: index out of bounds");
138
+ // }
139
+ // return data[(first + i) % capacity];
140
+ //}
141
+
142
+ const T & rat(size_t i) const {
143
+ if (i >= sz) {
144
+ throw std::runtime_error("ring buffer: index out of bounds");
145
+ }
146
+ return data[(first + sz - i - 1) % capacity];
147
+ }
148
+
149
+ std::vector<T> to_vector() const {
150
+ std::vector<T> result;
151
+ result.reserve(sz);
152
+ for (size_t i = 0; i < sz; i++) {
153
+ result.push_back(data[(first + i) % capacity]);
154
+ }
155
+ return result;
156
+ }
157
+
158
+ void clear() {
159
+ // here only reset the status of the buffer
160
+ sz = 0;
161
+ first = 0;
162
+ pos = 0;
163
+ }
164
+
165
+ bool empty() const {
166
+ return sz == 0;
167
+ }
168
+
169
+ size_t size() const {
170
+ return sz;
171
+ }
172
+
173
+ size_t capacity = 0;
174
+ size_t sz = 0;
175
+ size_t first = 0;
176
+ size_t pos = 0;
177
+ std::vector<T> data;
178
+ };