@fugood/llama.node 1.4.1 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -1
- package/lib/binding.js +3 -0
- package/lib/binding.ts +2 -0
- package/package.json +16 -16
- package/scripts/llama.cpp.patch +25 -11
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/CMakeLists.txt +21 -6
- package/src/llama.cpp/common/CMakeLists.txt +6 -0
- package/src/llama.cpp/common/arg.cpp +65 -16
- package/src/llama.cpp/common/chat-parser.cpp +40 -0
- package/src/llama.cpp/common/chat-peg-parser.cpp +110 -0
- package/src/llama.cpp/common/chat-peg-parser.h +105 -0
- package/src/llama.cpp/common/chat.cpp +40 -29
- package/src/llama.cpp/common/chat.h +10 -1
- package/src/llama.cpp/common/common.cpp +24 -5
- package/src/llama.cpp/common/common.h +16 -5
- package/src/llama.cpp/common/download.cpp +18 -8
- package/src/llama.cpp/common/download.h +3 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +15 -1
- package/src/llama.cpp/common/log.h +19 -12
- package/src/llama.cpp/common/peg-parser.cpp +1712 -0
- package/src/llama.cpp/common/peg-parser.h +459 -0
- package/src/llama.cpp/common/unicode.cpp +64 -0
- package/src/llama.cpp/common/unicode.h +22 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +48 -48
- package/src/llama.cpp/ggml/include/ggml.h +7 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +60 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +30 -1
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-graph.cpp +3 -6
- package/src/llama.cpp/src/llama-hparams.h +2 -2
- package/src/llama.cpp/src/llama-impl.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +50 -6
- package/src/llama.cpp/src/llama-vocab.cpp +1 -2
- package/src/llama.cpp/src/models/mistral3.cpp +160 -0
- package/src/llama.cpp/src/models/models.h +4 -0
|
@@ -0,0 +1,1712 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
#include "peg-parser.h"
|
|
3
|
+
#include "json-schema-to-grammar.h"
|
|
4
|
+
#include "unicode.h"
|
|
5
|
+
|
|
6
|
+
#include <nlohmann/json.hpp>
|
|
7
|
+
|
|
8
|
+
#include <algorithm>
|
|
9
|
+
#include <initializer_list>
|
|
10
|
+
#include <map>
|
|
11
|
+
#include <memory>
|
|
12
|
+
#include <regex>
|
|
13
|
+
#include <stdexcept>
|
|
14
|
+
#include <unordered_set>
|
|
15
|
+
|
|
16
|
+
// Trick to catch missing branches
|
|
17
|
+
template <typename T>
|
|
18
|
+
inline constexpr bool is_always_false_v = false;
|
|
19
|
+
|
|
20
|
+
const char * common_peg_parse_result_type_name(common_peg_parse_result_type type) {
|
|
21
|
+
switch (type) {
|
|
22
|
+
case COMMON_PEG_PARSE_RESULT_FAIL: return "fail";
|
|
23
|
+
case COMMON_PEG_PARSE_RESULT_SUCCESS: return "success";
|
|
24
|
+
case COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT: return "need_more_input";
|
|
25
|
+
default: return "unknown";
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
static bool is_hex_digit(const char c) {
|
|
30
|
+
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Trie for matching multiple literals.
|
|
34
|
+
// This is used in common_peg_until_parser and to build a GBNF exclusion grammar
|
|
35
|
+
struct trie {
|
|
36
|
+
struct node {
|
|
37
|
+
size_t depth = 0;
|
|
38
|
+
std::map<unsigned char, size_t> children;
|
|
39
|
+
bool is_word;
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
std::vector<node> nodes;
|
|
43
|
+
|
|
44
|
+
trie(const std::vector<std::string> & words) {
|
|
45
|
+
create_node(); // root node
|
|
46
|
+
for (const auto & w : words) {
|
|
47
|
+
insert(w);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
enum match_result { NO_MATCH, PARTIAL_MATCH, COMPLETE_MATCH };
|
|
52
|
+
|
|
53
|
+
// Check if a delimiter starts at the given position
|
|
54
|
+
match_result check_at(std::string_view sv, size_t start_pos) const {
|
|
55
|
+
size_t current = 0; // Start at root
|
|
56
|
+
size_t pos = start_pos;
|
|
57
|
+
|
|
58
|
+
while (pos < sv.size()) {
|
|
59
|
+
auto it = nodes[current].children.find(sv[pos]);
|
|
60
|
+
if (it == nodes[current].children.end()) {
|
|
61
|
+
// Can't continue matching
|
|
62
|
+
return match_result{match_result::NO_MATCH};
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
current = it->second;
|
|
66
|
+
pos++;
|
|
67
|
+
|
|
68
|
+
// Check if we've matched a complete word
|
|
69
|
+
if (nodes[current].is_word) {
|
|
70
|
+
return match_result{match_result::COMPLETE_MATCH};
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Reached end of input while still in the trie (not at root)
|
|
75
|
+
if (current != 0) {
|
|
76
|
+
// We're in the middle of a potential match
|
|
77
|
+
return match_result{match_result::PARTIAL_MATCH};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Reached end at root (no match)
|
|
81
|
+
return match_result{match_result::NO_MATCH};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
struct prefix_and_next {
|
|
85
|
+
std::string prefix;
|
|
86
|
+
std::string next_chars;
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
std::vector<prefix_and_next> collect_prefix_and_next() {
|
|
90
|
+
std::string prefix;
|
|
91
|
+
std::vector<prefix_and_next> result;
|
|
92
|
+
collect_prefix_and_next(0, prefix, result);
|
|
93
|
+
return result;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
private:
|
|
97
|
+
void collect_prefix_and_next(size_t index, std::string & prefix, std::vector<prefix_and_next> & out) {
|
|
98
|
+
if (!nodes[index].is_word) {
|
|
99
|
+
if (!nodes[index].children.empty()) {
|
|
100
|
+
std::string chars;
|
|
101
|
+
chars.reserve(nodes[index].children.size());
|
|
102
|
+
for (const auto & p : nodes[index].children) {
|
|
103
|
+
chars.push_back(p.first);
|
|
104
|
+
}
|
|
105
|
+
out.emplace_back(prefix_and_next{prefix, chars});
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
for (const auto & p : nodes[index].children) {
|
|
110
|
+
unsigned char ch = p.first;
|
|
111
|
+
auto child = p.second;
|
|
112
|
+
prefix.push_back(ch);
|
|
113
|
+
collect_prefix_and_next(child, prefix, out);
|
|
114
|
+
prefix.pop_back();
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
size_t create_node() {
|
|
119
|
+
size_t index = nodes.size();
|
|
120
|
+
nodes.emplace_back();
|
|
121
|
+
return index;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
void insert(const std::string & word) {
|
|
125
|
+
size_t current = 0;
|
|
126
|
+
for (unsigned char ch : word) {
|
|
127
|
+
auto it = nodes[current].children.find(ch);
|
|
128
|
+
if (it == nodes[current].children.end()) {
|
|
129
|
+
size_t child = create_node();
|
|
130
|
+
nodes[child].depth = nodes[current].depth + 1;
|
|
131
|
+
nodes[current].children[ch] = child;
|
|
132
|
+
current = child;
|
|
133
|
+
} else {
|
|
134
|
+
current = it->second;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
nodes[current].is_word = true;
|
|
138
|
+
}
|
|
139
|
+
};
|
|
140
|
+
|
|
141
|
+
static std::pair<uint32_t, size_t> parse_hex_escape(const std::string & str, size_t pos, int hex_count) {
|
|
142
|
+
if (pos + hex_count > str.length()) {
|
|
143
|
+
return {0, 0};
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
uint32_t value = 0;
|
|
147
|
+
for (int i = 0; i < hex_count; i++) {
|
|
148
|
+
char c = str[pos + i];
|
|
149
|
+
if (!is_hex_digit(c)) {
|
|
150
|
+
return {0, 0};
|
|
151
|
+
}
|
|
152
|
+
value <<= 4;
|
|
153
|
+
if ('a' <= c && c <= 'f') {
|
|
154
|
+
value += c - 'a' + 10;
|
|
155
|
+
} else if ('A' <= c && c <= 'F') {
|
|
156
|
+
value += c - 'A' + 10;
|
|
157
|
+
} else if ('0' <= c && c <= '9') {
|
|
158
|
+
value += c - '0';
|
|
159
|
+
} else {
|
|
160
|
+
break;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
return {value, static_cast<size_t>(hex_count)};
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
static std::pair<uint32_t, size_t> parse_char_class_char(const std::string & content, size_t pos) {
|
|
167
|
+
if (content[pos] == '\\' && pos + 1 < content.length()) {
|
|
168
|
+
switch (content[pos + 1]) {
|
|
169
|
+
case 'x': {
|
|
170
|
+
auto result = parse_hex_escape(content, pos + 2, 2);
|
|
171
|
+
if (result.second > 0) {
|
|
172
|
+
return {result.first, 2 + result.second};
|
|
173
|
+
}
|
|
174
|
+
// Invalid escape, treat as literal 'x'
|
|
175
|
+
return {static_cast<uint32_t>('x'), 2};
|
|
176
|
+
}
|
|
177
|
+
case 'u': {
|
|
178
|
+
auto result = parse_hex_escape(content, pos + 2, 4);
|
|
179
|
+
if (result.second > 0) {
|
|
180
|
+
return {result.first, 2 + result.second};
|
|
181
|
+
}
|
|
182
|
+
// Invalid escape, treat as literal 'u'
|
|
183
|
+
return {static_cast<uint32_t>('u'), 2};
|
|
184
|
+
}
|
|
185
|
+
case 'U': {
|
|
186
|
+
auto result = parse_hex_escape(content, pos + 2, 8);
|
|
187
|
+
if (result.second > 0) {
|
|
188
|
+
return {result.first, 2 + result.second};
|
|
189
|
+
}
|
|
190
|
+
// Invalid escape, treat as literal 'U'
|
|
191
|
+
return {static_cast<uint32_t>('U'), 2};
|
|
192
|
+
}
|
|
193
|
+
case 'n': return {'\n', 2};
|
|
194
|
+
case 't': return {'\t', 2};
|
|
195
|
+
case 'r': return {'\r', 2};
|
|
196
|
+
case '\\': return {'\\', 2};
|
|
197
|
+
case ']': return {']', 2};
|
|
198
|
+
case '[': return {'[', 2};
|
|
199
|
+
default: return {static_cast<uint32_t>(content[pos + 1]), 2};
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Regular character - return as codepoint
|
|
204
|
+
return {static_cast<uint32_t>(static_cast<unsigned char>(content[pos])), 1};
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
static std::pair<std::vector<common_peg_chars_parser::char_range>, bool> parse_char_classes(const std::string & classes) {
|
|
208
|
+
std::vector<common_peg_chars_parser::char_range> ranges;
|
|
209
|
+
bool negated = false;
|
|
210
|
+
|
|
211
|
+
std::string content = classes;
|
|
212
|
+
if (content.front() == '[') {
|
|
213
|
+
content = content.substr(1);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
if (content.back() == ']') {
|
|
217
|
+
content.pop_back();
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Check for negation
|
|
221
|
+
if (!content.empty() && content.front() == '^') {
|
|
222
|
+
negated = true;
|
|
223
|
+
content = content.substr(1);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
size_t i = 0;
|
|
227
|
+
while (i < content.length()) {
|
|
228
|
+
auto [start, start_len] = parse_char_class_char(content, i);
|
|
229
|
+
i += start_len;
|
|
230
|
+
|
|
231
|
+
if (i + 1 < content.length() && content[i] == '-') {
|
|
232
|
+
// Range detected
|
|
233
|
+
auto [end, end_len] = parse_char_class_char(content, i + 1);
|
|
234
|
+
ranges.push_back(common_peg_chars_parser::char_range{start, end});
|
|
235
|
+
i += 1 + end_len;
|
|
236
|
+
} else {
|
|
237
|
+
ranges.push_back(common_peg_chars_parser::char_range{start, start});
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return {ranges, negated};
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
void common_peg_ast_arena::visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const {
|
|
245
|
+
if (id == COMMON_PEG_INVALID_AST_ID) {
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
248
|
+
const auto & node = get(id);
|
|
249
|
+
visitor(node);
|
|
250
|
+
for (const auto & child : node.children) {
|
|
251
|
+
visit(child, visitor);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
void common_peg_ast_arena::visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const {
|
|
256
|
+
for (const auto & node : result.nodes) {
|
|
257
|
+
visit(node, visitor);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
struct parser_executor;
|
|
262
|
+
|
|
263
|
+
common_peg_parser_id common_peg_arena::add_parser(common_peg_parser_variant parser) {
|
|
264
|
+
common_peg_parser_id id = parsers_.size();
|
|
265
|
+
parsers_.push_back(std::move(parser));
|
|
266
|
+
return id;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
void common_peg_arena::add_rule(const std::string & name, common_peg_parser_id id) {
|
|
270
|
+
rules_[name] = id;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
common_peg_parser_id common_peg_arena::get_rule(const std::string & name) const {
|
|
274
|
+
auto it = rules_.find(name);
|
|
275
|
+
if (it == rules_.end()) {
|
|
276
|
+
throw std::runtime_error("Rule not found: " + name);
|
|
277
|
+
}
|
|
278
|
+
return it->second;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
struct parser_executor {
|
|
282
|
+
const common_peg_arena & arena;
|
|
283
|
+
common_peg_parse_context & ctx;
|
|
284
|
+
size_t start_pos;
|
|
285
|
+
|
|
286
|
+
parser_executor(const common_peg_arena & arena, common_peg_parse_context & ctx, size_t start)
|
|
287
|
+
: arena(arena), ctx(ctx), start_pos(start) {}
|
|
288
|
+
|
|
289
|
+
common_peg_parse_result operator()(const common_peg_epsilon_parser & /* p */) const {
|
|
290
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
common_peg_parse_result operator()(const common_peg_start_parser & /* p */) const {
|
|
294
|
+
return common_peg_parse_result(
|
|
295
|
+
start_pos == 0 ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL,
|
|
296
|
+
start_pos
|
|
297
|
+
);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
common_peg_parse_result operator()(const common_peg_end_parser & /* p */) const {
|
|
301
|
+
return common_peg_parse_result(
|
|
302
|
+
start_pos >= ctx.input.size() ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL,
|
|
303
|
+
start_pos
|
|
304
|
+
);
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
common_peg_parse_result operator()(const common_peg_literal_parser & p) {
|
|
308
|
+
auto pos = start_pos;
|
|
309
|
+
for (auto i = 0u; i < p.literal.size(); ++i) {
|
|
310
|
+
if (pos >= ctx.input.size()) {
|
|
311
|
+
if (!ctx.is_partial) {
|
|
312
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
313
|
+
}
|
|
314
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
|
315
|
+
}
|
|
316
|
+
if (ctx.input[pos] != p.literal[i]) {
|
|
317
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
318
|
+
}
|
|
319
|
+
++pos;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
common_peg_parse_result operator()(const common_peg_sequence_parser & p) {
|
|
326
|
+
auto pos = start_pos;
|
|
327
|
+
std::vector<common_peg_ast_id> nodes;
|
|
328
|
+
|
|
329
|
+
for (const auto & child_id : p.children) {
|
|
330
|
+
auto result = arena.parse(child_id, ctx, pos);
|
|
331
|
+
if (result.fail()) {
|
|
332
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
if (!result.nodes.empty()) {
|
|
336
|
+
nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
if (result.need_more_input()) {
|
|
340
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
pos = result.end;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
common_peg_parse_result operator()(const common_peg_choice_parser & p) {
|
|
350
|
+
auto pos = start_pos;
|
|
351
|
+
for (const auto & child_id : p.children) {
|
|
352
|
+
auto result = arena.parse(child_id, ctx, pos);
|
|
353
|
+
if (!result.fail()) {
|
|
354
|
+
return result;
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
common_peg_parse_result operator()(const common_peg_repetition_parser & p) {
|
|
362
|
+
auto pos = start_pos;
|
|
363
|
+
int match_count = 0;
|
|
364
|
+
std::vector<common_peg_ast_id> nodes;
|
|
365
|
+
|
|
366
|
+
// Try to match up to max_count times (or unlimited if max_count is -1)
|
|
367
|
+
while (p.max_count == -1 || match_count < p.max_count) {
|
|
368
|
+
if (pos >= ctx.input.size()) {
|
|
369
|
+
break;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
auto result = arena.parse(p.child, ctx, pos);
|
|
373
|
+
|
|
374
|
+
if (result.success()) {
|
|
375
|
+
// Prevent infinite loop on empty matches
|
|
376
|
+
if (result.end == pos) {
|
|
377
|
+
break;
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
if (!result.nodes.empty()) {
|
|
381
|
+
nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
pos = result.end;
|
|
385
|
+
match_count++;
|
|
386
|
+
continue;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
if (result.need_more_input()) {
|
|
390
|
+
if (!result.nodes.empty()) {
|
|
391
|
+
nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Child failed - stop trying
|
|
398
|
+
break;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
// Check if we got enough matches
|
|
402
|
+
if (p.min_count > 0 && match_count < p.min_count) {
|
|
403
|
+
if (pos >= ctx.input.size() && ctx.is_partial) {
|
|
404
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes));
|
|
405
|
+
}
|
|
406
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
common_peg_parse_result operator()(const common_peg_and_parser & p) {
|
|
413
|
+
auto result = arena.parse(p.child, ctx, start_pos);
|
|
414
|
+
// Pass result but don't consume input
|
|
415
|
+
return common_peg_parse_result(result.type, start_pos);
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
common_peg_parse_result operator()(const common_peg_not_parser & p) {
|
|
419
|
+
auto result = arena.parse(p.child, ctx, start_pos);
|
|
420
|
+
|
|
421
|
+
if (result.success()) {
|
|
422
|
+
// Fail if the underlying parser matches
|
|
423
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
if (result.need_more_input()) {
|
|
427
|
+
// Propagate - need to know what child would match before negating
|
|
428
|
+
return result;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
// Child failed, so negation succeeds
|
|
432
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
common_peg_parse_result operator()(const common_peg_any_parser & /* p */) const {
|
|
436
|
+
// Parse a single UTF-8 codepoint (not just a single byte)
|
|
437
|
+
auto result = parse_utf8_codepoint(ctx.input, start_pos);
|
|
438
|
+
|
|
439
|
+
if (result.status == utf8_parse_result::INCOMPLETE) {
|
|
440
|
+
if (!ctx.is_partial) {
|
|
441
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
442
|
+
}
|
|
443
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
|
|
444
|
+
}
|
|
445
|
+
if (result.status == utf8_parse_result::INVALID) {
|
|
446
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
447
|
+
}
|
|
448
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, start_pos + result.bytes_consumed);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
common_peg_parse_result operator()(const common_peg_space_parser & /* p */) {
|
|
452
|
+
auto pos = start_pos;
|
|
453
|
+
while (pos < ctx.input.size()) {
|
|
454
|
+
auto c = static_cast<unsigned char>(ctx.input[pos]);
|
|
455
|
+
if (std::isspace(c)) {
|
|
456
|
+
++pos;
|
|
457
|
+
} else {
|
|
458
|
+
break;
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
common_peg_parse_result operator()(const common_peg_chars_parser & p) const {
|
|
466
|
+
auto pos = start_pos;
|
|
467
|
+
int match_count = 0;
|
|
468
|
+
|
|
469
|
+
// Try to match up to max_count times (or unlimited if max_count is -1)
|
|
470
|
+
while (p.max_count == -1 || match_count < p.max_count) {
|
|
471
|
+
auto result = parse_utf8_codepoint(ctx.input, pos);
|
|
472
|
+
|
|
473
|
+
if (result.status == utf8_parse_result::INCOMPLETE) {
|
|
474
|
+
if (match_count >= p.min_count) {
|
|
475
|
+
// We have enough matches, succeed with what we have
|
|
476
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
|
477
|
+
}
|
|
478
|
+
// Not enough matches yet
|
|
479
|
+
if (!ctx.is_partial) {
|
|
480
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
481
|
+
}
|
|
482
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
if (result.status == utf8_parse_result::INVALID) {
|
|
486
|
+
// Malformed UTF-8 in input
|
|
487
|
+
if (match_count >= p.min_count) {
|
|
488
|
+
// We have enough matches, succeed up to here
|
|
489
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
|
490
|
+
}
|
|
491
|
+
// Not enough matches, fail
|
|
492
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
// Check if this codepoint matches our character class
|
|
496
|
+
bool matches = false;
|
|
497
|
+
for (const auto & range : p.ranges) {
|
|
498
|
+
if (range.contains(result.codepoint)) {
|
|
499
|
+
matches = true;
|
|
500
|
+
break;
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
// If negated, invert the match result
|
|
505
|
+
if (p.negated) {
|
|
506
|
+
matches = !matches;
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
if (matches) {
|
|
510
|
+
pos += result.bytes_consumed;
|
|
511
|
+
++match_count;
|
|
512
|
+
} else {
|
|
513
|
+
// Character doesn't match, stop matching
|
|
514
|
+
break;
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
// Check if we got enough matches
|
|
519
|
+
if (match_count < p.min_count) {
|
|
520
|
+
if (pos >= ctx.input.size() && ctx.is_partial) {
|
|
521
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
|
522
|
+
}
|
|
523
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) {
|
|
530
|
+
++pos; // consume '\'
|
|
531
|
+
if (pos >= ctx.input.size()) {
|
|
532
|
+
if (!ctx.is_partial) {
|
|
533
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
|
|
534
|
+
}
|
|
535
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
switch (ctx.input[pos]) {
|
|
539
|
+
case '"':
|
|
540
|
+
case '\\':
|
|
541
|
+
case '/':
|
|
542
|
+
case 'b':
|
|
543
|
+
case 'f':
|
|
544
|
+
case 'n':
|
|
545
|
+
case 'r':
|
|
546
|
+
case 't':
|
|
547
|
+
++pos;
|
|
548
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
|
|
549
|
+
case 'u':
|
|
550
|
+
return handle_unicode_escape(ctx, start, pos);
|
|
551
|
+
default:
|
|
552
|
+
// Invalid escape sequence
|
|
553
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
static common_peg_parse_result handle_unicode_escape(common_peg_parse_context & ctx, size_t start, size_t & pos) {
|
|
558
|
+
++pos; // consume 'u'
|
|
559
|
+
for (int i = 0; i < 4; ++i) {
|
|
560
|
+
if (pos >= ctx.input.size()) {
|
|
561
|
+
if (!ctx.is_partial) {
|
|
562
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
|
|
563
|
+
}
|
|
564
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
|
|
565
|
+
}
|
|
566
|
+
if (!is_hex_digit(ctx.input[pos])) {
|
|
567
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
|
|
568
|
+
}
|
|
569
|
+
++pos;
|
|
570
|
+
}
|
|
571
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
common_peg_parse_result operator()(const common_peg_json_string_parser & /* p */) {
|
|
575
|
+
auto pos = start_pos;
|
|
576
|
+
|
|
577
|
+
// Parse string content (without quotes)
|
|
578
|
+
while (pos < ctx.input.size()) {
|
|
579
|
+
char c = ctx.input[pos];
|
|
580
|
+
|
|
581
|
+
if (c == '"') {
|
|
582
|
+
// Found closing quote - success (don't consume it)
|
|
583
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
if (c == '\\') {
|
|
587
|
+
auto result = handle_escape_sequence(ctx, start_pos, pos);
|
|
588
|
+
if (!result.success()) {
|
|
589
|
+
return result;
|
|
590
|
+
}
|
|
591
|
+
} else {
|
|
592
|
+
auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
|
|
593
|
+
|
|
594
|
+
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
|
|
595
|
+
if (!ctx.is_partial) {
|
|
596
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
597
|
+
}
|
|
598
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
if (utf8_result.status == utf8_parse_result::INVALID) {
|
|
602
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
pos += utf8_result.bytes_consumed;
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
// Reached end without finding closing quote
|
|
610
|
+
if (!ctx.is_partial) {
|
|
611
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
|
|
612
|
+
}
|
|
613
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
common_peg_parse_result operator()(const common_peg_until_parser & p) const {
|
|
617
|
+
trie matcher(p.delimiters);
|
|
618
|
+
|
|
619
|
+
// Scan input and check for delimiters
|
|
620
|
+
size_t pos = start_pos;
|
|
621
|
+
size_t last_valid_pos = start_pos;
|
|
622
|
+
|
|
623
|
+
while (pos < ctx.input.size()) {
|
|
624
|
+
auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
|
|
625
|
+
|
|
626
|
+
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
|
|
627
|
+
// Incomplete UTF-8 sequence
|
|
628
|
+
if (!ctx.is_partial) {
|
|
629
|
+
// Input is complete but UTF-8 is incomplete = malformed
|
|
630
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
631
|
+
}
|
|
632
|
+
// Return what we have so far (before incomplete sequence)
|
|
633
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
if (utf8_result.status == utf8_parse_result::INVALID) {
|
|
637
|
+
// Malformed UTF-8
|
|
638
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
// Check if a delimiter starts at this position
|
|
642
|
+
auto match = matcher.check_at(ctx.input, pos);
|
|
643
|
+
|
|
644
|
+
if (match == trie::COMPLETE_MATCH) {
|
|
645
|
+
// Found a complete delimiter, return everything before it
|
|
646
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
if (match == trie::PARTIAL_MATCH) {
|
|
650
|
+
// Found a partial match extending to end of input, return everything before it
|
|
651
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
pos += utf8_result.bytes_consumed;
|
|
655
|
+
last_valid_pos = pos;
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
if (last_valid_pos == ctx.input.size() && ctx.is_partial) {
|
|
659
|
+
// Reached the end of a partial stream, there might still be more input that we need to consume.
|
|
660
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
|
|
661
|
+
}
|
|
662
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, last_valid_pos);
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
common_peg_parse_result operator()(const common_peg_schema_parser & p) {
|
|
666
|
+
return arena.parse(p.child, ctx, start_pos);
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
common_peg_parse_result operator()(const common_peg_rule_parser & p) {
|
|
670
|
+
// Parse the child
|
|
671
|
+
auto result = arena.parse(p.child, ctx, start_pos);
|
|
672
|
+
|
|
673
|
+
if (!result.fail()) {
|
|
674
|
+
std::string_view text;
|
|
675
|
+
if (result.start < ctx.input.size()) {
|
|
676
|
+
text = std::string_view(ctx.input).substr(result.start, result.end - result.start);
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
auto node_id = ctx.ast.add_node(
|
|
680
|
+
p.name,
|
|
681
|
+
"",
|
|
682
|
+
result.start,
|
|
683
|
+
result.end,
|
|
684
|
+
text,
|
|
685
|
+
std::move(result.nodes),
|
|
686
|
+
result.need_more_input()
|
|
687
|
+
);
|
|
688
|
+
|
|
689
|
+
return common_peg_parse_result(result.type, result.start, result.end, { node_id });
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
return result;
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
common_peg_parse_result operator()(const common_peg_tag_parser & p) {
|
|
696
|
+
// Parse the child
|
|
697
|
+
auto result = arena.parse(p.child, ctx, start_pos);
|
|
698
|
+
|
|
699
|
+
if (!result.fail()) {
|
|
700
|
+
std::string_view text;
|
|
701
|
+
if (result.start < ctx.input.size()) {
|
|
702
|
+
text = std::string_view(ctx.input).substr(result.start, result.end - result.start);
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
auto node_id = ctx.ast.add_node(
|
|
706
|
+
"",
|
|
707
|
+
p.tag,
|
|
708
|
+
result.start,
|
|
709
|
+
result.end,
|
|
710
|
+
text,
|
|
711
|
+
std::move(result.nodes),
|
|
712
|
+
result.need_more_input()
|
|
713
|
+
);
|
|
714
|
+
|
|
715
|
+
return common_peg_parse_result(result.type, result.start, result.end, { node_id });
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
return result;
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
common_peg_parse_result operator()(const common_peg_ref_parser & p) {
|
|
722
|
+
auto rule_id = arena.get_rule(p.name);
|
|
723
|
+
return arena.parse(rule_id, ctx, start_pos);
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
common_peg_parse_result operator()(const common_peg_atomic_parser & p) {
|
|
727
|
+
auto result = arena.parse(p.child, ctx, start_pos);
|
|
728
|
+
if (result.need_more_input()) {
|
|
729
|
+
// Clear nodes so they don't propagate up.
|
|
730
|
+
result.nodes.clear();
|
|
731
|
+
}
|
|
732
|
+
return result;
|
|
733
|
+
}
|
|
734
|
+
};
|
|
735
|
+
|
|
736
|
+
common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const {
|
|
737
|
+
if (root_ == COMMON_PEG_INVALID_PARSER_ID) {
|
|
738
|
+
throw std::runtime_error("No root parser set");
|
|
739
|
+
}
|
|
740
|
+
return parse(root_, ctx, start);
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
common_peg_parse_result common_peg_arena::parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const {
|
|
744
|
+
// Execute parser
|
|
745
|
+
const auto & parser = parsers_.at(id);
|
|
746
|
+
parser_executor exec(*this, ctx, start);
|
|
747
|
+
return std::visit(exec, parser);
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
common_peg_parser_id common_peg_arena::resolve_ref(common_peg_parser_id id) {
|
|
751
|
+
const auto & parser = parsers_.at(id);
|
|
752
|
+
if (auto ref = std::get_if<common_peg_ref_parser>(&parser)) {
|
|
753
|
+
return get_rule(ref->name);
|
|
754
|
+
}
|
|
755
|
+
return id;
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
void common_peg_arena::resolve_refs() {
|
|
759
|
+
// Walk through all parsers and replace refs with their corresponding rule IDs
|
|
760
|
+
for (auto & parser : parsers_) {
|
|
761
|
+
std::visit([this](auto & p) {
|
|
762
|
+
using T = std::decay_t<decltype(p)>;
|
|
763
|
+
|
|
764
|
+
if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
|
|
765
|
+
for (auto & child : p.children) {
|
|
766
|
+
child = resolve_ref(child);
|
|
767
|
+
}
|
|
768
|
+
} else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
|
|
769
|
+
for (auto & child : p.children) {
|
|
770
|
+
child = resolve_ref(child);
|
|
771
|
+
}
|
|
772
|
+
} else if constexpr (std::is_same_v<T, common_peg_repetition_parser> ||
|
|
773
|
+
std::is_same_v<T, common_peg_and_parser> ||
|
|
774
|
+
std::is_same_v<T, common_peg_not_parser> ||
|
|
775
|
+
std::is_same_v<T, common_peg_tag_parser> ||
|
|
776
|
+
std::is_same_v<T, common_peg_atomic_parser>) {
|
|
777
|
+
p.child = resolve_ref(p.child);
|
|
778
|
+
} else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
|
|
779
|
+
p.child = resolve_ref(p.child);
|
|
780
|
+
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
|
|
781
|
+
p.child = resolve_ref(p.child);
|
|
782
|
+
} else if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
|
|
783
|
+
std::is_same_v<T, common_peg_start_parser> ||
|
|
784
|
+
std::is_same_v<T, common_peg_end_parser> ||
|
|
785
|
+
std::is_same_v<T, common_peg_ref_parser> ||
|
|
786
|
+
std::is_same_v<T, common_peg_until_parser> ||
|
|
787
|
+
std::is_same_v<T, common_peg_literal_parser> ||
|
|
788
|
+
std::is_same_v<T, common_peg_json_string_parser> ||
|
|
789
|
+
std::is_same_v<T, common_peg_chars_parser> ||
|
|
790
|
+
std::is_same_v<T, common_peg_any_parser> ||
|
|
791
|
+
std::is_same_v<T, common_peg_space_parser>) {
|
|
792
|
+
// These rules do not have children
|
|
793
|
+
} else {
|
|
794
|
+
static_assert(is_always_false_v<T>);
|
|
795
|
+
}
|
|
796
|
+
}, parser);
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
// Also flatten root if it's a ref
|
|
800
|
+
if (root_ != COMMON_PEG_INVALID_PARSER_ID) {
|
|
801
|
+
root_ = resolve_ref(root_);
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
std::string common_peg_arena::dump(common_peg_parser_id id) const {
|
|
806
|
+
const auto & parser = parsers_.at(id);
|
|
807
|
+
|
|
808
|
+
return std::visit([this](const auto & p) -> std::string {
|
|
809
|
+
using T = std::decay_t<decltype(p)>;
|
|
810
|
+
|
|
811
|
+
if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
|
|
812
|
+
return "Epsilon";
|
|
813
|
+
} else if constexpr (std::is_same_v<T, common_peg_start_parser>) {
|
|
814
|
+
return "Start";
|
|
815
|
+
} else if constexpr (std::is_same_v<T, common_peg_end_parser>) {
|
|
816
|
+
return "End";
|
|
817
|
+
} else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
|
|
818
|
+
return "Literal(" + p.literal + ")";
|
|
819
|
+
} else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
|
|
820
|
+
std::vector<std::string> parts;
|
|
821
|
+
for (const auto & child : p.children) {
|
|
822
|
+
parts.push_back(dump(child));
|
|
823
|
+
}
|
|
824
|
+
return "Sequence(" + string_join(parts, ", ") + ")";
|
|
825
|
+
} else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
|
|
826
|
+
std::vector<std::string> parts;
|
|
827
|
+
for (const auto & child : p.children) {
|
|
828
|
+
parts.push_back(dump(child));
|
|
829
|
+
}
|
|
830
|
+
return "Choice(" + string_join(parts, ", ") + ")";
|
|
831
|
+
} else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
|
|
832
|
+
if (p.max_count == -1) {
|
|
833
|
+
return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", unbounded)";
|
|
834
|
+
}
|
|
835
|
+
return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
|
|
836
|
+
} else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
|
|
837
|
+
return "And(" + dump(p.child) + ")";
|
|
838
|
+
} else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
|
|
839
|
+
return "Not(" + dump(p.child) + ")";
|
|
840
|
+
} else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
|
|
841
|
+
return "Any";
|
|
842
|
+
} else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
|
|
843
|
+
return "Space";
|
|
844
|
+
} else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
|
|
845
|
+
if (p.max_count == -1) {
|
|
846
|
+
return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)";
|
|
847
|
+
}
|
|
848
|
+
return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
|
|
849
|
+
} else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
|
|
850
|
+
return "JsonString()";
|
|
851
|
+
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
|
|
852
|
+
return "Until(" + string_join(p.delimiters, " | ") + ")";
|
|
853
|
+
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
|
|
854
|
+
return "Schema(" + dump(p.child) + ", " + (p.schema ? p.schema->dump() : "null") + ")";
|
|
855
|
+
} else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
|
|
856
|
+
return "Rule(" + p.name + ", " + dump(p.child) + ")";
|
|
857
|
+
} else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
|
|
858
|
+
return "Ref(" + p.name + ")";
|
|
859
|
+
} else {
|
|
860
|
+
return "Unknown";
|
|
861
|
+
}
|
|
862
|
+
}, parser);
|
|
863
|
+
}
|
|
864
|
+
|
|
865
|
+
common_peg_parser & common_peg_parser::operator=(const common_peg_parser & other) {
|
|
866
|
+
id_ = other.id_;
|
|
867
|
+
return *this;
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
common_peg_parser & common_peg_parser::operator+=(const common_peg_parser & other) {
|
|
871
|
+
id_ = builder_.sequence({id_, other.id_});
|
|
872
|
+
return *this;
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
common_peg_parser & common_peg_parser::operator|=(const common_peg_parser & other) {
|
|
876
|
+
id_ = builder_.choice({id_, other.id_});
|
|
877
|
+
return *this;
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
common_peg_parser common_peg_parser::operator+(const common_peg_parser & other) const {
|
|
881
|
+
return builder_.sequence({id_, other.id_});
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
common_peg_parser common_peg_parser::operator|(const common_peg_parser & other) const {
|
|
885
|
+
return builder_.choice({id_, other.id_});
|
|
886
|
+
}
|
|
887
|
+
|
|
888
|
+
common_peg_parser common_peg_parser::operator<<(const common_peg_parser & other) const {
|
|
889
|
+
return builder_.sequence({id_, builder_.space(), other.id_});
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
common_peg_parser common_peg_parser::operator+(const char * str) const {
|
|
893
|
+
return *this + builder_.literal(str);
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
common_peg_parser common_peg_parser::operator+(const std::string & str) const {
|
|
897
|
+
return *this + builder_.literal(str);
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
common_peg_parser common_peg_parser::operator<<(const char * str) const {
|
|
901
|
+
return *this << builder_.literal(str);
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
common_peg_parser common_peg_parser::operator<<(const std::string & str) const {
|
|
905
|
+
return *this << builder_.literal(str);
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
common_peg_parser common_peg_parser::operator|(const char * str) const {
|
|
909
|
+
return *this | builder_.literal(str);
|
|
910
|
+
}
|
|
911
|
+
|
|
912
|
+
common_peg_parser common_peg_parser::operator|(const std::string & str) const {
|
|
913
|
+
return *this | builder_.literal(str);
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
common_peg_parser operator+(const char * str, const common_peg_parser & p) {
|
|
917
|
+
return p.builder().literal(str) + p;
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
common_peg_parser operator+(const std::string & str, const common_peg_parser & p) {
|
|
921
|
+
return operator+(str.c_str(), p);
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
common_peg_parser operator<<(const char * str, const common_peg_parser & p) {
|
|
925
|
+
return p.builder().literal(str) << p;
|
|
926
|
+
}
|
|
927
|
+
|
|
928
|
+
common_peg_parser operator<<(const std::string & str, const common_peg_parser & p) {
|
|
929
|
+
return operator<<(str.c_str(), p);
|
|
930
|
+
}
|
|
931
|
+
|
|
932
|
+
common_peg_parser operator|(const char * str, const common_peg_parser & p) {
|
|
933
|
+
return p.builder().literal(str) | p;
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
common_peg_parser operator|(const std::string & str, const common_peg_parser & p) {
|
|
937
|
+
return operator|(str.c_str(), p);
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
static std::string rule_name(const std::string & name) {
|
|
941
|
+
static const std::regex invalid_rule_chars_re("[^a-zA-Z0-9-]+");
|
|
942
|
+
return std::regex_replace(name, invalid_rule_chars_re, "-");
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
common_peg_parser_builder::common_peg_parser_builder() {}
|
|
946
|
+
|
|
947
|
+
common_peg_parser common_peg_parser_builder::sequence(const std::vector<common_peg_parser_id> & parsers) {
|
|
948
|
+
// Flatten nested sequences
|
|
949
|
+
std::vector<common_peg_parser_id> flattened;
|
|
950
|
+
for (const auto & p : parsers) {
|
|
951
|
+
const auto & parser = arena_.get(p);
|
|
952
|
+
if (auto seq = std::get_if<common_peg_sequence_parser>(&parser)) {
|
|
953
|
+
flattened.insert(flattened.end(), seq->children.begin(), seq->children.end());
|
|
954
|
+
} else {
|
|
955
|
+
flattened.push_back(p);
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
return wrap(arena_.add_parser(common_peg_sequence_parser{flattened}));
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
common_peg_parser common_peg_parser_builder::sequence(const std::vector<common_peg_parser> & parsers) {
|
|
962
|
+
std::vector<common_peg_parser_id> ids;
|
|
963
|
+
ids.reserve(parsers.size());
|
|
964
|
+
for (const auto & p : parsers) {
|
|
965
|
+
ids.push_back(p.id());
|
|
966
|
+
}
|
|
967
|
+
return sequence(ids);
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
common_peg_parser common_peg_parser_builder::sequence(std::initializer_list<common_peg_parser> parsers) {
|
|
971
|
+
std::vector<common_peg_parser_id> ids;
|
|
972
|
+
ids.reserve(parsers.size());
|
|
973
|
+
for (const auto & p : parsers) {
|
|
974
|
+
ids.push_back(p.id());
|
|
975
|
+
}
|
|
976
|
+
return sequence(ids);
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
common_peg_parser common_peg_parser_builder::choice(const std::vector<common_peg_parser_id> & parsers) {
|
|
980
|
+
// Flatten nested choices
|
|
981
|
+
std::vector<common_peg_parser_id> flattened;
|
|
982
|
+
for (const auto & p : parsers) {
|
|
983
|
+
const auto & parser = arena_.get(p);
|
|
984
|
+
if (auto choice = std::get_if<common_peg_choice_parser>(&parser)) {
|
|
985
|
+
flattened.insert(flattened.end(), choice->children.begin(), choice->children.end());
|
|
986
|
+
} else {
|
|
987
|
+
flattened.push_back(p);
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
return wrap(arena_.add_parser(common_peg_choice_parser{flattened}));
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
common_peg_parser common_peg_parser_builder::choice(const std::vector<common_peg_parser> & parsers) {
|
|
994
|
+
std::vector<common_peg_parser_id> ids;
|
|
995
|
+
ids.reserve(parsers.size());
|
|
996
|
+
for (const auto & p : parsers) {
|
|
997
|
+
ids.push_back(p.id());
|
|
998
|
+
}
|
|
999
|
+
return choice(ids);
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
common_peg_parser common_peg_parser_builder::choice(std::initializer_list<common_peg_parser> parsers) {
|
|
1003
|
+
std::vector<common_peg_parser_id> ids;
|
|
1004
|
+
ids.reserve(parsers.size());
|
|
1005
|
+
for (const auto & p : parsers) {
|
|
1006
|
+
ids.push_back(p.id());
|
|
1007
|
+
}
|
|
1008
|
+
return choice(ids);
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
common_peg_parser common_peg_parser_builder::chars(const std::string & classes, int min, int max) {
|
|
1012
|
+
auto [ranges, negated] = parse_char_classes(classes);
|
|
1013
|
+
return wrap(arena_.add_parser(common_peg_chars_parser{classes, ranges, negated, min, max}));
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
common_peg_parser common_peg_parser_builder::schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw) {
|
|
1017
|
+
return wrap(arena_.add_parser(common_peg_schema_parser{p.id(), name, std::make_shared<nlohmann::ordered_json>(schema), raw}));
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
common_peg_parser common_peg_parser_builder::rule(const std::string & name, const common_peg_parser & p, bool trigger) {
|
|
1021
|
+
auto clean_name = rule_name(name);
|
|
1022
|
+
auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, p.id(), trigger});
|
|
1023
|
+
arena_.add_rule(clean_name, rule_id);
|
|
1024
|
+
return ref(clean_name);
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
common_peg_parser common_peg_parser_builder::rule(const std::string & name, const std::function<common_peg_parser()> & builder_fn, bool trigger) {
|
|
1028
|
+
auto clean_name = rule_name(name);
|
|
1029
|
+
if (arena_.has_rule(clean_name)) {
|
|
1030
|
+
return ref(clean_name);
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
// Create placeholder rule to allow recursive references
|
|
1034
|
+
auto placeholder = any(); // Temporary placeholder
|
|
1035
|
+
auto placeholder_rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, placeholder.id(), trigger});
|
|
1036
|
+
arena_.add_rule(clean_name, placeholder_rule_id);
|
|
1037
|
+
|
|
1038
|
+
// Build the actual parser
|
|
1039
|
+
auto parser = builder_fn();
|
|
1040
|
+
|
|
1041
|
+
// Replace placeholder with actual rule
|
|
1042
|
+
auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, parser.id(), trigger});
|
|
1043
|
+
arena_.rules_[clean_name] = rule_id;
|
|
1044
|
+
|
|
1045
|
+
return ref(clean_name);
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
void common_peg_parser_builder::set_root(const common_peg_parser & p) {
|
|
1049
|
+
arena_.set_root(p.id());
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
common_peg_arena common_peg_parser_builder::build() {
|
|
1053
|
+
arena_.resolve_refs();
|
|
1054
|
+
return std::move(arena_);
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
// JSON parsers
|
|
1058
|
+
common_peg_parser common_peg_parser_builder::json_number() {
|
|
1059
|
+
return rule("json-number", [this]() {
|
|
1060
|
+
auto digit1_9 = chars("[1-9]", 1, 1);
|
|
1061
|
+
auto digits = chars("[0-9]");
|
|
1062
|
+
auto int_part = choice({literal("0"), sequence({digit1_9, chars("[0-9]", 0, -1)})});
|
|
1063
|
+
auto frac = sequence({literal("."), digits});
|
|
1064
|
+
auto exp = sequence({choice({literal("e"), literal("E")}), optional(chars("[+-]", 1, 1)), digits});
|
|
1065
|
+
return sequence({optional(literal("-")), int_part, optional(frac), optional(exp), space()});
|
|
1066
|
+
});
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
common_peg_parser common_peg_parser_builder::json_string() {
|
|
1070
|
+
return rule("json-string", [this]() {
|
|
1071
|
+
return sequence({literal("\""), json_string_content(), literal("\""), space()});
|
|
1072
|
+
});
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
common_peg_parser common_peg_parser_builder::json_bool() {
|
|
1076
|
+
return rule("json-bool", [this]() {
|
|
1077
|
+
return sequence({choice({literal("true"), literal("false")}), space()});
|
|
1078
|
+
});
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1081
|
+
common_peg_parser common_peg_parser_builder::json_null() {
|
|
1082
|
+
return rule("json-null", [this]() {
|
|
1083
|
+
return sequence({literal("null"), space()});
|
|
1084
|
+
});
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
common_peg_parser common_peg_parser_builder::json_object() {
|
|
1088
|
+
return rule("json-object", [this]() {
|
|
1089
|
+
auto ws = space();
|
|
1090
|
+
auto member = sequence({json_string(), ws, literal(":"), ws, json()});
|
|
1091
|
+
auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))});
|
|
1092
|
+
return sequence({
|
|
1093
|
+
literal("{"),
|
|
1094
|
+
ws,
|
|
1095
|
+
choice({
|
|
1096
|
+
literal("}"),
|
|
1097
|
+
sequence({members, ws, literal("}")})
|
|
1098
|
+
}),
|
|
1099
|
+
ws
|
|
1100
|
+
});
|
|
1101
|
+
});
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
common_peg_parser common_peg_parser_builder::json_array() {
|
|
1105
|
+
return rule("json-array", [this]() {
|
|
1106
|
+
auto ws = space();
|
|
1107
|
+
auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
|
|
1108
|
+
return sequence({
|
|
1109
|
+
literal("["),
|
|
1110
|
+
ws,
|
|
1111
|
+
choice({
|
|
1112
|
+
literal("]"),
|
|
1113
|
+
sequence({elements, ws, literal("]")})
|
|
1114
|
+
}),
|
|
1115
|
+
ws
|
|
1116
|
+
});
|
|
1117
|
+
});
|
|
1118
|
+
}
|
|
1119
|
+
|
|
1120
|
+
common_peg_parser common_peg_parser_builder::json() {
|
|
1121
|
+
return rule("json-value", [this]() {
|
|
1122
|
+
return choice({
|
|
1123
|
+
json_object(),
|
|
1124
|
+
json_array(),
|
|
1125
|
+
json_string(),
|
|
1126
|
+
json_number(),
|
|
1127
|
+
json_bool(),
|
|
1128
|
+
json_null()
|
|
1129
|
+
});
|
|
1130
|
+
});
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
common_peg_parser common_peg_parser_builder::json_string_content() {
|
|
1134
|
+
return wrap(arena_.add_parser(common_peg_json_string_parser{}));
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1137
|
+
common_peg_parser common_peg_parser_builder::json_member(const std::string & key, const common_peg_parser & p) {
|
|
1138
|
+
auto ws = space();
|
|
1139
|
+
return sequence({
|
|
1140
|
+
literal("\"" + key + "\""),
|
|
1141
|
+
ws,
|
|
1142
|
+
literal(":"),
|
|
1143
|
+
ws,
|
|
1144
|
+
p,
|
|
1145
|
+
});
|
|
1146
|
+
}
|
|
1147
|
+
|
|
1148
|
+
|
|
1149
|
+
static std::string gbnf_escape_char_class(char c) {
|
|
1150
|
+
switch (c) {
|
|
1151
|
+
case '\n': return "\\n";
|
|
1152
|
+
case '\t': return "\\t";
|
|
1153
|
+
case '\r': return "\\r";
|
|
1154
|
+
case '\\': return "\\\\";
|
|
1155
|
+
case ']': return "\\]";
|
|
1156
|
+
case '[': return "\\[";
|
|
1157
|
+
default: return std::string(1, c);
|
|
1158
|
+
}
|
|
1159
|
+
}
|
|
1160
|
+
|
|
1161
|
+
static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
|
|
1162
|
+
trie matcher(strings);
|
|
1163
|
+
auto pieces = matcher.collect_prefix_and_next();
|
|
1164
|
+
|
|
1165
|
+
std::string pattern;
|
|
1166
|
+
for (size_t i = 0; i < pieces.size(); ++i) {
|
|
1167
|
+
if (i > 0) {
|
|
1168
|
+
pattern += " | ";
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1171
|
+
const auto & pre = pieces[i].prefix;
|
|
1172
|
+
const auto & chars = pieces[i].next_chars;
|
|
1173
|
+
|
|
1174
|
+
std::string cls;
|
|
1175
|
+
cls.reserve(chars.size());
|
|
1176
|
+
for (const auto & ch : chars) {
|
|
1177
|
+
cls += gbnf_escape_char_class(ch);
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
if (!pre.empty()) {
|
|
1181
|
+
pattern += gbnf_format_literal(pre) + " [^" + cls + "]";
|
|
1182
|
+
} else {
|
|
1183
|
+
pattern += "[^" + cls + "]";
|
|
1184
|
+
}
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
return "(" + pattern + ")*";
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
static std::unordered_set<std::string> collect_reachable_rules(
|
|
1191
|
+
const common_peg_arena & arena,
|
|
1192
|
+
const common_peg_parser_id & rule
|
|
1193
|
+
) {
|
|
1194
|
+
std::unordered_set<std::string> reachable;
|
|
1195
|
+
std::unordered_set<std::string> visited;
|
|
1196
|
+
|
|
1197
|
+
std::function<void(common_peg_parser_id)> visit = [&](common_peg_parser_id id) {
|
|
1198
|
+
const auto & parser = arena.get(id);
|
|
1199
|
+
|
|
1200
|
+
std::visit([&](const auto & p) {
|
|
1201
|
+
using T = std::decay_t<decltype(p)>;
|
|
1202
|
+
|
|
1203
|
+
if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
|
|
1204
|
+
std::is_same_v<T, common_peg_start_parser> ||
|
|
1205
|
+
std::is_same_v<T, common_peg_end_parser> ||
|
|
1206
|
+
std::is_same_v<T, common_peg_until_parser> ||
|
|
1207
|
+
std::is_same_v<T, common_peg_literal_parser> ||
|
|
1208
|
+
std::is_same_v<T, common_peg_chars_parser> ||
|
|
1209
|
+
std::is_same_v<T, common_peg_space_parser> ||
|
|
1210
|
+
std::is_same_v<T, common_peg_any_parser> ||
|
|
1211
|
+
std::is_same_v<T, common_peg_json_string_parser>) {
|
|
1212
|
+
// These parsers do not have any children
|
|
1213
|
+
} else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
|
|
1214
|
+
for (auto child : p.children) {
|
|
1215
|
+
visit(child);
|
|
1216
|
+
}
|
|
1217
|
+
} else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
|
|
1218
|
+
for (auto child : p.children) {
|
|
1219
|
+
visit(child);
|
|
1220
|
+
}
|
|
1221
|
+
} else if constexpr (std::is_same_v<T, common_peg_repetition_parser> ||
|
|
1222
|
+
std::is_same_v<T, common_peg_and_parser> ||
|
|
1223
|
+
std::is_same_v<T, common_peg_not_parser> ||
|
|
1224
|
+
std::is_same_v<T, common_peg_tag_parser> ||
|
|
1225
|
+
std::is_same_v<T, common_peg_atomic_parser> ||
|
|
1226
|
+
std::is_same_v<T, common_peg_schema_parser>) {
|
|
1227
|
+
visit(p.child);
|
|
1228
|
+
} else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
|
|
1229
|
+
if (visited.find(p.name) == visited.end()) {
|
|
1230
|
+
visited.insert(p.name);
|
|
1231
|
+
reachable.insert(p.name);
|
|
1232
|
+
visit(p.child);
|
|
1233
|
+
}
|
|
1234
|
+
} else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
|
|
1235
|
+
// Traverse rules so we pick up everything
|
|
1236
|
+
auto referenced_rule = arena.get_rule(p.name);
|
|
1237
|
+
visit(referenced_rule);
|
|
1238
|
+
} else {
|
|
1239
|
+
static_assert(is_always_false_v<T>);
|
|
1240
|
+
}
|
|
1241
|
+
}, parser);
|
|
1242
|
+
};
|
|
1243
|
+
|
|
1244
|
+
visit(rule);
|
|
1245
|
+
return reachable;
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
// GBNF generation implementation
|
|
1249
|
+
void common_peg_arena::build_grammar(const common_grammar_builder & builder, bool lazy) const {
|
|
1250
|
+
// Generate GBNF for a parser
|
|
1251
|
+
std::function<std::string(common_peg_parser_id)> to_gbnf = [&](common_peg_parser_id id) -> std::string {
|
|
1252
|
+
const auto & parser = parsers_.at(id);
|
|
1253
|
+
|
|
1254
|
+
return std::visit([&](const auto & p) -> std::string {
|
|
1255
|
+
using T = std::decay_t<decltype(p)>;
|
|
1256
|
+
|
|
1257
|
+
if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
|
|
1258
|
+
std::is_same_v<T, common_peg_start_parser> ||
|
|
1259
|
+
std::is_same_v<T, common_peg_end_parser>) {
|
|
1260
|
+
return "";
|
|
1261
|
+
} else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
|
|
1262
|
+
return gbnf_format_literal(p.literal);
|
|
1263
|
+
} else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
|
|
1264
|
+
std::string s;
|
|
1265
|
+
for (const auto & child : p.children) {
|
|
1266
|
+
if (!s.empty()) {
|
|
1267
|
+
s += " ";
|
|
1268
|
+
}
|
|
1269
|
+
auto child_gbnf = to_gbnf(child);
|
|
1270
|
+
const auto & child_parser = parsers_.at(child);
|
|
1271
|
+
if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
|
|
1272
|
+
std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
|
|
1273
|
+
s += "(" + child_gbnf + ")";
|
|
1274
|
+
} else {
|
|
1275
|
+
s += child_gbnf;
|
|
1276
|
+
}
|
|
1277
|
+
}
|
|
1278
|
+
return s;
|
|
1279
|
+
} else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
|
|
1280
|
+
std::string s;
|
|
1281
|
+
for (const auto & child : p.children) {
|
|
1282
|
+
if (!s.empty()) {
|
|
1283
|
+
s += " | ";
|
|
1284
|
+
}
|
|
1285
|
+
auto child_gbnf = to_gbnf(child);
|
|
1286
|
+
const auto & child_parser = parsers_.at(child);
|
|
1287
|
+
if (std::holds_alternative<common_peg_choice_parser>(child_parser)) {
|
|
1288
|
+
s += "(" + child_gbnf + ")";
|
|
1289
|
+
} else {
|
|
1290
|
+
s += child_gbnf;
|
|
1291
|
+
}
|
|
1292
|
+
}
|
|
1293
|
+
return s;
|
|
1294
|
+
} else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
|
|
1295
|
+
auto child_gbnf = to_gbnf(p.child);
|
|
1296
|
+
const auto & child_parser = parsers_.at(p.child);
|
|
1297
|
+
if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
|
|
1298
|
+
std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
|
|
1299
|
+
child_gbnf = "(" + child_gbnf + ")";
|
|
1300
|
+
}
|
|
1301
|
+
if (p.min_count == 0 && p.max_count == 1) {
|
|
1302
|
+
return child_gbnf + "?";
|
|
1303
|
+
}
|
|
1304
|
+
if (p.min_count == 0 && p.max_count == -1) {
|
|
1305
|
+
return child_gbnf + "*";
|
|
1306
|
+
}
|
|
1307
|
+
if (p.min_count == 1 && p.max_count == -1) {
|
|
1308
|
+
return child_gbnf + "+";
|
|
1309
|
+
}
|
|
1310
|
+
if (p.max_count == -1) {
|
|
1311
|
+
return child_gbnf + "{" + std::to_string(p.min_count) + ",}";
|
|
1312
|
+
}
|
|
1313
|
+
if (p.min_count == p.max_count) {
|
|
1314
|
+
if (p.min_count == 1) {
|
|
1315
|
+
return child_gbnf;
|
|
1316
|
+
}
|
|
1317
|
+
return child_gbnf + "{" + std::to_string(p.min_count) + "}";
|
|
1318
|
+
}
|
|
1319
|
+
return child_gbnf + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
|
|
1320
|
+
} else if constexpr (std::is_same_v<T, common_peg_and_parser> || std::is_same_v<T, common_peg_not_parser>) {
|
|
1321
|
+
return ""; // Lookahead not supported in GBNF
|
|
1322
|
+
} else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
|
|
1323
|
+
return ".";
|
|
1324
|
+
} else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
|
|
1325
|
+
return "space";
|
|
1326
|
+
} else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
|
|
1327
|
+
std::string result = p.pattern;
|
|
1328
|
+
if (p.min_count == 0 && p.max_count == 1) {
|
|
1329
|
+
return result + "?";
|
|
1330
|
+
}
|
|
1331
|
+
if (p.min_count == 0 && p.max_count == -1) {
|
|
1332
|
+
return result + "*";
|
|
1333
|
+
}
|
|
1334
|
+
if (p.min_count == 1 && p.max_count == -1) {
|
|
1335
|
+
return result + "+";
|
|
1336
|
+
}
|
|
1337
|
+
if (p.max_count == -1) {
|
|
1338
|
+
return result + "{" + std::to_string(p.min_count) + ",}";
|
|
1339
|
+
}
|
|
1340
|
+
if (p.min_count == p.max_count) {
|
|
1341
|
+
if (p.min_count == 1) {
|
|
1342
|
+
return result;
|
|
1343
|
+
}
|
|
1344
|
+
return result + "{" + std::to_string(p.min_count) + "}";
|
|
1345
|
+
}
|
|
1346
|
+
return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
|
|
1347
|
+
} else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
|
|
1348
|
+
return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
|
|
1349
|
+
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
|
|
1350
|
+
if (p.delimiters.empty()) {
|
|
1351
|
+
return ".*";
|
|
1352
|
+
}
|
|
1353
|
+
return gbnf_excluding_pattern(p.delimiters);
|
|
1354
|
+
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
|
|
1355
|
+
if (p.schema) {
|
|
1356
|
+
if (p.raw && p.schema->contains("type") && p.schema->at("type").is_string() && p.schema->at("type") == "string") {
|
|
1357
|
+
// TODO: Implement more comprehensive grammar generation for raw strings.
|
|
1358
|
+
// For now, use the grammar emitted from the underlying parser.
|
|
1359
|
+
return to_gbnf(p.child);
|
|
1360
|
+
}
|
|
1361
|
+
return builder.add_schema(p.name, *p.schema);
|
|
1362
|
+
}
|
|
1363
|
+
return to_gbnf(p.child);
|
|
1364
|
+
} else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
|
|
1365
|
+
return p.name;
|
|
1366
|
+
} else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
|
|
1367
|
+
// Refs should not exist after flattening, but kept just in case
|
|
1368
|
+
return p.name;
|
|
1369
|
+
} else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
|
|
1370
|
+
return to_gbnf(p.child);
|
|
1371
|
+
} else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
|
|
1372
|
+
return to_gbnf(p.child);
|
|
1373
|
+
} else {
|
|
1374
|
+
static_assert(is_always_false_v<T>);
|
|
1375
|
+
}
|
|
1376
|
+
}, parser);
|
|
1377
|
+
};
|
|
1378
|
+
|
|
1379
|
+
// Collect reachable rules
|
|
1380
|
+
std::unordered_set<std::string> reachable_rules;
|
|
1381
|
+
|
|
1382
|
+
if (lazy) {
|
|
1383
|
+
// Collect rules reachable from trigger rules
|
|
1384
|
+
for (const auto & [name, id] : rules_) {
|
|
1385
|
+
const auto & parser = parsers_.at(id);
|
|
1386
|
+
if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
|
|
1387
|
+
if (rule->trigger) {
|
|
1388
|
+
// Mark trigger as reachable and visit it
|
|
1389
|
+
reachable_rules.insert(name);
|
|
1390
|
+
auto add_rules = collect_reachable_rules(*this, id);
|
|
1391
|
+
reachable_rules.insert(add_rules.begin(), add_rules.end());
|
|
1392
|
+
}
|
|
1393
|
+
}
|
|
1394
|
+
}
|
|
1395
|
+
} else {
|
|
1396
|
+
// Collect rules reachable from root
|
|
1397
|
+
reachable_rules = collect_reachable_rules(*this, root_);
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
// Create GBNF rules for all reachable rules
|
|
1401
|
+
for (const auto & [name, rule_id] : rules_) {
|
|
1402
|
+
if (reachable_rules.find(name) == reachable_rules.end()) {
|
|
1403
|
+
continue;
|
|
1404
|
+
}
|
|
1405
|
+
|
|
1406
|
+
const auto & parser = parsers_.at(rule_id);
|
|
1407
|
+
if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
|
|
1408
|
+
builder.add_rule(rule->name, to_gbnf(rule->child));
|
|
1409
|
+
}
|
|
1410
|
+
}
|
|
1411
|
+
|
|
1412
|
+
if (lazy) {
|
|
1413
|
+
// Generate root rule from trigger rules only
|
|
1414
|
+
std::vector<std::string> trigger_names;
|
|
1415
|
+
for (const auto & [name, rule_id] : rules_) {
|
|
1416
|
+
const auto & parser = parsers_.at(rule_id);
|
|
1417
|
+
if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
|
|
1418
|
+
if (rule->trigger) {
|
|
1419
|
+
trigger_names.push_back(rule->name);
|
|
1420
|
+
}
|
|
1421
|
+
}
|
|
1422
|
+
}
|
|
1423
|
+
|
|
1424
|
+
// Sort for predictable order
|
|
1425
|
+
std::sort(trigger_names.begin(), trigger_names.end());
|
|
1426
|
+
builder.add_rule("root", string_join(trigger_names, " | "));
|
|
1427
|
+
} else if (root_ != COMMON_PEG_INVALID_PARSER_ID) {
|
|
1428
|
+
builder.add_rule("root", to_gbnf(root_));
|
|
1429
|
+
}
|
|
1430
|
+
}
|
|
1431
|
+
|
|
1432
|
+
static nlohmann::json serialize_parser_variant(const common_peg_parser_variant & variant) {
|
|
1433
|
+
using json = nlohmann::json;
|
|
1434
|
+
|
|
1435
|
+
return std::visit([](const auto & p) -> json {
|
|
1436
|
+
using T = std::decay_t<decltype(p)>;
|
|
1437
|
+
|
|
1438
|
+
if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
|
|
1439
|
+
return json{{"type", "epsilon"}};
|
|
1440
|
+
} else if constexpr (std::is_same_v<T, common_peg_start_parser>) {
|
|
1441
|
+
return json{{"type", "start"}};
|
|
1442
|
+
} else if constexpr (std::is_same_v<T, common_peg_end_parser>) {
|
|
1443
|
+
return json{{"type", "end"}};
|
|
1444
|
+
} else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
|
|
1445
|
+
return json{{"type", "literal"}, {"literal", p.literal}};
|
|
1446
|
+
} else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
|
|
1447
|
+
return json{{"type", "sequence"}, {"children", p.children}};
|
|
1448
|
+
} else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
|
|
1449
|
+
return json{{"type", "choice"}, {"children", p.children}};
|
|
1450
|
+
} else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
|
|
1451
|
+
return json{
|
|
1452
|
+
{"type", "repetition"},
|
|
1453
|
+
{"child", p.child},
|
|
1454
|
+
{"min_count", p.min_count},
|
|
1455
|
+
{"max_count", p.max_count}
|
|
1456
|
+
};
|
|
1457
|
+
} else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
|
|
1458
|
+
return json{{"type", "and"}, {"child", p.child}};
|
|
1459
|
+
} else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
|
|
1460
|
+
return json{{"type", "not"}, {"child", p.child}};
|
|
1461
|
+
} else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
|
|
1462
|
+
return json{{"type", "any"}};
|
|
1463
|
+
} else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
|
|
1464
|
+
return json{{"type", "space"}};
|
|
1465
|
+
} else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
|
|
1466
|
+
json ranges = json::array();
|
|
1467
|
+
for (const auto & range : p.ranges) {
|
|
1468
|
+
ranges.push_back({{"start", range.start}, {"end", range.end}});
|
|
1469
|
+
}
|
|
1470
|
+
return json{
|
|
1471
|
+
{"type", "chars"},
|
|
1472
|
+
{"pattern", p.pattern},
|
|
1473
|
+
{"ranges", ranges},
|
|
1474
|
+
{"negated", p.negated},
|
|
1475
|
+
{"min_count", p.min_count},
|
|
1476
|
+
{"max_count", p.max_count}
|
|
1477
|
+
};
|
|
1478
|
+
} else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
|
|
1479
|
+
return json{{"type", "json_string"}};
|
|
1480
|
+
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
|
|
1481
|
+
return json{{"type", "until"}, {"delimiters", p.delimiters}};
|
|
1482
|
+
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
|
|
1483
|
+
return json{
|
|
1484
|
+
{"type", "schema"},
|
|
1485
|
+
{"child", p.child},
|
|
1486
|
+
{"name", p.name},
|
|
1487
|
+
{"schema", p.schema ? *p.schema : nullptr},
|
|
1488
|
+
{"raw", p.raw}
|
|
1489
|
+
};
|
|
1490
|
+
} else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
|
|
1491
|
+
return json{
|
|
1492
|
+
{"type", "rule"},
|
|
1493
|
+
{"name", p.name},
|
|
1494
|
+
{"child", p.child},
|
|
1495
|
+
{"trigger", p.trigger}
|
|
1496
|
+
};
|
|
1497
|
+
} else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
|
|
1498
|
+
return json{{"type", "ref"}, {"name", p.name}};
|
|
1499
|
+
} else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
|
|
1500
|
+
return json{{"type", "atomic"}, {"child", p.child}};
|
|
1501
|
+
} else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
|
|
1502
|
+
return json{
|
|
1503
|
+
{"type", "tag"},
|
|
1504
|
+
{"child", p.child},
|
|
1505
|
+
{"tag", p.tag}
|
|
1506
|
+
};
|
|
1507
|
+
}
|
|
1508
|
+
}, variant);
|
|
1509
|
+
}
|
|
1510
|
+
|
|
1511
|
+
nlohmann::json common_peg_arena::to_json() const {
|
|
1512
|
+
auto parsers = nlohmann::json::array();
|
|
1513
|
+
for (const auto & parser : parsers_) {
|
|
1514
|
+
parsers.push_back(serialize_parser_variant(parser));
|
|
1515
|
+
}
|
|
1516
|
+
return nlohmann::json{
|
|
1517
|
+
{"parsers", parsers},
|
|
1518
|
+
{"rules", rules_},
|
|
1519
|
+
{"root", root_}
|
|
1520
|
+
};
|
|
1521
|
+
}
|
|
1522
|
+
|
|
1523
|
+
static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json & j) {
|
|
1524
|
+
if (!j.contains("type") || !j["type"].is_string()) {
|
|
1525
|
+
throw std::runtime_error("Parser variant JSON missing or invalid 'type' field");
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1528
|
+
std::string type = j["type"];
|
|
1529
|
+
|
|
1530
|
+
if (type == "epsilon") {
|
|
1531
|
+
return common_peg_epsilon_parser{};
|
|
1532
|
+
}
|
|
1533
|
+
if (type == "start") {
|
|
1534
|
+
return common_peg_start_parser{};
|
|
1535
|
+
}
|
|
1536
|
+
if (type == "end") {
|
|
1537
|
+
return common_peg_end_parser{};
|
|
1538
|
+
}
|
|
1539
|
+
if (type == "literal") {
|
|
1540
|
+
if (!j.contains("literal") || !j["literal"].is_string()) {
|
|
1541
|
+
throw std::runtime_error("literal parser missing or invalid 'literal' field");
|
|
1542
|
+
}
|
|
1543
|
+
return common_peg_literal_parser{j["literal"]};
|
|
1544
|
+
}
|
|
1545
|
+
if (type == "sequence") {
|
|
1546
|
+
if (!j.contains("children") || !j["children"].is_array()) {
|
|
1547
|
+
throw std::runtime_error("sequence parser missing or invalid 'children' field");
|
|
1548
|
+
}
|
|
1549
|
+
return common_peg_sequence_parser{j["children"].get<std::vector<common_peg_parser_id>>()};
|
|
1550
|
+
}
|
|
1551
|
+
if (type == "choice") {
|
|
1552
|
+
if (!j.contains("children") || !j["children"].is_array()) {
|
|
1553
|
+
throw std::runtime_error("choice parser missing or invalid 'children' field");
|
|
1554
|
+
}
|
|
1555
|
+
return common_peg_choice_parser{j["children"].get<std::vector<common_peg_parser_id>>()};
|
|
1556
|
+
}
|
|
1557
|
+
if (type == "repetition") {
|
|
1558
|
+
if (!j.contains("child") || !j.contains("min_count") || !j.contains("max_count")) {
|
|
1559
|
+
throw std::runtime_error("repetition parser missing required fields");
|
|
1560
|
+
}
|
|
1561
|
+
return common_peg_repetition_parser{
|
|
1562
|
+
j["child"].get<common_peg_parser_id>(),
|
|
1563
|
+
j["min_count"].get<int>(),
|
|
1564
|
+
j["max_count"].get<int>()
|
|
1565
|
+
};
|
|
1566
|
+
}
|
|
1567
|
+
if (type == "and") {
|
|
1568
|
+
if (!j.contains("child")) {
|
|
1569
|
+
throw std::runtime_error("and parser missing 'child' field");
|
|
1570
|
+
}
|
|
1571
|
+
return common_peg_and_parser{j["child"].get<common_peg_parser_id>()};
|
|
1572
|
+
}
|
|
1573
|
+
if (type == "not") {
|
|
1574
|
+
if (!j.contains("child")) {
|
|
1575
|
+
throw std::runtime_error("not parser missing 'child' field");
|
|
1576
|
+
}
|
|
1577
|
+
return common_peg_not_parser{j["child"].get<common_peg_parser_id>()};
|
|
1578
|
+
}
|
|
1579
|
+
if (type == "any") {
|
|
1580
|
+
return common_peg_any_parser{};
|
|
1581
|
+
}
|
|
1582
|
+
if (type == "space") {
|
|
1583
|
+
return common_peg_space_parser{};
|
|
1584
|
+
}
|
|
1585
|
+
if (type == "chars") {
|
|
1586
|
+
if (!j.contains("pattern") || !j.contains("ranges") || !j.contains("negated") ||
|
|
1587
|
+
!j.contains("min_count") || !j.contains("max_count")) {
|
|
1588
|
+
throw std::runtime_error("chars parser missing required fields");
|
|
1589
|
+
}
|
|
1590
|
+
common_peg_chars_parser parser;
|
|
1591
|
+
parser.pattern = j["pattern"];
|
|
1592
|
+
parser.negated = j["negated"];
|
|
1593
|
+
parser.min_count = j["min_count"];
|
|
1594
|
+
parser.max_count = j["max_count"];
|
|
1595
|
+
for (const auto & range_json : j["ranges"]) {
|
|
1596
|
+
if (!range_json.contains("start") || !range_json.contains("end")) {
|
|
1597
|
+
throw std::runtime_error("char_range missing 'start' or 'end' field");
|
|
1598
|
+
}
|
|
1599
|
+
parser.ranges.push_back({
|
|
1600
|
+
range_json["start"].get<uint32_t>(),
|
|
1601
|
+
range_json["end"].get<uint32_t>()
|
|
1602
|
+
});
|
|
1603
|
+
}
|
|
1604
|
+
return parser;
|
|
1605
|
+
}
|
|
1606
|
+
if (type == "json_string") {
|
|
1607
|
+
return common_peg_json_string_parser{};
|
|
1608
|
+
}
|
|
1609
|
+
if (type == "until") {
|
|
1610
|
+
if (!j.contains("delimiters") || !j["delimiters"].is_array()) {
|
|
1611
|
+
throw std::runtime_error("until parser missing or invalid 'delimiters' field");
|
|
1612
|
+
}
|
|
1613
|
+
return common_peg_until_parser{j["delimiters"].get<std::vector<std::string>>()};
|
|
1614
|
+
}
|
|
1615
|
+
if (type == "schema") {
|
|
1616
|
+
if (!j.contains("child") || !j.contains("name") || !j.contains("schema") || !j.contains("raw")) {
|
|
1617
|
+
throw std::runtime_error("schema parser missing required fields");
|
|
1618
|
+
}
|
|
1619
|
+
common_peg_schema_parser parser;
|
|
1620
|
+
parser.child = j["child"].get<common_peg_parser_id>();
|
|
1621
|
+
parser.name = j["name"];
|
|
1622
|
+
if (!j["schema"].is_null()) {
|
|
1623
|
+
parser.schema = std::make_shared<nlohmann::ordered_json>(j["schema"]);
|
|
1624
|
+
}
|
|
1625
|
+
parser.raw = j["raw"].get<bool>();
|
|
1626
|
+
return parser;
|
|
1627
|
+
}
|
|
1628
|
+
if (type == "rule") {
|
|
1629
|
+
if (!j.contains("name") || !j.contains("child") || !j.contains("trigger")) {
|
|
1630
|
+
throw std::runtime_error("rule parser missing required fields");
|
|
1631
|
+
}
|
|
1632
|
+
return common_peg_rule_parser{
|
|
1633
|
+
j["name"].get<std::string>(),
|
|
1634
|
+
j["child"].get<common_peg_parser_id>(),
|
|
1635
|
+
j["trigger"].get<bool>()
|
|
1636
|
+
};
|
|
1637
|
+
}
|
|
1638
|
+
if (type == "ref") {
|
|
1639
|
+
if (!j.contains("name") || !j["name"].is_string()) {
|
|
1640
|
+
throw std::runtime_error("ref parser missing or invalid 'name' field");
|
|
1641
|
+
}
|
|
1642
|
+
return common_peg_ref_parser{j["name"]};
|
|
1643
|
+
}
|
|
1644
|
+
if (type == "atomic") {
|
|
1645
|
+
if (!j.contains("child")) {
|
|
1646
|
+
throw std::runtime_error("tag parser missing required fields");
|
|
1647
|
+
}
|
|
1648
|
+
return common_peg_atomic_parser{
|
|
1649
|
+
j["child"].get<common_peg_parser_id>(),
|
|
1650
|
+
};
|
|
1651
|
+
}
|
|
1652
|
+
if (type == "tag") {
|
|
1653
|
+
if (!j.contains("child") || !j.contains("tag")) {
|
|
1654
|
+
throw std::runtime_error("tag parser missing required fields");
|
|
1655
|
+
}
|
|
1656
|
+
return common_peg_tag_parser{
|
|
1657
|
+
j["child"].get<common_peg_parser_id>(),
|
|
1658
|
+
j["tag"].get<std::string>(),
|
|
1659
|
+
};
|
|
1660
|
+
}
|
|
1661
|
+
|
|
1662
|
+
throw std::runtime_error("Unknown parser type: " + type);
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
common_peg_arena common_peg_arena::from_json(const nlohmann::json & j) {
|
|
1666
|
+
if (!j.contains("parsers") || !j["parsers"].is_array()) {
|
|
1667
|
+
throw std::runtime_error("JSON missing or invalid 'parsers' array");
|
|
1668
|
+
}
|
|
1669
|
+
if (!j.contains("rules") || !j["rules"].is_object()) {
|
|
1670
|
+
throw std::runtime_error("JSON missing or invalid 'rules' object");
|
|
1671
|
+
}
|
|
1672
|
+
if (!j.contains("root")) {
|
|
1673
|
+
throw std::runtime_error("JSON missing 'root' field");
|
|
1674
|
+
}
|
|
1675
|
+
|
|
1676
|
+
common_peg_arena arena;
|
|
1677
|
+
|
|
1678
|
+
const auto & parsers_json = j["parsers"];
|
|
1679
|
+
arena.parsers_.reserve(parsers_json.size());
|
|
1680
|
+
for (const auto & parser_json : parsers_json) {
|
|
1681
|
+
arena.parsers_.push_back(deserialize_parser_variant(parser_json));
|
|
1682
|
+
}
|
|
1683
|
+
|
|
1684
|
+
arena.rules_ = j["rules"].get<std::unordered_map<std::string, common_peg_parser_id>>();
|
|
1685
|
+
|
|
1686
|
+
for (const auto & [name, id] : arena.rules_) {
|
|
1687
|
+
if (id >= arena.parsers_.size()) {
|
|
1688
|
+
throw std::runtime_error("Rule '" + name + "' references invalid parser ID: " + std::to_string(id));
|
|
1689
|
+
}
|
|
1690
|
+
}
|
|
1691
|
+
|
|
1692
|
+
arena.root_ = j["root"].get<common_peg_parser_id>();
|
|
1693
|
+
if (arena.root_ != COMMON_PEG_INVALID_PARSER_ID && arena.root_ >= arena.parsers_.size()) {
|
|
1694
|
+
throw std::runtime_error("Root references invalid parser ID: " + std::to_string(arena.root_));
|
|
1695
|
+
}
|
|
1696
|
+
|
|
1697
|
+
return arena;
|
|
1698
|
+
}
|
|
1699
|
+
|
|
1700
|
+
std::string common_peg_arena::save() const {
|
|
1701
|
+
return to_json().dump();
|
|
1702
|
+
}
|
|
1703
|
+
|
|
1704
|
+
void common_peg_arena::load(const std::string & data) {
|
|
1705
|
+
*this = from_json(nlohmann::json::parse(data));
|
|
1706
|
+
}
|
|
1707
|
+
|
|
1708
|
+
common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn) {
|
|
1709
|
+
common_peg_parser_builder builder;
|
|
1710
|
+
builder.set_root(fn(builder));
|
|
1711
|
+
return builder.build();
|
|
1712
|
+
}
|