edn_turbo 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +16 -40
- data/ext/edn_turbo/edn_parser.cc +1651 -1529
- data/ext/edn_turbo/edn_parser.h +15 -12
- data/ext/edn_turbo/edn_parser.rl +375 -327
- data/ext/edn_turbo/edn_parser_unicode.cc +1 -1
- data/ext/edn_turbo/{edn_parser_def.cc → edn_parser_util.cc} +23 -15
- data/lib/edn_turbo/version.rb +2 -2
- data/test/test_output_diff.rb +1 -1
- metadata +3 -3
data/ext/edn_turbo/edn_parser.rl
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
#include <rice/Hash.hpp>
|
6
6
|
#include <rice/Array.hpp>
|
7
7
|
#include <rice/to_from_ruby.hpp>
|
8
|
+
#include <rice/Exception.hpp>
|
8
9
|
|
9
10
|
#include "edn_parser.h"
|
10
11
|
|
@@ -49,16 +50,19 @@
|
|
49
50
|
exp = ([Ee] [+\-]? digit+);
|
50
51
|
|
51
52
|
|
53
|
+
# common actions
|
52
54
|
action close_err {
|
53
55
|
std::stringstream s;
|
54
56
|
s << "unterminated " << EDN_TYPE;
|
55
57
|
error(__FUNCTION__, s.str());
|
56
58
|
fhold; fbreak;
|
57
59
|
}
|
60
|
+
|
61
|
+
action exit { fhold; fbreak; }
|
58
62
|
}%%
|
59
63
|
|
60
64
|
// ============================================================
|
61
|
-
// machine for parsing various types
|
65
|
+
// machine for parsing various EDN token types
|
62
66
|
//
|
63
67
|
|
64
68
|
%%{
|
@@ -67,46 +71,20 @@
|
|
67
71
|
|
68
72
|
write data;
|
69
73
|
|
70
|
-
action parse_dispatch {
|
71
|
-
const char *np = parse_dispatch(fpc + 1, pe, o);
|
72
|
-
if (np == NULL) { fhold; fbreak; } else fexec np;
|
73
|
-
}
|
74
|
-
|
75
|
-
action parse_char {
|
76
|
-
const char *np = parse_esc_char(fpc, pe, o);
|
77
|
-
if (np == NULL) { fhold; fbreak; } else fexec np;
|
78
|
-
}
|
79
|
-
|
80
74
|
action parse_string {
|
75
|
+
// string types within double-quotes
|
81
76
|
const char *np = parse_string(fpc, pe, o);
|
82
77
|
if (np == NULL) { fhold; fbreak; } else fexec np;
|
83
78
|
}
|
84
79
|
|
85
80
|
action parse_keyword {
|
81
|
+
// tokens with a leading ':'
|
86
82
|
const char *np = parse_keyword(fpc, pe, o);
|
87
83
|
if (np == NULL) { fhold; fbreak; } else fexec np;
|
88
84
|
}
|
89
85
|
|
90
|
-
action parse_operator {
|
91
|
-
const char *np = parse_operator(fpc, pe, o);
|
92
|
-
if (np == NULL) { fhold; fbreak; } else fexec np;
|
93
|
-
}
|
94
|
-
|
95
|
-
action parse_symbol {
|
96
|
-
std::string sym;
|
97
|
-
const char *np = parse_symbol(fpc, pe, sym);
|
98
|
-
if (np == NULL) { fhold; fbreak; } else {
|
99
|
-
if (sym == "true") { o = Qtrue; }
|
100
|
-
else if (sym == "false") { o = Qfalse; }
|
101
|
-
else if (sym == "nil") { o = Qnil; }
|
102
|
-
else {
|
103
|
-
o = Parser::make_edn_symbol(sym);
|
104
|
-
}
|
105
|
-
fexec np;
|
106
|
-
}
|
107
|
-
}
|
108
|
-
|
109
86
|
action parse_number {
|
87
|
+
// tokens w/ leading digits: non-negative integers & decimals.
|
110
88
|
// try to parse a decimal first
|
111
89
|
const char *np = parse_decimal(fpc, pe, o);
|
112
90
|
if (np == NULL) {
|
@@ -125,41 +103,75 @@
|
|
125
103
|
}
|
126
104
|
}
|
127
105
|
|
106
|
+
action parse_operator {
|
107
|
+
// stand-alone operators *, +, -, etc.
|
108
|
+
const char *np = parse_operator(fpc, pe, o);
|
109
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
110
|
+
}
|
111
|
+
|
112
|
+
action parse_char {
|
113
|
+
// tokens w/ leading \ (escaped characters \newline, \c, etc.)
|
114
|
+
const char *np = parse_esc_char(fpc, pe, o);
|
115
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
116
|
+
}
|
117
|
+
|
118
|
+
action parse_symbol {
|
119
|
+
// user identifiers and reserved keywords (true, false, nil)
|
120
|
+
std::string sym;
|
121
|
+
const char *np = parse_symbol(fpc, pe, sym);
|
122
|
+
if (np == NULL) { fhold; fbreak; } else {
|
123
|
+
if (sym == "true") { o = Qtrue; }
|
124
|
+
else if (sym == "false") { o = Qfalse; }
|
125
|
+
else if (sym == "nil") { o = Qnil; }
|
126
|
+
else {
|
127
|
+
o = Parser::make_edn_symbol(sym);
|
128
|
+
}
|
129
|
+
fexec np;
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
128
133
|
action parse_vector {
|
134
|
+
// [
|
129
135
|
const char *np = parse_vector(fpc, pe, o);
|
130
136
|
if (np == NULL) { fhold; fbreak; } else fexec np;
|
131
137
|
}
|
132
138
|
|
133
139
|
action parse_list {
|
140
|
+
// (
|
134
141
|
const char *np = parse_list(fpc, pe, o);
|
135
142
|
if (np == NULL) { fhold; fbreak; } else fexec np;
|
136
143
|
}
|
137
144
|
|
138
145
|
action parse_map {
|
146
|
+
// {
|
139
147
|
const char *np = parse_map(fpc, pe, o);
|
140
148
|
if (np == NULL) { fhold; fbreak; } else fexec np;
|
141
149
|
}
|
142
150
|
|
143
|
-
action
|
151
|
+
action parse_dispatch {
|
152
|
+
// handles tokens w/ leading # ("#_", "#{", and tagged elems)
|
153
|
+
const char *np = parse_dispatch(fpc + 1, pe, o);
|
154
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
155
|
+
}
|
156
|
+
|
144
157
|
|
145
158
|
main := (
|
146
|
-
begin_dispatch >parse_dispatch |
|
147
|
-
begin_char >parse_char |
|
148
159
|
string_delim >parse_string |
|
149
160
|
begin_keyword >parse_keyword |
|
161
|
+
begin_number >parse_number |
|
150
162
|
operators >parse_operator |
|
163
|
+
begin_char >parse_char |
|
151
164
|
begin_symbol >parse_symbol |
|
152
|
-
begin_number >parse_number |
|
153
165
|
begin_vector >parse_vector |
|
154
166
|
begin_list >parse_list |
|
155
|
-
begin_map >parse_map
|
167
|
+
begin_map >parse_map |
|
168
|
+
begin_dispatch >parse_dispatch
|
156
169
|
) %*exit;
|
157
170
|
}%%
|
158
171
|
|
159
172
|
|
160
173
|
const char *edn::Parser::parse_value(const char *p, const char *pe, Rice::Object& o)
|
161
174
|
{
|
162
|
-
//std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
163
175
|
int cs;
|
164
176
|
|
165
177
|
%% write init;
|
@@ -179,295 +191,279 @@ const char *edn::Parser::parse_value(const char *p, const char *pe, Rice::Object
|
|
179
191
|
|
180
192
|
|
181
193
|
// ============================================================
|
182
|
-
//
|
194
|
+
// string parsing - incoming string is raw so interpreting utf
|
195
|
+
// encodings & unicode values might be necessary. To optimize things a
|
196
|
+
// bit, we mark the string for encoding if anything outside of the
|
197
|
+
// ascii range is found.
|
183
198
|
//
|
184
199
|
%%{
|
185
|
-
machine
|
200
|
+
machine EDN_string;
|
186
201
|
include EDN_common;
|
187
202
|
|
188
203
|
write data;
|
189
204
|
|
190
|
-
action
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
o = Parser::make_edn_symbol(sym);
|
196
|
-
fexec np;
|
197
|
-
}
|
198
|
-
}
|
199
|
-
|
200
|
-
action parse_number {
|
201
|
-
// parse a number with the leading symbol - this is slightly
|
202
|
-
// different than the one within EDN_value since it includes
|
203
|
-
// the leading - or +
|
204
|
-
//
|
205
|
-
// try to parse a decimal first
|
206
|
-
const char *np = parse_decimal(p_save, pe, o);
|
207
|
-
if (np == NULL) {
|
208
|
-
// if we can't, try to parse it as an int
|
209
|
-
np = parse_integer(p_save, pe, o);
|
210
|
-
}
|
211
|
-
|
212
|
-
if (np) {
|
213
|
-
fexec np;
|
214
|
-
fhold;
|
215
|
-
fbreak;
|
216
|
-
}
|
217
|
-
else {
|
218
|
-
error(__FUNCTION__, *p);
|
219
|
-
fexec pe;
|
205
|
+
action parse_string {
|
206
|
+
if (Parser::parse_byte_stream(p_save + 1, p, s, encode)) {
|
207
|
+
fexec p + 1;
|
208
|
+
} else {
|
209
|
+
fhold; fbreak;
|
220
210
|
}
|
221
211
|
}
|
222
212
|
|
223
|
-
action
|
224
|
-
|
225
|
-
std::string sym;
|
226
|
-
sym += *(fpc - 1);
|
227
|
-
o = Parser::make_edn_symbol(sym);
|
213
|
+
action mark_for_encoding {
|
214
|
+
encode = true;
|
228
215
|
}
|
229
216
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
217
|
+
main := string_delim (
|
218
|
+
(^([\"\\] | 0..0x1f | 0xc2..0xf5) |
|
219
|
+
((0xc2..0xf5) |
|
220
|
+
'\\'[\"\\/bfnrt] |
|
221
|
+
'\\u'[0-9a-fA-F]{4}) $mark_for_encoding |
|
222
|
+
'\\'^([\"\\/bfnrtu]|0..0x1f))* %parse_string
|
223
|
+
) :>> string_delim @err(close_err) @exit;
|
237
224
|
}%%
|
238
225
|
|
239
226
|
|
240
|
-
const char* edn::Parser::
|
227
|
+
const char* edn::Parser::parse_string(const char *p, const char *pe, Rice::Object& o)
|
241
228
|
{
|
242
229
|
// std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
230
|
+
static const char* EDN_TYPE = "string";
|
243
231
|
int cs;
|
244
|
-
|
232
|
+
bool encode = false;
|
233
|
+
const char *eof = pe;
|
234
|
+
Rice::String s;
|
245
235
|
|
246
236
|
%% write init;
|
247
237
|
p_save = p;
|
248
238
|
%% write exec;
|
249
239
|
|
250
|
-
if (cs >=
|
251
|
-
|
240
|
+
if (cs >= EDN_string_first_final) {
|
241
|
+
o = s;
|
242
|
+
return p + 1;
|
252
243
|
}
|
253
|
-
else if (cs ==
|
254
|
-
error(__FUNCTION__, *p);
|
244
|
+
else if (cs == EDN_string_error) {
|
255
245
|
return pe;
|
256
246
|
}
|
257
|
-
else if (cs ==
|
247
|
+
else if (cs == EDN_string_en_main) {} // silence ragel warning
|
258
248
|
return NULL;
|
259
249
|
}
|
260
250
|
|
261
251
|
|
262
252
|
|
263
253
|
// ============================================================
|
264
|
-
//
|
254
|
+
// keyword parsing
|
265
255
|
//
|
266
256
|
%%{
|
267
|
-
machine
|
257
|
+
machine EDN_keyword;
|
268
258
|
include EDN_common;
|
269
259
|
|
270
|
-
|
260
|
+
keyword_chars = symbol_chars | operators;
|
261
|
+
keyword_start = symbol_start | [\#\./];
|
271
262
|
|
272
|
-
|
263
|
+
keyword_name = keyword_start (keyword_chars)*;
|
273
264
|
|
274
|
-
|
265
|
+
write data;
|
275
266
|
|
276
|
-
|
277
|
-
|
278
|
-
) (^(valid_chars | '\\')? @exit);
|
267
|
+
|
268
|
+
main := begin_keyword keyword_name (^keyword_chars? @exit);
|
279
269
|
}%%
|
280
270
|
|
281
271
|
|
282
|
-
const char* edn::Parser::
|
272
|
+
const char* edn::Parser::parse_keyword(const char *p, const char *pe, Rice::Object& o)
|
283
273
|
{
|
284
|
-
//std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
285
274
|
int cs;
|
286
275
|
|
287
276
|
%% write init;
|
288
277
|
p_save = p;
|
289
278
|
%% write exec;
|
290
279
|
|
291
|
-
if (cs >=
|
292
|
-
|
293
|
-
|
294
|
-
|
280
|
+
if (cs >= EDN_keyword_first_final) {
|
281
|
+
std::string buf;
|
282
|
+
uint32_t len = p - p_save;
|
283
|
+
// don't include leading ':' because Rice::Symbol will handle it
|
284
|
+
buf.append(p_save + 1, len - 1);
|
285
|
+
o = Rice::Symbol(buf);
|
295
286
|
return p;
|
296
287
|
}
|
297
|
-
else if (cs ==
|
288
|
+
else if (cs == EDN_keyword_error) {
|
298
289
|
error(__FUNCTION__, *p);
|
299
290
|
return pe;
|
300
291
|
}
|
301
|
-
else if (cs ==
|
292
|
+
else if (cs == EDN_keyword_en_main) {} // silence ragel warning
|
302
293
|
return NULL;
|
303
294
|
}
|
304
295
|
|
305
296
|
|
306
297
|
|
307
|
-
|
308
298
|
// ============================================================
|
309
|
-
//
|
299
|
+
// decimal parsing machine
|
310
300
|
//
|
311
301
|
%%{
|
312
|
-
machine
|
302
|
+
machine EDN_decimal;
|
313
303
|
include EDN_common;
|
314
304
|
|
315
|
-
write data;
|
305
|
+
write data noerror;
|
316
306
|
|
317
|
-
action exit { fhold; fbreak; }
|
318
307
|
|
319
|
-
main := (
|
320
|
-
|
321
|
-
|
322
|
-
|
308
|
+
main := ('-'|'+')? (
|
309
|
+
(integer '.' digit* (exp? [M]?)) |
|
310
|
+
(integer exp)
|
311
|
+
) (^[0-9Ee.+\-M]? @exit );
|
323
312
|
}%%
|
324
313
|
|
325
314
|
|
326
|
-
const char* edn::Parser::
|
315
|
+
const char* edn::Parser::parse_decimal(const char *p, const char *pe, Rice::Object& o)
|
327
316
|
{
|
328
|
-
// std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
329
317
|
int cs;
|
330
318
|
|
331
319
|
%% write init;
|
332
320
|
p_save = p;
|
333
321
|
%% write exec;
|
334
322
|
|
335
|
-
if (cs >=
|
336
|
-
|
337
|
-
|
338
|
-
buf.append(p_save, len);
|
339
|
-
|
340
|
-
s = buf;
|
341
|
-
return p;
|
342
|
-
}
|
343
|
-
else if (cs == EDN_symbol_error) {
|
344
|
-
error(__FUNCTION__, *p);
|
345
|
-
return pe;
|
323
|
+
if (cs >= EDN_decimal_first_final) {
|
324
|
+
o = Parser::float_to_ruby(p_save, p - p_save);
|
325
|
+
return p + 1;
|
346
326
|
}
|
347
|
-
else if (cs ==
|
327
|
+
else if (cs == EDN_decimal_en_main) {} // silence ragel warning
|
348
328
|
return NULL;
|
349
329
|
}
|
350
330
|
|
351
331
|
|
352
|
-
|
353
|
-
|
354
332
|
// ============================================================
|
355
|
-
//
|
333
|
+
// integer parsing machine
|
356
334
|
//
|
357
335
|
%%{
|
358
|
-
machine
|
336
|
+
machine EDN_integer;
|
359
337
|
include EDN_common;
|
360
338
|
|
361
|
-
|
362
|
-
keyword_start = symbol_start | [\#\./];
|
363
|
-
|
364
|
-
keyword_name = keyword_start (keyword_chars)*;
|
365
|
-
|
366
|
-
write data;
|
339
|
+
write data noerror;
|
367
340
|
|
368
|
-
action exit { fhold; fbreak; }
|
369
341
|
|
370
|
-
main :=
|
342
|
+
main := (
|
343
|
+
('-'|'+')? (integer [MN]?)
|
344
|
+
) (^[0-9MN+\-]? @exit);
|
371
345
|
}%%
|
372
346
|
|
373
|
-
|
374
|
-
const char* edn::Parser::parse_keyword(const char *p, const char *pe, Rice::Object& o)
|
347
|
+
const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Object& o)
|
375
348
|
{
|
376
349
|
int cs;
|
377
|
-
// std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
378
350
|
|
379
351
|
%% write init;
|
380
352
|
p_save = p;
|
381
353
|
%% write exec;
|
382
354
|
|
383
|
-
if (cs >=
|
384
|
-
|
385
|
-
|
386
|
-
buf.append(p_save + 1, len);
|
387
|
-
o = Rice::Symbol(buf);
|
388
|
-
return p;
|
389
|
-
}
|
390
|
-
else if (cs == EDN_keyword_error) {
|
391
|
-
error(__FUNCTION__, *p);
|
392
|
-
return pe;
|
355
|
+
if (cs >= EDN_integer_first_final) {
|
356
|
+
o = Parser::integer_to_ruby(p_save, p - p_save);
|
357
|
+
return p + 1;
|
393
358
|
}
|
394
|
-
else if (cs ==
|
359
|
+
else if (cs == EDN_integer_en_main) {} // silence ragel warning
|
395
360
|
return NULL;
|
396
361
|
}
|
397
362
|
|
398
363
|
|
399
364
|
|
400
365
|
// ============================================================
|
401
|
-
//
|
366
|
+
// operator parsing - handles tokens w/ a leading operator:
|
367
|
+
//
|
368
|
+
// 1. symbols w/ leading operator: -something, .somethingelse
|
369
|
+
// 2. number values w/ leading - or +
|
370
|
+
// 3. stand-alone operators: +, -, /, *, etc.
|
402
371
|
//
|
403
372
|
%%{
|
404
|
-
machine
|
373
|
+
machine EDN_operator;
|
405
374
|
include EDN_common;
|
406
375
|
|
407
376
|
write data;
|
408
377
|
|
409
|
-
action
|
410
|
-
|
378
|
+
action parse_symbol {
|
379
|
+
// parse a symbol including the leading operator (-, +, .)
|
380
|
+
std::string sym;
|
381
|
+
const char *np = parse_symbol(p_save, pe, sym);
|
382
|
+
if (np == NULL) { fhold; fbreak; } else {
|
383
|
+
o = Parser::make_edn_symbol(sym);
|
384
|
+
fexec np;
|
385
|
+
}
|
386
|
+
}
|
387
|
+
|
388
|
+
action parse_number {
|
389
|
+
// parse a number with the leading symbol - this is slightly
|
390
|
+
// different than the one within EDN_value since it includes
|
391
|
+
// the leading - or +
|
392
|
+
//
|
393
|
+
// try to parse a decimal first
|
394
|
+
const char *np = parse_decimal(p_save, pe, o);
|
395
|
+
if (np == NULL) {
|
396
|
+
// if we can't, try to parse it as an int
|
397
|
+
np = parse_integer(p_save, pe, o);
|
398
|
+
}
|
399
|
+
|
400
|
+
if (np) {
|
401
|
+
fexec np;
|
411
402
|
fhold;
|
412
403
|
fbreak;
|
413
|
-
} else {
|
414
|
-
fexec p + 1;
|
415
404
|
}
|
405
|
+
else {
|
406
|
+
error(__FUNCTION__, *p);
|
407
|
+
fexec pe;
|
408
|
+
}
|
409
|
+
}
|
410
|
+
|
411
|
+
action parse_operator {
|
412
|
+
// stand-alone operators (-, +, /, ... etc)
|
413
|
+
std::string sym;
|
414
|
+
sym += *(p_save);
|
415
|
+
o = Parser::make_edn_symbol(sym);
|
416
416
|
}
|
417
417
|
|
418
|
-
action exit { fhold; fbreak; }
|
419
418
|
|
420
|
-
main :=
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
) :>> string_delim @err(close_err) @exit;
|
419
|
+
main := (
|
420
|
+
('-'|'+'|'.') alpha >parse_symbol |
|
421
|
+
('-'|'+') begin_number >parse_number |
|
422
|
+
operators ignore* >parse_operator
|
423
|
+
) ^(operators|alpha|digit)? @exit;
|
426
424
|
}%%
|
427
425
|
|
428
426
|
|
429
|
-
const char* edn::Parser::
|
427
|
+
const char* edn::Parser::parse_operator(const char *p, const char *pe, Rice::Object& o)
|
430
428
|
{
|
431
|
-
// std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
432
|
-
static const char* EDN_TYPE = "string";
|
433
429
|
int cs;
|
434
|
-
const char *eof = pe;
|
435
430
|
|
436
|
-
Rice::String s;
|
437
431
|
%% write init;
|
438
432
|
p_save = p;
|
439
433
|
%% write exec;
|
440
434
|
|
441
|
-
if (cs >=
|
442
|
-
|
443
|
-
return p + 1;
|
435
|
+
if (cs >= EDN_operator_first_final) {
|
436
|
+
return p;
|
444
437
|
}
|
445
|
-
else if (cs ==
|
438
|
+
else if (cs == EDN_operator_error) {
|
439
|
+
error(__FUNCTION__, *p);
|
446
440
|
return pe;
|
447
441
|
}
|
448
|
-
else if (cs ==
|
442
|
+
else if (cs == EDN_operator_en_main) {} // silence ragel warning
|
449
443
|
return NULL;
|
450
444
|
}
|
451
445
|
|
446
|
+
|
447
|
+
|
452
448
|
// ============================================================
|
453
|
-
//
|
449
|
+
// escaped char parsing - handles \c, \newline, \formfeed, etc.
|
454
450
|
//
|
455
451
|
%%{
|
456
|
-
machine
|
452
|
+
machine EDN_escaped_char;
|
457
453
|
include EDN_common;
|
458
454
|
|
459
|
-
write data
|
455
|
+
write data;
|
456
|
+
|
457
|
+
valid_chars = alpha;
|
460
458
|
|
461
|
-
action exit { fhold; fbreak; }
|
462
459
|
|
463
|
-
main := (
|
464
|
-
|
465
|
-
|
466
|
-
) (^[0-9Ee.+\-M]? @exit );
|
460
|
+
main := (
|
461
|
+
begin_char valid_chars+ ignore*
|
462
|
+
) (^(valid_chars | '\\')? @exit);
|
467
463
|
}%%
|
468
464
|
|
469
465
|
|
470
|
-
const char* edn::Parser::
|
466
|
+
const char* edn::Parser::parse_esc_char(const char *p, const char *pe, Rice::Object& o)
|
471
467
|
{
|
472
468
|
int cs;
|
473
469
|
|
@@ -475,32 +471,43 @@ const char* edn::Parser::parse_decimal(const char *p, const char *pe, Rice::Obje
|
|
475
471
|
p_save = p;
|
476
472
|
%% write exec;
|
477
473
|
|
478
|
-
if (cs >=
|
479
|
-
|
480
|
-
|
474
|
+
if (cs >= EDN_escaped_char_first_final) {
|
475
|
+
// convert the escaped value to a character
|
476
|
+
if (!Parser::parse_escaped_char(p_save + 1, p, o)) {
|
477
|
+
return pe;
|
478
|
+
}
|
479
|
+
return p;
|
481
480
|
}
|
482
|
-
else if (cs ==
|
481
|
+
else if (cs == EDN_escaped_char_error) {
|
482
|
+
error(__FUNCTION__, *p);
|
483
|
+
return pe;
|
484
|
+
}
|
485
|
+
else if (cs == EDN_escaped_char_en_main) {} // silence ragel warning
|
483
486
|
return NULL;
|
484
487
|
}
|
485
488
|
|
486
489
|
|
490
|
+
|
491
|
+
|
487
492
|
// ============================================================
|
488
|
-
//
|
493
|
+
// symbol parsing - handles identifiers that begin with an alpha
|
494
|
+
// character and an optional leading operator (name, -today,
|
495
|
+
// .yesterday)
|
489
496
|
//
|
490
497
|
%%{
|
491
|
-
machine
|
498
|
+
machine EDN_symbol;
|
492
499
|
include EDN_common;
|
493
500
|
|
494
|
-
write data
|
501
|
+
write data;
|
495
502
|
|
496
|
-
action exit { fhold; fbreak; }
|
497
503
|
|
498
504
|
main := (
|
499
|
-
|
500
|
-
) (^
|
505
|
+
operators? symbol
|
506
|
+
) ignore* (^(symbol_chars | operators)? @exit);
|
501
507
|
}%%
|
502
508
|
|
503
|
-
|
509
|
+
|
510
|
+
const char* edn::Parser::parse_symbol(const char *p, const char *pe, std::string& sym)
|
504
511
|
{
|
505
512
|
int cs;
|
506
513
|
|
@@ -508,15 +515,22 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
|
|
508
515
|
p_save = p;
|
509
516
|
%% write exec;
|
510
517
|
|
511
|
-
if (cs >=
|
512
|
-
|
513
|
-
|
518
|
+
if (cs >= EDN_symbol_first_final) {
|
519
|
+
// copy the symbol text
|
520
|
+
sym.clear();
|
521
|
+
sym.append(p_save, p - p_save);
|
522
|
+
return p;
|
514
523
|
}
|
515
|
-
else if (cs ==
|
524
|
+
else if (cs == EDN_symbol_error) {
|
525
|
+
error(__FUNCTION__, *p);
|
526
|
+
return pe;
|
527
|
+
}
|
528
|
+
else if (cs == EDN_symbol_en_main) {} // silence ragel warning
|
516
529
|
return NULL;
|
517
530
|
}
|
518
531
|
|
519
532
|
|
533
|
+
|
520
534
|
// ============================================================
|
521
535
|
// EDN_sequence_common is used to parse EDN containers - elements are
|
522
536
|
// initially stored in a rice array and then the final corresponding
|
@@ -527,30 +541,34 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
|
|
527
541
|
machine EDN_sequence_common;
|
528
542
|
include EDN_common;
|
529
543
|
|
530
|
-
action
|
531
|
-
|
532
|
-
|
544
|
+
action parse_item {
|
545
|
+
// reads an item within a sequence (vector, list, map, or
|
546
|
+
// set). Regardless of the sequence type, an array of the
|
547
|
+
// items is built. Once done, the sequence parser will convert
|
548
|
+
// if needed
|
549
|
+
Rice::Object e;
|
550
|
+
const char *np = parse_value(fpc, pe, e);
|
533
551
|
if (np == NULL) {
|
534
552
|
fhold; fbreak;
|
535
553
|
} else {
|
536
554
|
// if there's an entry in the discard list, the current
|
537
|
-
// object is not meant to be kept
|
555
|
+
// object is not meant to be kept due to a #_ so don't
|
556
|
+
// push it into the list of elements
|
538
557
|
if (!discard.empty()) {
|
539
558
|
discard.pop();
|
540
559
|
}
|
541
560
|
else {
|
542
|
-
// otherwise we add it to the
|
543
|
-
|
561
|
+
// otherwise we add it to the list of elements for the
|
562
|
+
// corresponding container
|
563
|
+
elems.push(e);
|
544
564
|
}
|
545
565
|
fexec np;
|
546
566
|
}
|
547
567
|
}
|
548
568
|
|
549
|
-
element = begin_value >
|
569
|
+
element = begin_value >parse_item;
|
550
570
|
next_element = ignore* element;
|
551
571
|
sequence = ((element ignore*) (next_element ignore*)*);
|
552
|
-
|
553
|
-
action exit { fhold; fbreak; }
|
554
572
|
}%%
|
555
573
|
|
556
574
|
//
|
@@ -575,17 +593,16 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
|
|
575
593
|
//
|
576
594
|
const char* edn::Parser::parse_vector(const char *p, const char *pe, Rice::Object& o)
|
577
595
|
{
|
578
|
-
//std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
579
596
|
static const char* EDN_TYPE = "vector";
|
580
597
|
|
581
598
|
int cs;
|
582
|
-
Rice::Array
|
599
|
+
Rice::Array elems; // will store the vector's elements
|
583
600
|
|
584
601
|
%% write init;
|
585
602
|
%% write exec;
|
586
603
|
|
587
604
|
if (cs >= EDN_vector_first_final) {
|
588
|
-
o =
|
605
|
+
o = elems;
|
589
606
|
return p + 1;
|
590
607
|
}
|
591
608
|
else if (cs == EDN_vector_error) {
|
@@ -622,13 +639,13 @@ const char* edn::Parser::parse_list(const char *p, const char *pe, Rice::Object&
|
|
622
639
|
static const char* EDN_TYPE = "list";
|
623
640
|
|
624
641
|
int cs;
|
625
|
-
Rice::Array
|
642
|
+
Rice::Array elems;
|
626
643
|
|
627
644
|
%% write init;
|
628
645
|
%% write exec;
|
629
646
|
|
630
647
|
if (cs >= EDN_list_first_final) {
|
631
|
-
o =
|
648
|
+
o = elems;
|
632
649
|
return p + 1;
|
633
650
|
}
|
634
651
|
else if (cs == EDN_list_error) {
|
@@ -642,161 +659,171 @@ const char* edn::Parser::parse_list(const char *p, const char *pe, Rice::Object&
|
|
642
659
|
|
643
660
|
|
644
661
|
// ============================================================
|
645
|
-
//
|
662
|
+
// hash parsing
|
646
663
|
//
|
647
664
|
%%{
|
648
|
-
machine
|
665
|
+
machine EDN_map;
|
649
666
|
include EDN_sequence_common;
|
650
667
|
|
668
|
+
end_map = '}';
|
669
|
+
|
651
670
|
write data;
|
652
671
|
|
653
|
-
begin_set = '{';
|
654
|
-
end_set = '}';
|
655
672
|
|
656
|
-
main :=
|
657
|
-
ignore* sequence? :>>
|
673
|
+
main := begin_map (
|
674
|
+
ignore* (sequence)? :>> end_map
|
658
675
|
) @err(close_err) @exit;
|
659
676
|
}%%
|
660
677
|
|
661
|
-
|
662
|
-
|
663
|
-
//
|
664
|
-
const char* edn::Parser::parse_set(const char *p, const char *pe, Rice::Object& o)
|
678
|
+
|
679
|
+
const char* edn::Parser::parse_map(const char *p, const char *pe, Rice::Object& o)
|
665
680
|
{
|
666
|
-
|
667
|
-
static const char* EDN_TYPE = "set";
|
681
|
+
static const char* EDN_TYPE = "map";
|
668
682
|
|
669
683
|
int cs;
|
670
|
-
|
684
|
+
// since we don't know whether we're looking at a key or value,
|
685
|
+
// initially store all elements in a list
|
686
|
+
Rice::Array elems;
|
671
687
|
|
672
688
|
%% write init;
|
673
689
|
%% write exec;
|
674
690
|
|
675
|
-
if (cs >=
|
676
|
-
|
691
|
+
if (cs >= EDN_map_first_final) {
|
692
|
+
|
693
|
+
// hash parsing is done. Make sure we have an even count
|
694
|
+
if ((elems.size() % 2) != 0) {
|
695
|
+
error(__FUNCTION__, "odd number of elements in map");
|
696
|
+
return pe;
|
697
|
+
}
|
698
|
+
|
699
|
+
// now convert the sequence to a hash
|
700
|
+
Rice::Hash rslt;
|
701
|
+
while (elems.size())
|
702
|
+
{
|
703
|
+
Rice::Object k = elems.shift();
|
704
|
+
rslt[k] = elems.shift();
|
705
|
+
}
|
706
|
+
|
707
|
+
o = rslt;
|
677
708
|
return p + 1;
|
678
709
|
}
|
679
|
-
else if (cs ==
|
680
|
-
error(__FUNCTION__, *p);
|
710
|
+
else if (cs == EDN_map_error) {
|
681
711
|
return pe;
|
682
712
|
}
|
683
|
-
else if (cs ==
|
713
|
+
else if (cs == EDN_map_en_main) {} // silence ragel warning
|
684
714
|
return NULL;
|
685
715
|
}
|
686
716
|
|
687
717
|
|
688
718
|
|
689
719
|
// ============================================================
|
690
|
-
//
|
720
|
+
// dispatch - handles all tokens with a leading #, then delegates to
|
721
|
+
// the corresponding machine. This machine consumes the # and passes
|
722
|
+
// the remaining data to the correct parser
|
691
723
|
//
|
692
724
|
%%{
|
693
|
-
machine
|
694
|
-
include
|
695
|
-
|
696
|
-
end_map = '}';
|
725
|
+
machine EDN_dispatch;
|
726
|
+
include EDN_common;
|
697
727
|
|
698
728
|
write data;
|
699
729
|
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
fexec
|
730
|
+
action parse_set {
|
731
|
+
// #{ }
|
732
|
+
const char *np = parse_set(fpc, pe, o);
|
733
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
734
|
+
}
|
735
|
+
|
736
|
+
action parse_discard {
|
737
|
+
// discard token #_
|
738
|
+
const char *np = parse_discard(fpc, pe);
|
739
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
704
740
|
}
|
705
741
|
|
706
|
-
|
707
|
-
|
708
|
-
|
742
|
+
action parse_tagged {
|
743
|
+
// #inst, #uuid, or #user/tag
|
744
|
+
const char *np = parse_tagged(fpc, pe, o);
|
745
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
746
|
+
}
|
747
|
+
|
748
|
+
|
749
|
+
main := (
|
750
|
+
('{' >parse_set |
|
751
|
+
'_' >parse_discard |
|
752
|
+
alpha >parse_tagged)
|
753
|
+
) @exit;
|
709
754
|
}%%
|
710
755
|
|
711
756
|
|
712
|
-
const char* edn::Parser::
|
757
|
+
const char* edn::Parser::parse_dispatch(const char *p, const char *pe, Rice::Object& o)
|
713
758
|
{
|
714
|
-
// std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
715
|
-
static const char* EDN_TYPE = "map";
|
716
|
-
Rice::Array arr;
|
717
759
|
int cs;
|
718
760
|
|
719
761
|
%% write init;
|
720
762
|
%% write exec;
|
721
763
|
|
722
|
-
if (cs >=
|
723
|
-
|
724
|
-
if ((arr.size() % 2) != 0) {
|
725
|
-
error(__FUNCTION__, "odd number of elements in map");
|
726
|
-
return pe;
|
727
|
-
}
|
728
|
-
|
729
|
-
Rice::Hash map;
|
730
|
-
while (arr.size())
|
731
|
-
{
|
732
|
-
Rice::Object k = arr.shift();
|
733
|
-
map[k] = arr.shift();
|
734
|
-
}
|
735
|
-
|
736
|
-
o = map;
|
764
|
+
if (cs >= EDN_dispatch_first_final) {
|
737
765
|
return p + 1;
|
738
766
|
}
|
739
|
-
else if (cs ==
|
767
|
+
else if (cs == EDN_dispatch_error) {
|
768
|
+
error(__FUNCTION__, *p);
|
740
769
|
return pe;
|
741
770
|
}
|
742
|
-
else if (cs ==
|
771
|
+
else if (cs == EDN_dispatch_en_main) {} // silence ragel warning
|
772
|
+
|
743
773
|
return NULL;
|
744
774
|
}
|
745
775
|
|
746
776
|
|
747
777
|
// ============================================================
|
748
|
-
//
|
778
|
+
// set parsing machine
|
749
779
|
//
|
750
780
|
%%{
|
751
|
-
machine
|
752
|
-
include
|
753
|
-
|
754
|
-
# inst = (string_delim [0-9+\-:\.TZ]* string_delim);
|
755
|
-
# uuid = (string_delim [a-f0-9\-]* string_delim);
|
781
|
+
machine EDN_set;
|
782
|
+
include EDN_sequence_common;
|
756
783
|
|
757
784
|
write data;
|
758
785
|
|
759
|
-
|
760
|
-
|
761
|
-
if (np == NULL) { fhold; fbreak; } else { fexec np; }
|
762
|
-
}
|
763
|
-
action parse_value {
|
764
|
-
const char *np = parse_value(fpc, pe, object);
|
765
|
-
if (np == NULL) { fhold; fbreak; } else { fexec np; }
|
766
|
-
}
|
767
|
-
|
768
|
-
action exit { fhold; fbreak; }
|
786
|
+
begin_set = '{';
|
787
|
+
end_set = '}';
|
769
788
|
|
770
|
-
main := (
|
789
|
+
main := begin_set (
|
790
|
+
ignore* sequence? :>> end_set
|
791
|
+
) @err(close_err) @exit;
|
771
792
|
}%%
|
772
793
|
|
773
|
-
|
774
|
-
|
794
|
+
//
|
795
|
+
// set parsing
|
796
|
+
//
|
797
|
+
const char* edn::Parser::parse_set(const char *p, const char *pe, Rice::Object& o)
|
775
798
|
{
|
776
|
-
|
777
|
-
std::string sym_name;
|
778
|
-
Rice::Object object;
|
799
|
+
static const char* EDN_TYPE = "set";
|
779
800
|
|
780
801
|
int cs;
|
802
|
+
Rice::Array elems; // stored as a vector
|
781
803
|
|
782
804
|
%% write init;
|
783
805
|
%% write exec;
|
784
806
|
|
785
|
-
if (cs >=
|
786
|
-
//
|
787
|
-
o = Parser::
|
807
|
+
if (cs >= EDN_set_first_final) {
|
808
|
+
// all elements collected; now convert to a set
|
809
|
+
o = Parser::make_ruby_set(elems);
|
788
810
|
return p + 1;
|
789
811
|
}
|
790
|
-
else if (cs ==
|
812
|
+
else if (cs == EDN_set_error) {
|
813
|
+
error(__FUNCTION__, *p);
|
791
814
|
return pe;
|
792
815
|
}
|
793
|
-
else if (cs ==
|
816
|
+
else if (cs == EDN_set_en_main) {} // silence ragel warning
|
794
817
|
return NULL;
|
795
818
|
}
|
796
819
|
|
797
820
|
|
821
|
+
|
798
822
|
// ============================================================
|
799
|
-
// discard
|
823
|
+
// discard - consume the discard token and parse the next value to
|
824
|
+
// discard. TODO: perhaps optimize this so no object data is built
|
825
|
+
// by defining a new machine(s) to consume items within container
|
826
|
+
// delimiters
|
800
827
|
//
|
801
828
|
%%{
|
802
829
|
machine EDN_discard;
|
@@ -808,12 +835,17 @@ const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Objec
|
|
808
835
|
|
809
836
|
action discard_value {
|
810
837
|
const char *np = parse_value(fpc, pe, o);
|
811
|
-
if (np
|
838
|
+
if (np) {
|
839
|
+
// this token is to be discard it so store it in the
|
840
|
+
// discard stack - we really don't need to save it so this
|
841
|
+
// could be simplified
|
842
|
+
discard.push(o);
|
843
|
+
fexec np;
|
844
|
+
} else {
|
845
|
+
fhold; fbreak;
|
846
|
+
}
|
812
847
|
}
|
813
848
|
|
814
|
-
action exit {
|
815
|
-
fhold; fbreak;
|
816
|
-
}
|
817
849
|
|
818
850
|
main := begin_discard ignore* (
|
819
851
|
begin_value >discard_value
|
@@ -823,7 +855,6 @@ const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Objec
|
|
823
855
|
|
824
856
|
const char* edn::Parser::parse_discard(const char *p, const char *pe)
|
825
857
|
{
|
826
|
-
//std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
827
858
|
int cs;
|
828
859
|
Rice::Object o;
|
829
860
|
|
@@ -845,59 +876,77 @@ const char* edn::Parser::parse_discard(const char *p, const char *pe)
|
|
845
876
|
|
846
877
|
|
847
878
|
// ============================================================
|
848
|
-
//
|
879
|
+
// tagged element parsing - #uuid, #inst, #{, #user/tag
|
880
|
+
//
|
881
|
+
// Current implementation expects a symbol followed by a value to
|
882
|
+
// match it against and does not check validity of uuid or rfc3339
|
883
|
+
// date characters.
|
884
|
+
//
|
885
|
+
// TODO:
|
886
|
+
// 1. need to check if we must support discard shenanigans such as
|
887
|
+
//
|
888
|
+
// #symbol #_ discard data
|
889
|
+
//
|
890
|
+
// 2. add parse checks for uuid and inst for better error reporting
|
849
891
|
//
|
850
892
|
%%{
|
851
|
-
machine
|
893
|
+
machine EDN_tagged;
|
852
894
|
include EDN_common;
|
853
895
|
|
854
|
-
|
896
|
+
# inst = (string_delim [0-9+\-:\.TZ]* string_delim);
|
897
|
+
# uuid = (string_delim [a-f0-9\-]* string_delim);
|
855
898
|
|
856
|
-
|
857
|
-
const char *np = parse_discard(fpc, pe);
|
858
|
-
if (np == NULL) { fhold; fbreak; } else fexec np;
|
859
|
-
}
|
899
|
+
write data;
|
860
900
|
|
861
|
-
action
|
862
|
-
|
863
|
-
|
901
|
+
action parse_symbol {
|
902
|
+
// parses the symbol portion of the pair
|
903
|
+
const char *np = parse_symbol(fpc, pe, sym_name);
|
904
|
+
if (np == NULL) { fhold; fbreak; } else { fexec np; }
|
864
905
|
}
|
865
|
-
|
866
|
-
|
867
|
-
const char *np =
|
868
|
-
if (np == NULL) { fhold; fbreak; } else fexec np;
|
906
|
+
action parse_value {
|
907
|
+
// parses the value portion
|
908
|
+
const char *np = parse_value(fpc, pe, data);
|
909
|
+
if (np == NULL) { fhold; fbreak; } else { fexec np; }
|
869
910
|
}
|
870
911
|
|
871
|
-
action exit { fhold; fbreak; }
|
872
912
|
|
873
|
-
main := (
|
874
|
-
('_' >parse_discard) |
|
875
|
-
'{' >parse_set |
|
876
|
-
alpha >parse_tagged
|
877
|
-
) @exit;
|
913
|
+
main := (symbol >parse_symbol ignore* begin_value >parse_value) @exit;
|
878
914
|
}%%
|
879
915
|
|
880
916
|
|
881
|
-
const char* edn::Parser::
|
917
|
+
const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Object& o)
|
882
918
|
{
|
883
|
-
|
919
|
+
std::string sym_name;
|
920
|
+
Rice::Object data;
|
921
|
+
|
884
922
|
int cs;
|
885
923
|
|
886
924
|
%% write init;
|
887
925
|
%% write exec;
|
888
926
|
|
889
|
-
if (cs >=
|
927
|
+
if (cs >= EDN_tagged_first_final) {
|
928
|
+
//std::cerr << __FUNCTION__ << " parse symbol name as '" << sym_name << "', value is: " << data << std::endl;
|
929
|
+
|
930
|
+
try {
|
931
|
+
// tagged_element makes a call to ruby which may throw an
|
932
|
+
// exception when parsing the data
|
933
|
+
o = Parser::tagged_element(sym_name, data);
|
934
|
+
} catch (Rice::Exception& e) {
|
935
|
+
error(__FUNCTION__, e.message().str());
|
936
|
+
return pe;
|
937
|
+
}
|
890
938
|
return p + 1;
|
891
939
|
}
|
892
|
-
else if (cs ==
|
893
|
-
error(__FUNCTION__, *p);
|
940
|
+
else if (cs == EDN_tagged_error) {
|
894
941
|
return pe;
|
895
942
|
}
|
896
|
-
else if (cs ==
|
897
|
-
|
943
|
+
else if (cs == EDN_tagged_en_main) {} // silence ragel warning
|
898
944
|
return NULL;
|
899
945
|
}
|
900
946
|
|
947
|
+
|
948
|
+
|
949
|
+
|
901
950
|
// ============================================================
|
902
951
|
// main parsing machine
|
903
952
|
//
|
@@ -916,13 +965,12 @@ const char* edn::Parser::parse_dispatch(const char *p, const char *pe, Rice::Obj
|
|
916
965
|
next_element = ignore* element;
|
917
966
|
sequence = ((element ignore*) (next_element ignore*)*);
|
918
967
|
|
919
|
-
# TODO: check this. Using a sequence to handle cases with a discard
|
920
968
|
main := ignore* sequence? ignore*;
|
921
969
|
}%%
|
922
970
|
|
923
971
|
//
|
924
|
-
//
|
925
|
-
//
|
972
|
+
// TODO: Currently using a sequence to handle cases with a discard
|
973
|
+
// but EDN's Reader allows token by token parsing
|
926
974
|
Rice::Object edn::Parser::parse(const char* buf, std::size_t len)
|
927
975
|
{
|
928
976
|
int cs;
|