edn_turbo 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +16 -40
- data/ext/edn_turbo/edn_parser.cc +1651 -1529
- data/ext/edn_turbo/edn_parser.h +15 -12
- data/ext/edn_turbo/edn_parser.rl +375 -327
- data/ext/edn_turbo/edn_parser_unicode.cc +1 -1
- data/ext/edn_turbo/{edn_parser_def.cc → edn_parser_util.cc} +23 -15
- data/lib/edn_turbo/version.rb +2 -2
- data/test/test_output_diff.rb +1 -1
- metadata +3 -3
data/ext/edn_turbo/edn_parser.rl
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
#include <rice/Hash.hpp>
|
6
6
|
#include <rice/Array.hpp>
|
7
7
|
#include <rice/to_from_ruby.hpp>
|
8
|
+
#include <rice/Exception.hpp>
|
8
9
|
|
9
10
|
#include "edn_parser.h"
|
10
11
|
|
@@ -49,16 +50,19 @@
|
|
49
50
|
exp = ([Ee] [+\-]? digit+);
|
50
51
|
|
51
52
|
|
53
|
+
# common actions
|
52
54
|
action close_err {
|
53
55
|
std::stringstream s;
|
54
56
|
s << "unterminated " << EDN_TYPE;
|
55
57
|
error(__FUNCTION__, s.str());
|
56
58
|
fhold; fbreak;
|
57
59
|
}
|
60
|
+
|
61
|
+
action exit { fhold; fbreak; }
|
58
62
|
}%%
|
59
63
|
|
60
64
|
// ============================================================
|
61
|
-
// machine for parsing various types
|
65
|
+
// machine for parsing various EDN token types
|
62
66
|
//
|
63
67
|
|
64
68
|
%%{
|
@@ -67,46 +71,20 @@
|
|
67
71
|
|
68
72
|
write data;
|
69
73
|
|
70
|
-
action parse_dispatch {
|
71
|
-
const char *np = parse_dispatch(fpc + 1, pe, o);
|
72
|
-
if (np == NULL) { fhold; fbreak; } else fexec np;
|
73
|
-
}
|
74
|
-
|
75
|
-
action parse_char {
|
76
|
-
const char *np = parse_esc_char(fpc, pe, o);
|
77
|
-
if (np == NULL) { fhold; fbreak; } else fexec np;
|
78
|
-
}
|
79
|
-
|
80
74
|
action parse_string {
|
75
|
+
// string types within double-quotes
|
81
76
|
const char *np = parse_string(fpc, pe, o);
|
82
77
|
if (np == NULL) { fhold; fbreak; } else fexec np;
|
83
78
|
}
|
84
79
|
|
85
80
|
action parse_keyword {
|
81
|
+
// tokens with a leading ':'
|
86
82
|
const char *np = parse_keyword(fpc, pe, o);
|
87
83
|
if (np == NULL) { fhold; fbreak; } else fexec np;
|
88
84
|
}
|
89
85
|
|
90
|
-
action parse_operator {
|
91
|
-
const char *np = parse_operator(fpc, pe, o);
|
92
|
-
if (np == NULL) { fhold; fbreak; } else fexec np;
|
93
|
-
}
|
94
|
-
|
95
|
-
action parse_symbol {
|
96
|
-
std::string sym;
|
97
|
-
const char *np = parse_symbol(fpc, pe, sym);
|
98
|
-
if (np == NULL) { fhold; fbreak; } else {
|
99
|
-
if (sym == "true") { o = Qtrue; }
|
100
|
-
else if (sym == "false") { o = Qfalse; }
|
101
|
-
else if (sym == "nil") { o = Qnil; }
|
102
|
-
else {
|
103
|
-
o = Parser::make_edn_symbol(sym);
|
104
|
-
}
|
105
|
-
fexec np;
|
106
|
-
}
|
107
|
-
}
|
108
|
-
|
109
86
|
action parse_number {
|
87
|
+
// tokens w/ leading digits: non-negative integers & decimals.
|
110
88
|
// try to parse a decimal first
|
111
89
|
const char *np = parse_decimal(fpc, pe, o);
|
112
90
|
if (np == NULL) {
|
@@ -125,41 +103,75 @@
|
|
125
103
|
}
|
126
104
|
}
|
127
105
|
|
106
|
+
action parse_operator {
|
107
|
+
// stand-alone operators *, +, -, etc.
|
108
|
+
const char *np = parse_operator(fpc, pe, o);
|
109
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
110
|
+
}
|
111
|
+
|
112
|
+
action parse_char {
|
113
|
+
// tokens w/ leading \ (escaped characters \newline, \c, etc.)
|
114
|
+
const char *np = parse_esc_char(fpc, pe, o);
|
115
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
116
|
+
}
|
117
|
+
|
118
|
+
action parse_symbol {
|
119
|
+
// user identifiers and reserved keywords (true, false, nil)
|
120
|
+
std::string sym;
|
121
|
+
const char *np = parse_symbol(fpc, pe, sym);
|
122
|
+
if (np == NULL) { fhold; fbreak; } else {
|
123
|
+
if (sym == "true") { o = Qtrue; }
|
124
|
+
else if (sym == "false") { o = Qfalse; }
|
125
|
+
else if (sym == "nil") { o = Qnil; }
|
126
|
+
else {
|
127
|
+
o = Parser::make_edn_symbol(sym);
|
128
|
+
}
|
129
|
+
fexec np;
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
128
133
|
action parse_vector {
|
134
|
+
// [
|
129
135
|
const char *np = parse_vector(fpc, pe, o);
|
130
136
|
if (np == NULL) { fhold; fbreak; } else fexec np;
|
131
137
|
}
|
132
138
|
|
133
139
|
action parse_list {
|
140
|
+
// (
|
134
141
|
const char *np = parse_list(fpc, pe, o);
|
135
142
|
if (np == NULL) { fhold; fbreak; } else fexec np;
|
136
143
|
}
|
137
144
|
|
138
145
|
action parse_map {
|
146
|
+
// {
|
139
147
|
const char *np = parse_map(fpc, pe, o);
|
140
148
|
if (np == NULL) { fhold; fbreak; } else fexec np;
|
141
149
|
}
|
142
150
|
|
143
|
-
action
|
151
|
+
action parse_dispatch {
|
152
|
+
// handles tokens w/ leading # ("#_", "#{", and tagged elems)
|
153
|
+
const char *np = parse_dispatch(fpc + 1, pe, o);
|
154
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
155
|
+
}
|
156
|
+
|
144
157
|
|
145
158
|
main := (
|
146
|
-
begin_dispatch >parse_dispatch |
|
147
|
-
begin_char >parse_char |
|
148
159
|
string_delim >parse_string |
|
149
160
|
begin_keyword >parse_keyword |
|
161
|
+
begin_number >parse_number |
|
150
162
|
operators >parse_operator |
|
163
|
+
begin_char >parse_char |
|
151
164
|
begin_symbol >parse_symbol |
|
152
|
-
begin_number >parse_number |
|
153
165
|
begin_vector >parse_vector |
|
154
166
|
begin_list >parse_list |
|
155
|
-
begin_map >parse_map
|
167
|
+
begin_map >parse_map |
|
168
|
+
begin_dispatch >parse_dispatch
|
156
169
|
) %*exit;
|
157
170
|
}%%
|
158
171
|
|
159
172
|
|
160
173
|
const char *edn::Parser::parse_value(const char *p, const char *pe, Rice::Object& o)
|
161
174
|
{
|
162
|
-
//std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
163
175
|
int cs;
|
164
176
|
|
165
177
|
%% write init;
|
@@ -179,295 +191,279 @@ const char *edn::Parser::parse_value(const char *p, const char *pe, Rice::Object
|
|
179
191
|
|
180
192
|
|
181
193
|
// ============================================================
|
182
|
-
//
|
194
|
+
// string parsing - incoming string is raw so interpreting utf
|
195
|
+
// encodings & unicode values might be necessary. To optimize things a
|
196
|
+
// bit, we mark the string for encoding if anything outside of the
|
197
|
+
// ascii range is found.
|
183
198
|
//
|
184
199
|
%%{
|
185
|
-
machine
|
200
|
+
machine EDN_string;
|
186
201
|
include EDN_common;
|
187
202
|
|
188
203
|
write data;
|
189
204
|
|
190
|
-
action
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
o = Parser::make_edn_symbol(sym);
|
196
|
-
fexec np;
|
197
|
-
}
|
198
|
-
}
|
199
|
-
|
200
|
-
action parse_number {
|
201
|
-
// parse a number with the leading symbol - this is slightly
|
202
|
-
// different than the one within EDN_value since it includes
|
203
|
-
// the leading - or +
|
204
|
-
//
|
205
|
-
// try to parse a decimal first
|
206
|
-
const char *np = parse_decimal(p_save, pe, o);
|
207
|
-
if (np == NULL) {
|
208
|
-
// if we can't, try to parse it as an int
|
209
|
-
np = parse_integer(p_save, pe, o);
|
210
|
-
}
|
211
|
-
|
212
|
-
if (np) {
|
213
|
-
fexec np;
|
214
|
-
fhold;
|
215
|
-
fbreak;
|
216
|
-
}
|
217
|
-
else {
|
218
|
-
error(__FUNCTION__, *p);
|
219
|
-
fexec pe;
|
205
|
+
action parse_string {
|
206
|
+
if (Parser::parse_byte_stream(p_save + 1, p, s, encode)) {
|
207
|
+
fexec p + 1;
|
208
|
+
} else {
|
209
|
+
fhold; fbreak;
|
220
210
|
}
|
221
211
|
}
|
222
212
|
|
223
|
-
action
|
224
|
-
|
225
|
-
std::string sym;
|
226
|
-
sym += *(fpc - 1);
|
227
|
-
o = Parser::make_edn_symbol(sym);
|
213
|
+
action mark_for_encoding {
|
214
|
+
encode = true;
|
228
215
|
}
|
229
216
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
217
|
+
main := string_delim (
|
218
|
+
(^([\"\\] | 0..0x1f | 0xc2..0xf5) |
|
219
|
+
((0xc2..0xf5) |
|
220
|
+
'\\'[\"\\/bfnrt] |
|
221
|
+
'\\u'[0-9a-fA-F]{4}) $mark_for_encoding |
|
222
|
+
'\\'^([\"\\/bfnrtu]|0..0x1f))* %parse_string
|
223
|
+
) :>> string_delim @err(close_err) @exit;
|
237
224
|
}%%
|
238
225
|
|
239
226
|
|
240
|
-
const char* edn::Parser::
|
227
|
+
const char* edn::Parser::parse_string(const char *p, const char *pe, Rice::Object& o)
|
241
228
|
{
|
242
229
|
// std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
230
|
+
static const char* EDN_TYPE = "string";
|
243
231
|
int cs;
|
244
|
-
|
232
|
+
bool encode = false;
|
233
|
+
const char *eof = pe;
|
234
|
+
Rice::String s;
|
245
235
|
|
246
236
|
%% write init;
|
247
237
|
p_save = p;
|
248
238
|
%% write exec;
|
249
239
|
|
250
|
-
if (cs >=
|
251
|
-
|
240
|
+
if (cs >= EDN_string_first_final) {
|
241
|
+
o = s;
|
242
|
+
return p + 1;
|
252
243
|
}
|
253
|
-
else if (cs ==
|
254
|
-
error(__FUNCTION__, *p);
|
244
|
+
else if (cs == EDN_string_error) {
|
255
245
|
return pe;
|
256
246
|
}
|
257
|
-
else if (cs ==
|
247
|
+
else if (cs == EDN_string_en_main) {} // silence ragel warning
|
258
248
|
return NULL;
|
259
249
|
}
|
260
250
|
|
261
251
|
|
262
252
|
|
263
253
|
// ============================================================
|
264
|
-
//
|
254
|
+
// keyword parsing
|
265
255
|
//
|
266
256
|
%%{
|
267
|
-
machine
|
257
|
+
machine EDN_keyword;
|
268
258
|
include EDN_common;
|
269
259
|
|
270
|
-
|
260
|
+
keyword_chars = symbol_chars | operators;
|
261
|
+
keyword_start = symbol_start | [\#\./];
|
271
262
|
|
272
|
-
|
263
|
+
keyword_name = keyword_start (keyword_chars)*;
|
273
264
|
|
274
|
-
|
265
|
+
write data;
|
275
266
|
|
276
|
-
|
277
|
-
|
278
|
-
) (^(valid_chars | '\\')? @exit);
|
267
|
+
|
268
|
+
main := begin_keyword keyword_name (^keyword_chars? @exit);
|
279
269
|
}%%
|
280
270
|
|
281
271
|
|
282
|
-
const char* edn::Parser::
|
272
|
+
const char* edn::Parser::parse_keyword(const char *p, const char *pe, Rice::Object& o)
|
283
273
|
{
|
284
|
-
//std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
285
274
|
int cs;
|
286
275
|
|
287
276
|
%% write init;
|
288
277
|
p_save = p;
|
289
278
|
%% write exec;
|
290
279
|
|
291
|
-
if (cs >=
|
292
|
-
|
293
|
-
|
294
|
-
|
280
|
+
if (cs >= EDN_keyword_first_final) {
|
281
|
+
std::string buf;
|
282
|
+
uint32_t len = p - p_save;
|
283
|
+
// don't include leading ':' because Rice::Symbol will handle it
|
284
|
+
buf.append(p_save + 1, len - 1);
|
285
|
+
o = Rice::Symbol(buf);
|
295
286
|
return p;
|
296
287
|
}
|
297
|
-
else if (cs ==
|
288
|
+
else if (cs == EDN_keyword_error) {
|
298
289
|
error(__FUNCTION__, *p);
|
299
290
|
return pe;
|
300
291
|
}
|
301
|
-
else if (cs ==
|
292
|
+
else if (cs == EDN_keyword_en_main) {} // silence ragel warning
|
302
293
|
return NULL;
|
303
294
|
}
|
304
295
|
|
305
296
|
|
306
297
|
|
307
|
-
|
308
298
|
// ============================================================
|
309
|
-
//
|
299
|
+
// decimal parsing machine
|
310
300
|
//
|
311
301
|
%%{
|
312
|
-
machine
|
302
|
+
machine EDN_decimal;
|
313
303
|
include EDN_common;
|
314
304
|
|
315
|
-
write data;
|
305
|
+
write data noerror;
|
316
306
|
|
317
|
-
action exit { fhold; fbreak; }
|
318
307
|
|
319
|
-
main := (
|
320
|
-
|
321
|
-
|
322
|
-
|
308
|
+
main := ('-'|'+')? (
|
309
|
+
(integer '.' digit* (exp? [M]?)) |
|
310
|
+
(integer exp)
|
311
|
+
) (^[0-9Ee.+\-M]? @exit );
|
323
312
|
}%%
|
324
313
|
|
325
314
|
|
326
|
-
const char* edn::Parser::
|
315
|
+
const char* edn::Parser::parse_decimal(const char *p, const char *pe, Rice::Object& o)
|
327
316
|
{
|
328
|
-
// std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
329
317
|
int cs;
|
330
318
|
|
331
319
|
%% write init;
|
332
320
|
p_save = p;
|
333
321
|
%% write exec;
|
334
322
|
|
335
|
-
if (cs >=
|
336
|
-
|
337
|
-
|
338
|
-
buf.append(p_save, len);
|
339
|
-
|
340
|
-
s = buf;
|
341
|
-
return p;
|
342
|
-
}
|
343
|
-
else if (cs == EDN_symbol_error) {
|
344
|
-
error(__FUNCTION__, *p);
|
345
|
-
return pe;
|
323
|
+
if (cs >= EDN_decimal_first_final) {
|
324
|
+
o = Parser::float_to_ruby(p_save, p - p_save);
|
325
|
+
return p + 1;
|
346
326
|
}
|
347
|
-
else if (cs ==
|
327
|
+
else if (cs == EDN_decimal_en_main) {} // silence ragel warning
|
348
328
|
return NULL;
|
349
329
|
}
|
350
330
|
|
351
331
|
|
352
|
-
|
353
|
-
|
354
332
|
// ============================================================
|
355
|
-
//
|
333
|
+
// integer parsing machine
|
356
334
|
//
|
357
335
|
%%{
|
358
|
-
machine
|
336
|
+
machine EDN_integer;
|
359
337
|
include EDN_common;
|
360
338
|
|
361
|
-
|
362
|
-
keyword_start = symbol_start | [\#\./];
|
363
|
-
|
364
|
-
keyword_name = keyword_start (keyword_chars)*;
|
365
|
-
|
366
|
-
write data;
|
339
|
+
write data noerror;
|
367
340
|
|
368
|
-
action exit { fhold; fbreak; }
|
369
341
|
|
370
|
-
main :=
|
342
|
+
main := (
|
343
|
+
('-'|'+')? (integer [MN]?)
|
344
|
+
) (^[0-9MN+\-]? @exit);
|
371
345
|
}%%
|
372
346
|
|
373
|
-
|
374
|
-
const char* edn::Parser::parse_keyword(const char *p, const char *pe, Rice::Object& o)
|
347
|
+
const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Object& o)
|
375
348
|
{
|
376
349
|
int cs;
|
377
|
-
// std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
378
350
|
|
379
351
|
%% write init;
|
380
352
|
p_save = p;
|
381
353
|
%% write exec;
|
382
354
|
|
383
|
-
if (cs >=
|
384
|
-
|
385
|
-
|
386
|
-
buf.append(p_save + 1, len);
|
387
|
-
o = Rice::Symbol(buf);
|
388
|
-
return p;
|
389
|
-
}
|
390
|
-
else if (cs == EDN_keyword_error) {
|
391
|
-
error(__FUNCTION__, *p);
|
392
|
-
return pe;
|
355
|
+
if (cs >= EDN_integer_first_final) {
|
356
|
+
o = Parser::integer_to_ruby(p_save, p - p_save);
|
357
|
+
return p + 1;
|
393
358
|
}
|
394
|
-
else if (cs ==
|
359
|
+
else if (cs == EDN_integer_en_main) {} // silence ragel warning
|
395
360
|
return NULL;
|
396
361
|
}
|
397
362
|
|
398
363
|
|
399
364
|
|
400
365
|
// ============================================================
|
401
|
-
//
|
366
|
+
// operator parsing - handles tokens w/ a leading operator:
|
367
|
+
//
|
368
|
+
// 1. symbols w/ leading operator: -something, .somethingelse
|
369
|
+
// 2. number values w/ leading - or +
|
370
|
+
// 3. stand-alone operators: +, -, /, *, etc.
|
402
371
|
//
|
403
372
|
%%{
|
404
|
-
machine
|
373
|
+
machine EDN_operator;
|
405
374
|
include EDN_common;
|
406
375
|
|
407
376
|
write data;
|
408
377
|
|
409
|
-
action
|
410
|
-
|
378
|
+
action parse_symbol {
|
379
|
+
// parse a symbol including the leading operator (-, +, .)
|
380
|
+
std::string sym;
|
381
|
+
const char *np = parse_symbol(p_save, pe, sym);
|
382
|
+
if (np == NULL) { fhold; fbreak; } else {
|
383
|
+
o = Parser::make_edn_symbol(sym);
|
384
|
+
fexec np;
|
385
|
+
}
|
386
|
+
}
|
387
|
+
|
388
|
+
action parse_number {
|
389
|
+
// parse a number with the leading symbol - this is slightly
|
390
|
+
// different than the one within EDN_value since it includes
|
391
|
+
// the leading - or +
|
392
|
+
//
|
393
|
+
// try to parse a decimal first
|
394
|
+
const char *np = parse_decimal(p_save, pe, o);
|
395
|
+
if (np == NULL) {
|
396
|
+
// if we can't, try to parse it as an int
|
397
|
+
np = parse_integer(p_save, pe, o);
|
398
|
+
}
|
399
|
+
|
400
|
+
if (np) {
|
401
|
+
fexec np;
|
411
402
|
fhold;
|
412
403
|
fbreak;
|
413
|
-
} else {
|
414
|
-
fexec p + 1;
|
415
404
|
}
|
405
|
+
else {
|
406
|
+
error(__FUNCTION__, *p);
|
407
|
+
fexec pe;
|
408
|
+
}
|
409
|
+
}
|
410
|
+
|
411
|
+
action parse_operator {
|
412
|
+
// stand-alone operators (-, +, /, ... etc)
|
413
|
+
std::string sym;
|
414
|
+
sym += *(p_save);
|
415
|
+
o = Parser::make_edn_symbol(sym);
|
416
416
|
}
|
417
417
|
|
418
|
-
action exit { fhold; fbreak; }
|
419
418
|
|
420
|
-
main :=
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
) :>> string_delim @err(close_err) @exit;
|
419
|
+
main := (
|
420
|
+
('-'|'+'|'.') alpha >parse_symbol |
|
421
|
+
('-'|'+') begin_number >parse_number |
|
422
|
+
operators ignore* >parse_operator
|
423
|
+
) ^(operators|alpha|digit)? @exit;
|
426
424
|
}%%
|
427
425
|
|
428
426
|
|
429
|
-
const char* edn::Parser::
|
427
|
+
const char* edn::Parser::parse_operator(const char *p, const char *pe, Rice::Object& o)
|
430
428
|
{
|
431
|
-
// std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
432
|
-
static const char* EDN_TYPE = "string";
|
433
429
|
int cs;
|
434
|
-
const char *eof = pe;
|
435
430
|
|
436
|
-
Rice::String s;
|
437
431
|
%% write init;
|
438
432
|
p_save = p;
|
439
433
|
%% write exec;
|
440
434
|
|
441
|
-
if (cs >=
|
442
|
-
|
443
|
-
return p + 1;
|
435
|
+
if (cs >= EDN_operator_first_final) {
|
436
|
+
return p;
|
444
437
|
}
|
445
|
-
else if (cs ==
|
438
|
+
else if (cs == EDN_operator_error) {
|
439
|
+
error(__FUNCTION__, *p);
|
446
440
|
return pe;
|
447
441
|
}
|
448
|
-
else if (cs ==
|
442
|
+
else if (cs == EDN_operator_en_main) {} // silence ragel warning
|
449
443
|
return NULL;
|
450
444
|
}
|
451
445
|
|
446
|
+
|
447
|
+
|
452
448
|
// ============================================================
|
453
|
-
//
|
449
|
+
// escaped char parsing - handles \c, \newline, \formfeed, etc.
|
454
450
|
//
|
455
451
|
%%{
|
456
|
-
machine
|
452
|
+
machine EDN_escaped_char;
|
457
453
|
include EDN_common;
|
458
454
|
|
459
|
-
write data
|
455
|
+
write data;
|
456
|
+
|
457
|
+
valid_chars = alpha;
|
460
458
|
|
461
|
-
action exit { fhold; fbreak; }
|
462
459
|
|
463
|
-
main := (
|
464
|
-
|
465
|
-
|
466
|
-
) (^[0-9Ee.+\-M]? @exit );
|
460
|
+
main := (
|
461
|
+
begin_char valid_chars+ ignore*
|
462
|
+
) (^(valid_chars | '\\')? @exit);
|
467
463
|
}%%
|
468
464
|
|
469
465
|
|
470
|
-
const char* edn::Parser::
|
466
|
+
const char* edn::Parser::parse_esc_char(const char *p, const char *pe, Rice::Object& o)
|
471
467
|
{
|
472
468
|
int cs;
|
473
469
|
|
@@ -475,32 +471,43 @@ const char* edn::Parser::parse_decimal(const char *p, const char *pe, Rice::Obje
|
|
475
471
|
p_save = p;
|
476
472
|
%% write exec;
|
477
473
|
|
478
|
-
if (cs >=
|
479
|
-
|
480
|
-
|
474
|
+
if (cs >= EDN_escaped_char_first_final) {
|
475
|
+
// convert the escaped value to a character
|
476
|
+
if (!Parser::parse_escaped_char(p_save + 1, p, o)) {
|
477
|
+
return pe;
|
478
|
+
}
|
479
|
+
return p;
|
481
480
|
}
|
482
|
-
else if (cs ==
|
481
|
+
else if (cs == EDN_escaped_char_error) {
|
482
|
+
error(__FUNCTION__, *p);
|
483
|
+
return pe;
|
484
|
+
}
|
485
|
+
else if (cs == EDN_escaped_char_en_main) {} // silence ragel warning
|
483
486
|
return NULL;
|
484
487
|
}
|
485
488
|
|
486
489
|
|
490
|
+
|
491
|
+
|
487
492
|
// ============================================================
|
488
|
-
//
|
493
|
+
// symbol parsing - handles identifiers that begin with an alpha
|
494
|
+
// character and an optional leading operator (name, -today,
|
495
|
+
// .yesterday)
|
489
496
|
//
|
490
497
|
%%{
|
491
|
-
machine
|
498
|
+
machine EDN_symbol;
|
492
499
|
include EDN_common;
|
493
500
|
|
494
|
-
write data
|
501
|
+
write data;
|
495
502
|
|
496
|
-
action exit { fhold; fbreak; }
|
497
503
|
|
498
504
|
main := (
|
499
|
-
|
500
|
-
) (^
|
505
|
+
operators? symbol
|
506
|
+
) ignore* (^(symbol_chars | operators)? @exit);
|
501
507
|
}%%
|
502
508
|
|
503
|
-
|
509
|
+
|
510
|
+
const char* edn::Parser::parse_symbol(const char *p, const char *pe, std::string& sym)
|
504
511
|
{
|
505
512
|
int cs;
|
506
513
|
|
@@ -508,15 +515,22 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
|
|
508
515
|
p_save = p;
|
509
516
|
%% write exec;
|
510
517
|
|
511
|
-
if (cs >=
|
512
|
-
|
513
|
-
|
518
|
+
if (cs >= EDN_symbol_first_final) {
|
519
|
+
// copy the symbol text
|
520
|
+
sym.clear();
|
521
|
+
sym.append(p_save, p - p_save);
|
522
|
+
return p;
|
514
523
|
}
|
515
|
-
else if (cs ==
|
524
|
+
else if (cs == EDN_symbol_error) {
|
525
|
+
error(__FUNCTION__, *p);
|
526
|
+
return pe;
|
527
|
+
}
|
528
|
+
else if (cs == EDN_symbol_en_main) {} // silence ragel warning
|
516
529
|
return NULL;
|
517
530
|
}
|
518
531
|
|
519
532
|
|
533
|
+
|
520
534
|
// ============================================================
|
521
535
|
// EDN_sequence_common is used to parse EDN containers - elements are
|
522
536
|
// initially stored in a rice array and then the final corresponding
|
@@ -527,30 +541,34 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
|
|
527
541
|
machine EDN_sequence_common;
|
528
542
|
include EDN_common;
|
529
543
|
|
530
|
-
action
|
531
|
-
|
532
|
-
|
544
|
+
action parse_item {
|
545
|
+
// reads an item within a sequence (vector, list, map, or
|
546
|
+
// set). Regardless of the sequence type, an array of the
|
547
|
+
// items is built. Once done, the sequence parser will convert
|
548
|
+
// if needed
|
549
|
+
Rice::Object e;
|
550
|
+
const char *np = parse_value(fpc, pe, e);
|
533
551
|
if (np == NULL) {
|
534
552
|
fhold; fbreak;
|
535
553
|
} else {
|
536
554
|
// if there's an entry in the discard list, the current
|
537
|
-
// object is not meant to be kept
|
555
|
+
// object is not meant to be kept due to a #_ so don't
|
556
|
+
// push it into the list of elements
|
538
557
|
if (!discard.empty()) {
|
539
558
|
discard.pop();
|
540
559
|
}
|
541
560
|
else {
|
542
|
-
// otherwise we add it to the
|
543
|
-
|
561
|
+
// otherwise we add it to the list of elements for the
|
562
|
+
// corresponding container
|
563
|
+
elems.push(e);
|
544
564
|
}
|
545
565
|
fexec np;
|
546
566
|
}
|
547
567
|
}
|
548
568
|
|
549
|
-
element = begin_value >
|
569
|
+
element = begin_value >parse_item;
|
550
570
|
next_element = ignore* element;
|
551
571
|
sequence = ((element ignore*) (next_element ignore*)*);
|
552
|
-
|
553
|
-
action exit { fhold; fbreak; }
|
554
572
|
}%%
|
555
573
|
|
556
574
|
//
|
@@ -575,17 +593,16 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
|
|
575
593
|
//
|
576
594
|
const char* edn::Parser::parse_vector(const char *p, const char *pe, Rice::Object& o)
|
577
595
|
{
|
578
|
-
//std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
579
596
|
static const char* EDN_TYPE = "vector";
|
580
597
|
|
581
598
|
int cs;
|
582
|
-
Rice::Array
|
599
|
+
Rice::Array elems; // will store the vector's elements
|
583
600
|
|
584
601
|
%% write init;
|
585
602
|
%% write exec;
|
586
603
|
|
587
604
|
if (cs >= EDN_vector_first_final) {
|
588
|
-
o =
|
605
|
+
o = elems;
|
589
606
|
return p + 1;
|
590
607
|
}
|
591
608
|
else if (cs == EDN_vector_error) {
|
@@ -622,13 +639,13 @@ const char* edn::Parser::parse_list(const char *p, const char *pe, Rice::Object&
|
|
622
639
|
static const char* EDN_TYPE = "list";
|
623
640
|
|
624
641
|
int cs;
|
625
|
-
Rice::Array
|
642
|
+
Rice::Array elems;
|
626
643
|
|
627
644
|
%% write init;
|
628
645
|
%% write exec;
|
629
646
|
|
630
647
|
if (cs >= EDN_list_first_final) {
|
631
|
-
o =
|
648
|
+
o = elems;
|
632
649
|
return p + 1;
|
633
650
|
}
|
634
651
|
else if (cs == EDN_list_error) {
|
@@ -642,161 +659,171 @@ const char* edn::Parser::parse_list(const char *p, const char *pe, Rice::Object&
|
|
642
659
|
|
643
660
|
|
644
661
|
// ============================================================
|
645
|
-
//
|
662
|
+
// hash parsing
|
646
663
|
//
|
647
664
|
%%{
|
648
|
-
machine
|
665
|
+
machine EDN_map;
|
649
666
|
include EDN_sequence_common;
|
650
667
|
|
668
|
+
end_map = '}';
|
669
|
+
|
651
670
|
write data;
|
652
671
|
|
653
|
-
begin_set = '{';
|
654
|
-
end_set = '}';
|
655
672
|
|
656
|
-
main :=
|
657
|
-
ignore* sequence? :>>
|
673
|
+
main := begin_map (
|
674
|
+
ignore* (sequence)? :>> end_map
|
658
675
|
) @err(close_err) @exit;
|
659
676
|
}%%
|
660
677
|
|
661
|
-
|
662
|
-
|
663
|
-
//
|
664
|
-
const char* edn::Parser::parse_set(const char *p, const char *pe, Rice::Object& o)
|
678
|
+
|
679
|
+
const char* edn::Parser::parse_map(const char *p, const char *pe, Rice::Object& o)
|
665
680
|
{
|
666
|
-
|
667
|
-
static const char* EDN_TYPE = "set";
|
681
|
+
static const char* EDN_TYPE = "map";
|
668
682
|
|
669
683
|
int cs;
|
670
|
-
|
684
|
+
// since we don't know whether we're looking at a key or value,
|
685
|
+
// initially store all elements in a list
|
686
|
+
Rice::Array elems;
|
671
687
|
|
672
688
|
%% write init;
|
673
689
|
%% write exec;
|
674
690
|
|
675
|
-
if (cs >=
|
676
|
-
|
691
|
+
if (cs >= EDN_map_first_final) {
|
692
|
+
|
693
|
+
// hash parsing is done. Make sure we have an even count
|
694
|
+
if ((elems.size() % 2) != 0) {
|
695
|
+
error(__FUNCTION__, "odd number of elements in map");
|
696
|
+
return pe;
|
697
|
+
}
|
698
|
+
|
699
|
+
// now convert the sequence to a hash
|
700
|
+
Rice::Hash rslt;
|
701
|
+
while (elems.size())
|
702
|
+
{
|
703
|
+
Rice::Object k = elems.shift();
|
704
|
+
rslt[k] = elems.shift();
|
705
|
+
}
|
706
|
+
|
707
|
+
o = rslt;
|
677
708
|
return p + 1;
|
678
709
|
}
|
679
|
-
else if (cs ==
|
680
|
-
error(__FUNCTION__, *p);
|
710
|
+
else if (cs == EDN_map_error) {
|
681
711
|
return pe;
|
682
712
|
}
|
683
|
-
else if (cs ==
|
713
|
+
else if (cs == EDN_map_en_main) {} // silence ragel warning
|
684
714
|
return NULL;
|
685
715
|
}
|
686
716
|
|
687
717
|
|
688
718
|
|
689
719
|
// ============================================================
|
690
|
-
//
|
720
|
+
// dispatch - handles all tokens with a leading #, then delegates to
|
721
|
+
// the corresponding machine. This machine consumes the # and passes
|
722
|
+
// the remaining data to the correct parser
|
691
723
|
//
|
692
724
|
%%{
|
693
|
-
machine
|
694
|
-
include
|
695
|
-
|
696
|
-
end_map = '}';
|
725
|
+
machine EDN_dispatch;
|
726
|
+
include EDN_common;
|
697
727
|
|
698
728
|
write data;
|
699
729
|
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
fexec
|
730
|
+
action parse_set {
|
731
|
+
// #{ }
|
732
|
+
const char *np = parse_set(fpc, pe, o);
|
733
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
734
|
+
}
|
735
|
+
|
736
|
+
action parse_discard {
|
737
|
+
// discard token #_
|
738
|
+
const char *np = parse_discard(fpc, pe);
|
739
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
704
740
|
}
|
705
741
|
|
706
|
-
|
707
|
-
|
708
|
-
|
742
|
+
action parse_tagged {
|
743
|
+
// #inst, #uuid, or #user/tag
|
744
|
+
const char *np = parse_tagged(fpc, pe, o);
|
745
|
+
if (np == NULL) { fhold; fbreak; } else fexec np;
|
746
|
+
}
|
747
|
+
|
748
|
+
|
749
|
+
main := (
|
750
|
+
('{' >parse_set |
|
751
|
+
'_' >parse_discard |
|
752
|
+
alpha >parse_tagged)
|
753
|
+
) @exit;
|
709
754
|
}%%
|
710
755
|
|
711
756
|
|
712
|
-
const char* edn::Parser::
|
757
|
+
const char* edn::Parser::parse_dispatch(const char *p, const char *pe, Rice::Object& o)
|
713
758
|
{
|
714
|
-
// std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
715
|
-
static const char* EDN_TYPE = "map";
|
716
|
-
Rice::Array arr;
|
717
759
|
int cs;
|
718
760
|
|
719
761
|
%% write init;
|
720
762
|
%% write exec;
|
721
763
|
|
722
|
-
if (cs >=
|
723
|
-
|
724
|
-
if ((arr.size() % 2) != 0) {
|
725
|
-
error(__FUNCTION__, "odd number of elements in map");
|
726
|
-
return pe;
|
727
|
-
}
|
728
|
-
|
729
|
-
Rice::Hash map;
|
730
|
-
while (arr.size())
|
731
|
-
{
|
732
|
-
Rice::Object k = arr.shift();
|
733
|
-
map[k] = arr.shift();
|
734
|
-
}
|
735
|
-
|
736
|
-
o = map;
|
764
|
+
if (cs >= EDN_dispatch_first_final) {
|
737
765
|
return p + 1;
|
738
766
|
}
|
739
|
-
else if (cs ==
|
767
|
+
else if (cs == EDN_dispatch_error) {
|
768
|
+
error(__FUNCTION__, *p);
|
740
769
|
return pe;
|
741
770
|
}
|
742
|
-
else if (cs ==
|
771
|
+
else if (cs == EDN_dispatch_en_main) {} // silence ragel warning
|
772
|
+
|
743
773
|
return NULL;
|
744
774
|
}
|
745
775
|
|
746
776
|
|
747
777
|
// ============================================================
|
748
|
-
//
|
778
|
+
// set parsing machine
|
749
779
|
//
|
750
780
|
%%{
|
751
|
-
machine
|
752
|
-
include
|
753
|
-
|
754
|
-
# inst = (string_delim [0-9+\-:\.TZ]* string_delim);
|
755
|
-
# uuid = (string_delim [a-f0-9\-]* string_delim);
|
781
|
+
machine EDN_set;
|
782
|
+
include EDN_sequence_common;
|
756
783
|
|
757
784
|
write data;
|
758
785
|
|
759
|
-
|
760
|
-
|
761
|
-
if (np == NULL) { fhold; fbreak; } else { fexec np; }
|
762
|
-
}
|
763
|
-
action parse_value {
|
764
|
-
const char *np = parse_value(fpc, pe, object);
|
765
|
-
if (np == NULL) { fhold; fbreak; } else { fexec np; }
|
766
|
-
}
|
767
|
-
|
768
|
-
action exit { fhold; fbreak; }
|
786
|
+
begin_set = '{';
|
787
|
+
end_set = '}';
|
769
788
|
|
770
|
-
main := (
|
789
|
+
main := begin_set (
|
790
|
+
ignore* sequence? :>> end_set
|
791
|
+
) @err(close_err) @exit;
|
771
792
|
}%%
|
772
793
|
|
773
|
-
|
774
|
-
|
794
|
+
//
|
795
|
+
// set parsing
|
796
|
+
//
|
797
|
+
const char* edn::Parser::parse_set(const char *p, const char *pe, Rice::Object& o)
|
775
798
|
{
|
776
|
-
|
777
|
-
std::string sym_name;
|
778
|
-
Rice::Object object;
|
799
|
+
static const char* EDN_TYPE = "set";
|
779
800
|
|
780
801
|
int cs;
|
802
|
+
Rice::Array elems; // stored as a vector
|
781
803
|
|
782
804
|
%% write init;
|
783
805
|
%% write exec;
|
784
806
|
|
785
|
-
if (cs >=
|
786
|
-
//
|
787
|
-
o = Parser::
|
807
|
+
if (cs >= EDN_set_first_final) {
|
808
|
+
// all elements collected; now convert to a set
|
809
|
+
o = Parser::make_ruby_set(elems);
|
788
810
|
return p + 1;
|
789
811
|
}
|
790
|
-
else if (cs ==
|
812
|
+
else if (cs == EDN_set_error) {
|
813
|
+
error(__FUNCTION__, *p);
|
791
814
|
return pe;
|
792
815
|
}
|
793
|
-
else if (cs ==
|
816
|
+
else if (cs == EDN_set_en_main) {} // silence ragel warning
|
794
817
|
return NULL;
|
795
818
|
}
|
796
819
|
|
797
820
|
|
821
|
+
|
798
822
|
// ============================================================
|
799
|
-
// discard
|
823
|
+
// discard - consume the discard token and parse the next value to
|
824
|
+
// discard. TODO: perhaps optimize this so no object data is built
|
825
|
+
// by defining a new machine(s) to consume items within container
|
826
|
+
// delimiters
|
800
827
|
//
|
801
828
|
%%{
|
802
829
|
machine EDN_discard;
|
@@ -808,12 +835,17 @@ const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Objec
|
|
808
835
|
|
809
836
|
action discard_value {
|
810
837
|
const char *np = parse_value(fpc, pe, o);
|
811
|
-
if (np
|
838
|
+
if (np) {
|
839
|
+
// this token is to be discard it so store it in the
|
840
|
+
// discard stack - we really don't need to save it so this
|
841
|
+
// could be simplified
|
842
|
+
discard.push(o);
|
843
|
+
fexec np;
|
844
|
+
} else {
|
845
|
+
fhold; fbreak;
|
846
|
+
}
|
812
847
|
}
|
813
848
|
|
814
|
-
action exit {
|
815
|
-
fhold; fbreak;
|
816
|
-
}
|
817
849
|
|
818
850
|
main := begin_discard ignore* (
|
819
851
|
begin_value >discard_value
|
@@ -823,7 +855,6 @@ const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Objec
|
|
823
855
|
|
824
856
|
const char* edn::Parser::parse_discard(const char *p, const char *pe)
|
825
857
|
{
|
826
|
-
//std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
|
827
858
|
int cs;
|
828
859
|
Rice::Object o;
|
829
860
|
|
@@ -845,59 +876,77 @@ const char* edn::Parser::parse_discard(const char *p, const char *pe)
|
|
845
876
|
|
846
877
|
|
847
878
|
// ============================================================
|
848
|
-
//
|
879
|
+
// tagged element parsing - #uuid, #inst, #{, #user/tag
|
880
|
+
//
|
881
|
+
// Current implementation expects a symbol followed by a value to
|
882
|
+
// match it against and does not check validity of uuid or rfc3339
|
883
|
+
// date characters.
|
884
|
+
//
|
885
|
+
// TODO:
|
886
|
+
// 1. need to check if we must support discard shenanigans such as
|
887
|
+
//
|
888
|
+
// #symbol #_ discard data
|
889
|
+
//
|
890
|
+
// 2. add parse checks for uuid and inst for better error reporting
|
849
891
|
//
|
850
892
|
%%{
|
851
|
-
machine
|
893
|
+
machine EDN_tagged;
|
852
894
|
include EDN_common;
|
853
895
|
|
854
|
-
|
896
|
+
# inst = (string_delim [0-9+\-:\.TZ]* string_delim);
|
897
|
+
# uuid = (string_delim [a-f0-9\-]* string_delim);
|
855
898
|
|
856
|
-
|
857
|
-
const char *np = parse_discard(fpc, pe);
|
858
|
-
if (np == NULL) { fhold; fbreak; } else fexec np;
|
859
|
-
}
|
899
|
+
write data;
|
860
900
|
|
861
|
-
action
|
862
|
-
|
863
|
-
|
901
|
+
action parse_symbol {
|
902
|
+
// parses the symbol portion of the pair
|
903
|
+
const char *np = parse_symbol(fpc, pe, sym_name);
|
904
|
+
if (np == NULL) { fhold; fbreak; } else { fexec np; }
|
864
905
|
}
|
865
|
-
|
866
|
-
|
867
|
-
const char *np =
|
868
|
-
if (np == NULL) { fhold; fbreak; } else fexec np;
|
906
|
+
action parse_value {
|
907
|
+
// parses the value portion
|
908
|
+
const char *np = parse_value(fpc, pe, data);
|
909
|
+
if (np == NULL) { fhold; fbreak; } else { fexec np; }
|
869
910
|
}
|
870
911
|
|
871
|
-
action exit { fhold; fbreak; }
|
872
912
|
|
873
|
-
main := (
|
874
|
-
('_' >parse_discard) |
|
875
|
-
'{' >parse_set |
|
876
|
-
alpha >parse_tagged
|
877
|
-
) @exit;
|
913
|
+
main := (symbol >parse_symbol ignore* begin_value >parse_value) @exit;
|
878
914
|
}%%
|
879
915
|
|
880
916
|
|
881
|
-
const char* edn::Parser::
|
917
|
+
const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Object& o)
|
882
918
|
{
|
883
|
-
|
919
|
+
std::string sym_name;
|
920
|
+
Rice::Object data;
|
921
|
+
|
884
922
|
int cs;
|
885
923
|
|
886
924
|
%% write init;
|
887
925
|
%% write exec;
|
888
926
|
|
889
|
-
if (cs >=
|
927
|
+
if (cs >= EDN_tagged_first_final) {
|
928
|
+
//std::cerr << __FUNCTION__ << " parse symbol name as '" << sym_name << "', value is: " << data << std::endl;
|
929
|
+
|
930
|
+
try {
|
931
|
+
// tagged_element makes a call to ruby which may throw an
|
932
|
+
// exception when parsing the data
|
933
|
+
o = Parser::tagged_element(sym_name, data);
|
934
|
+
} catch (Rice::Exception& e) {
|
935
|
+
error(__FUNCTION__, e.message().str());
|
936
|
+
return pe;
|
937
|
+
}
|
890
938
|
return p + 1;
|
891
939
|
}
|
892
|
-
else if (cs ==
|
893
|
-
error(__FUNCTION__, *p);
|
940
|
+
else if (cs == EDN_tagged_error) {
|
894
941
|
return pe;
|
895
942
|
}
|
896
|
-
else if (cs ==
|
897
|
-
|
943
|
+
else if (cs == EDN_tagged_en_main) {} // silence ragel warning
|
898
944
|
return NULL;
|
899
945
|
}
|
900
946
|
|
947
|
+
|
948
|
+
|
949
|
+
|
901
950
|
// ============================================================
|
902
951
|
// main parsing machine
|
903
952
|
//
|
@@ -916,13 +965,12 @@ const char* edn::Parser::parse_dispatch(const char *p, const char *pe, Rice::Obj
|
|
916
965
|
next_element = ignore* element;
|
917
966
|
sequence = ((element ignore*) (next_element ignore*)*);
|
918
967
|
|
919
|
-
# TODO: check this. Using a sequence to handle cases with a discard
|
920
968
|
main := ignore* sequence? ignore*;
|
921
969
|
}%%
|
922
970
|
|
923
971
|
//
|
924
|
-
//
|
925
|
-
//
|
972
|
+
// TODO: Currently using a sequence to handle cases with a discard
|
973
|
+
// but EDN's Reader allows token by token parsing
|
926
974
|
Rice::Object edn::Parser::parse(const char* buf, std::size_t len)
|
927
975
|
{
|
928
976
|
int cs;
|