edn_turbo 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@
5
5
  #include <rice/Hash.hpp>
6
6
  #include <rice/Array.hpp>
7
7
  #include <rice/to_from_ruby.hpp>
8
+ #include <rice/Exception.hpp>
8
9
 
9
10
  #include "edn_parser.h"
10
11
 
@@ -49,16 +50,19 @@
49
50
  exp = ([Ee] [+\-]? digit+);
50
51
 
51
52
 
53
+ # common actions
52
54
  action close_err {
53
55
  std::stringstream s;
54
56
  s << "unterminated " << EDN_TYPE;
55
57
  error(__FUNCTION__, s.str());
56
58
  fhold; fbreak;
57
59
  }
60
+
61
+ action exit { fhold; fbreak; }
58
62
  }%%
59
63
 
60
64
  // ============================================================
61
- // machine for parsing various types
65
+ // machine for parsing various EDN token types
62
66
  //
63
67
 
64
68
  %%{
@@ -67,46 +71,20 @@
67
71
 
68
72
  write data;
69
73
 
70
- action parse_dispatch {
71
- const char *np = parse_dispatch(fpc + 1, pe, o);
72
- if (np == NULL) { fhold; fbreak; } else fexec np;
73
- }
74
-
75
- action parse_char {
76
- const char *np = parse_esc_char(fpc, pe, o);
77
- if (np == NULL) { fhold; fbreak; } else fexec np;
78
- }
79
-
80
74
  action parse_string {
75
+ // string types within double-quotes
81
76
  const char *np = parse_string(fpc, pe, o);
82
77
  if (np == NULL) { fhold; fbreak; } else fexec np;
83
78
  }
84
79
 
85
80
  action parse_keyword {
81
+ // tokens with a leading ':'
86
82
  const char *np = parse_keyword(fpc, pe, o);
87
83
  if (np == NULL) { fhold; fbreak; } else fexec np;
88
84
  }
89
85
 
90
- action parse_operator {
91
- const char *np = parse_operator(fpc, pe, o);
92
- if (np == NULL) { fhold; fbreak; } else fexec np;
93
- }
94
-
95
- action parse_symbol {
96
- std::string sym;
97
- const char *np = parse_symbol(fpc, pe, sym);
98
- if (np == NULL) { fhold; fbreak; } else {
99
- if (sym == "true") { o = Qtrue; }
100
- else if (sym == "false") { o = Qfalse; }
101
- else if (sym == "nil") { o = Qnil; }
102
- else {
103
- o = Parser::make_edn_symbol(sym);
104
- }
105
- fexec np;
106
- }
107
- }
108
-
109
86
  action parse_number {
87
+ // tokens w/ leading digits: non-negative integers & decimals.
110
88
  // try to parse a decimal first
111
89
  const char *np = parse_decimal(fpc, pe, o);
112
90
  if (np == NULL) {
@@ -125,41 +103,75 @@
125
103
  }
126
104
  }
127
105
 
106
+ action parse_operator {
107
+ // stand-alone operators *, +, -, etc.
108
+ const char *np = parse_operator(fpc, pe, o);
109
+ if (np == NULL) { fhold; fbreak; } else fexec np;
110
+ }
111
+
112
+ action parse_char {
113
+ // tokens w/ leading \ (escaped characters \newline, \c, etc.)
114
+ const char *np = parse_esc_char(fpc, pe, o);
115
+ if (np == NULL) { fhold; fbreak; } else fexec np;
116
+ }
117
+
118
+ action parse_symbol {
119
+ // user identifiers and reserved keywords (true, false, nil)
120
+ std::string sym;
121
+ const char *np = parse_symbol(fpc, pe, sym);
122
+ if (np == NULL) { fhold; fbreak; } else {
123
+ if (sym == "true") { o = Qtrue; }
124
+ else if (sym == "false") { o = Qfalse; }
125
+ else if (sym == "nil") { o = Qnil; }
126
+ else {
127
+ o = Parser::make_edn_symbol(sym);
128
+ }
129
+ fexec np;
130
+ }
131
+ }
132
+
128
133
  action parse_vector {
134
+ // [
129
135
  const char *np = parse_vector(fpc, pe, o);
130
136
  if (np == NULL) { fhold; fbreak; } else fexec np;
131
137
  }
132
138
 
133
139
  action parse_list {
140
+ // (
134
141
  const char *np = parse_list(fpc, pe, o);
135
142
  if (np == NULL) { fhold; fbreak; } else fexec np;
136
143
  }
137
144
 
138
145
  action parse_map {
146
+ // {
139
147
  const char *np = parse_map(fpc, pe, o);
140
148
  if (np == NULL) { fhold; fbreak; } else fexec np;
141
149
  }
142
150
 
143
- action exit { fhold; fbreak; }
151
+ action parse_dispatch {
152
+ // handles tokens w/ leading # ("#_", "#{", and tagged elems)
153
+ const char *np = parse_dispatch(fpc + 1, pe, o);
154
+ if (np == NULL) { fhold; fbreak; } else fexec np;
155
+ }
156
+
144
157
 
145
158
  main := (
146
- begin_dispatch >parse_dispatch |
147
- begin_char >parse_char |
148
159
  string_delim >parse_string |
149
160
  begin_keyword >parse_keyword |
161
+ begin_number >parse_number |
150
162
  operators >parse_operator |
163
+ begin_char >parse_char |
151
164
  begin_symbol >parse_symbol |
152
- begin_number >parse_number |
153
165
  begin_vector >parse_vector |
154
166
  begin_list >parse_list |
155
- begin_map >parse_map
167
+ begin_map >parse_map |
168
+ begin_dispatch >parse_dispatch
156
169
  ) %*exit;
157
170
  }%%
158
171
 
159
172
 
160
173
  const char *edn::Parser::parse_value(const char *p, const char *pe, Rice::Object& o)
161
174
  {
162
- //std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
163
175
  int cs;
164
176
 
165
177
  %% write init;
@@ -179,295 +191,279 @@ const char *edn::Parser::parse_value(const char *p, const char *pe, Rice::Object
179
191
 
180
192
 
181
193
  // ============================================================
182
- // operator parsing
194
+ // string parsing - incoming string is raw so interpreting utf
195
+ // encodings & unicode values might be necessary. To optimize things a
196
+ // bit, we mark the string for encoding if anything outside of the
197
+ // ascii range is found.
183
198
  //
184
199
  %%{
185
- machine EDN_operator;
200
+ machine EDN_string;
186
201
  include EDN_common;
187
202
 
188
203
  write data;
189
204
 
190
- action parse_symbol {
191
- // parse a symbol including the leading operator (-, +, .)
192
- std::string sym;
193
- const char *np = parse_symbol(p_save, pe, sym);
194
- if (np == NULL) { fhold; fbreak; } else {
195
- o = Parser::make_edn_symbol(sym);
196
- fexec np;
197
- }
198
- }
199
-
200
- action parse_number {
201
- // parse a number with the leading symbol - this is slightly
202
- // different than the one within EDN_value since it includes
203
- // the leading - or +
204
- //
205
- // try to parse a decimal first
206
- const char *np = parse_decimal(p_save, pe, o);
207
- if (np == NULL) {
208
- // if we can't, try to parse it as an int
209
- np = parse_integer(p_save, pe, o);
210
- }
211
-
212
- if (np) {
213
- fexec np;
214
- fhold;
215
- fbreak;
216
- }
217
- else {
218
- error(__FUNCTION__, *p);
219
- fexec pe;
205
+ action parse_string {
206
+ if (Parser::parse_byte_stream(p_save + 1, p, s, encode)) {
207
+ fexec p + 1;
208
+ } else {
209
+ fhold; fbreak;
220
210
  }
221
211
  }
222
212
 
223
- action parse_operator {
224
- // stand-alone operators (-, +, /, ... etc)
225
- std::string sym;
226
- sym += *(fpc - 1);
227
- o = Parser::make_edn_symbol(sym);
213
+ action mark_for_encoding {
214
+ encode = true;
228
215
  }
229
216
 
230
- action exit { fhold; fbreak; }
231
-
232
- main := (
233
- ('-'|'+'|'.') alpha >parse_symbol |
234
- ('-'|'+') begin_number >parse_number |
235
- operators ignore* >parse_operator
236
- ) ^(operators|alpha|digit)? @exit;
217
+ main := string_delim (
218
+ (^([\"\\] | 0..0x1f | 0xc2..0xf5) |
219
+ ((0xc2..0xf5) |
220
+ '\\'[\"\\/bfnrt] |
221
+ '\\u'[0-9a-fA-F]{4}) $mark_for_encoding |
222
+ '\\'^([\"\\/bfnrtu]|0..0x1f))* %parse_string
223
+ ) :>> string_delim @err(close_err) @exit;
237
224
  }%%
238
225
 
239
226
 
240
- const char* edn::Parser::parse_operator(const char *p, const char *pe, Rice::Object& o)
227
+ const char* edn::Parser::parse_string(const char *p, const char *pe, Rice::Object& o)
241
228
  {
242
229
  // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
230
+ static const char* EDN_TYPE = "string";
243
231
  int cs;
244
- std::string op;
232
+ bool encode = false;
233
+ const char *eof = pe;
234
+ Rice::String s;
245
235
 
246
236
  %% write init;
247
237
  p_save = p;
248
238
  %% write exec;
249
239
 
250
- if (cs >= EDN_operator_first_final) {
251
- return p;
240
+ if (cs >= EDN_string_first_final) {
241
+ o = s;
242
+ return p + 1;
252
243
  }
253
- else if (cs == EDN_operator_error) {
254
- error(__FUNCTION__, *p);
244
+ else if (cs == EDN_string_error) {
255
245
  return pe;
256
246
  }
257
- else if (cs == EDN_operator_en_main) {} // silence ragel warning
247
+ else if (cs == EDN_string_en_main) {} // silence ragel warning
258
248
  return NULL;
259
249
  }
260
250
 
261
251
 
262
252
 
263
253
  // ============================================================
264
- // escaped char parsing
254
+ // keyword parsing
265
255
  //
266
256
  %%{
267
- machine EDN_escaped_char;
257
+ machine EDN_keyword;
268
258
  include EDN_common;
269
259
 
270
- write data;
260
+ keyword_chars = symbol_chars | operators;
261
+ keyword_start = symbol_start | [\#\./];
271
262
 
272
- valid_chars = alpha;
263
+ keyword_name = keyword_start (keyword_chars)*;
273
264
 
274
- action exit { fhold; fbreak; }
265
+ write data;
275
266
 
276
- main := (
277
- begin_char valid_chars+ ignore*
278
- ) (^(valid_chars | '\\')? @exit);
267
+
268
+ main := begin_keyword keyword_name (^keyword_chars? @exit);
279
269
  }%%
280
270
 
281
271
 
282
- const char* edn::Parser::parse_esc_char(const char *p, const char *pe, Rice::Object& o)
272
+ const char* edn::Parser::parse_keyword(const char *p, const char *pe, Rice::Object& o)
283
273
  {
284
- //std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
285
274
  int cs;
286
275
 
287
276
  %% write init;
288
277
  p_save = p;
289
278
  %% write exec;
290
279
 
291
- if (cs >= EDN_escaped_char_first_final) {
292
- if (!Parser::parse_escaped_char(p_save + 1, p, o)) {
293
- return pe;
294
- }
280
+ if (cs >= EDN_keyword_first_final) {
281
+ std::string buf;
282
+ uint32_t len = p - p_save;
283
+ // don't include leading ':' because Rice::Symbol will handle it
284
+ buf.append(p_save + 1, len - 1);
285
+ o = Rice::Symbol(buf);
295
286
  return p;
296
287
  }
297
- else if (cs == EDN_escaped_char_error) {
288
+ else if (cs == EDN_keyword_error) {
298
289
  error(__FUNCTION__, *p);
299
290
  return pe;
300
291
  }
301
- else if (cs == EDN_escaped_char_en_main) {} // silence ragel warning
292
+ else if (cs == EDN_keyword_en_main) {} // silence ragel warning
302
293
  return NULL;
303
294
  }
304
295
 
305
296
 
306
297
 
307
-
308
298
  // ============================================================
309
- // symbol parsing
299
+ // decimal parsing machine
310
300
  //
311
301
  %%{
312
- machine EDN_symbol;
302
+ machine EDN_decimal;
313
303
  include EDN_common;
314
304
 
315
- write data;
305
+ write data noerror;
316
306
 
317
- action exit { fhold; fbreak; }
318
307
 
319
- main := (
320
- operators? symbol |
321
- operators
322
- ) ignore* (^(symbol_chars | operators)? @exit);
308
+ main := ('-'|'+')? (
309
+ (integer '.' digit* (exp? [M]?)) |
310
+ (integer exp)
311
+ ) (^[0-9Ee.+\-M]? @exit );
323
312
  }%%
324
313
 
325
314
 
326
- const char* edn::Parser::parse_symbol(const char *p, const char *pe, std::string& s)
315
+ const char* edn::Parser::parse_decimal(const char *p, const char *pe, Rice::Object& o)
327
316
  {
328
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
329
317
  int cs;
330
318
 
331
319
  %% write init;
332
320
  p_save = p;
333
321
  %% write exec;
334
322
 
335
- if (cs >= EDN_symbol_first_final) {
336
- uint32_t len = p - p_save;
337
- std::string buf;
338
- buf.append(p_save, len);
339
-
340
- s = buf;
341
- return p;
342
- }
343
- else if (cs == EDN_symbol_error) {
344
- error(__FUNCTION__, *p);
345
- return pe;
323
+ if (cs >= EDN_decimal_first_final) {
324
+ o = Parser::float_to_ruby(p_save, p - p_save);
325
+ return p + 1;
346
326
  }
347
- else if (cs == EDN_symbol_en_main) {} // silence ragel warning
327
+ else if (cs == EDN_decimal_en_main) {} // silence ragel warning
348
328
  return NULL;
349
329
  }
350
330
 
351
331
 
352
-
353
-
354
332
  // ============================================================
355
- // keyword parsing
333
+ // integer parsing machine
356
334
  //
357
335
  %%{
358
- machine EDN_keyword;
336
+ machine EDN_integer;
359
337
  include EDN_common;
360
338
 
361
- keyword_chars = symbol_chars | operators;
362
- keyword_start = symbol_start | [\#\./];
363
-
364
- keyword_name = keyword_start (keyword_chars)*;
365
-
366
- write data;
339
+ write data noerror;
367
340
 
368
- action exit { fhold; fbreak; }
369
341
 
370
- main := begin_keyword keyword_name (^keyword_chars? @exit);
342
+ main := (
343
+ ('-'|'+')? (integer [MN]?)
344
+ ) (^[0-9MN+\-]? @exit);
371
345
  }%%
372
346
 
373
-
374
- const char* edn::Parser::parse_keyword(const char *p, const char *pe, Rice::Object& o)
347
+ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Object& o)
375
348
  {
376
349
  int cs;
377
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
378
350
 
379
351
  %% write init;
380
352
  p_save = p;
381
353
  %% write exec;
382
354
 
383
- if (cs >= EDN_keyword_first_final) {
384
- uint32_t len = p - p_save - 1; // don't include leading ':' because Rice::Symbol will handle it
385
- std::string buf;
386
- buf.append(p_save + 1, len);
387
- o = Rice::Symbol(buf);
388
- return p;
389
- }
390
- else if (cs == EDN_keyword_error) {
391
- error(__FUNCTION__, *p);
392
- return pe;
355
+ if (cs >= EDN_integer_first_final) {
356
+ o = Parser::integer_to_ruby(p_save, p - p_save);
357
+ return p + 1;
393
358
  }
394
- else if (cs == EDN_keyword_en_main) {} // silence ragel warning
359
+ else if (cs == EDN_integer_en_main) {} // silence ragel warning
395
360
  return NULL;
396
361
  }
397
362
 
398
363
 
399
364
 
400
365
  // ============================================================
401
- // string parsing
366
+ // operator parsing - handles tokens w/ a leading operator:
367
+ //
368
+ // 1. symbols w/ leading operator: -something, .somethingelse
369
+ // 2. number values w/ leading - or +
370
+ // 3. stand-alone operators: +, -, /, *, etc.
402
371
  //
403
372
  %%{
404
- machine EDN_string;
373
+ machine EDN_operator;
405
374
  include EDN_common;
406
375
 
407
376
  write data;
408
377
 
409
- action parse_string {
410
- if (!Parser::parse_byte_stream(p_save + 1, p, s)) {
378
+ action parse_symbol {
379
+ // parse a symbol including the leading operator (-, +, .)
380
+ std::string sym;
381
+ const char *np = parse_symbol(p_save, pe, sym);
382
+ if (np == NULL) { fhold; fbreak; } else {
383
+ o = Parser::make_edn_symbol(sym);
384
+ fexec np;
385
+ }
386
+ }
387
+
388
+ action parse_number {
389
+ // parse a number with the leading symbol - this is slightly
390
+ // different than the one within EDN_value since it includes
391
+ // the leading - or +
392
+ //
393
+ // try to parse a decimal first
394
+ const char *np = parse_decimal(p_save, pe, o);
395
+ if (np == NULL) {
396
+ // if we can't, try to parse it as an int
397
+ np = parse_integer(p_save, pe, o);
398
+ }
399
+
400
+ if (np) {
401
+ fexec np;
411
402
  fhold;
412
403
  fbreak;
413
- } else {
414
- fexec p + 1;
415
404
  }
405
+ else {
406
+ error(__FUNCTION__, *p);
407
+ fexec pe;
408
+ }
409
+ }
410
+
411
+ action parse_operator {
412
+ // stand-alone operators (-, +, /, ... etc)
413
+ std::string sym;
414
+ sym += *(p_save);
415
+ o = Parser::make_edn_symbol(sym);
416
416
  }
417
417
 
418
- action exit { fhold; fbreak; }
419
418
 
420
- main := string_delim (
421
- (^([\"\\] | 0..0x1f) |
422
- '\\'[\"\\/bfnrt] |
423
- '\\u'[0-9a-fA-F]{4} |
424
- '\\'^([\"\\/bfnrtu]|0..0x1f))* %parse_string
425
- ) :>> string_delim @err(close_err) @exit;
419
+ main := (
420
+ ('-'|'+'|'.') alpha >parse_symbol |
421
+ ('-'|'+') begin_number >parse_number |
422
+ operators ignore* >parse_operator
423
+ ) ^(operators|alpha|digit)? @exit;
426
424
  }%%
427
425
 
428
426
 
429
- const char* edn::Parser::parse_string(const char *p, const char *pe, Rice::Object& o)
427
+ const char* edn::Parser::parse_operator(const char *p, const char *pe, Rice::Object& o)
430
428
  {
431
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
432
- static const char* EDN_TYPE = "string";
433
429
  int cs;
434
- const char *eof = pe;
435
430
 
436
- Rice::String s;
437
431
  %% write init;
438
432
  p_save = p;
439
433
  %% write exec;
440
434
 
441
- if (cs >= EDN_string_first_final) {
442
- o = s;
443
- return p + 1;
435
+ if (cs >= EDN_operator_first_final) {
436
+ return p;
444
437
  }
445
- else if (cs == EDN_string_error) {
438
+ else if (cs == EDN_operator_error) {
439
+ error(__FUNCTION__, *p);
446
440
  return pe;
447
441
  }
448
- else if (cs == EDN_string_en_main) {} // silence ragel warning
442
+ else if (cs == EDN_operator_en_main) {} // silence ragel warning
449
443
  return NULL;
450
444
  }
451
445
 
446
+
447
+
452
448
  // ============================================================
453
- // decimal parsing grammar
449
+ // escaped char parsing - handles \c, \newline, \formfeed, etc.
454
450
  //
455
451
  %%{
456
- machine EDN_decimal;
452
+ machine EDN_escaped_char;
457
453
  include EDN_common;
458
454
 
459
- write data noerror;
455
+ write data;
456
+
457
+ valid_chars = alpha;
460
458
 
461
- action exit { fhold; fbreak; }
462
459
 
463
- main := ('-'|'+')? (
464
- (integer '.' digit* (exp? [M]?)) |
465
- (integer exp)
466
- ) (^[0-9Ee.+\-M]? @exit );
460
+ main := (
461
+ begin_char valid_chars+ ignore*
462
+ ) (^(valid_chars | '\\')? @exit);
467
463
  }%%
468
464
 
469
465
 
470
- const char* edn::Parser::parse_decimal(const char *p, const char *pe, Rice::Object& o)
466
+ const char* edn::Parser::parse_esc_char(const char *p, const char *pe, Rice::Object& o)
471
467
  {
472
468
  int cs;
473
469
 
@@ -475,32 +471,43 @@ const char* edn::Parser::parse_decimal(const char *p, const char *pe, Rice::Obje
475
471
  p_save = p;
476
472
  %% write exec;
477
473
 
478
- if (cs >= EDN_decimal_first_final) {
479
- o = Parser::float_to_ruby(p_save, p - p_save);
480
- return p + 1;
474
+ if (cs >= EDN_escaped_char_first_final) {
475
+ // convert the escaped value to a character
476
+ if (!Parser::parse_escaped_char(p_save + 1, p, o)) {
477
+ return pe;
478
+ }
479
+ return p;
481
480
  }
482
- else if (cs == EDN_decimal_en_main) {} // silence ragel warning
481
+ else if (cs == EDN_escaped_char_error) {
482
+ error(__FUNCTION__, *p);
483
+ return pe;
484
+ }
485
+ else if (cs == EDN_escaped_char_en_main) {} // silence ragel warning
483
486
  return NULL;
484
487
  }
485
488
 
486
489
 
490
+
491
+
487
492
  // ============================================================
488
- // integer parsing grammar
493
+ // symbol parsing - handles identifiers that begin with an alpha
494
+ // character and an optional leading operator (name, -today,
495
+ // .yesterday)
489
496
  //
490
497
  %%{
491
- machine EDN_integer;
498
+ machine EDN_symbol;
492
499
  include EDN_common;
493
500
 
494
- write data noerror;
501
+ write data;
495
502
 
496
- action exit { fhold; fbreak; }
497
503
 
498
504
  main := (
499
- ('-'|'+')? (integer [MN]?)
500
- ) (^[0-9MN+\-]? @exit);
505
+ operators? symbol
506
+ ) ignore* (^(symbol_chars | operators)? @exit);
501
507
  }%%
502
508
 
503
- const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Object& o)
509
+
510
+ const char* edn::Parser::parse_symbol(const char *p, const char *pe, std::string& sym)
504
511
  {
505
512
  int cs;
506
513
 
@@ -508,15 +515,22 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
508
515
  p_save = p;
509
516
  %% write exec;
510
517
 
511
- if (cs >= EDN_integer_first_final) {
512
- o = Parser::integer_to_ruby(p_save, p - p_save);
513
- return p + 1;
518
+ if (cs >= EDN_symbol_first_final) {
519
+ // copy the symbol text
520
+ sym.clear();
521
+ sym.append(p_save, p - p_save);
522
+ return p;
514
523
  }
515
- else if (cs == EDN_integer_en_main) {} // silence ragel warning
524
+ else if (cs == EDN_symbol_error) {
525
+ error(__FUNCTION__, *p);
526
+ return pe;
527
+ }
528
+ else if (cs == EDN_symbol_en_main) {} // silence ragel warning
516
529
  return NULL;
517
530
  }
518
531
 
519
532
 
533
+
520
534
  // ============================================================
521
535
  // EDN_sequence_common is used to parse EDN containers - elements are
522
536
  // initially stored in a rice array and then the final corresponding
@@ -527,30 +541,34 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
527
541
  machine EDN_sequence_common;
528
542
  include EDN_common;
529
543
 
530
- action parse_value {
531
- Rice::Object v;
532
- const char *np = parse_value(fpc, pe, v);
544
+ action parse_item {
545
+ // reads an item within a sequence (vector, list, map, or
546
+ // set). Regardless of the sequence type, an array of the
547
+ // items is built. Once done, the sequence parser will convert
548
+ // if needed
549
+ Rice::Object e;
550
+ const char *np = parse_value(fpc, pe, e);
533
551
  if (np == NULL) {
534
552
  fhold; fbreak;
535
553
  } else {
536
554
  // if there's an entry in the discard list, the current
537
- // object is not meant to be kept
555
+ // object is not meant to be kept due to a #_ so don't
556
+ // push it into the list of elements
538
557
  if (!discard.empty()) {
539
558
  discard.pop();
540
559
  }
541
560
  else {
542
- // otherwise we add it to the sequence
543
- arr.push(v);
561
+ // otherwise we add it to the list of elements for the
562
+ // corresponding container
563
+ elems.push(e);
544
564
  }
545
565
  fexec np;
546
566
  }
547
567
  }
548
568
 
549
- element = begin_value >parse_value;
569
+ element = begin_value >parse_item;
550
570
  next_element = ignore* element;
551
571
  sequence = ((element ignore*) (next_element ignore*)*);
552
-
553
- action exit { fhold; fbreak; }
554
572
  }%%
555
573
 
556
574
  //
@@ -575,17 +593,16 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
575
593
  //
576
594
  const char* edn::Parser::parse_vector(const char *p, const char *pe, Rice::Object& o)
577
595
  {
578
- //std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
579
596
  static const char* EDN_TYPE = "vector";
580
597
 
581
598
  int cs;
582
- Rice::Array arr;
599
+ Rice::Array elems; // will store the vector's elements
583
600
 
584
601
  %% write init;
585
602
  %% write exec;
586
603
 
587
604
  if (cs >= EDN_vector_first_final) {
588
- o = arr;
605
+ o = elems;
589
606
  return p + 1;
590
607
  }
591
608
  else if (cs == EDN_vector_error) {
@@ -622,13 +639,13 @@ const char* edn::Parser::parse_list(const char *p, const char *pe, Rice::Object&
622
639
  static const char* EDN_TYPE = "list";
623
640
 
624
641
  int cs;
625
- Rice::Array arr;
642
+ Rice::Array elems;
626
643
 
627
644
  %% write init;
628
645
  %% write exec;
629
646
 
630
647
  if (cs >= EDN_list_first_final) {
631
- o = arr;
648
+ o = elems;
632
649
  return p + 1;
633
650
  }
634
651
  else if (cs == EDN_list_error) {
@@ -642,161 +659,171 @@ const char* edn::Parser::parse_list(const char *p, const char *pe, Rice::Object&
642
659
 
643
660
 
644
661
  // ============================================================
645
- // set parsing machine
662
+ // hash parsing
646
663
  //
647
664
  %%{
648
- machine EDN_set;
665
+ machine EDN_map;
649
666
  include EDN_sequence_common;
650
667
 
668
+ end_map = '}';
669
+
651
670
  write data;
652
671
 
653
- begin_set = '{';
654
- end_set = '}';
655
672
 
656
- main := begin_set (
657
- ignore* sequence? :>> end_set
673
+ main := begin_map (
674
+ ignore* (sequence)? :>> end_map
658
675
  ) @err(close_err) @exit;
659
676
  }%%
660
677
 
661
- //
662
- // set parsing
663
- //
664
- const char* edn::Parser::parse_set(const char *p, const char *pe, Rice::Object& o)
678
+
679
+ const char* edn::Parser::parse_map(const char *p, const char *pe, Rice::Object& o)
665
680
  {
666
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
667
- static const char* EDN_TYPE = "set";
681
+ static const char* EDN_TYPE = "map";
668
682
 
669
683
  int cs;
670
- Rice::Array arr; // store as a vector; then convert to a set once done
684
+ // since we don't know whether we're looking at a key or value,
685
+ // initially store all elements in a list
686
+ Rice::Array elems;
671
687
 
672
688
  %% write init;
673
689
  %% write exec;
674
690
 
675
- if (cs >= EDN_set_first_final) {
676
- o = Parser::make_ruby_set(arr);
691
+ if (cs >= EDN_map_first_final) {
692
+
693
+ // hash parsing is done. Make sure we have an even count
694
+ if ((elems.size() % 2) != 0) {
695
+ error(__FUNCTION__, "odd number of elements in map");
696
+ return pe;
697
+ }
698
+
699
+ // now convert the sequence to a hash
700
+ Rice::Hash rslt;
701
+ while (elems.size())
702
+ {
703
+ Rice::Object k = elems.shift();
704
+ rslt[k] = elems.shift();
705
+ }
706
+
707
+ o = rslt;
677
708
  return p + 1;
678
709
  }
679
- else if (cs == EDN_set_error) {
680
- error(__FUNCTION__, *p);
710
+ else if (cs == EDN_map_error) {
681
711
  return pe;
682
712
  }
683
- else if (cs == EDN_set_en_main) {} // silence ragel warning
713
+ else if (cs == EDN_map_en_main) {} // silence ragel warning
684
714
  return NULL;
685
715
  }
686
716
 
687
717
 
688
718
 
689
719
  // ============================================================
690
- // hash parsing
720
+ // dispatch - handles all tokens with a leading #, then delegates to
721
+ // the corresponding machine. This machine consumes the # and passes
722
+ // the remaining data to the correct parser
691
723
  //
692
724
  %%{
693
- machine EDN_map;
694
- include EDN_sequence_common;
695
-
696
- end_map = '}';
725
+ machine EDN_dispatch;
726
+ include EDN_common;
697
727
 
698
728
  write data;
699
729
 
700
- # action to report missing value in k/v pair
701
- action pair_err {
702
- error(__FUNCTION__, "map pair not found");
703
- fexec pe;
730
+ action parse_set {
731
+ // #{ }
732
+ const char *np = parse_set(fpc, pe, o);
733
+ if (np == NULL) { fhold; fbreak; } else fexec np;
734
+ }
735
+
736
+ action parse_discard {
737
+ // discard token #_
738
+ const char *np = parse_discard(fpc, pe);
739
+ if (np == NULL) { fhold; fbreak; } else fexec np;
704
740
  }
705
741
 
706
- main := begin_map (
707
- ignore* (sequence)? :>> end_map
708
- ) @err(close_err) @exit;
742
+ action parse_tagged {
743
+ // #inst, #uuid, or #user/tag
744
+ const char *np = parse_tagged(fpc, pe, o);
745
+ if (np == NULL) { fhold; fbreak; } else fexec np;
746
+ }
747
+
748
+
749
+ main := (
750
+ ('{' >parse_set |
751
+ '_' >parse_discard |
752
+ alpha >parse_tagged)
753
+ ) @exit;
709
754
  }%%
710
755
 
711
756
 
712
- const char* edn::Parser::parse_map(const char *p, const char *pe, Rice::Object& o)
757
+ const char* edn::Parser::parse_dispatch(const char *p, const char *pe, Rice::Object& o)
713
758
  {
714
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
715
- static const char* EDN_TYPE = "map";
716
- Rice::Array arr;
717
759
  int cs;
718
760
 
719
761
  %% write init;
720
762
  %% write exec;
721
763
 
722
- if (cs >= EDN_map_first_final) {
723
-
724
- if ((arr.size() % 2) != 0) {
725
- error(__FUNCTION__, "odd number of elements in map");
726
- return pe;
727
- }
728
-
729
- Rice::Hash map;
730
- while (arr.size())
731
- {
732
- Rice::Object k = arr.shift();
733
- map[k] = arr.shift();
734
- }
735
-
736
- o = map;
764
+ if (cs >= EDN_dispatch_first_final) {
737
765
  return p + 1;
738
766
  }
739
- else if (cs == EDN_map_error) {
767
+ else if (cs == EDN_dispatch_error) {
768
+ error(__FUNCTION__, *p);
740
769
  return pe;
741
770
  }
742
- else if (cs == EDN_map_en_main) {} // silence ragel warning
771
+ else if (cs == EDN_dispatch_en_main) {} // silence ragel warning
772
+
743
773
  return NULL;
744
774
  }
745
775
 
746
776
 
747
777
  // ============================================================
748
- // tagged element parsing - #uuid, #inst, #{, #_
778
+ // set parsing machine
749
779
  //
750
780
  %%{
751
- machine EDN_tagged;
752
- include EDN_common;
753
-
754
- # inst = (string_delim [0-9+\-:\.TZ]* string_delim);
755
- # uuid = (string_delim [a-f0-9\-]* string_delim);
781
+ machine EDN_set;
782
+ include EDN_sequence_common;
756
783
 
757
784
  write data;
758
785
 
759
- action parse_symbol {
760
- const char *np = parse_symbol(fpc, pe, sym_name);
761
- if (np == NULL) { fhold; fbreak; } else { fexec np; }
762
- }
763
- action parse_value {
764
- const char *np = parse_value(fpc, pe, object);
765
- if (np == NULL) { fhold; fbreak; } else { fexec np; }
766
- }
767
-
768
- action exit { fhold; fbreak; }
786
+ begin_set = '{';
787
+ end_set = '}';
769
788
 
770
- main := (symbol >parse_symbol ignore* begin_value >parse_value) @exit;
789
+ main := begin_set (
790
+ ignore* sequence? :>> end_set
791
+ ) @err(close_err) @exit;
771
792
  }%%
772
793
 
773
-
774
- const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Object& o)
794
+ //
795
+ // set parsing
796
+ //
797
+ const char* edn::Parser::parse_set(const char *p, const char *pe, Rice::Object& o)
775
798
  {
776
- // std::cerr << __FUNCTION__ << " p '" << p << "'" << std::endl;
777
- std::string sym_name;
778
- Rice::Object object;
799
+ static const char* EDN_TYPE = "set";
779
800
 
780
801
  int cs;
802
+ Rice::Array elems; // stored as a vector
781
803
 
782
804
  %% write init;
783
805
  %% write exec;
784
806
 
785
- if (cs >= EDN_tagged_first_final) {
786
- //std::cerr << __FUNCTION__ << " parse symbol name as '" << sym_name << "', value is: " << object << std::endl;
787
- o = Parser::tagged_element(sym_name, object);
807
+ if (cs >= EDN_set_first_final) {
808
+ // all elements collected; now convert to a set
809
+ o = Parser::make_ruby_set(elems);
788
810
  return p + 1;
789
811
  }
790
- else if (cs == EDN_tagged_error) {
812
+ else if (cs == EDN_set_error) {
813
+ error(__FUNCTION__, *p);
791
814
  return pe;
792
815
  }
793
- else if (cs == EDN_tagged_en_main) {} // silence ragel warning
816
+ else if (cs == EDN_set_en_main) {} // silence ragel warning
794
817
  return NULL;
795
818
  }
796
819
 
797
820
 
821
+
798
822
  // ============================================================
799
- // discard
823
+ // discard - consume the discard token and parse the next value to
824
+ // discard. TODO: perhaps optimize this so no object data is built
825
+ // by defining a new machine(s) to consume items within container
826
+ // delimiters
800
827
  //
801
828
  %%{
802
829
  machine EDN_discard;
@@ -808,12 +835,17 @@ const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Objec
808
835
 
809
836
  action discard_value {
810
837
  const char *np = parse_value(fpc, pe, o);
811
- if (np == NULL) { fhold; fbreak; } else { discard.push(o); fexec np; }
838
+ if (np) {
839
+ // this token is to be discard it so store it in the
840
+ // discard stack - we really don't need to save it so this
841
+ // could be simplified
842
+ discard.push(o);
843
+ fexec np;
844
+ } else {
845
+ fhold; fbreak;
846
+ }
812
847
  }
813
848
 
814
- action exit {
815
- fhold; fbreak;
816
- }
817
849
 
818
850
  main := begin_discard ignore* (
819
851
  begin_value >discard_value
@@ -823,7 +855,6 @@ const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Objec
823
855
 
824
856
  const char* edn::Parser::parse_discard(const char *p, const char *pe)
825
857
  {
826
- //std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
827
858
  int cs;
828
859
  Rice::Object o;
829
860
 
@@ -845,59 +876,77 @@ const char* edn::Parser::parse_discard(const char *p, const char *pe)
845
876
 
846
877
 
847
878
  // ============================================================
848
- // dispatch
879
+ // tagged element parsing - #uuid, #inst, #{, #user/tag
880
+ //
881
+ // Current implementation expects a symbol followed by a value to
882
+ // match it against and does not check validity of uuid or rfc3339
883
+ // date characters.
884
+ //
885
+ // TODO:
886
+ // 1. need to check if we must support discard shenanigans such as
887
+ //
888
+ // #symbol #_ discard data
889
+ //
890
+ // 2. add parse checks for uuid and inst for better error reporting
849
891
  //
850
892
  %%{
851
- machine EDN_dispatch;
893
+ machine EDN_tagged;
852
894
  include EDN_common;
853
895
 
854
- write data;
896
+ # inst = (string_delim [0-9+\-:\.TZ]* string_delim);
897
+ # uuid = (string_delim [a-f0-9\-]* string_delim);
855
898
 
856
- action parse_discard {
857
- const char *np = parse_discard(fpc, pe);
858
- if (np == NULL) { fhold; fbreak; } else fexec np;
859
- }
899
+ write data;
860
900
 
861
- action parse_set {
862
- const char *np = parse_set(fpc, pe, o);
863
- if (np == NULL) { fhold; fbreak; } else fexec np;
901
+ action parse_symbol {
902
+ // parses the symbol portion of the pair
903
+ const char *np = parse_symbol(fpc, pe, sym_name);
904
+ if (np == NULL) { fhold; fbreak; } else { fexec np; }
864
905
  }
865
-
866
- action parse_tagged {
867
- const char *np = parse_tagged(fpc, pe, o);
868
- if (np == NULL) { fhold; fbreak; } else fexec np;
906
+ action parse_value {
907
+ // parses the value portion
908
+ const char *np = parse_value(fpc, pe, data);
909
+ if (np == NULL) { fhold; fbreak; } else { fexec np; }
869
910
  }
870
911
 
871
- action exit { fhold; fbreak; }
872
912
 
873
- main := (
874
- ('_' >parse_discard) |
875
- '{' >parse_set |
876
- alpha >parse_tagged
877
- ) @exit;
913
+ main := (symbol >parse_symbol ignore* begin_value >parse_value) @exit;
878
914
  }%%
879
915
 
880
916
 
881
- const char* edn::Parser::parse_dispatch(const char *p, const char *pe, Rice::Object& o)
917
+ const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Object& o)
882
918
  {
883
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
919
+ std::string sym_name;
920
+ Rice::Object data;
921
+
884
922
  int cs;
885
923
 
886
924
  %% write init;
887
925
  %% write exec;
888
926
 
889
- if (cs >= EDN_dispatch_first_final) {
927
+ if (cs >= EDN_tagged_first_final) {
928
+ //std::cerr << __FUNCTION__ << " parse symbol name as '" << sym_name << "', value is: " << data << std::endl;
929
+
930
+ try {
931
+ // tagged_element makes a call to ruby which may throw an
932
+ // exception when parsing the data
933
+ o = Parser::tagged_element(sym_name, data);
934
+ } catch (Rice::Exception& e) {
935
+ error(__FUNCTION__, e.message().str());
936
+ return pe;
937
+ }
890
938
  return p + 1;
891
939
  }
892
- else if (cs == EDN_dispatch_error) {
893
- error(__FUNCTION__, *p);
940
+ else if (cs == EDN_tagged_error) {
894
941
  return pe;
895
942
  }
896
- else if (cs == EDN_dispatch_en_main) {} // silence ragel warning
897
-
943
+ else if (cs == EDN_tagged_en_main) {} // silence ragel warning
898
944
  return NULL;
899
945
  }
900
946
 
947
+
948
+
949
+
901
950
  // ============================================================
902
951
  // main parsing machine
903
952
  //
@@ -916,13 +965,12 @@ const char* edn::Parser::parse_dispatch(const char *p, const char *pe, Rice::Obj
916
965
  next_element = ignore* element;
917
966
  sequence = ((element ignore*) (next_element ignore*)*);
918
967
 
919
- # TODO: check this. Using a sequence to handle cases with a discard
920
968
  main := ignore* sequence? ignore*;
921
969
  }%%
922
970
 
923
971
  //
924
- //
925
- //
972
+ // TODO: Currently using a sequence to handle cases with a discard
973
+ // but EDN's Reader allows token by token parsing
926
974
  Rice::Object edn::Parser::parse(const char* buf, std::size_t len)
927
975
  {
928
976
  int cs;