edn_turbo 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,6 +5,7 @@
5
5
  #include <rice/Hash.hpp>
6
6
  #include <rice/Array.hpp>
7
7
  #include <rice/to_from_ruby.hpp>
8
+ #include <rice/Exception.hpp>
8
9
 
9
10
  #include "edn_parser.h"
10
11
 
@@ -49,16 +50,19 @@
49
50
  exp = ([Ee] [+\-]? digit+);
50
51
 
51
52
 
53
+ # common actions
52
54
  action close_err {
53
55
  std::stringstream s;
54
56
  s << "unterminated " << EDN_TYPE;
55
57
  error(__FUNCTION__, s.str());
56
58
  fhold; fbreak;
57
59
  }
60
+
61
+ action exit { fhold; fbreak; }
58
62
  }%%
59
63
 
60
64
  // ============================================================
61
- // machine for parsing various types
65
+ // machine for parsing various EDN token types
62
66
  //
63
67
 
64
68
  %%{
@@ -67,46 +71,20 @@
67
71
 
68
72
  write data;
69
73
 
70
- action parse_dispatch {
71
- const char *np = parse_dispatch(fpc + 1, pe, o);
72
- if (np == NULL) { fhold; fbreak; } else fexec np;
73
- }
74
-
75
- action parse_char {
76
- const char *np = parse_esc_char(fpc, pe, o);
77
- if (np == NULL) { fhold; fbreak; } else fexec np;
78
- }
79
-
80
74
  action parse_string {
75
+ // string types within double-quotes
81
76
  const char *np = parse_string(fpc, pe, o);
82
77
  if (np == NULL) { fhold; fbreak; } else fexec np;
83
78
  }
84
79
 
85
80
  action parse_keyword {
81
+ // tokens with a leading ':'
86
82
  const char *np = parse_keyword(fpc, pe, o);
87
83
  if (np == NULL) { fhold; fbreak; } else fexec np;
88
84
  }
89
85
 
90
- action parse_operator {
91
- const char *np = parse_operator(fpc, pe, o);
92
- if (np == NULL) { fhold; fbreak; } else fexec np;
93
- }
94
-
95
- action parse_symbol {
96
- std::string sym;
97
- const char *np = parse_symbol(fpc, pe, sym);
98
- if (np == NULL) { fhold; fbreak; } else {
99
- if (sym == "true") { o = Qtrue; }
100
- else if (sym == "false") { o = Qfalse; }
101
- else if (sym == "nil") { o = Qnil; }
102
- else {
103
- o = Parser::make_edn_symbol(sym);
104
- }
105
- fexec np;
106
- }
107
- }
108
-
109
86
  action parse_number {
87
+ // tokens w/ leading digits: non-negative integers & decimals.
110
88
  // try to parse a decimal first
111
89
  const char *np = parse_decimal(fpc, pe, o);
112
90
  if (np == NULL) {
@@ -125,41 +103,75 @@
125
103
  }
126
104
  }
127
105
 
106
+ action parse_operator {
107
+ // stand-alone operators *, +, -, etc.
108
+ const char *np = parse_operator(fpc, pe, o);
109
+ if (np == NULL) { fhold; fbreak; } else fexec np;
110
+ }
111
+
112
+ action parse_char {
113
+ // tokens w/ leading \ (escaped characters \newline, \c, etc.)
114
+ const char *np = parse_esc_char(fpc, pe, o);
115
+ if (np == NULL) { fhold; fbreak; } else fexec np;
116
+ }
117
+
118
+ action parse_symbol {
119
+ // user identifiers and reserved keywords (true, false, nil)
120
+ std::string sym;
121
+ const char *np = parse_symbol(fpc, pe, sym);
122
+ if (np == NULL) { fhold; fbreak; } else {
123
+ if (sym == "true") { o = Qtrue; }
124
+ else if (sym == "false") { o = Qfalse; }
125
+ else if (sym == "nil") { o = Qnil; }
126
+ else {
127
+ o = Parser::make_edn_symbol(sym);
128
+ }
129
+ fexec np;
130
+ }
131
+ }
132
+
128
133
  action parse_vector {
134
+ // [
129
135
  const char *np = parse_vector(fpc, pe, o);
130
136
  if (np == NULL) { fhold; fbreak; } else fexec np;
131
137
  }
132
138
 
133
139
  action parse_list {
140
+ // (
134
141
  const char *np = parse_list(fpc, pe, o);
135
142
  if (np == NULL) { fhold; fbreak; } else fexec np;
136
143
  }
137
144
 
138
145
  action parse_map {
146
+ // {
139
147
  const char *np = parse_map(fpc, pe, o);
140
148
  if (np == NULL) { fhold; fbreak; } else fexec np;
141
149
  }
142
150
 
143
- action exit { fhold; fbreak; }
151
+ action parse_dispatch {
152
+ // handles tokens w/ leading # ("#_", "#{", and tagged elems)
153
+ const char *np = parse_dispatch(fpc + 1, pe, o);
154
+ if (np == NULL) { fhold; fbreak; } else fexec np;
155
+ }
156
+
144
157
 
145
158
  main := (
146
- begin_dispatch >parse_dispatch |
147
- begin_char >parse_char |
148
159
  string_delim >parse_string |
149
160
  begin_keyword >parse_keyword |
161
+ begin_number >parse_number |
150
162
  operators >parse_operator |
163
+ begin_char >parse_char |
151
164
  begin_symbol >parse_symbol |
152
- begin_number >parse_number |
153
165
  begin_vector >parse_vector |
154
166
  begin_list >parse_list |
155
- begin_map >parse_map
167
+ begin_map >parse_map |
168
+ begin_dispatch >parse_dispatch
156
169
  ) %*exit;
157
170
  }%%
158
171
 
159
172
 
160
173
  const char *edn::Parser::parse_value(const char *p, const char *pe, Rice::Object& o)
161
174
  {
162
- //std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
163
175
  int cs;
164
176
 
165
177
  %% write init;
@@ -179,295 +191,279 @@ const char *edn::Parser::parse_value(const char *p, const char *pe, Rice::Object
179
191
 
180
192
 
181
193
  // ============================================================
182
- // operator parsing
194
+ // string parsing - incoming string is raw so interpreting utf
195
+ // encodings & unicode values might be necessary. To optimize things a
196
+ // bit, we mark the string for encoding if anything outside of the
197
+ // ascii range is found.
183
198
  //
184
199
  %%{
185
- machine EDN_operator;
200
+ machine EDN_string;
186
201
  include EDN_common;
187
202
 
188
203
  write data;
189
204
 
190
- action parse_symbol {
191
- // parse a symbol including the leading operator (-, +, .)
192
- std::string sym;
193
- const char *np = parse_symbol(p_save, pe, sym);
194
- if (np == NULL) { fhold; fbreak; } else {
195
- o = Parser::make_edn_symbol(sym);
196
- fexec np;
197
- }
198
- }
199
-
200
- action parse_number {
201
- // parse a number with the leading symbol - this is slightly
202
- // different than the one within EDN_value since it includes
203
- // the leading - or +
204
- //
205
- // try to parse a decimal first
206
- const char *np = parse_decimal(p_save, pe, o);
207
- if (np == NULL) {
208
- // if we can't, try to parse it as an int
209
- np = parse_integer(p_save, pe, o);
210
- }
211
-
212
- if (np) {
213
- fexec np;
214
- fhold;
215
- fbreak;
216
- }
217
- else {
218
- error(__FUNCTION__, *p);
219
- fexec pe;
205
+ action parse_string {
206
+ if (Parser::parse_byte_stream(p_save + 1, p, s, encode)) {
207
+ fexec p + 1;
208
+ } else {
209
+ fhold; fbreak;
220
210
  }
221
211
  }
222
212
 
223
- action parse_operator {
224
- // stand-alone operators (-, +, /, ... etc)
225
- std::string sym;
226
- sym += *(fpc - 1);
227
- o = Parser::make_edn_symbol(sym);
213
+ action mark_for_encoding {
214
+ encode = true;
228
215
  }
229
216
 
230
- action exit { fhold; fbreak; }
231
-
232
- main := (
233
- ('-'|'+'|'.') alpha >parse_symbol |
234
- ('-'|'+') begin_number >parse_number |
235
- operators ignore* >parse_operator
236
- ) ^(operators|alpha|digit)? @exit;
217
+ main := string_delim (
218
+ (^([\"\\] | 0..0x1f | 0xc2..0xf5) |
219
+ ((0xc2..0xf5) |
220
+ '\\'[\"\\/bfnrt] |
221
+ '\\u'[0-9a-fA-F]{4}) $mark_for_encoding |
222
+ '\\'^([\"\\/bfnrtu]|0..0x1f))* %parse_string
223
+ ) :>> string_delim @err(close_err) @exit;
237
224
  }%%
238
225
 
239
226
 
240
- const char* edn::Parser::parse_operator(const char *p, const char *pe, Rice::Object& o)
227
+ const char* edn::Parser::parse_string(const char *p, const char *pe, Rice::Object& o)
241
228
  {
242
229
  // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
230
+ static const char* EDN_TYPE = "string";
243
231
  int cs;
244
- std::string op;
232
+ bool encode = false;
233
+ const char *eof = pe;
234
+ Rice::String s;
245
235
 
246
236
  %% write init;
247
237
  p_save = p;
248
238
  %% write exec;
249
239
 
250
- if (cs >= EDN_operator_first_final) {
251
- return p;
240
+ if (cs >= EDN_string_first_final) {
241
+ o = s;
242
+ return p + 1;
252
243
  }
253
- else if (cs == EDN_operator_error) {
254
- error(__FUNCTION__, *p);
244
+ else if (cs == EDN_string_error) {
255
245
  return pe;
256
246
  }
257
- else if (cs == EDN_operator_en_main) {} // silence ragel warning
247
+ else if (cs == EDN_string_en_main) {} // silence ragel warning
258
248
  return NULL;
259
249
  }
260
250
 
261
251
 
262
252
 
263
253
  // ============================================================
264
- // escaped char parsing
254
+ // keyword parsing
265
255
  //
266
256
  %%{
267
- machine EDN_escaped_char;
257
+ machine EDN_keyword;
268
258
  include EDN_common;
269
259
 
270
- write data;
260
+ keyword_chars = symbol_chars | operators;
261
+ keyword_start = symbol_start | [\#\./];
271
262
 
272
- valid_chars = alpha;
263
+ keyword_name = keyword_start (keyword_chars)*;
273
264
 
274
- action exit { fhold; fbreak; }
265
+ write data;
275
266
 
276
- main := (
277
- begin_char valid_chars+ ignore*
278
- ) (^(valid_chars | '\\')? @exit);
267
+
268
+ main := begin_keyword keyword_name (^keyword_chars? @exit);
279
269
  }%%
280
270
 
281
271
 
282
- const char* edn::Parser::parse_esc_char(const char *p, const char *pe, Rice::Object& o)
272
+ const char* edn::Parser::parse_keyword(const char *p, const char *pe, Rice::Object& o)
283
273
  {
284
- //std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
285
274
  int cs;
286
275
 
287
276
  %% write init;
288
277
  p_save = p;
289
278
  %% write exec;
290
279
 
291
- if (cs >= EDN_escaped_char_first_final) {
292
- if (!Parser::parse_escaped_char(p_save + 1, p, o)) {
293
- return pe;
294
- }
280
+ if (cs >= EDN_keyword_first_final) {
281
+ std::string buf;
282
+ uint32_t len = p - p_save;
283
+ // don't include leading ':' because Rice::Symbol will handle it
284
+ buf.append(p_save + 1, len - 1);
285
+ o = Rice::Symbol(buf);
295
286
  return p;
296
287
  }
297
- else if (cs == EDN_escaped_char_error) {
288
+ else if (cs == EDN_keyword_error) {
298
289
  error(__FUNCTION__, *p);
299
290
  return pe;
300
291
  }
301
- else if (cs == EDN_escaped_char_en_main) {} // silence ragel warning
292
+ else if (cs == EDN_keyword_en_main) {} // silence ragel warning
302
293
  return NULL;
303
294
  }
304
295
 
305
296
 
306
297
 
307
-
308
298
  // ============================================================
309
- // symbol parsing
299
+ // decimal parsing machine
310
300
  //
311
301
  %%{
312
- machine EDN_symbol;
302
+ machine EDN_decimal;
313
303
  include EDN_common;
314
304
 
315
- write data;
305
+ write data noerror;
316
306
 
317
- action exit { fhold; fbreak; }
318
307
 
319
- main := (
320
- operators? symbol |
321
- operators
322
- ) ignore* (^(symbol_chars | operators)? @exit);
308
+ main := ('-'|'+')? (
309
+ (integer '.' digit* (exp? [M]?)) |
310
+ (integer exp)
311
+ ) (^[0-9Ee.+\-M]? @exit );
323
312
  }%%
324
313
 
325
314
 
326
- const char* edn::Parser::parse_symbol(const char *p, const char *pe, std::string& s)
315
+ const char* edn::Parser::parse_decimal(const char *p, const char *pe, Rice::Object& o)
327
316
  {
328
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
329
317
  int cs;
330
318
 
331
319
  %% write init;
332
320
  p_save = p;
333
321
  %% write exec;
334
322
 
335
- if (cs >= EDN_symbol_first_final) {
336
- uint32_t len = p - p_save;
337
- std::string buf;
338
- buf.append(p_save, len);
339
-
340
- s = buf;
341
- return p;
342
- }
343
- else if (cs == EDN_symbol_error) {
344
- error(__FUNCTION__, *p);
345
- return pe;
323
+ if (cs >= EDN_decimal_first_final) {
324
+ o = Parser::float_to_ruby(p_save, p - p_save);
325
+ return p + 1;
346
326
  }
347
- else if (cs == EDN_symbol_en_main) {} // silence ragel warning
327
+ else if (cs == EDN_decimal_en_main) {} // silence ragel warning
348
328
  return NULL;
349
329
  }
350
330
 
351
331
 
352
-
353
-
354
332
  // ============================================================
355
- // keyword parsing
333
+ // integer parsing machine
356
334
  //
357
335
  %%{
358
- machine EDN_keyword;
336
+ machine EDN_integer;
359
337
  include EDN_common;
360
338
 
361
- keyword_chars = symbol_chars | operators;
362
- keyword_start = symbol_start | [\#\./];
363
-
364
- keyword_name = keyword_start (keyword_chars)*;
365
-
366
- write data;
339
+ write data noerror;
367
340
 
368
- action exit { fhold; fbreak; }
369
341
 
370
- main := begin_keyword keyword_name (^keyword_chars? @exit);
342
+ main := (
343
+ ('-'|'+')? (integer [MN]?)
344
+ ) (^[0-9MN+\-]? @exit);
371
345
  }%%
372
346
 
373
-
374
- const char* edn::Parser::parse_keyword(const char *p, const char *pe, Rice::Object& o)
347
+ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Object& o)
375
348
  {
376
349
  int cs;
377
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
378
350
 
379
351
  %% write init;
380
352
  p_save = p;
381
353
  %% write exec;
382
354
 
383
- if (cs >= EDN_keyword_first_final) {
384
- uint32_t len = p - p_save - 1; // don't include leading ':' because Rice::Symbol will handle it
385
- std::string buf;
386
- buf.append(p_save + 1, len);
387
- o = Rice::Symbol(buf);
388
- return p;
389
- }
390
- else if (cs == EDN_keyword_error) {
391
- error(__FUNCTION__, *p);
392
- return pe;
355
+ if (cs >= EDN_integer_first_final) {
356
+ o = Parser::integer_to_ruby(p_save, p - p_save);
357
+ return p + 1;
393
358
  }
394
- else if (cs == EDN_keyword_en_main) {} // silence ragel warning
359
+ else if (cs == EDN_integer_en_main) {} // silence ragel warning
395
360
  return NULL;
396
361
  }
397
362
 
398
363
 
399
364
 
400
365
  // ============================================================
401
- // string parsing
366
+ // operator parsing - handles tokens w/ a leading operator:
367
+ //
368
+ // 1. symbols w/ leading operator: -something, .somethingelse
369
+ // 2. number values w/ leading - or +
370
+ // 3. stand-alone operators: +, -, /, *, etc.
402
371
  //
403
372
  %%{
404
- machine EDN_string;
373
+ machine EDN_operator;
405
374
  include EDN_common;
406
375
 
407
376
  write data;
408
377
 
409
- action parse_string {
410
- if (!Parser::parse_byte_stream(p_save + 1, p, s)) {
378
+ action parse_symbol {
379
+ // parse a symbol including the leading operator (-, +, .)
380
+ std::string sym;
381
+ const char *np = parse_symbol(p_save, pe, sym);
382
+ if (np == NULL) { fhold; fbreak; } else {
383
+ o = Parser::make_edn_symbol(sym);
384
+ fexec np;
385
+ }
386
+ }
387
+
388
+ action parse_number {
389
+ // parse a number with the leading symbol - this is slightly
390
+ // different than the one within EDN_value since it includes
391
+ // the leading - or +
392
+ //
393
+ // try to parse a decimal first
394
+ const char *np = parse_decimal(p_save, pe, o);
395
+ if (np == NULL) {
396
+ // if we can't, try to parse it as an int
397
+ np = parse_integer(p_save, pe, o);
398
+ }
399
+
400
+ if (np) {
401
+ fexec np;
411
402
  fhold;
412
403
  fbreak;
413
- } else {
414
- fexec p + 1;
415
404
  }
405
+ else {
406
+ error(__FUNCTION__, *p);
407
+ fexec pe;
408
+ }
409
+ }
410
+
411
+ action parse_operator {
412
+ // stand-alone operators (-, +, /, ... etc)
413
+ std::string sym;
414
+ sym += *(p_save);
415
+ o = Parser::make_edn_symbol(sym);
416
416
  }
417
417
 
418
- action exit { fhold; fbreak; }
419
418
 
420
- main := string_delim (
421
- (^([\"\\] | 0..0x1f) |
422
- '\\'[\"\\/bfnrt] |
423
- '\\u'[0-9a-fA-F]{4} |
424
- '\\'^([\"\\/bfnrtu]|0..0x1f))* %parse_string
425
- ) :>> string_delim @err(close_err) @exit;
419
+ main := (
420
+ ('-'|'+'|'.') alpha >parse_symbol |
421
+ ('-'|'+') begin_number >parse_number |
422
+ operators ignore* >parse_operator
423
+ ) ^(operators|alpha|digit)? @exit;
426
424
  }%%
427
425
 
428
426
 
429
- const char* edn::Parser::parse_string(const char *p, const char *pe, Rice::Object& o)
427
+ const char* edn::Parser::parse_operator(const char *p, const char *pe, Rice::Object& o)
430
428
  {
431
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
432
- static const char* EDN_TYPE = "string";
433
429
  int cs;
434
- const char *eof = pe;
435
430
 
436
- Rice::String s;
437
431
  %% write init;
438
432
  p_save = p;
439
433
  %% write exec;
440
434
 
441
- if (cs >= EDN_string_first_final) {
442
- o = s;
443
- return p + 1;
435
+ if (cs >= EDN_operator_first_final) {
436
+ return p;
444
437
  }
445
- else if (cs == EDN_string_error) {
438
+ else if (cs == EDN_operator_error) {
439
+ error(__FUNCTION__, *p);
446
440
  return pe;
447
441
  }
448
- else if (cs == EDN_string_en_main) {} // silence ragel warning
442
+ else if (cs == EDN_operator_en_main) {} // silence ragel warning
449
443
  return NULL;
450
444
  }
451
445
 
446
+
447
+
452
448
  // ============================================================
453
- // decimal parsing grammar
449
+ // escaped char parsing - handles \c, \newline, \formfeed, etc.
454
450
  //
455
451
  %%{
456
- machine EDN_decimal;
452
+ machine EDN_escaped_char;
457
453
  include EDN_common;
458
454
 
459
- write data noerror;
455
+ write data;
456
+
457
+ valid_chars = alpha;
460
458
 
461
- action exit { fhold; fbreak; }
462
459
 
463
- main := ('-'|'+')? (
464
- (integer '.' digit* (exp? [M]?)) |
465
- (integer exp)
466
- ) (^[0-9Ee.+\-M]? @exit );
460
+ main := (
461
+ begin_char valid_chars+ ignore*
462
+ ) (^(valid_chars | '\\')? @exit);
467
463
  }%%
468
464
 
469
465
 
470
- const char* edn::Parser::parse_decimal(const char *p, const char *pe, Rice::Object& o)
466
+ const char* edn::Parser::parse_esc_char(const char *p, const char *pe, Rice::Object& o)
471
467
  {
472
468
  int cs;
473
469
 
@@ -475,32 +471,43 @@ const char* edn::Parser::parse_decimal(const char *p, const char *pe, Rice::Obje
475
471
  p_save = p;
476
472
  %% write exec;
477
473
 
478
- if (cs >= EDN_decimal_first_final) {
479
- o = Parser::float_to_ruby(p_save, p - p_save);
480
- return p + 1;
474
+ if (cs >= EDN_escaped_char_first_final) {
475
+ // convert the escaped value to a character
476
+ if (!Parser::parse_escaped_char(p_save + 1, p, o)) {
477
+ return pe;
478
+ }
479
+ return p;
481
480
  }
482
- else if (cs == EDN_decimal_en_main) {} // silence ragel warning
481
+ else if (cs == EDN_escaped_char_error) {
482
+ error(__FUNCTION__, *p);
483
+ return pe;
484
+ }
485
+ else if (cs == EDN_escaped_char_en_main) {} // silence ragel warning
483
486
  return NULL;
484
487
  }
485
488
 
486
489
 
490
+
491
+
487
492
  // ============================================================
488
- // integer parsing grammar
493
+ // symbol parsing - handles identifiers that begin with an alpha
494
+ // character and an optional leading operator (name, -today,
495
+ // .yesterday)
489
496
  //
490
497
  %%{
491
- machine EDN_integer;
498
+ machine EDN_symbol;
492
499
  include EDN_common;
493
500
 
494
- write data noerror;
501
+ write data;
495
502
 
496
- action exit { fhold; fbreak; }
497
503
 
498
504
  main := (
499
- ('-'|'+')? (integer [MN]?)
500
- ) (^[0-9MN+\-]? @exit);
505
+ operators? symbol
506
+ ) ignore* (^(symbol_chars | operators)? @exit);
501
507
  }%%
502
508
 
503
- const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Object& o)
509
+
510
+ const char* edn::Parser::parse_symbol(const char *p, const char *pe, std::string& sym)
504
511
  {
505
512
  int cs;
506
513
 
@@ -508,15 +515,22 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
508
515
  p_save = p;
509
516
  %% write exec;
510
517
 
511
- if (cs >= EDN_integer_first_final) {
512
- o = Parser::integer_to_ruby(p_save, p - p_save);
513
- return p + 1;
518
+ if (cs >= EDN_symbol_first_final) {
519
+ // copy the symbol text
520
+ sym.clear();
521
+ sym.append(p_save, p - p_save);
522
+ return p;
514
523
  }
515
- else if (cs == EDN_integer_en_main) {} // silence ragel warning
524
+ else if (cs == EDN_symbol_error) {
525
+ error(__FUNCTION__, *p);
526
+ return pe;
527
+ }
528
+ else if (cs == EDN_symbol_en_main) {} // silence ragel warning
516
529
  return NULL;
517
530
  }
518
531
 
519
532
 
533
+
520
534
  // ============================================================
521
535
  // EDN_sequence_common is used to parse EDN containers - elements are
522
536
  // initially stored in a rice array and then the final corresponding
@@ -527,30 +541,34 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
527
541
  machine EDN_sequence_common;
528
542
  include EDN_common;
529
543
 
530
- action parse_value {
531
- Rice::Object v;
532
- const char *np = parse_value(fpc, pe, v);
544
+ action parse_item {
545
+ // reads an item within a sequence (vector, list, map, or
546
+ // set). Regardless of the sequence type, an array of the
547
+ // items is built. Once done, the sequence parser will convert
548
+ // if needed
549
+ Rice::Object e;
550
+ const char *np = parse_value(fpc, pe, e);
533
551
  if (np == NULL) {
534
552
  fhold; fbreak;
535
553
  } else {
536
554
  // if there's an entry in the discard list, the current
537
- // object is not meant to be kept
555
+ // object is not meant to be kept due to a #_ so don't
556
+ // push it into the list of elements
538
557
  if (!discard.empty()) {
539
558
  discard.pop();
540
559
  }
541
560
  else {
542
- // otherwise we add it to the sequence
543
- arr.push(v);
561
+ // otherwise we add it to the list of elements for the
562
+ // corresponding container
563
+ elems.push(e);
544
564
  }
545
565
  fexec np;
546
566
  }
547
567
  }
548
568
 
549
- element = begin_value >parse_value;
569
+ element = begin_value >parse_item;
550
570
  next_element = ignore* element;
551
571
  sequence = ((element ignore*) (next_element ignore*)*);
552
-
553
- action exit { fhold; fbreak; }
554
572
  }%%
555
573
 
556
574
  //
@@ -575,17 +593,16 @@ const char* edn::Parser::parse_integer(const char *p, const char *pe, Rice::Obje
575
593
  //
576
594
  const char* edn::Parser::parse_vector(const char *p, const char *pe, Rice::Object& o)
577
595
  {
578
- //std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
579
596
  static const char* EDN_TYPE = "vector";
580
597
 
581
598
  int cs;
582
- Rice::Array arr;
599
+ Rice::Array elems; // will store the vector's elements
583
600
 
584
601
  %% write init;
585
602
  %% write exec;
586
603
 
587
604
  if (cs >= EDN_vector_first_final) {
588
- o = arr;
605
+ o = elems;
589
606
  return p + 1;
590
607
  }
591
608
  else if (cs == EDN_vector_error) {
@@ -622,13 +639,13 @@ const char* edn::Parser::parse_list(const char *p, const char *pe, Rice::Object&
622
639
  static const char* EDN_TYPE = "list";
623
640
 
624
641
  int cs;
625
- Rice::Array arr;
642
+ Rice::Array elems;
626
643
 
627
644
  %% write init;
628
645
  %% write exec;
629
646
 
630
647
  if (cs >= EDN_list_first_final) {
631
- o = arr;
648
+ o = elems;
632
649
  return p + 1;
633
650
  }
634
651
  else if (cs == EDN_list_error) {
@@ -642,161 +659,171 @@ const char* edn::Parser::parse_list(const char *p, const char *pe, Rice::Object&
642
659
 
643
660
 
644
661
  // ============================================================
645
- // set parsing machine
662
+ // hash parsing
646
663
  //
647
664
  %%{
648
- machine EDN_set;
665
+ machine EDN_map;
649
666
  include EDN_sequence_common;
650
667
 
668
+ end_map = '}';
669
+
651
670
  write data;
652
671
 
653
- begin_set = '{';
654
- end_set = '}';
655
672
 
656
- main := begin_set (
657
- ignore* sequence? :>> end_set
673
+ main := begin_map (
674
+ ignore* (sequence)? :>> end_map
658
675
  ) @err(close_err) @exit;
659
676
  }%%
660
677
 
661
- //
662
- // set parsing
663
- //
664
- const char* edn::Parser::parse_set(const char *p, const char *pe, Rice::Object& o)
678
+
679
+ const char* edn::Parser::parse_map(const char *p, const char *pe, Rice::Object& o)
665
680
  {
666
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
667
- static const char* EDN_TYPE = "set";
681
+ static const char* EDN_TYPE = "map";
668
682
 
669
683
  int cs;
670
- Rice::Array arr; // store as a vector; then convert to a set once done
684
+ // since we don't know whether we're looking at a key or value,
685
+ // initially store all elements in a list
686
+ Rice::Array elems;
671
687
 
672
688
  %% write init;
673
689
  %% write exec;
674
690
 
675
- if (cs >= EDN_set_first_final) {
676
- o = Parser::make_ruby_set(arr);
691
+ if (cs >= EDN_map_first_final) {
692
+
693
+ // hash parsing is done. Make sure we have an even count
694
+ if ((elems.size() % 2) != 0) {
695
+ error(__FUNCTION__, "odd number of elements in map");
696
+ return pe;
697
+ }
698
+
699
+ // now convert the sequence to a hash
700
+ Rice::Hash rslt;
701
+ while (elems.size())
702
+ {
703
+ Rice::Object k = elems.shift();
704
+ rslt[k] = elems.shift();
705
+ }
706
+
707
+ o = rslt;
677
708
  return p + 1;
678
709
  }
679
- else if (cs == EDN_set_error) {
680
- error(__FUNCTION__, *p);
710
+ else if (cs == EDN_map_error) {
681
711
  return pe;
682
712
  }
683
- else if (cs == EDN_set_en_main) {} // silence ragel warning
713
+ else if (cs == EDN_map_en_main) {} // silence ragel warning
684
714
  return NULL;
685
715
  }
686
716
 
687
717
 
688
718
 
689
719
  // ============================================================
690
- // hash parsing
720
+ // dispatch - handles all tokens with a leading #, then delegates to
721
+ // the corresponding machine. This machine consumes the # and passes
722
+ // the remaining data to the correct parser
691
723
  //
692
724
  %%{
693
- machine EDN_map;
694
- include EDN_sequence_common;
695
-
696
- end_map = '}';
725
+ machine EDN_dispatch;
726
+ include EDN_common;
697
727
 
698
728
  write data;
699
729
 
700
- # action to report missing value in k/v pair
701
- action pair_err {
702
- error(__FUNCTION__, "map pair not found");
703
- fexec pe;
730
+ action parse_set {
731
+ // #{ }
732
+ const char *np = parse_set(fpc, pe, o);
733
+ if (np == NULL) { fhold; fbreak; } else fexec np;
734
+ }
735
+
736
+ action parse_discard {
737
+ // discard token #_
738
+ const char *np = parse_discard(fpc, pe);
739
+ if (np == NULL) { fhold; fbreak; } else fexec np;
704
740
  }
705
741
 
706
- main := begin_map (
707
- ignore* (sequence)? :>> end_map
708
- ) @err(close_err) @exit;
742
+ action parse_tagged {
743
+ // #inst, #uuid, or #user/tag
744
+ const char *np = parse_tagged(fpc, pe, o);
745
+ if (np == NULL) { fhold; fbreak; } else fexec np;
746
+ }
747
+
748
+
749
+ main := (
750
+ ('{' >parse_set |
751
+ '_' >parse_discard |
752
+ alpha >parse_tagged)
753
+ ) @exit;
709
754
  }%%
710
755
 
711
756
 
712
- const char* edn::Parser::parse_map(const char *p, const char *pe, Rice::Object& o)
757
+ const char* edn::Parser::parse_dispatch(const char *p, const char *pe, Rice::Object& o)
713
758
  {
714
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
715
- static const char* EDN_TYPE = "map";
716
- Rice::Array arr;
717
759
  int cs;
718
760
 
719
761
  %% write init;
720
762
  %% write exec;
721
763
 
722
- if (cs >= EDN_map_first_final) {
723
-
724
- if ((arr.size() % 2) != 0) {
725
- error(__FUNCTION__, "odd number of elements in map");
726
- return pe;
727
- }
728
-
729
- Rice::Hash map;
730
- while (arr.size())
731
- {
732
- Rice::Object k = arr.shift();
733
- map[k] = arr.shift();
734
- }
735
-
736
- o = map;
764
+ if (cs >= EDN_dispatch_first_final) {
737
765
  return p + 1;
738
766
  }
739
- else if (cs == EDN_map_error) {
767
+ else if (cs == EDN_dispatch_error) {
768
+ error(__FUNCTION__, *p);
740
769
  return pe;
741
770
  }
742
- else if (cs == EDN_map_en_main) {} // silence ragel warning
771
+ else if (cs == EDN_dispatch_en_main) {} // silence ragel warning
772
+
743
773
  return NULL;
744
774
  }
745
775
 
746
776
 
747
777
  // ============================================================
748
- // tagged element parsing - #uuid, #inst, #{, #_
778
+ // set parsing machine
749
779
  //
750
780
  %%{
751
- machine EDN_tagged;
752
- include EDN_common;
753
-
754
- # inst = (string_delim [0-9+\-:\.TZ]* string_delim);
755
- # uuid = (string_delim [a-f0-9\-]* string_delim);
781
+ machine EDN_set;
782
+ include EDN_sequence_common;
756
783
 
757
784
  write data;
758
785
 
759
- action parse_symbol {
760
- const char *np = parse_symbol(fpc, pe, sym_name);
761
- if (np == NULL) { fhold; fbreak; } else { fexec np; }
762
- }
763
- action parse_value {
764
- const char *np = parse_value(fpc, pe, object);
765
- if (np == NULL) { fhold; fbreak; } else { fexec np; }
766
- }
767
-
768
- action exit { fhold; fbreak; }
786
+ begin_set = '{';
787
+ end_set = '}';
769
788
 
770
- main := (symbol >parse_symbol ignore* begin_value >parse_value) @exit;
789
+ main := begin_set (
790
+ ignore* sequence? :>> end_set
791
+ ) @err(close_err) @exit;
771
792
  }%%
772
793
 
773
-
774
- const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Object& o)
794
+ //
795
+ // set parsing
796
+ //
797
+ const char* edn::Parser::parse_set(const char *p, const char *pe, Rice::Object& o)
775
798
  {
776
- // std::cerr << __FUNCTION__ << " p '" << p << "'" << std::endl;
777
- std::string sym_name;
778
- Rice::Object object;
799
+ static const char* EDN_TYPE = "set";
779
800
 
780
801
  int cs;
802
+ Rice::Array elems; // stored as a vector
781
803
 
782
804
  %% write init;
783
805
  %% write exec;
784
806
 
785
- if (cs >= EDN_tagged_first_final) {
786
- //std::cerr << __FUNCTION__ << " parse symbol name as '" << sym_name << "', value is: " << object << std::endl;
787
- o = Parser::tagged_element(sym_name, object);
807
+ if (cs >= EDN_set_first_final) {
808
+ // all elements collected; now convert to a set
809
+ o = Parser::make_ruby_set(elems);
788
810
  return p + 1;
789
811
  }
790
- else if (cs == EDN_tagged_error) {
812
+ else if (cs == EDN_set_error) {
813
+ error(__FUNCTION__, *p);
791
814
  return pe;
792
815
  }
793
- else if (cs == EDN_tagged_en_main) {} // silence ragel warning
816
+ else if (cs == EDN_set_en_main) {} // silence ragel warning
794
817
  return NULL;
795
818
  }
796
819
 
797
820
 
821
+
798
822
  // ============================================================
799
- // discard
823
+ // discard - consume the discard token and parse the next value to
824
+ // discard. TODO: perhaps optimize this so no object data is built
825
+ // by defining a new machine(s) to consume items within container
826
+ // delimiters
800
827
  //
801
828
  %%{
802
829
  machine EDN_discard;
@@ -808,12 +835,17 @@ const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Objec
808
835
 
809
836
  action discard_value {
810
837
  const char *np = parse_value(fpc, pe, o);
811
- if (np == NULL) { fhold; fbreak; } else { discard.push(o); fexec np; }
838
+ if (np) {
839
+ // this token is to be discard it so store it in the
840
+ // discard stack - we really don't need to save it so this
841
+ // could be simplified
842
+ discard.push(o);
843
+ fexec np;
844
+ } else {
845
+ fhold; fbreak;
846
+ }
812
847
  }
813
848
 
814
- action exit {
815
- fhold; fbreak;
816
- }
817
849
 
818
850
  main := begin_discard ignore* (
819
851
  begin_value >discard_value
@@ -823,7 +855,6 @@ const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Objec
823
855
 
824
856
  const char* edn::Parser::parse_discard(const char *p, const char *pe)
825
857
  {
826
- //std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
827
858
  int cs;
828
859
  Rice::Object o;
829
860
 
@@ -845,59 +876,77 @@ const char* edn::Parser::parse_discard(const char *p, const char *pe)
845
876
 
846
877
 
847
878
  // ============================================================
848
- // dispatch
879
+ // tagged element parsing - #uuid, #inst, #{, #user/tag
880
+ //
881
+ // Current implementation expects a symbol followed by a value to
882
+ // match it against and does not check validity of uuid or rfc3339
883
+ // date characters.
884
+ //
885
+ // TODO:
886
+ // 1. need to check if we must support discard shenanigans such as
887
+ //
888
+ // #symbol #_ discard data
889
+ //
890
+ // 2. add parse checks for uuid and inst for better error reporting
849
891
  //
850
892
  %%{
851
- machine EDN_dispatch;
893
+ machine EDN_tagged;
852
894
  include EDN_common;
853
895
 
854
- write data;
896
+ # inst = (string_delim [0-9+\-:\.TZ]* string_delim);
897
+ # uuid = (string_delim [a-f0-9\-]* string_delim);
855
898
 
856
- action parse_discard {
857
- const char *np = parse_discard(fpc, pe);
858
- if (np == NULL) { fhold; fbreak; } else fexec np;
859
- }
899
+ write data;
860
900
 
861
- action parse_set {
862
- const char *np = parse_set(fpc, pe, o);
863
- if (np == NULL) { fhold; fbreak; } else fexec np;
901
+ action parse_symbol {
902
+ // parses the symbol portion of the pair
903
+ const char *np = parse_symbol(fpc, pe, sym_name);
904
+ if (np == NULL) { fhold; fbreak; } else { fexec np; }
864
905
  }
865
-
866
- action parse_tagged {
867
- const char *np = parse_tagged(fpc, pe, o);
868
- if (np == NULL) { fhold; fbreak; } else fexec np;
906
+ action parse_value {
907
+ // parses the value portion
908
+ const char *np = parse_value(fpc, pe, data);
909
+ if (np == NULL) { fhold; fbreak; } else { fexec np; }
869
910
  }
870
911
 
871
- action exit { fhold; fbreak; }
872
912
 
873
- main := (
874
- ('_' >parse_discard) |
875
- '{' >parse_set |
876
- alpha >parse_tagged
877
- ) @exit;
913
+ main := (symbol >parse_symbol ignore* begin_value >parse_value) @exit;
878
914
  }%%
879
915
 
880
916
 
881
- const char* edn::Parser::parse_dispatch(const char *p, const char *pe, Rice::Object& o)
917
+ const char* edn::Parser::parse_tagged(const char *p, const char *pe, Rice::Object& o)
882
918
  {
883
- // std::cerr << __FUNCTION__ << " - p: '" << p << "'" << std::endl;
919
+ std::string sym_name;
920
+ Rice::Object data;
921
+
884
922
  int cs;
885
923
 
886
924
  %% write init;
887
925
  %% write exec;
888
926
 
889
- if (cs >= EDN_dispatch_first_final) {
927
+ if (cs >= EDN_tagged_first_final) {
928
+ //std::cerr << __FUNCTION__ << " parse symbol name as '" << sym_name << "', value is: " << data << std::endl;
929
+
930
+ try {
931
+ // tagged_element makes a call to ruby which may throw an
932
+ // exception when parsing the data
933
+ o = Parser::tagged_element(sym_name, data);
934
+ } catch (Rice::Exception& e) {
935
+ error(__FUNCTION__, e.message().str());
936
+ return pe;
937
+ }
890
938
  return p + 1;
891
939
  }
892
- else if (cs == EDN_dispatch_error) {
893
- error(__FUNCTION__, *p);
940
+ else if (cs == EDN_tagged_error) {
894
941
  return pe;
895
942
  }
896
- else if (cs == EDN_dispatch_en_main) {} // silence ragel warning
897
-
943
+ else if (cs == EDN_tagged_en_main) {} // silence ragel warning
898
944
  return NULL;
899
945
  }
900
946
 
947
+
948
+
949
+
901
950
  // ============================================================
902
951
  // main parsing machine
903
952
  //
@@ -916,13 +965,12 @@ const char* edn::Parser::parse_dispatch(const char *p, const char *pe, Rice::Obj
916
965
  next_element = ignore* element;
917
966
  sequence = ((element ignore*) (next_element ignore*)*);
918
967
 
919
- # TODO: check this. Using a sequence to handle cases with a discard
920
968
  main := ignore* sequence? ignore*;
921
969
  }%%
922
970
 
923
971
  //
924
- //
925
- //
972
+ // TODO: Currently using a sequence to handle cases with a discard
973
+ // but EDN's Reader allows token by token parsing
926
974
  Rice::Object edn::Parser::parse(const char* buf, std::size_t len)
927
975
  {
928
976
  int cs;