edn_turbo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,725 @@
1
+ #include <iostream>
2
+ #include <string>
3
+
4
+ #include <ruby/ruby.h>
5
+ #include <ruby/encoding.h>
6
+
7
+ #include <rice/Hash.hpp>
8
+ #include <rice/Array.hpp>
9
+ #include <rice/to_from_ruby.hpp>
10
+
11
+ #include "edn_parser.h"
12
+
13
+ //
14
+ // EDN spec at: https://github.com/edn-format/edn
15
+ //
16
+
17
+ %%{
18
+ machine EDN_common;
19
+
20
+ cr = '\n';
21
+ counter = ( cr @{ line_number++; } );
22
+ cr_neg = [^\n];
23
+ ws = [\t\v\f\r ] | ',' | counter;
24
+ comment = ';' cr_neg* counter;
25
+ ignore = ws | comment;
26
+ k_nil = 'nil';
27
+ k_true = 'true';
28
+ k_false = 'false';
29
+ begin_keyword = ':';
30
+ begin_value = digit | [:nft\"\-\{\[\(\\];
31
+ begin_dispatch = '#';
32
+ begin_vector = '[';
33
+ end_vector = ']';
34
+ begin_map = '{';
35
+ end_map = '}';
36
+ begin_list = '(';
37
+ end_list = ')';
38
+ string_delim = '"';
39
+ begin_number = digit | '-';
40
+
41
+ action close_err {
42
+ std::stringstream s;
43
+ s << "unterminated " << EDN_TYPE;
44
+ error(s.str());
45
+ fexec pe;
46
+ }
47
+ }%%
48
+
49
+ // ============================================================
50
+ // machine for parsing various types
51
+ //
52
+
53
+ %%{
54
+ machine EDN_value;
55
+ include EDN_common;
56
+
57
+ write data;
58
+
59
+ action parse_nil {
60
+ o = Qnil;
61
+ }
62
+ action parse_false {
63
+ o = Qfalse;
64
+ }
65
+ action parse_true {
66
+ o = Qtrue;
67
+ }
68
+
69
+ action parse_keyword {
70
+ const char *np = EDN_parse_keyword(fpc, pe, o);
71
+ if (np == NULL) { fhold; fbreak; } else fexec np;
72
+ }
73
+
74
+ action parse_string {
75
+ const char *np = EDN_parse_string(fpc, pe, o);
76
+ if (np == NULL) { fhold; fbreak; } else fexec np;
77
+ }
78
+
79
+ action parse_number {
80
+ // try to parse a decimal first
81
+ const char *np = EDN_parse_decimal(fpc, pe, o);
82
+ if (np == NULL) {
83
+ // if we can't, try to parse it as an int
84
+ np = EDN_parse_integer(fpc, pe, o);
85
+ }
86
+
87
+ if (np) {
88
+ fexec np;
89
+ fhold;
90
+ fbreak;
91
+ }
92
+ else {
93
+ error(*p);
94
+ fexec pe;
95
+ }
96
+ }
97
+
98
+ action parse_vector {
99
+ const char *np = EDN_parse_vector(fpc, pe, o);
100
+ if (np == NULL) { fhold; fbreak; } else fexec np;
101
+ }
102
+
103
+ action parse_list {
104
+ const char *np = EDN_parse_list(fpc, pe, o);
105
+ if (np == NULL) { fhold; fbreak; } else fexec np;
106
+ }
107
+
108
+ action parse_map {
109
+ const char *np = EDN_parse_map(fpc, pe, o);
110
+ if (np == NULL) { fhold; fbreak; } else fexec np;
111
+ }
112
+
113
+ action exit { fhold; fbreak; }
114
+
115
+ main := (
116
+ k_nil @parse_nil |
117
+ k_false @parse_false |
118
+ k_true @parse_true |
119
+ string_delim >parse_string |
120
+ begin_keyword >parse_keyword |
121
+ begin_number >parse_number |
122
+ begin_vector >parse_vector |
123
+ begin_list >parse_list |
124
+ begin_map >parse_map
125
+ ) %*exit;
126
+ }%%
127
+
128
+
129
+ const char *edn::Parser::EDN_parse_value(const char *p, const char *pe, Rice::Object& o)
130
+ {
131
+ int cs;
132
+
133
+ %% write init;
134
+ %% write exec;
135
+
136
+ if (cs >= EDN_value_first_final) {
137
+ return p;
138
+ }
139
+ else if (cs == EDN_value_error) {
140
+ error(*p);
141
+ return pe;
142
+ }
143
+ else if (cs == EDN_value_en_main) {} // silence ragel warning
144
+ return NULL;
145
+ }
146
+
147
+
148
+ // ============================================================
149
+ // tagged element parsing - any of #uuid, #inst, #{, #(some symbol)
150
+ // discard (#_ <ident>) is handled by the top-level machine
151
+ //
152
+ %%{
153
+ machine EDN_dispatch;
154
+ include EDN_common;
155
+
156
+ begin_discard = '_';
157
+ begin_set = '{';
158
+ end_set = '}';
159
+
160
+ write data;
161
+
162
+ action exit { fhold; fbreak; }
163
+
164
+ main := begin_dispatch (
165
+ (begin_discard (space)? ([a-zA-Z0-9\-\.]*)) |
166
+ ('inst ' string_delim ([0-9\-+:\.TZ])* string_delim) |
167
+ ('uuid ' string_delim ([a-f0-9\-]* string_delim))
168
+ )
169
+ (^[a-zA-Z0-9:\.\-+ ]* @exit);
170
+ }%%
171
+
172
+
173
+ const char* edn::Parser::EDN_parse_tagged(const char *p, const char *pe, Rice::Object& o, bool& discard)
174
+ {
175
+ int cs;
176
+ Rice::String str;
177
+
178
+ %% write init;
179
+ p_save = p;
180
+ %% write exec;
181
+
182
+ if (cs >= EDN_dispatch_first_final) {
183
+
184
+ //is it a discard? if so, just drop the following token
185
+ if (*(p_save + 1) == '_')
186
+ {
187
+ discard = true;
188
+ return p + 1;
189
+ }
190
+
191
+ std::size_t len = p - p_save;
192
+ std::string buf;
193
+ buf.reserve(len);
194
+
195
+ if (len > 10)
196
+ {
197
+ // there's enough room to be #inst or #uuid, copy the
198
+ // string portion
199
+ if (std::strncmp(p_save + 1, "inst", 4) == 0) {
200
+ buf.append(p_save + 7, len - 8);
201
+ } else if (std::strncmp(p_save + 1, "uuid", 4) == 0) {
202
+ buf.append(p_save + 7, len - 8);
203
+ }
204
+
205
+ o = Rice::String(buf);
206
+ return p;
207
+ }
208
+
209
+ // tagged element
210
+ o = Rice::String(buf);
211
+ return p;
212
+ }
213
+ else if (cs == EDN_dispatch_error) {
214
+ error(*p);
215
+ return pe;
216
+ }
217
+ else if (cs == EDN_dispatch_en_main) {} // silence ragel warning
218
+ return NULL;
219
+ }
220
+
221
+
222
+
223
+ // ============================================================
224
+ // keyword parsing
225
+ //
226
+ %%{
227
+ machine EDN_keyword;
228
+ include EDN_common;
229
+
230
+ write data;
231
+
232
+ action exit { fhold; fbreak; }
233
+
234
+ main := begin_keyword
235
+ ([a-zA-Z_][a-zA-Z_0-9\-]* ('/' [a-zA-Z_][a-zA-Z_0-9\-]*)?)
236
+ (^[a-zA-Z_0-9\-'/']? @exit);
237
+ }%%
238
+
239
+
240
+ const char* edn::Parser::EDN_parse_keyword(const char *p, const char *pe, Rice::Object& o)
241
+ {
242
+ int cs;
243
+
244
+ %% write init;
245
+ p_save = p;
246
+ %% write exec;
247
+
248
+ if (cs >= EDN_keyword_first_final) {
249
+ uint32_t len = p - p_save - 1; // don't include leading ':' because Rice::Symbol will handle it
250
+ std::string buf;
251
+ buf.append(p_save + 1, len);
252
+ o = Rice::Symbol(buf);
253
+ return p;
254
+ }
255
+ else if (cs == EDN_keyword_error) {
256
+ error(*p);
257
+ return pe;
258
+ }
259
+ else if (cs == EDN_keyword_en_main) {} // silence ragel warning
260
+ return NULL;
261
+ }
262
+
263
+
264
+
265
+ // ============================================================
266
+ // string parsing
267
+ //
268
+ %%{
269
+ machine EDN_string;
270
+ include EDN_common;
271
+
272
+ write data;
273
+
274
+ action parse_string {
275
+ if (!EDN_parse_byte_stream(p_save + 1, p, s)) {
276
+ fhold;
277
+ fbreak;
278
+ } else {
279
+ fexec p + 1;
280
+ }
281
+ }
282
+
283
+ action exit { fhold; fbreak; }
284
+
285
+ main := string_delim (
286
+ (^([\"\\] | 0..0x1f) |
287
+ '\\'[\"\\/bfnrt] |
288
+ '\\u'[0-9a-fA-F]{4} |
289
+ '\\'^([\"\\/bfnrtu]|0..0x1f))* %parse_string)
290
+ string_delim @err(close_err)
291
+ @exit;
292
+ }%%
293
+
294
+
295
+ //
296
+ // copies the string data, unescaping any present values that need to be replaced
297
+ //
298
+ bool edn::Parser::EDN_parse_byte_stream(const char *p, const char *pe, Rice::String& s)
299
+ {
300
+ if (pe > p) {
301
+ std::string buf;
302
+ std::size_t len = pe - p;
303
+
304
+ // pre-allocate storage needed
305
+ buf.reserve(len);
306
+
307
+ const char* cp = p;
308
+ std::size_t pos = 0;
309
+ char c, replacement;
310
+
311
+ while (cp < pe)
312
+ {
313
+ // append any other character that is not the escaping slash
314
+ if (*cp != '\\') {
315
+ buf.replace(pos++, 1, 1, *cp++);
316
+ continue;
317
+ }
318
+
319
+ // looking at a '\' - check what it escapes if there's a
320
+ // following character
321
+ if (++cp == pe)
322
+ break;
323
+
324
+ c = *cp++;
325
+ replacement = '?';
326
+
327
+ switch (c)
328
+ {
329
+ case 't':
330
+ replacement = '\t';
331
+ break;
332
+ case 'n':
333
+ replacement = '\n';
334
+ break;
335
+ case 'r':
336
+ replacement = '\r';
337
+ break;
338
+ case '\"':
339
+ replacement = '\"';
340
+ break;
341
+ case '\\':
342
+ replacement = '\\';
343
+ break;
344
+ /* TODO: add support for this!
345
+ case 'u':
346
+ replacement = '\u';
347
+ break;
348
+ */
349
+ default:
350
+ std::cerr << "value must be unescaped but case is unhandled: '" << c << "'" << std::endl;
351
+ break;
352
+ }
353
+
354
+ // substitute the escaped walue
355
+ if (replacement != '?')
356
+ buf.replace(pos++, 1, 1, replacement);
357
+ }
358
+
359
+ // utf-8 encode
360
+ VALUE vs = Rice::protect( rb_str_new2, buf.c_str() );
361
+ VALUE s_utf8 = Rice::protect( rb_enc_associate, vs, rb_utf8_encoding() );
362
+ s = Rice::String(s_utf8);
363
+ return true;
364
+ }
365
+
366
+ return false;
367
+ }
368
+
369
+
370
+ const char* edn::Parser::EDN_parse_string(const char *p, const char *pe, Rice::Object& o)
371
+ {
372
+ static const char* EDN_TYPE = "string";
373
+ int cs;
374
+ const char *eof = pe;
375
+
376
+ Rice::String s;
377
+ %% write init;
378
+ p_save = p;
379
+ %% write exec;
380
+
381
+ if (cs >= EDN_string_first_final) {
382
+ o = s;
383
+ return p + 1;
384
+ }
385
+ else if (cs == EDN_string_error) {
386
+ return pe;
387
+ }
388
+ else if (cs == EDN_string_en_main) {} // silence ragel warning
389
+ return NULL;
390
+ }
391
+
392
+ // ============================================================
393
+ // decimal parsing grammar
394
+ //
395
+ %%{
396
+ machine EDN_decimal;
397
+ include EDN_common;
398
+
399
+ write data noerror;
400
+
401
+ action exit { fhold; fbreak; }
402
+
403
+ main := '-'? (
404
+ (('0' |
405
+ [1-9][0-9]*) '.' [0-9]+ ((([Ee] [+\-]?[0-9]+)?) | ([M]?))
406
+ ) |
407
+ (('0' | [1-9][0-9]*) ([Ee] [+\-]?[0-9]+))
408
+ )
409
+ (^[0-9Ee.\-M]? @exit );
410
+ }%%
411
+
412
+
413
+ const char* edn::Parser::EDN_parse_decimal(const char *p, const char *pe, Rice::Object& o)
414
+ {
415
+ int cs;
416
+
417
+ %% write init;
418
+ p_save = p;
419
+ %% write exec;
420
+
421
+ if (cs >= EDN_decimal_first_final) {
422
+ double value;
423
+ o = Parser::buftotype<double>(p_save, p - p_save, value);
424
+ return p + 1;
425
+ }
426
+ else if (cs == EDN_decimal_en_main) {} // silence ragel warning
427
+ return NULL;
428
+ }
429
+
430
+
431
+ // ============================================================
432
+ // integer parsing grammar
433
+ //
434
+ %%{
435
+ machine EDN_integer;
436
+
437
+ write data noerror;
438
+
439
+ action exit { fhold; fbreak; }
440
+
441
+ main := '-'? ('0' | [1-9][0-9]* [M]?) (^[0-9M]? @exit);
442
+ }%%
443
+
444
+ const char* edn::Parser::EDN_parse_integer(const char *p, const char *pe, Rice::Object& o)
445
+ {
446
+ int cs;
447
+
448
+ %% write init;
449
+ p_save = p;
450
+ %% write exec;
451
+
452
+ if (cs >= EDN_integer_first_final) {
453
+ int value;
454
+ o = Parser::buftotype<int>(p_save, p - p_save, value);
455
+ return p + 1;
456
+ }
457
+ else if (cs == EDN_integer_en_main) {} // silence ragel warning
458
+ return NULL;
459
+ }
460
+
461
+
462
+ // ============================================================
463
+ // vector parsing machine. EDN_vector_common is used to parse EDN
464
+ // vectors and lists since they're both represented as vectors in ruby
465
+ //
466
+ %%{
467
+ machine EDN_vector_common;
468
+ include EDN_common;
469
+
470
+ action parse_value {
471
+ Rice::Object v;
472
+ const char *np = EDN_parse_value(fpc, pe, v);
473
+ if (np == NULL) {
474
+ fhold; fbreak;
475
+ } else {
476
+ arr.push(v);
477
+ fexec np;
478
+ }
479
+ }
480
+
481
+ action parse_dispatch {
482
+ bool discard = false;
483
+ Rice::Object v;
484
+ const char *np = EDN_parse_tagged(fpc, pe, v, discard);
485
+ if (np == NULL) {
486
+ fhold; fbreak;
487
+ } else {
488
+ if (!discard) {
489
+ arr.push(v);
490
+ }
491
+ fexec np;
492
+ }
493
+ }
494
+
495
+ action exit { fhold; fbreak; }
496
+
497
+ element = (
498
+ begin_value >parse_value |
499
+ begin_dispatch >parse_dispatch
500
+ );
501
+
502
+ next_element = ignore* element;
503
+ }%%
504
+
505
+ //
506
+ // vector-specific machine
507
+ %%{
508
+ machine EDN_vector;
509
+ include EDN_vector_common;
510
+
511
+ write data;
512
+
513
+ main := begin_vector ignore* ((element ignore*)
514
+ (ignore* next_element ignore*)*)?
515
+ end_vector @err(close_err)
516
+ @exit;
517
+ }%%
518
+
519
+
520
+ //
521
+ // vector parsing
522
+ //
523
+ const char* edn::Parser::EDN_parse_vector(const char *p, const char *pe, Rice::Object& o)
524
+ {
525
+ static const char* EDN_TYPE = "vector";
526
+
527
+ int cs;
528
+ Rice::Array arr;
529
+
530
+ %% write init;
531
+ %% write exec;
532
+
533
+ if (cs >= EDN_vector_first_final) {
534
+ o = arr;
535
+ return p + 1;
536
+ }
537
+ else if (cs == EDN_vector_error) {
538
+ error(*p);
539
+ return pe;
540
+ }
541
+ else if (cs == EDN_vector_en_main) {} // silence ragel warning
542
+ return NULL;
543
+ }
544
+
545
+
546
+
547
+ // ============================================================
548
+ // list parsing machine
549
+ //
550
+ %%{
551
+ machine EDN_list;
552
+ include EDN_vector_common;
553
+
554
+ write data;
555
+
556
+ main := begin_list ignore*
557
+ ((begin_value >parse_value ignore*)
558
+ (ignore* next_element ignore*)*)?
559
+ end_list @err(close_err)
560
+ @exit;
561
+ }%%
562
+
563
+ //
564
+ // list parsing
565
+ //
566
+ const char* edn::Parser::EDN_parse_list(const char *p, const char *pe, Rice::Object& o)
567
+ {
568
+ static const char* EDN_TYPE = "list";
569
+
570
+ int cs;
571
+ Rice::Array arr;
572
+
573
+ %% write init;
574
+ %% write exec;
575
+
576
+ if (cs >= EDN_list_first_final) {
577
+ o = arr;
578
+ return p + 1;
579
+ }
580
+ else if (cs == EDN_list_error) {
581
+ error(*p);
582
+ return pe;
583
+ }
584
+ else if (cs == EDN_list_en_main) {} // silence ragel warning
585
+ return NULL;
586
+ }
587
+
588
+
589
+
590
+ // ============================================================
591
+ // hash parsing
592
+ //
593
+ %%{
594
+ machine EDN_map;
595
+ include EDN_common;
596
+
597
+ write data;
598
+
599
+ action parse_key {
600
+ const char *np = EDN_parse_value(fpc, pe, k);
601
+ if (np == NULL) {
602
+ fhold; fbreak;
603
+ } else {
604
+ fexec np;
605
+ }
606
+ }
607
+
608
+ action parse_value {
609
+ const char *np = EDN_parse_value(fpc, pe, v);
610
+ if (np == NULL) {
611
+ fhold; fbreak;
612
+ } else {
613
+ map[k] = v;
614
+ fexec np;
615
+ }
616
+ }
617
+
618
+ # action to report missing value in k/v pair
619
+ action pair_err {
620
+ error("map pair not found");
621
+ fexec pe;
622
+ }
623
+
624
+ action exit { fhold; fbreak; }
625
+
626
+ pair = ignore* begin_value >parse_key ignore* begin_value >parse_value @err(pair_err);
627
+ next_pair = ignore* pair;
628
+
629
+ main := (
630
+ begin_map
631
+ (pair (next_pair)*)? ignore*
632
+ end_map @err(close_err)
633
+ ) @exit;
634
+ }%%
635
+
636
+
637
+ const char* edn::Parser::EDN_parse_map(const char *p, const char *pe, Rice::Object& o)
638
+ {
639
+ static const char* EDN_TYPE = "map";
640
+
641
+ int cs;
642
+ Rice::Hash map;
643
+ Rice::Object k, v;
644
+
645
+ %% write init;
646
+ p_save = p;
647
+ %% write exec;
648
+
649
+ if (cs >= EDN_map_first_final) {
650
+ o = map;
651
+ return p + 1;
652
+ }
653
+ else if (cs == EDN_map_error) {
654
+ return pe;
655
+ }
656
+ else if (cs == EDN_map_en_main) {} // silence ragel warning
657
+ return NULL;
658
+ }
659
+
660
+
661
+
662
+ // ============================================================
663
+ // main parsing machine
664
+ //
665
+ %%{
666
+ machine EDN;
667
+ include EDN_common;
668
+
669
+ write data nofinal;
670
+
671
+ action parse_vector {
672
+ const char* np = EDN_parse_vector(fpc, pe, result);
673
+ if (np == NULL) { fhold; fbreak; } else fexec np;
674
+ }
675
+
676
+ action parse_map {
677
+ const char *np = EDN_parse_map(fpc, pe, result);
678
+ if (np == NULL) { fhold; fbreak; } else fexec np;
679
+ }
680
+
681
+ action parse_list {
682
+ const char *np = EDN_parse_list(fpc, pe, result);
683
+ if (np == NULL) { fhold; fbreak; } else fexec np;
684
+ }
685
+
686
+ main := ignore* (
687
+ begin_vector >parse_vector |
688
+ begin_map >parse_map |
689
+ begin_list >parse_list
690
+ ) ignore*;
691
+ }%%
692
+
693
+ //
694
+ //
695
+ //
696
+ Rice::Object edn::Parser::parse(const char* buf, std::size_t len)
697
+ {
698
+ int cs;
699
+ const char *p;
700
+ const char *pe;
701
+ Rice::Object result;
702
+
703
+ line_number = 1;
704
+
705
+ %% write init;
706
+ p = &buf[0];
707
+ pe = p + len;
708
+ eof = pe; // eof defined in Parser class
709
+ %% write exec;
710
+
711
+ if (cs == EDN_error) {
712
+ error(*p);
713
+ return Qnil;
714
+ }
715
+ else if (cs == EDN_en_main) {} // silence ragel warning
716
+ return result;
717
+ }
718
+
719
+ /*
720
+ * Local variables:
721
+ * mode: c
722
+ * c-file-style: ruby
723
+ * indent-tabs-mode: nil
724
+ * End:
725
+ */