edn_turbo 0.5.3 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/Gemfile +2 -0
- data/Rakefile +2 -3
- data/bin/ppedn +11 -12
- data/bin/ppedn-ruby +6 -5
- data/ext/edn_turbo/depend +5 -3
- data/ext/edn_turbo/edn_parser.cc +297 -295
- data/ext/edn_turbo/edn_parser.rl +20 -18
- data/ext/edn_turbo/extconf.rb +7 -11
- data/ext/edn_turbo/main.cc +28 -27
- data/ext/edn_turbo/{edn_parser.h → parser.h} +0 -42
- data/ext/edn_turbo/parser_def.cc +197 -0
- data/ext/edn_turbo/util.cc +240 -0
- data/ext/edn_turbo/util.h +39 -0
- data/ext/edn_turbo/util_unicode.cc +36 -0
- data/ext/edn_turbo/util_unicode.h +14 -0
- data/lib/edn_turbo/edn_parser.rb +6 -3
- data/lib/edn_turbo/version.rb +4 -2
- data/lib/edn_turbo.rb +2 -0
- data/test/test_output_diff.rb +38 -49
- metadata +9 -7
- data/ext/edn_turbo/edn_parser_util.cc +0 -424
- data/ext/edn_turbo/edn_parser_util.h +0 -11
- data/ext/edn_turbo/edn_parser_util_unicode.cc +0 -33
@@ -1,424 +0,0 @@
|
|
1
|
-
#include <iostream>
|
2
|
-
#include <string>
|
3
|
-
#include <stack>
|
4
|
-
#include <vector>
|
5
|
-
#include <limits>
|
6
|
-
#include <exception>
|
7
|
-
|
8
|
-
#include <cstring>
|
9
|
-
#include <stdexcept>
|
10
|
-
|
11
|
-
#include <ruby/ruby.h>
|
12
|
-
#include <ruby/encoding.h>
|
13
|
-
|
14
|
-
#include "edn_parser.h"
|
15
|
-
#include "edn_parser_util.h"
|
16
|
-
|
17
|
-
namespace edn
|
18
|
-
{
|
19
|
-
//
|
20
|
-
// used to determine max number of chars in string value of a type
|
21
|
-
template <typename T>
|
22
|
-
static std::size_t get_max_chars(T)
|
23
|
-
{
|
24
|
-
std::stringstream s;
|
25
|
-
s << std::fixed << std::numeric_limits<T>::max();
|
26
|
-
return s.str().length();
|
27
|
-
}
|
28
|
-
|
29
|
-
static const std::size_t LL_max_chars = get_max_chars<>((long) 1);
|
30
|
-
static const std::size_t LD_max_chars = get_max_chars<>((double) 1);
|
31
|
-
|
32
|
-
|
33
|
-
// parser destructor
|
34
|
-
//
|
35
|
-
Parser::~Parser()
|
36
|
-
{
|
37
|
-
reset_state();
|
38
|
-
del_top_meta_list();
|
39
|
-
|
40
|
-
if (io_buffer) {
|
41
|
-
free(reinterpret_cast<void*>(io_buffer));
|
42
|
-
}
|
43
|
-
}
|
44
|
-
|
45
|
-
// =================================================================
|
46
|
-
// for token-by-token parsing. If a discard or metadata is parsed,
|
47
|
-
// attempt to get the following value
|
48
|
-
//
|
49
|
-
VALUE Parser::next()
|
50
|
-
{
|
51
|
-
VALUE token = EDNT_EOF_CONST;
|
52
|
-
|
53
|
-
// buffer if reading from an IO
|
54
|
-
if (core_io || (read_io != Qnil)) {
|
55
|
-
fill_buf();
|
56
|
-
}
|
57
|
-
|
58
|
-
while (!is_eof())
|
59
|
-
{
|
60
|
-
// fetch a token. If it's metadata or discard
|
61
|
-
VALUE v = EDNT_EOF_CONST;
|
62
|
-
eTokenState state = parse_next(v);
|
63
|
-
|
64
|
-
if (state == TOKEN_OK) {
|
65
|
-
// valid token
|
66
|
-
token = v;
|
67
|
-
break;
|
68
|
-
}
|
69
|
-
else if (state == TOKEN_ERROR) {
|
70
|
-
token = EDNT_EOF_CONST;
|
71
|
-
break;
|
72
|
-
}
|
73
|
-
}
|
74
|
-
|
75
|
-
return token;
|
76
|
-
}
|
77
|
-
|
78
|
-
// reset parsing state
|
79
|
-
//
|
80
|
-
void Parser::reset_state()
|
81
|
-
{
|
82
|
-
line_number = 1;
|
83
|
-
discard.clear();
|
84
|
-
|
85
|
-
// remove any remaining levels except for the first
|
86
|
-
while (metadata.size() > 1) {
|
87
|
-
del_top_meta_list();
|
88
|
-
}
|
89
|
-
// but clear any metadata on the first
|
90
|
-
metadata.top()->clear();
|
91
|
-
|
92
|
-
// clean up
|
93
|
-
core_io = NULL;
|
94
|
-
read_io = Qnil;
|
95
|
-
p = pe = eof = NULL;
|
96
|
-
}
|
97
|
-
|
98
|
-
//
|
99
|
-
// set a new source
|
100
|
-
void Parser::set_source(const char* src, std::size_t len)
|
101
|
-
{
|
102
|
-
reset_state();
|
103
|
-
// set ragel state
|
104
|
-
p = src;
|
105
|
-
pe = src + len;
|
106
|
-
eof = pe;
|
107
|
-
}
|
108
|
-
|
109
|
-
void Parser::set_source(FILE* fp)
|
110
|
-
{
|
111
|
-
reset_state();
|
112
|
-
core_io = fp;
|
113
|
-
}
|
114
|
-
|
115
|
-
void Parser::set_source(VALUE str_io)
|
116
|
-
{
|
117
|
-
reset_state();
|
118
|
-
read_io = str_io;
|
119
|
-
}
|
120
|
-
|
121
|
-
//
|
122
|
-
// for IO sources, read and fill a buffer
|
123
|
-
void Parser::fill_buf()
|
124
|
-
{
|
125
|
-
std::string str_buf;
|
126
|
-
|
127
|
-
// read as much data available
|
128
|
-
if (core_io) {
|
129
|
-
// ruby core IO types
|
130
|
-
char c;
|
131
|
-
while (1)
|
132
|
-
{
|
133
|
-
c = fgetc(core_io);
|
134
|
-
if (c == EOF) {
|
135
|
-
break;
|
136
|
-
}
|
137
|
-
str_buf += c;
|
138
|
-
}
|
139
|
-
|
140
|
-
} else if (read_io != Qnil) {
|
141
|
-
// StringIO, etc. Call read() from ruby side
|
142
|
-
VALUE v = ruby_io_read(read_io);
|
143
|
-
if (TYPE(v) == T_STRING) {
|
144
|
-
str_buf.assign( StringValuePtr(v), RSTRING_LEN(v));
|
145
|
-
}
|
146
|
-
}
|
147
|
-
|
148
|
-
// set the buffer to read from
|
149
|
-
if (str_buf.length() > 0) {
|
150
|
-
// first time when io_buffer is NULL, pe & p = 0
|
151
|
-
uintmax_t new_length = (pe - p) + str_buf.length();
|
152
|
-
if (new_length > (((uintmax_t) 1 << 32) - 1)) {
|
153
|
-
// icu -> 32-bit. TODO: handle
|
154
|
-
rb_raise(rb_eRuntimeError, "Unsupported string buffer length");
|
155
|
-
}
|
156
|
-
char* start = NULL;
|
157
|
-
|
158
|
-
// allocate or extend storage needed
|
159
|
-
if (!io_buffer) {
|
160
|
-
io_buffer = reinterpret_cast<char*>(malloc(new_length));
|
161
|
-
start = io_buffer;
|
162
|
-
} else if (io_buffer_len < new_length) {
|
163
|
-
// resize the buffer
|
164
|
-
io_buffer = reinterpret_cast<char*>(realloc(reinterpret_cast<void*>(io_buffer), new_length));
|
165
|
-
}
|
166
|
-
|
167
|
-
if (!start) {
|
168
|
-
// appending to the buffer but move the data not yet
|
169
|
-
// parsed first to the front
|
170
|
-
memmove(io_buffer, p, pe - p);
|
171
|
-
start = io_buffer + (pe - p);
|
172
|
-
}
|
173
|
-
|
174
|
-
// and copy
|
175
|
-
memcpy(start, str_buf.c_str(), str_buf.length());
|
176
|
-
io_buffer_len = (uint32_t) new_length;
|
177
|
-
|
178
|
-
// set ragel state
|
179
|
-
p = io_buffer;
|
180
|
-
pe = p + new_length;
|
181
|
-
eof = pe;
|
182
|
-
}
|
183
|
-
}
|
184
|
-
|
185
|
-
|
186
|
-
// =================================================================
|
187
|
-
// work-around for idiotic rb_protect convention in order to avoid
|
188
|
-
// using ruby/rice
|
189
|
-
//
|
190
|
-
typedef VALUE (edn_rb_f_type)( VALUE arg );
|
191
|
-
|
192
|
-
// we're using at most 2 args
|
193
|
-
struct prot_args {
|
194
|
-
prot_args(VALUE r, ID m) :
|
195
|
-
receiver(r), method(m), count(0) {
|
196
|
-
}
|
197
|
-
prot_args(VALUE r, ID m, VALUE arg) :
|
198
|
-
receiver(r), method(m), count(1) {
|
199
|
-
args[0] = arg;
|
200
|
-
}
|
201
|
-
prot_args(VALUE r, ID m, VALUE arg1, VALUE arg2) :
|
202
|
-
receiver(r), method(m), count(2) {
|
203
|
-
args[0] = arg1;
|
204
|
-
args[1] = arg2;
|
205
|
-
}
|
206
|
-
|
207
|
-
VALUE call() const {
|
208
|
-
return ((count == 0) ?
|
209
|
-
rb_funcall( receiver, method, 0 ) :
|
210
|
-
rb_funcall2( receiver, method, count, args ));
|
211
|
-
}
|
212
|
-
|
213
|
-
private:
|
214
|
-
VALUE receiver;
|
215
|
-
ID method;
|
216
|
-
int count;
|
217
|
-
VALUE args[2];
|
218
|
-
};
|
219
|
-
|
220
|
-
// this allows us to wrap with rb_protect()
|
221
|
-
static inline VALUE edn_wrap_funcall2( VALUE arg )
|
222
|
-
{
|
223
|
-
const prot_args* a = reinterpret_cast<const prot_args*>(arg);
|
224
|
-
if (a)
|
225
|
-
return a->call();
|
226
|
-
return Qnil;
|
227
|
-
}
|
228
|
-
|
229
|
-
static inline VALUE edn_prot_rb_funcall( edn_rb_f_type func, VALUE args )
|
230
|
-
{
|
231
|
-
int error;
|
232
|
-
VALUE s = rb_protect( func, args, &error );
|
233
|
-
if (error) Parser::throw_error(error);
|
234
|
-
return s;
|
235
|
-
}
|
236
|
-
|
237
|
-
static inline VALUE edn_prot_rb_new_str(const char* str) {
|
238
|
-
int error;
|
239
|
-
VALUE s = rb_protect( reinterpret_cast<VALUE (*)(VALUE)>(rb_str_new_cstr),
|
240
|
-
reinterpret_cast<VALUE>(str), &error );
|
241
|
-
if (error) Parser::throw_error(error);
|
242
|
-
return s;
|
243
|
-
}
|
244
|
-
|
245
|
-
static inline VALUE edn_rb_enc_associate_utf8(VALUE str)
|
246
|
-
{
|
247
|
-
return rb_enc_associate(str, rb_utf8_encoding() );
|
248
|
-
}
|
249
|
-
|
250
|
-
// =================================================================
|
251
|
-
// utils
|
252
|
-
|
253
|
-
//
|
254
|
-
// convert to int.. if string rep has more digits than long can
|
255
|
-
// hold, call into ruby to get a big num
|
256
|
-
VALUE Parser::integer_to_ruby(const char* str, std::size_t len)
|
257
|
-
{
|
258
|
-
if (str[len-1] == 'M' || len >= LL_max_chars)
|
259
|
-
{
|
260
|
-
std::string buf(str, len);
|
261
|
-
VALUE vs = edn_prot_rb_new_str(buf.c_str());
|
262
|
-
prot_args args(vs, EDNT_STRING_TO_I_METHOD);
|
263
|
-
return edn_prot_rb_funcall( edn_wrap_funcall2, reinterpret_cast<VALUE>(&args) );
|
264
|
-
}
|
265
|
-
|
266
|
-
return LONG2NUM(buftotype<long>(str, len));
|
267
|
-
}
|
268
|
-
|
269
|
-
//
|
270
|
-
// as above.. TODO: check exponential..
|
271
|
-
VALUE Parser::float_to_ruby(const char* str, std::size_t len)
|
272
|
-
{
|
273
|
-
if (str[len-1] == 'M' || len >= LD_max_chars)
|
274
|
-
{
|
275
|
-
std::string buf(str, len);
|
276
|
-
VALUE vs = edn_prot_rb_new_str(buf.c_str());
|
277
|
-
|
278
|
-
if (str[len-1] == 'M') {
|
279
|
-
return Parser::make_edn_type(EDNT_MAKE_BIG_DECIMAL_METHOD, vs);
|
280
|
-
}
|
281
|
-
|
282
|
-
prot_args args(vs, EDNT_STRING_TO_F_METHOD);
|
283
|
-
return edn_prot_rb_funcall( edn_wrap_funcall2, reinterpret_cast<VALUE>(&args) );
|
284
|
-
}
|
285
|
-
|
286
|
-
return rb_float_new(buftotype<double>(str, len));
|
287
|
-
}
|
288
|
-
|
289
|
-
|
290
|
-
//
|
291
|
-
// read from a StringIO - expensive!!!
|
292
|
-
//
|
293
|
-
VALUE Parser::ruby_io_read(VALUE io)
|
294
|
-
{
|
295
|
-
prot_args args(io, EDNT_READ_METHOD);
|
296
|
-
return edn_prot_rb_funcall( edn_wrap_funcall2, reinterpret_cast<VALUE>(&args) );
|
297
|
-
}
|
298
|
-
|
299
|
-
//
|
300
|
-
// copies the string data, unescaping any present values that need to be replaced
|
301
|
-
//
|
302
|
-
bool Parser::parse_byte_stream(const char *p_start, const char *p_end, VALUE& v_utf8,
|
303
|
-
bool encode)
|
304
|
-
{
|
305
|
-
if (p_end > p_start) {
|
306
|
-
std::string buf;
|
307
|
-
|
308
|
-
if (encode) {
|
309
|
-
if (!util::to_utf8(p_start, (uint32_t) (p_end - p_start), buf))
|
310
|
-
return false;
|
311
|
-
}
|
312
|
-
else {
|
313
|
-
buf.append(p_start, p_end - p_start);
|
314
|
-
}
|
315
|
-
|
316
|
-
// utf-8 encode
|
317
|
-
VALUE vs = edn_prot_rb_new_str(buf.c_str());
|
318
|
-
int error;
|
319
|
-
v_utf8 = rb_protect( edn_rb_enc_associate_utf8, vs, &error);
|
320
|
-
if (error) Parser::throw_error(error);
|
321
|
-
return true;
|
322
|
-
} else if (p_end == p_start) {
|
323
|
-
v_utf8 = rb_str_new("", 0);
|
324
|
-
return true;
|
325
|
-
}
|
326
|
-
|
327
|
-
return false;
|
328
|
-
}
|
329
|
-
|
330
|
-
//
|
331
|
-
// handles things like \c, \newline
|
332
|
-
//
|
333
|
-
bool Parser::parse_escaped_char(const char *p, const char *pe, VALUE& v)
|
334
|
-
{
|
335
|
-
std::string buf;
|
336
|
-
std::size_t len = pe - p;
|
337
|
-
buf.append(p, len);
|
338
|
-
|
339
|
-
if (len > 1) {
|
340
|
-
if (buf == "newline") buf = '\n';
|
341
|
-
else if (buf == "tab") buf = '\t';
|
342
|
-
else if (buf == "return") buf = '\r';
|
343
|
-
else if (buf == "space") buf = ' ';
|
344
|
-
else if (buf == "formfeed") buf = '\f';
|
345
|
-
else if (buf == "backspace") buf = '\b';
|
346
|
-
// TODO: is this supported?
|
347
|
-
else if (buf == "verticaltab") buf = '\v';
|
348
|
-
else return false;
|
349
|
-
}
|
350
|
-
|
351
|
-
v = edn_prot_rb_new_str( buf.c_str() );
|
352
|
-
return true;
|
353
|
-
}
|
354
|
-
|
355
|
-
|
356
|
-
//
|
357
|
-
// get a set representation from the ruby side. See edn_turbo.rb
|
358
|
-
VALUE Parser::make_edn_type(ID method, VALUE sym)
|
359
|
-
{
|
360
|
-
VALUE edn_module = rb_const_get(rb_cObject, edn::EDN_MODULE_SYMBOL);
|
361
|
-
prot_args args(edn_module, method, sym);
|
362
|
-
return edn_prot_rb_funcall( edn_wrap_funcall2, reinterpret_cast<VALUE>(&args) );
|
363
|
-
}
|
364
|
-
|
365
|
-
VALUE Parser::make_edn_type(ID method, VALUE name, VALUE data)
|
366
|
-
{
|
367
|
-
VALUE module = rb_const_get(rb_cObject, edn::EDN_MODULE_SYMBOL);
|
368
|
-
return make_edn_type(module, method, name, data);
|
369
|
-
}
|
370
|
-
|
371
|
-
VALUE Parser::make_edn_type(VALUE module, ID method, VALUE name, VALUE data)
|
372
|
-
{
|
373
|
-
prot_args args(module, method, name, data);
|
374
|
-
return edn_prot_rb_funcall( edn_wrap_funcall2, reinterpret_cast<VALUE>(&args) );
|
375
|
-
}
|
376
|
-
|
377
|
-
|
378
|
-
// =================================================================
|
379
|
-
// METADATA
|
380
|
-
//
|
381
|
-
// returns an array of metadata value(s) saved in reverse order
|
382
|
-
// (right to left) - the ruby side will interpret this
|
383
|
-
VALUE Parser::ruby_meta()
|
384
|
-
{
|
385
|
-
VALUE m_ary = rb_ary_new();
|
386
|
-
|
387
|
-
// pop from the back of the top-most list
|
388
|
-
while (!metadata.top()->empty()) {
|
389
|
-
rb_ary_push(m_ary, metadata.top()->back());
|
390
|
-
metadata.top()->pop_back();
|
391
|
-
}
|
392
|
-
|
393
|
-
return m_ary;
|
394
|
-
}
|
395
|
-
|
396
|
-
|
397
|
-
// =================================================================
|
398
|
-
//
|
399
|
-
// error reporting
|
400
|
-
void Parser::throw_error(int error)
|
401
|
-
{
|
402
|
-
if (error == 0)
|
403
|
-
return;
|
404
|
-
|
405
|
-
VALUE err = rb_errinfo();
|
406
|
-
VALUE klass = rb_class_path(CLASS_OF(err));
|
407
|
-
VALUE message = rb_obj_as_string(err);
|
408
|
-
std::stringstream msg;
|
409
|
-
msg << RSTRING_PTR(klass) << " exception: " << RSTRING_PTR(message);
|
410
|
-
throw std::runtime_error(msg.str());
|
411
|
-
}
|
412
|
-
|
413
|
-
void Parser::error(const std::string& func, const std::string& err, char c) const
|
414
|
-
{
|
415
|
-
std::cerr << "Parse error "
|
416
|
-
// "from " << func << "() "
|
417
|
-
;
|
418
|
-
if (err.length() > 0)
|
419
|
-
std::cerr << "(" << err << ") ";
|
420
|
-
if (c != '\0')
|
421
|
-
std::cerr << "at '" << c << "' ";
|
422
|
-
std::cerr << "on line " << line_number << std::endl;
|
423
|
-
}
|
424
|
-
}
|
@@ -1,33 +0,0 @@
|
|
1
|
-
#include <string>
|
2
|
-
|
3
|
-
//
|
4
|
-
// needed to define this in its own file because icu and ruby have
|
5
|
-
// differing definitions for Uchar and the compiler complains
|
6
|
-
//
|
7
|
-
#include <unicode/utypes.h>
|
8
|
-
#include <unicode/ustring.h>
|
9
|
-
#include <unicode/ucnv.h>
|
10
|
-
#include <unicode/unistr.h>
|
11
|
-
|
12
|
-
#include "edn_parser_util.h"
|
13
|
-
|
14
|
-
namespace edn
|
15
|
-
{
|
16
|
-
namespace util
|
17
|
-
{
|
18
|
-
//
|
19
|
-
// unescapes any values that need to be replaced, saves it to utf8
|
20
|
-
//
|
21
|
-
bool to_utf8(const char *s, uint32_t len, std::string& rslt)
|
22
|
-
{
|
23
|
-
icu::UnicodeString ustr(s, len);
|
24
|
-
|
25
|
-
if (ustr.isBogus()) {
|
26
|
-
return false;
|
27
|
-
}
|
28
|
-
|
29
|
-
ustr.unescape().toUTF8String(rslt);
|
30
|
-
return true;
|
31
|
-
}
|
32
|
-
}
|
33
|
-
}
|