zscan 1.0.1 → 1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/{bench.rb → benchmark/vs-strscan.rb} +1 -1
- data/benchmark/vs-unpack.rb +21 -0
- data/ext/bspec_exec.inc +156 -0
- data/ext/bspec_opcode_names.inc +3 -0
- data/ext/zscan.c +210 -18
- data/lib/zscan.rb +99 -2
- data/lib/zscan/instructions.rb +165 -0
- data/rakefile +137 -0
- data/readme.md +80 -8
- data/spec/binary_scan_spec.rb +28 -0
- data/spec/combinator_spec.rb +52 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/typed_scan_spec.rb +48 -0
- data/spec/zscan_spec.rb +18 -24
- data/zscan.gemspec +2 -2
- metadata +11 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 998b97db8e9341f3920caa27bf11558954a777ba
|
4
|
+
data.tar.gz: 033986f8e4a4086985bca23c84f8acfabd2f5e29
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a8c23d9f29b57e113a55e46fb024bce0337aa8ee5b8317744ed9ce9c13ddf35e4b1b7af9bbf25d67c6b14e6c30bb7de00c214692cb5fea05c2e0f4fa6116478b
|
7
|
+
data.tar.gz: 06ccbc8c793f873a630c4774b746ad8ca1cfa4c2895414871c0b435812bd902e3f75618a6134551b5ab3f35b2838e12c9072e0bb15ff8f6010a7ba2ca4e8a83a
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require_relative "../lib/zscan"
|
2
|
+
require "benchmark"
|
3
|
+
|
4
|
+
spec = ZScan.binary_spec do
|
5
|
+
int8
|
6
|
+
double_le 2
|
7
|
+
single_be
|
8
|
+
end
|
9
|
+
|
10
|
+
arr = [1, 1.1, 1.2, 1.3]
|
11
|
+
str = arr.pack 'cE2g'
|
12
|
+
z = Zscan.new str.b
|
13
|
+
|
14
|
+
puts 'reference nop group'
|
15
|
+
puts Benchmark.measure{ 100000.times{ z.pos = 0 } }
|
16
|
+
puts 'ZScan#unpack'
|
17
|
+
puts Benchmark.measure{ 100000.times{ z.pos = 0; z.unpack 'cE2g' } }
|
18
|
+
puts 'ZScan#scan_binary'
|
19
|
+
puts Benchmark.measure{ 100000.times{ z.pos = 0; z.scan_binary spec } }
|
20
|
+
puts 'String#unpack'
|
21
|
+
puts Benchmark.measure{ 100000.times{ z.pos = 0; str.unpack 'cE2g' } }
|
data/ext/bspec_exec.inc
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
// GENERATED WITH: rake gen
|
2
|
+
#line 2 "ext/bspec_exec.inc"
|
3
|
+
__attribute__((__noinline__))
|
4
|
+
static VALUE bspec_exec(void** ip, char* s, VALUE a) {
|
5
|
+
static void* opcodes[] = { &&BS_RET, &&BS_INT8, &&BS_INT16, &&BS_INT16_SWAP, &&BS_INT32, &&BS_INT32_SWAP, &&BS_INT64, &&BS_INT64_SWAP, &&BS_UINT8, &&BS_UINT16, &&BS_UINT16_SWAP, &&BS_UINT32, &&BS_UINT32_SWAP, &&BS_UINT64, &&BS_UINT64_SWAP, &&BS_SINGLE, &&BS_SINGLE_SWAP, &&BS_DOUBLE, &&BS_DOUBLE_SWAP };
|
6
|
+
if (ip == NULL) {
|
7
|
+
return (VALUE)opcodes;
|
8
|
+
}
|
9
|
+
goto **(ip++);
|
10
|
+
BS_RET:
|
11
|
+
return a;
|
12
|
+
BS_INT8:
|
13
|
+
{
|
14
|
+
uint8_t r = ((uint8_t*)s)[0];
|
15
|
+
rb_ary_push(a, INT2FIX(CAST(r, int8_t)));
|
16
|
+
s += 1;
|
17
|
+
goto **(ip++);
|
18
|
+
}
|
19
|
+
|
20
|
+
BS_INT16:
|
21
|
+
{
|
22
|
+
uint16_t r = ((uint16_t*)s)[0];
|
23
|
+
rb_ary_push(a, INT2FIX(CAST(r, int16_t)));
|
24
|
+
s += 2;
|
25
|
+
goto **(ip++);
|
26
|
+
}
|
27
|
+
|
28
|
+
BS_INT16_SWAP:
|
29
|
+
{
|
30
|
+
uint16_t r = swap16(((uint16_t*)s)[0]);
|
31
|
+
rb_ary_push(a, INT2FIX(CAST(r, int16_t)));
|
32
|
+
s += 2;
|
33
|
+
goto **(ip++);
|
34
|
+
}
|
35
|
+
|
36
|
+
BS_INT32:
|
37
|
+
{
|
38
|
+
uint32_t r = ((uint32_t*)s)[0];
|
39
|
+
rb_ary_push(a, INT2NUM(CAST(r, int32_t)));
|
40
|
+
s += 4;
|
41
|
+
goto **(ip++);
|
42
|
+
}
|
43
|
+
|
44
|
+
BS_INT32_SWAP:
|
45
|
+
{
|
46
|
+
uint32_t r = swap32(((uint32_t*)s)[0]);
|
47
|
+
rb_ary_push(a, INT2NUM(CAST(r, int32_t)));
|
48
|
+
s += 4;
|
49
|
+
goto **(ip++);
|
50
|
+
}
|
51
|
+
|
52
|
+
BS_INT64:
|
53
|
+
{
|
54
|
+
uint64_t r = ((uint64_t*)s)[0];
|
55
|
+
rb_ary_push(a, INT64toNUM(CAST(r, int64_t)));
|
56
|
+
s += 8;
|
57
|
+
goto **(ip++);
|
58
|
+
}
|
59
|
+
|
60
|
+
BS_INT64_SWAP:
|
61
|
+
{
|
62
|
+
uint64_t r = swap64(((uint64_t*)s)[0]);
|
63
|
+
rb_ary_push(a, INT64toNUM(CAST(r, int64_t)));
|
64
|
+
s += 8;
|
65
|
+
goto **(ip++);
|
66
|
+
}
|
67
|
+
|
68
|
+
BS_UINT8:
|
69
|
+
{
|
70
|
+
uint8_t r = ((uint8_t*)s)[0];
|
71
|
+
rb_ary_push(a, INT2FIX(CAST(r, uint8_t)));
|
72
|
+
s += 1;
|
73
|
+
goto **(ip++);
|
74
|
+
}
|
75
|
+
|
76
|
+
BS_UINT16:
|
77
|
+
{
|
78
|
+
uint16_t r = ((uint16_t*)s)[0];
|
79
|
+
rb_ary_push(a, INT2FIX(CAST(r, uint16_t)));
|
80
|
+
s += 2;
|
81
|
+
goto **(ip++);
|
82
|
+
}
|
83
|
+
|
84
|
+
BS_UINT16_SWAP:
|
85
|
+
{
|
86
|
+
uint16_t r = swap16(((uint16_t*)s)[0]);
|
87
|
+
rb_ary_push(a, INT2FIX(CAST(r, uint16_t)));
|
88
|
+
s += 2;
|
89
|
+
goto **(ip++);
|
90
|
+
}
|
91
|
+
|
92
|
+
BS_UINT32:
|
93
|
+
{
|
94
|
+
uint32_t r = ((uint32_t*)s)[0];
|
95
|
+
rb_ary_push(a, UINT64toNUM(r));
|
96
|
+
s += 4;
|
97
|
+
goto **(ip++);
|
98
|
+
}
|
99
|
+
|
100
|
+
BS_UINT32_SWAP:
|
101
|
+
{
|
102
|
+
uint32_t r = swap32(((uint32_t*)s)[0]);
|
103
|
+
rb_ary_push(a, UINT64toNUM(r));
|
104
|
+
s += 4;
|
105
|
+
goto **(ip++);
|
106
|
+
}
|
107
|
+
|
108
|
+
BS_UINT64:
|
109
|
+
{
|
110
|
+
uint64_t r = ((uint64_t*)s)[0];
|
111
|
+
rb_ary_push(a, UINT64toNUM(r));
|
112
|
+
s += 8;
|
113
|
+
goto **(ip++);
|
114
|
+
}
|
115
|
+
|
116
|
+
BS_UINT64_SWAP:
|
117
|
+
{
|
118
|
+
uint64_t r = swap64(((uint64_t*)s)[0]);
|
119
|
+
rb_ary_push(a, UINT64toNUM(r));
|
120
|
+
s += 8;
|
121
|
+
goto **(ip++);
|
122
|
+
}
|
123
|
+
|
124
|
+
BS_SINGLE:
|
125
|
+
{
|
126
|
+
uint32_t r = ((uint32_t*)s)[0];
|
127
|
+
rb_ary_push(a, DBL2NUM((double)CAST(r, float)));
|
128
|
+
s += 4;
|
129
|
+
goto **(ip++);
|
130
|
+
}
|
131
|
+
|
132
|
+
BS_SINGLE_SWAP:
|
133
|
+
{
|
134
|
+
uint32_t r = swap32(((uint32_t*)s)[0]);
|
135
|
+
rb_ary_push(a, DBL2NUM((double)CAST(r, float)));
|
136
|
+
s += 4;
|
137
|
+
goto **(ip++);
|
138
|
+
}
|
139
|
+
|
140
|
+
BS_DOUBLE:
|
141
|
+
{
|
142
|
+
uint64_t r = ((uint64_t*)s)[0];
|
143
|
+
rb_ary_push(a, DBL2NUM(CAST(r, double)));
|
144
|
+
s += 8;
|
145
|
+
goto **(ip++);
|
146
|
+
}
|
147
|
+
|
148
|
+
BS_DOUBLE_SWAP:
|
149
|
+
{
|
150
|
+
uint64_t r = swap64(((uint64_t*)s)[0]);
|
151
|
+
rb_ary_push(a, DBL2NUM(CAST(r, double)));
|
152
|
+
s += 8;
|
153
|
+
goto **(ip++);
|
154
|
+
}
|
155
|
+
|
156
|
+
}
|
@@ -0,0 +1,3 @@
|
|
1
|
+
// GENERATED WITH: rake gen
|
2
|
+
const char* bspec_opcode_names[] = {"RET", "INT8", "INT16", "INT16_SWAP", "INT32", "INT32_SWAP", "INT64", "INT64_SWAP", "UINT8", "UINT16", "UINT16_SWAP", "UINT32", "UINT32_SWAP", "UINT64", "UINT64_SWAP", "SINGLE", "SINGLE_SWAP", "DOUBLE", "DOUBLE_SWAP"};
|
3
|
+
long bspec_opcode_size = 19;
|
data/ext/zscan.c
CHANGED
@@ -1,23 +1,24 @@
|
|
1
1
|
#include <ruby/ruby.h>
|
2
2
|
#include <ruby/re.h>
|
3
3
|
#include <ruby/encoding.h>
|
4
|
+
#include <ctype.h>
|
5
|
+
|
6
|
+
// todo infect check
|
4
7
|
|
5
8
|
typedef struct {
|
6
|
-
|
7
|
-
|
9
|
+
long pos;
|
10
|
+
long bytepos;
|
8
11
|
} Pos;
|
9
12
|
|
10
13
|
typedef struct {
|
11
|
-
|
12
|
-
|
14
|
+
long pos;
|
15
|
+
long bytepos;
|
13
16
|
VALUE s;
|
14
|
-
|
15
|
-
|
17
|
+
long stack_i;
|
18
|
+
long stack_cap;
|
16
19
|
Pos* stack;
|
17
20
|
} ZScan;
|
18
21
|
|
19
|
-
#define P ZScan* p = rb_check_typeddata(self, &zscan_type)
|
20
|
-
|
21
22
|
static void zscan_mark(void* pp) {
|
22
23
|
ZScan* p = pp;
|
23
24
|
rb_gc_mark(p->s);
|
@@ -39,6 +40,8 @@ static const rb_data_type_t zscan_type = {
|
|
39
40
|
{zscan_mark, zscan_free, zscan_memsize}
|
40
41
|
};
|
41
42
|
|
43
|
+
#define P ZScan* p = rb_check_typeddata(self, &zscan_type)
|
44
|
+
|
42
45
|
static VALUE zscan_alloc(VALUE klass) {
|
43
46
|
ZScan* p = ALLOC(ZScan);
|
44
47
|
MEMZERO(p, ZScan, 1);
|
@@ -66,13 +69,12 @@ static VALUE zscan_pos(VALUE self) {
|
|
66
69
|
|
67
70
|
static VALUE zscan_advance(VALUE self, VALUE v_diff) {
|
68
71
|
P;
|
69
|
-
long
|
70
|
-
if (
|
72
|
+
long n = p->pos + NUM2LONG(v_diff);
|
73
|
+
if (n < 0) {
|
71
74
|
p->pos = 0;
|
72
75
|
p->bytepos = 0;
|
73
76
|
return self;
|
74
77
|
}
|
75
|
-
size_t n = signed_n;
|
76
78
|
|
77
79
|
// because there's no "reverse scan" API, we have a O(n) routine :(
|
78
80
|
if (n < p->pos) {
|
@@ -82,7 +84,7 @@ static VALUE zscan_advance(VALUE self, VALUE v_diff) {
|
|
82
84
|
|
83
85
|
if (n > p->pos) {
|
84
86
|
rb_encoding* enc = rb_enc_get(p->s);
|
85
|
-
|
87
|
+
long byteend = RSTRING_LEN(p->s);
|
86
88
|
char* ptr = RSTRING_PTR(p->s);
|
87
89
|
for (; p->pos < n && p->bytepos < byteend;) {
|
88
90
|
int n = rb_enc_mbclen(ptr + p->bytepos, ptr + byteend, enc);
|
@@ -105,7 +107,7 @@ static VALUE zscan_bytepos(VALUE self) {
|
|
105
107
|
static VALUE zscan_bytepos_eq(VALUE self, VALUE v_bytepos) {
|
106
108
|
P;
|
107
109
|
long signed_bytepos = NUM2LONG(v_bytepos);
|
108
|
-
|
110
|
+
long from, to, bytepos;
|
109
111
|
|
110
112
|
if (signed_bytepos > RSTRING_LEN(p->s)) {
|
111
113
|
bytepos = RSTRING_LEN(p->s);
|
@@ -127,7 +129,7 @@ static VALUE zscan_bytepos_eq(VALUE self, VALUE v_bytepos) {
|
|
127
129
|
|
128
130
|
rb_encoding* enc = rb_enc_get(p->s);
|
129
131
|
char* ptr = RSTRING_PTR(p->s);
|
130
|
-
|
132
|
+
long diff = 0;
|
131
133
|
for (; from < to;) {
|
132
134
|
int n = rb_enc_mbclen(ptr + from, ptr + to, enc);
|
133
135
|
if (n) {
|
@@ -154,7 +156,7 @@ static VALUE zscan_bytepos_eq(VALUE self, VALUE v_bytepos) {
|
|
154
156
|
|
155
157
|
static VALUE zscan_eos_p(VALUE self) {
|
156
158
|
P;
|
157
|
-
return (p->bytepos ==
|
159
|
+
return (p->bytepos == RSTRING_LEN(p->s) ? Qtrue : Qfalse);
|
158
160
|
}
|
159
161
|
|
160
162
|
regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
|
@@ -262,10 +264,13 @@ static VALUE zscan_clear_pos_stack(VALUE self) {
|
|
262
264
|
return self;
|
263
265
|
}
|
264
266
|
|
265
|
-
|
266
|
-
if (!rb_block_given_p()) {
|
267
|
-
rb_raise(rb_eRuntimeError, "need a block")
|
267
|
+
#define REQUIRE_BLOCK \
|
268
|
+
if (!rb_block_given_p()) {\
|
269
|
+
rb_raise(rb_eRuntimeError, "need a block");\
|
268
270
|
}
|
271
|
+
|
272
|
+
static VALUE zscan_try(VALUE self) {
|
273
|
+
REQUIRE_BLOCK;
|
269
274
|
VALUE r;
|
270
275
|
zscan_push(self);
|
271
276
|
r = rb_yield(Qnil);
|
@@ -277,6 +282,175 @@ static VALUE zscan_try(VALUE self) {
|
|
277
282
|
return r;
|
278
283
|
}
|
279
284
|
|
285
|
+
static VALUE zscan_zero_or_one(int argc, VALUE* argv, VALUE self) {
|
286
|
+
REQUIRE_BLOCK;
|
287
|
+
volatile VALUE a = Qnil;
|
288
|
+
volatile VALUE r;
|
289
|
+
rb_scan_args(argc, argv, "01", &a);
|
290
|
+
if (a == Qnil) {
|
291
|
+
a = rb_ary_new();
|
292
|
+
}
|
293
|
+
zscan_push(self);
|
294
|
+
r = rb_yield(Qnil);
|
295
|
+
if (RTEST(r)) {
|
296
|
+
rb_funcall(a, rb_intern("<<"), 1, r);
|
297
|
+
zscan_drop(self);
|
298
|
+
} else {
|
299
|
+
zscan_pop(self);
|
300
|
+
}
|
301
|
+
return a;
|
302
|
+
}
|
303
|
+
|
304
|
+
static VALUE zscan_zero_or_more(int argc, VALUE* argv, VALUE self) {
|
305
|
+
REQUIRE_BLOCK;
|
306
|
+
volatile VALUE a = Qnil;
|
307
|
+
volatile VALUE r;
|
308
|
+
long backpos;
|
309
|
+
P;
|
310
|
+
rb_scan_args(argc, argv, "01", &a);
|
311
|
+
if (a == Qnil) {
|
312
|
+
a = rb_ary_new();
|
313
|
+
}
|
314
|
+
for (;;) {
|
315
|
+
zscan_push(self);
|
316
|
+
backpos = p->bytepos;
|
317
|
+
r = rb_yield(Qnil);
|
318
|
+
if (RTEST(r) && backpos != p->bytepos) {
|
319
|
+
rb_funcall(a, rb_intern("<<"), 1, r);
|
320
|
+
zscan_drop(self);
|
321
|
+
} else {
|
322
|
+
zscan_pop(self);
|
323
|
+
break;
|
324
|
+
}
|
325
|
+
}
|
326
|
+
return a;
|
327
|
+
}
|
328
|
+
|
329
|
+
static VALUE zscan_one_or_more(int argc, VALUE* argv, VALUE self) {
|
330
|
+
REQUIRE_BLOCK;
|
331
|
+
volatile VALUE a = Qnil;
|
332
|
+
volatile VALUE r;
|
333
|
+
|
334
|
+
r = rb_yield(Qnil);
|
335
|
+
if (RTEST(r)) {
|
336
|
+
long backpos;
|
337
|
+
P;
|
338
|
+
rb_scan_args(argc, argv, "01", &a);
|
339
|
+
if (a == Qnil) {
|
340
|
+
a = rb_ary_new();
|
341
|
+
}
|
342
|
+
|
343
|
+
rb_funcall(a, rb_intern("<<"), 1, r);
|
344
|
+
for (;;) {
|
345
|
+
zscan_push(self);
|
346
|
+
backpos = p->bytepos;
|
347
|
+
r = rb_yield(Qnil);
|
348
|
+
if (RTEST(r) && backpos != p->bytepos) {
|
349
|
+
rb_funcall(a, rb_intern("<<"), 1, r);
|
350
|
+
zscan_drop(self);
|
351
|
+
} else {
|
352
|
+
zscan_pop(self);
|
353
|
+
break;
|
354
|
+
}
|
355
|
+
}
|
356
|
+
return a;
|
357
|
+
} else {
|
358
|
+
return Qnil;
|
359
|
+
}
|
360
|
+
}
|
361
|
+
|
362
|
+
VALUE zscan_scan_float(VALUE self) {
|
363
|
+
P;
|
364
|
+
if (RSTRING_LEN(p->s) == p->bytepos) {
|
365
|
+
return Qnil;
|
366
|
+
}
|
367
|
+
|
368
|
+
char* s = RSTRING_PTR(p->s) + p->bytepos;
|
369
|
+
if (isspace(s[0])) {
|
370
|
+
return Qnil;
|
371
|
+
}
|
372
|
+
char* e;
|
373
|
+
double d = strtod(s, &e);
|
374
|
+
if (e == s || e - s > RSTRING_LEN(p->s) - p->bytepos) {
|
375
|
+
return Qnil;
|
376
|
+
} else {
|
377
|
+
// it ok to use advance because the source is ascii compatible
|
378
|
+
zscan_advance(self, LONG2NUM(e - s));
|
379
|
+
return DBL2NUM(d);
|
380
|
+
}
|
381
|
+
}
|
382
|
+
|
383
|
+
static VALUE bspec_big_endian_p(VALUE self) {
|
384
|
+
# ifdef DYNAMIC_ENDIAN
|
385
|
+
/* for universal binary of NEXTSTEP and MacOS X */
|
386
|
+
int init = 1;
|
387
|
+
char* p = (char*)&init;
|
388
|
+
return p[0] ? Qfalse : Qtrue;
|
389
|
+
# elif defined(WORDS_BIGENDIAN)
|
390
|
+
return Qtrue;
|
391
|
+
#else
|
392
|
+
return Qfalse;
|
393
|
+
#endif
|
394
|
+
}
|
395
|
+
|
396
|
+
#define GCC_VERSION_SINCE(major, minor, patchlevel) \
|
397
|
+
(defined(__GNUC__) && !defined(__INTEL_COMPILER) && \
|
398
|
+
((__GNUC__ > (major)) || \
|
399
|
+
(__GNUC__ == (major) && __GNUC_MINOR__ > (minor)) || \
|
400
|
+
(__GNUC__ == (major) && __GNUC_MINOR__ == (minor) && __GNUC_PATCHLEVEL__ >= (patchlevel))))
|
401
|
+
|
402
|
+
#if GCC_VERSION_SINCE(4,3,0) || defined(__clang__)
|
403
|
+
# define swap32(x) __builtin_bswap32(x)
|
404
|
+
# define swap64(x) __builtin_bswap64(x)
|
405
|
+
#endif
|
406
|
+
|
407
|
+
#ifndef swap16
|
408
|
+
# define swap16(x) ((uint16_t)((((x)&0xFF)<<8) | (((x)>>8)&0xFF)))
|
409
|
+
#endif
|
410
|
+
|
411
|
+
#ifndef swap32
|
412
|
+
# define swap32(x) ((uint32_t)((((x)&0xFF)<<24) \
|
413
|
+
|(((x)>>24)&0xFF) \
|
414
|
+
|(((x)&0x0000FF00)<<8) \
|
415
|
+
|(((x)&0x00FF0000)>>8) ))
|
416
|
+
#endif
|
417
|
+
|
418
|
+
#ifndef swap64
|
419
|
+
# ifdef HAVE_INT64_T
|
420
|
+
# define byte_in_64bit(n) ((uint64_t)0xff << (n))
|
421
|
+
# define swap64(x) ((uint64_t)((((x)&byte_in_64bit(0))<<56) \
|
422
|
+
|(((x)>>56)&0xFF) \
|
423
|
+
|(((x)&byte_in_64bit(8))<<40) \
|
424
|
+
|(((x)&byte_in_64bit(48))>>40) \
|
425
|
+
|(((x)&byte_in_64bit(16))<<24) \
|
426
|
+
|(((x)&byte_in_64bit(40))>>24) \
|
427
|
+
|(((x)&byte_in_64bit(24))<<8) \
|
428
|
+
|(((x)&byte_in_64bit(32))>>8)))
|
429
|
+
# endif
|
430
|
+
#endif
|
431
|
+
|
432
|
+
// NOTE can not use sizeof in preprocessor
|
433
|
+
#define INT64toNUM(x) (sizeof(long) == 8 ? LONG2NUM(x) : LL2NUM(x))
|
434
|
+
#define UINT64toNUM(x) (sizeof(long) == 8 ? ULONG2NUM(x) : ULL2NUM(x))
|
435
|
+
|
436
|
+
#define CAST(var, ty) *((ty*)(&(var)))
|
437
|
+
|
438
|
+
#include "bspec_exec.inc"
|
439
|
+
|
440
|
+
static VALUE zscan_scan_binary(VALUE self, VALUE spec) {
|
441
|
+
P;
|
442
|
+
long s_size = NUM2LONG(rb_iv_get(spec, "@s_size"));
|
443
|
+
if (p->bytepos + s_size > RSTRING_LEN(p->s)) {
|
444
|
+
return Qnil;
|
445
|
+
}
|
446
|
+
VALUE code = rb_iv_get(spec, "@code");
|
447
|
+
long a_size = RSTRING_LEN(code) / sizeof(void*);
|
448
|
+
volatile VALUE a = rb_ary_new2(a_size);
|
449
|
+
bspec_exec((void**)RSTRING_PTR(code), RSTRING_PTR(p->s) + p->bytepos, a);
|
450
|
+
zscan_bytepos_eq(self, LONG2NUM(p->bytepos + s_size));
|
451
|
+
return a;
|
452
|
+
}
|
453
|
+
|
280
454
|
void Init_zscan() {
|
281
455
|
VALUE zscan = rb_define_class("ZScan", rb_cObject);
|
282
456
|
rb_define_alloc_func(zscan, zscan_alloc);
|
@@ -295,5 +469,23 @@ void Init_zscan() {
|
|
295
469
|
rb_define_method(zscan, "drop", zscan_drop, 0);
|
296
470
|
rb_define_method(zscan, "restore", zscan_restore, 0);
|
297
471
|
rb_define_method(zscan, "clear_pos_stack", zscan_clear_pos_stack, 0);
|
472
|
+
|
298
473
|
rb_define_method(zscan, "try", zscan_try, 0);
|
474
|
+
rb_define_method(zscan, "zero_or_one", zscan_zero_or_one, -1);
|
475
|
+
rb_define_method(zscan, "zero_or_more", zscan_zero_or_more, -1);
|
476
|
+
rb_define_method(zscan, "one_or_more", zscan_one_or_more, -1);
|
477
|
+
|
478
|
+
rb_define_method(zscan, "scan_float", zscan_scan_float, 0);
|
479
|
+
rb_define_method(zscan, "scan_binary", zscan_scan_binary, 1);
|
480
|
+
|
481
|
+
VALUE bs = rb_define_class_under(zscan, "BinarySpec", rb_cObject);
|
482
|
+
rb_define_singleton_method(bs, "big_endian?", bspec_big_endian_p, 0);
|
483
|
+
|
484
|
+
# include "bspec_opcode_names.inc"
|
485
|
+
void** opcodes = (void**)bspec_exec(NULL, NULL, Qnil);
|
486
|
+
for (long i = 0; i < bspec_opcode_size; i++) {
|
487
|
+
VALUE bytecode = rb_str_new((char*)&opcodes[i], sizeof(void*));
|
488
|
+
OBJ_FREEZE(bytecode);
|
489
|
+
rb_define_const(bs, bspec_opcode_names[i], bytecode);
|
490
|
+
}
|
299
491
|
}
|
data/lib/zscan.rb
CHANGED
@@ -1,10 +1,17 @@
|
|
1
1
|
require_relative "../ext/zscan"
|
2
|
+
require_relative "zscan/instructions"
|
3
|
+
require "date"
|
2
4
|
|
3
5
|
class ZScan
|
4
|
-
VERSION = '1.
|
6
|
+
VERSION = '1.1'
|
5
7
|
|
6
8
|
def initialize s, dup=false
|
7
|
-
|
9
|
+
if s.encoding.ascii_compatible?
|
10
|
+
s = dup ? s.dup : s
|
11
|
+
else
|
12
|
+
s = s.encode 'utf-8'
|
13
|
+
end
|
14
|
+
_internal_init s
|
8
15
|
end
|
9
16
|
|
10
17
|
def string
|
@@ -17,6 +24,76 @@ class ZScan
|
|
17
24
|
end
|
18
25
|
end
|
19
26
|
|
27
|
+
def scan_int radix=nil
|
28
|
+
negative = false
|
29
|
+
r = try do
|
30
|
+
negative = (scan(/[+\-]/) == '-')
|
31
|
+
if radix.nil?
|
32
|
+
radix =
|
33
|
+
if scan(/0b/i)
|
34
|
+
2
|
35
|
+
elsif scan(/0x/i)
|
36
|
+
16
|
37
|
+
elsif scan('0')
|
38
|
+
8
|
39
|
+
else
|
40
|
+
10
|
41
|
+
end
|
42
|
+
end
|
43
|
+
scan \
|
44
|
+
case radix
|
45
|
+
when 2; /[01]+/
|
46
|
+
when 8; /[0-7]+/
|
47
|
+
when 10; /\d+/
|
48
|
+
when 16; /\h+/i
|
49
|
+
else
|
50
|
+
if radix < 10
|
51
|
+
/[0-#{radix}]+/
|
52
|
+
elsif radix > 36
|
53
|
+
raise ArgumentError, "invalid radix #{radix}"
|
54
|
+
else
|
55
|
+
end_char = ('a'.ord + (radix - 11)).chr
|
56
|
+
/[\da-#{end_char}]+/i
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
if r
|
61
|
+
r = r.to_i radix
|
62
|
+
negative ? -r : r
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def scan_date format, start=Date::ITALY
|
67
|
+
s = rest
|
68
|
+
d = DateTime._strptime s, format
|
69
|
+
if d
|
70
|
+
# XXX need 2 parses because the handling is very complex ...
|
71
|
+
dt = DateTime.strptime s, format, start rescue return nil
|
72
|
+
|
73
|
+
len = s.bytesize
|
74
|
+
if leftover = d[:leftover]
|
75
|
+
len -= leftover.bytesize
|
76
|
+
end
|
77
|
+
self.bytepos += len
|
78
|
+
|
79
|
+
dt
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def unpack format
|
84
|
+
if format.index('@')
|
85
|
+
raise ArgumentError, 'position instruction @ not supported'
|
86
|
+
end
|
87
|
+
r = rest.unpack format
|
88
|
+
if r.index(nil)
|
89
|
+
return
|
90
|
+
end
|
91
|
+
# XXX pack to get parsed length because no related API is exposed ...
|
92
|
+
len = r.pack(format).bytesize
|
93
|
+
self.bytepos += len
|
94
|
+
r
|
95
|
+
end
|
96
|
+
|
20
97
|
def pos= new_pos
|
21
98
|
advance new_pos - pos
|
22
99
|
end
|
@@ -57,6 +134,26 @@ class ZScan
|
|
57
134
|
_internal_string.bytesize
|
58
135
|
end
|
59
136
|
|
137
|
+
def line_index
|
138
|
+
_internal_string.byteslice(0, bytepos).count "\n"
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.binary_spec &p
|
142
|
+
bs = BinarySpec.new
|
143
|
+
bs.instance_eval &p
|
144
|
+
bs.instance_variable_get(:@code) << BinarySpec::RET
|
145
|
+
bs
|
146
|
+
end
|
147
|
+
|
148
|
+
class BinarySpec
|
149
|
+
BLANK = ''.force_encoding 'binary'
|
150
|
+
|
151
|
+
def initialize
|
152
|
+
@code = BLANK.dup
|
153
|
+
@s_size = 0
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
60
157
|
private :_internal_init, :_internal_string
|
61
158
|
end
|
62
159
|
|
@@ -0,0 +1,165 @@
|
|
1
|
+
# GENERATED WITH: rake gen
|
2
|
+
class ZScan::BinarySpec
|
3
|
+
def int8 n=1
|
4
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
5
|
+
n.times do
|
6
|
+
@code << INT8
|
7
|
+
@s_size += 1
|
8
|
+
end
|
9
|
+
end
|
10
|
+
def int16 n=1
|
11
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
12
|
+
n.times do
|
13
|
+
@code << INT16
|
14
|
+
@s_size += 2
|
15
|
+
end
|
16
|
+
end
|
17
|
+
def int16_swap n=1
|
18
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
19
|
+
n.times do
|
20
|
+
@code << INT16_SWAP
|
21
|
+
@s_size += 2
|
22
|
+
end
|
23
|
+
end
|
24
|
+
def int32 n=1
|
25
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
26
|
+
n.times do
|
27
|
+
@code << INT32
|
28
|
+
@s_size += 4
|
29
|
+
end
|
30
|
+
end
|
31
|
+
def int32_swap n=1
|
32
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
33
|
+
n.times do
|
34
|
+
@code << INT32_SWAP
|
35
|
+
@s_size += 4
|
36
|
+
end
|
37
|
+
end
|
38
|
+
def int64 n=1
|
39
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
40
|
+
n.times do
|
41
|
+
@code << INT64
|
42
|
+
@s_size += 8
|
43
|
+
end
|
44
|
+
end
|
45
|
+
def int64_swap n=1
|
46
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
47
|
+
n.times do
|
48
|
+
@code << INT64_SWAP
|
49
|
+
@s_size += 8
|
50
|
+
end
|
51
|
+
end
|
52
|
+
def uint8 n=1
|
53
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
54
|
+
n.times do
|
55
|
+
@code << UINT8
|
56
|
+
@s_size += 1
|
57
|
+
end
|
58
|
+
end
|
59
|
+
def uint16 n=1
|
60
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
61
|
+
n.times do
|
62
|
+
@code << UINT16
|
63
|
+
@s_size += 2
|
64
|
+
end
|
65
|
+
end
|
66
|
+
def uint16_swap n=1
|
67
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
68
|
+
n.times do
|
69
|
+
@code << UINT16_SWAP
|
70
|
+
@s_size += 2
|
71
|
+
end
|
72
|
+
end
|
73
|
+
def uint32 n=1
|
74
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
75
|
+
n.times do
|
76
|
+
@code << UINT32
|
77
|
+
@s_size += 4
|
78
|
+
end
|
79
|
+
end
|
80
|
+
def uint32_swap n=1
|
81
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
82
|
+
n.times do
|
83
|
+
@code << UINT32_SWAP
|
84
|
+
@s_size += 4
|
85
|
+
end
|
86
|
+
end
|
87
|
+
def uint64 n=1
|
88
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
89
|
+
n.times do
|
90
|
+
@code << UINT64
|
91
|
+
@s_size += 8
|
92
|
+
end
|
93
|
+
end
|
94
|
+
def uint64_swap n=1
|
95
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
96
|
+
n.times do
|
97
|
+
@code << UINT64_SWAP
|
98
|
+
@s_size += 8
|
99
|
+
end
|
100
|
+
end
|
101
|
+
def single n=1
|
102
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
103
|
+
n.times do
|
104
|
+
@code << SINGLE
|
105
|
+
@s_size += 4
|
106
|
+
end
|
107
|
+
end
|
108
|
+
def single_swap n=1
|
109
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
110
|
+
n.times do
|
111
|
+
@code << SINGLE_SWAP
|
112
|
+
@s_size += 4
|
113
|
+
end
|
114
|
+
end
|
115
|
+
def double n=1
|
116
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
117
|
+
n.times do
|
118
|
+
@code << DOUBLE
|
119
|
+
@s_size += 8
|
120
|
+
end
|
121
|
+
end
|
122
|
+
def double_swap n=1
|
123
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
124
|
+
n.times do
|
125
|
+
@code << DOUBLE_SWAP
|
126
|
+
@s_size += 8
|
127
|
+
end
|
128
|
+
end
|
129
|
+
if ZScan::BinarySpec.big_endian?
|
130
|
+
alias int16_be int16
|
131
|
+
alias int16_le int16_swap
|
132
|
+
alias int32_be int32
|
133
|
+
alias int32_le int32_swap
|
134
|
+
alias int64_be int64
|
135
|
+
alias int64_le int64_swap
|
136
|
+
alias uint16_be uint16
|
137
|
+
alias uint16_le uint16_swap
|
138
|
+
alias uint32_be uint32
|
139
|
+
alias uint32_le uint32_swap
|
140
|
+
alias uint64_be uint64
|
141
|
+
alias uint64_le uint64_swap
|
142
|
+
alias single_be single
|
143
|
+
alias single_le single_swap
|
144
|
+
alias double_be double
|
145
|
+
alias double_le double_swap
|
146
|
+
else
|
147
|
+
alias int16_le int16
|
148
|
+
alias int16_be int16_swap
|
149
|
+
alias int32_le int32
|
150
|
+
alias int32_be int32_swap
|
151
|
+
alias int64_le int64
|
152
|
+
alias int64_be int64_swap
|
153
|
+
alias uint16_le uint16
|
154
|
+
alias uint16_be uint16_swap
|
155
|
+
alias uint32_le uint32
|
156
|
+
alias uint32_be uint32_swap
|
157
|
+
alias uint64_le uint64
|
158
|
+
alias uint64_be uint64_swap
|
159
|
+
alias single_le single
|
160
|
+
alias single_be single_swap
|
161
|
+
alias double_le double
|
162
|
+
alias double_be double_swap
|
163
|
+
end
|
164
|
+
undef int16_swap, int32_swap, int64_swap, uint16_swap, uint32_swap, uint64_swap, single_swap, double_swap
|
165
|
+
end
|
data/rakefile
CHANGED
@@ -4,6 +4,60 @@ version = `command grep 'VERSION =' lib/zscan.rb`[version_re]
|
|
4
4
|
gem_files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,c}}')
|
5
5
|
gem_package = "zscan-#{version}.gem"
|
6
6
|
|
7
|
+
bspec_types = %w[INT8 INT16 INT32 INT64 UINT8 UINT16 UINT32 UINT64 SINGLE DOUBLE]
|
8
|
+
bspec_insns = bspec_types.flat_map{|ty|
|
9
|
+
if ty =~ /INT8/
|
10
|
+
ty
|
11
|
+
else
|
12
|
+
[ty, "#{ty}_SWAP"]
|
13
|
+
end
|
14
|
+
}
|
15
|
+
def bspec_incr ins
|
16
|
+
case ins
|
17
|
+
when /INT(\d+)/; $1.to_i / 8
|
18
|
+
when /SINGLE/; 4
|
19
|
+
when /DOUBLE/; 8
|
20
|
+
else; raise 'bad'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
def bspec_c_type ins
|
24
|
+
case ins
|
25
|
+
when /(U?INT\d+)/; "#{$1.downcase}_t"
|
26
|
+
when /SINGLE/; 'float'
|
27
|
+
when /DOUBLE/; 'double'
|
28
|
+
else; raise 'bad'
|
29
|
+
end
|
30
|
+
end
|
31
|
+
def bspec_extract ins
|
32
|
+
type = bspec_c_type ins
|
33
|
+
len = bspec_incr(ins) * 8
|
34
|
+
r = "((uint#{len}_t*)s)[0]"
|
35
|
+
if ins.end_with?('SWAP')
|
36
|
+
r = "swap#{len}(#{r})"
|
37
|
+
end
|
38
|
+
"uint#{len}_t r = #{r}"
|
39
|
+
end
|
40
|
+
def bspec_convert ins
|
41
|
+
case ins
|
42
|
+
when /(U)?INT64|UINT32/
|
43
|
+
if ins.start_with?('U')
|
44
|
+
"UINT64toNUM(r)"
|
45
|
+
else
|
46
|
+
"INT64toNUM(CAST(r, int64_t))"
|
47
|
+
end
|
48
|
+
when /INT32/
|
49
|
+
"INT2NUM(CAST(r, int32_t))"
|
50
|
+
when /INT(16|8)/
|
51
|
+
"INT2FIX(CAST(r, #{bspec_c_type ins}))"
|
52
|
+
when /SINGLE/
|
53
|
+
"DBL2NUM((double)CAST(r, float))"
|
54
|
+
when /DOUBLE/
|
55
|
+
"DBL2NUM(CAST(r, double))"
|
56
|
+
else
|
57
|
+
raise 'bad'
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
7
61
|
desc "build and test"
|
8
62
|
task :default => [:test, gem_package]
|
9
63
|
|
@@ -30,3 +84,86 @@ file gem_package => gem_files do
|
|
30
84
|
end
|
31
85
|
sh "gem build zscan.gemspec"
|
32
86
|
end
|
87
|
+
|
88
|
+
desc "generate files"
|
89
|
+
task :gen => %w[ext/bspec_exec.inc ext/bspec_opcode_names.inc lib/zscan/instructions.rb]
|
90
|
+
|
91
|
+
file 'ext/bspec_exec.inc' => __FILE__ do
|
92
|
+
puts "generating ext/bspec_exec.inc"
|
93
|
+
opcode_list = bspec_insns.map do |ins|
|
94
|
+
"&&BS_#{ins}"
|
95
|
+
end.join ', '
|
96
|
+
|
97
|
+
opcode_segs = bspec_insns.map do |ins|
|
98
|
+
%Q{BS_#{ins}:
|
99
|
+
{
|
100
|
+
#{bspec_extract ins};
|
101
|
+
rb_ary_push(a, #{bspec_convert ins});
|
102
|
+
s += #{bspec_incr ins};
|
103
|
+
goto **(ip++);
|
104
|
+
}
|
105
|
+
}
|
106
|
+
end.join "\n"
|
107
|
+
|
108
|
+
File.open 'ext/bspec_exec.inc', 'w' do |f|
|
109
|
+
f.puts %Q|// GENERATED WITH: rake gen
|
110
|
+
#line 2 "ext/bspec_exec.inc"
|
111
|
+
__attribute__((__noinline__))
|
112
|
+
static VALUE bspec_exec(void** ip, char* s, VALUE a) {
|
113
|
+
static void* opcodes[] = { &&BS_RET, #{opcode_list} };
|
114
|
+
if (ip == NULL) {
|
115
|
+
return (VALUE)opcodes;
|
116
|
+
}
|
117
|
+
goto **(ip++);
|
118
|
+
BS_RET:
|
119
|
+
return a;
|
120
|
+
#{opcode_segs}
|
121
|
+
}|
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
file 'ext/bspec_opcode_names.inc' => __FILE__ do
|
126
|
+
puts 'generating ext/bspec_opcode_names.inc'
|
127
|
+
opcode_names = bspec_insns.map(&:inspect).join ', '
|
128
|
+
File.open 'ext/bspec_opcode_names.inc', 'w' do |f|
|
129
|
+
f.puts "// GENERATED WITH: rake gen"
|
130
|
+
f.puts %Q|const char* bspec_opcode_names[] = {"RET", #{opcode_names}};|
|
131
|
+
f.puts %Q|long bspec_opcode_size = #{bspec_insns.size + 1};|
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
file 'lib/zscan/instructions.rb' => __FILE__ do
|
136
|
+
puts 'generating lib/zscan/instructions.rb'
|
137
|
+
File.open 'lib/zscan/instructions.rb', 'w' do |f|
|
138
|
+
f.puts "# GENERATED WITH: rake gen"
|
139
|
+
f.puts "class ZScan::BinarySpec"
|
140
|
+
|
141
|
+
bspec_insns.each do |ins|
|
142
|
+
f.puts <<-RUBY
|
143
|
+
def #{ins.downcase} n=1
|
144
|
+
raise ArgumentError, "repeat count should be >= 1, but got \#{n}" if n < 1
|
145
|
+
n.times do
|
146
|
+
@code << #{ins}
|
147
|
+
@s_size += #{bspec_incr ins}
|
148
|
+
end
|
149
|
+
end
|
150
|
+
RUBY
|
151
|
+
end
|
152
|
+
|
153
|
+
alias_ins = (bspec_types - ['INT8', 'UINT8']).map &:downcase
|
154
|
+
f.puts " if ZScan::BinarySpec.big_endian?"
|
155
|
+
alias_ins.each do |ins|
|
156
|
+
f.puts " alias #{ins}_be #{ins}"
|
157
|
+
f.puts " alias #{ins}_le #{ins}_swap"
|
158
|
+
end
|
159
|
+
f.puts " else"
|
160
|
+
alias_ins.each do |ins|
|
161
|
+
f.puts " alias #{ins}_le #{ins}"
|
162
|
+
f.puts " alias #{ins}_be #{ins}_swap"
|
163
|
+
end
|
164
|
+
f.puts " end"
|
165
|
+
swap_ins = alias_ins.map{|ins| "#{ins}_swap"}
|
166
|
+
f.puts " undef #{swap_ins.join ', '}"
|
167
|
+
f.puts "end"
|
168
|
+
end
|
169
|
+
end
|
data/readme.md
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
- `ZScan#pos` is the codepoint position, and `ZScan#bytepos` is byte position.
|
5
5
|
- Correctly scans anchors and look behind predicates.
|
6
6
|
- Pos stack manipulation.
|
7
|
+
- Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format`, `#scan_binary format`.
|
7
8
|
|
8
9
|
## Install
|
9
10
|
|
@@ -22,7 +23,7 @@ z.scan /\w+/ #=> 'world'
|
|
22
23
|
z.eos? #=> true
|
23
24
|
```
|
24
25
|
|
25
|
-
## Motivation
|
26
|
+
## Motivation - `StringScanner`
|
26
27
|
|
27
28
|
Ruby's stdlib `StringScanner` treats the scanning position as beginning of string:
|
28
29
|
|
@@ -46,41 +47,112 @@ z.scan /^/ #=> nil
|
|
46
47
|
|
47
48
|
See also https://bugs.ruby-lang.org/issues/7092
|
48
49
|
|
50
|
+
## Other motivations - `scanf` / `strptime` / `unpack`
|
51
|
+
|
52
|
+
- For scan and convert, ruby's stdlib `Scanf` is slow (creates regexp array everytime called) and not possible to corporate with scanner.
|
53
|
+
- For date parsing, `strptime` doesn't tell the parsed length.
|
54
|
+
- For binary parsing, `unpack` is an slow interpreter, and the instructions are quite irregular.
|
55
|
+
|
49
56
|
## Essential methods
|
50
57
|
|
51
58
|
- `ZScan.new string, dup=false`
|
52
59
|
- `#scan regexp_or_string`
|
53
60
|
- `#skip regexp_or_string`
|
54
61
|
- `#match_bytesize regexp_or_string` return length of matched bytes or `nil`.
|
62
|
+
- `#scan_float` scan a float number which is not starting with space. It deals with multibyte encodings for you.
|
63
|
+
- `#scan_int radix=nil` if radix is nil, decide base by prefix: `0x` is 16, `0` is 8, `0b` is 2, otherwise 10. `radix` should be in range `2..36`.
|
64
|
+
- `#scan_date format_string, start=Date::ITALY` scan a `DateTime` object, see also [strptime](http://rubydoc.info/stdlib/date/DateTime.strptime).
|
65
|
+
- `#scan_binary binary_spec` optimized and readable binary scan, see below for how to create a `ZScan::BinarySpec`.
|
66
|
+
- `#unpack format_string`
|
55
67
|
- `#eos?`
|
56
68
|
- `#string` note: return a dup. Don't worry the performance because it is a copy-on-write string.
|
57
69
|
- `#rest`
|
58
70
|
|
71
|
+
## String delegates
|
72
|
+
|
73
|
+
For convienience
|
74
|
+
|
75
|
+
- `#<< append_string`
|
76
|
+
- `#[]= range, replace_string` note: if `range` starts before pos, moves pos left, also clears the stack.
|
77
|
+
- `#size`
|
78
|
+
- `#bytesize`
|
79
|
+
|
80
|
+
## Parsing combinators
|
81
|
+
|
82
|
+
Combinators that manage scanner pos and stack state for you. In the combinators, if the returned value of the given block is `nil` or `false`, stops iteration. Can be nested, useful for building parsers.
|
83
|
+
|
84
|
+
- `#try &block` returns `block`'s return.
|
85
|
+
- `#zero_or_one result=[], &block` try to execute 0 or 1 time, returns `result`.
|
86
|
+
- `#zero_or_more result=[], &block` try to execute 0 or more times, also stops iteration if scanner no advance, returns `result`.
|
87
|
+
- `#one_or_more result=[], &block` try to execute 1 or more times, also stops iteration if scanner no advance, returns `nil` or `result`.
|
88
|
+
|
59
89
|
## Pos management
|
60
90
|
|
61
91
|
- `#pos`
|
62
92
|
- `#pos= new_pos` note: complexity ~ `new_pos > pos ? new_pos - pos : new_pos`.
|
63
93
|
- `#bytepos`
|
64
94
|
- `#bytepos= new_bytepos` note: complexity ~ `abs(new_bytepos - bytepos)`.
|
95
|
+
- `#line_index` line index of current position, start from `0`.
|
65
96
|
- `#advance n` move forward `n` codepoints, if `n < 0`, move backward. Stops at beginning or end.
|
66
97
|
- `#reset` go to beginning.
|
67
98
|
- `#terminate` go to end of string.
|
68
99
|
|
69
|
-
## Efficient pos stack manipulation
|
100
|
+
## (Low level) Efficient pos stack manipulation
|
70
101
|
|
71
102
|
- `#push` push current pos into the stack.
|
72
103
|
- `#pop` set current pos to top of the stack, and pop it.
|
73
104
|
- `#drop` drop top of pos stack without changing current pos.
|
74
105
|
- `#restore` set current pos to top of the stack.
|
75
106
|
- `#clear_pos_stack` clear pos stack.
|
76
|
-
- `#try` try to do several scans in the given block, fall back to init pos if block returns `nil` or `false`. Returns block's return, can be nested.
|
77
107
|
|
78
|
-
##
|
108
|
+
## `ZScan::BinarySpec`
|
79
109
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
110
|
+
Specify a sequence of binary data. Designed for binary protocol parsing. Example:
|
111
|
+
|
112
|
+
```ruby
|
113
|
+
# create a ZScan::BinarySpec
|
114
|
+
s = ZScan.binary_spec do
|
115
|
+
int8 # once
|
116
|
+
uint32_le 2 # little endian, twice
|
117
|
+
double_be 1 # big endian, once
|
118
|
+
end
|
119
|
+
z = ZScan.new [-1, 2, 3, 4.0].pack('cI<2G') + "rest"
|
120
|
+
z.scan_binary s #=> [-1, 2, 3, 4.0]
|
121
|
+
z.rest #=> 'rest
|
122
|
+
```
|
123
|
+
|
124
|
+
Integer instructions:
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
int8 uint8
|
128
|
+
int16 uint16 int16_le uint16_le int16_be uint16_be
|
129
|
+
int32 uint32 int32_le uint32_le int32_be uint32_be
|
130
|
+
int64 uint64 int64_le uint64_le int64_be uint64_be
|
131
|
+
```
|
132
|
+
|
133
|
+
Single precision float instructions:
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
single single_le single_be
|
137
|
+
```
|
138
|
+
|
139
|
+
Double precision float instructions:
|
140
|
+
|
141
|
+
```ruby
|
142
|
+
double double_le double_be
|
143
|
+
```
|
144
|
+
|
145
|
+
Endians:
|
146
|
+
|
147
|
+
- (without endian suffix) native endian
|
148
|
+
- `*_le` little endian (VAX, x86, Windows string code unit)
|
149
|
+
- `*_be` big endian, network endian (SPARC, Java string code unit)
|
150
|
+
|
151
|
+
Repeat count must be integer `>= 1`, default is `1`.
|
152
|
+
|
153
|
+
It is implemented as a direct-threaded bytecode interpreter. Performance vs `String#unpack`:
|
154
|
+
|
155
|
+
todo
|
84
156
|
|
85
157
|
## License
|
86
158
|
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require_relative "spec_helper"
|
2
|
+
|
3
|
+
describe 'ZScan binary scanning methods' do
|
4
|
+
it "#unpack" do
|
5
|
+
z = ZScan.new "\x01\x02\x03"
|
6
|
+
assert_raise ArgumentError do
|
7
|
+
z.unpack '@1C'
|
8
|
+
end
|
9
|
+
assert_equal [1, 2], (z.unpack 'CC')
|
10
|
+
assert_equal 2, z.pos
|
11
|
+
assert_equal nil, (z.unpack 'I')
|
12
|
+
assert_equal 2, z.pos
|
13
|
+
end
|
14
|
+
|
15
|
+
it "#scan_binary" do
|
16
|
+
s = ZScan.binary_spec do
|
17
|
+
int8 # once
|
18
|
+
uint32_le 2 # little endian, twice
|
19
|
+
double_be 1 # big endian, once
|
20
|
+
single 1
|
21
|
+
end
|
22
|
+
a = [-1, 2, 3, 4.0, 3.0]
|
23
|
+
z = ZScan.new(a.pack('cI<2Gf') + 'rest')
|
24
|
+
b = z.scan_binary s
|
25
|
+
assert_equal 'rest', z.rest
|
26
|
+
assert_equal a, b
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative "spec_helper"
|
2
|
+
|
3
|
+
describe 'ZScan combinators' do
|
4
|
+
it "#try restores pos" do
|
5
|
+
z = ZScan.new "hello"
|
6
|
+
return1 = z.try do
|
7
|
+
z.scan 'h'
|
8
|
+
z.scan 'e'
|
9
|
+
end
|
10
|
+
assert_equal 'e', return1
|
11
|
+
assert_equal 2, z.pos
|
12
|
+
|
13
|
+
return2 = z.try do
|
14
|
+
z.scan 'l'
|
15
|
+
z.scan 'l'
|
16
|
+
z.scan 'p' # fails
|
17
|
+
end
|
18
|
+
assert_equal nil, return2
|
19
|
+
assert_equal 2, z.pos
|
20
|
+
end
|
21
|
+
|
22
|
+
it "#zero_or_one" do
|
23
|
+
z = Zscan.new "aab"
|
24
|
+
assert_equal ['a'], z.zero_or_one{z.scan 'a'}
|
25
|
+
assert_equal 1, z.pos
|
26
|
+
|
27
|
+
z = Zscan.new 'aab'
|
28
|
+
assert_equal [], z.zero_or_one{z.scan 'b'}
|
29
|
+
assert_equal 0, z.pos
|
30
|
+
end
|
31
|
+
|
32
|
+
it "#zero_or_more" do
|
33
|
+
z = Zscan.new "aab"
|
34
|
+
assert_equal ['a', 'a'], z.zero_or_more{z.scan 'a'}
|
35
|
+
assert_equal 2, z.pos
|
36
|
+
|
37
|
+
assert_equal 'aab', z.zero_or_more('aa'){z.scan 'c'; z.scan 'b'}
|
38
|
+
|
39
|
+
z = Zscan.new 'aab'
|
40
|
+
assert_equal [], z.zero_or_more{z.scan 'b'}
|
41
|
+
assert_equal 0, z.pos
|
42
|
+
end
|
43
|
+
|
44
|
+
it "#one_or_more" do
|
45
|
+
z = Zscan.new 'aab'
|
46
|
+
assert_equal ['a', 'a'], z.one_or_more{z.scan 'a'}
|
47
|
+
assert_equal 2, z.pos
|
48
|
+
|
49
|
+
z = Zscan.new 'aab'
|
50
|
+
assert_equal nil, z.one_or_more([]){z.scan 'b'}
|
51
|
+
end
|
52
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require_relative "spec_helper"
|
2
|
+
|
3
|
+
describe "typed scan" do
|
4
|
+
it "#scan_int" do
|
5
|
+
z = Zscan.new " 1 0b10F5 10 030"
|
6
|
+
assert_equal nil, z.scan_int
|
7
|
+
z.advance 1
|
8
|
+
assert_equal 1, z.scan_int(10)
|
9
|
+
|
10
|
+
z.advance 1
|
11
|
+
assert_equal 0b10, z.scan_int
|
12
|
+
assert_equal 0xF5, z.scan_int(16)
|
13
|
+
|
14
|
+
z.advance 1
|
15
|
+
assert_equal 12, z.scan_int(12)
|
16
|
+
|
17
|
+
z.advance 1
|
18
|
+
assert_equal 030, z.scan_int
|
19
|
+
end
|
20
|
+
|
21
|
+
it "#scan_float" do
|
22
|
+
z = Zscan.new " -3.5e23"
|
23
|
+
assert_equal nil, z.scan_float
|
24
|
+
z.advance 1
|
25
|
+
assert_equal -3.5e23, z.scan_float
|
26
|
+
end
|
27
|
+
|
28
|
+
it "won't overflow in #scan_float" do
|
29
|
+
s = '1.23E15'.byteslice 0, 4
|
30
|
+
z = Zscan.new s
|
31
|
+
assert_equal 1.23, z.scan_float
|
32
|
+
assert_equal 4, z.pos
|
33
|
+
end
|
34
|
+
|
35
|
+
it "#scan_date" do
|
36
|
+
z = Zscan.new " 2001 04 6 04 05 06 +7 231rest"
|
37
|
+
assert_equal nil, z.scan_date('%Y %U %w %H %M %S %z %N')
|
38
|
+
z.advance 1
|
39
|
+
|
40
|
+
d = z.scan_date '%Y %U %w %H %M %S %z %N'
|
41
|
+
assert_equal 0.231, d.sec_fraction
|
42
|
+
assert_equal 'rest', z.rest
|
43
|
+
|
44
|
+
z.pos = 1
|
45
|
+
z.scan_date '%Y %U %w ahoy %H %M %S %z' # bad format
|
46
|
+
assert_equal 1, z.pos
|
47
|
+
end
|
48
|
+
end
|
data/spec/zscan_spec.rb
CHANGED
@@ -1,18 +1,10 @@
|
|
1
|
-
require_relative "
|
2
|
-
require 'rspec/autorun'
|
3
|
-
RSpec.configure do |config|
|
4
|
-
config.expect_with :stdlib
|
5
|
-
end
|
1
|
+
require_relative "spec_helper"
|
6
2
|
|
7
3
|
describe ZScan do
|
8
4
|
before :each do
|
9
5
|
@z = ZScan.new 'ab你好'
|
10
6
|
end
|
11
7
|
|
12
|
-
before :all do
|
13
|
-
GC.stress = true
|
14
|
-
end
|
15
|
-
|
16
8
|
it "random workflow" do
|
17
9
|
assert_equal 2, @z.match_bytesize('ab')
|
18
10
|
@z.pos = 4
|
@@ -76,21 +68,23 @@ describe ZScan do
|
|
76
68
|
assert_equal 3, @z.pos
|
77
69
|
end
|
78
70
|
|
79
|
-
it "#
|
80
|
-
z = ZScan.new
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
assert_equal
|
86
|
-
assert_equal 2, z.pos
|
71
|
+
it "#reset, #terminate and #line_index" do
|
72
|
+
z = ZScan.new ''
|
73
|
+
assert_equal 0, z.line_index
|
74
|
+
z.terminate
|
75
|
+
assert_equal 0, z.line_index
|
76
|
+
z.reset
|
77
|
+
assert_equal 0, z.line_index
|
87
78
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
assert_equal
|
94
|
-
|
79
|
+
z = ZScan.new "a\nb\nc"
|
80
|
+
assert_equal 0, z.line_index
|
81
|
+
z.terminate
|
82
|
+
assert_equal 2, z.line_index
|
83
|
+
z.reset
|
84
|
+
assert_equal 0, z.line_index
|
85
|
+
z.pos = 1
|
86
|
+
assert_equal 0, z.line_index
|
87
|
+
z.pos = 2
|
88
|
+
assert_equal 1, z.line_index
|
95
89
|
end
|
96
90
|
end
|
data/zscan.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "zscan"
|
3
|
-
s.version = "1.
|
3
|
+
s.version = "1.1" # version mapped from zscan.rb, don't change here
|
4
4
|
s.author = "Zete Lui"
|
5
5
|
s.homepage = "https://github.com/luikore/zscan"
|
6
6
|
s.platform = Gem::Platform::RUBY
|
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.required_ruby_version = ">=1.9.2"
|
10
10
|
s.licenses = ['BSD']
|
11
11
|
|
12
|
-
s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,c}}')
|
12
|
+
s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,c,inc}}')
|
13
13
|
s.require_paths = ["lib"]
|
14
14
|
s.extensions = ["ext/extconf.rb"]
|
15
15
|
s.rubygems_version = '1.8.24'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zscan
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: '1.1'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Zete Lui
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-05-
|
11
|
+
date: 2013-05-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: improved string scanner, respects anchors and lookbehinds, supports codepoint
|
14
14
|
positioning
|
@@ -21,11 +21,19 @@ files:
|
|
21
21
|
- rakefile
|
22
22
|
- zscan.gemspec
|
23
23
|
- readme.md
|
24
|
-
-
|
24
|
+
- benchmark/vs-strscan.rb
|
25
|
+
- benchmark/vs-unpack.rb
|
25
26
|
- ext/extconf.rb
|
27
|
+
- lib/zscan/instructions.rb
|
26
28
|
- lib/zscan.rb
|
29
|
+
- spec/binary_scan_spec.rb
|
30
|
+
- spec/combinator_spec.rb
|
31
|
+
- spec/spec_helper.rb
|
32
|
+
- spec/typed_scan_spec.rb
|
27
33
|
- spec/zscan_spec.rb
|
28
34
|
- ext/zscan.c
|
35
|
+
- ext/bspec_exec.inc
|
36
|
+
- ext/bspec_opcode_names.inc
|
29
37
|
homepage: https://github.com/luikore/zscan
|
30
38
|
licenses:
|
31
39
|
- BSD
|