zscan 1.0.1 → 1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/{bench.rb → benchmark/vs-strscan.rb} +1 -1
- data/benchmark/vs-unpack.rb +21 -0
- data/ext/bspec_exec.inc +156 -0
- data/ext/bspec_opcode_names.inc +3 -0
- data/ext/zscan.c +210 -18
- data/lib/zscan.rb +99 -2
- data/lib/zscan/instructions.rb +165 -0
- data/rakefile +137 -0
- data/readme.md +80 -8
- data/spec/binary_scan_spec.rb +28 -0
- data/spec/combinator_spec.rb +52 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/typed_scan_spec.rb +48 -0
- data/spec/zscan_spec.rb +18 -24
- data/zscan.gemspec +2 -2
- metadata +11 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 998b97db8e9341f3920caa27bf11558954a777ba
|
4
|
+
data.tar.gz: 033986f8e4a4086985bca23c84f8acfabd2f5e29
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a8c23d9f29b57e113a55e46fb024bce0337aa8ee5b8317744ed9ce9c13ddf35e4b1b7af9bbf25d67c6b14e6c30bb7de00c214692cb5fea05c2e0f4fa6116478b
|
7
|
+
data.tar.gz: 06ccbc8c793f873a630c4774b746ad8ca1cfa4c2895414871c0b435812bd902e3f75618a6134551b5ab3f35b2838e12c9072e0bb15ff8f6010a7ba2ca4e8a83a
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require_relative "../lib/zscan"
|
2
|
+
require "benchmark"
|
3
|
+
|
4
|
+
spec = ZScan.binary_spec do
|
5
|
+
int8
|
6
|
+
double_le 2
|
7
|
+
single_be
|
8
|
+
end
|
9
|
+
|
10
|
+
arr = [1, 1.1, 1.2, 1.3]
|
11
|
+
str = arr.pack 'cE2g'
|
12
|
+
z = Zscan.new str.b
|
13
|
+
|
14
|
+
puts 'reference nop group'
|
15
|
+
puts Benchmark.measure{ 100000.times{ z.pos = 0 } }
|
16
|
+
puts 'ZScan#unpack'
|
17
|
+
puts Benchmark.measure{ 100000.times{ z.pos = 0; z.unpack 'cE2g' } }
|
18
|
+
puts 'ZScan#scan_binary'
|
19
|
+
puts Benchmark.measure{ 100000.times{ z.pos = 0; z.scan_binary spec } }
|
20
|
+
puts 'String#unpack'
|
21
|
+
puts Benchmark.measure{ 100000.times{ z.pos = 0; str.unpack 'cE2g' } }
|
data/ext/bspec_exec.inc
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
// GENERATED WITH: rake gen
|
2
|
+
#line 2 "ext/bspec_exec.inc"
|
3
|
+
__attribute__((__noinline__))
|
4
|
+
static VALUE bspec_exec(void** ip, char* s, VALUE a) {
|
5
|
+
static void* opcodes[] = { &&BS_RET, &&BS_INT8, &&BS_INT16, &&BS_INT16_SWAP, &&BS_INT32, &&BS_INT32_SWAP, &&BS_INT64, &&BS_INT64_SWAP, &&BS_UINT8, &&BS_UINT16, &&BS_UINT16_SWAP, &&BS_UINT32, &&BS_UINT32_SWAP, &&BS_UINT64, &&BS_UINT64_SWAP, &&BS_SINGLE, &&BS_SINGLE_SWAP, &&BS_DOUBLE, &&BS_DOUBLE_SWAP };
|
6
|
+
if (ip == NULL) {
|
7
|
+
return (VALUE)opcodes;
|
8
|
+
}
|
9
|
+
goto **(ip++);
|
10
|
+
BS_RET:
|
11
|
+
return a;
|
12
|
+
BS_INT8:
|
13
|
+
{
|
14
|
+
uint8_t r = ((uint8_t*)s)[0];
|
15
|
+
rb_ary_push(a, INT2FIX(CAST(r, int8_t)));
|
16
|
+
s += 1;
|
17
|
+
goto **(ip++);
|
18
|
+
}
|
19
|
+
|
20
|
+
BS_INT16:
|
21
|
+
{
|
22
|
+
uint16_t r = ((uint16_t*)s)[0];
|
23
|
+
rb_ary_push(a, INT2FIX(CAST(r, int16_t)));
|
24
|
+
s += 2;
|
25
|
+
goto **(ip++);
|
26
|
+
}
|
27
|
+
|
28
|
+
BS_INT16_SWAP:
|
29
|
+
{
|
30
|
+
uint16_t r = swap16(((uint16_t*)s)[0]);
|
31
|
+
rb_ary_push(a, INT2FIX(CAST(r, int16_t)));
|
32
|
+
s += 2;
|
33
|
+
goto **(ip++);
|
34
|
+
}
|
35
|
+
|
36
|
+
BS_INT32:
|
37
|
+
{
|
38
|
+
uint32_t r = ((uint32_t*)s)[0];
|
39
|
+
rb_ary_push(a, INT2NUM(CAST(r, int32_t)));
|
40
|
+
s += 4;
|
41
|
+
goto **(ip++);
|
42
|
+
}
|
43
|
+
|
44
|
+
BS_INT32_SWAP:
|
45
|
+
{
|
46
|
+
uint32_t r = swap32(((uint32_t*)s)[0]);
|
47
|
+
rb_ary_push(a, INT2NUM(CAST(r, int32_t)));
|
48
|
+
s += 4;
|
49
|
+
goto **(ip++);
|
50
|
+
}
|
51
|
+
|
52
|
+
BS_INT64:
|
53
|
+
{
|
54
|
+
uint64_t r = ((uint64_t*)s)[0];
|
55
|
+
rb_ary_push(a, INT64toNUM(CAST(r, int64_t)));
|
56
|
+
s += 8;
|
57
|
+
goto **(ip++);
|
58
|
+
}
|
59
|
+
|
60
|
+
BS_INT64_SWAP:
|
61
|
+
{
|
62
|
+
uint64_t r = swap64(((uint64_t*)s)[0]);
|
63
|
+
rb_ary_push(a, INT64toNUM(CAST(r, int64_t)));
|
64
|
+
s += 8;
|
65
|
+
goto **(ip++);
|
66
|
+
}
|
67
|
+
|
68
|
+
BS_UINT8:
|
69
|
+
{
|
70
|
+
uint8_t r = ((uint8_t*)s)[0];
|
71
|
+
rb_ary_push(a, INT2FIX(CAST(r, uint8_t)));
|
72
|
+
s += 1;
|
73
|
+
goto **(ip++);
|
74
|
+
}
|
75
|
+
|
76
|
+
BS_UINT16:
|
77
|
+
{
|
78
|
+
uint16_t r = ((uint16_t*)s)[0];
|
79
|
+
rb_ary_push(a, INT2FIX(CAST(r, uint16_t)));
|
80
|
+
s += 2;
|
81
|
+
goto **(ip++);
|
82
|
+
}
|
83
|
+
|
84
|
+
BS_UINT16_SWAP:
|
85
|
+
{
|
86
|
+
uint16_t r = swap16(((uint16_t*)s)[0]);
|
87
|
+
rb_ary_push(a, INT2FIX(CAST(r, uint16_t)));
|
88
|
+
s += 2;
|
89
|
+
goto **(ip++);
|
90
|
+
}
|
91
|
+
|
92
|
+
BS_UINT32:
|
93
|
+
{
|
94
|
+
uint32_t r = ((uint32_t*)s)[0];
|
95
|
+
rb_ary_push(a, UINT64toNUM(r));
|
96
|
+
s += 4;
|
97
|
+
goto **(ip++);
|
98
|
+
}
|
99
|
+
|
100
|
+
BS_UINT32_SWAP:
|
101
|
+
{
|
102
|
+
uint32_t r = swap32(((uint32_t*)s)[0]);
|
103
|
+
rb_ary_push(a, UINT64toNUM(r));
|
104
|
+
s += 4;
|
105
|
+
goto **(ip++);
|
106
|
+
}
|
107
|
+
|
108
|
+
BS_UINT64:
|
109
|
+
{
|
110
|
+
uint64_t r = ((uint64_t*)s)[0];
|
111
|
+
rb_ary_push(a, UINT64toNUM(r));
|
112
|
+
s += 8;
|
113
|
+
goto **(ip++);
|
114
|
+
}
|
115
|
+
|
116
|
+
BS_UINT64_SWAP:
|
117
|
+
{
|
118
|
+
uint64_t r = swap64(((uint64_t*)s)[0]);
|
119
|
+
rb_ary_push(a, UINT64toNUM(r));
|
120
|
+
s += 8;
|
121
|
+
goto **(ip++);
|
122
|
+
}
|
123
|
+
|
124
|
+
BS_SINGLE:
|
125
|
+
{
|
126
|
+
uint32_t r = ((uint32_t*)s)[0];
|
127
|
+
rb_ary_push(a, DBL2NUM((double)CAST(r, float)));
|
128
|
+
s += 4;
|
129
|
+
goto **(ip++);
|
130
|
+
}
|
131
|
+
|
132
|
+
BS_SINGLE_SWAP:
|
133
|
+
{
|
134
|
+
uint32_t r = swap32(((uint32_t*)s)[0]);
|
135
|
+
rb_ary_push(a, DBL2NUM((double)CAST(r, float)));
|
136
|
+
s += 4;
|
137
|
+
goto **(ip++);
|
138
|
+
}
|
139
|
+
|
140
|
+
BS_DOUBLE:
|
141
|
+
{
|
142
|
+
uint64_t r = ((uint64_t*)s)[0];
|
143
|
+
rb_ary_push(a, DBL2NUM(CAST(r, double)));
|
144
|
+
s += 8;
|
145
|
+
goto **(ip++);
|
146
|
+
}
|
147
|
+
|
148
|
+
BS_DOUBLE_SWAP:
|
149
|
+
{
|
150
|
+
uint64_t r = swap64(((uint64_t*)s)[0]);
|
151
|
+
rb_ary_push(a, DBL2NUM(CAST(r, double)));
|
152
|
+
s += 8;
|
153
|
+
goto **(ip++);
|
154
|
+
}
|
155
|
+
|
156
|
+
}
|
@@ -0,0 +1,3 @@
|
|
1
|
+
// GENERATED WITH: rake gen
|
2
|
+
const char* bspec_opcode_names[] = {"RET", "INT8", "INT16", "INT16_SWAP", "INT32", "INT32_SWAP", "INT64", "INT64_SWAP", "UINT8", "UINT16", "UINT16_SWAP", "UINT32", "UINT32_SWAP", "UINT64", "UINT64_SWAP", "SINGLE", "SINGLE_SWAP", "DOUBLE", "DOUBLE_SWAP"};
|
3
|
+
long bspec_opcode_size = 19;
|
data/ext/zscan.c
CHANGED
@@ -1,23 +1,24 @@
|
|
1
1
|
#include <ruby/ruby.h>
|
2
2
|
#include <ruby/re.h>
|
3
3
|
#include <ruby/encoding.h>
|
4
|
+
#include <ctype.h>
|
5
|
+
|
6
|
+
// todo infect check
|
4
7
|
|
5
8
|
typedef struct {
|
6
|
-
|
7
|
-
|
9
|
+
long pos;
|
10
|
+
long bytepos;
|
8
11
|
} Pos;
|
9
12
|
|
10
13
|
typedef struct {
|
11
|
-
|
12
|
-
|
14
|
+
long pos;
|
15
|
+
long bytepos;
|
13
16
|
VALUE s;
|
14
|
-
|
15
|
-
|
17
|
+
long stack_i;
|
18
|
+
long stack_cap;
|
16
19
|
Pos* stack;
|
17
20
|
} ZScan;
|
18
21
|
|
19
|
-
#define P ZScan* p = rb_check_typeddata(self, &zscan_type)
|
20
|
-
|
21
22
|
static void zscan_mark(void* pp) {
|
22
23
|
ZScan* p = pp;
|
23
24
|
rb_gc_mark(p->s);
|
@@ -39,6 +40,8 @@ static const rb_data_type_t zscan_type = {
|
|
39
40
|
{zscan_mark, zscan_free, zscan_memsize}
|
40
41
|
};
|
41
42
|
|
43
|
+
#define P ZScan* p = rb_check_typeddata(self, &zscan_type)
|
44
|
+
|
42
45
|
static VALUE zscan_alloc(VALUE klass) {
|
43
46
|
ZScan* p = ALLOC(ZScan);
|
44
47
|
MEMZERO(p, ZScan, 1);
|
@@ -66,13 +69,12 @@ static VALUE zscan_pos(VALUE self) {
|
|
66
69
|
|
67
70
|
static VALUE zscan_advance(VALUE self, VALUE v_diff) {
|
68
71
|
P;
|
69
|
-
long
|
70
|
-
if (
|
72
|
+
long n = p->pos + NUM2LONG(v_diff);
|
73
|
+
if (n < 0) {
|
71
74
|
p->pos = 0;
|
72
75
|
p->bytepos = 0;
|
73
76
|
return self;
|
74
77
|
}
|
75
|
-
size_t n = signed_n;
|
76
78
|
|
77
79
|
// because there's no "reverse scan" API, we have a O(n) routine :(
|
78
80
|
if (n < p->pos) {
|
@@ -82,7 +84,7 @@ static VALUE zscan_advance(VALUE self, VALUE v_diff) {
|
|
82
84
|
|
83
85
|
if (n > p->pos) {
|
84
86
|
rb_encoding* enc = rb_enc_get(p->s);
|
85
|
-
|
87
|
+
long byteend = RSTRING_LEN(p->s);
|
86
88
|
char* ptr = RSTRING_PTR(p->s);
|
87
89
|
for (; p->pos < n && p->bytepos < byteend;) {
|
88
90
|
int n = rb_enc_mbclen(ptr + p->bytepos, ptr + byteend, enc);
|
@@ -105,7 +107,7 @@ static VALUE zscan_bytepos(VALUE self) {
|
|
105
107
|
static VALUE zscan_bytepos_eq(VALUE self, VALUE v_bytepos) {
|
106
108
|
P;
|
107
109
|
long signed_bytepos = NUM2LONG(v_bytepos);
|
108
|
-
|
110
|
+
long from, to, bytepos;
|
109
111
|
|
110
112
|
if (signed_bytepos > RSTRING_LEN(p->s)) {
|
111
113
|
bytepos = RSTRING_LEN(p->s);
|
@@ -127,7 +129,7 @@ static VALUE zscan_bytepos_eq(VALUE self, VALUE v_bytepos) {
|
|
127
129
|
|
128
130
|
rb_encoding* enc = rb_enc_get(p->s);
|
129
131
|
char* ptr = RSTRING_PTR(p->s);
|
130
|
-
|
132
|
+
long diff = 0;
|
131
133
|
for (; from < to;) {
|
132
134
|
int n = rb_enc_mbclen(ptr + from, ptr + to, enc);
|
133
135
|
if (n) {
|
@@ -154,7 +156,7 @@ static VALUE zscan_bytepos_eq(VALUE self, VALUE v_bytepos) {
|
|
154
156
|
|
155
157
|
static VALUE zscan_eos_p(VALUE self) {
|
156
158
|
P;
|
157
|
-
return (p->bytepos ==
|
159
|
+
return (p->bytepos == RSTRING_LEN(p->s) ? Qtrue : Qfalse);
|
158
160
|
}
|
159
161
|
|
160
162
|
regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
|
@@ -262,10 +264,13 @@ static VALUE zscan_clear_pos_stack(VALUE self) {
|
|
262
264
|
return self;
|
263
265
|
}
|
264
266
|
|
265
|
-
|
266
|
-
if (!rb_block_given_p()) {
|
267
|
-
rb_raise(rb_eRuntimeError, "need a block")
|
267
|
+
#define REQUIRE_BLOCK \
|
268
|
+
if (!rb_block_given_p()) {\
|
269
|
+
rb_raise(rb_eRuntimeError, "need a block");\
|
268
270
|
}
|
271
|
+
|
272
|
+
static VALUE zscan_try(VALUE self) {
|
273
|
+
REQUIRE_BLOCK;
|
269
274
|
VALUE r;
|
270
275
|
zscan_push(self);
|
271
276
|
r = rb_yield(Qnil);
|
@@ -277,6 +282,175 @@ static VALUE zscan_try(VALUE self) {
|
|
277
282
|
return r;
|
278
283
|
}
|
279
284
|
|
285
|
+
static VALUE zscan_zero_or_one(int argc, VALUE* argv, VALUE self) {
|
286
|
+
REQUIRE_BLOCK;
|
287
|
+
volatile VALUE a = Qnil;
|
288
|
+
volatile VALUE r;
|
289
|
+
rb_scan_args(argc, argv, "01", &a);
|
290
|
+
if (a == Qnil) {
|
291
|
+
a = rb_ary_new();
|
292
|
+
}
|
293
|
+
zscan_push(self);
|
294
|
+
r = rb_yield(Qnil);
|
295
|
+
if (RTEST(r)) {
|
296
|
+
rb_funcall(a, rb_intern("<<"), 1, r);
|
297
|
+
zscan_drop(self);
|
298
|
+
} else {
|
299
|
+
zscan_pop(self);
|
300
|
+
}
|
301
|
+
return a;
|
302
|
+
}
|
303
|
+
|
304
|
+
static VALUE zscan_zero_or_more(int argc, VALUE* argv, VALUE self) {
|
305
|
+
REQUIRE_BLOCK;
|
306
|
+
volatile VALUE a = Qnil;
|
307
|
+
volatile VALUE r;
|
308
|
+
long backpos;
|
309
|
+
P;
|
310
|
+
rb_scan_args(argc, argv, "01", &a);
|
311
|
+
if (a == Qnil) {
|
312
|
+
a = rb_ary_new();
|
313
|
+
}
|
314
|
+
for (;;) {
|
315
|
+
zscan_push(self);
|
316
|
+
backpos = p->bytepos;
|
317
|
+
r = rb_yield(Qnil);
|
318
|
+
if (RTEST(r) && backpos != p->bytepos) {
|
319
|
+
rb_funcall(a, rb_intern("<<"), 1, r);
|
320
|
+
zscan_drop(self);
|
321
|
+
} else {
|
322
|
+
zscan_pop(self);
|
323
|
+
break;
|
324
|
+
}
|
325
|
+
}
|
326
|
+
return a;
|
327
|
+
}
|
328
|
+
|
329
|
+
static VALUE zscan_one_or_more(int argc, VALUE* argv, VALUE self) {
|
330
|
+
REQUIRE_BLOCK;
|
331
|
+
volatile VALUE a = Qnil;
|
332
|
+
volatile VALUE r;
|
333
|
+
|
334
|
+
r = rb_yield(Qnil);
|
335
|
+
if (RTEST(r)) {
|
336
|
+
long backpos;
|
337
|
+
P;
|
338
|
+
rb_scan_args(argc, argv, "01", &a);
|
339
|
+
if (a == Qnil) {
|
340
|
+
a = rb_ary_new();
|
341
|
+
}
|
342
|
+
|
343
|
+
rb_funcall(a, rb_intern("<<"), 1, r);
|
344
|
+
for (;;) {
|
345
|
+
zscan_push(self);
|
346
|
+
backpos = p->bytepos;
|
347
|
+
r = rb_yield(Qnil);
|
348
|
+
if (RTEST(r) && backpos != p->bytepos) {
|
349
|
+
rb_funcall(a, rb_intern("<<"), 1, r);
|
350
|
+
zscan_drop(self);
|
351
|
+
} else {
|
352
|
+
zscan_pop(self);
|
353
|
+
break;
|
354
|
+
}
|
355
|
+
}
|
356
|
+
return a;
|
357
|
+
} else {
|
358
|
+
return Qnil;
|
359
|
+
}
|
360
|
+
}
|
361
|
+
|
362
|
+
VALUE zscan_scan_float(VALUE self) {
|
363
|
+
P;
|
364
|
+
if (RSTRING_LEN(p->s) == p->bytepos) {
|
365
|
+
return Qnil;
|
366
|
+
}
|
367
|
+
|
368
|
+
char* s = RSTRING_PTR(p->s) + p->bytepos;
|
369
|
+
if (isspace(s[0])) {
|
370
|
+
return Qnil;
|
371
|
+
}
|
372
|
+
char* e;
|
373
|
+
double d = strtod(s, &e);
|
374
|
+
if (e == s || e - s > RSTRING_LEN(p->s) - p->bytepos) {
|
375
|
+
return Qnil;
|
376
|
+
} else {
|
377
|
+
// it ok to use advance because the source is ascii compatible
|
378
|
+
zscan_advance(self, LONG2NUM(e - s));
|
379
|
+
return DBL2NUM(d);
|
380
|
+
}
|
381
|
+
}
|
382
|
+
|
383
|
+
static VALUE bspec_big_endian_p(VALUE self) {
|
384
|
+
# ifdef DYNAMIC_ENDIAN
|
385
|
+
/* for universal binary of NEXTSTEP and MacOS X */
|
386
|
+
int init = 1;
|
387
|
+
char* p = (char*)&init;
|
388
|
+
return p[0] ? Qfalse : Qtrue;
|
389
|
+
# elif defined(WORDS_BIGENDIAN)
|
390
|
+
return Qtrue;
|
391
|
+
#else
|
392
|
+
return Qfalse;
|
393
|
+
#endif
|
394
|
+
}
|
395
|
+
|
396
|
+
#define GCC_VERSION_SINCE(major, minor, patchlevel) \
|
397
|
+
(defined(__GNUC__) && !defined(__INTEL_COMPILER) && \
|
398
|
+
((__GNUC__ > (major)) || \
|
399
|
+
(__GNUC__ == (major) && __GNUC_MINOR__ > (minor)) || \
|
400
|
+
(__GNUC__ == (major) && __GNUC_MINOR__ == (minor) && __GNUC_PATCHLEVEL__ >= (patchlevel))))
|
401
|
+
|
402
|
+
#if GCC_VERSION_SINCE(4,3,0) || defined(__clang__)
|
403
|
+
# define swap32(x) __builtin_bswap32(x)
|
404
|
+
# define swap64(x) __builtin_bswap64(x)
|
405
|
+
#endif
|
406
|
+
|
407
|
+
#ifndef swap16
|
408
|
+
# define swap16(x) ((uint16_t)((((x)&0xFF)<<8) | (((x)>>8)&0xFF)))
|
409
|
+
#endif
|
410
|
+
|
411
|
+
#ifndef swap32
|
412
|
+
# define swap32(x) ((uint32_t)((((x)&0xFF)<<24) \
|
413
|
+
|(((x)>>24)&0xFF) \
|
414
|
+
|(((x)&0x0000FF00)<<8) \
|
415
|
+
|(((x)&0x00FF0000)>>8) ))
|
416
|
+
#endif
|
417
|
+
|
418
|
+
#ifndef swap64
|
419
|
+
# ifdef HAVE_INT64_T
|
420
|
+
# define byte_in_64bit(n) ((uint64_t)0xff << (n))
|
421
|
+
# define swap64(x) ((uint64_t)((((x)&byte_in_64bit(0))<<56) \
|
422
|
+
|(((x)>>56)&0xFF) \
|
423
|
+
|(((x)&byte_in_64bit(8))<<40) \
|
424
|
+
|(((x)&byte_in_64bit(48))>>40) \
|
425
|
+
|(((x)&byte_in_64bit(16))<<24) \
|
426
|
+
|(((x)&byte_in_64bit(40))>>24) \
|
427
|
+
|(((x)&byte_in_64bit(24))<<8) \
|
428
|
+
|(((x)&byte_in_64bit(32))>>8)))
|
429
|
+
# endif
|
430
|
+
#endif
|
431
|
+
|
432
|
+
// NOTE can not use sizeof in preprocessor
|
433
|
+
#define INT64toNUM(x) (sizeof(long) == 8 ? LONG2NUM(x) : LL2NUM(x))
|
434
|
+
#define UINT64toNUM(x) (sizeof(long) == 8 ? ULONG2NUM(x) : ULL2NUM(x))
|
435
|
+
|
436
|
+
#define CAST(var, ty) *((ty*)(&(var)))
|
437
|
+
|
438
|
+
#include "bspec_exec.inc"
|
439
|
+
|
440
|
+
static VALUE zscan_scan_binary(VALUE self, VALUE spec) {
|
441
|
+
P;
|
442
|
+
long s_size = NUM2LONG(rb_iv_get(spec, "@s_size"));
|
443
|
+
if (p->bytepos + s_size > RSTRING_LEN(p->s)) {
|
444
|
+
return Qnil;
|
445
|
+
}
|
446
|
+
VALUE code = rb_iv_get(spec, "@code");
|
447
|
+
long a_size = RSTRING_LEN(code) / sizeof(void*);
|
448
|
+
volatile VALUE a = rb_ary_new2(a_size);
|
449
|
+
bspec_exec((void**)RSTRING_PTR(code), RSTRING_PTR(p->s) + p->bytepos, a);
|
450
|
+
zscan_bytepos_eq(self, LONG2NUM(p->bytepos + s_size));
|
451
|
+
return a;
|
452
|
+
}
|
453
|
+
|
280
454
|
void Init_zscan() {
|
281
455
|
VALUE zscan = rb_define_class("ZScan", rb_cObject);
|
282
456
|
rb_define_alloc_func(zscan, zscan_alloc);
|
@@ -295,5 +469,23 @@ void Init_zscan() {
|
|
295
469
|
rb_define_method(zscan, "drop", zscan_drop, 0);
|
296
470
|
rb_define_method(zscan, "restore", zscan_restore, 0);
|
297
471
|
rb_define_method(zscan, "clear_pos_stack", zscan_clear_pos_stack, 0);
|
472
|
+
|
298
473
|
rb_define_method(zscan, "try", zscan_try, 0);
|
474
|
+
rb_define_method(zscan, "zero_or_one", zscan_zero_or_one, -1);
|
475
|
+
rb_define_method(zscan, "zero_or_more", zscan_zero_or_more, -1);
|
476
|
+
rb_define_method(zscan, "one_or_more", zscan_one_or_more, -1);
|
477
|
+
|
478
|
+
rb_define_method(zscan, "scan_float", zscan_scan_float, 0);
|
479
|
+
rb_define_method(zscan, "scan_binary", zscan_scan_binary, 1);
|
480
|
+
|
481
|
+
VALUE bs = rb_define_class_under(zscan, "BinarySpec", rb_cObject);
|
482
|
+
rb_define_singleton_method(bs, "big_endian?", bspec_big_endian_p, 0);
|
483
|
+
|
484
|
+
# include "bspec_opcode_names.inc"
|
485
|
+
void** opcodes = (void**)bspec_exec(NULL, NULL, Qnil);
|
486
|
+
for (long i = 0; i < bspec_opcode_size; i++) {
|
487
|
+
VALUE bytecode = rb_str_new((char*)&opcodes[i], sizeof(void*));
|
488
|
+
OBJ_FREEZE(bytecode);
|
489
|
+
rb_define_const(bs, bspec_opcode_names[i], bytecode);
|
490
|
+
}
|
299
491
|
}
|
data/lib/zscan.rb
CHANGED
@@ -1,10 +1,17 @@
|
|
1
1
|
require_relative "../ext/zscan"
|
2
|
+
require_relative "zscan/instructions"
|
3
|
+
require "date"
|
2
4
|
|
3
5
|
class ZScan
|
4
|
-
VERSION = '1.
|
6
|
+
VERSION = '1.1'
|
5
7
|
|
6
8
|
def initialize s, dup=false
|
7
|
-
|
9
|
+
if s.encoding.ascii_compatible?
|
10
|
+
s = dup ? s.dup : s
|
11
|
+
else
|
12
|
+
s = s.encode 'utf-8'
|
13
|
+
end
|
14
|
+
_internal_init s
|
8
15
|
end
|
9
16
|
|
10
17
|
def string
|
@@ -17,6 +24,76 @@ class ZScan
|
|
17
24
|
end
|
18
25
|
end
|
19
26
|
|
27
|
+
def scan_int radix=nil
|
28
|
+
negative = false
|
29
|
+
r = try do
|
30
|
+
negative = (scan(/[+\-]/) == '-')
|
31
|
+
if radix.nil?
|
32
|
+
radix =
|
33
|
+
if scan(/0b/i)
|
34
|
+
2
|
35
|
+
elsif scan(/0x/i)
|
36
|
+
16
|
37
|
+
elsif scan('0')
|
38
|
+
8
|
39
|
+
else
|
40
|
+
10
|
41
|
+
end
|
42
|
+
end
|
43
|
+
scan \
|
44
|
+
case radix
|
45
|
+
when 2; /[01]+/
|
46
|
+
when 8; /[0-7]+/
|
47
|
+
when 10; /\d+/
|
48
|
+
when 16; /\h+/i
|
49
|
+
else
|
50
|
+
if radix < 10
|
51
|
+
/[0-#{radix}]+/
|
52
|
+
elsif radix > 36
|
53
|
+
raise ArgumentError, "invalid radix #{radix}"
|
54
|
+
else
|
55
|
+
end_char = ('a'.ord + (radix - 11)).chr
|
56
|
+
/[\da-#{end_char}]+/i
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
if r
|
61
|
+
r = r.to_i radix
|
62
|
+
negative ? -r : r
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def scan_date format, start=Date::ITALY
|
67
|
+
s = rest
|
68
|
+
d = DateTime._strptime s, format
|
69
|
+
if d
|
70
|
+
# XXX need 2 parses because the handling is very complex ...
|
71
|
+
dt = DateTime.strptime s, format, start rescue return nil
|
72
|
+
|
73
|
+
len = s.bytesize
|
74
|
+
if leftover = d[:leftover]
|
75
|
+
len -= leftover.bytesize
|
76
|
+
end
|
77
|
+
self.bytepos += len
|
78
|
+
|
79
|
+
dt
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def unpack format
|
84
|
+
if format.index('@')
|
85
|
+
raise ArgumentError, 'position instruction @ not supported'
|
86
|
+
end
|
87
|
+
r = rest.unpack format
|
88
|
+
if r.index(nil)
|
89
|
+
return
|
90
|
+
end
|
91
|
+
# XXX pack to get parsed length because no related API is exposed ...
|
92
|
+
len = r.pack(format).bytesize
|
93
|
+
self.bytepos += len
|
94
|
+
r
|
95
|
+
end
|
96
|
+
|
20
97
|
def pos= new_pos
|
21
98
|
advance new_pos - pos
|
22
99
|
end
|
@@ -57,6 +134,26 @@ class ZScan
|
|
57
134
|
_internal_string.bytesize
|
58
135
|
end
|
59
136
|
|
137
|
+
def line_index
|
138
|
+
_internal_string.byteslice(0, bytepos).count "\n"
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.binary_spec &p
|
142
|
+
bs = BinarySpec.new
|
143
|
+
bs.instance_eval &p
|
144
|
+
bs.instance_variable_get(:@code) << BinarySpec::RET
|
145
|
+
bs
|
146
|
+
end
|
147
|
+
|
148
|
+
class BinarySpec
|
149
|
+
BLANK = ''.force_encoding 'binary'
|
150
|
+
|
151
|
+
def initialize
|
152
|
+
@code = BLANK.dup
|
153
|
+
@s_size = 0
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
60
157
|
private :_internal_init, :_internal_string
|
61
158
|
end
|
62
159
|
|
@@ -0,0 +1,165 @@
|
|
1
|
+
# GENERATED WITH: rake gen
|
2
|
+
class ZScan::BinarySpec
|
3
|
+
def int8 n=1
|
4
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
5
|
+
n.times do
|
6
|
+
@code << INT8
|
7
|
+
@s_size += 1
|
8
|
+
end
|
9
|
+
end
|
10
|
+
def int16 n=1
|
11
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
12
|
+
n.times do
|
13
|
+
@code << INT16
|
14
|
+
@s_size += 2
|
15
|
+
end
|
16
|
+
end
|
17
|
+
def int16_swap n=1
|
18
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
19
|
+
n.times do
|
20
|
+
@code << INT16_SWAP
|
21
|
+
@s_size += 2
|
22
|
+
end
|
23
|
+
end
|
24
|
+
def int32 n=1
|
25
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
26
|
+
n.times do
|
27
|
+
@code << INT32
|
28
|
+
@s_size += 4
|
29
|
+
end
|
30
|
+
end
|
31
|
+
def int32_swap n=1
|
32
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
33
|
+
n.times do
|
34
|
+
@code << INT32_SWAP
|
35
|
+
@s_size += 4
|
36
|
+
end
|
37
|
+
end
|
38
|
+
def int64 n=1
|
39
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
40
|
+
n.times do
|
41
|
+
@code << INT64
|
42
|
+
@s_size += 8
|
43
|
+
end
|
44
|
+
end
|
45
|
+
def int64_swap n=1
|
46
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
47
|
+
n.times do
|
48
|
+
@code << INT64_SWAP
|
49
|
+
@s_size += 8
|
50
|
+
end
|
51
|
+
end
|
52
|
+
def uint8 n=1
|
53
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
54
|
+
n.times do
|
55
|
+
@code << UINT8
|
56
|
+
@s_size += 1
|
57
|
+
end
|
58
|
+
end
|
59
|
+
def uint16 n=1
|
60
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
61
|
+
n.times do
|
62
|
+
@code << UINT16
|
63
|
+
@s_size += 2
|
64
|
+
end
|
65
|
+
end
|
66
|
+
def uint16_swap n=1
|
67
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
68
|
+
n.times do
|
69
|
+
@code << UINT16_SWAP
|
70
|
+
@s_size += 2
|
71
|
+
end
|
72
|
+
end
|
73
|
+
def uint32 n=1
|
74
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
75
|
+
n.times do
|
76
|
+
@code << UINT32
|
77
|
+
@s_size += 4
|
78
|
+
end
|
79
|
+
end
|
80
|
+
def uint32_swap n=1
|
81
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
82
|
+
n.times do
|
83
|
+
@code << UINT32_SWAP
|
84
|
+
@s_size += 4
|
85
|
+
end
|
86
|
+
end
|
87
|
+
def uint64 n=1
|
88
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
89
|
+
n.times do
|
90
|
+
@code << UINT64
|
91
|
+
@s_size += 8
|
92
|
+
end
|
93
|
+
end
|
94
|
+
def uint64_swap n=1
|
95
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
96
|
+
n.times do
|
97
|
+
@code << UINT64_SWAP
|
98
|
+
@s_size += 8
|
99
|
+
end
|
100
|
+
end
|
101
|
+
def single n=1
|
102
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
103
|
+
n.times do
|
104
|
+
@code << SINGLE
|
105
|
+
@s_size += 4
|
106
|
+
end
|
107
|
+
end
|
108
|
+
def single_swap n=1
|
109
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
110
|
+
n.times do
|
111
|
+
@code << SINGLE_SWAP
|
112
|
+
@s_size += 4
|
113
|
+
end
|
114
|
+
end
|
115
|
+
def double n=1
|
116
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
117
|
+
n.times do
|
118
|
+
@code << DOUBLE
|
119
|
+
@s_size += 8
|
120
|
+
end
|
121
|
+
end
|
122
|
+
def double_swap n=1
|
123
|
+
raise ArgumentError, "repeat count should be >= 1, but got #{n}" if n < 1
|
124
|
+
n.times do
|
125
|
+
@code << DOUBLE_SWAP
|
126
|
+
@s_size += 8
|
127
|
+
end
|
128
|
+
end
|
129
|
+
if ZScan::BinarySpec.big_endian?
|
130
|
+
alias int16_be int16
|
131
|
+
alias int16_le int16_swap
|
132
|
+
alias int32_be int32
|
133
|
+
alias int32_le int32_swap
|
134
|
+
alias int64_be int64
|
135
|
+
alias int64_le int64_swap
|
136
|
+
alias uint16_be uint16
|
137
|
+
alias uint16_le uint16_swap
|
138
|
+
alias uint32_be uint32
|
139
|
+
alias uint32_le uint32_swap
|
140
|
+
alias uint64_be uint64
|
141
|
+
alias uint64_le uint64_swap
|
142
|
+
alias single_be single
|
143
|
+
alias single_le single_swap
|
144
|
+
alias double_be double
|
145
|
+
alias double_le double_swap
|
146
|
+
else
|
147
|
+
alias int16_le int16
|
148
|
+
alias int16_be int16_swap
|
149
|
+
alias int32_le int32
|
150
|
+
alias int32_be int32_swap
|
151
|
+
alias int64_le int64
|
152
|
+
alias int64_be int64_swap
|
153
|
+
alias uint16_le uint16
|
154
|
+
alias uint16_be uint16_swap
|
155
|
+
alias uint32_le uint32
|
156
|
+
alias uint32_be uint32_swap
|
157
|
+
alias uint64_le uint64
|
158
|
+
alias uint64_be uint64_swap
|
159
|
+
alias single_le single
|
160
|
+
alias single_be single_swap
|
161
|
+
alias double_le double
|
162
|
+
alias double_be double_swap
|
163
|
+
end
|
164
|
+
undef int16_swap, int32_swap, int64_swap, uint16_swap, uint32_swap, uint64_swap, single_swap, double_swap
|
165
|
+
end
|
data/rakefile
CHANGED
@@ -4,6 +4,60 @@ version = `command grep 'VERSION =' lib/zscan.rb`[version_re]
|
|
4
4
|
gem_files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,c}}')
|
5
5
|
gem_package = "zscan-#{version}.gem"
|
6
6
|
|
7
|
+
bspec_types = %w[INT8 INT16 INT32 INT64 UINT8 UINT16 UINT32 UINT64 SINGLE DOUBLE]
|
8
|
+
bspec_insns = bspec_types.flat_map{|ty|
|
9
|
+
if ty =~ /INT8/
|
10
|
+
ty
|
11
|
+
else
|
12
|
+
[ty, "#{ty}_SWAP"]
|
13
|
+
end
|
14
|
+
}
|
15
|
+
def bspec_incr ins
|
16
|
+
case ins
|
17
|
+
when /INT(\d+)/; $1.to_i / 8
|
18
|
+
when /SINGLE/; 4
|
19
|
+
when /DOUBLE/; 8
|
20
|
+
else; raise 'bad'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
def bspec_c_type ins
|
24
|
+
case ins
|
25
|
+
when /(U?INT\d+)/; "#{$1.downcase}_t"
|
26
|
+
when /SINGLE/; 'float'
|
27
|
+
when /DOUBLE/; 'double'
|
28
|
+
else; raise 'bad'
|
29
|
+
end
|
30
|
+
end
|
31
|
+
def bspec_extract ins
|
32
|
+
type = bspec_c_type ins
|
33
|
+
len = bspec_incr(ins) * 8
|
34
|
+
r = "((uint#{len}_t*)s)[0]"
|
35
|
+
if ins.end_with?('SWAP')
|
36
|
+
r = "swap#{len}(#{r})"
|
37
|
+
end
|
38
|
+
"uint#{len}_t r = #{r}"
|
39
|
+
end
|
40
|
+
def bspec_convert ins
|
41
|
+
case ins
|
42
|
+
when /(U)?INT64|UINT32/
|
43
|
+
if ins.start_with?('U')
|
44
|
+
"UINT64toNUM(r)"
|
45
|
+
else
|
46
|
+
"INT64toNUM(CAST(r, int64_t))"
|
47
|
+
end
|
48
|
+
when /INT32/
|
49
|
+
"INT2NUM(CAST(r, int32_t))"
|
50
|
+
when /INT(16|8)/
|
51
|
+
"INT2FIX(CAST(r, #{bspec_c_type ins}))"
|
52
|
+
when /SINGLE/
|
53
|
+
"DBL2NUM((double)CAST(r, float))"
|
54
|
+
when /DOUBLE/
|
55
|
+
"DBL2NUM(CAST(r, double))"
|
56
|
+
else
|
57
|
+
raise 'bad'
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
7
61
|
desc "build and test"
|
8
62
|
task :default => [:test, gem_package]
|
9
63
|
|
@@ -30,3 +84,86 @@ file gem_package => gem_files do
|
|
30
84
|
end
|
31
85
|
sh "gem build zscan.gemspec"
|
32
86
|
end
|
87
|
+
|
88
|
+
desc "generate files"
|
89
|
+
task :gen => %w[ext/bspec_exec.inc ext/bspec_opcode_names.inc lib/zscan/instructions.rb]
|
90
|
+
|
91
|
+
file 'ext/bspec_exec.inc' => __FILE__ do
|
92
|
+
puts "generating ext/bspec_exec.inc"
|
93
|
+
opcode_list = bspec_insns.map do |ins|
|
94
|
+
"&&BS_#{ins}"
|
95
|
+
end.join ', '
|
96
|
+
|
97
|
+
opcode_segs = bspec_insns.map do |ins|
|
98
|
+
%Q{BS_#{ins}:
|
99
|
+
{
|
100
|
+
#{bspec_extract ins};
|
101
|
+
rb_ary_push(a, #{bspec_convert ins});
|
102
|
+
s += #{bspec_incr ins};
|
103
|
+
goto **(ip++);
|
104
|
+
}
|
105
|
+
}
|
106
|
+
end.join "\n"
|
107
|
+
|
108
|
+
File.open 'ext/bspec_exec.inc', 'w' do |f|
|
109
|
+
f.puts %Q|// GENERATED WITH: rake gen
|
110
|
+
#line 2 "ext/bspec_exec.inc"
|
111
|
+
__attribute__((__noinline__))
|
112
|
+
static VALUE bspec_exec(void** ip, char* s, VALUE a) {
|
113
|
+
static void* opcodes[] = { &&BS_RET, #{opcode_list} };
|
114
|
+
if (ip == NULL) {
|
115
|
+
return (VALUE)opcodes;
|
116
|
+
}
|
117
|
+
goto **(ip++);
|
118
|
+
BS_RET:
|
119
|
+
return a;
|
120
|
+
#{opcode_segs}
|
121
|
+
}|
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
file 'ext/bspec_opcode_names.inc' => __FILE__ do
|
126
|
+
puts 'generating ext/bspec_opcode_names.inc'
|
127
|
+
opcode_names = bspec_insns.map(&:inspect).join ', '
|
128
|
+
File.open 'ext/bspec_opcode_names.inc', 'w' do |f|
|
129
|
+
f.puts "// GENERATED WITH: rake gen"
|
130
|
+
f.puts %Q|const char* bspec_opcode_names[] = {"RET", #{opcode_names}};|
|
131
|
+
f.puts %Q|long bspec_opcode_size = #{bspec_insns.size + 1};|
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
file 'lib/zscan/instructions.rb' => __FILE__ do
|
136
|
+
puts 'generating lib/zscan/instructions.rb'
|
137
|
+
File.open 'lib/zscan/instructions.rb', 'w' do |f|
|
138
|
+
f.puts "# GENERATED WITH: rake gen"
|
139
|
+
f.puts "class ZScan::BinarySpec"
|
140
|
+
|
141
|
+
bspec_insns.each do |ins|
|
142
|
+
f.puts <<-RUBY
|
143
|
+
def #{ins.downcase} n=1
|
144
|
+
raise ArgumentError, "repeat count should be >= 1, but got \#{n}" if n < 1
|
145
|
+
n.times do
|
146
|
+
@code << #{ins}
|
147
|
+
@s_size += #{bspec_incr ins}
|
148
|
+
end
|
149
|
+
end
|
150
|
+
RUBY
|
151
|
+
end
|
152
|
+
|
153
|
+
alias_ins = (bspec_types - ['INT8', 'UINT8']).map &:downcase
|
154
|
+
f.puts " if ZScan::BinarySpec.big_endian?"
|
155
|
+
alias_ins.each do |ins|
|
156
|
+
f.puts " alias #{ins}_be #{ins}"
|
157
|
+
f.puts " alias #{ins}_le #{ins}_swap"
|
158
|
+
end
|
159
|
+
f.puts " else"
|
160
|
+
alias_ins.each do |ins|
|
161
|
+
f.puts " alias #{ins}_le #{ins}"
|
162
|
+
f.puts " alias #{ins}_be #{ins}_swap"
|
163
|
+
end
|
164
|
+
f.puts " end"
|
165
|
+
swap_ins = alias_ins.map{|ins| "#{ins}_swap"}
|
166
|
+
f.puts " undef #{swap_ins.join ', '}"
|
167
|
+
f.puts "end"
|
168
|
+
end
|
169
|
+
end
|
data/readme.md
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
- `ZScan#pos` is the codepoint position, and `ZScan#bytepos` is byte position.
|
5
5
|
- Correctly scans anchors and look behind predicates.
|
6
6
|
- Pos stack manipulation.
|
7
|
+
- Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format`, `#scan_binary format`.
|
7
8
|
|
8
9
|
## Install
|
9
10
|
|
@@ -22,7 +23,7 @@ z.scan /\w+/ #=> 'world'
|
|
22
23
|
z.eos? #=> true
|
23
24
|
```
|
24
25
|
|
25
|
-
## Motivation
|
26
|
+
## Motivation - `StringScanner`
|
26
27
|
|
27
28
|
Ruby's stdlib `StringScanner` treats the scanning position as beginning of string:
|
28
29
|
|
@@ -46,41 +47,112 @@ z.scan /^/ #=> nil
|
|
46
47
|
|
47
48
|
See also https://bugs.ruby-lang.org/issues/7092
|
48
49
|
|
50
|
+
## Other motivations - `scanf` / `strptime` / `unpack`
|
51
|
+
|
52
|
+
- For scan and convert, ruby's stdlib `Scanf` is slow (creates regexp array everytime called) and not possible to corporate with scanner.
|
53
|
+
- For date parsing, `strptime` doesn't tell the parsed length.
|
54
|
+
- For binary parsing, `unpack` is an slow interpreter, and the instructions are quite irregular.
|
55
|
+
|
49
56
|
## Essential methods
|
50
57
|
|
51
58
|
- `ZScan.new string, dup=false`
|
52
59
|
- `#scan regexp_or_string`
|
53
60
|
- `#skip regexp_or_string`
|
54
61
|
- `#match_bytesize regexp_or_string` return length of matched bytes or `nil`.
|
62
|
+
- `#scan_float` scan a float number which is not starting with space. It deals with multibyte encodings for you.
|
63
|
+
- `#scan_int radix=nil` if radix is nil, decide base by prefix: `0x` is 16, `0` is 8, `0b` is 2, otherwise 10. `radix` should be in range `2..36`.
|
64
|
+
- `#scan_date format_string, start=Date::ITALY` scan a `DateTime` object, see also [strptime](http://rubydoc.info/stdlib/date/DateTime.strptime).
|
65
|
+
- `#scan_binary binary_spec` optimized and readable binary scan, see below for how to create a `ZScan::BinarySpec`.
|
66
|
+
- `#unpack format_string`
|
55
67
|
- `#eos?`
|
56
68
|
- `#string` note: return a dup. Don't worry the performance because it is a copy-on-write string.
|
57
69
|
- `#rest`
|
58
70
|
|
71
|
+
## String delegates
|
72
|
+
|
73
|
+
For convienience
|
74
|
+
|
75
|
+
- `#<< append_string`
|
76
|
+
- `#[]= range, replace_string` note: if `range` starts before pos, moves pos left, also clears the stack.
|
77
|
+
- `#size`
|
78
|
+
- `#bytesize`
|
79
|
+
|
80
|
+
## Parsing combinators
|
81
|
+
|
82
|
+
Combinators that manage scanner pos and stack state for you. In the combinators, if the returned value of the given block is `nil` or `false`, stops iteration. Can be nested, useful for building parsers.
|
83
|
+
|
84
|
+
- `#try &block` returns `block`'s return.
|
85
|
+
- `#zero_or_one result=[], &block` try to execute 0 or 1 time, returns `result`.
|
86
|
+
- `#zero_or_more result=[], &block` try to execute 0 or more times, also stops iteration if scanner no advance, returns `result`.
|
87
|
+
- `#one_or_more result=[], &block` try to execute 1 or more times, also stops iteration if scanner no advance, returns `nil` or `result`.
|
88
|
+
|
59
89
|
## Pos management
|
60
90
|
|
61
91
|
- `#pos`
|
62
92
|
- `#pos= new_pos` note: complexity ~ `new_pos > pos ? new_pos - pos : new_pos`.
|
63
93
|
- `#bytepos`
|
64
94
|
- `#bytepos= new_bytepos` note: complexity ~ `abs(new_bytepos - bytepos)`.
|
95
|
+
- `#line_index` line index of current position, start from `0`.
|
65
96
|
- `#advance n` move forward `n` codepoints, if `n < 0`, move backward. Stops at beginning or end.
|
66
97
|
- `#reset` go to beginning.
|
67
98
|
- `#terminate` go to end of string.
|
68
99
|
|
69
|
-
## Efficient pos stack manipulation
|
100
|
+
## (Low level) Efficient pos stack manipulation
|
70
101
|
|
71
102
|
- `#push` push current pos into the stack.
|
72
103
|
- `#pop` set current pos to top of the stack, and pop it.
|
73
104
|
- `#drop` drop top of pos stack without changing current pos.
|
74
105
|
- `#restore` set current pos to top of the stack.
|
75
106
|
- `#clear_pos_stack` clear pos stack.
|
76
|
-
- `#try` try to do several scans in the given block, fall back to init pos if block returns `nil` or `false`. Returns block's return, can be nested.
|
77
107
|
|
78
|
-
##
|
108
|
+
## `ZScan::BinarySpec`
|
79
109
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
110
|
+
Specify a sequence of binary data. Designed for binary protocol parsing. Example:
|
111
|
+
|
112
|
+
```ruby
|
113
|
+
# create a ZScan::BinarySpec
|
114
|
+
s = ZScan.binary_spec do
|
115
|
+
int8 # once
|
116
|
+
uint32_le 2 # little endian, twice
|
117
|
+
double_be 1 # big endian, once
|
118
|
+
end
|
119
|
+
z = ZScan.new [-1, 2, 3, 4.0].pack('cI<2G') + "rest"
|
120
|
+
z.scan_binary s #=> [-1, 2, 3, 4.0]
|
121
|
+
z.rest #=> 'rest
|
122
|
+
```
|
123
|
+
|
124
|
+
Integer instructions:
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
int8 uint8
|
128
|
+
int16 uint16 int16_le uint16_le int16_be uint16_be
|
129
|
+
int32 uint32 int32_le uint32_le int32_be uint32_be
|
130
|
+
int64 uint64 int64_le uint64_le int64_be uint64_be
|
131
|
+
```
|
132
|
+
|
133
|
+
Single precision float instructions:
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
single single_le single_be
|
137
|
+
```
|
138
|
+
|
139
|
+
Double precision float instructions:
|
140
|
+
|
141
|
+
```ruby
|
142
|
+
double double_le double_be
|
143
|
+
```
|
144
|
+
|
145
|
+
Endians:
|
146
|
+
|
147
|
+
- (without endian suffix) native endian
|
148
|
+
- `*_le` little endian (VAX, x86, Windows string code unit)
|
149
|
+
- `*_be` big endian, network endian (SPARC, Java string code unit)
|
150
|
+
|
151
|
+
Repeat count must be integer `>= 1`, default is `1`.
|
152
|
+
|
153
|
+
It is implemented as a direct-threaded bytecode interpreter. Performance vs `String#unpack`:
|
154
|
+
|
155
|
+
todo
|
84
156
|
|
85
157
|
## License
|
86
158
|
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require_relative "spec_helper"
|
2
|
+
|
3
|
+
describe 'ZScan binary scanning methods' do
|
4
|
+
it "#unpack" do
|
5
|
+
z = ZScan.new "\x01\x02\x03"
|
6
|
+
assert_raise ArgumentError do
|
7
|
+
z.unpack '@1C'
|
8
|
+
end
|
9
|
+
assert_equal [1, 2], (z.unpack 'CC')
|
10
|
+
assert_equal 2, z.pos
|
11
|
+
assert_equal nil, (z.unpack 'I')
|
12
|
+
assert_equal 2, z.pos
|
13
|
+
end
|
14
|
+
|
15
|
+
it "#scan_binary" do
|
16
|
+
s = ZScan.binary_spec do
|
17
|
+
int8 # once
|
18
|
+
uint32_le 2 # little endian, twice
|
19
|
+
double_be 1 # big endian, once
|
20
|
+
single 1
|
21
|
+
end
|
22
|
+
a = [-1, 2, 3, 4.0, 3.0]
|
23
|
+
z = ZScan.new(a.pack('cI<2Gf') + 'rest')
|
24
|
+
b = z.scan_binary s
|
25
|
+
assert_equal 'rest', z.rest
|
26
|
+
assert_equal a, b
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative "spec_helper"
|
2
|
+
|
3
|
+
describe 'ZScan combinators' do
|
4
|
+
it "#try restores pos" do
|
5
|
+
z = ZScan.new "hello"
|
6
|
+
return1 = z.try do
|
7
|
+
z.scan 'h'
|
8
|
+
z.scan 'e'
|
9
|
+
end
|
10
|
+
assert_equal 'e', return1
|
11
|
+
assert_equal 2, z.pos
|
12
|
+
|
13
|
+
return2 = z.try do
|
14
|
+
z.scan 'l'
|
15
|
+
z.scan 'l'
|
16
|
+
z.scan 'p' # fails
|
17
|
+
end
|
18
|
+
assert_equal nil, return2
|
19
|
+
assert_equal 2, z.pos
|
20
|
+
end
|
21
|
+
|
22
|
+
it "#zero_or_one" do
|
23
|
+
z = Zscan.new "aab"
|
24
|
+
assert_equal ['a'], z.zero_or_one{z.scan 'a'}
|
25
|
+
assert_equal 1, z.pos
|
26
|
+
|
27
|
+
z = Zscan.new 'aab'
|
28
|
+
assert_equal [], z.zero_or_one{z.scan 'b'}
|
29
|
+
assert_equal 0, z.pos
|
30
|
+
end
|
31
|
+
|
32
|
+
it "#zero_or_more" do
|
33
|
+
z = Zscan.new "aab"
|
34
|
+
assert_equal ['a', 'a'], z.zero_or_more{z.scan 'a'}
|
35
|
+
assert_equal 2, z.pos
|
36
|
+
|
37
|
+
assert_equal 'aab', z.zero_or_more('aa'){z.scan 'c'; z.scan 'b'}
|
38
|
+
|
39
|
+
z = Zscan.new 'aab'
|
40
|
+
assert_equal [], z.zero_or_more{z.scan 'b'}
|
41
|
+
assert_equal 0, z.pos
|
42
|
+
end
|
43
|
+
|
44
|
+
it "#one_or_more" do
|
45
|
+
z = Zscan.new 'aab'
|
46
|
+
assert_equal ['a', 'a'], z.one_or_more{z.scan 'a'}
|
47
|
+
assert_equal 2, z.pos
|
48
|
+
|
49
|
+
z = Zscan.new 'aab'
|
50
|
+
assert_equal nil, z.one_or_more([]){z.scan 'b'}
|
51
|
+
end
|
52
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require_relative "spec_helper"
|
2
|
+
|
3
|
+
describe "typed scan" do
|
4
|
+
it "#scan_int" do
|
5
|
+
z = Zscan.new " 1 0b10F5 10 030"
|
6
|
+
assert_equal nil, z.scan_int
|
7
|
+
z.advance 1
|
8
|
+
assert_equal 1, z.scan_int(10)
|
9
|
+
|
10
|
+
z.advance 1
|
11
|
+
assert_equal 0b10, z.scan_int
|
12
|
+
assert_equal 0xF5, z.scan_int(16)
|
13
|
+
|
14
|
+
z.advance 1
|
15
|
+
assert_equal 12, z.scan_int(12)
|
16
|
+
|
17
|
+
z.advance 1
|
18
|
+
assert_equal 030, z.scan_int
|
19
|
+
end
|
20
|
+
|
21
|
+
it "#scan_float" do
|
22
|
+
z = Zscan.new " -3.5e23"
|
23
|
+
assert_equal nil, z.scan_float
|
24
|
+
z.advance 1
|
25
|
+
assert_equal -3.5e23, z.scan_float
|
26
|
+
end
|
27
|
+
|
28
|
+
it "won't overflow in #scan_float" do
|
29
|
+
s = '1.23E15'.byteslice 0, 4
|
30
|
+
z = Zscan.new s
|
31
|
+
assert_equal 1.23, z.scan_float
|
32
|
+
assert_equal 4, z.pos
|
33
|
+
end
|
34
|
+
|
35
|
+
it "#scan_date" do
|
36
|
+
z = Zscan.new " 2001 04 6 04 05 06 +7 231rest"
|
37
|
+
assert_equal nil, z.scan_date('%Y %U %w %H %M %S %z %N')
|
38
|
+
z.advance 1
|
39
|
+
|
40
|
+
d = z.scan_date '%Y %U %w %H %M %S %z %N'
|
41
|
+
assert_equal 0.231, d.sec_fraction
|
42
|
+
assert_equal 'rest', z.rest
|
43
|
+
|
44
|
+
z.pos = 1
|
45
|
+
z.scan_date '%Y %U %w ahoy %H %M %S %z' # bad format
|
46
|
+
assert_equal 1, z.pos
|
47
|
+
end
|
48
|
+
end
|
data/spec/zscan_spec.rb
CHANGED
@@ -1,18 +1,10 @@
|
|
1
|
-
require_relative "
|
2
|
-
require 'rspec/autorun'
|
3
|
-
RSpec.configure do |config|
|
4
|
-
config.expect_with :stdlib
|
5
|
-
end
|
1
|
+
require_relative "spec_helper"
|
6
2
|
|
7
3
|
describe ZScan do
|
8
4
|
before :each do
|
9
5
|
@z = ZScan.new 'ab你好'
|
10
6
|
end
|
11
7
|
|
12
|
-
before :all do
|
13
|
-
GC.stress = true
|
14
|
-
end
|
15
|
-
|
16
8
|
it "random workflow" do
|
17
9
|
assert_equal 2, @z.match_bytesize('ab')
|
18
10
|
@z.pos = 4
|
@@ -76,21 +68,23 @@ describe ZScan do
|
|
76
68
|
assert_equal 3, @z.pos
|
77
69
|
end
|
78
70
|
|
79
|
-
it "#
|
80
|
-
z = ZScan.new
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
assert_equal
|
86
|
-
assert_equal 2, z.pos
|
71
|
+
it "#reset, #terminate and #line_index" do
|
72
|
+
z = ZScan.new ''
|
73
|
+
assert_equal 0, z.line_index
|
74
|
+
z.terminate
|
75
|
+
assert_equal 0, z.line_index
|
76
|
+
z.reset
|
77
|
+
assert_equal 0, z.line_index
|
87
78
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
assert_equal
|
94
|
-
|
79
|
+
z = ZScan.new "a\nb\nc"
|
80
|
+
assert_equal 0, z.line_index
|
81
|
+
z.terminate
|
82
|
+
assert_equal 2, z.line_index
|
83
|
+
z.reset
|
84
|
+
assert_equal 0, z.line_index
|
85
|
+
z.pos = 1
|
86
|
+
assert_equal 0, z.line_index
|
87
|
+
z.pos = 2
|
88
|
+
assert_equal 1, z.line_index
|
95
89
|
end
|
96
90
|
end
|
data/zscan.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "zscan"
|
3
|
-
s.version = "1.
|
3
|
+
s.version = "1.1" # version mapped from zscan.rb, don't change here
|
4
4
|
s.author = "Zete Lui"
|
5
5
|
s.homepage = "https://github.com/luikore/zscan"
|
6
6
|
s.platform = Gem::Platform::RUBY
|
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.required_ruby_version = ">=1.9.2"
|
10
10
|
s.licenses = ['BSD']
|
11
11
|
|
12
|
-
s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,c}}')
|
12
|
+
s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,c,inc}}')
|
13
13
|
s.require_paths = ["lib"]
|
14
14
|
s.extensions = ["ext/extconf.rb"]
|
15
15
|
s.rubygems_version = '1.8.24'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zscan
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: '1.1'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Zete Lui
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-05-
|
11
|
+
date: 2013-05-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: improved string scanner, respects anchors and lookbehinds, supports codepoint
|
14
14
|
positioning
|
@@ -21,11 +21,19 @@ files:
|
|
21
21
|
- rakefile
|
22
22
|
- zscan.gemspec
|
23
23
|
- readme.md
|
24
|
-
-
|
24
|
+
- benchmark/vs-strscan.rb
|
25
|
+
- benchmark/vs-unpack.rb
|
25
26
|
- ext/extconf.rb
|
27
|
+
- lib/zscan/instructions.rb
|
26
28
|
- lib/zscan.rb
|
29
|
+
- spec/binary_scan_spec.rb
|
30
|
+
- spec/combinator_spec.rb
|
31
|
+
- spec/spec_helper.rb
|
32
|
+
- spec/typed_scan_spec.rb
|
27
33
|
- spec/zscan_spec.rb
|
28
34
|
- ext/zscan.c
|
35
|
+
- ext/bspec_exec.inc
|
36
|
+
- ext/bspec_opcode_names.inc
|
29
37
|
homepage: https://github.com/luikore/zscan
|
30
38
|
licenses:
|
31
39
|
- BSD
|