zscan 1.3 → 2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/benchmark/vs-unpack.rb +5 -4
- data/ext/bspec.c +34 -22
- data/ext/bspec_exec.inc +48 -11
- data/ext/bspec_init.inc +3 -3
- data/ext/extconf.rb +13 -2
- data/ext/pack/COPYING +56 -0
- data/ext/pack/COPYING.ja +51 -0
- data/ext/pack/internal.h +419 -0
- data/ext/pack/pack.c +2295 -0
- data/ext/zscan.c +37 -6
- data/ext/zscan.h +4 -0
- data/generate/bspec.rb +48 -0
- data/generate/bspec_exec.inc +29 -0
- data/generate/bspec_init.inc +2 -0
- data/generate/generate.rb +147 -0
- data/lib/zscan.rb +4 -37
- data/lib/zscan/bspec.rb +168 -0
- data/rakefile +29 -139
- data/readme.md +69 -24
- data/spec/binary_scan_spec.rb +35 -11
- data/spec/typed_scan_spec.rb +6 -0
- data/spec/zscan_spec.rb +8 -0
- data/zscan.gemspec +2 -2
- metadata +11 -3
- data/lib/zscan/instructions.rb +0 -147
data/rakefile
CHANGED
@@ -1,74 +1,28 @@
|
|
1
|
+
require_relative "generate/generate"
|
1
2
|
Dir.chdir __dir__
|
2
|
-
version_re
|
3
|
-
version
|
4
|
-
gem_files
|
5
|
-
gem_package
|
3
|
+
version_re = /\d+(\.\d+)*/
|
4
|
+
version = `command grep 'VERSION =' lib/zscan.rb`[version_re]
|
5
|
+
gem_files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc},ext/pack/COPYING*}')
|
6
|
+
gem_package = "zscan-#{version}.gem"
|
7
|
+
generate_src = [__FILE__, *Dir.glob('generate/*')]
|
6
8
|
|
7
|
-
|
8
|
-
|
9
|
-
if ty =~ /INT8/
|
10
|
-
ty
|
11
|
-
else
|
12
|
-
[ty, "#{ty}_SWAP"]
|
13
|
-
end
|
14
|
-
}
|
15
|
-
def bspec_incr ins
|
16
|
-
case ins
|
17
|
-
when /INT(\d+)/; $1.to_i / 8
|
18
|
-
when /SINGLE/; 4
|
19
|
-
when /DOUBLE/; 8
|
20
|
-
else; raise 'bad'
|
21
|
-
end
|
22
|
-
end
|
23
|
-
def bspec_c_type ins
|
24
|
-
case ins
|
25
|
-
when /(U?INT\d+)/; "#{$1.downcase}_t"
|
26
|
-
when /SINGLE/; 'float'
|
27
|
-
when /DOUBLE/; 'double'
|
28
|
-
else; raise 'bad'
|
29
|
-
end
|
30
|
-
end
|
31
|
-
def bspec_extract ins
|
32
|
-
type = bspec_c_type ins
|
33
|
-
len = bspec_incr(ins) * 8
|
34
|
-
r = "((uint#{len}_t*)s)[0]"
|
35
|
-
if ins.end_with?('SWAP')
|
36
|
-
r = "swap#{len}(#{r})"
|
37
|
-
end
|
38
|
-
"uint#{len}_t r = #{r}"
|
39
|
-
end
|
40
|
-
def bspec_convert ins
|
41
|
-
case ins
|
42
|
-
when /(U)?INT64|UINT32/
|
43
|
-
if ins.start_with?('U')
|
44
|
-
"UINT64toNUM(r)"
|
45
|
-
else
|
46
|
-
"INT64toNUM(CAST(r, int64_t))"
|
47
|
-
end
|
48
|
-
when /INT32/
|
49
|
-
"INT2NUM(CAST(r, int32_t))"
|
50
|
-
when /INT(16|8)/
|
51
|
-
"INT2FIX(CAST(r, #{bspec_c_type ins}))"
|
52
|
-
when /SINGLE/
|
53
|
-
"DBL2NUM((double)CAST(r, float))"
|
54
|
-
when /DOUBLE/
|
55
|
-
"DBL2NUM(CAST(r, double))"
|
56
|
-
else
|
57
|
-
raise 'bad'
|
58
|
-
end
|
59
|
-
end
|
9
|
+
desc "generate, build and test, and pack gem"
|
10
|
+
task :default => [:gen, :test, gem_package]
|
60
11
|
|
61
12
|
desc "build and test"
|
62
|
-
task :
|
63
|
-
|
64
|
-
desc "build and run test"
|
65
|
-
task :test do
|
13
|
+
task :test => 'ext/Makefile' do
|
66
14
|
sh "make -C ext"
|
67
15
|
sh "rspec"
|
68
16
|
end
|
69
17
|
|
18
|
+
file 'ext/Makefile' do
|
19
|
+
Dir.chdir 'ext' do
|
20
|
+
sh "ruby extconf.rb"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
70
24
|
desc "pack gem"
|
71
|
-
file gem_package => gem_files do
|
25
|
+
file gem_package => gem_files + Generate.files do
|
72
26
|
sh "rm zscan-*.gem"
|
73
27
|
|
74
28
|
new_version = false
|
@@ -82,87 +36,23 @@ file gem_package => gem_files do
|
|
82
36
|
if new_version
|
83
37
|
File.open('zscan.gemspec', 'w'){|f| f << lines.join }
|
84
38
|
end
|
39
|
+
puts "packing files:"
|
40
|
+
puts '-' * 40
|
41
|
+
puts gem_files
|
42
|
+
puts '-' * 40
|
85
43
|
sh "gem build zscan.gemspec"
|
86
44
|
end
|
87
45
|
|
88
46
|
desc "generate files"
|
89
|
-
task :gen =>
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
%Q{BS_#{ins}:
|
99
|
-
{
|
100
|
-
#{bspec_extract ins};
|
101
|
-
rb_ary_push(a, #{bspec_convert ins});
|
102
|
-
s += #{bspec_incr ins};
|
103
|
-
goto **(ip++);
|
104
|
-
}
|
105
|
-
}
|
106
|
-
end.join "\n"
|
107
|
-
|
108
|
-
File.open 'ext/bspec_exec.inc', 'w' do |f|
|
109
|
-
f.puts %Q|// GENERATED WITH: rake gen
|
110
|
-
#line 2 "ext/bspec_exec.inc"
|
111
|
-
__attribute__((__noinline__))
|
112
|
-
static VALUE bspec_exec(void** ip, char* s, VALUE a) {
|
113
|
-
static void* opcodes[] = { &&BS_RET, #{opcode_list} };
|
114
|
-
if (ip == NULL) {
|
115
|
-
return (VALUE)opcodes;
|
116
|
-
}
|
117
|
-
goto **(ip++);
|
118
|
-
BS_RET:
|
119
|
-
return a;
|
120
|
-
#{opcode_segs}
|
121
|
-
}|
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
file 'ext/bspec_init.inc' => __FILE__ do
|
126
|
-
puts 'generating ext/bspec_init.inc'
|
127
|
-
opcode_incrs = bspec_insns.map{|ins| bspec_incr ins}.join ', '
|
128
|
-
File.open 'ext/bspec_init.inc', 'w' do |f|
|
129
|
-
f.puts "// GENERATED WITH: rake gen"
|
130
|
-
f.puts %Q|static const long bspec_s_sizes[] = {0, #{opcode_incrs}};|
|
131
|
-
f.puts %Q|static const long bspec_opcodes_size = #{bspec_insns.size + 1};|
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
file 'lib/zscan/instructions.rb' => __FILE__ do
|
136
|
-
puts 'generating lib/zscan/instructions.rb'
|
137
|
-
File.open 'lib/zscan/instructions.rb', 'w' do |f|
|
138
|
-
f.puts "# GENERATED WITH: rake gen"
|
139
|
-
f.puts "class ZScan::BinarySpec"
|
140
|
-
|
141
|
-
bspec_insns.each_with_index do |ins, i|
|
142
|
-
f.puts <<-RUBY
|
143
|
-
def #{ins.downcase} n=1
|
144
|
-
raise ArgumentError, "repeat count should be >= 1, but got \#{n}" if n < 1
|
145
|
-
n.times do
|
146
|
-
append #{i + 1}
|
147
|
-
end
|
148
|
-
end
|
149
|
-
RUBY
|
150
|
-
end
|
151
|
-
|
152
|
-
alias_ins = (bspec_types - ['INT8', 'UINT8']).map &:downcase
|
153
|
-
f.puts " if ZScan::BinarySpec.big_endian?"
|
154
|
-
alias_ins.each do |ins|
|
155
|
-
f.puts " alias #{ins}_be #{ins}"
|
156
|
-
f.puts " alias #{ins}_le #{ins}_swap"
|
157
|
-
end
|
158
|
-
f.puts " else"
|
159
|
-
alias_ins.each do |ins|
|
160
|
-
f.puts " alias #{ins}_le #{ins}"
|
161
|
-
f.puts " alias #{ins}_be #{ins}_swap"
|
47
|
+
task :gen => Generate.files
|
48
|
+
|
49
|
+
Generate.files.each do |name|
|
50
|
+
file name => generate_src do
|
51
|
+
puts "generating #{name}"
|
52
|
+
r = Generate.generate(name)
|
53
|
+
File.open name, 'w' do |f|
|
54
|
+
f.puts "#{name.end_with?('rb') ? '#' : '//'} generated by rake gen"
|
55
|
+
f << r
|
162
56
|
end
|
163
|
-
f.puts " end"
|
164
|
-
swap_ins = alias_ins.map{|ins| "#{ins}_swap"}
|
165
|
-
f.puts " undef #{swap_ins.join ', '}"
|
166
|
-
f.puts "end"
|
167
57
|
end
|
168
58
|
end
|
data/readme.md
CHANGED
@@ -4,7 +4,8 @@
|
|
4
4
|
- `ZScan#pos` is the codepoint position, and `ZScan#bytepos` is byte position.
|
5
5
|
- Correctly scans anchors and look behind predicates.
|
6
6
|
- Pos stack manipulation.
|
7
|
-
- Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format
|
7
|
+
- Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format`.
|
8
|
+
- Binary scanning methods: `#scan_bytes spec`, `#unpack format`.
|
8
9
|
|
9
10
|
## Install
|
10
11
|
|
@@ -50,22 +51,20 @@ See also https://bugs.ruby-lang.org/issues/7092
|
|
50
51
|
## Other motivations
|
51
52
|
|
52
53
|
- For scan and convert, ruby's stdlib `Scanf` is slow (creates regexp array everytime called) and not possible to corporate with scanner.
|
53
|
-
- For date parsing, `strptime` doesn't tell the parsed length.
|
54
|
-
- For binary parsing, `unpack` is an slow interpreter, and the instructions are quite irregular.
|
54
|
+
- For date parsing, `Date#strptime` doesn't tell the parsed length.
|
55
|
+
- For binary parsing, `String#unpack` is an slow interpreter, it doesn't tell the parsed length either, and the instructions are quite irregular.
|
55
56
|
|
56
57
|
## Essential methods
|
57
58
|
|
58
59
|
- `ZScan.new string, dup=false`
|
59
60
|
- `#scan regexp_or_string`
|
60
|
-
- `#skip regexp_or_string`
|
61
|
+
- `#skip regexp_or_string` return new byte pos or `nil`
|
61
62
|
- `#match_bytesize regexp_or_string` return length of matched bytes or `nil`.
|
62
63
|
- `#slice n` slice sub string of n chars from current pos, advances the cursor.
|
63
64
|
- `#byteslice n` slice sub string of n bytes from cursor pos, advances the cursor.
|
64
65
|
- `#scan_float` scan a float number which is not starting with space. It deals with multibyte encodings for you.
|
65
66
|
- `#scan_int radix=nil` if radix is nil, decide base by prefix: `0x` is 16, `0` is 8, `0b` is 2, otherwise 10. `radix` should be in range `2..36`.
|
66
67
|
- `#scan_date format_string, start=Date::ITALY` scan a `DateTime` object, see also [strptime](http://rubydoc.info/stdlib/date/DateTime.strptime).
|
67
|
-
- `#scan_binary binary_spec` optimized and readable binary scan, see below for how to create a `ZScan::BinarySpec`.
|
68
|
-
- `#unpack format_string`
|
69
68
|
- `#eos?`
|
70
69
|
- `#string` note: return a dup. Don't worry the performance because it is a copy-on-write string.
|
71
70
|
- `#rest` rest unscanned sub string.
|
@@ -80,6 +79,7 @@ For convienience
|
|
80
79
|
- `#[]= range, replace_string` note: if `range` starts before pos, moves pos left, also clears the stack.
|
81
80
|
- `#size`
|
82
81
|
- `#bytesize`
|
82
|
+
- `#cleanup` cleanup substring before current pos.
|
83
83
|
|
84
84
|
## Pos management
|
85
85
|
|
@@ -92,53 +92,70 @@ For convienience
|
|
92
92
|
- `#reset` go to beginning.
|
93
93
|
- `#terminate` go to end of string.
|
94
94
|
|
95
|
-
## Binary
|
95
|
+
## Binary scanning
|
96
96
|
|
97
|
-
|
97
|
+
- `#scan_bytes bspec` optimized and readable binary scan, see below for how to create a `ZScan::BSpec`.
|
98
|
+
- `#unpack unpack_format_string` note that it always returns an array no matter matched or not (same behavior as `String#unpack`).
|
99
|
+
|
100
|
+
#### Bytes spec
|
101
|
+
|
102
|
+
Bytes spec is designed for fast binary protocol parsing. You can specify a sequence of binary data and how to expect the matching.
|
103
|
+
|
104
|
+
Unlike `#unpack`, bytes spec uses english names to specify the data sequence. It returns `nil` if any of the instructions not matching. Though there's no string / position changing / variable length instructions.
|
105
|
+
|
106
|
+
Bytes spec is implemented as direct-threaded VM, it faster than `#unpack`.
|
107
|
+
|
108
|
+
Example:
|
98
109
|
|
99
110
|
```ruby
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
111
|
+
s = ZScan.BSpec.new do
|
112
|
+
int8 expect: -1 # return nil if the first int8 is not -1
|
113
|
+
2.times{
|
114
|
+
uint32_le # le means: little endian
|
115
|
+
}
|
116
|
+
double_be # be means: big endian
|
105
117
|
end
|
118
|
+
|
106
119
|
z = ZScan.new [-1, 2, 3, 4.0].pack('cI<2G') + "rest"
|
107
|
-
z.
|
120
|
+
z.scan_bytes s #=> [-1, 2, 3, 4.0]
|
108
121
|
z.rest #=> 'rest
|
122
|
+
|
123
|
+
bad_z = ZScan.new [1, 2, 3, 4.0].pack('cI<2G) # first byte not match
|
124
|
+
z.scan_bytes s #=> nil
|
109
125
|
```
|
110
126
|
|
111
127
|
Integer instructions:
|
112
128
|
|
113
129
|
```ruby
|
114
|
-
int8 uint8
|
130
|
+
int8 uint8 byte # byte is the same as uint8
|
115
131
|
int16 uint16 int16_le uint16_le int16_be uint16_be
|
116
132
|
int32 uint32 int32_le uint32_le int32_be uint32_be
|
117
133
|
int64 uint64 int64_le uint64_le int64_be uint64_be
|
118
134
|
```
|
119
135
|
|
120
|
-
|
136
|
+
Only integer instructions support the `:expect` option, match quickly stops if the scanned result not equal to the expected number.
|
137
|
+
|
138
|
+
Double precision float instructions:
|
121
139
|
|
122
140
|
```ruby
|
123
|
-
|
141
|
+
double double_le double_be
|
124
142
|
```
|
125
143
|
|
126
|
-
|
144
|
+
Single precision float instructions:
|
127
145
|
|
128
146
|
```ruby
|
129
|
-
|
147
|
+
float float_le float_be
|
148
|
+
single single_le single_be # same as float*
|
130
149
|
```
|
131
150
|
|
132
|
-
|
151
|
+
Note that ruby floats are doubles in fact, in a very rare case, you may need to keep the original single-precision data instead of converting into doubles, you can use `uint32` for the job.
|
152
|
+
|
153
|
+
A note on endians:
|
133
154
|
|
134
155
|
- (without endian suffix) native endian
|
135
156
|
- `*_le` little endian (VAX, x86, Windows string code unit)
|
136
157
|
- `*_be` big endian, network endian (SPARC, Java string code unit)
|
137
158
|
|
138
|
-
Repeat count must be integer `>= 1`, default is `1`.
|
139
|
-
|
140
|
-
It is implemented as a direct-threaded bytecode interpreter. A bit faster than `String#unpack`.
|
141
|
-
|
142
159
|
## Parsing combinators
|
143
160
|
|
144
161
|
Combinators that manage scanner pos and stack state for you. In the combinators, if the returned value of the given block is `nil` or `false`, stops iteration and restores scanner location. Can be nested, useful for building parsers.
|
@@ -157,6 +174,34 @@ Combinators that manage scanner pos and stack state for you. In the combinators,
|
|
157
174
|
- `#clear_pos_stack` clear pos stack.
|
158
175
|
- `z.push._try expr` equivalent to `z.try{ expr }`, but faster because no block is required
|
159
176
|
|
177
|
+
## Features QA
|
178
|
+
|
179
|
+
#### `scan_until` and `skip_until`?
|
180
|
+
|
181
|
+
For example, the StringScanner call
|
182
|
+
|
183
|
+
```ruby
|
184
|
+
strscan.scan_until /a/
|
185
|
+
```
|
186
|
+
|
187
|
+
Is equivalent to a slightly different regexp
|
188
|
+
|
189
|
+
```ruby
|
190
|
+
zscan.scan /.*a/m
|
191
|
+
```
|
192
|
+
|
193
|
+
#### Capture groups?
|
194
|
+
|
195
|
+
Not implemented yet. Maybe future?
|
196
|
+
|
197
|
+
#### `unscan`?
|
198
|
+
|
199
|
+
Use pos management methods.
|
200
|
+
|
201
|
+
#### Erlang style bitstring?
|
202
|
+
|
203
|
+
Thought of that but the API can be quirky... It's way beyond a string scanner.
|
204
|
+
|
160
205
|
## License
|
161
206
|
|
162
207
|
```
|
data/spec/binary_scan_spec.rb
CHANGED
@@ -4,27 +4,51 @@ require_relative "spec_helper"
|
|
4
4
|
describe 'ZScan binary scanning methods' do
|
5
5
|
it "#unpack" do
|
6
6
|
z = ZScan.new "\x01\x02\x03"
|
7
|
-
assert_raise ArgumentError do
|
8
|
-
z.unpack '@1C'
|
9
|
-
end
|
10
7
|
assert_equal [1, 2], (z.unpack 'CC')
|
11
8
|
assert_equal 2, z.pos
|
12
|
-
assert_equal nil, (z.unpack 'I')
|
9
|
+
assert_equal [nil], (z.unpack 'I')
|
13
10
|
assert_equal 2, z.pos
|
14
11
|
end
|
15
12
|
|
16
|
-
it "#
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
13
|
+
it "#unpack position-changing instructions and var-length instructions" do
|
14
|
+
z = ZScan.new "abcd\0abc"
|
15
|
+
s, _ = z.unpack 'Z*'
|
16
|
+
assert_equal "abcd", s
|
17
|
+
assert_equal 5, z.pos
|
18
|
+
|
19
|
+
z.reset
|
20
|
+
s, _ = z.unpack '@2Z*'
|
21
|
+
assert_equal 'cd', s
|
22
|
+
end
|
23
|
+
|
24
|
+
it "#scan_bytes" do
|
25
|
+
s = ZScan::BSpec.new do
|
26
|
+
int8
|
27
|
+
2.times{ uint32_le } # little endian
|
28
|
+
double_be # big endian
|
29
|
+
single
|
22
30
|
end
|
23
31
|
|
24
32
|
a = [-1, 2, 3, 4.0, 3.0]
|
25
33
|
z = ZScan.new(a.pack('cI<2Gf') + 'rest')
|
26
|
-
b = z.
|
34
|
+
b = z.scan_bytes s
|
27
35
|
assert_equal 'rest', z.rest
|
28
36
|
assert_equal a, b
|
29
37
|
end
|
38
|
+
|
39
|
+
it "#scan_bytes with expectation" do
|
40
|
+
s = ZScan::BSpec.new do
|
41
|
+
int8 expect: 3
|
42
|
+
float
|
43
|
+
end
|
44
|
+
|
45
|
+
a = [3, 4.0]
|
46
|
+
z = ZScan.new a.pack('cf')
|
47
|
+
assert_equal a, z.scan_bytes(s)
|
48
|
+
|
49
|
+
a = [2, 4.0]
|
50
|
+
z = ZScan.new a.pack('cf')
|
51
|
+
assert_equal nil, z.scan_bytes(s)
|
52
|
+
assert_equal 0, z.pos
|
53
|
+
end
|
30
54
|
end
|
data/spec/typed_scan_spec.rb
CHANGED
@@ -18,6 +18,12 @@ describe "typed scan" do
|
|
18
18
|
assert_equal 030, z.scan_int
|
19
19
|
end
|
20
20
|
|
21
|
+
it '#scan_int does not use unicode numbers' do
|
22
|
+
z = Zscan.new "一二".force_encoding('utf-8')
|
23
|
+
assert_equal nil, z.scan_int
|
24
|
+
assert_equal 0, z.pos
|
25
|
+
end
|
26
|
+
|
21
27
|
it "#scan_float" do
|
22
28
|
z = Zscan.new " -3.5e23"
|
23
29
|
assert_equal nil, z.scan_float
|
data/spec/zscan_spec.rb
CHANGED
data/zscan.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "zscan"
|
3
|
-
s.version = "
|
3
|
+
s.version = "2.0" # version mapped from zscan.rb, don't change here
|
4
4
|
s.author = "Zete Lui"
|
5
5
|
s.homepage = "https://github.com/luikore/zscan"
|
6
6
|
s.platform = Gem::Platform::RUBY
|
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.required_ruby_version = ">=1.9.2"
|
10
10
|
s.licenses = ['BSD']
|
11
11
|
|
12
|
-
s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc}}')
|
12
|
+
s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc},ext/pack/COPYING*}')
|
13
13
|
s.require_paths = ["lib"]
|
14
14
|
s.extensions = ["ext/extconf.rb"]
|
15
15
|
s.rubygems_version = '1.8.24'
|