zscan 1.3 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/benchmark/vs-unpack.rb +5 -4
- data/ext/bspec.c +34 -22
- data/ext/bspec_exec.inc +48 -11
- data/ext/bspec_init.inc +3 -3
- data/ext/extconf.rb +13 -2
- data/ext/pack/COPYING +56 -0
- data/ext/pack/COPYING.ja +51 -0
- data/ext/pack/internal.h +419 -0
- data/ext/pack/pack.c +2295 -0
- data/ext/zscan.c +37 -6
- data/ext/zscan.h +4 -0
- data/generate/bspec.rb +48 -0
- data/generate/bspec_exec.inc +29 -0
- data/generate/bspec_init.inc +2 -0
- data/generate/generate.rb +147 -0
- data/lib/zscan.rb +4 -37
- data/lib/zscan/bspec.rb +168 -0
- data/rakefile +29 -139
- data/readme.md +69 -24
- data/spec/binary_scan_spec.rb +35 -11
- data/spec/typed_scan_spec.rb +6 -0
- data/spec/zscan_spec.rb +8 -0
- data/zscan.gemspec +2 -2
- metadata +11 -3
- data/lib/zscan/instructions.rb +0 -147
data/rakefile
CHANGED
@@ -1,74 +1,28 @@
|
|
1
|
+
require_relative "generate/generate"
|
1
2
|
Dir.chdir __dir__
|
2
|
-
version_re
|
3
|
-
version
|
4
|
-
gem_files
|
5
|
-
gem_package
|
3
|
+
version_re = /\d+(\.\d+)*/
|
4
|
+
version = `command grep 'VERSION =' lib/zscan.rb`[version_re]
|
5
|
+
gem_files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc},ext/pack/COPYING*}')
|
6
|
+
gem_package = "zscan-#{version}.gem"
|
7
|
+
generate_src = [__FILE__, *Dir.glob('generate/*')]
|
6
8
|
|
7
|
-
|
8
|
-
|
9
|
-
if ty =~ /INT8/
|
10
|
-
ty
|
11
|
-
else
|
12
|
-
[ty, "#{ty}_SWAP"]
|
13
|
-
end
|
14
|
-
}
|
15
|
-
def bspec_incr ins
|
16
|
-
case ins
|
17
|
-
when /INT(\d+)/; $1.to_i / 8
|
18
|
-
when /SINGLE/; 4
|
19
|
-
when /DOUBLE/; 8
|
20
|
-
else; raise 'bad'
|
21
|
-
end
|
22
|
-
end
|
23
|
-
def bspec_c_type ins
|
24
|
-
case ins
|
25
|
-
when /(U?INT\d+)/; "#{$1.downcase}_t"
|
26
|
-
when /SINGLE/; 'float'
|
27
|
-
when /DOUBLE/; 'double'
|
28
|
-
else; raise 'bad'
|
29
|
-
end
|
30
|
-
end
|
31
|
-
def bspec_extract ins
|
32
|
-
type = bspec_c_type ins
|
33
|
-
len = bspec_incr(ins) * 8
|
34
|
-
r = "((uint#{len}_t*)s)[0]"
|
35
|
-
if ins.end_with?('SWAP')
|
36
|
-
r = "swap#{len}(#{r})"
|
37
|
-
end
|
38
|
-
"uint#{len}_t r = #{r}"
|
39
|
-
end
|
40
|
-
def bspec_convert ins
|
41
|
-
case ins
|
42
|
-
when /(U)?INT64|UINT32/
|
43
|
-
if ins.start_with?('U')
|
44
|
-
"UINT64toNUM(r)"
|
45
|
-
else
|
46
|
-
"INT64toNUM(CAST(r, int64_t))"
|
47
|
-
end
|
48
|
-
when /INT32/
|
49
|
-
"INT2NUM(CAST(r, int32_t))"
|
50
|
-
when /INT(16|8)/
|
51
|
-
"INT2FIX(CAST(r, #{bspec_c_type ins}))"
|
52
|
-
when /SINGLE/
|
53
|
-
"DBL2NUM((double)CAST(r, float))"
|
54
|
-
when /DOUBLE/
|
55
|
-
"DBL2NUM(CAST(r, double))"
|
56
|
-
else
|
57
|
-
raise 'bad'
|
58
|
-
end
|
59
|
-
end
|
9
|
+
desc "generate, build and test, and pack gem"
|
10
|
+
task :default => [:gen, :test, gem_package]
|
60
11
|
|
61
12
|
desc "build and test"
|
62
|
-
task :
|
63
|
-
|
64
|
-
desc "build and run test"
|
65
|
-
task :test do
|
13
|
+
task :test => 'ext/Makefile' do
|
66
14
|
sh "make -C ext"
|
67
15
|
sh "rspec"
|
68
16
|
end
|
69
17
|
|
18
|
+
file 'ext/Makefile' do
|
19
|
+
Dir.chdir 'ext' do
|
20
|
+
sh "ruby extconf.rb"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
70
24
|
desc "pack gem"
|
71
|
-
file gem_package => gem_files do
|
25
|
+
file gem_package => gem_files + Generate.files do
|
72
26
|
sh "rm zscan-*.gem"
|
73
27
|
|
74
28
|
new_version = false
|
@@ -82,87 +36,23 @@ file gem_package => gem_files do
|
|
82
36
|
if new_version
|
83
37
|
File.open('zscan.gemspec', 'w'){|f| f << lines.join }
|
84
38
|
end
|
39
|
+
puts "packing files:"
|
40
|
+
puts '-' * 40
|
41
|
+
puts gem_files
|
42
|
+
puts '-' * 40
|
85
43
|
sh "gem build zscan.gemspec"
|
86
44
|
end
|
87
45
|
|
88
46
|
desc "generate files"
|
89
|
-
task :gen =>
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
%Q{BS_#{ins}:
|
99
|
-
{
|
100
|
-
#{bspec_extract ins};
|
101
|
-
rb_ary_push(a, #{bspec_convert ins});
|
102
|
-
s += #{bspec_incr ins};
|
103
|
-
goto **(ip++);
|
104
|
-
}
|
105
|
-
}
|
106
|
-
end.join "\n"
|
107
|
-
|
108
|
-
File.open 'ext/bspec_exec.inc', 'w' do |f|
|
109
|
-
f.puts %Q|// GENERATED WITH: rake gen
|
110
|
-
#line 2 "ext/bspec_exec.inc"
|
111
|
-
__attribute__((__noinline__))
|
112
|
-
static VALUE bspec_exec(void** ip, char* s, VALUE a) {
|
113
|
-
static void* opcodes[] = { &&BS_RET, #{opcode_list} };
|
114
|
-
if (ip == NULL) {
|
115
|
-
return (VALUE)opcodes;
|
116
|
-
}
|
117
|
-
goto **(ip++);
|
118
|
-
BS_RET:
|
119
|
-
return a;
|
120
|
-
#{opcode_segs}
|
121
|
-
}|
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
file 'ext/bspec_init.inc' => __FILE__ do
|
126
|
-
puts 'generating ext/bspec_init.inc'
|
127
|
-
opcode_incrs = bspec_insns.map{|ins| bspec_incr ins}.join ', '
|
128
|
-
File.open 'ext/bspec_init.inc', 'w' do |f|
|
129
|
-
f.puts "// GENERATED WITH: rake gen"
|
130
|
-
f.puts %Q|static const long bspec_s_sizes[] = {0, #{opcode_incrs}};|
|
131
|
-
f.puts %Q|static const long bspec_opcodes_size = #{bspec_insns.size + 1};|
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
file 'lib/zscan/instructions.rb' => __FILE__ do
|
136
|
-
puts 'generating lib/zscan/instructions.rb'
|
137
|
-
File.open 'lib/zscan/instructions.rb', 'w' do |f|
|
138
|
-
f.puts "# GENERATED WITH: rake gen"
|
139
|
-
f.puts "class ZScan::BinarySpec"
|
140
|
-
|
141
|
-
bspec_insns.each_with_index do |ins, i|
|
142
|
-
f.puts <<-RUBY
|
143
|
-
def #{ins.downcase} n=1
|
144
|
-
raise ArgumentError, "repeat count should be >= 1, but got \#{n}" if n < 1
|
145
|
-
n.times do
|
146
|
-
append #{i + 1}
|
147
|
-
end
|
148
|
-
end
|
149
|
-
RUBY
|
150
|
-
end
|
151
|
-
|
152
|
-
alias_ins = (bspec_types - ['INT8', 'UINT8']).map &:downcase
|
153
|
-
f.puts " if ZScan::BinarySpec.big_endian?"
|
154
|
-
alias_ins.each do |ins|
|
155
|
-
f.puts " alias #{ins}_be #{ins}"
|
156
|
-
f.puts " alias #{ins}_le #{ins}_swap"
|
157
|
-
end
|
158
|
-
f.puts " else"
|
159
|
-
alias_ins.each do |ins|
|
160
|
-
f.puts " alias #{ins}_le #{ins}"
|
161
|
-
f.puts " alias #{ins}_be #{ins}_swap"
|
47
|
+
task :gen => Generate.files
|
48
|
+
|
49
|
+
Generate.files.each do |name|
|
50
|
+
file name => generate_src do
|
51
|
+
puts "generating #{name}"
|
52
|
+
r = Generate.generate(name)
|
53
|
+
File.open name, 'w' do |f|
|
54
|
+
f.puts "#{name.end_with?('rb') ? '#' : '//'} generated by rake gen"
|
55
|
+
f << r
|
162
56
|
end
|
163
|
-
f.puts " end"
|
164
|
-
swap_ins = alias_ins.map{|ins| "#{ins}_swap"}
|
165
|
-
f.puts " undef #{swap_ins.join ', '}"
|
166
|
-
f.puts "end"
|
167
57
|
end
|
168
58
|
end
|
data/readme.md
CHANGED
@@ -4,7 +4,8 @@
|
|
4
4
|
- `ZScan#pos` is the codepoint position, and `ZScan#bytepos` is byte position.
|
5
5
|
- Correctly scans anchors and look behind predicates.
|
6
6
|
- Pos stack manipulation.
|
7
|
-
- Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format
|
7
|
+
- Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format`.
|
8
|
+
- Binary scanning methods: `#scan_bytes spec`, `#unpack format`.
|
8
9
|
|
9
10
|
## Install
|
10
11
|
|
@@ -50,22 +51,20 @@ See also https://bugs.ruby-lang.org/issues/7092
|
|
50
51
|
## Other motivations
|
51
52
|
|
52
53
|
- For scan and convert, ruby's stdlib `Scanf` is slow (creates regexp array everytime called) and not possible to corporate with scanner.
|
53
|
-
- For date parsing, `strptime` doesn't tell the parsed length.
|
54
|
-
- For binary parsing, `unpack` is an slow interpreter, and the instructions are quite irregular.
|
54
|
+
- For date parsing, `Date#strptime` doesn't tell the parsed length.
|
55
|
+
- For binary parsing, `String#unpack` is an slow interpreter, it doesn't tell the parsed length either, and the instructions are quite irregular.
|
55
56
|
|
56
57
|
## Essential methods
|
57
58
|
|
58
59
|
- `ZScan.new string, dup=false`
|
59
60
|
- `#scan regexp_or_string`
|
60
|
-
- `#skip regexp_or_string`
|
61
|
+
- `#skip regexp_or_string` return new byte pos or `nil`
|
61
62
|
- `#match_bytesize regexp_or_string` return length of matched bytes or `nil`.
|
62
63
|
- `#slice n` slice sub string of n chars from current pos, advances the cursor.
|
63
64
|
- `#byteslice n` slice sub string of n bytes from cursor pos, advances the cursor.
|
64
65
|
- `#scan_float` scan a float number which is not starting with space. It deals with multibyte encodings for you.
|
65
66
|
- `#scan_int radix=nil` if radix is nil, decide base by prefix: `0x` is 16, `0` is 8, `0b` is 2, otherwise 10. `radix` should be in range `2..36`.
|
66
67
|
- `#scan_date format_string, start=Date::ITALY` scan a `DateTime` object, see also [strptime](http://rubydoc.info/stdlib/date/DateTime.strptime).
|
67
|
-
- `#scan_binary binary_spec` optimized and readable binary scan, see below for how to create a `ZScan::BinarySpec`.
|
68
|
-
- `#unpack format_string`
|
69
68
|
- `#eos?`
|
70
69
|
- `#string` note: return a dup. Don't worry the performance because it is a copy-on-write string.
|
71
70
|
- `#rest` rest unscanned sub string.
|
@@ -80,6 +79,7 @@ For convienience
|
|
80
79
|
- `#[]= range, replace_string` note: if `range` starts before pos, moves pos left, also clears the stack.
|
81
80
|
- `#size`
|
82
81
|
- `#bytesize`
|
82
|
+
- `#cleanup` cleanup substring before current pos.
|
83
83
|
|
84
84
|
## Pos management
|
85
85
|
|
@@ -92,53 +92,70 @@ For convienience
|
|
92
92
|
- `#reset` go to beginning.
|
93
93
|
- `#terminate` go to end of string.
|
94
94
|
|
95
|
-
## Binary
|
95
|
+
## Binary scanning
|
96
96
|
|
97
|
-
|
97
|
+
- `#scan_bytes bspec` optimized and readable binary scan, see below for how to create a `ZScan::BSpec`.
|
98
|
+
- `#unpack unpack_format_string` note that it always returns an array no matter matched or not (same behavior as `String#unpack`).
|
99
|
+
|
100
|
+
#### Bytes spec
|
101
|
+
|
102
|
+
Bytes spec is designed for fast binary protocol parsing. You can specify a sequence of binary data and how to expect the matching.
|
103
|
+
|
104
|
+
Unlike `#unpack`, bytes spec uses english names to specify the data sequence. It returns `nil` if any of the instructions not matching. Though there's no string / position changing / variable length instructions.
|
105
|
+
|
106
|
+
Bytes spec is implemented as direct-threaded VM, it faster than `#unpack`.
|
107
|
+
|
108
|
+
Example:
|
98
109
|
|
99
110
|
```ruby
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
111
|
+
s = ZScan.BSpec.new do
|
112
|
+
int8 expect: -1 # return nil if the first int8 is not -1
|
113
|
+
2.times{
|
114
|
+
uint32_le # le means: little endian
|
115
|
+
}
|
116
|
+
double_be # be means: big endian
|
105
117
|
end
|
118
|
+
|
106
119
|
z = ZScan.new [-1, 2, 3, 4.0].pack('cI<2G') + "rest"
|
107
|
-
z.
|
120
|
+
z.scan_bytes s #=> [-1, 2, 3, 4.0]
|
108
121
|
z.rest #=> 'rest
|
122
|
+
|
123
|
+
bad_z = ZScan.new [1, 2, 3, 4.0].pack('cI<2G) # first byte not match
|
124
|
+
z.scan_bytes s #=> nil
|
109
125
|
```
|
110
126
|
|
111
127
|
Integer instructions:
|
112
128
|
|
113
129
|
```ruby
|
114
|
-
int8 uint8
|
130
|
+
int8 uint8 byte # byte is the same as uint8
|
115
131
|
int16 uint16 int16_le uint16_le int16_be uint16_be
|
116
132
|
int32 uint32 int32_le uint32_le int32_be uint32_be
|
117
133
|
int64 uint64 int64_le uint64_le int64_be uint64_be
|
118
134
|
```
|
119
135
|
|
120
|
-
|
136
|
+
Only integer instructions support the `:expect` option, match quickly stops if the scanned result not equal to the expected number.
|
137
|
+
|
138
|
+
Double precision float instructions:
|
121
139
|
|
122
140
|
```ruby
|
123
|
-
|
141
|
+
double double_le double_be
|
124
142
|
```
|
125
143
|
|
126
|
-
|
144
|
+
Single precision float instructions:
|
127
145
|
|
128
146
|
```ruby
|
129
|
-
|
147
|
+
float float_le float_be
|
148
|
+
single single_le single_be # same as float*
|
130
149
|
```
|
131
150
|
|
132
|
-
|
151
|
+
Note that ruby floats are doubles in fact, in a very rare case, you may need to keep the original single-precision data instead of converting into doubles, you can use `uint32` for the job.
|
152
|
+
|
153
|
+
A note on endians:
|
133
154
|
|
134
155
|
- (without endian suffix) native endian
|
135
156
|
- `*_le` little endian (VAX, x86, Windows string code unit)
|
136
157
|
- `*_be` big endian, network endian (SPARC, Java string code unit)
|
137
158
|
|
138
|
-
Repeat count must be integer `>= 1`, default is `1`.
|
139
|
-
|
140
|
-
It is implemented as a direct-threaded bytecode interpreter. A bit faster than `String#unpack`.
|
141
|
-
|
142
159
|
## Parsing combinators
|
143
160
|
|
144
161
|
Combinators that manage scanner pos and stack state for you. In the combinators, if the returned value of the given block is `nil` or `false`, stops iteration and restores scanner location. Can be nested, useful for building parsers.
|
@@ -157,6 +174,34 @@ Combinators that manage scanner pos and stack state for you. In the combinators,
|
|
157
174
|
- `#clear_pos_stack` clear pos stack.
|
158
175
|
- `z.push._try expr` equivalent to `z.try{ expr }`, but faster because no block is required
|
159
176
|
|
177
|
+
## Features QA
|
178
|
+
|
179
|
+
#### `scan_until` and `skip_until`?
|
180
|
+
|
181
|
+
For example, the StringScanner call
|
182
|
+
|
183
|
+
```ruby
|
184
|
+
strscan.scan_until /a/
|
185
|
+
```
|
186
|
+
|
187
|
+
Is equivalent to a slightly different regexp
|
188
|
+
|
189
|
+
```ruby
|
190
|
+
zscan.scan /.*a/m
|
191
|
+
```
|
192
|
+
|
193
|
+
#### Capture groups?
|
194
|
+
|
195
|
+
Not implemented yet. Maybe future?
|
196
|
+
|
197
|
+
#### `unscan`?
|
198
|
+
|
199
|
+
Use pos management methods.
|
200
|
+
|
201
|
+
#### Erlang style bitstring?
|
202
|
+
|
203
|
+
Thought of that but the API can be quirky... It's way beyond a string scanner.
|
204
|
+
|
160
205
|
## License
|
161
206
|
|
162
207
|
```
|
data/spec/binary_scan_spec.rb
CHANGED
@@ -4,27 +4,51 @@ require_relative "spec_helper"
|
|
4
4
|
describe 'ZScan binary scanning methods' do
|
5
5
|
it "#unpack" do
|
6
6
|
z = ZScan.new "\x01\x02\x03"
|
7
|
-
assert_raise ArgumentError do
|
8
|
-
z.unpack '@1C'
|
9
|
-
end
|
10
7
|
assert_equal [1, 2], (z.unpack 'CC')
|
11
8
|
assert_equal 2, z.pos
|
12
|
-
assert_equal nil, (z.unpack 'I')
|
9
|
+
assert_equal [nil], (z.unpack 'I')
|
13
10
|
assert_equal 2, z.pos
|
14
11
|
end
|
15
12
|
|
16
|
-
it "#
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
13
|
+
it "#unpack position-changing instructions and var-length instructions" do
|
14
|
+
z = ZScan.new "abcd\0abc"
|
15
|
+
s, _ = z.unpack 'Z*'
|
16
|
+
assert_equal "abcd", s
|
17
|
+
assert_equal 5, z.pos
|
18
|
+
|
19
|
+
z.reset
|
20
|
+
s, _ = z.unpack '@2Z*'
|
21
|
+
assert_equal 'cd', s
|
22
|
+
end
|
23
|
+
|
24
|
+
it "#scan_bytes" do
|
25
|
+
s = ZScan::BSpec.new do
|
26
|
+
int8
|
27
|
+
2.times{ uint32_le } # little endian
|
28
|
+
double_be # big endian
|
29
|
+
single
|
22
30
|
end
|
23
31
|
|
24
32
|
a = [-1, 2, 3, 4.0, 3.0]
|
25
33
|
z = ZScan.new(a.pack('cI<2Gf') + 'rest')
|
26
|
-
b = z.
|
34
|
+
b = z.scan_bytes s
|
27
35
|
assert_equal 'rest', z.rest
|
28
36
|
assert_equal a, b
|
29
37
|
end
|
38
|
+
|
39
|
+
it "#scan_bytes with expectation" do
|
40
|
+
s = ZScan::BSpec.new do
|
41
|
+
int8 expect: 3
|
42
|
+
float
|
43
|
+
end
|
44
|
+
|
45
|
+
a = [3, 4.0]
|
46
|
+
z = ZScan.new a.pack('cf')
|
47
|
+
assert_equal a, z.scan_bytes(s)
|
48
|
+
|
49
|
+
a = [2, 4.0]
|
50
|
+
z = ZScan.new a.pack('cf')
|
51
|
+
assert_equal nil, z.scan_bytes(s)
|
52
|
+
assert_equal 0, z.pos
|
53
|
+
end
|
30
54
|
end
|
data/spec/typed_scan_spec.rb
CHANGED
@@ -18,6 +18,12 @@ describe "typed scan" do
|
|
18
18
|
assert_equal 030, z.scan_int
|
19
19
|
end
|
20
20
|
|
21
|
+
it '#scan_int does not use unicode numbers' do
|
22
|
+
z = Zscan.new "一二".force_encoding('utf-8')
|
23
|
+
assert_equal nil, z.scan_int
|
24
|
+
assert_equal 0, z.pos
|
25
|
+
end
|
26
|
+
|
21
27
|
it "#scan_float" do
|
22
28
|
z = Zscan.new " -3.5e23"
|
23
29
|
assert_equal nil, z.scan_float
|
data/spec/zscan_spec.rb
CHANGED
data/zscan.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "zscan"
|
3
|
-
s.version = "
|
3
|
+
s.version = "2.0" # version mapped from zscan.rb, don't change here
|
4
4
|
s.author = "Zete Lui"
|
5
5
|
s.homepage = "https://github.com/luikore/zscan"
|
6
6
|
s.platform = Gem::Platform::RUBY
|
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.required_ruby_version = ">=1.9.2"
|
10
10
|
s.licenses = ['BSD']
|
11
11
|
|
12
|
-
s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc}}')
|
12
|
+
s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc},ext/pack/COPYING*}')
|
13
13
|
s.require_paths = ["lib"]
|
14
14
|
s.extensions = ["ext/extconf.rb"]
|
15
15
|
s.rubygems_version = '1.8.24'
|