zscan 1.3 → 2.0

Sign up to get free protection for your applications and to get access to all the features.
data/rakefile CHANGED
@@ -1,74 +1,28 @@
1
+ require_relative "generate/generate"
1
2
  Dir.chdir __dir__
2
- version_re = /\d+(\.\d+)*/
3
- version = `command grep 'VERSION =' lib/zscan.rb`[version_re]
4
- gem_files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,c}}')
5
- gem_package = "zscan-#{version}.gem"
3
+ version_re = /\d+(\.\d+)*/
4
+ version = `command grep 'VERSION =' lib/zscan.rb`[version_re]
5
+ gem_files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc},ext/pack/COPYING*}')
6
+ gem_package = "zscan-#{version}.gem"
7
+ generate_src = [__FILE__, *Dir.glob('generate/*')]
6
8
 
7
- bspec_types = %w[INT8 INT16 INT32 INT64 UINT8 UINT16 UINT32 UINT64 SINGLE DOUBLE]
8
- bspec_insns = bspec_types.flat_map{|ty|
9
- if ty =~ /INT8/
10
- ty
11
- else
12
- [ty, "#{ty}_SWAP"]
13
- end
14
- }
15
- def bspec_incr ins
16
- case ins
17
- when /INT(\d+)/; $1.to_i / 8
18
- when /SINGLE/; 4
19
- when /DOUBLE/; 8
20
- else; raise 'bad'
21
- end
22
- end
23
- def bspec_c_type ins
24
- case ins
25
- when /(U?INT\d+)/; "#{$1.downcase}_t"
26
- when /SINGLE/; 'float'
27
- when /DOUBLE/; 'double'
28
- else; raise 'bad'
29
- end
30
- end
31
- def bspec_extract ins
32
- type = bspec_c_type ins
33
- len = bspec_incr(ins) * 8
34
- r = "((uint#{len}_t*)s)[0]"
35
- if ins.end_with?('SWAP')
36
- r = "swap#{len}(#{r})"
37
- end
38
- "uint#{len}_t r = #{r}"
39
- end
40
- def bspec_convert ins
41
- case ins
42
- when /(U)?INT64|UINT32/
43
- if ins.start_with?('U')
44
- "UINT64toNUM(r)"
45
- else
46
- "INT64toNUM(CAST(r, int64_t))"
47
- end
48
- when /INT32/
49
- "INT2NUM(CAST(r, int32_t))"
50
- when /INT(16|8)/
51
- "INT2FIX(CAST(r, #{bspec_c_type ins}))"
52
- when /SINGLE/
53
- "DBL2NUM((double)CAST(r, float))"
54
- when /DOUBLE/
55
- "DBL2NUM(CAST(r, double))"
56
- else
57
- raise 'bad'
58
- end
59
- end
9
+ desc "generate, build and test, and pack gem"
10
+ task :default => [:gen, :test, gem_package]
60
11
 
61
12
  desc "build and test"
62
- task :default => [:test, gem_package]
63
-
64
- desc "build and run test"
65
- task :test do
13
+ task :test => 'ext/Makefile' do
66
14
  sh "make -C ext"
67
15
  sh "rspec"
68
16
  end
69
17
 
18
+ file 'ext/Makefile' do
19
+ Dir.chdir 'ext' do
20
+ sh "ruby extconf.rb"
21
+ end
22
+ end
23
+
70
24
  desc "pack gem"
71
- file gem_package => gem_files do
25
+ file gem_package => gem_files + Generate.files do
72
26
  sh "rm zscan-*.gem"
73
27
 
74
28
  new_version = false
@@ -82,87 +36,23 @@ file gem_package => gem_files do
82
36
  if new_version
83
37
  File.open('zscan.gemspec', 'w'){|f| f << lines.join }
84
38
  end
39
+ puts "packing files:"
40
+ puts '-' * 40
41
+ puts gem_files
42
+ puts '-' * 40
85
43
  sh "gem build zscan.gemspec"
86
44
  end
87
45
 
88
46
  desc "generate files"
89
- task :gen => %w[ext/bspec_exec.inc ext/bspec_init.inc lib/zscan/instructions.rb]
90
-
91
- file 'ext/bspec_exec.inc' => __FILE__ do
92
- puts "generating ext/bspec_exec.inc"
93
- opcode_list = bspec_insns.map do |ins|
94
- "&&BS_#{ins}"
95
- end.join ', '
96
-
97
- opcode_segs = bspec_insns.map do |ins|
98
- %Q{BS_#{ins}:
99
- {
100
- #{bspec_extract ins};
101
- rb_ary_push(a, #{bspec_convert ins});
102
- s += #{bspec_incr ins};
103
- goto **(ip++);
104
- }
105
- }
106
- end.join "\n"
107
-
108
- File.open 'ext/bspec_exec.inc', 'w' do |f|
109
- f.puts %Q|// GENERATED WITH: rake gen
110
- #line 2 "ext/bspec_exec.inc"
111
- __attribute__((__noinline__))
112
- static VALUE bspec_exec(void** ip, char* s, VALUE a) {
113
- static void* opcodes[] = { &&BS_RET, #{opcode_list} };
114
- if (ip == NULL) {
115
- return (VALUE)opcodes;
116
- }
117
- goto **(ip++);
118
- BS_RET:
119
- return a;
120
- #{opcode_segs}
121
- }|
122
- end
123
- end
124
-
125
- file 'ext/bspec_init.inc' => __FILE__ do
126
- puts 'generating ext/bspec_init.inc'
127
- opcode_incrs = bspec_insns.map{|ins| bspec_incr ins}.join ', '
128
- File.open 'ext/bspec_init.inc', 'w' do |f|
129
- f.puts "// GENERATED WITH: rake gen"
130
- f.puts %Q|static const long bspec_s_sizes[] = {0, #{opcode_incrs}};|
131
- f.puts %Q|static const long bspec_opcodes_size = #{bspec_insns.size + 1};|
132
- end
133
- end
134
-
135
- file 'lib/zscan/instructions.rb' => __FILE__ do
136
- puts 'generating lib/zscan/instructions.rb'
137
- File.open 'lib/zscan/instructions.rb', 'w' do |f|
138
- f.puts "# GENERATED WITH: rake gen"
139
- f.puts "class ZScan::BinarySpec"
140
-
141
- bspec_insns.each_with_index do |ins, i|
142
- f.puts <<-RUBY
143
- def #{ins.downcase} n=1
144
- raise ArgumentError, "repeat count should be >= 1, but got \#{n}" if n < 1
145
- n.times do
146
- append #{i + 1}
147
- end
148
- end
149
- RUBY
150
- end
151
-
152
- alias_ins = (bspec_types - ['INT8', 'UINT8']).map &:downcase
153
- f.puts " if ZScan::BinarySpec.big_endian?"
154
- alias_ins.each do |ins|
155
- f.puts " alias #{ins}_be #{ins}"
156
- f.puts " alias #{ins}_le #{ins}_swap"
157
- end
158
- f.puts " else"
159
- alias_ins.each do |ins|
160
- f.puts " alias #{ins}_le #{ins}"
161
- f.puts " alias #{ins}_be #{ins}_swap"
47
+ task :gen => Generate.files
48
+
49
+ Generate.files.each do |name|
50
+ file name => generate_src do
51
+ puts "generating #{name}"
52
+ r = Generate.generate(name)
53
+ File.open name, 'w' do |f|
54
+ f.puts "#{name.end_with?('rb') ? '#' : '//'} generated by rake gen"
55
+ f << r
162
56
  end
163
- f.puts " end"
164
- swap_ins = alias_ins.map{|ins| "#{ins}_swap"}
165
- f.puts " undef #{swap_ins.join ', '}"
166
- f.puts "end"
167
57
  end
168
58
  end
data/readme.md CHANGED
@@ -4,7 +4,8 @@
4
4
  - `ZScan#pos` is the codepoint position, and `ZScan#bytepos` is byte position.
5
5
  - Correctly scans anchors and look behind predicates.
6
6
  - Pos stack manipulation.
7
- - Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format`, `#scan_binary format`.
7
+ - Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format`.
8
+ - Binary scanning methods: `#scan_bytes spec`, `#unpack format`.
8
9
 
9
10
  ## Install
10
11
 
@@ -50,22 +51,20 @@ See also https://bugs.ruby-lang.org/issues/7092
50
51
  ## Other motivations
51
52
 
52
53
  - For scan and convert, ruby's stdlib `Scanf` is slow (creates regexp array everytime called) and not possible to corporate with scanner.
53
- - For date parsing, `strptime` doesn't tell the parsed length.
54
- - For binary parsing, `unpack` is an slow interpreter, and the instructions are quite irregular.
54
+ - For date parsing, `Date#strptime` doesn't tell the parsed length.
55
+ - For binary parsing, `String#unpack` is an slow interpreter, it doesn't tell the parsed length either, and the instructions are quite irregular.
55
56
 
56
57
  ## Essential methods
57
58
 
58
59
  - `ZScan.new string, dup=false`
59
60
  - `#scan regexp_or_string`
60
- - `#skip regexp_or_string`
61
+ - `#skip regexp_or_string` return new byte pos or `nil`
61
62
  - `#match_bytesize regexp_or_string` return length of matched bytes or `nil`.
62
63
  - `#slice n` slice sub string of n chars from current pos, advances the cursor.
63
64
  - `#byteslice n` slice sub string of n bytes from cursor pos, advances the cursor.
64
65
  - `#scan_float` scan a float number which is not starting with space. It deals with multibyte encodings for you.
65
66
  - `#scan_int radix=nil` if radix is nil, decide base by prefix: `0x` is 16, `0` is 8, `0b` is 2, otherwise 10. `radix` should be in range `2..36`.
66
67
  - `#scan_date format_string, start=Date::ITALY` scan a `DateTime` object, see also [strptime](http://rubydoc.info/stdlib/date/DateTime.strptime).
67
- - `#scan_binary binary_spec` optimized and readable binary scan, see below for how to create a `ZScan::BinarySpec`.
68
- - `#unpack format_string`
69
68
  - `#eos?`
70
69
  - `#string` note: return a dup. Don't worry the performance because it is a copy-on-write string.
71
70
  - `#rest` rest unscanned sub string.
@@ -80,6 +79,7 @@ For convienience
80
79
  - `#[]= range, replace_string` note: if `range` starts before pos, moves pos left, also clears the stack.
81
80
  - `#size`
82
81
  - `#bytesize`
82
+ - `#cleanup` cleanup substring before current pos.
83
83
 
84
84
  ## Pos management
85
85
 
@@ -92,53 +92,70 @@ For convienience
92
92
  - `#reset` go to beginning.
93
93
  - `#terminate` go to end of string.
94
94
 
95
- ## Binary parsing
95
+ ## Binary scanning
96
96
 
97
- Specify a sequence of binary data. Designed for binary protocol parsing. Example:
97
+ - `#scan_bytes bspec` optimized and readable binary scan, see below for how to create a `ZScan::BSpec`.
98
+ - `#unpack unpack_format_string` note that it always returns an array no matter matched or not (same behavior as `String#unpack`).
99
+
100
+ #### Bytes spec
101
+
102
+ Bytes spec is designed for fast binary protocol parsing. You can specify a sequence of binary data and how to expect the matching.
103
+
104
+ Unlike `#unpack`, bytes spec uses english names to specify the data sequence. It returns `nil` if any of the instructions not matching. Though there's no string / position changing / variable length instructions.
105
+
106
+ Bytes spec is implemented as direct-threaded VM, it faster than `#unpack`.
107
+
108
+ Example:
98
109
 
99
110
  ```ruby
100
- # create a ZScan::BinarySpec
101
- s = ZScan.binary_spec do
102
- int8 # once
103
- uint32_le 2 # little endian, twice
104
- double_be 1 # big endian, once
111
+ s = ZScan.BSpec.new do
112
+ int8 expect: -1 # return nil if the first int8 is not -1
113
+ 2.times{
114
+ uint32_le # le means: little endian
115
+ }
116
+ double_be # be means: big endian
105
117
  end
118
+
106
119
  z = ZScan.new [-1, 2, 3, 4.0].pack('cI<2G') + "rest"
107
- z.scan_binary s #=> [-1, 2, 3, 4.0]
120
+ z.scan_bytes s #=> [-1, 2, 3, 4.0]
108
121
  z.rest #=> 'rest
122
+
123
+ bad_z = ZScan.new [1, 2, 3, 4.0].pack('cI<2G) # first byte not match
124
+ z.scan_bytes s #=> nil
109
125
  ```
110
126
 
111
127
  Integer instructions:
112
128
 
113
129
  ```ruby
114
- int8 uint8
130
+ int8 uint8 byte # byte is the same as uint8
115
131
  int16 uint16 int16_le uint16_le int16_be uint16_be
116
132
  int32 uint32 int32_le uint32_le int32_be uint32_be
117
133
  int64 uint64 int64_le uint64_le int64_be uint64_be
118
134
  ```
119
135
 
120
- Single precision float instructions:
136
+ Only integer instructions support the `:expect` option, match quickly stops if the scanned result not equal to the expected number.
137
+
138
+ Double precision float instructions:
121
139
 
122
140
  ```ruby
123
- single single_le single_be
141
+ double double_le double_be
124
142
  ```
125
143
 
126
- Double precision float instructions:
144
+ Single precision float instructions:
127
145
 
128
146
  ```ruby
129
- double double_le double_be
147
+ float float_le float_be
148
+ single single_le single_be # same as float*
130
149
  ```
131
150
 
132
- Endians:
151
+ Note that ruby floats are doubles in fact, in a very rare case, you may need to keep the original single-precision data instead of converting into doubles, you can use `uint32` for the job.
152
+
153
+ A note on endians:
133
154
 
134
155
  - (without endian suffix) native endian
135
156
  - `*_le` little endian (VAX, x86, Windows string code unit)
136
157
  - `*_be` big endian, network endian (SPARC, Java string code unit)
137
158
 
138
- Repeat count must be integer `>= 1`, default is `1`.
139
-
140
- It is implemented as a direct-threaded bytecode interpreter. A bit faster than `String#unpack`.
141
-
142
159
  ## Parsing combinators
143
160
 
144
161
  Combinators that manage scanner pos and stack state for you. In the combinators, if the returned value of the given block is `nil` or `false`, stops iteration and restores scanner location. Can be nested, useful for building parsers.
@@ -157,6 +174,34 @@ Combinators that manage scanner pos and stack state for you. In the combinators,
157
174
  - `#clear_pos_stack` clear pos stack.
158
175
  - `z.push._try expr` equivalent to `z.try{ expr }`, but faster because no block is required
159
176
 
177
+ ## Features QA
178
+
179
+ #### `scan_until` and `skip_until`?
180
+
181
+ For example, the StringScanner call
182
+
183
+ ```ruby
184
+ strscan.scan_until /a/
185
+ ```
186
+
187
+ Is equivalent to a slightly different regexp
188
+
189
+ ```ruby
190
+ zscan.scan /.*a/m
191
+ ```
192
+
193
+ #### Capture groups?
194
+
195
+ Not implemented yet. Maybe future?
196
+
197
+ #### `unscan`?
198
+
199
+ Use pos management methods.
200
+
201
+ #### Erlang style bitstring?
202
+
203
+ Thought of that but the API can be quirky... It's way beyond a string scanner.
204
+
160
205
  ## License
161
206
 
162
207
  ```
@@ -4,27 +4,51 @@ require_relative "spec_helper"
4
4
  describe 'ZScan binary scanning methods' do
5
5
  it "#unpack" do
6
6
  z = ZScan.new "\x01\x02\x03"
7
- assert_raise ArgumentError do
8
- z.unpack '@1C'
9
- end
10
7
  assert_equal [1, 2], (z.unpack 'CC')
11
8
  assert_equal 2, z.pos
12
- assert_equal nil, (z.unpack 'I')
9
+ assert_equal [nil], (z.unpack 'I')
13
10
  assert_equal 2, z.pos
14
11
  end
15
12
 
16
- it "#scan_binary" do
17
- s = ZScan.binary_spec do
18
- int8 # once
19
- uint32_le 2 # little endian, twice
20
- double_be 1 # big endian, once
21
- single 1
13
+ it "#unpack position-changing instructions and var-length instructions" do
14
+ z = ZScan.new "abcd\0abc"
15
+ s, _ = z.unpack 'Z*'
16
+ assert_equal "abcd", s
17
+ assert_equal 5, z.pos
18
+
19
+ z.reset
20
+ s, _ = z.unpack '@2Z*'
21
+ assert_equal 'cd', s
22
+ end
23
+
24
+ it "#scan_bytes" do
25
+ s = ZScan::BSpec.new do
26
+ int8
27
+ 2.times{ uint32_le } # little endian
28
+ double_be # big endian
29
+ single
22
30
  end
23
31
 
24
32
  a = [-1, 2, 3, 4.0, 3.0]
25
33
  z = ZScan.new(a.pack('cI<2Gf') + 'rest')
26
- b = z.scan_binary s
34
+ b = z.scan_bytes s
27
35
  assert_equal 'rest', z.rest
28
36
  assert_equal a, b
29
37
  end
38
+
39
+ it "#scan_bytes with expectation" do
40
+ s = ZScan::BSpec.new do
41
+ int8 expect: 3
42
+ float
43
+ end
44
+
45
+ a = [3, 4.0]
46
+ z = ZScan.new a.pack('cf')
47
+ assert_equal a, z.scan_bytes(s)
48
+
49
+ a = [2, 4.0]
50
+ z = ZScan.new a.pack('cf')
51
+ assert_equal nil, z.scan_bytes(s)
52
+ assert_equal 0, z.pos
53
+ end
30
54
  end
@@ -18,6 +18,12 @@ describe "typed scan" do
18
18
  assert_equal 030, z.scan_int
19
19
  end
20
20
 
21
+ it '#scan_int does not use unicode numbers' do
22
+ z = Zscan.new "一二".force_encoding('utf-8')
23
+ assert_equal nil, z.scan_int
24
+ assert_equal 0, z.pos
25
+ end
26
+
21
27
  it "#scan_float" do
22
28
  z = Zscan.new " -3.5e23"
23
29
  assert_equal nil, z.scan_float
data/spec/zscan_spec.rb CHANGED
@@ -96,4 +96,12 @@ describe ZScan do
96
96
  z.pos = 2
97
97
  assert_equal 1, z.line_index
98
98
  end
99
+
100
+ it '#cleanup' do
101
+ @z.scan /\w/
102
+ @z.cleanup
103
+ assert_equal 'b你好', @z.string
104
+ assert_equal 0, @z.pos
105
+ assert_equal 0, @z.bytepos
106
+ end
99
107
  end
data/zscan.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "zscan"
3
- s.version = "1.3" # version mapped from zscan.rb, don't change here
3
+ s.version = "2.0" # version mapped from zscan.rb, don't change here
4
4
  s.author = "Zete Lui"
5
5
  s.homepage = "https://github.com/luikore/zscan"
6
6
  s.platform = Gem::Platform::RUBY
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
9
9
  s.required_ruby_version = ">=1.9.2"
10
10
  s.licenses = ['BSD']
11
11
 
12
- s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc}}')
12
+ s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc},ext/pack/COPYING*}')
13
13
  s.require_paths = ["lib"]
14
14
  s.extensions = ["ext/extconf.rb"]
15
15
  s.rubygems_version = '1.8.24'