zscan 1.3 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/rakefile CHANGED
@@ -1,74 +1,28 @@
1
+ require_relative "generate/generate"
1
2
  Dir.chdir __dir__
2
- version_re = /\d+(\.\d+)*/
3
- version = `command grep 'VERSION =' lib/zscan.rb`[version_re]
4
- gem_files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,c}}')
5
- gem_package = "zscan-#{version}.gem"
3
+ version_re = /\d+(\.\d+)*/
4
+ version = `command grep 'VERSION =' lib/zscan.rb`[version_re]
5
+ gem_files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc},ext/pack/COPYING*}')
6
+ gem_package = "zscan-#{version}.gem"
7
+ generate_src = [__FILE__, *Dir.glob('generate/*')]
6
8
 
7
- bspec_types = %w[INT8 INT16 INT32 INT64 UINT8 UINT16 UINT32 UINT64 SINGLE DOUBLE]
8
- bspec_insns = bspec_types.flat_map{|ty|
9
- if ty =~ /INT8/
10
- ty
11
- else
12
- [ty, "#{ty}_SWAP"]
13
- end
14
- }
15
- def bspec_incr ins
16
- case ins
17
- when /INT(\d+)/; $1.to_i / 8
18
- when /SINGLE/; 4
19
- when /DOUBLE/; 8
20
- else; raise 'bad'
21
- end
22
- end
23
- def bspec_c_type ins
24
- case ins
25
- when /(U?INT\d+)/; "#{$1.downcase}_t"
26
- when /SINGLE/; 'float'
27
- when /DOUBLE/; 'double'
28
- else; raise 'bad'
29
- end
30
- end
31
- def bspec_extract ins
32
- type = bspec_c_type ins
33
- len = bspec_incr(ins) * 8
34
- r = "((uint#{len}_t*)s)[0]"
35
- if ins.end_with?('SWAP')
36
- r = "swap#{len}(#{r})"
37
- end
38
- "uint#{len}_t r = #{r}"
39
- end
40
- def bspec_convert ins
41
- case ins
42
- when /(U)?INT64|UINT32/
43
- if ins.start_with?('U')
44
- "UINT64toNUM(r)"
45
- else
46
- "INT64toNUM(CAST(r, int64_t))"
47
- end
48
- when /INT32/
49
- "INT2NUM(CAST(r, int32_t))"
50
- when /INT(16|8)/
51
- "INT2FIX(CAST(r, #{bspec_c_type ins}))"
52
- when /SINGLE/
53
- "DBL2NUM((double)CAST(r, float))"
54
- when /DOUBLE/
55
- "DBL2NUM(CAST(r, double))"
56
- else
57
- raise 'bad'
58
- end
59
- end
9
+ desc "generate, build and test, and pack gem"
10
+ task :default => [:gen, :test, gem_package]
60
11
 
61
12
  desc "build and test"
62
- task :default => [:test, gem_package]
63
-
64
- desc "build and run test"
65
- task :test do
13
+ task :test => 'ext/Makefile' do
66
14
  sh "make -C ext"
67
15
  sh "rspec"
68
16
  end
69
17
 
18
+ file 'ext/Makefile' do
19
+ Dir.chdir 'ext' do
20
+ sh "ruby extconf.rb"
21
+ end
22
+ end
23
+
70
24
  desc "pack gem"
71
- file gem_package => gem_files do
25
+ file gem_package => gem_files + Generate.files do
72
26
  sh "rm zscan-*.gem"
73
27
 
74
28
  new_version = false
@@ -82,87 +36,23 @@ file gem_package => gem_files do
82
36
  if new_version
83
37
  File.open('zscan.gemspec', 'w'){|f| f << lines.join }
84
38
  end
39
+ puts "packing files:"
40
+ puts '-' * 40
41
+ puts gem_files
42
+ puts '-' * 40
85
43
  sh "gem build zscan.gemspec"
86
44
  end
87
45
 
88
46
  desc "generate files"
89
- task :gen => %w[ext/bspec_exec.inc ext/bspec_init.inc lib/zscan/instructions.rb]
90
-
91
- file 'ext/bspec_exec.inc' => __FILE__ do
92
- puts "generating ext/bspec_exec.inc"
93
- opcode_list = bspec_insns.map do |ins|
94
- "&&BS_#{ins}"
95
- end.join ', '
96
-
97
- opcode_segs = bspec_insns.map do |ins|
98
- %Q{BS_#{ins}:
99
- {
100
- #{bspec_extract ins};
101
- rb_ary_push(a, #{bspec_convert ins});
102
- s += #{bspec_incr ins};
103
- goto **(ip++);
104
- }
105
- }
106
- end.join "\n"
107
-
108
- File.open 'ext/bspec_exec.inc', 'w' do |f|
109
- f.puts %Q|// GENERATED WITH: rake gen
110
- #line 2 "ext/bspec_exec.inc"
111
- __attribute__((__noinline__))
112
- static VALUE bspec_exec(void** ip, char* s, VALUE a) {
113
- static void* opcodes[] = { &&BS_RET, #{opcode_list} };
114
- if (ip == NULL) {
115
- return (VALUE)opcodes;
116
- }
117
- goto **(ip++);
118
- BS_RET:
119
- return a;
120
- #{opcode_segs}
121
- }|
122
- end
123
- end
124
-
125
- file 'ext/bspec_init.inc' => __FILE__ do
126
- puts 'generating ext/bspec_init.inc'
127
- opcode_incrs = bspec_insns.map{|ins| bspec_incr ins}.join ', '
128
- File.open 'ext/bspec_init.inc', 'w' do |f|
129
- f.puts "// GENERATED WITH: rake gen"
130
- f.puts %Q|static const long bspec_s_sizes[] = {0, #{opcode_incrs}};|
131
- f.puts %Q|static const long bspec_opcodes_size = #{bspec_insns.size + 1};|
132
- end
133
- end
134
-
135
- file 'lib/zscan/instructions.rb' => __FILE__ do
136
- puts 'generating lib/zscan/instructions.rb'
137
- File.open 'lib/zscan/instructions.rb', 'w' do |f|
138
- f.puts "# GENERATED WITH: rake gen"
139
- f.puts "class ZScan::BinarySpec"
140
-
141
- bspec_insns.each_with_index do |ins, i|
142
- f.puts <<-RUBY
143
- def #{ins.downcase} n=1
144
- raise ArgumentError, "repeat count should be >= 1, but got \#{n}" if n < 1
145
- n.times do
146
- append #{i + 1}
147
- end
148
- end
149
- RUBY
150
- end
151
-
152
- alias_ins = (bspec_types - ['INT8', 'UINT8']).map &:downcase
153
- f.puts " if ZScan::BinarySpec.big_endian?"
154
- alias_ins.each do |ins|
155
- f.puts " alias #{ins}_be #{ins}"
156
- f.puts " alias #{ins}_le #{ins}_swap"
157
- end
158
- f.puts " else"
159
- alias_ins.each do |ins|
160
- f.puts " alias #{ins}_le #{ins}"
161
- f.puts " alias #{ins}_be #{ins}_swap"
47
+ task :gen => Generate.files
48
+
49
+ Generate.files.each do |name|
50
+ file name => generate_src do
51
+ puts "generating #{name}"
52
+ r = Generate.generate(name)
53
+ File.open name, 'w' do |f|
54
+ f.puts "#{name.end_with?('rb') ? '#' : '//'} generated by rake gen"
55
+ f << r
162
56
  end
163
- f.puts " end"
164
- swap_ins = alias_ins.map{|ins| "#{ins}_swap"}
165
- f.puts " undef #{swap_ins.join ', '}"
166
- f.puts "end"
167
57
  end
168
58
  end
data/readme.md CHANGED
@@ -4,7 +4,8 @@
4
4
  - `ZScan#pos` is the codepoint position, and `ZScan#bytepos` is byte position.
5
5
  - Correctly scans anchors and look behind predicates.
6
6
  - Pos stack manipulation.
7
- - Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format`, `#scan_binary format`.
7
+ - Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format`.
8
+ - Binary scanning methods: `#scan_bytes spec`, `#unpack format`.
8
9
 
9
10
  ## Install
10
11
 
@@ -50,22 +51,20 @@ See also https://bugs.ruby-lang.org/issues/7092
50
51
  ## Other motivations
51
52
 
52
53
  - For scan and convert, ruby's stdlib `Scanf` is slow (creates regexp array everytime called) and not possible to corporate with scanner.
53
- - For date parsing, `strptime` doesn't tell the parsed length.
54
- - For binary parsing, `unpack` is an slow interpreter, and the instructions are quite irregular.
54
+ - For date parsing, `Date#strptime` doesn't tell the parsed length.
55
+ - For binary parsing, `String#unpack` is an slow interpreter, it doesn't tell the parsed length either, and the instructions are quite irregular.
55
56
 
56
57
  ## Essential methods
57
58
 
58
59
  - `ZScan.new string, dup=false`
59
60
  - `#scan regexp_or_string`
60
- - `#skip regexp_or_string`
61
+ - `#skip regexp_or_string` return new byte pos or `nil`
61
62
  - `#match_bytesize regexp_or_string` return length of matched bytes or `nil`.
62
63
  - `#slice n` slice sub string of n chars from current pos, advances the cursor.
63
64
  - `#byteslice n` slice sub string of n bytes from cursor pos, advances the cursor.
64
65
  - `#scan_float` scan a float number which is not starting with space. It deals with multibyte encodings for you.
65
66
  - `#scan_int radix=nil` if radix is nil, decide base by prefix: `0x` is 16, `0` is 8, `0b` is 2, otherwise 10. `radix` should be in range `2..36`.
66
67
  - `#scan_date format_string, start=Date::ITALY` scan a `DateTime` object, see also [strptime](http://rubydoc.info/stdlib/date/DateTime.strptime).
67
- - `#scan_binary binary_spec` optimized and readable binary scan, see below for how to create a `ZScan::BinarySpec`.
68
- - `#unpack format_string`
69
68
  - `#eos?`
70
69
  - `#string` note: return a dup. Don't worry the performance because it is a copy-on-write string.
71
70
  - `#rest` rest unscanned sub string.
@@ -80,6 +79,7 @@ For convienience
80
79
  - `#[]= range, replace_string` note: if `range` starts before pos, moves pos left, also clears the stack.
81
80
  - `#size`
82
81
  - `#bytesize`
82
+ - `#cleanup` cleanup substring before current pos.
83
83
 
84
84
  ## Pos management
85
85
 
@@ -92,53 +92,70 @@ For convienience
92
92
  - `#reset` go to beginning.
93
93
  - `#terminate` go to end of string.
94
94
 
95
- ## Binary parsing
95
+ ## Binary scanning
96
96
 
97
- Specify a sequence of binary data. Designed for binary protocol parsing. Example:
97
+ - `#scan_bytes bspec` optimized and readable binary scan, see below for how to create a `ZScan::BSpec`.
98
+ - `#unpack unpack_format_string` note that it always returns an array no matter matched or not (same behavior as `String#unpack`).
99
+
100
+ #### Bytes spec
101
+
102
+ Bytes spec is designed for fast binary protocol parsing. You can specify a sequence of binary data and how to expect the matching.
103
+
104
+ Unlike `#unpack`, bytes spec uses english names to specify the data sequence. It returns `nil` if any of the instructions not matching. Though there's no string / position changing / variable length instructions.
105
+
106
+ Bytes spec is implemented as direct-threaded VM, it faster than `#unpack`.
107
+
108
+ Example:
98
109
 
99
110
  ```ruby
100
- # create a ZScan::BinarySpec
101
- s = ZScan.binary_spec do
102
- int8 # once
103
- uint32_le 2 # little endian, twice
104
- double_be 1 # big endian, once
111
+ s = ZScan.BSpec.new do
112
+ int8 expect: -1 # return nil if the first int8 is not -1
113
+ 2.times{
114
+ uint32_le # le means: little endian
115
+ }
116
+ double_be # be means: big endian
105
117
  end
118
+
106
119
  z = ZScan.new [-1, 2, 3, 4.0].pack('cI<2G') + "rest"
107
- z.scan_binary s #=> [-1, 2, 3, 4.0]
120
+ z.scan_bytes s #=> [-1, 2, 3, 4.0]
108
121
  z.rest #=> 'rest
122
+
123
+ bad_z = ZScan.new [1, 2, 3, 4.0].pack('cI<2G) # first byte not match
124
+ z.scan_bytes s #=> nil
109
125
  ```
110
126
 
111
127
  Integer instructions:
112
128
 
113
129
  ```ruby
114
- int8 uint8
130
+ int8 uint8 byte # byte is the same as uint8
115
131
  int16 uint16 int16_le uint16_le int16_be uint16_be
116
132
  int32 uint32 int32_le uint32_le int32_be uint32_be
117
133
  int64 uint64 int64_le uint64_le int64_be uint64_be
118
134
  ```
119
135
 
120
- Single precision float instructions:
136
+ Only integer instructions support the `:expect` option, match quickly stops if the scanned result not equal to the expected number.
137
+
138
+ Double precision float instructions:
121
139
 
122
140
  ```ruby
123
- single single_le single_be
141
+ double double_le double_be
124
142
  ```
125
143
 
126
- Double precision float instructions:
144
+ Single precision float instructions:
127
145
 
128
146
  ```ruby
129
- double double_le double_be
147
+ float float_le float_be
148
+ single single_le single_be # same as float*
130
149
  ```
131
150
 
132
- Endians:
151
+ Note that ruby floats are doubles in fact, in a very rare case, you may need to keep the original single-precision data instead of converting into doubles, you can use `uint32` for the job.
152
+
153
+ A note on endians:
133
154
 
134
155
  - (without endian suffix) native endian
135
156
  - `*_le` little endian (VAX, x86, Windows string code unit)
136
157
  - `*_be` big endian, network endian (SPARC, Java string code unit)
137
158
 
138
- Repeat count must be integer `>= 1`, default is `1`.
139
-
140
- It is implemented as a direct-threaded bytecode interpreter. A bit faster than `String#unpack`.
141
-
142
159
  ## Parsing combinators
143
160
 
144
161
  Combinators that manage scanner pos and stack state for you. In the combinators, if the returned value of the given block is `nil` or `false`, stops iteration and restores scanner location. Can be nested, useful for building parsers.
@@ -157,6 +174,34 @@ Combinators that manage scanner pos and stack state for you. In the combinators,
157
174
  - `#clear_pos_stack` clear pos stack.
158
175
  - `z.push._try expr` equivalent to `z.try{ expr }`, but faster because no block is required
159
176
 
177
+ ## Features QA
178
+
179
+ #### `scan_until` and `skip_until`?
180
+
181
+ For example, the StringScanner call
182
+
183
+ ```ruby
184
+ strscan.scan_until /a/
185
+ ```
186
+
187
+ Is equivalent to a slightly different regexp
188
+
189
+ ```ruby
190
+ zscan.scan /.*a/m
191
+ ```
192
+
193
+ #### Capture groups?
194
+
195
+ Not implemented yet. Maybe future?
196
+
197
+ #### `unscan`?
198
+
199
+ Use pos management methods.
200
+
201
+ #### Erlang style bitstring?
202
+
203
+ Thought of that but the API can be quirky... It's way beyond a string scanner.
204
+
160
205
  ## License
161
206
 
162
207
  ```
@@ -4,27 +4,51 @@ require_relative "spec_helper"
4
4
  describe 'ZScan binary scanning methods' do
5
5
  it "#unpack" do
6
6
  z = ZScan.new "\x01\x02\x03"
7
- assert_raise ArgumentError do
8
- z.unpack '@1C'
9
- end
10
7
  assert_equal [1, 2], (z.unpack 'CC')
11
8
  assert_equal 2, z.pos
12
- assert_equal nil, (z.unpack 'I')
9
+ assert_equal [nil], (z.unpack 'I')
13
10
  assert_equal 2, z.pos
14
11
  end
15
12
 
16
- it "#scan_binary" do
17
- s = ZScan.binary_spec do
18
- int8 # once
19
- uint32_le 2 # little endian, twice
20
- double_be 1 # big endian, once
21
- single 1
13
+ it "#unpack position-changing instructions and var-length instructions" do
14
+ z = ZScan.new "abcd\0abc"
15
+ s, _ = z.unpack 'Z*'
16
+ assert_equal "abcd", s
17
+ assert_equal 5, z.pos
18
+
19
+ z.reset
20
+ s, _ = z.unpack '@2Z*'
21
+ assert_equal 'cd', s
22
+ end
23
+
24
+ it "#scan_bytes" do
25
+ s = ZScan::BSpec.new do
26
+ int8
27
+ 2.times{ uint32_le } # little endian
28
+ double_be # big endian
29
+ single
22
30
  end
23
31
 
24
32
  a = [-1, 2, 3, 4.0, 3.0]
25
33
  z = ZScan.new(a.pack('cI<2Gf') + 'rest')
26
- b = z.scan_binary s
34
+ b = z.scan_bytes s
27
35
  assert_equal 'rest', z.rest
28
36
  assert_equal a, b
29
37
  end
38
+
39
+ it "#scan_bytes with expectation" do
40
+ s = ZScan::BSpec.new do
41
+ int8 expect: 3
42
+ float
43
+ end
44
+
45
+ a = [3, 4.0]
46
+ z = ZScan.new a.pack('cf')
47
+ assert_equal a, z.scan_bytes(s)
48
+
49
+ a = [2, 4.0]
50
+ z = ZScan.new a.pack('cf')
51
+ assert_equal nil, z.scan_bytes(s)
52
+ assert_equal 0, z.pos
53
+ end
30
54
  end
@@ -18,6 +18,12 @@ describe "typed scan" do
18
18
  assert_equal 030, z.scan_int
19
19
  end
20
20
 
21
+ it '#scan_int does not use unicode numbers' do
22
+ z = Zscan.new "一二".force_encoding('utf-8')
23
+ assert_equal nil, z.scan_int
24
+ assert_equal 0, z.pos
25
+ end
26
+
21
27
  it "#scan_float" do
22
28
  z = Zscan.new " -3.5e23"
23
29
  assert_equal nil, z.scan_float
data/spec/zscan_spec.rb CHANGED
@@ -96,4 +96,12 @@ describe ZScan do
96
96
  z.pos = 2
97
97
  assert_equal 1, z.line_index
98
98
  end
99
+
100
+ it '#cleanup' do
101
+ @z.scan /\w/
102
+ @z.cleanup
103
+ assert_equal 'b你好', @z.string
104
+ assert_equal 0, @z.pos
105
+ assert_equal 0, @z.bytepos
106
+ end
99
107
  end
data/zscan.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "zscan"
3
- s.version = "1.3" # version mapped from zscan.rb, don't change here
3
+ s.version = "2.0" # version mapped from zscan.rb, don't change here
4
4
  s.author = "Zete Lui"
5
5
  s.homepage = "https://github.com/luikore/zscan"
6
6
  s.platform = Gem::Platform::RUBY
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
9
9
  s.required_ruby_version = ">=1.9.2"
10
10
  s.licenses = ['BSD']
11
11
 
12
- s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc}}')
12
+ s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc},ext/pack/COPYING*}')
13
13
  s.require_paths = ["lib"]
14
14
  s.extensions = ["ext/extconf.rb"]
15
15
  s.rubygems_version = '1.8.24'