RubyGems - zscan - Versions diffs - 1.3 → 2.0 - Mend

zscan 1.3 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/rakefile CHANGED Viewed

@@ -1,74 +1,28 @@
+require_relative "generate/generate"
 Dir.chdir __dir__
-version_re  = /\d+(\.\d+)*/
-version     = `command grep 'VERSION =' lib/zscan.rb`[version_re]
-gem_files   = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,c}}')
-gem_package = "zscan-#{version}.gem"
+version_re   = /\d+(\.\d+)*/
+version      = `command grep 'VERSION =' lib/zscan.rb`[version_re]
+gem_files    = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc},ext/pack/COPYING*}')
+gem_package  = "zscan-#{version}.gem"
+generate_src = [__FILE__, *Dir.glob('generate/*')]
-bspec_types = %w[INT8 INT16 INT32 INT64 UINT8 UINT16 UINT32 UINT64 SINGLE DOUBLE]
-bspec_insns = bspec_types.flat_map{|ty|
-  if ty =~ /INT8/
-    ty
-  else
-    [ty, "#{ty}_SWAP"]
-  end
-}
-def bspec_incr ins
-  case ins
-  when /INT(\d+)/; $1.to_i / 8
-  when /SINGLE/; 4
-  when /DOUBLE/; 8
-  else; raise 'bad'
-  end
-end
-def bspec_c_type ins
-  case ins
-  when /(U?INT\d+)/; "#{$1.downcase}_t"
-  when /SINGLE/; 'float'
-  when /DOUBLE/; 'double'
-  else; raise 'bad'
-  end
-end
-def bspec_extract ins
-  type = bspec_c_type ins
-  len = bspec_incr(ins) * 8
-  r = "((uint#{len}_t*)s)[0]"
-  if ins.end_with?('SWAP')
-    r = "swap#{len}(#{r})"
-  end
-  "uint#{len}_t r = #{r}"
-end
-def bspec_convert ins
-  case ins
-  when /(U)?INT64|UINT32/
-    if ins.start_with?('U')
-      "UINT64toNUM(r)"
-    else
-      "INT64toNUM(CAST(r, int64_t))"
-    end
-  when /INT32/
-    "INT2NUM(CAST(r, int32_t))"
-  when /INT(16|8)/
-    "INT2FIX(CAST(r, #{bspec_c_type ins}))"
-  when /SINGLE/
-    "DBL2NUM((double)CAST(r, float))"
-  when /DOUBLE/
-    "DBL2NUM(CAST(r, double))"
-  else
-    raise 'bad'
-  end
-end
+desc "generate, build and test, and pack gem"
+task :default => [:gen, :test, gem_package]
 desc "build and test"
-task :default => [:test, gem_package]
-desc "build and run test"
-task :test do
+task :test => 'ext/Makefile' do
   sh "make -C ext"
   sh "rspec"
 end
+file 'ext/Makefile' do
+  Dir.chdir 'ext' do
+    sh "ruby extconf.rb"
+  end
+end
 desc "pack gem"
-file gem_package => gem_files do
+file gem_package => gem_files + Generate.files do
   sh "rm zscan-*.gem"
   new_version = false
@@ -82,87 +36,23 @@ file gem_package => gem_files do
   if new_version
     File.open('zscan.gemspec', 'w'){|f| f << lines.join }
   end
+  puts "packing files:"
+  puts '-' * 40
+  puts gem_files
+  puts '-' * 40
   sh "gem build zscan.gemspec"
 end
 desc "generate files"
-task :gen => %w[ext/bspec_exec.inc ext/bspec_init.inc lib/zscan/instructions.rb]
-file 'ext/bspec_exec.inc' => __FILE__ do
-  puts "generating ext/bspec_exec.inc"
-  opcode_list = bspec_insns.map do |ins|
-    "&&BS_#{ins}"
-  end.join ', '
-  opcode_segs = bspec_insns.map do |ins|
-%Q{BS_#{ins}:
-  {
-    #{bspec_extract ins};
-    rb_ary_push(a, #{bspec_convert ins});
-    s += #{bspec_incr ins};
-    goto **(ip++);
-  }
-}
-  end.join "\n"
-  File.open 'ext/bspec_exec.inc', 'w' do |f|
-    f.puts %Q|// GENERATED WITH: rake gen
-#line 2 "ext/bspec_exec.inc"
-__attribute__((__noinline__))
-static VALUE bspec_exec(void** ip, char* s, VALUE a) {
-  static void* opcodes[] = { &&BS_RET, #{opcode_list} };
-  if (ip == NULL) {
-    return (VALUE)opcodes;
-  }
-  goto **(ip++);
-BS_RET:
-  return a;
-#{opcode_segs}
-}|
-  end
-end
-file 'ext/bspec_init.inc' => __FILE__ do
-  puts 'generating ext/bspec_init.inc'
-  opcode_incrs = bspec_insns.map{|ins| bspec_incr ins}.join ', '
-  File.open 'ext/bspec_init.inc', 'w' do |f|
-    f.puts "// GENERATED WITH: rake gen"
-    f.puts %Q|static const long bspec_s_sizes[] = {0, #{opcode_incrs}};|
-    f.puts %Q|static const long bspec_opcodes_size = #{bspec_insns.size + 1};|
-  end
-end
-file 'lib/zscan/instructions.rb' => __FILE__ do
-  puts 'generating lib/zscan/instructions.rb'
-  File.open 'lib/zscan/instructions.rb', 'w' do |f|
-    f.puts "# GENERATED WITH: rake gen"
-    f.puts "class ZScan::BinarySpec"
-    bspec_insns.each_with_index do |ins, i|
-      f.puts <<-RUBY
-  def #{ins.downcase} n=1
-    raise ArgumentError, "repeat count should be >= 1, but got \#{n}" if n < 1
-    n.times do
-      append #{i + 1}
-    end
-  end
-      RUBY
-    end
-    alias_ins = (bspec_types - ['INT8', 'UINT8']).map &:downcase
-    f.puts "  if ZScan::BinarySpec.big_endian?"
-    alias_ins.each do |ins|
-    f.puts "    alias #{ins}_be #{ins}"
-    f.puts "    alias #{ins}_le #{ins}_swap"
-    end
-    f.puts "  else"
-    alias_ins.each do |ins|
-    f.puts "    alias #{ins}_le #{ins}"
-    f.puts "    alias #{ins}_be #{ins}_swap"
+task :gen => Generate.files
+Generate.files.each do |name|
+  file name => generate_src do
+    puts "generating #{name}"
+    r = Generate.generate(name)
+    File.open name, 'w' do |f|
+      f.puts "#{name.end_with?('rb') ? '#' : '//'} generated by rake gen"
+      f << r
     end
-    f.puts "  end"
-    swap_ins = alias_ins.map{|ins| "#{ins}_swap"}
-    f.puts "  undef #{swap_ins.join ', '}"
-    f.puts "end"
   end
 end

data/readme.md CHANGED Viewed

@@ -4,7 +4,8 @@
 - `ZScan#pos` is the codepoint position, and `ZScan#bytepos` is byte position.
 - Correctly scans anchors and look behind predicates.
 - Pos stack manipulation.
-- Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format`, `#scan_binary format`.
+- Typed scanning methods: `#scan_float`, `#scan_int radix=nil`, `#scan_date format`.
+- Binary scanning methods: `#scan_bytes spec`, `#unpack format`.
 ## Install
@@ -50,22 +51,20 @@ See also https://bugs.ruby-lang.org/issues/7092
 ## Other motivations
 - For scan and convert, ruby's stdlib `Scanf` is slow (creates regexp array everytime called) and not possible to corporate with scanner.
-- For date parsing, `strptime` doesn't tell the parsed length.
-- For binary parsing, `unpack` is an slow interpreter, and the instructions are quite irregular.
+- For date parsing, `Date#strptime` doesn't tell the parsed length.
+- For binary parsing, `String#unpack` is an slow interpreter, it doesn't tell the parsed length either, and the instructions are quite irregular.
 ## Essential methods
 - `ZScan.new string, dup=false`
 - `#scan regexp_or_string`
-- `#skip regexp_or_string`
+- `#skip regexp_or_string` return new byte pos or `nil`
 - `#match_bytesize regexp_or_string` return length of matched bytes or `nil`.
 - `#slice n` slice sub string of n chars from current pos, advances the cursor.
 - `#byteslice n` slice sub string of n bytes from cursor pos, advances the cursor.
 - `#scan_float` scan a float number which is not starting with space. It deals with multibyte encodings for you.
 - `#scan_int radix=nil` if radix is nil, decide base by prefix: `0x` is 16, `0` is 8, `0b` is 2, otherwise 10. `radix` should be in range `2..36`.
 - `#scan_date format_string, start=Date::ITALY` scan a `DateTime` object, see also [strptime](http://rubydoc.info/stdlib/date/DateTime.strptime).
-- `#scan_binary binary_spec` optimized and readable binary scan, see below for how to create a `ZScan::BinarySpec`.
-- `#unpack format_string`
 - `#eos?`
 - `#string` note: return a dup. Don't worry the performance because it is a copy-on-write string.
 - `#rest` rest unscanned sub string.
@@ -80,6 +79,7 @@ For convienience
 - `#[]= range, replace_string` note: if `range` starts before pos, moves pos left, also clears the stack.
 - `#size`
 - `#bytesize`
+- `#cleanup` cleanup substring before current pos.
 ## Pos management
@@ -92,53 +92,70 @@ For convienience
 - `#reset` go to beginning.
 - `#terminate` go to end of string.
-## Binary parsing
+## Binary scanning
-Specify a sequence of binary data. Designed for binary protocol parsing. Example:
+- `#scan_bytes bspec` optimized and readable binary scan, see below for how to create a `ZScan::BSpec`.
+- `#unpack unpack_format_string` note that it always returns an array no matter matched or not (same behavior as `String#unpack`).
+#### Bytes spec
+Bytes spec is designed for fast binary protocol parsing. You can specify a sequence of binary data and how to expect the matching.
+Unlike `#unpack`, bytes spec uses english names to specify the data sequence. It returns `nil` if any of the instructions not matching. Though there's no string / position changing / variable length instructions.
+Bytes spec is implemented as direct-threaded VM, it faster than `#unpack`.
+Example:
 ```ruby
-# create a ZScan::BinarySpec
-s = ZScan.binary_spec do
-  int8        # once
-  uint32_le 2 # little endian, twice
-  double_be 1 # big endian, once
+s = ZScan.BSpec.new do
+  int8 expect: -1 # return nil if the first int8 is not -1
+  2.times{
+    uint32_le # le means: little endian
+  }
+  double_be   # be means: big endian
 end
 z = ZScan.new [-1, 2, 3, 4.0].pack('cI<2G') + "rest"
-z.scan_binary s #=> [-1, 2, 3, 4.0]
+z.scan_bytes s #=> [-1, 2, 3, 4.0]
 z.rest #=> 'rest
+bad_z = ZScan.new [1, 2, 3, 4.0].pack('cI<2G) # first byte not match
+z.scan_bytes s #=> nil
 ```
 Integer instructions:
 ```ruby
-int8  uint8
+int8  uint8  byte # byte is the same as uint8
 int16 uint16 int16_le uint16_le int16_be uint16_be
 int32 uint32 int32_le uint32_le int32_be uint32_be
 int64 uint64 int64_le uint64_le int64_be uint64_be
 ```
-Single precision float instructions:
+Only integer instructions support the `:expect` option, match quickly stops if the scanned result not equal to the expected number.
+Double precision float instructions:
 ```ruby
-single single_le single_be
+double double_le double_be
 ```
-Double precision float instructions:
+Single precision float instructions:
 ```ruby
-double double_le double_be
+float float_le float_be
+single single_le single_be # same as float*
 ```
-Endians:
+Note that ruby floats are doubles in fact, in a very rare case, you may need to keep the original single-precision data instead of converting into doubles, you can use `uint32` for the job.
+A note on endians:
 - (without endian suffix) native endian
 - `*_le` little endian (VAX, x86, Windows string code unit)
 - `*_be` big endian, network endian (SPARC, Java string code unit)
-Repeat count must be integer `>= 1`, default is `1`.
-It is implemented as a direct-threaded bytecode interpreter. A bit faster than `String#unpack`.
 ## Parsing combinators
 Combinators that manage scanner pos and stack state for you. In the combinators, if the returned value of the given block is `nil` or `false`, stops iteration and restores scanner location. Can be nested, useful for building parsers.
@@ -157,6 +174,34 @@ Combinators that manage scanner pos and stack state for you. In the combinators,
 - `#clear_pos_stack` clear pos stack.
 - `z.push._try expr` equivalent to `z.try{ expr }`, but faster because no block is required
+## Features QA
+#### `scan_until` and `skip_until`?
+For example, the StringScanner call
+```ruby
+strscan.scan_until /a/
+```
+Is equivalent to a slightly different regexp
+```ruby
+zscan.scan /.*a/m
+```
+#### Capture groups?
+Not implemented yet. Maybe future?
+#### `unscan`?
+Use pos management methods.
+#### Erlang style bitstring?
+Thought of that but the API can be quirky... It's way beyond a string scanner.
 ## License
 ```

data/spec/binary_scan_spec.rb CHANGED Viewed

@@ -4,27 +4,51 @@ require_relative "spec_helper"
 describe 'ZScan binary scanning methods' do
   it "#unpack" do
     z = ZScan.new "\x01\x02\x03"
-    assert_raise ArgumentError do
-      z.unpack '@1C'
-    end
     assert_equal [1, 2], (z.unpack 'CC')
     assert_equal 2, z.pos
-    assert_equal nil, (z.unpack 'I')
+    assert_equal [nil], (z.unpack 'I')
     assert_equal 2, z.pos
   end
-  it "#scan_binary" do
-    s = ZScan.binary_spec do
-      int8        # once
-      uint32_le 2 # little endian, twice
-      double_be 1 # big endian, once
-      single 1
+  it "#unpack position-changing instructions and var-length instructions" do
+    z = ZScan.new "abcd\0abc"
+    s, _ = z.unpack 'Z*'
+    assert_equal "abcd", s
+    assert_equal 5, z.pos
+    z.reset
+    s, _ = z.unpack '@2Z*'
+    assert_equal 'cd', s
+  end
+  it "#scan_bytes" do
+    s = ZScan::BSpec.new do
+      int8
+      2.times{ uint32_le } # little endian
+      double_be            # big endian
+      single
     end
     a = [-1, 2, 3, 4.0, 3.0]
     z = ZScan.new(a.pack('cI<2Gf') + 'rest')
-    b = z.scan_binary s
+    b = z.scan_bytes s
     assert_equal 'rest', z.rest
     assert_equal a, b
   end
+  it "#scan_bytes with expectation" do
+    s = ZScan::BSpec.new do
+      int8 expect: 3
+      float
+    end
+    a = [3, 4.0]
+    z = ZScan.new a.pack('cf')
+    assert_equal a, z.scan_bytes(s)
+    a = [2, 4.0]
+    z = ZScan.new a.pack('cf')
+    assert_equal nil, z.scan_bytes(s)
+    assert_equal 0, z.pos
+  end
 end

data/spec/typed_scan_spec.rb CHANGED Viewed

@@ -18,6 +18,12 @@ describe "typed scan" do
     assert_equal 030, z.scan_int
   end
+  it '#scan_int does not use unicode numbers' do
+    z = Zscan.new "一二".force_encoding('utf-8')
+    assert_equal nil, z.scan_int
+    assert_equal 0, z.pos
+  end
   it "#scan_float" do
     z = Zscan.new " -3.5e23"
     assert_equal nil, z.scan_float

data/spec/zscan_spec.rb CHANGED Viewed

@@ -96,4 +96,12 @@ describe ZScan do
     z.pos = 2
     assert_equal 1, z.line_index
   end
+  it '#cleanup' do
+    @z.scan /\w/
+    @z.cleanup
+    assert_equal 'b你好', @z.string
+    assert_equal 0, @z.pos
+    assert_equal 0, @z.bytepos
+  end
 end

data/zscan.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
   s.name = "zscan"
-  s.version = "1.3" # version mapped from zscan.rb, don't change here
+  s.version = "2.0" # version mapped from zscan.rb, don't change here
   s.author = "Zete Lui"
   s.homepage = "https://github.com/luikore/zscan"
   s.platform = Gem::Platform::RUBY
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
   s.required_ruby_version = ">=1.9.2"
   s.licenses = ['BSD']
-  s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc}}')
+  s.files = Dir.glob('{rakefile,zscan.gemspec,readme.md,**/*.{rb,h,c,inc},ext/pack/COPYING*}')
   s.require_paths = ["lib"]
   s.extensions = ["ext/extconf.rb"]
   s.rubygems_version = '1.8.24'