RubyGems - unicode_scanner - Versions diffs - 1.0.0 - Mend

unicode_scanner 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/.document +5 -0
data/.rspec +1 -0
data/.rvmrc +1 -0
data/Gemfile +11 -0
data/Gemfile.lock +34 -0
data/LICENSE.txt +20 -0
data/README.md +45 -0
data/Rakefile +53 -0
data/VERSION +1 -0
data/lib/unicode_scanner.rb +655 -0
data/spec/spec_helper.rb +12 -0
data/spec/unicode_scanner_spec.rb +206 -0
data/unicode_scanner.gemspec +64 -0
metadata +143 -0

data/.document ADDED Viewed

@@ -0,0 +1,5 @@
+lib/**/*.rb
+bin/*
+-
+features/**/*.feature
+LICENSE.txt

data/.rspec ADDED Viewed

	@@ -0,0 +1 @@
1	+ --color

data/.rvmrc ADDED Viewed

	@@ -0,0 +1 @@
1	+ rvm 1.9.3@scanner --create

data/Gemfile ADDED Viewed

@@ -0,0 +1,11 @@
+source :rubygems
+group :development do
+  gem 'rspec'
+  gem 'redcarpet'
+  gem 'yard'
+  gem 'bundler'
+  gem 'jeweler'
+end

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,34 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    diff-lcs (1.1.3)
+    git (1.2.5)
+    jeweler (1.8.4)
+      bundler (~> 1.0)
+      git (>= 1.2.5)
+      rake
+      rdoc
+    json (1.7.3)
+    rake (0.9.2.2)
+    rdoc (3.12)
+      json (~> 1.4)
+    redcarpet (2.1.1)
+    rspec (2.11.0)
+      rspec-core (~> 2.11.0)
+      rspec-expectations (~> 2.11.0)
+      rspec-mocks (~> 2.11.0)
+    rspec-core (2.11.0)
+    rspec-expectations (2.11.1)
+      diff-lcs (~> 1.1.3)
+    rspec-mocks (2.11.1)
+    yard (0.8.2.1)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler
+  jeweler
+  redcarpet
+  rspec
+  yard

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2012 Tim Morgan
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,45 @@
+Unicode String Scanner
+======================
+A Unicode-aware implementation of Ruby's `StringScanner`.
+|             |                                 |
+|:------------|:--------------------------------|
+| **Author**  | Tim Morgan                      |
+| **Version** | 1.0 (Jul 11, 2012)              |
+| **License** | Released under the MIT license. |
+About
+-----
+Did you know that `StringScanner` splits codepoints? Neither did I. This one
+doesn't.
+**When would I want to use this?** When you want to use `StringScanner` on a
+Unicode (UTF-_n_) string.
+**When would I _not_ want to use this?** If you're interested in speed. This is
+slower than StringScanner because a) it's not written in native C, and b) it's
+slower to traverse Unicode strings anyway because characters can have varying
+byte sizes.
+Installation
+------------
+Simply add this gem to your project's `Gemfile`:
+```` ruby
+gem 'unicode_scanner'
+````
+Usage
+-----
+The `UnicodeScanner` object responds to exactly the same API as
+[StringScanner](http://ruby-doc.org/stdlib-1.9.3/libdoc/strscan/rdoc/StringScanner.html),
+with the exception of the following methods:
+* `getbyte`
+* any obsolete methods
+For more information, see the {UnicodeScanner} class documentation.

data/Rakefile ADDED Viewed

@@ -0,0 +1,53 @@
+# encoding: utf-8
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
+  gem.name        = "unicode_scanner"
+  gem.homepage    = "http://github.com/RISCfuture/unicode_scanner"
+  gem.license     = "MIT"
+  gem.summary     = %Q{Unicode-aware implementation of StringScanner}
+  gem.description = %Q{An implementation of StringScanner that doesn't split multibyte characters.}
+  gem.email       = "git@timothymorgan.info"
+  gem.authors     = ["Tim Morgan"]
+  # dependencies defined in Gemfile
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rspec/core'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.pattern = FileList['spec/**/*_spec.rb']
+end
+task default: :spec
+require 'yard'
+# bring sexy back (sexy == tables)
+module YARD::Templates::Helpers::HtmlHelper
+  def html_markup_markdown(text)
+    markup_class(:markdown).new(text, :gh_blockcode, :fenced_code, :autolink, :tables).to_html
+  end
+end
+YARD::Rake::YardocTask.new('doc') do |doc|
+  doc.options << '-m' << 'markdown' << '-M' << 'redcarpet'
+  doc.options << '--protected' << '--no-private'
+  doc.options << '-r' << 'README.md'
+  doc.options << '-o' << 'doc'
+  doc.options << '--title' << 'Unicode String Scanner Documentation'
+  doc.files = %w( lib/**/* README.md )
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1.0.0

data/lib/unicode_scanner.rb ADDED Viewed

@@ -0,0 +1,655 @@
+# UnicodeScanner provides for Unicode-aware lexical scanning operations on a
+# `String`.  Here is an example of its usage:
+#
+# ```` ruby
+# s = UnicodeScanner.new('This is an example string')
+# s.eos?               # -> false
+#
+# p s.scan(/\w+/)      # -> "This"
+# p s.scan(/\w+/)      # -> nil
+# p s.scan(/\s+/)      # -> " "
+# p s.scan(/\s+/)      # -> nil
+# p s.scan(/\w+/)      # -> "is"
+# s.eos?               # -> false
+#
+# p s.scan(/\s+/)      # -> " "
+# p s.scan(/\w+/)      # -> "an"
+# p s.scan(/\s+/)      # -> " "
+# p s.scan(/\w+/)      # -> "example"
+# p s.scan(/\s+/)      # -> " "
+# p s.scan(/\w+/)      # -> "string"
+# s.eos?               # -> true
+#
+# p s.scan(/\s+/)      # -> nil
+# p s.scan(/\w+/)      # -> nil
+# ````
+#
+# Scanning a string means remembering the position of a _scan pointer_, which is
+# just an index.  The point of scanning is to move forward a bit at a time, so
+# matches are sought after the scan pointer; usually immediately after it.
+#
+# Given the string "test string", here are the pertinent scan pointer positions:
+#
+# ````
+#   t e s t   s t r i n g
+# 0 1 2 ...             1
+#                       0
+# ````
+#
+# When you {#scan} for a pattern (a regular expression), the match must occur at
+# the character after the scan pointer.  If you use {#scan_until}, then the
+# match can occur anywhere after the scan pointer.  In both cases, the scan
+# pointer moves _just beyond_ the last character of the match, ready to scan
+# again from the next character onwards.  This is demonstrated by the example
+# above.
+#
+# Method Categories
+# -----------------
+#
+# There are other methods besides the plain scanners.  You can look ahead in the
+# string without actually scanning.  You can access the most recent match. You
+# can modify the string being scanned, reset or terminate the scanner, find out
+# or change the position of the scan pointer, skip ahead, and so on.
+#
+# ### Advancing the Scan Pointer
+#
+# - {#getch}
+# - {#scan}
+# - {#scan_until}
+# - {#skip}
+# - {#skip_until}
+#
+# ### Looking Ahead
+#
+# - {#check}
+# - {#check_until}
+# - {#exist?}
+# - {#match?}
+# - {#peek}
+#
+# ### Finding Where we Are
+#
+# - {#beginning_of_line?} ({#bol?})
+# - {#eos?}
+# - {#rest_size}
+# - {#pos}
+#
+# ### Setting Where we Are
+#
+# - {#reset}
+# - {#terminate}
+# - {#pos=}
+#
+# ### Match Data
+#
+# - {#matched}
+# - {#matched?}
+# - {#matched_size}
+# - {#[]}
+# - {#pre_match}
+# - {#post_match}
+#
+# ### Miscellaneous
+#
+# - {#<<}
+# - {#concat}
+# - {#string}
+# - {#string=}
+# - {#unscan}
+#
+# There are aliases to several of the methods.
+class UnicodeScanner
+  INSPECT_LENGTH = 5
+  # Creates a new UnicodeScanner object to scan over the given `string`.
+  #
+  # @param [String] string The string to iterate over.
+  def initialize(string)
+    @string   = string
+    @matches  = nil
+    @matched  = false
+    @current  = 0
+    @previous = 0
+  end
+  # Appends `str` to the string being scanned. This method does not affect scan
+  # pointer.
+  #
+  # @param [String] str The string to append.
+  #
+  # @example
+  #   s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+  #   s.scan(/Fri /)
+  #   s << " +1000 GMT"
+  #   s.string            # -> "Fri Dec 12 1975 14:39 +1000 GMT"
+  #   s.scan(/Dec/)       # -> "Dec"
+  def concat(str)
+    @string.concat str
+  end
+  alias << concat
+  # Return the <i>n</i>th subgroup in the most recent match.
+  #
+  # @param [Fixnum] n The index of the subgroup to return.
+  # @return [String, nil] The subgroup, if it exists.
+  #
+  # @example
+  #   s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+  #   s.scan(/(\w+) (\w+) (\d+) /)       # -> "Fri Dec 12 "
+  #   s[0]                               # -> "Fri Dec 12 "
+  #   s[1]                               # -> "Fri"
+  #   s[2]                               # -> "Dec"
+  #   s[3]                               # -> "12"
+  #   s.post_match                       # -> "1975 14:39"
+  #   s.pre_match                        # -> ""
+  def [](n)
+    @matched ? @matches[n] : nil
+  end
+  # @return [true, false] `true` iff the scan pointer is at the beginning of the
+  #   line.
+  #
+  # @example
+  #   s = UnicodeScanner.new("test\ntest\n")
+  #   s.bol?           # => true
+  #   s.scan(/te/)
+  #   s.bol?           # => false
+  #   s.scan(/st\n/)
+  #   s.bol?           # => true
+  #   s.terminate
+  #   s.bol?           # => true
+  def beginning_of_line?
+    return nil if @current > @string.length
+    return true if @current == 0
+    return @string[@current - 1] == "\n"
+  end
+  alias bol? beginning_of_line?
+  # This returns the value that {#scan} would return, without advancing the scan
+  # pointer. The match register is affected, though.
+  #
+  # Mnemonic: it "checks" to see whether a {#scan} will return a value.
+  #
+  # @param [Regexp] pattern The pattern to scan for.
+  # @return [String, nil] The matched segment, if matched.
+  #
+  # @example
+  #   s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+  #   s.check /Fri/               # -> "Fri"
+  #   s.pos                       # -> 0
+  #   s.matched                   # -> "Fri"
+  #   s.check /12/                # -> nil
+  #   s.matched                   # -> nil
+  def check(pattern)
+    do_scan pattern, false, true, true
+  end
+  # This returns the value that {#scan_until} would return, without advancing
+  # the scan pointer. The match register is affected, though.
+  #
+  # Mnemonic: it "checks" to see whether a {#scan_until} will return a value.
+  #
+  # @param [Regexp] pattern The pattern to scan until reaching.
+  # @return [String, nil] The matched segment, if matched.
+  #
+  # @example
+  #   s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+  #   s.check_until /12/          # -> "Fri Dec 12"
+  #   s.pos                       # -> 0
+  #   s.matched                   # -> 12
+  def check_until(pattern)
+    do_scan pattern, false, true, false
+  end
+  # @return [true, false] `true` if the scan pointer is at the end of the string.
+  #
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   p s.eos?          # => false
+  #   s.scan(/test/)
+  #   p s.eos?          # => false
+  #   s.terminate
+  #   p s.eos?          # => true
+  def eos?
+    @current >= @string.length
+  end
+  # Looks _ahead_ to see if the `pattern` exists _anywhere_ in the string,
+  # without advancing the scan pointer. This predicates whether a {#scan_until}
+  # will return a value.
+  #
+  # @param [Regexp] pattern The pattern to search for.
+  # @return [true, false] Whether the pattern exists ahead.
+  #
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   s.exist? /s/            # -> 3
+  #   s.scan /test/           # -> "test"
+  #   s.exist? /s/            # -> 2
+  #   s.exist? /e/            # -> nil
+  def exist?(pattern)
+    do_scan pattern, false, false, false
+  end
+  # Scans one character and returns it.
+  #
+  # @return [String] The character.
+  #
+  # @example
+  #   s = UnicodeScanner.new("ab")
+  #   s.getch           # => "a"
+  #   s.getch           # => "b"
+  #   s.getch           # => nil
+  #
+  #   $KCODE = 'EUC'
+  #   s = UnicodeScanner.new("\2244\2242")
+  #   s.getch           # => "\244\242"   # Japanese hira-kana "A" in EUC-JP
+  #   s.getch           # => nil
+  def getch
+    return nil if eos?
+    do_scan /./u, true, true, true
+  end
+  # Returns a string that represents the UnicodeScanner object, showing:
+  #
+  # * the current position
+  # * the size of the string
+  # * the characters surrounding the scan pointer
+  #
+  # @return [String] A description of this object.
+  #
+  # @example
+  #   s = ::new("Fri Dec 12 1975 14:39")
+  #   s.inspect # -> '#<UnicodeScanner 0/21 @ "Fri D...">'
+  #   s.scan_until /12/ # -> "Fri Dec 12"
+  #   s.inspect # -> '#<UnicodeScanner 10/21 "...ec 12" @ " 1975...">'
+  def inspect
+    return "#<#{self.class.to_s} (uninitialized)>" if @string.nil?
+    return "#<#{self.class.to_s} fin>" if eos?
+    if @current == 0
+      return "#<%s %d/%d @ %s>" % [self.class.to_s, @current, @string.length, inspect_after.inspect]
+    end
+    "#<%s %d/%d %s @ %s>" % [self.class.to_s, @current, @string.length, inspect_before.inspect, inspect_after.inspect]
+  end
+  # Tests whether the given `pattern` is matched from the current scan pointer.
+  # Returns the length of the match, or `nil`. The scan pointer is not advanced.
+  #
+  # @param [Regexp] pattern The pattern to match with.
+  # @return [true, false] Whether the pattern is matched from the scan pointer.
+  #
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   p s.match?(/\w+/)   # -> 4
+  #   p s.match?(/\w+/)   # -> 4
+  #   p s.match?(/\s+/)   # -> nil
+  def match?(pattern)
+    do_scan pattern, false, false, true
+  end
+  # @return [String, nil] The last matched string.
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   s.match?(/\w+/)     # -> 4
+  #   s.matched           # -> "test"
+  def matched
+    return nil unless @matched
+    @matches[0]
+  end
+  # @return [true, false] `true` iff the last match was successful.
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   s.match?(/\w+/)     # => 4
+  #   s.matched?          # => true
+  #   s.match?(/\d+/)     # => nil
+  #   s.matched?          # => false
+  def matched?() @matched end
+  # @return [Fixnum, nil] The size of the most recent match (see {#matched}), or
+  #   `nil` if there was no recent match.
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   s.check /\w+/           # -> "test"
+  #   s.matched_size          # -> 4
+  #   s.check /\d+/           # -> nil
+  #   s.matched_size          # -> nil
+  def matched_size
+    return nil unless @matched
+    @matches.end(0) - @matches.begin(0)
+  end
+  # Extracts a string corresponding to `string[pos,len]`, without advancing the
+  # scan pointer.
+  #
+  # @param [Fixnum] len The number of characters ahead to peek.
+  # @return [String] The string after the current position.
+  #
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   s.peek(7)          # => "test st"
+  #   s.peek(7)          # => "test st"
+  def peek(len)
+    return '' if eos?
+    @string[@current, len]
+  end
+  # Returns the byte position of the scan pointer. In the 'reset' position, this
+  # value is zero. In the 'terminated' position (i.e. the string is exhausted),
+  # this value is the bytesize of the string.
+  #
+  # In short, it's a 0-based index into the string.
+  #
+  # @return [Fixnum] The current scan position.
+  #
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   s.pos               # -> 0
+  #   s.scan_until /str/  # -> "test str"
+  #   s.pos               # -> 8
+  #   s.terminate         # -> #<UnicodeScanner fin>
+  #   s.pos               # -> 11
+  def pos() @current end
+  alias pointer pos
+  # Set the byte position of the scan pointer.
+  #
+  # @param [Fixnum] n The new position.
+  #
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   s.pos = 7            # -> 7
+  #   s.rest               # -> "ring"
+  def pos=(n)
+    n += @string.length if n < 0
+    raise RangeError, "index out of range" if n < 0
+    raise RangeError, "index out of range" if n > @string.length
+    @current = n
+  end
+  # @return [String] The _**post**-match_ (in the regular expression sense) of
+  #   the last scan.
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   s.scan(/\w+/)           # -> "test"
+  #   s.scan(/\s+/)           # -> " "
+  #   s.pre_match             # -> "test"
+  #   s.post_match            # -> "string"
+  def post_match
+    return nil unless @matched
+    @string[@previous + @matches.end(0), @string.length]
+  end
+  # @return [String] The _**pre**-match_ (in the regular expression sense) of
+  #   the last scan.
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   s.scan(/\w+/)           # -> "test"
+  #   s.scan(/\s+/)           # -> " "
+  #   s.pre_match             # -> "test"
+  #   s.post_match            # -> "string"
+  def pre_match
+    return nil unless @matched
+    @string[0, @previous + @matches.begin(0)]
+  end
+  # Reset the scan pointer (index 0) and clear matching data.
+  def reset
+    @current = 0
+    @matched = false
+  end
+  # @return [String] The "rest" of the string (i.e. everything after the scan
+  #   pointer). If there is no more data (`eos? = true`), it returns `""`.
+  def rest
+    return '' if eos?
+    return @string[@current, @string.length]
+  end
+  # @return [Fixnum] The value returned by `s.rest.size`.
+  def rest_size
+    return 0 if eos?
+    @string.length - @current
+  end
+  # Tries to match with `pattern` at the current position. If there's a match,
+  # the scanner advances the "scan pointer" and returns the matched string.
+  # Otherwise, the scanner returns `nil`.
+  #
+  # @param [Regexp] pattern The pattern to match.
+  # @return [String, nil] The string that was matched, if a match was found.
+  #
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   p s.scan(/\w+/)   # -> "test"
+  #   p s.scan(/\w+/)   # -> nil
+  #   p s.scan(/\s+/)   # -> " "
+  #   p s.scan(/\w+/)   # -> "string"
+  #   p s.scan(/./)     # -> nil
+  def scan(pattern)
+    do_scan pattern, true, true, true
+  end
+  # Tests whether the given `pattern` is matched from the current scan pointer.
+  # Advances the scan pointer if `advance_pointer` is `true`. Returns the
+  # matched string if `return_string` is true. The match register is affected.
+  #
+  # "full" means "scan with full parameters".
+  #
+  # @param [Regexp] pattern The pattern to scan.
+  # @param [true, false] advance_pointer Whether to advance the scan pointer if
+  #   a match is found.
+  # @param [true, false] return_string Whether to return the matched segment.
+  # @return [String, Fixnum, nil] The matched segment if `return_string` is
+  #   `true`, otherwise the number of characters advanced. `nil` if nothing
+  #   matched.
+  def scan_full(pattern, advance_pointer, return_string)
+    do_scan pattern, advance_pointer, return_string, true
+  end
+  # Scans the string _until_ the `pattern` is matched. Returns the substring up
+  # to and including the end of the match, advancing the scan pointer to that
+  # location. If there is no match, `nil` is returned.
+  #
+  # @param [Regexp] pattern The pattern to match.
+  # @return [String, nil] The segment that matched.
+  #
+  # @example
+  #   s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+  #   s.scan_until(/1/)        # -> "Fri Dec 1"
+  #   s.pre_match              # -> "Fri Dec "
+  #   s.scan_until(/XYZ/)      # -> nil
+  def scan_until(pattern)
+    do_scan pattern, true, true, false
+  end
+  # Scans the string `until` the pattern is matched. Advances the scan pointer
+  # if `advance_pointer`, otherwise not. Returns the matched string if
+  # `return_string` is `true`, otherwise returns the number of characters
+  # advanced. This method does affect the match register.
+  #
+  # @param [Regexp] pattern The pattern to scan.
+  # @param [true, false] advance_pointer Whether to advance the scan pointer if
+  #   a match is found.
+  # @param [true, false] return_string Whether to return the matched segment.
+  # @return [String, Fixnum, nil] The matched segment if `return_string` is
+  #   `true`, otherwise the number of characters advanced. `nil` if nothing
+  #   matched.
+  def search_full(pattern, advance_pointer, return_string)
+    do_scan pattern, advance_pointer, return_string, false
+  end
+  # Attempts to skip over the given `pattern` beginning with the scan pointer.
+  # If it matches, the scan pointer is advanced to the end of the match, and the
+  # length of the match is returned. Otherwise, `nil` is returned.
+  #
+  # It's similar to {#scan}, but without returning the matched string.
+  #
+  # @param [Regexp] pattern The pattern to match.
+  # @return [Fixnum, nil] The number of characters advanced, if matched.
+  #
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   p s.skip(/\w+/)   # -> 4
+  #   p s.skip(/\w+/)   # -> nil
+  #   p s.skip(/\s+/)   # -> 1
+  #   p s.skip(/\w+/)   # -> 6
+  #   p s.skip(/./)     # -> nil
+  def skip(pattern)
+    do_scan pattern, true, false, true
+  end
+  # Advances the scan pointer until `pattern` is matched and consumed. Returns
+  # the number of characters advanced, or `nil` if no match was found.
+  #
+  # Look ahead to match `pattern`, and advance the scan pointer to the _end_ of
+  # the match. Return the number of characters advanced, or `nil` if the match
+  # was unsuccessful.
+  #
+  # It's similar to {#scan_until}, but without returning the intervening string.
+  #
+  # @param [Regexp] pattern The pattern to match.
+  # @return [Fixnum, nil] The number of characters advanced, if matched.
+  def skip_until(pattern)
+    do_scan pattern, true, false, false
+  end
+  # @return [String] The string being scanned.
+  def string() @string end
+  # Changes the string being scanned to `str` and resets the scanner.
+  #
+  # @param [String] str The new string to scan.
+  # @return [String] `str`
+  def string=(str)
+    @string  = str
+    @matched = false
+    @current = 0
+    str
+  end
+  # Set the scan pointer to the end of the string and clear matching data.
+  def terminate
+    @current = @string.length
+    @matched = false
+    self
+  end
+  alias clear terminate
+  # Set the scan pointer to the previous position. Only one previous position is
+  # remembered, and it changes with each scanning operation.
+  #
+  # @example
+  #   s = UnicodeScanner.new('test string')
+  #   s.scan(/\w+/)        # => "test"
+  #   s.unscan
+  #   s.scan(/../)         # => "te"
+  #   s.scan(/\d/)         # => nil
+  #   s.unscan             # ScanError: unscan failed: previous match record not exist
+  def unscan
+    raise ScanError, "unscan failed: previous match record not exist" unless @matched
+    @current = @previous
+    @matched = false
+    self
+  end
+  private
+  def do_scan(regex, advance_pointer, return_string, head_only)
+    raise ArgumentError unless regex.kind_of?(Regexp)
+    @matched = false
+    return nil if eos?
+    @matches = regex.match(@string[@current, @string.length])
+    return nil unless @matches
+    if head_only && @matches.begin(0) > 0
+      @matches = nil
+      return nil
+    end
+    @matched = true
+    @previous = @current
+    @current += @matches.end(0) if advance_pointer
+    if return_string
+      return @string[@previous, @matches.end(0)]
+    else
+      return @matches.end(0)
+    end
+  end
+  def inspect_before # inspect1
+    return '' if @current == 0
+    str = String.new
+    len = 0
+    if @current > INSPECT_LENGTH
+      str << '...'
+      len = INSPECT_LENGTH
+    else
+      len = @current
+    end
+    str << @string[@current - len, len]
+    return str
+  end
+  def inspect_after # inspect2
+    return '' if eos?
+    str = String.new
+    len = @string.length - @current
+    if len > INSPECT_LENGTH
+      len = INSPECT_LENGTH
+      str << @string[@current, len]
+      str << '...'
+    else
+      str << @string[@current, len]
+    end
+    return str
+  end
+end
+class ScanError < StandardError; end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,12 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'rspec'
+require 'unicode_scanner'
+# Requires supporting files with custom matchers and macros, etc,
+# in ./support/ and its subdirectories.
+Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
+RSpec.configure do |config|
+end

data/spec/unicode_scanner_spec.rb ADDED Viewed

@@ -0,0 +1,206 @@
+# encoding: utf-8
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe UnicodeScanner do
+  it "should pass all the class-level examples" do
+    s = UnicodeScanner.new('This is an example string')
+    s.eos?.should == false
+    s.scan(/\w+/).should == "This"
+    s.scan(/\w+/).should == nil
+    s.scan(/\s+/).should == " "
+    s.scan(/\s+/).should == nil
+    s.scan(/\w+/).should == "is"
+    s.eos?.should == false
+    s.scan(/\s+/).should == " "
+    s.scan(/\w+/).should == "an"
+    s.scan(/\s+/).should == " "
+    s.scan(/\w+/).should == "example"
+    s.scan(/\s+/).should == " "
+    s.scan(/\w+/).should == "string"
+    s.eos?.should == true
+    s.scan(/\s+/).should == nil
+    s.scan(/\w+/).should == nil
+  end
+  it "should pass the #concat example" do
+    s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+    s.scan(/Fri /)
+    s << " +1000 GMT"
+    s.string.should == "Fri Dec 12 1975 14:39 +1000 GMT"
+    s.scan(/Dec/).should == "Dec"
+  end
+  it "should pass the #[] example" do
+    s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+    s.scan(/(\w+) (\w+) (\d+) /).should == "Fri Dec 12 "
+    s[0].should == "Fri Dec 12 "
+    s[1].should == "Fri"
+    s[2].should == "Dec"
+    s[3].should == "12"
+    s.post_match.should == "1975 14:39"
+    s.pre_match.should == ""
+  end
+  it "should pass the #beginning_of_line? example" do
+    s = UnicodeScanner.new("test\ntest\n")
+    s.bol?.should == true
+    s.scan(/te/)
+    s.bol?.should == false
+    s.scan(/st\n/)
+    s.bol?.should == true
+    s.terminate
+    s.bol?.should == true
+  end
+  it "should pass the #check example" do
+    s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+    s.check(/Fri/).should == "Fri"
+    s.pos.should == 0
+    s.matched.should == "Fri"
+    s.check(/12/).should == nil
+    s.matched.should == nil
+  end
+  it "should pass the #check_until example" do
+    s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+    s.check_until(/12/).should == "Fri Dec 12"
+    s.pos.should == 0
+    s.matched.should == "12"
+  end
+  it "should pass the #eos? example" do
+    s = UnicodeScanner.new('test string')
+    s.eos?.should == false
+    s.scan(/test/)
+    s.eos?.should == false
+    s.terminate
+    s.eos?.should == true
+  end
+  it "should pass the #exist? example" do
+    s = UnicodeScanner.new('test string')
+    s.exist?(/s/).should == 3
+    s.scan(/test/).should == "test"
+    s.exist?(/s/).should == 2
+    s.exist?(/e/).should == nil
+  end
+  it "should pass a tweaked version of the #getch example" do
+    s = UnicodeScanner.new("ab")
+    s.getch.should == "a"
+    s.getch.should == "b"
+    s.getch.should == nil
+    s = UnicodeScanner.new("ぁ")
+    s.getch.should == "ぁ" # Japanese hira-kana "A" in EUC-JP
+    s.getch.should == nil
+  end
+  it "should pass the #inspect example" do
+    s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+    s.inspect.should == '#<UnicodeScanner 0/21 @ "Fri D...">'
+    s.scan_until(/12/).should == "Fri Dec 12"
+    s.inspect.should == '#<UnicodeScanner 10/21 "...ec 12" @ " 1975...">'
+  end
+  it "should pass the #match? example" do
+    s = UnicodeScanner.new('test string')
+    s.match?(/\w+/).should == 4
+    s.match?(/\w+/).should == 4
+    s.match?(/\s+/).should == nil
+  end
+  it "should pass the #matched example" do
+    s = UnicodeScanner.new('test string')
+    s.match?(/\w+/).should == 4
+    s.matched.should == "test"
+  end
+  it "should pass the #matched? example" do
+    s = UnicodeScanner.new('test string')
+    s.match?(/\w+/).should == 4
+    s.matched?.should == true
+    s.match?(/\d+/).should == nil
+    s.matched?.should == false
+  end
+  it "should pass the #matched_size example" do
+    s = UnicodeScanner.new('test string')
+    s.check(/\w+/).should == "test"
+    s.matched_size.should == 4
+    s.check(/\d+/).should == nil
+    s.matched_size.should == nil
+  end
+  it "should pass the #peek example" do
+    s = UnicodeScanner.new('test string')
+    s.peek(7).should == "test st"
+    s.peek(7).should == "test st"
+  end
+  it "should pass the #pos example" do
+    s = UnicodeScanner.new('test string')
+    s.pos.should == 0
+    s.scan_until(/str/).should == "test str"
+    s.pos.should == 8
+    s.terminate.inspect.should == "#<UnicodeScanner fin>"
+    s.pos.should == 11
+  end
+  it "should pass the #pos= example" do
+    s = UnicodeScanner.new('test string')
+    (s.pos = 7).should == 7
+    s.rest.should == "ring"
+  end
+  it "should pass the #post_match/#pre_match example" do
+    s = UnicodeScanner.new('test string')
+    s.scan(/\w+/).should == "test"
+    s.scan(/\s+/).should == " "
+    s.pre_match.should == "test"
+    s.post_match.should == "string"
+  end
+  it "should pass the #scan example" do
+    s = UnicodeScanner.new('test string')
+    s.scan(/\w+/).should == "test"
+    s.scan(/\w+/).should == nil
+    s.scan(/\s+/).should == " "
+    s.scan(/\w+/).should == "string"
+    s.scan(/./).should == nil
+  end
+  it "should pass the #scan_until example" do
+    s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+    s.scan_until(/1/).should == "Fri Dec 1"
+    s.pre_match.should == "Fri Dec "
+    s.scan_until(/XYZ/).should == nil
+  end
+  it "should pass the #skip example" do
+    s = UnicodeScanner.new('test string')
+    s.skip(/\w+/).should == 4
+    s.skip(/\w+/).should == nil
+    s.skip(/\s+/).should == 1
+    s.skip(/\w+/).should == 6
+    s.skip(/./).should == nil
+  end
+  it "should pass the half-finished #skip_until example" do
+    s = UnicodeScanner.new("Fri Dec 12 1975 14:39")
+    s.skip_until(/12/).should == 10
+  end
+  it "should pass the #unscan example" do
+    s = UnicodeScanner.new('test string')
+    s.scan(/\w+/).should == "test"
+    s.unscan
+    s.scan(/../).should == "te"
+    s.scan(/\d/).should == nil
+    -> { s.unscan }.should raise_error(ScanError, 'unscan failed: previous match record not exist')
+  end
+end

data/unicode_scanner.gemspec ADDED Viewed

@@ -0,0 +1,64 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = "unicode_scanner"
+  s.version = "1.0.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Tim Morgan"]
+  s.date = "2012-07-12"
+  s.description = "An implementation of StringScanner that doesn't split multibyte characters."
+  s.email = "git@timothymorgan.info"
+  s.extra_rdoc_files = [
+    "LICENSE.txt",
+    "README.md"
+  ]
+  s.files = [
+    ".document",
+    ".rspec",
+    ".rvmrc",
+    "Gemfile",
+    "Gemfile.lock",
+    "LICENSE.txt",
+    "README.md",
+    "Rakefile",
+    "VERSION",
+    "lib/unicode_scanner.rb",
+    "spec/spec_helper.rb",
+    "spec/unicode_scanner_spec.rb",
+    "unicode_scanner.gemspec"
+  ]
+  s.homepage = "http://github.com/RISCfuture/unicode_scanner"
+  s.licenses = ["MIT"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = "1.8.24"
+  s.summary = "Unicode-aware implementation of StringScanner"
+  if s.respond_to? :specification_version then
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_development_dependency(%q<rspec>, [">= 0"])
+      s.add_development_dependency(%q<redcarpet>, [">= 0"])
+      s.add_development_dependency(%q<yard>, [">= 0"])
+      s.add_development_dependency(%q<bundler>, [">= 0"])
+      s.add_development_dependency(%q<jeweler>, [">= 0"])
+    else
+      s.add_dependency(%q<rspec>, [">= 0"])
+      s.add_dependency(%q<redcarpet>, [">= 0"])
+      s.add_dependency(%q<yard>, [">= 0"])
+      s.add_dependency(%q<bundler>, [">= 0"])
+      s.add_dependency(%q<jeweler>, [">= 0"])
+    end
+  else
+    s.add_dependency(%q<rspec>, [">= 0"])
+    s.add_dependency(%q<redcarpet>, [">= 0"])
+    s.add_dependency(%q<yard>, [">= 0"])
+    s.add_dependency(%q<bundler>, [">= 0"])
+    s.add_dependency(%q<jeweler>, [">= 0"])
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,143 @@
+--- !ruby/object:Gem::Specification
+name: unicode_scanner
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+  prerelease:
+platform: ruby
+authors:
+- Tim Morgan
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-07-12 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: redcarpet
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: yard
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: jeweler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: An implementation of StringScanner that doesn't split multibyte characters.
+email: git@timothymorgan.info
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE.txt
+- README.md
+files:
+- .document
+- .rspec
+- .rvmrc
+- Gemfile
+- Gemfile.lock
+- LICENSE.txt
+- README.md
+- Rakefile
+- VERSION
+- lib/unicode_scanner.rb
+- spec/spec_helper.rb
+- spec/unicode_scanner_spec.rb
+- unicode_scanner.gemspec
+homepage: http://github.com/RISCfuture/unicode_scanner
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
+      - 0
+      hash: -3935821298050612576
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: Unicode-aware implementation of StringScanner
+test_files: []