RubyGems - string-eater - Versions diffs - 0.2.2 → 1.0.0 - Mend

string-eater 0.2.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +15 -0
data/LICENSE +1 -1
data/README.md +31 -15
data/examples/address.rb +22 -17
data/examples/nginx.rb +21 -13
data/ext/string-eater/c-tokenizer.c +20 -6
data/ext/string-eater/extconf.rb +1 -0
data/lib/c-tokenizer.rb +52 -37
data/lib/string-eater.rb +4 -2
data/lib/token.rb +5 -3
data/lib/version.rb +8 -4
data/spec/nginx_spec.rb +33 -21
data/spec/spec_helper.rb +1 -0
data/spec/string_eater_spec.rb +122 -130
metadata +5 -9
data/lib/ruby-tokenizer-each-char.rb +0 -145
data/lib/ruby-tokenizer.rb +0 -98

checksums.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    ZmU1MmM0ZDY2MjQ4ZWM5MmFjN2E0YzVlODJkYWIwOWFlZDYxYzYyOQ==
+  data.tar.gz: !binary |-
+    NjhjZGQ1ZDQwZDVjNmE0MjZjM2Q5YTljNjAwNzFhNTJhNmE2ZmFmNw==
+!binary "U0hBNTEy":
+  metadata.gz: !binary |-
+    YTg0NzZjZTFkYzhjNWVhMzE2YjNjMzQ4N2RlNWYzYTI1NWM5MTE1MjE4NGEw
+    ODdmNjRiYWZiODVmZGY0ZmI1MTk5MmZiZGMyYTBhNTRjODZjOGM2ODRiYjM5
+    MWMwMzJmNGVlOTAyYTI2YmY0NzM4MTEwNDM3NjI1MTE1ZmRmNDU=
+  data.tar.gz: !binary |-
+    NzQ2NTdlYjAzY2NiMWIzYTRkYTI1NGFhZjgxOWY0YjgxYzk4ZDkyMGU3MDAw
+    YmQ5YjQzNDAzNGViOGJmYjFmOGI1MDIyNGI2OWNiZGVhN2ZkNWJjYTYzNTBh
+    YWYzZWRiYjE4ODA3YjI1ZmM4NWExZmI2ZmJmMzljMDA1Nzc0ZGY=

data/LICENSE CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2012 Dan Swain
+Copyright (c) 2012 Simpli.fi
 MIT License

data/README.md CHANGED Viewed

@@ -87,25 +87,41 @@ We can also do something like this:
 For another example, see `examples/nginx.rb`, which defines an
 [nginx](http://nginx.org) log line tokenizer.
-## Implementation
+## Non-strict usage
+Use `set_non_strict` to indicate that separator finding should be
+non-strict.  This means that if the tokenizer fails to find a
+separator before finishing a string, it will fill in the last token
+with the remainder of the string.  Normally (i.e., strict usage), the
+token whose closing character was not found is left nil.
+Example:
+    class PersonTokenizer < StringEater::Tokenizer
+      add_field :last_name
+      look_for ", "
+      add_field :first_name, :extract => false
+      look_for " | "
+      add_field :street_address, :extract => false
+      look_for ", "
+      add_field :city
+      look_for ", "
+      add_field :state
+      look_for ", "
+      set_non_strict
+    end
-There are actually three tokenizer algorithms provided here.  The
-three algorithms should be interchangeable.
+    tokenizer = PersonTokenizer.new
+	string = "Flinstone, Fred | 301 Cobblestone Way, Bedrock"
+	tokenizer.tokenize! string
-1. `StringEater::CTokenizer` - A C extension implementation.  The
-   fastest of the three.  This is the default implementation for
-   `StringEater::Tokenizer`.
+    puts tokenizer.last_name # => "Flinstone"
+	puts tokenizer.city      # => "Bedrock"  (if strict, would be nil)
-2. `StringEater::RubyTokenizer` - A pure-Ruby implementation.  This is
-   a slightly different implementation of the algorithm - an
-   implementation that is faster on Ruby than a translation of the C
-   algorithm.  Probably not as fast (or not much faster) than using
-   Ruby regular expressions.
+Non-strict can also be set on an instance tokenizer,
+i.e., call `tokenizer.set_non_strict` to make `tokenizer` non-strict.
-3. `StringEater::RubyTokenizerEachChar` - A pure-Ruby implementation.
-   This is essentially the same as the C implementation, but written
-   in pure Ruby.  It uses `String#each_char` and is therefore VERY
-   SLOW!  It provides a good way to hack the algorithm, though.
+## Implementation
 The main algorithm works by finding the start and end points of tokens
 in a string.  The search is done incrementally (i.e., loop through the

data/examples/address.rb CHANGED Viewed

@@ -1,35 +1,40 @@
+# encoding: utf-8
 # once the gem is installed, you don't need this
-$: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
-$: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'ext/string-eater'))
+$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
+                                              '..', 'lib')))
+$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
+                                              '..', 'ext/string-eater')))
 # this is the example from the README
 require 'string-eater'
-class PersonTokenizer < StringEater::Tokenizer
-  add_field :last_name
-  look_for ", "
-  add_field :first_name, :extract => false
-  look_for " | "
-  add_field :street_address, :extract => false
-  look_for ", "
+# example tokenizer for addresses
+class PersonTokenizer < StringEater::Tokenizer
+  add_field :last_name
+  look_for ', '
+  add_field :first_name, extract:  false
+  look_for ' | '
+  add_field :street_address, extract:  false
+  look_for ', '
   add_field :city
-  look_for ", "
-  add_field :state
-  look_for ", "
+  look_for ', '
+  add_field :state
+  look_for ', '
 end
-if __FILE__ == $0
+if __FILE__ == $PROGRAM_NAME
   tokenizer = PersonTokenizer.new
   puts tokenizer.describe_line
-  string = "Flinstone, Fred | 301 Cobblestone Way, Bedrock, NA, 00000"
+  string = 'Flinstone, Fred | 301 Cobblestone Way, Bedrock, NA, 00000'
   tokenizer.tokenize! string
-  puts tokenizer.last_name # => "Flinestone"
-  puts tokenizer.city      # => "Bedrock"
+  puts tokenizer.last_name # => "Flinestone"
+  puts tokenizer.city      # => "Bedrock"
   puts tokenizer.state     # => "NA"
-  tokenizer.tokenize!(string) do |tokens|
+  tokenizer.tokenize!(string) do |tokens|
     puts "The #{tokens[:last_name]}s live in #{tokens[:city]}"
   end
 end

data/examples/nginx.rb CHANGED Viewed

@@ -1,27 +1,32 @@
+# encoding: utf-8
 # once the gem is installed, you don't need this
-$: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
-$: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'ext/string-eater'))
+$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
+                                              '..', 'lib')))
+$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
+                                              '..', 'ext/string-eater')))
 require 'string-eater'
+# Example tokenizer for nginx log lines
 class NginxLogTokenizer < StringEater::CTokenizer
   add_field :ip
-  look_for " - "
-  add_field :remote_user, :extract => false
-  look_for " ["
-  add_field :timestamp, :extract => false
+  look_for ' - '
+  add_field :remote_user, extract:  false
+  look_for ' ['
+  add_field :timestamp, extract:  false
   look_for "] \""
   add_field :request
   look_for "\" "
   add_field :status_code
-  look_for " "
-  add_field :bytes_sent, :extract => false
+  look_for ' '
+  add_field :bytes_sent, extract:  false
   look_for " \""
   add_field :referrer_url
   look_for "\" \""
   add_field :user_agent
   look_for "\" \""
-  add_field :compression, :extract => false
+  add_field :compression, extract:  false
   look_for "\" "
   add_field :remainder
@@ -47,14 +52,17 @@ class NginxLogTokenizer < StringEater::CTokenizer
   end
 end
-if __FILE__ == $0
+if __FILE__ == $PROGRAM_NAME
   tokenizer = NginxLogTokenizer.new
   puts tokenizer.describe_line
-  str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] "GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" "-" "there could be" other "stuff here"'
+  str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
+    '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
+    '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
+    'Trident/5.0)" "-" "there could be" other "stuff here"'
-  puts "input string: " + str
-  puts "Tokens: "
+  puts 'input string: ' + str
+  puts 'Tokens: '
   # use a block to work with the extracted tokens
   tokenizer.tokenize!(str) do |tokens|

data/ext/string-eater/c-tokenizer.c CHANGED Viewed

@@ -10,11 +10,12 @@ static VALUE rb_cCTokenizer;
 static VALUE rb_mStringEater;
 static VALUE tokenize_string(VALUE self,
-    VALUE string,
-    VALUE tokens_to_find_indexes,
-    VALUE tokens_to_find_strings,
-    VALUE tokens_to_extract_indexes,
-    VALUE tokens_to_extract_names)
+			     VALUE string,
+			     VALUE tokens_to_find_indexes,
+			     VALUE tokens_to_find_strings,
+			     VALUE tokens_to_extract_indexes,
+			     VALUE tokens_to_extract_names,
+			     VALUE non_strict)
 {
   const char* input_string = StringValueCStr(string);
   VALUE extracted_tokens = rb_hash_new();
@@ -115,6 +116,19 @@ static VALUE tokenize_string(VALUE self,
     }
   }
+  /*
+     got to the end of the string
+     and have an incomplete token
+     and not strict
+  */
+  if(ix == str_len && curr_token_ix < n_tokens && RTEST(non_strict))
+  {
+    rb_hash_aset(extracted_tokens,
+		 rb_ary_entry(tokens_to_extract_names, curr_token_ix - 1),
+		 rb_usascii_str_new(input_string + startpoint,
+				    str_len - startpoint));
+  }
   curr_token_ix = n_tokens - 1;
   if(ix < str_len && curr_token_ix == next_token_to_extract_ix)
@@ -139,7 +153,7 @@ void Init_c_tokenizer_ext(void)
   rb_cCTokenizer = rb_define_class_under(rb_mStringEater,
       "CTokenizer", rb_cObject);
-  rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string, 5);
+  rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string, 6);
   /* set the callback for when the extension is unloaded */
   rb_set_end_proc(finalize_c_tokenizer_ext, 0);

data/ext/string-eater/extconf.rb CHANGED Viewed

@@ -1,2 +1,3 @@
+# encoding: utf-8
 require 'mkmf'
 create_makefile('c_tokenizer_ext')

data/lib/c-tokenizer.rb CHANGED Viewed

@@ -1,17 +1,22 @@
+# encoding: utf-8
 require 'c_tokenizer_ext'
+# Ruby interface to the c extension
 class StringEater::CTokenizer
+  attr_reader :tokens
   def self.tokens
     @tokens ||= []
   end
-  def self.add_field name, opts={}
-    self.tokens << StringEater::Token::new_field(name, opts)
-    define_method(name) {@extracted_tokens[name]}
+  def self.add_field(name, opts = {})
+    tokens << StringEater::Token.new_field(name, opts)
+    define_method(name) { @extracted_tokens[name] }
   end
-  def self.look_for tokens
-    self.tokens << StringEater::Token::new_separator(tokens)
+  def self.look_for(look_for_tokens)
+    tokens << StringEater::Token.new_separator(look_for_tokens)
   end
   # This is very slow, only do it when necessary
@@ -19,12 +24,20 @@ class StringEater::CTokenizer
     Marshal.load(Marshal.dump(tokens))
   end
+  def self.set_non_strict
+    @class_non_strict = true
+  end
+  def self.non_strict?
+    @class_non_strict == true
+  end
   def initialize
     refresh_tokens
   end
-  def tokens
-    @tokens
+  def set_non_strict
+    @non_strict = true
   end
   def extract_all_fields
@@ -41,7 +54,7 @@ class StringEater::CTokenizer
     refresh_tokens
   end
-  def extract_fields *fields
+  def extract_fields(*fields)
     @token_filter = lambda do |t|
       t.opts[:extract] = fields.include?(t.name)
     end
@@ -52,76 +65,78 @@ class StringEater::CTokenizer
   def refresh_tokens
     @tokens = self.class.dup_tokens
-    if @token_filter
-      @tokens.each{|t| @token_filter.call(t)}
-    end
-    tokens_to_find = tokens.each_with_index.map do |t, i|
-      [i, t.string] if t.string
-    end.compact
+    @tokens.each { |t| @token_filter.call(t) } if @token_filter
-    @tokens_to_find_indexes = tokens_to_find.map{|t| t[0]}
-    @tokens_to_find_strings = tokens_to_find.map{|t| t[1]}
+    tokens_to_find = gen_tokens_to_find
+    @tokens_to_find_indexes = tokens_to_find.map { |t| t[0] }
+    @tokens_to_find_strings = tokens_to_find.map { |t| t[1] }
-    tokens_to_extract = tokens.each_with_index.map do |t, i|
-      [i, t.name] if t.extract?
-    end.compact
-    @tokens_to_extract_indexes = tokens_to_extract.map{|t| t[0]}
-    @tokens_to_extract_names = tokens.map{|t| t.name}
+    tokens_to_extract = gen_tokens_to_extract
+    @tokens_to_extract_indexes = tokens_to_extract.map { |t| t[0] }
+    @tokens_to_extract_names = tokens.map { |t| t.name }
     @have_tokens_to_extract = (@tokens_to_extract_indexes.size > 0)
   end
   def describe_line
-    tokens.inject("") do |desc, t|
-      desc << (t.string || t.name.to_s || "xxxxxx")
+    tokens.reduce('') do |desc, t|
+      desc << (t.string || t.name.to_s || 'xxxxxx')
     end
   end
   def do_extra_parsing
   end
-  def tokenize! string, &block
+  # Not sure this could be much more concise
+  # rubocop:disable MethodLength
+  def tokenize!(string, &block)
     @string = string
     @extracted_tokens ||= {}
     @extracted_tokens.clear
+    @non_strict ||= self.class.non_strict?
     return unless @have_tokens_to_extract
-    @extracted_tokens = ctokenize!(@string,
+    @extracted_tokens = ctokenize!(@string,
                                    @tokens_to_find_indexes,
                                    @tokens_to_find_strings,
                                    @tokens_to_extract_indexes,
-                                   @tokens_to_extract_names)
+                                   @tokens_to_extract_names,
+                                   @non_strict)
     # extra parsing hook
     do_extra_parsing
-    if block_given?
-      yield @extracted_tokens
-    end
+    yield @extracted_tokens if block_given?
     # return self for chaining
     self
-  end
+  end
+  # rubocop:enable MethodLength
   private
-  def set_token_startpoint ix, startpoint
+  def set_token_startpoint(ix, startpoint)
     @tokens[ix].breakpoints[0] = startpoint
   end
-  def get_token_startpoint ix
+  def get_token_startpoint(ix)
     @tokens[ix].breakpoints[0]
   end
-  def set_token_endpoint ix, endpoint
+  def set_token_endpoint(ix, endpoint)
     @tokens[ix].breakpoints[1] = endpoint
   end
-  def extract_token? ix
+  def extract_token?(ix)
     @tokens[ix].extract?
   end
+  def gen_tokens_to_find
+    tokens.each_with_index.map { |t, i| [i, t.string] if t.string }.compact
+  end
+  def gen_tokens_to_extract
+    tokens.each_with_index.map { |t, i| [i, t.name] if t.extract? }.compact
+  end
 end

data/lib/string-eater.rb CHANGED Viewed

@@ -1,10 +1,12 @@
+# encoding: utf-8
+# Namespacing module for StringEater
 module StringEater
   autoload :Token, 'token'
-  autoload :RubyTokenizer, 'ruby-tokenizer'
-  autoload :RubyTokenizerEachCHar, 'ruby-tokenizer-each-char'
   autoload :CTokenizer, 'c-tokenizer'
   autoload :VERSION, 'version'
+  # by default, Tokenizer is the c extension tokenizer
   class Tokenizer < CTokenizer; end
 end

data/lib/token.rb CHANGED Viewed

@@ -1,9 +1,12 @@
+# encoding: utf-8
+# Token class used by tokenizers
 class StringEater::Token
   attr_accessor :name, :string, :opts, :breakpoints, :children
   def initialize
     @opts = {}
-    @breakpoints = [nil,nil]
+    @breakpoints = [nil, nil]
   end
   def extract?
@@ -13,7 +16,7 @@ class StringEater::Token
   def self.new_field(name, opts)
     t = new
     t.name = name
-    t.opts = {:extract => true}.merge(opts)
+    t.opts = { extract: true }.merge(opts)
     t
   end
@@ -22,5 +25,4 @@ class StringEater::Token
     t.string = string
     t
   end
 end

data/lib/version.rb CHANGED Viewed

@@ -1,8 +1,12 @@
+# encoding: utf-8
+# Extend StringEater with Version
 module StringEater
-  module VERSION
-    MAJOR = 0
-    MINOR = 2
-    PATCH = 2
+  # Version constants
+  module VERSION
+    MAJOR = 1
+    MINOR = 0
+    PATCH = 0
     PRE   = nil
     STRING = [MAJOR, MINOR, PATCH, PRE].compact.join('.')
   end

data/spec/nginx_spec.rb CHANGED Viewed

@@ -1,32 +1,44 @@
+# encoding: utf-8
 require 'spec_helper'
 require 'string-eater'
-$: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'examples'))
+$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
+                                              '..', 'examples')))
 require 'nginx'
 describe NginxLogTokenizer do
   before(:each) do
     @tokenizer = NginxLogTokenizer.new
-    @str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] "GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" "-" "there could be" other "stuff here"'
-    @str2 = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] "GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" "-"'
-   end
-   {
-      :ip => "73.80.217.212",
-      :request => "GET /this_is_a_url HTTP/1.1",
-      :status_code => 304,
-      :referrer_url => "http://referrer.com",
-      :user_agent => "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
-      :remainder => "\"there could be\" other \"stuff here\"",
-   }.each_pair do |token,val|
-        it "should find the right value for #{token}" do
-          @tokenizer.tokenize!(@str).send(token).should == val
-        end
-      end
-      it "should handle there not being a remainder correctly" do
-        @tokenizer.tokenize!(@str2).remainder.should be_nil
-      end
+    @str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
+      '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
+      '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
+      'Trident/5.0)" "-" "there could be" other "stuff here"'
+    @str2 = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
+      '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
+      '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
+      'WOW64; Trident/5.0)" "-"'
+  end
+  user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
+    'WOW64; Trident/5.0)'
+  {
+    ip: '73.80.217.212',
+    request: 'GET /this_is_a_url HTTP/1.1',
+    status_code: 304,
+    referrer_url:  'http://referrer.com',
+    user_agent:  user_agent,
+    remainder:  "\"there could be\" other \"stuff here\"",
+  }.each_pair do |token, val|
+    it "finds the right value for #{token}" do
+      @tokenizer.tokenize!(@str).send(token).should == val
+    end
+  end
+  it 'correctly handles there not being a remainder' do
+    @tokenizer.tokenize!(@str2).remainder.should be_nil
+  end
 end

data/spec/spec_helper.rb CHANGED Viewed

	@@ -1 +1,2 @@
1	+ # encoding: utf-8
1 2	$LOAD_PATH.concat %w[./lib ./ext/string-eater]

data/spec/string_eater_spec.rb CHANGED Viewed

@@ -1,193 +1,185 @@
+# encoding: utf-8
 require 'spec_helper'
 require 'string-eater'
-TestedClass = StringEater::CTokenizer
 describe StringEater do
-  it "should have a version" do
-    StringEater::VERSION::STRING.split(".").size.should >= 3
+  it 'has a version' do
+    StringEater::VERSION::STRING.split('.').size.should >= 3
   end
 end
 # normal use
-class Example1 < TestedClass
+class Example1 < StringEater::CTokenizer
   add_field :first_word
-  look_for " "
-  add_field :second_word, :extract => false
-  look_for "|"
+  look_for ' '
+  add_field :second_word, extract: false
+  look_for '|'
   add_field :third_word
 end
 describe Example1 do
-  before(:each) do
-    @tokenizer = Example1.new
-    @str1 = "foo bar|baz"
-    @first_word1 = "foo"
-    @second_word1 = "bar"
-    @third_word1 = "baz"
-    @bp1 = [0, 3,4,7,8,11]
+  let(:tokenizer) { Example1.new }
+  let(:first_word1) { 'foo' }
+  let(:second_word1) { 'bar' }
+  let(:third_word1) { 'baz' }
+  let(:str1) { "#{first_word1} #{second_word1}|#{third_word1}" }
+  describe '#extract_all_fields' do
+    it 'extracts all of the fields' do
+      tokenizer.extract_all_fields
+      tokenizer.tokenize!(str1)
+      expect(tokenizer.first_word).to eq(first_word1)
+      expect(tokenizer.second_word).to eq(second_word1)
+      expect(tokenizer.third_word).to eq(third_word1)
+    end
   end
-  describe "find_breakpoints" do
-    it "should return an array of the breakpoints" do
-      @tokenizer.find_breakpoints(@str1).should == @bp1 if @tokenizer.respond_to?(:find_breakpoints)
+  describe '#extract_no_fields' do
+    it 'does not extract any of the fields' do
+      tokenizer.extract_no_fields
+      tokenizer.tokenize!(str1)
+      tokenizer.first_word.should be_nil
+      tokenizer.second_word.should be_nil
+      tokenizer.third_word.should be_nil
     end
   end
-  describe "#extract_all_fields" do
-    it "should extract all of the fields" do
-      @tokenizer.extract_all_fields
-      @tokenizer.tokenize!(@str1)
-      @tokenizer.first_word.should == @first_word1
-      @tokenizer.second_word.should == @second_word1
-      @tokenizer.third_word.should == @third_word1
+  describe '#extract_fields' do
+    it 'allows us to set which fields get extracted' do
+      tokenizer.extract_fields :second_word
+      tokenizer.tokenize!(str1)
+      tokenizer.first_word.should be_nil
+      expect(tokenizer.second_word).to eq(second_word1)
+      tokenizer.third_word.should be_nil
     end
   end
-  describe "#extract_no_fields" do
-    it "should not extract any of the fields" do
-      @tokenizer.extract_no_fields
-      @tokenizer.tokenize!(@str1)
-      @tokenizer.first_word.should be_nil
-      @tokenizer.second_word.should be_nil
-      @tokenizer.third_word.should be_nil
+  describe 'tokenize!' do
+    it 'returns itself' do
+      tokenizer.tokenize!(str1).should == tokenizer
     end
-  end
-  describe "#extract_fields" do
-    it "should allow us to set which fields get extracted" do
-      @tokenizer.extract_fields :second_word
-      @tokenizer.tokenize!(@str1)
-      @tokenizer.first_word.should be_nil
-      @tokenizer.second_word.should == @second_word1
-      @tokenizer.third_word.should be_nil
+    it 'sets the first word' do
+      tokenizer.tokenize!(str1).first_word.should == 'foo'
     end
-  end
-  describe "tokenize!" do
-    it "should return itself" do
-      @tokenizer.tokenize!(@str1).should == @tokenizer
+    it 'sets the third word' do
+      tokenizer.tokenize!(str1).third_word.should == 'baz'
     end
-    it "should set the first word" do
-      @tokenizer.tokenize!(@str1).first_word.should == "foo"
+    it 'does not set the second word' do
+      tokenizer.tokenize!(str1).second_word.should be_nil
     end
-    it "should set the third word" do
-      @tokenizer.tokenize!(@str1).third_word.should == "baz"
+    it 'yields a hash of tokens if a block is given' do
+      tokenizer.tokenize!(str1) do |tokens|
+        tokens[:first_word].should == 'foo'
+      end
     end
-    it "should not set the second word" do
-      @tokenizer.tokenize!(@str1).second_word.should be_nil
+    it 'returns everything to the end of the line for the last token' do
+      s = 'c defg asdf | foo , baa'
+      tokenizer.tokenize!("a b|#{s}").third_word.should == s
     end
-    it "should yield a hash of tokens if a block is given" do
-      @tokenizer.tokenize!(@str1) do |tokens|
-        tokens[:first_word].should == "foo"
+    context 'when the last delimiter is missing' do
+      let(:s) { 'a b' }
+      it 'still finds the first word' do
+        expect(tokenizer.tokenize!(s).first_word).to eq('a')
       end
-    end
-    it "should return everything to the end of the line for the last token" do
-      s = "c defg asdf | foo , baa"
-      @tokenizer.tokenize!("a b|#{s}").third_word.should == s
-    end
+      it 'returns nil for the second word' do
+        expect(tokenizer.tokenize!(s).second_word).to be_nil
+      end
-    it "should work if the last delimeter is missing and the second-to-last field is not used" do
-      s = "a b"
-#      @tokenizer.extract_all_fields
-      @tokenizer.tokenize!(s).third_word.should be_nil
+      it 'returns nil for the third word' do
+        expect(tokenizer.tokenize!(s).third_word).to be_nil
+      end
     end
-  end
+    context 'when non_strict is enabled' do
+      before do
+        tokenizer.extract_all_fields
+        tokenizer.set_non_strict
+      end
+      context 'when the last delimiter is missing' do
+        let(:s) { 'a b' }
+        it 'still finds the first word' do
+          expect(tokenizer.tokenize!(s).first_word).to eq('a')
+        end
+        it 'still finds the second word' do
+          expect(tokenizer.tokenize!(s).second_word).to eq('b')
+        end
+        it 'returns nil for the third word' do
+          expect(tokenizer.tokenize!(s).third_word).to be_nil
+        end
+      end
+      context 'when the last delimiter is not missing' do
+        let(:s) { 'a b|c' }
+        it 'still finds the first word' do
+          expect(tokenizer.tokenize!(s).first_word).to eq('a')
+        end
+        it 'still finds the second word' do
+          expect(tokenizer.tokenize!(s).second_word).to eq('b')
+        end
+        it 'returns nil for the third word' do
+          expect(tokenizer.tokenize!(s).third_word).to eq('c')
+        end
+      end
+    end
+  end
 end
 # an example where we ignore after a certain point in the string
-class Example2 < TestedClass
-  add_field :first_word, :extract => false
-  look_for " "
+class Example2 < StringEater::CTokenizer
+  add_field :first_word, extract: false
+  look_for ' '
   add_field :second_word
-  look_for " "
-  add_field :third_word, :extract => false
-  look_for "-"
+  look_for ' '
+  add_field :third_word, extract: false
+  look_for '-'
 end
 describe Example2 do
+  let(:tokenizer) { Example2.new }
+  let(:second_word1) { 'bar' }
+  let(:str1) { "foo #{second_word1} baz-" }
-  before(:each) do
-    @tokenizer = Example2.new
-    @str1 = "foo bar baz-"
-    @second_word1 = "bar"
-  end
-  describe "tokenize!" do
-    it "should find the token when there is extra stuff at the end of the string" do
-      @tokenizer.tokenize!(@str1).second_word.should == @second_word1
+  describe 'tokenize!' do
+    it 'finds the token when there is extra stuff at the' +
+      'end of the string' do
+      tokenizer.tokenize!(str1).second_word.should == second_word1
     end
   end
 end
 # an example where the split is more than one char
-class Example3 < TestedClass
-  look_for "foo="
+class Example3 < StringEater::CTokenizer
+  look_for 'foo='
   add_field :foo_val
-  look_for "&"
+  look_for '&'
 end
 describe Example3 do
-  before(:each) do
-    @tokenizer = Example3.new
-  end
+  let(:tokenizer) { Example3.new }
-  describe "tokenize!" do
-    it "should find the token if there is only one occurrence of the characters in the separator" do
-      @tokenizer.tokenize!("abcd?foo=val&blah").foo_val.should == "val"
+  describe 'tokenize!' do
+    it 'finds the token if there is only one occurrence ' +
+      'of the characters in the separator' do
+      tokenizer.tokenize!('abcd?foo=val&blah').foo_val.should == 'val'
     end
-    it "should still work if part of the separator token occurs" do
-      @tokenizer.tokenize!("abcd?foo_blah=baz&foo=bar&buh").foo_val.should == "bar"
+    it 'still works if part of the separator token occurs' do
+      tokenizer.tokenize!('abcd?foo_blah=baz&foo=bar&buh')
+        .foo_val.should == 'bar'
     end
   end
 end
-# CTokenizer doesn't do combine_fields because
-#  writing out breakpoints is a significant slow-down
-if TestedClass.respond_to?(:combine_fields)
-  # an example where we combine fields
-  class Example3 < TestedClass
-    add_field :first_word, :extract => false
-    look_for " \""
-    add_field :part1, :extract => false
-    look_for " "
-    add_field :part2
-    look_for " "
-    add_field :part3, :extract => false
-    look_for "\""
-    combine_fields :from => :part1, :to => :part3, :as => :parts
-  end
-  describe Example3 do
-    before(:each) do
-      @tokenizer = Example3.new
-      @str1 = "foo \"bar baz bang\""
-      @part2 = "baz"
-      @parts = "bar baz bang"
-    end
-    it "should extract like normal" do
-      @tokenizer.tokenize!(@str1).part2.should == @part2
-    end
-    it "should ignore like normal" do
-      @tokenizer.tokenize!(@str1).part1.should be_nil
-    end
-    it "should extract the combined field" do
-      @tokenizer.tokenize!(@str1).parts.should == @parts
-    end
-  end
-end

metadata CHANGED Viewed

@@ -1,15 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: string-eater
 version: !ruby/object:Gem::Version
-  version: 0.2.2
-  prerelease:
+  version: 1.0.0
 platform: ruby
 authors:
 - Dan Swain
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-11-30 00:00:00.000000000 Z
+date: 2014-01-05 00:00:00.000000000 Z
 dependencies: []
 description: Fast string tokenizer. Nom strings.
 email:
@@ -20,8 +19,6 @@ extensions:
 extra_rdoc_files: []
 files:
 - lib/c-tokenizer.rb
-- lib/ruby-tokenizer-each-char.rb
-- lib/ruby-tokenizer.rb
 - lib/string-eater.rb
 - lib/token.rb
 - lib/version.rb
@@ -37,28 +34,27 @@ files:
 - README.md
 homepage: http://github.com/simplifi/string-eater
 licenses: []
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 - ext/string-eater
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 2.0.6
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: Fast string tokenizer.  Nom strings.
 test_files:
 - spec/nginx_spec.rb

data/lib/ruby-tokenizer-each-char.rb DELETED Viewed

@@ -1,145 +0,0 @@
-# this tokenizer is very slow, but it illustrates the
-# basic idea of the C tokenizer
-class StringEater::RubyTokenizerEachChar
-  def self.tokens
-    @tokens ||= []
-  end
-  def self.combined_tokens
-    @combined_tokens ||= []
-  end
-  def self.add_field name, opts={}
-    self.tokens << StringEater::Token::new_field(name, opts)
-    define_method(name) {@extracted_tokens[name]}
-  end
-  def self.look_for tokens
-    self.tokens << StringEater::Token::new_separator(tokens)
-  end
-  def self.combine_fields opts={}
-    from_token_index = self.tokens.index{|t| t.name == opts[:from]}
-    to_token_index = self.tokens.index{|t| t.name == opts[:to]}
-    self.combined_tokens << [opts[:as], from_token_index, to_token_index]
-    define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
-  end
-  def tokens
-    @tokens ||= self.class.tokens
-  end
-  def combined_tokens
-    @combined_tokens ||= self.class.combined_tokens
-  end
-  def refresh_tokens
-    @combined_tokens = nil
-    @tokens = nil
-    tokens
-  end
-  def describe_line
-    tokens.inject("") do |desc, t|
-      desc << (t.string || t.name.to_s || "xxxxxx")
-    end
-  end
-  def find_breakpoints string
-    tokenize!(string) unless @string == string
-    tokens.inject([]) do |bp, t|
-      bp << t.breakpoints
-      bp
-    end.flatten.uniq
-  end
-  def tokenize! string, &block
-    @string = string
-    @extracted_tokens ||= {}
-    @extracted_tokens.clear
-    @tokens_to_find ||= tokens.each_with_index.map do |t, i|
-      [i, t.string] if t.string
-    end.compact
-    @tokens_to_extract_indeces ||= tokens.each_with_index.map do |t, i|
-      i if t.extract?
-    end.compact
-    tokens.first.breakpoints[0] = 0
-    find_index = 0
-    curr_token = @tokens_to_find[find_index]
-    curr_token_index = curr_token[0]
-    curr_token_length = curr_token[1].length
-    looking_for_index = 0
-    looking_for = curr_token[1][looking_for_index]
-    counter = 0
-    string.each_char do |c|
-      if c == looking_for
-        if looking_for_index == 0
-          # entering new token
-          if curr_token_index > 0
-            t = tokens[curr_token_index - 1]
-            t.breakpoints[1] = counter
-            if t.extract?
-              @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
-            end
-          end
-          tokens[curr_token_index].breakpoints[0] = counter
-        end
-        if looking_for_index >= (curr_token_length - 1)
-          # leaving token
-          tokens[curr_token_index].breakpoints[1] = counter
-          if curr_token_index >= tokens.size-1
-            # we're done!
-            break
-          else
-            tokens[curr_token_index + 1].breakpoints[0] = counter + 1
-          end
-          # next token
-          find_index += 1
-          if find_index >= @tokens_to_find.length
-            # we're done!
-            break
-          end
-          curr_token = @tokens_to_find[find_index]
-          curr_token_index = curr_token[0]
-          curr_token_length = curr_token[1].length
-          looking_for_index = 0
-        else
-          looking_for_index += 1
-        end
-      end
-      looking_for = curr_token[1][looking_for_index]
-      counter += 1
-    end
-    last_token = tokens.last
-    last_token.breakpoints[1] = string.length
-    if last_token.extract?
-      @extracted_tokens[last_token.name] = string[last_token.breakpoints[0]..last_token.breakpoints[1]]
-    end
-    combined_tokens.each do |combiner|
-      name = combiner[0]
-      from = @tokens[combiner[1]].breakpoints[0]
-      to = @tokens[combiner[2]].breakpoints[1]
-      @extracted_tokens[name] = string[from...to]
-    end
-    if block_given?
-      yield @extracted_tokens
-    end
-    # return self for chaining
-    self
-  end
-end

data/lib/ruby-tokenizer.rb DELETED Viewed

@@ -1,98 +0,0 @@
-# this tokenizer is fairly fast, but not necessarily faster than regexps
-class StringEater::RubyTokenizer
-  def self.tokens
-    @tokens ||= []
-  end
-  def self.combined_tokens
-    @combined_tokens ||= []
-  end
-  def self.add_field name, opts={}
-    self.tokens << StringEater::Token::new_field(name, opts)
-    define_method(name) {@extracted_tokens[name]}
-  end
-  def self.look_for tokens
-    self.tokens << StringEater::Token::new_separator(tokens)
-  end
-  def self.combine_fields opts={}
-    from_token_index = self.tokens.index{|t| t.name == opts[:from]}
-    to_token_index = self.tokens.index{|t| t.name == opts[:to]}
-    self.combined_tokens << [opts[:as], from_token_index, to_token_index]
-    define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
-  end
-  def tokens
-    @tokens ||= self.class.tokens
-  end
-  def combined_tokens
-    @combined_tokens ||= self.class.combined_tokens
-  end
-  def refresh_tokens
-    @combined_tokens = nil
-    @tokens = nil
-    tokens
-  end
-  def describe_line
-    tokens.inject("") do |desc, t|
-      desc << (t.string || t.name.to_s || "xxxxxx")
-    end
-  end
-  def find_breakpoints(string)
-    @literal_tokens ||= tokens.select{|t| t.string}
-    @breakpoints ||= Array.new(2*@literal_tokens.size + 2)
-    @breakpoints[0] = 0
-    @breakpoints[-1] = string.length
-    start_point = 0
-    @literal_tokens.each_with_index do |t, i|
-      @breakpoints[2*i+1], start_point = find_end_of(t, string, start_point)
-      @breakpoints[2*i+2] = start_point
-    end
-    @breakpoints
-  end
-  def tokenize! string, &block
-    @extracted_tokens ||= {}
-    @extracted_tokens.clear
-    @tokens_to_extract ||= tokens.select{|t| t.extract?}
-    find_breakpoints(string)
-    last_important_bp = [@breakpoints.length, tokens.size].min
-    (0...last_important_bp).each do |i|
-      tokens[i].breakpoints = [@breakpoints[i], @breakpoints[i+1]]
-    end
-    @tokens_to_extract.each do |t|
-      @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
-    end
-    combined_tokens.each do |combiner|
-      name = combiner[0]
-      from = @tokens[combiner[1]].breakpoints[0]
-      to = @tokens[combiner[2]].breakpoints[1]
-      @extracted_tokens[name] = string[from...to]
-    end
-    if block_given?
-      yield @extracted_tokens
-    end
-    # return self for chaining
-    self
-  end
-  protected
-  def find_end_of token, string, start_at
-    start = string.index(token.string, start_at+1) || string.length
-    [start, [start + token.string.length, string.length].min]
-  end
-end