string-eater 0.2.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZmU1MmM0ZDY2MjQ4ZWM5MmFjN2E0YzVlODJkYWIwOWFlZDYxYzYyOQ==
5
+ data.tar.gz: !binary |-
6
+ NjhjZGQ1ZDQwZDVjNmE0MjZjM2Q5YTljNjAwNzFhNTJhNmE2ZmFmNw==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ YTg0NzZjZTFkYzhjNWVhMzE2YjNjMzQ4N2RlNWYzYTI1NWM5MTE1MjE4NGEw
10
+ ODdmNjRiYWZiODVmZGY0ZmI1MTk5MmZiZGMyYTBhNTRjODZjOGM2ODRiYjM5
11
+ MWMwMzJmNGVlOTAyYTI2YmY0NzM4MTEwNDM3NjI1MTE1ZmRmNDU=
12
+ data.tar.gz: !binary |-
13
+ NzQ2NTdlYjAzY2NiMWIzYTRkYTI1NGFhZjgxOWY0YjgxYzk4ZDkyMGU3MDAw
14
+ YmQ5YjQzNDAzNGViOGJmYjFmOGI1MDIyNGI2OWNiZGVhN2ZkNWJjYTYzNTBh
15
+ YWYzZWRiYjE4ODA3YjI1ZmM4NWExZmI2ZmJmMzljMDA1Nzc0ZGY=
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2012 Dan Swain
1
+ Copyright (c) 2012 Simpli.fi
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -87,25 +87,41 @@ We can also do something like this:
87
87
  For another example, see `examples/nginx.rb`, which defines an
88
88
  [nginx](http://nginx.org) log line tokenizer.
89
89
 
90
- ## Implementation
90
+ ## Non-strict usage
91
+
92
+ Use `set_non_strict` to indicate that separator finding should be
93
+ non-strict. This means that if the tokenizer fails to find a
94
+ separator before finishing a string, it will fill in the last token
95
+ with the remainder of the string. Normally (i.e., strict usage), the
96
+ token whose closing character was not found is left nil.
97
+
98
+ Example:
99
+
100
+ class PersonTokenizer < StringEater::Tokenizer
101
+ add_field :last_name
102
+ look_for ", "
103
+ add_field :first_name, :extract => false
104
+ look_for " | "
105
+ add_field :street_address, :extract => false
106
+ look_for ", "
107
+ add_field :city
108
+ look_for ", "
109
+ add_field :state
110
+ look_for ", "
111
+ set_non_strict
112
+ end
91
113
 
92
- There are actually three tokenizer algorithms provided here. The
93
- three algorithms should be interchangeable.
114
+ tokenizer = PersonTokenizer.new
115
+ string = "Flinstone, Fred | 301 Cobblestone Way, Bedrock"
116
+ tokenizer.tokenize! string
94
117
 
95
- 1. `StringEater::CTokenizer` - A C extension implementation. The
96
- fastest of the three. This is the default implementation for
97
- `StringEater::Tokenizer`.
118
+ puts tokenizer.last_name # => "Flinstone"
119
+ puts tokenizer.city # => "Bedrock" (if strict, would be nil)
98
120
 
99
- 2. `StringEater::RubyTokenizer` - A pure-Ruby implementation. This is
100
- a slightly different implementation of the algorithm - an
101
- implementation that is faster on Ruby than a translation of the C
102
- algorithm. Probably not as fast (or not much faster) than using
103
- Ruby regular expressions.
121
+ Non-strict can also be set on an instance tokenizer,
122
+ i.e., call `tokenizer.set_non_strict` to make `tokenizer` non-strict.
104
123
 
105
- 3. `StringEater::RubyTokenizerEachChar` - A pure-Ruby implementation.
106
- This is essentially the same as the C implementation, but written
107
- in pure Ruby. It uses `String#each_char` and is therefore VERY
108
- SLOW! It provides a good way to hack the algorithm, though.
124
+ ## Implementation
109
125
 
110
126
  The main algorithm works by finding the start and end points of tokens
111
127
  in a string. The search is done incrementally (i.e., loop through the
data/examples/address.rb CHANGED
@@ -1,35 +1,40 @@
1
+ # encoding: utf-8
2
+
1
3
  # once the gem is installed, you don't need this
2
- $: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
3
- $: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'ext/string-eater'))
4
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
5
+ '..', 'lib')))
6
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
7
+ '..', 'ext/string-eater')))
4
8
 
5
9
  # this is the example from the README
6
10
  require 'string-eater'
7
11
 
8
- class PersonTokenizer < StringEater::Tokenizer
9
- add_field :last_name
10
- look_for ", "
11
- add_field :first_name, :extract => false
12
- look_for " | "
13
- add_field :street_address, :extract => false
14
- look_for ", "
12
+ # example tokenizer for addresses
13
+ class PersonTokenizer < StringEater::Tokenizer
14
+ add_field :last_name
15
+ look_for ', '
16
+ add_field :first_name, extract: false
17
+ look_for ' | '
18
+ add_field :street_address, extract: false
19
+ look_for ', '
15
20
  add_field :city
16
- look_for ", "
17
- add_field :state
18
- look_for ", "
21
+ look_for ', '
22
+ add_field :state
23
+ look_for ', '
19
24
  end
20
25
 
21
- if __FILE__ == $0
26
+ if __FILE__ == $PROGRAM_NAME
22
27
  tokenizer = PersonTokenizer.new
23
28
  puts tokenizer.describe_line
24
29
 
25
- string = "Flinstone, Fred | 301 Cobblestone Way, Bedrock, NA, 00000"
30
+ string = 'Flinstone, Fred | 301 Cobblestone Way, Bedrock, NA, 00000'
26
31
  tokenizer.tokenize! string
27
32
 
28
- puts tokenizer.last_name # => "Flinestone"
29
- puts tokenizer.city # => "Bedrock"
33
+ puts tokenizer.last_name # => "Flinestone"
34
+ puts tokenizer.city # => "Bedrock"
30
35
  puts tokenizer.state # => "NA"
31
36
 
32
- tokenizer.tokenize!(string) do |tokens|
37
+ tokenizer.tokenize!(string) do |tokens|
33
38
  puts "The #{tokens[:last_name]}s live in #{tokens[:city]}"
34
39
  end
35
40
  end
data/examples/nginx.rb CHANGED
@@ -1,27 +1,32 @@
1
+ # encoding: utf-8
2
+
1
3
  # once the gem is installed, you don't need this
2
- $: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
3
- $: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'ext/string-eater'))
4
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
5
+ '..', 'lib')))
6
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
7
+ '..', 'ext/string-eater')))
4
8
 
5
9
  require 'string-eater'
6
10
 
11
+ # Example tokenizer for nginx log lines
7
12
  class NginxLogTokenizer < StringEater::CTokenizer
8
13
  add_field :ip
9
- look_for " - "
10
- add_field :remote_user, :extract => false
11
- look_for " ["
12
- add_field :timestamp, :extract => false
14
+ look_for ' - '
15
+ add_field :remote_user, extract: false
16
+ look_for ' ['
17
+ add_field :timestamp, extract: false
13
18
  look_for "] \""
14
19
  add_field :request
15
20
  look_for "\" "
16
21
  add_field :status_code
17
- look_for " "
18
- add_field :bytes_sent, :extract => false
22
+ look_for ' '
23
+ add_field :bytes_sent, extract: false
19
24
  look_for " \""
20
25
  add_field :referrer_url
21
26
  look_for "\" \""
22
27
  add_field :user_agent
23
28
  look_for "\" \""
24
- add_field :compression, :extract => false
29
+ add_field :compression, extract: false
25
30
  look_for "\" "
26
31
  add_field :remainder
27
32
 
@@ -47,14 +52,17 @@ class NginxLogTokenizer < StringEater::CTokenizer
47
52
  end
48
53
  end
49
54
 
50
- if __FILE__ == $0
55
+ if __FILE__ == $PROGRAM_NAME
51
56
  tokenizer = NginxLogTokenizer.new
52
57
  puts tokenizer.describe_line
53
58
 
54
- str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] "GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" "-" "there could be" other "stuff here"'
59
+ str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
60
+ '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
61
+ '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
62
+ 'Trident/5.0)" "-" "there could be" other "stuff here"'
55
63
 
56
- puts "input string: " + str
57
- puts "Tokens: "
64
+ puts 'input string: ' + str
65
+ puts 'Tokens: '
58
66
 
59
67
  # use a block to work with the extracted tokens
60
68
  tokenizer.tokenize!(str) do |tokens|
@@ -10,11 +10,12 @@ static VALUE rb_cCTokenizer;
10
10
  static VALUE rb_mStringEater;
11
11
 
12
12
  static VALUE tokenize_string(VALUE self,
13
- VALUE string,
14
- VALUE tokens_to_find_indexes,
15
- VALUE tokens_to_find_strings,
16
- VALUE tokens_to_extract_indexes,
17
- VALUE tokens_to_extract_names)
13
+ VALUE string,
14
+ VALUE tokens_to_find_indexes,
15
+ VALUE tokens_to_find_strings,
16
+ VALUE tokens_to_extract_indexes,
17
+ VALUE tokens_to_extract_names,
18
+ VALUE non_strict)
18
19
  {
19
20
  const char* input_string = StringValueCStr(string);
20
21
  VALUE extracted_tokens = rb_hash_new();
@@ -115,6 +116,19 @@ static VALUE tokenize_string(VALUE self,
115
116
  }
116
117
  }
117
118
 
119
+ /*
120
+ got to the end of the string
121
+ and have an incomplete token
122
+ and not strict
123
+ */
124
+ if(ix == str_len && curr_token_ix < n_tokens && RTEST(non_strict))
125
+ {
126
+ rb_hash_aset(extracted_tokens,
127
+ rb_ary_entry(tokens_to_extract_names, curr_token_ix - 1),
128
+ rb_usascii_str_new(input_string + startpoint,
129
+ str_len - startpoint));
130
+ }
131
+
118
132
  curr_token_ix = n_tokens - 1;
119
133
 
120
134
  if(ix < str_len && curr_token_ix == next_token_to_extract_ix)
@@ -139,7 +153,7 @@ void Init_c_tokenizer_ext(void)
139
153
  rb_cCTokenizer = rb_define_class_under(rb_mStringEater,
140
154
  "CTokenizer", rb_cObject);
141
155
 
142
- rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string, 5);
156
+ rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string, 6);
143
157
 
144
158
  /* set the callback for when the extension is unloaded */
145
159
  rb_set_end_proc(finalize_c_tokenizer_ext, 0);
@@ -1,2 +1,3 @@
1
+ # encoding: utf-8
1
2
  require 'mkmf'
2
3
  create_makefile('c_tokenizer_ext')
data/lib/c-tokenizer.rb CHANGED
@@ -1,17 +1,22 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'c_tokenizer_ext'
2
4
 
5
+ # Ruby interface to the c extension
3
6
  class StringEater::CTokenizer
7
+ attr_reader :tokens
8
+
4
9
  def self.tokens
5
10
  @tokens ||= []
6
11
  end
7
12
 
8
- def self.add_field name, opts={}
9
- self.tokens << StringEater::Token::new_field(name, opts)
10
- define_method(name) {@extracted_tokens[name]}
13
+ def self.add_field(name, opts = {})
14
+ tokens << StringEater::Token.new_field(name, opts)
15
+ define_method(name) { @extracted_tokens[name] }
11
16
  end
12
17
 
13
- def self.look_for tokens
14
- self.tokens << StringEater::Token::new_separator(tokens)
18
+ def self.look_for(look_for_tokens)
19
+ tokens << StringEater::Token.new_separator(look_for_tokens)
15
20
  end
16
21
 
17
22
  # This is very slow, only do it when necessary
@@ -19,12 +24,20 @@ class StringEater::CTokenizer
19
24
  Marshal.load(Marshal.dump(tokens))
20
25
  end
21
26
 
27
+ def self.set_non_strict
28
+ @class_non_strict = true
29
+ end
30
+
31
+ def self.non_strict?
32
+ @class_non_strict == true
33
+ end
34
+
22
35
  def initialize
23
36
  refresh_tokens
24
37
  end
25
38
 
26
- def tokens
27
- @tokens
39
+ def set_non_strict
40
+ @non_strict = true
28
41
  end
29
42
 
30
43
  def extract_all_fields
@@ -41,7 +54,7 @@ class StringEater::CTokenizer
41
54
  refresh_tokens
42
55
  end
43
56
 
44
- def extract_fields *fields
57
+ def extract_fields(*fields)
45
58
  @token_filter = lambda do |t|
46
59
  t.opts[:extract] = fields.include?(t.name)
47
60
  end
@@ -52,76 +65,78 @@ class StringEater::CTokenizer
52
65
  def refresh_tokens
53
66
  @tokens = self.class.dup_tokens
54
67
 
55
- if @token_filter
56
- @tokens.each{|t| @token_filter.call(t)}
57
- end
58
-
59
- tokens_to_find = tokens.each_with_index.map do |t, i|
60
- [i, t.string] if t.string
61
- end.compact
68
+ @tokens.each { |t| @token_filter.call(t) } if @token_filter
62
69
 
63
- @tokens_to_find_indexes = tokens_to_find.map{|t| t[0]}
64
- @tokens_to_find_strings = tokens_to_find.map{|t| t[1]}
70
+ tokens_to_find = gen_tokens_to_find
71
+ @tokens_to_find_indexes = tokens_to_find.map { |t| t[0] }
72
+ @tokens_to_find_strings = tokens_to_find.map { |t| t[1] }
65
73
 
66
- tokens_to_extract = tokens.each_with_index.map do |t, i|
67
- [i, t.name] if t.extract?
68
- end.compact
69
-
70
- @tokens_to_extract_indexes = tokens_to_extract.map{|t| t[0]}
71
- @tokens_to_extract_names = tokens.map{|t| t.name}
74
+ tokens_to_extract = gen_tokens_to_extract
75
+ @tokens_to_extract_indexes = tokens_to_extract.map { |t| t[0] }
76
+ @tokens_to_extract_names = tokens.map { |t| t.name }
72
77
 
73
78
  @have_tokens_to_extract = (@tokens_to_extract_indexes.size > 0)
74
79
  end
75
80
 
76
81
  def describe_line
77
- tokens.inject("") do |desc, t|
78
- desc << (t.string || t.name.to_s || "xxxxxx")
82
+ tokens.reduce('') do |desc, t|
83
+ desc << (t.string || t.name.to_s || 'xxxxxx')
79
84
  end
80
85
  end
81
86
 
82
87
  def do_extra_parsing
83
88
  end
84
89
 
85
- def tokenize! string, &block
90
+ # Not sure this could be much more concise
91
+ # rubocop:disable MethodLength
92
+ def tokenize!(string, &block)
86
93
  @string = string
87
94
  @extracted_tokens ||= {}
88
95
  @extracted_tokens.clear
96
+ @non_strict ||= self.class.non_strict?
89
97
 
90
98
  return unless @have_tokens_to_extract
91
99
 
92
- @extracted_tokens = ctokenize!(@string,
100
+ @extracted_tokens = ctokenize!(@string,
93
101
  @tokens_to_find_indexes,
94
102
  @tokens_to_find_strings,
95
103
  @tokens_to_extract_indexes,
96
- @tokens_to_extract_names)
104
+ @tokens_to_extract_names,
105
+ @non_strict)
97
106
 
98
107
  # extra parsing hook
99
108
  do_extra_parsing
100
109
 
101
- if block_given?
102
- yield @extracted_tokens
103
- end
110
+ yield @extracted_tokens if block_given?
104
111
 
105
112
  # return self for chaining
106
113
  self
107
- end
108
-
114
+ end
115
+ # rubocop:enable MethodLength
116
+
109
117
  private
110
118
 
111
- def set_token_startpoint ix, startpoint
119
+ def set_token_startpoint(ix, startpoint)
112
120
  @tokens[ix].breakpoints[0] = startpoint
113
121
  end
114
122
 
115
- def get_token_startpoint ix
123
+ def get_token_startpoint(ix)
116
124
  @tokens[ix].breakpoints[0]
117
125
  end
118
126
 
119
- def set_token_endpoint ix, endpoint
127
+ def set_token_endpoint(ix, endpoint)
120
128
  @tokens[ix].breakpoints[1] = endpoint
121
129
  end
122
130
 
123
- def extract_token? ix
131
+ def extract_token?(ix)
124
132
  @tokens[ix].extract?
125
133
  end
126
134
 
135
+ def gen_tokens_to_find
136
+ tokens.each_with_index.map { |t, i| [i, t.string] if t.string }.compact
137
+ end
138
+
139
+ def gen_tokens_to_extract
140
+ tokens.each_with_index.map { |t, i| [i, t.name] if t.extract? }.compact
141
+ end
127
142
  end
data/lib/string-eater.rb CHANGED
@@ -1,10 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ # Namespacing module for StringEater
1
4
  module StringEater
2
5
  autoload :Token, 'token'
3
- autoload :RubyTokenizer, 'ruby-tokenizer'
4
- autoload :RubyTokenizerEachCHar, 'ruby-tokenizer-each-char'
5
6
  autoload :CTokenizer, 'c-tokenizer'
6
7
 
7
8
  autoload :VERSION, 'version'
8
9
 
10
+ # by default, Tokenizer is the c extension tokenizer
9
11
  class Tokenizer < CTokenizer; end
10
12
  end
data/lib/token.rb CHANGED
@@ -1,9 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ # Token class used by tokenizers
1
4
  class StringEater::Token
2
5
  attr_accessor :name, :string, :opts, :breakpoints, :children
3
6
 
4
7
  def initialize
5
8
  @opts = {}
6
- @breakpoints = [nil,nil]
9
+ @breakpoints = [nil, nil]
7
10
  end
8
11
 
9
12
  def extract?
@@ -13,7 +16,7 @@ class StringEater::Token
13
16
  def self.new_field(name, opts)
14
17
  t = new
15
18
  t.name = name
16
- t.opts = {:extract => true}.merge(opts)
19
+ t.opts = { extract: true }.merge(opts)
17
20
  t
18
21
  end
19
22
 
@@ -22,5 +25,4 @@ class StringEater::Token
22
25
  t.string = string
23
26
  t
24
27
  end
25
-
26
28
  end
data/lib/version.rb CHANGED
@@ -1,8 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ # Extend StringEater with Version
1
4
  module StringEater
2
- module VERSION
3
- MAJOR = 0
4
- MINOR = 2
5
- PATCH = 2
5
+ # Version constants
6
+ module VERSION
7
+ MAJOR = 1
8
+ MINOR = 0
9
+ PATCH = 0
6
10
  PRE = nil
7
11
  STRING = [MAJOR, MINOR, PATCH, PRE].compact.join('.')
8
12
  end
data/spec/nginx_spec.rb CHANGED
@@ -1,32 +1,44 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'spec_helper'
2
4
  require 'string-eater'
3
5
 
4
- $: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'examples'))
6
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
7
+ '..', 'examples')))
5
8
 
6
9
  require 'nginx'
7
10
 
8
11
  describe NginxLogTokenizer do
9
12
  before(:each) do
10
13
  @tokenizer = NginxLogTokenizer.new
11
- @str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] "GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" "-" "there could be" other "stuff here"'
12
- @str2 = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] "GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" "-"'
13
- end
14
-
15
- {
16
- :ip => "73.80.217.212",
17
- :request => "GET /this_is_a_url HTTP/1.1",
18
- :status_code => 304,
19
- :referrer_url => "http://referrer.com",
20
- :user_agent => "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
21
- :remainder => "\"there could be\" other \"stuff here\"",
22
- }.each_pair do |token,val|
23
- it "should find the right value for #{token}" do
24
- @tokenizer.tokenize!(@str).send(token).should == val
25
- end
26
- end
27
-
28
- it "should handle there not being a remainder correctly" do
29
- @tokenizer.tokenize!(@str2).remainder.should be_nil
30
- end
14
+ @str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
15
+ '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
16
+ '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
17
+ 'Trident/5.0)" "-" "there could be" other "stuff here"'
18
+ @str2 = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
19
+ '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
20
+ '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
21
+ 'WOW64; Trident/5.0)" "-"'
22
+ end
23
+
24
+ user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
25
+ 'WOW64; Trident/5.0)'
26
+
27
+ {
28
+ ip: '73.80.217.212',
29
+ request: 'GET /this_is_a_url HTTP/1.1',
30
+ status_code: 304,
31
+ referrer_url: 'http://referrer.com',
32
+ user_agent: user_agent,
33
+ remainder: "\"there could be\" other \"stuff here\"",
34
+ }.each_pair do |token, val|
35
+ it "finds the right value for #{token}" do
36
+ @tokenizer.tokenize!(@str).send(token).should == val
37
+ end
38
+ end
39
+
40
+ it 'correctly handles there not being a remainder' do
41
+ @tokenizer.tokenize!(@str2).remainder.should be_nil
42
+ end
31
43
 
32
44
  end
data/spec/spec_helper.rb CHANGED
@@ -1 +1,2 @@
1
+ # encoding: utf-8
1
2
  $LOAD_PATH.concat %w[./lib ./ext/string-eater]
@@ -1,193 +1,185 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'spec_helper'
2
4
  require 'string-eater'
3
5
 
4
- TestedClass = StringEater::CTokenizer
5
-
6
6
  describe StringEater do
7
- it "should have a version" do
8
- StringEater::VERSION::STRING.split(".").size.should >= 3
7
+ it 'has a version' do
8
+ StringEater::VERSION::STRING.split('.').size.should >= 3
9
9
  end
10
10
  end
11
11
 
12
12
  # normal use
13
- class Example1 < TestedClass
13
+ class Example1 < StringEater::CTokenizer
14
14
  add_field :first_word
15
- look_for " "
16
- add_field :second_word, :extract => false
17
- look_for "|"
15
+ look_for ' '
16
+ add_field :second_word, extract: false
17
+ look_for '|'
18
18
  add_field :third_word
19
19
  end
20
20
 
21
21
  describe Example1 do
22
-
23
- before(:each) do
24
- @tokenizer = Example1.new
25
- @str1 = "foo bar|baz"
26
- @first_word1 = "foo"
27
- @second_word1 = "bar"
28
- @third_word1 = "baz"
29
- @bp1 = [0, 3,4,7,8,11]
22
+ let(:tokenizer) { Example1.new }
23
+ let(:first_word1) { 'foo' }
24
+ let(:second_word1) { 'bar' }
25
+ let(:third_word1) { 'baz' }
26
+ let(:str1) { "#{first_word1} #{second_word1}|#{third_word1}" }
27
+
28
+ describe '#extract_all_fields' do
29
+ it 'extracts all of the fields' do
30
+ tokenizer.extract_all_fields
31
+ tokenizer.tokenize!(str1)
32
+ expect(tokenizer.first_word).to eq(first_word1)
33
+ expect(tokenizer.second_word).to eq(second_word1)
34
+ expect(tokenizer.third_word).to eq(third_word1)
35
+ end
30
36
  end
31
37
 
32
- describe "find_breakpoints" do
33
- it "should return an array of the breakpoints" do
34
- @tokenizer.find_breakpoints(@str1).should == @bp1 if @tokenizer.respond_to?(:find_breakpoints)
38
+ describe '#extract_no_fields' do
39
+ it 'does not extract any of the fields' do
40
+ tokenizer.extract_no_fields
41
+ tokenizer.tokenize!(str1)
42
+ tokenizer.first_word.should be_nil
43
+ tokenizer.second_word.should be_nil
44
+ tokenizer.third_word.should be_nil
35
45
  end
36
46
  end
37
47
 
38
- describe "#extract_all_fields" do
39
- it "should extract all of the fields" do
40
- @tokenizer.extract_all_fields
41
- @tokenizer.tokenize!(@str1)
42
- @tokenizer.first_word.should == @first_word1
43
- @tokenizer.second_word.should == @second_word1
44
- @tokenizer.third_word.should == @third_word1
48
+ describe '#extract_fields' do
49
+ it 'allows us to set which fields get extracted' do
50
+ tokenizer.extract_fields :second_word
51
+ tokenizer.tokenize!(str1)
52
+ tokenizer.first_word.should be_nil
53
+ expect(tokenizer.second_word).to eq(second_word1)
54
+ tokenizer.third_word.should be_nil
45
55
  end
46
56
  end
47
57
 
48
- describe "#extract_no_fields" do
49
- it "should not extract any of the fields" do
50
- @tokenizer.extract_no_fields
51
- @tokenizer.tokenize!(@str1)
52
- @tokenizer.first_word.should be_nil
53
- @tokenizer.second_word.should be_nil
54
- @tokenizer.third_word.should be_nil
58
+ describe 'tokenize!' do
59
+ it 'returns itself' do
60
+ tokenizer.tokenize!(str1).should == tokenizer
55
61
  end
56
- end
57
62
 
58
- describe "#extract_fields" do
59
- it "should allow us to set which fields get extracted" do
60
- @tokenizer.extract_fields :second_word
61
- @tokenizer.tokenize!(@str1)
62
- @tokenizer.first_word.should be_nil
63
- @tokenizer.second_word.should == @second_word1
64
- @tokenizer.third_word.should be_nil
63
+ it 'sets the first word' do
64
+ tokenizer.tokenize!(str1).first_word.should == 'foo'
65
65
  end
66
- end
67
66
 
68
- describe "tokenize!" do
69
- it "should return itself" do
70
- @tokenizer.tokenize!(@str1).should == @tokenizer
67
+ it 'sets the third word' do
68
+ tokenizer.tokenize!(str1).third_word.should == 'baz'
71
69
  end
72
70
 
73
- it "should set the first word" do
74
- @tokenizer.tokenize!(@str1).first_word.should == "foo"
71
+ it 'does not set the second word' do
72
+ tokenizer.tokenize!(str1).second_word.should be_nil
75
73
  end
76
74
 
77
- it "should set the third word" do
78
- @tokenizer.tokenize!(@str1).third_word.should == "baz"
75
+ it 'yields a hash of tokens if a block is given' do
76
+ tokenizer.tokenize!(str1) do |tokens|
77
+ tokens[:first_word].should == 'foo'
78
+ end
79
79
  end
80
80
 
81
- it "should not set the second word" do
82
- @tokenizer.tokenize!(@str1).second_word.should be_nil
81
+ it 'returns everything to the end of the line for the last token' do
82
+ s = 'c defg asdf | foo , baa'
83
+ tokenizer.tokenize!("a b|#{s}").third_word.should == s
83
84
  end
84
85
 
85
- it "should yield a hash of tokens if a block is given" do
86
- @tokenizer.tokenize!(@str1) do |tokens|
87
- tokens[:first_word].should == "foo"
86
+ context 'when the last delimiter is missing' do
87
+ let(:s) { 'a b' }
88
+ it 'still finds the first word' do
89
+ expect(tokenizer.tokenize!(s).first_word).to eq('a')
88
90
  end
89
- end
90
91
 
91
- it "should return everything to the end of the line for the last token" do
92
- s = "c defg asdf | foo , baa"
93
- @tokenizer.tokenize!("a b|#{s}").third_word.should == s
94
- end
92
+ it 'returns nil for the second word' do
93
+ expect(tokenizer.tokenize!(s).second_word).to be_nil
94
+ end
95
95
 
96
- it "should work if the last delimeter is missing and the second-to-last field is not used" do
97
- s = "a b"
98
- # @tokenizer.extract_all_fields
99
- @tokenizer.tokenize!(s).third_word.should be_nil
96
+ it 'returns nil for the third word' do
97
+ expect(tokenizer.tokenize!(s).third_word).to be_nil
98
+ end
100
99
  end
101
100
 
102
- end
101
+ context 'when non_strict is enabled' do
102
+ before do
103
+ tokenizer.extract_all_fields
104
+ tokenizer.set_non_strict
105
+ end
106
+
107
+ context 'when the last delimiter is missing' do
108
+ let(:s) { 'a b' }
109
+ it 'still finds the first word' do
110
+ expect(tokenizer.tokenize!(s).first_word).to eq('a')
111
+ end
112
+
113
+ it 'still finds the second word' do
114
+ expect(tokenizer.tokenize!(s).second_word).to eq('b')
115
+ end
116
+
117
+ it 'returns nil for the third word' do
118
+ expect(tokenizer.tokenize!(s).third_word).to be_nil
119
+ end
120
+ end
121
+
122
+ context 'when the last delimiter is not missing' do
123
+ let(:s) { 'a b|c' }
124
+ it 'still finds the first word' do
125
+ expect(tokenizer.tokenize!(s).first_word).to eq('a')
126
+ end
127
+
128
+ it 'still finds the second word' do
129
+ expect(tokenizer.tokenize!(s).second_word).to eq('b')
130
+ end
103
131
 
132
+ it 'returns nil for the third word' do
133
+ expect(tokenizer.tokenize!(s).third_word).to eq('c')
134
+ end
135
+ end
136
+ end
137
+ end
104
138
  end
105
139
 
106
140
  # an example where we ignore after a certain point in the string
107
- class Example2 < TestedClass
108
- add_field :first_word, :extract => false
109
- look_for " "
141
+ class Example2 < StringEater::CTokenizer
142
+ add_field :first_word, extract: false
143
+ look_for ' '
110
144
  add_field :second_word
111
- look_for " "
112
- add_field :third_word, :extract => false
113
- look_for "-"
145
+ look_for ' '
146
+ add_field :third_word, extract: false
147
+ look_for '-'
114
148
  end
115
149
 
116
150
  describe Example2 do
151
+ let(:tokenizer) { Example2.new }
152
+ let(:second_word1) { 'bar' }
153
+ let(:str1) { "foo #{second_word1} baz-" }
117
154
 
118
- before(:each) do
119
- @tokenizer = Example2.new
120
- @str1 = "foo bar baz-"
121
- @second_word1 = "bar"
122
- end
123
-
124
- describe "tokenize!" do
125
- it "should find the token when there is extra stuff at the end of the string" do
126
- @tokenizer.tokenize!(@str1).second_word.should == @second_word1
155
+ describe 'tokenize!' do
156
+ it 'finds the token when there is extra stuff at the' +
157
+ 'end of the string' do
158
+ tokenizer.tokenize!(str1).second_word.should == second_word1
127
159
  end
128
160
  end
129
161
 
130
162
  end
131
163
 
132
164
  # an example where the split is more than one char
133
- class Example3 < TestedClass
134
- look_for "foo="
165
+ class Example3 < StringEater::CTokenizer
166
+ look_for 'foo='
135
167
  add_field :foo_val
136
- look_for "&"
168
+ look_for '&'
137
169
  end
138
170
 
139
171
  describe Example3 do
140
- before(:each) do
141
- @tokenizer = Example3.new
142
- end
172
+ let(:tokenizer) { Example3.new }
143
173
 
144
- describe "tokenize!" do
145
- it "should find the token if there is only one occurrence of the characters in the separator" do
146
- @tokenizer.tokenize!("abcd?foo=val&blah").foo_val.should == "val"
174
+ describe 'tokenize!' do
175
+ it 'finds the token if there is only one occurrence ' +
176
+ 'of the characters in the separator' do
177
+ tokenizer.tokenize!('abcd?foo=val&blah').foo_val.should == 'val'
147
178
  end
148
179
 
149
- it "should still work if part of the separator token occurs" do
150
- @tokenizer.tokenize!("abcd?foo_blah=baz&foo=bar&buh").foo_val.should == "bar"
180
+ it 'still works if part of the separator token occurs' do
181
+ tokenizer.tokenize!('abcd?foo_blah=baz&foo=bar&buh')
182
+ .foo_val.should == 'bar'
151
183
  end
152
184
  end
153
185
  end
154
-
155
- # CTokenizer doesn't do combine_fields because
156
- # writing out breakpoints is a significant slow-down
157
- if TestedClass.respond_to?(:combine_fields)
158
- # an example where we combine fields
159
- class Example3 < TestedClass
160
- add_field :first_word, :extract => false
161
- look_for " \""
162
- add_field :part1, :extract => false
163
- look_for " "
164
- add_field :part2
165
- look_for " "
166
- add_field :part3, :extract => false
167
- look_for "\""
168
-
169
- combine_fields :from => :part1, :to => :part3, :as => :parts
170
- end
171
-
172
- describe Example3 do
173
- before(:each) do
174
- @tokenizer = Example3.new
175
- @str1 = "foo \"bar baz bang\""
176
- @part2 = "baz"
177
- @parts = "bar baz bang"
178
- end
179
-
180
- it "should extract like normal" do
181
- @tokenizer.tokenize!(@str1).part2.should == @part2
182
- end
183
-
184
- it "should ignore like normal" do
185
- @tokenizer.tokenize!(@str1).part1.should be_nil
186
- end
187
-
188
- it "should extract the combined field" do
189
- @tokenizer.tokenize!(@str1).parts.should == @parts
190
- end
191
-
192
- end
193
- end
metadata CHANGED
@@ -1,15 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string-eater
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
5
- prerelease:
4
+ version: 1.0.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Dan Swain
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-11-30 00:00:00.000000000 Z
11
+ date: 2014-01-05 00:00:00.000000000 Z
13
12
  dependencies: []
14
13
  description: Fast string tokenizer. Nom strings.
15
14
  email:
@@ -20,8 +19,6 @@ extensions:
20
19
  extra_rdoc_files: []
21
20
  files:
22
21
  - lib/c-tokenizer.rb
23
- - lib/ruby-tokenizer-each-char.rb
24
- - lib/ruby-tokenizer.rb
25
22
  - lib/string-eater.rb
26
23
  - lib/token.rb
27
24
  - lib/version.rb
@@ -37,28 +34,27 @@ files:
37
34
  - README.md
38
35
  homepage: http://github.com/simplifi/string-eater
39
36
  licenses: []
37
+ metadata: {}
40
38
  post_install_message:
41
39
  rdoc_options: []
42
40
  require_paths:
43
41
  - lib
44
42
  - ext/string-eater
45
43
  required_ruby_version: !ruby/object:Gem::Requirement
46
- none: false
47
44
  requirements:
48
45
  - - ! '>='
49
46
  - !ruby/object:Gem::Version
50
47
  version: '0'
51
48
  required_rubygems_version: !ruby/object:Gem::Requirement
52
- none: false
53
49
  requirements:
54
50
  - - ! '>='
55
51
  - !ruby/object:Gem::Version
56
52
  version: '0'
57
53
  requirements: []
58
54
  rubyforge_project:
59
- rubygems_version: 1.8.24
55
+ rubygems_version: 2.0.6
60
56
  signing_key:
61
- specification_version: 3
57
+ specification_version: 4
62
58
  summary: Fast string tokenizer. Nom strings.
63
59
  test_files:
64
60
  - spec/nginx_spec.rb
@@ -1,145 +0,0 @@
1
- # this tokenizer is very slow, but it illustrates the
2
- # basic idea of the C tokenizer
3
- class StringEater::RubyTokenizerEachChar
4
-
5
- def self.tokens
6
- @tokens ||= []
7
- end
8
-
9
- def self.combined_tokens
10
- @combined_tokens ||= []
11
- end
12
-
13
- def self.add_field name, opts={}
14
- self.tokens << StringEater::Token::new_field(name, opts)
15
- define_method(name) {@extracted_tokens[name]}
16
- end
17
-
18
- def self.look_for tokens
19
- self.tokens << StringEater::Token::new_separator(tokens)
20
- end
21
-
22
- def self.combine_fields opts={}
23
- from_token_index = self.tokens.index{|t| t.name == opts[:from]}
24
- to_token_index = self.tokens.index{|t| t.name == opts[:to]}
25
- self.combined_tokens << [opts[:as], from_token_index, to_token_index]
26
- define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
27
- end
28
-
29
- def tokens
30
- @tokens ||= self.class.tokens
31
- end
32
-
33
- def combined_tokens
34
- @combined_tokens ||= self.class.combined_tokens
35
- end
36
-
37
- def refresh_tokens
38
- @combined_tokens = nil
39
- @tokens = nil
40
- tokens
41
- end
42
-
43
- def describe_line
44
- tokens.inject("") do |desc, t|
45
- desc << (t.string || t.name.to_s || "xxxxxx")
46
- end
47
- end
48
-
49
- def find_breakpoints string
50
- tokenize!(string) unless @string == string
51
- tokens.inject([]) do |bp, t|
52
- bp << t.breakpoints
53
- bp
54
- end.flatten.uniq
55
- end
56
-
57
- def tokenize! string, &block
58
- @string = string
59
- @extracted_tokens ||= {}
60
- @extracted_tokens.clear
61
- @tokens_to_find ||= tokens.each_with_index.map do |t, i|
62
- [i, t.string] if t.string
63
- end.compact
64
- @tokens_to_extract_indeces ||= tokens.each_with_index.map do |t, i|
65
- i if t.extract?
66
- end.compact
67
-
68
- tokens.first.breakpoints[0] = 0
69
-
70
- find_index = 0
71
-
72
- curr_token = @tokens_to_find[find_index]
73
- curr_token_index = curr_token[0]
74
- curr_token_length = curr_token[1].length
75
- looking_for_index = 0
76
- looking_for = curr_token[1][looking_for_index]
77
-
78
- counter = 0
79
- string.each_char do |c|
80
- if c == looking_for
81
- if looking_for_index == 0
82
- # entering new token
83
- if curr_token_index > 0
84
- t = tokens[curr_token_index - 1]
85
- t.breakpoints[1] = counter
86
- if t.extract?
87
- @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
88
- end
89
- end
90
- tokens[curr_token_index].breakpoints[0] = counter
91
- end
92
- if looking_for_index >= (curr_token_length - 1)
93
- # leaving token
94
- tokens[curr_token_index].breakpoints[1] = counter
95
-
96
- if curr_token_index >= tokens.size-1
97
- # we're done!
98
- break
99
- else
100
- tokens[curr_token_index + 1].breakpoints[0] = counter + 1
101
- end
102
-
103
- # next token
104
- find_index += 1
105
- if find_index >= @tokens_to_find.length
106
- # we're done!
107
- break
108
- end
109
- curr_token = @tokens_to_find[find_index]
110
- curr_token_index = curr_token[0]
111
- curr_token_length = curr_token[1].length
112
- looking_for_index = 0
113
- else
114
- looking_for_index += 1
115
- end
116
- end
117
- looking_for = curr_token[1][looking_for_index]
118
- counter += 1
119
- end
120
-
121
- last_token = tokens.last
122
- last_token.breakpoints[1] = string.length
123
-
124
- if last_token.extract?
125
- @extracted_tokens[last_token.name] = string[last_token.breakpoints[0]..last_token.breakpoints[1]]
126
- end
127
-
128
- combined_tokens.each do |combiner|
129
- name = combiner[0]
130
- from = @tokens[combiner[1]].breakpoints[0]
131
- to = @tokens[combiner[2]].breakpoints[1]
132
- @extracted_tokens[name] = string[from...to]
133
- end
134
-
135
- if block_given?
136
- yield @extracted_tokens
137
- end
138
-
139
- # return self for chaining
140
- self
141
- end
142
-
143
- end
144
-
145
-
@@ -1,98 +0,0 @@
1
- # this tokenizer is fairly fast, but not necessarily faster than regexps
2
- class StringEater::RubyTokenizer
3
- def self.tokens
4
- @tokens ||= []
5
- end
6
-
7
- def self.combined_tokens
8
- @combined_tokens ||= []
9
- end
10
-
11
- def self.add_field name, opts={}
12
- self.tokens << StringEater::Token::new_field(name, opts)
13
- define_method(name) {@extracted_tokens[name]}
14
- end
15
-
16
- def self.look_for tokens
17
- self.tokens << StringEater::Token::new_separator(tokens)
18
- end
19
-
20
- def self.combine_fields opts={}
21
- from_token_index = self.tokens.index{|t| t.name == opts[:from]}
22
- to_token_index = self.tokens.index{|t| t.name == opts[:to]}
23
- self.combined_tokens << [opts[:as], from_token_index, to_token_index]
24
- define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
25
- end
26
-
27
- def tokens
28
- @tokens ||= self.class.tokens
29
- end
30
-
31
- def combined_tokens
32
- @combined_tokens ||= self.class.combined_tokens
33
- end
34
-
35
- def refresh_tokens
36
- @combined_tokens = nil
37
- @tokens = nil
38
- tokens
39
- end
40
-
41
- def describe_line
42
- tokens.inject("") do |desc, t|
43
- desc << (t.string || t.name.to_s || "xxxxxx")
44
- end
45
- end
46
-
47
- def find_breakpoints(string)
48
- @literal_tokens ||= tokens.select{|t| t.string}
49
- @breakpoints ||= Array.new(2*@literal_tokens.size + 2)
50
- @breakpoints[0] = 0
51
- @breakpoints[-1] = string.length
52
- start_point = 0
53
- @literal_tokens.each_with_index do |t, i|
54
- @breakpoints[2*i+1], start_point = find_end_of(t, string, start_point)
55
- @breakpoints[2*i+2] = start_point
56
- end
57
- @breakpoints
58
- end
59
-
60
- def tokenize! string, &block
61
- @extracted_tokens ||= {}
62
- @extracted_tokens.clear
63
- @tokens_to_extract ||= tokens.select{|t| t.extract?}
64
-
65
- find_breakpoints(string)
66
- last_important_bp = [@breakpoints.length, tokens.size].min
67
- (0...last_important_bp).each do |i|
68
- tokens[i].breakpoints = [@breakpoints[i], @breakpoints[i+1]]
69
- end
70
-
71
- @tokens_to_extract.each do |t|
72
- @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
73
- end
74
-
75
- combined_tokens.each do |combiner|
76
- name = combiner[0]
77
- from = @tokens[combiner[1]].breakpoints[0]
78
- to = @tokens[combiner[2]].breakpoints[1]
79
- @extracted_tokens[name] = string[from...to]
80
- end
81
-
82
- if block_given?
83
- yield @extracted_tokens
84
- end
85
-
86
- # return self for chaining
87
- self
88
- end
89
-
90
- protected
91
-
92
- def find_end_of token, string, start_at
93
- start = string.index(token.string, start_at+1) || string.length
94
- [start, [start + token.string.length, string.length].min]
95
- end
96
-
97
- end
98
-