string-eater 0.2.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZmU1MmM0ZDY2MjQ4ZWM5MmFjN2E0YzVlODJkYWIwOWFlZDYxYzYyOQ==
5
+ data.tar.gz: !binary |-
6
+ NjhjZGQ1ZDQwZDVjNmE0MjZjM2Q5YTljNjAwNzFhNTJhNmE2ZmFmNw==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ YTg0NzZjZTFkYzhjNWVhMzE2YjNjMzQ4N2RlNWYzYTI1NWM5MTE1MjE4NGEw
10
+ ODdmNjRiYWZiODVmZGY0ZmI1MTk5MmZiZGMyYTBhNTRjODZjOGM2ODRiYjM5
11
+ MWMwMzJmNGVlOTAyYTI2YmY0NzM4MTEwNDM3NjI1MTE1ZmRmNDU=
12
+ data.tar.gz: !binary |-
13
+ NzQ2NTdlYjAzY2NiMWIzYTRkYTI1NGFhZjgxOWY0YjgxYzk4ZDkyMGU3MDAw
14
+ YmQ5YjQzNDAzNGViOGJmYjFmOGI1MDIyNGI2OWNiZGVhN2ZkNWJjYTYzNTBh
15
+ YWYzZWRiYjE4ODA3YjI1ZmM4NWExZmI2ZmJmMzljMDA1Nzc0ZGY=
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2012 Dan Swain
1
+ Copyright (c) 2012 Simpli.fi
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -87,25 +87,41 @@ We can also do something like this:
87
87
  For another example, see `examples/nginx.rb`, which defines an
88
88
  [nginx](http://nginx.org) log line tokenizer.
89
89
 
90
- ## Implementation
90
+ ## Non-strict usage
91
+
92
+ Use `set_non_strict` to indicate that separator finding should be
93
+ non-strict. This means that if the tokenizer fails to find a
94
+ separator before finishing a string, it will fill in the last token
95
+ with the remainder of the string. Normally (i.e., strict usage), the
96
+ token whose closing character was not found is left nil.
97
+
98
+ Example:
99
+
100
+ class PersonTokenizer < StringEater::Tokenizer
101
+ add_field :last_name
102
+ look_for ", "
103
+ add_field :first_name, :extract => false
104
+ look_for " | "
105
+ add_field :street_address, :extract => false
106
+ look_for ", "
107
+ add_field :city
108
+ look_for ", "
109
+ add_field :state
110
+ look_for ", "
111
+ set_non_strict
112
+ end
91
113
 
92
- There are actually three tokenizer algorithms provided here. The
93
- three algorithms should be interchangeable.
114
+ tokenizer = PersonTokenizer.new
115
+ string = "Flinstone, Fred | 301 Cobblestone Way, Bedrock"
116
+ tokenizer.tokenize! string
94
117
 
95
- 1. `StringEater::CTokenizer` - A C extension implementation. The
96
- fastest of the three. This is the default implementation for
97
- `StringEater::Tokenizer`.
118
+ puts tokenizer.last_name # => "Flinstone"
119
+ puts tokenizer.city # => "Bedrock" (if strict, would be nil)
98
120
 
99
- 2. `StringEater::RubyTokenizer` - A pure-Ruby implementation. This is
100
- a slightly different implementation of the algorithm - an
101
- implementation that is faster on Ruby than a translation of the C
102
- algorithm. Probably not as fast (or not much faster) than using
103
- Ruby regular expressions.
121
+ Non-strict can also be set on an instance tokenizer,
122
+ i.e., call `tokenizer.set_non_strict` to make `tokenizer` non-strict.
104
123
 
105
- 3. `StringEater::RubyTokenizerEachChar` - A pure-Ruby implementation.
106
- This is essentially the same as the C implementation, but written
107
- in pure Ruby. It uses `String#each_char` and is therefore VERY
108
- SLOW! It provides a good way to hack the algorithm, though.
124
+ ## Implementation
109
125
 
110
126
  The main algorithm works by finding the start and end points of tokens
111
127
  in a string. The search is done incrementally (i.e., loop through the
data/examples/address.rb CHANGED
@@ -1,35 +1,40 @@
1
+ # encoding: utf-8
2
+
1
3
  # once the gem is installed, you don't need this
2
- $: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
3
- $: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'ext/string-eater'))
4
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
5
+ '..', 'lib')))
6
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
7
+ '..', 'ext/string-eater')))
4
8
 
5
9
  # this is the example from the README
6
10
  require 'string-eater'
7
11
 
8
- class PersonTokenizer < StringEater::Tokenizer
9
- add_field :last_name
10
- look_for ", "
11
- add_field :first_name, :extract => false
12
- look_for " | "
13
- add_field :street_address, :extract => false
14
- look_for ", "
12
+ # example tokenizer for addresses
13
+ class PersonTokenizer < StringEater::Tokenizer
14
+ add_field :last_name
15
+ look_for ', '
16
+ add_field :first_name, extract: false
17
+ look_for ' | '
18
+ add_field :street_address, extract: false
19
+ look_for ', '
15
20
  add_field :city
16
- look_for ", "
17
- add_field :state
18
- look_for ", "
21
+ look_for ', '
22
+ add_field :state
23
+ look_for ', '
19
24
  end
20
25
 
21
- if __FILE__ == $0
26
+ if __FILE__ == $PROGRAM_NAME
22
27
  tokenizer = PersonTokenizer.new
23
28
  puts tokenizer.describe_line
24
29
 
25
- string = "Flinstone, Fred | 301 Cobblestone Way, Bedrock, NA, 00000"
30
+ string = 'Flinstone, Fred | 301 Cobblestone Way, Bedrock, NA, 00000'
26
31
  tokenizer.tokenize! string
27
32
 
28
- puts tokenizer.last_name # => "Flinestone"
29
- puts tokenizer.city # => "Bedrock"
33
+ puts tokenizer.last_name # => "Flinestone"
34
+ puts tokenizer.city # => "Bedrock"
30
35
  puts tokenizer.state # => "NA"
31
36
 
32
- tokenizer.tokenize!(string) do |tokens|
37
+ tokenizer.tokenize!(string) do |tokens|
33
38
  puts "The #{tokens[:last_name]}s live in #{tokens[:city]}"
34
39
  end
35
40
  end
data/examples/nginx.rb CHANGED
@@ -1,27 +1,32 @@
1
+ # encoding: utf-8
2
+
1
3
  # once the gem is installed, you don't need this
2
- $: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
3
- $: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'ext/string-eater'))
4
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
5
+ '..', 'lib')))
6
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
7
+ '..', 'ext/string-eater')))
4
8
 
5
9
  require 'string-eater'
6
10
 
11
+ # Example tokenizer for nginx log lines
7
12
  class NginxLogTokenizer < StringEater::CTokenizer
8
13
  add_field :ip
9
- look_for " - "
10
- add_field :remote_user, :extract => false
11
- look_for " ["
12
- add_field :timestamp, :extract => false
14
+ look_for ' - '
15
+ add_field :remote_user, extract: false
16
+ look_for ' ['
17
+ add_field :timestamp, extract: false
13
18
  look_for "] \""
14
19
  add_field :request
15
20
  look_for "\" "
16
21
  add_field :status_code
17
- look_for " "
18
- add_field :bytes_sent, :extract => false
22
+ look_for ' '
23
+ add_field :bytes_sent, extract: false
19
24
  look_for " \""
20
25
  add_field :referrer_url
21
26
  look_for "\" \""
22
27
  add_field :user_agent
23
28
  look_for "\" \""
24
- add_field :compression, :extract => false
29
+ add_field :compression, extract: false
25
30
  look_for "\" "
26
31
  add_field :remainder
27
32
 
@@ -47,14 +52,17 @@ class NginxLogTokenizer < StringEater::CTokenizer
47
52
  end
48
53
  end
49
54
 
50
- if __FILE__ == $0
55
+ if __FILE__ == $PROGRAM_NAME
51
56
  tokenizer = NginxLogTokenizer.new
52
57
  puts tokenizer.describe_line
53
58
 
54
- str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] "GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" "-" "there could be" other "stuff here"'
59
+ str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
60
+ '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
61
+ '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
62
+ 'Trident/5.0)" "-" "there could be" other "stuff here"'
55
63
 
56
- puts "input string: " + str
57
- puts "Tokens: "
64
+ puts 'input string: ' + str
65
+ puts 'Tokens: '
58
66
 
59
67
  # use a block to work with the extracted tokens
60
68
  tokenizer.tokenize!(str) do |tokens|
@@ -10,11 +10,12 @@ static VALUE rb_cCTokenizer;
10
10
  static VALUE rb_mStringEater;
11
11
 
12
12
  static VALUE tokenize_string(VALUE self,
13
- VALUE string,
14
- VALUE tokens_to_find_indexes,
15
- VALUE tokens_to_find_strings,
16
- VALUE tokens_to_extract_indexes,
17
- VALUE tokens_to_extract_names)
13
+ VALUE string,
14
+ VALUE tokens_to_find_indexes,
15
+ VALUE tokens_to_find_strings,
16
+ VALUE tokens_to_extract_indexes,
17
+ VALUE tokens_to_extract_names,
18
+ VALUE non_strict)
18
19
  {
19
20
  const char* input_string = StringValueCStr(string);
20
21
  VALUE extracted_tokens = rb_hash_new();
@@ -115,6 +116,19 @@ static VALUE tokenize_string(VALUE self,
115
116
  }
116
117
  }
117
118
 
119
+ /*
120
+ got to the end of the string
121
+ and have an incomplete token
122
+ and not strict
123
+ */
124
+ if(ix == str_len && curr_token_ix < n_tokens && RTEST(non_strict))
125
+ {
126
+ rb_hash_aset(extracted_tokens,
127
+ rb_ary_entry(tokens_to_extract_names, curr_token_ix - 1),
128
+ rb_usascii_str_new(input_string + startpoint,
129
+ str_len - startpoint));
130
+ }
131
+
118
132
  curr_token_ix = n_tokens - 1;
119
133
 
120
134
  if(ix < str_len && curr_token_ix == next_token_to_extract_ix)
@@ -139,7 +153,7 @@ void Init_c_tokenizer_ext(void)
139
153
  rb_cCTokenizer = rb_define_class_under(rb_mStringEater,
140
154
  "CTokenizer", rb_cObject);
141
155
 
142
- rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string, 5);
156
+ rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string, 6);
143
157
 
144
158
  /* set the callback for when the extension is unloaded */
145
159
  rb_set_end_proc(finalize_c_tokenizer_ext, 0);
@@ -1,2 +1,3 @@
1
+ # encoding: utf-8
1
2
  require 'mkmf'
2
3
  create_makefile('c_tokenizer_ext')
data/lib/c-tokenizer.rb CHANGED
@@ -1,17 +1,22 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'c_tokenizer_ext'
2
4
 
5
+ # Ruby interface to the c extension
3
6
  class StringEater::CTokenizer
7
+ attr_reader :tokens
8
+
4
9
  def self.tokens
5
10
  @tokens ||= []
6
11
  end
7
12
 
8
- def self.add_field name, opts={}
9
- self.tokens << StringEater::Token::new_field(name, opts)
10
- define_method(name) {@extracted_tokens[name]}
13
+ def self.add_field(name, opts = {})
14
+ tokens << StringEater::Token.new_field(name, opts)
15
+ define_method(name) { @extracted_tokens[name] }
11
16
  end
12
17
 
13
- def self.look_for tokens
14
- self.tokens << StringEater::Token::new_separator(tokens)
18
+ def self.look_for(look_for_tokens)
19
+ tokens << StringEater::Token.new_separator(look_for_tokens)
15
20
  end
16
21
 
17
22
  # This is very slow, only do it when necessary
@@ -19,12 +24,20 @@ class StringEater::CTokenizer
19
24
  Marshal.load(Marshal.dump(tokens))
20
25
  end
21
26
 
27
+ def self.set_non_strict
28
+ @class_non_strict = true
29
+ end
30
+
31
+ def self.non_strict?
32
+ @class_non_strict == true
33
+ end
34
+
22
35
  def initialize
23
36
  refresh_tokens
24
37
  end
25
38
 
26
- def tokens
27
- @tokens
39
+ def set_non_strict
40
+ @non_strict = true
28
41
  end
29
42
 
30
43
  def extract_all_fields
@@ -41,7 +54,7 @@ class StringEater::CTokenizer
41
54
  refresh_tokens
42
55
  end
43
56
 
44
- def extract_fields *fields
57
+ def extract_fields(*fields)
45
58
  @token_filter = lambda do |t|
46
59
  t.opts[:extract] = fields.include?(t.name)
47
60
  end
@@ -52,76 +65,78 @@ class StringEater::CTokenizer
52
65
  def refresh_tokens
53
66
  @tokens = self.class.dup_tokens
54
67
 
55
- if @token_filter
56
- @tokens.each{|t| @token_filter.call(t)}
57
- end
58
-
59
- tokens_to_find = tokens.each_with_index.map do |t, i|
60
- [i, t.string] if t.string
61
- end.compact
68
+ @tokens.each { |t| @token_filter.call(t) } if @token_filter
62
69
 
63
- @tokens_to_find_indexes = tokens_to_find.map{|t| t[0]}
64
- @tokens_to_find_strings = tokens_to_find.map{|t| t[1]}
70
+ tokens_to_find = gen_tokens_to_find
71
+ @tokens_to_find_indexes = tokens_to_find.map { |t| t[0] }
72
+ @tokens_to_find_strings = tokens_to_find.map { |t| t[1] }
65
73
 
66
- tokens_to_extract = tokens.each_with_index.map do |t, i|
67
- [i, t.name] if t.extract?
68
- end.compact
69
-
70
- @tokens_to_extract_indexes = tokens_to_extract.map{|t| t[0]}
71
- @tokens_to_extract_names = tokens.map{|t| t.name}
74
+ tokens_to_extract = gen_tokens_to_extract
75
+ @tokens_to_extract_indexes = tokens_to_extract.map { |t| t[0] }
76
+ @tokens_to_extract_names = tokens.map { |t| t.name }
72
77
 
73
78
  @have_tokens_to_extract = (@tokens_to_extract_indexes.size > 0)
74
79
  end
75
80
 
76
81
  def describe_line
77
- tokens.inject("") do |desc, t|
78
- desc << (t.string || t.name.to_s || "xxxxxx")
82
+ tokens.reduce('') do |desc, t|
83
+ desc << (t.string || t.name.to_s || 'xxxxxx')
79
84
  end
80
85
  end
81
86
 
82
87
  def do_extra_parsing
83
88
  end
84
89
 
85
- def tokenize! string, &block
90
+ # Not sure this could be much more concise
91
+ # rubocop:disable MethodLength
92
+ def tokenize!(string, &block)
86
93
  @string = string
87
94
  @extracted_tokens ||= {}
88
95
  @extracted_tokens.clear
96
+ @non_strict ||= self.class.non_strict?
89
97
 
90
98
  return unless @have_tokens_to_extract
91
99
 
92
- @extracted_tokens = ctokenize!(@string,
100
+ @extracted_tokens = ctokenize!(@string,
93
101
  @tokens_to_find_indexes,
94
102
  @tokens_to_find_strings,
95
103
  @tokens_to_extract_indexes,
96
- @tokens_to_extract_names)
104
+ @tokens_to_extract_names,
105
+ @non_strict)
97
106
 
98
107
  # extra parsing hook
99
108
  do_extra_parsing
100
109
 
101
- if block_given?
102
- yield @extracted_tokens
103
- end
110
+ yield @extracted_tokens if block_given?
104
111
 
105
112
  # return self for chaining
106
113
  self
107
- end
108
-
114
+ end
115
+ # rubocop:enable MethodLength
116
+
109
117
  private
110
118
 
111
- def set_token_startpoint ix, startpoint
119
+ def set_token_startpoint(ix, startpoint)
112
120
  @tokens[ix].breakpoints[0] = startpoint
113
121
  end
114
122
 
115
- def get_token_startpoint ix
123
+ def get_token_startpoint(ix)
116
124
  @tokens[ix].breakpoints[0]
117
125
  end
118
126
 
119
- def set_token_endpoint ix, endpoint
127
+ def set_token_endpoint(ix, endpoint)
120
128
  @tokens[ix].breakpoints[1] = endpoint
121
129
  end
122
130
 
123
- def extract_token? ix
131
+ def extract_token?(ix)
124
132
  @tokens[ix].extract?
125
133
  end
126
134
 
135
+ def gen_tokens_to_find
136
+ tokens.each_with_index.map { |t, i| [i, t.string] if t.string }.compact
137
+ end
138
+
139
+ def gen_tokens_to_extract
140
+ tokens.each_with_index.map { |t, i| [i, t.name] if t.extract? }.compact
141
+ end
127
142
  end
data/lib/string-eater.rb CHANGED
@@ -1,10 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ # Namespacing module for StringEater
1
4
  module StringEater
2
5
  autoload :Token, 'token'
3
- autoload :RubyTokenizer, 'ruby-tokenizer'
4
- autoload :RubyTokenizerEachCHar, 'ruby-tokenizer-each-char'
5
6
  autoload :CTokenizer, 'c-tokenizer'
6
7
 
7
8
  autoload :VERSION, 'version'
8
9
 
10
+ # by default, Tokenizer is the c extension tokenizer
9
11
  class Tokenizer < CTokenizer; end
10
12
  end
data/lib/token.rb CHANGED
@@ -1,9 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ # Token class used by tokenizers
1
4
  class StringEater::Token
2
5
  attr_accessor :name, :string, :opts, :breakpoints, :children
3
6
 
4
7
  def initialize
5
8
  @opts = {}
6
- @breakpoints = [nil,nil]
9
+ @breakpoints = [nil, nil]
7
10
  end
8
11
 
9
12
  def extract?
@@ -13,7 +16,7 @@ class StringEater::Token
13
16
  def self.new_field(name, opts)
14
17
  t = new
15
18
  t.name = name
16
- t.opts = {:extract => true}.merge(opts)
19
+ t.opts = { extract: true }.merge(opts)
17
20
  t
18
21
  end
19
22
 
@@ -22,5 +25,4 @@ class StringEater::Token
22
25
  t.string = string
23
26
  t
24
27
  end
25
-
26
28
  end
data/lib/version.rb CHANGED
@@ -1,8 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ # Extend StringEater with Version
1
4
  module StringEater
2
- module VERSION
3
- MAJOR = 0
4
- MINOR = 2
5
- PATCH = 2
5
+ # Version constants
6
+ module VERSION
7
+ MAJOR = 1
8
+ MINOR = 0
9
+ PATCH = 0
6
10
  PRE = nil
7
11
  STRING = [MAJOR, MINOR, PATCH, PRE].compact.join('.')
8
12
  end
data/spec/nginx_spec.rb CHANGED
@@ -1,32 +1,44 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'spec_helper'
2
4
  require 'string-eater'
3
5
 
4
- $: << File.expand_path(File.join(File.dirname(__FILE__), '..', 'examples'))
6
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
7
+ '..', 'examples')))
5
8
 
6
9
  require 'nginx'
7
10
 
8
11
  describe NginxLogTokenizer do
9
12
  before(:each) do
10
13
  @tokenizer = NginxLogTokenizer.new
11
- @str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] "GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" "-" "there could be" other "stuff here"'
12
- @str2 = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] "GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" "-"'
13
- end
14
-
15
- {
16
- :ip => "73.80.217.212",
17
- :request => "GET /this_is_a_url HTTP/1.1",
18
- :status_code => 304,
19
- :referrer_url => "http://referrer.com",
20
- :user_agent => "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
21
- :remainder => "\"there could be\" other \"stuff here\"",
22
- }.each_pair do |token,val|
23
- it "should find the right value for #{token}" do
24
- @tokenizer.tokenize!(@str).send(token).should == val
25
- end
26
- end
27
-
28
- it "should handle there not being a remainder correctly" do
29
- @tokenizer.tokenize!(@str2).remainder.should be_nil
30
- end
14
+ @str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
15
+ '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
16
+ '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
17
+ 'Trident/5.0)" "-" "there could be" other "stuff here"'
18
+ @str2 = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
19
+ '"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
20
+ '"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
21
+ 'WOW64; Trident/5.0)" "-"'
22
+ end
23
+
24
+ user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
25
+ 'WOW64; Trident/5.0)'
26
+
27
+ {
28
+ ip: '73.80.217.212',
29
+ request: 'GET /this_is_a_url HTTP/1.1',
30
+ status_code: 304,
31
+ referrer_url: 'http://referrer.com',
32
+ user_agent: user_agent,
33
+ remainder: "\"there could be\" other \"stuff here\"",
34
+ }.each_pair do |token, val|
35
+ it "finds the right value for #{token}" do
36
+ @tokenizer.tokenize!(@str).send(token).should == val
37
+ end
38
+ end
39
+
40
+ it 'correctly handles there not being a remainder' do
41
+ @tokenizer.tokenize!(@str2).remainder.should be_nil
42
+ end
31
43
 
32
44
  end
data/spec/spec_helper.rb CHANGED
@@ -1 +1,2 @@
1
+ # encoding: utf-8
1
2
  $LOAD_PATH.concat %w[./lib ./ext/string-eater]
@@ -1,193 +1,185 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'spec_helper'
2
4
  require 'string-eater'
3
5
 
4
- TestedClass = StringEater::CTokenizer
5
-
6
6
  describe StringEater do
7
- it "should have a version" do
8
- StringEater::VERSION::STRING.split(".").size.should >= 3
7
+ it 'has a version' do
8
+ StringEater::VERSION::STRING.split('.').size.should >= 3
9
9
  end
10
10
  end
11
11
 
12
12
  # normal use
13
- class Example1 < TestedClass
13
+ class Example1 < StringEater::CTokenizer
14
14
  add_field :first_word
15
- look_for " "
16
- add_field :second_word, :extract => false
17
- look_for "|"
15
+ look_for ' '
16
+ add_field :second_word, extract: false
17
+ look_for '|'
18
18
  add_field :third_word
19
19
  end
20
20
 
21
21
  describe Example1 do
22
-
23
- before(:each) do
24
- @tokenizer = Example1.new
25
- @str1 = "foo bar|baz"
26
- @first_word1 = "foo"
27
- @second_word1 = "bar"
28
- @third_word1 = "baz"
29
- @bp1 = [0, 3,4,7,8,11]
22
+ let(:tokenizer) { Example1.new }
23
+ let(:first_word1) { 'foo' }
24
+ let(:second_word1) { 'bar' }
25
+ let(:third_word1) { 'baz' }
26
+ let(:str1) { "#{first_word1} #{second_word1}|#{third_word1}" }
27
+
28
+ describe '#extract_all_fields' do
29
+ it 'extracts all of the fields' do
30
+ tokenizer.extract_all_fields
31
+ tokenizer.tokenize!(str1)
32
+ expect(tokenizer.first_word).to eq(first_word1)
33
+ expect(tokenizer.second_word).to eq(second_word1)
34
+ expect(tokenizer.third_word).to eq(third_word1)
35
+ end
30
36
  end
31
37
 
32
- describe "find_breakpoints" do
33
- it "should return an array of the breakpoints" do
34
- @tokenizer.find_breakpoints(@str1).should == @bp1 if @tokenizer.respond_to?(:find_breakpoints)
38
+ describe '#extract_no_fields' do
39
+ it 'does not extract any of the fields' do
40
+ tokenizer.extract_no_fields
41
+ tokenizer.tokenize!(str1)
42
+ tokenizer.first_word.should be_nil
43
+ tokenizer.second_word.should be_nil
44
+ tokenizer.third_word.should be_nil
35
45
  end
36
46
  end
37
47
 
38
- describe "#extract_all_fields" do
39
- it "should extract all of the fields" do
40
- @tokenizer.extract_all_fields
41
- @tokenizer.tokenize!(@str1)
42
- @tokenizer.first_word.should == @first_word1
43
- @tokenizer.second_word.should == @second_word1
44
- @tokenizer.third_word.should == @third_word1
48
+ describe '#extract_fields' do
49
+ it 'allows us to set which fields get extracted' do
50
+ tokenizer.extract_fields :second_word
51
+ tokenizer.tokenize!(str1)
52
+ tokenizer.first_word.should be_nil
53
+ expect(tokenizer.second_word).to eq(second_word1)
54
+ tokenizer.third_word.should be_nil
45
55
  end
46
56
  end
47
57
 
48
- describe "#extract_no_fields" do
49
- it "should not extract any of the fields" do
50
- @tokenizer.extract_no_fields
51
- @tokenizer.tokenize!(@str1)
52
- @tokenizer.first_word.should be_nil
53
- @tokenizer.second_word.should be_nil
54
- @tokenizer.third_word.should be_nil
58
+ describe 'tokenize!' do
59
+ it 'returns itself' do
60
+ tokenizer.tokenize!(str1).should == tokenizer
55
61
  end
56
- end
57
62
 
58
- describe "#extract_fields" do
59
- it "should allow us to set which fields get extracted" do
60
- @tokenizer.extract_fields :second_word
61
- @tokenizer.tokenize!(@str1)
62
- @tokenizer.first_word.should be_nil
63
- @tokenizer.second_word.should == @second_word1
64
- @tokenizer.third_word.should be_nil
63
+ it 'sets the first word' do
64
+ tokenizer.tokenize!(str1).first_word.should == 'foo'
65
65
  end
66
- end
67
66
 
68
- describe "tokenize!" do
69
- it "should return itself" do
70
- @tokenizer.tokenize!(@str1).should == @tokenizer
67
+ it 'sets the third word' do
68
+ tokenizer.tokenize!(str1).third_word.should == 'baz'
71
69
  end
72
70
 
73
- it "should set the first word" do
74
- @tokenizer.tokenize!(@str1).first_word.should == "foo"
71
+ it 'does not set the second word' do
72
+ tokenizer.tokenize!(str1).second_word.should be_nil
75
73
  end
76
74
 
77
- it "should set the third word" do
78
- @tokenizer.tokenize!(@str1).third_word.should == "baz"
75
+ it 'yields a hash of tokens if a block is given' do
76
+ tokenizer.tokenize!(str1) do |tokens|
77
+ tokens[:first_word].should == 'foo'
78
+ end
79
79
  end
80
80
 
81
- it "should not set the second word" do
82
- @tokenizer.tokenize!(@str1).second_word.should be_nil
81
+ it 'returns everything to the end of the line for the last token' do
82
+ s = 'c defg asdf | foo , baa'
83
+ tokenizer.tokenize!("a b|#{s}").third_word.should == s
83
84
  end
84
85
 
85
- it "should yield a hash of tokens if a block is given" do
86
- @tokenizer.tokenize!(@str1) do |tokens|
87
- tokens[:first_word].should == "foo"
86
+ context 'when the last delimiter is missing' do
87
+ let(:s) { 'a b' }
88
+ it 'still finds the first word' do
89
+ expect(tokenizer.tokenize!(s).first_word).to eq('a')
88
90
  end
89
- end
90
91
 
91
- it "should return everything to the end of the line for the last token" do
92
- s = "c defg asdf | foo , baa"
93
- @tokenizer.tokenize!("a b|#{s}").third_word.should == s
94
- end
92
+ it 'returns nil for the second word' do
93
+ expect(tokenizer.tokenize!(s).second_word).to be_nil
94
+ end
95
95
 
96
- it "should work if the last delimeter is missing and the second-to-last field is not used" do
97
- s = "a b"
98
- # @tokenizer.extract_all_fields
99
- @tokenizer.tokenize!(s).third_word.should be_nil
96
+ it 'returns nil for the third word' do
97
+ expect(tokenizer.tokenize!(s).third_word).to be_nil
98
+ end
100
99
  end
101
100
 
102
- end
101
+ context 'when non_strict is enabled' do
102
+ before do
103
+ tokenizer.extract_all_fields
104
+ tokenizer.set_non_strict
105
+ end
106
+
107
+ context 'when the last delimiter is missing' do
108
+ let(:s) { 'a b' }
109
+ it 'still finds the first word' do
110
+ expect(tokenizer.tokenize!(s).first_word).to eq('a')
111
+ end
112
+
113
+ it 'still finds the second word' do
114
+ expect(tokenizer.tokenize!(s).second_word).to eq('b')
115
+ end
116
+
117
+ it 'returns nil for the third word' do
118
+ expect(tokenizer.tokenize!(s).third_word).to be_nil
119
+ end
120
+ end
121
+
122
+ context 'when the last delimiter is not missing' do
123
+ let(:s) { 'a b|c' }
124
+ it 'still finds the first word' do
125
+ expect(tokenizer.tokenize!(s).first_word).to eq('a')
126
+ end
127
+
128
+ it 'still finds the second word' do
129
+ expect(tokenizer.tokenize!(s).second_word).to eq('b')
130
+ end
103
131
 
132
+ it 'returns nil for the third word' do
133
+ expect(tokenizer.tokenize!(s).third_word).to eq('c')
134
+ end
135
+ end
136
+ end
137
+ end
104
138
  end
105
139
 
106
140
  # an example where we ignore after a certain point in the string
107
- class Example2 < TestedClass
108
- add_field :first_word, :extract => false
109
- look_for " "
141
+ class Example2 < StringEater::CTokenizer
142
+ add_field :first_word, extract: false
143
+ look_for ' '
110
144
  add_field :second_word
111
- look_for " "
112
- add_field :third_word, :extract => false
113
- look_for "-"
145
+ look_for ' '
146
+ add_field :third_word, extract: false
147
+ look_for '-'
114
148
  end
115
149
 
116
150
  describe Example2 do
151
+ let(:tokenizer) { Example2.new }
152
+ let(:second_word1) { 'bar' }
153
+ let(:str1) { "foo #{second_word1} baz-" }
117
154
 
118
- before(:each) do
119
- @tokenizer = Example2.new
120
- @str1 = "foo bar baz-"
121
- @second_word1 = "bar"
122
- end
123
-
124
- describe "tokenize!" do
125
- it "should find the token when there is extra stuff at the end of the string" do
126
- @tokenizer.tokenize!(@str1).second_word.should == @second_word1
155
+ describe 'tokenize!' do
156
+ it 'finds the token when there is extra stuff at the' +
157
+ 'end of the string' do
158
+ tokenizer.tokenize!(str1).second_word.should == second_word1
127
159
  end
128
160
  end
129
161
 
130
162
  end
131
163
 
132
164
  # an example where the split is more than one char
133
- class Example3 < TestedClass
134
- look_for "foo="
165
+ class Example3 < StringEater::CTokenizer
166
+ look_for 'foo='
135
167
  add_field :foo_val
136
- look_for "&"
168
+ look_for '&'
137
169
  end
138
170
 
139
171
  describe Example3 do
140
- before(:each) do
141
- @tokenizer = Example3.new
142
- end
172
+ let(:tokenizer) { Example3.new }
143
173
 
144
- describe "tokenize!" do
145
- it "should find the token if there is only one occurrence of the characters in the separator" do
146
- @tokenizer.tokenize!("abcd?foo=val&blah").foo_val.should == "val"
174
+ describe 'tokenize!' do
175
+ it 'finds the token if there is only one occurrence ' +
176
+ 'of the characters in the separator' do
177
+ tokenizer.tokenize!('abcd?foo=val&blah').foo_val.should == 'val'
147
178
  end
148
179
 
149
- it "should still work if part of the separator token occurs" do
150
- @tokenizer.tokenize!("abcd?foo_blah=baz&foo=bar&buh").foo_val.should == "bar"
180
+ it 'still works if part of the separator token occurs' do
181
+ tokenizer.tokenize!('abcd?foo_blah=baz&foo=bar&buh')
182
+ .foo_val.should == 'bar'
151
183
  end
152
184
  end
153
185
  end
154
-
155
- # CTokenizer doesn't do combine_fields because
156
- # writing out breakpoints is a significant slow-down
157
- if TestedClass.respond_to?(:combine_fields)
158
- # an example where we combine fields
159
- class Example3 < TestedClass
160
- add_field :first_word, :extract => false
161
- look_for " \""
162
- add_field :part1, :extract => false
163
- look_for " "
164
- add_field :part2
165
- look_for " "
166
- add_field :part3, :extract => false
167
- look_for "\""
168
-
169
- combine_fields :from => :part1, :to => :part3, :as => :parts
170
- end
171
-
172
- describe Example3 do
173
- before(:each) do
174
- @tokenizer = Example3.new
175
- @str1 = "foo \"bar baz bang\""
176
- @part2 = "baz"
177
- @parts = "bar baz bang"
178
- end
179
-
180
- it "should extract like normal" do
181
- @tokenizer.tokenize!(@str1).part2.should == @part2
182
- end
183
-
184
- it "should ignore like normal" do
185
- @tokenizer.tokenize!(@str1).part1.should be_nil
186
- end
187
-
188
- it "should extract the combined field" do
189
- @tokenizer.tokenize!(@str1).parts.should == @parts
190
- end
191
-
192
- end
193
- end
metadata CHANGED
@@ -1,15 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string-eater
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
5
- prerelease:
4
+ version: 1.0.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Dan Swain
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-11-30 00:00:00.000000000 Z
11
+ date: 2014-01-05 00:00:00.000000000 Z
13
12
  dependencies: []
14
13
  description: Fast string tokenizer. Nom strings.
15
14
  email:
@@ -20,8 +19,6 @@ extensions:
20
19
  extra_rdoc_files: []
21
20
  files:
22
21
  - lib/c-tokenizer.rb
23
- - lib/ruby-tokenizer-each-char.rb
24
- - lib/ruby-tokenizer.rb
25
22
  - lib/string-eater.rb
26
23
  - lib/token.rb
27
24
  - lib/version.rb
@@ -37,28 +34,27 @@ files:
37
34
  - README.md
38
35
  homepage: http://github.com/simplifi/string-eater
39
36
  licenses: []
37
+ metadata: {}
40
38
  post_install_message:
41
39
  rdoc_options: []
42
40
  require_paths:
43
41
  - lib
44
42
  - ext/string-eater
45
43
  required_ruby_version: !ruby/object:Gem::Requirement
46
- none: false
47
44
  requirements:
48
45
  - - ! '>='
49
46
  - !ruby/object:Gem::Version
50
47
  version: '0'
51
48
  required_rubygems_version: !ruby/object:Gem::Requirement
52
- none: false
53
49
  requirements:
54
50
  - - ! '>='
55
51
  - !ruby/object:Gem::Version
56
52
  version: '0'
57
53
  requirements: []
58
54
  rubyforge_project:
59
- rubygems_version: 1.8.24
55
+ rubygems_version: 2.0.6
60
56
  signing_key:
61
- specification_version: 3
57
+ specification_version: 4
62
58
  summary: Fast string tokenizer. Nom strings.
63
59
  test_files:
64
60
  - spec/nginx_spec.rb
@@ -1,145 +0,0 @@
1
- # this tokenizer is very slow, but it illustrates the
2
- # basic idea of the C tokenizer
3
- class StringEater::RubyTokenizerEachChar
4
-
5
- def self.tokens
6
- @tokens ||= []
7
- end
8
-
9
- def self.combined_tokens
10
- @combined_tokens ||= []
11
- end
12
-
13
- def self.add_field name, opts={}
14
- self.tokens << StringEater::Token::new_field(name, opts)
15
- define_method(name) {@extracted_tokens[name]}
16
- end
17
-
18
- def self.look_for tokens
19
- self.tokens << StringEater::Token::new_separator(tokens)
20
- end
21
-
22
- def self.combine_fields opts={}
23
- from_token_index = self.tokens.index{|t| t.name == opts[:from]}
24
- to_token_index = self.tokens.index{|t| t.name == opts[:to]}
25
- self.combined_tokens << [opts[:as], from_token_index, to_token_index]
26
- define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
27
- end
28
-
29
- def tokens
30
- @tokens ||= self.class.tokens
31
- end
32
-
33
- def combined_tokens
34
- @combined_tokens ||= self.class.combined_tokens
35
- end
36
-
37
- def refresh_tokens
38
- @combined_tokens = nil
39
- @tokens = nil
40
- tokens
41
- end
42
-
43
- def describe_line
44
- tokens.inject("") do |desc, t|
45
- desc << (t.string || t.name.to_s || "xxxxxx")
46
- end
47
- end
48
-
49
- def find_breakpoints string
50
- tokenize!(string) unless @string == string
51
- tokens.inject([]) do |bp, t|
52
- bp << t.breakpoints
53
- bp
54
- end.flatten.uniq
55
- end
56
-
57
- def tokenize! string, &block
58
- @string = string
59
- @extracted_tokens ||= {}
60
- @extracted_tokens.clear
61
- @tokens_to_find ||= tokens.each_with_index.map do |t, i|
62
- [i, t.string] if t.string
63
- end.compact
64
- @tokens_to_extract_indeces ||= tokens.each_with_index.map do |t, i|
65
- i if t.extract?
66
- end.compact
67
-
68
- tokens.first.breakpoints[0] = 0
69
-
70
- find_index = 0
71
-
72
- curr_token = @tokens_to_find[find_index]
73
- curr_token_index = curr_token[0]
74
- curr_token_length = curr_token[1].length
75
- looking_for_index = 0
76
- looking_for = curr_token[1][looking_for_index]
77
-
78
- counter = 0
79
- string.each_char do |c|
80
- if c == looking_for
81
- if looking_for_index == 0
82
- # entering new token
83
- if curr_token_index > 0
84
- t = tokens[curr_token_index - 1]
85
- t.breakpoints[1] = counter
86
- if t.extract?
87
- @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
88
- end
89
- end
90
- tokens[curr_token_index].breakpoints[0] = counter
91
- end
92
- if looking_for_index >= (curr_token_length - 1)
93
- # leaving token
94
- tokens[curr_token_index].breakpoints[1] = counter
95
-
96
- if curr_token_index >= tokens.size-1
97
- # we're done!
98
- break
99
- else
100
- tokens[curr_token_index + 1].breakpoints[0] = counter + 1
101
- end
102
-
103
- # next token
104
- find_index += 1
105
- if find_index >= @tokens_to_find.length
106
- # we're done!
107
- break
108
- end
109
- curr_token = @tokens_to_find[find_index]
110
- curr_token_index = curr_token[0]
111
- curr_token_length = curr_token[1].length
112
- looking_for_index = 0
113
- else
114
- looking_for_index += 1
115
- end
116
- end
117
- looking_for = curr_token[1][looking_for_index]
118
- counter += 1
119
- end
120
-
121
- last_token = tokens.last
122
- last_token.breakpoints[1] = string.length
123
-
124
- if last_token.extract?
125
- @extracted_tokens[last_token.name] = string[last_token.breakpoints[0]..last_token.breakpoints[1]]
126
- end
127
-
128
- combined_tokens.each do |combiner|
129
- name = combiner[0]
130
- from = @tokens[combiner[1]].breakpoints[0]
131
- to = @tokens[combiner[2]].breakpoints[1]
132
- @extracted_tokens[name] = string[from...to]
133
- end
134
-
135
- if block_given?
136
- yield @extracted_tokens
137
- end
138
-
139
- # return self for chaining
140
- self
141
- end
142
-
143
- end
144
-
145
-
@@ -1,98 +0,0 @@
1
- # this tokenizer is fairly fast, but not necessarily faster than regexps
2
- class StringEater::RubyTokenizer
3
- def self.tokens
4
- @tokens ||= []
5
- end
6
-
7
- def self.combined_tokens
8
- @combined_tokens ||= []
9
- end
10
-
11
- def self.add_field name, opts={}
12
- self.tokens << StringEater::Token::new_field(name, opts)
13
- define_method(name) {@extracted_tokens[name]}
14
- end
15
-
16
- def self.look_for tokens
17
- self.tokens << StringEater::Token::new_separator(tokens)
18
- end
19
-
20
- def self.combine_fields opts={}
21
- from_token_index = self.tokens.index{|t| t.name == opts[:from]}
22
- to_token_index = self.tokens.index{|t| t.name == opts[:to]}
23
- self.combined_tokens << [opts[:as], from_token_index, to_token_index]
24
- define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
25
- end
26
-
27
- def tokens
28
- @tokens ||= self.class.tokens
29
- end
30
-
31
- def combined_tokens
32
- @combined_tokens ||= self.class.combined_tokens
33
- end
34
-
35
- def refresh_tokens
36
- @combined_tokens = nil
37
- @tokens = nil
38
- tokens
39
- end
40
-
41
- def describe_line
42
- tokens.inject("") do |desc, t|
43
- desc << (t.string || t.name.to_s || "xxxxxx")
44
- end
45
- end
46
-
47
- def find_breakpoints(string)
48
- @literal_tokens ||= tokens.select{|t| t.string}
49
- @breakpoints ||= Array.new(2*@literal_tokens.size + 2)
50
- @breakpoints[0] = 0
51
- @breakpoints[-1] = string.length
52
- start_point = 0
53
- @literal_tokens.each_with_index do |t, i|
54
- @breakpoints[2*i+1], start_point = find_end_of(t, string, start_point)
55
- @breakpoints[2*i+2] = start_point
56
- end
57
- @breakpoints
58
- end
59
-
60
- def tokenize! string, &block
61
- @extracted_tokens ||= {}
62
- @extracted_tokens.clear
63
- @tokens_to_extract ||= tokens.select{|t| t.extract?}
64
-
65
- find_breakpoints(string)
66
- last_important_bp = [@breakpoints.length, tokens.size].min
67
- (0...last_important_bp).each do |i|
68
- tokens[i].breakpoints = [@breakpoints[i], @breakpoints[i+1]]
69
- end
70
-
71
- @tokens_to_extract.each do |t|
72
- @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
73
- end
74
-
75
- combined_tokens.each do |combiner|
76
- name = combiner[0]
77
- from = @tokens[combiner[1]].breakpoints[0]
78
- to = @tokens[combiner[2]].breakpoints[1]
79
- @extracted_tokens[name] = string[from...to]
80
- end
81
-
82
- if block_given?
83
- yield @extracted_tokens
84
- end
85
-
86
- # return self for chaining
87
- self
88
- end
89
-
90
- protected
91
-
92
- def find_end_of token, string, start_at
93
- start = string.index(token.string, start_at+1) || string.length
94
- [start, [start + token.string.length, string.length].min]
95
- end
96
-
97
- end
98
-