string-eater 0.2.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/LICENSE +1 -1
- data/README.md +31 -15
- data/examples/address.rb +22 -17
- data/examples/nginx.rb +21 -13
- data/ext/string-eater/c-tokenizer.c +20 -6
- data/ext/string-eater/extconf.rb +1 -0
- data/lib/c-tokenizer.rb +52 -37
- data/lib/string-eater.rb +4 -2
- data/lib/token.rb +5 -3
- data/lib/version.rb +8 -4
- data/spec/nginx_spec.rb +33 -21
- data/spec/spec_helper.rb +1 -0
- data/spec/string_eater_spec.rb +122 -130
- metadata +5 -9
- data/lib/ruby-tokenizer-each-char.rb +0 -145
- data/lib/ruby-tokenizer.rb +0 -98
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ZmU1MmM0ZDY2MjQ4ZWM5MmFjN2E0YzVlODJkYWIwOWFlZDYxYzYyOQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NjhjZGQ1ZDQwZDVjNmE0MjZjM2Q5YTljNjAwNzFhNTJhNmE2ZmFmNw==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YTg0NzZjZTFkYzhjNWVhMzE2YjNjMzQ4N2RlNWYzYTI1NWM5MTE1MjE4NGEw
|
10
|
+
ODdmNjRiYWZiODVmZGY0ZmI1MTk5MmZiZGMyYTBhNTRjODZjOGM2ODRiYjM5
|
11
|
+
MWMwMzJmNGVlOTAyYTI2YmY0NzM4MTEwNDM3NjI1MTE1ZmRmNDU=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
NzQ2NTdlYjAzY2NiMWIzYTRkYTI1NGFhZjgxOWY0YjgxYzk4ZDkyMGU3MDAw
|
14
|
+
YmQ5YjQzNDAzNGViOGJmYjFmOGI1MDIyNGI2OWNiZGVhN2ZkNWJjYTYzNTBh
|
15
|
+
YWYzZWRiYjE4ODA3YjI1ZmM4NWExZmI2ZmJmMzljMDA1Nzc0ZGY=
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -87,25 +87,41 @@ We can also do something like this:
|
|
87
87
|
For another example, see `examples/nginx.rb`, which defines an
|
88
88
|
[nginx](http://nginx.org) log line tokenizer.
|
89
89
|
|
90
|
-
##
|
90
|
+
## Non-strict usage
|
91
|
+
|
92
|
+
Use `set_non_strict` to indicate that separator finding should be
|
93
|
+
non-strict. This means that if the tokenizer fails to find a
|
94
|
+
separator before finishing a string, it will fill in the last token
|
95
|
+
with the remainder of the string. Normally (i.e., strict usage), the
|
96
|
+
token whose closing character was not found is left nil.
|
97
|
+
|
98
|
+
Example:
|
99
|
+
|
100
|
+
class PersonTokenizer < StringEater::Tokenizer
|
101
|
+
add_field :last_name
|
102
|
+
look_for ", "
|
103
|
+
add_field :first_name, :extract => false
|
104
|
+
look_for " | "
|
105
|
+
add_field :street_address, :extract => false
|
106
|
+
look_for ", "
|
107
|
+
add_field :city
|
108
|
+
look_for ", "
|
109
|
+
add_field :state
|
110
|
+
look_for ", "
|
111
|
+
set_non_strict
|
112
|
+
end
|
91
113
|
|
92
|
-
|
93
|
-
|
114
|
+
tokenizer = PersonTokenizer.new
|
115
|
+
string = "Flinstone, Fred | 301 Cobblestone Way, Bedrock"
|
116
|
+
tokenizer.tokenize! string
|
94
117
|
|
95
|
-
|
96
|
-
|
97
|
-
`StringEater::Tokenizer`.
|
118
|
+
puts tokenizer.last_name # => "Flinstone"
|
119
|
+
puts tokenizer.city # => "Bedrock" (if strict, would be nil)
|
98
120
|
|
99
|
-
|
100
|
-
|
101
|
-
implementation that is faster on Ruby than a translation of the C
|
102
|
-
algorithm. Probably not as fast (or not much faster) than using
|
103
|
-
Ruby regular expressions.
|
121
|
+
Non-strict can also be set on an instance tokenizer,
|
122
|
+
i.e., call `tokenizer.set_non_strict` to make `tokenizer` non-strict.
|
104
123
|
|
105
|
-
|
106
|
-
This is essentially the same as the C implementation, but written
|
107
|
-
in pure Ruby. It uses `String#each_char` and is therefore VERY
|
108
|
-
SLOW! It provides a good way to hack the algorithm, though.
|
124
|
+
## Implementation
|
109
125
|
|
110
126
|
The main algorithm works by finding the start and end points of tokens
|
111
127
|
in a string. The search is done incrementally (i.e., loop through the
|
data/examples/address.rb
CHANGED
@@ -1,35 +1,40 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
# once the gem is installed, you don't need this
|
2
|
-
|
3
|
-
|
4
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
|
5
|
+
'..', 'lib')))
|
6
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
|
7
|
+
'..', 'ext/string-eater')))
|
4
8
|
|
5
9
|
# this is the example from the README
|
6
10
|
require 'string-eater'
|
7
11
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
# example tokenizer for addresses
|
13
|
+
class PersonTokenizer < StringEater::Tokenizer
|
14
|
+
add_field :last_name
|
15
|
+
look_for ', '
|
16
|
+
add_field :first_name, extract: false
|
17
|
+
look_for ' | '
|
18
|
+
add_field :street_address, extract: false
|
19
|
+
look_for ', '
|
15
20
|
add_field :city
|
16
|
-
look_for
|
17
|
-
add_field :state
|
18
|
-
look_for
|
21
|
+
look_for ', '
|
22
|
+
add_field :state
|
23
|
+
look_for ', '
|
19
24
|
end
|
20
25
|
|
21
|
-
if __FILE__ == $
|
26
|
+
if __FILE__ == $PROGRAM_NAME
|
22
27
|
tokenizer = PersonTokenizer.new
|
23
28
|
puts tokenizer.describe_line
|
24
29
|
|
25
|
-
string =
|
30
|
+
string = 'Flinstone, Fred | 301 Cobblestone Way, Bedrock, NA, 00000'
|
26
31
|
tokenizer.tokenize! string
|
27
32
|
|
28
|
-
puts tokenizer.last_name # => "Flinestone"
|
29
|
-
puts tokenizer.city # => "Bedrock"
|
33
|
+
puts tokenizer.last_name # => "Flinestone"
|
34
|
+
puts tokenizer.city # => "Bedrock"
|
30
35
|
puts tokenizer.state # => "NA"
|
31
36
|
|
32
|
-
tokenizer.tokenize!(string) do |tokens|
|
37
|
+
tokenizer.tokenize!(string) do |tokens|
|
33
38
|
puts "The #{tokens[:last_name]}s live in #{tokens[:city]}"
|
34
39
|
end
|
35
40
|
end
|
data/examples/nginx.rb
CHANGED
@@ -1,27 +1,32 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
# once the gem is installed, you don't need this
|
2
|
-
|
3
|
-
|
4
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
|
5
|
+
'..', 'lib')))
|
6
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
|
7
|
+
'..', 'ext/string-eater')))
|
4
8
|
|
5
9
|
require 'string-eater'
|
6
10
|
|
11
|
+
# Example tokenizer for nginx log lines
|
7
12
|
class NginxLogTokenizer < StringEater::CTokenizer
|
8
13
|
add_field :ip
|
9
|
-
look_for
|
10
|
-
add_field :remote_user, :
|
11
|
-
look_for
|
12
|
-
add_field :timestamp, :
|
14
|
+
look_for ' - '
|
15
|
+
add_field :remote_user, extract: false
|
16
|
+
look_for ' ['
|
17
|
+
add_field :timestamp, extract: false
|
13
18
|
look_for "] \""
|
14
19
|
add_field :request
|
15
20
|
look_for "\" "
|
16
21
|
add_field :status_code
|
17
|
-
look_for
|
18
|
-
add_field :bytes_sent, :
|
22
|
+
look_for ' '
|
23
|
+
add_field :bytes_sent, extract: false
|
19
24
|
look_for " \""
|
20
25
|
add_field :referrer_url
|
21
26
|
look_for "\" \""
|
22
27
|
add_field :user_agent
|
23
28
|
look_for "\" \""
|
24
|
-
add_field :compression, :
|
29
|
+
add_field :compression, extract: false
|
25
30
|
look_for "\" "
|
26
31
|
add_field :remainder
|
27
32
|
|
@@ -47,14 +52,17 @@ class NginxLogTokenizer < StringEater::CTokenizer
|
|
47
52
|
end
|
48
53
|
end
|
49
54
|
|
50
|
-
if __FILE__ == $
|
55
|
+
if __FILE__ == $PROGRAM_NAME
|
51
56
|
tokenizer = NginxLogTokenizer.new
|
52
57
|
puts tokenizer.describe_line
|
53
58
|
|
54
|
-
str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500]
|
59
|
+
str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
|
60
|
+
'"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
|
61
|
+
'"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
|
62
|
+
'Trident/5.0)" "-" "there could be" other "stuff here"'
|
55
63
|
|
56
|
-
puts
|
57
|
-
puts
|
64
|
+
puts 'input string: ' + str
|
65
|
+
puts 'Tokens: '
|
58
66
|
|
59
67
|
# use a block to work with the extracted tokens
|
60
68
|
tokenizer.tokenize!(str) do |tokens|
|
@@ -10,11 +10,12 @@ static VALUE rb_cCTokenizer;
|
|
10
10
|
static VALUE rb_mStringEater;
|
11
11
|
|
12
12
|
static VALUE tokenize_string(VALUE self,
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
13
|
+
VALUE string,
|
14
|
+
VALUE tokens_to_find_indexes,
|
15
|
+
VALUE tokens_to_find_strings,
|
16
|
+
VALUE tokens_to_extract_indexes,
|
17
|
+
VALUE tokens_to_extract_names,
|
18
|
+
VALUE non_strict)
|
18
19
|
{
|
19
20
|
const char* input_string = StringValueCStr(string);
|
20
21
|
VALUE extracted_tokens = rb_hash_new();
|
@@ -115,6 +116,19 @@ static VALUE tokenize_string(VALUE self,
|
|
115
116
|
}
|
116
117
|
}
|
117
118
|
|
119
|
+
/*
|
120
|
+
got to the end of the string
|
121
|
+
and have an incomplete token
|
122
|
+
and not strict
|
123
|
+
*/
|
124
|
+
if(ix == str_len && curr_token_ix < n_tokens && RTEST(non_strict))
|
125
|
+
{
|
126
|
+
rb_hash_aset(extracted_tokens,
|
127
|
+
rb_ary_entry(tokens_to_extract_names, curr_token_ix - 1),
|
128
|
+
rb_usascii_str_new(input_string + startpoint,
|
129
|
+
str_len - startpoint));
|
130
|
+
}
|
131
|
+
|
118
132
|
curr_token_ix = n_tokens - 1;
|
119
133
|
|
120
134
|
if(ix < str_len && curr_token_ix == next_token_to_extract_ix)
|
@@ -139,7 +153,7 @@ void Init_c_tokenizer_ext(void)
|
|
139
153
|
rb_cCTokenizer = rb_define_class_under(rb_mStringEater,
|
140
154
|
"CTokenizer", rb_cObject);
|
141
155
|
|
142
|
-
rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string,
|
156
|
+
rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string, 6);
|
143
157
|
|
144
158
|
/* set the callback for when the extension is unloaded */
|
145
159
|
rb_set_end_proc(finalize_c_tokenizer_ext, 0);
|
data/ext/string-eater/extconf.rb
CHANGED
data/lib/c-tokenizer.rb
CHANGED
@@ -1,17 +1,22 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'c_tokenizer_ext'
|
2
4
|
|
5
|
+
# Ruby interface to the c extension
|
3
6
|
class StringEater::CTokenizer
|
7
|
+
attr_reader :tokens
|
8
|
+
|
4
9
|
def self.tokens
|
5
10
|
@tokens ||= []
|
6
11
|
end
|
7
12
|
|
8
|
-
def self.add_field
|
9
|
-
|
10
|
-
define_method(name) {@extracted_tokens[name]}
|
13
|
+
def self.add_field(name, opts = {})
|
14
|
+
tokens << StringEater::Token.new_field(name, opts)
|
15
|
+
define_method(name) { @extracted_tokens[name] }
|
11
16
|
end
|
12
17
|
|
13
|
-
def self.look_for
|
14
|
-
|
18
|
+
def self.look_for(look_for_tokens)
|
19
|
+
tokens << StringEater::Token.new_separator(look_for_tokens)
|
15
20
|
end
|
16
21
|
|
17
22
|
# This is very slow, only do it when necessary
|
@@ -19,12 +24,20 @@ class StringEater::CTokenizer
|
|
19
24
|
Marshal.load(Marshal.dump(tokens))
|
20
25
|
end
|
21
26
|
|
27
|
+
def self.set_non_strict
|
28
|
+
@class_non_strict = true
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.non_strict?
|
32
|
+
@class_non_strict == true
|
33
|
+
end
|
34
|
+
|
22
35
|
def initialize
|
23
36
|
refresh_tokens
|
24
37
|
end
|
25
38
|
|
26
|
-
def
|
27
|
-
@
|
39
|
+
def set_non_strict
|
40
|
+
@non_strict = true
|
28
41
|
end
|
29
42
|
|
30
43
|
def extract_all_fields
|
@@ -41,7 +54,7 @@ class StringEater::CTokenizer
|
|
41
54
|
refresh_tokens
|
42
55
|
end
|
43
56
|
|
44
|
-
def extract_fields
|
57
|
+
def extract_fields(*fields)
|
45
58
|
@token_filter = lambda do |t|
|
46
59
|
t.opts[:extract] = fields.include?(t.name)
|
47
60
|
end
|
@@ -52,76 +65,78 @@ class StringEater::CTokenizer
|
|
52
65
|
def refresh_tokens
|
53
66
|
@tokens = self.class.dup_tokens
|
54
67
|
|
55
|
-
if @token_filter
|
56
|
-
@tokens.each{|t| @token_filter.call(t)}
|
57
|
-
end
|
58
|
-
|
59
|
-
tokens_to_find = tokens.each_with_index.map do |t, i|
|
60
|
-
[i, t.string] if t.string
|
61
|
-
end.compact
|
68
|
+
@tokens.each { |t| @token_filter.call(t) } if @token_filter
|
62
69
|
|
63
|
-
|
64
|
-
@
|
70
|
+
tokens_to_find = gen_tokens_to_find
|
71
|
+
@tokens_to_find_indexes = tokens_to_find.map { |t| t[0] }
|
72
|
+
@tokens_to_find_strings = tokens_to_find.map { |t| t[1] }
|
65
73
|
|
66
|
-
tokens_to_extract =
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
@tokens_to_extract_indexes = tokens_to_extract.map{|t| t[0]}
|
71
|
-
@tokens_to_extract_names = tokens.map{|t| t.name}
|
74
|
+
tokens_to_extract = gen_tokens_to_extract
|
75
|
+
@tokens_to_extract_indexes = tokens_to_extract.map { |t| t[0] }
|
76
|
+
@tokens_to_extract_names = tokens.map { |t| t.name }
|
72
77
|
|
73
78
|
@have_tokens_to_extract = (@tokens_to_extract_indexes.size > 0)
|
74
79
|
end
|
75
80
|
|
76
81
|
def describe_line
|
77
|
-
tokens.
|
78
|
-
desc << (t.string || t.name.to_s ||
|
82
|
+
tokens.reduce('') do |desc, t|
|
83
|
+
desc << (t.string || t.name.to_s || 'xxxxxx')
|
79
84
|
end
|
80
85
|
end
|
81
86
|
|
82
87
|
def do_extra_parsing
|
83
88
|
end
|
84
89
|
|
85
|
-
|
90
|
+
# Not sure this could be much more concise
|
91
|
+
# rubocop:disable MethodLength
|
92
|
+
def tokenize!(string, &block)
|
86
93
|
@string = string
|
87
94
|
@extracted_tokens ||= {}
|
88
95
|
@extracted_tokens.clear
|
96
|
+
@non_strict ||= self.class.non_strict?
|
89
97
|
|
90
98
|
return unless @have_tokens_to_extract
|
91
99
|
|
92
|
-
@extracted_tokens = ctokenize!(@string,
|
100
|
+
@extracted_tokens = ctokenize!(@string,
|
93
101
|
@tokens_to_find_indexes,
|
94
102
|
@tokens_to_find_strings,
|
95
103
|
@tokens_to_extract_indexes,
|
96
|
-
@tokens_to_extract_names
|
104
|
+
@tokens_to_extract_names,
|
105
|
+
@non_strict)
|
97
106
|
|
98
107
|
# extra parsing hook
|
99
108
|
do_extra_parsing
|
100
109
|
|
101
|
-
if block_given?
|
102
|
-
yield @extracted_tokens
|
103
|
-
end
|
110
|
+
yield @extracted_tokens if block_given?
|
104
111
|
|
105
112
|
# return self for chaining
|
106
113
|
self
|
107
|
-
end
|
108
|
-
|
114
|
+
end
|
115
|
+
# rubocop:enable MethodLength
|
116
|
+
|
109
117
|
private
|
110
118
|
|
111
|
-
def set_token_startpoint
|
119
|
+
def set_token_startpoint(ix, startpoint)
|
112
120
|
@tokens[ix].breakpoints[0] = startpoint
|
113
121
|
end
|
114
122
|
|
115
|
-
def get_token_startpoint
|
123
|
+
def get_token_startpoint(ix)
|
116
124
|
@tokens[ix].breakpoints[0]
|
117
125
|
end
|
118
126
|
|
119
|
-
def set_token_endpoint
|
127
|
+
def set_token_endpoint(ix, endpoint)
|
120
128
|
@tokens[ix].breakpoints[1] = endpoint
|
121
129
|
end
|
122
130
|
|
123
|
-
def extract_token?
|
131
|
+
def extract_token?(ix)
|
124
132
|
@tokens[ix].extract?
|
125
133
|
end
|
126
134
|
|
135
|
+
def gen_tokens_to_find
|
136
|
+
tokens.each_with_index.map { |t, i| [i, t.string] if t.string }.compact
|
137
|
+
end
|
138
|
+
|
139
|
+
def gen_tokens_to_extract
|
140
|
+
tokens.each_with_index.map { |t, i| [i, t.name] if t.extract? }.compact
|
141
|
+
end
|
127
142
|
end
|
data/lib/string-eater.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Namespacing module for StringEater
|
1
4
|
module StringEater
|
2
5
|
autoload :Token, 'token'
|
3
|
-
autoload :RubyTokenizer, 'ruby-tokenizer'
|
4
|
-
autoload :RubyTokenizerEachCHar, 'ruby-tokenizer-each-char'
|
5
6
|
autoload :CTokenizer, 'c-tokenizer'
|
6
7
|
|
7
8
|
autoload :VERSION, 'version'
|
8
9
|
|
10
|
+
# by default, Tokenizer is the c extension tokenizer
|
9
11
|
class Tokenizer < CTokenizer; end
|
10
12
|
end
|
data/lib/token.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Token class used by tokenizers
|
1
4
|
class StringEater::Token
|
2
5
|
attr_accessor :name, :string, :opts, :breakpoints, :children
|
3
6
|
|
4
7
|
def initialize
|
5
8
|
@opts = {}
|
6
|
-
@breakpoints = [nil,nil]
|
9
|
+
@breakpoints = [nil, nil]
|
7
10
|
end
|
8
11
|
|
9
12
|
def extract?
|
@@ -13,7 +16,7 @@ class StringEater::Token
|
|
13
16
|
def self.new_field(name, opts)
|
14
17
|
t = new
|
15
18
|
t.name = name
|
16
|
-
t.opts = {:
|
19
|
+
t.opts = { extract: true }.merge(opts)
|
17
20
|
t
|
18
21
|
end
|
19
22
|
|
@@ -22,5 +25,4 @@ class StringEater::Token
|
|
22
25
|
t.string = string
|
23
26
|
t
|
24
27
|
end
|
25
|
-
|
26
28
|
end
|
data/lib/version.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Extend StringEater with Version
|
1
4
|
module StringEater
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
5
|
+
# Version constants
|
6
|
+
module VERSION
|
7
|
+
MAJOR = 1
|
8
|
+
MINOR = 0
|
9
|
+
PATCH = 0
|
6
10
|
PRE = nil
|
7
11
|
STRING = [MAJOR, MINOR, PATCH, PRE].compact.join('.')
|
8
12
|
end
|
data/spec/nginx_spec.rb
CHANGED
@@ -1,32 +1,44 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'spec_helper'
|
2
4
|
require 'string-eater'
|
3
5
|
|
4
|
-
|
6
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
|
7
|
+
'..', 'examples')))
|
5
8
|
|
6
9
|
require 'nginx'
|
7
10
|
|
8
11
|
describe NginxLogTokenizer do
|
9
12
|
before(:each) do
|
10
13
|
@tokenizer = NginxLogTokenizer.new
|
11
|
-
@str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500]
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
14
|
+
@str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
|
15
|
+
'"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
|
16
|
+
'"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
|
17
|
+
'Trident/5.0)" "-" "there could be" other "stuff here"'
|
18
|
+
@str2 = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
|
19
|
+
'"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
|
20
|
+
'"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
|
21
|
+
'WOW64; Trident/5.0)" "-"'
|
22
|
+
end
|
23
|
+
|
24
|
+
user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
|
25
|
+
'WOW64; Trident/5.0)'
|
26
|
+
|
27
|
+
{
|
28
|
+
ip: '73.80.217.212',
|
29
|
+
request: 'GET /this_is_a_url HTTP/1.1',
|
30
|
+
status_code: 304,
|
31
|
+
referrer_url: 'http://referrer.com',
|
32
|
+
user_agent: user_agent,
|
33
|
+
remainder: "\"there could be\" other \"stuff here\"",
|
34
|
+
}.each_pair do |token, val|
|
35
|
+
it "finds the right value for #{token}" do
|
36
|
+
@tokenizer.tokenize!(@str).send(token).should == val
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'correctly handles there not being a remainder' do
|
41
|
+
@tokenizer.tokenize!(@str2).remainder.should be_nil
|
42
|
+
end
|
31
43
|
|
32
44
|
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/string_eater_spec.rb
CHANGED
@@ -1,193 +1,185 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'spec_helper'
|
2
4
|
require 'string-eater'
|
3
5
|
|
4
|
-
TestedClass = StringEater::CTokenizer
|
5
|
-
|
6
6
|
describe StringEater do
|
7
|
-
it
|
8
|
-
StringEater::VERSION::STRING.split(
|
7
|
+
it 'has a version' do
|
8
|
+
StringEater::VERSION::STRING.split('.').size.should >= 3
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
12
12
|
# normal use
|
13
|
-
class Example1 <
|
13
|
+
class Example1 < StringEater::CTokenizer
|
14
14
|
add_field :first_word
|
15
|
-
look_for
|
16
|
-
add_field :second_word, :
|
17
|
-
look_for
|
15
|
+
look_for ' '
|
16
|
+
add_field :second_word, extract: false
|
17
|
+
look_for '|'
|
18
18
|
add_field :third_word
|
19
19
|
end
|
20
20
|
|
21
21
|
describe Example1 do
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
22
|
+
let(:tokenizer) { Example1.new }
|
23
|
+
let(:first_word1) { 'foo' }
|
24
|
+
let(:second_word1) { 'bar' }
|
25
|
+
let(:third_word1) { 'baz' }
|
26
|
+
let(:str1) { "#{first_word1} #{second_word1}|#{third_word1}" }
|
27
|
+
|
28
|
+
describe '#extract_all_fields' do
|
29
|
+
it 'extracts all of the fields' do
|
30
|
+
tokenizer.extract_all_fields
|
31
|
+
tokenizer.tokenize!(str1)
|
32
|
+
expect(tokenizer.first_word).to eq(first_word1)
|
33
|
+
expect(tokenizer.second_word).to eq(second_word1)
|
34
|
+
expect(tokenizer.third_word).to eq(third_word1)
|
35
|
+
end
|
30
36
|
end
|
31
37
|
|
32
|
-
describe
|
33
|
-
it
|
34
|
-
|
38
|
+
describe '#extract_no_fields' do
|
39
|
+
it 'does not extract any of the fields' do
|
40
|
+
tokenizer.extract_no_fields
|
41
|
+
tokenizer.tokenize!(str1)
|
42
|
+
tokenizer.first_word.should be_nil
|
43
|
+
tokenizer.second_word.should be_nil
|
44
|
+
tokenizer.third_word.should be_nil
|
35
45
|
end
|
36
46
|
end
|
37
47
|
|
38
|
-
describe
|
39
|
-
it
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
48
|
+
describe '#extract_fields' do
|
49
|
+
it 'allows us to set which fields get extracted' do
|
50
|
+
tokenizer.extract_fields :second_word
|
51
|
+
tokenizer.tokenize!(str1)
|
52
|
+
tokenizer.first_word.should be_nil
|
53
|
+
expect(tokenizer.second_word).to eq(second_word1)
|
54
|
+
tokenizer.third_word.should be_nil
|
45
55
|
end
|
46
56
|
end
|
47
57
|
|
48
|
-
describe
|
49
|
-
it
|
50
|
-
|
51
|
-
@tokenizer.tokenize!(@str1)
|
52
|
-
@tokenizer.first_word.should be_nil
|
53
|
-
@tokenizer.second_word.should be_nil
|
54
|
-
@tokenizer.third_word.should be_nil
|
58
|
+
describe 'tokenize!' do
|
59
|
+
it 'returns itself' do
|
60
|
+
tokenizer.tokenize!(str1).should == tokenizer
|
55
61
|
end
|
56
|
-
end
|
57
62
|
|
58
|
-
|
59
|
-
|
60
|
-
@tokenizer.extract_fields :second_word
|
61
|
-
@tokenizer.tokenize!(@str1)
|
62
|
-
@tokenizer.first_word.should be_nil
|
63
|
-
@tokenizer.second_word.should == @second_word1
|
64
|
-
@tokenizer.third_word.should be_nil
|
63
|
+
it 'sets the first word' do
|
64
|
+
tokenizer.tokenize!(str1).first_word.should == 'foo'
|
65
65
|
end
|
66
|
-
end
|
67
66
|
|
68
|
-
|
69
|
-
|
70
|
-
@tokenizer.tokenize!(@str1).should == @tokenizer
|
67
|
+
it 'sets the third word' do
|
68
|
+
tokenizer.tokenize!(str1).third_word.should == 'baz'
|
71
69
|
end
|
72
70
|
|
73
|
-
it
|
74
|
-
|
71
|
+
it 'does not set the second word' do
|
72
|
+
tokenizer.tokenize!(str1).second_word.should be_nil
|
75
73
|
end
|
76
74
|
|
77
|
-
it
|
78
|
-
|
75
|
+
it 'yields a hash of tokens if a block is given' do
|
76
|
+
tokenizer.tokenize!(str1) do |tokens|
|
77
|
+
tokens[:first_word].should == 'foo'
|
78
|
+
end
|
79
79
|
end
|
80
80
|
|
81
|
-
it
|
82
|
-
|
81
|
+
it 'returns everything to the end of the line for the last token' do
|
82
|
+
s = 'c defg asdf | foo , baa'
|
83
|
+
tokenizer.tokenize!("a b|#{s}").third_word.should == s
|
83
84
|
end
|
84
85
|
|
85
|
-
|
86
|
-
|
87
|
-
|
86
|
+
context 'when the last delimiter is missing' do
|
87
|
+
let(:s) { 'a b' }
|
88
|
+
it 'still finds the first word' do
|
89
|
+
expect(tokenizer.tokenize!(s).first_word).to eq('a')
|
88
90
|
end
|
89
|
-
end
|
90
91
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
end
|
92
|
+
it 'returns nil for the second word' do
|
93
|
+
expect(tokenizer.tokenize!(s).second_word).to be_nil
|
94
|
+
end
|
95
95
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
@tokenizer.tokenize!(s).third_word.should be_nil
|
96
|
+
it 'returns nil for the third word' do
|
97
|
+
expect(tokenizer.tokenize!(s).third_word).to be_nil
|
98
|
+
end
|
100
99
|
end
|
101
100
|
|
102
|
-
|
101
|
+
context 'when non_strict is enabled' do
|
102
|
+
before do
|
103
|
+
tokenizer.extract_all_fields
|
104
|
+
tokenizer.set_non_strict
|
105
|
+
end
|
106
|
+
|
107
|
+
context 'when the last delimiter is missing' do
|
108
|
+
let(:s) { 'a b' }
|
109
|
+
it 'still finds the first word' do
|
110
|
+
expect(tokenizer.tokenize!(s).first_word).to eq('a')
|
111
|
+
end
|
112
|
+
|
113
|
+
it 'still finds the second word' do
|
114
|
+
expect(tokenizer.tokenize!(s).second_word).to eq('b')
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'returns nil for the third word' do
|
118
|
+
expect(tokenizer.tokenize!(s).third_word).to be_nil
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
context 'when the last delimiter is not missing' do
|
123
|
+
let(:s) { 'a b|c' }
|
124
|
+
it 'still finds the first word' do
|
125
|
+
expect(tokenizer.tokenize!(s).first_word).to eq('a')
|
126
|
+
end
|
127
|
+
|
128
|
+
it 'still finds the second word' do
|
129
|
+
expect(tokenizer.tokenize!(s).second_word).to eq('b')
|
130
|
+
end
|
103
131
|
|
132
|
+
it 'returns nil for the third word' do
|
133
|
+
expect(tokenizer.tokenize!(s).third_word).to eq('c')
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
104
138
|
end
|
105
139
|
|
106
140
|
# an example where we ignore after a certain point in the string
|
107
|
-
class Example2 <
|
108
|
-
add_field :first_word, :
|
109
|
-
look_for
|
141
|
+
class Example2 < StringEater::CTokenizer
|
142
|
+
add_field :first_word, extract: false
|
143
|
+
look_for ' '
|
110
144
|
add_field :second_word
|
111
|
-
look_for
|
112
|
-
add_field :third_word, :
|
113
|
-
look_for
|
145
|
+
look_for ' '
|
146
|
+
add_field :third_word, extract: false
|
147
|
+
look_for '-'
|
114
148
|
end
|
115
149
|
|
116
150
|
describe Example2 do
|
151
|
+
let(:tokenizer) { Example2.new }
|
152
|
+
let(:second_word1) { 'bar' }
|
153
|
+
let(:str1) { "foo #{second_word1} baz-" }
|
117
154
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
end
|
123
|
-
|
124
|
-
describe "tokenize!" do
|
125
|
-
it "should find the token when there is extra stuff at the end of the string" do
|
126
|
-
@tokenizer.tokenize!(@str1).second_word.should == @second_word1
|
155
|
+
describe 'tokenize!' do
|
156
|
+
it 'finds the token when there is extra stuff at the' +
|
157
|
+
'end of the string' do
|
158
|
+
tokenizer.tokenize!(str1).second_word.should == second_word1
|
127
159
|
end
|
128
160
|
end
|
129
161
|
|
130
162
|
end
|
131
163
|
|
132
164
|
# an example where the split is more than one char
|
133
|
-
class Example3 <
|
134
|
-
look_for
|
165
|
+
class Example3 < StringEater::CTokenizer
|
166
|
+
look_for 'foo='
|
135
167
|
add_field :foo_val
|
136
|
-
look_for
|
168
|
+
look_for '&'
|
137
169
|
end
|
138
170
|
|
139
171
|
describe Example3 do
|
140
|
-
|
141
|
-
@tokenizer = Example3.new
|
142
|
-
end
|
172
|
+
let(:tokenizer) { Example3.new }
|
143
173
|
|
144
|
-
describe
|
145
|
-
it
|
146
|
-
|
174
|
+
describe 'tokenize!' do
|
175
|
+
it 'finds the token if there is only one occurrence ' +
|
176
|
+
'of the characters in the separator' do
|
177
|
+
tokenizer.tokenize!('abcd?foo=val&blah').foo_val.should == 'val'
|
147
178
|
end
|
148
179
|
|
149
|
-
it
|
150
|
-
|
180
|
+
it 'still works if part of the separator token occurs' do
|
181
|
+
tokenizer.tokenize!('abcd?foo_blah=baz&foo=bar&buh')
|
182
|
+
.foo_val.should == 'bar'
|
151
183
|
end
|
152
184
|
end
|
153
185
|
end
|
154
|
-
|
155
|
-
# CTokenizer doesn't do combine_fields because
|
156
|
-
# writing out breakpoints is a significant slow-down
|
157
|
-
if TestedClass.respond_to?(:combine_fields)
|
158
|
-
# an example where we combine fields
|
159
|
-
class Example3 < TestedClass
|
160
|
-
add_field :first_word, :extract => false
|
161
|
-
look_for " \""
|
162
|
-
add_field :part1, :extract => false
|
163
|
-
look_for " "
|
164
|
-
add_field :part2
|
165
|
-
look_for " "
|
166
|
-
add_field :part3, :extract => false
|
167
|
-
look_for "\""
|
168
|
-
|
169
|
-
combine_fields :from => :part1, :to => :part3, :as => :parts
|
170
|
-
end
|
171
|
-
|
172
|
-
describe Example3 do
|
173
|
-
before(:each) do
|
174
|
-
@tokenizer = Example3.new
|
175
|
-
@str1 = "foo \"bar baz bang\""
|
176
|
-
@part2 = "baz"
|
177
|
-
@parts = "bar baz bang"
|
178
|
-
end
|
179
|
-
|
180
|
-
it "should extract like normal" do
|
181
|
-
@tokenizer.tokenize!(@str1).part2.should == @part2
|
182
|
-
end
|
183
|
-
|
184
|
-
it "should ignore like normal" do
|
185
|
-
@tokenizer.tokenize!(@str1).part1.should be_nil
|
186
|
-
end
|
187
|
-
|
188
|
-
it "should extract the combined field" do
|
189
|
-
@tokenizer.tokenize!(@str1).parts.should == @parts
|
190
|
-
end
|
191
|
-
|
192
|
-
end
|
193
|
-
end
|
metadata
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string-eater
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 1.0.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Dan Swain
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-01-05 00:00:00.000000000 Z
|
13
12
|
dependencies: []
|
14
13
|
description: Fast string tokenizer. Nom strings.
|
15
14
|
email:
|
@@ -20,8 +19,6 @@ extensions:
|
|
20
19
|
extra_rdoc_files: []
|
21
20
|
files:
|
22
21
|
- lib/c-tokenizer.rb
|
23
|
-
- lib/ruby-tokenizer-each-char.rb
|
24
|
-
- lib/ruby-tokenizer.rb
|
25
22
|
- lib/string-eater.rb
|
26
23
|
- lib/token.rb
|
27
24
|
- lib/version.rb
|
@@ -37,28 +34,27 @@ files:
|
|
37
34
|
- README.md
|
38
35
|
homepage: http://github.com/simplifi/string-eater
|
39
36
|
licenses: []
|
37
|
+
metadata: {}
|
40
38
|
post_install_message:
|
41
39
|
rdoc_options: []
|
42
40
|
require_paths:
|
43
41
|
- lib
|
44
42
|
- ext/string-eater
|
45
43
|
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
-
none: false
|
47
44
|
requirements:
|
48
45
|
- - ! '>='
|
49
46
|
- !ruby/object:Gem::Version
|
50
47
|
version: '0'
|
51
48
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
-
none: false
|
53
49
|
requirements:
|
54
50
|
- - ! '>='
|
55
51
|
- !ruby/object:Gem::Version
|
56
52
|
version: '0'
|
57
53
|
requirements: []
|
58
54
|
rubyforge_project:
|
59
|
-
rubygems_version:
|
55
|
+
rubygems_version: 2.0.6
|
60
56
|
signing_key:
|
61
|
-
specification_version:
|
57
|
+
specification_version: 4
|
62
58
|
summary: Fast string tokenizer. Nom strings.
|
63
59
|
test_files:
|
64
60
|
- spec/nginx_spec.rb
|
@@ -1,145 +0,0 @@
|
|
1
|
-
# this tokenizer is very slow, but it illustrates the
|
2
|
-
# basic idea of the C tokenizer
|
3
|
-
class StringEater::RubyTokenizerEachChar
|
4
|
-
|
5
|
-
def self.tokens
|
6
|
-
@tokens ||= []
|
7
|
-
end
|
8
|
-
|
9
|
-
def self.combined_tokens
|
10
|
-
@combined_tokens ||= []
|
11
|
-
end
|
12
|
-
|
13
|
-
def self.add_field name, opts={}
|
14
|
-
self.tokens << StringEater::Token::new_field(name, opts)
|
15
|
-
define_method(name) {@extracted_tokens[name]}
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.look_for tokens
|
19
|
-
self.tokens << StringEater::Token::new_separator(tokens)
|
20
|
-
end
|
21
|
-
|
22
|
-
def self.combine_fields opts={}
|
23
|
-
from_token_index = self.tokens.index{|t| t.name == opts[:from]}
|
24
|
-
to_token_index = self.tokens.index{|t| t.name == opts[:to]}
|
25
|
-
self.combined_tokens << [opts[:as], from_token_index, to_token_index]
|
26
|
-
define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
|
27
|
-
end
|
28
|
-
|
29
|
-
def tokens
|
30
|
-
@tokens ||= self.class.tokens
|
31
|
-
end
|
32
|
-
|
33
|
-
def combined_tokens
|
34
|
-
@combined_tokens ||= self.class.combined_tokens
|
35
|
-
end
|
36
|
-
|
37
|
-
def refresh_tokens
|
38
|
-
@combined_tokens = nil
|
39
|
-
@tokens = nil
|
40
|
-
tokens
|
41
|
-
end
|
42
|
-
|
43
|
-
def describe_line
|
44
|
-
tokens.inject("") do |desc, t|
|
45
|
-
desc << (t.string || t.name.to_s || "xxxxxx")
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
def find_breakpoints string
|
50
|
-
tokenize!(string) unless @string == string
|
51
|
-
tokens.inject([]) do |bp, t|
|
52
|
-
bp << t.breakpoints
|
53
|
-
bp
|
54
|
-
end.flatten.uniq
|
55
|
-
end
|
56
|
-
|
57
|
-
def tokenize! string, &block
|
58
|
-
@string = string
|
59
|
-
@extracted_tokens ||= {}
|
60
|
-
@extracted_tokens.clear
|
61
|
-
@tokens_to_find ||= tokens.each_with_index.map do |t, i|
|
62
|
-
[i, t.string] if t.string
|
63
|
-
end.compact
|
64
|
-
@tokens_to_extract_indeces ||= tokens.each_with_index.map do |t, i|
|
65
|
-
i if t.extract?
|
66
|
-
end.compact
|
67
|
-
|
68
|
-
tokens.first.breakpoints[0] = 0
|
69
|
-
|
70
|
-
find_index = 0
|
71
|
-
|
72
|
-
curr_token = @tokens_to_find[find_index]
|
73
|
-
curr_token_index = curr_token[0]
|
74
|
-
curr_token_length = curr_token[1].length
|
75
|
-
looking_for_index = 0
|
76
|
-
looking_for = curr_token[1][looking_for_index]
|
77
|
-
|
78
|
-
counter = 0
|
79
|
-
string.each_char do |c|
|
80
|
-
if c == looking_for
|
81
|
-
if looking_for_index == 0
|
82
|
-
# entering new token
|
83
|
-
if curr_token_index > 0
|
84
|
-
t = tokens[curr_token_index - 1]
|
85
|
-
t.breakpoints[1] = counter
|
86
|
-
if t.extract?
|
87
|
-
@extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
|
88
|
-
end
|
89
|
-
end
|
90
|
-
tokens[curr_token_index].breakpoints[0] = counter
|
91
|
-
end
|
92
|
-
if looking_for_index >= (curr_token_length - 1)
|
93
|
-
# leaving token
|
94
|
-
tokens[curr_token_index].breakpoints[1] = counter
|
95
|
-
|
96
|
-
if curr_token_index >= tokens.size-1
|
97
|
-
# we're done!
|
98
|
-
break
|
99
|
-
else
|
100
|
-
tokens[curr_token_index + 1].breakpoints[0] = counter + 1
|
101
|
-
end
|
102
|
-
|
103
|
-
# next token
|
104
|
-
find_index += 1
|
105
|
-
if find_index >= @tokens_to_find.length
|
106
|
-
# we're done!
|
107
|
-
break
|
108
|
-
end
|
109
|
-
curr_token = @tokens_to_find[find_index]
|
110
|
-
curr_token_index = curr_token[0]
|
111
|
-
curr_token_length = curr_token[1].length
|
112
|
-
looking_for_index = 0
|
113
|
-
else
|
114
|
-
looking_for_index += 1
|
115
|
-
end
|
116
|
-
end
|
117
|
-
looking_for = curr_token[1][looking_for_index]
|
118
|
-
counter += 1
|
119
|
-
end
|
120
|
-
|
121
|
-
last_token = tokens.last
|
122
|
-
last_token.breakpoints[1] = string.length
|
123
|
-
|
124
|
-
if last_token.extract?
|
125
|
-
@extracted_tokens[last_token.name] = string[last_token.breakpoints[0]..last_token.breakpoints[1]]
|
126
|
-
end
|
127
|
-
|
128
|
-
combined_tokens.each do |combiner|
|
129
|
-
name = combiner[0]
|
130
|
-
from = @tokens[combiner[1]].breakpoints[0]
|
131
|
-
to = @tokens[combiner[2]].breakpoints[1]
|
132
|
-
@extracted_tokens[name] = string[from...to]
|
133
|
-
end
|
134
|
-
|
135
|
-
if block_given?
|
136
|
-
yield @extracted_tokens
|
137
|
-
end
|
138
|
-
|
139
|
-
# return self for chaining
|
140
|
-
self
|
141
|
-
end
|
142
|
-
|
143
|
-
end
|
144
|
-
|
145
|
-
|
data/lib/ruby-tokenizer.rb
DELETED
@@ -1,98 +0,0 @@
|
|
1
|
-
# this tokenizer is fairly fast, but not necessarily faster than regexps
|
2
|
-
class StringEater::RubyTokenizer
|
3
|
-
def self.tokens
|
4
|
-
@tokens ||= []
|
5
|
-
end
|
6
|
-
|
7
|
-
def self.combined_tokens
|
8
|
-
@combined_tokens ||= []
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.add_field name, opts={}
|
12
|
-
self.tokens << StringEater::Token::new_field(name, opts)
|
13
|
-
define_method(name) {@extracted_tokens[name]}
|
14
|
-
end
|
15
|
-
|
16
|
-
def self.look_for tokens
|
17
|
-
self.tokens << StringEater::Token::new_separator(tokens)
|
18
|
-
end
|
19
|
-
|
20
|
-
def self.combine_fields opts={}
|
21
|
-
from_token_index = self.tokens.index{|t| t.name == opts[:from]}
|
22
|
-
to_token_index = self.tokens.index{|t| t.name == opts[:to]}
|
23
|
-
self.combined_tokens << [opts[:as], from_token_index, to_token_index]
|
24
|
-
define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
|
25
|
-
end
|
26
|
-
|
27
|
-
def tokens
|
28
|
-
@tokens ||= self.class.tokens
|
29
|
-
end
|
30
|
-
|
31
|
-
def combined_tokens
|
32
|
-
@combined_tokens ||= self.class.combined_tokens
|
33
|
-
end
|
34
|
-
|
35
|
-
def refresh_tokens
|
36
|
-
@combined_tokens = nil
|
37
|
-
@tokens = nil
|
38
|
-
tokens
|
39
|
-
end
|
40
|
-
|
41
|
-
def describe_line
|
42
|
-
tokens.inject("") do |desc, t|
|
43
|
-
desc << (t.string || t.name.to_s || "xxxxxx")
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def find_breakpoints(string)
|
48
|
-
@literal_tokens ||= tokens.select{|t| t.string}
|
49
|
-
@breakpoints ||= Array.new(2*@literal_tokens.size + 2)
|
50
|
-
@breakpoints[0] = 0
|
51
|
-
@breakpoints[-1] = string.length
|
52
|
-
start_point = 0
|
53
|
-
@literal_tokens.each_with_index do |t, i|
|
54
|
-
@breakpoints[2*i+1], start_point = find_end_of(t, string, start_point)
|
55
|
-
@breakpoints[2*i+2] = start_point
|
56
|
-
end
|
57
|
-
@breakpoints
|
58
|
-
end
|
59
|
-
|
60
|
-
def tokenize! string, &block
|
61
|
-
@extracted_tokens ||= {}
|
62
|
-
@extracted_tokens.clear
|
63
|
-
@tokens_to_extract ||= tokens.select{|t| t.extract?}
|
64
|
-
|
65
|
-
find_breakpoints(string)
|
66
|
-
last_important_bp = [@breakpoints.length, tokens.size].min
|
67
|
-
(0...last_important_bp).each do |i|
|
68
|
-
tokens[i].breakpoints = [@breakpoints[i], @breakpoints[i+1]]
|
69
|
-
end
|
70
|
-
|
71
|
-
@tokens_to_extract.each do |t|
|
72
|
-
@extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
|
73
|
-
end
|
74
|
-
|
75
|
-
combined_tokens.each do |combiner|
|
76
|
-
name = combiner[0]
|
77
|
-
from = @tokens[combiner[1]].breakpoints[0]
|
78
|
-
to = @tokens[combiner[2]].breakpoints[1]
|
79
|
-
@extracted_tokens[name] = string[from...to]
|
80
|
-
end
|
81
|
-
|
82
|
-
if block_given?
|
83
|
-
yield @extracted_tokens
|
84
|
-
end
|
85
|
-
|
86
|
-
# return self for chaining
|
87
|
-
self
|
88
|
-
end
|
89
|
-
|
90
|
-
protected
|
91
|
-
|
92
|
-
def find_end_of token, string, start_at
|
93
|
-
start = string.index(token.string, start_at+1) || string.length
|
94
|
-
[start, [start + token.string.length, string.length].min]
|
95
|
-
end
|
96
|
-
|
97
|
-
end
|
98
|
-
|