string-eater 0.2.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/LICENSE +1 -1
- data/README.md +31 -15
- data/examples/address.rb +22 -17
- data/examples/nginx.rb +21 -13
- data/ext/string-eater/c-tokenizer.c +20 -6
- data/ext/string-eater/extconf.rb +1 -0
- data/lib/c-tokenizer.rb +52 -37
- data/lib/string-eater.rb +4 -2
- data/lib/token.rb +5 -3
- data/lib/version.rb +8 -4
- data/spec/nginx_spec.rb +33 -21
- data/spec/spec_helper.rb +1 -0
- data/spec/string_eater_spec.rb +122 -130
- metadata +5 -9
- data/lib/ruby-tokenizer-each-char.rb +0 -145
- data/lib/ruby-tokenizer.rb +0 -98
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ZmU1MmM0ZDY2MjQ4ZWM5MmFjN2E0YzVlODJkYWIwOWFlZDYxYzYyOQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NjhjZGQ1ZDQwZDVjNmE0MjZjM2Q5YTljNjAwNzFhNTJhNmE2ZmFmNw==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YTg0NzZjZTFkYzhjNWVhMzE2YjNjMzQ4N2RlNWYzYTI1NWM5MTE1MjE4NGEw
|
10
|
+
ODdmNjRiYWZiODVmZGY0ZmI1MTk5MmZiZGMyYTBhNTRjODZjOGM2ODRiYjM5
|
11
|
+
MWMwMzJmNGVlOTAyYTI2YmY0NzM4MTEwNDM3NjI1MTE1ZmRmNDU=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
NzQ2NTdlYjAzY2NiMWIzYTRkYTI1NGFhZjgxOWY0YjgxYzk4ZDkyMGU3MDAw
|
14
|
+
YmQ5YjQzNDAzNGViOGJmYjFmOGI1MDIyNGI2OWNiZGVhN2ZkNWJjYTYzNTBh
|
15
|
+
YWYzZWRiYjE4ODA3YjI1ZmM4NWExZmI2ZmJmMzljMDA1Nzc0ZGY=
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -87,25 +87,41 @@ We can also do something like this:
|
|
87
87
|
For another example, see `examples/nginx.rb`, which defines an
|
88
88
|
[nginx](http://nginx.org) log line tokenizer.
|
89
89
|
|
90
|
-
##
|
90
|
+
## Non-strict usage
|
91
|
+
|
92
|
+
Use `set_non_strict` to indicate that separator finding should be
|
93
|
+
non-strict. This means that if the tokenizer fails to find a
|
94
|
+
separator before finishing a string, it will fill in the last token
|
95
|
+
with the remainder of the string. Normally (i.e., strict usage), the
|
96
|
+
token whose closing character was not found is left nil.
|
97
|
+
|
98
|
+
Example:
|
99
|
+
|
100
|
+
class PersonTokenizer < StringEater::Tokenizer
|
101
|
+
add_field :last_name
|
102
|
+
look_for ", "
|
103
|
+
add_field :first_name, :extract => false
|
104
|
+
look_for " | "
|
105
|
+
add_field :street_address, :extract => false
|
106
|
+
look_for ", "
|
107
|
+
add_field :city
|
108
|
+
look_for ", "
|
109
|
+
add_field :state
|
110
|
+
look_for ", "
|
111
|
+
set_non_strict
|
112
|
+
end
|
91
113
|
|
92
|
-
|
93
|
-
|
114
|
+
tokenizer = PersonTokenizer.new
|
115
|
+
string = "Flinstone, Fred | 301 Cobblestone Way, Bedrock"
|
116
|
+
tokenizer.tokenize! string
|
94
117
|
|
95
|
-
|
96
|
-
|
97
|
-
`StringEater::Tokenizer`.
|
118
|
+
puts tokenizer.last_name # => "Flinstone"
|
119
|
+
puts tokenizer.city # => "Bedrock" (if strict, would be nil)
|
98
120
|
|
99
|
-
|
100
|
-
|
101
|
-
implementation that is faster on Ruby than a translation of the C
|
102
|
-
algorithm. Probably not as fast (or not much faster) than using
|
103
|
-
Ruby regular expressions.
|
121
|
+
Non-strict can also be set on an instance tokenizer,
|
122
|
+
i.e., call `tokenizer.set_non_strict` to make `tokenizer` non-strict.
|
104
123
|
|
105
|
-
|
106
|
-
This is essentially the same as the C implementation, but written
|
107
|
-
in pure Ruby. It uses `String#each_char` and is therefore VERY
|
108
|
-
SLOW! It provides a good way to hack the algorithm, though.
|
124
|
+
## Implementation
|
109
125
|
|
110
126
|
The main algorithm works by finding the start and end points of tokens
|
111
127
|
in a string. The search is done incrementally (i.e., loop through the
|
data/examples/address.rb
CHANGED
@@ -1,35 +1,40 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
# once the gem is installed, you don't need this
|
2
|
-
|
3
|
-
|
4
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
|
5
|
+
'..', 'lib')))
|
6
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
|
7
|
+
'..', 'ext/string-eater')))
|
4
8
|
|
5
9
|
# this is the example from the README
|
6
10
|
require 'string-eater'
|
7
11
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
# example tokenizer for addresses
|
13
|
+
class PersonTokenizer < StringEater::Tokenizer
|
14
|
+
add_field :last_name
|
15
|
+
look_for ', '
|
16
|
+
add_field :first_name, extract: false
|
17
|
+
look_for ' | '
|
18
|
+
add_field :street_address, extract: false
|
19
|
+
look_for ', '
|
15
20
|
add_field :city
|
16
|
-
look_for
|
17
|
-
add_field :state
|
18
|
-
look_for
|
21
|
+
look_for ', '
|
22
|
+
add_field :state
|
23
|
+
look_for ', '
|
19
24
|
end
|
20
25
|
|
21
|
-
if __FILE__ == $
|
26
|
+
if __FILE__ == $PROGRAM_NAME
|
22
27
|
tokenizer = PersonTokenizer.new
|
23
28
|
puts tokenizer.describe_line
|
24
29
|
|
25
|
-
string =
|
30
|
+
string = 'Flinstone, Fred | 301 Cobblestone Way, Bedrock, NA, 00000'
|
26
31
|
tokenizer.tokenize! string
|
27
32
|
|
28
|
-
puts tokenizer.last_name # => "Flinestone"
|
29
|
-
puts tokenizer.city # => "Bedrock"
|
33
|
+
puts tokenizer.last_name # => "Flinestone"
|
34
|
+
puts tokenizer.city # => "Bedrock"
|
30
35
|
puts tokenizer.state # => "NA"
|
31
36
|
|
32
|
-
tokenizer.tokenize!(string) do |tokens|
|
37
|
+
tokenizer.tokenize!(string) do |tokens|
|
33
38
|
puts "The #{tokens[:last_name]}s live in #{tokens[:city]}"
|
34
39
|
end
|
35
40
|
end
|
data/examples/nginx.rb
CHANGED
@@ -1,27 +1,32 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
# once the gem is installed, you don't need this
|
2
|
-
|
3
|
-
|
4
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
|
5
|
+
'..', 'lib')))
|
6
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
|
7
|
+
'..', 'ext/string-eater')))
|
4
8
|
|
5
9
|
require 'string-eater'
|
6
10
|
|
11
|
+
# Example tokenizer for nginx log lines
|
7
12
|
class NginxLogTokenizer < StringEater::CTokenizer
|
8
13
|
add_field :ip
|
9
|
-
look_for
|
10
|
-
add_field :remote_user, :
|
11
|
-
look_for
|
12
|
-
add_field :timestamp, :
|
14
|
+
look_for ' - '
|
15
|
+
add_field :remote_user, extract: false
|
16
|
+
look_for ' ['
|
17
|
+
add_field :timestamp, extract: false
|
13
18
|
look_for "] \""
|
14
19
|
add_field :request
|
15
20
|
look_for "\" "
|
16
21
|
add_field :status_code
|
17
|
-
look_for
|
18
|
-
add_field :bytes_sent, :
|
22
|
+
look_for ' '
|
23
|
+
add_field :bytes_sent, extract: false
|
19
24
|
look_for " \""
|
20
25
|
add_field :referrer_url
|
21
26
|
look_for "\" \""
|
22
27
|
add_field :user_agent
|
23
28
|
look_for "\" \""
|
24
|
-
add_field :compression, :
|
29
|
+
add_field :compression, extract: false
|
25
30
|
look_for "\" "
|
26
31
|
add_field :remainder
|
27
32
|
|
@@ -47,14 +52,17 @@ class NginxLogTokenizer < StringEater::CTokenizer
|
|
47
52
|
end
|
48
53
|
end
|
49
54
|
|
50
|
-
if __FILE__ == $
|
55
|
+
if __FILE__ == $PROGRAM_NAME
|
51
56
|
tokenizer = NginxLogTokenizer.new
|
52
57
|
puts tokenizer.describe_line
|
53
58
|
|
54
|
-
str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500]
|
59
|
+
str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
|
60
|
+
'"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
|
61
|
+
'"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
|
62
|
+
'Trident/5.0)" "-" "there could be" other "stuff here"'
|
55
63
|
|
56
|
-
puts
|
57
|
-
puts
|
64
|
+
puts 'input string: ' + str
|
65
|
+
puts 'Tokens: '
|
58
66
|
|
59
67
|
# use a block to work with the extracted tokens
|
60
68
|
tokenizer.tokenize!(str) do |tokens|
|
@@ -10,11 +10,12 @@ static VALUE rb_cCTokenizer;
|
|
10
10
|
static VALUE rb_mStringEater;
|
11
11
|
|
12
12
|
static VALUE tokenize_string(VALUE self,
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
13
|
+
VALUE string,
|
14
|
+
VALUE tokens_to_find_indexes,
|
15
|
+
VALUE tokens_to_find_strings,
|
16
|
+
VALUE tokens_to_extract_indexes,
|
17
|
+
VALUE tokens_to_extract_names,
|
18
|
+
VALUE non_strict)
|
18
19
|
{
|
19
20
|
const char* input_string = StringValueCStr(string);
|
20
21
|
VALUE extracted_tokens = rb_hash_new();
|
@@ -115,6 +116,19 @@ static VALUE tokenize_string(VALUE self,
|
|
115
116
|
}
|
116
117
|
}
|
117
118
|
|
119
|
+
/*
|
120
|
+
got to the end of the string
|
121
|
+
and have an incomplete token
|
122
|
+
and not strict
|
123
|
+
*/
|
124
|
+
if(ix == str_len && curr_token_ix < n_tokens && RTEST(non_strict))
|
125
|
+
{
|
126
|
+
rb_hash_aset(extracted_tokens,
|
127
|
+
rb_ary_entry(tokens_to_extract_names, curr_token_ix - 1),
|
128
|
+
rb_usascii_str_new(input_string + startpoint,
|
129
|
+
str_len - startpoint));
|
130
|
+
}
|
131
|
+
|
118
132
|
curr_token_ix = n_tokens - 1;
|
119
133
|
|
120
134
|
if(ix < str_len && curr_token_ix == next_token_to_extract_ix)
|
@@ -139,7 +153,7 @@ void Init_c_tokenizer_ext(void)
|
|
139
153
|
rb_cCTokenizer = rb_define_class_under(rb_mStringEater,
|
140
154
|
"CTokenizer", rb_cObject);
|
141
155
|
|
142
|
-
rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string,
|
156
|
+
rb_define_method(rb_cCTokenizer, "ctokenize!", tokenize_string, 6);
|
143
157
|
|
144
158
|
/* set the callback for when the extension is unloaded */
|
145
159
|
rb_set_end_proc(finalize_c_tokenizer_ext, 0);
|
data/ext/string-eater/extconf.rb
CHANGED
data/lib/c-tokenizer.rb
CHANGED
@@ -1,17 +1,22 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'c_tokenizer_ext'
|
2
4
|
|
5
|
+
# Ruby interface to the c extension
|
3
6
|
class StringEater::CTokenizer
|
7
|
+
attr_reader :tokens
|
8
|
+
|
4
9
|
def self.tokens
|
5
10
|
@tokens ||= []
|
6
11
|
end
|
7
12
|
|
8
|
-
def self.add_field
|
9
|
-
|
10
|
-
define_method(name) {@extracted_tokens[name]}
|
13
|
+
def self.add_field(name, opts = {})
|
14
|
+
tokens << StringEater::Token.new_field(name, opts)
|
15
|
+
define_method(name) { @extracted_tokens[name] }
|
11
16
|
end
|
12
17
|
|
13
|
-
def self.look_for
|
14
|
-
|
18
|
+
def self.look_for(look_for_tokens)
|
19
|
+
tokens << StringEater::Token.new_separator(look_for_tokens)
|
15
20
|
end
|
16
21
|
|
17
22
|
# This is very slow, only do it when necessary
|
@@ -19,12 +24,20 @@ class StringEater::CTokenizer
|
|
19
24
|
Marshal.load(Marshal.dump(tokens))
|
20
25
|
end
|
21
26
|
|
27
|
+
def self.set_non_strict
|
28
|
+
@class_non_strict = true
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.non_strict?
|
32
|
+
@class_non_strict == true
|
33
|
+
end
|
34
|
+
|
22
35
|
def initialize
|
23
36
|
refresh_tokens
|
24
37
|
end
|
25
38
|
|
26
|
-
def
|
27
|
-
@
|
39
|
+
def set_non_strict
|
40
|
+
@non_strict = true
|
28
41
|
end
|
29
42
|
|
30
43
|
def extract_all_fields
|
@@ -41,7 +54,7 @@ class StringEater::CTokenizer
|
|
41
54
|
refresh_tokens
|
42
55
|
end
|
43
56
|
|
44
|
-
def extract_fields
|
57
|
+
def extract_fields(*fields)
|
45
58
|
@token_filter = lambda do |t|
|
46
59
|
t.opts[:extract] = fields.include?(t.name)
|
47
60
|
end
|
@@ -52,76 +65,78 @@ class StringEater::CTokenizer
|
|
52
65
|
def refresh_tokens
|
53
66
|
@tokens = self.class.dup_tokens
|
54
67
|
|
55
|
-
if @token_filter
|
56
|
-
@tokens.each{|t| @token_filter.call(t)}
|
57
|
-
end
|
58
|
-
|
59
|
-
tokens_to_find = tokens.each_with_index.map do |t, i|
|
60
|
-
[i, t.string] if t.string
|
61
|
-
end.compact
|
68
|
+
@tokens.each { |t| @token_filter.call(t) } if @token_filter
|
62
69
|
|
63
|
-
|
64
|
-
@
|
70
|
+
tokens_to_find = gen_tokens_to_find
|
71
|
+
@tokens_to_find_indexes = tokens_to_find.map { |t| t[0] }
|
72
|
+
@tokens_to_find_strings = tokens_to_find.map { |t| t[1] }
|
65
73
|
|
66
|
-
tokens_to_extract =
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
@tokens_to_extract_indexes = tokens_to_extract.map{|t| t[0]}
|
71
|
-
@tokens_to_extract_names = tokens.map{|t| t.name}
|
74
|
+
tokens_to_extract = gen_tokens_to_extract
|
75
|
+
@tokens_to_extract_indexes = tokens_to_extract.map { |t| t[0] }
|
76
|
+
@tokens_to_extract_names = tokens.map { |t| t.name }
|
72
77
|
|
73
78
|
@have_tokens_to_extract = (@tokens_to_extract_indexes.size > 0)
|
74
79
|
end
|
75
80
|
|
76
81
|
def describe_line
|
77
|
-
tokens.
|
78
|
-
desc << (t.string || t.name.to_s ||
|
82
|
+
tokens.reduce('') do |desc, t|
|
83
|
+
desc << (t.string || t.name.to_s || 'xxxxxx')
|
79
84
|
end
|
80
85
|
end
|
81
86
|
|
82
87
|
def do_extra_parsing
|
83
88
|
end
|
84
89
|
|
85
|
-
|
90
|
+
# Not sure this could be much more concise
|
91
|
+
# rubocop:disable MethodLength
|
92
|
+
def tokenize!(string, &block)
|
86
93
|
@string = string
|
87
94
|
@extracted_tokens ||= {}
|
88
95
|
@extracted_tokens.clear
|
96
|
+
@non_strict ||= self.class.non_strict?
|
89
97
|
|
90
98
|
return unless @have_tokens_to_extract
|
91
99
|
|
92
|
-
@extracted_tokens = ctokenize!(@string,
|
100
|
+
@extracted_tokens = ctokenize!(@string,
|
93
101
|
@tokens_to_find_indexes,
|
94
102
|
@tokens_to_find_strings,
|
95
103
|
@tokens_to_extract_indexes,
|
96
|
-
@tokens_to_extract_names
|
104
|
+
@tokens_to_extract_names,
|
105
|
+
@non_strict)
|
97
106
|
|
98
107
|
# extra parsing hook
|
99
108
|
do_extra_parsing
|
100
109
|
|
101
|
-
if block_given?
|
102
|
-
yield @extracted_tokens
|
103
|
-
end
|
110
|
+
yield @extracted_tokens if block_given?
|
104
111
|
|
105
112
|
# return self for chaining
|
106
113
|
self
|
107
|
-
end
|
108
|
-
|
114
|
+
end
|
115
|
+
# rubocop:enable MethodLength
|
116
|
+
|
109
117
|
private
|
110
118
|
|
111
|
-
def set_token_startpoint
|
119
|
+
def set_token_startpoint(ix, startpoint)
|
112
120
|
@tokens[ix].breakpoints[0] = startpoint
|
113
121
|
end
|
114
122
|
|
115
|
-
def get_token_startpoint
|
123
|
+
def get_token_startpoint(ix)
|
116
124
|
@tokens[ix].breakpoints[0]
|
117
125
|
end
|
118
126
|
|
119
|
-
def set_token_endpoint
|
127
|
+
def set_token_endpoint(ix, endpoint)
|
120
128
|
@tokens[ix].breakpoints[1] = endpoint
|
121
129
|
end
|
122
130
|
|
123
|
-
def extract_token?
|
131
|
+
def extract_token?(ix)
|
124
132
|
@tokens[ix].extract?
|
125
133
|
end
|
126
134
|
|
135
|
+
def gen_tokens_to_find
|
136
|
+
tokens.each_with_index.map { |t, i| [i, t.string] if t.string }.compact
|
137
|
+
end
|
138
|
+
|
139
|
+
def gen_tokens_to_extract
|
140
|
+
tokens.each_with_index.map { |t, i| [i, t.name] if t.extract? }.compact
|
141
|
+
end
|
127
142
|
end
|
data/lib/string-eater.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Namespacing module for StringEater
|
1
4
|
module StringEater
|
2
5
|
autoload :Token, 'token'
|
3
|
-
autoload :RubyTokenizer, 'ruby-tokenizer'
|
4
|
-
autoload :RubyTokenizerEachCHar, 'ruby-tokenizer-each-char'
|
5
6
|
autoload :CTokenizer, 'c-tokenizer'
|
6
7
|
|
7
8
|
autoload :VERSION, 'version'
|
8
9
|
|
10
|
+
# by default, Tokenizer is the c extension tokenizer
|
9
11
|
class Tokenizer < CTokenizer; end
|
10
12
|
end
|
data/lib/token.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Token class used by tokenizers
|
1
4
|
class StringEater::Token
|
2
5
|
attr_accessor :name, :string, :opts, :breakpoints, :children
|
3
6
|
|
4
7
|
def initialize
|
5
8
|
@opts = {}
|
6
|
-
@breakpoints = [nil,nil]
|
9
|
+
@breakpoints = [nil, nil]
|
7
10
|
end
|
8
11
|
|
9
12
|
def extract?
|
@@ -13,7 +16,7 @@ class StringEater::Token
|
|
13
16
|
def self.new_field(name, opts)
|
14
17
|
t = new
|
15
18
|
t.name = name
|
16
|
-
t.opts = {:
|
19
|
+
t.opts = { extract: true }.merge(opts)
|
17
20
|
t
|
18
21
|
end
|
19
22
|
|
@@ -22,5 +25,4 @@ class StringEater::Token
|
|
22
25
|
t.string = string
|
23
26
|
t
|
24
27
|
end
|
25
|
-
|
26
28
|
end
|
data/lib/version.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Extend StringEater with Version
|
1
4
|
module StringEater
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
5
|
+
# Version constants
|
6
|
+
module VERSION
|
7
|
+
MAJOR = 1
|
8
|
+
MINOR = 0
|
9
|
+
PATCH = 0
|
6
10
|
PRE = nil
|
7
11
|
STRING = [MAJOR, MINOR, PATCH, PRE].compact.join('.')
|
8
12
|
end
|
data/spec/nginx_spec.rb
CHANGED
@@ -1,32 +1,44 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'spec_helper'
|
2
4
|
require 'string-eater'
|
3
5
|
|
4
|
-
|
6
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),
|
7
|
+
'..', 'examples')))
|
5
8
|
|
6
9
|
require 'nginx'
|
7
10
|
|
8
11
|
describe NginxLogTokenizer do
|
9
12
|
before(:each) do
|
10
13
|
@tokenizer = NginxLogTokenizer.new
|
11
|
-
@str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500]
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
14
|
+
@str = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
|
15
|
+
'"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
|
16
|
+
'"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; ' +
|
17
|
+
'Trident/5.0)" "-" "there could be" other "stuff here"'
|
18
|
+
@str2 = '73.80.217.212 - - [01/Aug/2012:09:14:25 -0500] ' +
|
19
|
+
'"GET /this_is_a_url HTTP/1.1" 304 152 "http://referrer.com" ' +
|
20
|
+
'"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
|
21
|
+
'WOW64; Trident/5.0)" "-"'
|
22
|
+
end
|
23
|
+
|
24
|
+
user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ' +
|
25
|
+
'WOW64; Trident/5.0)'
|
26
|
+
|
27
|
+
{
|
28
|
+
ip: '73.80.217.212',
|
29
|
+
request: 'GET /this_is_a_url HTTP/1.1',
|
30
|
+
status_code: 304,
|
31
|
+
referrer_url: 'http://referrer.com',
|
32
|
+
user_agent: user_agent,
|
33
|
+
remainder: "\"there could be\" other \"stuff here\"",
|
34
|
+
}.each_pair do |token, val|
|
35
|
+
it "finds the right value for #{token}" do
|
36
|
+
@tokenizer.tokenize!(@str).send(token).should == val
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'correctly handles there not being a remainder' do
|
41
|
+
@tokenizer.tokenize!(@str2).remainder.should be_nil
|
42
|
+
end
|
31
43
|
|
32
44
|
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/string_eater_spec.rb
CHANGED
@@ -1,193 +1,185 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'spec_helper'
|
2
4
|
require 'string-eater'
|
3
5
|
|
4
|
-
TestedClass = StringEater::CTokenizer
|
5
|
-
|
6
6
|
describe StringEater do
|
7
|
-
it
|
8
|
-
StringEater::VERSION::STRING.split(
|
7
|
+
it 'has a version' do
|
8
|
+
StringEater::VERSION::STRING.split('.').size.should >= 3
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
12
12
|
# normal use
|
13
|
-
class Example1 <
|
13
|
+
class Example1 < StringEater::CTokenizer
|
14
14
|
add_field :first_word
|
15
|
-
look_for
|
16
|
-
add_field :second_word, :
|
17
|
-
look_for
|
15
|
+
look_for ' '
|
16
|
+
add_field :second_word, extract: false
|
17
|
+
look_for '|'
|
18
18
|
add_field :third_word
|
19
19
|
end
|
20
20
|
|
21
21
|
describe Example1 do
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
22
|
+
let(:tokenizer) { Example1.new }
|
23
|
+
let(:first_word1) { 'foo' }
|
24
|
+
let(:second_word1) { 'bar' }
|
25
|
+
let(:third_word1) { 'baz' }
|
26
|
+
let(:str1) { "#{first_word1} #{second_word1}|#{third_word1}" }
|
27
|
+
|
28
|
+
describe '#extract_all_fields' do
|
29
|
+
it 'extracts all of the fields' do
|
30
|
+
tokenizer.extract_all_fields
|
31
|
+
tokenizer.tokenize!(str1)
|
32
|
+
expect(tokenizer.first_word).to eq(first_word1)
|
33
|
+
expect(tokenizer.second_word).to eq(second_word1)
|
34
|
+
expect(tokenizer.third_word).to eq(third_word1)
|
35
|
+
end
|
30
36
|
end
|
31
37
|
|
32
|
-
describe
|
33
|
-
it
|
34
|
-
|
38
|
+
describe '#extract_no_fields' do
|
39
|
+
it 'does not extract any of the fields' do
|
40
|
+
tokenizer.extract_no_fields
|
41
|
+
tokenizer.tokenize!(str1)
|
42
|
+
tokenizer.first_word.should be_nil
|
43
|
+
tokenizer.second_word.should be_nil
|
44
|
+
tokenizer.third_word.should be_nil
|
35
45
|
end
|
36
46
|
end
|
37
47
|
|
38
|
-
describe
|
39
|
-
it
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
48
|
+
describe '#extract_fields' do
|
49
|
+
it 'allows us to set which fields get extracted' do
|
50
|
+
tokenizer.extract_fields :second_word
|
51
|
+
tokenizer.tokenize!(str1)
|
52
|
+
tokenizer.first_word.should be_nil
|
53
|
+
expect(tokenizer.second_word).to eq(second_word1)
|
54
|
+
tokenizer.third_word.should be_nil
|
45
55
|
end
|
46
56
|
end
|
47
57
|
|
48
|
-
describe
|
49
|
-
it
|
50
|
-
|
51
|
-
@tokenizer.tokenize!(@str1)
|
52
|
-
@tokenizer.first_word.should be_nil
|
53
|
-
@tokenizer.second_word.should be_nil
|
54
|
-
@tokenizer.third_word.should be_nil
|
58
|
+
describe 'tokenize!' do
|
59
|
+
it 'returns itself' do
|
60
|
+
tokenizer.tokenize!(str1).should == tokenizer
|
55
61
|
end
|
56
|
-
end
|
57
62
|
|
58
|
-
|
59
|
-
|
60
|
-
@tokenizer.extract_fields :second_word
|
61
|
-
@tokenizer.tokenize!(@str1)
|
62
|
-
@tokenizer.first_word.should be_nil
|
63
|
-
@tokenizer.second_word.should == @second_word1
|
64
|
-
@tokenizer.third_word.should be_nil
|
63
|
+
it 'sets the first word' do
|
64
|
+
tokenizer.tokenize!(str1).first_word.should == 'foo'
|
65
65
|
end
|
66
|
-
end
|
67
66
|
|
68
|
-
|
69
|
-
|
70
|
-
@tokenizer.tokenize!(@str1).should == @tokenizer
|
67
|
+
it 'sets the third word' do
|
68
|
+
tokenizer.tokenize!(str1).third_word.should == 'baz'
|
71
69
|
end
|
72
70
|
|
73
|
-
it
|
74
|
-
|
71
|
+
it 'does not set the second word' do
|
72
|
+
tokenizer.tokenize!(str1).second_word.should be_nil
|
75
73
|
end
|
76
74
|
|
77
|
-
it
|
78
|
-
|
75
|
+
it 'yields a hash of tokens if a block is given' do
|
76
|
+
tokenizer.tokenize!(str1) do |tokens|
|
77
|
+
tokens[:first_word].should == 'foo'
|
78
|
+
end
|
79
79
|
end
|
80
80
|
|
81
|
-
it
|
82
|
-
|
81
|
+
it 'returns everything to the end of the line for the last token' do
|
82
|
+
s = 'c defg asdf | foo , baa'
|
83
|
+
tokenizer.tokenize!("a b|#{s}").third_word.should == s
|
83
84
|
end
|
84
85
|
|
85
|
-
|
86
|
-
|
87
|
-
|
86
|
+
context 'when the last delimiter is missing' do
|
87
|
+
let(:s) { 'a b' }
|
88
|
+
it 'still finds the first word' do
|
89
|
+
expect(tokenizer.tokenize!(s).first_word).to eq('a')
|
88
90
|
end
|
89
|
-
end
|
90
91
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
end
|
92
|
+
it 'returns nil for the second word' do
|
93
|
+
expect(tokenizer.tokenize!(s).second_word).to be_nil
|
94
|
+
end
|
95
95
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
@tokenizer.tokenize!(s).third_word.should be_nil
|
96
|
+
it 'returns nil for the third word' do
|
97
|
+
expect(tokenizer.tokenize!(s).third_word).to be_nil
|
98
|
+
end
|
100
99
|
end
|
101
100
|
|
102
|
-
|
101
|
+
context 'when non_strict is enabled' do
|
102
|
+
before do
|
103
|
+
tokenizer.extract_all_fields
|
104
|
+
tokenizer.set_non_strict
|
105
|
+
end
|
106
|
+
|
107
|
+
context 'when the last delimiter is missing' do
|
108
|
+
let(:s) { 'a b' }
|
109
|
+
it 'still finds the first word' do
|
110
|
+
expect(tokenizer.tokenize!(s).first_word).to eq('a')
|
111
|
+
end
|
112
|
+
|
113
|
+
it 'still finds the second word' do
|
114
|
+
expect(tokenizer.tokenize!(s).second_word).to eq('b')
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'returns nil for the third word' do
|
118
|
+
expect(tokenizer.tokenize!(s).third_word).to be_nil
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
context 'when the last delimiter is not missing' do
|
123
|
+
let(:s) { 'a b|c' }
|
124
|
+
it 'still finds the first word' do
|
125
|
+
expect(tokenizer.tokenize!(s).first_word).to eq('a')
|
126
|
+
end
|
127
|
+
|
128
|
+
it 'still finds the second word' do
|
129
|
+
expect(tokenizer.tokenize!(s).second_word).to eq('b')
|
130
|
+
end
|
103
131
|
|
132
|
+
it 'returns nil for the third word' do
|
133
|
+
expect(tokenizer.tokenize!(s).third_word).to eq('c')
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
104
138
|
end
|
105
139
|
|
106
140
|
# an example where we ignore after a certain point in the string
|
107
|
-
class Example2 <
|
108
|
-
add_field :first_word, :
|
109
|
-
look_for
|
141
|
+
class Example2 < StringEater::CTokenizer
|
142
|
+
add_field :first_word, extract: false
|
143
|
+
look_for ' '
|
110
144
|
add_field :second_word
|
111
|
-
look_for
|
112
|
-
add_field :third_word, :
|
113
|
-
look_for
|
145
|
+
look_for ' '
|
146
|
+
add_field :third_word, extract: false
|
147
|
+
look_for '-'
|
114
148
|
end
|
115
149
|
|
116
150
|
describe Example2 do
|
151
|
+
let(:tokenizer) { Example2.new }
|
152
|
+
let(:second_word1) { 'bar' }
|
153
|
+
let(:str1) { "foo #{second_word1} baz-" }
|
117
154
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
end
|
123
|
-
|
124
|
-
describe "tokenize!" do
|
125
|
-
it "should find the token when there is extra stuff at the end of the string" do
|
126
|
-
@tokenizer.tokenize!(@str1).second_word.should == @second_word1
|
155
|
+
describe 'tokenize!' do
|
156
|
+
it 'finds the token when there is extra stuff at the' +
|
157
|
+
'end of the string' do
|
158
|
+
tokenizer.tokenize!(str1).second_word.should == second_word1
|
127
159
|
end
|
128
160
|
end
|
129
161
|
|
130
162
|
end
|
131
163
|
|
132
164
|
# an example where the split is more than one char
|
133
|
-
class Example3 <
|
134
|
-
look_for
|
165
|
+
class Example3 < StringEater::CTokenizer
|
166
|
+
look_for 'foo='
|
135
167
|
add_field :foo_val
|
136
|
-
look_for
|
168
|
+
look_for '&'
|
137
169
|
end
|
138
170
|
|
139
171
|
describe Example3 do
|
140
|
-
|
141
|
-
@tokenizer = Example3.new
|
142
|
-
end
|
172
|
+
let(:tokenizer) { Example3.new }
|
143
173
|
|
144
|
-
describe
|
145
|
-
it
|
146
|
-
|
174
|
+
describe 'tokenize!' do
|
175
|
+
it 'finds the token if there is only one occurrence ' +
|
176
|
+
'of the characters in the separator' do
|
177
|
+
tokenizer.tokenize!('abcd?foo=val&blah').foo_val.should == 'val'
|
147
178
|
end
|
148
179
|
|
149
|
-
it
|
150
|
-
|
180
|
+
it 'still works if part of the separator token occurs' do
|
181
|
+
tokenizer.tokenize!('abcd?foo_blah=baz&foo=bar&buh')
|
182
|
+
.foo_val.should == 'bar'
|
151
183
|
end
|
152
184
|
end
|
153
185
|
end
|
154
|
-
|
155
|
-
# CTokenizer doesn't do combine_fields because
|
156
|
-
# writing out breakpoints is a significant slow-down
|
157
|
-
if TestedClass.respond_to?(:combine_fields)
|
158
|
-
# an example where we combine fields
|
159
|
-
class Example3 < TestedClass
|
160
|
-
add_field :first_word, :extract => false
|
161
|
-
look_for " \""
|
162
|
-
add_field :part1, :extract => false
|
163
|
-
look_for " "
|
164
|
-
add_field :part2
|
165
|
-
look_for " "
|
166
|
-
add_field :part3, :extract => false
|
167
|
-
look_for "\""
|
168
|
-
|
169
|
-
combine_fields :from => :part1, :to => :part3, :as => :parts
|
170
|
-
end
|
171
|
-
|
172
|
-
describe Example3 do
|
173
|
-
before(:each) do
|
174
|
-
@tokenizer = Example3.new
|
175
|
-
@str1 = "foo \"bar baz bang\""
|
176
|
-
@part2 = "baz"
|
177
|
-
@parts = "bar baz bang"
|
178
|
-
end
|
179
|
-
|
180
|
-
it "should extract like normal" do
|
181
|
-
@tokenizer.tokenize!(@str1).part2.should == @part2
|
182
|
-
end
|
183
|
-
|
184
|
-
it "should ignore like normal" do
|
185
|
-
@tokenizer.tokenize!(@str1).part1.should be_nil
|
186
|
-
end
|
187
|
-
|
188
|
-
it "should extract the combined field" do
|
189
|
-
@tokenizer.tokenize!(@str1).parts.should == @parts
|
190
|
-
end
|
191
|
-
|
192
|
-
end
|
193
|
-
end
|
metadata
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string-eater
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 1.0.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Dan Swain
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-01-05 00:00:00.000000000 Z
|
13
12
|
dependencies: []
|
14
13
|
description: Fast string tokenizer. Nom strings.
|
15
14
|
email:
|
@@ -20,8 +19,6 @@ extensions:
|
|
20
19
|
extra_rdoc_files: []
|
21
20
|
files:
|
22
21
|
- lib/c-tokenizer.rb
|
23
|
-
- lib/ruby-tokenizer-each-char.rb
|
24
|
-
- lib/ruby-tokenizer.rb
|
25
22
|
- lib/string-eater.rb
|
26
23
|
- lib/token.rb
|
27
24
|
- lib/version.rb
|
@@ -37,28 +34,27 @@ files:
|
|
37
34
|
- README.md
|
38
35
|
homepage: http://github.com/simplifi/string-eater
|
39
36
|
licenses: []
|
37
|
+
metadata: {}
|
40
38
|
post_install_message:
|
41
39
|
rdoc_options: []
|
42
40
|
require_paths:
|
43
41
|
- lib
|
44
42
|
- ext/string-eater
|
45
43
|
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
-
none: false
|
47
44
|
requirements:
|
48
45
|
- - ! '>='
|
49
46
|
- !ruby/object:Gem::Version
|
50
47
|
version: '0'
|
51
48
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
-
none: false
|
53
49
|
requirements:
|
54
50
|
- - ! '>='
|
55
51
|
- !ruby/object:Gem::Version
|
56
52
|
version: '0'
|
57
53
|
requirements: []
|
58
54
|
rubyforge_project:
|
59
|
-
rubygems_version:
|
55
|
+
rubygems_version: 2.0.6
|
60
56
|
signing_key:
|
61
|
-
specification_version:
|
57
|
+
specification_version: 4
|
62
58
|
summary: Fast string tokenizer. Nom strings.
|
63
59
|
test_files:
|
64
60
|
- spec/nginx_spec.rb
|
@@ -1,145 +0,0 @@
|
|
1
|
-
# this tokenizer is very slow, but it illustrates the
|
2
|
-
# basic idea of the C tokenizer
|
3
|
-
class StringEater::RubyTokenizerEachChar
|
4
|
-
|
5
|
-
def self.tokens
|
6
|
-
@tokens ||= []
|
7
|
-
end
|
8
|
-
|
9
|
-
def self.combined_tokens
|
10
|
-
@combined_tokens ||= []
|
11
|
-
end
|
12
|
-
|
13
|
-
def self.add_field name, opts={}
|
14
|
-
self.tokens << StringEater::Token::new_field(name, opts)
|
15
|
-
define_method(name) {@extracted_tokens[name]}
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.look_for tokens
|
19
|
-
self.tokens << StringEater::Token::new_separator(tokens)
|
20
|
-
end
|
21
|
-
|
22
|
-
def self.combine_fields opts={}
|
23
|
-
from_token_index = self.tokens.index{|t| t.name == opts[:from]}
|
24
|
-
to_token_index = self.tokens.index{|t| t.name == opts[:to]}
|
25
|
-
self.combined_tokens << [opts[:as], from_token_index, to_token_index]
|
26
|
-
define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
|
27
|
-
end
|
28
|
-
|
29
|
-
def tokens
|
30
|
-
@tokens ||= self.class.tokens
|
31
|
-
end
|
32
|
-
|
33
|
-
def combined_tokens
|
34
|
-
@combined_tokens ||= self.class.combined_tokens
|
35
|
-
end
|
36
|
-
|
37
|
-
def refresh_tokens
|
38
|
-
@combined_tokens = nil
|
39
|
-
@tokens = nil
|
40
|
-
tokens
|
41
|
-
end
|
42
|
-
|
43
|
-
def describe_line
|
44
|
-
tokens.inject("") do |desc, t|
|
45
|
-
desc << (t.string || t.name.to_s || "xxxxxx")
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
def find_breakpoints string
|
50
|
-
tokenize!(string) unless @string == string
|
51
|
-
tokens.inject([]) do |bp, t|
|
52
|
-
bp << t.breakpoints
|
53
|
-
bp
|
54
|
-
end.flatten.uniq
|
55
|
-
end
|
56
|
-
|
57
|
-
def tokenize! string, &block
|
58
|
-
@string = string
|
59
|
-
@extracted_tokens ||= {}
|
60
|
-
@extracted_tokens.clear
|
61
|
-
@tokens_to_find ||= tokens.each_with_index.map do |t, i|
|
62
|
-
[i, t.string] if t.string
|
63
|
-
end.compact
|
64
|
-
@tokens_to_extract_indeces ||= tokens.each_with_index.map do |t, i|
|
65
|
-
i if t.extract?
|
66
|
-
end.compact
|
67
|
-
|
68
|
-
tokens.first.breakpoints[0] = 0
|
69
|
-
|
70
|
-
find_index = 0
|
71
|
-
|
72
|
-
curr_token = @tokens_to_find[find_index]
|
73
|
-
curr_token_index = curr_token[0]
|
74
|
-
curr_token_length = curr_token[1].length
|
75
|
-
looking_for_index = 0
|
76
|
-
looking_for = curr_token[1][looking_for_index]
|
77
|
-
|
78
|
-
counter = 0
|
79
|
-
string.each_char do |c|
|
80
|
-
if c == looking_for
|
81
|
-
if looking_for_index == 0
|
82
|
-
# entering new token
|
83
|
-
if curr_token_index > 0
|
84
|
-
t = tokens[curr_token_index - 1]
|
85
|
-
t.breakpoints[1] = counter
|
86
|
-
if t.extract?
|
87
|
-
@extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
|
88
|
-
end
|
89
|
-
end
|
90
|
-
tokens[curr_token_index].breakpoints[0] = counter
|
91
|
-
end
|
92
|
-
if looking_for_index >= (curr_token_length - 1)
|
93
|
-
# leaving token
|
94
|
-
tokens[curr_token_index].breakpoints[1] = counter
|
95
|
-
|
96
|
-
if curr_token_index >= tokens.size-1
|
97
|
-
# we're done!
|
98
|
-
break
|
99
|
-
else
|
100
|
-
tokens[curr_token_index + 1].breakpoints[0] = counter + 1
|
101
|
-
end
|
102
|
-
|
103
|
-
# next token
|
104
|
-
find_index += 1
|
105
|
-
if find_index >= @tokens_to_find.length
|
106
|
-
# we're done!
|
107
|
-
break
|
108
|
-
end
|
109
|
-
curr_token = @tokens_to_find[find_index]
|
110
|
-
curr_token_index = curr_token[0]
|
111
|
-
curr_token_length = curr_token[1].length
|
112
|
-
looking_for_index = 0
|
113
|
-
else
|
114
|
-
looking_for_index += 1
|
115
|
-
end
|
116
|
-
end
|
117
|
-
looking_for = curr_token[1][looking_for_index]
|
118
|
-
counter += 1
|
119
|
-
end
|
120
|
-
|
121
|
-
last_token = tokens.last
|
122
|
-
last_token.breakpoints[1] = string.length
|
123
|
-
|
124
|
-
if last_token.extract?
|
125
|
-
@extracted_tokens[last_token.name] = string[last_token.breakpoints[0]..last_token.breakpoints[1]]
|
126
|
-
end
|
127
|
-
|
128
|
-
combined_tokens.each do |combiner|
|
129
|
-
name = combiner[0]
|
130
|
-
from = @tokens[combiner[1]].breakpoints[0]
|
131
|
-
to = @tokens[combiner[2]].breakpoints[1]
|
132
|
-
@extracted_tokens[name] = string[from...to]
|
133
|
-
end
|
134
|
-
|
135
|
-
if block_given?
|
136
|
-
yield @extracted_tokens
|
137
|
-
end
|
138
|
-
|
139
|
-
# return self for chaining
|
140
|
-
self
|
141
|
-
end
|
142
|
-
|
143
|
-
end
|
144
|
-
|
145
|
-
|
data/lib/ruby-tokenizer.rb
DELETED
@@ -1,98 +0,0 @@
|
|
1
|
-
# this tokenizer is fairly fast, but not necessarily faster than regexps
|
2
|
-
class StringEater::RubyTokenizer
|
3
|
-
def self.tokens
|
4
|
-
@tokens ||= []
|
5
|
-
end
|
6
|
-
|
7
|
-
def self.combined_tokens
|
8
|
-
@combined_tokens ||= []
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.add_field name, opts={}
|
12
|
-
self.tokens << StringEater::Token::new_field(name, opts)
|
13
|
-
define_method(name) {@extracted_tokens[name]}
|
14
|
-
end
|
15
|
-
|
16
|
-
def self.look_for tokens
|
17
|
-
self.tokens << StringEater::Token::new_separator(tokens)
|
18
|
-
end
|
19
|
-
|
20
|
-
def self.combine_fields opts={}
|
21
|
-
from_token_index = self.tokens.index{|t| t.name == opts[:from]}
|
22
|
-
to_token_index = self.tokens.index{|t| t.name == opts[:to]}
|
23
|
-
self.combined_tokens << [opts[:as], from_token_index, to_token_index]
|
24
|
-
define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
|
25
|
-
end
|
26
|
-
|
27
|
-
def tokens
|
28
|
-
@tokens ||= self.class.tokens
|
29
|
-
end
|
30
|
-
|
31
|
-
def combined_tokens
|
32
|
-
@combined_tokens ||= self.class.combined_tokens
|
33
|
-
end
|
34
|
-
|
35
|
-
def refresh_tokens
|
36
|
-
@combined_tokens = nil
|
37
|
-
@tokens = nil
|
38
|
-
tokens
|
39
|
-
end
|
40
|
-
|
41
|
-
def describe_line
|
42
|
-
tokens.inject("") do |desc, t|
|
43
|
-
desc << (t.string || t.name.to_s || "xxxxxx")
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def find_breakpoints(string)
|
48
|
-
@literal_tokens ||= tokens.select{|t| t.string}
|
49
|
-
@breakpoints ||= Array.new(2*@literal_tokens.size + 2)
|
50
|
-
@breakpoints[0] = 0
|
51
|
-
@breakpoints[-1] = string.length
|
52
|
-
start_point = 0
|
53
|
-
@literal_tokens.each_with_index do |t, i|
|
54
|
-
@breakpoints[2*i+1], start_point = find_end_of(t, string, start_point)
|
55
|
-
@breakpoints[2*i+2] = start_point
|
56
|
-
end
|
57
|
-
@breakpoints
|
58
|
-
end
|
59
|
-
|
60
|
-
def tokenize! string, &block
|
61
|
-
@extracted_tokens ||= {}
|
62
|
-
@extracted_tokens.clear
|
63
|
-
@tokens_to_extract ||= tokens.select{|t| t.extract?}
|
64
|
-
|
65
|
-
find_breakpoints(string)
|
66
|
-
last_important_bp = [@breakpoints.length, tokens.size].min
|
67
|
-
(0...last_important_bp).each do |i|
|
68
|
-
tokens[i].breakpoints = [@breakpoints[i], @breakpoints[i+1]]
|
69
|
-
end
|
70
|
-
|
71
|
-
@tokens_to_extract.each do |t|
|
72
|
-
@extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
|
73
|
-
end
|
74
|
-
|
75
|
-
combined_tokens.each do |combiner|
|
76
|
-
name = combiner[0]
|
77
|
-
from = @tokens[combiner[1]].breakpoints[0]
|
78
|
-
to = @tokens[combiner[2]].breakpoints[1]
|
79
|
-
@extracted_tokens[name] = string[from...to]
|
80
|
-
end
|
81
|
-
|
82
|
-
if block_given?
|
83
|
-
yield @extracted_tokens
|
84
|
-
end
|
85
|
-
|
86
|
-
# return self for chaining
|
87
|
-
self
|
88
|
-
end
|
89
|
-
|
90
|
-
protected
|
91
|
-
|
92
|
-
def find_end_of token, string, start_at
|
93
|
-
start = string.index(token.string, start_at+1) || string.length
|
94
|
-
[start, [start + token.string.length, string.length].min]
|
95
|
-
end
|
96
|
-
|
97
|
-
end
|
98
|
-
|