string-eater 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +5 -1
- data/lib/c-tokenizer.rb +36 -2
- data/lib/version.rb +1 -1
- data/spec/string_eater_spec.rb +31 -0
- metadata +3 -2
data/README.md
CHANGED
@@ -18,7 +18,11 @@ implemenatations that provide support for C extensions.
|
|
18
18
|
|
19
19
|
## Installation
|
20
20
|
|
21
|
-
|
21
|
+
If your system is set up to allow it, you can just do
|
22
|
+
|
23
|
+
gem install string-eater
|
24
|
+
|
25
|
+
Or, if you prefer a more hands-on approach or want to hack at the source:
|
22
26
|
|
23
27
|
git clone git://github.com/dantswain/string-eater.git
|
24
28
|
cd string-eater
|
data/lib/c-tokenizer.rb
CHANGED
@@ -14,6 +14,11 @@ class StringEater::CTokenizer
|
|
14
14
|
self.tokens << StringEater::Token::new_separator(tokens)
|
15
15
|
end
|
16
16
|
|
17
|
+
# This is very slow, only do it when necessary
|
18
|
+
def self.dup_tokens
|
19
|
+
Marshal.load(Marshal.dump(tokens))
|
20
|
+
end
|
21
|
+
|
17
22
|
def initialize
|
18
23
|
refresh_tokens
|
19
24
|
end
|
@@ -22,8 +27,35 @@ class StringEater::CTokenizer
|
|
22
27
|
@tokens
|
23
28
|
end
|
24
29
|
|
30
|
+
def extract_all_fields
|
31
|
+
@token_filter = lambda do |t|
|
32
|
+
t.opts[:extract] = true if t.name
|
33
|
+
end
|
34
|
+
refresh_tokens
|
35
|
+
end
|
36
|
+
|
37
|
+
def extract_no_fields
|
38
|
+
@token_filter = lambda do |t|
|
39
|
+
t.opts[:extract] = false if t.name
|
40
|
+
end
|
41
|
+
refresh_tokens
|
42
|
+
end
|
43
|
+
|
44
|
+
def extract_fields *fields
|
45
|
+
@token_filter = lambda do |t|
|
46
|
+
t.opts[:extract] = fields.include?(t.name)
|
47
|
+
end
|
48
|
+
refresh_tokens
|
49
|
+
end
|
50
|
+
|
51
|
+
# This is very slow, only do it once before processing
|
25
52
|
def refresh_tokens
|
26
|
-
@tokens = self.class.
|
53
|
+
@tokens = self.class.dup_tokens
|
54
|
+
|
55
|
+
if @token_filter
|
56
|
+
@tokens.each{|t| @token_filter.call(t)}
|
57
|
+
end
|
58
|
+
|
27
59
|
tokens_to_find = tokens.each_with_index.map do |t, i|
|
28
60
|
[i, t.string] if t.string
|
29
61
|
end.compact
|
@@ -37,6 +69,8 @@ class StringEater::CTokenizer
|
|
37
69
|
|
38
70
|
@tokens_to_extract_indexes = tokens_to_extract.map{|t| t[0]}
|
39
71
|
@tokens_to_extract_names = tokens.map{|t| t.name}
|
72
|
+
|
73
|
+
@have_tokens_to_extract = (@tokens_to_extract_indexes.size > 0)
|
40
74
|
end
|
41
75
|
|
42
76
|
def describe_line
|
@@ -53,7 +87,7 @@ class StringEater::CTokenizer
|
|
53
87
|
@extracted_tokens ||= {}
|
54
88
|
@extracted_tokens.clear
|
55
89
|
|
56
|
-
|
90
|
+
return unless @have_tokens_to_extract
|
57
91
|
|
58
92
|
@extracted_tokens = ctokenize!(@string,
|
59
93
|
@tokens_to_find_indexes,
|
data/lib/version.rb
CHANGED
data/spec/string_eater_spec.rb
CHANGED
@@ -24,6 +24,7 @@ describe Example1 do
|
|
24
24
|
@tokenizer = Example1.new
|
25
25
|
@str1 = "foo bar|baz"
|
26
26
|
@first_word1 = "foo"
|
27
|
+
@second_word1 = "bar"
|
27
28
|
@third_word1 = "baz"
|
28
29
|
@bp1 = [0, 3,4,7,8,11]
|
29
30
|
end
|
@@ -34,6 +35,36 @@ describe Example1 do
|
|
34
35
|
end
|
35
36
|
end
|
36
37
|
|
38
|
+
describe "#extract_all_fields" do
|
39
|
+
it "should extract all of the fields" do
|
40
|
+
@tokenizer.extract_all_fields
|
41
|
+
@tokenizer.tokenize!(@str1)
|
42
|
+
@tokenizer.first_word.should == @first_word1
|
43
|
+
@tokenizer.second_word.should == @second_word1
|
44
|
+
@tokenizer.third_word.should == @third_word1
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "#extract_no_fields" do
|
49
|
+
it "should not extract any of the fields" do
|
50
|
+
@tokenizer.extract_no_fields
|
51
|
+
@tokenizer.tokenize!(@str1)
|
52
|
+
@tokenizer.first_word.should be_nil
|
53
|
+
@tokenizer.second_word.should be_nil
|
54
|
+
@tokenizer.third_word.should be_nil
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
describe "#extract_fields" do
|
59
|
+
it "should allow us to set which fields get extracted" do
|
60
|
+
@tokenizer.extract_fields :second_word
|
61
|
+
@tokenizer.tokenize!(@str1)
|
62
|
+
@tokenizer.first_word.should be_nil
|
63
|
+
@tokenizer.second_word.should == @second_word1
|
64
|
+
@tokenizer.third_word.should be_nil
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
37
68
|
describe "tokenize!" do
|
38
69
|
it "should return itself" do
|
39
70
|
@tokenizer.tokenize!(@str1).should == @tokenizer
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string-eater
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-08-
|
12
|
+
date: 2012-08-21 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Fast string tokenizer. Nom strings.
|
15
15
|
email:
|
@@ -64,3 +64,4 @@ test_files:
|
|
64
64
|
- spec/nginx_spec.rb
|
65
65
|
- spec/spec_helper.rb
|
66
66
|
- spec/string_eater_spec.rb
|
67
|
+
has_rdoc:
|