picky 1.4.0 → 1.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/spec/lib/tokenizers/base_spec.rb +48 -50
- data/spec/lib/tokenizers/index_spec.rb +5 -7
- data/spec/lib/tokenizers/query_spec.rb +18 -20
- metadata +1 -1
@@ -5,68 +5,65 @@ require 'spec_helper'
|
|
5
5
|
describe Tokenizers::Base do
|
6
6
|
|
7
7
|
context 'with special instance' do
|
8
|
-
|
9
|
-
@tokenizer = Tokenizers::Base.new reject_token_if: lambda { |token| token.to_s.length < 2 || token == :hello }
|
10
|
-
end
|
8
|
+
let (:tokenizer) { Tokenizers::Base.new reject_token_if: lambda { |token| token.to_s.length < 2 || token == :hello } }
|
11
9
|
it 'rejects tokens with length < 2' do
|
12
|
-
|
10
|
+
tokenizer.reject([:'', :a, :ab, :abc]).should == [:ab, :abc]
|
13
11
|
end
|
14
12
|
it 'rejects tokens that are called :hello' do
|
15
|
-
|
13
|
+
tokenizer.reject([:hel, :hell, :hello]).should == [:hel, :hell]
|
16
14
|
end
|
17
15
|
end
|
18
16
|
|
19
17
|
context 'with normal instance' do
|
20
|
-
|
21
|
-
@tokenizer = Tokenizers::Base.new
|
22
|
-
end
|
18
|
+
let(:tokenizer) { Tokenizers::Base.new }
|
23
19
|
|
24
20
|
describe 'reject_token_if' do
|
25
21
|
it 'rejects empty tokens by default' do
|
26
|
-
|
22
|
+
tokenizer.reject(['a', nil, '', 'b']).should == ['a', 'b']
|
27
23
|
end
|
28
24
|
it 'rejects tokens based on the given rejection criteria if set' do
|
29
|
-
|
25
|
+
tokenizer.reject_token_if &:nil?
|
30
26
|
|
31
|
-
|
27
|
+
tokenizer.reject(['a', nil, '', 'b']).should == ['a', '', 'b']
|
32
28
|
end
|
33
29
|
end
|
34
30
|
|
35
31
|
describe "substitute(s)_characters*" do
|
36
32
|
it "doesn't substitute if there is no substituter" do
|
37
|
-
|
33
|
+
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzäöü'
|
38
34
|
end
|
39
35
|
it "uses the substituter to replace characters" do
|
40
|
-
|
36
|
+
tokenizer.substitutes_characters_with CharacterSubstituters::WestEuropean.new
|
41
37
|
|
42
|
-
|
38
|
+
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
|
43
39
|
end
|
44
40
|
it "uses the european substituter as default" do
|
45
|
-
|
41
|
+
tokenizer.substitutes_characters_with
|
46
42
|
|
47
|
-
|
43
|
+
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
|
48
44
|
end
|
49
45
|
end
|
50
46
|
|
51
47
|
describe "removes_characters_after_splitting" do
|
52
48
|
context "without removes_characters_after_splitting called" do
|
53
49
|
it "has remove_after_normalizing_illegals" do
|
54
|
-
|
50
|
+
expect { tokenizer.remove_after_normalizing_illegals('any') }.to_not raise_error
|
55
51
|
end
|
56
52
|
it 'should define a remove_after_normalizing_illegals normalize_with_patterns does nothing' do
|
57
53
|
unchanging = stub :unchanging
|
58
|
-
|
54
|
+
|
55
|
+
tokenizer.remove_after_normalizing_illegals unchanging
|
59
56
|
end
|
60
57
|
end
|
61
58
|
context "with removes_characters_after_splitting called" do
|
62
59
|
before(:each) do
|
63
|
-
|
60
|
+
tokenizer.removes_characters_after_splitting(/[afo]/)
|
64
61
|
end
|
65
62
|
it "has remove_after_normalizing_illegals" do
|
66
|
-
|
63
|
+
expect { tokenizer.remove_after_normalizing_illegals('abcdefghijklmnop') }.to_not raise_error
|
67
64
|
end
|
68
65
|
it "removes illegal characters" do
|
69
|
-
|
66
|
+
tokenizer.remove_after_normalizing_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
|
70
67
|
end
|
71
68
|
end
|
72
69
|
end
|
@@ -74,25 +71,26 @@ describe Tokenizers::Base do
|
|
74
71
|
describe "normalizes_words" do
|
75
72
|
context "without normalizes_words called" do
|
76
73
|
it "has normalize_with_patterns" do
|
77
|
-
|
74
|
+
expect { tokenizer.normalize_with_patterns('any') }.to_not raise_error
|
78
75
|
end
|
79
76
|
it 'should define a method normalize_with_patterns does nothing' do
|
80
77
|
unchanging = stub :unchanging
|
81
|
-
|
78
|
+
|
79
|
+
tokenizer.normalize_with_patterns(unchanging).should == unchanging
|
82
80
|
end
|
83
81
|
end
|
84
82
|
context "with normalizes_words called" do
|
85
83
|
before(:each) do
|
86
|
-
|
84
|
+
tokenizer.normalizes_words([
|
87
85
|
[/st\./, 'sankt'],
|
88
86
|
[/stras?s?e?/, 'str']
|
89
87
|
])
|
90
88
|
end
|
91
89
|
it "has normalize_with_patterns" do
|
92
|
-
|
90
|
+
expect { tokenizer.normalize_with_patterns('a b/c.d') }.to_not raise_error
|
93
91
|
end
|
94
92
|
it "normalizes, but just the first one" do
|
95
|
-
|
93
|
+
tokenizer.normalize_with_patterns('st. wegstrasse').should == 'sankt wegstrasse'
|
96
94
|
end
|
97
95
|
end
|
98
96
|
end
|
@@ -100,24 +98,24 @@ describe Tokenizers::Base do
|
|
100
98
|
describe "splits_text_on" do
|
101
99
|
context "without splits_text_on called" do
|
102
100
|
it "has split" do
|
103
|
-
lambda {
|
101
|
+
lambda { tokenizer.split('any') }.should_not raise_error
|
104
102
|
end
|
105
103
|
it 'should define a method split that splits by default on \s' do
|
106
|
-
|
104
|
+
tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
|
107
105
|
end
|
108
106
|
it 'splits text on /\s/ by default' do
|
109
|
-
|
107
|
+
tokenizer.split('this is a test').should == ['this', 'is', 'a', 'test']
|
110
108
|
end
|
111
109
|
end
|
112
110
|
context "with removes_characters called" do
|
113
111
|
before(:each) do
|
114
|
-
|
112
|
+
tokenizer.splits_text_on(/[\s\.\/]/)
|
115
113
|
end
|
116
114
|
it "has split" do
|
117
|
-
|
115
|
+
expect { tokenizer.split('a b/c.d') }.to_not raise_error
|
118
116
|
end
|
119
117
|
it "removes illegal characters" do
|
120
|
-
|
118
|
+
tokenizer.split('a b/c.d').should == ['a','b','c','d']
|
121
119
|
end
|
122
120
|
end
|
123
121
|
end
|
@@ -125,22 +123,23 @@ describe Tokenizers::Base do
|
|
125
123
|
describe "removes_characters" do
|
126
124
|
context "without removes_characters called" do
|
127
125
|
it "has remove_illegals" do
|
128
|
-
|
126
|
+
expect { tokenizer.remove_illegals('any') }.to_not raise_error
|
129
127
|
end
|
130
128
|
it 'should define a method remove_illegals that does nothing' do
|
131
129
|
unchanging = stub :unchanging
|
132
|
-
|
130
|
+
|
131
|
+
tokenizer.remove_illegals unchanging
|
133
132
|
end
|
134
133
|
end
|
135
134
|
context "with removes_characters called" do
|
136
135
|
before(:each) do
|
137
|
-
|
136
|
+
tokenizer.removes_characters(/[afo]/)
|
138
137
|
end
|
139
138
|
it "has remove_illegals" do
|
140
|
-
|
139
|
+
expect { tokenizer.remove_illegals('abcdefghijklmnop') }.to_not raise_error
|
141
140
|
end
|
142
141
|
it "removes illegal characters" do
|
143
|
-
|
142
|
+
tokenizer.remove_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
|
144
143
|
end
|
145
144
|
end
|
146
145
|
end
|
@@ -148,45 +147,44 @@ describe Tokenizers::Base do
|
|
148
147
|
describe 'stopwords' do
|
149
148
|
context 'without stopwords given' do
|
150
149
|
it 'should define a method remove_stopwords' do
|
151
|
-
lambda {
|
150
|
+
lambda { tokenizer.remove_stopwords('from this text') }.should_not raise_error
|
152
151
|
end
|
153
152
|
it 'should define a method remove_stopwords that does nothing' do
|
154
|
-
|
153
|
+
tokenizer.remove_stopwords('from this text').should == 'from this text'
|
155
154
|
end
|
156
155
|
it 'should define a method remove_non_single_stopwords' do
|
157
|
-
|
158
|
-
|
156
|
+
expect { tokenizer.remove_non_single_stopwords('from this text') }.to_not raise_error
|
159
157
|
end
|
160
158
|
end
|
161
159
|
context 'with stopwords given' do
|
162
160
|
before(:each) do
|
163
|
-
|
161
|
+
tokenizer.stopwords(/r|e/)
|
164
162
|
end
|
165
163
|
it 'should define a method remove_stopwords' do
|
166
|
-
lambda {
|
164
|
+
lambda { tokenizer.remove_stopwords('from this text') }.should_not raise_error
|
167
165
|
end
|
168
166
|
it 'should define a method stopwords that removes stopwords' do
|
169
|
-
|
167
|
+
tokenizer.remove_stopwords('from this text').should == 'fom this txt'
|
170
168
|
end
|
171
169
|
it 'should define a method remove_non_single_stopwords' do
|
172
|
-
|
170
|
+
expect { tokenizer.remove_non_single_stopwords('from this text') }.to_not raise_error
|
173
171
|
end
|
174
172
|
it 'should define a method remove_non_single_stopwords that removes non-single stopwords' do
|
175
|
-
|
173
|
+
tokenizer.remove_non_single_stopwords('rerere rerere').should == ' '
|
176
174
|
end
|
177
175
|
it 'should define a method remove_non_single_stopwords that does not single stopwords' do
|
178
|
-
|
176
|
+
tokenizer.remove_non_single_stopwords('rerere').should == 'rerere'
|
179
177
|
end
|
180
178
|
end
|
181
179
|
context 'error case' do
|
182
180
|
before(:each) do
|
183
|
-
|
181
|
+
tokenizer.stopwords(/any/)
|
184
182
|
end
|
185
183
|
it 'should not remove non-single stopwords with a star' do
|
186
|
-
|
184
|
+
tokenizer.remove_non_single_stopwords('a*').should == 'a*'
|
187
185
|
end
|
188
186
|
it 'should not remove non-single stopwords with a tilde' do
|
189
|
-
|
187
|
+
tokenizer.remove_non_single_stopwords('a~').should == 'a~'
|
190
188
|
end
|
191
189
|
end
|
192
190
|
end
|
@@ -4,9 +4,7 @@ require 'spec_helper'
|
|
4
4
|
|
5
5
|
describe Tokenizers::Index do
|
6
6
|
|
7
|
-
|
8
|
-
@tokenizer = Tokenizers::Index.new
|
9
|
-
end
|
7
|
+
let(:tokenizer) { Tokenizers::Index.new }
|
10
8
|
|
11
9
|
describe "default*" do
|
12
10
|
before(:all) do
|
@@ -33,13 +31,13 @@ describe Tokenizers::Index do
|
|
33
31
|
|
34
32
|
describe "remove_removes_characters" do
|
35
33
|
it "should not remove ' from a query by default" do
|
36
|
-
|
34
|
+
tokenizer.remove_illegals("Lugi's").should == "Lugi's"
|
37
35
|
end
|
38
36
|
end
|
39
37
|
|
40
38
|
describe "reject!" do
|
41
39
|
it "should reject tokens if blank" do
|
42
|
-
|
40
|
+
tokenizer.reject(['', 'not blank', '']).should == ['not blank']
|
43
41
|
end
|
44
42
|
end
|
45
43
|
|
@@ -47,7 +45,7 @@ describe Tokenizers::Index do
|
|
47
45
|
describe "normalizing" do
|
48
46
|
def self.it_should_normalize_token(text, expected)
|
49
47
|
it "should handle the #{text} case" do
|
50
|
-
|
48
|
+
tokenizer.tokenize(text).to_a.should == [expected].compact
|
51
49
|
end
|
52
50
|
end
|
53
51
|
# defaults
|
@@ -57,7 +55,7 @@ describe Tokenizers::Index do
|
|
57
55
|
describe "tokenizing" do
|
58
56
|
def self.it_should_tokenize_token(text, expected)
|
59
57
|
it "should handle the #{text} case" do
|
60
|
-
|
58
|
+
tokenizer.tokenize(text).to_a.should == expected
|
61
59
|
end
|
62
60
|
end
|
63
61
|
# defaults
|
@@ -3,9 +3,7 @@ require 'spec_helper'
|
|
3
3
|
|
4
4
|
describe Tokenizers::Query do
|
5
5
|
|
6
|
-
|
7
|
-
@tokenizer = Tokenizers::Query.new
|
8
|
-
end
|
6
|
+
let(:tokenizer) { Tokenizers::Query.new }
|
9
7
|
|
10
8
|
describe "default*" do
|
11
9
|
before(:all) do
|
@@ -32,7 +30,7 @@ describe Tokenizers::Query do
|
|
32
30
|
|
33
31
|
describe "maximum_tokens" do
|
34
32
|
it "should be set to 5 by default" do
|
35
|
-
|
33
|
+
tokenizer.maximum_tokens.should == 5
|
36
34
|
end
|
37
35
|
it "should be settable" do
|
38
36
|
Tokenizers::Query.new(maximum_tokens: 3).maximum_tokens.should == 3
|
@@ -43,15 +41,15 @@ describe Tokenizers::Query do
|
|
43
41
|
it 'should call methods in order' do
|
44
42
|
text = stub :text
|
45
43
|
|
46
|
-
|
47
|
-
|
44
|
+
tokenizer.should_receive(:remove_illegals).once.ordered.with text
|
45
|
+
tokenizer.should_receive(:remove_non_single_stopwords).once.ordered.with text
|
48
46
|
|
49
|
-
|
47
|
+
tokenizer.preprocess text
|
50
48
|
end
|
51
49
|
it 'should return the text unchanged by default' do
|
52
50
|
text = "some text"
|
53
51
|
|
54
|
-
|
52
|
+
tokenizer.preprocess(text).should == text
|
55
53
|
end
|
56
54
|
end
|
57
55
|
|
@@ -60,9 +58,9 @@ describe Tokenizers::Query do
|
|
60
58
|
@tokens = mock :tokens, :null_object => true
|
61
59
|
end
|
62
60
|
it 'should tokenize the tokens' do
|
63
|
-
@tokens.should_receive(:tokenize_with).once.with
|
61
|
+
@tokens.should_receive(:tokenize_with).once.with tokenizer
|
64
62
|
|
65
|
-
|
63
|
+
tokenizer.process @tokens
|
66
64
|
end
|
67
65
|
it 'should call methods on the tokens in order' do
|
68
66
|
@tokens.should_receive(:tokenize_with).once.ordered
|
@@ -70,17 +68,17 @@ describe Tokenizers::Query do
|
|
70
68
|
@tokens.should_receive(:cap).once.ordered
|
71
69
|
@tokens.should_receive(:partialize_last).once.ordered
|
72
70
|
|
73
|
-
|
71
|
+
tokenizer.process @tokens
|
74
72
|
end
|
75
73
|
it 'should return the tokens' do
|
76
|
-
|
74
|
+
tokenizer.process(@tokens).should == @tokens
|
77
75
|
end
|
78
76
|
end
|
79
77
|
|
80
78
|
describe 'pretokenize' do
|
81
79
|
def self.it_should_pretokenize text, expected
|
82
80
|
it "should pretokenize #{text} as #{expected}" do
|
83
|
-
|
81
|
+
tokenizer.pretokenize(text).should == expected
|
84
82
|
end
|
85
83
|
end
|
86
84
|
it_should_pretokenize 'test miau test', ['test', 'miau', 'test']
|
@@ -89,7 +87,7 @@ describe Tokenizers::Query do
|
|
89
87
|
describe "tokenizing" do
|
90
88
|
def self.it_should_tokenize_token(text, expected)
|
91
89
|
it "should handle the #{text} case" do
|
92
|
-
|
90
|
+
tokenizer.tokenize(text).map(&:text).should == expected
|
93
91
|
end
|
94
92
|
end
|
95
93
|
it_should_tokenize_token 'simple tokenizing on \s', [:simple, :tokenizing, :on, :'\s']
|
@@ -98,7 +96,7 @@ describe Tokenizers::Query do
|
|
98
96
|
describe 'normalize_with_patterns' do
|
99
97
|
def self.it_should_pattern_normalize original, expected
|
100
98
|
it "should normalize #{original} with pattern into #{expected}" do
|
101
|
-
|
99
|
+
tokenizer.normalize_with_patterns(original).should == expected
|
102
100
|
end
|
103
101
|
end
|
104
102
|
it_should_pattern_normalize 'no pattern normalization', 'no pattern normalization'
|
@@ -106,22 +104,22 @@ describe Tokenizers::Query do
|
|
106
104
|
|
107
105
|
describe 'reject' do
|
108
106
|
it 'should reject blank tokens' do
|
109
|
-
|
107
|
+
tokenizer.reject(["some token answering to blank?", nil, nil]).should == ["some token answering to blank?"]
|
110
108
|
end
|
111
109
|
end
|
112
110
|
|
113
111
|
describe "last token" do
|
114
112
|
it "should be partial" do
|
115
|
-
|
113
|
+
tokenizer.tokenize("First Second Third Last").last.instance_variable_get(:@partial).should be_true
|
116
114
|
end
|
117
115
|
end
|
118
116
|
|
119
117
|
describe ".tokenize" do
|
120
118
|
it "should return an Array of tokens" do
|
121
|
-
|
119
|
+
tokenizer.tokenize('test test').to_a.should be_instance_of(Array)
|
122
120
|
end
|
123
121
|
it "should return an empty tokenized query if the query string is blank or empty" do
|
124
|
-
|
122
|
+
tokenizer.tokenize('').map(&:to_s).should == []
|
125
123
|
end
|
126
124
|
end
|
127
125
|
describe "token_for" do
|
@@ -129,7 +127,7 @@ describe Tokenizers::Query do
|
|
129
127
|
text = stub(:text)
|
130
128
|
Query::Token.should_receive(:processed).with text
|
131
129
|
|
132
|
-
|
130
|
+
tokenizer.token_for text
|
133
131
|
end
|
134
132
|
end
|
135
133
|
|