picky 1.4.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/spec/lib/tokenizers/base_spec.rb +48 -50
- data/spec/lib/tokenizers/index_spec.rb +5 -7
- data/spec/lib/tokenizers/query_spec.rb +18 -20
- metadata +1 -1
@@ -5,68 +5,65 @@ require 'spec_helper'
|
|
5
5
|
describe Tokenizers::Base do
|
6
6
|
|
7
7
|
context 'with special instance' do
|
8
|
-
|
9
|
-
@tokenizer = Tokenizers::Base.new reject_token_if: lambda { |token| token.to_s.length < 2 || token == :hello }
|
10
|
-
end
|
8
|
+
let (:tokenizer) { Tokenizers::Base.new reject_token_if: lambda { |token| token.to_s.length < 2 || token == :hello } }
|
11
9
|
it 'rejects tokens with length < 2' do
|
12
|
-
|
10
|
+
tokenizer.reject([:'', :a, :ab, :abc]).should == [:ab, :abc]
|
13
11
|
end
|
14
12
|
it 'rejects tokens that are called :hello' do
|
15
|
-
|
13
|
+
tokenizer.reject([:hel, :hell, :hello]).should == [:hel, :hell]
|
16
14
|
end
|
17
15
|
end
|
18
16
|
|
19
17
|
context 'with normal instance' do
|
20
|
-
|
21
|
-
@tokenizer = Tokenizers::Base.new
|
22
|
-
end
|
18
|
+
let(:tokenizer) { Tokenizers::Base.new }
|
23
19
|
|
24
20
|
describe 'reject_token_if' do
|
25
21
|
it 'rejects empty tokens by default' do
|
26
|
-
|
22
|
+
tokenizer.reject(['a', nil, '', 'b']).should == ['a', 'b']
|
27
23
|
end
|
28
24
|
it 'rejects tokens based on the given rejection criteria if set' do
|
29
|
-
|
25
|
+
tokenizer.reject_token_if &:nil?
|
30
26
|
|
31
|
-
|
27
|
+
tokenizer.reject(['a', nil, '', 'b']).should == ['a', '', 'b']
|
32
28
|
end
|
33
29
|
end
|
34
30
|
|
35
31
|
describe "substitute(s)_characters*" do
|
36
32
|
it "doesn't substitute if there is no substituter" do
|
37
|
-
|
33
|
+
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzäöü'
|
38
34
|
end
|
39
35
|
it "uses the substituter to replace characters" do
|
40
|
-
|
36
|
+
tokenizer.substitutes_characters_with CharacterSubstituters::WestEuropean.new
|
41
37
|
|
42
|
-
|
38
|
+
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
|
43
39
|
end
|
44
40
|
it "uses the european substituter as default" do
|
45
|
-
|
41
|
+
tokenizer.substitutes_characters_with
|
46
42
|
|
47
|
-
|
43
|
+
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
|
48
44
|
end
|
49
45
|
end
|
50
46
|
|
51
47
|
describe "removes_characters_after_splitting" do
|
52
48
|
context "without removes_characters_after_splitting called" do
|
53
49
|
it "has remove_after_normalizing_illegals" do
|
54
|
-
|
50
|
+
expect { tokenizer.remove_after_normalizing_illegals('any') }.to_not raise_error
|
55
51
|
end
|
56
52
|
it 'should define a remove_after_normalizing_illegals normalize_with_patterns does nothing' do
|
57
53
|
unchanging = stub :unchanging
|
58
|
-
|
54
|
+
|
55
|
+
tokenizer.remove_after_normalizing_illegals unchanging
|
59
56
|
end
|
60
57
|
end
|
61
58
|
context "with removes_characters_after_splitting called" do
|
62
59
|
before(:each) do
|
63
|
-
|
60
|
+
tokenizer.removes_characters_after_splitting(/[afo]/)
|
64
61
|
end
|
65
62
|
it "has remove_after_normalizing_illegals" do
|
66
|
-
|
63
|
+
expect { tokenizer.remove_after_normalizing_illegals('abcdefghijklmnop') }.to_not raise_error
|
67
64
|
end
|
68
65
|
it "removes illegal characters" do
|
69
|
-
|
66
|
+
tokenizer.remove_after_normalizing_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
|
70
67
|
end
|
71
68
|
end
|
72
69
|
end
|
@@ -74,25 +71,26 @@ describe Tokenizers::Base do
|
|
74
71
|
describe "normalizes_words" do
|
75
72
|
context "without normalizes_words called" do
|
76
73
|
it "has normalize_with_patterns" do
|
77
|
-
|
74
|
+
expect { tokenizer.normalize_with_patterns('any') }.to_not raise_error
|
78
75
|
end
|
79
76
|
it 'should define a method normalize_with_patterns does nothing' do
|
80
77
|
unchanging = stub :unchanging
|
81
|
-
|
78
|
+
|
79
|
+
tokenizer.normalize_with_patterns(unchanging).should == unchanging
|
82
80
|
end
|
83
81
|
end
|
84
82
|
context "with normalizes_words called" do
|
85
83
|
before(:each) do
|
86
|
-
|
84
|
+
tokenizer.normalizes_words([
|
87
85
|
[/st\./, 'sankt'],
|
88
86
|
[/stras?s?e?/, 'str']
|
89
87
|
])
|
90
88
|
end
|
91
89
|
it "has normalize_with_patterns" do
|
92
|
-
|
90
|
+
expect { tokenizer.normalize_with_patterns('a b/c.d') }.to_not raise_error
|
93
91
|
end
|
94
92
|
it "normalizes, but just the first one" do
|
95
|
-
|
93
|
+
tokenizer.normalize_with_patterns('st. wegstrasse').should == 'sankt wegstrasse'
|
96
94
|
end
|
97
95
|
end
|
98
96
|
end
|
@@ -100,24 +98,24 @@ describe Tokenizers::Base do
|
|
100
98
|
describe "splits_text_on" do
|
101
99
|
context "without splits_text_on called" do
|
102
100
|
it "has split" do
|
103
|
-
lambda {
|
101
|
+
lambda { tokenizer.split('any') }.should_not raise_error
|
104
102
|
end
|
105
103
|
it 'should define a method split that splits by default on \s' do
|
106
|
-
|
104
|
+
tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
|
107
105
|
end
|
108
106
|
it 'splits text on /\s/ by default' do
|
109
|
-
|
107
|
+
tokenizer.split('this is a test').should == ['this', 'is', 'a', 'test']
|
110
108
|
end
|
111
109
|
end
|
112
110
|
context "with removes_characters called" do
|
113
111
|
before(:each) do
|
114
|
-
|
112
|
+
tokenizer.splits_text_on(/[\s\.\/]/)
|
115
113
|
end
|
116
114
|
it "has split" do
|
117
|
-
|
115
|
+
expect { tokenizer.split('a b/c.d') }.to_not raise_error
|
118
116
|
end
|
119
117
|
it "removes illegal characters" do
|
120
|
-
|
118
|
+
tokenizer.split('a b/c.d').should == ['a','b','c','d']
|
121
119
|
end
|
122
120
|
end
|
123
121
|
end
|
@@ -125,22 +123,23 @@ describe Tokenizers::Base do
|
|
125
123
|
describe "removes_characters" do
|
126
124
|
context "without removes_characters called" do
|
127
125
|
it "has remove_illegals" do
|
128
|
-
|
126
|
+
expect { tokenizer.remove_illegals('any') }.to_not raise_error
|
129
127
|
end
|
130
128
|
it 'should define a method remove_illegals that does nothing' do
|
131
129
|
unchanging = stub :unchanging
|
132
|
-
|
130
|
+
|
131
|
+
tokenizer.remove_illegals unchanging
|
133
132
|
end
|
134
133
|
end
|
135
134
|
context "with removes_characters called" do
|
136
135
|
before(:each) do
|
137
|
-
|
136
|
+
tokenizer.removes_characters(/[afo]/)
|
138
137
|
end
|
139
138
|
it "has remove_illegals" do
|
140
|
-
|
139
|
+
expect { tokenizer.remove_illegals('abcdefghijklmnop') }.to_not raise_error
|
141
140
|
end
|
142
141
|
it "removes illegal characters" do
|
143
|
-
|
142
|
+
tokenizer.remove_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
|
144
143
|
end
|
145
144
|
end
|
146
145
|
end
|
@@ -148,45 +147,44 @@ describe Tokenizers::Base do
|
|
148
147
|
describe 'stopwords' do
|
149
148
|
context 'without stopwords given' do
|
150
149
|
it 'should define a method remove_stopwords' do
|
151
|
-
lambda {
|
150
|
+
lambda { tokenizer.remove_stopwords('from this text') }.should_not raise_error
|
152
151
|
end
|
153
152
|
it 'should define a method remove_stopwords that does nothing' do
|
154
|
-
|
153
|
+
tokenizer.remove_stopwords('from this text').should == 'from this text'
|
155
154
|
end
|
156
155
|
it 'should define a method remove_non_single_stopwords' do
|
157
|
-
|
158
|
-
|
156
|
+
expect { tokenizer.remove_non_single_stopwords('from this text') }.to_not raise_error
|
159
157
|
end
|
160
158
|
end
|
161
159
|
context 'with stopwords given' do
|
162
160
|
before(:each) do
|
163
|
-
|
161
|
+
tokenizer.stopwords(/r|e/)
|
164
162
|
end
|
165
163
|
it 'should define a method remove_stopwords' do
|
166
|
-
lambda {
|
164
|
+
lambda { tokenizer.remove_stopwords('from this text') }.should_not raise_error
|
167
165
|
end
|
168
166
|
it 'should define a method stopwords that removes stopwords' do
|
169
|
-
|
167
|
+
tokenizer.remove_stopwords('from this text').should == 'fom this txt'
|
170
168
|
end
|
171
169
|
it 'should define a method remove_non_single_stopwords' do
|
172
|
-
|
170
|
+
expect { tokenizer.remove_non_single_stopwords('from this text') }.to_not raise_error
|
173
171
|
end
|
174
172
|
it 'should define a method remove_non_single_stopwords that removes non-single stopwords' do
|
175
|
-
|
173
|
+
tokenizer.remove_non_single_stopwords('rerere rerere').should == ' '
|
176
174
|
end
|
177
175
|
it 'should define a method remove_non_single_stopwords that does not single stopwords' do
|
178
|
-
|
176
|
+
tokenizer.remove_non_single_stopwords('rerere').should == 'rerere'
|
179
177
|
end
|
180
178
|
end
|
181
179
|
context 'error case' do
|
182
180
|
before(:each) do
|
183
|
-
|
181
|
+
tokenizer.stopwords(/any/)
|
184
182
|
end
|
185
183
|
it 'should not remove non-single stopwords with a star' do
|
186
|
-
|
184
|
+
tokenizer.remove_non_single_stopwords('a*').should == 'a*'
|
187
185
|
end
|
188
186
|
it 'should not remove non-single stopwords with a tilde' do
|
189
|
-
|
187
|
+
tokenizer.remove_non_single_stopwords('a~').should == 'a~'
|
190
188
|
end
|
191
189
|
end
|
192
190
|
end
|
@@ -4,9 +4,7 @@ require 'spec_helper'
|
|
4
4
|
|
5
5
|
describe Tokenizers::Index do
|
6
6
|
|
7
|
-
|
8
|
-
@tokenizer = Tokenizers::Index.new
|
9
|
-
end
|
7
|
+
let(:tokenizer) { Tokenizers::Index.new }
|
10
8
|
|
11
9
|
describe "default*" do
|
12
10
|
before(:all) do
|
@@ -33,13 +31,13 @@ describe Tokenizers::Index do
|
|
33
31
|
|
34
32
|
describe "remove_removes_characters" do
|
35
33
|
it "should not remove ' from a query by default" do
|
36
|
-
|
34
|
+
tokenizer.remove_illegals("Lugi's").should == "Lugi's"
|
37
35
|
end
|
38
36
|
end
|
39
37
|
|
40
38
|
describe "reject!" do
|
41
39
|
it "should reject tokens if blank" do
|
42
|
-
|
40
|
+
tokenizer.reject(['', 'not blank', '']).should == ['not blank']
|
43
41
|
end
|
44
42
|
end
|
45
43
|
|
@@ -47,7 +45,7 @@ describe Tokenizers::Index do
|
|
47
45
|
describe "normalizing" do
|
48
46
|
def self.it_should_normalize_token(text, expected)
|
49
47
|
it "should handle the #{text} case" do
|
50
|
-
|
48
|
+
tokenizer.tokenize(text).to_a.should == [expected].compact
|
51
49
|
end
|
52
50
|
end
|
53
51
|
# defaults
|
@@ -57,7 +55,7 @@ describe Tokenizers::Index do
|
|
57
55
|
describe "tokenizing" do
|
58
56
|
def self.it_should_tokenize_token(text, expected)
|
59
57
|
it "should handle the #{text} case" do
|
60
|
-
|
58
|
+
tokenizer.tokenize(text).to_a.should == expected
|
61
59
|
end
|
62
60
|
end
|
63
61
|
# defaults
|
@@ -3,9 +3,7 @@ require 'spec_helper'
|
|
3
3
|
|
4
4
|
describe Tokenizers::Query do
|
5
5
|
|
6
|
-
|
7
|
-
@tokenizer = Tokenizers::Query.new
|
8
|
-
end
|
6
|
+
let(:tokenizer) { Tokenizers::Query.new }
|
9
7
|
|
10
8
|
describe "default*" do
|
11
9
|
before(:all) do
|
@@ -32,7 +30,7 @@ describe Tokenizers::Query do
|
|
32
30
|
|
33
31
|
describe "maximum_tokens" do
|
34
32
|
it "should be set to 5 by default" do
|
35
|
-
|
33
|
+
tokenizer.maximum_tokens.should == 5
|
36
34
|
end
|
37
35
|
it "should be settable" do
|
38
36
|
Tokenizers::Query.new(maximum_tokens: 3).maximum_tokens.should == 3
|
@@ -43,15 +41,15 @@ describe Tokenizers::Query do
|
|
43
41
|
it 'should call methods in order' do
|
44
42
|
text = stub :text
|
45
43
|
|
46
|
-
|
47
|
-
|
44
|
+
tokenizer.should_receive(:remove_illegals).once.ordered.with text
|
45
|
+
tokenizer.should_receive(:remove_non_single_stopwords).once.ordered.with text
|
48
46
|
|
49
|
-
|
47
|
+
tokenizer.preprocess text
|
50
48
|
end
|
51
49
|
it 'should return the text unchanged by default' do
|
52
50
|
text = "some text"
|
53
51
|
|
54
|
-
|
52
|
+
tokenizer.preprocess(text).should == text
|
55
53
|
end
|
56
54
|
end
|
57
55
|
|
@@ -60,9 +58,9 @@ describe Tokenizers::Query do
|
|
60
58
|
@tokens = mock :tokens, :null_object => true
|
61
59
|
end
|
62
60
|
it 'should tokenize the tokens' do
|
63
|
-
@tokens.should_receive(:tokenize_with).once.with
|
61
|
+
@tokens.should_receive(:tokenize_with).once.with tokenizer
|
64
62
|
|
65
|
-
|
63
|
+
tokenizer.process @tokens
|
66
64
|
end
|
67
65
|
it 'should call methods on the tokens in order' do
|
68
66
|
@tokens.should_receive(:tokenize_with).once.ordered
|
@@ -70,17 +68,17 @@ describe Tokenizers::Query do
|
|
70
68
|
@tokens.should_receive(:cap).once.ordered
|
71
69
|
@tokens.should_receive(:partialize_last).once.ordered
|
72
70
|
|
73
|
-
|
71
|
+
tokenizer.process @tokens
|
74
72
|
end
|
75
73
|
it 'should return the tokens' do
|
76
|
-
|
74
|
+
tokenizer.process(@tokens).should == @tokens
|
77
75
|
end
|
78
76
|
end
|
79
77
|
|
80
78
|
describe 'pretokenize' do
|
81
79
|
def self.it_should_pretokenize text, expected
|
82
80
|
it "should pretokenize #{text} as #{expected}" do
|
83
|
-
|
81
|
+
tokenizer.pretokenize(text).should == expected
|
84
82
|
end
|
85
83
|
end
|
86
84
|
it_should_pretokenize 'test miau test', ['test', 'miau', 'test']
|
@@ -89,7 +87,7 @@ describe Tokenizers::Query do
|
|
89
87
|
describe "tokenizing" do
|
90
88
|
def self.it_should_tokenize_token(text, expected)
|
91
89
|
it "should handle the #{text} case" do
|
92
|
-
|
90
|
+
tokenizer.tokenize(text).map(&:text).should == expected
|
93
91
|
end
|
94
92
|
end
|
95
93
|
it_should_tokenize_token 'simple tokenizing on \s', [:simple, :tokenizing, :on, :'\s']
|
@@ -98,7 +96,7 @@ describe Tokenizers::Query do
|
|
98
96
|
describe 'normalize_with_patterns' do
|
99
97
|
def self.it_should_pattern_normalize original, expected
|
100
98
|
it "should normalize #{original} with pattern into #{expected}" do
|
101
|
-
|
99
|
+
tokenizer.normalize_with_patterns(original).should == expected
|
102
100
|
end
|
103
101
|
end
|
104
102
|
it_should_pattern_normalize 'no pattern normalization', 'no pattern normalization'
|
@@ -106,22 +104,22 @@ describe Tokenizers::Query do
|
|
106
104
|
|
107
105
|
describe 'reject' do
|
108
106
|
it 'should reject blank tokens' do
|
109
|
-
|
107
|
+
tokenizer.reject(["some token answering to blank?", nil, nil]).should == ["some token answering to blank?"]
|
110
108
|
end
|
111
109
|
end
|
112
110
|
|
113
111
|
describe "last token" do
|
114
112
|
it "should be partial" do
|
115
|
-
|
113
|
+
tokenizer.tokenize("First Second Third Last").last.instance_variable_get(:@partial).should be_true
|
116
114
|
end
|
117
115
|
end
|
118
116
|
|
119
117
|
describe ".tokenize" do
|
120
118
|
it "should return an Array of tokens" do
|
121
|
-
|
119
|
+
tokenizer.tokenize('test test').to_a.should be_instance_of(Array)
|
122
120
|
end
|
123
121
|
it "should return an empty tokenized query if the query string is blank or empty" do
|
124
|
-
|
122
|
+
tokenizer.tokenize('').map(&:to_s).should == []
|
125
123
|
end
|
126
124
|
end
|
127
125
|
describe "token_for" do
|
@@ -129,7 +127,7 @@ describe Tokenizers::Query do
|
|
129
127
|
text = stub(:text)
|
130
128
|
Query::Token.should_receive(:processed).with text
|
131
129
|
|
132
|
-
|
130
|
+
tokenizer.token_for text
|
133
131
|
end
|
134
132
|
end
|
135
133
|
|