picky 0.3.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/picky/application.rb +2 -2
- data/lib/picky/cacher/partial/default.rb +1 -1
- data/lib/picky/configuration/field.rb +8 -10
- data/lib/picky/configuration/indexes.rb +6 -6
- data/lib/picky/configuration/queries.rb +4 -3
- data/lib/picky/cores.rb +2 -2
- data/lib/picky/extensions/array.rb +2 -12
- data/lib/picky/generator.rb +27 -4
- data/lib/picky/index/bundle.rb +5 -41
- data/lib/picky/index/bundle_checker.rb +58 -0
- data/lib/picky/index/type.rb +4 -1
- data/lib/picky/index/wrappers/exact_first.rb +57 -0
- data/lib/picky/indexes.rb +12 -19
- data/lib/picky/loader.rb +7 -8
- data/lib/picky/query/allocation.rb +1 -1
- data/lib/picky/query/combinations.rb +9 -6
- data/lib/picky/query/combinator.rb +11 -5
- data/lib/picky/rack/harakiri.rb +1 -1
- data/lib/picky/results/base.rb +4 -12
- data/lib/picky/results/live.rb +0 -6
- data/lib/picky/routing.rb +17 -17
- data/lib/picky/sources/csv.rb +1 -2
- data/lib/picky/sources/db.rb +0 -1
- data/lib/picky/sources/delicious.rb +41 -0
- data/lib/picky/tokenizers/base.rb +52 -43
- data/lib/picky/tokenizers/default/index.rb +7 -0
- data/lib/picky/tokenizers/default/query.rb +7 -0
- data/lib/picky/tokenizers/index.rb +0 -9
- data/lib/picky/tokenizers/query.rb +0 -9
- data/lib/tasks/application.rake +1 -1
- data/lib/tasks/cache.rake +41 -48
- data/lib/tasks/framework.rake +1 -1
- data/lib/tasks/index.rake +22 -12
- data/lib/tasks/server.rake +3 -3
- data/lib/tasks/shortcuts.rake +9 -2
- data/lib/tasks/statistics.rake +8 -8
- data/lib/tasks/try.rake +4 -2
- data/project_prototype/Gemfile +1 -1
- data/project_prototype/app/application.rb +7 -3
- data/spec/lib/cacher/partial/default_spec.rb +1 -1
- data/spec/lib/cacher/partial/none_spec.rb +12 -0
- data/spec/lib/cacher/partial/subtoken_spec.rb +29 -1
- data/spec/lib/configuration/field_spec.rb +162 -3
- data/spec/lib/configuration/indexes_spec.rb +150 -0
- data/spec/lib/cores_spec.rb +43 -0
- data/spec/lib/extensions/module_spec.rb +27 -16
- data/spec/lib/generator_spec.rb +3 -3
- data/spec/lib/index/bundle_checker_spec.rb +67 -0
- data/spec/lib/index/bundle_spec.rb +0 -50
- data/spec/lib/index/type_spec.rb +47 -0
- data/spec/lib/index/wrappers/exact_first_spec.rb +95 -0
- data/spec/lib/indexers/base_spec.rb +18 -2
- data/spec/lib/loader_spec.rb +21 -1
- data/spec/lib/query/allocation_spec.rb +25 -0
- data/spec/lib/query/base_spec.rb +37 -0
- data/spec/lib/query/combination_spec.rb +10 -1
- data/spec/lib/query/combinations_spec.rb +82 -3
- data/spec/lib/query/combinator_spec.rb +45 -0
- data/spec/lib/query/token_spec.rb +24 -0
- data/spec/lib/rack/harakiri_spec.rb +28 -0
- data/spec/lib/results/base_spec.rb +24 -0
- data/spec/lib/results/live_spec.rb +15 -0
- data/spec/lib/routing_spec.rb +5 -0
- data/spec/lib/sources/db_spec.rb +31 -1
- data/spec/lib/sources/delicious_spec.rb +75 -0
- data/spec/lib/tokenizers/base_spec.rb +160 -49
- data/spec/lib/tokenizers/default/index_spec.rb +11 -0
- data/spec/lib/tokenizers/default/query_spec.rb +11 -0
- metadata +26 -5
- data/lib/picky/index/combined.rb +0 -45
- data/lib/picky/tokenizers/default.rb +0 -3
@@ -6,56 +6,167 @@ describe Tokenizers::Base do
|
|
6
6
|
before(:each) do
|
7
7
|
@tokenizer = Tokenizers::Base.new
|
8
8
|
end
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
9
|
+
|
10
|
+
describe "removes_characters_after_splitting" do
|
11
|
+
context "without removes_characters_after_splitting called" do
|
12
|
+
it "has remove_after_normalizing_illegals" do
|
13
|
+
lambda { @tokenizer.remove_after_normalizing_illegals('any') }.should_not raise_error
|
14
|
+
end
|
15
|
+
it 'should define a remove_after_normalizing_illegals normalize_with_patterns does nothing' do
|
16
|
+
unchanging = stub :unchanging
|
17
|
+
@tokenizer.remove_after_normalizing_illegals unchanging
|
18
|
+
end
|
19
|
+
end
|
20
|
+
context "with removes_characters_after_splitting called" do
|
21
|
+
before(:each) do
|
22
|
+
@tokenizer.removes_characters_after_splitting(/[afo]/)
|
23
|
+
end
|
24
|
+
it "has remove_after_normalizing_illegals" do
|
25
|
+
lambda { @tokenizer.remove_after_normalizing_illegals('abcdefghijklmnop') }.should_not raise_error
|
26
|
+
end
|
27
|
+
it "removes illegal characters" do
|
28
|
+
@tokenizer.remove_after_normalizing_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "normalizes_words" do
|
34
|
+
context "without normalizes_words called" do
|
35
|
+
it "has normalize_with_patterns" do
|
36
|
+
lambda { @tokenizer.normalize_with_patterns('any') }.should_not raise_error
|
37
|
+
end
|
38
|
+
it 'should define a method normalize_with_patterns does nothing' do
|
39
|
+
unchanging = stub :unchanging
|
40
|
+
@tokenizer.normalize_with_patterns(unchanging).should == unchanging
|
41
|
+
end
|
42
|
+
end
|
43
|
+
context "with normalizes_words called" do
|
44
|
+
before(:each) do
|
45
|
+
@tokenizer.normalizes_words([
|
46
|
+
[/st\./, 'sankt'],
|
47
|
+
[/stras?s?e?/, 'str']
|
48
|
+
])
|
49
|
+
end
|
50
|
+
it "has normalize_with_patterns" do
|
51
|
+
lambda { @tokenizer.normalize_with_patterns('a b/c.d') }.should_not raise_error
|
52
|
+
end
|
53
|
+
it "normalizes, but just the first one" do
|
54
|
+
@tokenizer.normalize_with_patterns('st. wegstrasse').should == 'sankt wegstrasse'
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "splits_text_on" do
|
60
|
+
context "without splits_text_on called" do
|
61
|
+
it "has split" do
|
62
|
+
lambda { @tokenizer.split('any') }.should_not raise_error
|
63
|
+
end
|
64
|
+
it 'should define a method split that splits by default on \s' do
|
65
|
+
@tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
|
66
|
+
end
|
67
|
+
end
|
68
|
+
context "with removes_characters called" do
|
69
|
+
before(:each) do
|
70
|
+
@tokenizer.splits_text_on(/[\s\.\/]/)
|
71
|
+
end
|
72
|
+
it "has split" do
|
73
|
+
lambda { @tokenizer.split('a b/c.d') }.should_not raise_error
|
74
|
+
end
|
75
|
+
it "removes illegal characters" do
|
76
|
+
@tokenizer.split('a b/c.d').should == ['a','b','c','d']
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
describe "removes_characters" do
|
82
|
+
context "without removes_characters called" do
|
83
|
+
it "has remove_illegals" do
|
84
|
+
lambda { @tokenizer.remove_illegals('any') }.should_not raise_error
|
85
|
+
end
|
86
|
+
it 'should define a method remove_illegals that does nothing' do
|
87
|
+
unchanging = stub :unchanging
|
88
|
+
@tokenizer.remove_illegals unchanging
|
89
|
+
end
|
90
|
+
end
|
91
|
+
context "with removes_characters called" do
|
92
|
+
before(:each) do
|
93
|
+
@tokenizer.removes_characters(/[afo]/)
|
94
|
+
end
|
95
|
+
it "has remove_illegals" do
|
96
|
+
lambda { @tokenizer.remove_illegals('abcdefghijklmnop') }.should_not raise_error
|
97
|
+
end
|
98
|
+
it "removes illegal characters" do
|
99
|
+
@tokenizer.remove_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
describe 'contracts_expressions' do
|
105
|
+
context 'without contract_expressions called' do
|
106
|
+
it 'should define a method contract' do
|
107
|
+
lambda { @tokenizer.contract('from this text') }.should_not raise_error
|
108
|
+
end
|
109
|
+
it 'should define a method contract that does nothing' do
|
110
|
+
unchanging = stub :unchanging
|
111
|
+
@tokenizer.contract unchanging
|
112
|
+
end
|
113
|
+
end
|
114
|
+
context 'with contracts_expressions called' do
|
115
|
+
before(:each) do
|
116
|
+
@tokenizer.contracts_expressions(/Mister|Mr./, 'mr')
|
117
|
+
end
|
118
|
+
it 'should define a method remove_stopwords' do
|
119
|
+
lambda { @tokenizer.contract('from this text') }.should_not raise_error
|
120
|
+
end
|
121
|
+
it 'should define a method contract that contracts expressions' do
|
122
|
+
@tokenizer.contract('Mister Meyer, Mr. Peter').should == 'mr Meyer, mr Peter'
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
describe 'stopwords' do
|
128
|
+
context 'without stopwords given' do
|
129
|
+
it 'should define a method remove_stopwords' do
|
130
|
+
lambda { @tokenizer.remove_stopwords('from this text') }.should_not raise_error
|
131
|
+
end
|
132
|
+
it 'should define a method remove_stopwords that does nothing' do
|
133
|
+
@tokenizer.remove_stopwords('from this text').should == 'from this text'
|
134
|
+
end
|
135
|
+
it 'should define a method remove_non_single_stopwords' do
|
136
|
+
lambda { @tokenizer.remove_non_single_stopwords('from this text') }.should_not raise_error
|
137
|
+
|
138
|
+
end
|
139
|
+
end
|
140
|
+
context 'with stopwords given' do
|
141
|
+
before(:each) do
|
142
|
+
@tokenizer.stopwords(/r|e/)
|
143
|
+
end
|
144
|
+
it 'should define a method remove_stopwords' do
|
145
|
+
lambda { @tokenizer.remove_stopwords('from this text') }.should_not raise_error
|
146
|
+
end
|
147
|
+
it 'should define a method stopwords that removes stopwords' do
|
148
|
+
@tokenizer.remove_stopwords('from this text').should == 'fom this txt'
|
149
|
+
end
|
150
|
+
it 'should define a method remove_non_single_stopwords' do
|
151
|
+
lambda { @tokenizer.remove_non_single_stopwords('from this text') }.should_not raise_error
|
152
|
+
end
|
153
|
+
it 'should define a method remove_non_single_stopwords that removes non-single stopwords' do
|
154
|
+
@tokenizer.remove_non_single_stopwords('rerere rerere').should == ' '
|
155
|
+
end
|
156
|
+
it 'should define a method remove_non_single_stopwords that does not single stopwords' do
|
157
|
+
@tokenizer.remove_non_single_stopwords('rerere').should == 'rerere'
|
158
|
+
end
|
159
|
+
end
|
160
|
+
context 'error case' do
|
161
|
+
before(:each) do
|
162
|
+
@tokenizer.stopwords(/any/)
|
163
|
+
end
|
164
|
+
it 'should not remove non-single stopwords with a star' do
|
165
|
+
@tokenizer.remove_non_single_stopwords('a*').should == 'a*'
|
166
|
+
end
|
167
|
+
it 'should not remove non-single stopwords with a tilde' do
|
168
|
+
@tokenizer.remove_non_single_stopwords('a~').should == 'a~'
|
57
169
|
end
|
58
170
|
end
|
59
171
|
end
|
60
|
-
|
61
172
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
7
|
+
- 9
|
8
8
|
- 0
|
9
|
-
version: 0.
|
9
|
+
version: 0.9.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Florian Hanke
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-10-
|
17
|
+
date: 2010-10-26 00:00:00 +02:00
|
18
18
|
default_executable: picky
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -75,9 +75,10 @@ files:
|
|
75
75
|
- lib/picky/helpers/gc.rb
|
76
76
|
- lib/picky/helpers/measuring.rb
|
77
77
|
- lib/picky/index/bundle.rb
|
78
|
+
- lib/picky/index/bundle_checker.rb
|
78
79
|
- lib/picky/index/category.rb
|
79
|
-
- lib/picky/index/combined.rb
|
80
80
|
- lib/picky/index/type.rb
|
81
|
+
- lib/picky/index/wrappers/exact_first.rb
|
81
82
|
- lib/picky/indexers/base.rb
|
82
83
|
- lib/picky/indexers/default.rb
|
83
84
|
- lib/picky/indexers/field.rb
|
@@ -111,8 +112,10 @@ files:
|
|
111
112
|
- lib/picky/sources/base.rb
|
112
113
|
- lib/picky/sources/csv.rb
|
113
114
|
- lib/picky/sources/db.rb
|
115
|
+
- lib/picky/sources/delicious.rb
|
114
116
|
- lib/picky/tokenizers/base.rb
|
115
|
-
- lib/picky/tokenizers/default.rb
|
117
|
+
- lib/picky/tokenizers/default/index.rb
|
118
|
+
- lib/picky/tokenizers/default/query.rb
|
116
119
|
- lib/picky/tokenizers/index.rb
|
117
120
|
- lib/picky/tokenizers/query.rb
|
118
121
|
- lib/picky/umlaut_substituter.rb
|
@@ -145,6 +148,7 @@ files:
|
|
145
148
|
- spec/ext/performant_spec.rb
|
146
149
|
- spec/lib/application_spec.rb
|
147
150
|
- spec/lib/cacher/partial/default_spec.rb
|
151
|
+
- spec/lib/cacher/partial/none_spec.rb
|
148
152
|
- spec/lib/cacher/partial/subtoken_spec.rb
|
149
153
|
- spec/lib/cacher/partial_generator_spec.rb
|
150
154
|
- spec/lib/cacher/similarity/double_levenshtone_spec.rb
|
@@ -153,6 +157,7 @@ files:
|
|
153
157
|
- spec/lib/cacher/weights/logarithmic_spec.rb
|
154
158
|
- spec/lib/cacher/weights_generator_spec.rb
|
155
159
|
- spec/lib/configuration/field_spec.rb
|
160
|
+
- spec/lib/configuration/indexes_spec.rb
|
156
161
|
- spec/lib/configuration/type_spec.rb
|
157
162
|
- spec/lib/cores_spec.rb
|
158
163
|
- spec/lib/extensions/array_spec.rb
|
@@ -164,9 +169,12 @@ files:
|
|
164
169
|
- spec/lib/helpers/cache_spec.rb
|
165
170
|
- spec/lib/helpers/gc_spec.rb
|
166
171
|
- spec/lib/helpers/measuring_spec.rb
|
172
|
+
- spec/lib/index/bundle_checker_spec.rb
|
167
173
|
- spec/lib/index/bundle_partial_generation_speed_spec.rb
|
168
174
|
- spec/lib/index/bundle_spec.rb
|
169
175
|
- spec/lib/index/category_spec.rb
|
176
|
+
- spec/lib/index/type_spec.rb
|
177
|
+
- spec/lib/index/wrappers/exact_first_spec.rb
|
170
178
|
- spec/lib/indexers/base_spec.rb
|
171
179
|
- spec/lib/indexers/field_spec.rb
|
172
180
|
- spec/lib/loader_spec.rb
|
@@ -186,11 +194,15 @@ files:
|
|
186
194
|
- spec/lib/query/weights_spec.rb
|
187
195
|
- spec/lib/rack/harakiri_spec.rb
|
188
196
|
- spec/lib/results/base_spec.rb
|
197
|
+
- spec/lib/results/live_spec.rb
|
189
198
|
- spec/lib/routing_spec.rb
|
190
199
|
- spec/lib/solr/schema_generator_spec.rb
|
191
200
|
- spec/lib/sources/csv_spec.rb
|
192
201
|
- spec/lib/sources/db_spec.rb
|
202
|
+
- spec/lib/sources/delicious_spec.rb
|
193
203
|
- spec/lib/tokenizers/base_spec.rb
|
204
|
+
- spec/lib/tokenizers/default/index_spec.rb
|
205
|
+
- spec/lib/tokenizers/default/query_spec.rb
|
194
206
|
- spec/lib/tokenizers/index_spec.rb
|
195
207
|
- spec/lib/tokenizers/query_spec.rb
|
196
208
|
- spec/lib/umlaut_substituter_spec.rb
|
@@ -232,6 +244,7 @@ test_files:
|
|
232
244
|
- spec/ext/performant_spec.rb
|
233
245
|
- spec/lib/application_spec.rb
|
234
246
|
- spec/lib/cacher/partial/default_spec.rb
|
247
|
+
- spec/lib/cacher/partial/none_spec.rb
|
235
248
|
- spec/lib/cacher/partial/subtoken_spec.rb
|
236
249
|
- spec/lib/cacher/partial_generator_spec.rb
|
237
250
|
- spec/lib/cacher/similarity/double_levenshtone_spec.rb
|
@@ -240,6 +253,7 @@ test_files:
|
|
240
253
|
- spec/lib/cacher/weights/logarithmic_spec.rb
|
241
254
|
- spec/lib/cacher/weights_generator_spec.rb
|
242
255
|
- spec/lib/configuration/field_spec.rb
|
256
|
+
- spec/lib/configuration/indexes_spec.rb
|
243
257
|
- spec/lib/configuration/type_spec.rb
|
244
258
|
- spec/lib/cores_spec.rb
|
245
259
|
- spec/lib/extensions/array_spec.rb
|
@@ -251,9 +265,12 @@ test_files:
|
|
251
265
|
- spec/lib/helpers/cache_spec.rb
|
252
266
|
- spec/lib/helpers/gc_spec.rb
|
253
267
|
- spec/lib/helpers/measuring_spec.rb
|
268
|
+
- spec/lib/index/bundle_checker_spec.rb
|
254
269
|
- spec/lib/index/bundle_partial_generation_speed_spec.rb
|
255
270
|
- spec/lib/index/bundle_spec.rb
|
256
271
|
- spec/lib/index/category_spec.rb
|
272
|
+
- spec/lib/index/type_spec.rb
|
273
|
+
- spec/lib/index/wrappers/exact_first_spec.rb
|
257
274
|
- spec/lib/indexers/base_spec.rb
|
258
275
|
- spec/lib/indexers/field_spec.rb
|
259
276
|
- spec/lib/loader_spec.rb
|
@@ -273,11 +290,15 @@ test_files:
|
|
273
290
|
- spec/lib/query/weights_spec.rb
|
274
291
|
- spec/lib/rack/harakiri_spec.rb
|
275
292
|
- spec/lib/results/base_spec.rb
|
293
|
+
- spec/lib/results/live_spec.rb
|
276
294
|
- spec/lib/routing_spec.rb
|
277
295
|
- spec/lib/solr/schema_generator_spec.rb
|
278
296
|
- spec/lib/sources/csv_spec.rb
|
279
297
|
- spec/lib/sources/db_spec.rb
|
298
|
+
- spec/lib/sources/delicious_spec.rb
|
280
299
|
- spec/lib/tokenizers/base_spec.rb
|
300
|
+
- spec/lib/tokenizers/default/index_spec.rb
|
301
|
+
- spec/lib/tokenizers/default/query_spec.rb
|
281
302
|
- spec/lib/tokenizers/index_spec.rb
|
282
303
|
- spec/lib/tokenizers/query_spec.rb
|
283
304
|
- spec/lib/umlaut_substituter_spec.rb
|
data/lib/picky/index/combined.rb
DELETED
@@ -1,45 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
module Index
|
4
|
-
|
5
|
-
# This index combines an exact and partial index.
|
6
|
-
# It serves to order the results such that exact hits are found first.
|
7
|
-
#
|
8
|
-
# TODO Need to use the right subtokens. Bake in?
|
9
|
-
#
|
10
|
-
# TODO One can use it as a wrapper, and it will extract the indexes itself. Rename: ExactFirst.
|
11
|
-
#
|
12
|
-
class Combined < Bundle
|
13
|
-
|
14
|
-
delegate :similar,
|
15
|
-
:identifier,
|
16
|
-
:name,
|
17
|
-
:to => :@exact
|
18
|
-
delegate :type,
|
19
|
-
:category,
|
20
|
-
:weight,
|
21
|
-
:generate_partial_from,
|
22
|
-
:generate_caches_from_memory,
|
23
|
-
:generate_derived,
|
24
|
-
:dump,
|
25
|
-
:load,
|
26
|
-
:to => :@partial
|
27
|
-
|
28
|
-
# TODO initialize type_or_category # => installs itself on all exact and partial
|
29
|
-
#
|
30
|
-
def initialize exact, partial
|
31
|
-
@exact = exact
|
32
|
-
@partial = partial
|
33
|
-
end
|
34
|
-
|
35
|
-
def ids text
|
36
|
-
@exact.ids(text) + @partial.ids(text)
|
37
|
-
end
|
38
|
-
|
39
|
-
def weight text
|
40
|
-
[@exact.weight(text) || 0, @partial.weight(text) || 0].max
|
41
|
-
end
|
42
|
-
|
43
|
-
end
|
44
|
-
|
45
|
-
end
|