picky 1.5.2 → 1.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. data/lib/picky/analyzer.rb +154 -0
  2. data/lib/picky/application.rb +53 -33
  3. data/lib/picky/character_substituters/west_european.rb +10 -6
  4. data/lib/picky/cli.rb +18 -18
  5. data/lib/picky/index/base.rb +44 -13
  6. data/lib/picky/index_bundle.rb +13 -4
  7. data/lib/picky/indexed/indexes.rb +26 -10
  8. data/lib/picky/indexing/indexes.rb +26 -24
  9. data/lib/picky/interfaces/live_parameters.rb +23 -16
  10. data/lib/picky/internals/extensions/object.rb +13 -6
  11. data/lib/picky/internals/frontend_adapters/rack.rb +30 -34
  12. data/lib/picky/internals/index/backend.rb +1 -2
  13. data/lib/picky/internals/index/file/basic.rb +18 -14
  14. data/lib/picky/internals/index/files.rb +16 -6
  15. data/lib/picky/internals/index/redis/basic.rb +12 -5
  16. data/lib/picky/internals/index/redis.rb +2 -2
  17. data/lib/picky/internals/indexed/bundle/base.rb +58 -14
  18. data/lib/picky/internals/indexed/bundle/memory.rb +40 -14
  19. data/lib/picky/internals/indexed/bundle/redis.rb +9 -30
  20. data/lib/picky/internals/indexed/categories.rb +19 -14
  21. data/lib/picky/internals/indexed/category.rb +44 -20
  22. data/lib/picky/internals/indexed/index.rb +23 -13
  23. data/lib/picky/internals/indexed/wrappers/bundle/wrapper.rb +27 -9
  24. data/lib/picky/internals/indexers/serial.rb +1 -1
  25. data/lib/picky/internals/indexing/bundle/base.rb +28 -28
  26. data/lib/picky/internals/indexing/bundle/memory.rb +14 -7
  27. data/lib/picky/internals/indexing/categories.rb +15 -11
  28. data/lib/picky/internals/indexing/category.rb +30 -20
  29. data/lib/picky/internals/indexing/index.rb +22 -14
  30. data/lib/picky/internals/query/allocations.rb +0 -15
  31. data/lib/picky/internals/query/combinations/base.rb +0 -4
  32. data/lib/picky/internals/query/combinations/redis.rb +19 -8
  33. data/lib/picky/internals/query/indexes.rb +3 -6
  34. data/lib/picky/internals/query/token.rb +0 -4
  35. data/lib/picky/internals/query/weights.rb +2 -11
  36. data/lib/picky/internals/results/base.rb +3 -10
  37. data/lib/picky/internals/tokenizers/base.rb +64 -28
  38. data/lib/picky/internals/tokenizers/index.rb +8 -8
  39. data/lib/picky/loader.rb +59 -53
  40. data/lib/picky/query/base.rb +23 -29
  41. data/lib/picky/sources/base.rb +10 -10
  42. data/lib/picky/sources/couch.rb +14 -10
  43. data/lib/picky/sources/csv.rb +21 -14
  44. data/lib/picky/sources/db.rb +37 -31
  45. data/lib/picky/sources/delicious.rb +11 -8
  46. data/lib/picky/sources/wrappers/base.rb +3 -1
  47. data/lib/picky/statistics.rb +66 -0
  48. data/lib/tasks/application.rake +3 -0
  49. data/lib/tasks/checks.rake +11 -0
  50. data/lib/tasks/framework.rake +3 -0
  51. data/lib/tasks/index.rake +9 -11
  52. data/lib/tasks/routes.rake +3 -2
  53. data/lib/tasks/shortcuts.rake +17 -5
  54. data/lib/tasks/statistics.rake +20 -12
  55. data/lib/tasks/try.rake +14 -14
  56. data/spec/lib/application_spec.rb +3 -3
  57. data/spec/lib/index/base_spec.rb +25 -3
  58. data/spec/lib/internals/extensions/object_spec.rb +46 -20
  59. data/spec/lib/internals/frontend_adapters/rack_spec.rb +3 -3
  60. data/spec/lib/internals/index/redis/basic_spec.rb +67 -0
  61. data/spec/lib/internals/indexers/serial_spec.rb +1 -1
  62. data/spec/lib/internals/results/base_spec.rb +0 -12
  63. data/spec/lib/internals/tokenizers/base_spec.rb +49 -1
  64. data/spec/lib/query/allocations_spec.rb +0 -56
  65. data/spec/lib/query/base_spec.rb +25 -21
  66. data/spec/lib/query/combinations/redis_spec.rb +6 -1
  67. data/spec/lib/sources/delicious_spec.rb +2 -2
  68. data/spec/lib/statistics_spec.rb +31 -0
  69. metadata +9 -2
@@ -1,18 +1,35 @@
1
1
  module Internals
2
-
2
+
3
3
  module Tokenizers # :nodoc:all
4
-
4
+
5
5
  # Defines tokenizing processes used both in indexing and querying.
6
6
  #
7
7
  class Base
8
-
8
+
9
9
  # TODO Move EMPTY_STRING top level.
10
10
  #
11
11
  EMPTY_STRING = ''.freeze
12
-
12
+
13
+ def to_s
14
+ reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
15
+ <<-TOKENIZER
16
+ Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'}
17
+ Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'}
18
+ Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
19
+ Removes chars after split: #{@removes_characters_after_splitting_regexp ? "/#{@removes_characters_after_splitting_regexp.source}/" : '-'}
20
+ Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
21
+ Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
22
+ Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
23
+ TOKENIZER
24
+ end
25
+
13
26
  # Stopwords.
14
27
  #
28
+ # We only allow regexps (even if string would be okay
29
+ # too for gsub! - it's too hard to understand)
30
+ #
15
31
  def stopwords regexp
32
+ check_argument_in __method__, Regexp, regexp
16
33
  @remove_stopwords_regexp = regexp
17
34
  end
18
35
  def remove_stopwords text
@@ -24,36 +41,45 @@ module Internals
24
41
  return text if text.match @@non_single_stopword_regexp
25
42
  remove_stopwords text
26
43
  end
27
-
44
+
28
45
  # Illegals.
29
46
  #
30
- # TODO Should there be a legal?
47
+ # We only allow regexps (even if string would be okay
48
+ # too for gsub! - it's too hard to understand)
31
49
  #
32
50
  def removes_characters regexp
51
+ check_argument_in __method__, Regexp, regexp
33
52
  @removes_characters_regexp = regexp
34
53
  end
35
54
  def remove_illegals text
36
55
  text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
37
56
  text
38
57
  end
39
-
58
+
40
59
  # Splitting.
41
60
  #
42
- def splits_text_on regexp
43
- @splits_text_on_regexp = regexp
61
+ # We allow Strings and Regexps.
62
+ # Note: We do not test against to_str since symbols do not work with String#split.
63
+ #
64
+ def splits_text_on regexp_or_string
65
+ raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
66
+ @splits_text_on = regexp_or_string
44
67
  end
45
68
  def split text
46
- text.split @splits_text_on_regexp
69
+ text.split @splits_text_on
47
70
  end
48
-
71
+
49
72
  # Normalizing.
50
73
  #
74
+ # We only allow arrays.
75
+ #
51
76
  def normalizes_words regexp_replaces
77
+ raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
52
78
  @normalizes_words_regexp_replaces = regexp_replaces
53
79
  end
54
80
  def normalize_with_patterns text
55
81
  return text unless @normalizes_words_regexp_replaces
56
-
82
+
57
83
  @normalizes_words_regexp_replaces.each do |regex, replace|
58
84
  # This should be sufficient
59
85
  #
@@ -62,28 +88,32 @@ module Internals
62
88
  remove_after_normalizing_illegals text
63
89
  text
64
90
  end
65
-
91
+
66
92
  # Illegal after normalizing.
67
93
  #
94
+ # We only allow regexps (even if string would be okay
95
+ # too for gsub! - it's too hard to understand)
96
+ #
68
97
  def removes_characters_after_splitting regexp
98
+ check_argument_in __method__, Regexp, regexp
69
99
  @removes_characters_after_splitting_regexp = regexp
70
100
  end
71
101
  def remove_after_normalizing_illegals text
72
102
  text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
73
103
  end
74
-
104
+
75
105
  # Substitute Characters with this substituter.
76
106
  #
77
107
  # Default is European Character substitution.
78
108
  #
79
109
  def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
80
- # TODO Raise if it doesn't quack substitute?
110
+ raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
81
111
  @substituter = substituter
82
112
  end
83
113
  def substitute_characters text
84
- substituter?? substituter.substitute(text) : text
114
+ substituter?? substituter.substitute(text) : text
85
115
  end
86
-
116
+
87
117
  # Reject tokens after tokenizing based on the given criteria.
88
118
  #
89
119
  # Note: Currently only for indexing. TODO Redesign and write for both!
@@ -94,8 +124,14 @@ module Internals
94
124
  def reject tokens
95
125
  tokens.reject! &@reject_condition
96
126
  end
97
-
98
-
127
+
128
+ # Checks if the right argument type has been given.
129
+ #
130
+ def check_argument_in method, type, argument, &condition
131
+ raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
132
+ end
133
+
134
+
99
135
  # Returns a number of tokens, generated from the given text.
100
136
  #
101
137
  # Note:
@@ -109,10 +145,10 @@ module Internals
109
145
  tokens = tokens_for words # creating tokens / strings
110
146
  process tokens # processing tokens / strings
111
147
  end
112
-
148
+
113
149
  attr_reader :substituter
114
150
  alias substituter? substituter
115
-
151
+
116
152
  def initialize options = {}
117
153
  removes_characters options[:removes_characters] if options[:removes_characters]
118
154
  contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
@@ -120,16 +156,16 @@ module Internals
120
156
  normalizes_words options[:normalizes_words] if options[:normalizes_words]
121
157
  removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
122
158
  substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
123
-
159
+
124
160
  # Defaults.
125
161
  #
126
162
  splits_text_on options[:splits_text_on] || /\s/
127
163
  reject_token_if &(options[:reject_token_if] || :blank?)
128
164
  end
129
-
165
+
130
166
  # Hooks.
131
167
  #
132
-
168
+
133
169
  # Preprocessing.
134
170
  #
135
171
  def preprocess text; end
@@ -142,7 +178,7 @@ module Internals
142
178
  reject tokens # Reject any tokens that don't meet criteria
143
179
  tokens
144
180
  end
145
-
181
+
146
182
  # Converts words into real tokens.
147
183
  #
148
184
  def tokens_for words
@@ -158,9 +194,9 @@ module Internals
158
194
  def empty_tokens
159
195
  Internals::Query::Tokens.new
160
196
  end
161
-
197
+
162
198
  end
163
-
199
+
164
200
  end
165
-
201
+
166
202
  end
@@ -1,20 +1,20 @@
1
1
  module Internals
2
2
 
3
3
  module Tokenizers
4
-
4
+
5
5
  # The base indexing tokenizer.
6
6
  #
7
7
  # Override in indexing subclasses and define in configuration.
8
8
  #
9
9
  class Index < Base
10
-
10
+
11
11
  def self.default= new_default
12
12
  @default = new_default
13
13
  end
14
14
  def self.default
15
15
  @default ||= new
16
16
  end
17
-
17
+
18
18
  # Default indexing preprocessing hook.
19
19
  #
20
20
  # Does:
@@ -34,7 +34,7 @@ module Internals
34
34
  remove_non_single_stopwords text
35
35
  text
36
36
  end
37
-
37
+
38
38
  # Default indexing pretokenizing hook.
39
39
  #
40
40
  # Does:
@@ -48,16 +48,16 @@ module Internals
48
48
  word
49
49
  end
50
50
  end
51
-
51
+
52
52
  # Does not actually return a token, but a
53
53
  # symbol "token".
54
54
  #
55
55
  def token_for text
56
56
  symbolize text
57
57
  end
58
-
58
+
59
59
  end
60
-
60
+
61
61
  end
62
-
62
+
63
63
  end
data/lib/picky/loader.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # Loads the search engine and itself.
2
2
  #
3
3
  module Loader # :nodoc:all
4
-
4
+
5
5
  # Reloads the whole app.
6
6
  # First itself, then the app.
7
7
  #
@@ -21,7 +21,7 @@ module Loader # :nodoc:all
21
21
  exclaim 'Loader loading itself.'
22
22
  load __FILE__
23
23
  end
24
-
24
+
25
25
  def self.require_relative filename
26
26
  require File.join(File.dirname(__FILE__), filename)
27
27
  end
@@ -31,7 +31,7 @@ module Loader # :nodoc:all
31
31
  def self.load_internals filename_without_rb
32
32
  load File.join(File.dirname(__FILE__), "internals/#{filename_without_rb}.rb")
33
33
  end
34
-
34
+
35
35
  def self.load_user filename
36
36
  load File.join(PICKY_ROOT, "#{filename}.rb")
37
37
  end
@@ -43,14 +43,14 @@ module Loader # :nodoc:all
43
43
  load filename
44
44
  end
45
45
  end
46
-
46
+
47
47
  # Load the user's application.
48
48
  #
49
49
  def self.load_application
50
50
  # Add lib dir to load path.
51
51
  #
52
52
  # add_lib_dir
53
-
53
+
54
54
  # Picky autoloading.
55
55
  #
56
56
  begin
@@ -63,25 +63,27 @@ module Loader # :nodoc:all
63
63
  load_user_lib namespaced_class_name.underscore # Try it once.
64
64
  retry
65
65
  end
66
-
66
+
67
+ # Prepare the application for reload.
68
+ #
69
+ # TODO Application.prepare_for_reload
70
+
67
71
  # Load the user's config.
68
72
  #
69
73
  load_user 'app/logging'
70
74
  load_user 'app/application'
71
-
75
+
72
76
  # Finalize the applications.
73
77
  #
74
- # TODO Problem: Reload Routes. Throw them all away and do them again?
75
- #
76
78
  Application.finalize_apps
77
-
79
+
78
80
  # TODO Rewrite
79
81
  #
80
82
  Internals::Query::Qualifiers.instance.prepare
81
-
83
+
82
84
  exclaim "Application #{Application.apps.map(&:name).join(', ')} loaded."
83
85
  end
84
-
86
+
85
87
  # Loads the internal parts of the framework.
86
88
  # (Not for the user)
87
89
  #
@@ -89,7 +91,7 @@ module Loader # :nodoc:all
89
91
  # Load compiled C code.
90
92
  #
91
93
  load_internals 'ext/maybe_compile'
92
-
94
+
93
95
  # Load extensions.
94
96
  #
95
97
  load_internals 'extensions/object'
@@ -97,24 +99,24 @@ module Loader # :nodoc:all
97
99
  load_internals 'extensions/symbol'
98
100
  load_internals 'extensions/module'
99
101
  load_internals 'extensions/hash'
100
-
102
+
101
103
  # Requiring Helpers
102
104
  #
103
105
  load_internals 'helpers/measuring'
104
-
106
+
105
107
  # Calculations.
106
108
  #
107
109
  load_internals 'calculations/location'
108
-
110
+
109
111
  # Index generation strategies.
110
112
  #
111
113
  load_internals 'indexers/no_source_specified_error'
112
114
  load_internals 'indexers/serial'
113
-
115
+
114
116
  # Generators.
115
117
  #
116
118
  load_internals 'generators/strategy'
117
-
119
+
118
120
  # Partial index generation strategies.
119
121
  #
120
122
  load_internals 'generators/partial/strategy'
@@ -127,37 +129,37 @@ module Loader # :nodoc:all
127
129
  load_internals 'generators/weights/strategy'
128
130
  load_internals 'generators/weights/logarithmic'
129
131
  load_internals 'generators/weights/default'
130
-
132
+
131
133
  # Similarity index generation strategies.
132
134
  #
133
135
  load_internals 'generators/similarity/strategy'
134
136
  load_internals 'generators/similarity/none'
135
137
  load_internals 'generators/similarity/double_levenshtone'
136
138
  load_internals 'generators/similarity/default'
137
-
139
+
138
140
  # Index generators.
139
141
  #
140
142
  load_internals 'generators/base'
141
143
  load_internals 'generators/partial_generator'
142
144
  load_internals 'generators/weights_generator'
143
145
  load_internals 'generators/similarity_generator'
144
-
146
+
145
147
  # Index store handling.
146
148
  #
147
149
  load_internals 'index/backend'
148
-
150
+
149
151
  load_internals 'index/redis'
150
152
  load_internals 'index/redis/basic'
151
153
  load_internals 'index/redis/list_hash'
152
154
  load_internals 'index/redis/string_hash'
153
-
155
+
154
156
  load_internals 'index/file/basic'
155
157
  load_internals 'index/file/text'
156
158
  load_internals 'index/file/marshal'
157
159
  load_internals 'index/file/json'
158
-
160
+
159
161
  load_internals 'index/files'
160
-
162
+
161
163
  # Indexing and Indexed things.
162
164
  #
163
165
  load_internals 'indexing/bundle/super_base' # TODO Remove.
@@ -167,68 +169,68 @@ module Loader # :nodoc:all
167
169
  load_internals 'indexing/category'
168
170
  load_internals 'indexing/categories'
169
171
  load_internals 'indexing/index'
170
-
172
+
171
173
  load_internals 'indexed/bundle/base'
172
174
  load_internals 'indexed/bundle/memory'
173
175
  load_internals 'indexed/bundle/redis'
174
176
  load_internals 'indexed/category'
175
177
  load_internals 'indexed/categories'
176
178
  load_internals 'indexed/index'
177
-
179
+
178
180
  # TODO Ok here?
179
181
  #
180
182
  load_internals 'indexed/wrappers/exact_first'
181
-
183
+
182
184
  # Bundle Wrapper
183
185
  #
184
186
  load_internals 'indexed/wrappers/bundle/wrapper'
185
187
  load_internals 'indexed/wrappers/bundle/calculation'
186
188
  load_internals 'indexed/wrappers/bundle/location'
187
-
189
+
188
190
  # Tokens.
189
191
  #
190
192
  load_internals 'query/token'
191
193
  load_internals 'query/tokens'
192
-
194
+
193
195
  # Tokenizers types.
194
196
  #
195
197
  load_internals 'tokenizers/base'
196
198
  load_internals 'tokenizers/index'
197
199
  load_internals 'tokenizers/query'
198
-
200
+
199
201
  # Query combinations, qualifiers, weigher.
200
202
  #
201
203
  load_internals 'query/combination'
202
204
  load_internals 'query/combinations/base'
203
205
  load_internals 'query/combinations/memory'
204
206
  load_internals 'query/combinations/redis'
205
-
207
+
206
208
  load_internals 'query/allocation'
207
209
  load_internals 'query/allocations'
208
-
210
+
209
211
  load_internals 'query/qualifiers'
210
-
212
+
211
213
  load_internals 'query/weights'
212
-
214
+
213
215
  load_internals 'query/indexes'
214
-
216
+
215
217
  # Results.
216
218
  #
217
219
  load_internals 'results/base'
218
220
  load_internals 'results/full'
219
221
  load_internals 'results/live'
220
-
222
+
221
223
  # Configuration.
222
224
  #
223
225
  load_internals 'configuration/index'
224
-
226
+
225
227
  # Adapters.
226
228
  #
227
229
  load_internals 'adapters/rack/base'
228
230
  load_internals 'adapters/rack/query'
229
231
  load_internals 'adapters/rack/live_parameters'
230
232
  load_internals 'adapters/rack'
231
-
233
+
232
234
  # Routing.
233
235
  #
234
236
  load_internals 'frontend_adapters/rack'
@@ -239,35 +241,39 @@ module Loader # :nodoc:all
239
241
  # Load harakiri.
240
242
  #
241
243
  load_relative 'rack/harakiri'
242
-
244
+
245
+ # Load analyzer.
246
+ #
247
+ load_relative 'analyzer'
248
+
243
249
  # Character Substituters
244
250
  #
245
251
  load_relative 'character_substituters/west_european'
246
-
252
+
247
253
  # Signal handling
248
254
  #
249
255
  load_relative 'signals'
250
-
256
+
251
257
  # Logging.
252
258
  #
253
259
  load_relative 'loggers/search'
254
-
260
+
255
261
  # Convenience accessors for generators.
256
262
  #
257
263
  load_relative 'generators/aliases'
258
-
264
+
259
265
  # API.
260
266
  #
261
267
  load_relative 'index/base'
262
268
  load_relative 'index/memory'
263
269
  load_relative 'index/redis'
264
-
270
+
265
271
  load_relative 'indexing/indexes'
266
272
  load_relative 'indexed/indexes'
267
-
273
+
268
274
  load_relative 'index_bundle'
269
275
  load_relative 'aliases'
270
-
276
+
271
277
  # Query.
272
278
  #
273
279
  load_relative 'query/base'
@@ -275,7 +281,7 @@ module Loader # :nodoc:all
275
281
  load_relative 'query/full'
276
282
  #
277
283
  # load_relative 'query/solr'
278
-
284
+
279
285
  # Sources.
280
286
  #
281
287
  load_relative 'sources/base'
@@ -283,23 +289,23 @@ module Loader # :nodoc:all
283
289
  load_relative 'sources/csv'
284
290
  load_relative 'sources/delicious'
285
291
  load_relative 'sources/couch'
286
-
292
+
287
293
  load_relative 'sources/wrappers/base'
288
294
  load_relative 'sources/wrappers/location'
289
-
295
+
290
296
  # Interfaces
291
297
  #
292
298
  load_relative 'interfaces/live_parameters'
293
-
299
+
294
300
  # Application.
295
301
  #
296
302
  load_relative 'application'
297
-
303
+
298
304
  # Load tools. Load in specific case?
299
305
  #
300
306
  load_relative 'cores'
301
307
  end
302
-
308
+
303
309
  # Loads the framework.
304
310
  #
305
311
  def self.load_framework