picky 1.5.2 → 1.5.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. data/lib/picky/analyzer.rb +154 -0
  2. data/lib/picky/application.rb +53 -33
  3. data/lib/picky/character_substituters/west_european.rb +10 -6
  4. data/lib/picky/cli.rb +18 -18
  5. data/lib/picky/index/base.rb +44 -13
  6. data/lib/picky/index_bundle.rb +13 -4
  7. data/lib/picky/indexed/indexes.rb +26 -10
  8. data/lib/picky/indexing/indexes.rb +26 -24
  9. data/lib/picky/interfaces/live_parameters.rb +23 -16
  10. data/lib/picky/internals/extensions/object.rb +13 -6
  11. data/lib/picky/internals/frontend_adapters/rack.rb +30 -34
  12. data/lib/picky/internals/index/backend.rb +1 -2
  13. data/lib/picky/internals/index/file/basic.rb +18 -14
  14. data/lib/picky/internals/index/files.rb +16 -6
  15. data/lib/picky/internals/index/redis/basic.rb +12 -5
  16. data/lib/picky/internals/index/redis.rb +2 -2
  17. data/lib/picky/internals/indexed/bundle/base.rb +58 -14
  18. data/lib/picky/internals/indexed/bundle/memory.rb +40 -14
  19. data/lib/picky/internals/indexed/bundle/redis.rb +9 -30
  20. data/lib/picky/internals/indexed/categories.rb +19 -14
  21. data/lib/picky/internals/indexed/category.rb +44 -20
  22. data/lib/picky/internals/indexed/index.rb +23 -13
  23. data/lib/picky/internals/indexed/wrappers/bundle/wrapper.rb +27 -9
  24. data/lib/picky/internals/indexers/serial.rb +1 -1
  25. data/lib/picky/internals/indexing/bundle/base.rb +28 -28
  26. data/lib/picky/internals/indexing/bundle/memory.rb +14 -7
  27. data/lib/picky/internals/indexing/categories.rb +15 -11
  28. data/lib/picky/internals/indexing/category.rb +30 -20
  29. data/lib/picky/internals/indexing/index.rb +22 -14
  30. data/lib/picky/internals/query/allocations.rb +0 -15
  31. data/lib/picky/internals/query/combinations/base.rb +0 -4
  32. data/lib/picky/internals/query/combinations/redis.rb +19 -8
  33. data/lib/picky/internals/query/indexes.rb +3 -6
  34. data/lib/picky/internals/query/token.rb +0 -4
  35. data/lib/picky/internals/query/weights.rb +2 -11
  36. data/lib/picky/internals/results/base.rb +3 -10
  37. data/lib/picky/internals/tokenizers/base.rb +64 -28
  38. data/lib/picky/internals/tokenizers/index.rb +8 -8
  39. data/lib/picky/loader.rb +59 -53
  40. data/lib/picky/query/base.rb +23 -29
  41. data/lib/picky/sources/base.rb +10 -10
  42. data/lib/picky/sources/couch.rb +14 -10
  43. data/lib/picky/sources/csv.rb +21 -14
  44. data/lib/picky/sources/db.rb +37 -31
  45. data/lib/picky/sources/delicious.rb +11 -8
  46. data/lib/picky/sources/wrappers/base.rb +3 -1
  47. data/lib/picky/statistics.rb +66 -0
  48. data/lib/tasks/application.rake +3 -0
  49. data/lib/tasks/checks.rake +11 -0
  50. data/lib/tasks/framework.rake +3 -0
  51. data/lib/tasks/index.rake +9 -11
  52. data/lib/tasks/routes.rake +3 -2
  53. data/lib/tasks/shortcuts.rake +17 -5
  54. data/lib/tasks/statistics.rake +20 -12
  55. data/lib/tasks/try.rake +14 -14
  56. data/spec/lib/application_spec.rb +3 -3
  57. data/spec/lib/index/base_spec.rb +25 -3
  58. data/spec/lib/internals/extensions/object_spec.rb +46 -20
  59. data/spec/lib/internals/frontend_adapters/rack_spec.rb +3 -3
  60. data/spec/lib/internals/index/redis/basic_spec.rb +67 -0
  61. data/spec/lib/internals/indexers/serial_spec.rb +1 -1
  62. data/spec/lib/internals/results/base_spec.rb +0 -12
  63. data/spec/lib/internals/tokenizers/base_spec.rb +49 -1
  64. data/spec/lib/query/allocations_spec.rb +0 -56
  65. data/spec/lib/query/base_spec.rb +25 -21
  66. data/spec/lib/query/combinations/redis_spec.rb +6 -1
  67. data/spec/lib/sources/delicious_spec.rb +2 -2
  68. data/spec/lib/statistics_spec.rb +31 -0
  69. metadata +9 -2
@@ -1,18 +1,35 @@
1
1
  module Internals
2
-
2
+
3
3
  module Tokenizers # :nodoc:all
4
-
4
+
5
5
  # Defines tokenizing processes used both in indexing and querying.
6
6
  #
7
7
  class Base
8
-
8
+
9
9
  # TODO Move EMPTY_STRING top level.
10
10
  #
11
11
  EMPTY_STRING = ''.freeze
12
-
12
+
13
+ def to_s
14
+ reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
15
+ <<-TOKENIZER
16
+ Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'}
17
+ Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'}
18
+ Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
19
+ Removes chars after split: #{@removes_characters_after_splitting_regexp ? "/#{@removes_characters_after_splitting_regexp.source}/" : '-'}
20
+ Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
21
+ Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
22
+ Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
23
+ TOKENIZER
24
+ end
25
+
13
26
  # Stopwords.
14
27
  #
28
+ # We only allow regexps (even if string would be okay
29
+ # too for gsub! - it's too hard to understand)
30
+ #
15
31
  def stopwords regexp
32
+ check_argument_in __method__, Regexp, regexp
16
33
  @remove_stopwords_regexp = regexp
17
34
  end
18
35
  def remove_stopwords text
@@ -24,36 +41,45 @@ module Internals
24
41
  return text if text.match @@non_single_stopword_regexp
25
42
  remove_stopwords text
26
43
  end
27
-
44
+
28
45
  # Illegals.
29
46
  #
30
- # TODO Should there be a legal?
47
+ # We only allow regexps (even if string would be okay
48
+ # too for gsub! - it's too hard to understand)
31
49
  #
32
50
  def removes_characters regexp
51
+ check_argument_in __method__, Regexp, regexp
33
52
  @removes_characters_regexp = regexp
34
53
  end
35
54
  def remove_illegals text
36
55
  text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
37
56
  text
38
57
  end
39
-
58
+
40
59
  # Splitting.
41
60
  #
42
- def splits_text_on regexp
43
- @splits_text_on_regexp = regexp
61
+ # We allow Strings and Regexps.
62
+ # Note: We do not test against to_str since symbols do not work with String#split.
63
+ #
64
+ def splits_text_on regexp_or_string
65
+ raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
66
+ @splits_text_on = regexp_or_string
44
67
  end
45
68
  def split text
46
- text.split @splits_text_on_regexp
69
+ text.split @splits_text_on
47
70
  end
48
-
71
+
49
72
  # Normalizing.
50
73
  #
74
+ # We only allow arrays.
75
+ #
51
76
  def normalizes_words regexp_replaces
77
+ raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
52
78
  @normalizes_words_regexp_replaces = regexp_replaces
53
79
  end
54
80
  def normalize_with_patterns text
55
81
  return text unless @normalizes_words_regexp_replaces
56
-
82
+
57
83
  @normalizes_words_regexp_replaces.each do |regex, replace|
58
84
  # This should be sufficient
59
85
  #
@@ -62,28 +88,32 @@ module Internals
62
88
  remove_after_normalizing_illegals text
63
89
  text
64
90
  end
65
-
91
+
66
92
  # Illegal after normalizing.
67
93
  #
94
+ # We only allow regexps (even if string would be okay
95
+ # too for gsub! - it's too hard to understand)
96
+ #
68
97
  def removes_characters_after_splitting regexp
98
+ check_argument_in __method__, Regexp, regexp
69
99
  @removes_characters_after_splitting_regexp = regexp
70
100
  end
71
101
  def remove_after_normalizing_illegals text
72
102
  text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
73
103
  end
74
-
104
+
75
105
  # Substitute Characters with this substituter.
76
106
  #
77
107
  # Default is European Character substitution.
78
108
  #
79
109
  def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
80
- # TODO Raise if it doesn't quack substitute?
110
+ raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
81
111
  @substituter = substituter
82
112
  end
83
113
  def substitute_characters text
84
- substituter?? substituter.substitute(text) : text
114
+ substituter?? substituter.substitute(text) : text
85
115
  end
86
-
116
+
87
117
  # Reject tokens after tokenizing based on the given criteria.
88
118
  #
89
119
  # Note: Currently only for indexing. TODO Redesign and write for both!
@@ -94,8 +124,14 @@ module Internals
94
124
  def reject tokens
95
125
  tokens.reject! &@reject_condition
96
126
  end
97
-
98
-
127
+
128
+ # Checks if the right argument type has been given.
129
+ #
130
+ def check_argument_in method, type, argument, &condition
131
+ raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
132
+ end
133
+
134
+
99
135
  # Returns a number of tokens, generated from the given text.
100
136
  #
101
137
  # Note:
@@ -109,10 +145,10 @@ module Internals
109
145
  tokens = tokens_for words # creating tokens / strings
110
146
  process tokens # processing tokens / strings
111
147
  end
112
-
148
+
113
149
  attr_reader :substituter
114
150
  alias substituter? substituter
115
-
151
+
116
152
  def initialize options = {}
117
153
  removes_characters options[:removes_characters] if options[:removes_characters]
118
154
  contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
@@ -120,16 +156,16 @@ module Internals
120
156
  normalizes_words options[:normalizes_words] if options[:normalizes_words]
121
157
  removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
122
158
  substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
123
-
159
+
124
160
  # Defaults.
125
161
  #
126
162
  splits_text_on options[:splits_text_on] || /\s/
127
163
  reject_token_if &(options[:reject_token_if] || :blank?)
128
164
  end
129
-
165
+
130
166
  # Hooks.
131
167
  #
132
-
168
+
133
169
  # Preprocessing.
134
170
  #
135
171
  def preprocess text; end
@@ -142,7 +178,7 @@ module Internals
142
178
  reject tokens # Reject any tokens that don't meet criteria
143
179
  tokens
144
180
  end
145
-
181
+
146
182
  # Converts words into real tokens.
147
183
  #
148
184
  def tokens_for words
@@ -158,9 +194,9 @@ module Internals
158
194
  def empty_tokens
159
195
  Internals::Query::Tokens.new
160
196
  end
161
-
197
+
162
198
  end
163
-
199
+
164
200
  end
165
-
201
+
166
202
  end
@@ -1,20 +1,20 @@
1
1
  module Internals
2
2
 
3
3
  module Tokenizers
4
-
4
+
5
5
  # The base indexing tokenizer.
6
6
  #
7
7
  # Override in indexing subclasses and define in configuration.
8
8
  #
9
9
  class Index < Base
10
-
10
+
11
11
  def self.default= new_default
12
12
  @default = new_default
13
13
  end
14
14
  def self.default
15
15
  @default ||= new
16
16
  end
17
-
17
+
18
18
  # Default indexing preprocessing hook.
19
19
  #
20
20
  # Does:
@@ -34,7 +34,7 @@ module Internals
34
34
  remove_non_single_stopwords text
35
35
  text
36
36
  end
37
-
37
+
38
38
  # Default indexing pretokenizing hook.
39
39
  #
40
40
  # Does:
@@ -48,16 +48,16 @@ module Internals
48
48
  word
49
49
  end
50
50
  end
51
-
51
+
52
52
  # Does not actually return a token, but a
53
53
  # symbol "token".
54
54
  #
55
55
  def token_for text
56
56
  symbolize text
57
57
  end
58
-
58
+
59
59
  end
60
-
60
+
61
61
  end
62
-
62
+
63
63
  end
data/lib/picky/loader.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # Loads the search engine and itself.
2
2
  #
3
3
  module Loader # :nodoc:all
4
-
4
+
5
5
  # Reloads the whole app.
6
6
  # First itself, then the app.
7
7
  #
@@ -21,7 +21,7 @@ module Loader # :nodoc:all
21
21
  exclaim 'Loader loading itself.'
22
22
  load __FILE__
23
23
  end
24
-
24
+
25
25
  def self.require_relative filename
26
26
  require File.join(File.dirname(__FILE__), filename)
27
27
  end
@@ -31,7 +31,7 @@ module Loader # :nodoc:all
31
31
  def self.load_internals filename_without_rb
32
32
  load File.join(File.dirname(__FILE__), "internals/#{filename_without_rb}.rb")
33
33
  end
34
-
34
+
35
35
  def self.load_user filename
36
36
  load File.join(PICKY_ROOT, "#{filename}.rb")
37
37
  end
@@ -43,14 +43,14 @@ module Loader # :nodoc:all
43
43
  load filename
44
44
  end
45
45
  end
46
-
46
+
47
47
  # Load the user's application.
48
48
  #
49
49
  def self.load_application
50
50
  # Add lib dir to load path.
51
51
  #
52
52
  # add_lib_dir
53
-
53
+
54
54
  # Picky autoloading.
55
55
  #
56
56
  begin
@@ -63,25 +63,27 @@ module Loader # :nodoc:all
63
63
  load_user_lib namespaced_class_name.underscore # Try it once.
64
64
  retry
65
65
  end
66
-
66
+
67
+ # Prepare the application for reload.
68
+ #
69
+ # TODO Application.prepare_for_reload
70
+
67
71
  # Load the user's config.
68
72
  #
69
73
  load_user 'app/logging'
70
74
  load_user 'app/application'
71
-
75
+
72
76
  # Finalize the applications.
73
77
  #
74
- # TODO Problem: Reload Routes. Throw them all away and do them again?
75
- #
76
78
  Application.finalize_apps
77
-
79
+
78
80
  # TODO Rewrite
79
81
  #
80
82
  Internals::Query::Qualifiers.instance.prepare
81
-
83
+
82
84
  exclaim "Application #{Application.apps.map(&:name).join(', ')} loaded."
83
85
  end
84
-
86
+
85
87
  # Loads the internal parts of the framework.
86
88
  # (Not for the user)
87
89
  #
@@ -89,7 +91,7 @@ module Loader # :nodoc:all
89
91
  # Load compiled C code.
90
92
  #
91
93
  load_internals 'ext/maybe_compile'
92
-
94
+
93
95
  # Load extensions.
94
96
  #
95
97
  load_internals 'extensions/object'
@@ -97,24 +99,24 @@ module Loader # :nodoc:all
97
99
  load_internals 'extensions/symbol'
98
100
  load_internals 'extensions/module'
99
101
  load_internals 'extensions/hash'
100
-
102
+
101
103
  # Requiring Helpers
102
104
  #
103
105
  load_internals 'helpers/measuring'
104
-
106
+
105
107
  # Calculations.
106
108
  #
107
109
  load_internals 'calculations/location'
108
-
110
+
109
111
  # Index generation strategies.
110
112
  #
111
113
  load_internals 'indexers/no_source_specified_error'
112
114
  load_internals 'indexers/serial'
113
-
115
+
114
116
  # Generators.
115
117
  #
116
118
  load_internals 'generators/strategy'
117
-
119
+
118
120
  # Partial index generation strategies.
119
121
  #
120
122
  load_internals 'generators/partial/strategy'
@@ -127,37 +129,37 @@ module Loader # :nodoc:all
127
129
  load_internals 'generators/weights/strategy'
128
130
  load_internals 'generators/weights/logarithmic'
129
131
  load_internals 'generators/weights/default'
130
-
132
+
131
133
  # Similarity index generation strategies.
132
134
  #
133
135
  load_internals 'generators/similarity/strategy'
134
136
  load_internals 'generators/similarity/none'
135
137
  load_internals 'generators/similarity/double_levenshtone'
136
138
  load_internals 'generators/similarity/default'
137
-
139
+
138
140
  # Index generators.
139
141
  #
140
142
  load_internals 'generators/base'
141
143
  load_internals 'generators/partial_generator'
142
144
  load_internals 'generators/weights_generator'
143
145
  load_internals 'generators/similarity_generator'
144
-
146
+
145
147
  # Index store handling.
146
148
  #
147
149
  load_internals 'index/backend'
148
-
150
+
149
151
  load_internals 'index/redis'
150
152
  load_internals 'index/redis/basic'
151
153
  load_internals 'index/redis/list_hash'
152
154
  load_internals 'index/redis/string_hash'
153
-
155
+
154
156
  load_internals 'index/file/basic'
155
157
  load_internals 'index/file/text'
156
158
  load_internals 'index/file/marshal'
157
159
  load_internals 'index/file/json'
158
-
160
+
159
161
  load_internals 'index/files'
160
-
162
+
161
163
  # Indexing and Indexed things.
162
164
  #
163
165
  load_internals 'indexing/bundle/super_base' # TODO Remove.
@@ -167,68 +169,68 @@ module Loader # :nodoc:all
167
169
  load_internals 'indexing/category'
168
170
  load_internals 'indexing/categories'
169
171
  load_internals 'indexing/index'
170
-
172
+
171
173
  load_internals 'indexed/bundle/base'
172
174
  load_internals 'indexed/bundle/memory'
173
175
  load_internals 'indexed/bundle/redis'
174
176
  load_internals 'indexed/category'
175
177
  load_internals 'indexed/categories'
176
178
  load_internals 'indexed/index'
177
-
179
+
178
180
  # TODO Ok here?
179
181
  #
180
182
  load_internals 'indexed/wrappers/exact_first'
181
-
183
+
182
184
  # Bundle Wrapper
183
185
  #
184
186
  load_internals 'indexed/wrappers/bundle/wrapper'
185
187
  load_internals 'indexed/wrappers/bundle/calculation'
186
188
  load_internals 'indexed/wrappers/bundle/location'
187
-
189
+
188
190
  # Tokens.
189
191
  #
190
192
  load_internals 'query/token'
191
193
  load_internals 'query/tokens'
192
-
194
+
193
195
  # Tokenizers types.
194
196
  #
195
197
  load_internals 'tokenizers/base'
196
198
  load_internals 'tokenizers/index'
197
199
  load_internals 'tokenizers/query'
198
-
200
+
199
201
  # Query combinations, qualifiers, weigher.
200
202
  #
201
203
  load_internals 'query/combination'
202
204
  load_internals 'query/combinations/base'
203
205
  load_internals 'query/combinations/memory'
204
206
  load_internals 'query/combinations/redis'
205
-
207
+
206
208
  load_internals 'query/allocation'
207
209
  load_internals 'query/allocations'
208
-
210
+
209
211
  load_internals 'query/qualifiers'
210
-
212
+
211
213
  load_internals 'query/weights'
212
-
214
+
213
215
  load_internals 'query/indexes'
214
-
216
+
215
217
  # Results.
216
218
  #
217
219
  load_internals 'results/base'
218
220
  load_internals 'results/full'
219
221
  load_internals 'results/live'
220
-
222
+
221
223
  # Configuration.
222
224
  #
223
225
  load_internals 'configuration/index'
224
-
226
+
225
227
  # Adapters.
226
228
  #
227
229
  load_internals 'adapters/rack/base'
228
230
  load_internals 'adapters/rack/query'
229
231
  load_internals 'adapters/rack/live_parameters'
230
232
  load_internals 'adapters/rack'
231
-
233
+
232
234
  # Routing.
233
235
  #
234
236
  load_internals 'frontend_adapters/rack'
@@ -239,35 +241,39 @@ module Loader # :nodoc:all
239
241
  # Load harakiri.
240
242
  #
241
243
  load_relative 'rack/harakiri'
242
-
244
+
245
+ # Load analyzer.
246
+ #
247
+ load_relative 'analyzer'
248
+
243
249
  # Character Substituters
244
250
  #
245
251
  load_relative 'character_substituters/west_european'
246
-
252
+
247
253
  # Signal handling
248
254
  #
249
255
  load_relative 'signals'
250
-
256
+
251
257
  # Logging.
252
258
  #
253
259
  load_relative 'loggers/search'
254
-
260
+
255
261
  # Convenience accessors for generators.
256
262
  #
257
263
  load_relative 'generators/aliases'
258
-
264
+
259
265
  # API.
260
266
  #
261
267
  load_relative 'index/base'
262
268
  load_relative 'index/memory'
263
269
  load_relative 'index/redis'
264
-
270
+
265
271
  load_relative 'indexing/indexes'
266
272
  load_relative 'indexed/indexes'
267
-
273
+
268
274
  load_relative 'index_bundle'
269
275
  load_relative 'aliases'
270
-
276
+
271
277
  # Query.
272
278
  #
273
279
  load_relative 'query/base'
@@ -275,7 +281,7 @@ module Loader # :nodoc:all
275
281
  load_relative 'query/full'
276
282
  #
277
283
  # load_relative 'query/solr'
278
-
284
+
279
285
  # Sources.
280
286
  #
281
287
  load_relative 'sources/base'
@@ -283,23 +289,23 @@ module Loader # :nodoc:all
283
289
  load_relative 'sources/csv'
284
290
  load_relative 'sources/delicious'
285
291
  load_relative 'sources/couch'
286
-
292
+
287
293
  load_relative 'sources/wrappers/base'
288
294
  load_relative 'sources/wrappers/location'
289
-
295
+
290
296
  # Interfaces
291
297
  #
292
298
  load_relative 'interfaces/live_parameters'
293
-
299
+
294
300
  # Application.
295
301
  #
296
302
  load_relative 'application'
297
-
303
+
298
304
  # Load tools. Load in specific case?
299
305
  #
300
306
  load_relative 'cores'
301
307
  end
302
-
308
+
303
309
  # Loads the framework.
304
310
  #
305
311
  def self.load_framework