picky 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. data/lib/picky/application.rb +12 -12
  2. data/lib/picky/backends/backend.rb +17 -0
  3. data/lib/picky/{backend → backends}/file/basic.rb +1 -1
  4. data/lib/picky/{backend → backends}/file/json.rb +1 -1
  5. data/lib/picky/{backend → backends}/file/marshal.rb +1 -1
  6. data/lib/picky/{backend → backends}/file/text.rb +1 -1
  7. data/lib/picky/backends/memory.rb +53 -0
  8. data/lib/picky/{backend → backends}/redis/basic.rb +9 -14
  9. data/lib/picky/backends/redis/float_hash.rb +26 -0
  10. data/lib/picky/{backend → backends}/redis/list_hash.rb +7 -11
  11. data/lib/picky/{backend → backends}/redis/string_hash.rb +7 -11
  12. data/lib/picky/backends/redis.rb +87 -0
  13. data/lib/picky/bundle.rb +107 -11
  14. data/lib/picky/category.rb +5 -5
  15. data/lib/picky/index.rb +329 -0
  16. data/lib/picky/index_indexed.rb +31 -0
  17. data/lib/picky/index_indexing.rb +161 -0
  18. data/lib/picky/indexed/bundle.rb +112 -0
  19. data/lib/picky/indexed/wrappers/exact_first.rb +1 -1
  20. data/lib/picky/indexers/parallel.rb +2 -1
  21. data/lib/picky/indexers/serial.rb +2 -1
  22. data/lib/picky/indexes_indexing.rb +1 -1
  23. data/lib/picky/indexing/bundle.rb +188 -0
  24. data/lib/picky/indexing/wrappers/category/location.rb +1 -1
  25. data/lib/picky/interfaces/live_parameters.rb +8 -8
  26. data/lib/picky/loader.rb +24 -38
  27. data/lib/picky/migrations/from_30_to_31.rb +61 -0
  28. data/lib/picky/query/allocation.rb +10 -5
  29. data/lib/picky/query/combinations.rb +70 -0
  30. data/lib/picky/query/indexes.rb +8 -7
  31. data/lib/picky/query/indexes_check.rb +47 -0
  32. data/lib/picky/query/token.rb +16 -29
  33. data/lib/picky/query/tokens.rb +4 -20
  34. data/lib/picky/search.rb +51 -58
  35. data/lib/picky/tokenizer.rb +231 -0
  36. data/lib/picky/tokenizers/location.rb +1 -1
  37. data/lib/tasks/try.rake +4 -12
  38. data/lib/tasks/try.rb +37 -0
  39. data/spec/lib/application_spec.rb +5 -5
  40. data/spec/lib/{backend → backends}/file/basic_spec.rb +2 -2
  41. data/spec/lib/{backend → backends}/file/json_spec.rb +2 -2
  42. data/spec/lib/{backend → backends}/file/marshal_spec.rb +2 -2
  43. data/spec/lib/{backend → backends}/file/text_spec.rb +1 -1
  44. data/spec/lib/backends/memory_spec.rb +77 -0
  45. data/spec/lib/{backend → backends}/redis/basic_spec.rb +19 -21
  46. data/spec/lib/backends/redis/float_hash_spec.rb +38 -0
  47. data/spec/lib/backends/redis/list_hash_spec.rb +27 -0
  48. data/spec/lib/backends/redis/string_hash_spec.rb +38 -0
  49. data/spec/lib/backends/redis_spec.rb +79 -0
  50. data/spec/lib/categories_indexed_spec.rb +3 -3
  51. data/spec/lib/category_indexed_spec.rb +6 -6
  52. data/spec/lib/category_indexing_spec.rb +1 -1
  53. data/spec/lib/category_spec.rb +1 -1
  54. data/spec/lib/frontend_adapters/rack_spec.rb +2 -2
  55. data/spec/lib/{indexes/index_indexed_spec.rb → index_indexed_spec.rb} +1 -1
  56. data/spec/lib/{indexes/index_indexing_spec.rb → index_indexing_spec.rb} +1 -1
  57. data/spec/lib/{indexes/index_spec.rb → index_spec.rb} +1 -1
  58. data/spec/lib/indexed/{bundle/memory_spec.rb → memory_spec.rb} +18 -18
  59. data/spec/lib/indexed/wrappers/exact_first_spec.rb +2 -2
  60. data/spec/lib/indexing/{bundle/memory_partial_generation_speed_spec.rb → bundle_partial_generation_speed_spec.rb} +3 -3
  61. data/spec/lib/indexing/bundle_spec.rb +302 -0
  62. data/spec/lib/query/allocation_spec.rb +21 -11
  63. data/spec/lib/query/combination_spec.rb +2 -2
  64. data/spec/lib/query/{combinations/base_spec.rb → combinations_spec.rb} +1 -1
  65. data/spec/lib/query/indexes_check_spec.rb +25 -0
  66. data/spec/lib/query/indexes_spec.rb +5 -1
  67. data/spec/lib/query/token_spec.rb +18 -20
  68. data/spec/lib/query/tokens_spec.rb +14 -65
  69. data/spec/lib/search_spec.rb +36 -37
  70. data/spec/lib/tasks/try_spec.rb +51 -0
  71. data/spec/lib/{tokenizers/base_spec.rb → tokenizer_spec.rb} +15 -44
  72. metadata +64 -81
  73. data/lib/picky/backend/base.rb +0 -121
  74. data/lib/picky/backend/files.rb +0 -28
  75. data/lib/picky/backend/redis.rb +0 -44
  76. data/lib/picky/indexed/bundle/base.rb +0 -47
  77. data/lib/picky/indexed/bundle/memory.rb +0 -88
  78. data/lib/picky/indexed/bundle/redis.rb +0 -91
  79. data/lib/picky/indexes/index.rb +0 -328
  80. data/lib/picky/indexes/index_indexed.rb +0 -35
  81. data/lib/picky/indexes/index_indexing.rb +0 -165
  82. data/lib/picky/indexes/memory.rb +0 -20
  83. data/lib/picky/indexes/redis.rb +0 -20
  84. data/lib/picky/indexing/bundle/base.rb +0 -242
  85. data/lib/picky/indexing/bundle/memory.rb +0 -26
  86. data/lib/picky/indexing/bundle/redis.rb +0 -26
  87. data/lib/picky/query/combinations/base.rb +0 -74
  88. data/lib/picky/query/combinations/memory.rb +0 -52
  89. data/lib/picky/query/combinations/redis.rb +0 -90
  90. data/lib/picky/query.rb +0 -6
  91. data/lib/picky/tokenizers/base.rb +0 -231
  92. data/lib/picky/tokenizers/index.rb +0 -34
  93. data/lib/picky/tokenizers/query.rb +0 -61
  94. data/spec/lib/backend/files_spec.rb +0 -189
  95. data/spec/lib/backend/redis/list_hash_spec.rb +0 -40
  96. data/spec/lib/backend/redis/string_hash_spec.rb +0 -47
  97. data/spec/lib/backend/redis_spec.rb +0 -170
  98. data/spec/lib/indexed/bundle/redis_spec.rb +0 -41
  99. data/spec/lib/indexes/redis_spec.rb +0 -15
  100. data/spec/lib/indexing/bundle/base_spec.rb +0 -38
  101. data/spec/lib/indexing/bundle/memory_spec.rb +0 -287
  102. data/spec/lib/indexing/bundle/redis_spec.rb +0 -283
  103. data/spec/lib/query/combinations/memory_spec.rb +0 -158
  104. data/spec/lib/query/combinations/redis_spec.rb +0 -172
  105. data/spec/lib/tokenizers/index_spec.rb +0 -69
  106. data/spec/lib/tokenizers/query_spec.rb +0 -121
@@ -0,0 +1,329 @@
1
+ module Picky
2
+
3
+ # = Picky Indexes
4
+ #
5
+ # A Picky Index defines
6
+ # * what backend it uses.
7
+ # * where its data comes from (a data source).
8
+ # * how this data it is indexed.
9
+ # * a number of categories that may or may not map directly to data categories.
10
+ #
11
+ # == Howto
12
+ #
13
+ # This is a step-by-step description on how to create an index.
14
+ #
15
+ # Start by choosing an <tt>Index</tt> or an <tt>Index</tt>.
16
+ # In the example, we will be using an in-memory index, <tt>Index</tt>.
17
+ #
18
+ # books = Index.new(:books)
19
+ #
20
+ # That in itself won't do much good, that's why we add a data source:
21
+ #
22
+ # books = Index.new(:books) do
23
+ # source Sources::CSV.new(:title, :author, file: 'data/books.csv')
24
+ # end
25
+ #
26
+ # In the example, we use an explicit <tt>Sources::CSV</tt> of Picky.
27
+ # However, anything that responds to <tt>#each</tt>, and returns an object that
28
+ # answers to <tt>#id</tt>, works.
29
+ #
30
+ # For example, a 3.0 ActiveRecord class:
31
+ #
32
+ # books = Index.new(:books) do
33
+ # source Book.order('isbn ASC')
34
+ # end
35
+ #
36
+ # Now we know where the data comes from, but not, how to categorize it.
37
+ #
38
+ # Let's add a few categories:
39
+ #
40
+ # books = Index.new(:books) do
41
+ # source Book.order('isbn ASC')
42
+ # category :title
43
+ # category :author
44
+ # category :isbn
45
+ # end
46
+ #
47
+ # Categories offer quite a few options, see <tt>Indexes::Base#category</tt> for details.
48
+ #
49
+ # After adding more options, it might look like this:
50
+ #
51
+ # books = Index.new(:books) do
52
+ # source Book.order('isbn ASC')
53
+ # category :title,
54
+ # partial: Partial::Substring.new(from: 1),
55
+ # similarity: Similarity::DoubleMetaphone.new(3),
56
+ # qualifiers: [:t, :title, :titulo]
57
+ # category :author,
58
+ # similarity: Similarity::Metaphone.new(2)
59
+ # category :isbn,
60
+ # partial: Partial::None.new,
61
+ # from: :legacy_isbn_name
62
+ # end
63
+ #
64
+ # For this to work, a <tt>Book</tt> should support methods <tt>#title</tt>, <tt>#author</tt> and <tt>#legacy_isbn_name</tt>.
65
+ #
66
+ # If it uses <tt>String</tt> ids, use <tt>#key_format</tt> to define a formatting method:
67
+ #
68
+ # books = Index.new(:books) do
69
+ # key_format :to_s
70
+ # source Book.order('isbn ASC')
71
+ # category :title
72
+ # category :author
73
+ # category :isbn
74
+ # end
75
+ #
76
+ # Finally, use the index for a <tt>Search</tt>:
77
+ #
78
+ # route %r{^/media$} => Search.new(books, dvds, mp3s)
79
+ #
80
+ # This class defines the indexing and index API that is exposed to the user
81
+ # as the #index method inside the Application class.
82
+ #
83
+ # It provides a single front for both indexing and index options. We suggest to always use the index API.
84
+ #
85
+ # Note: An Index holds both an *Indexed*::*Index* and an *Indexing*::*Index*.
86
+ #
87
+ class Index
88
+
89
+ attr_reader :name,
90
+ :categories
91
+
92
+ delegate :[],
93
+ :each_category,
94
+ :to => :categories
95
+
96
+ # Create a new index with a given source.
97
+ #
98
+ # === Parameters
99
+ # * name: A name that will be used for the index directory and in the Picky front end.
100
+ #
101
+ # === Options (all are used in the block, see examples)
102
+ # * source: Where the data comes from, e.g. Sources::CSV.new(...). Optional, can be defined in the block using #source.
103
+ # * result_identifier: Use if you'd like a different identifier/name in the results than the name of the index.
104
+ # * after_indexing: As of this writing only used in the db source. Executes the given after_indexing as SQL after the indexing process.
105
+ # * tokenizer: Call and pass either a tokenizer (responds to #tokenize) or the options for a tokenizer..
106
+ # * key_format: Call and pass in a format method for the ids (default is #to_i).
107
+ #
108
+ # Example:
109
+ # my_index = Index.new(:my_index) do
110
+ # source Sources::CSV.new(file: 'data/index.csv')
111
+ # key_format :to_sym
112
+ # category :bla
113
+ # result_identifier :my_special_results
114
+ # end
115
+ #
116
+ def initialize name, options = {}
117
+ @name = name.to_sym
118
+
119
+ # TODO Move ignore_unassigned_tokens to query, somehow. Then, remove options.
120
+ #
121
+ @categories = Categories.new ignore_unassigned_tokens: (options[:ignore_unassigned_tokens] || false)
122
+
123
+ # Centralized registry.
124
+ #
125
+ Indexes.register self
126
+
127
+ instance_eval(&Proc.new) if block_given?
128
+ end
129
+
130
+ # API method.
131
+ #
132
+ # Sets/returns the backend used.
133
+ # Default is @Backends::Memory.new@.
134
+ #
135
+ def backend backend = nil
136
+ if backend
137
+ @backend = backend
138
+ else
139
+ @backend || Backends::Memory.new
140
+ end
141
+ end
142
+
143
+ # Defines a searchable category on the index.
144
+ #
145
+ # === Parameters
146
+ # * category_name: This identifier is used in the front end, but also to categorize query text. For example, “title:hobbit” will narrow the hobbit query on categories with the identifier :title.
147
+ #
148
+ # === Options
149
+ # * partial: Partial::None.new or Partial::Substring.new(from: starting_char, to: ending_char). Default is Partial::Substring.new(from: -3, to: -1).
150
+ # * similarity: Similarity::None.new or Similarity::DoubleMetaphone.new(similar_words_searched). Default is Similarity::None.new.
151
+ # * qualifiers: An array of qualifiers with which you can define which category you’d like to search, for example “title:hobbit” will search for hobbit in just title categories. Example: qualifiers: [:t, :titre, :title] (use it for example with multiple languages). Default is the name of the category.
152
+ # * qualifier: Convenience options if you just need a single qualifier, see above. Example: qualifiers => :title. Default is the name of the category.
153
+ # * source: Use a different source than the index uses. If you think you need that, there might be a better solution to your problem. Please post to the mailing list first with your application.rb :)
154
+ # * from: Take the data from the data category with this name. Example: You have a source Sources::CSV.new(:title, file:'some_file.csv') but you want the category to be called differently. The you use from: define_category(:similar_title, :from => :title).
155
+ #
156
+ def category category_name, options = {}
157
+ new_category = Category.new category_name.to_sym, self, options
158
+ categories << new_category
159
+
160
+ new_category = yield new_category if block_given?
161
+
162
+ new_category
163
+ end
164
+ alias define_category category
165
+
166
+ # Make this category range searchable with a fixed range. If you need other
167
+ # ranges, define another category with a different range value.
168
+ #
169
+ # Example:
170
+ # You have data values inside 1..100, and you want to have Picky return
171
+ # not only the results for 47 if you search for 47, but also results for
172
+ # 45, 46, or 47.2, 48.9, in a range of 2 around 47, so (45..49).
173
+ #
174
+ # Then you use:
175
+ # ranged_category :values_inside_1_100, 2
176
+ #
177
+ # Optionally, you give it a precision value to reduce the error margin
178
+ # around 47 (Picky is a bit liberal).
179
+ # Index.new :range do
180
+ # ranged_category :values_inside_1_100, 2, precision: 5
181
+ # end
182
+ #
183
+ # This will force Picky to maximally be wrong 5% of the given range value
184
+ # (5% of 2 = 0.1) instead of the default 20% (20% of 2 = 0.4).
185
+ #
186
+ # We suggest not to use much more than 5 as a higher precision is more
187
+ # performance intensive for less and less precision gain.
188
+ #
189
+ # == Protip 1
190
+ #
191
+ # Create two ranged categories to make an area search:
192
+ # Index.new :area do
193
+ # ranged_category :x, 1
194
+ # ranged_category :y, 1
195
+ # end
196
+ #
197
+ # Search for it using for example:
198
+ # x:133, y:120
199
+ #
200
+ # This will search this square area (* = 133, 120: The "search" point entered):
201
+ #
202
+ # 132 134
203
+ # | |
204
+ # --|---------|-- 121
205
+ # | |
206
+ # | * |
207
+ # | |
208
+ # --|---------|-- 119
209
+ # | |
210
+ #
211
+ # Note: The area does not need to be square, but can be rectangular.
212
+ #
213
+ # == Protip 2
214
+ #
215
+ # Create three ranged categories to make a volume search.
216
+ #
217
+ # Or go crazy and use 4 ranged categories for a space/time search! ;)
218
+ #
219
+ # === Parameters
220
+ # * category_name: The category_name as used in #define_category.
221
+ # * range: The range (in the units of your data values) around the query point where we search for results.
222
+ #
223
+ # -----|<- range ->*------------|-----
224
+ #
225
+ # === Options
226
+ # * precision: Default is 1 (20% error margin, very fast), up to 5 (5% error margin, slower) makes sense.
227
+ # * ... all options of #define_category.
228
+ #
229
+ def ranged_category category_name, range, options = {}
230
+ precision = options[:precision] || 1 # THINK options.delete?
231
+
232
+ # Note: :key_format => :to_f ?
233
+ #
234
+ options = { partial: Partial::None.new }.merge options
235
+
236
+ define_category category_name, options do |category|
237
+ Indexing::Wrappers::Category::Location.install_on category, range, precision
238
+ Indexed::Wrappers::Category::Location.install_on category, range, precision
239
+ end
240
+ end
241
+ alias define_ranged_category ranged_category
242
+
243
+ # HIGHLY EXPERIMENTAL Not correctly working yet. Try it if you feel "beta".
244
+ #
245
+ # Also a range search see #ranged_category, but on the earth's surface.
246
+ #
247
+ # Parameters:
248
+ # * lat_name: The latitude's name as used in #define_category.
249
+ # * lng_name: The longitude's name as used in #define_category.
250
+ # * radius: The distance (in km) around the query point which we search for results.
251
+ #
252
+ # Note: Picky uses a square, not a circle. That should be ok for most usages.
253
+ #
254
+ # -----------------------------
255
+ # | |
256
+ # | |
257
+ # | |
258
+ # | |
259
+ # | |
260
+ # | *<- radius ->|
261
+ # | |
262
+ # | |
263
+ # | |
264
+ # | |
265
+ # | |
266
+ # -----------------------------
267
+ #
268
+ # Options
269
+ # * precision: Default 1 (20% error margin, very fast), up to 5 (5% error margin, slower) makes sense.
270
+ # * lat_from: The data category to take the data for the latitude from.
271
+ # * lng_from: The data category to take the data for the longitude from.
272
+ #
273
+ # TODO Will have to write a wrapper that combines two categories that are
274
+ # indexed simultaneously, since lat/lng are correlated.
275
+ #
276
+ def geo_categories lat_name, lng_name, radius, options = {} # :nodoc:
277
+
278
+ # Extract lat/lng specific options.
279
+ #
280
+ lat_from = options.delete :lat_from
281
+ lng_from = options.delete :lng_from
282
+
283
+ # One can be a normal ranged_category.
284
+ #
285
+ ranged_category lat_name, radius*0.00898312, options.merge(from: lat_from)
286
+
287
+ # The other needs to adapt the radius depending on the one.
288
+ #
289
+ # Depending on the latitude, the radius of the longitude
290
+ # needs to enlarge, the closer we get to the pole.
291
+ #
292
+ # In our simplified case, the radius is given as if all the
293
+ # locations were on the 45 degree line.
294
+ #
295
+ # This calculates km -> longitude (degrees).
296
+ #
297
+ # A degree on the 45 degree line is equal to ~222.6398 km.
298
+ # So a km on the 45 degree line is equal to 0.01796624 degrees.
299
+ #
300
+ ranged_category lng_name, radius*0.01796624, options.merge(from: lng_from)
301
+
302
+ end
303
+ alias define_geo_categories geo_categories
304
+
305
+ def to_stats # :nodoc:
306
+ stats = <<-INDEX
307
+ #{name} (#{self.class}):
308
+ #{"source: #{source}".indented_to_s}
309
+ #{"categories: #{categories.map(&:name).join(', ')}".indented_to_s}
310
+ INDEX
311
+ stats << " result identifier: \"#{result_identifier}\"".indented_to_s unless result_identifier.to_s == name.to_s
312
+ stats
313
+ end
314
+
315
+ # Identifier used for technical output.
316
+ #
317
+ def identifier
318
+ "#{PICKY_ENVIRONMENT}:#{name}"
319
+ end
320
+
321
+ #
322
+ #
323
+ def to_s
324
+ "#{self.class}(#{name}, result_id: #{result_identifier}, source: #{@source}, categories: #{categories})"
325
+ end
326
+
327
+ end
328
+
329
+ end
@@ -0,0 +1,31 @@
1
+ module Picky
2
+
3
+ #
4
+ #
5
+ class Index
6
+
7
+ attr_reader :combinator
8
+
9
+ delegate :load_from_cache,
10
+ :analyze,
11
+ :reindex,
12
+ :possible_combinations,
13
+ :to => :categories
14
+
15
+ alias reload load_from_cache
16
+
17
+ # Define how the results of this index are identified.
18
+ # (Shown in the client, for example)
19
+ #
20
+ # Default is the name of the index.
21
+ #
22
+ def result_identifier result_identifier = nil
23
+ result_identifier ? define_result_identifier(result_identifier) : (@result_identifier || @name)
24
+ end
25
+ def define_result_identifier result_identifier
26
+ @result_identifier = result_identifier
27
+ end
28
+
29
+ end
30
+
31
+ end
@@ -0,0 +1,161 @@
1
+ module Picky
2
+
3
+ #
4
+ #
5
+ class Index
6
+
7
+ attr_reader :bundle_class
8
+
9
+ # Delegators for indexing.
10
+ #
11
+ delegate :cache,
12
+ :check,
13
+ :clear,
14
+ :backup,
15
+ :restore,
16
+ :to => :categories
17
+
18
+ # Calling index on an index will call index
19
+ # on every category.
20
+ #
21
+ # Decides whether to use a parallel indexer or whether to
22
+ # delegate to each category to index themselves.
23
+ #
24
+ def index
25
+ if source.respond_to?(:each)
26
+ check_source_empty
27
+ index_in_parallel
28
+ else
29
+ with_data_snapshot do
30
+ categories.each &:index
31
+ end
32
+ end
33
+ end
34
+
35
+ # Define an index tokenizer on the index.
36
+ #
37
+ # Parameters are the exact same as for indexing.
38
+ #
39
+ def indexing options = {}
40
+ @tokenizer = if options.respond_to?(:tokenize)
41
+ options
42
+ else
43
+ options && Tokenizer.new(options)
44
+ end
45
+ end
46
+ alias define_indexing indexing
47
+
48
+ # Check if the given enumerable source is empty.
49
+ #
50
+ # Note: Checking as early as possible to tell the
51
+ # user as early as possible.
52
+ #
53
+ def check_source_empty
54
+ warn %Q{\n\033[1mWarning\033[m, source for index "#{name}" is empty: #{source} (responds true to empty?).\n} if source.respond_to?(:empty?) && source.empty?
55
+ end
56
+
57
+ # Note: Duplicated in category_indexing.rb.
58
+ #
59
+ # Take a data snapshot if the source offers it.
60
+ #
61
+ def with_data_snapshot
62
+ if source.respond_to? :with_snapshot
63
+ source.with_snapshot(self) do
64
+ yield
65
+ end
66
+ else
67
+ yield
68
+ end
69
+ end
70
+
71
+ # Indexes the categories in parallel.
72
+ #
73
+ # Only use where the category does have a #each source defined.
74
+ #
75
+ def index_in_parallel
76
+ indexer = Indexers::Parallel.new self
77
+ indexer.index categories
78
+ categories.each &:cache
79
+ end
80
+
81
+ # Returns the installed tokenizer or the default.
82
+ #
83
+ def tokenizer
84
+ @tokenizer || Indexes.tokenizer
85
+ end
86
+
87
+ # Define a source on the index.
88
+ #
89
+ # Parameter is a source, either one of the standard sources or
90
+ # anything responding to #each and returning objects that
91
+ # respond to id and the category names (or the category from option).
92
+ #
93
+ def source some_source = nil, &block
94
+ some_source ||= block
95
+ some_source ? define_source(some_source) : (@source && extract_source || raise_no_source)
96
+ end
97
+ # Extract the actual source if it is wrapped in a time
98
+ # capsule, i.e. a block/lambda.
99
+ #
100
+ # TODO Extract into module.
101
+ #
102
+ def extract_source
103
+ @source = @source.respond_to?(:call) ? @source.call : @source
104
+ end
105
+ def define_source source
106
+ check_source source
107
+ @source = source
108
+ end
109
+ def raise_no_source
110
+ raise NoSourceSpecifiedException.new(<<-NO_SOURCE
111
+
112
+
113
+ No source given for index #{name}. An index needs a source.
114
+ Example:
115
+ Index.new(:with_source) do
116
+ source Sources::CSV.new(:title, file: 'data/books.csv')
117
+ category :title
118
+ category :author
119
+ end
120
+
121
+ NO_SOURCE
122
+ )
123
+ end
124
+ def check_source source # :nodoc:
125
+ raise ArgumentError.new(<<-SOURCE
126
+
127
+
128
+ The index "#{name}" should use a data source that responds to either the method #each, or the method #harvest, which yields(id, text), OR it can be a lambda/block, returning such a source.
129
+ Or it could use one of the built-in sources:
130
+ Sources::#{(Sources.constants - [:Base, :Wrappers, :NoCSVFileGiven, :NoCouchDBGiven]).join(',
131
+ Sources::')}
132
+
133
+
134
+ SOURCE
135
+ ) unless source.respond_to?(:each) || source.respond_to?(:harvest) || source.respond_to?(:call)
136
+ end
137
+
138
+ # Define a key_format on the index.
139
+ #
140
+ # Parameter is a method name to use on the key (e.g. :to_i, :to_s, :strip).
141
+ #
142
+ def key_format format = nil
143
+ format ? define_key_format(format) : @key_format
144
+ end
145
+ def define_key_format key_format
146
+ @key_format = key_format
147
+ end
148
+
149
+ # Define what to do after indexing.
150
+ # (Only used in the Sources::DB)
151
+ #
152
+ def after_indexing after_indexing = nil
153
+ after_indexing ? define_after_indexing(after_indexing) : @after_indexing
154
+ end
155
+ def define_after_indexing after_indexing
156
+ @after_indexing = after_indexing
157
+ end
158
+
159
+ end
160
+
161
+ end
@@ -0,0 +1,112 @@
1
+ module Picky
2
+
3
+ module Indexed # :nodoc:all
4
+
5
+ # An indexed bundle is a number of memory/redis
6
+ # indexes that compose the indexes for a single category:
7
+ # * core (inverted) index
8
+ # * weights index
9
+ # * similarity index
10
+ # * index configuration
11
+ #
12
+ # Indexed refers to them being indexed.
13
+ # This class notably offers the methods:
14
+ # * load
15
+ # * clear
16
+ #
17
+ # To (re)load or clear the current indexes.
18
+ #
19
+ class Bundle < Picky::Bundle
20
+
21
+ # Get the ids for the given symbol.
22
+ #
23
+ # Returns a (potentially empty) array of ids.
24
+ #
25
+ def ids sym
26
+ @inverted[sym] || []
27
+ end
28
+
29
+ # Get a weight for the given symbol.
30
+ #
31
+ # Returns a number, or nil.
32
+ #
33
+ def weight sym
34
+ @weights[sym]
35
+ end
36
+
37
+ # Get settings for this bundle.
38
+ #
39
+ # Returns an object.
40
+ #
41
+ def [] sym
42
+ @configuration[sym]
43
+ end
44
+
45
+ # Loads all indexes.
46
+ #
47
+ # Loading loads index objects from the backend.
48
+ # They should each respond to [] and return something appropriate.
49
+ #
50
+ def load
51
+ load_inverted
52
+ load_weights
53
+ load_similarity
54
+ load_configuration
55
+ end
56
+
57
+ # Loads the core index.
58
+ #
59
+ def load_inverted
60
+ self.inverted = @backend_inverted.load
61
+ end
62
+ # Loads the weights index.
63
+ #
64
+ def load_weights
65
+ self.weights = @backend_weights.load
66
+ end
67
+ # Loads the similarity index.
68
+ #
69
+ def load_similarity
70
+ self.similarity = @backend_similarity.load
71
+ end
72
+ # Loads the configuration.
73
+ #
74
+ def load_configuration
75
+ self.configuration = @backend_configuration.load
76
+ end
77
+
78
+ # Clears all indexes.
79
+ #
80
+ def clear
81
+ clear_inverted
82
+ clear_weights
83
+ clear_similarity
84
+ clear_configuration
85
+ end
86
+
87
+ # Clears the core index.
88
+ #
89
+ def clear_inverted
90
+ inverted.clear
91
+ end
92
+ # Clears the weights index.
93
+ #
94
+ def clear_weights
95
+ weights.clear
96
+ end
97
+ # Clears the similarity index.
98
+ #
99
+ def clear_similarity
100
+ similarity.clear
101
+ end
102
+ # Clears the configuration.
103
+ #
104
+ def clear_configuration
105
+ configuration.clear
106
+ end
107
+
108
+ end
109
+
110
+ end
111
+
112
+ end
@@ -9,7 +9,7 @@ module Picky
9
9
  # This index combines an exact and partial index.
10
10
  # It serves to order the results such that exact hits are found first.
11
11
  #
12
- class ExactFirst < Indexed::Bundle::Base
12
+ class ExactFirst < Indexed::Bundle
13
13
 
14
14
  delegate :similar,
15
15
  :identifier,
@@ -45,7 +45,8 @@ module Picky
45
45
  # Is it a good idea that not the tokenizer has control over when he gets the next text?
46
46
  #
47
47
  combined.each do |category, cache, _, tokenizer|
48
- tokenizer.tokenize(object.send(category.from).to_s).each do |token_text|
48
+ tokens, _ = tokenizer.tokenize object.send(category.from).to_s # Note: Originals not needed.
49
+ tokens.each do |token_text|
49
50
  next unless token_text
50
51
  cache << id << comma << token_text << newline
51
52
  end
@@ -28,7 +28,8 @@ module Picky
28
28
  result = []
29
29
 
30
30
  source.harvest(category) do |indexed_id, text|
31
- tokenizer.tokenize(text).each do |token_text|
31
+ tokens, _ = tokenizer.tokenize text # Note: Originals not needed.
32
+ tokens.each do |token_text|
32
33
  next unless token_text
33
34
  result << indexed_id << comma << token_text << newline
34
35
  end
@@ -42,7 +42,7 @@ module Picky
42
42
  #
43
43
  #
44
44
  def tokenizer
45
- Tokenizers::Index.default
45
+ Tokenizer.index_default
46
46
  end
47
47
 
48
48
  end