picky 3.0.1 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (106) hide show
  1. data/lib/picky/application.rb +12 -12
  2. data/lib/picky/backends/backend.rb +17 -0
  3. data/lib/picky/{backend → backends}/file/basic.rb +1 -1
  4. data/lib/picky/{backend → backends}/file/json.rb +1 -1
  5. data/lib/picky/{backend → backends}/file/marshal.rb +1 -1
  6. data/lib/picky/{backend → backends}/file/text.rb +1 -1
  7. data/lib/picky/backends/memory.rb +53 -0
  8. data/lib/picky/{backend → backends}/redis/basic.rb +9 -14
  9. data/lib/picky/backends/redis/float_hash.rb +26 -0
  10. data/lib/picky/{backend → backends}/redis/list_hash.rb +7 -11
  11. data/lib/picky/{backend → backends}/redis/string_hash.rb +7 -11
  12. data/lib/picky/backends/redis.rb +87 -0
  13. data/lib/picky/bundle.rb +107 -11
  14. data/lib/picky/category.rb +5 -5
  15. data/lib/picky/index.rb +329 -0
  16. data/lib/picky/index_indexed.rb +31 -0
  17. data/lib/picky/index_indexing.rb +161 -0
  18. data/lib/picky/indexed/bundle.rb +112 -0
  19. data/lib/picky/indexed/wrappers/exact_first.rb +1 -1
  20. data/lib/picky/indexers/parallel.rb +2 -1
  21. data/lib/picky/indexers/serial.rb +2 -1
  22. data/lib/picky/indexes_indexing.rb +1 -1
  23. data/lib/picky/indexing/bundle.rb +188 -0
  24. data/lib/picky/indexing/wrappers/category/location.rb +1 -1
  25. data/lib/picky/interfaces/live_parameters.rb +8 -8
  26. data/lib/picky/loader.rb +24 -38
  27. data/lib/picky/migrations/from_30_to_31.rb +61 -0
  28. data/lib/picky/query/allocation.rb +10 -5
  29. data/lib/picky/query/combinations.rb +70 -0
  30. data/lib/picky/query/indexes.rb +8 -7
  31. data/lib/picky/query/indexes_check.rb +47 -0
  32. data/lib/picky/query/token.rb +16 -29
  33. data/lib/picky/query/tokens.rb +4 -20
  34. data/lib/picky/search.rb +51 -58
  35. data/lib/picky/tokenizer.rb +231 -0
  36. data/lib/picky/tokenizers/location.rb +1 -1
  37. data/lib/tasks/try.rake +4 -12
  38. data/lib/tasks/try.rb +37 -0
  39. data/spec/lib/application_spec.rb +5 -5
  40. data/spec/lib/{backend → backends}/file/basic_spec.rb +2 -2
  41. data/spec/lib/{backend → backends}/file/json_spec.rb +2 -2
  42. data/spec/lib/{backend → backends}/file/marshal_spec.rb +2 -2
  43. data/spec/lib/{backend → backends}/file/text_spec.rb +1 -1
  44. data/spec/lib/backends/memory_spec.rb +77 -0
  45. data/spec/lib/{backend → backends}/redis/basic_spec.rb +19 -21
  46. data/spec/lib/backends/redis/float_hash_spec.rb +38 -0
  47. data/spec/lib/backends/redis/list_hash_spec.rb +27 -0
  48. data/spec/lib/backends/redis/string_hash_spec.rb +38 -0
  49. data/spec/lib/backends/redis_spec.rb +79 -0
  50. data/spec/lib/categories_indexed_spec.rb +3 -3
  51. data/spec/lib/category_indexed_spec.rb +6 -6
  52. data/spec/lib/category_indexing_spec.rb +1 -1
  53. data/spec/lib/category_spec.rb +1 -1
  54. data/spec/lib/frontend_adapters/rack_spec.rb +2 -2
  55. data/spec/lib/{indexes/index_indexed_spec.rb → index_indexed_spec.rb} +1 -1
  56. data/spec/lib/{indexes/index_indexing_spec.rb → index_indexing_spec.rb} +1 -1
  57. data/spec/lib/{indexes/index_spec.rb → index_spec.rb} +1 -1
  58. data/spec/lib/indexed/{bundle/memory_spec.rb → memory_spec.rb} +18 -18
  59. data/spec/lib/indexed/wrappers/exact_first_spec.rb +2 -2
  60. data/spec/lib/indexing/{bundle/memory_partial_generation_speed_spec.rb → bundle_partial_generation_speed_spec.rb} +3 -3
  61. data/spec/lib/indexing/bundle_spec.rb +302 -0
  62. data/spec/lib/query/allocation_spec.rb +21 -11
  63. data/spec/lib/query/combination_spec.rb +2 -2
  64. data/spec/lib/query/{combinations/base_spec.rb → combinations_spec.rb} +1 -1
  65. data/spec/lib/query/indexes_check_spec.rb +25 -0
  66. data/spec/lib/query/indexes_spec.rb +5 -1
  67. data/spec/lib/query/token_spec.rb +18 -20
  68. data/spec/lib/query/tokens_spec.rb +14 -65
  69. data/spec/lib/search_spec.rb +36 -37
  70. data/spec/lib/tasks/try_spec.rb +51 -0
  71. data/spec/lib/{tokenizers/base_spec.rb → tokenizer_spec.rb} +15 -44
  72. metadata +64 -81
  73. data/lib/picky/backend/base.rb +0 -121
  74. data/lib/picky/backend/files.rb +0 -28
  75. data/lib/picky/backend/redis.rb +0 -44
  76. data/lib/picky/indexed/bundle/base.rb +0 -47
  77. data/lib/picky/indexed/bundle/memory.rb +0 -88
  78. data/lib/picky/indexed/bundle/redis.rb +0 -91
  79. data/lib/picky/indexes/index.rb +0 -328
  80. data/lib/picky/indexes/index_indexed.rb +0 -35
  81. data/lib/picky/indexes/index_indexing.rb +0 -165
  82. data/lib/picky/indexes/memory.rb +0 -20
  83. data/lib/picky/indexes/redis.rb +0 -20
  84. data/lib/picky/indexing/bundle/base.rb +0 -242
  85. data/lib/picky/indexing/bundle/memory.rb +0 -26
  86. data/lib/picky/indexing/bundle/redis.rb +0 -26
  87. data/lib/picky/query/combinations/base.rb +0 -74
  88. data/lib/picky/query/combinations/memory.rb +0 -52
  89. data/lib/picky/query/combinations/redis.rb +0 -90
  90. data/lib/picky/query.rb +0 -6
  91. data/lib/picky/tokenizers/base.rb +0 -231
  92. data/lib/picky/tokenizers/index.rb +0 -34
  93. data/lib/picky/tokenizers/query.rb +0 -61
  94. data/spec/lib/backend/files_spec.rb +0 -189
  95. data/spec/lib/backend/redis/list_hash_spec.rb +0 -40
  96. data/spec/lib/backend/redis/string_hash_spec.rb +0 -47
  97. data/spec/lib/backend/redis_spec.rb +0 -170
  98. data/spec/lib/indexed/bundle/redis_spec.rb +0 -41
  99. data/spec/lib/indexes/redis_spec.rb +0 -15
  100. data/spec/lib/indexing/bundle/base_spec.rb +0 -38
  101. data/spec/lib/indexing/bundle/memory_spec.rb +0 -287
  102. data/spec/lib/indexing/bundle/redis_spec.rb +0 -283
  103. data/spec/lib/query/combinations/memory_spec.rb +0 -158
  104. data/spec/lib/query/combinations/redis_spec.rb +0 -172
  105. data/spec/lib/tokenizers/index_spec.rb +0 -69
  106. data/spec/lib/tokenizers/query_spec.rb +0 -121
@@ -1,242 +0,0 @@
1
- module Picky
2
-
3
- module Indexing # :nodoc:all
4
-
5
- # A Bundle is a number of indexes
6
- # per [index, category] combination.
7
- #
8
- # At most, there are three indexes:
9
- # * *core* index (always used)
10
- # * *weights* index (always used)
11
- # * *similarity* index (used with similarity)
12
- #
13
- # In Picky, indexing is separated from the index
14
- # handling itself through a parallel structure.
15
- #
16
- # Both use methods provided by this base class, but
17
- # have very different goals:
18
- #
19
- # * *Indexing*::*Bundle* is just concerned with creating index files
20
- # and providing helper functions to e.g. check the indexes.
21
- #
22
- # * *Index*::*Bundle* is concerned with loading these index files into
23
- # memory and looking up search data as fast as possible.
24
- #
25
- module Bundle
26
-
27
- # This is the indexing bundle.
28
- #
29
- # It does all menial tasks that have nothing to do
30
- # with the actual index running etc.
31
- #
32
- class Base < Picky::Bundle
33
-
34
- attr_reader :backend,
35
- :prepared
36
-
37
- attr_accessor :partial_strategy,
38
- :weights_strategy
39
-
40
- def initialize name, category, weights_strategy, partial_strategy, similarity_strategy, options = {}
41
- super name, category, similarity_strategy, options
42
-
43
- @weights_strategy = weights_strategy
44
- @partial_strategy = partial_strategy
45
- @key_format = options[:key_format]
46
- @prepared = Backend::File::Text.new category.prepared_index_path
47
- end
48
-
49
- # Sets up a piece of the index for the given token.
50
- #
51
- def initialize_inverted_index_for token
52
- self.inverted[token] ||= []
53
- end
54
-
55
- # Generation
56
- #
57
-
58
- # This method
59
- # * Loads the base index from the "prepared..." file.
60
- # * Generates derived indexes.
61
- # * Dumps all the indexes into files.
62
- #
63
- def generate_caches_from_source
64
- load_from_prepared_index_file
65
- generate_caches_from_memory
66
- end
67
- # Generates derived indexes from the index and dumps.
68
- #
69
- # Note: assumes that there is something in the index
70
- #
71
- def generate_caches_from_memory
72
- cache_from_memory_generation_message
73
- generate_derived
74
- end
75
- def cache_from_memory_generation_message
76
- timed_exclaim %Q{"#{identifier}": Caching from intermediate in-memory index.}
77
- end
78
-
79
- # Generates the weights and similarity from the main index.
80
- #
81
- def generate_derived
82
- generate_weights
83
- generate_similarity
84
- end
85
-
86
- # Load the data from the db.
87
- #
88
- def load_from_prepared_index_file
89
- load_from_prepared_index_generation_message
90
- clear
91
- retrieve
92
- end
93
- def load_from_prepared_index_generation_message
94
- timed_exclaim %Q{"#{identifier}": Loading prepared data into memory.}
95
- end
96
- # Retrieves the prepared index data into the index.
97
- #
98
- # This is in preparation for generating
99
- # derived indexes (like weights, similarity)
100
- # and later dumping the optimized index.
101
- #
102
- # TODO Move this out to the category?
103
- #
104
- def retrieve
105
- format = category.key_format || :to_i # Optimization.
106
- prepared.retrieve do |id, token|
107
- initialize_inverted_index_for token
108
- self.inverted[token] << id.send(format)
109
- end
110
- end
111
-
112
- # Generates a new index (writes its index) using the
113
- # partial caching strategy of this bundle.
114
- #
115
- def generate_partial
116
- generator = Generators::PartialGenerator.new self.inverted
117
- self.inverted = generator.generate self.partial_strategy
118
- end
119
- # Generate a partial index from the given exact inverted index.
120
- #
121
- def generate_partial_from exact_inverted_index
122
- timed_exclaim %Q{"#{identifier}": Generating partial index for index.}
123
- self.inverted = exact_inverted_index
124
- self.generate_partial
125
- self
126
- end
127
- # Generates a new weights index (writes its index) using the
128
- # given weight caching strategy.
129
- #
130
- def generate_weights
131
- generator = Generators::WeightsGenerator.new self.inverted
132
- self.weights = generator.generate self.weights_strategy
133
- end
134
- # Generates a new similarity index (writes its index) using the
135
- # given similarity caching strategy.
136
- #
137
- def generate_similarity
138
- generator = Generators::SimilarityGenerator.new self.inverted
139
- self.similarity = generator.generate self.similarity_strategy
140
- end
141
-
142
- # Saves the indexes in a dump file.
143
- #
144
- def dump
145
- timed_exclaim %Q{"#{identifier}": Dumping data.}
146
- dump_inverted
147
- dump_similarity
148
- dump_weights
149
- dump_configuration
150
- end
151
- # Dumps the core index.
152
- #
153
- def dump_inverted
154
- # timed_exclaim %Q{"#{identifier}": Dumping inverted index.}
155
- backend.dump_inverted self.inverted
156
- end
157
- # Dumps the weights index.
158
- #
159
- def dump_weights
160
- # timed_exclaim %Q{"#{identifier}": Dumping index weights.}
161
- backend.dump_weights self.weights
162
- end
163
- # Dumps the similarity index.
164
- #
165
- def dump_similarity
166
- # timed_exclaim %Q{"#{identifier}": Dumping similarity index.}
167
- backend.dump_similarity self.similarity
168
- end
169
- # Dumps the similarity index.
170
- #
171
- def dump_configuration
172
- # timed_exclaim %Q{"#{identifier}": Dumping configuration.}
173
- backend.dump_configuration self.configuration
174
- end
175
-
176
- # Alerts the user if an index is missing.
177
- #
178
- def raise_unless_cache_exists
179
- raise_unless_index_exists
180
- raise_unless_similarity_exists
181
- end
182
- # Alerts the user if one of the necessary indexes
183
- # (core, weights) is missing.
184
- #
185
- def raise_unless_index_exists
186
- if partial_strategy.saved?
187
- warn_if_index_small
188
- raise_unless_index_ok
189
- end
190
- end
191
- # Alerts the user if the similarity
192
- # index is missing (given that it's used).
193
- #
194
- def raise_unless_similarity_exists
195
- if similarity_strategy.saved?
196
- warn_if_similarity_small
197
- raise_unless_similarity_ok
198
- end
199
- end
200
-
201
- # Outputs a warning for the given cache.
202
- #
203
- def warn_cache_small what
204
- warn "Warning: #{what} cache for #{identifier} smaller than 16 bytes."
205
- end
206
- # Raises an appropriate error message for the given cache.
207
- #
208
- def raise_cache_missing what
209
- raise "Error: The #{what} cache for #{identifier} is missing."
210
- end
211
-
212
- # Warns the user if the similarity index is small.
213
- #
214
- def warn_if_similarity_small
215
- warn_cache_small :similarity if backend.similarity_cache_small?
216
- end
217
- # Alerts the user if the similarity index is not there.
218
- #
219
- def raise_unless_similarity_ok
220
- raise_cache_missing :similarity unless backend.similarity_cache_ok?
221
- end
222
-
223
- # Warns the user if the core or weights indexes are small.
224
- #
225
- def warn_if_index_small
226
- warn_cache_small :inverted if backend.inverted_cache_small?
227
- warn_cache_small :weights if backend.weights_cache_small?
228
- end
229
- # Alerts the user if the core or weights indexes are not there.
230
- #
231
- def raise_unless_index_ok
232
- raise_cache_missing :inverted unless backend.inverted_cache_ok?
233
- raise_cache_missing :weights unless backend.weights_cache_ok?
234
- end
235
-
236
- end
237
-
238
- end
239
-
240
- end
241
-
242
- end
@@ -1,26 +0,0 @@
1
- module Picky
2
-
3
- # encoding: utf-8
4
- #
5
- module Indexing # :nodoc:all
6
-
7
- module Bundle
8
-
9
- # The memory version dumps its generated indexes to disk
10
- # (mostly JSON) to load them into memory on startup.
11
- #
12
- class Memory < Base
13
-
14
- def initialize name, category, *args
15
- super name, category, *args
16
-
17
- @backend = Backend::Files.new self
18
- end
19
-
20
- end
21
-
22
- end
23
-
24
- end
25
-
26
- end
@@ -1,26 +0,0 @@
1
- module Picky
2
-
3
- # encoding: utf-8
4
- #
5
- module Indexing # :nodoc:all
6
-
7
- module Bundle
8
-
9
- # The Redis version dumps its generated indexes to
10
- # the Redis backend.
11
- #
12
- class Redis < Base
13
-
14
- def initialize name, category, *args
15
- super name, category, *args
16
-
17
- @backend = Backend::Redis.new self
18
- end
19
-
20
- end
21
-
22
- end
23
-
24
- end
25
-
26
- end
@@ -1,74 +0,0 @@
1
- module Picky
2
-
3
- module Query
4
-
5
- # Combinations are a number of Combination-s.
6
- #
7
- # They are the core of an allocation.
8
- # An allocation consists of a number of combinations.
9
- #
10
- module Combinations # :nodoc:all
11
-
12
- # Base Combinations contain methods for calculating score and ids.
13
- #
14
- class Base
15
-
16
- attr_reader :combinations
17
-
18
- delegate :empty?, :to => :@combinations
19
-
20
- def initialize combinations = []
21
- @combinations = combinations
22
- end
23
-
24
- def hash
25
- @combinations.hash
26
- end
27
-
28
- # Uses user specific weights to calculate a score for the combinations.
29
- #
30
- def calculate_score weights
31
- total_score + weighted_score(weights)
32
- end
33
- def total_score
34
- @combinations.sum &:weight
35
- end
36
- def weighted_score weights
37
- weights.score_for @combinations
38
- end
39
-
40
- # Filters the tokens and identifiers such that only identifiers
41
- # that are passed in, remain, including their tokens.
42
- #
43
- # Note: This method is not totally independent of the calculate_ids one.
44
- # Since identifiers are only nullified, we need to not include the
45
- # ids that have an associated identifier that is nil.
46
- #
47
- def keep identifiers = []
48
- @combinations.reject! { |combination| !combination.in?(identifiers) }
49
- end
50
-
51
- # Filters the tokens and identifiers such that identifiers
52
- # that are passed in, are removed, including their tokens.
53
- #
54
- # Note: This method is not totally independent of the calculate_ids one.
55
- # Since identifiers are only nullified, we need to not include the
56
- # ids that have an associated identifier that is nil.
57
- #
58
- def remove identifiers = []
59
- @combinations.reject! { |combination| combination.in?(identifiers) }
60
- end
61
-
62
- #
63
- #
64
- def to_result
65
- @combinations.map &:to_result
66
- end
67
-
68
- end
69
-
70
- end
71
-
72
- end
73
-
74
- end
@@ -1,52 +0,0 @@
1
- module Picky
2
-
3
- module Query
4
-
5
- # Combinations are a number of Combination-s.
6
- #
7
- # They are the core of an allocation.
8
- # An allocation consists of a number of combinations.
9
- #
10
- module Combinations # :nodoc:all
11
-
12
- # Memory Combinations contain specific methods for
13
- # calculating score and ids in memory.
14
- #
15
- class Memory < Base
16
-
17
- # Returns the result ids for the allocation.
18
- #
19
- # Sorts the ids by size and & through them in the following order (sizes):
20
- # 0. [100_000, 400, 30, 2]
21
- # 1. [2, 30, 400, 100_000]
22
- # 2. (100_000 & (400 & (30 & 2))) # => result
23
- #
24
- # Note: Uses a C-optimized intersection routine (in performant.c)
25
- # for speed and memory efficiency.
26
- #
27
- # Note: In the memory based version we ignore the (amount) needed hint.
28
- # We cannot use the information to speed up the algorithm, unfortunately.
29
- #
30
- def ids _, _
31
- return [] if @combinations.empty?
32
-
33
- # Get the ids for each combination.
34
- #
35
- id_arrays = @combinations.inject([]) do |total, combination|
36
- total << combination.ids
37
- end
38
-
39
- # Call the optimized C algorithm.
40
- #
41
- # Note: It orders the passed arrays by size.
42
- #
43
- Performant::Array.memory_efficient_intersect id_arrays
44
- end
45
-
46
- end
47
-
48
- end
49
-
50
- end
51
-
52
- end
@@ -1,90 +0,0 @@
1
- module Picky
2
-
3
- module Query
4
-
5
- # Combinations are a number of Combination-s.
6
- #
7
- # They are the core of an allocation.
8
- # An allocation consists of a number of combinations.
9
- #
10
- module Combinations # :nodoc:all
11
-
12
- # Redis Combinations contain specific methods for
13
- # calculating score and ids in memory.
14
- #
15
- class Redis < Base
16
-
17
- # Connect to the backend.
18
- #
19
- # TODO Use specific Picky Redis wrapper.
20
- #
21
- def self.redis
22
- @redis ||= ::Redis.new :db => 15
23
- end
24
-
25
- attr_reader :redis
26
-
27
- #
28
- #
29
- def initialize combinations
30
- super combinations
31
-
32
- @redis = self.class.redis
33
- end
34
-
35
- # Returns the result ids for the allocation.
36
- #
37
- def ids amount, offset
38
- return [] if @combinations.empty?
39
-
40
- identifiers = @combinations.inject([]) do |identifiers, combination|
41
- identifiers << "#{combination.identifier}"
42
- end
43
-
44
- result_id = generate_intermediate_result_id
45
-
46
- # Intersect and store.
47
- #
48
- redis.zinterstore result_id, identifiers
49
-
50
- # Get the stored result.
51
- #
52
- results = redis.zrange result_id, offset, (offset + amount)
53
-
54
- # Delete the stored result as it was only for temporary purposes.
55
- #
56
- # Note: I could also not delete it, but that would not be clean at all.
57
- #
58
- redis.del result_id
59
-
60
- results
61
- end
62
-
63
- # Generate a multiple host/process safe result id.
64
- #
65
- # Note: Generated when this class loads.
66
- #
67
- require 'socket'
68
- def self.extract_host
69
- @host ||= Socket.gethostname
70
- end
71
- def host
72
- self.class.extract_host
73
- end
74
- extract_host
75
- def pid
76
- @pid ||= Process.pid
77
- end
78
- # Use the host and pid (generated lazily in child processes) for the result.
79
- #
80
- def generate_intermediate_result_id
81
- :"#{host}:#{pid}:picky:result"
82
- end
83
-
84
- end
85
-
86
- end
87
-
88
- end
89
-
90
- end
data/lib/picky/query.rb DELETED
@@ -1,6 +0,0 @@
1
- module Picky
2
-
3
- module Query # :nodoc:all
4
- end
5
-
6
- end