picky 0.10.5 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. data/lib/picky/alias_instances.rb +1 -0
  2. data/lib/picky/application.rb +6 -7
  3. data/lib/picky/bundle.rb +31 -0
  4. data/lib/picky/configuration/indexes.rb +30 -41
  5. data/lib/picky/configuration/type.rb +6 -40
  6. data/lib/picky/ext/maybe_compile.rb +9 -0
  7. data/lib/picky/index/bundle.rb +1 -139
  8. data/lib/picky/{query/combinator.rb → index/categories.rb} +16 -18
  9. data/lib/picky/index/category.rb +20 -46
  10. data/lib/picky/index/type.rb +16 -12
  11. data/lib/picky/index/types.rb +41 -0
  12. data/lib/picky/index/wrappers/exact_first.rb +5 -1
  13. data/lib/picky/indexers/base.rb +9 -8
  14. data/lib/picky/indexing/bundle.rb +152 -0
  15. data/lib/picky/indexing/categories.rb +36 -0
  16. data/lib/picky/indexing/category.rb +145 -0
  17. data/lib/picky/indexing/type.rb +45 -0
  18. data/lib/picky/indexing/types.rb +74 -0
  19. data/lib/picky/loader.rb +17 -7
  20. data/lib/picky/query/base.rb +5 -4
  21. data/lib/picky/sources/wrappers/base.rb +23 -0
  22. data/lib/picky/sources/wrappers/location.rb +92 -0
  23. data/lib/picky/tokenizers/index.rb +4 -1
  24. data/lib/picky/type.rb +46 -0
  25. data/lib/picky/types.rb +38 -0
  26. data/lib/tasks/index.rake +4 -0
  27. data/project_prototype/Gemfile +1 -1
  28. data/project_prototype/app/application.rb +12 -12
  29. data/spec/lib/application_spec.rb +6 -9
  30. data/spec/lib/configuration/indexes_spec.rb +0 -85
  31. data/spec/lib/index/bundle_spec.rb +2 -94
  32. data/spec/lib/index/category_spec.rb +7 -86
  33. data/spec/lib/index/type_spec.rb +14 -26
  34. data/spec/lib/index/wrappers/exact_first_spec.rb +12 -12
  35. data/spec/lib/{index → indexing}/bundle_partial_generation_speed_spec.rb +2 -2
  36. data/spec/lib/indexing/bundle_spec.rb +174 -0
  37. data/spec/lib/{query/combinator_spec.rb → indexing/categories_spec.rb} +30 -34
  38. data/spec/lib/indexing/category_spec.rb +257 -0
  39. data/spec/lib/indexing/type_spec.rb +32 -0
  40. data/spec/lib/loader_spec.rb +0 -2
  41. data/spec/lib/query/base_spec.rb +8 -17
  42. data/spec/lib/query/full_spec.rb +3 -6
  43. data/spec/lib/query/live_spec.rb +4 -3
  44. data/spec/lib/sources/wrappers/base_spec.rb +35 -0
  45. data/spec/lib/sources/wrappers/location_spec.rb +68 -0
  46. data/spec/lib/tokenizers/index_spec.rb +2 -5
  47. metadata +32 -16
  48. data/lib/picky/configuration/field.rb +0 -73
  49. data/lib/picky/indexes.rb +0 -179
  50. data/lib/picky/initializers/ext.rb +0 -1
  51. data/spec/lib/configuration/field_spec.rb +0 -208
  52. data/spec/lib/configuration/type_spec.rb +0 -49
@@ -7,22 +7,22 @@ module Indexers
7
7
  #
8
8
  class Base
9
9
 
10
- def initialize type, field
10
+ def initialize type, category
11
11
  @type = type
12
- @field = field
12
+ @category = category
13
13
  end
14
14
 
15
15
  # Convenience method for getting the right Tokenizer.
16
16
  #
17
17
  def tokenizer
18
- @field.tokenizer
18
+ @category.tokenizer
19
19
  end
20
20
  # Convenience methods for user subclasses.
21
21
  #
22
22
  # TODO Duplicate code in Index::Files.
23
23
  #
24
24
  def search_index_file_name
25
- @field.search_index_file_name
25
+ @category.search_index_file_name
26
26
  end
27
27
 
28
28
  # Executes the specific strategy.
@@ -34,10 +34,10 @@ module Indexers
34
34
  # Get the source where the data is taken from.
35
35
  #
36
36
  def source
37
- @field.source || raise_no_source
37
+ @category.source || raise_no_source
38
38
  end
39
39
  def raise_no_source
40
- raise NoSourceSpecifiedException.new "No source given for index:#{@type.name}, field:#{@field.name}." # TODO field.identifier
40
+ raise NoSourceSpecifiedException.new "No source given for index:#{@type.name}, category:#{@category.name}." # TODO field.identifier
41
41
  end
42
42
 
43
43
  # Selects the original id (indexed id) and a column to process. The column data is called "token".
@@ -54,8 +54,9 @@ module Indexers
54
54
  #
55
55
  File.open(search_index_file_name, 'w:binary') do |file|
56
56
  result = []
57
- source.harvest(@type, @field) do |indexed_id, text|
57
+ source.harvest(@type, @category) do |indexed_id, text|
58
58
  tokenizer.tokenize(text).each do |token_text|
59
+ next unless token_text
59
60
  result << indexed_id << comma << token_text << newline
60
61
  end
61
62
  file.write(result.join) && result.clear if result.size > 100_000
@@ -65,7 +66,7 @@ module Indexers
65
66
  end
66
67
 
67
68
  def indexing_message
68
- timed_exclaim "INDEX #{@type.name} #{@field.name}" #:#{@field.indexed_name}." # TODO field.identifier
69
+ timed_exclaim "INDEX #{@type.name} #{@category.name}" #:#{@category.indexed_as}." # TODO field.identifier
69
70
  end
70
71
 
71
72
  end
@@ -0,0 +1,152 @@
1
+ # encoding: utf-8
2
+ #
3
+ module Indexing
4
+
5
+ # This is the indexing bundle.
6
+ # It does all menial tasks that have nothing to do
7
+ # with the actual index running etc.
8
+ #
9
+ # TODO Superclass?
10
+ #
11
+ class Bundle < ::Bundle
12
+
13
+ attr_accessor :partial_strategy, :weights_strategy
14
+ attr_reader :files
15
+
16
+ # Path is in which directory the cache is located.
17
+ #
18
+ def initialize name, category, type, similarity_strategy, partial_strategy, weights_strategy
19
+ super name, category, type, similarity_strategy
20
+
21
+ @partial_strategy = partial_strategy
22
+ @weights_strategy = weights_strategy
23
+ end
24
+
25
+ # Generation
26
+ #
27
+
28
+ # This method
29
+ # * loads the base index from the db
30
+ # * generates derived indexes
31
+ # * dumps all the indexes into files
32
+ #
33
+ def generate_caches_from_source
34
+ load_from_index_file
35
+ generate_caches_from_memory
36
+ end
37
+ # Generates derived indexes from the index and dumps.
38
+ #
39
+ # Note: assumes that there is something in the index
40
+ #
41
+ def generate_caches_from_memory
42
+ cache_from_memory_generation_message
43
+ generate_derived
44
+ end
45
+ def cache_from_memory_generation_message
46
+ timed_exclaim "CACHE FROM MEMORY #{identifier}."
47
+ end
48
+
49
+ # Generates the weights and similarity from the main index.
50
+ #
51
+ def generate_derived
52
+ generate_weights
53
+ generate_similarity
54
+ end
55
+
56
+ # Load the data from the db.
57
+ #
58
+ def load_from_index_file
59
+ load_from_index_generation_message
60
+ clear
61
+ retrieve
62
+ end
63
+ def load_from_index_generation_message
64
+ timed_exclaim "LOAD INDEX #{identifier}."
65
+ end
66
+ # Retrieves the data into the index.
67
+ #
68
+ def retrieve
69
+ files.retrieve do |id, token|
70
+ initialize_index_for token
71
+ index[token] << id
72
+ end
73
+ end
74
+ def initialize_index_for token
75
+ index[token] ||= []
76
+ end
77
+
78
+ # Generators.
79
+ #
80
+ # TODO Move somewhere more fitting.
81
+ #
82
+
83
+ # Generates a new index (writes its index) using the
84
+ # given partial caching strategy.
85
+ #
86
+ def generate_partial
87
+ generator = Cacher::PartialGenerator.new self.index
88
+ self.index = generator.generate self.partial_strategy
89
+ end
90
+ def generate_partial_from exact_index
91
+ timed_exclaim "PARTIAL GENERATE #{identifier}."
92
+ self.index = exact_index
93
+ self.generate_partial
94
+ self
95
+ end
96
+ # Generates a new similarity index (writes its index) using the
97
+ # given similarity caching strategy.
98
+ #
99
+ def generate_similarity
100
+ generator = Cacher::SimilarityGenerator.new self.index
101
+ self.similarity = generator.generate self.similarity_strategy
102
+ end
103
+ # Generates a new weights index (writes its index) using the
104
+ # given weight caching strategy.
105
+ #
106
+ def generate_weights
107
+ generator = Cacher::WeightsGenerator.new self.index
108
+ self.weights = generator.generate self.weights_strategy
109
+ end
110
+
111
+ # Saves the index in a dump file.
112
+ #
113
+ def dump
114
+ dump_index
115
+ dump_similarity
116
+ dump_weights
117
+ end
118
+ def dump_index
119
+ timed_exclaim "DUMP INDEX #{identifier}."
120
+ files.dump_index index
121
+ end
122
+ def dump_similarity
123
+ timed_exclaim "DUMP SIMILARITY #{identifier}."
124
+ files.dump_similarity similarity
125
+ end
126
+ def dump_weights
127
+ timed_exclaim "DUMP WEIGHTS #{identifier}."
128
+ files.dump_weights weights
129
+ end
130
+
131
+ # Alerts the user if an index is missing.
132
+ #
133
+ def raise_unless_cache_exists
134
+ warn_cache_small :index if files.index_cache_small?
135
+ warn_cache_small :similarity if files.similarity_cache_small?
136
+ warn_cache_small :weights if files.weights_cache_small?
137
+
138
+ raise_cache_missing :index unless files.index_cache_ok?
139
+ raise_cache_missing :similarity unless files.similarity_cache_ok?
140
+ raise_cache_missing :weights unless files.weights_cache_ok?
141
+ end
142
+ def warn_cache_small what
143
+ puts "#{what} cache for #{identifier} smaller than 16 bytes."
144
+ end
145
+ # Raises an appropriate error message.
146
+ #
147
+ def raise_cache_missing what
148
+ raise "#{what} cache for #{identifier} missing."
149
+ end
150
+
151
+ end
152
+ end
@@ -0,0 +1,36 @@
1
+ module Indexing
2
+
3
+ class Categories
4
+
5
+ attr_reader :categories
6
+
7
+ each_delegate :index,
8
+ :cache,
9
+ :generate_caches,
10
+ :backup_caches,
11
+ :restore_caches,
12
+ :check_caches,
13
+ :clear_caches,
14
+ :create_directory_structure,
15
+ :to => :categories
16
+
17
+ def initialize
18
+ @categories = []
19
+ end
20
+
21
+ def << category
22
+ categories << category
23
+ end
24
+
25
+ def find category_name
26
+ category_name = category_name.to_sym
27
+
28
+ categories.each do |category|
29
+ next unless category.name == category_name
30
+ return category
31
+ end
32
+ end
33
+
34
+ end
35
+
36
+ end
@@ -0,0 +1,145 @@
1
+ module Indexing
2
+
3
+ class Category
4
+
5
+ attr_reader :name, :type, :indexed_as, :virtual, :tokenizer, :source, :exact, :partial
6
+
7
+ # TODO Dup the options?
8
+ #
9
+ def initialize name, type, options = {}
10
+ @name = name
11
+ @type = type
12
+
13
+ @source = options[:source]
14
+
15
+ @tokenizer = options[:tokenizer] || Tokenizers::Index.default
16
+ @indexer_class = options[:indexer] || Indexers::Default
17
+ @indexed_as = options[:as] || name
18
+ @virtual = options[:virtual] || false # TODO What is this again?
19
+
20
+ # TODO Push into Bundle.
21
+ #
22
+ partial = options[:partial] || Cacher::Partial::Default
23
+ weights = options[:weights] || Cacher::Weights::Default
24
+ similarity = options[:similarity] || Cacher::Similarity::Default
25
+
26
+ @exact = options[:exact_indexing_bundle] || Bundle.new(:exact, self, type, similarity, Cacher::Partial::None.new, weights)
27
+ @partial = options[:partial_indexing_bundle] || Bundle.new(:partial, self, type, Cacher::Similarity::None.new, partial, weights)
28
+
29
+ # @remove = options[:remove] || false
30
+ # @filter = options[:filter] || true
31
+
32
+ @options = options # TODO Remove?
33
+ end
34
+
35
+ # TODO Move to initializer?
36
+ #
37
+ def identifier
38
+ @identifier ||= "#{type.name} #{name}"
39
+ end
40
+
41
+ # Note: Most of the time the source of the type is used.
42
+ #
43
+ def source
44
+ @source || type.source
45
+ end
46
+
47
+ # TODO Spec.
48
+ #
49
+ def backup_caches
50
+ timed_exclaim "Backing up #{identifier}."
51
+ exact.backup
52
+ partial.backup
53
+ end
54
+ def restore_caches
55
+ timed_exclaim "Restoring #{identifier}."
56
+ exact.restore
57
+ partial.restore
58
+ end
59
+ def check_caches
60
+ timed_exclaim "Checking #{identifier}."
61
+ exact.raise_unless_cache_exists
62
+ partial.raise_unless_cache_exists
63
+ end
64
+ def clear_caches
65
+ timed_exclaim "Deleting #{identifier}."
66
+ exact.delete
67
+ partial.delete
68
+ end
69
+ def create_directory_structure
70
+ timed_exclaim "Creating directory structure for #{identifier}."
71
+ exact.create_directory
72
+ partial.create_directory
73
+ end
74
+
75
+ # Used for testing.
76
+ #
77
+ # TODO Remove?
78
+ #
79
+ def generate_indexes_from_exact_index
80
+ generate_derived_exact
81
+ generate_partial
82
+ generate_derived_partial
83
+ end
84
+ def generate_derived_exact
85
+ exact.generate_derived
86
+ end
87
+ def generate_derived_partial
88
+ partial.generate_derived
89
+ end
90
+
91
+ # Generates all caches for this category.
92
+ #
93
+ def cache
94
+ prepare_cache_directory
95
+ generate_caches
96
+ end
97
+ def generate_caches
98
+ generate_caches_from_source
99
+ generate_partial
100
+ generate_caches_from_memory
101
+ dump_caches
102
+ timed_exclaim "CACHE FINISHED #{identifier}."
103
+ end
104
+ def generate_caches_from_source
105
+ exact.generate_caches_from_source
106
+ end
107
+ def generate_partial
108
+ partial.generate_partial_from exact.index
109
+ end
110
+ def generate_caches_from_memory
111
+ partial.generate_caches_from_memory
112
+ end
113
+ def dump_caches
114
+ exact.dump
115
+ partial.dump
116
+ end
117
+
118
+ # TODO Partially move to type. Duplicate Code in indexers/field.rb.
119
+ #
120
+ def search_index_root
121
+ File.join PICKY_ROOT, 'index'
122
+ end
123
+ def cache_directory
124
+ File.join search_index_root, PICKY_ENVIRONMENT, type.name.to_s
125
+ end
126
+ def search_index_file_name
127
+ File.join cache_directory, "prepared_#{name}_index.txt"
128
+ end
129
+ def index
130
+ prepare_cache_directory
131
+ indexer.index
132
+ end
133
+ def prepare_cache_directory
134
+ FileUtils.mkdir_p cache_directory
135
+ end
136
+ def indexer
137
+ @indexer || @indexer = @indexer_class.new(type, self)
138
+ end
139
+ def virtual?
140
+ !!virtual
141
+ end
142
+
143
+ end
144
+
145
+ end
@@ -0,0 +1,45 @@
1
+ module Indexing
2
+
3
+ class Type
4
+
5
+ attr_reader :name, :source, :categories, :after_indexing
6
+
7
+ # Delegators for indexing.
8
+ #
9
+ delegate :connect_backend,
10
+ :to => :source
11
+
12
+ delegate :index,
13
+ :cache,
14
+ :generate_caches,
15
+ :backup_caches,
16
+ :restore_caches,
17
+ :check_caches,
18
+ :clear_caches,
19
+ :create_directory_structure,
20
+ :to => :categories
21
+
22
+ def initialize name, source, options = {}
23
+ @name = name
24
+ @source = source
25
+
26
+ @after_indexing = options[:after_indexing]
27
+
28
+ @categories = Categories.new
29
+ end
30
+
31
+ # TODO Spec. Doc.
32
+ #
33
+ def add_category name, options = {}
34
+ categories << Category.new(name, self, options)
35
+ end
36
+
37
+ # Indexing.
38
+ #
39
+ def take_snapshot
40
+ source.take_snapshot self
41
+ end
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,74 @@
1
+ module Indexing
2
+
3
+ class Types
4
+
5
+ attr_reader :types
6
+
7
+ each_delegate :take_snapshot,
8
+ :generate_caches,
9
+ :backup_caches,
10
+ :restore_caches,
11
+ :check_caches,
12
+ :clear_caches,
13
+ :create_directory_structure,
14
+ :to => :types
15
+
16
+ def initialize
17
+ clear
18
+ end
19
+
20
+ # TODO Spec.
21
+ #
22
+ def clear
23
+ @types = []
24
+ end
25
+
26
+ # TODO Spec. Superclass?
27
+ #
28
+ def register type
29
+ self.types << type
30
+ end
31
+
32
+ # Runs the indexers in parallel (index + cache).
33
+ #
34
+ # TODO Spec.
35
+ #
36
+ def index randomly = true
37
+ take_snapshot
38
+
39
+ # Run in parallel.
40
+ #
41
+ timed_exclaim "INDEXING USING #{Cores.max_processors} PROCESSORS, IN #{randomly ? 'RANDOM' : 'GIVEN'} ORDER."
42
+ Cores.forked self.types, { randomly: randomly } do |type|
43
+ type.index
44
+ type.cache
45
+ end
46
+ timed_exclaim "INDEXING FINISHED."
47
+ end
48
+
49
+ # TODO Spec
50
+ #
51
+ def generate_index_only type_name, field_name
52
+ found = find type_name, field_name
53
+ found.index if found
54
+ end
55
+ def generate_cache_only type_name, category_name
56
+ found = find type_name, field_name
57
+ found.generate_caches if found
58
+ end
59
+
60
+ # TODO Spec
61
+ #
62
+ def find type_name, category_name
63
+ type_name = type_name.to_sym
64
+
65
+ types.each do |type|
66
+ next unless type.name == type_name
67
+
68
+ found = type.categories.find category_name
69
+ return found if found
70
+ end
71
+ end
72
+
73
+ end
74
+ end
data/lib/picky/loader.rb CHANGED
@@ -84,7 +84,7 @@ module Loader
84
84
  def self.load_framework
85
85
  # Load compiled C code.
86
86
  #
87
- require_relative 'initializers/ext'
87
+ require_relative 'ext/maybe_compile'
88
88
 
89
89
  # Load extensions.
90
90
  #
@@ -166,9 +166,23 @@ module Loader
166
166
 
167
167
  # Index types.
168
168
  #
169
+ load_relative 'bundle'
170
+
171
+ load_relative 'indexing/bundle'
172
+ load_relative 'indexing/category'
173
+ load_relative 'indexing/categories'
174
+ load_relative 'indexing/type'
175
+ load_relative 'indexing/types'
176
+
169
177
  load_relative 'index/bundle'
170
178
  load_relative 'index/category'
179
+ load_relative 'index/categories'
171
180
  load_relative 'index/type'
181
+ load_relative 'index/types'
182
+
183
+ load_relative 'types'
184
+ load_relative 'alias_instances'
185
+ load_relative 'type'
172
186
 
173
187
  load_relative 'index/wrappers/exact_first'
174
188
 
@@ -193,7 +207,6 @@ module Loader
193
207
 
194
208
  load_relative 'query/qualifiers'
195
209
  load_relative 'query/weigher'
196
- load_relative 'query/combinator'
197
210
 
198
211
  load_relative 'query/weights'
199
212
 
@@ -219,14 +232,11 @@ module Loader
219
232
  load_relative 'sources/delicious'
220
233
  load_relative 'sources/couch'
221
234
 
222
- # Indexes.
223
- #
224
- load_relative 'indexes'
235
+ load_relative 'sources/wrappers/base'
236
+ load_relative 'sources/wrappers/location'
225
237
 
226
238
  # Configuration.
227
239
  #
228
- load_relative 'configuration/field'
229
- load_relative 'configuration/type'
230
240
  load_relative 'configuration/indexes'
231
241
 
232
242
  # ... in Application.
@@ -17,10 +17,11 @@ module Query
17
17
  # * tokenizer: Tokenizers::Query.default by default.
18
18
  # * weights: A hash of weights, or a Query::Weights object.
19
19
  #
20
- def initialize *index_types
21
- options = Hash === index_types.last ? index_types.pop : {}
22
- @index_types = index_types
23
- @weigher = options[:weigher] || Weigher.new(index_types)
20
+ def initialize *index_type_definitions
21
+ options = Hash === index_type_definitions.last ? index_type_definitions.pop : {}
22
+ indexes = index_type_definitions.map &:index
23
+
24
+ @weigher = options[:weigher] || Weigher.new(indexes)
24
25
  @tokenizer = options[:tokenizer] || Tokenizers::Query.default
25
26
  weights = options[:weights] || Weights.new
26
27
  @weights = Hash === weights ? Weights.new(weights) : weights
@@ -0,0 +1,23 @@
1
+ module Sources
2
+
3
+ module Wrappers
4
+
5
+ class Base
6
+
7
+ attr_reader :backend
8
+
9
+ # Wraps a backend
10
+ #
11
+ def initialize backend
12
+ @backend = backend
13
+ end
14
+
15
+ # Default is delegation for all methods
16
+ #
17
+ delegate :harvest, :connect_backend, :take_snapshot, :to => :backend
18
+
19
+ end
20
+
21
+ end
22
+
23
+ end
@@ -0,0 +1,92 @@
1
+ module Sources
2
+
3
+ module Wrappers
4
+
5
+ class Location < Base
6
+
7
+ attr_reader :precision, :grid
8
+
9
+ # TODO Save min and grid!
10
+ #
11
+ def initialize backend, options = {}
12
+ super backend
13
+
14
+ @user_grid = extract_user_grid options
15
+ @precision = extract_precision options
16
+
17
+ @grid = @user_grid / (@precision + 0.5)
18
+ end
19
+
20
+ #
21
+ #
22
+ def extract_user_grid options
23
+ options[:grid] || raise # TODO
24
+ end
25
+ # Extracts an amount of grids that this
26
+ # Precision is given in a value.
27
+ # 1 is low (up to 16.6% error), 5 is very high (up to 5% error).
28
+ #
29
+ # We don't recommend using values higher than 5.
30
+ #
31
+ # Default is 1.
32
+ #
33
+ def extract_precision options
34
+ options[:precision] || 1
35
+ end
36
+
37
+ def reset
38
+ @min = 1.0/0
39
+ end
40
+
41
+ # Yield the data (id, text for id) for the given type and field.
42
+ #
43
+ def harvest type, field
44
+ reset
45
+
46
+ # Cache. TODO Make option?
47
+ #
48
+ locations = []
49
+
50
+ # Gather min/max.
51
+ #
52
+ backend.harvest type, field do |indexed_id, location|
53
+ location = location.to_f
54
+ @min = location if location < @min
55
+ locations << [indexed_id, location]
56
+ end
57
+
58
+ # Add a margin.
59
+ #
60
+ marginize
61
+
62
+ # Recalculate locations.
63
+ #
64
+ locations.each do |indexed_id, location|
65
+ locations_for(location).each do |new_location|
66
+ yield indexed_id, new_location.to_s
67
+ end
68
+ end
69
+ end
70
+
71
+ def marginize
72
+ @min -= @user_grid
73
+ end
74
+
75
+ # Put location onto multiple places on a grid.
76
+ #
77
+ # Note: Always returns an integer.
78
+ #
79
+ def locations_for location
80
+ new_location = ((location - @min) / grid).floor
81
+
82
+ min_location = new_location - precision
83
+ max_location = new_location + precision
84
+
85
+ (min_location..max_location).to_a
86
+ end
87
+
88
+ end
89
+
90
+ end
91
+
92
+ end