picky 0.10.5 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. data/lib/picky/alias_instances.rb +1 -0
  2. data/lib/picky/application.rb +6 -7
  3. data/lib/picky/bundle.rb +31 -0
  4. data/lib/picky/configuration/indexes.rb +30 -41
  5. data/lib/picky/configuration/type.rb +6 -40
  6. data/lib/picky/ext/maybe_compile.rb +9 -0
  7. data/lib/picky/index/bundle.rb +1 -139
  8. data/lib/picky/{query/combinator.rb → index/categories.rb} +16 -18
  9. data/lib/picky/index/category.rb +20 -46
  10. data/lib/picky/index/type.rb +16 -12
  11. data/lib/picky/index/types.rb +41 -0
  12. data/lib/picky/index/wrappers/exact_first.rb +5 -1
  13. data/lib/picky/indexers/base.rb +9 -8
  14. data/lib/picky/indexing/bundle.rb +152 -0
  15. data/lib/picky/indexing/categories.rb +36 -0
  16. data/lib/picky/indexing/category.rb +145 -0
  17. data/lib/picky/indexing/type.rb +45 -0
  18. data/lib/picky/indexing/types.rb +74 -0
  19. data/lib/picky/loader.rb +17 -7
  20. data/lib/picky/query/base.rb +5 -4
  21. data/lib/picky/sources/wrappers/base.rb +23 -0
  22. data/lib/picky/sources/wrappers/location.rb +92 -0
  23. data/lib/picky/tokenizers/index.rb +4 -1
  24. data/lib/picky/type.rb +46 -0
  25. data/lib/picky/types.rb +38 -0
  26. data/lib/tasks/index.rake +4 -0
  27. data/project_prototype/Gemfile +1 -1
  28. data/project_prototype/app/application.rb +12 -12
  29. data/spec/lib/application_spec.rb +6 -9
  30. data/spec/lib/configuration/indexes_spec.rb +0 -85
  31. data/spec/lib/index/bundle_spec.rb +2 -94
  32. data/spec/lib/index/category_spec.rb +7 -86
  33. data/spec/lib/index/type_spec.rb +14 -26
  34. data/spec/lib/index/wrappers/exact_first_spec.rb +12 -12
  35. data/spec/lib/{index → indexing}/bundle_partial_generation_speed_spec.rb +2 -2
  36. data/spec/lib/indexing/bundle_spec.rb +174 -0
  37. data/spec/lib/{query/combinator_spec.rb → indexing/categories_spec.rb} +30 -34
  38. data/spec/lib/indexing/category_spec.rb +257 -0
  39. data/spec/lib/indexing/type_spec.rb +32 -0
  40. data/spec/lib/loader_spec.rb +0 -2
  41. data/spec/lib/query/base_spec.rb +8 -17
  42. data/spec/lib/query/full_spec.rb +3 -6
  43. data/spec/lib/query/live_spec.rb +4 -3
  44. data/spec/lib/sources/wrappers/base_spec.rb +35 -0
  45. data/spec/lib/sources/wrappers/location_spec.rb +68 -0
  46. data/spec/lib/tokenizers/index_spec.rb +2 -5
  47. metadata +32 -16
  48. data/lib/picky/configuration/field.rb +0 -73
  49. data/lib/picky/indexes.rb +0 -179
  50. data/lib/picky/initializers/ext.rb +0 -1
  51. data/spec/lib/configuration/field_spec.rb +0 -208
  52. data/spec/lib/configuration/type_spec.rb +0 -49
@@ -7,22 +7,22 @@ module Indexers
7
7
  #
8
8
  class Base
9
9
 
10
- def initialize type, field
10
+ def initialize type, category
11
11
  @type = type
12
- @field = field
12
+ @category = category
13
13
  end
14
14
 
15
15
  # Convenience method for getting the right Tokenizer.
16
16
  #
17
17
  def tokenizer
18
- @field.tokenizer
18
+ @category.tokenizer
19
19
  end
20
20
  # Convenience methods for user subclasses.
21
21
  #
22
22
  # TODO Duplicate code in Index::Files.
23
23
  #
24
24
  def search_index_file_name
25
- @field.search_index_file_name
25
+ @category.search_index_file_name
26
26
  end
27
27
 
28
28
  # Executes the specific strategy.
@@ -34,10 +34,10 @@ module Indexers
34
34
  # Get the source where the data is taken from.
35
35
  #
36
36
  def source
37
- @field.source || raise_no_source
37
+ @category.source || raise_no_source
38
38
  end
39
39
  def raise_no_source
40
- raise NoSourceSpecifiedException.new "No source given for index:#{@type.name}, field:#{@field.name}." # TODO field.identifier
40
+ raise NoSourceSpecifiedException.new "No source given for index:#{@type.name}, category:#{@category.name}." # TODO field.identifier
41
41
  end
42
42
 
43
43
  # Selects the original id (indexed id) and a column to process. The column data is called "token".
@@ -54,8 +54,9 @@ module Indexers
54
54
  #
55
55
  File.open(search_index_file_name, 'w:binary') do |file|
56
56
  result = []
57
- source.harvest(@type, @field) do |indexed_id, text|
57
+ source.harvest(@type, @category) do |indexed_id, text|
58
58
  tokenizer.tokenize(text).each do |token_text|
59
+ next unless token_text
59
60
  result << indexed_id << comma << token_text << newline
60
61
  end
61
62
  file.write(result.join) && result.clear if result.size > 100_000
@@ -65,7 +66,7 @@ module Indexers
65
66
  end
66
67
 
67
68
  def indexing_message
68
- timed_exclaim "INDEX #{@type.name} #{@field.name}" #:#{@field.indexed_name}." # TODO field.identifier
69
+ timed_exclaim "INDEX #{@type.name} #{@category.name}" #:#{@category.indexed_as}." # TODO field.identifier
69
70
  end
70
71
 
71
72
  end
@@ -0,0 +1,152 @@
1
+ # encoding: utf-8
2
+ #
3
+ module Indexing
4
+
5
+ # This is the indexing bundle.
6
+ # It does all menial tasks that have nothing to do
7
+ # with the actual index running etc.
8
+ #
9
+ # TODO Superclass?
10
+ #
11
+ class Bundle < ::Bundle
12
+
13
+ attr_accessor :partial_strategy, :weights_strategy
14
+ attr_reader :files
15
+
16
+ # Path is in which directory the cache is located.
17
+ #
18
+ def initialize name, category, type, similarity_strategy, partial_strategy, weights_strategy
19
+ super name, category, type, similarity_strategy
20
+
21
+ @partial_strategy = partial_strategy
22
+ @weights_strategy = weights_strategy
23
+ end
24
+
25
+ # Generation
26
+ #
27
+
28
+ # This method
29
+ # * loads the base index from the db
30
+ # * generates derived indexes
31
+ # * dumps all the indexes into files
32
+ #
33
+ def generate_caches_from_source
34
+ load_from_index_file
35
+ generate_caches_from_memory
36
+ end
37
+ # Generates derived indexes from the index and dumps.
38
+ #
39
+ # Note: assumes that there is something in the index
40
+ #
41
+ def generate_caches_from_memory
42
+ cache_from_memory_generation_message
43
+ generate_derived
44
+ end
45
+ def cache_from_memory_generation_message
46
+ timed_exclaim "CACHE FROM MEMORY #{identifier}."
47
+ end
48
+
49
+ # Generates the weights and similarity from the main index.
50
+ #
51
+ def generate_derived
52
+ generate_weights
53
+ generate_similarity
54
+ end
55
+
56
+ # Load the data from the db.
57
+ #
58
+ def load_from_index_file
59
+ load_from_index_generation_message
60
+ clear
61
+ retrieve
62
+ end
63
+ def load_from_index_generation_message
64
+ timed_exclaim "LOAD INDEX #{identifier}."
65
+ end
66
+ # Retrieves the data into the index.
67
+ #
68
+ def retrieve
69
+ files.retrieve do |id, token|
70
+ initialize_index_for token
71
+ index[token] << id
72
+ end
73
+ end
74
+ def initialize_index_for token
75
+ index[token] ||= []
76
+ end
77
+
78
+ # Generators.
79
+ #
80
+ # TODO Move somewhere more fitting.
81
+ #
82
+
83
+ # Generates a new index (writes its index) using the
84
+ # given partial caching strategy.
85
+ #
86
+ def generate_partial
87
+ generator = Cacher::PartialGenerator.new self.index
88
+ self.index = generator.generate self.partial_strategy
89
+ end
90
+ def generate_partial_from exact_index
91
+ timed_exclaim "PARTIAL GENERATE #{identifier}."
92
+ self.index = exact_index
93
+ self.generate_partial
94
+ self
95
+ end
96
+ # Generates a new similarity index (writes its index) using the
97
+ # given similarity caching strategy.
98
+ #
99
+ def generate_similarity
100
+ generator = Cacher::SimilarityGenerator.new self.index
101
+ self.similarity = generator.generate self.similarity_strategy
102
+ end
103
+ # Generates a new weights index (writes its index) using the
104
+ # given weight caching strategy.
105
+ #
106
+ def generate_weights
107
+ generator = Cacher::WeightsGenerator.new self.index
108
+ self.weights = generator.generate self.weights_strategy
109
+ end
110
+
111
+ # Saves the index in a dump file.
112
+ #
113
+ def dump
114
+ dump_index
115
+ dump_similarity
116
+ dump_weights
117
+ end
118
+ def dump_index
119
+ timed_exclaim "DUMP INDEX #{identifier}."
120
+ files.dump_index index
121
+ end
122
+ def dump_similarity
123
+ timed_exclaim "DUMP SIMILARITY #{identifier}."
124
+ files.dump_similarity similarity
125
+ end
126
+ def dump_weights
127
+ timed_exclaim "DUMP WEIGHTS #{identifier}."
128
+ files.dump_weights weights
129
+ end
130
+
131
+ # Alerts the user if an index is missing.
132
+ #
133
+ def raise_unless_cache_exists
134
+ warn_cache_small :index if files.index_cache_small?
135
+ warn_cache_small :similarity if files.similarity_cache_small?
136
+ warn_cache_small :weights if files.weights_cache_small?
137
+
138
+ raise_cache_missing :index unless files.index_cache_ok?
139
+ raise_cache_missing :similarity unless files.similarity_cache_ok?
140
+ raise_cache_missing :weights unless files.weights_cache_ok?
141
+ end
142
+ def warn_cache_small what
143
+ puts "#{what} cache for #{identifier} smaller than 16 bytes."
144
+ end
145
+ # Raises an appropriate error message.
146
+ #
147
+ def raise_cache_missing what
148
+ raise "#{what} cache for #{identifier} missing."
149
+ end
150
+
151
+ end
152
+ end
@@ -0,0 +1,36 @@
1
+ module Indexing
2
+
3
+ class Categories
4
+
5
+ attr_reader :categories
6
+
7
+ each_delegate :index,
8
+ :cache,
9
+ :generate_caches,
10
+ :backup_caches,
11
+ :restore_caches,
12
+ :check_caches,
13
+ :clear_caches,
14
+ :create_directory_structure,
15
+ :to => :categories
16
+
17
+ def initialize
18
+ @categories = []
19
+ end
20
+
21
+ def << category
22
+ categories << category
23
+ end
24
+
25
+ def find category_name
26
+ category_name = category_name.to_sym
27
+
28
+ categories.each do |category|
29
+ next unless category.name == category_name
30
+ return category
31
+ end
32
+ end
33
+
34
+ end
35
+
36
+ end
@@ -0,0 +1,145 @@
1
+ module Indexing
2
+
3
+ class Category
4
+
5
+ attr_reader :name, :type, :indexed_as, :virtual, :tokenizer, :source, :exact, :partial
6
+
7
+ # TODO Dup the options?
8
+ #
9
+ def initialize name, type, options = {}
10
+ @name = name
11
+ @type = type
12
+
13
+ @source = options[:source]
14
+
15
+ @tokenizer = options[:tokenizer] || Tokenizers::Index.default
16
+ @indexer_class = options[:indexer] || Indexers::Default
17
+ @indexed_as = options[:as] || name
18
+ @virtual = options[:virtual] || false # TODO What is this again?
19
+
20
+ # TODO Push into Bundle.
21
+ #
22
+ partial = options[:partial] || Cacher::Partial::Default
23
+ weights = options[:weights] || Cacher::Weights::Default
24
+ similarity = options[:similarity] || Cacher::Similarity::Default
25
+
26
+ @exact = options[:exact_indexing_bundle] || Bundle.new(:exact, self, type, similarity, Cacher::Partial::None.new, weights)
27
+ @partial = options[:partial_indexing_bundle] || Bundle.new(:partial, self, type, Cacher::Similarity::None.new, partial, weights)
28
+
29
+ # @remove = options[:remove] || false
30
+ # @filter = options[:filter] || true
31
+
32
+ @options = options # TODO Remove?
33
+ end
34
+
35
+ # TODO Move to initializer?
36
+ #
37
+ def identifier
38
+ @identifier ||= "#{type.name} #{name}"
39
+ end
40
+
41
+ # Note: Most of the time the source of the type is used.
42
+ #
43
+ def source
44
+ @source || type.source
45
+ end
46
+
47
+ # TODO Spec.
48
+ #
49
+ def backup_caches
50
+ timed_exclaim "Backing up #{identifier}."
51
+ exact.backup
52
+ partial.backup
53
+ end
54
+ def restore_caches
55
+ timed_exclaim "Restoring #{identifier}."
56
+ exact.restore
57
+ partial.restore
58
+ end
59
+ def check_caches
60
+ timed_exclaim "Checking #{identifier}."
61
+ exact.raise_unless_cache_exists
62
+ partial.raise_unless_cache_exists
63
+ end
64
+ def clear_caches
65
+ timed_exclaim "Deleting #{identifier}."
66
+ exact.delete
67
+ partial.delete
68
+ end
69
+ def create_directory_structure
70
+ timed_exclaim "Creating directory structure for #{identifier}."
71
+ exact.create_directory
72
+ partial.create_directory
73
+ end
74
+
75
+ # Used for testing.
76
+ #
77
+ # TODO Remove?
78
+ #
79
+ def generate_indexes_from_exact_index
80
+ generate_derived_exact
81
+ generate_partial
82
+ generate_derived_partial
83
+ end
84
+ def generate_derived_exact
85
+ exact.generate_derived
86
+ end
87
+ def generate_derived_partial
88
+ partial.generate_derived
89
+ end
90
+
91
+ # Generates all caches for this category.
92
+ #
93
+ def cache
94
+ prepare_cache_directory
95
+ generate_caches
96
+ end
97
+ def generate_caches
98
+ generate_caches_from_source
99
+ generate_partial
100
+ generate_caches_from_memory
101
+ dump_caches
102
+ timed_exclaim "CACHE FINISHED #{identifier}."
103
+ end
104
+ def generate_caches_from_source
105
+ exact.generate_caches_from_source
106
+ end
107
+ def generate_partial
108
+ partial.generate_partial_from exact.index
109
+ end
110
+ def generate_caches_from_memory
111
+ partial.generate_caches_from_memory
112
+ end
113
+ def dump_caches
114
+ exact.dump
115
+ partial.dump
116
+ end
117
+
118
+ # TODO Partially move to type. Duplicate Code in indexers/field.rb.
119
+ #
120
+ def search_index_root
121
+ File.join PICKY_ROOT, 'index'
122
+ end
123
+ def cache_directory
124
+ File.join search_index_root, PICKY_ENVIRONMENT, type.name.to_s
125
+ end
126
+ def search_index_file_name
127
+ File.join cache_directory, "prepared_#{name}_index.txt"
128
+ end
129
+ def index
130
+ prepare_cache_directory
131
+ indexer.index
132
+ end
133
+ def prepare_cache_directory
134
+ FileUtils.mkdir_p cache_directory
135
+ end
136
+ def indexer
137
+ @indexer || @indexer = @indexer_class.new(type, self)
138
+ end
139
+ def virtual?
140
+ !!virtual
141
+ end
142
+
143
+ end
144
+
145
+ end
@@ -0,0 +1,45 @@
1
+ module Indexing
2
+
3
+ class Type
4
+
5
+ attr_reader :name, :source, :categories, :after_indexing
6
+
7
+ # Delegators for indexing.
8
+ #
9
+ delegate :connect_backend,
10
+ :to => :source
11
+
12
+ delegate :index,
13
+ :cache,
14
+ :generate_caches,
15
+ :backup_caches,
16
+ :restore_caches,
17
+ :check_caches,
18
+ :clear_caches,
19
+ :create_directory_structure,
20
+ :to => :categories
21
+
22
+ def initialize name, source, options = {}
23
+ @name = name
24
+ @source = source
25
+
26
+ @after_indexing = options[:after_indexing]
27
+
28
+ @categories = Categories.new
29
+ end
30
+
31
+ # TODO Spec. Doc.
32
+ #
33
+ def add_category name, options = {}
34
+ categories << Category.new(name, self, options)
35
+ end
36
+
37
+ # Indexing.
38
+ #
39
+ def take_snapshot
40
+ source.take_snapshot self
41
+ end
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,74 @@
1
+ module Indexing
2
+
3
+ class Types
4
+
5
+ attr_reader :types
6
+
7
+ each_delegate :take_snapshot,
8
+ :generate_caches,
9
+ :backup_caches,
10
+ :restore_caches,
11
+ :check_caches,
12
+ :clear_caches,
13
+ :create_directory_structure,
14
+ :to => :types
15
+
16
+ def initialize
17
+ clear
18
+ end
19
+
20
+ # TODO Spec.
21
+ #
22
+ def clear
23
+ @types = []
24
+ end
25
+
26
+ # TODO Spec. Superclass?
27
+ #
28
+ def register type
29
+ self.types << type
30
+ end
31
+
32
+ # Runs the indexers in parallel (index + cache).
33
+ #
34
+ # TODO Spec.
35
+ #
36
+ def index randomly = true
37
+ take_snapshot
38
+
39
+ # Run in parallel.
40
+ #
41
+ timed_exclaim "INDEXING USING #{Cores.max_processors} PROCESSORS, IN #{randomly ? 'RANDOM' : 'GIVEN'} ORDER."
42
+ Cores.forked self.types, { randomly: randomly } do |type|
43
+ type.index
44
+ type.cache
45
+ end
46
+ timed_exclaim "INDEXING FINISHED."
47
+ end
48
+
49
+ # TODO Spec
50
+ #
51
+ def generate_index_only type_name, field_name
52
+ found = find type_name, field_name
53
+ found.index if found
54
+ end
55
+ def generate_cache_only type_name, category_name
56
+ found = find type_name, field_name
57
+ found.generate_caches if found
58
+ end
59
+
60
+ # TODO Spec
61
+ #
62
+ def find type_name, category_name
63
+ type_name = type_name.to_sym
64
+
65
+ types.each do |type|
66
+ next unless type.name == type_name
67
+
68
+ found = type.categories.find category_name
69
+ return found if found
70
+ end
71
+ end
72
+
73
+ end
74
+ end
data/lib/picky/loader.rb CHANGED
@@ -84,7 +84,7 @@ module Loader
84
84
  def self.load_framework
85
85
  # Load compiled C code.
86
86
  #
87
- require_relative 'initializers/ext'
87
+ require_relative 'ext/maybe_compile'
88
88
 
89
89
  # Load extensions.
90
90
  #
@@ -166,9 +166,23 @@ module Loader
166
166
 
167
167
  # Index types.
168
168
  #
169
+ load_relative 'bundle'
170
+
171
+ load_relative 'indexing/bundle'
172
+ load_relative 'indexing/category'
173
+ load_relative 'indexing/categories'
174
+ load_relative 'indexing/type'
175
+ load_relative 'indexing/types'
176
+
169
177
  load_relative 'index/bundle'
170
178
  load_relative 'index/category'
179
+ load_relative 'index/categories'
171
180
  load_relative 'index/type'
181
+ load_relative 'index/types'
182
+
183
+ load_relative 'types'
184
+ load_relative 'alias_instances'
185
+ load_relative 'type'
172
186
 
173
187
  load_relative 'index/wrappers/exact_first'
174
188
 
@@ -193,7 +207,6 @@ module Loader
193
207
 
194
208
  load_relative 'query/qualifiers'
195
209
  load_relative 'query/weigher'
196
- load_relative 'query/combinator'
197
210
 
198
211
  load_relative 'query/weights'
199
212
 
@@ -219,14 +232,11 @@ module Loader
219
232
  load_relative 'sources/delicious'
220
233
  load_relative 'sources/couch'
221
234
 
222
- # Indexes.
223
- #
224
- load_relative 'indexes'
235
+ load_relative 'sources/wrappers/base'
236
+ load_relative 'sources/wrappers/location'
225
237
 
226
238
  # Configuration.
227
239
  #
228
- load_relative 'configuration/field'
229
- load_relative 'configuration/type'
230
240
  load_relative 'configuration/indexes'
231
241
 
232
242
  # ... in Application.
@@ -17,10 +17,11 @@ module Query
17
17
  # * tokenizer: Tokenizers::Query.default by default.
18
18
  # * weights: A hash of weights, or a Query::Weights object.
19
19
  #
20
- def initialize *index_types
21
- options = Hash === index_types.last ? index_types.pop : {}
22
- @index_types = index_types
23
- @weigher = options[:weigher] || Weigher.new(index_types)
20
+ def initialize *index_type_definitions
21
+ options = Hash === index_type_definitions.last ? index_type_definitions.pop : {}
22
+ indexes = index_type_definitions.map &:index
23
+
24
+ @weigher = options[:weigher] || Weigher.new(indexes)
24
25
  @tokenizer = options[:tokenizer] || Tokenizers::Query.default
25
26
  weights = options[:weights] || Weights.new
26
27
  @weights = Hash === weights ? Weights.new(weights) : weights
@@ -0,0 +1,23 @@
1
+ module Sources
2
+
3
+ module Wrappers
4
+
5
+ class Base
6
+
7
+ attr_reader :backend
8
+
9
+ # Wraps a backend
10
+ #
11
+ def initialize backend
12
+ @backend = backend
13
+ end
14
+
15
+ # Default is delegation for all methods
16
+ #
17
+ delegate :harvest, :connect_backend, :take_snapshot, :to => :backend
18
+
19
+ end
20
+
21
+ end
22
+
23
+ end
@@ -0,0 +1,92 @@
1
+ module Sources
2
+
3
+ module Wrappers
4
+
5
+ class Location < Base
6
+
7
+ attr_reader :precision, :grid
8
+
9
+ # TODO Save min and grid!
10
+ #
11
+ def initialize backend, options = {}
12
+ super backend
13
+
14
+ @user_grid = extract_user_grid options
15
+ @precision = extract_precision options
16
+
17
+ @grid = @user_grid / (@precision + 0.5)
18
+ end
19
+
20
+ #
21
+ #
22
+ def extract_user_grid options
23
+ options[:grid] || raise # TODO
24
+ end
25
+ # Extracts an amount of grids that this
26
+ # Precision is given in a value.
27
+ # 1 is low (up to 16.6% error), 5 is very high (up to 5% error).
28
+ #
29
+ # We don't recommend using values higher than 5.
30
+ #
31
+ # Default is 1.
32
+ #
33
+ def extract_precision options
34
+ options[:precision] || 1
35
+ end
36
+
37
+ def reset
38
+ @min = 1.0/0
39
+ end
40
+
41
+ # Yield the data (id, text for id) for the given type and field.
42
+ #
43
+ def harvest type, field
44
+ reset
45
+
46
+ # Cache. TODO Make option?
47
+ #
48
+ locations = []
49
+
50
+ # Gather min/max.
51
+ #
52
+ backend.harvest type, field do |indexed_id, location|
53
+ location = location.to_f
54
+ @min = location if location < @min
55
+ locations << [indexed_id, location]
56
+ end
57
+
58
+ # Add a margin.
59
+ #
60
+ marginize
61
+
62
+ # Recalculate locations.
63
+ #
64
+ locations.each do |indexed_id, location|
65
+ locations_for(location).each do |new_location|
66
+ yield indexed_id, new_location.to_s
67
+ end
68
+ end
69
+ end
70
+
71
+ def marginize
72
+ @min -= @user_grid
73
+ end
74
+
75
+ # Put location onto multiple places on a grid.
76
+ #
77
+ # Note: Always returns an integer.
78
+ #
79
+ def locations_for location
80
+ new_location = ((location - @min) / grid).floor
81
+
82
+ min_location = new_location - precision
83
+ max_location = new_location + precision
84
+
85
+ (min_location..max_location).to_a
86
+ end
87
+
88
+ end
89
+
90
+ end
91
+
92
+ end