picky 2.6.0 → 2.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. data/lib/picky/analyzer.rb +4 -4
  2. data/lib/picky/application.rb +6 -7
  3. data/lib/picky/backend/{backend.rb → base.rb} +31 -14
  4. data/lib/picky/backend/file/basic.rb +12 -4
  5. data/lib/picky/backend/file/json.rb +5 -5
  6. data/lib/picky/backend/file/text.rb +1 -1
  7. data/lib/picky/backend/files.rb +3 -9
  8. data/lib/picky/backend/redis/basic.rb +8 -0
  9. data/lib/picky/backend/redis/list_hash.rb +5 -5
  10. data/lib/picky/backend/redis/string_hash.rb +5 -5
  11. data/lib/picky/backend/redis.rb +5 -5
  12. data/lib/picky/bundle.rb +62 -0
  13. data/lib/picky/categories.rb +10 -9
  14. data/lib/picky/categories_indexed.rb +12 -7
  15. data/lib/picky/categories_indexing.rb +7 -9
  16. data/lib/picky/category.rb +38 -26
  17. data/lib/picky/category_indexed.rb +4 -20
  18. data/lib/picky/category_indexing.rb +71 -68
  19. data/lib/picky/generators/base.rb +6 -6
  20. data/lib/picky/generators/partial/substring.rb +28 -26
  21. data/lib/picky/generators/partial_generator.rb +3 -3
  22. data/lib/picky/generators/similarity/phonetic.rb +5 -5
  23. data/lib/picky/generators/similarity_generator.rb +2 -2
  24. data/lib/picky/generators/weights/logarithmic.rb +3 -3
  25. data/lib/picky/generators/weights_generator.rb +2 -2
  26. data/lib/picky/index/base.rb +13 -10
  27. data/lib/picky/index/base_indexed.rb +2 -0
  28. data/lib/picky/index/base_indexing.rb +65 -57
  29. data/lib/picky/indexed/bundle/base.rb +21 -86
  30. data/lib/picky/indexed/bundle/memory.rb +5 -12
  31. data/lib/picky/indexed/bundle/redis.rb +42 -0
  32. data/lib/picky/indexed/wrappers/bundle/wrapper.rb +3 -3
  33. data/lib/picky/indexers/base.rb +20 -3
  34. data/lib/picky/indexers/parallel.rb +32 -14
  35. data/lib/picky/indexers/serial.rb +29 -26
  36. data/lib/picky/indexes.rb +5 -3
  37. data/lib/picky/indexes_indexed.rb +3 -15
  38. data/lib/picky/indexes_indexing.rb +18 -21
  39. data/lib/picky/indexing/bundle/base.rb +64 -45
  40. data/lib/picky/indexing/bundle/memory.rb +0 -4
  41. data/lib/picky/loader.rb +7 -6
  42. data/lib/picky/query/allocation.rb +3 -3
  43. data/lib/picky/query/token.rb +5 -1
  44. data/lib/picky/search.rb +5 -0
  45. data/lib/picky/sources/base.rb +21 -2
  46. data/lib/picky/sources/db.rb +0 -7
  47. data/lib/picky/statistics.rb +9 -12
  48. data/lib/picky/tokenizers/location.rb +1 -1
  49. data/lib/tasks/checks.rake +8 -6
  50. data/lib/tasks/index.rake +14 -20
  51. data/lib/tasks/server.rake +18 -2
  52. data/lib/tasks/statistics.rake +27 -14
  53. data/lib/tasks/todo.rake +2 -2
  54. data/lib/tasks/try.rake +12 -27
  55. data/spec/lib/application_spec.rb +1 -1
  56. data/spec/lib/backend/file/basic_spec.rb +6 -6
  57. data/spec/lib/backend/file/json_spec.rb +11 -6
  58. data/spec/lib/backend/file/marshal_spec.rb +11 -6
  59. data/spec/lib/backend/files_spec.rb +21 -7
  60. data/spec/lib/backend/redis/basic_spec.rb +6 -0
  61. data/spec/lib/backend/redis/list_hash_spec.rb +9 -3
  62. data/spec/lib/backend/redis/string_hash_spec.rb +7 -1
  63. data/spec/lib/backend/redis_spec.rb +22 -12
  64. data/spec/lib/categories_indexed_spec.rb +2 -2
  65. data/spec/lib/category_indexing_spec.rb +12 -33
  66. data/spec/lib/category_spec.rb +22 -0
  67. data/spec/lib/index/base_indexing_spec.rb +30 -0
  68. data/spec/lib/indexed/bundle/memory_spec.rb +13 -20
  69. data/spec/lib/indexers/base_spec.rb +39 -4
  70. data/spec/lib/indexers/parallel_spec.rb +2 -10
  71. data/spec/lib/indexers/serial_spec.rb +11 -26
  72. data/spec/lib/indexes_class_spec.rb +4 -4
  73. data/spec/lib/indexes_indexed_spec.rb +2 -2
  74. data/spec/lib/indexes_indexing_spec.rb +6 -10
  75. data/spec/lib/indexes_spec.rb +3 -3
  76. data/spec/lib/indexing/bundle/{super_base_spec.rb → base_spec.rb} +2 -2
  77. data/spec/lib/indexing/bundle/memory_partial_generation_speed_spec.rb +3 -3
  78. data/spec/lib/indexing/bundle/memory_spec.rb +16 -14
  79. data/spec/lib/indexing/bundle/redis_spec.rb +18 -16
  80. data/spec/lib/query/allocation_spec.rb +1 -1
  81. data/spec/lib/query/token_spec.rb +5 -7
  82. data/spec/lib/sources/base_spec.rb +53 -0
  83. data/spec/lib/sources/db_spec.rb +0 -7
  84. metadata +11 -12
  85. data/lib/picky/indexers/solr.rb +0 -56
  86. data/lib/picky/indexing/bundle/super_base.rb +0 -61
  87. data/lib/picky/solr/schema_generator.rb +0 -74
  88. data/lib/tasks/search.rake +0 -9
  89. data/lib/tasks/shortcuts.rake +0 -32
  90. data/lib/tasks/solr.rake +0 -36
@@ -3,33 +3,67 @@ module Index
3
3
  #
4
4
  #
5
5
  class Base
6
-
6
+
7
7
  attr_reader :after_indexing,
8
- :bundle_class,
9
- :tokenizer
10
-
8
+ :bundle_class
9
+
11
10
  # Delegators for indexing.
12
11
  #
13
- delegate :backup_caches,
14
- :cache,
15
- :check_caches,
16
- :clear_caches,
17
- :create_directory_structure,
18
- :generate_caches,
19
- :restore_caches,
12
+ delegate :cache,
13
+ :check,
14
+ :clear,
15
+ :backup,
16
+ :restore,
20
17
  :to => :categories
21
-
22
- delegate :connect_backend,
23
- :to => :source
24
-
25
- # Calling index on an index will
26
- # * prepare (the data)
27
- # * cache (the data)
18
+
19
+ # Calling index on an index will call index
28
20
  # on every category.
29
21
  #
22
+ # Decides whether to use a parallel indexer or whether to
23
+ # delegate to each category to index themselves.
24
+ #
30
25
  def index
31
- prepare
32
- cache
26
+ if source.respond_to?(:each)
27
+ check_source_empty
28
+ index_in_parallel
29
+ else
30
+ with_data_snapshot do
31
+ categories.each &:index
32
+ end
33
+ end
34
+ end
35
+
36
+ # Check if the given enumerable source is empty.
37
+ #
38
+ # Note: Checking as early as possible to tell the
39
+ # user as early as possible.
40
+ #
41
+ def check_source_empty
42
+ warn %Q{\n\033[1mWarning\033[m, source for index "#{name}" is empty: #{source} (responds true to empty?).\n} if source.respond_to?(:empty?) && source.empty?
43
+ end
44
+
45
+ # Note: Duplicated in category_indexing.rb.
46
+ #
47
+ # Take a data snapshot if the source offers it.
48
+ #
49
+ def with_data_snapshot
50
+ if source.respond_to? :with_snapshot
51
+ source.with_snapshot(self) do
52
+ yield
53
+ end
54
+ else
55
+ yield
56
+ end
57
+ end
58
+
59
+ # Indexes the categories in parallel.
60
+ #
61
+ # Only use where the category does have a #each source defined.
62
+ #
63
+ def index_in_parallel
64
+ indexer = Indexers::Parallel.new self
65
+ indexer.index categories
66
+ categories.each &:cache
33
67
  end
34
68
 
35
69
  # Define an index tokenizer on the index.
@@ -40,7 +74,15 @@ module Index
40
74
  @tokenizer = Tokenizers::Index.new options
41
75
  end
42
76
  alias define_indexing indexing
43
-
77
+
78
+ # Returns the installed tokenizer or the default.
79
+ #
80
+ # TODO Spec.
81
+ #
82
+ def tokenizer
83
+ @tokenizer || Indexes.tokenizer
84
+ end
85
+
44
86
  # Define a source on the index.
45
87
  #
46
88
  # Parameter is a source, either one of the standard sources or
@@ -68,7 +110,7 @@ end
68
110
  NO_SOURCE
69
111
  )
70
112
  end
71
-
113
+
72
114
  # Define a key_format on the index.
73
115
  #
74
116
  # Parameter is a method name to use on the key (e.g. :to_i, :to_s, :strip).
@@ -79,41 +121,7 @@ end
79
121
  def define_key_format key_format
80
122
  @key_format = key_format
81
123
  end
82
-
83
- # Decides whether to use a parallel indexer or whether to
84
- # delegate to each category to index themselves.
85
- #
86
- # TODO Rename to prepare.
87
- #
88
- def prepare
89
- # TODO Duplicated in category.rb def indexer.
90
- #
91
- if source.respond_to?(:each)
92
- warn %Q{\n\033[1mWarning\033[m, source for index "#{name}" is empty: #{source} (responds true to empty?).\n} if source.respond_to?(:empty?) && source.empty?
93
- index_parallel
94
- else
95
- categories.each &:prepare
96
- end
97
- end
98
-
99
- # Indexes the categories in parallel.
100
- #
101
- # Only use where the category does not have a non-#each source defined.
102
- #
103
- def index_parallel
104
- indexer = Indexers::Parallel.new self
105
- categories.first.prepare_index_directory # TODO Unnice.
106
- indexer.index
107
- end
108
124
 
109
- # Indexing.
110
- #
111
- # Note: If it is an each source we do not take a snapshot.
112
- #
113
- def take_snapshot
114
- source.take_snapshot self unless source.respond_to? :each
115
- end
116
-
117
125
  end
118
-
126
+
119
127
  end
@@ -1,106 +1,41 @@
1
+ # TODO Extract duplicate code from base bundle classes.
2
+ #
1
3
  module Indexed # :nodoc:all
2
4
 
3
- # A Bundle is a number of indexes
4
- # per [index, category] combination.
5
+ # An indexed bundle is a number of memory/redis
6
+ # indexes that compose the indexes for a single category:
7
+ # * core (inverted) index
8
+ # * weights index
9
+ # * similarity index
10
+ # * index configuration
5
11
  #
6
- # At most, there are three indexes:
7
- # * *core* index (always used)
8
- # * *weights* index (always used)
9
- # * *similarity* index (used with similarity)
12
+ # Indexed refers to them being indexed.
13
+ # This class notably offers the methods:
14
+ # * load
15
+ # * clear
10
16
  #
11
- # In Picky, indexing is separated from the index
12
- # handling itself through a parallel structure.
13
- #
14
- # Both use methods provided by this base class, but
15
- # have very different goals:
16
- #
17
- # * *Indexing*::*Bundle* is just concerned with creating index files
18
- # and providing helper functions to e.g. check the indexes.
19
- #
20
- # * *Index*::*Bundle* is concerned with loading these index files into
21
- # memory and looking up search data as fast as possible.
17
+ # To (re)load or clear the current indexes.
22
18
  #
23
19
  module Bundle
24
20
 
25
- class Base
26
-
27
- attr_reader :identifier, :configuration
28
- attr_accessor :similarity_strategy
29
- attr_accessor :index, :weights, :similarity, :configuration
30
-
31
- delegate :[], :to => :configuration
32
- delegate :size, :to => :index
33
-
34
- def initialize name, category, similarity_strategy
35
- @identifier = "#{category.identifier}:#{name}"
36
-
37
- @index = {}
38
- @weights = {}
39
- @similarity = {}
40
-
41
- @similarity_strategy = similarity_strategy
42
- end
43
-
44
- # Get a list of similar texts.
45
- #
46
- # Note: Does not return itself.
47
- #
48
- def similar text
49
- code = similarity_strategy.encoded text
50
- similar_codes = code && @similarity[code]
51
- similar_codes.delete text if similar_codes
52
- similar_codes || []
53
- end
21
+ class Base < ::Bundle
54
22
 
55
23
  # Loads all indexes.
56
24
  #
57
25
  def load
58
- load_index
26
+ load_inverted
59
27
  load_weights
60
28
  load_similarity
61
29
  load_configuration
62
30
  end
63
31
 
64
- # Loads the core index.
65
- #
66
- def load_index
67
- # No loading needed.
68
- end
69
- # Loads the weights index.
70
- #
71
- def load_weights
72
- # No loading needed.
73
- end
74
- # Loads the similarity index.
75
- #
76
- def load_similarity
77
- # No loading needed.
78
- end
79
- # Loads the configuration.
80
- #
81
- def load_configuration
82
- # No loading needed.
83
- end
84
-
85
- # Loads the core index.
86
- #
87
- def clear_index
88
- # No loading needed.
89
- end
90
- # Loads the weights index.
91
- #
92
- def clear_weights
93
- # No loading needed.
94
- end
95
- # Loads the similarity index.
96
- #
97
- def clear_similarity
98
- # No loading needed.
99
- end
100
- # Loads the configuration.
32
+ # Clears all indexes.
101
33
  #
102
- def clear_configuration
103
- # No loading needed.
34
+ def clear
35
+ clear_inverted
36
+ clear_weights
37
+ clear_similarity
38
+ clear_configuration
104
39
  end
105
40
 
106
41
  end
@@ -24,17 +24,10 @@ module Indexed # :nodoc:all
24
24
  @backend = Backend::Files.new name, configuration
25
25
  end
26
26
 
27
- def to_s
28
- <<-MEMORY
29
- Memory
30
- #{@backend.indented_to_s}
31
- MEMORY
32
- end
33
-
34
27
  # Get the ids for the given symbol.
35
28
  #
36
29
  def ids sym
37
- @index[sym] || []
30
+ @inverted[sym] || []
38
31
  end
39
32
  # Get a weight for the given symbol.
40
33
  #
@@ -44,8 +37,8 @@ MEMORY
44
37
 
45
38
  # Loads the core index.
46
39
  #
47
- def load_index
48
- self.index = @backend.load_index
40
+ def load_inverted
41
+ self.inverted = @backend.load_inverted
49
42
  end
50
43
  # Loads the weights index.
51
44
  #
@@ -65,8 +58,8 @@ MEMORY
65
58
 
66
59
  # Loads the core index.
67
60
  #
68
- def clear_index
69
- self.index = {}
61
+ def clear_inverted
62
+ self.inverted = {}
70
63
  end
71
64
  # Loads the weights index.
72
65
  #
@@ -38,6 +38,48 @@ module Indexed # :nodoc:all
38
38
  @backend.setting sym
39
39
  end
40
40
 
41
+ # Loads the inverted index.
42
+ #
43
+ def load_inverted
44
+ # No loading needed.
45
+ end
46
+ # Loads the weights index.
47
+ #
48
+ def load_weights
49
+ # No loading needed.
50
+ end
51
+ # Loads the similarity index.
52
+ #
53
+ def load_similarity
54
+ # No loading needed.
55
+ end
56
+ # Loads the configuration.
57
+ #
58
+ def load_configuration
59
+ # No loading needed.
60
+ end
61
+
62
+ # Loads the inverted index.
63
+ #
64
+ def clear_inverted
65
+ # No clearing possible, currently.
66
+ end
67
+ # Loads the weights index.
68
+ #
69
+ def clear_weights
70
+ # No clearing possible, currently.
71
+ end
72
+ # Loads the similarity index.
73
+ #
74
+ def clear_similarity
75
+ # No clearing possible, currently.
76
+ end
77
+ # Loads the configuration.
78
+ #
79
+ def clear_configuration
80
+ # No clearing possible, currently.
81
+ end
82
+
41
83
  end
42
84
 
43
85
  end
@@ -16,11 +16,11 @@ module Indexed
16
16
  end
17
17
 
18
18
  delegate :load,
19
- :load_index,
19
+ :load_inverted,
20
20
  :load_weights,
21
21
  :load_similarity,
22
22
  :load_configuration,
23
- :clear_index,
23
+ :clear_inverted,
24
24
  :clear_weights,
25
25
  :clear_similarity,
26
26
  :clear_configuration,
@@ -29,7 +29,7 @@ module Indexed
29
29
  :identifier,
30
30
  :analyze,
31
31
  :size,
32
- :index,
32
+ :inverted,
33
33
  :weights,
34
34
  :similarity,
35
35
  :configuration,
@@ -6,11 +6,28 @@ module Indexers
6
6
  #
7
7
  class Base
8
8
 
9
+ attr_reader :index_or_category
10
+
11
+ delegate :source, :to => :index_or_category
12
+
13
+ def initialize index_or_category
14
+ @index_or_category = index_or_category
15
+ end
16
+
9
17
  # Starts the indexing process.
10
18
  #
11
- def index
12
- indexing_message
13
- process
19
+ def index categories
20
+ start_indexing_message
21
+ prepare categories
22
+ process categories
23
+ finish_indexing_message
24
+ end
25
+
26
+ # By default, an indexer
27
+ # * prepares the index directories.
28
+ #
29
+ def prepare categories
30
+ categories.each &:prepare_index_directory
14
31
  end
15
32
 
16
33
  end
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # encoding: utf-8
2
2
  #
3
3
  module Indexers
4
4
 
@@ -6,27 +6,35 @@ module Indexers
6
6
  #
7
7
  # The tokenizer is taken from each category if specified, from the index, if not.
8
8
  #
9
- # TODO Think about this one more. It should work on an index, but also a single category.
10
- #
11
9
  class Parallel < Base
12
10
 
13
- delegate :categories, :source, :to => :@index
14
-
15
- def initialize index
16
- @index = index
17
- end
18
-
19
- def process
11
+ # Process does the actual indexing.
12
+ #
13
+ # Parameters:
14
+ # * categories: An Enumerable of Category-s.
15
+ #
16
+ def process categories
20
17
  comma = ?,
21
18
  newline = ?\n
22
19
 
23
20
  # Prepare a combined object - array.
24
21
  #
25
- combined = categories.map { |category| [category, [], category.prepared_index_file, (category.tokenizer || tokenizer)] }
22
+ combined = categories.map do |category|
23
+ [category, [], category.prepared_index_file, (category.tokenizer || tokenizer)]
24
+ end
26
25
 
27
26
  # Index.
28
27
  #
28
+ # TODO Extract into flush_every(100_000) do
29
+ #
29
30
  i = 0
31
+
32
+ # Explicitly reset the source to avoid caching trouble.
33
+ #
34
+ source.reset if source.respond_to?(:reset)
35
+
36
+ # Go through each object in the source.
37
+ #
30
38
  source.each do |object|
31
39
  id = object.id
32
40
 
@@ -48,17 +56,27 @@ module Indexers
48
56
  i += 1
49
57
  end
50
58
  flush combined
51
- combined.each { |_, _, file, _| file.close }
59
+ combined.each do |_, _, file, _|
60
+ timed_exclaim %Q{"#{@index_or_category.identifier}": => #{file.path}.}
61
+ file.close
62
+ end
52
63
  end
64
+
65
+ # Flush the combined array into the file.
66
+ #
53
67
  def flush combined # :nodoc:
54
68
  combined.each do |_, cache, file, _|
55
69
  file.write(cache.join) && cache.clear
56
70
  end
57
71
  end
72
+
58
73
  #
59
74
  #
60
- def indexing_message # :nodoc:
61
- timed_exclaim %Q{"#{@index.name}": Starting parallel indexing.}
75
+ def start_indexing_message # :nodoc:
76
+ timed_exclaim %Q{"#{@index_or_category.identifier}": Starting parallel data preparation.}
77
+ end
78
+ def finish_indexing_message # :nodoc:
79
+ timed_exclaim %Q{"#{@index_or_category.identifier}": Finished parallel data preparation.}
62
80
  end
63
81
 
64
82
  end
@@ -8,44 +8,47 @@ module Indexers
8
8
  #
9
9
  class Serial < Base
10
10
 
11
- attr_reader :category
12
-
13
- delegate :source, :to => :category
14
-
15
- def initialize category
16
- @category = category
17
- end
18
-
19
- # The tokenizer used is a cached tokenizer from the category.
20
- #
21
- def tokenizer
22
- @tokenizer ||= category.tokenizer
23
- end
24
-
25
11
  # Harvest the data from the source, tokenize,
26
12
  # and write to an intermediate "prepared index" file.
27
13
  #
28
- def process
14
+ # Parameters:
15
+ # * categories: An enumerable of Category-s.
16
+ #
17
+ def process categories
29
18
  comma = ?,
30
19
  newline = ?\n
31
20
 
32
- local_tokenizer = tokenizer
33
- category.prepared_index_file do |file|
34
- result = []
35
- source.harvest(category) do |indexed_id, text|
36
- local_tokenizer.tokenize(text).each do |token_text|
37
- next unless token_text
38
- result << indexed_id << comma << token_text << newline
21
+ categories.each do |category|
22
+
23
+ tokenizer = category.tokenizer
24
+
25
+ category.prepared_index_file do |file|
26
+ result = []
27
+
28
+ source.harvest(category) do |indexed_id, text|
29
+ tokenizer.tokenize(text).each do |token_text|
30
+ next unless token_text
31
+ result << indexed_id << comma << token_text << newline
32
+ end
33
+ file.write(result.join) && result.clear if result.size > 100_000
39
34
  end
40
- file.write(result.join) && result.clear if result.size > 100_000
35
+
36
+ timed_exclaim %Q{"#{@index_or_category.identifier}": => #{file.path}.}
37
+
38
+ file.write result.join
41
39
  end
42
- file.write result.join
40
+
43
41
  end
42
+
44
43
  end
44
+
45
45
  #
46
46
  #
47
- def indexing_message # :nodoc:
48
- timed_exclaim %Q{"#{@category.identifier}": Starting serial indexing.}
47
+ def start_indexing_message # :nodoc:
48
+ timed_exclaim %Q{"#{@index_or_category.identifier}": Starting serial data preparation.}
49
+ end
50
+ def finish_indexing_message # :nodoc:
51
+ timed_exclaim %Q{"#{@index_or_category.identifier}": Finished serial data preparation.}
49
52
  end
50
53
 
51
54
  end
data/lib/picky/indexes.rb CHANGED
@@ -14,10 +14,11 @@ class Indexes
14
14
  :to => :indexes
15
15
 
16
16
  each_delegate :reindex,
17
+ :each_category,
17
18
  :to => :indexes
18
19
 
19
20
  def initialize
20
- clear
21
+ clear_indexes
21
22
  end
22
23
 
23
24
  # Return the Indexes instance.
@@ -32,11 +33,12 @@ class Indexes
32
33
  :[],
33
34
  :to_s,
34
35
  :size,
35
- :each
36
+ :each,
37
+ :each_category
36
38
 
37
39
  # Clears the indexes and the mapping.
38
40
  #
39
- def clear
41
+ def clear_indexes
40
42
  @indexes = []
41
43
  @index_mapping = {}
42
44
  end
@@ -1,29 +1,17 @@
1
1
  # Registers the indexes held at runtime, for queries.
2
2
  #
3
3
  class Indexes
4
-
4
+
5
5
  instance_delegate :load_from_cache,
6
6
  :reload,
7
7
  :analyze
8
-
8
+
9
9
  each_delegate :load_from_cache,
10
10
  :to => :indexes
11
-
11
+
12
12
  # Reloads all indexes, one after another,
13
13
  # in the order they were added.
14
14
  #
15
15
  alias reload load_from_cache
16
16
 
17
- # Load each index, and analyze it.
18
- #
19
- # Returns a hash with the findings.
20
- #
21
- def analyze
22
- result = {}
23
- indexes.each do |index|
24
- index.analyze result
25
- end
26
- result
27
- end
28
-
29
17
  end