picky 2.6.0 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. data/lib/picky/analyzer.rb +4 -4
  2. data/lib/picky/application.rb +6 -7
  3. data/lib/picky/backend/{backend.rb → base.rb} +31 -14
  4. data/lib/picky/backend/file/basic.rb +12 -4
  5. data/lib/picky/backend/file/json.rb +5 -5
  6. data/lib/picky/backend/file/text.rb +1 -1
  7. data/lib/picky/backend/files.rb +3 -9
  8. data/lib/picky/backend/redis/basic.rb +8 -0
  9. data/lib/picky/backend/redis/list_hash.rb +5 -5
  10. data/lib/picky/backend/redis/string_hash.rb +5 -5
  11. data/lib/picky/backend/redis.rb +5 -5
  12. data/lib/picky/bundle.rb +62 -0
  13. data/lib/picky/categories.rb +10 -9
  14. data/lib/picky/categories_indexed.rb +12 -7
  15. data/lib/picky/categories_indexing.rb +7 -9
  16. data/lib/picky/category.rb +38 -26
  17. data/lib/picky/category_indexed.rb +4 -20
  18. data/lib/picky/category_indexing.rb +71 -68
  19. data/lib/picky/generators/base.rb +6 -6
  20. data/lib/picky/generators/partial/substring.rb +28 -26
  21. data/lib/picky/generators/partial_generator.rb +3 -3
  22. data/lib/picky/generators/similarity/phonetic.rb +5 -5
  23. data/lib/picky/generators/similarity_generator.rb +2 -2
  24. data/lib/picky/generators/weights/logarithmic.rb +3 -3
  25. data/lib/picky/generators/weights_generator.rb +2 -2
  26. data/lib/picky/index/base.rb +13 -10
  27. data/lib/picky/index/base_indexed.rb +2 -0
  28. data/lib/picky/index/base_indexing.rb +65 -57
  29. data/lib/picky/indexed/bundle/base.rb +21 -86
  30. data/lib/picky/indexed/bundle/memory.rb +5 -12
  31. data/lib/picky/indexed/bundle/redis.rb +42 -0
  32. data/lib/picky/indexed/wrappers/bundle/wrapper.rb +3 -3
  33. data/lib/picky/indexers/base.rb +20 -3
  34. data/lib/picky/indexers/parallel.rb +32 -14
  35. data/lib/picky/indexers/serial.rb +29 -26
  36. data/lib/picky/indexes.rb +5 -3
  37. data/lib/picky/indexes_indexed.rb +3 -15
  38. data/lib/picky/indexes_indexing.rb +18 -21
  39. data/lib/picky/indexing/bundle/base.rb +64 -45
  40. data/lib/picky/indexing/bundle/memory.rb +0 -4
  41. data/lib/picky/loader.rb +7 -6
  42. data/lib/picky/query/allocation.rb +3 -3
  43. data/lib/picky/query/token.rb +5 -1
  44. data/lib/picky/search.rb +5 -0
  45. data/lib/picky/sources/base.rb +21 -2
  46. data/lib/picky/sources/db.rb +0 -7
  47. data/lib/picky/statistics.rb +9 -12
  48. data/lib/picky/tokenizers/location.rb +1 -1
  49. data/lib/tasks/checks.rake +8 -6
  50. data/lib/tasks/index.rake +14 -20
  51. data/lib/tasks/server.rake +18 -2
  52. data/lib/tasks/statistics.rake +27 -14
  53. data/lib/tasks/todo.rake +2 -2
  54. data/lib/tasks/try.rake +12 -27
  55. data/spec/lib/application_spec.rb +1 -1
  56. data/spec/lib/backend/file/basic_spec.rb +6 -6
  57. data/spec/lib/backend/file/json_spec.rb +11 -6
  58. data/spec/lib/backend/file/marshal_spec.rb +11 -6
  59. data/spec/lib/backend/files_spec.rb +21 -7
  60. data/spec/lib/backend/redis/basic_spec.rb +6 -0
  61. data/spec/lib/backend/redis/list_hash_spec.rb +9 -3
  62. data/spec/lib/backend/redis/string_hash_spec.rb +7 -1
  63. data/spec/lib/backend/redis_spec.rb +22 -12
  64. data/spec/lib/categories_indexed_spec.rb +2 -2
  65. data/spec/lib/category_indexing_spec.rb +12 -33
  66. data/spec/lib/category_spec.rb +22 -0
  67. data/spec/lib/index/base_indexing_spec.rb +30 -0
  68. data/spec/lib/indexed/bundle/memory_spec.rb +13 -20
  69. data/spec/lib/indexers/base_spec.rb +39 -4
  70. data/spec/lib/indexers/parallel_spec.rb +2 -10
  71. data/spec/lib/indexers/serial_spec.rb +11 -26
  72. data/spec/lib/indexes_class_spec.rb +4 -4
  73. data/spec/lib/indexes_indexed_spec.rb +2 -2
  74. data/spec/lib/indexes_indexing_spec.rb +6 -10
  75. data/spec/lib/indexes_spec.rb +3 -3
  76. data/spec/lib/indexing/bundle/{super_base_spec.rb → base_spec.rb} +2 -2
  77. data/spec/lib/indexing/bundle/memory_partial_generation_speed_spec.rb +3 -3
  78. data/spec/lib/indexing/bundle/memory_spec.rb +16 -14
  79. data/spec/lib/indexing/bundle/redis_spec.rb +18 -16
  80. data/spec/lib/query/allocation_spec.rb +1 -1
  81. data/spec/lib/query/token_spec.rb +5 -7
  82. data/spec/lib/sources/base_spec.rb +53 -0
  83. data/spec/lib/sources/db_spec.rb +0 -7
  84. metadata +11 -12
  85. data/lib/picky/indexers/solr.rb +0 -56
  86. data/lib/picky/indexing/bundle/super_base.rb +0 -61
  87. data/lib/picky/solr/schema_generator.rb +0 -74
  88. data/lib/tasks/search.rake +0 -9
  89. data/lib/tasks/shortcuts.rake +0 -32
  90. data/lib/tasks/solr.rake +0 -36
@@ -3,33 +3,67 @@ module Index
3
3
  #
4
4
  #
5
5
  class Base
6
-
6
+
7
7
  attr_reader :after_indexing,
8
- :bundle_class,
9
- :tokenizer
10
-
8
+ :bundle_class
9
+
11
10
  # Delegators for indexing.
12
11
  #
13
- delegate :backup_caches,
14
- :cache,
15
- :check_caches,
16
- :clear_caches,
17
- :create_directory_structure,
18
- :generate_caches,
19
- :restore_caches,
12
+ delegate :cache,
13
+ :check,
14
+ :clear,
15
+ :backup,
16
+ :restore,
20
17
  :to => :categories
21
-
22
- delegate :connect_backend,
23
- :to => :source
24
-
25
- # Calling index on an index will
26
- # * prepare (the data)
27
- # * cache (the data)
18
+
19
+ # Calling index on an index will call index
28
20
  # on every category.
29
21
  #
22
+ # Decides whether to use a parallel indexer or whether to
23
+ # delegate to each category to index themselves.
24
+ #
30
25
  def index
31
- prepare
32
- cache
26
+ if source.respond_to?(:each)
27
+ check_source_empty
28
+ index_in_parallel
29
+ else
30
+ with_data_snapshot do
31
+ categories.each &:index
32
+ end
33
+ end
34
+ end
35
+
36
+ # Check if the given enumerable source is empty.
37
+ #
38
+ # Note: Checking as early as possible to tell the
39
+ # user as early as possible.
40
+ #
41
+ def check_source_empty
42
+ warn %Q{\n\033[1mWarning\033[m, source for index "#{name}" is empty: #{source} (responds true to empty?).\n} if source.respond_to?(:empty?) && source.empty?
43
+ end
44
+
45
+ # Note: Duplicated in category_indexing.rb.
46
+ #
47
+ # Take a data snapshot if the source offers it.
48
+ #
49
+ def with_data_snapshot
50
+ if source.respond_to? :with_snapshot
51
+ source.with_snapshot(self) do
52
+ yield
53
+ end
54
+ else
55
+ yield
56
+ end
57
+ end
58
+
59
+ # Indexes the categories in parallel.
60
+ #
61
+ # Only use where the category does have a #each source defined.
62
+ #
63
+ def index_in_parallel
64
+ indexer = Indexers::Parallel.new self
65
+ indexer.index categories
66
+ categories.each &:cache
33
67
  end
34
68
 
35
69
  # Define an index tokenizer on the index.
@@ -40,7 +74,15 @@ module Index
40
74
  @tokenizer = Tokenizers::Index.new options
41
75
  end
42
76
  alias define_indexing indexing
43
-
77
+
78
+ # Returns the installed tokenizer or the default.
79
+ #
80
+ # TODO Spec.
81
+ #
82
+ def tokenizer
83
+ @tokenizer || Indexes.tokenizer
84
+ end
85
+
44
86
  # Define a source on the index.
45
87
  #
46
88
  # Parameter is a source, either one of the standard sources or
@@ -68,7 +110,7 @@ end
68
110
  NO_SOURCE
69
111
  )
70
112
  end
71
-
113
+
72
114
  # Define a key_format on the index.
73
115
  #
74
116
  # Parameter is a method name to use on the key (e.g. :to_i, :to_s, :strip).
@@ -79,41 +121,7 @@ end
79
121
  def define_key_format key_format
80
122
  @key_format = key_format
81
123
  end
82
-
83
- # Decides whether to use a parallel indexer or whether to
84
- # delegate to each category to index themselves.
85
- #
86
- # TODO Rename to prepare.
87
- #
88
- def prepare
89
- # TODO Duplicated in category.rb def indexer.
90
- #
91
- if source.respond_to?(:each)
92
- warn %Q{\n\033[1mWarning\033[m, source for index "#{name}" is empty: #{source} (responds true to empty?).\n} if source.respond_to?(:empty?) && source.empty?
93
- index_parallel
94
- else
95
- categories.each &:prepare
96
- end
97
- end
98
-
99
- # Indexes the categories in parallel.
100
- #
101
- # Only use where the category does not have a non-#each source defined.
102
- #
103
- def index_parallel
104
- indexer = Indexers::Parallel.new self
105
- categories.first.prepare_index_directory # TODO Unnice.
106
- indexer.index
107
- end
108
124
 
109
- # Indexing.
110
- #
111
- # Note: If it is an each source we do not take a snapshot.
112
- #
113
- def take_snapshot
114
- source.take_snapshot self unless source.respond_to? :each
115
- end
116
-
117
125
  end
118
-
126
+
119
127
  end
@@ -1,106 +1,41 @@
1
+ # TODO Extract duplicate code from base bundle classes.
2
+ #
1
3
  module Indexed # :nodoc:all
2
4
 
3
- # A Bundle is a number of indexes
4
- # per [index, category] combination.
5
+ # An indexed bundle is a number of memory/redis
6
+ # indexes that compose the indexes for a single category:
7
+ # * core (inverted) index
8
+ # * weights index
9
+ # * similarity index
10
+ # * index configuration
5
11
  #
6
- # At most, there are three indexes:
7
- # * *core* index (always used)
8
- # * *weights* index (always used)
9
- # * *similarity* index (used with similarity)
12
+ # Indexed refers to them being indexed.
13
+ # This class notably offers the methods:
14
+ # * load
15
+ # * clear
10
16
  #
11
- # In Picky, indexing is separated from the index
12
- # handling itself through a parallel structure.
13
- #
14
- # Both use methods provided by this base class, but
15
- # have very different goals:
16
- #
17
- # * *Indexing*::*Bundle* is just concerned with creating index files
18
- # and providing helper functions to e.g. check the indexes.
19
- #
20
- # * *Index*::*Bundle* is concerned with loading these index files into
21
- # memory and looking up search data as fast as possible.
17
+ # To (re)load or clear the current indexes.
22
18
  #
23
19
  module Bundle
24
20
 
25
- class Base
26
-
27
- attr_reader :identifier, :configuration
28
- attr_accessor :similarity_strategy
29
- attr_accessor :index, :weights, :similarity, :configuration
30
-
31
- delegate :[], :to => :configuration
32
- delegate :size, :to => :index
33
-
34
- def initialize name, category, similarity_strategy
35
- @identifier = "#{category.identifier}:#{name}"
36
-
37
- @index = {}
38
- @weights = {}
39
- @similarity = {}
40
-
41
- @similarity_strategy = similarity_strategy
42
- end
43
-
44
- # Get a list of similar texts.
45
- #
46
- # Note: Does not return itself.
47
- #
48
- def similar text
49
- code = similarity_strategy.encoded text
50
- similar_codes = code && @similarity[code]
51
- similar_codes.delete text if similar_codes
52
- similar_codes || []
53
- end
21
+ class Base < ::Bundle
54
22
 
55
23
  # Loads all indexes.
56
24
  #
57
25
  def load
58
- load_index
26
+ load_inverted
59
27
  load_weights
60
28
  load_similarity
61
29
  load_configuration
62
30
  end
63
31
 
64
- # Loads the core index.
65
- #
66
- def load_index
67
- # No loading needed.
68
- end
69
- # Loads the weights index.
70
- #
71
- def load_weights
72
- # No loading needed.
73
- end
74
- # Loads the similarity index.
75
- #
76
- def load_similarity
77
- # No loading needed.
78
- end
79
- # Loads the configuration.
80
- #
81
- def load_configuration
82
- # No loading needed.
83
- end
84
-
85
- # Loads the core index.
86
- #
87
- def clear_index
88
- # No loading needed.
89
- end
90
- # Loads the weights index.
91
- #
92
- def clear_weights
93
- # No loading needed.
94
- end
95
- # Loads the similarity index.
96
- #
97
- def clear_similarity
98
- # No loading needed.
99
- end
100
- # Loads the configuration.
32
+ # Clears all indexes.
101
33
  #
102
- def clear_configuration
103
- # No loading needed.
34
+ def clear
35
+ clear_inverted
36
+ clear_weights
37
+ clear_similarity
38
+ clear_configuration
104
39
  end
105
40
 
106
41
  end
@@ -24,17 +24,10 @@ module Indexed # :nodoc:all
24
24
  @backend = Backend::Files.new name, configuration
25
25
  end
26
26
 
27
- def to_s
28
- <<-MEMORY
29
- Memory
30
- #{@backend.indented_to_s}
31
- MEMORY
32
- end
33
-
34
27
  # Get the ids for the given symbol.
35
28
  #
36
29
  def ids sym
37
- @index[sym] || []
30
+ @inverted[sym] || []
38
31
  end
39
32
  # Get a weight for the given symbol.
40
33
  #
@@ -44,8 +37,8 @@ MEMORY
44
37
 
45
38
  # Loads the core index.
46
39
  #
47
- def load_index
48
- self.index = @backend.load_index
40
+ def load_inverted
41
+ self.inverted = @backend.load_inverted
49
42
  end
50
43
  # Loads the weights index.
51
44
  #
@@ -65,8 +58,8 @@ MEMORY
65
58
 
66
59
  # Loads the core index.
67
60
  #
68
- def clear_index
69
- self.index = {}
61
+ def clear_inverted
62
+ self.inverted = {}
70
63
  end
71
64
  # Loads the weights index.
72
65
  #
@@ -38,6 +38,48 @@ module Indexed # :nodoc:all
38
38
  @backend.setting sym
39
39
  end
40
40
 
41
+ # Loads the inverted index.
42
+ #
43
+ def load_inverted
44
+ # No loading needed.
45
+ end
46
+ # Loads the weights index.
47
+ #
48
+ def load_weights
49
+ # No loading needed.
50
+ end
51
+ # Loads the similarity index.
52
+ #
53
+ def load_similarity
54
+ # No loading needed.
55
+ end
56
+ # Loads the configuration.
57
+ #
58
+ def load_configuration
59
+ # No loading needed.
60
+ end
61
+
62
+ # Loads the inverted index.
63
+ #
64
+ def clear_inverted
65
+ # No clearing possible, currently.
66
+ end
67
+ # Loads the weights index.
68
+ #
69
+ def clear_weights
70
+ # No clearing possible, currently.
71
+ end
72
+ # Loads the similarity index.
73
+ #
74
+ def clear_similarity
75
+ # No clearing possible, currently.
76
+ end
77
+ # Loads the configuration.
78
+ #
79
+ def clear_configuration
80
+ # No clearing possible, currently.
81
+ end
82
+
41
83
  end
42
84
 
43
85
  end
@@ -16,11 +16,11 @@ module Indexed
16
16
  end
17
17
 
18
18
  delegate :load,
19
- :load_index,
19
+ :load_inverted,
20
20
  :load_weights,
21
21
  :load_similarity,
22
22
  :load_configuration,
23
- :clear_index,
23
+ :clear_inverted,
24
24
  :clear_weights,
25
25
  :clear_similarity,
26
26
  :clear_configuration,
@@ -29,7 +29,7 @@ module Indexed
29
29
  :identifier,
30
30
  :analyze,
31
31
  :size,
32
- :index,
32
+ :inverted,
33
33
  :weights,
34
34
  :similarity,
35
35
  :configuration,
@@ -6,11 +6,28 @@ module Indexers
6
6
  #
7
7
  class Base
8
8
 
9
+ attr_reader :index_or_category
10
+
11
+ delegate :source, :to => :index_or_category
12
+
13
+ def initialize index_or_category
14
+ @index_or_category = index_or_category
15
+ end
16
+
9
17
  # Starts the indexing process.
10
18
  #
11
- def index
12
- indexing_message
13
- process
19
+ def index categories
20
+ start_indexing_message
21
+ prepare categories
22
+ process categories
23
+ finish_indexing_message
24
+ end
25
+
26
+ # By default, an indexer
27
+ # * prepares the index directories.
28
+ #
29
+ def prepare categories
30
+ categories.each &:prepare_index_directory
14
31
  end
15
32
 
16
33
  end
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # encoding: utf-8
2
2
  #
3
3
  module Indexers
4
4
 
@@ -6,27 +6,35 @@ module Indexers
6
6
  #
7
7
  # The tokenizer is taken from each category if specified, from the index, if not.
8
8
  #
9
- # TODO Think about this one more. It should work on an index, but also a single category.
10
- #
11
9
  class Parallel < Base
12
10
 
13
- delegate :categories, :source, :to => :@index
14
-
15
- def initialize index
16
- @index = index
17
- end
18
-
19
- def process
11
+ # Process does the actual indexing.
12
+ #
13
+ # Parameters:
14
+ # * categories: An Enumerable of Category-s.
15
+ #
16
+ def process categories
20
17
  comma = ?,
21
18
  newline = ?\n
22
19
 
23
20
  # Prepare a combined object - array.
24
21
  #
25
- combined = categories.map { |category| [category, [], category.prepared_index_file, (category.tokenizer || tokenizer)] }
22
+ combined = categories.map do |category|
23
+ [category, [], category.prepared_index_file, (category.tokenizer || tokenizer)]
24
+ end
26
25
 
27
26
  # Index.
28
27
  #
28
+ # TODO Extract into flush_every(100_000) do
29
+ #
29
30
  i = 0
31
+
32
+ # Explicitly reset the source to avoid caching trouble.
33
+ #
34
+ source.reset if source.respond_to?(:reset)
35
+
36
+ # Go through each object in the source.
37
+ #
30
38
  source.each do |object|
31
39
  id = object.id
32
40
 
@@ -48,17 +56,27 @@ module Indexers
48
56
  i += 1
49
57
  end
50
58
  flush combined
51
- combined.each { |_, _, file, _| file.close }
59
+ combined.each do |_, _, file, _|
60
+ timed_exclaim %Q{"#{@index_or_category.identifier}": => #{file.path}.}
61
+ file.close
62
+ end
52
63
  end
64
+
65
+ # Flush the combined array into the file.
66
+ #
53
67
  def flush combined # :nodoc:
54
68
  combined.each do |_, cache, file, _|
55
69
  file.write(cache.join) && cache.clear
56
70
  end
57
71
  end
72
+
58
73
  #
59
74
  #
60
- def indexing_message # :nodoc:
61
- timed_exclaim %Q{"#{@index.name}": Starting parallel indexing.}
75
+ def start_indexing_message # :nodoc:
76
+ timed_exclaim %Q{"#{@index_or_category.identifier}": Starting parallel data preparation.}
77
+ end
78
+ def finish_indexing_message # :nodoc:
79
+ timed_exclaim %Q{"#{@index_or_category.identifier}": Finished parallel data preparation.}
62
80
  end
63
81
 
64
82
  end
@@ -8,44 +8,47 @@ module Indexers
8
8
  #
9
9
  class Serial < Base
10
10
 
11
- attr_reader :category
12
-
13
- delegate :source, :to => :category
14
-
15
- def initialize category
16
- @category = category
17
- end
18
-
19
- # The tokenizer used is a cached tokenizer from the category.
20
- #
21
- def tokenizer
22
- @tokenizer ||= category.tokenizer
23
- end
24
-
25
11
  # Harvest the data from the source, tokenize,
26
12
  # and write to an intermediate "prepared index" file.
27
13
  #
28
- def process
14
+ # Parameters:
15
+ # * categories: An enumerable of Category-s.
16
+ #
17
+ def process categories
29
18
  comma = ?,
30
19
  newline = ?\n
31
20
 
32
- local_tokenizer = tokenizer
33
- category.prepared_index_file do |file|
34
- result = []
35
- source.harvest(category) do |indexed_id, text|
36
- local_tokenizer.tokenize(text).each do |token_text|
37
- next unless token_text
38
- result << indexed_id << comma << token_text << newline
21
+ categories.each do |category|
22
+
23
+ tokenizer = category.tokenizer
24
+
25
+ category.prepared_index_file do |file|
26
+ result = []
27
+
28
+ source.harvest(category) do |indexed_id, text|
29
+ tokenizer.tokenize(text).each do |token_text|
30
+ next unless token_text
31
+ result << indexed_id << comma << token_text << newline
32
+ end
33
+ file.write(result.join) && result.clear if result.size > 100_000
39
34
  end
40
- file.write(result.join) && result.clear if result.size > 100_000
35
+
36
+ timed_exclaim %Q{"#{@index_or_category.identifier}": => #{file.path}.}
37
+
38
+ file.write result.join
41
39
  end
42
- file.write result.join
40
+
43
41
  end
42
+
44
43
  end
44
+
45
45
  #
46
46
  #
47
- def indexing_message # :nodoc:
48
- timed_exclaim %Q{"#{@category.identifier}": Starting serial indexing.}
47
+ def start_indexing_message # :nodoc:
48
+ timed_exclaim %Q{"#{@index_or_category.identifier}": Starting serial data preparation.}
49
+ end
50
+ def finish_indexing_message # :nodoc:
51
+ timed_exclaim %Q{"#{@index_or_category.identifier}": Finished serial data preparation.}
49
52
  end
50
53
 
51
54
  end
data/lib/picky/indexes.rb CHANGED
@@ -14,10 +14,11 @@ class Indexes
14
14
  :to => :indexes
15
15
 
16
16
  each_delegate :reindex,
17
+ :each_category,
17
18
  :to => :indexes
18
19
 
19
20
  def initialize
20
- clear
21
+ clear_indexes
21
22
  end
22
23
 
23
24
  # Return the Indexes instance.
@@ -32,11 +33,12 @@ class Indexes
32
33
  :[],
33
34
  :to_s,
34
35
  :size,
35
- :each
36
+ :each,
37
+ :each_category
36
38
 
37
39
  # Clears the indexes and the mapping.
38
40
  #
39
- def clear
41
+ def clear_indexes
40
42
  @indexes = []
41
43
  @index_mapping = {}
42
44
  end
@@ -1,29 +1,17 @@
1
1
  # Registers the indexes held at runtime, for queries.
2
2
  #
3
3
  class Indexes
4
-
4
+
5
5
  instance_delegate :load_from_cache,
6
6
  :reload,
7
7
  :analyze
8
-
8
+
9
9
  each_delegate :load_from_cache,
10
10
  :to => :indexes
11
-
11
+
12
12
  # Reloads all indexes, one after another,
13
13
  # in the order they were added.
14
14
  #
15
15
  alias reload load_from_cache
16
16
 
17
- # Load each index, and analyze it.
18
- #
19
- # Returns a hash with the findings.
20
- #
21
- def analyze
22
- result = {}
23
- indexes.each do |index|
24
- index.analyze result
25
- end
26
- result
27
- end
28
-
29
17
  end