picky 2.6.0 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/picky/analyzer.rb +4 -4
- data/lib/picky/application.rb +6 -7
- data/lib/picky/backend/{backend.rb → base.rb} +31 -14
- data/lib/picky/backend/file/basic.rb +12 -4
- data/lib/picky/backend/file/json.rb +5 -5
- data/lib/picky/backend/file/text.rb +1 -1
- data/lib/picky/backend/files.rb +3 -9
- data/lib/picky/backend/redis/basic.rb +8 -0
- data/lib/picky/backend/redis/list_hash.rb +5 -5
- data/lib/picky/backend/redis/string_hash.rb +5 -5
- data/lib/picky/backend/redis.rb +5 -5
- data/lib/picky/bundle.rb +62 -0
- data/lib/picky/categories.rb +10 -9
- data/lib/picky/categories_indexed.rb +12 -7
- data/lib/picky/categories_indexing.rb +7 -9
- data/lib/picky/category.rb +38 -26
- data/lib/picky/category_indexed.rb +4 -20
- data/lib/picky/category_indexing.rb +71 -68
- data/lib/picky/generators/base.rb +6 -6
- data/lib/picky/generators/partial/substring.rb +28 -26
- data/lib/picky/generators/partial_generator.rb +3 -3
- data/lib/picky/generators/similarity/phonetic.rb +5 -5
- data/lib/picky/generators/similarity_generator.rb +2 -2
- data/lib/picky/generators/weights/logarithmic.rb +3 -3
- data/lib/picky/generators/weights_generator.rb +2 -2
- data/lib/picky/index/base.rb +13 -10
- data/lib/picky/index/base_indexed.rb +2 -0
- data/lib/picky/index/base_indexing.rb +65 -57
- data/lib/picky/indexed/bundle/base.rb +21 -86
- data/lib/picky/indexed/bundle/memory.rb +5 -12
- data/lib/picky/indexed/bundle/redis.rb +42 -0
- data/lib/picky/indexed/wrappers/bundle/wrapper.rb +3 -3
- data/lib/picky/indexers/base.rb +20 -3
- data/lib/picky/indexers/parallel.rb +32 -14
- data/lib/picky/indexers/serial.rb +29 -26
- data/lib/picky/indexes.rb +5 -3
- data/lib/picky/indexes_indexed.rb +3 -15
- data/lib/picky/indexes_indexing.rb +18 -21
- data/lib/picky/indexing/bundle/base.rb +64 -45
- data/lib/picky/indexing/bundle/memory.rb +0 -4
- data/lib/picky/loader.rb +7 -6
- data/lib/picky/query/allocation.rb +3 -3
- data/lib/picky/query/token.rb +5 -1
- data/lib/picky/search.rb +5 -0
- data/lib/picky/sources/base.rb +21 -2
- data/lib/picky/sources/db.rb +0 -7
- data/lib/picky/statistics.rb +9 -12
- data/lib/picky/tokenizers/location.rb +1 -1
- data/lib/tasks/checks.rake +8 -6
- data/lib/tasks/index.rake +14 -20
- data/lib/tasks/server.rake +18 -2
- data/lib/tasks/statistics.rake +27 -14
- data/lib/tasks/todo.rake +2 -2
- data/lib/tasks/try.rake +12 -27
- data/spec/lib/application_spec.rb +1 -1
- data/spec/lib/backend/file/basic_spec.rb +6 -6
- data/spec/lib/backend/file/json_spec.rb +11 -6
- data/spec/lib/backend/file/marshal_spec.rb +11 -6
- data/spec/lib/backend/files_spec.rb +21 -7
- data/spec/lib/backend/redis/basic_spec.rb +6 -0
- data/spec/lib/backend/redis/list_hash_spec.rb +9 -3
- data/spec/lib/backend/redis/string_hash_spec.rb +7 -1
- data/spec/lib/backend/redis_spec.rb +22 -12
- data/spec/lib/categories_indexed_spec.rb +2 -2
- data/spec/lib/category_indexing_spec.rb +12 -33
- data/spec/lib/category_spec.rb +22 -0
- data/spec/lib/index/base_indexing_spec.rb +30 -0
- data/spec/lib/indexed/bundle/memory_spec.rb +13 -20
- data/spec/lib/indexers/base_spec.rb +39 -4
- data/spec/lib/indexers/parallel_spec.rb +2 -10
- data/spec/lib/indexers/serial_spec.rb +11 -26
- data/spec/lib/indexes_class_spec.rb +4 -4
- data/spec/lib/indexes_indexed_spec.rb +2 -2
- data/spec/lib/indexes_indexing_spec.rb +6 -10
- data/spec/lib/indexes_spec.rb +3 -3
- data/spec/lib/indexing/bundle/{super_base_spec.rb → base_spec.rb} +2 -2
- data/spec/lib/indexing/bundle/memory_partial_generation_speed_spec.rb +3 -3
- data/spec/lib/indexing/bundle/memory_spec.rb +16 -14
- data/spec/lib/indexing/bundle/redis_spec.rb +18 -16
- data/spec/lib/query/allocation_spec.rb +1 -1
- data/spec/lib/query/token_spec.rb +5 -7
- data/spec/lib/sources/base_spec.rb +53 -0
- data/spec/lib/sources/db_spec.rb +0 -7
- metadata +11 -12
- data/lib/picky/indexers/solr.rb +0 -56
- data/lib/picky/indexing/bundle/super_base.rb +0 -61
- data/lib/picky/solr/schema_generator.rb +0 -74
- data/lib/tasks/search.rake +0 -9
- data/lib/tasks/shortcuts.rake +0 -32
- data/lib/tasks/solr.rake +0 -36
@@ -3,33 +3,67 @@ module Index
|
|
3
3
|
#
|
4
4
|
#
|
5
5
|
class Base
|
6
|
-
|
6
|
+
|
7
7
|
attr_reader :after_indexing,
|
8
|
-
:bundle_class
|
9
|
-
|
10
|
-
|
8
|
+
:bundle_class
|
9
|
+
|
11
10
|
# Delegators for indexing.
|
12
11
|
#
|
13
|
-
delegate :
|
14
|
-
:
|
15
|
-
:
|
16
|
-
:
|
17
|
-
:
|
18
|
-
:generate_caches,
|
19
|
-
:restore_caches,
|
12
|
+
delegate :cache,
|
13
|
+
:check,
|
14
|
+
:clear,
|
15
|
+
:backup,
|
16
|
+
:restore,
|
20
17
|
:to => :categories
|
21
|
-
|
22
|
-
|
23
|
-
:to => :source
|
24
|
-
|
25
|
-
# Calling index on an index will
|
26
|
-
# * prepare (the data)
|
27
|
-
# * cache (the data)
|
18
|
+
|
19
|
+
# Calling index on an index will call index
|
28
20
|
# on every category.
|
29
21
|
#
|
22
|
+
# Decides whether to use a parallel indexer or whether to
|
23
|
+
# delegate to each category to index themselves.
|
24
|
+
#
|
30
25
|
def index
|
31
|
-
|
32
|
-
|
26
|
+
if source.respond_to?(:each)
|
27
|
+
check_source_empty
|
28
|
+
index_in_parallel
|
29
|
+
else
|
30
|
+
with_data_snapshot do
|
31
|
+
categories.each &:index
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Check if the given enumerable source is empty.
|
37
|
+
#
|
38
|
+
# Note: Checking as early as possible to tell the
|
39
|
+
# user as early as possible.
|
40
|
+
#
|
41
|
+
def check_source_empty
|
42
|
+
warn %Q{\n\033[1mWarning\033[m, source for index "#{name}" is empty: #{source} (responds true to empty?).\n} if source.respond_to?(:empty?) && source.empty?
|
43
|
+
end
|
44
|
+
|
45
|
+
# Note: Duplicated in category_indexing.rb.
|
46
|
+
#
|
47
|
+
# Take a data snapshot if the source offers it.
|
48
|
+
#
|
49
|
+
def with_data_snapshot
|
50
|
+
if source.respond_to? :with_snapshot
|
51
|
+
source.with_snapshot(self) do
|
52
|
+
yield
|
53
|
+
end
|
54
|
+
else
|
55
|
+
yield
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Indexes the categories in parallel.
|
60
|
+
#
|
61
|
+
# Only use where the category does have a #each source defined.
|
62
|
+
#
|
63
|
+
def index_in_parallel
|
64
|
+
indexer = Indexers::Parallel.new self
|
65
|
+
indexer.index categories
|
66
|
+
categories.each &:cache
|
33
67
|
end
|
34
68
|
|
35
69
|
# Define an index tokenizer on the index.
|
@@ -40,7 +74,15 @@ module Index
|
|
40
74
|
@tokenizer = Tokenizers::Index.new options
|
41
75
|
end
|
42
76
|
alias define_indexing indexing
|
43
|
-
|
77
|
+
|
78
|
+
# Returns the installed tokenizer or the default.
|
79
|
+
#
|
80
|
+
# TODO Spec.
|
81
|
+
#
|
82
|
+
def tokenizer
|
83
|
+
@tokenizer || Indexes.tokenizer
|
84
|
+
end
|
85
|
+
|
44
86
|
# Define a source on the index.
|
45
87
|
#
|
46
88
|
# Parameter is a source, either one of the standard sources or
|
@@ -68,7 +110,7 @@ end
|
|
68
110
|
NO_SOURCE
|
69
111
|
)
|
70
112
|
end
|
71
|
-
|
113
|
+
|
72
114
|
# Define a key_format on the index.
|
73
115
|
#
|
74
116
|
# Parameter is a method name to use on the key (e.g. :to_i, :to_s, :strip).
|
@@ -79,41 +121,7 @@ end
|
|
79
121
|
def define_key_format key_format
|
80
122
|
@key_format = key_format
|
81
123
|
end
|
82
|
-
|
83
|
-
# Decides whether to use a parallel indexer or whether to
|
84
|
-
# delegate to each category to index themselves.
|
85
|
-
#
|
86
|
-
# TODO Rename to prepare.
|
87
|
-
#
|
88
|
-
def prepare
|
89
|
-
# TODO Duplicated in category.rb def indexer.
|
90
|
-
#
|
91
|
-
if source.respond_to?(:each)
|
92
|
-
warn %Q{\n\033[1mWarning\033[m, source for index "#{name}" is empty: #{source} (responds true to empty?).\n} if source.respond_to?(:empty?) && source.empty?
|
93
|
-
index_parallel
|
94
|
-
else
|
95
|
-
categories.each &:prepare
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
# Indexes the categories in parallel.
|
100
|
-
#
|
101
|
-
# Only use where the category does not have a non-#each source defined.
|
102
|
-
#
|
103
|
-
def index_parallel
|
104
|
-
indexer = Indexers::Parallel.new self
|
105
|
-
categories.first.prepare_index_directory # TODO Unnice.
|
106
|
-
indexer.index
|
107
|
-
end
|
108
124
|
|
109
|
-
# Indexing.
|
110
|
-
#
|
111
|
-
# Note: If it is an each source we do not take a snapshot.
|
112
|
-
#
|
113
|
-
def take_snapshot
|
114
|
-
source.take_snapshot self unless source.respond_to? :each
|
115
|
-
end
|
116
|
-
|
117
125
|
end
|
118
|
-
|
126
|
+
|
119
127
|
end
|
@@ -1,106 +1,41 @@
|
|
1
|
+
# TODO Extract duplicate code from base bundle classes.
|
2
|
+
#
|
1
3
|
module Indexed # :nodoc:all
|
2
4
|
|
3
|
-
#
|
4
|
-
#
|
5
|
+
# An indexed bundle is a number of memory/redis
|
6
|
+
# indexes that compose the indexes for a single category:
|
7
|
+
# * core (inverted) index
|
8
|
+
# * weights index
|
9
|
+
# * similarity index
|
10
|
+
# * index configuration
|
5
11
|
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
12
|
+
# Indexed refers to them being indexed.
|
13
|
+
# This class notably offers the methods:
|
14
|
+
# * load
|
15
|
+
# * clear
|
10
16
|
#
|
11
|
-
#
|
12
|
-
# handling itself through a parallel structure.
|
13
|
-
#
|
14
|
-
# Both use methods provided by this base class, but
|
15
|
-
# have very different goals:
|
16
|
-
#
|
17
|
-
# * *Indexing*::*Bundle* is just concerned with creating index files
|
18
|
-
# and providing helper functions to e.g. check the indexes.
|
19
|
-
#
|
20
|
-
# * *Index*::*Bundle* is concerned with loading these index files into
|
21
|
-
# memory and looking up search data as fast as possible.
|
17
|
+
# To (re)load or clear the current indexes.
|
22
18
|
#
|
23
19
|
module Bundle
|
24
20
|
|
25
|
-
class Base
|
26
|
-
|
27
|
-
attr_reader :identifier, :configuration
|
28
|
-
attr_accessor :similarity_strategy
|
29
|
-
attr_accessor :index, :weights, :similarity, :configuration
|
30
|
-
|
31
|
-
delegate :[], :to => :configuration
|
32
|
-
delegate :size, :to => :index
|
33
|
-
|
34
|
-
def initialize name, category, similarity_strategy
|
35
|
-
@identifier = "#{category.identifier}:#{name}"
|
36
|
-
|
37
|
-
@index = {}
|
38
|
-
@weights = {}
|
39
|
-
@similarity = {}
|
40
|
-
|
41
|
-
@similarity_strategy = similarity_strategy
|
42
|
-
end
|
43
|
-
|
44
|
-
# Get a list of similar texts.
|
45
|
-
#
|
46
|
-
# Note: Does not return itself.
|
47
|
-
#
|
48
|
-
def similar text
|
49
|
-
code = similarity_strategy.encoded text
|
50
|
-
similar_codes = code && @similarity[code]
|
51
|
-
similar_codes.delete text if similar_codes
|
52
|
-
similar_codes || []
|
53
|
-
end
|
21
|
+
class Base < ::Bundle
|
54
22
|
|
55
23
|
# Loads all indexes.
|
56
24
|
#
|
57
25
|
def load
|
58
|
-
|
26
|
+
load_inverted
|
59
27
|
load_weights
|
60
28
|
load_similarity
|
61
29
|
load_configuration
|
62
30
|
end
|
63
31
|
|
64
|
-
#
|
65
|
-
#
|
66
|
-
def load_index
|
67
|
-
# No loading needed.
|
68
|
-
end
|
69
|
-
# Loads the weights index.
|
70
|
-
#
|
71
|
-
def load_weights
|
72
|
-
# No loading needed.
|
73
|
-
end
|
74
|
-
# Loads the similarity index.
|
75
|
-
#
|
76
|
-
def load_similarity
|
77
|
-
# No loading needed.
|
78
|
-
end
|
79
|
-
# Loads the configuration.
|
80
|
-
#
|
81
|
-
def load_configuration
|
82
|
-
# No loading needed.
|
83
|
-
end
|
84
|
-
|
85
|
-
# Loads the core index.
|
86
|
-
#
|
87
|
-
def clear_index
|
88
|
-
# No loading needed.
|
89
|
-
end
|
90
|
-
# Loads the weights index.
|
91
|
-
#
|
92
|
-
def clear_weights
|
93
|
-
# No loading needed.
|
94
|
-
end
|
95
|
-
# Loads the similarity index.
|
96
|
-
#
|
97
|
-
def clear_similarity
|
98
|
-
# No loading needed.
|
99
|
-
end
|
100
|
-
# Loads the configuration.
|
32
|
+
# Clears all indexes.
|
101
33
|
#
|
102
|
-
def
|
103
|
-
|
34
|
+
def clear
|
35
|
+
clear_inverted
|
36
|
+
clear_weights
|
37
|
+
clear_similarity
|
38
|
+
clear_configuration
|
104
39
|
end
|
105
40
|
|
106
41
|
end
|
@@ -24,17 +24,10 @@ module Indexed # :nodoc:all
|
|
24
24
|
@backend = Backend::Files.new name, configuration
|
25
25
|
end
|
26
26
|
|
27
|
-
def to_s
|
28
|
-
<<-MEMORY
|
29
|
-
Memory
|
30
|
-
#{@backend.indented_to_s}
|
31
|
-
MEMORY
|
32
|
-
end
|
33
|
-
|
34
27
|
# Get the ids for the given symbol.
|
35
28
|
#
|
36
29
|
def ids sym
|
37
|
-
@
|
30
|
+
@inverted[sym] || []
|
38
31
|
end
|
39
32
|
# Get a weight for the given symbol.
|
40
33
|
#
|
@@ -44,8 +37,8 @@ MEMORY
|
|
44
37
|
|
45
38
|
# Loads the core index.
|
46
39
|
#
|
47
|
-
def
|
48
|
-
self.
|
40
|
+
def load_inverted
|
41
|
+
self.inverted = @backend.load_inverted
|
49
42
|
end
|
50
43
|
# Loads the weights index.
|
51
44
|
#
|
@@ -65,8 +58,8 @@ MEMORY
|
|
65
58
|
|
66
59
|
# Loads the core index.
|
67
60
|
#
|
68
|
-
def
|
69
|
-
self.
|
61
|
+
def clear_inverted
|
62
|
+
self.inverted = {}
|
70
63
|
end
|
71
64
|
# Loads the weights index.
|
72
65
|
#
|
@@ -38,6 +38,48 @@ module Indexed # :nodoc:all
|
|
38
38
|
@backend.setting sym
|
39
39
|
end
|
40
40
|
|
41
|
+
# Loads the inverted index.
|
42
|
+
#
|
43
|
+
def load_inverted
|
44
|
+
# No loading needed.
|
45
|
+
end
|
46
|
+
# Loads the weights index.
|
47
|
+
#
|
48
|
+
def load_weights
|
49
|
+
# No loading needed.
|
50
|
+
end
|
51
|
+
# Loads the similarity index.
|
52
|
+
#
|
53
|
+
def load_similarity
|
54
|
+
# No loading needed.
|
55
|
+
end
|
56
|
+
# Loads the configuration.
|
57
|
+
#
|
58
|
+
def load_configuration
|
59
|
+
# No loading needed.
|
60
|
+
end
|
61
|
+
|
62
|
+
# Loads the inverted index.
|
63
|
+
#
|
64
|
+
def clear_inverted
|
65
|
+
# No clearing possible, currently.
|
66
|
+
end
|
67
|
+
# Loads the weights index.
|
68
|
+
#
|
69
|
+
def clear_weights
|
70
|
+
# No clearing possible, currently.
|
71
|
+
end
|
72
|
+
# Loads the similarity index.
|
73
|
+
#
|
74
|
+
def clear_similarity
|
75
|
+
# No clearing possible, currently.
|
76
|
+
end
|
77
|
+
# Loads the configuration.
|
78
|
+
#
|
79
|
+
def clear_configuration
|
80
|
+
# No clearing possible, currently.
|
81
|
+
end
|
82
|
+
|
41
83
|
end
|
42
84
|
|
43
85
|
end
|
@@ -16,11 +16,11 @@ module Indexed
|
|
16
16
|
end
|
17
17
|
|
18
18
|
delegate :load,
|
19
|
-
:
|
19
|
+
:load_inverted,
|
20
20
|
:load_weights,
|
21
21
|
:load_similarity,
|
22
22
|
:load_configuration,
|
23
|
-
:
|
23
|
+
:clear_inverted,
|
24
24
|
:clear_weights,
|
25
25
|
:clear_similarity,
|
26
26
|
:clear_configuration,
|
@@ -29,7 +29,7 @@ module Indexed
|
|
29
29
|
:identifier,
|
30
30
|
:analyze,
|
31
31
|
:size,
|
32
|
-
:
|
32
|
+
:inverted,
|
33
33
|
:weights,
|
34
34
|
:similarity,
|
35
35
|
:configuration,
|
data/lib/picky/indexers/base.rb
CHANGED
@@ -6,11 +6,28 @@ module Indexers
|
|
6
6
|
#
|
7
7
|
class Base
|
8
8
|
|
9
|
+
attr_reader :index_or_category
|
10
|
+
|
11
|
+
delegate :source, :to => :index_or_category
|
12
|
+
|
13
|
+
def initialize index_or_category
|
14
|
+
@index_or_category = index_or_category
|
15
|
+
end
|
16
|
+
|
9
17
|
# Starts the indexing process.
|
10
18
|
#
|
11
|
-
def index
|
12
|
-
|
13
|
-
|
19
|
+
def index categories
|
20
|
+
start_indexing_message
|
21
|
+
prepare categories
|
22
|
+
process categories
|
23
|
+
finish_indexing_message
|
24
|
+
end
|
25
|
+
|
26
|
+
# By default, an indexer
|
27
|
+
# * prepares the index directories.
|
28
|
+
#
|
29
|
+
def prepare categories
|
30
|
+
categories.each &:prepare_index_directory
|
14
31
|
end
|
15
32
|
|
16
33
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# encoding: utf-8
|
1
|
+
# encoding: utf-8
|
2
2
|
#
|
3
3
|
module Indexers
|
4
4
|
|
@@ -6,27 +6,35 @@ module Indexers
|
|
6
6
|
#
|
7
7
|
# The tokenizer is taken from each category if specified, from the index, if not.
|
8
8
|
#
|
9
|
-
# TODO Think about this one more. It should work on an index, but also a single category.
|
10
|
-
#
|
11
9
|
class Parallel < Base
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
def process
|
11
|
+
# Process does the actual indexing.
|
12
|
+
#
|
13
|
+
# Parameters:
|
14
|
+
# * categories: An Enumerable of Category-s.
|
15
|
+
#
|
16
|
+
def process categories
|
20
17
|
comma = ?,
|
21
18
|
newline = ?\n
|
22
19
|
|
23
20
|
# Prepare a combined object - array.
|
24
21
|
#
|
25
|
-
combined = categories.map
|
22
|
+
combined = categories.map do |category|
|
23
|
+
[category, [], category.prepared_index_file, (category.tokenizer || tokenizer)]
|
24
|
+
end
|
26
25
|
|
27
26
|
# Index.
|
28
27
|
#
|
28
|
+
# TODO Extract into flush_every(100_000) do
|
29
|
+
#
|
29
30
|
i = 0
|
31
|
+
|
32
|
+
# Explicitly reset the source to avoid caching trouble.
|
33
|
+
#
|
34
|
+
source.reset if source.respond_to?(:reset)
|
35
|
+
|
36
|
+
# Go through each object in the source.
|
37
|
+
#
|
30
38
|
source.each do |object|
|
31
39
|
id = object.id
|
32
40
|
|
@@ -48,17 +56,27 @@ module Indexers
|
|
48
56
|
i += 1
|
49
57
|
end
|
50
58
|
flush combined
|
51
|
-
combined.each
|
59
|
+
combined.each do |_, _, file, _|
|
60
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": => #{file.path}.}
|
61
|
+
file.close
|
62
|
+
end
|
52
63
|
end
|
64
|
+
|
65
|
+
# Flush the combined array into the file.
|
66
|
+
#
|
53
67
|
def flush combined # :nodoc:
|
54
68
|
combined.each do |_, cache, file, _|
|
55
69
|
file.write(cache.join) && cache.clear
|
56
70
|
end
|
57
71
|
end
|
72
|
+
|
58
73
|
#
|
59
74
|
#
|
60
|
-
def
|
61
|
-
timed_exclaim %Q{"#{@
|
75
|
+
def start_indexing_message # :nodoc:
|
76
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": Starting parallel data preparation.}
|
77
|
+
end
|
78
|
+
def finish_indexing_message # :nodoc:
|
79
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": Finished parallel data preparation.}
|
62
80
|
end
|
63
81
|
|
64
82
|
end
|
@@ -8,44 +8,47 @@ module Indexers
|
|
8
8
|
#
|
9
9
|
class Serial < Base
|
10
10
|
|
11
|
-
attr_reader :category
|
12
|
-
|
13
|
-
delegate :source, :to => :category
|
14
|
-
|
15
|
-
def initialize category
|
16
|
-
@category = category
|
17
|
-
end
|
18
|
-
|
19
|
-
# The tokenizer used is a cached tokenizer from the category.
|
20
|
-
#
|
21
|
-
def tokenizer
|
22
|
-
@tokenizer ||= category.tokenizer
|
23
|
-
end
|
24
|
-
|
25
11
|
# Harvest the data from the source, tokenize,
|
26
12
|
# and write to an intermediate "prepared index" file.
|
27
13
|
#
|
28
|
-
|
14
|
+
# Parameters:
|
15
|
+
# * categories: An enumerable of Category-s.
|
16
|
+
#
|
17
|
+
def process categories
|
29
18
|
comma = ?,
|
30
19
|
newline = ?\n
|
31
20
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
21
|
+
categories.each do |category|
|
22
|
+
|
23
|
+
tokenizer = category.tokenizer
|
24
|
+
|
25
|
+
category.prepared_index_file do |file|
|
26
|
+
result = []
|
27
|
+
|
28
|
+
source.harvest(category) do |indexed_id, text|
|
29
|
+
tokenizer.tokenize(text).each do |token_text|
|
30
|
+
next unless token_text
|
31
|
+
result << indexed_id << comma << token_text << newline
|
32
|
+
end
|
33
|
+
file.write(result.join) && result.clear if result.size > 100_000
|
39
34
|
end
|
40
|
-
|
35
|
+
|
36
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": => #{file.path}.}
|
37
|
+
|
38
|
+
file.write result.join
|
41
39
|
end
|
42
|
-
|
40
|
+
|
43
41
|
end
|
42
|
+
|
44
43
|
end
|
44
|
+
|
45
45
|
#
|
46
46
|
#
|
47
|
-
def
|
48
|
-
timed_exclaim %Q{"#{@
|
47
|
+
def start_indexing_message # :nodoc:
|
48
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": Starting serial data preparation.}
|
49
|
+
end
|
50
|
+
def finish_indexing_message # :nodoc:
|
51
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": Finished serial data preparation.}
|
49
52
|
end
|
50
53
|
|
51
54
|
end
|
data/lib/picky/indexes.rb
CHANGED
@@ -14,10 +14,11 @@ class Indexes
|
|
14
14
|
:to => :indexes
|
15
15
|
|
16
16
|
each_delegate :reindex,
|
17
|
+
:each_category,
|
17
18
|
:to => :indexes
|
18
19
|
|
19
20
|
def initialize
|
20
|
-
|
21
|
+
clear_indexes
|
21
22
|
end
|
22
23
|
|
23
24
|
# Return the Indexes instance.
|
@@ -32,11 +33,12 @@ class Indexes
|
|
32
33
|
:[],
|
33
34
|
:to_s,
|
34
35
|
:size,
|
35
|
-
:each
|
36
|
+
:each,
|
37
|
+
:each_category
|
36
38
|
|
37
39
|
# Clears the indexes and the mapping.
|
38
40
|
#
|
39
|
-
def
|
41
|
+
def clear_indexes
|
40
42
|
@indexes = []
|
41
43
|
@index_mapping = {}
|
42
44
|
end
|
@@ -1,29 +1,17 @@
|
|
1
1
|
# Registers the indexes held at runtime, for queries.
|
2
2
|
#
|
3
3
|
class Indexes
|
4
|
-
|
4
|
+
|
5
5
|
instance_delegate :load_from_cache,
|
6
6
|
:reload,
|
7
7
|
:analyze
|
8
|
-
|
8
|
+
|
9
9
|
each_delegate :load_from_cache,
|
10
10
|
:to => :indexes
|
11
|
-
|
11
|
+
|
12
12
|
# Reloads all indexes, one after another,
|
13
13
|
# in the order they were added.
|
14
14
|
#
|
15
15
|
alias reload load_from_cache
|
16
16
|
|
17
|
-
# Load each index, and analyze it.
|
18
|
-
#
|
19
|
-
# Returns a hash with the findings.
|
20
|
-
#
|
21
|
-
def analyze
|
22
|
-
result = {}
|
23
|
-
indexes.each do |index|
|
24
|
-
index.analyze result
|
25
|
-
end
|
26
|
-
result
|
27
|
-
end
|
28
|
-
|
29
17
|
end
|