picky 2.6.0 → 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/picky/analyzer.rb +4 -4
- data/lib/picky/application.rb +6 -7
- data/lib/picky/backend/{backend.rb → base.rb} +31 -14
- data/lib/picky/backend/file/basic.rb +12 -4
- data/lib/picky/backend/file/json.rb +5 -5
- data/lib/picky/backend/file/text.rb +1 -1
- data/lib/picky/backend/files.rb +3 -9
- data/lib/picky/backend/redis/basic.rb +8 -0
- data/lib/picky/backend/redis/list_hash.rb +5 -5
- data/lib/picky/backend/redis/string_hash.rb +5 -5
- data/lib/picky/backend/redis.rb +5 -5
- data/lib/picky/bundle.rb +62 -0
- data/lib/picky/categories.rb +10 -9
- data/lib/picky/categories_indexed.rb +12 -7
- data/lib/picky/categories_indexing.rb +7 -9
- data/lib/picky/category.rb +38 -26
- data/lib/picky/category_indexed.rb +4 -20
- data/lib/picky/category_indexing.rb +71 -68
- data/lib/picky/generators/base.rb +6 -6
- data/lib/picky/generators/partial/substring.rb +28 -26
- data/lib/picky/generators/partial_generator.rb +3 -3
- data/lib/picky/generators/similarity/phonetic.rb +5 -5
- data/lib/picky/generators/similarity_generator.rb +2 -2
- data/lib/picky/generators/weights/logarithmic.rb +3 -3
- data/lib/picky/generators/weights_generator.rb +2 -2
- data/lib/picky/index/base.rb +13 -10
- data/lib/picky/index/base_indexed.rb +2 -0
- data/lib/picky/index/base_indexing.rb +65 -57
- data/lib/picky/indexed/bundle/base.rb +21 -86
- data/lib/picky/indexed/bundle/memory.rb +5 -12
- data/lib/picky/indexed/bundle/redis.rb +42 -0
- data/lib/picky/indexed/wrappers/bundle/wrapper.rb +3 -3
- data/lib/picky/indexers/base.rb +20 -3
- data/lib/picky/indexers/parallel.rb +32 -14
- data/lib/picky/indexers/serial.rb +29 -26
- data/lib/picky/indexes.rb +5 -3
- data/lib/picky/indexes_indexed.rb +3 -15
- data/lib/picky/indexes_indexing.rb +18 -21
- data/lib/picky/indexing/bundle/base.rb +64 -45
- data/lib/picky/indexing/bundle/memory.rb +0 -4
- data/lib/picky/loader.rb +7 -6
- data/lib/picky/query/allocation.rb +3 -3
- data/lib/picky/query/token.rb +5 -1
- data/lib/picky/search.rb +5 -0
- data/lib/picky/sources/base.rb +21 -2
- data/lib/picky/sources/db.rb +0 -7
- data/lib/picky/statistics.rb +9 -12
- data/lib/picky/tokenizers/location.rb +1 -1
- data/lib/tasks/checks.rake +8 -6
- data/lib/tasks/index.rake +14 -20
- data/lib/tasks/server.rake +18 -2
- data/lib/tasks/statistics.rake +27 -14
- data/lib/tasks/todo.rake +2 -2
- data/lib/tasks/try.rake +12 -27
- data/spec/lib/application_spec.rb +1 -1
- data/spec/lib/backend/file/basic_spec.rb +6 -6
- data/spec/lib/backend/file/json_spec.rb +11 -6
- data/spec/lib/backend/file/marshal_spec.rb +11 -6
- data/spec/lib/backend/files_spec.rb +21 -7
- data/spec/lib/backend/redis/basic_spec.rb +6 -0
- data/spec/lib/backend/redis/list_hash_spec.rb +9 -3
- data/spec/lib/backend/redis/string_hash_spec.rb +7 -1
- data/spec/lib/backend/redis_spec.rb +22 -12
- data/spec/lib/categories_indexed_spec.rb +2 -2
- data/spec/lib/category_indexing_spec.rb +12 -33
- data/spec/lib/category_spec.rb +22 -0
- data/spec/lib/index/base_indexing_spec.rb +30 -0
- data/spec/lib/indexed/bundle/memory_spec.rb +13 -20
- data/spec/lib/indexers/base_spec.rb +39 -4
- data/spec/lib/indexers/parallel_spec.rb +2 -10
- data/spec/lib/indexers/serial_spec.rb +11 -26
- data/spec/lib/indexes_class_spec.rb +4 -4
- data/spec/lib/indexes_indexed_spec.rb +2 -2
- data/spec/lib/indexes_indexing_spec.rb +6 -10
- data/spec/lib/indexes_spec.rb +3 -3
- data/spec/lib/indexing/bundle/{super_base_spec.rb → base_spec.rb} +2 -2
- data/spec/lib/indexing/bundle/memory_partial_generation_speed_spec.rb +3 -3
- data/spec/lib/indexing/bundle/memory_spec.rb +16 -14
- data/spec/lib/indexing/bundle/redis_spec.rb +18 -16
- data/spec/lib/query/allocation_spec.rb +1 -1
- data/spec/lib/query/token_spec.rb +5 -7
- data/spec/lib/sources/base_spec.rb +53 -0
- data/spec/lib/sources/db_spec.rb +0 -7
- metadata +11 -12
- data/lib/picky/indexers/solr.rb +0 -56
- data/lib/picky/indexing/bundle/super_base.rb +0 -61
- data/lib/picky/solr/schema_generator.rb +0 -74
- data/lib/tasks/search.rake +0 -9
- data/lib/tasks/shortcuts.rake +0 -32
- data/lib/tasks/solr.rake +0 -36
@@ -3,33 +3,67 @@ module Index
|
|
3
3
|
#
|
4
4
|
#
|
5
5
|
class Base
|
6
|
-
|
6
|
+
|
7
7
|
attr_reader :after_indexing,
|
8
|
-
:bundle_class
|
9
|
-
|
10
|
-
|
8
|
+
:bundle_class
|
9
|
+
|
11
10
|
# Delegators for indexing.
|
12
11
|
#
|
13
|
-
delegate :
|
14
|
-
:
|
15
|
-
:
|
16
|
-
:
|
17
|
-
:
|
18
|
-
:generate_caches,
|
19
|
-
:restore_caches,
|
12
|
+
delegate :cache,
|
13
|
+
:check,
|
14
|
+
:clear,
|
15
|
+
:backup,
|
16
|
+
:restore,
|
20
17
|
:to => :categories
|
21
|
-
|
22
|
-
|
23
|
-
:to => :source
|
24
|
-
|
25
|
-
# Calling index on an index will
|
26
|
-
# * prepare (the data)
|
27
|
-
# * cache (the data)
|
18
|
+
|
19
|
+
# Calling index on an index will call index
|
28
20
|
# on every category.
|
29
21
|
#
|
22
|
+
# Decides whether to use a parallel indexer or whether to
|
23
|
+
# delegate to each category to index themselves.
|
24
|
+
#
|
30
25
|
def index
|
31
|
-
|
32
|
-
|
26
|
+
if source.respond_to?(:each)
|
27
|
+
check_source_empty
|
28
|
+
index_in_parallel
|
29
|
+
else
|
30
|
+
with_data_snapshot do
|
31
|
+
categories.each &:index
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Check if the given enumerable source is empty.
|
37
|
+
#
|
38
|
+
# Note: Checking as early as possible to tell the
|
39
|
+
# user as early as possible.
|
40
|
+
#
|
41
|
+
def check_source_empty
|
42
|
+
warn %Q{\n\033[1mWarning\033[m, source for index "#{name}" is empty: #{source} (responds true to empty?).\n} if source.respond_to?(:empty?) && source.empty?
|
43
|
+
end
|
44
|
+
|
45
|
+
# Note: Duplicated in category_indexing.rb.
|
46
|
+
#
|
47
|
+
# Take a data snapshot if the source offers it.
|
48
|
+
#
|
49
|
+
def with_data_snapshot
|
50
|
+
if source.respond_to? :with_snapshot
|
51
|
+
source.with_snapshot(self) do
|
52
|
+
yield
|
53
|
+
end
|
54
|
+
else
|
55
|
+
yield
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Indexes the categories in parallel.
|
60
|
+
#
|
61
|
+
# Only use where the category does have a #each source defined.
|
62
|
+
#
|
63
|
+
def index_in_parallel
|
64
|
+
indexer = Indexers::Parallel.new self
|
65
|
+
indexer.index categories
|
66
|
+
categories.each &:cache
|
33
67
|
end
|
34
68
|
|
35
69
|
# Define an index tokenizer on the index.
|
@@ -40,7 +74,15 @@ module Index
|
|
40
74
|
@tokenizer = Tokenizers::Index.new options
|
41
75
|
end
|
42
76
|
alias define_indexing indexing
|
43
|
-
|
77
|
+
|
78
|
+
# Returns the installed tokenizer or the default.
|
79
|
+
#
|
80
|
+
# TODO Spec.
|
81
|
+
#
|
82
|
+
def tokenizer
|
83
|
+
@tokenizer || Indexes.tokenizer
|
84
|
+
end
|
85
|
+
|
44
86
|
# Define a source on the index.
|
45
87
|
#
|
46
88
|
# Parameter is a source, either one of the standard sources or
|
@@ -68,7 +110,7 @@ end
|
|
68
110
|
NO_SOURCE
|
69
111
|
)
|
70
112
|
end
|
71
|
-
|
113
|
+
|
72
114
|
# Define a key_format on the index.
|
73
115
|
#
|
74
116
|
# Parameter is a method name to use on the key (e.g. :to_i, :to_s, :strip).
|
@@ -79,41 +121,7 @@ end
|
|
79
121
|
def define_key_format key_format
|
80
122
|
@key_format = key_format
|
81
123
|
end
|
82
|
-
|
83
|
-
# Decides whether to use a parallel indexer or whether to
|
84
|
-
# delegate to each category to index themselves.
|
85
|
-
#
|
86
|
-
# TODO Rename to prepare.
|
87
|
-
#
|
88
|
-
def prepare
|
89
|
-
# TODO Duplicated in category.rb def indexer.
|
90
|
-
#
|
91
|
-
if source.respond_to?(:each)
|
92
|
-
warn %Q{\n\033[1mWarning\033[m, source for index "#{name}" is empty: #{source} (responds true to empty?).\n} if source.respond_to?(:empty?) && source.empty?
|
93
|
-
index_parallel
|
94
|
-
else
|
95
|
-
categories.each &:prepare
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
# Indexes the categories in parallel.
|
100
|
-
#
|
101
|
-
# Only use where the category does not have a non-#each source defined.
|
102
|
-
#
|
103
|
-
def index_parallel
|
104
|
-
indexer = Indexers::Parallel.new self
|
105
|
-
categories.first.prepare_index_directory # TODO Unnice.
|
106
|
-
indexer.index
|
107
|
-
end
|
108
124
|
|
109
|
-
# Indexing.
|
110
|
-
#
|
111
|
-
# Note: If it is an each source we do not take a snapshot.
|
112
|
-
#
|
113
|
-
def take_snapshot
|
114
|
-
source.take_snapshot self unless source.respond_to? :each
|
115
|
-
end
|
116
|
-
|
117
125
|
end
|
118
|
-
|
126
|
+
|
119
127
|
end
|
@@ -1,106 +1,41 @@
|
|
1
|
+
# TODO Extract duplicate code from base bundle classes.
|
2
|
+
#
|
1
3
|
module Indexed # :nodoc:all
|
2
4
|
|
3
|
-
#
|
4
|
-
#
|
5
|
+
# An indexed bundle is a number of memory/redis
|
6
|
+
# indexes that compose the indexes for a single category:
|
7
|
+
# * core (inverted) index
|
8
|
+
# * weights index
|
9
|
+
# * similarity index
|
10
|
+
# * index configuration
|
5
11
|
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
12
|
+
# Indexed refers to them being indexed.
|
13
|
+
# This class notably offers the methods:
|
14
|
+
# * load
|
15
|
+
# * clear
|
10
16
|
#
|
11
|
-
#
|
12
|
-
# handling itself through a parallel structure.
|
13
|
-
#
|
14
|
-
# Both use methods provided by this base class, but
|
15
|
-
# have very different goals:
|
16
|
-
#
|
17
|
-
# * *Indexing*::*Bundle* is just concerned with creating index files
|
18
|
-
# and providing helper functions to e.g. check the indexes.
|
19
|
-
#
|
20
|
-
# * *Index*::*Bundle* is concerned with loading these index files into
|
21
|
-
# memory and looking up search data as fast as possible.
|
17
|
+
# To (re)load or clear the current indexes.
|
22
18
|
#
|
23
19
|
module Bundle
|
24
20
|
|
25
|
-
class Base
|
26
|
-
|
27
|
-
attr_reader :identifier, :configuration
|
28
|
-
attr_accessor :similarity_strategy
|
29
|
-
attr_accessor :index, :weights, :similarity, :configuration
|
30
|
-
|
31
|
-
delegate :[], :to => :configuration
|
32
|
-
delegate :size, :to => :index
|
33
|
-
|
34
|
-
def initialize name, category, similarity_strategy
|
35
|
-
@identifier = "#{category.identifier}:#{name}"
|
36
|
-
|
37
|
-
@index = {}
|
38
|
-
@weights = {}
|
39
|
-
@similarity = {}
|
40
|
-
|
41
|
-
@similarity_strategy = similarity_strategy
|
42
|
-
end
|
43
|
-
|
44
|
-
# Get a list of similar texts.
|
45
|
-
#
|
46
|
-
# Note: Does not return itself.
|
47
|
-
#
|
48
|
-
def similar text
|
49
|
-
code = similarity_strategy.encoded text
|
50
|
-
similar_codes = code && @similarity[code]
|
51
|
-
similar_codes.delete text if similar_codes
|
52
|
-
similar_codes || []
|
53
|
-
end
|
21
|
+
class Base < ::Bundle
|
54
22
|
|
55
23
|
# Loads all indexes.
|
56
24
|
#
|
57
25
|
def load
|
58
|
-
|
26
|
+
load_inverted
|
59
27
|
load_weights
|
60
28
|
load_similarity
|
61
29
|
load_configuration
|
62
30
|
end
|
63
31
|
|
64
|
-
#
|
65
|
-
#
|
66
|
-
def load_index
|
67
|
-
# No loading needed.
|
68
|
-
end
|
69
|
-
# Loads the weights index.
|
70
|
-
#
|
71
|
-
def load_weights
|
72
|
-
# No loading needed.
|
73
|
-
end
|
74
|
-
# Loads the similarity index.
|
75
|
-
#
|
76
|
-
def load_similarity
|
77
|
-
# No loading needed.
|
78
|
-
end
|
79
|
-
# Loads the configuration.
|
80
|
-
#
|
81
|
-
def load_configuration
|
82
|
-
# No loading needed.
|
83
|
-
end
|
84
|
-
|
85
|
-
# Loads the core index.
|
86
|
-
#
|
87
|
-
def clear_index
|
88
|
-
# No loading needed.
|
89
|
-
end
|
90
|
-
# Loads the weights index.
|
91
|
-
#
|
92
|
-
def clear_weights
|
93
|
-
# No loading needed.
|
94
|
-
end
|
95
|
-
# Loads the similarity index.
|
96
|
-
#
|
97
|
-
def clear_similarity
|
98
|
-
# No loading needed.
|
99
|
-
end
|
100
|
-
# Loads the configuration.
|
32
|
+
# Clears all indexes.
|
101
33
|
#
|
102
|
-
def
|
103
|
-
|
34
|
+
def clear
|
35
|
+
clear_inverted
|
36
|
+
clear_weights
|
37
|
+
clear_similarity
|
38
|
+
clear_configuration
|
104
39
|
end
|
105
40
|
|
106
41
|
end
|
@@ -24,17 +24,10 @@ module Indexed # :nodoc:all
|
|
24
24
|
@backend = Backend::Files.new name, configuration
|
25
25
|
end
|
26
26
|
|
27
|
-
def to_s
|
28
|
-
<<-MEMORY
|
29
|
-
Memory
|
30
|
-
#{@backend.indented_to_s}
|
31
|
-
MEMORY
|
32
|
-
end
|
33
|
-
|
34
27
|
# Get the ids for the given symbol.
|
35
28
|
#
|
36
29
|
def ids sym
|
37
|
-
@
|
30
|
+
@inverted[sym] || []
|
38
31
|
end
|
39
32
|
# Get a weight for the given symbol.
|
40
33
|
#
|
@@ -44,8 +37,8 @@ MEMORY
|
|
44
37
|
|
45
38
|
# Loads the core index.
|
46
39
|
#
|
47
|
-
def
|
48
|
-
self.
|
40
|
+
def load_inverted
|
41
|
+
self.inverted = @backend.load_inverted
|
49
42
|
end
|
50
43
|
# Loads the weights index.
|
51
44
|
#
|
@@ -65,8 +58,8 @@ MEMORY
|
|
65
58
|
|
66
59
|
# Loads the core index.
|
67
60
|
#
|
68
|
-
def
|
69
|
-
self.
|
61
|
+
def clear_inverted
|
62
|
+
self.inverted = {}
|
70
63
|
end
|
71
64
|
# Loads the weights index.
|
72
65
|
#
|
@@ -38,6 +38,48 @@ module Indexed # :nodoc:all
|
|
38
38
|
@backend.setting sym
|
39
39
|
end
|
40
40
|
|
41
|
+
# Loads the inverted index.
|
42
|
+
#
|
43
|
+
def load_inverted
|
44
|
+
# No loading needed.
|
45
|
+
end
|
46
|
+
# Loads the weights index.
|
47
|
+
#
|
48
|
+
def load_weights
|
49
|
+
# No loading needed.
|
50
|
+
end
|
51
|
+
# Loads the similarity index.
|
52
|
+
#
|
53
|
+
def load_similarity
|
54
|
+
# No loading needed.
|
55
|
+
end
|
56
|
+
# Loads the configuration.
|
57
|
+
#
|
58
|
+
def load_configuration
|
59
|
+
# No loading needed.
|
60
|
+
end
|
61
|
+
|
62
|
+
# Loads the inverted index.
|
63
|
+
#
|
64
|
+
def clear_inverted
|
65
|
+
# No clearing possible, currently.
|
66
|
+
end
|
67
|
+
# Loads the weights index.
|
68
|
+
#
|
69
|
+
def clear_weights
|
70
|
+
# No clearing possible, currently.
|
71
|
+
end
|
72
|
+
# Loads the similarity index.
|
73
|
+
#
|
74
|
+
def clear_similarity
|
75
|
+
# No clearing possible, currently.
|
76
|
+
end
|
77
|
+
# Loads the configuration.
|
78
|
+
#
|
79
|
+
def clear_configuration
|
80
|
+
# No clearing possible, currently.
|
81
|
+
end
|
82
|
+
|
41
83
|
end
|
42
84
|
|
43
85
|
end
|
@@ -16,11 +16,11 @@ module Indexed
|
|
16
16
|
end
|
17
17
|
|
18
18
|
delegate :load,
|
19
|
-
:
|
19
|
+
:load_inverted,
|
20
20
|
:load_weights,
|
21
21
|
:load_similarity,
|
22
22
|
:load_configuration,
|
23
|
-
:
|
23
|
+
:clear_inverted,
|
24
24
|
:clear_weights,
|
25
25
|
:clear_similarity,
|
26
26
|
:clear_configuration,
|
@@ -29,7 +29,7 @@ module Indexed
|
|
29
29
|
:identifier,
|
30
30
|
:analyze,
|
31
31
|
:size,
|
32
|
-
:
|
32
|
+
:inverted,
|
33
33
|
:weights,
|
34
34
|
:similarity,
|
35
35
|
:configuration,
|
data/lib/picky/indexers/base.rb
CHANGED
@@ -6,11 +6,28 @@ module Indexers
|
|
6
6
|
#
|
7
7
|
class Base
|
8
8
|
|
9
|
+
attr_reader :index_or_category
|
10
|
+
|
11
|
+
delegate :source, :to => :index_or_category
|
12
|
+
|
13
|
+
def initialize index_or_category
|
14
|
+
@index_or_category = index_or_category
|
15
|
+
end
|
16
|
+
|
9
17
|
# Starts the indexing process.
|
10
18
|
#
|
11
|
-
def index
|
12
|
-
|
13
|
-
|
19
|
+
def index categories
|
20
|
+
start_indexing_message
|
21
|
+
prepare categories
|
22
|
+
process categories
|
23
|
+
finish_indexing_message
|
24
|
+
end
|
25
|
+
|
26
|
+
# By default, an indexer
|
27
|
+
# * prepares the index directories.
|
28
|
+
#
|
29
|
+
def prepare categories
|
30
|
+
categories.each &:prepare_index_directory
|
14
31
|
end
|
15
32
|
|
16
33
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# encoding: utf-8
|
1
|
+
# encoding: utf-8
|
2
2
|
#
|
3
3
|
module Indexers
|
4
4
|
|
@@ -6,27 +6,35 @@ module Indexers
|
|
6
6
|
#
|
7
7
|
# The tokenizer is taken from each category if specified, from the index, if not.
|
8
8
|
#
|
9
|
-
# TODO Think about this one more. It should work on an index, but also a single category.
|
10
|
-
#
|
11
9
|
class Parallel < Base
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
def process
|
11
|
+
# Process does the actual indexing.
|
12
|
+
#
|
13
|
+
# Parameters:
|
14
|
+
# * categories: An Enumerable of Category-s.
|
15
|
+
#
|
16
|
+
def process categories
|
20
17
|
comma = ?,
|
21
18
|
newline = ?\n
|
22
19
|
|
23
20
|
# Prepare a combined object - array.
|
24
21
|
#
|
25
|
-
combined = categories.map
|
22
|
+
combined = categories.map do |category|
|
23
|
+
[category, [], category.prepared_index_file, (category.tokenizer || tokenizer)]
|
24
|
+
end
|
26
25
|
|
27
26
|
# Index.
|
28
27
|
#
|
28
|
+
# TODO Extract into flush_every(100_000) do
|
29
|
+
#
|
29
30
|
i = 0
|
31
|
+
|
32
|
+
# Explicitly reset the source to avoid caching trouble.
|
33
|
+
#
|
34
|
+
source.reset if source.respond_to?(:reset)
|
35
|
+
|
36
|
+
# Go through each object in the source.
|
37
|
+
#
|
30
38
|
source.each do |object|
|
31
39
|
id = object.id
|
32
40
|
|
@@ -48,17 +56,27 @@ module Indexers
|
|
48
56
|
i += 1
|
49
57
|
end
|
50
58
|
flush combined
|
51
|
-
combined.each
|
59
|
+
combined.each do |_, _, file, _|
|
60
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": => #{file.path}.}
|
61
|
+
file.close
|
62
|
+
end
|
52
63
|
end
|
64
|
+
|
65
|
+
# Flush the combined array into the file.
|
66
|
+
#
|
53
67
|
def flush combined # :nodoc:
|
54
68
|
combined.each do |_, cache, file, _|
|
55
69
|
file.write(cache.join) && cache.clear
|
56
70
|
end
|
57
71
|
end
|
72
|
+
|
58
73
|
#
|
59
74
|
#
|
60
|
-
def
|
61
|
-
timed_exclaim %Q{"#{@
|
75
|
+
def start_indexing_message # :nodoc:
|
76
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": Starting parallel data preparation.}
|
77
|
+
end
|
78
|
+
def finish_indexing_message # :nodoc:
|
79
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": Finished parallel data preparation.}
|
62
80
|
end
|
63
81
|
|
64
82
|
end
|
@@ -8,44 +8,47 @@ module Indexers
|
|
8
8
|
#
|
9
9
|
class Serial < Base
|
10
10
|
|
11
|
-
attr_reader :category
|
12
|
-
|
13
|
-
delegate :source, :to => :category
|
14
|
-
|
15
|
-
def initialize category
|
16
|
-
@category = category
|
17
|
-
end
|
18
|
-
|
19
|
-
# The tokenizer used is a cached tokenizer from the category.
|
20
|
-
#
|
21
|
-
def tokenizer
|
22
|
-
@tokenizer ||= category.tokenizer
|
23
|
-
end
|
24
|
-
|
25
11
|
# Harvest the data from the source, tokenize,
|
26
12
|
# and write to an intermediate "prepared index" file.
|
27
13
|
#
|
28
|
-
|
14
|
+
# Parameters:
|
15
|
+
# * categories: An enumerable of Category-s.
|
16
|
+
#
|
17
|
+
def process categories
|
29
18
|
comma = ?,
|
30
19
|
newline = ?\n
|
31
20
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
21
|
+
categories.each do |category|
|
22
|
+
|
23
|
+
tokenizer = category.tokenizer
|
24
|
+
|
25
|
+
category.prepared_index_file do |file|
|
26
|
+
result = []
|
27
|
+
|
28
|
+
source.harvest(category) do |indexed_id, text|
|
29
|
+
tokenizer.tokenize(text).each do |token_text|
|
30
|
+
next unless token_text
|
31
|
+
result << indexed_id << comma << token_text << newline
|
32
|
+
end
|
33
|
+
file.write(result.join) && result.clear if result.size > 100_000
|
39
34
|
end
|
40
|
-
|
35
|
+
|
36
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": => #{file.path}.}
|
37
|
+
|
38
|
+
file.write result.join
|
41
39
|
end
|
42
|
-
|
40
|
+
|
43
41
|
end
|
42
|
+
|
44
43
|
end
|
44
|
+
|
45
45
|
#
|
46
46
|
#
|
47
|
-
def
|
48
|
-
timed_exclaim %Q{"#{@
|
47
|
+
def start_indexing_message # :nodoc:
|
48
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": Starting serial data preparation.}
|
49
|
+
end
|
50
|
+
def finish_indexing_message # :nodoc:
|
51
|
+
timed_exclaim %Q{"#{@index_or_category.identifier}": Finished serial data preparation.}
|
49
52
|
end
|
50
53
|
|
51
54
|
end
|
data/lib/picky/indexes.rb
CHANGED
@@ -14,10 +14,11 @@ class Indexes
|
|
14
14
|
:to => :indexes
|
15
15
|
|
16
16
|
each_delegate :reindex,
|
17
|
+
:each_category,
|
17
18
|
:to => :indexes
|
18
19
|
|
19
20
|
def initialize
|
20
|
-
|
21
|
+
clear_indexes
|
21
22
|
end
|
22
23
|
|
23
24
|
# Return the Indexes instance.
|
@@ -32,11 +33,12 @@ class Indexes
|
|
32
33
|
:[],
|
33
34
|
:to_s,
|
34
35
|
:size,
|
35
|
-
:each
|
36
|
+
:each,
|
37
|
+
:each_category
|
36
38
|
|
37
39
|
# Clears the indexes and the mapping.
|
38
40
|
#
|
39
|
-
def
|
41
|
+
def clear_indexes
|
40
42
|
@indexes = []
|
41
43
|
@index_mapping = {}
|
42
44
|
end
|
@@ -1,29 +1,17 @@
|
|
1
1
|
# Registers the indexes held at runtime, for queries.
|
2
2
|
#
|
3
3
|
class Indexes
|
4
|
-
|
4
|
+
|
5
5
|
instance_delegate :load_from_cache,
|
6
6
|
:reload,
|
7
7
|
:analyze
|
8
|
-
|
8
|
+
|
9
9
|
each_delegate :load_from_cache,
|
10
10
|
:to => :indexes
|
11
|
-
|
11
|
+
|
12
12
|
# Reloads all indexes, one after another,
|
13
13
|
# in the order they were added.
|
14
14
|
#
|
15
15
|
alias reload load_from_cache
|
16
16
|
|
17
|
-
# Load each index, and analyze it.
|
18
|
-
#
|
19
|
-
# Returns a hash with the findings.
|
20
|
-
#
|
21
|
-
def analyze
|
22
|
-
result = {}
|
23
|
-
indexes.each do |index|
|
24
|
-
index.analyze result
|
25
|
-
end
|
26
|
-
result
|
27
|
-
end
|
28
|
-
|
29
17
|
end
|