picky 0.11.2 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/picky/Index_api.rb +49 -0
- data/lib/picky/alias_instances.rb +4 -1
- data/lib/picky/application.rb +16 -15
- data/lib/picky/cacher/partial/{subtoken.rb → substring.rb} +19 -18
- data/lib/picky/{character_substitution/european.rb → character_substituters/west_european.rb} +2 -2
- data/lib/picky/configuration/index.rb +67 -0
- data/lib/picky/cores.rb +3 -0
- data/lib/picky/index/bundle.rb +35 -51
- data/lib/picky/index/file/basic.rb +39 -5
- data/lib/picky/index/file/json.rb +10 -0
- data/lib/picky/index/file/marshal.rb +10 -0
- data/lib/picky/index/file/text.rb +22 -0
- data/lib/picky/index/files.rb +11 -36
- data/lib/picky/indexed/bundle.rb +61 -0
- data/lib/picky/{index → indexed}/categories.rb +1 -1
- data/lib/picky/{index → indexed}/category.rb +13 -16
- data/lib/picky/{index/type.rb → indexed/index.rb} +6 -6
- data/lib/picky/{index/types.rb → indexed/indexes.rb} +10 -10
- data/lib/picky/{index → indexed}/wrappers/exact_first.rb +8 -8
- data/lib/picky/indexers/no_source_specified_error.rb +1 -1
- data/lib/picky/indexers/serial.rb +64 -0
- data/lib/picky/indexers/solr.rb +1 -3
- data/lib/picky/indexes_api.rb +41 -0
- data/lib/picky/indexing/bundle.rb +43 -13
- data/lib/picky/indexing/category.rb +17 -64
- data/lib/picky/indexing/{type.rb → index.rb} +13 -3
- data/lib/picky/indexing/{types.rb → indexes.rb} +22 -22
- data/lib/picky/loader.rb +17 -22
- data/lib/picky/query/base.rb +1 -1
- data/lib/picky/rack/harakiri.rb +9 -2
- data/lib/picky/signals.rb +1 -1
- data/lib/picky/sources/base.rb +14 -14
- data/lib/picky/sources/couch.rb +8 -7
- data/lib/picky/sources/csv.rb +10 -10
- data/lib/picky/sources/db.rb +8 -8
- data/lib/picky/sources/delicious.rb +2 -2
- data/lib/picky/sources/wrappers/location.rb +3 -3
- data/lib/picky/tokenizers/base.rb +1 -11
- data/lib/picky/tokenizers/index.rb +0 -1
- data/lib/picky/tokenizers/query.rb +0 -1
- data/lib/tasks/index.rake +4 -4
- data/lib/tasks/shortcuts.rake +4 -4
- data/lib/tasks/try.rake +8 -8
- data/project_prototype/Gemfile +1 -1
- data/project_prototype/app/application.rb +13 -12
- data/spec/lib/application_spec.rb +10 -38
- data/spec/lib/cacher/partial/{subtoken_spec.rb → substring_spec.rb} +0 -0
- data/spec/lib/{character_substitution/european_spec.rb → character_substituters/west_european_spec.rb} +6 -2
- data/spec/lib/configuration/index_spec.rb +80 -0
- data/spec/lib/cores_spec.rb +1 -1
- data/spec/lib/index/file/text_spec.rb +1 -1
- data/spec/lib/index/files_spec.rb +12 -32
- data/spec/lib/indexed/bundle_spec.rb +119 -0
- data/spec/lib/{indexing → indexed}/categories_spec.rb +13 -14
- data/spec/lib/{index → indexed}/category_spec.rb +6 -6
- data/spec/lib/{index/type_spec.rb → indexed/index_spec.rb} +3 -3
- data/spec/lib/{index → indexed}/wrappers/exact_first_spec.rb +5 -5
- data/spec/lib/indexers/serial_spec.rb +62 -0
- data/spec/lib/indexing/bundle_partial_generation_speed_spec.rb +7 -5
- data/spec/lib/indexing/bundle_spec.rb +9 -14
- data/spec/lib/indexing/category_spec.rb +9 -125
- data/spec/lib/indexing/{type_spec.rb → index_spec.rb} +3 -3
- data/spec/lib/query/base_spec.rb +1 -1
- data/spec/lib/query/full_spec.rb +1 -1
- data/spec/lib/query/live_spec.rb +2 -4
- data/spec/lib/sources/couch_spec.rb +5 -5
- data/spec/lib/sources/db_spec.rb +6 -7
- data/spec/lib/tokenizers/base_spec.rb +1 -24
- data/spec/lib/tokenizers/query_spec.rb +0 -1
- metadata +38 -41
- data/lib/picky/bundle.rb +0 -33
- data/lib/picky/configuration/indexes.rb +0 -51
- data/lib/picky/configuration/queries.rb +0 -15
- data/lib/picky/indexers/base.rb +0 -85
- data/lib/picky/indexers/default.rb +0 -3
- data/lib/picky/type.rb +0 -46
- data/lib/picky/types.rb +0 -41
- data/lib/tasks/cache.rake +0 -46
- data/spec/lib/configuration/indexes_spec.rb +0 -28
- data/spec/lib/index/bundle_spec.rb +0 -151
- data/spec/lib/indexers/base_spec.rb +0 -89
@@ -1,8 +1,8 @@
|
|
1
1
|
module Indexing
|
2
2
|
|
3
|
-
class
|
3
|
+
class Indexes
|
4
4
|
|
5
|
-
attr_reader :
|
5
|
+
attr_reader :indexes
|
6
6
|
|
7
7
|
each_delegate :take_snapshot,
|
8
8
|
:generate_caches,
|
@@ -11,7 +11,7 @@ module Indexing
|
|
11
11
|
:check_caches,
|
12
12
|
:clear_caches,
|
13
13
|
:create_directory_structure,
|
14
|
-
:to => :
|
14
|
+
:to => :indexes
|
15
15
|
|
16
16
|
def initialize
|
17
17
|
clear
|
@@ -20,13 +20,13 @@ module Indexing
|
|
20
20
|
# TODO Spec.
|
21
21
|
#
|
22
22
|
def clear
|
23
|
-
@
|
23
|
+
@indexes = []
|
24
24
|
end
|
25
25
|
|
26
26
|
# TODO Spec. Superclass?
|
27
27
|
#
|
28
|
-
def register
|
29
|
-
self.
|
28
|
+
def register index
|
29
|
+
self.indexes << index
|
30
30
|
end
|
31
31
|
|
32
32
|
# Runs the indexers in parallel (index + cache).
|
@@ -39,9 +39,9 @@ module Indexing
|
|
39
39
|
# Run in parallel.
|
40
40
|
#
|
41
41
|
timed_exclaim "INDEXING USING #{Cores.max_processors} PROCESSORS, IN #{randomly ? 'RANDOM' : 'GIVEN'} ORDER."
|
42
|
-
Cores.forked self.
|
43
|
-
|
44
|
-
|
42
|
+
Cores.forked self.indexes, { randomly: randomly } do |an_index|
|
43
|
+
an_index.index
|
44
|
+
an_index.cache
|
45
45
|
end
|
46
46
|
timed_exclaim "INDEXING FINISHED."
|
47
47
|
end
|
@@ -51,36 +51,36 @@ module Indexing
|
|
51
51
|
def index_for_tests
|
52
52
|
take_snapshot
|
53
53
|
|
54
|
-
self.
|
55
|
-
|
56
|
-
|
54
|
+
self.indexes.each do |an_index|
|
55
|
+
an_index.index
|
56
|
+
an_index.cache
|
57
57
|
end
|
58
58
|
end
|
59
59
|
|
60
60
|
# TODO Spec
|
61
61
|
#
|
62
|
-
def generate_index_only
|
63
|
-
found = find
|
62
|
+
def generate_index_only index_name, category_name
|
63
|
+
found = find index_name, category_name
|
64
64
|
found.index if found
|
65
65
|
end
|
66
|
-
def generate_cache_only
|
67
|
-
found = find
|
66
|
+
def generate_cache_only index_name, category_name
|
67
|
+
found = find index_name, category_name
|
68
68
|
found.generate_caches if found
|
69
69
|
end
|
70
70
|
|
71
71
|
# TODO Spec
|
72
72
|
#
|
73
|
-
def find
|
74
|
-
|
73
|
+
def find index_name, category_name
|
74
|
+
index_name = index_name.to_sym
|
75
75
|
|
76
|
-
|
77
|
-
next unless
|
76
|
+
indexes.each do |index|
|
77
|
+
next unless index.name == index_name
|
78
78
|
|
79
|
-
found =
|
79
|
+
found = index.categories.find category_name
|
80
80
|
return found if found
|
81
81
|
end
|
82
82
|
|
83
|
-
raise %Q{Index "#{
|
83
|
+
raise %Q{Index "#{index_name}" not found. Possible indexes: "#{indexes.map(&:name).join('", "')}".}
|
84
84
|
end
|
85
85
|
|
86
86
|
end
|
data/lib/picky/loader.rb
CHANGED
@@ -104,9 +104,9 @@ module Loader
|
|
104
104
|
load_relative 'helpers/cache'
|
105
105
|
load_relative 'helpers/measuring'
|
106
106
|
|
107
|
-
# Character
|
107
|
+
# Character Substituters
|
108
108
|
#
|
109
|
-
load_relative '
|
109
|
+
load_relative 'character_substituters/west_european'
|
110
110
|
|
111
111
|
# Signal handling
|
112
112
|
#
|
@@ -119,8 +119,7 @@ module Loader
|
|
119
119
|
# Index generation strategies.
|
120
120
|
#
|
121
121
|
load_relative 'indexers/no_source_specified_error'
|
122
|
-
load_relative 'indexers/
|
123
|
-
load_relative 'indexers/default'
|
122
|
+
load_relative 'indexers/serial'
|
124
123
|
#
|
125
124
|
# load_relative 'indexers/solr'
|
126
125
|
|
@@ -132,7 +131,7 @@ module Loader
|
|
132
131
|
#
|
133
132
|
load_relative 'cacher/partial/strategy'
|
134
133
|
load_relative 'cacher/partial/none'
|
135
|
-
load_relative 'cacher/partial/
|
134
|
+
load_relative 'cacher/partial/substring'
|
136
135
|
load_relative 'cacher/partial/default'
|
137
136
|
|
138
137
|
# Weight index generation strategies.
|
@@ -167,27 +166,27 @@ module Loader
|
|
167
166
|
load_relative 'index/file/json'
|
168
167
|
load_relative 'index/files'
|
169
168
|
|
170
|
-
#
|
169
|
+
# Indexing and Indexed things.
|
171
170
|
#
|
172
|
-
load_relative 'bundle'
|
171
|
+
load_relative 'index/bundle'
|
173
172
|
|
174
173
|
load_relative 'indexing/bundle'
|
175
174
|
load_relative 'indexing/category'
|
176
175
|
load_relative 'indexing/categories'
|
177
|
-
load_relative 'indexing/
|
178
|
-
load_relative 'indexing/
|
176
|
+
load_relative 'indexing/index'
|
177
|
+
load_relative 'indexing/indexes'
|
179
178
|
|
180
|
-
load_relative '
|
181
|
-
load_relative '
|
182
|
-
load_relative '
|
183
|
-
load_relative 'index
|
184
|
-
load_relative '
|
179
|
+
load_relative 'indexed/bundle'
|
180
|
+
load_relative 'indexed/category'
|
181
|
+
load_relative 'indexed/categories'
|
182
|
+
load_relative 'indexed/index'
|
183
|
+
load_relative 'indexed/indexes'
|
185
184
|
|
186
|
-
load_relative '
|
185
|
+
load_relative 'indexes_api'
|
187
186
|
load_relative 'alias_instances'
|
188
|
-
load_relative '
|
187
|
+
load_relative 'index_api'
|
189
188
|
|
190
|
-
load_relative '
|
189
|
+
load_relative 'indexed/wrappers/exact_first'
|
191
190
|
|
192
191
|
# Tokens.
|
193
192
|
#
|
@@ -240,11 +239,7 @@ module Loader
|
|
240
239
|
|
241
240
|
# Configuration.
|
242
241
|
#
|
243
|
-
load_relative 'configuration/
|
244
|
-
|
245
|
-
# ... in Application.
|
246
|
-
#
|
247
|
-
load_relative 'configuration/queries'
|
242
|
+
load_relative 'configuration/index'
|
248
243
|
|
249
244
|
# Application and routing.
|
250
245
|
#
|
data/lib/picky/query/base.rb
CHANGED
@@ -19,7 +19,7 @@ module Query
|
|
19
19
|
#
|
20
20
|
def initialize *index_type_definitions
|
21
21
|
options = Hash === index_type_definitions.last ? index_type_definitions.pop : {}
|
22
|
-
indexes = index_type_definitions.map &:
|
22
|
+
indexes = index_type_definitions.map &:indexed
|
23
23
|
|
24
24
|
@weigher = options[:weigher] || Weigher.new(indexes)
|
25
25
|
@tokenizer = options[:tokenizer] || Tokenizers::Query.default
|
data/lib/picky/rack/harakiri.rb
CHANGED
@@ -4,8 +4,12 @@ module Rack
|
|
4
4
|
#
|
5
5
|
# Use as follows in e.g. your rackup File:
|
6
6
|
#
|
7
|
-
#
|
8
|
-
#
|
7
|
+
# Rack::Harakiri.after = 100
|
8
|
+
# use Rack::Harakiri
|
9
|
+
#
|
10
|
+
# Then the Unicorn will commit suicide after 100 requests (50 is the default).
|
11
|
+
#
|
12
|
+
# The Master Unicorn process forks a new child Unicorn to replace the old one.
|
9
13
|
#
|
10
14
|
class Harakiri
|
11
15
|
|
@@ -21,6 +25,9 @@ module Rack
|
|
21
25
|
@quit_after_requests = self.class.after || 50
|
22
26
|
end
|
23
27
|
|
28
|
+
# Harakiri is a middleware, so it passes the call on after checking if it
|
29
|
+
# is time to honorably retire.
|
30
|
+
#
|
24
31
|
def call env
|
25
32
|
harakiri
|
26
33
|
@app.call env
|
data/lib/picky/signals.rb
CHANGED
data/lib/picky/sources/base.rb
CHANGED
@@ -10,22 +10,9 @@ module Sources
|
|
10
10
|
# * take_snapshot: Optional, called once for each type.
|
11
11
|
class Base
|
12
12
|
|
13
|
-
# Note:
|
13
|
+
# Note: Default methods do nothing.
|
14
14
|
#
|
15
15
|
|
16
|
-
# Called by the indexer when gathering data.
|
17
|
-
#
|
18
|
-
# Yields the data (id, text for id) for the given type and field.
|
19
|
-
#
|
20
|
-
# When implementing or overriding your own,
|
21
|
-
# be sure to <tt>yield</tt> (or <tt>block.call</tt>) an id (as string or integer)
|
22
|
-
# and a corresponding text for the given type symbol and
|
23
|
-
# category symbol.
|
24
|
-
#
|
25
|
-
def harvest type, category
|
26
|
-
# yields nothing
|
27
|
-
end
|
28
|
-
|
29
16
|
# Connect to the backend.
|
30
17
|
#
|
31
18
|
# Note: Called once per index/category combination
|
@@ -47,6 +34,19 @@ module Sources
|
|
47
34
|
|
48
35
|
end
|
49
36
|
|
37
|
+
# Called by the indexer when gathering data.
|
38
|
+
#
|
39
|
+
# Yields the data (id, text for id) for the given type and category.
|
40
|
+
#
|
41
|
+
# When implementing or overriding your own,
|
42
|
+
# be sure to <tt>yield</tt> (or <tt>block.call</tt>) an id (as string or integer)
|
43
|
+
# and a corresponding text for the given type symbol and
|
44
|
+
# category symbol.
|
45
|
+
#
|
46
|
+
def harvest type, category
|
47
|
+
# yields nothing
|
48
|
+
end
|
49
|
+
|
50
50
|
end
|
51
51
|
|
52
52
|
end
|
data/lib/picky/sources/couch.rb
CHANGED
@@ -8,9 +8,9 @@ module Sources
|
|
8
8
|
|
9
9
|
class Couch < Base
|
10
10
|
|
11
|
-
def initialize *
|
11
|
+
def initialize *category_names, options
|
12
12
|
check_gem
|
13
|
-
Hash === options && options[:url] || raise_no_db_given(
|
13
|
+
Hash === options && options[:url] || raise_no_db_given(category_names)
|
14
14
|
@db = RestClient::Resource.new options.delete(:url), options
|
15
15
|
end
|
16
16
|
|
@@ -23,9 +23,10 @@ module Sources
|
|
23
23
|
|
24
24
|
# Harvests the data to index.
|
25
25
|
#
|
26
|
-
def harvest type,
|
26
|
+
def harvest type, category
|
27
|
+
category_name = category.name.to_s
|
27
28
|
get_data do |doc|
|
28
|
-
yield doc['_id'].to_i, doc[
|
29
|
+
yield doc['_id'].to_i, doc[category_name] || next
|
29
30
|
end
|
30
31
|
end
|
31
32
|
|
@@ -35,9 +36,9 @@ module Sources
|
|
35
36
|
map{|row| row['doc']}.
|
36
37
|
each &block
|
37
38
|
end
|
38
|
-
|
39
|
-
def raise_no_db_given
|
40
|
-
raise NoCouchDBGiven.new(
|
39
|
+
|
40
|
+
def raise_no_db_given category_names
|
41
|
+
raise NoCouchDBGiven.new(category_names.join(', '))
|
41
42
|
end
|
42
43
|
end
|
43
44
|
end
|
data/lib/picky/sources/csv.rb
CHANGED
@@ -1,32 +1,32 @@
|
|
1
1
|
module Sources
|
2
2
|
|
3
3
|
# Describes a CSV source, a file with csv in it.
|
4
|
-
# Give it a sequence of
|
4
|
+
# Give it a sequence of category names and a file option with the filename.
|
5
5
|
#
|
6
6
|
class NoCSVFileGiven < StandardError; end
|
7
7
|
|
8
8
|
class CSV < Base
|
9
9
|
|
10
|
-
attr_reader :file_name, :
|
10
|
+
attr_reader :file_name, :category_names
|
11
11
|
|
12
|
-
def initialize *
|
12
|
+
def initialize *category_names, options
|
13
13
|
require 'csv'
|
14
|
-
@
|
15
|
-
@file_name = Hash === options && options[:file] || raise_no_file_given(
|
14
|
+
@category_names = category_names
|
15
|
+
@file_name = Hash === options && options[:file] || raise_no_file_given(category_names)
|
16
16
|
end
|
17
17
|
|
18
18
|
#
|
19
19
|
#
|
20
|
-
def raise_no_file_given
|
21
|
-
raise NoCSVFileGiven.new(
|
20
|
+
def raise_no_file_given category_names
|
21
|
+
raise NoCSVFileGiven.new(category_names.join(', '))
|
22
22
|
end
|
23
23
|
|
24
24
|
# Harvests the data to index.
|
25
25
|
#
|
26
|
-
def harvest _,
|
27
|
-
index =
|
26
|
+
def harvest _, category
|
27
|
+
index = category_names.index category.name
|
28
28
|
get_data do |ary|
|
29
|
-
indexed_id = ary.shift.to_i
|
29
|
+
indexed_id = ary.shift.to_i # TODO is to_i necessary?
|
30
30
|
text = ary[index]
|
31
31
|
next unless text
|
32
32
|
text.force_encoding 'utf-8' # TODO Still needed?
|
data/lib/picky/sources/db.rb
CHANGED
@@ -93,11 +93,11 @@ module Sources
|
|
93
93
|
# Example:
|
94
94
|
# "SELECT indexed_id, value FROM bla_table st WHERE kind = 'bla'"
|
95
95
|
#
|
96
|
-
def harvest type,
|
96
|
+
def harvest type, category
|
97
97
|
connect_backend
|
98
98
|
|
99
99
|
(0..count(type)).step(chunksize) do |offset|
|
100
|
-
get_data(type,
|
100
|
+
get_data(type, category, offset).each do |indexed_id, text|
|
101
101
|
next unless text
|
102
102
|
text.force_encoding 'utf-8' # TODO Still needed?
|
103
103
|
yield indexed_id, text
|
@@ -107,16 +107,16 @@ module Sources
|
|
107
107
|
|
108
108
|
# Gets database from the backend.
|
109
109
|
#
|
110
|
-
def get_data type,
|
111
|
-
database.connection.execute harvest_statement_with_offset(type,
|
110
|
+
def get_data type, category, offset
|
111
|
+
database.connection.execute harvest_statement_with_offset(type, category, offset)
|
112
112
|
end
|
113
113
|
|
114
114
|
# Builds a harvest statement for getting data to index.
|
115
115
|
#
|
116
116
|
# TODO Use the adapter for this.
|
117
117
|
#
|
118
|
-
def harvest_statement_with_offset type,
|
119
|
-
statement = harvest_statement type,
|
118
|
+
def harvest_statement_with_offset type, category, offset
|
119
|
+
statement = harvest_statement type, category
|
120
120
|
|
121
121
|
statement += statement.include?('WHERE') ? ' AND' : ' WHERE'
|
122
122
|
|
@@ -125,8 +125,8 @@ module Sources
|
|
125
125
|
|
126
126
|
# Base harvest statement for dbs.
|
127
127
|
#
|
128
|
-
def harvest_statement type,
|
129
|
-
"SELECT indexed_id, #{
|
128
|
+
def harvest_statement type, category
|
129
|
+
"SELECT indexed_id, #{category.name} FROM #{snapshot_table_name(type)} st"
|
130
130
|
end
|
131
131
|
|
132
132
|
# Override in subclasses.
|
@@ -16,10 +16,10 @@ module Sources
|
|
16
16
|
|
17
17
|
# Harvests the data to index.
|
18
18
|
#
|
19
|
-
def harvest _,
|
19
|
+
def harvest _, category
|
20
20
|
get_data do |uid, data|
|
21
21
|
indexed_id = uid
|
22
|
-
text = data[
|
22
|
+
text = data[category.name]
|
23
23
|
next unless text
|
24
24
|
text.force_encoding 'utf-8' # TODO Still needed?
|
25
25
|
yield indexed_id, text
|
@@ -38,9 +38,9 @@ module Sources
|
|
38
38
|
@min = 1.0/0
|
39
39
|
end
|
40
40
|
|
41
|
-
# Yield the data (id, text for id) for the given type and
|
41
|
+
# Yield the data (id, text for id) for the given type and category.
|
42
42
|
#
|
43
|
-
def harvest type,
|
43
|
+
def harvest type, category
|
44
44
|
reset
|
45
45
|
|
46
46
|
# Cache. TODO Make option?
|
@@ -49,7 +49,7 @@ module Sources
|
|
49
49
|
|
50
50
|
# Gather min/max.
|
51
51
|
#
|
52
|
-
backend.harvest type,
|
52
|
+
backend.harvest type, category do |indexed_id, location|
|
53
53
|
location = location.to_f
|
54
54
|
@min = location if location < @min
|
55
55
|
locations << [indexed_id, location]
|
@@ -22,16 +22,6 @@ module Tokenizers
|
|
22
22
|
remove_stopwords text
|
23
23
|
end
|
24
24
|
|
25
|
-
# Contraction.
|
26
|
-
#
|
27
|
-
def contracts_expressions what, to_what
|
28
|
-
@contract_what = what
|
29
|
-
@contract_to_what = to_what
|
30
|
-
end
|
31
|
-
def contract text
|
32
|
-
text.gsub! @contract_what, @contract_to_what if @contract_what
|
33
|
-
end
|
34
|
-
|
35
25
|
# Illegals.
|
36
26
|
#
|
37
27
|
# TODO Should there be a legal?
|
@@ -83,7 +73,7 @@ module Tokenizers
|
|
83
73
|
#
|
84
74
|
# Default is European Character substitution.
|
85
75
|
#
|
86
|
-
def substitutes_characters_with substituter =
|
76
|
+
def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
|
87
77
|
# TODO Raise if it doesn't quack substitute?
|
88
78
|
@substituter = substituter
|
89
79
|
end
|
@@ -25,7 +25,6 @@ module Tokenizers
|
|
25
25
|
text = substitute_characters text
|
26
26
|
text.downcase!
|
27
27
|
remove_illegals text
|
28
|
-
contract text
|
29
28
|
# we do not remove single stopwords for an entirely different
|
30
29
|
# reason than in the query tokenizer.
|
31
30
|
# An indexed thing with just name "UND" (a stopword) should not lose its name.
|
data/lib/tasks/index.rake
CHANGED
@@ -17,10 +17,10 @@ namespace :index do
|
|
17
17
|
end
|
18
18
|
|
19
19
|
desc "Generates a specific index from index snapshots."
|
20
|
-
task :specific, [:
|
21
|
-
|
22
|
-
Indexes.generate_index_only
|
23
|
-
Indexes.generate_cache_only
|
20
|
+
task :specific, [:index, :category] => :application do |_, options|
|
21
|
+
index, category = options.index, options.category
|
22
|
+
Indexes.generate_index_only index.to_sym, category.to_sym
|
23
|
+
Indexes.generate_cache_only index.to_sym, category.to_sym
|
24
24
|
end
|
25
25
|
|
26
26
|
desc 'Checks the index files for files that are suspiciously small or missing.'
|
data/lib/tasks/shortcuts.rake
CHANGED
@@ -3,11 +3,11 @@ task :index => :application do
|
|
3
3
|
Rake::Task[:'index:randomly'].invoke
|
4
4
|
end
|
5
5
|
|
6
|
-
desc "Try the given text in the indexer/query (
|
7
|
-
task :try, [:text, :
|
8
|
-
text,
|
6
|
+
desc "Try the given text in the indexer/query (index:category optional)."
|
7
|
+
task :try, [:text, :index_and_category] => :application do |_, options|
|
8
|
+
text, index_and_category = options.text, options.index_and_category
|
9
9
|
|
10
|
-
Rake::Task[:'try:both'].invoke text,
|
10
|
+
Rake::Task[:'try:both'].invoke text, index_and_category
|
11
11
|
end
|
12
12
|
|
13
13
|
desc "Start the server."
|
data/lib/tasks/try.rake
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
#
|
3
3
|
namespace :try do
|
4
4
|
|
5
|
-
# desc "Try how a given word would be tokenized when indexing (type:
|
6
|
-
task :index, [:text, :
|
7
|
-
text,
|
5
|
+
# desc "Try how a given word would be tokenized when indexing (type:category optional)."
|
6
|
+
task :index, [:text, :index_and_category] => :application do |_, options|
|
7
|
+
text, index_and_category = options.text, options.index_and_category
|
8
8
|
|
9
|
-
tokenizer =
|
9
|
+
tokenizer = index_and_category ? Indexes.find(*index_and_category.split(':')).tokenizer : Tokenizers::Index.default
|
10
10
|
|
11
11
|
puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text.dup).to_a}"
|
12
12
|
end
|
@@ -18,11 +18,11 @@ namespace :try do
|
|
18
18
|
puts "\"#{text}\" is query tokenized as #{Tokenizers::Query.default.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
|
19
19
|
end
|
20
20
|
|
21
|
-
# desc "Try the given text with both the index and the query (type:
|
22
|
-
task :both, [:text, :
|
23
|
-
text,
|
21
|
+
# desc "Try the given text with both the index and the query (type:category optional)."
|
22
|
+
task :both, [:text, :index_and_category] => :application do |_, options|
|
23
|
+
text, index_and_category = options.text, options.index_and_category
|
24
24
|
|
25
|
-
Rake::Task[:"try:index"].invoke text,
|
25
|
+
Rake::Task[:"try:index"].invoke text, index_and_category
|
26
26
|
Rake::Task[:"try:query"].invoke text
|
27
27
|
end
|
28
28
|
|
data/project_prototype/Gemfile
CHANGED
@@ -9,32 +9,33 @@
|
|
9
9
|
class PickySearch < Application
|
10
10
|
|
11
11
|
# Indexing: How text is indexed.
|
12
|
-
# Querying: How query text is handled.
|
13
12
|
#
|
14
13
|
default_indexing removes_characters: /[^a-zA-Z0-9\s\/\-\"\&\.]/,
|
15
14
|
stopwords: /\b(and|the|of|it|in|for)\b/,
|
16
15
|
splits_text_on: /[\s\/\-\"\&\.]/
|
17
16
|
|
17
|
+
# Querying: How query text is handled.
|
18
|
+
#
|
18
19
|
default_querying removes_characters: /[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/, # Picky needs control chars *"~: to pass through.
|
19
20
|
stopwords: /\b(and|the|of|it|in|for)\b/,
|
20
21
|
splits_text_on: /[\s\/\-\,\&]+/,
|
21
22
|
|
22
|
-
maximum_tokens: 5, #
|
23
|
-
substitutes_characters_with:
|
23
|
+
maximum_tokens: 5, # Amount of tokens passing into a query (5 = default).
|
24
|
+
substitutes_characters_with: CharacterSubstituters::WestEuropean.new # Normalizes special user input, Ä -> Ae, ñ -> n etc.
|
24
25
|
|
25
26
|
# Define an index. Use a database etc. source?
|
26
27
|
# See http://github.com/floere/picky/wiki/Sources-Configuration#sources
|
27
28
|
#
|
28
29
|
books_index = index :books, Sources::CSV.new(:title, :author, :isbn, file: 'app/library.csv')
|
29
|
-
books_index.
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
books_index.
|
34
|
-
|
35
|
-
books_index.
|
36
|
-
|
37
|
-
|
30
|
+
books_index.define_category :title,
|
31
|
+
similarity: Similarity::Phonetic.new(3), # Up to three similar title word indexed (default: No similarity).
|
32
|
+
partial: Partial::Substring.new(from: 1) # Indexes substrings upwards from character 1 (default: -3),
|
33
|
+
# You'll find "picky" even when entering just a "p".
|
34
|
+
books_index.define_category :author,
|
35
|
+
partial: Partial::Substring.new(from: 1)
|
36
|
+
books_index.define_category :isbn,
|
37
|
+
partial: Partial::None.new # Partial substring searching on an ISBN does not make
|
38
|
+
# much sense, neither does similarity.
|
38
39
|
|
39
40
|
query_options = { :weights => { [:title, :author] => +3, [:title] => +1 } } # +/- points for ordered combinations.
|
40
41
|
|