picky 0.11.2 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/picky/Index_api.rb +49 -0
- data/lib/picky/alias_instances.rb +4 -1
- data/lib/picky/application.rb +16 -15
- data/lib/picky/cacher/partial/{subtoken.rb → substring.rb} +19 -18
- data/lib/picky/{character_substitution/european.rb → character_substituters/west_european.rb} +2 -2
- data/lib/picky/configuration/index.rb +67 -0
- data/lib/picky/cores.rb +3 -0
- data/lib/picky/index/bundle.rb +35 -51
- data/lib/picky/index/file/basic.rb +39 -5
- data/lib/picky/index/file/json.rb +10 -0
- data/lib/picky/index/file/marshal.rb +10 -0
- data/lib/picky/index/file/text.rb +22 -0
- data/lib/picky/index/files.rb +11 -36
- data/lib/picky/indexed/bundle.rb +61 -0
- data/lib/picky/{index → indexed}/categories.rb +1 -1
- data/lib/picky/{index → indexed}/category.rb +13 -16
- data/lib/picky/{index/type.rb → indexed/index.rb} +6 -6
- data/lib/picky/{index/types.rb → indexed/indexes.rb} +10 -10
- data/lib/picky/{index → indexed}/wrappers/exact_first.rb +8 -8
- data/lib/picky/indexers/no_source_specified_error.rb +1 -1
- data/lib/picky/indexers/serial.rb +64 -0
- data/lib/picky/indexers/solr.rb +1 -3
- data/lib/picky/indexes_api.rb +41 -0
- data/lib/picky/indexing/bundle.rb +43 -13
- data/lib/picky/indexing/category.rb +17 -64
- data/lib/picky/indexing/{type.rb → index.rb} +13 -3
- data/lib/picky/indexing/{types.rb → indexes.rb} +22 -22
- data/lib/picky/loader.rb +17 -22
- data/lib/picky/query/base.rb +1 -1
- data/lib/picky/rack/harakiri.rb +9 -2
- data/lib/picky/signals.rb +1 -1
- data/lib/picky/sources/base.rb +14 -14
- data/lib/picky/sources/couch.rb +8 -7
- data/lib/picky/sources/csv.rb +10 -10
- data/lib/picky/sources/db.rb +8 -8
- data/lib/picky/sources/delicious.rb +2 -2
- data/lib/picky/sources/wrappers/location.rb +3 -3
- data/lib/picky/tokenizers/base.rb +1 -11
- data/lib/picky/tokenizers/index.rb +0 -1
- data/lib/picky/tokenizers/query.rb +0 -1
- data/lib/tasks/index.rake +4 -4
- data/lib/tasks/shortcuts.rake +4 -4
- data/lib/tasks/try.rake +8 -8
- data/project_prototype/Gemfile +1 -1
- data/project_prototype/app/application.rb +13 -12
- data/spec/lib/application_spec.rb +10 -38
- data/spec/lib/cacher/partial/{subtoken_spec.rb → substring_spec.rb} +0 -0
- data/spec/lib/{character_substitution/european_spec.rb → character_substituters/west_european_spec.rb} +6 -2
- data/spec/lib/configuration/index_spec.rb +80 -0
- data/spec/lib/cores_spec.rb +1 -1
- data/spec/lib/index/file/text_spec.rb +1 -1
- data/spec/lib/index/files_spec.rb +12 -32
- data/spec/lib/indexed/bundle_spec.rb +119 -0
- data/spec/lib/{indexing → indexed}/categories_spec.rb +13 -14
- data/spec/lib/{index → indexed}/category_spec.rb +6 -6
- data/spec/lib/{index/type_spec.rb → indexed/index_spec.rb} +3 -3
- data/spec/lib/{index → indexed}/wrappers/exact_first_spec.rb +5 -5
- data/spec/lib/indexers/serial_spec.rb +62 -0
- data/spec/lib/indexing/bundle_partial_generation_speed_spec.rb +7 -5
- data/spec/lib/indexing/bundle_spec.rb +9 -14
- data/spec/lib/indexing/category_spec.rb +9 -125
- data/spec/lib/indexing/{type_spec.rb → index_spec.rb} +3 -3
- data/spec/lib/query/base_spec.rb +1 -1
- data/spec/lib/query/full_spec.rb +1 -1
- data/spec/lib/query/live_spec.rb +2 -4
- data/spec/lib/sources/couch_spec.rb +5 -5
- data/spec/lib/sources/db_spec.rb +6 -7
- data/spec/lib/tokenizers/base_spec.rb +1 -24
- data/spec/lib/tokenizers/query_spec.rb +0 -1
- metadata +38 -41
- data/lib/picky/bundle.rb +0 -33
- data/lib/picky/configuration/indexes.rb +0 -51
- data/lib/picky/configuration/queries.rb +0 -15
- data/lib/picky/indexers/base.rb +0 -85
- data/lib/picky/indexers/default.rb +0 -3
- data/lib/picky/type.rb +0 -46
- data/lib/picky/types.rb +0 -41
- data/lib/tasks/cache.rake +0 -46
- data/spec/lib/configuration/indexes_spec.rb +0 -28
- data/spec/lib/index/bundle_spec.rb +0 -151
- data/spec/lib/indexers/base_spec.rb +0 -89
@@ -1,8 +1,8 @@
|
|
1
1
|
module Indexing
|
2
2
|
|
3
|
-
class
|
3
|
+
class Indexes
|
4
4
|
|
5
|
-
attr_reader :
|
5
|
+
attr_reader :indexes
|
6
6
|
|
7
7
|
each_delegate :take_snapshot,
|
8
8
|
:generate_caches,
|
@@ -11,7 +11,7 @@ module Indexing
|
|
11
11
|
:check_caches,
|
12
12
|
:clear_caches,
|
13
13
|
:create_directory_structure,
|
14
|
-
:to => :
|
14
|
+
:to => :indexes
|
15
15
|
|
16
16
|
def initialize
|
17
17
|
clear
|
@@ -20,13 +20,13 @@ module Indexing
|
|
20
20
|
# TODO Spec.
|
21
21
|
#
|
22
22
|
def clear
|
23
|
-
@
|
23
|
+
@indexes = []
|
24
24
|
end
|
25
25
|
|
26
26
|
# TODO Spec. Superclass?
|
27
27
|
#
|
28
|
-
def register
|
29
|
-
self.
|
28
|
+
def register index
|
29
|
+
self.indexes << index
|
30
30
|
end
|
31
31
|
|
32
32
|
# Runs the indexers in parallel (index + cache).
|
@@ -39,9 +39,9 @@ module Indexing
|
|
39
39
|
# Run in parallel.
|
40
40
|
#
|
41
41
|
timed_exclaim "INDEXING USING #{Cores.max_processors} PROCESSORS, IN #{randomly ? 'RANDOM' : 'GIVEN'} ORDER."
|
42
|
-
Cores.forked self.
|
43
|
-
|
44
|
-
|
42
|
+
Cores.forked self.indexes, { randomly: randomly } do |an_index|
|
43
|
+
an_index.index
|
44
|
+
an_index.cache
|
45
45
|
end
|
46
46
|
timed_exclaim "INDEXING FINISHED."
|
47
47
|
end
|
@@ -51,36 +51,36 @@ module Indexing
|
|
51
51
|
def index_for_tests
|
52
52
|
take_snapshot
|
53
53
|
|
54
|
-
self.
|
55
|
-
|
56
|
-
|
54
|
+
self.indexes.each do |an_index|
|
55
|
+
an_index.index
|
56
|
+
an_index.cache
|
57
57
|
end
|
58
58
|
end
|
59
59
|
|
60
60
|
# TODO Spec
|
61
61
|
#
|
62
|
-
def generate_index_only
|
63
|
-
found = find
|
62
|
+
def generate_index_only index_name, category_name
|
63
|
+
found = find index_name, category_name
|
64
64
|
found.index if found
|
65
65
|
end
|
66
|
-
def generate_cache_only
|
67
|
-
found = find
|
66
|
+
def generate_cache_only index_name, category_name
|
67
|
+
found = find index_name, category_name
|
68
68
|
found.generate_caches if found
|
69
69
|
end
|
70
70
|
|
71
71
|
# TODO Spec
|
72
72
|
#
|
73
|
-
def find
|
74
|
-
|
73
|
+
def find index_name, category_name
|
74
|
+
index_name = index_name.to_sym
|
75
75
|
|
76
|
-
|
77
|
-
next unless
|
76
|
+
indexes.each do |index|
|
77
|
+
next unless index.name == index_name
|
78
78
|
|
79
|
-
found =
|
79
|
+
found = index.categories.find category_name
|
80
80
|
return found if found
|
81
81
|
end
|
82
82
|
|
83
|
-
raise %Q{Index "#{
|
83
|
+
raise %Q{Index "#{index_name}" not found. Possible indexes: "#{indexes.map(&:name).join('", "')}".}
|
84
84
|
end
|
85
85
|
|
86
86
|
end
|
data/lib/picky/loader.rb
CHANGED
@@ -104,9 +104,9 @@ module Loader
|
|
104
104
|
load_relative 'helpers/cache'
|
105
105
|
load_relative 'helpers/measuring'
|
106
106
|
|
107
|
-
# Character
|
107
|
+
# Character Substituters
|
108
108
|
#
|
109
|
-
load_relative '
|
109
|
+
load_relative 'character_substituters/west_european'
|
110
110
|
|
111
111
|
# Signal handling
|
112
112
|
#
|
@@ -119,8 +119,7 @@ module Loader
|
|
119
119
|
# Index generation strategies.
|
120
120
|
#
|
121
121
|
load_relative 'indexers/no_source_specified_error'
|
122
|
-
load_relative 'indexers/
|
123
|
-
load_relative 'indexers/default'
|
122
|
+
load_relative 'indexers/serial'
|
124
123
|
#
|
125
124
|
# load_relative 'indexers/solr'
|
126
125
|
|
@@ -132,7 +131,7 @@ module Loader
|
|
132
131
|
#
|
133
132
|
load_relative 'cacher/partial/strategy'
|
134
133
|
load_relative 'cacher/partial/none'
|
135
|
-
load_relative 'cacher/partial/
|
134
|
+
load_relative 'cacher/partial/substring'
|
136
135
|
load_relative 'cacher/partial/default'
|
137
136
|
|
138
137
|
# Weight index generation strategies.
|
@@ -167,27 +166,27 @@ module Loader
|
|
167
166
|
load_relative 'index/file/json'
|
168
167
|
load_relative 'index/files'
|
169
168
|
|
170
|
-
#
|
169
|
+
# Indexing and Indexed things.
|
171
170
|
#
|
172
|
-
load_relative 'bundle'
|
171
|
+
load_relative 'index/bundle'
|
173
172
|
|
174
173
|
load_relative 'indexing/bundle'
|
175
174
|
load_relative 'indexing/category'
|
176
175
|
load_relative 'indexing/categories'
|
177
|
-
load_relative 'indexing/
|
178
|
-
load_relative 'indexing/
|
176
|
+
load_relative 'indexing/index'
|
177
|
+
load_relative 'indexing/indexes'
|
179
178
|
|
180
|
-
load_relative '
|
181
|
-
load_relative '
|
182
|
-
load_relative '
|
183
|
-
load_relative 'index
|
184
|
-
load_relative '
|
179
|
+
load_relative 'indexed/bundle'
|
180
|
+
load_relative 'indexed/category'
|
181
|
+
load_relative 'indexed/categories'
|
182
|
+
load_relative 'indexed/index'
|
183
|
+
load_relative 'indexed/indexes'
|
185
184
|
|
186
|
-
load_relative '
|
185
|
+
load_relative 'indexes_api'
|
187
186
|
load_relative 'alias_instances'
|
188
|
-
load_relative '
|
187
|
+
load_relative 'index_api'
|
189
188
|
|
190
|
-
load_relative '
|
189
|
+
load_relative 'indexed/wrappers/exact_first'
|
191
190
|
|
192
191
|
# Tokens.
|
193
192
|
#
|
@@ -240,11 +239,7 @@ module Loader
|
|
240
239
|
|
241
240
|
# Configuration.
|
242
241
|
#
|
243
|
-
load_relative 'configuration/
|
244
|
-
|
245
|
-
# ... in Application.
|
246
|
-
#
|
247
|
-
load_relative 'configuration/queries'
|
242
|
+
load_relative 'configuration/index'
|
248
243
|
|
249
244
|
# Application and routing.
|
250
245
|
#
|
data/lib/picky/query/base.rb
CHANGED
@@ -19,7 +19,7 @@ module Query
|
|
19
19
|
#
|
20
20
|
def initialize *index_type_definitions
|
21
21
|
options = Hash === index_type_definitions.last ? index_type_definitions.pop : {}
|
22
|
-
indexes = index_type_definitions.map &:
|
22
|
+
indexes = index_type_definitions.map &:indexed
|
23
23
|
|
24
24
|
@weigher = options[:weigher] || Weigher.new(indexes)
|
25
25
|
@tokenizer = options[:tokenizer] || Tokenizers::Query.default
|
data/lib/picky/rack/harakiri.rb
CHANGED
@@ -4,8 +4,12 @@ module Rack
|
|
4
4
|
#
|
5
5
|
# Use as follows in e.g. your rackup File:
|
6
6
|
#
|
7
|
-
#
|
8
|
-
#
|
7
|
+
# Rack::Harakiri.after = 100
|
8
|
+
# use Rack::Harakiri
|
9
|
+
#
|
10
|
+
# Then the Unicorn will commit suicide after 100 requests (50 is the default).
|
11
|
+
#
|
12
|
+
# The Master Unicorn process forks a new child Unicorn to replace the old one.
|
9
13
|
#
|
10
14
|
class Harakiri
|
11
15
|
|
@@ -21,6 +25,9 @@ module Rack
|
|
21
25
|
@quit_after_requests = self.class.after || 50
|
22
26
|
end
|
23
27
|
|
28
|
+
# Harakiri is a middleware, so it passes the call on after checking if it
|
29
|
+
# is time to honorably retire.
|
30
|
+
#
|
24
31
|
def call env
|
25
32
|
harakiri
|
26
33
|
@app.call env
|
data/lib/picky/signals.rb
CHANGED
data/lib/picky/sources/base.rb
CHANGED
@@ -10,22 +10,9 @@ module Sources
|
|
10
10
|
# * take_snapshot: Optional, called once for each type.
|
11
11
|
class Base
|
12
12
|
|
13
|
-
# Note:
|
13
|
+
# Note: Default methods do nothing.
|
14
14
|
#
|
15
15
|
|
16
|
-
# Called by the indexer when gathering data.
|
17
|
-
#
|
18
|
-
# Yields the data (id, text for id) for the given type and field.
|
19
|
-
#
|
20
|
-
# When implementing or overriding your own,
|
21
|
-
# be sure to <tt>yield</tt> (or <tt>block.call</tt>) an id (as string or integer)
|
22
|
-
# and a corresponding text for the given type symbol and
|
23
|
-
# category symbol.
|
24
|
-
#
|
25
|
-
def harvest type, category
|
26
|
-
# yields nothing
|
27
|
-
end
|
28
|
-
|
29
16
|
# Connect to the backend.
|
30
17
|
#
|
31
18
|
# Note: Called once per index/category combination
|
@@ -47,6 +34,19 @@ module Sources
|
|
47
34
|
|
48
35
|
end
|
49
36
|
|
37
|
+
# Called by the indexer when gathering data.
|
38
|
+
#
|
39
|
+
# Yields the data (id, text for id) for the given type and category.
|
40
|
+
#
|
41
|
+
# When implementing or overriding your own,
|
42
|
+
# be sure to <tt>yield</tt> (or <tt>block.call</tt>) an id (as string or integer)
|
43
|
+
# and a corresponding text for the given type symbol and
|
44
|
+
# category symbol.
|
45
|
+
#
|
46
|
+
def harvest type, category
|
47
|
+
# yields nothing
|
48
|
+
end
|
49
|
+
|
50
50
|
end
|
51
51
|
|
52
52
|
end
|
data/lib/picky/sources/couch.rb
CHANGED
@@ -8,9 +8,9 @@ module Sources
|
|
8
8
|
|
9
9
|
class Couch < Base
|
10
10
|
|
11
|
-
def initialize *
|
11
|
+
def initialize *category_names, options
|
12
12
|
check_gem
|
13
|
-
Hash === options && options[:url] || raise_no_db_given(
|
13
|
+
Hash === options && options[:url] || raise_no_db_given(category_names)
|
14
14
|
@db = RestClient::Resource.new options.delete(:url), options
|
15
15
|
end
|
16
16
|
|
@@ -23,9 +23,10 @@ module Sources
|
|
23
23
|
|
24
24
|
# Harvests the data to index.
|
25
25
|
#
|
26
|
-
def harvest type,
|
26
|
+
def harvest type, category
|
27
|
+
category_name = category.name.to_s
|
27
28
|
get_data do |doc|
|
28
|
-
yield doc['_id'].to_i, doc[
|
29
|
+
yield doc['_id'].to_i, doc[category_name] || next
|
29
30
|
end
|
30
31
|
end
|
31
32
|
|
@@ -35,9 +36,9 @@ module Sources
|
|
35
36
|
map{|row| row['doc']}.
|
36
37
|
each &block
|
37
38
|
end
|
38
|
-
|
39
|
-
def raise_no_db_given
|
40
|
-
raise NoCouchDBGiven.new(
|
39
|
+
|
40
|
+
def raise_no_db_given category_names
|
41
|
+
raise NoCouchDBGiven.new(category_names.join(', '))
|
41
42
|
end
|
42
43
|
end
|
43
44
|
end
|
data/lib/picky/sources/csv.rb
CHANGED
@@ -1,32 +1,32 @@
|
|
1
1
|
module Sources
|
2
2
|
|
3
3
|
# Describes a CSV source, a file with csv in it.
|
4
|
-
# Give it a sequence of
|
4
|
+
# Give it a sequence of category names and a file option with the filename.
|
5
5
|
#
|
6
6
|
class NoCSVFileGiven < StandardError; end
|
7
7
|
|
8
8
|
class CSV < Base
|
9
9
|
|
10
|
-
attr_reader :file_name, :
|
10
|
+
attr_reader :file_name, :category_names
|
11
11
|
|
12
|
-
def initialize *
|
12
|
+
def initialize *category_names, options
|
13
13
|
require 'csv'
|
14
|
-
@
|
15
|
-
@file_name = Hash === options && options[:file] || raise_no_file_given(
|
14
|
+
@category_names = category_names
|
15
|
+
@file_name = Hash === options && options[:file] || raise_no_file_given(category_names)
|
16
16
|
end
|
17
17
|
|
18
18
|
#
|
19
19
|
#
|
20
|
-
def raise_no_file_given
|
21
|
-
raise NoCSVFileGiven.new(
|
20
|
+
def raise_no_file_given category_names
|
21
|
+
raise NoCSVFileGiven.new(category_names.join(', '))
|
22
22
|
end
|
23
23
|
|
24
24
|
# Harvests the data to index.
|
25
25
|
#
|
26
|
-
def harvest _,
|
27
|
-
index =
|
26
|
+
def harvest _, category
|
27
|
+
index = category_names.index category.name
|
28
28
|
get_data do |ary|
|
29
|
-
indexed_id = ary.shift.to_i
|
29
|
+
indexed_id = ary.shift.to_i # TODO is to_i necessary?
|
30
30
|
text = ary[index]
|
31
31
|
next unless text
|
32
32
|
text.force_encoding 'utf-8' # TODO Still needed?
|
data/lib/picky/sources/db.rb
CHANGED
@@ -93,11 +93,11 @@ module Sources
|
|
93
93
|
# Example:
|
94
94
|
# "SELECT indexed_id, value FROM bla_table st WHERE kind = 'bla'"
|
95
95
|
#
|
96
|
-
def harvest type,
|
96
|
+
def harvest type, category
|
97
97
|
connect_backend
|
98
98
|
|
99
99
|
(0..count(type)).step(chunksize) do |offset|
|
100
|
-
get_data(type,
|
100
|
+
get_data(type, category, offset).each do |indexed_id, text|
|
101
101
|
next unless text
|
102
102
|
text.force_encoding 'utf-8' # TODO Still needed?
|
103
103
|
yield indexed_id, text
|
@@ -107,16 +107,16 @@ module Sources
|
|
107
107
|
|
108
108
|
# Gets database from the backend.
|
109
109
|
#
|
110
|
-
def get_data type,
|
111
|
-
database.connection.execute harvest_statement_with_offset(type,
|
110
|
+
def get_data type, category, offset
|
111
|
+
database.connection.execute harvest_statement_with_offset(type, category, offset)
|
112
112
|
end
|
113
113
|
|
114
114
|
# Builds a harvest statement for getting data to index.
|
115
115
|
#
|
116
116
|
# TODO Use the adapter for this.
|
117
117
|
#
|
118
|
-
def harvest_statement_with_offset type,
|
119
|
-
statement = harvest_statement type,
|
118
|
+
def harvest_statement_with_offset type, category, offset
|
119
|
+
statement = harvest_statement type, category
|
120
120
|
|
121
121
|
statement += statement.include?('WHERE') ? ' AND' : ' WHERE'
|
122
122
|
|
@@ -125,8 +125,8 @@ module Sources
|
|
125
125
|
|
126
126
|
# Base harvest statement for dbs.
|
127
127
|
#
|
128
|
-
def harvest_statement type,
|
129
|
-
"SELECT indexed_id, #{
|
128
|
+
def harvest_statement type, category
|
129
|
+
"SELECT indexed_id, #{category.name} FROM #{snapshot_table_name(type)} st"
|
130
130
|
end
|
131
131
|
|
132
132
|
# Override in subclasses.
|
@@ -16,10 +16,10 @@ module Sources
|
|
16
16
|
|
17
17
|
# Harvests the data to index.
|
18
18
|
#
|
19
|
-
def harvest _,
|
19
|
+
def harvest _, category
|
20
20
|
get_data do |uid, data|
|
21
21
|
indexed_id = uid
|
22
|
-
text = data[
|
22
|
+
text = data[category.name]
|
23
23
|
next unless text
|
24
24
|
text.force_encoding 'utf-8' # TODO Still needed?
|
25
25
|
yield indexed_id, text
|
@@ -38,9 +38,9 @@ module Sources
|
|
38
38
|
@min = 1.0/0
|
39
39
|
end
|
40
40
|
|
41
|
-
# Yield the data (id, text for id) for the given type and
|
41
|
+
# Yield the data (id, text for id) for the given type and category.
|
42
42
|
#
|
43
|
-
def harvest type,
|
43
|
+
def harvest type, category
|
44
44
|
reset
|
45
45
|
|
46
46
|
# Cache. TODO Make option?
|
@@ -49,7 +49,7 @@ module Sources
|
|
49
49
|
|
50
50
|
# Gather min/max.
|
51
51
|
#
|
52
|
-
backend.harvest type,
|
52
|
+
backend.harvest type, category do |indexed_id, location|
|
53
53
|
location = location.to_f
|
54
54
|
@min = location if location < @min
|
55
55
|
locations << [indexed_id, location]
|
@@ -22,16 +22,6 @@ module Tokenizers
|
|
22
22
|
remove_stopwords text
|
23
23
|
end
|
24
24
|
|
25
|
-
# Contraction.
|
26
|
-
#
|
27
|
-
def contracts_expressions what, to_what
|
28
|
-
@contract_what = what
|
29
|
-
@contract_to_what = to_what
|
30
|
-
end
|
31
|
-
def contract text
|
32
|
-
text.gsub! @contract_what, @contract_to_what if @contract_what
|
33
|
-
end
|
34
|
-
|
35
25
|
# Illegals.
|
36
26
|
#
|
37
27
|
# TODO Should there be a legal?
|
@@ -83,7 +73,7 @@ module Tokenizers
|
|
83
73
|
#
|
84
74
|
# Default is European Character substitution.
|
85
75
|
#
|
86
|
-
def substitutes_characters_with substituter =
|
76
|
+
def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
|
87
77
|
# TODO Raise if it doesn't quack substitute?
|
88
78
|
@substituter = substituter
|
89
79
|
end
|
@@ -25,7 +25,6 @@ module Tokenizers
|
|
25
25
|
text = substitute_characters text
|
26
26
|
text.downcase!
|
27
27
|
remove_illegals text
|
28
|
-
contract text
|
29
28
|
# we do not remove single stopwords for an entirely different
|
30
29
|
# reason than in the query tokenizer.
|
31
30
|
# An indexed thing with just name "UND" (a stopword) should not lose its name.
|
data/lib/tasks/index.rake
CHANGED
@@ -17,10 +17,10 @@ namespace :index do
|
|
17
17
|
end
|
18
18
|
|
19
19
|
desc "Generates a specific index from index snapshots."
|
20
|
-
task :specific, [:
|
21
|
-
|
22
|
-
Indexes.generate_index_only
|
23
|
-
Indexes.generate_cache_only
|
20
|
+
task :specific, [:index, :category] => :application do |_, options|
|
21
|
+
index, category = options.index, options.category
|
22
|
+
Indexes.generate_index_only index.to_sym, category.to_sym
|
23
|
+
Indexes.generate_cache_only index.to_sym, category.to_sym
|
24
24
|
end
|
25
25
|
|
26
26
|
desc 'Checks the index files for files that are suspiciously small or missing.'
|
data/lib/tasks/shortcuts.rake
CHANGED
@@ -3,11 +3,11 @@ task :index => :application do
|
|
3
3
|
Rake::Task[:'index:randomly'].invoke
|
4
4
|
end
|
5
5
|
|
6
|
-
desc "Try the given text in the indexer/query (
|
7
|
-
task :try, [:text, :
|
8
|
-
text,
|
6
|
+
desc "Try the given text in the indexer/query (index:category optional)."
|
7
|
+
task :try, [:text, :index_and_category] => :application do |_, options|
|
8
|
+
text, index_and_category = options.text, options.index_and_category
|
9
9
|
|
10
|
-
Rake::Task[:'try:both'].invoke text,
|
10
|
+
Rake::Task[:'try:both'].invoke text, index_and_category
|
11
11
|
end
|
12
12
|
|
13
13
|
desc "Start the server."
|
data/lib/tasks/try.rake
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
#
|
3
3
|
namespace :try do
|
4
4
|
|
5
|
-
# desc "Try how a given word would be tokenized when indexing (type:
|
6
|
-
task :index, [:text, :
|
7
|
-
text,
|
5
|
+
# desc "Try how a given word would be tokenized when indexing (type:category optional)."
|
6
|
+
task :index, [:text, :index_and_category] => :application do |_, options|
|
7
|
+
text, index_and_category = options.text, options.index_and_category
|
8
8
|
|
9
|
-
tokenizer =
|
9
|
+
tokenizer = index_and_category ? Indexes.find(*index_and_category.split(':')).tokenizer : Tokenizers::Index.default
|
10
10
|
|
11
11
|
puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text.dup).to_a}"
|
12
12
|
end
|
@@ -18,11 +18,11 @@ namespace :try do
|
|
18
18
|
puts "\"#{text}\" is query tokenized as #{Tokenizers::Query.default.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
|
19
19
|
end
|
20
20
|
|
21
|
-
# desc "Try the given text with both the index and the query (type:
|
22
|
-
task :both, [:text, :
|
23
|
-
text,
|
21
|
+
# desc "Try the given text with both the index and the query (type:category optional)."
|
22
|
+
task :both, [:text, :index_and_category] => :application do |_, options|
|
23
|
+
text, index_and_category = options.text, options.index_and_category
|
24
24
|
|
25
|
-
Rake::Task[:"try:index"].invoke text,
|
25
|
+
Rake::Task[:"try:index"].invoke text, index_and_category
|
26
26
|
Rake::Task[:"try:query"].invoke text
|
27
27
|
end
|
28
28
|
|
data/project_prototype/Gemfile
CHANGED
@@ -9,32 +9,33 @@
|
|
9
9
|
class PickySearch < Application
|
10
10
|
|
11
11
|
# Indexing: How text is indexed.
|
12
|
-
# Querying: How query text is handled.
|
13
12
|
#
|
14
13
|
default_indexing removes_characters: /[^a-zA-Z0-9\s\/\-\"\&\.]/,
|
15
14
|
stopwords: /\b(and|the|of|it|in|for)\b/,
|
16
15
|
splits_text_on: /[\s\/\-\"\&\.]/
|
17
16
|
|
17
|
+
# Querying: How query text is handled.
|
18
|
+
#
|
18
19
|
default_querying removes_characters: /[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/, # Picky needs control chars *"~: to pass through.
|
19
20
|
stopwords: /\b(and|the|of|it|in|for)\b/,
|
20
21
|
splits_text_on: /[\s\/\-\,\&]+/,
|
21
22
|
|
22
|
-
maximum_tokens: 5, #
|
23
|
-
substitutes_characters_with:
|
23
|
+
maximum_tokens: 5, # Amount of tokens passing into a query (5 = default).
|
24
|
+
substitutes_characters_with: CharacterSubstituters::WestEuropean.new # Normalizes special user input, Ä -> Ae, ñ -> n etc.
|
24
25
|
|
25
26
|
# Define an index. Use a database etc. source?
|
26
27
|
# See http://github.com/floere/picky/wiki/Sources-Configuration#sources
|
27
28
|
#
|
28
29
|
books_index = index :books, Sources::CSV.new(:title, :author, :isbn, file: 'app/library.csv')
|
29
|
-
books_index.
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
books_index.
|
34
|
-
|
35
|
-
books_index.
|
36
|
-
|
37
|
-
|
30
|
+
books_index.define_category :title,
|
31
|
+
similarity: Similarity::Phonetic.new(3), # Up to three similar title word indexed (default: No similarity).
|
32
|
+
partial: Partial::Substring.new(from: 1) # Indexes substrings upwards from character 1 (default: -3),
|
33
|
+
# You'll find "picky" even when entering just a "p".
|
34
|
+
books_index.define_category :author,
|
35
|
+
partial: Partial::Substring.new(from: 1)
|
36
|
+
books_index.define_category :isbn,
|
37
|
+
partial: Partial::None.new # Partial substring searching on an ISBN does not make
|
38
|
+
# much sense, neither does similarity.
|
38
39
|
|
39
40
|
query_options = { :weights => { [:title, :author] => +3, [:title] => +1 } } # +/- points for ordered combinations.
|
40
41
|
|