picky 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/picky/application.rb +38 -37
- data/lib/picky/cacher/partial/default.rb +1 -3
- data/lib/picky/cacher/partial/subtoken.rb +44 -18
- data/lib/picky/configuration/field.rb +6 -2
- data/lib/picky/configuration/indexes.rb +16 -7
- data/lib/picky/configuration/queries.rb +3 -13
- data/lib/picky/extensions/symbol.rb +19 -4
- data/lib/picky/generator.rb +9 -0
- data/lib/picky/helpers/measuring.rb +3 -3
- data/lib/picky/index/bundle.rb +5 -4
- data/lib/picky/index/category.rb +14 -7
- data/lib/picky/index/combined.rb +6 -1
- data/lib/picky/indexers/no_source_specified_error.rb +2 -0
- data/lib/picky/indexes.rb +3 -9
- data/lib/picky/query/allocation.rb +1 -1
- data/lib/picky/query/allocations.rb +2 -2
- data/lib/picky/rack/harakiri.rb +10 -8
- data/lib/picky/routing.rb +19 -21
- data/lib/picky/solr/schema_generator.rb +4 -4
- data/lib/picky/sources/base.rb +16 -4
- data/lib/picky/sources/csv.rb +3 -0
- data/lib/picky/sources/db.rb +30 -22
- data/lib/picky/tokenizers/base.rb +7 -5
- data/lib/picky/tokenizers/index.rb +5 -5
- data/lib/picky/tokenizers/query.rb +9 -9
- data/prototype_project/app/application.rb +36 -29
- data/prototype_project/app/db.yml +1 -1
- data/prototype_project/config.ru +3 -2
- data/spec/ext/performant_spec.rb +2 -2
- data/spec/lib/application_spec.rb +54 -8
- data/spec/lib/cacher/partial/default_spec.rb +15 -0
- data/spec/lib/cacher/partial/subtoken_spec.rb +54 -2
- data/spec/lib/extensions/symbol_spec.rb +124 -30
- data/spec/lib/index/bundle_partial_generation_speed_spec.rb +1 -1
- data/spec/lib/query/allocations_spec.rb +5 -5
- data/spec/lib/query/combinations_spec.rb +3 -3
- data/spec/lib/rack/harakiri_spec.rb +29 -0
- data/spec/lib/routing_spec.rb +22 -98
- data/spec/lib/tokenizers/index_spec.rb +1 -1
- data/spec/specific/speed_spec.rb +4 -5
- metadata +7 -3
data/lib/picky/indexes.rb
CHANGED
@@ -171,16 +171,10 @@ module Indexes
|
|
171
171
|
end
|
172
172
|
end
|
173
173
|
|
174
|
-
# Loads all index definitions.
|
175
|
-
#
|
176
|
-
def self.setup
|
177
|
-
self.types ||= []
|
178
|
-
self.type_mapping ||= {}
|
179
|
-
configuration.types.each do |type|
|
180
|
-
add type.generate
|
181
|
-
end
|
182
|
-
end
|
183
174
|
def self.add type
|
175
|
+
self.type_mapping ||= {}
|
176
|
+
self.types ||= []
|
177
|
+
|
184
178
|
self.type_mapping[type.name] = type
|
185
179
|
self.types << type
|
186
180
|
end
|
@@ -61,7 +61,7 @@ module Query
|
|
61
61
|
# Transform the allocation into result form.
|
62
62
|
#
|
63
63
|
def to_result
|
64
|
-
[self.result_type, self.score,
|
64
|
+
[self.result_type, self.score, count, @combinations.to_result, self.ids] if count = self.count > 0
|
65
65
|
end
|
66
66
|
|
67
67
|
# Json representation of this allocation.
|
@@ -66,8 +66,8 @@ module Query
|
|
66
66
|
# TODO can there be no @allocations???
|
67
67
|
return [] if @allocations.empty?
|
68
68
|
ids = @allocations.first.ids
|
69
|
-
indexes = Array.new(ids.size) { |
|
70
|
-
indexes.first(amount).map { |
|
69
|
+
indexes = Array.new(ids.size) { |id| id }.sort_by { rand }
|
70
|
+
indexes.first(amount).map { |id| ids[id] }
|
71
71
|
end
|
72
72
|
|
73
73
|
# This is the main method of this class that will replace ids and count.
|
data/lib/picky/rack/harakiri.rb
CHANGED
@@ -1,22 +1,24 @@
|
|
1
|
-
# Simple Rack Middleware to kill Unicorns after X requests.
|
2
|
-
#
|
3
|
-
# Use as follows in e.g. your rackup File:
|
4
|
-
#
|
5
|
-
# Rack::Harakiri.after = 50
|
6
|
-
# use Rack::Harakiri
|
7
|
-
#
|
8
1
|
module Rack
|
2
|
+
|
3
|
+
# Simple Rack Middleware to kill Unicorns after X requests.
|
4
|
+
#
|
5
|
+
# Use as follows in e.g. your rackup File:
|
6
|
+
#
|
7
|
+
# Rack::Harakiri.after = 50
|
8
|
+
# use Rack::Harakiri
|
9
|
+
#
|
9
10
|
class Harakiri
|
10
11
|
|
11
12
|
# Set the amount of requests before the Unicorn commits Harakiri.
|
12
13
|
#
|
13
14
|
cattr_accessor :after
|
15
|
+
attr_reader :quit_after_requests
|
14
16
|
|
15
17
|
def initialize app
|
16
18
|
@app = app
|
17
19
|
|
18
20
|
@requests = 0
|
19
|
-
@quit_after_requests =
|
21
|
+
@quit_after_requests = self.class.after || 50
|
20
22
|
end
|
21
23
|
|
22
24
|
def call env
|
data/lib/picky/routing.rb
CHANGED
@@ -14,14 +14,6 @@ class Routing
|
|
14
14
|
@defaults = @@defaults.dup
|
15
15
|
end
|
16
16
|
|
17
|
-
# #
|
18
|
-
# #
|
19
|
-
# def define_using &block
|
20
|
-
# reset_routes
|
21
|
-
# instance_eval &block
|
22
|
-
# routes.freeze
|
23
|
-
# end
|
24
|
-
|
25
17
|
#
|
26
18
|
#
|
27
19
|
def reset_routes
|
@@ -60,21 +52,27 @@ class Routing
|
|
60
52
|
|
61
53
|
#
|
62
54
|
#
|
63
|
-
def route
|
64
|
-
|
65
|
-
|
55
|
+
def route options = {}
|
56
|
+
mappings, route_options = split options
|
57
|
+
mappings.each do |url, query|
|
58
|
+
route_one url, query, route_options
|
59
|
+
end
|
66
60
|
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
61
|
+
def split options
|
62
|
+
mappings = {}
|
63
|
+
route_options = {}
|
64
|
+
options.each_pair do |key, value|
|
65
|
+
if Regexp === key or String === key
|
66
|
+
mappings[key] = value
|
67
|
+
else
|
68
|
+
route_options[key] = value
|
69
|
+
end
|
70
|
+
end
|
71
|
+
[mappings, route_options]
|
72
72
|
end
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
route_options = Hash === indexes_and_options.last ? indexes_and_options.pop : {}
|
77
|
-
route url, Query::Full.new(*indexes_and_options), route_options
|
73
|
+
def route_one url, query, route_options = {}
|
74
|
+
query.tokenizer = @defaults[:tokenizer] if @defaults[:tokenizer]
|
75
|
+
routes.add_route generate_app(query, route_options), default_options(url, route_options)
|
78
76
|
end
|
79
77
|
#
|
80
78
|
#
|
@@ -54,8 +54,8 @@ module Solr
|
|
54
54
|
def read_template
|
55
55
|
template_path = File.join PICKY_ROOT, 'solr', 'conf', 'schema.xml.erb'
|
56
56
|
schema = ''
|
57
|
-
File.open(template_path, 'r') do |
|
58
|
-
schema =
|
57
|
+
File.open(template_path, 'r') do |file|
|
58
|
+
schema = file.read
|
59
59
|
end
|
60
60
|
schema
|
61
61
|
end
|
@@ -64,8 +64,8 @@ module Solr
|
|
64
64
|
#
|
65
65
|
def write result
|
66
66
|
schema_path = File.join PICKY_ROOT, 'solr', 'conf', 'schema.xml'
|
67
|
-
File.open(schema_path, 'w') do |
|
68
|
-
|
67
|
+
File.open(schema_path, 'w') do |file|
|
68
|
+
file << result
|
69
69
|
end
|
70
70
|
end
|
71
71
|
|
data/lib/picky/sources/base.rb
CHANGED
@@ -1,16 +1,28 @@
|
|
1
1
|
module Sources
|
2
2
|
|
3
|
+
# Sources are where your data comes from.
|
4
|
+
# Harvest is the most important method as it is used always to get data.
|
5
|
+
#
|
3
6
|
class Base
|
4
7
|
|
5
|
-
|
6
|
-
|
8
|
+
# Note: Methods listed for illustrative purposes.
|
9
|
+
#
|
10
|
+
|
11
|
+
# Yield the data (id, text for id) for the given type and field.
|
12
|
+
#
|
13
|
+
def harvest type, field
|
14
|
+
# yields nothing
|
7
15
|
end
|
8
16
|
|
9
|
-
|
17
|
+
# Connects to the backend.
|
18
|
+
#
|
19
|
+
def connect_backend
|
10
20
|
|
11
21
|
end
|
12
22
|
|
13
|
-
|
23
|
+
# Take a snapshot of your data, if it is fast changing.
|
24
|
+
#
|
25
|
+
def take_snapshot type
|
14
26
|
|
15
27
|
end
|
16
28
|
|
data/lib/picky/sources/csv.rb
CHANGED
data/lib/picky/sources/db.rb
CHANGED
@@ -1,13 +1,16 @@
|
|
1
1
|
module Sources
|
2
2
|
|
3
|
+
# Describes a database source. Just give it a select statement
|
4
|
+
# (with id in it), and a file option or the options from an AR config file.
|
5
|
+
#
|
3
6
|
class DB < Base
|
4
7
|
|
5
8
|
attr_reader :select_statement, :database, :connection_options
|
6
9
|
|
7
|
-
def initialize select_statement,
|
10
|
+
def initialize select_statement, options = { :file => 'app/db.yml' }
|
8
11
|
@select_statement = select_statement
|
9
12
|
@database = create_database_adapter
|
10
|
-
|
13
|
+
@options = options
|
11
14
|
end
|
12
15
|
|
13
16
|
# Get a configured Database backend.
|
@@ -34,7 +37,7 @@ module Sources
|
|
34
37
|
#
|
35
38
|
def configure options
|
36
39
|
@connection_options = if filename = options[:file]
|
37
|
-
File.open(File.join(PICKY_ROOT, filename)) { |
|
40
|
+
File.open(File.join(PICKY_ROOT, filename)) { |file| YAML::load(file) }
|
38
41
|
else
|
39
42
|
options
|
40
43
|
end
|
@@ -44,6 +47,7 @@ module Sources
|
|
44
47
|
# Connect the backend.
|
45
48
|
#
|
46
49
|
def connect_backend
|
50
|
+
configure @options
|
47
51
|
return if PICKY_ENVIRONMENT.to_s == 'test' # TODO Unclean.
|
48
52
|
raise "Database backend not configured" unless connection_options
|
49
53
|
database.establish_connection connection_options
|
@@ -55,15 +59,17 @@ module Sources
|
|
55
59
|
connect_backend
|
56
60
|
|
57
61
|
origin = snapshot_table_name type
|
58
|
-
|
59
|
-
database.connection
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
62
|
+
|
63
|
+
on_database = database.connection
|
64
|
+
|
65
|
+
on_database.execute "DROP TABLE IF EXISTS #{origin}"
|
66
|
+
on_database.execute "CREATE TABLE #{origin} AS #{select_statement}"
|
67
|
+
on_database.execute "ALTER TABLE #{origin} CHANGE COLUMN id indexed_id INTEGER"
|
68
|
+
on_database.execute "ALTER TABLE #{origin} ADD COLUMN id INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT"
|
69
|
+
|
64
70
|
# Execute any special queries this type needs executed.
|
65
71
|
#
|
66
|
-
|
72
|
+
on_database.execute type.after_indexing if type.after_indexing
|
67
73
|
end
|
68
74
|
|
69
75
|
# Counts all the entries that are used for the index.
|
@@ -86,6 +92,8 @@ module Sources
|
|
86
92
|
# Example:
|
87
93
|
# "SELECT indexed_id, value FROM bla_table st WHERE kind = 'bla'"
|
88
94
|
#
|
95
|
+
# TODO Perhaps it should be just harvest field.
|
96
|
+
#
|
89
97
|
def harvest type, field
|
90
98
|
connect_backend
|
91
99
|
|
@@ -98,24 +106,12 @@ module Sources
|
|
98
106
|
end
|
99
107
|
end
|
100
108
|
|
101
|
-
# Override in subclasses.
|
102
|
-
#
|
103
|
-
def chunksize
|
104
|
-
25_000
|
105
|
-
end
|
106
|
-
|
107
109
|
# Gets database from the backend.
|
108
110
|
#
|
109
111
|
def get_data type, field, offset
|
110
112
|
database.connection.execute harvest_statement_with_offset(type, field, offset)
|
111
113
|
end
|
112
114
|
|
113
|
-
# Base harvest statement for dbs.
|
114
|
-
#
|
115
|
-
def harvest_statement type, field
|
116
|
-
"SELECT indexed_id, #{field.name} FROM #{snapshot_table_name(type)} st"
|
117
|
-
end
|
118
|
-
|
119
115
|
# Builds a harvest statement for getting data to index.
|
120
116
|
#
|
121
117
|
# TODO Use the adapter for this.
|
@@ -128,6 +124,18 @@ module Sources
|
|
128
124
|
"#{statement} st.id > #{offset} LIMIT #{chunksize}"
|
129
125
|
end
|
130
126
|
|
127
|
+
# Base harvest statement for dbs.
|
128
|
+
#
|
129
|
+
def harvest_statement type, field
|
130
|
+
"SELECT indexed_id, #{field.name} FROM #{snapshot_table_name(type)} st"
|
131
|
+
end
|
132
|
+
|
133
|
+
# Override in subclasses.
|
134
|
+
#
|
135
|
+
def chunksize
|
136
|
+
25_000
|
137
|
+
end
|
138
|
+
|
131
139
|
end
|
132
140
|
|
133
141
|
end
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
|
3
|
+
# Defines tokenizing processes used both in indexing and querying.
|
4
|
+
#
|
3
5
|
class Base
|
4
6
|
|
5
7
|
# Stopwords.
|
@@ -21,7 +23,7 @@ module Tokenizers
|
|
21
23
|
|
22
24
|
# Contraction.
|
23
25
|
#
|
24
|
-
def self.
|
26
|
+
def self.contracts_expressions what, to_what
|
25
27
|
define_method :contract do |text|
|
26
28
|
text.gsub! what, to_what
|
27
29
|
end
|
@@ -32,7 +34,7 @@ module Tokenizers
|
|
32
34
|
#
|
33
35
|
# TODO Should there be a legal?
|
34
36
|
#
|
35
|
-
def self.
|
37
|
+
def self.removes_characters regexp
|
36
38
|
define_method :remove_illegals do |text|
|
37
39
|
text.gsub! regexp, ''
|
38
40
|
end
|
@@ -41,7 +43,7 @@ module Tokenizers
|
|
41
43
|
|
42
44
|
# Splitting.
|
43
45
|
#
|
44
|
-
def self.
|
46
|
+
def self.splits_text_on regexp
|
45
47
|
define_method :split do |text|
|
46
48
|
text.split regexp
|
47
49
|
end
|
@@ -50,7 +52,7 @@ module Tokenizers
|
|
50
52
|
|
51
53
|
# Normalizing.
|
52
54
|
#
|
53
|
-
def self.
|
55
|
+
def self.normalizes_words regexp_replaces
|
54
56
|
define_method :normalize_with_patterns do |text|
|
55
57
|
regexp_replaces.each do |regex, replace|
|
56
58
|
# This should be sufficient
|
@@ -65,7 +67,7 @@ module Tokenizers
|
|
65
67
|
|
66
68
|
# Illegal after normalizing.
|
67
69
|
#
|
68
|
-
def self.
|
70
|
+
def self.removes_characters_after_splitting regexp
|
69
71
|
define_method :remove_after_normalizing_illegals do |text|
|
70
72
|
text.gsub! regexp, ''
|
71
73
|
end
|
@@ -9,12 +9,12 @@ module Tokenizers
|
|
9
9
|
|
10
10
|
# Default handling definitions. Override in config.
|
11
11
|
#
|
12
|
-
|
12
|
+
removes_characters(//)
|
13
13
|
stopwords(//)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
contracts_expressions(//, '')
|
15
|
+
splits_text_on(/\s/)
|
16
|
+
normalizes_words([])
|
17
|
+
removes_characters_after_splitting(//)
|
18
18
|
|
19
19
|
# Default indexing preprocessing hook.
|
20
20
|
#
|
@@ -4,12 +4,12 @@ module Tokenizers
|
|
4
4
|
|
5
5
|
# There are a few class methods that you can use to configure how a query works.
|
6
6
|
#
|
7
|
-
#
|
7
|
+
# removes_characters regexp
|
8
8
|
# illegal_after_normalizing regexp
|
9
9
|
# stopwords regexp
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
10
|
+
# contracts_expressions regexp, to_string
|
11
|
+
# splits_text_on regexp
|
12
|
+
# normalizes_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']]
|
13
13
|
#
|
14
14
|
class Query < Base
|
15
15
|
|
@@ -17,12 +17,12 @@ module Tokenizers
|
|
17
17
|
|
18
18
|
# Default query tokenizer behaviour. Override in config.
|
19
19
|
#
|
20
|
-
|
20
|
+
removes_characters(//)
|
21
21
|
stopwords(//)
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
contracts_expressions(//, '')
|
23
|
+
splits_text_on(/\s/)
|
24
|
+
normalizes_words([])
|
25
|
+
removes_characters_after_splitting(//)
|
26
26
|
|
27
27
|
def preprocess text
|
28
28
|
remove_illegals text # Remove illegal characters
|
@@ -4,43 +4,50 @@
|
|
4
4
|
#
|
5
5
|
# Have fun with Picky!
|
6
6
|
#
|
7
|
-
class PickySearch < Application # The App Constant needs to be identical in
|
7
|
+
class PickySearch < Application # The App Constant needs to be identical in config.ru.
|
8
8
|
|
9
9
|
# This is an example with books that you can adapt.
|
10
10
|
#
|
11
11
|
# Note: Much more is possible, but let's start out super easy.
|
12
12
|
#
|
13
|
-
# Ask me if you have questions or specific requests
|
13
|
+
# Ask me if you have questions or specific requests.
|
14
14
|
#
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
split_text_on(/[\s\/\-\"\&\.]/)
|
16
|
+
indexing.removes_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
|
17
|
+
indexing.stopwords(/\b(and|the|of|it|in|for)\b/)
|
18
|
+
indexing.splits_text_on(/[\s\/\-\"\&\.]/)
|
20
19
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
end
|
20
|
+
books_index = index :books,
|
21
|
+
Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
|
22
|
+
field(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
|
23
|
+
field(:author),
|
24
|
+
field(:isbn, :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
|
27
25
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
26
|
+
# Defines the maximum tokens (words) that pass through to the engine.
|
27
|
+
#
|
28
|
+
querying.maximum_tokens 5
|
29
|
+
|
30
|
+
# Note that Picky needs the following characters to
|
31
|
+
# pass through, as they are control characters: *"~:
|
32
|
+
#
|
33
|
+
querying.removes_characters(/[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/)
|
34
|
+
querying.stopwords(/\b(and|the|of|it|in|for)\b/)
|
35
|
+
querying.splits_text_on(/[\s\/\-\,\&]+/)
|
36
|
+
|
37
|
+
# The example defines two queries that use the same index(es).
|
38
|
+
#
|
39
|
+
# A Full query returns ids, combinations, and counts.
|
40
|
+
# A Live query does return all that Full returns, without ids.
|
41
|
+
#
|
42
|
+
# Note: You can pass a query multiple indexes and it will combine them.
|
43
|
+
#
|
44
|
+
full_books = Query::Full.new books_index
|
45
|
+
live_books = Query::Live.new books_index
|
46
|
+
|
47
|
+
# Routing is simple.
|
48
|
+
# A path regexp pointing to a query that will be run.
|
49
|
+
#
|
50
|
+
route %r{^/books/full} => full_books
|
51
|
+
route %r{^/books/live} => live_books
|
45
52
|
|
46
53
|
end
|