picky 0.0.9 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/picky/application.rb +38 -37
- data/lib/picky/cacher/partial/default.rb +1 -3
- data/lib/picky/cacher/partial/subtoken.rb +44 -18
- data/lib/picky/configuration/field.rb +6 -2
- data/lib/picky/configuration/indexes.rb +16 -7
- data/lib/picky/configuration/queries.rb +3 -13
- data/lib/picky/extensions/symbol.rb +19 -4
- data/lib/picky/generator.rb +9 -0
- data/lib/picky/helpers/measuring.rb +3 -3
- data/lib/picky/index/bundle.rb +5 -4
- data/lib/picky/index/category.rb +14 -7
- data/lib/picky/index/combined.rb +6 -1
- data/lib/picky/indexers/no_source_specified_error.rb +2 -0
- data/lib/picky/indexes.rb +3 -9
- data/lib/picky/query/allocation.rb +1 -1
- data/lib/picky/query/allocations.rb +2 -2
- data/lib/picky/rack/harakiri.rb +10 -8
- data/lib/picky/routing.rb +19 -21
- data/lib/picky/solr/schema_generator.rb +4 -4
- data/lib/picky/sources/base.rb +16 -4
- data/lib/picky/sources/csv.rb +3 -0
- data/lib/picky/sources/db.rb +30 -22
- data/lib/picky/tokenizers/base.rb +7 -5
- data/lib/picky/tokenizers/index.rb +5 -5
- data/lib/picky/tokenizers/query.rb +9 -9
- data/prototype_project/app/application.rb +36 -29
- data/prototype_project/app/db.yml +1 -1
- data/prototype_project/config.ru +3 -2
- data/spec/ext/performant_spec.rb +2 -2
- data/spec/lib/application_spec.rb +54 -8
- data/spec/lib/cacher/partial/default_spec.rb +15 -0
- data/spec/lib/cacher/partial/subtoken_spec.rb +54 -2
- data/spec/lib/extensions/symbol_spec.rb +124 -30
- data/spec/lib/index/bundle_partial_generation_speed_spec.rb +1 -1
- data/spec/lib/query/allocations_spec.rb +5 -5
- data/spec/lib/query/combinations_spec.rb +3 -3
- data/spec/lib/rack/harakiri_spec.rb +29 -0
- data/spec/lib/routing_spec.rb +22 -98
- data/spec/lib/tokenizers/index_spec.rb +1 -1
- data/spec/specific/speed_spec.rb +4 -5
- metadata +7 -3
data/lib/picky/indexes.rb
CHANGED
@@ -171,16 +171,10 @@ module Indexes
|
|
171
171
|
end
|
172
172
|
end
|
173
173
|
|
174
|
-
# Loads all index definitions.
|
175
|
-
#
|
176
|
-
def self.setup
|
177
|
-
self.types ||= []
|
178
|
-
self.type_mapping ||= {}
|
179
|
-
configuration.types.each do |type|
|
180
|
-
add type.generate
|
181
|
-
end
|
182
|
-
end
|
183
174
|
def self.add type
|
175
|
+
self.type_mapping ||= {}
|
176
|
+
self.types ||= []
|
177
|
+
|
184
178
|
self.type_mapping[type.name] = type
|
185
179
|
self.types << type
|
186
180
|
end
|
@@ -61,7 +61,7 @@ module Query
|
|
61
61
|
# Transform the allocation into result form.
|
62
62
|
#
|
63
63
|
def to_result
|
64
|
-
[self.result_type, self.score,
|
64
|
+
[self.result_type, self.score, count, @combinations.to_result, self.ids] if count = self.count > 0
|
65
65
|
end
|
66
66
|
|
67
67
|
# Json representation of this allocation.
|
@@ -66,8 +66,8 @@ module Query
|
|
66
66
|
# TODO can there be no @allocations???
|
67
67
|
return [] if @allocations.empty?
|
68
68
|
ids = @allocations.first.ids
|
69
|
-
indexes = Array.new(ids.size) { |
|
70
|
-
indexes.first(amount).map { |
|
69
|
+
indexes = Array.new(ids.size) { |id| id }.sort_by { rand }
|
70
|
+
indexes.first(amount).map { |id| ids[id] }
|
71
71
|
end
|
72
72
|
|
73
73
|
# This is the main method of this class that will replace ids and count.
|
data/lib/picky/rack/harakiri.rb
CHANGED
@@ -1,22 +1,24 @@
|
|
1
|
-
# Simple Rack Middleware to kill Unicorns after X requests.
|
2
|
-
#
|
3
|
-
# Use as follows in e.g. your rackup File:
|
4
|
-
#
|
5
|
-
# Rack::Harakiri.after = 50
|
6
|
-
# use Rack::Harakiri
|
7
|
-
#
|
8
1
|
module Rack
|
2
|
+
|
3
|
+
# Simple Rack Middleware to kill Unicorns after X requests.
|
4
|
+
#
|
5
|
+
# Use as follows in e.g. your rackup File:
|
6
|
+
#
|
7
|
+
# Rack::Harakiri.after = 50
|
8
|
+
# use Rack::Harakiri
|
9
|
+
#
|
9
10
|
class Harakiri
|
10
11
|
|
11
12
|
# Set the amount of requests before the Unicorn commits Harakiri.
|
12
13
|
#
|
13
14
|
cattr_accessor :after
|
15
|
+
attr_reader :quit_after_requests
|
14
16
|
|
15
17
|
def initialize app
|
16
18
|
@app = app
|
17
19
|
|
18
20
|
@requests = 0
|
19
|
-
@quit_after_requests =
|
21
|
+
@quit_after_requests = self.class.after || 50
|
20
22
|
end
|
21
23
|
|
22
24
|
def call env
|
data/lib/picky/routing.rb
CHANGED
@@ -14,14 +14,6 @@ class Routing
|
|
14
14
|
@defaults = @@defaults.dup
|
15
15
|
end
|
16
16
|
|
17
|
-
# #
|
18
|
-
# #
|
19
|
-
# def define_using &block
|
20
|
-
# reset_routes
|
21
|
-
# instance_eval &block
|
22
|
-
# routes.freeze
|
23
|
-
# end
|
24
|
-
|
25
17
|
#
|
26
18
|
#
|
27
19
|
def reset_routes
|
@@ -60,21 +52,27 @@ class Routing
|
|
60
52
|
|
61
53
|
#
|
62
54
|
#
|
63
|
-
def route
|
64
|
-
|
65
|
-
|
55
|
+
def route options = {}
|
56
|
+
mappings, route_options = split options
|
57
|
+
mappings.each do |url, query|
|
58
|
+
route_one url, query, route_options
|
59
|
+
end
|
66
60
|
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
61
|
+
def split options
|
62
|
+
mappings = {}
|
63
|
+
route_options = {}
|
64
|
+
options.each_pair do |key, value|
|
65
|
+
if Regexp === key or String === key
|
66
|
+
mappings[key] = value
|
67
|
+
else
|
68
|
+
route_options[key] = value
|
69
|
+
end
|
70
|
+
end
|
71
|
+
[mappings, route_options]
|
72
72
|
end
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
route_options = Hash === indexes_and_options.last ? indexes_and_options.pop : {}
|
77
|
-
route url, Query::Full.new(*indexes_and_options), route_options
|
73
|
+
def route_one url, query, route_options = {}
|
74
|
+
query.tokenizer = @defaults[:tokenizer] if @defaults[:tokenizer]
|
75
|
+
routes.add_route generate_app(query, route_options), default_options(url, route_options)
|
78
76
|
end
|
79
77
|
#
|
80
78
|
#
|
@@ -54,8 +54,8 @@ module Solr
|
|
54
54
|
def read_template
|
55
55
|
template_path = File.join PICKY_ROOT, 'solr', 'conf', 'schema.xml.erb'
|
56
56
|
schema = ''
|
57
|
-
File.open(template_path, 'r') do |
|
58
|
-
schema =
|
57
|
+
File.open(template_path, 'r') do |file|
|
58
|
+
schema = file.read
|
59
59
|
end
|
60
60
|
schema
|
61
61
|
end
|
@@ -64,8 +64,8 @@ module Solr
|
|
64
64
|
#
|
65
65
|
def write result
|
66
66
|
schema_path = File.join PICKY_ROOT, 'solr', 'conf', 'schema.xml'
|
67
|
-
File.open(schema_path, 'w') do |
|
68
|
-
|
67
|
+
File.open(schema_path, 'w') do |file|
|
68
|
+
file << result
|
69
69
|
end
|
70
70
|
end
|
71
71
|
|
data/lib/picky/sources/base.rb
CHANGED
@@ -1,16 +1,28 @@
|
|
1
1
|
module Sources
|
2
2
|
|
3
|
+
# Sources are where your data comes from.
|
4
|
+
# Harvest is the most important method as it is used always to get data.
|
5
|
+
#
|
3
6
|
class Base
|
4
7
|
|
5
|
-
|
6
|
-
|
8
|
+
# Note: Methods listed for illustrative purposes.
|
9
|
+
#
|
10
|
+
|
11
|
+
# Yield the data (id, text for id) for the given type and field.
|
12
|
+
#
|
13
|
+
def harvest type, field
|
14
|
+
# yields nothing
|
7
15
|
end
|
8
16
|
|
9
|
-
|
17
|
+
# Connects to the backend.
|
18
|
+
#
|
19
|
+
def connect_backend
|
10
20
|
|
11
21
|
end
|
12
22
|
|
13
|
-
|
23
|
+
# Take a snapshot of your data, if it is fast changing.
|
24
|
+
#
|
25
|
+
def take_snapshot type
|
14
26
|
|
15
27
|
end
|
16
28
|
|
data/lib/picky/sources/csv.rb
CHANGED
data/lib/picky/sources/db.rb
CHANGED
@@ -1,13 +1,16 @@
|
|
1
1
|
module Sources
|
2
2
|
|
3
|
+
# Describes a database source. Just give it a select statement
|
4
|
+
# (with id in it), and a file option or the options from an AR config file.
|
5
|
+
#
|
3
6
|
class DB < Base
|
4
7
|
|
5
8
|
attr_reader :select_statement, :database, :connection_options
|
6
9
|
|
7
|
-
def initialize select_statement,
|
10
|
+
def initialize select_statement, options = { :file => 'app/db.yml' }
|
8
11
|
@select_statement = select_statement
|
9
12
|
@database = create_database_adapter
|
10
|
-
|
13
|
+
@options = options
|
11
14
|
end
|
12
15
|
|
13
16
|
# Get a configured Database backend.
|
@@ -34,7 +37,7 @@ module Sources
|
|
34
37
|
#
|
35
38
|
def configure options
|
36
39
|
@connection_options = if filename = options[:file]
|
37
|
-
File.open(File.join(PICKY_ROOT, filename)) { |
|
40
|
+
File.open(File.join(PICKY_ROOT, filename)) { |file| YAML::load(file) }
|
38
41
|
else
|
39
42
|
options
|
40
43
|
end
|
@@ -44,6 +47,7 @@ module Sources
|
|
44
47
|
# Connect the backend.
|
45
48
|
#
|
46
49
|
def connect_backend
|
50
|
+
configure @options
|
47
51
|
return if PICKY_ENVIRONMENT.to_s == 'test' # TODO Unclean.
|
48
52
|
raise "Database backend not configured" unless connection_options
|
49
53
|
database.establish_connection connection_options
|
@@ -55,15 +59,17 @@ module Sources
|
|
55
59
|
connect_backend
|
56
60
|
|
57
61
|
origin = snapshot_table_name type
|
58
|
-
|
59
|
-
database.connection
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
62
|
+
|
63
|
+
on_database = database.connection
|
64
|
+
|
65
|
+
on_database.execute "DROP TABLE IF EXISTS #{origin}"
|
66
|
+
on_database.execute "CREATE TABLE #{origin} AS #{select_statement}"
|
67
|
+
on_database.execute "ALTER TABLE #{origin} CHANGE COLUMN id indexed_id INTEGER"
|
68
|
+
on_database.execute "ALTER TABLE #{origin} ADD COLUMN id INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT"
|
69
|
+
|
64
70
|
# Execute any special queries this type needs executed.
|
65
71
|
#
|
66
|
-
|
72
|
+
on_database.execute type.after_indexing if type.after_indexing
|
67
73
|
end
|
68
74
|
|
69
75
|
# Counts all the entries that are used for the index.
|
@@ -86,6 +92,8 @@ module Sources
|
|
86
92
|
# Example:
|
87
93
|
# "SELECT indexed_id, value FROM bla_table st WHERE kind = 'bla'"
|
88
94
|
#
|
95
|
+
# TODO Perhaps it should be just harvest field.
|
96
|
+
#
|
89
97
|
def harvest type, field
|
90
98
|
connect_backend
|
91
99
|
|
@@ -98,24 +106,12 @@ module Sources
|
|
98
106
|
end
|
99
107
|
end
|
100
108
|
|
101
|
-
# Override in subclasses.
|
102
|
-
#
|
103
|
-
def chunksize
|
104
|
-
25_000
|
105
|
-
end
|
106
|
-
|
107
109
|
# Gets database from the backend.
|
108
110
|
#
|
109
111
|
def get_data type, field, offset
|
110
112
|
database.connection.execute harvest_statement_with_offset(type, field, offset)
|
111
113
|
end
|
112
114
|
|
113
|
-
# Base harvest statement for dbs.
|
114
|
-
#
|
115
|
-
def harvest_statement type, field
|
116
|
-
"SELECT indexed_id, #{field.name} FROM #{snapshot_table_name(type)} st"
|
117
|
-
end
|
118
|
-
|
119
115
|
# Builds a harvest statement for getting data to index.
|
120
116
|
#
|
121
117
|
# TODO Use the adapter for this.
|
@@ -128,6 +124,18 @@ module Sources
|
|
128
124
|
"#{statement} st.id > #{offset} LIMIT #{chunksize}"
|
129
125
|
end
|
130
126
|
|
127
|
+
# Base harvest statement for dbs.
|
128
|
+
#
|
129
|
+
def harvest_statement type, field
|
130
|
+
"SELECT indexed_id, #{field.name} FROM #{snapshot_table_name(type)} st"
|
131
|
+
end
|
132
|
+
|
133
|
+
# Override in subclasses.
|
134
|
+
#
|
135
|
+
def chunksize
|
136
|
+
25_000
|
137
|
+
end
|
138
|
+
|
131
139
|
end
|
132
140
|
|
133
141
|
end
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
|
3
|
+
# Defines tokenizing processes used both in indexing and querying.
|
4
|
+
#
|
3
5
|
class Base
|
4
6
|
|
5
7
|
# Stopwords.
|
@@ -21,7 +23,7 @@ module Tokenizers
|
|
21
23
|
|
22
24
|
# Contraction.
|
23
25
|
#
|
24
|
-
def self.
|
26
|
+
def self.contracts_expressions what, to_what
|
25
27
|
define_method :contract do |text|
|
26
28
|
text.gsub! what, to_what
|
27
29
|
end
|
@@ -32,7 +34,7 @@ module Tokenizers
|
|
32
34
|
#
|
33
35
|
# TODO Should there be a legal?
|
34
36
|
#
|
35
|
-
def self.
|
37
|
+
def self.removes_characters regexp
|
36
38
|
define_method :remove_illegals do |text|
|
37
39
|
text.gsub! regexp, ''
|
38
40
|
end
|
@@ -41,7 +43,7 @@ module Tokenizers
|
|
41
43
|
|
42
44
|
# Splitting.
|
43
45
|
#
|
44
|
-
def self.
|
46
|
+
def self.splits_text_on regexp
|
45
47
|
define_method :split do |text|
|
46
48
|
text.split regexp
|
47
49
|
end
|
@@ -50,7 +52,7 @@ module Tokenizers
|
|
50
52
|
|
51
53
|
# Normalizing.
|
52
54
|
#
|
53
|
-
def self.
|
55
|
+
def self.normalizes_words regexp_replaces
|
54
56
|
define_method :normalize_with_patterns do |text|
|
55
57
|
regexp_replaces.each do |regex, replace|
|
56
58
|
# This should be sufficient
|
@@ -65,7 +67,7 @@ module Tokenizers
|
|
65
67
|
|
66
68
|
# Illegal after normalizing.
|
67
69
|
#
|
68
|
-
def self.
|
70
|
+
def self.removes_characters_after_splitting regexp
|
69
71
|
define_method :remove_after_normalizing_illegals do |text|
|
70
72
|
text.gsub! regexp, ''
|
71
73
|
end
|
@@ -9,12 +9,12 @@ module Tokenizers
|
|
9
9
|
|
10
10
|
# Default handling definitions. Override in config.
|
11
11
|
#
|
12
|
-
|
12
|
+
removes_characters(//)
|
13
13
|
stopwords(//)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
contracts_expressions(//, '')
|
15
|
+
splits_text_on(/\s/)
|
16
|
+
normalizes_words([])
|
17
|
+
removes_characters_after_splitting(//)
|
18
18
|
|
19
19
|
# Default indexing preprocessing hook.
|
20
20
|
#
|
@@ -4,12 +4,12 @@ module Tokenizers
|
|
4
4
|
|
5
5
|
# There are a few class methods that you can use to configure how a query works.
|
6
6
|
#
|
7
|
-
#
|
7
|
+
# removes_characters regexp
|
8
8
|
# illegal_after_normalizing regexp
|
9
9
|
# stopwords regexp
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
10
|
+
# contracts_expressions regexp, to_string
|
11
|
+
# splits_text_on regexp
|
12
|
+
# normalizes_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']]
|
13
13
|
#
|
14
14
|
class Query < Base
|
15
15
|
|
@@ -17,12 +17,12 @@ module Tokenizers
|
|
17
17
|
|
18
18
|
# Default query tokenizer behaviour. Override in config.
|
19
19
|
#
|
20
|
-
|
20
|
+
removes_characters(//)
|
21
21
|
stopwords(//)
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
contracts_expressions(//, '')
|
23
|
+
splits_text_on(/\s/)
|
24
|
+
normalizes_words([])
|
25
|
+
removes_characters_after_splitting(//)
|
26
26
|
|
27
27
|
def preprocess text
|
28
28
|
remove_illegals text # Remove illegal characters
|
@@ -4,43 +4,50 @@
|
|
4
4
|
#
|
5
5
|
# Have fun with Picky!
|
6
6
|
#
|
7
|
-
class PickySearch < Application # The App Constant needs to be identical in
|
7
|
+
class PickySearch < Application # The App Constant needs to be identical in config.ru.
|
8
8
|
|
9
9
|
# This is an example with books that you can adapt.
|
10
10
|
#
|
11
11
|
# Note: Much more is possible, but let's start out super easy.
|
12
12
|
#
|
13
|
-
# Ask me if you have questions or specific requests
|
13
|
+
# Ask me if you have questions or specific requests.
|
14
14
|
#
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
split_text_on(/[\s\/\-\"\&\.]/)
|
16
|
+
indexing.removes_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
|
17
|
+
indexing.stopwords(/\b(and|the|of|it|in|for)\b/)
|
18
|
+
indexing.splits_text_on(/[\s\/\-\"\&\.]/)
|
20
19
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
end
|
20
|
+
books_index = index :books,
|
21
|
+
Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
|
22
|
+
field(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
|
23
|
+
field(:author),
|
24
|
+
field(:isbn, :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
|
27
25
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
26
|
+
# Defines the maximum tokens (words) that pass through to the engine.
|
27
|
+
#
|
28
|
+
querying.maximum_tokens 5
|
29
|
+
|
30
|
+
# Note that Picky needs the following characters to
|
31
|
+
# pass through, as they are control characters: *"~:
|
32
|
+
#
|
33
|
+
querying.removes_characters(/[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/)
|
34
|
+
querying.stopwords(/\b(and|the|of|it|in|for)\b/)
|
35
|
+
querying.splits_text_on(/[\s\/\-\,\&]+/)
|
36
|
+
|
37
|
+
# The example defines two queries that use the same index(es).
|
38
|
+
#
|
39
|
+
# A Full query returns ids, combinations, and counts.
|
40
|
+
# A Live query does return all that Full returns, without ids.
|
41
|
+
#
|
42
|
+
# Note: You can pass a query multiple indexes and it will combine them.
|
43
|
+
#
|
44
|
+
full_books = Query::Full.new books_index
|
45
|
+
live_books = Query::Live.new books_index
|
46
|
+
|
47
|
+
# Routing is simple.
|
48
|
+
# A path regexp pointing to a query that will be run.
|
49
|
+
#
|
50
|
+
route %r{^/books/full} => full_books
|
51
|
+
route %r{^/books/live} => live_books
|
45
52
|
|
46
53
|
end
|