picky 4.0.0pre1 → 4.0.0pre2
Sign up to get free protection for your applications and to get access to all the features.
- data/aux/picky/cli.rb +6 -2
- data/lib/picky.rb +10 -8
- data/lib/picky/backends/backend.rb +37 -0
- data/lib/picky/backends/file.rb +0 -20
- data/lib/picky/backends/memory.rb +0 -29
- data/lib/picky/backends/redis.rb +74 -15
- data/lib/picky/backends/redis/list.rb +1 -1
- data/lib/picky/backends/sqlite.rb +0 -27
- data/lib/picky/bundle.rb +2 -2
- data/lib/picky/bundle_indexed.rb +1 -1
- data/lib/picky/bundle_indexing.rb +1 -1
- data/lib/picky/categories_indexed.rb +1 -11
- data/lib/picky/category.rb +4 -4
- data/lib/picky/category/location.rb +25 -0
- data/lib/picky/category_realtime.rb +4 -3
- data/lib/picky/console.rb +1 -1
- data/lib/picky/constants.rb +1 -1
- data/lib/picky/ext/maybe_compile.rb +2 -2
- data/lib/picky/extensions/object.rb +3 -2
- data/lib/picky/generators/aliases.rb +7 -2
- data/lib/picky/generators/partial/default.rb +1 -0
- data/lib/picky/generators/similarity/default.rb +1 -0
- data/lib/picky/generators/similarity/phonetic.rb +13 -2
- data/lib/picky/generators/strategy.rb +0 -2
- data/lib/picky/generators/weights/constant.rb +1 -2
- data/lib/picky/generators/weights/default.rb +1 -0
- data/lib/picky/generators/weights/dynamic.rb +1 -1
- data/lib/picky/generators/weights/logarithmic.rb +1 -1
- data/lib/picky/generators/weights/{runtime.rb → stub.rb} +1 -3
- data/lib/picky/index.rb +3 -3
- data/lib/picky/index_indexing.rb +0 -2
- data/lib/picky/index_realtime.rb +1 -1
- data/lib/picky/indexers/base.rb +7 -0
- data/lib/picky/indexers/parallel.rb +2 -4
- data/lib/picky/indexers/serial.rb +2 -0
- data/lib/picky/indexes_indexing.rb +1 -1
- data/lib/picky/interfaces/live_parameters/master_child.rb +175 -0
- data/lib/picky/interfaces/live_parameters/unicorn.rb +37 -0
- data/lib/picky/loader.rb +238 -259
- data/lib/picky/query/allocation.rb +19 -10
- data/lib/picky/query/combination.rb +7 -1
- data/lib/picky/query/combinations.rb +1 -6
- data/lib/picky/query/token.rb +26 -36
- data/lib/picky/results.rb +18 -17
- data/lib/picky/scheduler.rb +2 -1
- data/lib/picky/search.rb +1 -1
- data/lib/picky/sinatra.rb +6 -6
- data/lib/picky/statistics.rb +2 -0
- data/lib/picky/tokenizer.rb +8 -8
- data/lib/picky/wrappers/bundle/calculation.rb +4 -4
- data/lib/picky/wrappers/bundle/location.rb +1 -2
- data/lib/tasks/framework.rake +1 -1
- data/lib/tasks/statistics.rake +1 -1
- data/lib/tasks/try.rake +1 -1
- data/lib/tasks/try.rb +1 -1
- data/spec/aux/picky/cli_spec.rb +12 -12
- data/spec/ext/performant_spec.rb +16 -16
- data/spec/functional/backends/file_spec.rb +78 -7
- data/spec/functional/backends/memory_spec.rb +78 -7
- data/spec/functional/backends/redis_spec.rb +73 -13
- data/spec/functional/dynamic_weights_spec.rb +3 -4
- data/spec/functional/realtime_spec.rb +2 -2
- data/spec/functional/speed_spec.rb +2 -2
- data/spec/functional/terminate_early_spec.rb +3 -3
- data/spec/lib/analytics_spec.rb +1 -1
- data/spec/lib/analyzer_spec.rb +5 -3
- data/spec/lib/categories_indexed_spec.rb +38 -20
- data/spec/lib/category/location_spec.rb +30 -0
- data/spec/lib/character_substituters/west_european_spec.rb +1 -0
- data/spec/lib/extensions/hash_spec.rb +6 -5
- data/spec/lib/extensions/module_spec.rb +6 -6
- data/spec/lib/extensions/object_spec.rb +9 -8
- data/spec/lib/extensions/string_spec.rb +1 -1
- data/spec/lib/generators/similarity/phonetic_spec.rb +11 -0
- data/spec/lib/index_realtime_spec.rb +5 -5
- data/spec/lib/interfaces/{live_parameters_spec.rb → live_parameters/master_child_spec.rb} +26 -26
- data/spec/lib/interfaces/live_parameters/unicorn_spec.rb +160 -0
- data/spec/lib/loader_spec.rb +65 -25
- data/spec/lib/query/allocation_spec.rb +25 -22
- data/spec/lib/query/combinations_spec.rb +13 -36
- data/spec/lib/query/token_spec.rb +144 -131
- data/spec/lib/query/tokens_spec.rb +14 -0
- data/spec/lib/results_spec.rb +14 -8
- data/spec/lib/search_spec.rb +1 -1
- data/spec/lib/sinatra_spec.rb +8 -8
- metadata +28 -91
- data/lib/picky/adapters/rack.rb +0 -34
- data/lib/picky/adapters/rack/base.rb +0 -27
- data/lib/picky/adapters/rack/live_parameters.rb +0 -37
- data/lib/picky/adapters/rack/search.rb +0 -67
- data/lib/picky/application.rb +0 -268
- data/lib/picky/frontend_adapters/rack.rb +0 -161
- data/lib/picky/interfaces/live_parameters.rb +0 -187
- data/lib/picky/sources/base.rb +0 -92
- data/lib/picky/sources/couch.rb +0 -76
- data/lib/picky/sources/csv.rb +0 -83
- data/lib/picky/sources/db.rb +0 -189
- data/lib/picky/sources/delicious.rb +0 -63
- data/lib/picky/sources/mongo.rb +0 -80
- data/lib/picky/wrappers/category/location.rb +0 -38
- data/lib/tasks/routes.rake +0 -8
- data/spec/lib/adapters/rack/base_spec.rb +0 -24
- data/spec/lib/adapters/rack/live_parameters_spec.rb +0 -26
- data/spec/lib/adapters/rack/query_spec.rb +0 -39
- data/spec/lib/application_spec.rb +0 -155
- data/spec/lib/frontend_adapters/rack_spec.rb +0 -294
- data/spec/lib/sources/base_spec.rb +0 -53
- data/spec/lib/sources/couch_spec.rb +0 -114
- data/spec/lib/sources/csv_spec.rb +0 -89
- data/spec/lib/sources/db_spec.rb +0 -125
- data/spec/lib/sources/delicious_spec.rb +0 -94
- data/spec/lib/sources/mongo_spec.rb +0 -50
@@ -1,187 +0,0 @@
|
|
1
|
-
module Picky
|
2
|
-
|
3
|
-
# This is very optional.
|
4
|
-
# Only load if the user wants it.
|
5
|
-
#
|
6
|
-
module Interfaces
|
7
|
-
|
8
|
-
# This is an interface that provides the user of
|
9
|
-
# Picky with the possibility to change parameters
|
10
|
-
# while the Application is running.
|
11
|
-
#
|
12
|
-
# Important Note: This will only work in Master/Child configurations.
|
13
|
-
#
|
14
|
-
class LiveParameters
|
15
|
-
|
16
|
-
def initialize
|
17
|
-
@child, @parent = IO.pipe
|
18
|
-
start_master_process_thread
|
19
|
-
end
|
20
|
-
|
21
|
-
# This runs a thread that listens to child processes.
|
22
|
-
#
|
23
|
-
def start_master_process_thread
|
24
|
-
# This thread is stopped in the children.
|
25
|
-
#
|
26
|
-
Thread.new do
|
27
|
-
loop do
|
28
|
-
IO.select([@child], nil, nil, 2) or next
|
29
|
-
result = @child.gets ';;;'
|
30
|
-
pid, configuration_hash = eval result
|
31
|
-
next unless Hash === configuration_hash
|
32
|
-
next if configuration_hash.empty?
|
33
|
-
exclaim "Trying to update MASTER configuration."
|
34
|
-
try_updating_configuration_with configuration_hash
|
35
|
-
kill_each_worker_except pid
|
36
|
-
|
37
|
-
# Fails hard on an error.
|
38
|
-
#
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
# TODO This needs to be webserver agnostic.
|
44
|
-
#
|
45
|
-
def worker_pids
|
46
|
-
Unicorn::HttpServer::WORKERS.keys
|
47
|
-
end
|
48
|
-
|
49
|
-
# Taken from Unicorn.
|
50
|
-
#
|
51
|
-
def kill_each_worker_except pid
|
52
|
-
worker_pids.each do |wpid|
|
53
|
-
next if wpid == pid
|
54
|
-
kill_worker :KILL, wpid
|
55
|
-
end
|
56
|
-
end
|
57
|
-
def kill_worker signal, wpid
|
58
|
-
Process.kill signal, wpid
|
59
|
-
exclaim "Killing worker ##{wpid} with signal #{signal}."
|
60
|
-
rescue Errno::ESRCH
|
61
|
-
remove_worker wpid
|
62
|
-
end
|
63
|
-
# TODO This needs to be Webserver agnostic.
|
64
|
-
#
|
65
|
-
def remove_worker wpid
|
66
|
-
worker = Unicorn::HttpServer::WORKERS.delete(wpid) and worker.tmp.close rescue nil
|
67
|
-
end
|
68
|
-
|
69
|
-
# Updates any parameters with the ones given and
|
70
|
-
# returns the updated params.
|
71
|
-
#
|
72
|
-
# The params are a strictly defined hash of:
|
73
|
-
# * querying_removes_characters: Regexp
|
74
|
-
# * querying_stopwords: Regexp
|
75
|
-
# * querying_splits_text_on: Regexp
|
76
|
-
#
|
77
|
-
# This first tries to update in the child process,
|
78
|
-
# and if successful, in the parent process
|
79
|
-
#
|
80
|
-
def parameters configuration_hash
|
81
|
-
close_child
|
82
|
-
exclaim "Trying to update worker child configuration." unless configuration_hash.empty?
|
83
|
-
try_updating_configuration_with configuration_hash
|
84
|
-
write_parent configuration_hash
|
85
|
-
extract_configuration
|
86
|
-
rescue CouldNotUpdateConfigurationError => e
|
87
|
-
# I need to die such that my broken config is never used.
|
88
|
-
#
|
89
|
-
exclaim "Child process #{Process.pid} performs harakiri because of broken config."
|
90
|
-
harakiri
|
91
|
-
{ e.config_key => :ERROR }
|
92
|
-
end
|
93
|
-
# Kills itself, but still answering the request honorably.
|
94
|
-
#
|
95
|
-
def harakiri
|
96
|
-
Process.kill :QUIT, Process.pid
|
97
|
-
end
|
98
|
-
# Write the parent.
|
99
|
-
#
|
100
|
-
# Note: The ;;; is the end marker for the message.
|
101
|
-
#
|
102
|
-
def write_parent configuration_hash
|
103
|
-
@parent.write "#{[Process.pid, configuration_hash]};;;"
|
104
|
-
end
|
105
|
-
# Close the child if it isn't yet closed.
|
106
|
-
#
|
107
|
-
def close_child
|
108
|
-
@child.close unless @child.closed?
|
109
|
-
end
|
110
|
-
|
111
|
-
class CouldNotUpdateConfigurationError < StandardError
|
112
|
-
attr_reader :config_key
|
113
|
-
def initialize config_key, message
|
114
|
-
super message
|
115
|
-
@config_key = config_key
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
# Tries updating the configuration in the child process or parent process.
|
120
|
-
#
|
121
|
-
def try_updating_configuration_with configuration_hash
|
122
|
-
current_key = nil
|
123
|
-
begin
|
124
|
-
configuration_hash.each_pair do |key, new_value|
|
125
|
-
exclaim " Setting #{key} with #{new_value}."
|
126
|
-
current_key = key
|
127
|
-
send :"#{key}=", new_value
|
128
|
-
end
|
129
|
-
rescue StandardError => e
|
130
|
-
# Catch any error and reraise as config error.
|
131
|
-
#
|
132
|
-
raise CouldNotUpdateConfigurationError.new current_key, e.message
|
133
|
-
end
|
134
|
-
end
|
135
|
-
|
136
|
-
def extract_configuration
|
137
|
-
{
|
138
|
-
querying_removes_characters: querying_removes_characters,
|
139
|
-
querying_stopwords: querying_stopwords,
|
140
|
-
querying_splits_text_on: querying_splits_text_on
|
141
|
-
}
|
142
|
-
end
|
143
|
-
|
144
|
-
# TODO Move to Interface object.
|
145
|
-
#
|
146
|
-
def querying_removes_characters
|
147
|
-
regexp = Tokenizer.query_default.instance_variable_get :@removes_characters_regexp
|
148
|
-
regexp && regexp.source
|
149
|
-
end
|
150
|
-
def querying_removes_characters= new_value
|
151
|
-
Tokenizer.query_default.instance_variable_set(:@removes_characters_regexp, %r{#{new_value}})
|
152
|
-
end
|
153
|
-
def querying_stopwords
|
154
|
-
regexp = Tokenizer.query_default.instance_variable_get :@remove_stopwords_regexp
|
155
|
-
regexp && regexp.source
|
156
|
-
end
|
157
|
-
def querying_stopwords= new_value
|
158
|
-
Tokenizer.query_default.instance_variable_set(:@remove_stopwords_regexp, %r{#{new_value}})
|
159
|
-
end
|
160
|
-
def querying_splits_text_on
|
161
|
-
splits = Tokenizer.query_default.instance_variable_get :@splits_text_on
|
162
|
-
splits && splits.respond_to?(:source) ? splits.source : splits
|
163
|
-
end
|
164
|
-
def querying_splits_text_on= new_value
|
165
|
-
splits = Tokenizer.query_default.instance_variable_get :@splits_text_on
|
166
|
-
if splits.respond_to?(:source)
|
167
|
-
Tokenizer.query_default.instance_variable_set(:@splits_text_on, %r{#{new_value}})
|
168
|
-
else
|
169
|
-
Tokenizer.query_default.instance_variable_set(:@splits_text_on, new_value)
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
#
|
174
|
-
#
|
175
|
-
def to_s
|
176
|
-
"Suckerfish Live Interface (Use the picky-live gem to introspect)"
|
177
|
-
end
|
178
|
-
|
179
|
-
end
|
180
|
-
|
181
|
-
end
|
182
|
-
|
183
|
-
# Aka.
|
184
|
-
#
|
185
|
-
LiveParameters = Interfaces::LiveParameters
|
186
|
-
|
187
|
-
end
|
data/lib/picky/sources/base.rb
DELETED
@@ -1,92 +0,0 @@
|
|
1
|
-
module Picky
|
2
|
-
|
3
|
-
# = Data Sources
|
4
|
-
#
|
5
|
-
# Currently, Picky offers the following Sources:
|
6
|
-
# * CSV (comma – or other – separated file)
|
7
|
-
# * Couch (CouchDB, key-value store)
|
8
|
-
# * DB (Databases, foremost MySQL)
|
9
|
-
# * Delicious (http://del.icio.us, online bookmarking service)
|
10
|
-
# See also:
|
11
|
-
# http://github.com/floere/picky/wiki/Sources-Configuration
|
12
|
-
#
|
13
|
-
# Don't worry if your source isn't here. Adding your own is easy:
|
14
|
-
# http://github.com/floere/picky/wiki/Contributing-sources
|
15
|
-
#
|
16
|
-
module Sources
|
17
|
-
|
18
|
-
# Sources are where your data comes from.
|
19
|
-
#
|
20
|
-
# A source has 1 mandatory and 2 optional methods:
|
21
|
-
# * connect_backend (_optional_): called once for each type/category pair.
|
22
|
-
# * harvest: Used by the indexer to gather data. Yields an indexed_id (string or integer) and a string value.
|
23
|
-
# * take_snapshot (_optional_): called once for each index or category (if indexing a single category).
|
24
|
-
#
|
25
|
-
# This base class "implements" all these methods, but they don't do anything.
|
26
|
-
# Subclass this class <tt>class MySource < Base</tt> and override the methods in your source to do something.
|
27
|
-
#
|
28
|
-
class Base
|
29
|
-
|
30
|
-
attr_reader :key_format
|
31
|
-
|
32
|
-
# Connect to the backend.
|
33
|
-
#
|
34
|
-
# Called once per index/category combination before harvesting.
|
35
|
-
#
|
36
|
-
# Examples:
|
37
|
-
# * The DB backend connects the DB adapter.
|
38
|
-
# * We open a connection to a key value store.
|
39
|
-
# * We open an file with data.
|
40
|
-
#
|
41
|
-
def connect_backend
|
42
|
-
|
43
|
-
end
|
44
|
-
|
45
|
-
# Called by the indexer when gathering data.
|
46
|
-
#
|
47
|
-
# Yields the data (id, text for id) for the given category.
|
48
|
-
#
|
49
|
-
# When implementing or overriding your own,
|
50
|
-
# be sure to <tt>yield(id, text_for_id)</tt> (or <tt>block.call(id, text_for_id)</tt>)
|
51
|
-
# for the given type symbol and category symbol.
|
52
|
-
#
|
53
|
-
# Note: Since harvest needs to be implemented, it has no default impementation.
|
54
|
-
#
|
55
|
-
# def harvest category # :yields: id, text_for_id
|
56
|
-
#
|
57
|
-
# end
|
58
|
-
|
59
|
-
# Used to take a snapshot of your data if it is fast changing.
|
60
|
-
#
|
61
|
-
# Called once for each index before harvesting.
|
62
|
-
# If it has been called on a source already by an index,
|
63
|
-
# it won't be called again for a category inside that index.
|
64
|
-
#
|
65
|
-
# Example:
|
66
|
-
# * In a DB source, a table based on the source's select statement is created.
|
67
|
-
#
|
68
|
-
def take_snapshot index
|
69
|
-
|
70
|
-
end
|
71
|
-
|
72
|
-
# Used to check if a snapshot has been done already.
|
73
|
-
#
|
74
|
-
# Example:
|
75
|
-
# * In a DB source, a table based on the source's select statement is created.
|
76
|
-
#
|
77
|
-
def with_snapshot index
|
78
|
-
connect_backend
|
79
|
-
@snapshot_taken ||= 0
|
80
|
-
if @snapshot_taken.zero?
|
81
|
-
take_snapshot index
|
82
|
-
end
|
83
|
-
@snapshot_taken += 1
|
84
|
-
yield
|
85
|
-
@snapshot_taken -= 1
|
86
|
-
end
|
87
|
-
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
end
|
data/lib/picky/sources/couch.rb
DELETED
@@ -1,76 +0,0 @@
|
|
1
|
-
module Picky
|
2
|
-
|
3
|
-
module Sources
|
4
|
-
|
5
|
-
# A Couch database source.
|
6
|
-
#
|
7
|
-
# Options:
|
8
|
-
# * url
|
9
|
-
# and all the options of a <tt>RestClient::Resource</tt>.
|
10
|
-
# See http://github.com/archiloque/rest-client.
|
11
|
-
#
|
12
|
-
# Examples:
|
13
|
-
# Picky::Sources::Couch.new(:title, :author, :isbn, url:'localhost:5984')
|
14
|
-
# Picky::Sources::Couch.new(:title, :author, :isbn, url:'localhost:5984', user:'someuser', password:'somepassword')
|
15
|
-
#
|
16
|
-
class Couch < Base
|
17
|
-
|
18
|
-
# Raised when a Couch source is instantiated without a file.
|
19
|
-
#
|
20
|
-
# Example:
|
21
|
-
# Picky::Sources::Couch.new(:column1, :column2) # without file option
|
22
|
-
#
|
23
|
-
class NoDBGiven < StandardError; end
|
24
|
-
|
25
|
-
#
|
26
|
-
#
|
27
|
-
def initialize *category_names, options
|
28
|
-
check_gem
|
29
|
-
|
30
|
-
Hash === options && options[:url] || raise_no_db_given(category_names)
|
31
|
-
|
32
|
-
@db = RestClient::Resource.new options.delete(:url), options
|
33
|
-
|
34
|
-
key_format = options.delete :key_format
|
35
|
-
@key_format = key_format && key_format.intern || :to_s
|
36
|
-
end
|
37
|
-
|
38
|
-
def to_s
|
39
|
-
self.class.name
|
40
|
-
end
|
41
|
-
|
42
|
-
# Tries to require the rest_client gem.
|
43
|
-
#
|
44
|
-
def check_gem # :nodoc:
|
45
|
-
require 'rest_client'
|
46
|
-
rescue LoadError
|
47
|
-
warn_gem_missing 'rest-client', 'the CouchDB source'
|
48
|
-
exit 1
|
49
|
-
end
|
50
|
-
|
51
|
-
# Harvests the data to index.
|
52
|
-
#
|
53
|
-
# See important note, above.
|
54
|
-
#
|
55
|
-
@@id_key = '_id'
|
56
|
-
def harvest category
|
57
|
-
category_name = category.from.to_s
|
58
|
-
get_data do |doc|
|
59
|
-
yield doc[@@id_key], doc[category_name] || next
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
def get_data &block # :nodoc:
|
64
|
-
resp = @db['_all_docs?include_docs=true'].get
|
65
|
-
JSON.parse(resp)['rows'].
|
66
|
-
map{|row| row['doc']}.
|
67
|
-
each &block
|
68
|
-
end
|
69
|
-
|
70
|
-
def raise_no_db_given category_names # :nodoc:
|
71
|
-
raise NoDBGiven.new(category_names.join(', '))
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
end
|
data/lib/picky/sources/csv.rb
DELETED
@@ -1,83 +0,0 @@
|
|
1
|
-
module Picky
|
2
|
-
|
3
|
-
module Sources
|
4
|
-
|
5
|
-
# Describes a CSV source, a file with comma separated values in it.
|
6
|
-
#
|
7
|
-
# The first column is implicitly assumed to be the id column.
|
8
|
-
#
|
9
|
-
# It takes the same options as the Ruby 1.9 CSV class.
|
10
|
-
#
|
11
|
-
# Examples:
|
12
|
-
# Sources::CSV.new(:title, :author, :isbn, file:'data/a_csv_file.csv')
|
13
|
-
# Sources::CSV.new(:title, :author, :isbn, file:'data/a_csv_file.csv', col_sep:';')
|
14
|
-
# Sources::CSV.new(:title, :author, :isbn, file:'data/a_csv_file.csv', row_sep:"\n")
|
15
|
-
#
|
16
|
-
class CSV < Base
|
17
|
-
|
18
|
-
# Raised when a CSV source is instantiated without a file.
|
19
|
-
#
|
20
|
-
# Example:
|
21
|
-
# Sources::CSV.new(:column1, :column2) # without file option
|
22
|
-
#
|
23
|
-
class NoFileGiven < StandardError; end
|
24
|
-
|
25
|
-
# The CSV file's path, relative to PICKY_ROOT.
|
26
|
-
#
|
27
|
-
attr_reader :file_name
|
28
|
-
|
29
|
-
# The options that were passed into #new.
|
30
|
-
#
|
31
|
-
attr_reader :csv_options, :key_format
|
32
|
-
|
33
|
-
# The data category names.
|
34
|
-
#
|
35
|
-
attr_reader :category_names
|
36
|
-
|
37
|
-
def initialize *category_names, options
|
38
|
-
require 'csv'
|
39
|
-
@category_names = category_names
|
40
|
-
|
41
|
-
@csv_options = Hash === options && options || {}
|
42
|
-
@file_name = @csv_options.delete(:file) || raise_no_file_given(category_names)
|
43
|
-
|
44
|
-
key_format = options.delete :key_format
|
45
|
-
@key_format = key_format && key_format.intern || :to_i
|
46
|
-
end
|
47
|
-
|
48
|
-
def to_s
|
49
|
-
parameters = category_names
|
50
|
-
parameters << { file: file_name }
|
51
|
-
parameters << csv_options unless csv_options.empty?
|
52
|
-
%Q{#{self.class.name}(#{parameters.join(', ')})}
|
53
|
-
end
|
54
|
-
|
55
|
-
# Raises a NoCSVFileGiven exception.
|
56
|
-
#
|
57
|
-
def raise_no_file_given category_names # :nodoc:
|
58
|
-
raise NoFileGiven.new(category_names.join(', '))
|
59
|
-
end
|
60
|
-
|
61
|
-
# Harvests the data to index.
|
62
|
-
#
|
63
|
-
def harvest category
|
64
|
-
index = category_names.index category.from
|
65
|
-
get_data do |indexed_id, *ary|
|
66
|
-
text = ary[index]
|
67
|
-
next unless text
|
68
|
-
text.force_encoding 'utf-8' # TODO Still needed?
|
69
|
-
yield indexed_id, text
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
#
|
74
|
-
#
|
75
|
-
def get_data &block # :nodoc:
|
76
|
-
::CSV.foreach file_name, csv_options, &block
|
77
|
-
end
|
78
|
-
|
79
|
-
end
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
end
|
data/lib/picky/sources/db.rb
DELETED
@@ -1,189 +0,0 @@
|
|
1
|
-
module Picky
|
2
|
-
|
3
|
-
module Sources
|
4
|
-
|
5
|
-
# Describes a database source. Needs a SELECT statement
|
6
|
-
# (with id in it), and a file option or the options from an AR config file.
|
7
|
-
#
|
8
|
-
# The select statement can be as complicated as you want,
|
9
|
-
# as long as it has an id in it and as long as it can be
|
10
|
-
# used in a CREATE TABLE AS statement.
|
11
|
-
# (working on that last one)
|
12
|
-
#
|
13
|
-
# Examples:
|
14
|
-
# Sources::DB.new('SELECT id, title, author, year FROM books') # Uses the config from app/db.yml by default.
|
15
|
-
# Sources::DB.new('SELECT id, title, author, year FROM books', file: 'app/some_db.yml')
|
16
|
-
# Sources::DB.new('SELECT b.id, b.title, b.author, b.publishing_year as year FROM books b INNER JOIN ON ...', file: 'app/some_db.yml')
|
17
|
-
# Sources::DB.new('SELECT id, title, author, year FROM books', adapter: 'mysql', host:'localhost', ...)
|
18
|
-
#
|
19
|
-
class DB < Base
|
20
|
-
|
21
|
-
# The select statement that was passed in.
|
22
|
-
#
|
23
|
-
attr_reader :select_statement
|
24
|
-
|
25
|
-
# The database adapter.
|
26
|
-
#
|
27
|
-
attr_reader :database
|
28
|
-
|
29
|
-
# The database connection options that were either passed in or loaded from the given file.
|
30
|
-
#
|
31
|
-
attr_reader :connection_options, :options
|
32
|
-
|
33
|
-
@@traversal_id = :__picky_id
|
34
|
-
|
35
|
-
def initialize select_statement, options = { file: 'app/db.yml' }
|
36
|
-
check_gem
|
37
|
-
|
38
|
-
@select_statement = select_statement
|
39
|
-
@database = create_database_adapter
|
40
|
-
@options = options
|
41
|
-
end
|
42
|
-
|
43
|
-
# Tries to require the active_record gem.
|
44
|
-
#
|
45
|
-
def check_gem # :nodoc:
|
46
|
-
require 'active_record'
|
47
|
-
rescue LoadError
|
48
|
-
warn_gem_missing 'active_record', 'the (ActiveRecord) DB source'
|
49
|
-
exit 1
|
50
|
-
end
|
51
|
-
|
52
|
-
def to_s
|
53
|
-
parameters = [select_statement.inspect]
|
54
|
-
parameters << options unless options.empty?
|
55
|
-
%Q{#{self.class.name}(#{parameters.join(', ')})}
|
56
|
-
end
|
57
|
-
|
58
|
-
# Creates a database adapter for use with this source.
|
59
|
-
#
|
60
|
-
def create_database_adapter # :nodoc:
|
61
|
-
# THINK Do not use ActiveRecord directly? Use set_table_name etc?
|
62
|
-
#
|
63
|
-
adapter_class = Class.new ActiveRecord::Base
|
64
|
-
adapter_class.abstract_class = true
|
65
|
-
adapter_class
|
66
|
-
end
|
67
|
-
|
68
|
-
# Configure the backend.
|
69
|
-
#
|
70
|
-
# Options:
|
71
|
-
# Either
|
72
|
-
# * file => 'some/filename.yml' # With an active record configuration.
|
73
|
-
# Or
|
74
|
-
# * The configuration as a hash.
|
75
|
-
#
|
76
|
-
def configure options # :nodoc:
|
77
|
-
@connection_options = if filename = options[:file]
|
78
|
-
File.open(File.join(PICKY_ROOT, filename)) { |file| YAML::load(file) }
|
79
|
-
else
|
80
|
-
options
|
81
|
-
end
|
82
|
-
self
|
83
|
-
end
|
84
|
-
|
85
|
-
# Connect the backend.
|
86
|
-
#
|
87
|
-
# Will raise unless connection options have been given.
|
88
|
-
#
|
89
|
-
def connect_backend
|
90
|
-
configure @options
|
91
|
-
raise "Database backend not configured" unless connection_options
|
92
|
-
database.establish_connection connection_options
|
93
|
-
end
|
94
|
-
|
95
|
-
# Take a snapshot of the data.
|
96
|
-
#
|
97
|
-
# Uses CREATE TABLE AS with the given SELECT statement to create a snapshot of the data.
|
98
|
-
#
|
99
|
-
def take_snapshot index
|
100
|
-
timed_exclaim %Q{ "#{index.identifier}": Taking snapshot of database data.}
|
101
|
-
|
102
|
-
origin = snapshot_table_name index.name
|
103
|
-
on_database = database.connection
|
104
|
-
|
105
|
-
# Drop the table if it exists.
|
106
|
-
#
|
107
|
-
on_database.drop_table origin if on_database.table_exists?(origin)
|
108
|
-
|
109
|
-
# The adapters currently do not support this.
|
110
|
-
#
|
111
|
-
on_database.execute "CREATE TABLE #{origin} AS #{select_statement}"
|
112
|
-
|
113
|
-
# Add a column that Picky uses to traverse the table's entries.
|
114
|
-
#
|
115
|
-
on_database.add_column origin, @@traversal_id, :primary_key, :null => :false
|
116
|
-
|
117
|
-
# Execute any special queries this index needs executed.
|
118
|
-
#
|
119
|
-
on_database.execute index.after_indexing if index.after_indexing
|
120
|
-
end
|
121
|
-
|
122
|
-
# Counts all the entries that are used for the index.
|
123
|
-
#
|
124
|
-
def count index_name
|
125
|
-
database.connection.select_value("SELECT COUNT(#{@@traversal_id}) FROM #{snapshot_table_name(index_name)}").to_i
|
126
|
-
end
|
127
|
-
|
128
|
-
# The name of the snapshot table created by Picky.
|
129
|
-
#
|
130
|
-
def snapshot_table_name index_name
|
131
|
-
"picky_#{index_name}_index"
|
132
|
-
end
|
133
|
-
|
134
|
-
# Harvests the data to index in chunks.
|
135
|
-
#
|
136
|
-
def harvest category, &block
|
137
|
-
(0..count(category.index_name)).step(chunksize) do |offset|
|
138
|
-
get_data category, offset, &block
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
# Gets the data from the backend.
|
143
|
-
#
|
144
|
-
def get_data category, offset, &block # :nodoc:
|
145
|
-
select_statement = harvest_statement_with_offset category, offset
|
146
|
-
|
147
|
-
# THINK Not really nice like this. Rewrite if possible.
|
148
|
-
#
|
149
|
-
if database.connection.adapter_name == "PostgreSQL"
|
150
|
-
id_key = 'id'
|
151
|
-
text_key = category.from.to_s
|
152
|
-
database.connection.execute(select_statement).each do |hash|
|
153
|
-
id, text = hash.values_at id_key, text_key
|
154
|
-
yield id, text if text
|
155
|
-
end
|
156
|
-
else
|
157
|
-
database.connection.execute(select_statement).each do |id, text|
|
158
|
-
yield id, text if text
|
159
|
-
end
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
# Builds a harvest statement for getting data to index.
|
164
|
-
#
|
165
|
-
def harvest_statement_with_offset category, offset
|
166
|
-
statement = harvest_statement category
|
167
|
-
|
168
|
-
statement += statement.include?('WHERE') ? ' AND' : ' WHERE'
|
169
|
-
|
170
|
-
"#{statement} st.#{@@traversal_id} > #{offset} LIMIT #{chunksize}"
|
171
|
-
end
|
172
|
-
|
173
|
-
# The harvest statement used to pull data from the snapshot table.
|
174
|
-
#
|
175
|
-
def harvest_statement category
|
176
|
-
"SELECT id, #{category.from} FROM #{snapshot_table_name(category.index_name)} st"
|
177
|
-
end
|
178
|
-
|
179
|
-
# The amount of records that are loaded each chunk.
|
180
|
-
#
|
181
|
-
def chunksize
|
182
|
-
25_000
|
183
|
-
end
|
184
|
-
|
185
|
-
end
|
186
|
-
|
187
|
-
end
|
188
|
-
|
189
|
-
end
|