picky 4.0.0pre1 → 4.0.0pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/aux/picky/cli.rb +6 -2
- data/lib/picky.rb +10 -8
- data/lib/picky/backends/backend.rb +37 -0
- data/lib/picky/backends/file.rb +0 -20
- data/lib/picky/backends/memory.rb +0 -29
- data/lib/picky/backends/redis.rb +74 -15
- data/lib/picky/backends/redis/list.rb +1 -1
- data/lib/picky/backends/sqlite.rb +0 -27
- data/lib/picky/bundle.rb +2 -2
- data/lib/picky/bundle_indexed.rb +1 -1
- data/lib/picky/bundle_indexing.rb +1 -1
- data/lib/picky/categories_indexed.rb +1 -11
- data/lib/picky/category.rb +4 -4
- data/lib/picky/category/location.rb +25 -0
- data/lib/picky/category_realtime.rb +4 -3
- data/lib/picky/console.rb +1 -1
- data/lib/picky/constants.rb +1 -1
- data/lib/picky/ext/maybe_compile.rb +2 -2
- data/lib/picky/extensions/object.rb +3 -2
- data/lib/picky/generators/aliases.rb +7 -2
- data/lib/picky/generators/partial/default.rb +1 -0
- data/lib/picky/generators/similarity/default.rb +1 -0
- data/lib/picky/generators/similarity/phonetic.rb +13 -2
- data/lib/picky/generators/strategy.rb +0 -2
- data/lib/picky/generators/weights/constant.rb +1 -2
- data/lib/picky/generators/weights/default.rb +1 -0
- data/lib/picky/generators/weights/dynamic.rb +1 -1
- data/lib/picky/generators/weights/logarithmic.rb +1 -1
- data/lib/picky/generators/weights/{runtime.rb → stub.rb} +1 -3
- data/lib/picky/index.rb +3 -3
- data/lib/picky/index_indexing.rb +0 -2
- data/lib/picky/index_realtime.rb +1 -1
- data/lib/picky/indexers/base.rb +7 -0
- data/lib/picky/indexers/parallel.rb +2 -4
- data/lib/picky/indexers/serial.rb +2 -0
- data/lib/picky/indexes_indexing.rb +1 -1
- data/lib/picky/interfaces/live_parameters/master_child.rb +175 -0
- data/lib/picky/interfaces/live_parameters/unicorn.rb +37 -0
- data/lib/picky/loader.rb +238 -259
- data/lib/picky/query/allocation.rb +19 -10
- data/lib/picky/query/combination.rb +7 -1
- data/lib/picky/query/combinations.rb +1 -6
- data/lib/picky/query/token.rb +26 -36
- data/lib/picky/results.rb +18 -17
- data/lib/picky/scheduler.rb +2 -1
- data/lib/picky/search.rb +1 -1
- data/lib/picky/sinatra.rb +6 -6
- data/lib/picky/statistics.rb +2 -0
- data/lib/picky/tokenizer.rb +8 -8
- data/lib/picky/wrappers/bundle/calculation.rb +4 -4
- data/lib/picky/wrappers/bundle/location.rb +1 -2
- data/lib/tasks/framework.rake +1 -1
- data/lib/tasks/statistics.rake +1 -1
- data/lib/tasks/try.rake +1 -1
- data/lib/tasks/try.rb +1 -1
- data/spec/aux/picky/cli_spec.rb +12 -12
- data/spec/ext/performant_spec.rb +16 -16
- data/spec/functional/backends/file_spec.rb +78 -7
- data/spec/functional/backends/memory_spec.rb +78 -7
- data/spec/functional/backends/redis_spec.rb +73 -13
- data/spec/functional/dynamic_weights_spec.rb +3 -4
- data/spec/functional/realtime_spec.rb +2 -2
- data/spec/functional/speed_spec.rb +2 -2
- data/spec/functional/terminate_early_spec.rb +3 -3
- data/spec/lib/analytics_spec.rb +1 -1
- data/spec/lib/analyzer_spec.rb +5 -3
- data/spec/lib/categories_indexed_spec.rb +38 -20
- data/spec/lib/category/location_spec.rb +30 -0
- data/spec/lib/character_substituters/west_european_spec.rb +1 -0
- data/spec/lib/extensions/hash_spec.rb +6 -5
- data/spec/lib/extensions/module_spec.rb +6 -6
- data/spec/lib/extensions/object_spec.rb +9 -8
- data/spec/lib/extensions/string_spec.rb +1 -1
- data/spec/lib/generators/similarity/phonetic_spec.rb +11 -0
- data/spec/lib/index_realtime_spec.rb +5 -5
- data/spec/lib/interfaces/{live_parameters_spec.rb → live_parameters/master_child_spec.rb} +26 -26
- data/spec/lib/interfaces/live_parameters/unicorn_spec.rb +160 -0
- data/spec/lib/loader_spec.rb +65 -25
- data/spec/lib/query/allocation_spec.rb +25 -22
- data/spec/lib/query/combinations_spec.rb +13 -36
- data/spec/lib/query/token_spec.rb +144 -131
- data/spec/lib/query/tokens_spec.rb +14 -0
- data/spec/lib/results_spec.rb +14 -8
- data/spec/lib/search_spec.rb +1 -1
- data/spec/lib/sinatra_spec.rb +8 -8
- metadata +28 -91
- data/lib/picky/adapters/rack.rb +0 -34
- data/lib/picky/adapters/rack/base.rb +0 -27
- data/lib/picky/adapters/rack/live_parameters.rb +0 -37
- data/lib/picky/adapters/rack/search.rb +0 -67
- data/lib/picky/application.rb +0 -268
- data/lib/picky/frontend_adapters/rack.rb +0 -161
- data/lib/picky/interfaces/live_parameters.rb +0 -187
- data/lib/picky/sources/base.rb +0 -92
- data/lib/picky/sources/couch.rb +0 -76
- data/lib/picky/sources/csv.rb +0 -83
- data/lib/picky/sources/db.rb +0 -189
- data/lib/picky/sources/delicious.rb +0 -63
- data/lib/picky/sources/mongo.rb +0 -80
- data/lib/picky/wrappers/category/location.rb +0 -38
- data/lib/tasks/routes.rake +0 -8
- data/spec/lib/adapters/rack/base_spec.rb +0 -24
- data/spec/lib/adapters/rack/live_parameters_spec.rb +0 -26
- data/spec/lib/adapters/rack/query_spec.rb +0 -39
- data/spec/lib/application_spec.rb +0 -155
- data/spec/lib/frontend_adapters/rack_spec.rb +0 -294
- data/spec/lib/sources/base_spec.rb +0 -53
- data/spec/lib/sources/couch_spec.rb +0 -114
- data/spec/lib/sources/csv_spec.rb +0 -89
- data/spec/lib/sources/db_spec.rb +0 -125
- data/spec/lib/sources/delicious_spec.rb +0 -94
- data/spec/lib/sources/mongo_spec.rb +0 -50
|
@@ -1,187 +0,0 @@
|
|
|
1
|
-
module Picky
|
|
2
|
-
|
|
3
|
-
# This is very optional.
|
|
4
|
-
# Only load if the user wants it.
|
|
5
|
-
#
|
|
6
|
-
module Interfaces
|
|
7
|
-
|
|
8
|
-
# This is an interface that provides the user of
|
|
9
|
-
# Picky with the possibility to change parameters
|
|
10
|
-
# while the Application is running.
|
|
11
|
-
#
|
|
12
|
-
# Important Note: This will only work in Master/Child configurations.
|
|
13
|
-
#
|
|
14
|
-
class LiveParameters
|
|
15
|
-
|
|
16
|
-
def initialize
|
|
17
|
-
@child, @parent = IO.pipe
|
|
18
|
-
start_master_process_thread
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
# This runs a thread that listens to child processes.
|
|
22
|
-
#
|
|
23
|
-
def start_master_process_thread
|
|
24
|
-
# This thread is stopped in the children.
|
|
25
|
-
#
|
|
26
|
-
Thread.new do
|
|
27
|
-
loop do
|
|
28
|
-
IO.select([@child], nil, nil, 2) or next
|
|
29
|
-
result = @child.gets ';;;'
|
|
30
|
-
pid, configuration_hash = eval result
|
|
31
|
-
next unless Hash === configuration_hash
|
|
32
|
-
next if configuration_hash.empty?
|
|
33
|
-
exclaim "Trying to update MASTER configuration."
|
|
34
|
-
try_updating_configuration_with configuration_hash
|
|
35
|
-
kill_each_worker_except pid
|
|
36
|
-
|
|
37
|
-
# Fails hard on an error.
|
|
38
|
-
#
|
|
39
|
-
end
|
|
40
|
-
end
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
# TODO This needs to be webserver agnostic.
|
|
44
|
-
#
|
|
45
|
-
def worker_pids
|
|
46
|
-
Unicorn::HttpServer::WORKERS.keys
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
# Taken from Unicorn.
|
|
50
|
-
#
|
|
51
|
-
def kill_each_worker_except pid
|
|
52
|
-
worker_pids.each do |wpid|
|
|
53
|
-
next if wpid == pid
|
|
54
|
-
kill_worker :KILL, wpid
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
def kill_worker signal, wpid
|
|
58
|
-
Process.kill signal, wpid
|
|
59
|
-
exclaim "Killing worker ##{wpid} with signal #{signal}."
|
|
60
|
-
rescue Errno::ESRCH
|
|
61
|
-
remove_worker wpid
|
|
62
|
-
end
|
|
63
|
-
# TODO This needs to be Webserver agnostic.
|
|
64
|
-
#
|
|
65
|
-
def remove_worker wpid
|
|
66
|
-
worker = Unicorn::HttpServer::WORKERS.delete(wpid) and worker.tmp.close rescue nil
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
# Updates any parameters with the ones given and
|
|
70
|
-
# returns the updated params.
|
|
71
|
-
#
|
|
72
|
-
# The params are a strictly defined hash of:
|
|
73
|
-
# * querying_removes_characters: Regexp
|
|
74
|
-
# * querying_stopwords: Regexp
|
|
75
|
-
# * querying_splits_text_on: Regexp
|
|
76
|
-
#
|
|
77
|
-
# This first tries to update in the child process,
|
|
78
|
-
# and if successful, in the parent process
|
|
79
|
-
#
|
|
80
|
-
def parameters configuration_hash
|
|
81
|
-
close_child
|
|
82
|
-
exclaim "Trying to update worker child configuration." unless configuration_hash.empty?
|
|
83
|
-
try_updating_configuration_with configuration_hash
|
|
84
|
-
write_parent configuration_hash
|
|
85
|
-
extract_configuration
|
|
86
|
-
rescue CouldNotUpdateConfigurationError => e
|
|
87
|
-
# I need to die such that my broken config is never used.
|
|
88
|
-
#
|
|
89
|
-
exclaim "Child process #{Process.pid} performs harakiri because of broken config."
|
|
90
|
-
harakiri
|
|
91
|
-
{ e.config_key => :ERROR }
|
|
92
|
-
end
|
|
93
|
-
# Kills itself, but still answering the request honorably.
|
|
94
|
-
#
|
|
95
|
-
def harakiri
|
|
96
|
-
Process.kill :QUIT, Process.pid
|
|
97
|
-
end
|
|
98
|
-
# Write the parent.
|
|
99
|
-
#
|
|
100
|
-
# Note: The ;;; is the end marker for the message.
|
|
101
|
-
#
|
|
102
|
-
def write_parent configuration_hash
|
|
103
|
-
@parent.write "#{[Process.pid, configuration_hash]};;;"
|
|
104
|
-
end
|
|
105
|
-
# Close the child if it isn't yet closed.
|
|
106
|
-
#
|
|
107
|
-
def close_child
|
|
108
|
-
@child.close unless @child.closed?
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
class CouldNotUpdateConfigurationError < StandardError
|
|
112
|
-
attr_reader :config_key
|
|
113
|
-
def initialize config_key, message
|
|
114
|
-
super message
|
|
115
|
-
@config_key = config_key
|
|
116
|
-
end
|
|
117
|
-
end
|
|
118
|
-
|
|
119
|
-
# Tries updating the configuration in the child process or parent process.
|
|
120
|
-
#
|
|
121
|
-
def try_updating_configuration_with configuration_hash
|
|
122
|
-
current_key = nil
|
|
123
|
-
begin
|
|
124
|
-
configuration_hash.each_pair do |key, new_value|
|
|
125
|
-
exclaim " Setting #{key} with #{new_value}."
|
|
126
|
-
current_key = key
|
|
127
|
-
send :"#{key}=", new_value
|
|
128
|
-
end
|
|
129
|
-
rescue StandardError => e
|
|
130
|
-
# Catch any error and reraise as config error.
|
|
131
|
-
#
|
|
132
|
-
raise CouldNotUpdateConfigurationError.new current_key, e.message
|
|
133
|
-
end
|
|
134
|
-
end
|
|
135
|
-
|
|
136
|
-
def extract_configuration
|
|
137
|
-
{
|
|
138
|
-
querying_removes_characters: querying_removes_characters,
|
|
139
|
-
querying_stopwords: querying_stopwords,
|
|
140
|
-
querying_splits_text_on: querying_splits_text_on
|
|
141
|
-
}
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
# TODO Move to Interface object.
|
|
145
|
-
#
|
|
146
|
-
def querying_removes_characters
|
|
147
|
-
regexp = Tokenizer.query_default.instance_variable_get :@removes_characters_regexp
|
|
148
|
-
regexp && regexp.source
|
|
149
|
-
end
|
|
150
|
-
def querying_removes_characters= new_value
|
|
151
|
-
Tokenizer.query_default.instance_variable_set(:@removes_characters_regexp, %r{#{new_value}})
|
|
152
|
-
end
|
|
153
|
-
def querying_stopwords
|
|
154
|
-
regexp = Tokenizer.query_default.instance_variable_get :@remove_stopwords_regexp
|
|
155
|
-
regexp && regexp.source
|
|
156
|
-
end
|
|
157
|
-
def querying_stopwords= new_value
|
|
158
|
-
Tokenizer.query_default.instance_variable_set(:@remove_stopwords_regexp, %r{#{new_value}})
|
|
159
|
-
end
|
|
160
|
-
def querying_splits_text_on
|
|
161
|
-
splits = Tokenizer.query_default.instance_variable_get :@splits_text_on
|
|
162
|
-
splits && splits.respond_to?(:source) ? splits.source : splits
|
|
163
|
-
end
|
|
164
|
-
def querying_splits_text_on= new_value
|
|
165
|
-
splits = Tokenizer.query_default.instance_variable_get :@splits_text_on
|
|
166
|
-
if splits.respond_to?(:source)
|
|
167
|
-
Tokenizer.query_default.instance_variable_set(:@splits_text_on, %r{#{new_value}})
|
|
168
|
-
else
|
|
169
|
-
Tokenizer.query_default.instance_variable_set(:@splits_text_on, new_value)
|
|
170
|
-
end
|
|
171
|
-
end
|
|
172
|
-
|
|
173
|
-
#
|
|
174
|
-
#
|
|
175
|
-
def to_s
|
|
176
|
-
"Suckerfish Live Interface (Use the picky-live gem to introspect)"
|
|
177
|
-
end
|
|
178
|
-
|
|
179
|
-
end
|
|
180
|
-
|
|
181
|
-
end
|
|
182
|
-
|
|
183
|
-
# Aka.
|
|
184
|
-
#
|
|
185
|
-
LiveParameters = Interfaces::LiveParameters
|
|
186
|
-
|
|
187
|
-
end
|
data/lib/picky/sources/base.rb
DELETED
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
module Picky
|
|
2
|
-
|
|
3
|
-
# = Data Sources
|
|
4
|
-
#
|
|
5
|
-
# Currently, Picky offers the following Sources:
|
|
6
|
-
# * CSV (comma – or other – separated file)
|
|
7
|
-
# * Couch (CouchDB, key-value store)
|
|
8
|
-
# * DB (Databases, foremost MySQL)
|
|
9
|
-
# * Delicious (http://del.icio.us, online bookmarking service)
|
|
10
|
-
# See also:
|
|
11
|
-
# http://github.com/floere/picky/wiki/Sources-Configuration
|
|
12
|
-
#
|
|
13
|
-
# Don't worry if your source isn't here. Adding your own is easy:
|
|
14
|
-
# http://github.com/floere/picky/wiki/Contributing-sources
|
|
15
|
-
#
|
|
16
|
-
module Sources
|
|
17
|
-
|
|
18
|
-
# Sources are where your data comes from.
|
|
19
|
-
#
|
|
20
|
-
# A source has 1 mandatory and 2 optional methods:
|
|
21
|
-
# * connect_backend (_optional_): called once for each type/category pair.
|
|
22
|
-
# * harvest: Used by the indexer to gather data. Yields an indexed_id (string or integer) and a string value.
|
|
23
|
-
# * take_snapshot (_optional_): called once for each index or category (if indexing a single category).
|
|
24
|
-
#
|
|
25
|
-
# This base class "implements" all these methods, but they don't do anything.
|
|
26
|
-
# Subclass this class <tt>class MySource < Base</tt> and override the methods in your source to do something.
|
|
27
|
-
#
|
|
28
|
-
class Base
|
|
29
|
-
|
|
30
|
-
attr_reader :key_format
|
|
31
|
-
|
|
32
|
-
# Connect to the backend.
|
|
33
|
-
#
|
|
34
|
-
# Called once per index/category combination before harvesting.
|
|
35
|
-
#
|
|
36
|
-
# Examples:
|
|
37
|
-
# * The DB backend connects the DB adapter.
|
|
38
|
-
# * We open a connection to a key value store.
|
|
39
|
-
# * We open an file with data.
|
|
40
|
-
#
|
|
41
|
-
def connect_backend
|
|
42
|
-
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
# Called by the indexer when gathering data.
|
|
46
|
-
#
|
|
47
|
-
# Yields the data (id, text for id) for the given category.
|
|
48
|
-
#
|
|
49
|
-
# When implementing or overriding your own,
|
|
50
|
-
# be sure to <tt>yield(id, text_for_id)</tt> (or <tt>block.call(id, text_for_id)</tt>)
|
|
51
|
-
# for the given type symbol and category symbol.
|
|
52
|
-
#
|
|
53
|
-
# Note: Since harvest needs to be implemented, it has no default impementation.
|
|
54
|
-
#
|
|
55
|
-
# def harvest category # :yields: id, text_for_id
|
|
56
|
-
#
|
|
57
|
-
# end
|
|
58
|
-
|
|
59
|
-
# Used to take a snapshot of your data if it is fast changing.
|
|
60
|
-
#
|
|
61
|
-
# Called once for each index before harvesting.
|
|
62
|
-
# If it has been called on a source already by an index,
|
|
63
|
-
# it won't be called again for a category inside that index.
|
|
64
|
-
#
|
|
65
|
-
# Example:
|
|
66
|
-
# * In a DB source, a table based on the source's select statement is created.
|
|
67
|
-
#
|
|
68
|
-
def take_snapshot index
|
|
69
|
-
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
# Used to check if a snapshot has been done already.
|
|
73
|
-
#
|
|
74
|
-
# Example:
|
|
75
|
-
# * In a DB source, a table based on the source's select statement is created.
|
|
76
|
-
#
|
|
77
|
-
def with_snapshot index
|
|
78
|
-
connect_backend
|
|
79
|
-
@snapshot_taken ||= 0
|
|
80
|
-
if @snapshot_taken.zero?
|
|
81
|
-
take_snapshot index
|
|
82
|
-
end
|
|
83
|
-
@snapshot_taken += 1
|
|
84
|
-
yield
|
|
85
|
-
@snapshot_taken -= 1
|
|
86
|
-
end
|
|
87
|
-
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
end
|
data/lib/picky/sources/couch.rb
DELETED
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
module Picky
|
|
2
|
-
|
|
3
|
-
module Sources
|
|
4
|
-
|
|
5
|
-
# A Couch database source.
|
|
6
|
-
#
|
|
7
|
-
# Options:
|
|
8
|
-
# * url
|
|
9
|
-
# and all the options of a <tt>RestClient::Resource</tt>.
|
|
10
|
-
# See http://github.com/archiloque/rest-client.
|
|
11
|
-
#
|
|
12
|
-
# Examples:
|
|
13
|
-
# Picky::Sources::Couch.new(:title, :author, :isbn, url:'localhost:5984')
|
|
14
|
-
# Picky::Sources::Couch.new(:title, :author, :isbn, url:'localhost:5984', user:'someuser', password:'somepassword')
|
|
15
|
-
#
|
|
16
|
-
class Couch < Base
|
|
17
|
-
|
|
18
|
-
# Raised when a Couch source is instantiated without a file.
|
|
19
|
-
#
|
|
20
|
-
# Example:
|
|
21
|
-
# Picky::Sources::Couch.new(:column1, :column2) # without file option
|
|
22
|
-
#
|
|
23
|
-
class NoDBGiven < StandardError; end
|
|
24
|
-
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
def initialize *category_names, options
|
|
28
|
-
check_gem
|
|
29
|
-
|
|
30
|
-
Hash === options && options[:url] || raise_no_db_given(category_names)
|
|
31
|
-
|
|
32
|
-
@db = RestClient::Resource.new options.delete(:url), options
|
|
33
|
-
|
|
34
|
-
key_format = options.delete :key_format
|
|
35
|
-
@key_format = key_format && key_format.intern || :to_s
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
def to_s
|
|
39
|
-
self.class.name
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# Tries to require the rest_client gem.
|
|
43
|
-
#
|
|
44
|
-
def check_gem # :nodoc:
|
|
45
|
-
require 'rest_client'
|
|
46
|
-
rescue LoadError
|
|
47
|
-
warn_gem_missing 'rest-client', 'the CouchDB source'
|
|
48
|
-
exit 1
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
# Harvests the data to index.
|
|
52
|
-
#
|
|
53
|
-
# See important note, above.
|
|
54
|
-
#
|
|
55
|
-
@@id_key = '_id'
|
|
56
|
-
def harvest category
|
|
57
|
-
category_name = category.from.to_s
|
|
58
|
-
get_data do |doc|
|
|
59
|
-
yield doc[@@id_key], doc[category_name] || next
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
def get_data &block # :nodoc:
|
|
64
|
-
resp = @db['_all_docs?include_docs=true'].get
|
|
65
|
-
JSON.parse(resp)['rows'].
|
|
66
|
-
map{|row| row['doc']}.
|
|
67
|
-
each &block
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
def raise_no_db_given category_names # :nodoc:
|
|
71
|
-
raise NoDBGiven.new(category_names.join(', '))
|
|
72
|
-
end
|
|
73
|
-
end
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
end
|
data/lib/picky/sources/csv.rb
DELETED
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
module Picky
|
|
2
|
-
|
|
3
|
-
module Sources
|
|
4
|
-
|
|
5
|
-
# Describes a CSV source, a file with comma separated values in it.
|
|
6
|
-
#
|
|
7
|
-
# The first column is implicitly assumed to be the id column.
|
|
8
|
-
#
|
|
9
|
-
# It takes the same options as the Ruby 1.9 CSV class.
|
|
10
|
-
#
|
|
11
|
-
# Examples:
|
|
12
|
-
# Sources::CSV.new(:title, :author, :isbn, file:'data/a_csv_file.csv')
|
|
13
|
-
# Sources::CSV.new(:title, :author, :isbn, file:'data/a_csv_file.csv', col_sep:';')
|
|
14
|
-
# Sources::CSV.new(:title, :author, :isbn, file:'data/a_csv_file.csv', row_sep:"\n")
|
|
15
|
-
#
|
|
16
|
-
class CSV < Base
|
|
17
|
-
|
|
18
|
-
# Raised when a CSV source is instantiated without a file.
|
|
19
|
-
#
|
|
20
|
-
# Example:
|
|
21
|
-
# Sources::CSV.new(:column1, :column2) # without file option
|
|
22
|
-
#
|
|
23
|
-
class NoFileGiven < StandardError; end
|
|
24
|
-
|
|
25
|
-
# The CSV file's path, relative to PICKY_ROOT.
|
|
26
|
-
#
|
|
27
|
-
attr_reader :file_name
|
|
28
|
-
|
|
29
|
-
# The options that were passed into #new.
|
|
30
|
-
#
|
|
31
|
-
attr_reader :csv_options, :key_format
|
|
32
|
-
|
|
33
|
-
# The data category names.
|
|
34
|
-
#
|
|
35
|
-
attr_reader :category_names
|
|
36
|
-
|
|
37
|
-
def initialize *category_names, options
|
|
38
|
-
require 'csv'
|
|
39
|
-
@category_names = category_names
|
|
40
|
-
|
|
41
|
-
@csv_options = Hash === options && options || {}
|
|
42
|
-
@file_name = @csv_options.delete(:file) || raise_no_file_given(category_names)
|
|
43
|
-
|
|
44
|
-
key_format = options.delete :key_format
|
|
45
|
-
@key_format = key_format && key_format.intern || :to_i
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
def to_s
|
|
49
|
-
parameters = category_names
|
|
50
|
-
parameters << { file: file_name }
|
|
51
|
-
parameters << csv_options unless csv_options.empty?
|
|
52
|
-
%Q{#{self.class.name}(#{parameters.join(', ')})}
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
# Raises a NoCSVFileGiven exception.
|
|
56
|
-
#
|
|
57
|
-
def raise_no_file_given category_names # :nodoc:
|
|
58
|
-
raise NoFileGiven.new(category_names.join(', '))
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
# Harvests the data to index.
|
|
62
|
-
#
|
|
63
|
-
def harvest category
|
|
64
|
-
index = category_names.index category.from
|
|
65
|
-
get_data do |indexed_id, *ary|
|
|
66
|
-
text = ary[index]
|
|
67
|
-
next unless text
|
|
68
|
-
text.force_encoding 'utf-8' # TODO Still needed?
|
|
69
|
-
yield indexed_id, text
|
|
70
|
-
end
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
#
|
|
74
|
-
#
|
|
75
|
-
def get_data &block # :nodoc:
|
|
76
|
-
::CSV.foreach file_name, csv_options, &block
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
end
|
data/lib/picky/sources/db.rb
DELETED
|
@@ -1,189 +0,0 @@
|
|
|
1
|
-
module Picky
|
|
2
|
-
|
|
3
|
-
module Sources
|
|
4
|
-
|
|
5
|
-
# Describes a database source. Needs a SELECT statement
|
|
6
|
-
# (with id in it), and a file option or the options from an AR config file.
|
|
7
|
-
#
|
|
8
|
-
# The select statement can be as complicated as you want,
|
|
9
|
-
# as long as it has an id in it and as long as it can be
|
|
10
|
-
# used in a CREATE TABLE AS statement.
|
|
11
|
-
# (working on that last one)
|
|
12
|
-
#
|
|
13
|
-
# Examples:
|
|
14
|
-
# Sources::DB.new('SELECT id, title, author, year FROM books') # Uses the config from app/db.yml by default.
|
|
15
|
-
# Sources::DB.new('SELECT id, title, author, year FROM books', file: 'app/some_db.yml')
|
|
16
|
-
# Sources::DB.new('SELECT b.id, b.title, b.author, b.publishing_year as year FROM books b INNER JOIN ON ...', file: 'app/some_db.yml')
|
|
17
|
-
# Sources::DB.new('SELECT id, title, author, year FROM books', adapter: 'mysql', host:'localhost', ...)
|
|
18
|
-
#
|
|
19
|
-
class DB < Base
|
|
20
|
-
|
|
21
|
-
# The select statement that was passed in.
|
|
22
|
-
#
|
|
23
|
-
attr_reader :select_statement
|
|
24
|
-
|
|
25
|
-
# The database adapter.
|
|
26
|
-
#
|
|
27
|
-
attr_reader :database
|
|
28
|
-
|
|
29
|
-
# The database connection options that were either passed in or loaded from the given file.
|
|
30
|
-
#
|
|
31
|
-
attr_reader :connection_options, :options
|
|
32
|
-
|
|
33
|
-
@@traversal_id = :__picky_id
|
|
34
|
-
|
|
35
|
-
def initialize select_statement, options = { file: 'app/db.yml' }
|
|
36
|
-
check_gem
|
|
37
|
-
|
|
38
|
-
@select_statement = select_statement
|
|
39
|
-
@database = create_database_adapter
|
|
40
|
-
@options = options
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
# Tries to require the active_record gem.
|
|
44
|
-
#
|
|
45
|
-
def check_gem # :nodoc:
|
|
46
|
-
require 'active_record'
|
|
47
|
-
rescue LoadError
|
|
48
|
-
warn_gem_missing 'active_record', 'the (ActiveRecord) DB source'
|
|
49
|
-
exit 1
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
def to_s
|
|
53
|
-
parameters = [select_statement.inspect]
|
|
54
|
-
parameters << options unless options.empty?
|
|
55
|
-
%Q{#{self.class.name}(#{parameters.join(', ')})}
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
# Creates a database adapter for use with this source.
|
|
59
|
-
#
|
|
60
|
-
def create_database_adapter # :nodoc:
|
|
61
|
-
# THINK Do not use ActiveRecord directly? Use set_table_name etc?
|
|
62
|
-
#
|
|
63
|
-
adapter_class = Class.new ActiveRecord::Base
|
|
64
|
-
adapter_class.abstract_class = true
|
|
65
|
-
adapter_class
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
# Configure the backend.
|
|
69
|
-
#
|
|
70
|
-
# Options:
|
|
71
|
-
# Either
|
|
72
|
-
# * file => 'some/filename.yml' # With an active record configuration.
|
|
73
|
-
# Or
|
|
74
|
-
# * The configuration as a hash.
|
|
75
|
-
#
|
|
76
|
-
def configure options # :nodoc:
|
|
77
|
-
@connection_options = if filename = options[:file]
|
|
78
|
-
File.open(File.join(PICKY_ROOT, filename)) { |file| YAML::load(file) }
|
|
79
|
-
else
|
|
80
|
-
options
|
|
81
|
-
end
|
|
82
|
-
self
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
# Connect the backend.
|
|
86
|
-
#
|
|
87
|
-
# Will raise unless connection options have been given.
|
|
88
|
-
#
|
|
89
|
-
def connect_backend
|
|
90
|
-
configure @options
|
|
91
|
-
raise "Database backend not configured" unless connection_options
|
|
92
|
-
database.establish_connection connection_options
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
# Take a snapshot of the data.
|
|
96
|
-
#
|
|
97
|
-
# Uses CREATE TABLE AS with the given SELECT statement to create a snapshot of the data.
|
|
98
|
-
#
|
|
99
|
-
def take_snapshot index
|
|
100
|
-
timed_exclaim %Q{ "#{index.identifier}": Taking snapshot of database data.}
|
|
101
|
-
|
|
102
|
-
origin = snapshot_table_name index.name
|
|
103
|
-
on_database = database.connection
|
|
104
|
-
|
|
105
|
-
# Drop the table if it exists.
|
|
106
|
-
#
|
|
107
|
-
on_database.drop_table origin if on_database.table_exists?(origin)
|
|
108
|
-
|
|
109
|
-
# The adapters currently do not support this.
|
|
110
|
-
#
|
|
111
|
-
on_database.execute "CREATE TABLE #{origin} AS #{select_statement}"
|
|
112
|
-
|
|
113
|
-
# Add a column that Picky uses to traverse the table's entries.
|
|
114
|
-
#
|
|
115
|
-
on_database.add_column origin, @@traversal_id, :primary_key, :null => :false
|
|
116
|
-
|
|
117
|
-
# Execute any special queries this index needs executed.
|
|
118
|
-
#
|
|
119
|
-
on_database.execute index.after_indexing if index.after_indexing
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
# Counts all the entries that are used for the index.
|
|
123
|
-
#
|
|
124
|
-
def count index_name
|
|
125
|
-
database.connection.select_value("SELECT COUNT(#{@@traversal_id}) FROM #{snapshot_table_name(index_name)}").to_i
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
# The name of the snapshot table created by Picky.
|
|
129
|
-
#
|
|
130
|
-
def snapshot_table_name index_name
|
|
131
|
-
"picky_#{index_name}_index"
|
|
132
|
-
end
|
|
133
|
-
|
|
134
|
-
# Harvests the data to index in chunks.
|
|
135
|
-
#
|
|
136
|
-
def harvest category, &block
|
|
137
|
-
(0..count(category.index_name)).step(chunksize) do |offset|
|
|
138
|
-
get_data category, offset, &block
|
|
139
|
-
end
|
|
140
|
-
end
|
|
141
|
-
|
|
142
|
-
# Gets the data from the backend.
|
|
143
|
-
#
|
|
144
|
-
def get_data category, offset, &block # :nodoc:
|
|
145
|
-
select_statement = harvest_statement_with_offset category, offset
|
|
146
|
-
|
|
147
|
-
# THINK Not really nice like this. Rewrite if possible.
|
|
148
|
-
#
|
|
149
|
-
if database.connection.adapter_name == "PostgreSQL"
|
|
150
|
-
id_key = 'id'
|
|
151
|
-
text_key = category.from.to_s
|
|
152
|
-
database.connection.execute(select_statement).each do |hash|
|
|
153
|
-
id, text = hash.values_at id_key, text_key
|
|
154
|
-
yield id, text if text
|
|
155
|
-
end
|
|
156
|
-
else
|
|
157
|
-
database.connection.execute(select_statement).each do |id, text|
|
|
158
|
-
yield id, text if text
|
|
159
|
-
end
|
|
160
|
-
end
|
|
161
|
-
end
|
|
162
|
-
|
|
163
|
-
# Builds a harvest statement for getting data to index.
|
|
164
|
-
#
|
|
165
|
-
def harvest_statement_with_offset category, offset
|
|
166
|
-
statement = harvest_statement category
|
|
167
|
-
|
|
168
|
-
statement += statement.include?('WHERE') ? ' AND' : ' WHERE'
|
|
169
|
-
|
|
170
|
-
"#{statement} st.#{@@traversal_id} > #{offset} LIMIT #{chunksize}"
|
|
171
|
-
end
|
|
172
|
-
|
|
173
|
-
# The harvest statement used to pull data from the snapshot table.
|
|
174
|
-
#
|
|
175
|
-
def harvest_statement category
|
|
176
|
-
"SELECT id, #{category.from} FROM #{snapshot_table_name(category.index_name)} st"
|
|
177
|
-
end
|
|
178
|
-
|
|
179
|
-
# The amount of records that are loaded each chunk.
|
|
180
|
-
#
|
|
181
|
-
def chunksize
|
|
182
|
-
25_000
|
|
183
|
-
end
|
|
184
|
-
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
end
|
|
188
|
-
|
|
189
|
-
end
|