pupa 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +6 -0
- data/.travis.yml +5 -0
- data/.yardopts +4 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.md +52 -0
- data/Rakefile +37 -0
- data/USAGE +1 -0
- data/lib/pupa/errors.rb +30 -0
- data/lib/pupa/logger.rb +37 -0
- data/lib/pupa/models/base.rb +190 -0
- data/lib/pupa/models/concerns/contactable.rb +34 -0
- data/lib/pupa/models/concerns/identifiable.rb +26 -0
- data/lib/pupa/models/concerns/linkable.rb +26 -0
- data/lib/pupa/models/concerns/nameable.rb +34 -0
- data/lib/pupa/models/concerns/sourceable.rb +26 -0
- data/lib/pupa/models/concerns/timestamps.rb +22 -0
- data/lib/pupa/models/contact_detail_list.rb +28 -0
- data/lib/pupa/models/membership.rb +37 -0
- data/lib/pupa/models/organization.rb +40 -0
- data/lib/pupa/models/person.rb +35 -0
- data/lib/pupa/models/post.rb +28 -0
- data/lib/pupa/processor/client.rb +42 -0
- data/lib/pupa/processor/dependency_graph.rb +18 -0
- data/lib/pupa/processor/helper.rb +15 -0
- data/lib/pupa/processor/middleware/logger.rb +37 -0
- data/lib/pupa/processor/middleware/parse_html.rb +16 -0
- data/lib/pupa/processor/persistence.rb +80 -0
- data/lib/pupa/processor/yielder.rb +50 -0
- data/lib/pupa/processor.rb +351 -0
- data/lib/pupa/refinements/faraday_middleware.rb +32 -0
- data/lib/pupa/refinements/json-schema.rb +36 -0
- data/lib/pupa/runner.rb +185 -0
- data/lib/pupa/version.rb +3 -0
- data/lib/pupa.rb +31 -0
- data/pupa.gemspec +34 -0
- data/schemas/popolo/contact_detail.json +44 -0
- data/schemas/popolo/identifier.json +18 -0
- data/schemas/popolo/link.json +19 -0
- data/schemas/popolo/membership.json +86 -0
- data/schemas/popolo/organization.json +104 -0
- data/schemas/popolo/other_name.json +28 -0
- data/schemas/popolo/person.json +130 -0
- data/schemas/popolo/post.json +78 -0
- data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +56 -0
- data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +48 -0
- data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +54 -0
- data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +26 -0
- data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +46 -0
- data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +26 -0
- data/spec/logger_spec.rb +4 -0
- data/spec/models/base_spec.rb +194 -0
- data/spec/models/concerns/contactable_spec.rb +37 -0
- data/spec/models/concerns/identifiable_spec.rb +25 -0
- data/spec/models/concerns/linkable_spec.rb +25 -0
- data/spec/models/concerns/nameable_spec.rb +25 -0
- data/spec/models/concerns/sourceable_spec.rb +25 -0
- data/spec/models/concerns/timestamps_spec.rb +32 -0
- data/spec/models/contact_detail_list_spec.rb +44 -0
- data/spec/models/membership_spec.rb +30 -0
- data/spec/models/organization_spec.rb +24 -0
- data/spec/models/person_spec.rb +24 -0
- data/spec/models/post_spec.rb +19 -0
- data/spec/processor/client_spec.rb +4 -0
- data/spec/processor/dependency_graph_spec.rb +4 -0
- data/spec/processor/helper_spec.rb +4 -0
- data/spec/processor/middleware/logger_spec.rb +87 -0
- data/spec/processor/middleware/parse_html_spec.rb +92 -0
- data/spec/processor/persistence_spec.rb +41 -0
- data/spec/processor/yielder_spec.rb +55 -0
- data/spec/processor_spec.rb +268 -0
- data/spec/runner_spec.rb +85 -0
- data/spec/spec_helper.rb +17 -0
- metadata +342 -0
@@ -0,0 +1,351 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
require 'pupa/processor/client'
|
6
|
+
require 'pupa/processor/dependency_graph'
|
7
|
+
require 'pupa/processor/helper'
|
8
|
+
require 'pupa/processor/persistence'
|
9
|
+
require 'pupa/processor/yielder'
|
10
|
+
|
11
|
+
module Pupa
|
12
|
+
# An abstract processor class from which specific processors inherit.
|
13
|
+
class Processor
|
14
|
+
extend Forwardable
|
15
|
+
include Helper
|
16
|
+
|
17
|
+
class_attribute :tasks
|
18
|
+
self.tasks = []
|
19
|
+
|
20
|
+
def_delegators :@logger, :debug, :info, :warn, :error, :fatal
|
21
|
+
|
22
|
+
# @param [String] output_dir the directory in which to dump JSON documents
|
23
|
+
# @param [String] cache_dir the directory in which to cache HTTP responses
|
24
|
+
# @param [Integer] expires_in the cache's expiration time in seconds
|
25
|
+
# @param [String] level the log level
|
26
|
+
# @param [String,IO] logdev the log device
|
27
|
+
# @param [Hash] options criteria for selecting the methods to run
|
28
|
+
def initialize(output_dir, cache_dir: nil, expires_in: 86400, level: 'INFO', logdev: STDOUT, options: {})
|
29
|
+
@output_dir = output_dir
|
30
|
+
@options = options
|
31
|
+
@level = level
|
32
|
+
@logger = Logger.new('pupa', level: level, logdev: logdev)
|
33
|
+
@client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Retrieves and parses a document with a GET request.
|
37
|
+
#
|
38
|
+
# @param [String] url a URL to an HTML document
|
39
|
+
# @param [String,Hash] params query string parameters
|
40
|
+
# @return a parsed document
|
41
|
+
def get(url, params = {})
|
42
|
+
# Faraday requires `params` to be a hash.
|
43
|
+
if String === params
|
44
|
+
params = CGI.parse(params)
|
45
|
+
|
46
|
+
# Flatten the parameters for Faraday.
|
47
|
+
params.each do |key,value|
|
48
|
+
if Array === value && value.size == 1
|
49
|
+
params[key] = value.first
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
@client.get(url, params).body
|
55
|
+
end
|
56
|
+
|
57
|
+
# Retrieves and parses a document with a POST request.
|
58
|
+
#
|
59
|
+
# @param [String] url a URL to an HTML document
|
60
|
+
# @param [String,Hash] params query string parameters
|
61
|
+
# @return a parsed document
|
62
|
+
def post(url, params = {})
|
63
|
+
@client.post(url, params).body
|
64
|
+
end
|
65
|
+
|
66
|
+
# Adds a scraping task to Pupa.rb.
|
67
|
+
#
|
68
|
+
# Defines a method whose name is identical to `task_name`. This method
|
69
|
+
# selects a method to perform the scraping task using `scraping_task_method`
|
70
|
+
# and memoizes its return value. The return value is a lazy enumerator of
|
71
|
+
# objects scraped by the selected method. The selected method must yield
|
72
|
+
# objects to populate this lazy enumerator.
|
73
|
+
#
|
74
|
+
# For example, `MyProcessor.add_scraping_task(:people)` defines a `people`
|
75
|
+
# method on `MyProcessor`. This `people` method returns a lazy enumerator of
|
76
|
+
# objects (presumably Person objects in this case, but the enumerator can
|
77
|
+
# contain any object in the general case).
|
78
|
+
#
|
79
|
+
# In `MyProcessor`, you would define an `scrape_people` method, which must
|
80
|
+
# yield objects to populate the lazy enumerator. Alternatively, you may
|
81
|
+
# override `scraping_task_method` to change the method selected to perform
|
82
|
+
# the scraping task.
|
83
|
+
#
|
84
|
+
# The `people` method can then be called by transformation and import tasks.
|
85
|
+
#
|
86
|
+
# @param [Symbol] task_name a task name
|
87
|
+
# @see Pupa::Processor#scraping_task_method
|
88
|
+
def self.add_scraping_task(task_name)
|
89
|
+
self.tasks += [task_name]
|
90
|
+
define_method(task_name) do
|
91
|
+
ivar = "@#{task_name}"
|
92
|
+
if instance_variable_defined?(ivar)
|
93
|
+
instance_variable_get(ivar)
|
94
|
+
else
|
95
|
+
instance_variable_set(ivar, Yielder.new(&method(scraping_task_method(task_name))))
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Dumps scraped objects to disk.
|
101
|
+
#
|
102
|
+
# @param [Symbol] task_name the name of the scraping task to perform
|
103
|
+
def dump_scraped_objects(task_name)
|
104
|
+
send(task_name).each do |object|
|
105
|
+
dump_scraped_object(object)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# Saves scraped objects to a database.
|
110
|
+
#
|
111
|
+
# @raises [TSort::Cyclic] if the dependency graph is cyclic
|
112
|
+
# @raises [Pupa::Errors::UnprocessableEntity] if an object's foreign keys or
|
113
|
+
# foreign objects cannot be resolved
|
114
|
+
# @raises [Pupa::Errors::DuplicateDocumentError] if duplicate objects were
|
115
|
+
# inadvertently saved to the database
|
116
|
+
def import
|
117
|
+
objects = deduplicate(load_scraped_objects)
|
118
|
+
|
119
|
+
object_id_to_database_id = {}
|
120
|
+
|
121
|
+
if use_dependency_graph?(objects)
|
122
|
+
dependency_graph = build_dependency_graph(objects)
|
123
|
+
|
124
|
+
# Replace object IDs with database IDs in foreign keys and save objects.
|
125
|
+
dependency_graph.tsort.each do |id|
|
126
|
+
object = objects[id]
|
127
|
+
resolve_foreign_keys(object, object_id_to_database_id)
|
128
|
+
# The dependency graph strategy only works if there are no foreign objects.
|
129
|
+
object_id_to_database_id[id] = Persistence.new(object).save
|
130
|
+
end
|
131
|
+
else
|
132
|
+
size = objects.size
|
133
|
+
|
134
|
+
# Should be O(n²). If there are foreign objects, we do not know all the
|
135
|
+
# edges in the graph, and therefore cannot build a dependency graph or
|
136
|
+
# derive any evaluation order.
|
137
|
+
#
|
138
|
+
# An exception is raised if a foreign object matches multiple documents
|
139
|
+
# in the database. However, if a matching object is not yet saved, this
|
140
|
+
# exception may not be raised.
|
141
|
+
loop do
|
142
|
+
progress_made = false
|
143
|
+
|
144
|
+
objects.delete_if do |id,object|
|
145
|
+
resolvable = true
|
146
|
+
|
147
|
+
resolvable &= object.foreign_keys.all? do |property|
|
148
|
+
value = object[property]
|
149
|
+
value.nil? || object_id_to_database_id.key?(value)
|
150
|
+
end
|
151
|
+
|
152
|
+
resolvable &= object.foreign_objects.all? do |property|
|
153
|
+
selector = object[property]
|
154
|
+
selector.blank? || Persistence.find(selector)
|
155
|
+
end
|
156
|
+
|
157
|
+
if resolvable
|
158
|
+
progress_made = true
|
159
|
+
resolve_foreign_keys(object, object_id_to_database_id)
|
160
|
+
resolve_foreign_objects(object)
|
161
|
+
object_id_to_database_id[id] = Persistence.new(object).save
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
break if objects.empty? || !progress_made
|
166
|
+
end
|
167
|
+
|
168
|
+
unless objects.empty?
|
169
|
+
raise Errors::UnprocessableEntity, "couldn't resolve #{objects.size}/#{size} objects:\n #{objects.values.map{|object| JSON.dump(object.foreign_properties)}.join("\n ")}"
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# Ensure that fingerprints uniquely identified objects.
|
174
|
+
counts = {}
|
175
|
+
object_id_to_database_id.each do |object_id,database_id|
|
176
|
+
(counts[database_id] ||= []) << object_id
|
177
|
+
end
|
178
|
+
duplicates = counts.select do |_,object_ids|
|
179
|
+
object_ids.size > 1
|
180
|
+
end
|
181
|
+
unless duplicates.empty?
|
182
|
+
raise Errors::DuplicateDocumentError, "multiple objects written to same document:\n" + duplicates.map{|database_id,object_ids| " #{database_id} <- #{object_ids.join(' ')}"}.join("\n")
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
private
|
187
|
+
|
188
|
+
# Returns the name of the method - `scrape_<task_name>` by default - that
|
189
|
+
# would be used to perform the given scraping task.
|
190
|
+
#
|
191
|
+
# If you would like to change this default behavior, override this method in
|
192
|
+
# a subclass. For example, you may want to select a method according to the
|
193
|
+
# additional `options` passed from the command-line to the processor.
|
194
|
+
#
|
195
|
+
# @param [Symbol] task_name a task name
|
196
|
+
# @return [String] the name of the method to perform the scraping task
|
197
|
+
def scraping_task_method(task_name)
|
198
|
+
"scrape_#{task_name}"
|
199
|
+
end
|
200
|
+
|
201
|
+
# Dumps an scraped object to disk.
|
202
|
+
#
|
203
|
+
# @param [Object] object an scraped object
|
204
|
+
# @raises [Pupa::Errors::DuplicateObjectIdError]
|
205
|
+
def dump_scraped_object(object)
|
206
|
+
type = object.class.to_s.demodulize.underscore
|
207
|
+
basename = "#{type}_#{object._id}.json"
|
208
|
+
path = File.join(@output_dir, basename)
|
209
|
+
|
210
|
+
if File.exist?(path)
|
211
|
+
raise Errors::DuplicateObjectIdError, "duplicate object ID: #{object._id} (was the same objected yielded twice?)"
|
212
|
+
end
|
213
|
+
|
214
|
+
info {"save #{type} #{object.to_s} as #{basename}"}
|
215
|
+
|
216
|
+
File.open(path, 'w') do |f|
|
217
|
+
f.write(JSON.dump(object.to_h(include_foreign_objects: true)))
|
218
|
+
end
|
219
|
+
|
220
|
+
begin
|
221
|
+
object.validate!
|
222
|
+
rescue JSON::Schema::ValidationError => e
|
223
|
+
warn {e.message}
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
# Loads scraped objects from disk.
|
228
|
+
#
|
229
|
+
# @return [Hash] a hash of scraped objects keyed by ID
|
230
|
+
def load_scraped_objects
|
231
|
+
{}.tap do |objects|
|
232
|
+
Dir[File.join(@output_dir, '*.json')].each do |path|
|
233
|
+
data = JSON.load(File.read(path))
|
234
|
+
object = data['_type'].camelize.constantize.new(data)
|
235
|
+
objects[object._id] = object
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# Removes all duplicate objects and re-assigns any foreign keys.
|
241
|
+
#
|
242
|
+
# @param [Hash] objects a hash of scraped objects keyed by ID
|
243
|
+
# @return [Hash] the objects without duplicates
|
244
|
+
def deduplicate(objects)
|
245
|
+
losers_to_winners = build_losers_to_winners_map(objects)
|
246
|
+
|
247
|
+
# Remove all losers.
|
248
|
+
losers_to_winners.each_key do |key|
|
249
|
+
objects.delete(key)
|
250
|
+
end
|
251
|
+
|
252
|
+
# Swap the IDs of losers for the IDs of winners.
|
253
|
+
objects.each do |id,object|
|
254
|
+
object.foreign_keys.each do |property|
|
255
|
+
value = object[property]
|
256
|
+
if value && losers_to_winners.key?(value)
|
257
|
+
object[property] = losers_to_winners[value]
|
258
|
+
end
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
objects
|
263
|
+
end
|
264
|
+
|
265
|
+
# For each object, map its ID to the ID of its duplicate, if any.
|
266
|
+
#
|
267
|
+
# @param [Hash] objects a hash of scraped objects keyed by ID
|
268
|
+
# @return [Hash] a mapping from an object ID to the ID of its duplicate
|
269
|
+
def build_losers_to_winners_map(objects)
|
270
|
+
{}.tap do |map|
|
271
|
+
objects.each_with_index do |(id1,object1),index|
|
272
|
+
unless map.key?(id1) # Don't search for duplicates of duplicates.
|
273
|
+
objects.drop(index + 1).each do |id2,object2|
|
274
|
+
if object1 == object2
|
275
|
+
map[id2] = id1
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
# If any objects have unresolved foreign objects, we cannot derive an
|
284
|
+
# evaluation order using a dependency graph.
|
285
|
+
#
|
286
|
+
# @param [Hash] objects a hash of scraped objects keyed by ID
|
287
|
+
# @return [Boolean] whether a dependency graph can be used to derive an
|
288
|
+
# evaluation order
|
289
|
+
def use_dependency_graph?(objects)
|
290
|
+
objects.each do |id,object|
|
291
|
+
object.foreign_objects.each do |property|
|
292
|
+
if object[property].present?
|
293
|
+
return false
|
294
|
+
end
|
295
|
+
end
|
296
|
+
end
|
297
|
+
true
|
298
|
+
end
|
299
|
+
|
300
|
+
# Builds a dependency graph.
|
301
|
+
#
|
302
|
+
# @param [Hash] objects a hash of scraped objects keyed by ID
|
303
|
+
# @return [DependencyGraph] the dependency graph
|
304
|
+
def build_dependency_graph(objects)
|
305
|
+
DependencyGraph.new.tap do |graph|
|
306
|
+
objects.each do |id,object|
|
307
|
+
graph[id] = [] # no duplicate IDs
|
308
|
+
object.foreign_keys.each do |property|
|
309
|
+
value = object[property]
|
310
|
+
if value
|
311
|
+
graph[id] << value
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
# Resolves an object's foreign keys from object IDs to database IDs.
|
319
|
+
#
|
320
|
+
# @param [Object] an object
|
321
|
+
# @param [Hash] a map from object ID to database ID
|
322
|
+
# @raises [Pupa::Errors::MissingDatabaseIdError]
|
323
|
+
def resolve_foreign_keys(object, map)
|
324
|
+
object.foreign_keys.each do |property|
|
325
|
+
value = object[property]
|
326
|
+
if value
|
327
|
+
# If using a dependency graph, any foreign key that cannot be resolved
|
328
|
+
# will cause a key error while building the dependency graph.
|
329
|
+
#
|
330
|
+
# If not using a dependency graph, this method will not be called
|
331
|
+
# unless the foreign key is resolvable.
|
332
|
+
object[property] = map[value]
|
333
|
+
end
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
# Resolves an object's foreign objects to database IDs.
|
338
|
+
#
|
339
|
+
# @param [Object] an object
|
340
|
+
# @raises [Pupa::Errors::MissingDatabaseIdError]
|
341
|
+
def resolve_foreign_objects(object)
|
342
|
+
object.foreign_objects.each do |property|
|
343
|
+
selector = object[property]
|
344
|
+
if selector.present?
|
345
|
+
# This method will not be called unless the foreign key is resolvable.
|
346
|
+
object["#{property}_id"] = Persistence.find(selector)['_id']
|
347
|
+
end
|
348
|
+
end
|
349
|
+
end
|
350
|
+
end
|
351
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Refinements
|
3
|
+
# A refinement for the Faraday caching middleware to cache all requests, not
|
4
|
+
# only GET requests.
|
5
|
+
module FaradayMiddleware
|
6
|
+
refine ::FaradayMiddleware::Caching do
|
7
|
+
def call(env)
|
8
|
+
# Remove if-statement to cache any request, not only GET.
|
9
|
+
if env[:parallel_manager]
|
10
|
+
# callback mode
|
11
|
+
cache_on_complete(env)
|
12
|
+
else
|
13
|
+
# synchronous mode
|
14
|
+
response = cache.fetch(cache_key(env)) { @app.call(env) }
|
15
|
+
finalize_response(response, env)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def cache_key(env)
|
20
|
+
url = env[:url].dup
|
21
|
+
if url.query && params_to_ignore.any?
|
22
|
+
params = parse_query url.query
|
23
|
+
params.reject! {|k,| params_to_ignore.include? k }
|
24
|
+
url.query = build_query params
|
25
|
+
end
|
26
|
+
url.normalize!
|
27
|
+
url.request_uri + env[:body].to_s # Add for POST requests.
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Refinements
|
3
|
+
# A refinement for JSON Schema to validate "email" and "uri" formats.
|
4
|
+
module Format
|
5
|
+
# @see http://my.rails-royce.org/2010/07/21/email-validation-in-ruby-on-rails-without-regexp/
|
6
|
+
def validate(current_schema, data, fragments, processor, validator, options = {})
|
7
|
+
case current_schema.schema['format']
|
8
|
+
when 'email'
|
9
|
+
error_message = "The property '#{build_fragment(fragments)}' must be a valid email address"
|
10
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
|
11
|
+
address = Mail::Address.new(data)
|
12
|
+
unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
|
13
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
14
|
+
return
|
15
|
+
end
|
16
|
+
when 'uri'
|
17
|
+
error_message = "The property '#{build_fragment(fragments)}' must be a valid URI"
|
18
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
|
19
|
+
r = URI::DEFAULT_PARSER.regexp[:ABS_URI]
|
20
|
+
unless r.match(data)
|
21
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
22
|
+
return
|
23
|
+
end
|
24
|
+
else
|
25
|
+
super
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class ::JSON::Schema::FormatAttribute
|
31
|
+
class << self
|
32
|
+
prepend Pupa::Refinements::Format
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/pupa/runner.rb
ADDED
@@ -0,0 +1,185 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
require 'moped'
|
6
|
+
|
7
|
+
module Pupa
|
8
|
+
class Runner
|
9
|
+
attr_reader :options, :actions
|
10
|
+
|
11
|
+
# @param [Pupa::Processor] a processor class
|
12
|
+
# @param [Hash] defaults change any default options
|
13
|
+
def initialize(processor_class, defaults = {})
|
14
|
+
@processor_class = processor_class
|
15
|
+
|
16
|
+
@options = OpenStruct.new({
|
17
|
+
actions: [],
|
18
|
+
tasks: [],
|
19
|
+
output_dir: File.expand_path('scraped_data', Dir.pwd),
|
20
|
+
cache_dir: File.expand_path('web_cache', Dir.pwd),
|
21
|
+
expires_in: 86400, # 1 day
|
22
|
+
host_with_port: 'localhost:27017',
|
23
|
+
database: 'pupa',
|
24
|
+
dry_run: false,
|
25
|
+
level: 'INFO',
|
26
|
+
}.merge(defaults))
|
27
|
+
|
28
|
+
@actions = {
|
29
|
+
'scrape' => 'Scrapes data from online sources',
|
30
|
+
'import' => 'Imports scraped data into a database',
|
31
|
+
}.map do |name,description|
|
32
|
+
OpenStruct.new(name: name, description: description)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# @param [Hash] attributes the action's attributes
|
37
|
+
# @option attributes [String] :name the action's label
|
38
|
+
# @option attributes [String] :description a description of the action
|
39
|
+
def add_action(attributes)
|
40
|
+
@actions << OpenStruct.new(attributes)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns the command-line option parser.
|
44
|
+
#
|
45
|
+
# @return [OptionParser] the command-line option parser
|
46
|
+
def opts
|
47
|
+
@opts ||= OptionParser.new do |opts|
|
48
|
+
opts.program_name = File.basename($PROGRAM_NAME)
|
49
|
+
opts.banner = "Usage: #{opts.program_name}"
|
50
|
+
|
51
|
+
opts.separator ''
|
52
|
+
opts.separator 'Actions:'
|
53
|
+
|
54
|
+
names = @actions.map(&:name)
|
55
|
+
padding = names.map(&:size).max
|
56
|
+
@actions.each do |action|
|
57
|
+
opts.separator " #{action.name.ljust(padding)} #{action.description}\n"
|
58
|
+
end
|
59
|
+
|
60
|
+
opts.separator ''
|
61
|
+
opts.separator 'Tasks:'
|
62
|
+
|
63
|
+
@processor_class.tasks.each do |task_name|
|
64
|
+
opts.separator " #{task_name}"
|
65
|
+
end
|
66
|
+
|
67
|
+
opts.separator ''
|
68
|
+
opts.separator 'Specific options:'
|
69
|
+
opts.on('-a', '--action ACTION', names, 'Select an action to run (you may give this switch multiple times)', " (#{names.join(', ')})") do |v|
|
70
|
+
options.actions << v
|
71
|
+
end
|
72
|
+
opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', " (#{@processor_class.tasks.join(', ')})") do |v|
|
73
|
+
options.tasks << v
|
74
|
+
end
|
75
|
+
opts.on('-o', '--output_dir PATH', 'The directory in which to dump JSON documents') do |v|
|
76
|
+
options.output_dir = v
|
77
|
+
end
|
78
|
+
opts.on('-c', '--cache_dir PATH', 'The directory in which to cache HTTP requests') do |v|
|
79
|
+
options.cache_dir = v
|
80
|
+
end
|
81
|
+
opts.on('-e', '--expires_in SECONDS', "The cache's expiration time in seconds") do |v|
|
82
|
+
options.expires_in = v
|
83
|
+
end
|
84
|
+
opts.on('-H', '--host HOST:PORT', 'The host and port to MongoDB') do |v|
|
85
|
+
options.host_with_port = v
|
86
|
+
end
|
87
|
+
opts.on('-d', '--database NAME', 'The name of the MongoDB database') do |v|
|
88
|
+
options.database = v
|
89
|
+
end
|
90
|
+
opts.on('-n', '--dry-run', 'Show the plan without running any actions') do
|
91
|
+
options.dry_run = true
|
92
|
+
end
|
93
|
+
opts.on('-v', '--verbose', 'Show all messages') do
|
94
|
+
options.level = 'DEBUG'
|
95
|
+
end
|
96
|
+
opts.on('-q', '--quiet', 'Show only warning and error messages') do
|
97
|
+
options.level = 'WARN'
|
98
|
+
end
|
99
|
+
opts.on('-s', '--silent', 'Show no messages') do
|
100
|
+
options.level = 'UNKNOWN'
|
101
|
+
end
|
102
|
+
|
103
|
+
opts.separator ''
|
104
|
+
opts.separator 'Common options:'
|
105
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
106
|
+
puts opts
|
107
|
+
exit
|
108
|
+
end
|
109
|
+
opts.on_tail('-v', '--version', 'Show version') do
|
110
|
+
puts Pupa::VERSION
|
111
|
+
exit
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Runs the action.
|
117
|
+
#
|
118
|
+
# @example Run from a command-line script
|
119
|
+
#
|
120
|
+
# runner.run(ARGV)
|
121
|
+
#
|
122
|
+
# @example Override the command-line options
|
123
|
+
#
|
124
|
+
# runner.run(ARGV, expires_in: 3600) # 1 hour
|
125
|
+
#
|
126
|
+
# @param [Array] args command-line arguments
|
127
|
+
# @param [Hash] overrides any overridden options
|
128
|
+
def run(args, overrides = {})
|
129
|
+
rest = opts.parse!(args)
|
130
|
+
|
131
|
+
@options = OpenStruct.new(options.to_h.merge(overrides))
|
132
|
+
|
133
|
+
if options.actions.empty?
|
134
|
+
options.actions = %w(scrape import)
|
135
|
+
end
|
136
|
+
if options.tasks.empty?
|
137
|
+
options.tasks = @processor_class.tasks
|
138
|
+
end
|
139
|
+
|
140
|
+
processor = @processor_class.new(options.output_dir, cache_dir: options.cache_dir, expires_in: options.expires_in, level: options.level, options: Hash[*rest])
|
141
|
+
|
142
|
+
options.actions.each do |action|
|
143
|
+
unless action == 'scrape' || processor.respond_to?(action)
|
144
|
+
abort %(`#{action}` is not a #{opts.program_name} action. See `#{opts.program_name} --help` for a list of available actions.)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
if %w(DEBUG INFO).include?(options.level)
|
149
|
+
puts "processor: #{@processor_class}"
|
150
|
+
puts "actions: #{options.actions.join(', ')}"
|
151
|
+
puts "tasks: #{options.tasks.join(', ')}"
|
152
|
+
end
|
153
|
+
|
154
|
+
if options.level == 'DEBUG'
|
155
|
+
%w(output_dir cache_dir expires_in host_with_port database level).each do |option|
|
156
|
+
puts "#{option}: #{options[option]}"
|
157
|
+
end
|
158
|
+
unless rest.empty?
|
159
|
+
puts "options: #{rest.join(' ')}"
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
exit if options.dry_run
|
164
|
+
|
165
|
+
Pupa.session = Moped::Session.new([options.host_with_port], database: options.database)
|
166
|
+
|
167
|
+
if options.actions.delete('scrape')
|
168
|
+
FileUtils.mkdir_p(options.output_dir)
|
169
|
+
FileUtils.mkdir_p(options.cache_dir)
|
170
|
+
|
171
|
+
Dir[File.join(options.output_dir, '*.json')].each do |path|
|
172
|
+
FileUtils.rm(path)
|
173
|
+
end
|
174
|
+
|
175
|
+
options.tasks.each do |task_name|
|
176
|
+
processor.dump_scraped_objects(task_name)
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
options.actions.each do |action|
|
181
|
+
processor.send(action)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
data/lib/pupa/version.rb
ADDED
data/lib/pupa.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
require 'active_support/concern'
|
4
|
+
require 'active_support/core_ext/class/attribute'
|
5
|
+
require 'active_support/core_ext/object/blank'
|
6
|
+
require 'active_support/inflector'
|
7
|
+
|
8
|
+
require 'pupa/errors'
|
9
|
+
require 'pupa/logger'
|
10
|
+
require 'pupa/processor'
|
11
|
+
require 'pupa/runner'
|
12
|
+
|
13
|
+
require 'pupa/models/concerns/contactable'
|
14
|
+
require 'pupa/models/concerns/identifiable'
|
15
|
+
require 'pupa/models/concerns/linkable'
|
16
|
+
require 'pupa/models/concerns/nameable'
|
17
|
+
require 'pupa/models/concerns/sourceable'
|
18
|
+
require 'pupa/models/concerns/timestamps'
|
19
|
+
|
20
|
+
require 'pupa/models/base'
|
21
|
+
require 'pupa/models/contact_detail_list'
|
22
|
+
require 'pupa/models/membership'
|
23
|
+
require 'pupa/models/organization'
|
24
|
+
require 'pupa/models/person'
|
25
|
+
require 'pupa/models/post'
|
26
|
+
|
27
|
+
module Pupa
|
28
|
+
class << self
|
29
|
+
attr_accessor :session
|
30
|
+
end
|
31
|
+
end
|
data/pupa.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/pupa/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "pupa"
|
6
|
+
s.version = Pupa::VERSION
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Open North"]
|
9
|
+
s.email = ["info@opennorth.ca"]
|
10
|
+
s.homepage = "http://github.com/opennorth/pupa-ruby"
|
11
|
+
s.summary = %q{A data scraping framework}
|
12
|
+
s.license = 'MIT'
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.add_runtime_dependency('activesupport', '~> 4.0.0')
|
20
|
+
s.add_runtime_dependency('colored', '~> 1.2')
|
21
|
+
s.add_runtime_dependency('faraday_middleware', '~> 0.9.0')
|
22
|
+
s.add_runtime_dependency('json-schema', '~> 2.1.3')
|
23
|
+
s.add_runtime_dependency('mail')
|
24
|
+
s.add_runtime_dependency('moped', '~> 1.5.1')
|
25
|
+
s.add_runtime_dependency('nokogiri', '~> 1.6.0')
|
26
|
+
|
27
|
+
s.add_development_dependency('coveralls')
|
28
|
+
s.add_development_dependency('json', '~> 1.7.7') # to silence coveralls warning
|
29
|
+
s.add_development_dependency('octokit') # to update Popolo schema
|
30
|
+
s.add_development_dependency('rake')
|
31
|
+
s.add_development_dependency('rspec', '~> 2.10')
|
32
|
+
s.add_development_dependency('vcr', '~> 2.5.0')
|
33
|
+
s.add_development_dependency('multi_xml')
|
34
|
+
end
|