pupa 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +6 -0
- data/.travis.yml +5 -0
- data/.yardopts +4 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.md +52 -0
- data/Rakefile +37 -0
- data/USAGE +1 -0
- data/lib/pupa/errors.rb +30 -0
- data/lib/pupa/logger.rb +37 -0
- data/lib/pupa/models/base.rb +190 -0
- data/lib/pupa/models/concerns/contactable.rb +34 -0
- data/lib/pupa/models/concerns/identifiable.rb +26 -0
- data/lib/pupa/models/concerns/linkable.rb +26 -0
- data/lib/pupa/models/concerns/nameable.rb +34 -0
- data/lib/pupa/models/concerns/sourceable.rb +26 -0
- data/lib/pupa/models/concerns/timestamps.rb +22 -0
- data/lib/pupa/models/contact_detail_list.rb +28 -0
- data/lib/pupa/models/membership.rb +37 -0
- data/lib/pupa/models/organization.rb +40 -0
- data/lib/pupa/models/person.rb +35 -0
- data/lib/pupa/models/post.rb +28 -0
- data/lib/pupa/processor/client.rb +42 -0
- data/lib/pupa/processor/dependency_graph.rb +18 -0
- data/lib/pupa/processor/helper.rb +15 -0
- data/lib/pupa/processor/middleware/logger.rb +37 -0
- data/lib/pupa/processor/middleware/parse_html.rb +16 -0
- data/lib/pupa/processor/persistence.rb +80 -0
- data/lib/pupa/processor/yielder.rb +50 -0
- data/lib/pupa/processor.rb +351 -0
- data/lib/pupa/refinements/faraday_middleware.rb +32 -0
- data/lib/pupa/refinements/json-schema.rb +36 -0
- data/lib/pupa/runner.rb +185 -0
- data/lib/pupa/version.rb +3 -0
- data/lib/pupa.rb +31 -0
- data/pupa.gemspec +34 -0
- data/schemas/popolo/contact_detail.json +44 -0
- data/schemas/popolo/identifier.json +18 -0
- data/schemas/popolo/link.json +19 -0
- data/schemas/popolo/membership.json +86 -0
- data/schemas/popolo/organization.json +104 -0
- data/schemas/popolo/other_name.json +28 -0
- data/schemas/popolo/person.json +130 -0
- data/schemas/popolo/post.json +78 -0
- data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +56 -0
- data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +48 -0
- data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +54 -0
- data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +26 -0
- data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +46 -0
- data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +26 -0
- data/spec/logger_spec.rb +4 -0
- data/spec/models/base_spec.rb +194 -0
- data/spec/models/concerns/contactable_spec.rb +37 -0
- data/spec/models/concerns/identifiable_spec.rb +25 -0
- data/spec/models/concerns/linkable_spec.rb +25 -0
- data/spec/models/concerns/nameable_spec.rb +25 -0
- data/spec/models/concerns/sourceable_spec.rb +25 -0
- data/spec/models/concerns/timestamps_spec.rb +32 -0
- data/spec/models/contact_detail_list_spec.rb +44 -0
- data/spec/models/membership_spec.rb +30 -0
- data/spec/models/organization_spec.rb +24 -0
- data/spec/models/person_spec.rb +24 -0
- data/spec/models/post_spec.rb +19 -0
- data/spec/processor/client_spec.rb +4 -0
- data/spec/processor/dependency_graph_spec.rb +4 -0
- data/spec/processor/helper_spec.rb +4 -0
- data/spec/processor/middleware/logger_spec.rb +87 -0
- data/spec/processor/middleware/parse_html_spec.rb +92 -0
- data/spec/processor/persistence_spec.rb +41 -0
- data/spec/processor/yielder_spec.rb +55 -0
- data/spec/processor_spec.rb +268 -0
- data/spec/runner_spec.rb +85 -0
- data/spec/spec_helper.rb +17 -0
- metadata +342 -0
@@ -0,0 +1,351 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
require 'pupa/processor/client'
|
6
|
+
require 'pupa/processor/dependency_graph'
|
7
|
+
require 'pupa/processor/helper'
|
8
|
+
require 'pupa/processor/persistence'
|
9
|
+
require 'pupa/processor/yielder'
|
10
|
+
|
11
|
+
module Pupa
|
12
|
+
# An abstract processor class from which specific processors inherit.
|
13
|
+
class Processor
|
14
|
+
extend Forwardable
|
15
|
+
include Helper
|
16
|
+
|
17
|
+
class_attribute :tasks
|
18
|
+
self.tasks = []
|
19
|
+
|
20
|
+
def_delegators :@logger, :debug, :info, :warn, :error, :fatal
|
21
|
+
|
22
|
+
# @param [String] output_dir the directory in which to dump JSON documents
|
23
|
+
# @param [String] cache_dir the directory in which to cache HTTP responses
|
24
|
+
# @param [Integer] expires_in the cache's expiration time in seconds
|
25
|
+
# @param [String] level the log level
|
26
|
+
# @param [String,IO] logdev the log device
|
27
|
+
# @param [Hash] options criteria for selecting the methods to run
|
28
|
+
def initialize(output_dir, cache_dir: nil, expires_in: 86400, level: 'INFO', logdev: STDOUT, options: {})
|
29
|
+
@output_dir = output_dir
|
30
|
+
@options = options
|
31
|
+
@level = level
|
32
|
+
@logger = Logger.new('pupa', level: level, logdev: logdev)
|
33
|
+
@client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Retrieves and parses a document with a GET request.
|
37
|
+
#
|
38
|
+
# @param [String] url a URL to an HTML document
|
39
|
+
# @param [String,Hash] params query string parameters
|
40
|
+
# @return a parsed document
|
41
|
+
def get(url, params = {})
|
42
|
+
# Faraday requires `params` to be a hash.
|
43
|
+
if String === params
|
44
|
+
params = CGI.parse(params)
|
45
|
+
|
46
|
+
# Flatten the parameters for Faraday.
|
47
|
+
params.each do |key,value|
|
48
|
+
if Array === value && value.size == 1
|
49
|
+
params[key] = value.first
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
@client.get(url, params).body
|
55
|
+
end
|
56
|
+
|
57
|
+
# Retrieves and parses a document with a POST request.
|
58
|
+
#
|
59
|
+
# @param [String] url a URL to an HTML document
|
60
|
+
# @param [String,Hash] params query string parameters
|
61
|
+
# @return a parsed document
|
62
|
+
def post(url, params = {})
|
63
|
+
@client.post(url, params).body
|
64
|
+
end
|
65
|
+
|
66
|
+
# Adds a scraping task to Pupa.rb.
|
67
|
+
#
|
68
|
+
# Defines a method whose name is identical to `task_name`. This method
|
69
|
+
# selects a method to perform the scraping task using `scraping_task_method`
|
70
|
+
# and memoizes its return value. The return value is a lazy enumerator of
|
71
|
+
# objects scraped by the selected method. The selected method must yield
|
72
|
+
# objects to populate this lazy enumerator.
|
73
|
+
#
|
74
|
+
# For example, `MyProcessor.add_scraping_task(:people)` defines a `people`
|
75
|
+
# method on `MyProcessor`. This `people` method returns a lazy enumerator of
|
76
|
+
# objects (presumably Person objects in this case, but the enumerator can
|
77
|
+
# contain any object in the general case).
|
78
|
+
#
|
79
|
+
# In `MyProcessor`, you would define an `scrape_people` method, which must
|
80
|
+
# yield objects to populate the lazy enumerator. Alternatively, you may
|
81
|
+
# override `scraping_task_method` to change the method selected to perform
|
82
|
+
# the scraping task.
|
83
|
+
#
|
84
|
+
# The `people` method can then be called by transformation and import tasks.
|
85
|
+
#
|
86
|
+
# @param [Symbol] task_name a task name
|
87
|
+
# @see Pupa::Processor#scraping_task_method
|
88
|
+
def self.add_scraping_task(task_name)
|
89
|
+
self.tasks += [task_name]
|
90
|
+
define_method(task_name) do
|
91
|
+
ivar = "@#{task_name}"
|
92
|
+
if instance_variable_defined?(ivar)
|
93
|
+
instance_variable_get(ivar)
|
94
|
+
else
|
95
|
+
instance_variable_set(ivar, Yielder.new(&method(scraping_task_method(task_name))))
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Dumps scraped objects to disk.
|
101
|
+
#
|
102
|
+
# @param [Symbol] task_name the name of the scraping task to perform
|
103
|
+
def dump_scraped_objects(task_name)
|
104
|
+
send(task_name).each do |object|
|
105
|
+
dump_scraped_object(object)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# Saves scraped objects to a database.
|
110
|
+
#
|
111
|
+
# @raises [TSort::Cyclic] if the dependency graph is cyclic
|
112
|
+
# @raises [Pupa::Errors::UnprocessableEntity] if an object's foreign keys or
|
113
|
+
# foreign objects cannot be resolved
|
114
|
+
# @raises [Pupa::Errors::DuplicateDocumentError] if duplicate objects were
|
115
|
+
# inadvertently saved to the database
|
116
|
+
def import
|
117
|
+
objects = deduplicate(load_scraped_objects)
|
118
|
+
|
119
|
+
object_id_to_database_id = {}
|
120
|
+
|
121
|
+
if use_dependency_graph?(objects)
|
122
|
+
dependency_graph = build_dependency_graph(objects)
|
123
|
+
|
124
|
+
# Replace object IDs with database IDs in foreign keys and save objects.
|
125
|
+
dependency_graph.tsort.each do |id|
|
126
|
+
object = objects[id]
|
127
|
+
resolve_foreign_keys(object, object_id_to_database_id)
|
128
|
+
# The dependency graph strategy only works if there are no foreign objects.
|
129
|
+
object_id_to_database_id[id] = Persistence.new(object).save
|
130
|
+
end
|
131
|
+
else
|
132
|
+
size = objects.size
|
133
|
+
|
134
|
+
# Should be O(n²). If there are foreign objects, we do not know all the
|
135
|
+
# edges in the graph, and therefore cannot build a dependency graph or
|
136
|
+
# derive any evaluation order.
|
137
|
+
#
|
138
|
+
# An exception is raised if a foreign object matches multiple documents
|
139
|
+
# in the database. However, if a matching object is not yet saved, this
|
140
|
+
# exception may not be raised.
|
141
|
+
loop do
|
142
|
+
progress_made = false
|
143
|
+
|
144
|
+
objects.delete_if do |id,object|
|
145
|
+
resolvable = true
|
146
|
+
|
147
|
+
resolvable &= object.foreign_keys.all? do |property|
|
148
|
+
value = object[property]
|
149
|
+
value.nil? || object_id_to_database_id.key?(value)
|
150
|
+
end
|
151
|
+
|
152
|
+
resolvable &= object.foreign_objects.all? do |property|
|
153
|
+
selector = object[property]
|
154
|
+
selector.blank? || Persistence.find(selector)
|
155
|
+
end
|
156
|
+
|
157
|
+
if resolvable
|
158
|
+
progress_made = true
|
159
|
+
resolve_foreign_keys(object, object_id_to_database_id)
|
160
|
+
resolve_foreign_objects(object)
|
161
|
+
object_id_to_database_id[id] = Persistence.new(object).save
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
break if objects.empty? || !progress_made
|
166
|
+
end
|
167
|
+
|
168
|
+
unless objects.empty?
|
169
|
+
raise Errors::UnprocessableEntity, "couldn't resolve #{objects.size}/#{size} objects:\n #{objects.values.map{|object| JSON.dump(object.foreign_properties)}.join("\n ")}"
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# Ensure that fingerprints uniquely identified objects.
|
174
|
+
counts = {}
|
175
|
+
object_id_to_database_id.each do |object_id,database_id|
|
176
|
+
(counts[database_id] ||= []) << object_id
|
177
|
+
end
|
178
|
+
duplicates = counts.select do |_,object_ids|
|
179
|
+
object_ids.size > 1
|
180
|
+
end
|
181
|
+
unless duplicates.empty?
|
182
|
+
raise Errors::DuplicateDocumentError, "multiple objects written to same document:\n" + duplicates.map{|database_id,object_ids| " #{database_id} <- #{object_ids.join(' ')}"}.join("\n")
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
private
|
187
|
+
|
188
|
+
# Returns the name of the method - `scrape_<task_name>` by default - that
|
189
|
+
# would be used to perform the given scraping task.
|
190
|
+
#
|
191
|
+
# If you would like to change this default behavior, override this method in
|
192
|
+
# a subclass. For example, you may want to select a method according to the
|
193
|
+
# additional `options` passed from the command-line to the processor.
|
194
|
+
#
|
195
|
+
# @param [Symbol] task_name a task name
|
196
|
+
# @return [String] the name of the method to perform the scraping task
|
197
|
+
def scraping_task_method(task_name)
|
198
|
+
"scrape_#{task_name}"
|
199
|
+
end
|
200
|
+
|
201
|
+
# Dumps an scraped object to disk.
|
202
|
+
#
|
203
|
+
# @param [Object] object an scraped object
|
204
|
+
# @raises [Pupa::Errors::DuplicateObjectIdError]
|
205
|
+
def dump_scraped_object(object)
|
206
|
+
type = object.class.to_s.demodulize.underscore
|
207
|
+
basename = "#{type}_#{object._id}.json"
|
208
|
+
path = File.join(@output_dir, basename)
|
209
|
+
|
210
|
+
if File.exist?(path)
|
211
|
+
raise Errors::DuplicateObjectIdError, "duplicate object ID: #{object._id} (was the same objected yielded twice?)"
|
212
|
+
end
|
213
|
+
|
214
|
+
info {"save #{type} #{object.to_s} as #{basename}"}
|
215
|
+
|
216
|
+
File.open(path, 'w') do |f|
|
217
|
+
f.write(JSON.dump(object.to_h(include_foreign_objects: true)))
|
218
|
+
end
|
219
|
+
|
220
|
+
begin
|
221
|
+
object.validate!
|
222
|
+
rescue JSON::Schema::ValidationError => e
|
223
|
+
warn {e.message}
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
# Loads scraped objects from disk.
|
228
|
+
#
|
229
|
+
# @return [Hash] a hash of scraped objects keyed by ID
|
230
|
+
def load_scraped_objects
|
231
|
+
{}.tap do |objects|
|
232
|
+
Dir[File.join(@output_dir, '*.json')].each do |path|
|
233
|
+
data = JSON.load(File.read(path))
|
234
|
+
object = data['_type'].camelize.constantize.new(data)
|
235
|
+
objects[object._id] = object
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# Removes all duplicate objects and re-assigns any foreign keys.
|
241
|
+
#
|
242
|
+
# @param [Hash] objects a hash of scraped objects keyed by ID
|
243
|
+
# @return [Hash] the objects without duplicates
|
244
|
+
def deduplicate(objects)
|
245
|
+
losers_to_winners = build_losers_to_winners_map(objects)
|
246
|
+
|
247
|
+
# Remove all losers.
|
248
|
+
losers_to_winners.each_key do |key|
|
249
|
+
objects.delete(key)
|
250
|
+
end
|
251
|
+
|
252
|
+
# Swap the IDs of losers for the IDs of winners.
|
253
|
+
objects.each do |id,object|
|
254
|
+
object.foreign_keys.each do |property|
|
255
|
+
value = object[property]
|
256
|
+
if value && losers_to_winners.key?(value)
|
257
|
+
object[property] = losers_to_winners[value]
|
258
|
+
end
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
objects
|
263
|
+
end
|
264
|
+
|
265
|
+
# For each object, map its ID to the ID of its duplicate, if any.
|
266
|
+
#
|
267
|
+
# @param [Hash] objects a hash of scraped objects keyed by ID
|
268
|
+
# @return [Hash] a mapping from an object ID to the ID of its duplicate
|
269
|
+
def build_losers_to_winners_map(objects)
|
270
|
+
{}.tap do |map|
|
271
|
+
objects.each_with_index do |(id1,object1),index|
|
272
|
+
unless map.key?(id1) # Don't search for duplicates of duplicates.
|
273
|
+
objects.drop(index + 1).each do |id2,object2|
|
274
|
+
if object1 == object2
|
275
|
+
map[id2] = id1
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
# If any objects have unresolved foreign objects, we cannot derive an
|
284
|
+
# evaluation order using a dependency graph.
|
285
|
+
#
|
286
|
+
# @param [Hash] objects a hash of scraped objects keyed by ID
|
287
|
+
# @return [Boolean] whether a dependency graph can be used to derive an
|
288
|
+
# evaluation order
|
289
|
+
def use_dependency_graph?(objects)
|
290
|
+
objects.each do |id,object|
|
291
|
+
object.foreign_objects.each do |property|
|
292
|
+
if object[property].present?
|
293
|
+
return false
|
294
|
+
end
|
295
|
+
end
|
296
|
+
end
|
297
|
+
true
|
298
|
+
end
|
299
|
+
|
300
|
+
# Builds a dependency graph.
|
301
|
+
#
|
302
|
+
# @param [Hash] objects a hash of scraped objects keyed by ID
|
303
|
+
# @return [DependencyGraph] the dependency graph
|
304
|
+
def build_dependency_graph(objects)
|
305
|
+
DependencyGraph.new.tap do |graph|
|
306
|
+
objects.each do |id,object|
|
307
|
+
graph[id] = [] # no duplicate IDs
|
308
|
+
object.foreign_keys.each do |property|
|
309
|
+
value = object[property]
|
310
|
+
if value
|
311
|
+
graph[id] << value
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
# Resolves an object's foreign keys from object IDs to database IDs.
|
319
|
+
#
|
320
|
+
# @param [Object] an object
|
321
|
+
# @param [Hash] a map from object ID to database ID
|
322
|
+
# @raises [Pupa::Errors::MissingDatabaseIdError]
|
323
|
+
def resolve_foreign_keys(object, map)
|
324
|
+
object.foreign_keys.each do |property|
|
325
|
+
value = object[property]
|
326
|
+
if value
|
327
|
+
# If using a dependency graph, any foreign key that cannot be resolved
|
328
|
+
# will cause a key error while building the dependency graph.
|
329
|
+
#
|
330
|
+
# If not using a dependency graph, this method will not be called
|
331
|
+
# unless the foreign key is resolvable.
|
332
|
+
object[property] = map[value]
|
333
|
+
end
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
# Resolves an object's foreign objects to database IDs.
|
338
|
+
#
|
339
|
+
# @param [Object] an object
|
340
|
+
# @raises [Pupa::Errors::MissingDatabaseIdError]
|
341
|
+
def resolve_foreign_objects(object)
|
342
|
+
object.foreign_objects.each do |property|
|
343
|
+
selector = object[property]
|
344
|
+
if selector.present?
|
345
|
+
# This method will not be called unless the foreign key is resolvable.
|
346
|
+
object["#{property}_id"] = Persistence.find(selector)['_id']
|
347
|
+
end
|
348
|
+
end
|
349
|
+
end
|
350
|
+
end
|
351
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Refinements
|
3
|
+
# A refinement for the Faraday caching middleware to cache all requests, not
|
4
|
+
# only GET requests.
|
5
|
+
module FaradayMiddleware
|
6
|
+
refine ::FaradayMiddleware::Caching do
|
7
|
+
def call(env)
|
8
|
+
# Remove if-statement to cache any request, not only GET.
|
9
|
+
if env[:parallel_manager]
|
10
|
+
# callback mode
|
11
|
+
cache_on_complete(env)
|
12
|
+
else
|
13
|
+
# synchronous mode
|
14
|
+
response = cache.fetch(cache_key(env)) { @app.call(env) }
|
15
|
+
finalize_response(response, env)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def cache_key(env)
|
20
|
+
url = env[:url].dup
|
21
|
+
if url.query && params_to_ignore.any?
|
22
|
+
params = parse_query url.query
|
23
|
+
params.reject! {|k,| params_to_ignore.include? k }
|
24
|
+
url.query = build_query params
|
25
|
+
end
|
26
|
+
url.normalize!
|
27
|
+
url.request_uri + env[:body].to_s # Add for POST requests.
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Refinements
|
3
|
+
# A refinement for JSON Schema to validate "email" and "uri" formats.
|
4
|
+
module Format
|
5
|
+
# @see http://my.rails-royce.org/2010/07/21/email-validation-in-ruby-on-rails-without-regexp/
|
6
|
+
def validate(current_schema, data, fragments, processor, validator, options = {})
|
7
|
+
case current_schema.schema['format']
|
8
|
+
when 'email'
|
9
|
+
error_message = "The property '#{build_fragment(fragments)}' must be a valid email address"
|
10
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
|
11
|
+
address = Mail::Address.new(data)
|
12
|
+
unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
|
13
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
14
|
+
return
|
15
|
+
end
|
16
|
+
when 'uri'
|
17
|
+
error_message = "The property '#{build_fragment(fragments)}' must be a valid URI"
|
18
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
|
19
|
+
r = URI::DEFAULT_PARSER.regexp[:ABS_URI]
|
20
|
+
unless r.match(data)
|
21
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
22
|
+
return
|
23
|
+
end
|
24
|
+
else
|
25
|
+
super
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class ::JSON::Schema::FormatAttribute
|
31
|
+
class << self
|
32
|
+
prepend Pupa::Refinements::Format
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/pupa/runner.rb
ADDED
@@ -0,0 +1,185 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
require 'moped'
|
6
|
+
|
7
|
+
module Pupa
|
8
|
+
class Runner
|
9
|
+
attr_reader :options, :actions
|
10
|
+
|
11
|
+
# @param [Pupa::Processor] a processor class
|
12
|
+
# @param [Hash] defaults change any default options
|
13
|
+
def initialize(processor_class, defaults = {})
|
14
|
+
@processor_class = processor_class
|
15
|
+
|
16
|
+
@options = OpenStruct.new({
|
17
|
+
actions: [],
|
18
|
+
tasks: [],
|
19
|
+
output_dir: File.expand_path('scraped_data', Dir.pwd),
|
20
|
+
cache_dir: File.expand_path('web_cache', Dir.pwd),
|
21
|
+
expires_in: 86400, # 1 day
|
22
|
+
host_with_port: 'localhost:27017',
|
23
|
+
database: 'pupa',
|
24
|
+
dry_run: false,
|
25
|
+
level: 'INFO',
|
26
|
+
}.merge(defaults))
|
27
|
+
|
28
|
+
@actions = {
|
29
|
+
'scrape' => 'Scrapes data from online sources',
|
30
|
+
'import' => 'Imports scraped data into a database',
|
31
|
+
}.map do |name,description|
|
32
|
+
OpenStruct.new(name: name, description: description)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# @param [Hash] attributes the action's attributes
|
37
|
+
# @option attributes [String] :name the action's label
|
38
|
+
# @option attributes [String] :description a description of the action
|
39
|
+
def add_action(attributes)
|
40
|
+
@actions << OpenStruct.new(attributes)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns the command-line option parser.
|
44
|
+
#
|
45
|
+
# @return [OptionParser] the command-line option parser
|
46
|
+
def opts
|
47
|
+
@opts ||= OptionParser.new do |opts|
|
48
|
+
opts.program_name = File.basename($PROGRAM_NAME)
|
49
|
+
opts.banner = "Usage: #{opts.program_name}"
|
50
|
+
|
51
|
+
opts.separator ''
|
52
|
+
opts.separator 'Actions:'
|
53
|
+
|
54
|
+
names = @actions.map(&:name)
|
55
|
+
padding = names.map(&:size).max
|
56
|
+
@actions.each do |action|
|
57
|
+
opts.separator " #{action.name.ljust(padding)} #{action.description}\n"
|
58
|
+
end
|
59
|
+
|
60
|
+
opts.separator ''
|
61
|
+
opts.separator 'Tasks:'
|
62
|
+
|
63
|
+
@processor_class.tasks.each do |task_name|
|
64
|
+
opts.separator " #{task_name}"
|
65
|
+
end
|
66
|
+
|
67
|
+
opts.separator ''
|
68
|
+
opts.separator 'Specific options:'
|
69
|
+
opts.on('-a', '--action ACTION', names, 'Select an action to run (you may give this switch multiple times)', " (#{names.join(', ')})") do |v|
|
70
|
+
options.actions << v
|
71
|
+
end
|
72
|
+
opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', " (#{@processor_class.tasks.join(', ')})") do |v|
|
73
|
+
options.tasks << v
|
74
|
+
end
|
75
|
+
opts.on('-o', '--output_dir PATH', 'The directory in which to dump JSON documents') do |v|
|
76
|
+
options.output_dir = v
|
77
|
+
end
|
78
|
+
opts.on('-c', '--cache_dir PATH', 'The directory in which to cache HTTP requests') do |v|
|
79
|
+
options.cache_dir = v
|
80
|
+
end
|
81
|
+
opts.on('-e', '--expires_in SECONDS', "The cache's expiration time in seconds") do |v|
|
82
|
+
options.expires_in = v
|
83
|
+
end
|
84
|
+
opts.on('-H', '--host HOST:PORT', 'The host and port to MongoDB') do |v|
|
85
|
+
options.host_with_port = v
|
86
|
+
end
|
87
|
+
opts.on('-d', '--database NAME', 'The name of the MongoDB database') do |v|
|
88
|
+
options.database = v
|
89
|
+
end
|
90
|
+
opts.on('-n', '--dry-run', 'Show the plan without running any actions') do
|
91
|
+
options.dry_run = true
|
92
|
+
end
|
93
|
+
opts.on('-v', '--verbose', 'Show all messages') do
|
94
|
+
options.level = 'DEBUG'
|
95
|
+
end
|
96
|
+
opts.on('-q', '--quiet', 'Show only warning and error messages') do
|
97
|
+
options.level = 'WARN'
|
98
|
+
end
|
99
|
+
opts.on('-s', '--silent', 'Show no messages') do
|
100
|
+
options.level = 'UNKNOWN'
|
101
|
+
end
|
102
|
+
|
103
|
+
opts.separator ''
|
104
|
+
opts.separator 'Common options:'
|
105
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
106
|
+
puts opts
|
107
|
+
exit
|
108
|
+
end
|
109
|
+
opts.on_tail('-v', '--version', 'Show version') do
|
110
|
+
puts Pupa::VERSION
|
111
|
+
exit
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Runs the action.
|
117
|
+
#
|
118
|
+
# @example Run from a command-line script
|
119
|
+
#
|
120
|
+
# runner.run(ARGV)
|
121
|
+
#
|
122
|
+
# @example Override the command-line options
|
123
|
+
#
|
124
|
+
# runner.run(ARGV, expires_in: 3600) # 1 hour
|
125
|
+
#
|
126
|
+
# @param [Array] args command-line arguments
|
127
|
+
# @param [Hash] overrides any overridden options
|
128
|
+
def run(args, overrides = {})
|
129
|
+
rest = opts.parse!(args)
|
130
|
+
|
131
|
+
@options = OpenStruct.new(options.to_h.merge(overrides))
|
132
|
+
|
133
|
+
if options.actions.empty?
|
134
|
+
options.actions = %w(scrape import)
|
135
|
+
end
|
136
|
+
if options.tasks.empty?
|
137
|
+
options.tasks = @processor_class.tasks
|
138
|
+
end
|
139
|
+
|
140
|
+
processor = @processor_class.new(options.output_dir, cache_dir: options.cache_dir, expires_in: options.expires_in, level: options.level, options: Hash[*rest])
|
141
|
+
|
142
|
+
options.actions.each do |action|
|
143
|
+
unless action == 'scrape' || processor.respond_to?(action)
|
144
|
+
abort %(`#{action}` is not a #{opts.program_name} action. See `#{opts.program_name} --help` for a list of available actions.)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
if %w(DEBUG INFO).include?(options.level)
|
149
|
+
puts "processor: #{@processor_class}"
|
150
|
+
puts "actions: #{options.actions.join(', ')}"
|
151
|
+
puts "tasks: #{options.tasks.join(', ')}"
|
152
|
+
end
|
153
|
+
|
154
|
+
if options.level == 'DEBUG'
|
155
|
+
%w(output_dir cache_dir expires_in host_with_port database level).each do |option|
|
156
|
+
puts "#{option}: #{options[option]}"
|
157
|
+
end
|
158
|
+
unless rest.empty?
|
159
|
+
puts "options: #{rest.join(' ')}"
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
exit if options.dry_run
|
164
|
+
|
165
|
+
Pupa.session = Moped::Session.new([options.host_with_port], database: options.database)
|
166
|
+
|
167
|
+
if options.actions.delete('scrape')
|
168
|
+
FileUtils.mkdir_p(options.output_dir)
|
169
|
+
FileUtils.mkdir_p(options.cache_dir)
|
170
|
+
|
171
|
+
Dir[File.join(options.output_dir, '*.json')].each do |path|
|
172
|
+
FileUtils.rm(path)
|
173
|
+
end
|
174
|
+
|
175
|
+
options.tasks.each do |task_name|
|
176
|
+
processor.dump_scraped_objects(task_name)
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
options.actions.each do |action|
|
181
|
+
processor.send(action)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
data/lib/pupa/version.rb
ADDED
data/lib/pupa.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
require 'active_support/concern'
|
4
|
+
require 'active_support/core_ext/class/attribute'
|
5
|
+
require 'active_support/core_ext/object/blank'
|
6
|
+
require 'active_support/inflector'
|
7
|
+
|
8
|
+
require 'pupa/errors'
|
9
|
+
require 'pupa/logger'
|
10
|
+
require 'pupa/processor'
|
11
|
+
require 'pupa/runner'
|
12
|
+
|
13
|
+
require 'pupa/models/concerns/contactable'
|
14
|
+
require 'pupa/models/concerns/identifiable'
|
15
|
+
require 'pupa/models/concerns/linkable'
|
16
|
+
require 'pupa/models/concerns/nameable'
|
17
|
+
require 'pupa/models/concerns/sourceable'
|
18
|
+
require 'pupa/models/concerns/timestamps'
|
19
|
+
|
20
|
+
require 'pupa/models/base'
|
21
|
+
require 'pupa/models/contact_detail_list'
|
22
|
+
require 'pupa/models/membership'
|
23
|
+
require 'pupa/models/organization'
|
24
|
+
require 'pupa/models/person'
|
25
|
+
require 'pupa/models/post'
|
26
|
+
|
27
|
+
module Pupa
|
28
|
+
class << self
|
29
|
+
attr_accessor :session
|
30
|
+
end
|
31
|
+
end
|
data/pupa.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/pupa/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "pupa"
|
6
|
+
s.version = Pupa::VERSION
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Open North"]
|
9
|
+
s.email = ["info@opennorth.ca"]
|
10
|
+
s.homepage = "http://github.com/opennorth/pupa-ruby"
|
11
|
+
s.summary = %q{A data scraping framework}
|
12
|
+
s.license = 'MIT'
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.add_runtime_dependency('activesupport', '~> 4.0.0')
|
20
|
+
s.add_runtime_dependency('colored', '~> 1.2')
|
21
|
+
s.add_runtime_dependency('faraday_middleware', '~> 0.9.0')
|
22
|
+
s.add_runtime_dependency('json-schema', '~> 2.1.3')
|
23
|
+
s.add_runtime_dependency('mail')
|
24
|
+
s.add_runtime_dependency('moped', '~> 1.5.1')
|
25
|
+
s.add_runtime_dependency('nokogiri', '~> 1.6.0')
|
26
|
+
|
27
|
+
s.add_development_dependency('coveralls')
|
28
|
+
s.add_development_dependency('json', '~> 1.7.7') # to silence coveralls warning
|
29
|
+
s.add_development_dependency('octokit') # to update Popolo schema
|
30
|
+
s.add_development_dependency('rake')
|
31
|
+
s.add_development_dependency('rspec', '~> 2.10')
|
32
|
+
s.add_development_dependency('vcr', '~> 2.5.0')
|
33
|
+
s.add_development_dependency('multi_xml')
|
34
|
+
end
|