pupa 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +6 -0
  3. data/.travis.yml +5 -0
  4. data/.yardopts +4 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE +20 -0
  7. data/README.md +52 -0
  8. data/Rakefile +37 -0
  9. data/USAGE +1 -0
  10. data/lib/pupa/errors.rb +30 -0
  11. data/lib/pupa/logger.rb +37 -0
  12. data/lib/pupa/models/base.rb +190 -0
  13. data/lib/pupa/models/concerns/contactable.rb +34 -0
  14. data/lib/pupa/models/concerns/identifiable.rb +26 -0
  15. data/lib/pupa/models/concerns/linkable.rb +26 -0
  16. data/lib/pupa/models/concerns/nameable.rb +34 -0
  17. data/lib/pupa/models/concerns/sourceable.rb +26 -0
  18. data/lib/pupa/models/concerns/timestamps.rb +22 -0
  19. data/lib/pupa/models/contact_detail_list.rb +28 -0
  20. data/lib/pupa/models/membership.rb +37 -0
  21. data/lib/pupa/models/organization.rb +40 -0
  22. data/lib/pupa/models/person.rb +35 -0
  23. data/lib/pupa/models/post.rb +28 -0
  24. data/lib/pupa/processor/client.rb +42 -0
  25. data/lib/pupa/processor/dependency_graph.rb +18 -0
  26. data/lib/pupa/processor/helper.rb +15 -0
  27. data/lib/pupa/processor/middleware/logger.rb +37 -0
  28. data/lib/pupa/processor/middleware/parse_html.rb +16 -0
  29. data/lib/pupa/processor/persistence.rb +80 -0
  30. data/lib/pupa/processor/yielder.rb +50 -0
  31. data/lib/pupa/processor.rb +351 -0
  32. data/lib/pupa/refinements/faraday_middleware.rb +32 -0
  33. data/lib/pupa/refinements/json-schema.rb +36 -0
  34. data/lib/pupa/runner.rb +185 -0
  35. data/lib/pupa/version.rb +3 -0
  36. data/lib/pupa.rb +31 -0
  37. data/pupa.gemspec +34 -0
  38. data/schemas/popolo/contact_detail.json +44 -0
  39. data/schemas/popolo/identifier.json +18 -0
  40. data/schemas/popolo/link.json +19 -0
  41. data/schemas/popolo/membership.json +86 -0
  42. data/schemas/popolo/organization.json +104 -0
  43. data/schemas/popolo/other_name.json +28 -0
  44. data/schemas/popolo/person.json +130 -0
  45. data/schemas/popolo/post.json +78 -0
  46. data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +56 -0
  47. data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +48 -0
  48. data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +54 -0
  49. data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +26 -0
  50. data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +46 -0
  51. data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +26 -0
  52. data/spec/logger_spec.rb +4 -0
  53. data/spec/models/base_spec.rb +194 -0
  54. data/spec/models/concerns/contactable_spec.rb +37 -0
  55. data/spec/models/concerns/identifiable_spec.rb +25 -0
  56. data/spec/models/concerns/linkable_spec.rb +25 -0
  57. data/spec/models/concerns/nameable_spec.rb +25 -0
  58. data/spec/models/concerns/sourceable_spec.rb +25 -0
  59. data/spec/models/concerns/timestamps_spec.rb +32 -0
  60. data/spec/models/contact_detail_list_spec.rb +44 -0
  61. data/spec/models/membership_spec.rb +30 -0
  62. data/spec/models/organization_spec.rb +24 -0
  63. data/spec/models/person_spec.rb +24 -0
  64. data/spec/models/post_spec.rb +19 -0
  65. data/spec/processor/client_spec.rb +4 -0
  66. data/spec/processor/dependency_graph_spec.rb +4 -0
  67. data/spec/processor/helper_spec.rb +4 -0
  68. data/spec/processor/middleware/logger_spec.rb +87 -0
  69. data/spec/processor/middleware/parse_html_spec.rb +92 -0
  70. data/spec/processor/persistence_spec.rb +41 -0
  71. data/spec/processor/yielder_spec.rb +55 -0
  72. data/spec/processor_spec.rb +268 -0
  73. data/spec/runner_spec.rb +85 -0
  74. data/spec/spec_helper.rb +17 -0
  75. metadata +342 -0
@@ -0,0 +1,351 @@
1
+ require 'json'
2
+
3
+ require 'nokogiri'
4
+
5
+ require 'pupa/processor/client'
6
+ require 'pupa/processor/dependency_graph'
7
+ require 'pupa/processor/helper'
8
+ require 'pupa/processor/persistence'
9
+ require 'pupa/processor/yielder'
10
+
11
+ module Pupa
12
+ # An abstract processor class from which specific processors inherit.
13
+ class Processor
14
+ extend Forwardable
15
+ include Helper
16
+
17
+ class_attribute :tasks
18
+ self.tasks = []
19
+
20
+ def_delegators :@logger, :debug, :info, :warn, :error, :fatal
21
+
22
+ # @param [String] output_dir the directory in which to dump JSON documents
23
+ # @param [String] cache_dir the directory in which to cache HTTP responses
24
+ # @param [Integer] expires_in the cache's expiration time in seconds
25
+ # @param [String] level the log level
26
+ # @param [String,IO] logdev the log device
27
+ # @param [Hash] options criteria for selecting the methods to run
28
+ def initialize(output_dir, cache_dir: nil, expires_in: 86400, level: 'INFO', logdev: STDOUT, options: {})
29
+ @output_dir = output_dir
30
+ @options = options
31
+ @level = level
32
+ @logger = Logger.new('pupa', level: level, logdev: logdev)
33
+ @client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
34
+ end
35
+
36
+ # Retrieves and parses a document with a GET request.
37
+ #
38
+ # @param [String] url a URL to an HTML document
39
+ # @param [String,Hash] params query string parameters
40
+ # @return a parsed document
41
+ def get(url, params = {})
42
+ # Faraday requires `params` to be a hash.
43
+ if String === params
44
+ params = CGI.parse(params)
45
+
46
+ # Flatten the parameters for Faraday.
47
+ params.each do |key,value|
48
+ if Array === value && value.size == 1
49
+ params[key] = value.first
50
+ end
51
+ end
52
+ end
53
+
54
+ @client.get(url, params).body
55
+ end
56
+
57
+ # Retrieves and parses a document with a POST request.
58
+ #
59
+ # @param [String] url a URL to an HTML document
60
+ # @param [String,Hash] params query string parameters
61
+ # @return a parsed document
62
+ def post(url, params = {})
63
+ @client.post(url, params).body
64
+ end
65
+
66
+ # Adds a scraping task to Pupa.rb.
67
+ #
68
+ # Defines a method whose name is identical to `task_name`. This method
69
+ # selects a method to perform the scraping task using `scraping_task_method`
70
+ # and memoizes its return value. The return value is a lazy enumerator of
71
+ # objects scraped by the selected method. The selected method must yield
72
+ # objects to populate this lazy enumerator.
73
+ #
74
+ # For example, `MyProcessor.add_scraping_task(:people)` defines a `people`
75
+ # method on `MyProcessor`. This `people` method returns a lazy enumerator of
76
+ # objects (presumably Person objects in this case, but the enumerator can
77
+ # contain any object in the general case).
78
+ #
79
+ # In `MyProcessor`, you would define an `scrape_people` method, which must
80
+ # yield objects to populate the lazy enumerator. Alternatively, you may
81
+ # override `scraping_task_method` to change the method selected to perform
82
+ # the scraping task.
83
+ #
84
+ # The `people` method can then be called by transformation and import tasks.
85
+ #
86
+ # @param [Symbol] task_name a task name
87
+ # @see Pupa::Processor#scraping_task_method
88
+ def self.add_scraping_task(task_name)
89
+ self.tasks += [task_name]
90
+ define_method(task_name) do
91
+ ivar = "@#{task_name}"
92
+ if instance_variable_defined?(ivar)
93
+ instance_variable_get(ivar)
94
+ else
95
+ instance_variable_set(ivar, Yielder.new(&method(scraping_task_method(task_name))))
96
+ end
97
+ end
98
+ end
99
+
100
+ # Dumps scraped objects to disk.
101
+ #
102
+ # @param [Symbol] task_name the name of the scraping task to perform
103
+ def dump_scraped_objects(task_name)
104
+ send(task_name).each do |object|
105
+ dump_scraped_object(object)
106
+ end
107
+ end
108
+
109
+ # Saves scraped objects to a database.
110
+ #
111
+ # @raises [TSort::Cyclic] if the dependency graph is cyclic
112
+ # @raises [Pupa::Errors::UnprocessableEntity] if an object's foreign keys or
113
+ # foreign objects cannot be resolved
114
+ # @raises [Pupa::Errors::DuplicateDocumentError] if duplicate objects were
115
+ # inadvertently saved to the database
116
+ def import
117
+ objects = deduplicate(load_scraped_objects)
118
+
119
+ object_id_to_database_id = {}
120
+
121
+ if use_dependency_graph?(objects)
122
+ dependency_graph = build_dependency_graph(objects)
123
+
124
+ # Replace object IDs with database IDs in foreign keys and save objects.
125
+ dependency_graph.tsort.each do |id|
126
+ object = objects[id]
127
+ resolve_foreign_keys(object, object_id_to_database_id)
128
+ # The dependency graph strategy only works if there are no foreign objects.
129
+ object_id_to_database_id[id] = Persistence.new(object).save
130
+ end
131
+ else
132
+ size = objects.size
133
+
134
+ # Should be O(n²). If there are foreign objects, we do not know all the
135
+ # edges in the graph, and therefore cannot build a dependency graph or
136
+ # derive any evaluation order.
137
+ #
138
+ # An exception is raised if a foreign object matches multiple documents
139
+ # in the database. However, if a matching object is not yet saved, this
140
+ # exception may not be raised.
141
+ loop do
142
+ progress_made = false
143
+
144
+ objects.delete_if do |id,object|
145
+ resolvable = true
146
+
147
+ resolvable &= object.foreign_keys.all? do |property|
148
+ value = object[property]
149
+ value.nil? || object_id_to_database_id.key?(value)
150
+ end
151
+
152
+ resolvable &= object.foreign_objects.all? do |property|
153
+ selector = object[property]
154
+ selector.blank? || Persistence.find(selector)
155
+ end
156
+
157
+ if resolvable
158
+ progress_made = true
159
+ resolve_foreign_keys(object, object_id_to_database_id)
160
+ resolve_foreign_objects(object)
161
+ object_id_to_database_id[id] = Persistence.new(object).save
162
+ end
163
+ end
164
+
165
+ break if objects.empty? || !progress_made
166
+ end
167
+
168
+ unless objects.empty?
169
+ raise Errors::UnprocessableEntity, "couldn't resolve #{objects.size}/#{size} objects:\n #{objects.values.map{|object| JSON.dump(object.foreign_properties)}.join("\n ")}"
170
+ end
171
+ end
172
+
173
+ # Ensure that fingerprints uniquely identified objects.
174
+ counts = {}
175
+ object_id_to_database_id.each do |object_id,database_id|
176
+ (counts[database_id] ||= []) << object_id
177
+ end
178
+ duplicates = counts.select do |_,object_ids|
179
+ object_ids.size > 1
180
+ end
181
+ unless duplicates.empty?
182
+ raise Errors::DuplicateDocumentError, "multiple objects written to same document:\n" + duplicates.map{|database_id,object_ids| " #{database_id} <- #{object_ids.join(' ')}"}.join("\n")
183
+ end
184
+ end
185
+
186
+ private
187
+
188
+ # Returns the name of the method - `scrape_<task_name>` by default - that
189
+ # would be used to perform the given scraping task.
190
+ #
191
+ # If you would like to change this default behavior, override this method in
192
+ # a subclass. For example, you may want to select a method according to the
193
+ # additional `options` passed from the command-line to the processor.
194
+ #
195
+ # @param [Symbol] task_name a task name
196
+ # @return [String] the name of the method to perform the scraping task
197
+ def scraping_task_method(task_name)
198
+ "scrape_#{task_name}"
199
+ end
200
+
201
+ # Dumps an scraped object to disk.
202
+ #
203
+ # @param [Object] object an scraped object
204
+ # @raises [Pupa::Errors::DuplicateObjectIdError]
205
+ def dump_scraped_object(object)
206
+ type = object.class.to_s.demodulize.underscore
207
+ basename = "#{type}_#{object._id}.json"
208
+ path = File.join(@output_dir, basename)
209
+
210
+ if File.exist?(path)
211
+ raise Errors::DuplicateObjectIdError, "duplicate object ID: #{object._id} (was the same objected yielded twice?)"
212
+ end
213
+
214
+ info {"save #{type} #{object.to_s} as #{basename}"}
215
+
216
+ File.open(path, 'w') do |f|
217
+ f.write(JSON.dump(object.to_h(include_foreign_objects: true)))
218
+ end
219
+
220
+ begin
221
+ object.validate!
222
+ rescue JSON::Schema::ValidationError => e
223
+ warn {e.message}
224
+ end
225
+ end
226
+
227
+ # Loads scraped objects from disk.
228
+ #
229
+ # @return [Hash] a hash of scraped objects keyed by ID
230
+ def load_scraped_objects
231
+ {}.tap do |objects|
232
+ Dir[File.join(@output_dir, '*.json')].each do |path|
233
+ data = JSON.load(File.read(path))
234
+ object = data['_type'].camelize.constantize.new(data)
235
+ objects[object._id] = object
236
+ end
237
+ end
238
+ end
239
+
240
+ # Removes all duplicate objects and re-assigns any foreign keys.
241
+ #
242
+ # @param [Hash] objects a hash of scraped objects keyed by ID
243
+ # @return [Hash] the objects without duplicates
244
+ def deduplicate(objects)
245
+ losers_to_winners = build_losers_to_winners_map(objects)
246
+
247
+ # Remove all losers.
248
+ losers_to_winners.each_key do |key|
249
+ objects.delete(key)
250
+ end
251
+
252
+ # Swap the IDs of losers for the IDs of winners.
253
+ objects.each do |id,object|
254
+ object.foreign_keys.each do |property|
255
+ value = object[property]
256
+ if value && losers_to_winners.key?(value)
257
+ object[property] = losers_to_winners[value]
258
+ end
259
+ end
260
+ end
261
+
262
+ objects
263
+ end
264
+
265
+ # For each object, map its ID to the ID of its duplicate, if any.
266
+ #
267
+ # @param [Hash] objects a hash of scraped objects keyed by ID
268
+ # @return [Hash] a mapping from an object ID to the ID of its duplicate
269
+ def build_losers_to_winners_map(objects)
270
+ {}.tap do |map|
271
+ objects.each_with_index do |(id1,object1),index|
272
+ unless map.key?(id1) # Don't search for duplicates of duplicates.
273
+ objects.drop(index + 1).each do |id2,object2|
274
+ if object1 == object2
275
+ map[id2] = id1
276
+ end
277
+ end
278
+ end
279
+ end
280
+ end
281
+ end
282
+
283
+ # If any objects have unresolved foreign objects, we cannot derive an
284
+ # evaluation order using a dependency graph.
285
+ #
286
+ # @param [Hash] objects a hash of scraped objects keyed by ID
287
+ # @return [Boolean] whether a dependency graph can be used to derive an
288
+ # evaluation order
289
+ def use_dependency_graph?(objects)
290
+ objects.each do |id,object|
291
+ object.foreign_objects.each do |property|
292
+ if object[property].present?
293
+ return false
294
+ end
295
+ end
296
+ end
297
+ true
298
+ end
299
+
300
+ # Builds a dependency graph.
301
+ #
302
+ # @param [Hash] objects a hash of scraped objects keyed by ID
303
+ # @return [DependencyGraph] the dependency graph
304
+ def build_dependency_graph(objects)
305
+ DependencyGraph.new.tap do |graph|
306
+ objects.each do |id,object|
307
+ graph[id] = [] # no duplicate IDs
308
+ object.foreign_keys.each do |property|
309
+ value = object[property]
310
+ if value
311
+ graph[id] << value
312
+ end
313
+ end
314
+ end
315
+ end
316
+ end
317
+
318
+ # Resolves an object's foreign keys from object IDs to database IDs.
319
+ #
320
+ # @param [Object] an object
321
+ # @param [Hash] a map from object ID to database ID
322
+ # @raises [Pupa::Errors::MissingDatabaseIdError]
323
+ def resolve_foreign_keys(object, map)
324
+ object.foreign_keys.each do |property|
325
+ value = object[property]
326
+ if value
327
+ # If using a dependency graph, any foreign key that cannot be resolved
328
+ # will cause a key error while building the dependency graph.
329
+ #
330
+ # If not using a dependency graph, this method will not be called
331
+ # unless the foreign key is resolvable.
332
+ object[property] = map[value]
333
+ end
334
+ end
335
+ end
336
+
337
+ # Resolves an object's foreign objects to database IDs.
338
+ #
339
+ # @param [Object] an object
340
+ # @raises [Pupa::Errors::MissingDatabaseIdError]
341
+ def resolve_foreign_objects(object)
342
+ object.foreign_objects.each do |property|
343
+ selector = object[property]
344
+ if selector.present?
345
+ # This method will not be called unless the foreign key is resolvable.
346
+ object["#{property}_id"] = Persistence.find(selector)['_id']
347
+ end
348
+ end
349
+ end
350
+ end
351
+ end
@@ -0,0 +1,32 @@
1
+ module Pupa
2
+ class Refinements
3
+ # A refinement for the Faraday caching middleware to cache all requests, not
4
+ # only GET requests.
5
+ module FaradayMiddleware
6
+ refine ::FaradayMiddleware::Caching do
7
+ def call(env)
8
+ # Remove if-statement to cache any request, not only GET.
9
+ if env[:parallel_manager]
10
+ # callback mode
11
+ cache_on_complete(env)
12
+ else
13
+ # synchronous mode
14
+ response = cache.fetch(cache_key(env)) { @app.call(env) }
15
+ finalize_response(response, env)
16
+ end
17
+ end
18
+
19
+ def cache_key(env)
20
+ url = env[:url].dup
21
+ if url.query && params_to_ignore.any?
22
+ params = parse_query url.query
23
+ params.reject! {|k,| params_to_ignore.include? k }
24
+ url.query = build_query params
25
+ end
26
+ url.normalize!
27
+ url.request_uri + env[:body].to_s # Add for POST requests.
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,36 @@
1
+ module Pupa
2
+ class Refinements
3
+ # A refinement for JSON Schema to validate "email" and "uri" formats.
4
+ module Format
5
+ # @see http://my.rails-royce.org/2010/07/21/email-validation-in-ruby-on-rails-without-regexp/
6
+ def validate(current_schema, data, fragments, processor, validator, options = {})
7
+ case current_schema.schema['format']
8
+ when 'email'
9
+ error_message = "The property '#{build_fragment(fragments)}' must be a valid email address"
10
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
11
+ address = Mail::Address.new(data)
12
+ unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
13
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
14
+ return
15
+ end
16
+ when 'uri'
17
+ error_message = "The property '#{build_fragment(fragments)}' must be a valid URI"
18
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
19
+ r = URI::DEFAULT_PARSER.regexp[:ABS_URI]
20
+ unless r.match(data)
21
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
22
+ return
23
+ end
24
+ else
25
+ super
26
+ end
27
+ end
28
+ end
29
+
30
+ class ::JSON::Schema::FormatAttribute
31
+ class << self
32
+ prepend Pupa::Refinements::Format
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,185 @@
1
+ require 'fileutils'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ require 'moped'
6
+
7
+ module Pupa
8
+ class Runner
9
+ attr_reader :options, :actions
10
+
11
+ # @param [Pupa::Processor] a processor class
12
+ # @param [Hash] defaults change any default options
13
+ def initialize(processor_class, defaults = {})
14
+ @processor_class = processor_class
15
+
16
+ @options = OpenStruct.new({
17
+ actions: [],
18
+ tasks: [],
19
+ output_dir: File.expand_path('scraped_data', Dir.pwd),
20
+ cache_dir: File.expand_path('web_cache', Dir.pwd),
21
+ expires_in: 86400, # 1 day
22
+ host_with_port: 'localhost:27017',
23
+ database: 'pupa',
24
+ dry_run: false,
25
+ level: 'INFO',
26
+ }.merge(defaults))
27
+
28
+ @actions = {
29
+ 'scrape' => 'Scrapes data from online sources',
30
+ 'import' => 'Imports scraped data into a database',
31
+ }.map do |name,description|
32
+ OpenStruct.new(name: name, description: description)
33
+ end
34
+ end
35
+
36
+ # @param [Hash] attributes the action's attributes
37
+ # @option attributes [String] :name the action's label
38
+ # @option attributes [String] :description a description of the action
39
+ def add_action(attributes)
40
+ @actions << OpenStruct.new(attributes)
41
+ end
42
+
43
+ # Returns the command-line option parser.
44
+ #
45
+ # @return [OptionParser] the command-line option parser
46
+ def opts
47
+ @opts ||= OptionParser.new do |opts|
48
+ opts.program_name = File.basename($PROGRAM_NAME)
49
+ opts.banner = "Usage: #{opts.program_name}"
50
+
51
+ opts.separator ''
52
+ opts.separator 'Actions:'
53
+
54
+ names = @actions.map(&:name)
55
+ padding = names.map(&:size).max
56
+ @actions.each do |action|
57
+ opts.separator " #{action.name.ljust(padding)} #{action.description}\n"
58
+ end
59
+
60
+ opts.separator ''
61
+ opts.separator 'Tasks:'
62
+
63
+ @processor_class.tasks.each do |task_name|
64
+ opts.separator " #{task_name}"
65
+ end
66
+
67
+ opts.separator ''
68
+ opts.separator 'Specific options:'
69
+ opts.on('-a', '--action ACTION', names, 'Select an action to run (you may give this switch multiple times)', " (#{names.join(', ')})") do |v|
70
+ options.actions << v
71
+ end
72
+ opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', " (#{@processor_class.tasks.join(', ')})") do |v|
73
+ options.tasks << v
74
+ end
75
+ opts.on('-o', '--output_dir PATH', 'The directory in which to dump JSON documents') do |v|
76
+ options.output_dir = v
77
+ end
78
+ opts.on('-c', '--cache_dir PATH', 'The directory in which to cache HTTP requests') do |v|
79
+ options.cache_dir = v
80
+ end
81
+ opts.on('-e', '--expires_in SECONDS', "The cache's expiration time in seconds") do |v|
82
+ options.expires_in = v
83
+ end
84
+ opts.on('-H', '--host HOST:PORT', 'The host and port to MongoDB') do |v|
85
+ options.host_with_port = v
86
+ end
87
+ opts.on('-d', '--database NAME', 'The name of the MongoDB database') do |v|
88
+ options.database = v
89
+ end
90
+ opts.on('-n', '--dry-run', 'Show the plan without running any actions') do
91
+ options.dry_run = true
92
+ end
93
+ opts.on('-v', '--verbose', 'Show all messages') do
94
+ options.level = 'DEBUG'
95
+ end
96
+ opts.on('-q', '--quiet', 'Show only warning and error messages') do
97
+ options.level = 'WARN'
98
+ end
99
+ opts.on('-s', '--silent', 'Show no messages') do
100
+ options.level = 'UNKNOWN'
101
+ end
102
+
103
+ opts.separator ''
104
+ opts.separator 'Common options:'
105
+ opts.on_tail('-h', '--help', 'Show this message') do
106
+ puts opts
107
+ exit
108
+ end
109
+ opts.on_tail('-v', '--version', 'Show version') do
110
+ puts Pupa::VERSION
111
+ exit
112
+ end
113
+ end
114
+ end
115
+
116
+ # Runs the action.
117
+ #
118
+ # @example Run from a command-line script
119
+ #
120
+ # runner.run(ARGV)
121
+ #
122
+ # @example Override the command-line options
123
+ #
124
+ # runner.run(ARGV, expires_in: 3600) # 1 hour
125
+ #
126
+ # @param [Array] args command-line arguments
127
+ # @param [Hash] overrides any overridden options
128
+ def run(args, overrides = {})
129
+ rest = opts.parse!(args)
130
+
131
+ @options = OpenStruct.new(options.to_h.merge(overrides))
132
+
133
+ if options.actions.empty?
134
+ options.actions = %w(scrape import)
135
+ end
136
+ if options.tasks.empty?
137
+ options.tasks = @processor_class.tasks
138
+ end
139
+
140
+ processor = @processor_class.new(options.output_dir, cache_dir: options.cache_dir, expires_in: options.expires_in, level: options.level, options: Hash[*rest])
141
+
142
+ options.actions.each do |action|
143
+ unless action == 'scrape' || processor.respond_to?(action)
144
+ abort %(`#{action}` is not a #{opts.program_name} action. See `#{opts.program_name} --help` for a list of available actions.)
145
+ end
146
+ end
147
+
148
+ if %w(DEBUG INFO).include?(options.level)
149
+ puts "processor: #{@processor_class}"
150
+ puts "actions: #{options.actions.join(', ')}"
151
+ puts "tasks: #{options.tasks.join(', ')}"
152
+ end
153
+
154
+ if options.level == 'DEBUG'
155
+ %w(output_dir cache_dir expires_in host_with_port database level).each do |option|
156
+ puts "#{option}: #{options[option]}"
157
+ end
158
+ unless rest.empty?
159
+ puts "options: #{rest.join(' ')}"
160
+ end
161
+ end
162
+
163
+ exit if options.dry_run
164
+
165
+ Pupa.session = Moped::Session.new([options.host_with_port], database: options.database)
166
+
167
+ if options.actions.delete('scrape')
168
+ FileUtils.mkdir_p(options.output_dir)
169
+ FileUtils.mkdir_p(options.cache_dir)
170
+
171
+ Dir[File.join(options.output_dir, '*.json')].each do |path|
172
+ FileUtils.rm(path)
173
+ end
174
+
175
+ options.tasks.each do |task_name|
176
+ processor.dump_scraped_objects(task_name)
177
+ end
178
+ end
179
+
180
+ options.actions.each do |action|
181
+ processor.send(action)
182
+ end
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,3 @@
1
+ module Pupa
2
+ VERSION = "0.0.1"
3
+ end
data/lib/pupa.rb ADDED
@@ -0,0 +1,31 @@
1
+ require 'forwardable'
2
+
3
+ require 'active_support/concern'
4
+ require 'active_support/core_ext/class/attribute'
5
+ require 'active_support/core_ext/object/blank'
6
+ require 'active_support/inflector'
7
+
8
+ require 'pupa/errors'
9
+ require 'pupa/logger'
10
+ require 'pupa/processor'
11
+ require 'pupa/runner'
12
+
13
+ require 'pupa/models/concerns/contactable'
14
+ require 'pupa/models/concerns/identifiable'
15
+ require 'pupa/models/concerns/linkable'
16
+ require 'pupa/models/concerns/nameable'
17
+ require 'pupa/models/concerns/sourceable'
18
+ require 'pupa/models/concerns/timestamps'
19
+
20
+ require 'pupa/models/base'
21
+ require 'pupa/models/contact_detail_list'
22
+ require 'pupa/models/membership'
23
+ require 'pupa/models/organization'
24
+ require 'pupa/models/person'
25
+ require 'pupa/models/post'
26
+
27
+ module Pupa
28
+ class << self
29
+ attr_accessor :session
30
+ end
31
+ end
data/pupa.gemspec ADDED
@@ -0,0 +1,34 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/pupa/version', __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "pupa"
6
+ s.version = Pupa::VERSION
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Open North"]
9
+ s.email = ["info@opennorth.ca"]
10
+ s.homepage = "http://github.com/opennorth/pupa-ruby"
11
+ s.summary = %q{A data scraping framework}
12
+ s.license = 'MIT'
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.add_runtime_dependency('activesupport', '~> 4.0.0')
20
+ s.add_runtime_dependency('colored', '~> 1.2')
21
+ s.add_runtime_dependency('faraday_middleware', '~> 0.9.0')
22
+ s.add_runtime_dependency('json-schema', '~> 2.1.3')
23
+ s.add_runtime_dependency('mail')
24
+ s.add_runtime_dependency('moped', '~> 1.5.1')
25
+ s.add_runtime_dependency('nokogiri', '~> 1.6.0')
26
+
27
+ s.add_development_dependency('coveralls')
28
+ s.add_development_dependency('json', '~> 1.7.7') # to silence coveralls warning
29
+ s.add_development_dependency('octokit') # to update Popolo schema
30
+ s.add_development_dependency('rake')
31
+ s.add_development_dependency('rspec', '~> 2.10')
32
+ s.add_development_dependency('vcr', '~> 2.5.0')
33
+ s.add_development_dependency('multi_xml')
34
+ end