pupa 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +6 -0
  3. data/.travis.yml +5 -0
  4. data/.yardopts +4 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE +20 -0
  7. data/README.md +52 -0
  8. data/Rakefile +37 -0
  9. data/USAGE +1 -0
  10. data/lib/pupa/errors.rb +30 -0
  11. data/lib/pupa/logger.rb +37 -0
  12. data/lib/pupa/models/base.rb +190 -0
  13. data/lib/pupa/models/concerns/contactable.rb +34 -0
  14. data/lib/pupa/models/concerns/identifiable.rb +26 -0
  15. data/lib/pupa/models/concerns/linkable.rb +26 -0
  16. data/lib/pupa/models/concerns/nameable.rb +34 -0
  17. data/lib/pupa/models/concerns/sourceable.rb +26 -0
  18. data/lib/pupa/models/concerns/timestamps.rb +22 -0
  19. data/lib/pupa/models/contact_detail_list.rb +28 -0
  20. data/lib/pupa/models/membership.rb +37 -0
  21. data/lib/pupa/models/organization.rb +40 -0
  22. data/lib/pupa/models/person.rb +35 -0
  23. data/lib/pupa/models/post.rb +28 -0
  24. data/lib/pupa/processor/client.rb +42 -0
  25. data/lib/pupa/processor/dependency_graph.rb +18 -0
  26. data/lib/pupa/processor/helper.rb +15 -0
  27. data/lib/pupa/processor/middleware/logger.rb +37 -0
  28. data/lib/pupa/processor/middleware/parse_html.rb +16 -0
  29. data/lib/pupa/processor/persistence.rb +80 -0
  30. data/lib/pupa/processor/yielder.rb +50 -0
  31. data/lib/pupa/processor.rb +351 -0
  32. data/lib/pupa/refinements/faraday_middleware.rb +32 -0
  33. data/lib/pupa/refinements/json-schema.rb +36 -0
  34. data/lib/pupa/runner.rb +185 -0
  35. data/lib/pupa/version.rb +3 -0
  36. data/lib/pupa.rb +31 -0
  37. data/pupa.gemspec +34 -0
  38. data/schemas/popolo/contact_detail.json +44 -0
  39. data/schemas/popolo/identifier.json +18 -0
  40. data/schemas/popolo/link.json +19 -0
  41. data/schemas/popolo/membership.json +86 -0
  42. data/schemas/popolo/organization.json +104 -0
  43. data/schemas/popolo/other_name.json +28 -0
  44. data/schemas/popolo/person.json +130 -0
  45. data/schemas/popolo/post.json +78 -0
  46. data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +56 -0
  47. data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +48 -0
  48. data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +54 -0
  49. data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +26 -0
  50. data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +46 -0
  51. data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +26 -0
  52. data/spec/logger_spec.rb +4 -0
  53. data/spec/models/base_spec.rb +194 -0
  54. data/spec/models/concerns/contactable_spec.rb +37 -0
  55. data/spec/models/concerns/identifiable_spec.rb +25 -0
  56. data/spec/models/concerns/linkable_spec.rb +25 -0
  57. data/spec/models/concerns/nameable_spec.rb +25 -0
  58. data/spec/models/concerns/sourceable_spec.rb +25 -0
  59. data/spec/models/concerns/timestamps_spec.rb +32 -0
  60. data/spec/models/contact_detail_list_spec.rb +44 -0
  61. data/spec/models/membership_spec.rb +30 -0
  62. data/spec/models/organization_spec.rb +24 -0
  63. data/spec/models/person_spec.rb +24 -0
  64. data/spec/models/post_spec.rb +19 -0
  65. data/spec/processor/client_spec.rb +4 -0
  66. data/spec/processor/dependency_graph_spec.rb +4 -0
  67. data/spec/processor/helper_spec.rb +4 -0
  68. data/spec/processor/middleware/logger_spec.rb +87 -0
  69. data/spec/processor/middleware/parse_html_spec.rb +92 -0
  70. data/spec/processor/persistence_spec.rb +41 -0
  71. data/spec/processor/yielder_spec.rb +55 -0
  72. data/spec/processor_spec.rb +268 -0
  73. data/spec/runner_spec.rb +85 -0
  74. data/spec/spec_helper.rb +17 -0
  75. metadata +342 -0
@@ -0,0 +1,351 @@
1
+ require 'json'
2
+
3
+ require 'nokogiri'
4
+
5
+ require 'pupa/processor/client'
6
+ require 'pupa/processor/dependency_graph'
7
+ require 'pupa/processor/helper'
8
+ require 'pupa/processor/persistence'
9
+ require 'pupa/processor/yielder'
10
+
11
+ module Pupa
12
+ # An abstract processor class from which specific processors inherit.
13
+ class Processor
14
+ extend Forwardable
15
+ include Helper
16
+
17
+ class_attribute :tasks
18
+ self.tasks = []
19
+
20
+ def_delegators :@logger, :debug, :info, :warn, :error, :fatal
21
+
22
+ # @param [String] output_dir the directory in which to dump JSON documents
23
+ # @param [String] cache_dir the directory in which to cache HTTP responses
24
+ # @param [Integer] expires_in the cache's expiration time in seconds
25
+ # @param [String] level the log level
26
+ # @param [String,IO] logdev the log device
27
+ # @param [Hash] options criteria for selecting the methods to run
28
+ def initialize(output_dir, cache_dir: nil, expires_in: 86400, level: 'INFO', logdev: STDOUT, options: {})
29
+ @output_dir = output_dir
30
+ @options = options
31
+ @level = level
32
+ @logger = Logger.new('pupa', level: level, logdev: logdev)
33
+ @client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
34
+ end
35
+
36
+ # Retrieves and parses a document with a GET request.
37
+ #
38
+ # @param [String] url a URL to an HTML document
39
+ # @param [String,Hash] params query string parameters
40
+ # @return a parsed document
41
+ def get(url, params = {})
42
+ # Faraday requires `params` to be a hash.
43
+ if String === params
44
+ params = CGI.parse(params)
45
+
46
+ # Flatten the parameters for Faraday.
47
+ params.each do |key,value|
48
+ if Array === value && value.size == 1
49
+ params[key] = value.first
50
+ end
51
+ end
52
+ end
53
+
54
+ @client.get(url, params).body
55
+ end
56
+
57
+ # Retrieves and parses a document with a POST request.
58
+ #
59
+ # @param [String] url a URL to an HTML document
60
+ # @param [String,Hash] params query string parameters
61
+ # @return a parsed document
62
+ def post(url, params = {})
63
+ @client.post(url, params).body
64
+ end
65
+
66
+ # Adds a scraping task to Pupa.rb.
67
+ #
68
+ # Defines a method whose name is identical to `task_name`. This method
69
+ # selects a method to perform the scraping task using `scraping_task_method`
70
+ # and memoizes its return value. The return value is a lazy enumerator of
71
+ # objects scraped by the selected method. The selected method must yield
72
+ # objects to populate this lazy enumerator.
73
+ #
74
+ # For example, `MyProcessor.add_scraping_task(:people)` defines a `people`
75
+ # method on `MyProcessor`. This `people` method returns a lazy enumerator of
76
+ # objects (presumably Person objects in this case, but the enumerator can
77
+ # contain any object in the general case).
78
+ #
79
+ # In `MyProcessor`, you would define an `scrape_people` method, which must
80
+ # yield objects to populate the lazy enumerator. Alternatively, you may
81
+ # override `scraping_task_method` to change the method selected to perform
82
+ # the scraping task.
83
+ #
84
+ # The `people` method can then be called by transformation and import tasks.
85
+ #
86
+ # @param [Symbol] task_name a task name
87
+ # @see Pupa::Processor#scraping_task_method
88
+ def self.add_scraping_task(task_name)
89
+ self.tasks += [task_name]
90
+ define_method(task_name) do
91
+ ivar = "@#{task_name}"
92
+ if instance_variable_defined?(ivar)
93
+ instance_variable_get(ivar)
94
+ else
95
+ instance_variable_set(ivar, Yielder.new(&method(scraping_task_method(task_name))))
96
+ end
97
+ end
98
+ end
99
+
100
+ # Dumps scraped objects to disk.
101
+ #
102
+ # @param [Symbol] task_name the name of the scraping task to perform
103
+ def dump_scraped_objects(task_name)
104
+ send(task_name).each do |object|
105
+ dump_scraped_object(object)
106
+ end
107
+ end
108
+
109
+ # Saves scraped objects to a database.
110
+ #
111
+ # @raises [TSort::Cyclic] if the dependency graph is cyclic
112
+ # @raises [Pupa::Errors::UnprocessableEntity] if an object's foreign keys or
113
+ # foreign objects cannot be resolved
114
+ # @raises [Pupa::Errors::DuplicateDocumentError] if duplicate objects were
115
+ # inadvertently saved to the database
116
+ def import
117
+ objects = deduplicate(load_scraped_objects)
118
+
119
+ object_id_to_database_id = {}
120
+
121
+ if use_dependency_graph?(objects)
122
+ dependency_graph = build_dependency_graph(objects)
123
+
124
+ # Replace object IDs with database IDs in foreign keys and save objects.
125
+ dependency_graph.tsort.each do |id|
126
+ object = objects[id]
127
+ resolve_foreign_keys(object, object_id_to_database_id)
128
+ # The dependency graph strategy only works if there are no foreign objects.
129
+ object_id_to_database_id[id] = Persistence.new(object).save
130
+ end
131
+ else
132
+ size = objects.size
133
+
134
+ # Should be O(n²). If there are foreign objects, we do not know all the
135
+ # edges in the graph, and therefore cannot build a dependency graph or
136
+ # derive any evaluation order.
137
+ #
138
+ # An exception is raised if a foreign object matches multiple documents
139
+ # in the database. However, if a matching object is not yet saved, this
140
+ # exception may not be raised.
141
+ loop do
142
+ progress_made = false
143
+
144
+ objects.delete_if do |id,object|
145
+ resolvable = true
146
+
147
+ resolvable &= object.foreign_keys.all? do |property|
148
+ value = object[property]
149
+ value.nil? || object_id_to_database_id.key?(value)
150
+ end
151
+
152
+ resolvable &= object.foreign_objects.all? do |property|
153
+ selector = object[property]
154
+ selector.blank? || Persistence.find(selector)
155
+ end
156
+
157
+ if resolvable
158
+ progress_made = true
159
+ resolve_foreign_keys(object, object_id_to_database_id)
160
+ resolve_foreign_objects(object)
161
+ object_id_to_database_id[id] = Persistence.new(object).save
162
+ end
163
+ end
164
+
165
+ break if objects.empty? || !progress_made
166
+ end
167
+
168
+ unless objects.empty?
169
+ raise Errors::UnprocessableEntity, "couldn't resolve #{objects.size}/#{size} objects:\n #{objects.values.map{|object| JSON.dump(object.foreign_properties)}.join("\n ")}"
170
+ end
171
+ end
172
+
173
+ # Ensure that fingerprints uniquely identified objects.
174
+ counts = {}
175
+ object_id_to_database_id.each do |object_id,database_id|
176
+ (counts[database_id] ||= []) << object_id
177
+ end
178
+ duplicates = counts.select do |_,object_ids|
179
+ object_ids.size > 1
180
+ end
181
+ unless duplicates.empty?
182
+ raise Errors::DuplicateDocumentError, "multiple objects written to same document:\n" + duplicates.map{|database_id,object_ids| " #{database_id} <- #{object_ids.join(' ')}"}.join("\n")
183
+ end
184
+ end
185
+
186
+ private
187
+
188
+ # Returns the name of the method - `scrape_<task_name>` by default - that
189
+ # would be used to perform the given scraping task.
190
+ #
191
+ # If you would like to change this default behavior, override this method in
192
+ # a subclass. For example, you may want to select a method according to the
193
+ # additional `options` passed from the command-line to the processor.
194
+ #
195
+ # @param [Symbol] task_name a task name
196
+ # @return [String] the name of the method to perform the scraping task
197
+ def scraping_task_method(task_name)
198
+ "scrape_#{task_name}"
199
+ end
200
+
201
+ # Dumps an scraped object to disk.
202
+ #
203
+ # @param [Object] object an scraped object
204
+ # @raises [Pupa::Errors::DuplicateObjectIdError]
205
+ def dump_scraped_object(object)
206
+ type = object.class.to_s.demodulize.underscore
207
+ basename = "#{type}_#{object._id}.json"
208
+ path = File.join(@output_dir, basename)
209
+
210
+ if File.exist?(path)
211
+ raise Errors::DuplicateObjectIdError, "duplicate object ID: #{object._id} (was the same objected yielded twice?)"
212
+ end
213
+
214
+ info {"save #{type} #{object.to_s} as #{basename}"}
215
+
216
+ File.open(path, 'w') do |f|
217
+ f.write(JSON.dump(object.to_h(include_foreign_objects: true)))
218
+ end
219
+
220
+ begin
221
+ object.validate!
222
+ rescue JSON::Schema::ValidationError => e
223
+ warn {e.message}
224
+ end
225
+ end
226
+
227
+ # Loads scraped objects from disk.
228
+ #
229
+ # @return [Hash] a hash of scraped objects keyed by ID
230
+ def load_scraped_objects
231
+ {}.tap do |objects|
232
+ Dir[File.join(@output_dir, '*.json')].each do |path|
233
+ data = JSON.load(File.read(path))
234
+ object = data['_type'].camelize.constantize.new(data)
235
+ objects[object._id] = object
236
+ end
237
+ end
238
+ end
239
+
240
+ # Removes all duplicate objects and re-assigns any foreign keys.
241
+ #
242
+ # @param [Hash] objects a hash of scraped objects keyed by ID
243
+ # @return [Hash] the objects without duplicates
244
+ def deduplicate(objects)
245
+ losers_to_winners = build_losers_to_winners_map(objects)
246
+
247
+ # Remove all losers.
248
+ losers_to_winners.each_key do |key|
249
+ objects.delete(key)
250
+ end
251
+
252
+ # Swap the IDs of losers for the IDs of winners.
253
+ objects.each do |id,object|
254
+ object.foreign_keys.each do |property|
255
+ value = object[property]
256
+ if value && losers_to_winners.key?(value)
257
+ object[property] = losers_to_winners[value]
258
+ end
259
+ end
260
+ end
261
+
262
+ objects
263
+ end
264
+
265
+ # For each object, map its ID to the ID of its duplicate, if any.
266
+ #
267
+ # @param [Hash] objects a hash of scraped objects keyed by ID
268
+ # @return [Hash] a mapping from an object ID to the ID of its duplicate
269
+ def build_losers_to_winners_map(objects)
270
+ {}.tap do |map|
271
+ objects.each_with_index do |(id1,object1),index|
272
+ unless map.key?(id1) # Don't search for duplicates of duplicates.
273
+ objects.drop(index + 1).each do |id2,object2|
274
+ if object1 == object2
275
+ map[id2] = id1
276
+ end
277
+ end
278
+ end
279
+ end
280
+ end
281
+ end
282
+
283
+ # If any objects have unresolved foreign objects, we cannot derive an
284
+ # evaluation order using a dependency graph.
285
+ #
286
+ # @param [Hash] objects a hash of scraped objects keyed by ID
287
+ # @return [Boolean] whether a dependency graph can be used to derive an
288
+ # evaluation order
289
+ def use_dependency_graph?(objects)
290
+ objects.each do |id,object|
291
+ object.foreign_objects.each do |property|
292
+ if object[property].present?
293
+ return false
294
+ end
295
+ end
296
+ end
297
+ true
298
+ end
299
+
300
+ # Builds a dependency graph.
301
+ #
302
+ # @param [Hash] objects a hash of scraped objects keyed by ID
303
+ # @return [DependencyGraph] the dependency graph
304
+ def build_dependency_graph(objects)
305
+ DependencyGraph.new.tap do |graph|
306
+ objects.each do |id,object|
307
+ graph[id] = [] # no duplicate IDs
308
+ object.foreign_keys.each do |property|
309
+ value = object[property]
310
+ if value
311
+ graph[id] << value
312
+ end
313
+ end
314
+ end
315
+ end
316
+ end
317
+
318
+ # Resolves an object's foreign keys from object IDs to database IDs.
319
+ #
320
+ # @param [Object] an object
321
+ # @param [Hash] a map from object ID to database ID
322
+ # @raises [Pupa::Errors::MissingDatabaseIdError]
323
+ def resolve_foreign_keys(object, map)
324
+ object.foreign_keys.each do |property|
325
+ value = object[property]
326
+ if value
327
+ # If using a dependency graph, any foreign key that cannot be resolved
328
+ # will cause a key error while building the dependency graph.
329
+ #
330
+ # If not using a dependency graph, this method will not be called
331
+ # unless the foreign key is resolvable.
332
+ object[property] = map[value]
333
+ end
334
+ end
335
+ end
336
+
337
+ # Resolves an object's foreign objects to database IDs.
338
+ #
339
+ # @param [Object] an object
340
+ # @raises [Pupa::Errors::MissingDatabaseIdError]
341
+ def resolve_foreign_objects(object)
342
+ object.foreign_objects.each do |property|
343
+ selector = object[property]
344
+ if selector.present?
345
+ # This method will not be called unless the foreign key is resolvable.
346
+ object["#{property}_id"] = Persistence.find(selector)['_id']
347
+ end
348
+ end
349
+ end
350
+ end
351
+ end
@@ -0,0 +1,32 @@
1
+ module Pupa
2
+ class Refinements
3
+ # A refinement for the Faraday caching middleware to cache all requests, not
4
+ # only GET requests.
5
+ module FaradayMiddleware
6
+ refine ::FaradayMiddleware::Caching do
7
+ def call(env)
8
+ # Remove if-statement to cache any request, not only GET.
9
+ if env[:parallel_manager]
10
+ # callback mode
11
+ cache_on_complete(env)
12
+ else
13
+ # synchronous mode
14
+ response = cache.fetch(cache_key(env)) { @app.call(env) }
15
+ finalize_response(response, env)
16
+ end
17
+ end
18
+
19
+ def cache_key(env)
20
+ url = env[:url].dup
21
+ if url.query && params_to_ignore.any?
22
+ params = parse_query url.query
23
+ params.reject! {|k,| params_to_ignore.include? k }
24
+ url.query = build_query params
25
+ end
26
+ url.normalize!
27
+ url.request_uri + env[:body].to_s # Add for POST requests.
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,36 @@
1
+ module Pupa
2
+ class Refinements
3
+ # A refinement for JSON Schema to validate "email" and "uri" formats.
4
+ module Format
5
+ # @see http://my.rails-royce.org/2010/07/21/email-validation-in-ruby-on-rails-without-regexp/
6
+ def validate(current_schema, data, fragments, processor, validator, options = {})
7
+ case current_schema.schema['format']
8
+ when 'email'
9
+ error_message = "The property '#{build_fragment(fragments)}' must be a valid email address"
10
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
11
+ address = Mail::Address.new(data)
12
+ unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
13
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
14
+ return
15
+ end
16
+ when 'uri'
17
+ error_message = "The property '#{build_fragment(fragments)}' must be a valid URI"
18
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
19
+ r = URI::DEFAULT_PARSER.regexp[:ABS_URI]
20
+ unless r.match(data)
21
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
22
+ return
23
+ end
24
+ else
25
+ super
26
+ end
27
+ end
28
+ end
29
+
30
+ class ::JSON::Schema::FormatAttribute
31
+ class << self
32
+ prepend Pupa::Refinements::Format
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,185 @@
1
+ require 'fileutils'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ require 'moped'
6
+
7
+ module Pupa
8
+ class Runner
9
+ attr_reader :options, :actions
10
+
11
+ # @param [Pupa::Processor] a processor class
12
+ # @param [Hash] defaults change any default options
13
+ def initialize(processor_class, defaults = {})
14
+ @processor_class = processor_class
15
+
16
+ @options = OpenStruct.new({
17
+ actions: [],
18
+ tasks: [],
19
+ output_dir: File.expand_path('scraped_data', Dir.pwd),
20
+ cache_dir: File.expand_path('web_cache', Dir.pwd),
21
+ expires_in: 86400, # 1 day
22
+ host_with_port: 'localhost:27017',
23
+ database: 'pupa',
24
+ dry_run: false,
25
+ level: 'INFO',
26
+ }.merge(defaults))
27
+
28
+ @actions = {
29
+ 'scrape' => 'Scrapes data from online sources',
30
+ 'import' => 'Imports scraped data into a database',
31
+ }.map do |name,description|
32
+ OpenStruct.new(name: name, description: description)
33
+ end
34
+ end
35
+
36
+ # @param [Hash] attributes the action's attributes
37
+ # @option attributes [String] :name the action's label
38
+ # @option attributes [String] :description a description of the action
39
+ def add_action(attributes)
40
+ @actions << OpenStruct.new(attributes)
41
+ end
42
+
43
+ # Returns the command-line option parser.
44
+ #
45
+ # @return [OptionParser] the command-line option parser
46
+ def opts
47
+ @opts ||= OptionParser.new do |opts|
48
+ opts.program_name = File.basename($PROGRAM_NAME)
49
+ opts.banner = "Usage: #{opts.program_name}"
50
+
51
+ opts.separator ''
52
+ opts.separator 'Actions:'
53
+
54
+ names = @actions.map(&:name)
55
+ padding = names.map(&:size).max
56
+ @actions.each do |action|
57
+ opts.separator " #{action.name.ljust(padding)} #{action.description}\n"
58
+ end
59
+
60
+ opts.separator ''
61
+ opts.separator 'Tasks:'
62
+
63
+ @processor_class.tasks.each do |task_name|
64
+ opts.separator " #{task_name}"
65
+ end
66
+
67
+ opts.separator ''
68
+ opts.separator 'Specific options:'
69
+ opts.on('-a', '--action ACTION', names, 'Select an action to run (you may give this switch multiple times)', " (#{names.join(', ')})") do |v|
70
+ options.actions << v
71
+ end
72
+ opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', " (#{@processor_class.tasks.join(', ')})") do |v|
73
+ options.tasks << v
74
+ end
75
+ opts.on('-o', '--output_dir PATH', 'The directory in which to dump JSON documents') do |v|
76
+ options.output_dir = v
77
+ end
78
+ opts.on('-c', '--cache_dir PATH', 'The directory in which to cache HTTP requests') do |v|
79
+ options.cache_dir = v
80
+ end
81
+ opts.on('-e', '--expires_in SECONDS', "The cache's expiration time in seconds") do |v|
82
+ options.expires_in = v
83
+ end
84
+ opts.on('-H', '--host HOST:PORT', 'The host and port to MongoDB') do |v|
85
+ options.host_with_port = v
86
+ end
87
+ opts.on('-d', '--database NAME', 'The name of the MongoDB database') do |v|
88
+ options.database = v
89
+ end
90
+ opts.on('-n', '--dry-run', 'Show the plan without running any actions') do
91
+ options.dry_run = true
92
+ end
93
+ opts.on('-v', '--verbose', 'Show all messages') do
94
+ options.level = 'DEBUG'
95
+ end
96
+ opts.on('-q', '--quiet', 'Show only warning and error messages') do
97
+ options.level = 'WARN'
98
+ end
99
+ opts.on('-s', '--silent', 'Show no messages') do
100
+ options.level = 'UNKNOWN'
101
+ end
102
+
103
+ opts.separator ''
104
+ opts.separator 'Common options:'
105
+ opts.on_tail('-h', '--help', 'Show this message') do
106
+ puts opts
107
+ exit
108
+ end
109
+ opts.on_tail('-v', '--version', 'Show version') do
110
+ puts Pupa::VERSION
111
+ exit
112
+ end
113
+ end
114
+ end
115
+
116
+ # Runs the action.
117
+ #
118
+ # @example Run from a command-line script
119
+ #
120
+ # runner.run(ARGV)
121
+ #
122
+ # @example Override the command-line options
123
+ #
124
+ # runner.run(ARGV, expires_in: 3600) # 1 hour
125
+ #
126
+ # @param [Array] args command-line arguments
127
+ # @param [Hash] overrides any overridden options
128
+ def run(args, overrides = {})
129
+ rest = opts.parse!(args)
130
+
131
+ @options = OpenStruct.new(options.to_h.merge(overrides))
132
+
133
+ if options.actions.empty?
134
+ options.actions = %w(scrape import)
135
+ end
136
+ if options.tasks.empty?
137
+ options.tasks = @processor_class.tasks
138
+ end
139
+
140
+ processor = @processor_class.new(options.output_dir, cache_dir: options.cache_dir, expires_in: options.expires_in, level: options.level, options: Hash[*rest])
141
+
142
+ options.actions.each do |action|
143
+ unless action == 'scrape' || processor.respond_to?(action)
144
+ abort %(`#{action}` is not a #{opts.program_name} action. See `#{opts.program_name} --help` for a list of available actions.)
145
+ end
146
+ end
147
+
148
+ if %w(DEBUG INFO).include?(options.level)
149
+ puts "processor: #{@processor_class}"
150
+ puts "actions: #{options.actions.join(', ')}"
151
+ puts "tasks: #{options.tasks.join(', ')}"
152
+ end
153
+
154
+ if options.level == 'DEBUG'
155
+ %w(output_dir cache_dir expires_in host_with_port database level).each do |option|
156
+ puts "#{option}: #{options[option]}"
157
+ end
158
+ unless rest.empty?
159
+ puts "options: #{rest.join(' ')}"
160
+ end
161
+ end
162
+
163
+ exit if options.dry_run
164
+
165
+ Pupa.session = Moped::Session.new([options.host_with_port], database: options.database)
166
+
167
+ if options.actions.delete('scrape')
168
+ FileUtils.mkdir_p(options.output_dir)
169
+ FileUtils.mkdir_p(options.cache_dir)
170
+
171
+ Dir[File.join(options.output_dir, '*.json')].each do |path|
172
+ FileUtils.rm(path)
173
+ end
174
+
175
+ options.tasks.each do |task_name|
176
+ processor.dump_scraped_objects(task_name)
177
+ end
178
+ end
179
+
180
+ options.actions.each do |action|
181
+ processor.send(action)
182
+ end
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,3 @@
1
+ module Pupa
2
+ VERSION = "0.0.1"
3
+ end
data/lib/pupa.rb ADDED
@@ -0,0 +1,31 @@
1
+ require 'forwardable'
2
+
3
+ require 'active_support/concern'
4
+ require 'active_support/core_ext/class/attribute'
5
+ require 'active_support/core_ext/object/blank'
6
+ require 'active_support/inflector'
7
+
8
+ require 'pupa/errors'
9
+ require 'pupa/logger'
10
+ require 'pupa/processor'
11
+ require 'pupa/runner'
12
+
13
+ require 'pupa/models/concerns/contactable'
14
+ require 'pupa/models/concerns/identifiable'
15
+ require 'pupa/models/concerns/linkable'
16
+ require 'pupa/models/concerns/nameable'
17
+ require 'pupa/models/concerns/sourceable'
18
+ require 'pupa/models/concerns/timestamps'
19
+
20
+ require 'pupa/models/base'
21
+ require 'pupa/models/contact_detail_list'
22
+ require 'pupa/models/membership'
23
+ require 'pupa/models/organization'
24
+ require 'pupa/models/person'
25
+ require 'pupa/models/post'
26
+
27
+ module Pupa
28
+ class << self
29
+ attr_accessor :session
30
+ end
31
+ end
data/pupa.gemspec ADDED
@@ -0,0 +1,34 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/pupa/version', __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "pupa"
6
+ s.version = Pupa::VERSION
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Open North"]
9
+ s.email = ["info@opennorth.ca"]
10
+ s.homepage = "http://github.com/opennorth/pupa-ruby"
11
+ s.summary = %q{A data scraping framework}
12
+ s.license = 'MIT'
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.add_runtime_dependency('activesupport', '~> 4.0.0')
20
+ s.add_runtime_dependency('colored', '~> 1.2')
21
+ s.add_runtime_dependency('faraday_middleware', '~> 0.9.0')
22
+ s.add_runtime_dependency('json-schema', '~> 2.1.3')
23
+ s.add_runtime_dependency('mail')
24
+ s.add_runtime_dependency('moped', '~> 1.5.1')
25
+ s.add_runtime_dependency('nokogiri', '~> 1.6.0')
26
+
27
+ s.add_development_dependency('coveralls')
28
+ s.add_development_dependency('json', '~> 1.7.7') # to silence coveralls warning
29
+ s.add_development_dependency('octokit') # to update Popolo schema
30
+ s.add_development_dependency('rake')
31
+ s.add_development_dependency('rspec', '~> 2.10')
32
+ s.add_development_dependency('vcr', '~> 2.5.0')
33
+ s.add_development_dependency('multi_xml')
34
+ end