pupa 0.0.13 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +1 -0
  3. data/README.md +17 -0
  4. data/lib/pupa.rb +5 -8
  5. data/lib/pupa/errors.rb +4 -0
  6. data/lib/pupa/models/model.rb +5 -5
  7. data/lib/pupa/models/organization.rb +10 -6
  8. data/lib/pupa/models/person.rb +10 -6
  9. data/lib/pupa/processor.rb +14 -15
  10. data/lib/pupa/processor/connection.rb +26 -0
  11. data/lib/pupa/processor/connection_adapters/mongodb_adapter.rb +92 -0
  12. data/lib/pupa/processor/connection_adapters/postgresql_adapter.rb +116 -0
  13. data/lib/pupa/processor/document_store.rb +3 -0
  14. data/lib/pupa/processor/middleware/raise_error.rb +1 -0
  15. data/lib/pupa/refinements/faraday_middleware.rb +1 -1
  16. data/lib/pupa/runner.rb +14 -21
  17. data/lib/pupa/version.rb +1 -1
  18. data/pupa.gemspec +3 -2
  19. data/schemas/popolo/contact_detail.json +10 -0
  20. data/schemas/popolo/membership.json +29 -9
  21. data/schemas/popolo/organization.json +9 -2
  22. data/schemas/popolo/other_name.json +24 -0
  23. data/schemas/popolo/person.json +6 -3
  24. data/schemas/popolo/post.json +16 -2
  25. data/spec/models/model_spec.rb +1 -1
  26. data/spec/processor/connection_adapters/mongodb_adapter_spec.rb +61 -0
  27. data/spec/processor/connection_adapters/postgresql_adapter_spec.rb +70 -0
  28. data/spec/processor/connection_spec.rb +15 -0
  29. data/spec/processor/middleware/parse_json_spec.rb +90 -0
  30. data/spec/processor_spec.rb +9 -10
  31. data/spec/spec_helper.rb +0 -10
  32. metadata +83 -75
  33. data/lib/pupa/processor/persistence.rb +0 -85
  34. data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +0 -56
  35. data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +0 -48
  36. data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +0 -54
  37. data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +0 -26
  38. data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +0 -46
  39. data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +0 -26
  40. data/spec/cassettes/f861172f1df3bdb2052af5451f9922699d574b77.yml +0 -62
  41. data/spec/processor/persistence_spec.rb +0 -51
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 37fa9e87e20d4fef24b1f028f046b695a3323b88
4
- data.tar.gz: 0b2ae08ac3597955e7175812f05a700c76d91247
3
+ metadata.gz: e225ec9f62bb2c542da3f691b529cea5b6f99c15
4
+ data.tar.gz: 07726b314755421a2eac3d46756c46cd483ab913
5
5
  SHA512:
6
- metadata.gz: de98e5ed8f0b145e0ba77401c3e2e983cd47dc2b7a327476a03b6e04eecaeb31f976d76e98a7d51114afd87a95e36c9da491e6a4f4d788f71b02060121cbb94e
7
- data.tar.gz: 69fe22ea3079034b34aea5b244d233f66eebefd5010ea923ca9865d475bc81ccaebd6cab94e721ff2944cef3b4842dc297c932e8cbc69d949a593a74b2fc46a3
6
+ metadata.gz: 5732c1a25dcbcd7aaac28a049905c1fa2c6e4909f18e3b8c75b82e063264a871ec465b79aaa9c5371621b01f23ab4aa73135bb5757de6721e4521ec185d8e27a
7
+ data.tar.gz: d93fca74f1f8276c44de1ef228b85659875e839ce4a06a54a17f507423186485276c03c0f2be921b7f39e081fb622931f13659b116d76f9a920502f53427d5e6
data/.travis.yml CHANGED
@@ -8,4 +8,5 @@ env:
8
8
  services:
9
9
  - memcached
10
10
  - mongodb
11
+ - postgresql
11
12
  - redis
data/README.md CHANGED
@@ -7,6 +7,8 @@
7
7
 
8
8
  Pupa.rb is a Ruby 2.x fork of Sunlight Labs' [Pupa](https://github.com/opencivicdata/pupa). It implements an Extract, Transform and Load (ETL) process to scrape data from online sources, transform it, and write it to a database.
9
9
 
10
+ gem install pupa
11
+
10
12
  ## What it tries to solve
11
13
 
12
14
  Pupa.rb's goal is to make scraping less painful by solving common problems:
@@ -207,6 +209,21 @@ You may want to set the `CPUPROFILE_REALTIME=1` flag; however, it seems to inter
207
209
 
208
210
  pprof.rb --pdf /tmp/PROFILE_NAME > /tmp/PROFILE_NAME.pdf
209
211
 
212
+ ## Integration with ODMs
213
+
214
+ ### Mongoid
215
+
216
+ `Pupa::Model` is incompatible with `Mongoid::Document`. Don't do this:
217
+
218
+ ```ruby
219
+ class Cat
220
+ include Pupa::Model
221
+ include Mongoid::Document
222
+ end
223
+ ```
224
+
225
+ Instead, have a scraping model that includes `Pupa::Model` and an app model that includes `Mongoid::Document`.
226
+
210
227
  ## Testing
211
228
 
212
229
  **DO NOT** run this gem's specs if you are using Redis database number 15 on `localhost`!
data/lib/pupa.rb CHANGED
@@ -9,11 +9,6 @@ require 'active_support/core_ext/hash/slice'
9
9
  require 'active_support/core_ext/object/blank'
10
10
  require 'active_support/inflector'
11
11
 
12
- require 'pupa/errors'
13
- require 'pupa/logger'
14
- require 'pupa/processor'
15
- require 'pupa/runner'
16
-
17
12
  require 'pupa/models/concerns/indifferent_access'
18
13
  require 'pupa/models/concerns/contactable'
19
14
  require 'pupa/models/concerns/identifiable'
@@ -22,6 +17,11 @@ require 'pupa/models/concerns/nameable'
22
17
  require 'pupa/models/concerns/sourceable'
23
18
  require 'pupa/models/concerns/timestamps'
24
19
 
20
+ require 'pupa/errors'
21
+ require 'pupa/logger'
22
+ require 'pupa/processor'
23
+ require 'pupa/runner'
24
+
25
25
  require 'pupa/models/foreign_object'
26
26
  require 'pupa/models/model'
27
27
  require 'pupa/models/contact_detail_list'
@@ -32,9 +32,6 @@ require 'pupa/models/person'
32
32
  require 'pupa/models/post'
33
33
 
34
34
  module Pupa
35
- class << self
36
- attr_accessor :session
37
- end
38
35
  end
39
36
 
40
37
  # ActiveSupport's String methods become bottlenecks once:
data/lib/pupa/errors.rb CHANGED
@@ -19,6 +19,10 @@ module Pupa
19
19
  # not exist in an object.
20
20
  class MissingAttributeError < Error; end
21
21
 
22
+ # This error is raised when saving an object to a database if the object
23
+ # produces an empty selector.
24
+ class EmptySelectorError < Error; end
25
+
22
26
  # This error is raised when saving an object to a database if the object
23
27
  # matches more than one document in the database.
24
28
  class TooManyMatches < Error; end
@@ -36,7 +36,7 @@ module Pupa
36
36
  attr_reader :extras
37
37
  # @return [String] The underscored, lowercase form of the object's class.
38
38
  attr_accessor :_type
39
- # @return [Moped::BSON::Document,nil] The object's matching document in
39
+ # @return [BSON::Document,nil] The object's matching document in
40
40
  # the database. Set before persisting the object to the database.
41
41
  attr_accessor :document
42
42
 
@@ -45,7 +45,7 @@ module Pupa
45
45
 
46
46
  module ClassMethods
47
47
  # Declare which properties should be dumped to JSON after a scraping task
48
- # is complete. A subset of these properties will be imported to MongoDB.
48
+ # is complete. A subset of these will be imported to the database.
49
49
  #
50
50
  # @param [Array<Symbol>] the properties to dump to JSON
51
51
  def dump(*attributes)
@@ -129,9 +129,9 @@ module Pupa
129
129
 
130
130
  # Sets the object's ID.
131
131
  #
132
- # @param [String,Moped::BSON::ObjectId] id an ID
132
+ # @param [String,BSON::ObjectId] id an ID
133
133
  def _id=(id)
134
- @_id = id.to_s # in case of Moped::BSON::ObjectId
134
+ @_id = id.to_s # in case of BSON::ObjectId
135
135
  end
136
136
 
137
137
  # Sets the extras.
@@ -176,7 +176,7 @@ module Pupa
176
176
  # Returns the object as a hash.
177
177
  #
178
178
  # @param [Boolean] persist whether the object is being persisted, validated,
179
- # or used as a MongoDB selector, in which case foreign objects (i.e. hints)
179
+ # or used as a database selector, in which case foreign objects (hints)
180
180
  # are excluded
181
181
  # @return [Hash] the object as a hash
182
182
  def to_h(persist: false)
@@ -33,12 +33,16 @@ module Pupa
33
33
  # same name. Add a `jurisdiction` property?
34
34
  def fingerprint
35
35
  hash = super.slice(:classification, :parent_id)
36
- {
37
- '$or' => [
38
- hash.merge('name' => name),
39
- hash.merge('other_names.name' => name),
40
- ],
41
- }
36
+ if name
37
+ {
38
+ '$or' => [
39
+ hash.merge('name' => name),
40
+ hash.merge('other_names.name' => name),
41
+ ],
42
+ }
43
+ else
44
+ hash
45
+ end
42
46
  end
43
47
  end
44
48
  end
@@ -29,12 +29,16 @@ module Pupa
29
29
  # @todo This will obviously need to be scoped as in Python Pupa, to a
30
30
  # jurisdiction, post, etc.
31
31
  def fingerprint
32
- {
33
- '$or' => [
34
- {'name' => name},
35
- {'other_names.name' => name},
36
- ],
37
- }
32
+ if name
33
+ {
34
+ '$or' => [
35
+ {'name' => name},
36
+ {'other_names.name' => name},
37
+ ],
38
+ }
39
+ else
40
+ {}
41
+ end
38
42
  end
39
43
  end
40
44
  end
@@ -1,13 +1,10 @@
1
1
  require 'pupa/processor/client'
2
2
  require 'pupa/processor/dependency_graph'
3
3
  require 'pupa/processor/helper'
4
- require 'pupa/processor/persistence'
4
+ require 'pupa/processor/connection'
5
5
  require 'pupa/processor/document_store'
6
6
  require 'pupa/processor/yielder'
7
7
 
8
- require 'pupa/processor/document_store/file_store'
9
- require 'pupa/processor/document_store/redis_store'
10
-
11
8
  module Pupa
12
9
  # An abstract processor class from which specific processors inherit.
13
10
  class Processor
@@ -17,27 +14,29 @@ module Pupa
17
14
  class_attribute :tasks
18
15
  self.tasks = []
19
16
 
20
- attr_reader :report, :store, :client, :options
17
+ attr_reader :report, :store, :connection, :client, :options
21
18
 
22
19
  def_delegators :@logger, :debug, :info, :warn, :error, :fatal
23
20
 
24
21
  # @param [String] output_dir the directory or Redis address
25
22
  # (e.g. `redis://localhost:6379`) in which to dump JSON documents
23
+ # @param [Boolean] pipelined whether to dump JSON documents all at once
26
24
  # @param [String] cache_dir the directory or Memcached address
27
25
  # (e.g. `memcached://localhost:11211`) in which to cache HTTP responses
28
26
  # @param [Integer] expires_in the cache's expiration time in seconds
29
- # @param [Boolean] pipelined whether to dump JSON documents all at once
27
+ # @param [String] database_url the database URL
30
28
  # @param [Boolean] validate whether to validate JSON documents
31
29
  # @param [String] level the log level
32
30
  # @param [String,IO] logdev the log device
33
31
  # @param [Hash] options criteria for selecting the methods to run
34
- def initialize(output_dir, cache_dir: nil, expires_in: 86400, pipelined: false, validate: true, level: 'INFO', logdev: STDOUT, options: {})
35
- @store = DocumentStore.new(output_dir, pipelined: pipelined)
36
- @client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
37
- @logger = Logger.new('pupa', level: level, logdev: logdev)
38
- @validate = validate
39
- @options = options
40
- @report = {}
32
+ def initialize(output_dir, pipelined: false, cache_dir: nil, expires_in: 86400, database_url: 'mongodb://localhost:27017/pupa', validate: true, level: 'INFO', logdev: STDOUT, options: {})
33
+ @store = DocumentStore.new(output_dir, pipelined: pipelined)
34
+ @client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
35
+ @connection = Connection.new(database_url)
36
+ @logger = Logger.new('pupa', level: level, logdev: logdev)
37
+ @validate = validate
38
+ @options = options
39
+ @report = {}
41
40
  end
42
41
 
43
42
  # Retrieves and parses a document with a GET request.
@@ -369,7 +368,7 @@ module Pupa
369
368
  if value.present?
370
369
  foreign_object = ForeignObject.new(value)
371
370
  resolve_foreign_keys(foreign_object, map)
372
- document = Persistence.find(foreign_object.to_h)
371
+ document = connection.find(foreign_object.to_h)
373
372
 
374
373
  if document
375
374
  object["#{property}_id"] = document['_id']
@@ -382,7 +381,7 @@ module Pupa
382
381
 
383
382
  # @param [Object] object an object
384
383
  def import_object(object)
385
- inserted, id = Persistence.new(object).save
384
+ inserted, id = connection.save(object)
386
385
  @report[:import][object._type] ||= Hash.new(0)
387
386
  if inserted
388
387
  @report[:import][object._type][:insert] += 1
@@ -0,0 +1,26 @@
1
+ require 'pupa/processor/connection_adapters/mongodb_adapter'
2
+ require 'pupa/processor/connection_adapters/postgresql_adapter'
3
+
4
+ module Pupa
5
+ class Processor
6
+ # A database system connection factory.
7
+ class Connection
8
+ # Returns a configured connection to a database system.
9
+ #
10
+ # See each connection adapter for more information.
11
+ #
12
+ # @param [String] database_url the database URL
13
+ # @return a configured connection to a database system
14
+ def self.new(database_url)
15
+ case URI.parse(database_url).scheme
16
+ when 'postgres'
17
+ PostgreSQLAdapter.new(database_url)
18
+ when 'mongodb'
19
+ MongoDBAdapter.new(database_url)
20
+ else
21
+ raise NotImplementedError
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,92 @@
1
+ require 'moped'
2
+
3
+ module Pupa
4
+ class Processor
5
+ class Connection
6
+ # A proxy class to save plain old Ruby objects to MongoDB.
7
+ class MongoDBAdapter
8
+ attr_reader :raw_connection
9
+
10
+ # @param [String] database_url the database URL
11
+ def initialize(database_url)
12
+ uri = URI.parse(database_url)
13
+ @raw_connection = Moped::Session.new(["#{uri.host}:#{uri.port}"], database: uri.path[1..-1])
14
+ @raw_connection.login(uri.user, uri.password) if uri.user && uri.password
15
+ end
16
+
17
+ # Finds a document matching the selection criteria.
18
+ #
19
+ # The selection criteria *must* set a `_type` key in order to determine
20
+ # the collection to query.
21
+ #
22
+ # @param [Hash] selector the selection criteria
23
+ # @return [Hash,nil] the matched document, or nil
24
+ # @raises [Pupa::Errors::TooManyMatches] if multiple documents are found
25
+ def find(selector)
26
+ collection_name = collection_name_from_class_name(selector[:_type].camelize)
27
+ if selector.except(:_type).empty?
28
+ raise Errors::EmptySelectorError, "selector is empty during find in collection #{collection_name}"
29
+ end
30
+ collection = raw_connection[collection_name]
31
+ query = collection.find(selector)
32
+
33
+ case query.count
34
+ when 0
35
+ nil
36
+ when 1
37
+ query.first
38
+ else
39
+ raise Errors::TooManyMatches, "selector matches multiple documents during find in collection #{collection_name}: #{JSON.dump(selector)}"
40
+ end
41
+ end
42
+
43
+ # Inserts or replaces a document in MongoDB.
44
+ #
45
+ # @param [Object] object an object
46
+ # @return [Array] whether the object was inserted and the object's database ID
47
+ # @raises [Pupa::Errors::TooManyMatches] if multiple documents would be updated
48
+ def save(object)
49
+ selector = object.fingerprint
50
+
51
+ collection_name = collection_name_from_class_name(object.class.to_s)
52
+ if selector.empty?
53
+ raise Errors::EmptySelectorError, "selector is empty during save in collection #{collection_name} for #{object._id}"
54
+ end
55
+ collection = raw_connection[collection_name]
56
+ query = collection.find(selector)
57
+
58
+ # Run query before callbacks to avoid e.g. timestamps in the selector.
59
+ case query.count
60
+ when 0
61
+ object.run_callbacks(:save) do
62
+ object.run_callbacks(:create) do
63
+ collection.insert(object.to_h(persist: true))
64
+ [true, object._id.to_s]
65
+ end
66
+ end
67
+ when 1
68
+ # Make the document available to the callbacks.
69
+ # @see https://github.com/opennorth/pupa-ruby/issues/17
70
+ object.document = query.first
71
+ object.run_callbacks(:save) do
72
+ query.update(object.to_h(persist: true).except(:_id))
73
+ [false, object.document['_id'].to_s]
74
+ end
75
+ else
76
+ raise Errors::TooManyMatches, "selector matches multiple documents during save in collection #{collection_name} for #{object._id}: #{JSON.dump(selector)}"
77
+ end
78
+ end
79
+
80
+ private
81
+
82
+ # Returns the name of the collection in which to save the object.
83
+ #
84
+ # @param [String] class_name the name of the object's class
85
+ # @return [String] the name of the collection in which to save the object
86
+ def collection_name_from_class_name(class_name)
87
+ class_name.demodulize.underscore.pluralize.to_sym
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,116 @@
1
+ require 'sequel'
2
+
3
+ module Pupa
4
+ class Processor
5
+ class Connection
6
+ # A proxy class to save plain old Ruby objects to PostgreSQL.
7
+ class PostgreSQLAdapter
8
+ include Pupa::Concerns::IndifferentAccess
9
+
10
+ attr_reader :raw_connection
11
+
12
+ # @param [String] database_url the database URL
13
+ def initialize(database_url)
14
+ @raw_connection = Sequel.connect(database_url)
15
+ end
16
+
17
+ # Finds a document matching the selection criteria.
18
+ #
19
+ # The selection criteria *must* set a `_type` key in order to determine
20
+ # the collection to query.
21
+ #
22
+ # @param [Hash] selector the selection criteria
23
+ # @return [Hash,nil] the matched document, or nil
24
+ # @raises [Pupa::Errors::TooManyMatches] if multiple documents are found
25
+ def find(selector)
26
+ collection_name = collection_name_from_class_name(selector[:_type].camelize)
27
+ if selector.except(:_type).empty?
28
+ raise Errors::EmptySelectorError, "selector is empty during find in collection #{collection_name}"
29
+ end
30
+ collection = raw_connection[collection_name]
31
+ query = collection.filter(symbolize_keys(selector))
32
+
33
+ case query.count
34
+ when 0
35
+ nil
36
+ when 1
37
+ stringify_keys(query.first)
38
+ else
39
+ raise Errors::TooManyMatches, "selector matches multiple documents during find in collection #{collection_name}: #{JSON.dump(selector)}"
40
+ end
41
+ end
42
+
43
+ # Inserts or replaces a document in PostgreSQL.
44
+ #
45
+ # @param [Object] object an object
46
+ # @return [Array] whether the object was inserted and the object's database ID
47
+ # @raises [Pupa::Errors::TooManyMatches] if multiple documents would be updated
48
+ def save(object)
49
+ fingerprint = symbolize_keys(object.fingerprint) # Sequel needs symbols
50
+ selector = if fingerprint.key?(:$or) && fingerprint.size == 1
51
+ Sequel.or(reject_subdocument_criteria(fingerprint[:$or]))
52
+ else
53
+ reject_subdocument_criteria(fingerprint)
54
+ end
55
+
56
+ collection_name = collection_name_from_class_name(object.class.to_s)
57
+ if fingerprint.empty?
58
+ raise Errors::EmptySelectorError, "selector is empty during save in collection #{collection_name} for #{object._id}"
59
+ end
60
+ collection = raw_connection[collection_name]
61
+ query = collection.filter(selector)
62
+
63
+ # Run query before callbacks to avoid e.g. timestamps in the selector.
64
+ case query.count
65
+ when 0
66
+ object.run_callbacks(:save) do
67
+ object.run_callbacks(:create) do
68
+ collection.insert(object.to_h(persist: true))
69
+ [true, object._id.to_s]
70
+ end
71
+ end
72
+ when 1
73
+ # Make the document available to the callbacks.
74
+ # @see https://github.com/opennorth/pupa-ruby/issues/17
75
+ object.document = stringify_keys(query.first)
76
+ object.run_callbacks(:save) do
77
+ query.update(object.to_h(persist: true).except(:_id))
78
+ [false, object.document['_id'].to_s]
79
+ end
80
+ else
81
+ raise Errors::TooManyMatches, "selector matches multiple documents during save in collection #{collection_name} for #{object._id}: #{JSON.dump(selector)}"
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ # Returns the name of the collection in which to save the object.
88
+ #
89
+ # @param [String] class_name the name of the object's class
90
+ # @return [String] the name of the collection in which to save the object
91
+ def collection_name_from_class_name(class_name)
92
+ class_name.demodulize.underscore.pluralize.to_sym
93
+ end
94
+
95
+ def reject_subdocument_criteria(object)
96
+ case object
97
+ when Hash
98
+ array = []
99
+ object.each do |key,value|
100
+ unless key.to_s['.'] # @todo Support MongoDB subdocument criteria.
101
+ array += [key, reject_subdocument_criteria(value)]
102
+ end
103
+ end
104
+ array
105
+ when Array
106
+ object.map do |value|
107
+ reject_subdocument_criteria(value)
108
+ end.reject(&:empty?)
109
+ else
110
+ object
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end