pupa 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +6 -0
  3. data/.travis.yml +5 -0
  4. data/.yardopts +4 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE +20 -0
  7. data/README.md +52 -0
  8. data/Rakefile +37 -0
  9. data/USAGE +1 -0
  10. data/lib/pupa/errors.rb +30 -0
  11. data/lib/pupa/logger.rb +37 -0
  12. data/lib/pupa/models/base.rb +190 -0
  13. data/lib/pupa/models/concerns/contactable.rb +34 -0
  14. data/lib/pupa/models/concerns/identifiable.rb +26 -0
  15. data/lib/pupa/models/concerns/linkable.rb +26 -0
  16. data/lib/pupa/models/concerns/nameable.rb +34 -0
  17. data/lib/pupa/models/concerns/sourceable.rb +26 -0
  18. data/lib/pupa/models/concerns/timestamps.rb +22 -0
  19. data/lib/pupa/models/contact_detail_list.rb +28 -0
  20. data/lib/pupa/models/membership.rb +37 -0
  21. data/lib/pupa/models/organization.rb +40 -0
  22. data/lib/pupa/models/person.rb +35 -0
  23. data/lib/pupa/models/post.rb +28 -0
  24. data/lib/pupa/processor/client.rb +42 -0
  25. data/lib/pupa/processor/dependency_graph.rb +18 -0
  26. data/lib/pupa/processor/helper.rb +15 -0
  27. data/lib/pupa/processor/middleware/logger.rb +37 -0
  28. data/lib/pupa/processor/middleware/parse_html.rb +16 -0
  29. data/lib/pupa/processor/persistence.rb +80 -0
  30. data/lib/pupa/processor/yielder.rb +50 -0
  31. data/lib/pupa/processor.rb +351 -0
  32. data/lib/pupa/refinements/faraday_middleware.rb +32 -0
  33. data/lib/pupa/refinements/json-schema.rb +36 -0
  34. data/lib/pupa/runner.rb +185 -0
  35. data/lib/pupa/version.rb +3 -0
  36. data/lib/pupa.rb +31 -0
  37. data/pupa.gemspec +34 -0
  38. data/schemas/popolo/contact_detail.json +44 -0
  39. data/schemas/popolo/identifier.json +18 -0
  40. data/schemas/popolo/link.json +19 -0
  41. data/schemas/popolo/membership.json +86 -0
  42. data/schemas/popolo/organization.json +104 -0
  43. data/schemas/popolo/other_name.json +28 -0
  44. data/schemas/popolo/person.json +130 -0
  45. data/schemas/popolo/post.json +78 -0
  46. data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +56 -0
  47. data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +48 -0
  48. data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +54 -0
  49. data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +26 -0
  50. data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +46 -0
  51. data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +26 -0
  52. data/spec/logger_spec.rb +4 -0
  53. data/spec/models/base_spec.rb +194 -0
  54. data/spec/models/concerns/contactable_spec.rb +37 -0
  55. data/spec/models/concerns/identifiable_spec.rb +25 -0
  56. data/spec/models/concerns/linkable_spec.rb +25 -0
  57. data/spec/models/concerns/nameable_spec.rb +25 -0
  58. data/spec/models/concerns/sourceable_spec.rb +25 -0
  59. data/spec/models/concerns/timestamps_spec.rb +32 -0
  60. data/spec/models/contact_detail_list_spec.rb +44 -0
  61. data/spec/models/membership_spec.rb +30 -0
  62. data/spec/models/organization_spec.rb +24 -0
  63. data/spec/models/person_spec.rb +24 -0
  64. data/spec/models/post_spec.rb +19 -0
  65. data/spec/processor/client_spec.rb +4 -0
  66. data/spec/processor/dependency_graph_spec.rb +4 -0
  67. data/spec/processor/helper_spec.rb +4 -0
  68. data/spec/processor/middleware/logger_spec.rb +87 -0
  69. data/spec/processor/middleware/parse_html_spec.rb +92 -0
  70. data/spec/processor/persistence_spec.rb +41 -0
  71. data/spec/processor/yielder_spec.rb +55 -0
  72. data/spec/processor_spec.rb +268 -0
  73. data/spec/runner_spec.rb +85 -0
  74. data/spec/spec_helper.rb +17 -0
  75. metadata +342 -0
@@ -0,0 +1,40 @@
1
+ module Pupa
2
+ # A group with a common purpose or reason for existence that goes beyond the set
3
+ # of people belonging to it.
4
+ class Organization < Base
5
+ self.schema = 'popolo/organization'
6
+
7
+ include Concerns::Timestamps
8
+ include Concerns::Sourceable
9
+ include Concerns::Nameable
10
+ include Concerns::Identifiable
11
+ include Concerns::Contactable
12
+ include Concerns::Linkable
13
+
14
+ attr_accessor :name, :classification, :parent_id, :parent, :founding_date,
15
+ :dissolution_date, :image
16
+
17
+ foreign_key :parent_id
18
+
19
+ foreign_object :parent
20
+
21
+ # Returns the name of the organization.
22
+ #
23
+ # @return [String] the name of the organization
24
+ def to_s
25
+ name
26
+ end
27
+
28
+ # @todo Parentless organizations in different jurisdictions can have the
29
+ # same name. Add a `jurisdiction` property?
30
+ def fingerprint
31
+ hash = super.slice(:classification, :parent_id)
32
+ {
33
+ '$or' => [
34
+ hash.merge('name' => name),
35
+ hash.merge('other_names.name' => name),
36
+ ],
37
+ }
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,35 @@
1
+ module Pupa
2
+ # A real person, alive or dead.
3
+ class Person < Base
4
+ self.schema = 'popolo/person'
5
+
6
+ include Concerns::Timestamps
7
+ include Concerns::Sourceable
8
+ include Concerns::Nameable
9
+ include Concerns::Identifiable
10
+ include Concerns::Contactable
11
+ include Concerns::Linkable
12
+
13
+ attr_accessor :name, :family_name, :given_name, :additional_name,
14
+ :honorific_prefix, :honorific_suffix, :patronymic_name, :sort_name,
15
+ :email, :gender, :birth_date, :death_date, :image, :summary, :biography
16
+
17
+ # Returns the person's name.
18
+ #
19
+ # @return [String] the person's name
20
+ def to_s
21
+ name
22
+ end
23
+
24
+ # @todo This will obviously need to be scoped as in Python Pupa, to a
25
+ # jurisdiction, post, etc.
26
+ def fingerprint
27
+ {
28
+ '$or' => [
29
+ {'name' => name},
30
+ {'other_names.name' => name},
31
+ ],
32
+ }
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,28 @@
1
+ module Pupa
2
+ # A position that exists independent of the person holding it.
3
+ class Post < Base
4
+ self.schema = 'popolo/post'
5
+
6
+ include Concerns::Timestamps
7
+ include Concerns::Sourceable
8
+ include Concerns::Contactable
9
+ include Concerns::Linkable
10
+
11
+ attr_accessor :label, :role, :organization_id, :start_date, :end_date
12
+
13
+ foreign_key :organization_id
14
+
15
+ # Returns the post's label and organization ID.
16
+ #
17
+ # @return [String] the post's label and organization ID
18
+ def to_s
19
+ "#{label} in #{organization_id}"
20
+ end
21
+
22
+ # A post should have a unique label within an organization, through it may
23
+ # share a label with a historical post.
24
+ def fingerprint
25
+ super.slice(:label, :organization_id, :end_date)
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,42 @@
1
+ require 'active_support/cache'
2
+ require 'faraday_middleware'
3
+ require 'faraday_middleware/response_middleware'
4
+
5
+ require 'pupa/processor/middleware/logger'
6
+ require 'pupa/processor/middleware/parse_html'
7
+ require 'pupa/refinements/faraday_middleware'
8
+
9
+ using Pupa::Refinements::FaradayMiddleware
10
+
11
+ module Pupa
12
+ class Processor
13
+ # An HTTP client factory.
14
+ class Client
15
+ # Returns a configured Faraday HTTP client.
16
+ #
17
+ # @param [String] cache_dir a directory in which to cache requests
18
+ # @param [Integer] expires_in the cache's expiration time in seconds
19
+ # @param [String] level the log level
20
+ # @return [Faraday::Connection] a configured Faraday HTTP client
21
+ def self.new(cache_dir: nil, expires_in: 86400, level: 'INFO') # 1 day
22
+ Faraday.new do |connection|
23
+ connection.request :url_encoded
24
+ connection.use Middleware::Logger, Logger.new('faraday', level: level)
25
+ # @see http://tools.ietf.org/html/rfc2854
26
+ # @see http://tools.ietf.org/html/rfc3236
27
+ connection.use Middleware::ParseHtml, content_type: %w(text/html application/xhtml+xml)
28
+ # @see http://tools.ietf.org/html/rfc4627
29
+ connection.use FaradayMiddleware::ParseJson, content_type: /\bjson$/
30
+ # @see http://tools.ietf.org/html/rfc3023
31
+ connection.use FaradayMiddleware::ParseXml, content_type: /\bxml$/
32
+ if cache_dir
33
+ connection.response :caching do
34
+ ActiveSupport::Cache::FileStore.new(cache_dir, expires_in: expires_in)
35
+ end
36
+ end
37
+ connection.adapter Faraday.default_adapter # must be last
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,18 @@
1
+ require 'tsort'
2
+
3
+ module Pupa
4
+ class Processor
5
+ # A simple implementation of a dependency graph.
6
+ #
7
+ # @see http://ruby-doc.org/stdlib-2.0.0/libdoc/tsort/rdoc/TSort.html
8
+ class DependencyGraph < Hash
9
+ include TSort
10
+
11
+ alias tsort_each_node each_key
12
+
13
+ def tsort_each_child(node, &block)
14
+ fetch(node).each(&block)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ module Pupa
2
+ class Processor
3
+ # Processor helper methods.
4
+ module Helper
5
+ # Normalizes all whitespace to spaces, removes consecutive spaces, and
6
+ # strips leading and ending spaces.
7
+ #
8
+ # @param [String] a string
9
+ # @return [String] a clean string
10
+ def clean(string)
11
+ string.gsub(/[[:space:]]/, ' ').squeeze(' ').strip
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,37 @@
1
+ module Pupa
2
+ class Processor
3
+ module Middleware
4
+ # Customizes the Faraday default logger.
5
+ class Logger < FaradayMiddleware::ResponseMiddleware
6
+ extend Forwardable
7
+
8
+ def initialize(app, logger = nil)
9
+ super(app)
10
+ @logger = logger || begin
11
+ require 'logger'
12
+ ::Logger.new(STDOUT)
13
+ end
14
+ end
15
+
16
+ def_delegators :@logger, :debug, :info, :warn, :error, :fatal
17
+
18
+ def call(env)
19
+ info "#{env[:method]} #{env[:url].to_s} #{env[:body].to_s}" # XXX add POST body
20
+ debug('request') { dump_headers env[:request_headers] }
21
+ super
22
+ end
23
+
24
+ def on_complete(env)
25
+ debug('Status') { env[:status].to_s } # XXX switch from info
26
+ debug('response') { dump_headers env[:response_headers] }
27
+ end
28
+
29
+ private
30
+
31
+ def dump_headers(headers)
32
+ headers.map { |k, v| "#{k}: #{v.inspect}" }.join("\n")
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,16 @@
1
+ module Pupa
2
+ class Processor
3
+ module Middleware
4
+ # A Faraday response middleware for parsing HTML.
5
+ #
6
+ # @see https://github.com/lostisland/faraday_middleware/pull/18
7
+ class ParseHtml < FaradayMiddleware::ResponseMiddleware
8
+ dependency 'nokogiri'
9
+
10
+ define_parser { |body|
11
+ Nokogiri::HTML(body) unless body.empty?
12
+ }
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,80 @@
1
+ module Pupa
2
+ class Processor
3
+ # A proxy class to save plain old Ruby objects to MongoDB.
4
+ class Persistence
5
+ # @param [Object] object an object
6
+ def initialize(object)
7
+ @object = object
8
+ end
9
+
10
+ # Finds a document matching the selection criteria.
11
+ #
12
+ # The selection criteria *must* set a `_type` key in order to determine
13
+ # the collection to query.
14
+ #
15
+ # @param [Hash] selector the selection criteria
16
+ # @return [Hash,nil] the matched document, or nil
17
+ # @raises [Pupa::Errors::TooManyMatches] if multiple documents are found
18
+ def self.find(selector)
19
+ collection_name = collection_name_from_class_name(selector[:_type].camelize)
20
+ query = Pupa.session[collection_name].find(selector)
21
+ case query.count
22
+ when 0
23
+ nil
24
+ when 1
25
+ query.first
26
+ else
27
+ raise Errors::TooManyMatches, "selector matches multiple documents during find: #{collection_name} #{JSON.dump(selector)}"
28
+ end
29
+ end
30
+
31
+ # Saves an object to MongoDB.
32
+ #
33
+ # @return [String] the object's database ID
34
+ # @raises [Pupa::Errors::TooManyMatches] if multiple documents would be updated
35
+ def save
36
+ selector = @object.fingerprint
37
+ query = collection.find(selector)
38
+
39
+ # Run query before callbacks to avoid e.g. timestamps in the selector.
40
+ @object.run_callbacks(:save) do
41
+ case query.count
42
+ when 0
43
+ @object.run_callbacks(:create) do
44
+ collection.insert(@object.to_h)
45
+ @object._id.to_s
46
+ end
47
+ when 1
48
+ query.update(@object.to_h)
49
+ query.first['_id'].to_s
50
+ else
51
+ raise Errors::TooManyMatches, "selector matches multiple documents during save: #{collection_name} #{JSON.dump(selector)}"
52
+ end
53
+ end
54
+ end
55
+
56
+ private
57
+
58
+ # Returns the name of the collection in which to save the object.
59
+ #
60
+ # @return [String] the name of the object's class
61
+ def self.collection_name_from_class_name(class_name)
62
+ class_name.demodulize.underscore.pluralize.to_sym
63
+ end
64
+
65
+ # Returns the name of the collection in which to save the object.
66
+ #
67
+ # @return [String] the name of the collection in which to save the object
68
+ def collection_name
69
+ self.class.collection_name_from_class_name(@object.class.to_s)
70
+ end
71
+
72
+ # Returns the collection in which to save the object.
73
+ #
74
+ # @return [Moped::Collection] the collection in which to save the object
75
+ def collection
76
+ Pupa.session[collection_name]
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,50 @@
1
+ # Using fibers instead of enumerators leads to less coupling in the processor.
2
+ # @see https://practicingruby.com/articles/building-enumerable-and-enumerator?u=dc2ab0f9bb
3
+ require 'fiber'
4
+
5
+ module Pupa
6
+ class Processor
7
+ # A lazy enumerator.
8
+ class Yielder
9
+ # The given block should yield objects to add to the enumerator.
10
+ def initialize
11
+ @fiber = Fiber.new do
12
+ yield
13
+ raise StopIteration
14
+ end
15
+ end
16
+
17
+ # Yields each object in the enumerator to the given block.
18
+ def each
19
+ if block_given?
20
+ loop do
21
+ yield self.next
22
+ end
23
+ else
24
+ to_enum
25
+ end
26
+ end
27
+
28
+ # Returns the next object in the enumerator, and moves the internal position
29
+ # forward. When the position reaches the end, `StopIteration` is raised.
30
+ def next
31
+ if @fiber.alive?
32
+ @fiber.resume
33
+ else
34
+ raise StopIteration
35
+ end
36
+ end
37
+
38
+ # Returns a lazy enumerator.
39
+ #
40
+ # @return [Enumerator] a lazy enumerator
41
+ def to_enum
42
+ Enumerator.new do |y|
43
+ loop do
44
+ y << self.next
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end