pupa 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +6 -0
  3. data/.travis.yml +5 -0
  4. data/.yardopts +4 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE +20 -0
  7. data/README.md +52 -0
  8. data/Rakefile +37 -0
  9. data/USAGE +1 -0
  10. data/lib/pupa/errors.rb +30 -0
  11. data/lib/pupa/logger.rb +37 -0
  12. data/lib/pupa/models/base.rb +190 -0
  13. data/lib/pupa/models/concerns/contactable.rb +34 -0
  14. data/lib/pupa/models/concerns/identifiable.rb +26 -0
  15. data/lib/pupa/models/concerns/linkable.rb +26 -0
  16. data/lib/pupa/models/concerns/nameable.rb +34 -0
  17. data/lib/pupa/models/concerns/sourceable.rb +26 -0
  18. data/lib/pupa/models/concerns/timestamps.rb +22 -0
  19. data/lib/pupa/models/contact_detail_list.rb +28 -0
  20. data/lib/pupa/models/membership.rb +37 -0
  21. data/lib/pupa/models/organization.rb +40 -0
  22. data/lib/pupa/models/person.rb +35 -0
  23. data/lib/pupa/models/post.rb +28 -0
  24. data/lib/pupa/processor/client.rb +42 -0
  25. data/lib/pupa/processor/dependency_graph.rb +18 -0
  26. data/lib/pupa/processor/helper.rb +15 -0
  27. data/lib/pupa/processor/middleware/logger.rb +37 -0
  28. data/lib/pupa/processor/middleware/parse_html.rb +16 -0
  29. data/lib/pupa/processor/persistence.rb +80 -0
  30. data/lib/pupa/processor/yielder.rb +50 -0
  31. data/lib/pupa/processor.rb +351 -0
  32. data/lib/pupa/refinements/faraday_middleware.rb +32 -0
  33. data/lib/pupa/refinements/json-schema.rb +36 -0
  34. data/lib/pupa/runner.rb +185 -0
  35. data/lib/pupa/version.rb +3 -0
  36. data/lib/pupa.rb +31 -0
  37. data/pupa.gemspec +34 -0
  38. data/schemas/popolo/contact_detail.json +44 -0
  39. data/schemas/popolo/identifier.json +18 -0
  40. data/schemas/popolo/link.json +19 -0
  41. data/schemas/popolo/membership.json +86 -0
  42. data/schemas/popolo/organization.json +104 -0
  43. data/schemas/popolo/other_name.json +28 -0
  44. data/schemas/popolo/person.json +130 -0
  45. data/schemas/popolo/post.json +78 -0
  46. data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +56 -0
  47. data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +48 -0
  48. data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +54 -0
  49. data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +26 -0
  50. data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +46 -0
  51. data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +26 -0
  52. data/spec/logger_spec.rb +4 -0
  53. data/spec/models/base_spec.rb +194 -0
  54. data/spec/models/concerns/contactable_spec.rb +37 -0
  55. data/spec/models/concerns/identifiable_spec.rb +25 -0
  56. data/spec/models/concerns/linkable_spec.rb +25 -0
  57. data/spec/models/concerns/nameable_spec.rb +25 -0
  58. data/spec/models/concerns/sourceable_spec.rb +25 -0
  59. data/spec/models/concerns/timestamps_spec.rb +32 -0
  60. data/spec/models/contact_detail_list_spec.rb +44 -0
  61. data/spec/models/membership_spec.rb +30 -0
  62. data/spec/models/organization_spec.rb +24 -0
  63. data/spec/models/person_spec.rb +24 -0
  64. data/spec/models/post_spec.rb +19 -0
  65. data/spec/processor/client_spec.rb +4 -0
  66. data/spec/processor/dependency_graph_spec.rb +4 -0
  67. data/spec/processor/helper_spec.rb +4 -0
  68. data/spec/processor/middleware/logger_spec.rb +87 -0
  69. data/spec/processor/middleware/parse_html_spec.rb +92 -0
  70. data/spec/processor/persistence_spec.rb +41 -0
  71. data/spec/processor/yielder_spec.rb +55 -0
  72. data/spec/processor_spec.rb +268 -0
  73. data/spec/runner_spec.rb +85 -0
  74. data/spec/spec_helper.rb +17 -0
  75. metadata +342 -0
@@ -0,0 +1,40 @@
1
+ module Pupa
2
+ # A group with a common purpose or reason for existence that goes beyond the set
3
+ # of people belonging to it.
4
+ class Organization < Base
5
+ self.schema = 'popolo/organization'
6
+
7
+ include Concerns::Timestamps
8
+ include Concerns::Sourceable
9
+ include Concerns::Nameable
10
+ include Concerns::Identifiable
11
+ include Concerns::Contactable
12
+ include Concerns::Linkable
13
+
14
+ attr_accessor :name, :classification, :parent_id, :parent, :founding_date,
15
+ :dissolution_date, :image
16
+
17
+ foreign_key :parent_id
18
+
19
+ foreign_object :parent
20
+
21
+ # Returns the name of the organization.
22
+ #
23
+ # @return [String] the name of the organization
24
+ def to_s
25
+ name
26
+ end
27
+
28
+ # @todo Parentless organizations in different jurisdictions can have the
29
+ # same name. Add a `jurisdiction` property?
30
+ def fingerprint
31
+ hash = super.slice(:classification, :parent_id)
32
+ {
33
+ '$or' => [
34
+ hash.merge('name' => name),
35
+ hash.merge('other_names.name' => name),
36
+ ],
37
+ }
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,35 @@
1
+ module Pupa
2
+ # A real person, alive or dead.
3
+ class Person < Base
4
+ self.schema = 'popolo/person'
5
+
6
+ include Concerns::Timestamps
7
+ include Concerns::Sourceable
8
+ include Concerns::Nameable
9
+ include Concerns::Identifiable
10
+ include Concerns::Contactable
11
+ include Concerns::Linkable
12
+
13
+ attr_accessor :name, :family_name, :given_name, :additional_name,
14
+ :honorific_prefix, :honorific_suffix, :patronymic_name, :sort_name,
15
+ :email, :gender, :birth_date, :death_date, :image, :summary, :biography
16
+
17
+ # Returns the person's name.
18
+ #
19
+ # @return [String] the person's name
20
+ def to_s
21
+ name
22
+ end
23
+
24
+ # @todo This will obviously need to be scoped as in Python Pupa, to a
25
+ # jurisdiction, post, etc.
26
+ def fingerprint
27
+ {
28
+ '$or' => [
29
+ {'name' => name},
30
+ {'other_names.name' => name},
31
+ ],
32
+ }
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,28 @@
1
+ module Pupa
2
+ # A position that exists independent of the person holding it.
3
+ class Post < Base
4
+ self.schema = 'popolo/post'
5
+
6
+ include Concerns::Timestamps
7
+ include Concerns::Sourceable
8
+ include Concerns::Contactable
9
+ include Concerns::Linkable
10
+
11
+ attr_accessor :label, :role, :organization_id, :start_date, :end_date
12
+
13
+ foreign_key :organization_id
14
+
15
+ # Returns the post's label and organization ID.
16
+ #
17
+ # @return [String] the post's label and organization ID
18
+ def to_s
19
+ "#{label} in #{organization_id}"
20
+ end
21
+
22
+ # A post should have a unique label within an organization, through it may
23
+ # share a label with a historical post.
24
+ def fingerprint
25
+ super.slice(:label, :organization_id, :end_date)
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,42 @@
1
+ require 'active_support/cache'
2
+ require 'faraday_middleware'
3
+ require 'faraday_middleware/response_middleware'
4
+
5
+ require 'pupa/processor/middleware/logger'
6
+ require 'pupa/processor/middleware/parse_html'
7
+ require 'pupa/refinements/faraday_middleware'
8
+
9
+ using Pupa::Refinements::FaradayMiddleware
10
+
11
+ module Pupa
12
+ class Processor
13
+ # An HTTP client factory.
14
+ class Client
15
+ # Returns a configured Faraday HTTP client.
16
+ #
17
+ # @param [String] cache_dir a directory in which to cache requests
18
+ # @param [Integer] expires_in the cache's expiration time in seconds
19
+ # @param [String] level the log level
20
+ # @return [Faraday::Connection] a configured Faraday HTTP client
21
+ def self.new(cache_dir: nil, expires_in: 86400, level: 'INFO') # 1 day
22
+ Faraday.new do |connection|
23
+ connection.request :url_encoded
24
+ connection.use Middleware::Logger, Logger.new('faraday', level: level)
25
+ # @see http://tools.ietf.org/html/rfc2854
26
+ # @see http://tools.ietf.org/html/rfc3236
27
+ connection.use Middleware::ParseHtml, content_type: %w(text/html application/xhtml+xml)
28
+ # @see http://tools.ietf.org/html/rfc4627
29
+ connection.use FaradayMiddleware::ParseJson, content_type: /\bjson$/
30
+ # @see http://tools.ietf.org/html/rfc3023
31
+ connection.use FaradayMiddleware::ParseXml, content_type: /\bxml$/
32
+ if cache_dir
33
+ connection.response :caching do
34
+ ActiveSupport::Cache::FileStore.new(cache_dir, expires_in: expires_in)
35
+ end
36
+ end
37
+ connection.adapter Faraday.default_adapter # must be last
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,18 @@
1
+ require 'tsort'
2
+
3
+ module Pupa
4
+ class Processor
5
+ # A simple implementation of a dependency graph.
6
+ #
7
+ # @see http://ruby-doc.org/stdlib-2.0.0/libdoc/tsort/rdoc/TSort.html
8
+ class DependencyGraph < Hash
9
+ include TSort
10
+
11
+ alias tsort_each_node each_key
12
+
13
+ def tsort_each_child(node, &block)
14
+ fetch(node).each(&block)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ module Pupa
2
+ class Processor
3
+ # Processor helper methods.
4
+ module Helper
5
+ # Normalizes all whitespace to spaces, removes consecutive spaces, and
6
+ # strips leading and ending spaces.
7
+ #
8
+ # @param [String] a string
9
+ # @return [String] a clean string
10
+ def clean(string)
11
+ string.gsub(/[[:space:]]/, ' ').squeeze(' ').strip
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,37 @@
1
+ module Pupa
2
+ class Processor
3
+ module Middleware
4
+ # Customizes the Faraday default logger.
5
+ class Logger < FaradayMiddleware::ResponseMiddleware
6
+ extend Forwardable
7
+
8
+ def initialize(app, logger = nil)
9
+ super(app)
10
+ @logger = logger || begin
11
+ require 'logger'
12
+ ::Logger.new(STDOUT)
13
+ end
14
+ end
15
+
16
+ def_delegators :@logger, :debug, :info, :warn, :error, :fatal
17
+
18
+ def call(env)
19
+ info "#{env[:method]} #{env[:url].to_s} #{env[:body].to_s}" # XXX add POST body
20
+ debug('request') { dump_headers env[:request_headers] }
21
+ super
22
+ end
23
+
24
+ def on_complete(env)
25
+ debug('Status') { env[:status].to_s } # XXX switch from info
26
+ debug('response') { dump_headers env[:response_headers] }
27
+ end
28
+
29
+ private
30
+
31
+ def dump_headers(headers)
32
+ headers.map { |k, v| "#{k}: #{v.inspect}" }.join("\n")
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,16 @@
1
+ module Pupa
2
+ class Processor
3
+ module Middleware
4
+ # A Faraday response middleware for parsing HTML.
5
+ #
6
+ # @see https://github.com/lostisland/faraday_middleware/pull/18
7
+ class ParseHtml < FaradayMiddleware::ResponseMiddleware
8
+ dependency 'nokogiri'
9
+
10
+ define_parser { |body|
11
+ Nokogiri::HTML(body) unless body.empty?
12
+ }
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,80 @@
1
+ module Pupa
2
+ class Processor
3
+ # A proxy class to save plain old Ruby objects to MongoDB.
4
+ class Persistence
5
+ # @param [Object] object an object
6
+ def initialize(object)
7
+ @object = object
8
+ end
9
+
10
+ # Finds a document matching the selection criteria.
11
+ #
12
+ # The selection criteria *must* set a `_type` key in order to determine
13
+ # the collection to query.
14
+ #
15
+ # @param [Hash] selector the selection criteria
16
+ # @return [Hash,nil] the matched document, or nil
17
+ # @raises [Pupa::Errors::TooManyMatches] if multiple documents are found
18
+ def self.find(selector)
19
+ collection_name = collection_name_from_class_name(selector[:_type].camelize)
20
+ query = Pupa.session[collection_name].find(selector)
21
+ case query.count
22
+ when 0
23
+ nil
24
+ when 1
25
+ query.first
26
+ else
27
+ raise Errors::TooManyMatches, "selector matches multiple documents during find: #{collection_name} #{JSON.dump(selector)}"
28
+ end
29
+ end
30
+
31
+ # Saves an object to MongoDB.
32
+ #
33
+ # @return [String] the object's database ID
34
+ # @raises [Pupa::Errors::TooManyMatches] if multiple documents would be updated
35
+ def save
36
+ selector = @object.fingerprint
37
+ query = collection.find(selector)
38
+
39
+ # Run query before callbacks to avoid e.g. timestamps in the selector.
40
+ @object.run_callbacks(:save) do
41
+ case query.count
42
+ when 0
43
+ @object.run_callbacks(:create) do
44
+ collection.insert(@object.to_h)
45
+ @object._id.to_s
46
+ end
47
+ when 1
48
+ query.update(@object.to_h)
49
+ query.first['_id'].to_s
50
+ else
51
+ raise Errors::TooManyMatches, "selector matches multiple documents during save: #{collection_name} #{JSON.dump(selector)}"
52
+ end
53
+ end
54
+ end
55
+
56
+ private
57
+
58
+ # Returns the name of the collection in which to save the object.
59
+ #
60
+ # @return [String] the name of the object's class
61
+ def self.collection_name_from_class_name(class_name)
62
+ class_name.demodulize.underscore.pluralize.to_sym
63
+ end
64
+
65
+ # Returns the name of the collection in which to save the object.
66
+ #
67
+ # @return [String] the name of the collection in which to save the object
68
+ def collection_name
69
+ self.class.collection_name_from_class_name(@object.class.to_s)
70
+ end
71
+
72
+ # Returns the collection in which to save the object.
73
+ #
74
+ # @return [Moped::Collection] the collection in which to save the object
75
+ def collection
76
+ Pupa.session[collection_name]
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,50 @@
1
+ # Using fibers instead of enumerators leads to less coupling in the processor.
2
+ # @see https://practicingruby.com/articles/building-enumerable-and-enumerator?u=dc2ab0f9bb
3
+ require 'fiber'
4
+
5
+ module Pupa
6
+ class Processor
7
+ # A lazy enumerator.
8
+ class Yielder
9
+ # The given block should yield objects to add to the enumerator.
10
+ def initialize
11
+ @fiber = Fiber.new do
12
+ yield
13
+ raise StopIteration
14
+ end
15
+ end
16
+
17
+ # Yields each object in the enumerator to the given block.
18
+ def each
19
+ if block_given?
20
+ loop do
21
+ yield self.next
22
+ end
23
+ else
24
+ to_enum
25
+ end
26
+ end
27
+
28
+ # Returns the next object in the enumerator, and moves the internal position
29
+ # forward. When the position reaches the end, `StopIteration` is raised.
30
+ def next
31
+ if @fiber.alive?
32
+ @fiber.resume
33
+ else
34
+ raise StopIteration
35
+ end
36
+ end
37
+
38
+ # Returns a lazy enumerator.
39
+ #
40
+ # @return [Enumerator] a lazy enumerator
41
+ def to_enum
42
+ Enumerator.new do |y|
43
+ loop do
44
+ y << self.next
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end