pupa 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +6 -0
- data/.travis.yml +5 -0
- data/.yardopts +4 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.md +52 -0
- data/Rakefile +37 -0
- data/USAGE +1 -0
- data/lib/pupa/errors.rb +30 -0
- data/lib/pupa/logger.rb +37 -0
- data/lib/pupa/models/base.rb +190 -0
- data/lib/pupa/models/concerns/contactable.rb +34 -0
- data/lib/pupa/models/concerns/identifiable.rb +26 -0
- data/lib/pupa/models/concerns/linkable.rb +26 -0
- data/lib/pupa/models/concerns/nameable.rb +34 -0
- data/lib/pupa/models/concerns/sourceable.rb +26 -0
- data/lib/pupa/models/concerns/timestamps.rb +22 -0
- data/lib/pupa/models/contact_detail_list.rb +28 -0
- data/lib/pupa/models/membership.rb +37 -0
- data/lib/pupa/models/organization.rb +40 -0
- data/lib/pupa/models/person.rb +35 -0
- data/lib/pupa/models/post.rb +28 -0
- data/lib/pupa/processor/client.rb +42 -0
- data/lib/pupa/processor/dependency_graph.rb +18 -0
- data/lib/pupa/processor/helper.rb +15 -0
- data/lib/pupa/processor/middleware/logger.rb +37 -0
- data/lib/pupa/processor/middleware/parse_html.rb +16 -0
- data/lib/pupa/processor/persistence.rb +80 -0
- data/lib/pupa/processor/yielder.rb +50 -0
- data/lib/pupa/processor.rb +351 -0
- data/lib/pupa/refinements/faraday_middleware.rb +32 -0
- data/lib/pupa/refinements/json-schema.rb +36 -0
- data/lib/pupa/runner.rb +185 -0
- data/lib/pupa/version.rb +3 -0
- data/lib/pupa.rb +31 -0
- data/pupa.gemspec +34 -0
- data/schemas/popolo/contact_detail.json +44 -0
- data/schemas/popolo/identifier.json +18 -0
- data/schemas/popolo/link.json +19 -0
- data/schemas/popolo/membership.json +86 -0
- data/schemas/popolo/organization.json +104 -0
- data/schemas/popolo/other_name.json +28 -0
- data/schemas/popolo/person.json +130 -0
- data/schemas/popolo/post.json +78 -0
- data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +56 -0
- data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +48 -0
- data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +54 -0
- data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +26 -0
- data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +46 -0
- data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +26 -0
- data/spec/logger_spec.rb +4 -0
- data/spec/models/base_spec.rb +194 -0
- data/spec/models/concerns/contactable_spec.rb +37 -0
- data/spec/models/concerns/identifiable_spec.rb +25 -0
- data/spec/models/concerns/linkable_spec.rb +25 -0
- data/spec/models/concerns/nameable_spec.rb +25 -0
- data/spec/models/concerns/sourceable_spec.rb +25 -0
- data/spec/models/concerns/timestamps_spec.rb +32 -0
- data/spec/models/contact_detail_list_spec.rb +44 -0
- data/spec/models/membership_spec.rb +30 -0
- data/spec/models/organization_spec.rb +24 -0
- data/spec/models/person_spec.rb +24 -0
- data/spec/models/post_spec.rb +19 -0
- data/spec/processor/client_spec.rb +4 -0
- data/spec/processor/dependency_graph_spec.rb +4 -0
- data/spec/processor/helper_spec.rb +4 -0
- data/spec/processor/middleware/logger_spec.rb +87 -0
- data/spec/processor/middleware/parse_html_spec.rb +92 -0
- data/spec/processor/persistence_spec.rb +41 -0
- data/spec/processor/yielder_spec.rb +55 -0
- data/spec/processor_spec.rb +268 -0
- data/spec/runner_spec.rb +85 -0
- data/spec/spec_helper.rb +17 -0
- metadata +342 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
module Pupa
|
2
|
+
# A group with a common purpose or reason for existence that goes beyond the set
|
3
|
+
# of people belonging to it.
|
4
|
+
class Organization < Base
|
5
|
+
self.schema = 'popolo/organization'
|
6
|
+
|
7
|
+
include Concerns::Timestamps
|
8
|
+
include Concerns::Sourceable
|
9
|
+
include Concerns::Nameable
|
10
|
+
include Concerns::Identifiable
|
11
|
+
include Concerns::Contactable
|
12
|
+
include Concerns::Linkable
|
13
|
+
|
14
|
+
attr_accessor :name, :classification, :parent_id, :parent, :founding_date,
|
15
|
+
:dissolution_date, :image
|
16
|
+
|
17
|
+
foreign_key :parent_id
|
18
|
+
|
19
|
+
foreign_object :parent
|
20
|
+
|
21
|
+
# Returns the name of the organization.
|
22
|
+
#
|
23
|
+
# @return [String] the name of the organization
|
24
|
+
def to_s
|
25
|
+
name
|
26
|
+
end
|
27
|
+
|
28
|
+
# @todo Parentless organizations in different jurisdictions can have the
|
29
|
+
# same name. Add a `jurisdiction` property?
|
30
|
+
def fingerprint
|
31
|
+
hash = super.slice(:classification, :parent_id)
|
32
|
+
{
|
33
|
+
'$or' => [
|
34
|
+
hash.merge('name' => name),
|
35
|
+
hash.merge('other_names.name' => name),
|
36
|
+
],
|
37
|
+
}
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Pupa
|
2
|
+
# A real person, alive or dead.
|
3
|
+
class Person < Base
|
4
|
+
self.schema = 'popolo/person'
|
5
|
+
|
6
|
+
include Concerns::Timestamps
|
7
|
+
include Concerns::Sourceable
|
8
|
+
include Concerns::Nameable
|
9
|
+
include Concerns::Identifiable
|
10
|
+
include Concerns::Contactable
|
11
|
+
include Concerns::Linkable
|
12
|
+
|
13
|
+
attr_accessor :name, :family_name, :given_name, :additional_name,
|
14
|
+
:honorific_prefix, :honorific_suffix, :patronymic_name, :sort_name,
|
15
|
+
:email, :gender, :birth_date, :death_date, :image, :summary, :biography
|
16
|
+
|
17
|
+
# Returns the person's name.
|
18
|
+
#
|
19
|
+
# @return [String] the person's name
|
20
|
+
def to_s
|
21
|
+
name
|
22
|
+
end
|
23
|
+
|
24
|
+
# @todo This will obviously need to be scoped as in Python Pupa, to a
|
25
|
+
# jurisdiction, post, etc.
|
26
|
+
def fingerprint
|
27
|
+
{
|
28
|
+
'$or' => [
|
29
|
+
{'name' => name},
|
30
|
+
{'other_names.name' => name},
|
31
|
+
],
|
32
|
+
}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Pupa
|
2
|
+
# A position that exists independent of the person holding it.
|
3
|
+
class Post < Base
|
4
|
+
self.schema = 'popolo/post'
|
5
|
+
|
6
|
+
include Concerns::Timestamps
|
7
|
+
include Concerns::Sourceable
|
8
|
+
include Concerns::Contactable
|
9
|
+
include Concerns::Linkable
|
10
|
+
|
11
|
+
attr_accessor :label, :role, :organization_id, :start_date, :end_date
|
12
|
+
|
13
|
+
foreign_key :organization_id
|
14
|
+
|
15
|
+
# Returns the post's label and organization ID.
|
16
|
+
#
|
17
|
+
# @return [String] the post's label and organization ID
|
18
|
+
def to_s
|
19
|
+
"#{label} in #{organization_id}"
|
20
|
+
end
|
21
|
+
|
22
|
+
# A post should have a unique label within an organization, through it may
|
23
|
+
# share a label with a historical post.
|
24
|
+
def fingerprint
|
25
|
+
super.slice(:label, :organization_id, :end_date)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'active_support/cache'
|
2
|
+
require 'faraday_middleware'
|
3
|
+
require 'faraday_middleware/response_middleware'
|
4
|
+
|
5
|
+
require 'pupa/processor/middleware/logger'
|
6
|
+
require 'pupa/processor/middleware/parse_html'
|
7
|
+
require 'pupa/refinements/faraday_middleware'
|
8
|
+
|
9
|
+
using Pupa::Refinements::FaradayMiddleware
|
10
|
+
|
11
|
+
module Pupa
|
12
|
+
class Processor
|
13
|
+
# An HTTP client factory.
|
14
|
+
class Client
|
15
|
+
# Returns a configured Faraday HTTP client.
|
16
|
+
#
|
17
|
+
# @param [String] cache_dir a directory in which to cache requests
|
18
|
+
# @param [Integer] expires_in the cache's expiration time in seconds
|
19
|
+
# @param [String] level the log level
|
20
|
+
# @return [Faraday::Connection] a configured Faraday HTTP client
|
21
|
+
def self.new(cache_dir: nil, expires_in: 86400, level: 'INFO') # 1 day
|
22
|
+
Faraday.new do |connection|
|
23
|
+
connection.request :url_encoded
|
24
|
+
connection.use Middleware::Logger, Logger.new('faraday', level: level)
|
25
|
+
# @see http://tools.ietf.org/html/rfc2854
|
26
|
+
# @see http://tools.ietf.org/html/rfc3236
|
27
|
+
connection.use Middleware::ParseHtml, content_type: %w(text/html application/xhtml+xml)
|
28
|
+
# @see http://tools.ietf.org/html/rfc4627
|
29
|
+
connection.use FaradayMiddleware::ParseJson, content_type: /\bjson$/
|
30
|
+
# @see http://tools.ietf.org/html/rfc3023
|
31
|
+
connection.use FaradayMiddleware::ParseXml, content_type: /\bxml$/
|
32
|
+
if cache_dir
|
33
|
+
connection.response :caching do
|
34
|
+
ActiveSupport::Cache::FileStore.new(cache_dir, expires_in: expires_in)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
connection.adapter Faraday.default_adapter # must be last
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'tsort'
|
2
|
+
|
3
|
+
module Pupa
|
4
|
+
class Processor
|
5
|
+
# A simple implementation of a dependency graph.
|
6
|
+
#
|
7
|
+
# @see http://ruby-doc.org/stdlib-2.0.0/libdoc/tsort/rdoc/TSort.html
|
8
|
+
class DependencyGraph < Hash
|
9
|
+
include TSort
|
10
|
+
|
11
|
+
alias tsort_each_node each_key
|
12
|
+
|
13
|
+
def tsort_each_child(node, &block)
|
14
|
+
fetch(node).each(&block)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Processor
|
3
|
+
# Processor helper methods.
|
4
|
+
module Helper
|
5
|
+
# Normalizes all whitespace to spaces, removes consecutive spaces, and
|
6
|
+
# strips leading and ending spaces.
|
7
|
+
#
|
8
|
+
# @param [String] a string
|
9
|
+
# @return [String] a clean string
|
10
|
+
def clean(string)
|
11
|
+
string.gsub(/[[:space:]]/, ' ').squeeze(' ').strip
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Processor
|
3
|
+
module Middleware
|
4
|
+
# Customizes the Faraday default logger.
|
5
|
+
class Logger < FaradayMiddleware::ResponseMiddleware
|
6
|
+
extend Forwardable
|
7
|
+
|
8
|
+
def initialize(app, logger = nil)
|
9
|
+
super(app)
|
10
|
+
@logger = logger || begin
|
11
|
+
require 'logger'
|
12
|
+
::Logger.new(STDOUT)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def_delegators :@logger, :debug, :info, :warn, :error, :fatal
|
17
|
+
|
18
|
+
def call(env)
|
19
|
+
info "#{env[:method]} #{env[:url].to_s} #{env[:body].to_s}" # XXX add POST body
|
20
|
+
debug('request') { dump_headers env[:request_headers] }
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
def on_complete(env)
|
25
|
+
debug('Status') { env[:status].to_s } # XXX switch from info
|
26
|
+
debug('response') { dump_headers env[:response_headers] }
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def dump_headers(headers)
|
32
|
+
headers.map { |k, v| "#{k}: #{v.inspect}" }.join("\n")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Processor
|
3
|
+
module Middleware
|
4
|
+
# A Faraday response middleware for parsing HTML.
|
5
|
+
#
|
6
|
+
# @see https://github.com/lostisland/faraday_middleware/pull/18
|
7
|
+
class ParseHtml < FaradayMiddleware::ResponseMiddleware
|
8
|
+
dependency 'nokogiri'
|
9
|
+
|
10
|
+
define_parser { |body|
|
11
|
+
Nokogiri::HTML(body) unless body.empty?
|
12
|
+
}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Processor
|
3
|
+
# A proxy class to save plain old Ruby objects to MongoDB.
|
4
|
+
class Persistence
|
5
|
+
# @param [Object] object an object
|
6
|
+
def initialize(object)
|
7
|
+
@object = object
|
8
|
+
end
|
9
|
+
|
10
|
+
# Finds a document matching the selection criteria.
|
11
|
+
#
|
12
|
+
# The selection criteria *must* set a `_type` key in order to determine
|
13
|
+
# the collection to query.
|
14
|
+
#
|
15
|
+
# @param [Hash] selector the selection criteria
|
16
|
+
# @return [Hash,nil] the matched document, or nil
|
17
|
+
# @raises [Pupa::Errors::TooManyMatches] if multiple documents are found
|
18
|
+
def self.find(selector)
|
19
|
+
collection_name = collection_name_from_class_name(selector[:_type].camelize)
|
20
|
+
query = Pupa.session[collection_name].find(selector)
|
21
|
+
case query.count
|
22
|
+
when 0
|
23
|
+
nil
|
24
|
+
when 1
|
25
|
+
query.first
|
26
|
+
else
|
27
|
+
raise Errors::TooManyMatches, "selector matches multiple documents during find: #{collection_name} #{JSON.dump(selector)}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Saves an object to MongoDB.
|
32
|
+
#
|
33
|
+
# @return [String] the object's database ID
|
34
|
+
# @raises [Pupa::Errors::TooManyMatches] if multiple documents would be updated
|
35
|
+
def save
|
36
|
+
selector = @object.fingerprint
|
37
|
+
query = collection.find(selector)
|
38
|
+
|
39
|
+
# Run query before callbacks to avoid e.g. timestamps in the selector.
|
40
|
+
@object.run_callbacks(:save) do
|
41
|
+
case query.count
|
42
|
+
when 0
|
43
|
+
@object.run_callbacks(:create) do
|
44
|
+
collection.insert(@object.to_h)
|
45
|
+
@object._id.to_s
|
46
|
+
end
|
47
|
+
when 1
|
48
|
+
query.update(@object.to_h)
|
49
|
+
query.first['_id'].to_s
|
50
|
+
else
|
51
|
+
raise Errors::TooManyMatches, "selector matches multiple documents during save: #{collection_name} #{JSON.dump(selector)}"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
# Returns the name of the collection in which to save the object.
|
59
|
+
#
|
60
|
+
# @return [String] the name of the object's class
|
61
|
+
def self.collection_name_from_class_name(class_name)
|
62
|
+
class_name.demodulize.underscore.pluralize.to_sym
|
63
|
+
end
|
64
|
+
|
65
|
+
# Returns the name of the collection in which to save the object.
|
66
|
+
#
|
67
|
+
# @return [String] the name of the collection in which to save the object
|
68
|
+
def collection_name
|
69
|
+
self.class.collection_name_from_class_name(@object.class.to_s)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns the collection in which to save the object.
|
73
|
+
#
|
74
|
+
# @return [Moped::Collection] the collection in which to save the object
|
75
|
+
def collection
|
76
|
+
Pupa.session[collection_name]
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# Using fibers instead of enumerators leads to less coupling in the processor.
|
2
|
+
# @see https://practicingruby.com/articles/building-enumerable-and-enumerator?u=dc2ab0f9bb
|
3
|
+
require 'fiber'
|
4
|
+
|
5
|
+
module Pupa
|
6
|
+
class Processor
|
7
|
+
# A lazy enumerator.
|
8
|
+
class Yielder
|
9
|
+
# The given block should yield objects to add to the enumerator.
|
10
|
+
def initialize
|
11
|
+
@fiber = Fiber.new do
|
12
|
+
yield
|
13
|
+
raise StopIteration
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Yields each object in the enumerator to the given block.
|
18
|
+
def each
|
19
|
+
if block_given?
|
20
|
+
loop do
|
21
|
+
yield self.next
|
22
|
+
end
|
23
|
+
else
|
24
|
+
to_enum
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns the next object in the enumerator, and moves the internal position
|
29
|
+
# forward. When the position reaches the end, `StopIteration` is raised.
|
30
|
+
def next
|
31
|
+
if @fiber.alive?
|
32
|
+
@fiber.resume
|
33
|
+
else
|
34
|
+
raise StopIteration
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns a lazy enumerator.
|
39
|
+
#
|
40
|
+
# @return [Enumerator] a lazy enumerator
|
41
|
+
def to_enum
|
42
|
+
Enumerator.new do |y|
|
43
|
+
loop do
|
44
|
+
y << self.next
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|