pupa 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +6 -0
- data/.travis.yml +5 -0
- data/.yardopts +4 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.md +52 -0
- data/Rakefile +37 -0
- data/USAGE +1 -0
- data/lib/pupa/errors.rb +30 -0
- data/lib/pupa/logger.rb +37 -0
- data/lib/pupa/models/base.rb +190 -0
- data/lib/pupa/models/concerns/contactable.rb +34 -0
- data/lib/pupa/models/concerns/identifiable.rb +26 -0
- data/lib/pupa/models/concerns/linkable.rb +26 -0
- data/lib/pupa/models/concerns/nameable.rb +34 -0
- data/lib/pupa/models/concerns/sourceable.rb +26 -0
- data/lib/pupa/models/concerns/timestamps.rb +22 -0
- data/lib/pupa/models/contact_detail_list.rb +28 -0
- data/lib/pupa/models/membership.rb +37 -0
- data/lib/pupa/models/organization.rb +40 -0
- data/lib/pupa/models/person.rb +35 -0
- data/lib/pupa/models/post.rb +28 -0
- data/lib/pupa/processor/client.rb +42 -0
- data/lib/pupa/processor/dependency_graph.rb +18 -0
- data/lib/pupa/processor/helper.rb +15 -0
- data/lib/pupa/processor/middleware/logger.rb +37 -0
- data/lib/pupa/processor/middleware/parse_html.rb +16 -0
- data/lib/pupa/processor/persistence.rb +80 -0
- data/lib/pupa/processor/yielder.rb +50 -0
- data/lib/pupa/processor.rb +351 -0
- data/lib/pupa/refinements/faraday_middleware.rb +32 -0
- data/lib/pupa/refinements/json-schema.rb +36 -0
- data/lib/pupa/runner.rb +185 -0
- data/lib/pupa/version.rb +3 -0
- data/lib/pupa.rb +31 -0
- data/pupa.gemspec +34 -0
- data/schemas/popolo/contact_detail.json +44 -0
- data/schemas/popolo/identifier.json +18 -0
- data/schemas/popolo/link.json +19 -0
- data/schemas/popolo/membership.json +86 -0
- data/schemas/popolo/organization.json +104 -0
- data/schemas/popolo/other_name.json +28 -0
- data/schemas/popolo/person.json +130 -0
- data/schemas/popolo/post.json +78 -0
- data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +56 -0
- data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +48 -0
- data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +54 -0
- data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +26 -0
- data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +46 -0
- data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +26 -0
- data/spec/logger_spec.rb +4 -0
- data/spec/models/base_spec.rb +194 -0
- data/spec/models/concerns/contactable_spec.rb +37 -0
- data/spec/models/concerns/identifiable_spec.rb +25 -0
- data/spec/models/concerns/linkable_spec.rb +25 -0
- data/spec/models/concerns/nameable_spec.rb +25 -0
- data/spec/models/concerns/sourceable_spec.rb +25 -0
- data/spec/models/concerns/timestamps_spec.rb +32 -0
- data/spec/models/contact_detail_list_spec.rb +44 -0
- data/spec/models/membership_spec.rb +30 -0
- data/spec/models/organization_spec.rb +24 -0
- data/spec/models/person_spec.rb +24 -0
- data/spec/models/post_spec.rb +19 -0
- data/spec/processor/client_spec.rb +4 -0
- data/spec/processor/dependency_graph_spec.rb +4 -0
- data/spec/processor/helper_spec.rb +4 -0
- data/spec/processor/middleware/logger_spec.rb +87 -0
- data/spec/processor/middleware/parse_html_spec.rb +92 -0
- data/spec/processor/persistence_spec.rb +41 -0
- data/spec/processor/yielder_spec.rb +55 -0
- data/spec/processor_spec.rb +268 -0
- data/spec/runner_spec.rb +85 -0
- data/spec/spec_helper.rb +17 -0
- metadata +342 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
module Pupa
|
2
|
+
# A group with a common purpose or reason for existence that goes beyond the set
|
3
|
+
# of people belonging to it.
|
4
|
+
class Organization < Base
|
5
|
+
self.schema = 'popolo/organization'
|
6
|
+
|
7
|
+
include Concerns::Timestamps
|
8
|
+
include Concerns::Sourceable
|
9
|
+
include Concerns::Nameable
|
10
|
+
include Concerns::Identifiable
|
11
|
+
include Concerns::Contactable
|
12
|
+
include Concerns::Linkable
|
13
|
+
|
14
|
+
attr_accessor :name, :classification, :parent_id, :parent, :founding_date,
|
15
|
+
:dissolution_date, :image
|
16
|
+
|
17
|
+
foreign_key :parent_id
|
18
|
+
|
19
|
+
foreign_object :parent
|
20
|
+
|
21
|
+
# Returns the name of the organization.
|
22
|
+
#
|
23
|
+
# @return [String] the name of the organization
|
24
|
+
def to_s
|
25
|
+
name
|
26
|
+
end
|
27
|
+
|
28
|
+
# @todo Parentless organizations in different jurisdictions can have the
|
29
|
+
# same name. Add a `jurisdiction` property?
|
30
|
+
def fingerprint
|
31
|
+
hash = super.slice(:classification, :parent_id)
|
32
|
+
{
|
33
|
+
'$or' => [
|
34
|
+
hash.merge('name' => name),
|
35
|
+
hash.merge('other_names.name' => name),
|
36
|
+
],
|
37
|
+
}
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Pupa
|
2
|
+
# A real person, alive or dead.
|
3
|
+
class Person < Base
|
4
|
+
self.schema = 'popolo/person'
|
5
|
+
|
6
|
+
include Concerns::Timestamps
|
7
|
+
include Concerns::Sourceable
|
8
|
+
include Concerns::Nameable
|
9
|
+
include Concerns::Identifiable
|
10
|
+
include Concerns::Contactable
|
11
|
+
include Concerns::Linkable
|
12
|
+
|
13
|
+
attr_accessor :name, :family_name, :given_name, :additional_name,
|
14
|
+
:honorific_prefix, :honorific_suffix, :patronymic_name, :sort_name,
|
15
|
+
:email, :gender, :birth_date, :death_date, :image, :summary, :biography
|
16
|
+
|
17
|
+
# Returns the person's name.
|
18
|
+
#
|
19
|
+
# @return [String] the person's name
|
20
|
+
def to_s
|
21
|
+
name
|
22
|
+
end
|
23
|
+
|
24
|
+
# @todo This will obviously need to be scoped as in Python Pupa, to a
|
25
|
+
# jurisdiction, post, etc.
|
26
|
+
def fingerprint
|
27
|
+
{
|
28
|
+
'$or' => [
|
29
|
+
{'name' => name},
|
30
|
+
{'other_names.name' => name},
|
31
|
+
],
|
32
|
+
}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Pupa
|
2
|
+
# A position that exists independent of the person holding it.
|
3
|
+
class Post < Base
|
4
|
+
self.schema = 'popolo/post'
|
5
|
+
|
6
|
+
include Concerns::Timestamps
|
7
|
+
include Concerns::Sourceable
|
8
|
+
include Concerns::Contactable
|
9
|
+
include Concerns::Linkable
|
10
|
+
|
11
|
+
attr_accessor :label, :role, :organization_id, :start_date, :end_date
|
12
|
+
|
13
|
+
foreign_key :organization_id
|
14
|
+
|
15
|
+
# Returns the post's label and organization ID.
|
16
|
+
#
|
17
|
+
# @return [String] the post's label and organization ID
|
18
|
+
def to_s
|
19
|
+
"#{label} in #{organization_id}"
|
20
|
+
end
|
21
|
+
|
22
|
+
# A post should have a unique label within an organization, through it may
|
23
|
+
# share a label with a historical post.
|
24
|
+
def fingerprint
|
25
|
+
super.slice(:label, :organization_id, :end_date)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'active_support/cache'
|
2
|
+
require 'faraday_middleware'
|
3
|
+
require 'faraday_middleware/response_middleware'
|
4
|
+
|
5
|
+
require 'pupa/processor/middleware/logger'
|
6
|
+
require 'pupa/processor/middleware/parse_html'
|
7
|
+
require 'pupa/refinements/faraday_middleware'
|
8
|
+
|
9
|
+
using Pupa::Refinements::FaradayMiddleware
|
10
|
+
|
11
|
+
module Pupa
|
12
|
+
class Processor
|
13
|
+
# An HTTP client factory.
|
14
|
+
class Client
|
15
|
+
# Returns a configured Faraday HTTP client.
|
16
|
+
#
|
17
|
+
# @param [String] cache_dir a directory in which to cache requests
|
18
|
+
# @param [Integer] expires_in the cache's expiration time in seconds
|
19
|
+
# @param [String] level the log level
|
20
|
+
# @return [Faraday::Connection] a configured Faraday HTTP client
|
21
|
+
def self.new(cache_dir: nil, expires_in: 86400, level: 'INFO') # 1 day
|
22
|
+
Faraday.new do |connection|
|
23
|
+
connection.request :url_encoded
|
24
|
+
connection.use Middleware::Logger, Logger.new('faraday', level: level)
|
25
|
+
# @see http://tools.ietf.org/html/rfc2854
|
26
|
+
# @see http://tools.ietf.org/html/rfc3236
|
27
|
+
connection.use Middleware::ParseHtml, content_type: %w(text/html application/xhtml+xml)
|
28
|
+
# @see http://tools.ietf.org/html/rfc4627
|
29
|
+
connection.use FaradayMiddleware::ParseJson, content_type: /\bjson$/
|
30
|
+
# @see http://tools.ietf.org/html/rfc3023
|
31
|
+
connection.use FaradayMiddleware::ParseXml, content_type: /\bxml$/
|
32
|
+
if cache_dir
|
33
|
+
connection.response :caching do
|
34
|
+
ActiveSupport::Cache::FileStore.new(cache_dir, expires_in: expires_in)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
connection.adapter Faraday.default_adapter # must be last
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'tsort'
|
2
|
+
|
3
|
+
module Pupa
|
4
|
+
class Processor
|
5
|
+
# A simple implementation of a dependency graph.
|
6
|
+
#
|
7
|
+
# @see http://ruby-doc.org/stdlib-2.0.0/libdoc/tsort/rdoc/TSort.html
|
8
|
+
class DependencyGraph < Hash
|
9
|
+
include TSort
|
10
|
+
|
11
|
+
alias tsort_each_node each_key
|
12
|
+
|
13
|
+
def tsort_each_child(node, &block)
|
14
|
+
fetch(node).each(&block)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Processor
|
3
|
+
# Processor helper methods.
|
4
|
+
module Helper
|
5
|
+
# Normalizes all whitespace to spaces, removes consecutive spaces, and
|
6
|
+
# strips leading and ending spaces.
|
7
|
+
#
|
8
|
+
# @param [String] a string
|
9
|
+
# @return [String] a clean string
|
10
|
+
def clean(string)
|
11
|
+
string.gsub(/[[:space:]]/, ' ').squeeze(' ').strip
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Processor
|
3
|
+
module Middleware
|
4
|
+
# Customizes the Faraday default logger.
|
5
|
+
class Logger < FaradayMiddleware::ResponseMiddleware
|
6
|
+
extend Forwardable
|
7
|
+
|
8
|
+
def initialize(app, logger = nil)
|
9
|
+
super(app)
|
10
|
+
@logger = logger || begin
|
11
|
+
require 'logger'
|
12
|
+
::Logger.new(STDOUT)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def_delegators :@logger, :debug, :info, :warn, :error, :fatal
|
17
|
+
|
18
|
+
def call(env)
|
19
|
+
info "#{env[:method]} #{env[:url].to_s} #{env[:body].to_s}" # XXX add POST body
|
20
|
+
debug('request') { dump_headers env[:request_headers] }
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
def on_complete(env)
|
25
|
+
debug('Status') { env[:status].to_s } # XXX switch from info
|
26
|
+
debug('response') { dump_headers env[:response_headers] }
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def dump_headers(headers)
|
32
|
+
headers.map { |k, v| "#{k}: #{v.inspect}" }.join("\n")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Processor
|
3
|
+
module Middleware
|
4
|
+
# A Faraday response middleware for parsing HTML.
|
5
|
+
#
|
6
|
+
# @see https://github.com/lostisland/faraday_middleware/pull/18
|
7
|
+
class ParseHtml < FaradayMiddleware::ResponseMiddleware
|
8
|
+
dependency 'nokogiri'
|
9
|
+
|
10
|
+
define_parser { |body|
|
11
|
+
Nokogiri::HTML(body) unless body.empty?
|
12
|
+
}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Processor
|
3
|
+
# A proxy class to save plain old Ruby objects to MongoDB.
|
4
|
+
class Persistence
|
5
|
+
# @param [Object] object an object
|
6
|
+
def initialize(object)
|
7
|
+
@object = object
|
8
|
+
end
|
9
|
+
|
10
|
+
# Finds a document matching the selection criteria.
|
11
|
+
#
|
12
|
+
# The selection criteria *must* set a `_type` key in order to determine
|
13
|
+
# the collection to query.
|
14
|
+
#
|
15
|
+
# @param [Hash] selector the selection criteria
|
16
|
+
# @return [Hash,nil] the matched document, or nil
|
17
|
+
# @raises [Pupa::Errors::TooManyMatches] if multiple documents are found
|
18
|
+
def self.find(selector)
|
19
|
+
collection_name = collection_name_from_class_name(selector[:_type].camelize)
|
20
|
+
query = Pupa.session[collection_name].find(selector)
|
21
|
+
case query.count
|
22
|
+
when 0
|
23
|
+
nil
|
24
|
+
when 1
|
25
|
+
query.first
|
26
|
+
else
|
27
|
+
raise Errors::TooManyMatches, "selector matches multiple documents during find: #{collection_name} #{JSON.dump(selector)}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Saves an object to MongoDB.
|
32
|
+
#
|
33
|
+
# @return [String] the object's database ID
|
34
|
+
# @raises [Pupa::Errors::TooManyMatches] if multiple documents would be updated
|
35
|
+
def save
|
36
|
+
selector = @object.fingerprint
|
37
|
+
query = collection.find(selector)
|
38
|
+
|
39
|
+
# Run query before callbacks to avoid e.g. timestamps in the selector.
|
40
|
+
@object.run_callbacks(:save) do
|
41
|
+
case query.count
|
42
|
+
when 0
|
43
|
+
@object.run_callbacks(:create) do
|
44
|
+
collection.insert(@object.to_h)
|
45
|
+
@object._id.to_s
|
46
|
+
end
|
47
|
+
when 1
|
48
|
+
query.update(@object.to_h)
|
49
|
+
query.first['_id'].to_s
|
50
|
+
else
|
51
|
+
raise Errors::TooManyMatches, "selector matches multiple documents during save: #{collection_name} #{JSON.dump(selector)}"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
# Returns the name of the collection in which to save the object.
|
59
|
+
#
|
60
|
+
# @return [String] the name of the object's class
|
61
|
+
def self.collection_name_from_class_name(class_name)
|
62
|
+
class_name.demodulize.underscore.pluralize.to_sym
|
63
|
+
end
|
64
|
+
|
65
|
+
# Returns the name of the collection in which to save the object.
|
66
|
+
#
|
67
|
+
# @return [String] the name of the collection in which to save the object
|
68
|
+
def collection_name
|
69
|
+
self.class.collection_name_from_class_name(@object.class.to_s)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns the collection in which to save the object.
|
73
|
+
#
|
74
|
+
# @return [Moped::Collection] the collection in which to save the object
|
75
|
+
def collection
|
76
|
+
Pupa.session[collection_name]
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# Using fibers instead of enumerators leads to less coupling in the processor.
|
2
|
+
# @see https://practicingruby.com/articles/building-enumerable-and-enumerator?u=dc2ab0f9bb
|
3
|
+
require 'fiber'
|
4
|
+
|
5
|
+
module Pupa
|
6
|
+
class Processor
|
7
|
+
# A lazy enumerator.
|
8
|
+
class Yielder
|
9
|
+
# The given block should yield objects to add to the enumerator.
|
10
|
+
def initialize
|
11
|
+
@fiber = Fiber.new do
|
12
|
+
yield
|
13
|
+
raise StopIteration
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Yields each object in the enumerator to the given block.
|
18
|
+
def each
|
19
|
+
if block_given?
|
20
|
+
loop do
|
21
|
+
yield self.next
|
22
|
+
end
|
23
|
+
else
|
24
|
+
to_enum
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns the next object in the enumerator, and moves the internal position
|
29
|
+
# forward. When the position reaches the end, `StopIteration` is raised.
|
30
|
+
def next
|
31
|
+
if @fiber.alive?
|
32
|
+
@fiber.resume
|
33
|
+
else
|
34
|
+
raise StopIteration
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns a lazy enumerator.
|
39
|
+
#
|
40
|
+
# @return [Enumerator] a lazy enumerator
|
41
|
+
def to_enum
|
42
|
+
Enumerator.new do |y|
|
43
|
+
loop do
|
44
|
+
y << self.next
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|