pupa 0.0.13 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +1 -0
  3. data/README.md +17 -0
  4. data/lib/pupa.rb +5 -8
  5. data/lib/pupa/errors.rb +4 -0
  6. data/lib/pupa/models/model.rb +5 -5
  7. data/lib/pupa/models/organization.rb +10 -6
  8. data/lib/pupa/models/person.rb +10 -6
  9. data/lib/pupa/processor.rb +14 -15
  10. data/lib/pupa/processor/connection.rb +26 -0
  11. data/lib/pupa/processor/connection_adapters/mongodb_adapter.rb +92 -0
  12. data/lib/pupa/processor/connection_adapters/postgresql_adapter.rb +116 -0
  13. data/lib/pupa/processor/document_store.rb +3 -0
  14. data/lib/pupa/processor/middleware/raise_error.rb +1 -0
  15. data/lib/pupa/refinements/faraday_middleware.rb +1 -1
  16. data/lib/pupa/runner.rb +14 -21
  17. data/lib/pupa/version.rb +1 -1
  18. data/pupa.gemspec +3 -2
  19. data/schemas/popolo/contact_detail.json +10 -0
  20. data/schemas/popolo/membership.json +29 -9
  21. data/schemas/popolo/organization.json +9 -2
  22. data/schemas/popolo/other_name.json +24 -0
  23. data/schemas/popolo/person.json +6 -3
  24. data/schemas/popolo/post.json +16 -2
  25. data/spec/models/model_spec.rb +1 -1
  26. data/spec/processor/connection_adapters/mongodb_adapter_spec.rb +61 -0
  27. data/spec/processor/connection_adapters/postgresql_adapter_spec.rb +70 -0
  28. data/spec/processor/connection_spec.rb +15 -0
  29. data/spec/processor/middleware/parse_json_spec.rb +90 -0
  30. data/spec/processor_spec.rb +9 -10
  31. data/spec/spec_helper.rb +0 -10
  32. metadata +83 -75
  33. data/lib/pupa/processor/persistence.rb +0 -85
  34. data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +0 -56
  35. data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +0 -48
  36. data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +0 -54
  37. data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +0 -26
  38. data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +0 -46
  39. data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +0 -26
  40. data/spec/cassettes/f861172f1df3bdb2052af5451f9922699d574b77.yml +0 -62
  41. data/spec/processor/persistence_spec.rb +0 -51
@@ -1,3 +1,6 @@
1
+ require 'pupa/processor/document_store/file_store'
2
+ require 'pupa/processor/document_store/redis_store'
3
+
1
4
  module Pupa
2
5
  class Processor
3
6
  # An JSON document store factory.
@@ -4,6 +4,7 @@ module Pupa
4
4
  # A Faraday response middleware for raising an error if unsuccessful.
5
5
  #
6
6
  # @see Faraday::Response::RaiseError
7
+ # @note Faraday has no tests for this middleware.
7
8
  class RaiseError < Faraday::Response::Middleware
8
9
  def on_complete(env)
9
10
  case env[:status]
@@ -17,7 +17,7 @@ class FaradayMiddleware::Caching
17
17
  if url.query && params_to_ignore.any?
18
18
  params = parse_query url.query
19
19
  params.reject! {|k,| params_to_ignore.include? k }
20
- url.query = build_query params
20
+ url.query = params.any? ? build_query(params) : nil
21
21
  end
22
22
  url.normalize!
23
23
  url.request_uri + env[:body].to_s # XXX add for POST requests
data/lib/pupa/runner.rb CHANGED
@@ -1,8 +1,6 @@
1
1
  require 'optparse'
2
2
  require 'ostruct'
3
3
 
4
- require 'moped'
5
-
6
4
  module Pupa
7
5
  class Runner
8
6
  attr_reader :options, :actions
@@ -16,14 +14,13 @@ module Pupa
16
14
  actions: [],
17
15
  tasks: [],
18
16
  output_dir: File.expand_path('scraped_data', Dir.pwd),
17
+ pipelined: false,
19
18
  cache_dir: File.expand_path('web_cache', Dir.pwd),
20
19
  expires_in: 86400, # 1 day
21
- pipelined: false,
20
+ database_url: 'mongodb://localhost:27017/pupa',
22
21
  validate: true,
23
- host_with_port: 'localhost:27017',
24
- database: 'pupa',
25
- dry_run: false,
26
22
  level: 'INFO',
23
+ dry_run: false,
27
24
  }.merge(defaults))
28
25
 
29
26
  @actions = {
@@ -76,27 +73,21 @@ module Pupa
76
73
  opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379/0) in which to dump JSON documents') do |v|
77
74
  options.output_dir = v
78
75
  end
76
+ opts.on('--pipelined', 'Dump JSON documents all at once') do |v|
77
+ options.pipelined = v
78
+ end
79
79
  opts.on('-c', '--cache_dir PATH', 'The directory or Memcached address (e.g. memcached://localhost:11211) in which to cache HTTP requests') do |v|
80
80
  options.cache_dir = v
81
81
  end
82
82
  opts.on('-e', '--expires_in SECONDS', "The cache's expiration time in seconds") do |v|
83
83
  options.expires_in = v
84
84
  end
85
- opts.on('--pipelined', 'Dump JSON documents all at once') do |v|
86
- options.pipelined = v
85
+ opts.on('-d', '--database_url SCHEME://USERNAME:PASSWORD@HOST:PORT/DATABASE', 'The database URL') do |v|
86
+ options.database_url = v
87
87
  end
88
88
  opts.on('--[no-]validate', 'Validate JSON documents') do |v|
89
89
  options.validate = v
90
90
  end
91
- opts.on('-H', '--host HOST:PORT', 'The host and port to MongoDB') do |v|
92
- options.host_with_port = v
93
- end
94
- opts.on('-d', '--database NAME', 'The name of the MongoDB database') do |v|
95
- options.database = v
96
- end
97
- opts.on('-n', '--dry-run', 'Show the plan without running any actions') do
98
- options.dry_run = true
99
- end
100
91
  opts.on('-v', '--verbose', 'Show all messages') do
101
92
  options.level = 'DEBUG'
102
93
  end
@@ -106,6 +97,9 @@ module Pupa
106
97
  opts.on('-s', '--silent', 'Show no messages') do
107
98
  options.level = 'UNKNOWN'
108
99
  end
100
+ opts.on('-n', '--dry-run', 'Show the plan without running any actions') do
101
+ options.dry_run = true
102
+ end
109
103
 
110
104
  opts.separator ''
111
105
  opts.separator 'Common options:'
@@ -145,9 +139,10 @@ module Pupa
145
139
  end
146
140
 
147
141
  processor = @processor_class.new(options.output_dir,
142
+ pipelined: options.pipelined,
148
143
  cache_dir: options.cache_dir,
149
144
  expires_in: options.expires_in,
150
- pipelined: options.pipelined,
145
+ database_url: options.database_url,
151
146
  validate: options.validate,
152
147
  level: options.level,
153
148
  options: Hash[*rest])
@@ -165,7 +160,7 @@ module Pupa
165
160
  end
166
161
 
167
162
  if options.level == 'DEBUG'
168
- %w(output_dir cache_dir expires_in host_with_port database level).each do |option|
163
+ %w(output_dir pipelined cache_dir expires_in database_url validate level).each do |option|
169
164
  puts "#{option}: #{options[option]}"
170
165
  end
171
166
  unless rest.empty?
@@ -184,8 +179,6 @@ module Pupa
184
179
  start: Time.now.utc,
185
180
  }
186
181
 
187
- Pupa.session = Moped::Session.new([options.host_with_port], database: options.database)
188
-
189
182
  if options.actions.delete('scrape')
190
183
  processor.store.clear
191
184
  report[:scrape] = {}
data/lib/pupa/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pupa
2
- VERSION = "0.0.13"
2
+ VERSION = "0.1.0"
3
3
  end
data/pupa.gemspec CHANGED
@@ -21,8 +21,10 @@ Gem::Specification.new do |s|
21
21
  s.add_runtime_dependency('faraday_middleware', '~> 0.9.0')
22
22
  s.add_runtime_dependency('json-schema', '~> 2.1.3')
23
23
  s.add_runtime_dependency('mail')
24
- s.add_runtime_dependency('moped', '~> 1.5.1')
24
+ s.add_runtime_dependency('moped', '~> 2.0.0.rc1')
25
25
  s.add_runtime_dependency('oj', '~> 2.1')
26
+ s.add_runtime_dependency('sequel', '~> 4.10.0')
27
+ s.add_runtime_dependency('pg', '~> 0.17.0')
26
28
 
27
29
  s.add_development_dependency('coveralls')
28
30
  s.add_development_dependency('dalli')
@@ -34,5 +36,4 @@ Gem::Specification.new do |s|
34
36
  s.add_development_dependency('redis-store')
35
37
  s.add_development_dependency('rspec', '~> 2.10')
36
38
  s.add_development_dependency('typhoeus')
37
- s.add_development_dependency('vcr', '~> 2.5.0')
38
39
  end
@@ -23,6 +23,16 @@
23
23
  "description": "A note, e.g. for grouping contact details by physical location",
24
24
  "type": ["string", "null"]
25
25
  },
26
+ "valid_from": {
27
+ "description": "The date from which the contact detail is valid",
28
+ "type": ["string", "null"],
29
+ "pattern": "^[0-9]{4}(-[0-9]{2}){0,2}$"
30
+ },
31
+ "valid_through": {
32
+ "description": "The date from which the contact detail is no longer valid",
33
+ "type": ["string", "null"],
34
+ "pattern": "^[0-9]{4}(-[0-9]{2}){0,2}$"
35
+ },
26
36
  "created_at": {
27
37
  "description": "The time at which the resource was created",
28
38
  "type": ["string", "null"],
@@ -2,7 +2,7 @@
2
2
  "$schema": "http://json-schema.org/draft-03/schema#",
3
3
  "id": "http://popoloproject.com/schemas/membership.json#",
4
4
  "title": "Membership",
5
- "description": "A relationship between a person and an organization",
5
+ "description": "A relationship between a member and an organization",
6
6
  "type": "object",
7
7
  "properties": {
8
8
  "id": {
@@ -14,33 +14,53 @@
14
14
  "type": ["string", "null"]
15
15
  },
16
16
  "role": {
17
- "description": "The role that the person fulfills in the organization",
17
+ "description": "The role that the member fulfills in the organization",
18
18
  "type": ["string", "null"]
19
19
  },
20
+ "member": {
21
+ "description": "The person or organization that is a member of the organization",
22
+ "type": ["object"]
23
+ },
20
24
  "person_id": {
21
- "description": "The ID of the person who is a party to the relationship",
25
+ "description": "The ID of the person who is a member of the organization",
22
26
  "type": ["string", "null"]
23
27
  },
24
28
  "person": {
25
- "description": "The person who is a party to the relationship",
29
+ "description": "The person who is a member of the organization",
26
30
  "$ref": "http://popoloproject.com/schemas/person.json#"
27
31
  },
28
32
  "organization_id": {
29
- "description": "The ID of the organization that is a party to the relationship",
33
+ "description": "The ID of the organization in which the person or organization is a member",
30
34
  "type": ["string", "null"]
31
35
  },
32
36
  "organization": {
33
- "description": "The organization that is a party to the relationship",
37
+ "description": "The organization in which the person or organization is a member",
34
38
  "$ref": "http://popoloproject.com/schemas/organization.json#"
35
39
  },
36
40
  "post_id": {
37
- "description": "The ID of the post held by the person in the organization through this membership",
41
+ "description": "The ID of the post held by the member in the organization through this membership",
38
42
  "type": ["string", "null"]
39
43
  },
40
44
  "post": {
41
- "description": "The post held by the person in the organization through this membership",
45
+ "description": "The post held by the member in the organization through this membership",
42
46
  "$ref": "http://popoloproject.com/schemas/post.json#"
43
47
  },
48
+ "on_behalf_of_id": {
49
+ "description": "The ID of the organization on whose behalf the person is a member of the organization",
50
+ "type": ["string", "null"]
51
+ },
52
+ "on_behalf_of": {
53
+ "description": "The organization on whose behalf the person is a member of the organization",
54
+ "$ref": "http://popoloproject.com/schemas/organization.json#"
55
+ },
56
+ "area_id": {
57
+ "description": "The ID of the geographic area to which this membership is related",
58
+ "type": ["string", "null"]
59
+ },
60
+ "area": {
61
+ "description": "The geographic area to which this membership is related",
62
+ "$ref": "http://popoloproject.com/schemas/area.json#"
63
+ },
44
64
  "start_date": {
45
65
  "description": "The date on which the relationship began",
46
66
  "type": ["string", "null"],
@@ -52,7 +72,7 @@
52
72
  "pattern": "^[0-9]{4}(-[0-9]{2}){0,2}$"
53
73
  },
54
74
  "contact_details": {
55
- "description": "Means of contacting the person who is a party to the relationship",
75
+ "description": "Means of contacting the member of the organization",
56
76
  "type": "array",
57
77
  "items": {
58
78
  "$ref": "http://popoloproject.com/schemas/contact_detail.json#"
@@ -11,8 +11,7 @@
11
11
  },
12
12
  "name": {
13
13
  "description": "A primary name, e.g. a legally recognized name",
14
- "type": "string",
15
- "required": true
14
+ "type": ["string", "null"]
16
15
  },
17
16
  "other_names": {
18
17
  "description": "Alternate or former names",
@@ -40,6 +39,14 @@
40
39
  "description": "The organization that contains this organization",
41
40
  "$ref": "http://popoloproject.com/schemas/organization.json#"
42
41
  },
42
+ "area_id": {
43
+ "description": "The ID of the geographic area to which this organization is related",
44
+ "type": ["string", "null"]
45
+ },
46
+ "area": {
47
+ "description": "The geographic area to which this organization is related",
48
+ "$ref": "http://popoloproject.com/schemas/area.json#"
49
+ },
43
50
  "founding_date": {
44
51
  "description": "A date of founding",
45
52
  "type": ["string", "null"],
@@ -10,6 +10,30 @@
10
10
  "type": "string",
11
11
  "required": true
12
12
  },
13
+ "family_name": {
14
+ "description": "One or more family names",
15
+ "type": ["string", "null"]
16
+ },
17
+ "given_name": {
18
+ "description": "One or more primary given names",
19
+ "type": ["string", "null"]
20
+ },
21
+ "additional_name": {
22
+ "description": "One or more secondary given names",
23
+ "type": ["string", "null"]
24
+ },
25
+ "honorific_prefix": {
26
+ "description": "One or more honorifics preceding a person's name",
27
+ "type": ["string", "null"]
28
+ },
29
+ "honorific_suffix": {
30
+ "description": "One or more honorifics following a person's name",
31
+ "type": ["string", "null"]
32
+ },
33
+ "patronymic_name": {
34
+ "description": "One or more patronymic names",
35
+ "type": ["string", "null"]
36
+ },
13
37
  "start_date": {
14
38
  "description": "The date on which the name was adopted",
15
39
  "type": ["string", "null"],
@@ -11,8 +11,7 @@
11
11
  },
12
12
  "name": {
13
13
  "description": "A person's preferred full name",
14
- "type": "string",
15
- "required": true
14
+ "type": ["string", "null"]
16
15
  },
17
16
  "other_names": {
18
17
  "description": "Alternate or former names",
@@ -53,7 +52,7 @@
53
52
  "type": ["string", "null"]
54
53
  },
55
54
  "sort_name": {
56
- "description": "A name to use in an lexicographically ordered list",
55
+ "description": "A name to use in a lexicographically ordered list",
57
56
  "type": ["string", "null"]
58
57
  },
59
58
  "email": {
@@ -88,6 +87,10 @@
88
87
  "description": "An extended account of a person's life",
89
88
  "type": ["string", "null"]
90
89
  },
90
+ "national_identity": {
91
+ "description": "A national identity",
92
+ "type": ["string", "null"]
93
+ },
91
94
  "contact_details": {
92
95
  "description": "Means of contacting the person",
93
96
  "type": "array",
@@ -11,8 +11,14 @@
11
11
  },
12
12
  "label": {
13
13
  "description": "A label describing the post",
14
- "type": "string",
15
- "required": true
14
+ "type": ["string", "null"]
15
+ },
16
+ "other_label": {
17
+ "description": "An alternate label",
18
+ "type": "array",
19
+ "items": {
20
+ "type": "string"
21
+ }
16
22
  },
17
23
  "role": {
18
24
  "description": "The function that the holder of the post fulfills",
@@ -26,6 +32,14 @@
26
32
  "description": "The organization in which the post is held",
27
33
  "$ref": "http://popoloproject.com/schemas/organization.json#"
28
34
  },
35
+ "area_id": {
36
+ "description": "The ID of the geographic area to which this post is related",
37
+ "type": ["string", "null"]
38
+ },
39
+ "area": {
40
+ "description": "The geographic area to which this post is related",
41
+ "$ref": "http://popoloproject.com/schemas/area.json#"
42
+ },
29
43
  "start_date": {
30
44
  "description": "The date on which the post was created",
31
45
  "type": ["string", "null"],
@@ -146,7 +146,7 @@ describe Pupa::Model do
146
146
  end
147
147
 
148
148
  it 'should coerce the _id to a string' do
149
- object._id = Moped::BSON::ObjectId.new
149
+ object._id = BSON::ObjectId.new
150
150
  object._id.should be_a(String)
151
151
  end
152
152
  end
@@ -0,0 +1,61 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../spec_helper')
2
+
3
+ describe Pupa::Processor::Connection::MongoDBAdapter do
4
+ def _type
5
+ if testing_python_compatibility?
6
+ 'person'
7
+ else
8
+ 'pupa/person'
9
+ end
10
+ end
11
+
12
+ def connection
13
+ Pupa::Processor::Connection::MongoDBAdapter.new('mongodb://localhost:27017/pupa_test')
14
+ end
15
+
16
+ before :all do
17
+ connection.raw_connection[:people].drop
18
+
19
+ connection.save(Pupa::Person.new(_id: 'existing', name: 'existing', email: 'existing@example.com'))
20
+ connection.raw_connection[:people].insert(_type: 'pupa/person', name: 'non-unique')
21
+ connection.raw_connection[:people].insert(_type: 'pupa/person', name: 'non-unique')
22
+ end
23
+
24
+ describe '.find' do
25
+ it 'should raise an error if selector is empty' do
26
+ expect{connection.find(_type: _type)}.to raise_error(Pupa::Errors::EmptySelectorError)
27
+ end
28
+
29
+ it 'should return nil if no matches' do
30
+ connection.find(_type: _type, name: 'nonexistent').should == nil
31
+ end
32
+
33
+ it 'should return a document if one match' do
34
+ connection.find(_type: _type, name: 'existing').should be_a(Hash)
35
+ end
36
+
37
+ it 'should raise an error if many matches' do
38
+ expect{connection.find(_type: 'pupa/person', name: 'non-unique')}.to raise_error(Pupa::Errors::TooManyMatches)
39
+ end
40
+ end
41
+
42
+ describe '.save' do
43
+ it 'should raise an error if selector is empty' do
44
+ expect{connection.save(Pupa::Person.new)}.to raise_error(Pupa::Errors::EmptySelectorError)
45
+ end
46
+
47
+ it 'should insert a document if no matches' do
48
+ connection.save(Pupa::Person.new(_id: 'new', name: 'new', email: 'new@example.com')).should == [true, 'new']
49
+ connection.find(_type: _type, name: 'new')['email'].should == 'new@example.com'
50
+ end
51
+
52
+ it 'should update a document if one match' do
53
+ connection.save(Pupa::Person.new(_id: 'changed', name: 'existing', email: 'changed@example.com')).should == [false, 'existing']
54
+ connection.find(_type: _type, name: 'existing')['email'].should == 'changed@example.com'
55
+ end
56
+
57
+ it 'should raise an error if many matches' do
58
+ expect{connection.save(Pupa::Person.new(name: 'non-unique'))}.to raise_error(Pupa::Errors::TooManyMatches)
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,70 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../spec_helper')
2
+
3
+ describe Pupa::Processor::Connection::PostgreSQLAdapter do
4
+ def _type
5
+ if testing_python_compatibility?
6
+ 'person'
7
+ else
8
+ 'pupa/person'
9
+ end
10
+ end
11
+
12
+ def connection
13
+ Pupa::Processor::Connection::PostgreSQLAdapter.new('postgres://localhost:5432/pupa_test')
14
+ end
15
+
16
+ before :all do
17
+ connection.raw_connection.drop_table(:people)
18
+ connection.raw_connection.create_table(:people) do
19
+ primary_key :id
20
+ String :_id
21
+ String :_type
22
+ String :name
23
+ String :email
24
+ Time :created_at
25
+ Time :updated_at
26
+ end
27
+
28
+ connection.save(Pupa::Person.new(_id: 'existing', name: 'existing', email: 'existing@example.com'))
29
+ connection.raw_connection[:people].insert(_type: 'pupa/person', name: 'non-unique')
30
+ connection.raw_connection[:people].insert(_type: 'pupa/person', name: 'non-unique')
31
+ end
32
+
33
+ describe '.find' do
34
+ it 'should raise an error if selector is empty' do
35
+ expect{connection.find(_type: _type)}.to raise_error(Pupa::Errors::EmptySelectorError)
36
+ end
37
+
38
+ it 'should return nil if no matches' do
39
+ connection.find(_type: _type, name: 'nonexistent').should == nil
40
+ end
41
+
42
+ it 'should return a document if one match' do
43
+ connection.find(_type: _type, name: 'existing').should be_a(Hash)
44
+ end
45
+
46
+ it 'should raise an error if many matches' do
47
+ expect{connection.find(_type: 'pupa/person', name: 'non-unique')}.to raise_error(Pupa::Errors::TooManyMatches)
48
+ end
49
+ end
50
+
51
+ describe '.save' do
52
+ it 'should raise an error if selector is empty' do
53
+ expect{connection.save(Pupa::Person.new)}.to raise_error(Pupa::Errors::EmptySelectorError)
54
+ end
55
+
56
+ it 'should insert a document if no matches' do
57
+ connection.save(Pupa::Person.new(_id: 'new', name: 'new', email: 'new@example.com')).should == [true, 'new']
58
+ connection.find(_type: _type, name: 'new')['email'].should == 'new@example.com'
59
+ end
60
+
61
+ it 'should update a document if one match' do
62
+ connection.save(Pupa::Person.new(_id: 'changed', name: 'existing', email: 'changed@example.com')).should == [false, 'existing']
63
+ connection.find(_type: _type, name: 'existing')['email'].should == 'changed@example.com'
64
+ end
65
+
66
+ it 'should raise an error if many matches' do
67
+ expect{connection.save(Pupa::Person.new(name: 'non-unique'))}.to raise_error(Pupa::Errors::TooManyMatches)
68
+ end
69
+ end
70
+ end