pupa 0.0.13 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +1 -0
  3. data/README.md +17 -0
  4. data/lib/pupa.rb +5 -8
  5. data/lib/pupa/errors.rb +4 -0
  6. data/lib/pupa/models/model.rb +5 -5
  7. data/lib/pupa/models/organization.rb +10 -6
  8. data/lib/pupa/models/person.rb +10 -6
  9. data/lib/pupa/processor.rb +14 -15
  10. data/lib/pupa/processor/connection.rb +26 -0
  11. data/lib/pupa/processor/connection_adapters/mongodb_adapter.rb +92 -0
  12. data/lib/pupa/processor/connection_adapters/postgresql_adapter.rb +116 -0
  13. data/lib/pupa/processor/document_store.rb +3 -0
  14. data/lib/pupa/processor/middleware/raise_error.rb +1 -0
  15. data/lib/pupa/refinements/faraday_middleware.rb +1 -1
  16. data/lib/pupa/runner.rb +14 -21
  17. data/lib/pupa/version.rb +1 -1
  18. data/pupa.gemspec +3 -2
  19. data/schemas/popolo/contact_detail.json +10 -0
  20. data/schemas/popolo/membership.json +29 -9
  21. data/schemas/popolo/organization.json +9 -2
  22. data/schemas/popolo/other_name.json +24 -0
  23. data/schemas/popolo/person.json +6 -3
  24. data/schemas/popolo/post.json +16 -2
  25. data/spec/models/model_spec.rb +1 -1
  26. data/spec/processor/connection_adapters/mongodb_adapter_spec.rb +61 -0
  27. data/spec/processor/connection_adapters/postgresql_adapter_spec.rb +70 -0
  28. data/spec/processor/connection_spec.rb +15 -0
  29. data/spec/processor/middleware/parse_json_spec.rb +90 -0
  30. data/spec/processor_spec.rb +9 -10
  31. data/spec/spec_helper.rb +0 -10
  32. metadata +83 -75
  33. data/lib/pupa/processor/persistence.rb +0 -85
  34. data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +0 -56
  35. data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +0 -48
  36. data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +0 -54
  37. data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +0 -26
  38. data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +0 -46
  39. data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +0 -26
  40. data/spec/cassettes/f861172f1df3bdb2052af5451f9922699d574b77.yml +0 -62
  41. data/spec/processor/persistence_spec.rb +0 -51
@@ -1,3 +1,6 @@
1
+ require 'pupa/processor/document_store/file_store'
2
+ require 'pupa/processor/document_store/redis_store'
3
+
1
4
  module Pupa
2
5
  class Processor
3
6
  # An JSON document store factory.
@@ -4,6 +4,7 @@ module Pupa
4
4
  # A Faraday response middleware for raising an error if unsuccessful.
5
5
  #
6
6
  # @see Faraday::Response::RaiseError
7
+ # @note Faraday has no tests for this middleware.
7
8
  class RaiseError < Faraday::Response::Middleware
8
9
  def on_complete(env)
9
10
  case env[:status]
@@ -17,7 +17,7 @@ class FaradayMiddleware::Caching
17
17
  if url.query && params_to_ignore.any?
18
18
  params = parse_query url.query
19
19
  params.reject! {|k,| params_to_ignore.include? k }
20
- url.query = build_query params
20
+ url.query = params.any? ? build_query(params) : nil
21
21
  end
22
22
  url.normalize!
23
23
  url.request_uri + env[:body].to_s # XXX add for POST requests
data/lib/pupa/runner.rb CHANGED
@@ -1,8 +1,6 @@
1
1
  require 'optparse'
2
2
  require 'ostruct'
3
3
 
4
- require 'moped'
5
-
6
4
  module Pupa
7
5
  class Runner
8
6
  attr_reader :options, :actions
@@ -16,14 +14,13 @@ module Pupa
16
14
  actions: [],
17
15
  tasks: [],
18
16
  output_dir: File.expand_path('scraped_data', Dir.pwd),
17
+ pipelined: false,
19
18
  cache_dir: File.expand_path('web_cache', Dir.pwd),
20
19
  expires_in: 86400, # 1 day
21
- pipelined: false,
20
+ database_url: 'mongodb://localhost:27017/pupa',
22
21
  validate: true,
23
- host_with_port: 'localhost:27017',
24
- database: 'pupa',
25
- dry_run: false,
26
22
  level: 'INFO',
23
+ dry_run: false,
27
24
  }.merge(defaults))
28
25
 
29
26
  @actions = {
@@ -76,27 +73,21 @@ module Pupa
76
73
  opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379/0) in which to dump JSON documents') do |v|
77
74
  options.output_dir = v
78
75
  end
76
+ opts.on('--pipelined', 'Dump JSON documents all at once') do |v|
77
+ options.pipelined = v
78
+ end
79
79
  opts.on('-c', '--cache_dir PATH', 'The directory or Memcached address (e.g. memcached://localhost:11211) in which to cache HTTP requests') do |v|
80
80
  options.cache_dir = v
81
81
  end
82
82
  opts.on('-e', '--expires_in SECONDS', "The cache's expiration time in seconds") do |v|
83
83
  options.expires_in = v
84
84
  end
85
- opts.on('--pipelined', 'Dump JSON documents all at once') do |v|
86
- options.pipelined = v
85
+ opts.on('-d', '--database_url SCHEME://USERNAME:PASSWORD@HOST:PORT/DATABASE', 'The database URL') do |v|
86
+ options.database_url = v
87
87
  end
88
88
  opts.on('--[no-]validate', 'Validate JSON documents') do |v|
89
89
  options.validate = v
90
90
  end
91
- opts.on('-H', '--host HOST:PORT', 'The host and port to MongoDB') do |v|
92
- options.host_with_port = v
93
- end
94
- opts.on('-d', '--database NAME', 'The name of the MongoDB database') do |v|
95
- options.database = v
96
- end
97
- opts.on('-n', '--dry-run', 'Show the plan without running any actions') do
98
- options.dry_run = true
99
- end
100
91
  opts.on('-v', '--verbose', 'Show all messages') do
101
92
  options.level = 'DEBUG'
102
93
  end
@@ -106,6 +97,9 @@ module Pupa
106
97
  opts.on('-s', '--silent', 'Show no messages') do
107
98
  options.level = 'UNKNOWN'
108
99
  end
100
+ opts.on('-n', '--dry-run', 'Show the plan without running any actions') do
101
+ options.dry_run = true
102
+ end
109
103
 
110
104
  opts.separator ''
111
105
  opts.separator 'Common options:'
@@ -145,9 +139,10 @@ module Pupa
145
139
  end
146
140
 
147
141
  processor = @processor_class.new(options.output_dir,
142
+ pipelined: options.pipelined,
148
143
  cache_dir: options.cache_dir,
149
144
  expires_in: options.expires_in,
150
- pipelined: options.pipelined,
145
+ database_url: options.database_url,
151
146
  validate: options.validate,
152
147
  level: options.level,
153
148
  options: Hash[*rest])
@@ -165,7 +160,7 @@ module Pupa
165
160
  end
166
161
 
167
162
  if options.level == 'DEBUG'
168
- %w(output_dir cache_dir expires_in host_with_port database level).each do |option|
163
+ %w(output_dir pipelined cache_dir expires_in database_url validate level).each do |option|
169
164
  puts "#{option}: #{options[option]}"
170
165
  end
171
166
  unless rest.empty?
@@ -184,8 +179,6 @@ module Pupa
184
179
  start: Time.now.utc,
185
180
  }
186
181
 
187
- Pupa.session = Moped::Session.new([options.host_with_port], database: options.database)
188
-
189
182
  if options.actions.delete('scrape')
190
183
  processor.store.clear
191
184
  report[:scrape] = {}
data/lib/pupa/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pupa
2
- VERSION = "0.0.13"
2
+ VERSION = "0.1.0"
3
3
  end
data/pupa.gemspec CHANGED
@@ -21,8 +21,10 @@ Gem::Specification.new do |s|
21
21
  s.add_runtime_dependency('faraday_middleware', '~> 0.9.0')
22
22
  s.add_runtime_dependency('json-schema', '~> 2.1.3')
23
23
  s.add_runtime_dependency('mail')
24
- s.add_runtime_dependency('moped', '~> 1.5.1')
24
+ s.add_runtime_dependency('moped', '~> 2.0.0.rc1')
25
25
  s.add_runtime_dependency('oj', '~> 2.1')
26
+ s.add_runtime_dependency('sequel', '~> 4.10.0')
27
+ s.add_runtime_dependency('pg', '~> 0.17.0')
26
28
 
27
29
  s.add_development_dependency('coveralls')
28
30
  s.add_development_dependency('dalli')
@@ -34,5 +36,4 @@ Gem::Specification.new do |s|
34
36
  s.add_development_dependency('redis-store')
35
37
  s.add_development_dependency('rspec', '~> 2.10')
36
38
  s.add_development_dependency('typhoeus')
37
- s.add_development_dependency('vcr', '~> 2.5.0')
38
39
  end
@@ -23,6 +23,16 @@
23
23
  "description": "A note, e.g. for grouping contact details by physical location",
24
24
  "type": ["string", "null"]
25
25
  },
26
+ "valid_from": {
27
+ "description": "The date from which the contact detail is valid",
28
+ "type": ["string", "null"],
29
+ "pattern": "^[0-9]{4}(-[0-9]{2}){0,2}$"
30
+ },
31
+ "valid_through": {
32
+ "description": "The date from which the contact detail is no longer valid",
33
+ "type": ["string", "null"],
34
+ "pattern": "^[0-9]{4}(-[0-9]{2}){0,2}$"
35
+ },
26
36
  "created_at": {
27
37
  "description": "The time at which the resource was created",
28
38
  "type": ["string", "null"],
@@ -2,7 +2,7 @@
2
2
  "$schema": "http://json-schema.org/draft-03/schema#",
3
3
  "id": "http://popoloproject.com/schemas/membership.json#",
4
4
  "title": "Membership",
5
- "description": "A relationship between a person and an organization",
5
+ "description": "A relationship between a member and an organization",
6
6
  "type": "object",
7
7
  "properties": {
8
8
  "id": {
@@ -14,33 +14,53 @@
14
14
  "type": ["string", "null"]
15
15
  },
16
16
  "role": {
17
- "description": "The role that the person fulfills in the organization",
17
+ "description": "The role that the member fulfills in the organization",
18
18
  "type": ["string", "null"]
19
19
  },
20
+ "member": {
21
+ "description": "The person or organization that is a member of the organization",
22
+ "type": ["object"]
23
+ },
20
24
  "person_id": {
21
- "description": "The ID of the person who is a party to the relationship",
25
+ "description": "The ID of the person who is a member of the organization",
22
26
  "type": ["string", "null"]
23
27
  },
24
28
  "person": {
25
- "description": "The person who is a party to the relationship",
29
+ "description": "The person who is a member of the organization",
26
30
  "$ref": "http://popoloproject.com/schemas/person.json#"
27
31
  },
28
32
  "organization_id": {
29
- "description": "The ID of the organization that is a party to the relationship",
33
+ "description": "The ID of the organization in which the person or organization is a member",
30
34
  "type": ["string", "null"]
31
35
  },
32
36
  "organization": {
33
- "description": "The organization that is a party to the relationship",
37
+ "description": "The organization in which the person or organization is a member",
34
38
  "$ref": "http://popoloproject.com/schemas/organization.json#"
35
39
  },
36
40
  "post_id": {
37
- "description": "The ID of the post held by the person in the organization through this membership",
41
+ "description": "The ID of the post held by the member in the organization through this membership",
38
42
  "type": ["string", "null"]
39
43
  },
40
44
  "post": {
41
- "description": "The post held by the person in the organization through this membership",
45
+ "description": "The post held by the member in the organization through this membership",
42
46
  "$ref": "http://popoloproject.com/schemas/post.json#"
43
47
  },
48
+ "on_behalf_of_id": {
49
+ "description": "The ID of the organization on whose behalf the person is a member of the organization",
50
+ "type": ["string", "null"]
51
+ },
52
+ "on_behalf_of": {
53
+ "description": "The organization on whose behalf the person is a member of the organization",
54
+ "$ref": "http://popoloproject.com/schemas/organization.json#"
55
+ },
56
+ "area_id": {
57
+ "description": "The ID of the geographic area to which this membership is related",
58
+ "type": ["string", "null"]
59
+ },
60
+ "area": {
61
+ "description": "The geographic area to which this membership is related",
62
+ "$ref": "http://popoloproject.com/schemas/area.json#"
63
+ },
44
64
  "start_date": {
45
65
  "description": "The date on which the relationship began",
46
66
  "type": ["string", "null"],
@@ -52,7 +72,7 @@
52
72
  "pattern": "^[0-9]{4}(-[0-9]{2}){0,2}$"
53
73
  },
54
74
  "contact_details": {
55
- "description": "Means of contacting the person who is a party to the relationship",
75
+ "description": "Means of contacting the member of the organization",
56
76
  "type": "array",
57
77
  "items": {
58
78
  "$ref": "http://popoloproject.com/schemas/contact_detail.json#"
@@ -11,8 +11,7 @@
11
11
  },
12
12
  "name": {
13
13
  "description": "A primary name, e.g. a legally recognized name",
14
- "type": "string",
15
- "required": true
14
+ "type": ["string", "null"]
16
15
  },
17
16
  "other_names": {
18
17
  "description": "Alternate or former names",
@@ -40,6 +39,14 @@
40
39
  "description": "The organization that contains this organization",
41
40
  "$ref": "http://popoloproject.com/schemas/organization.json#"
42
41
  },
42
+ "area_id": {
43
+ "description": "The ID of the geographic area to which this organization is related",
44
+ "type": ["string", "null"]
45
+ },
46
+ "area": {
47
+ "description": "The geographic area to which this organization is related",
48
+ "$ref": "http://popoloproject.com/schemas/area.json#"
49
+ },
43
50
  "founding_date": {
44
51
  "description": "A date of founding",
45
52
  "type": ["string", "null"],
@@ -10,6 +10,30 @@
10
10
  "type": "string",
11
11
  "required": true
12
12
  },
13
+ "family_name": {
14
+ "description": "One or more family names",
15
+ "type": ["string", "null"]
16
+ },
17
+ "given_name": {
18
+ "description": "One or more primary given names",
19
+ "type": ["string", "null"]
20
+ },
21
+ "additional_name": {
22
+ "description": "One or more secondary given names",
23
+ "type": ["string", "null"]
24
+ },
25
+ "honorific_prefix": {
26
+ "description": "One or more honorifics preceding a person's name",
27
+ "type": ["string", "null"]
28
+ },
29
+ "honorific_suffix": {
30
+ "description": "One or more honorifics following a person's name",
31
+ "type": ["string", "null"]
32
+ },
33
+ "patronymic_name": {
34
+ "description": "One or more patronymic names",
35
+ "type": ["string", "null"]
36
+ },
13
37
  "start_date": {
14
38
  "description": "The date on which the name was adopted",
15
39
  "type": ["string", "null"],
@@ -11,8 +11,7 @@
11
11
  },
12
12
  "name": {
13
13
  "description": "A person's preferred full name",
14
- "type": "string",
15
- "required": true
14
+ "type": ["string", "null"]
16
15
  },
17
16
  "other_names": {
18
17
  "description": "Alternate or former names",
@@ -53,7 +52,7 @@
53
52
  "type": ["string", "null"]
54
53
  },
55
54
  "sort_name": {
56
- "description": "A name to use in an lexicographically ordered list",
55
+ "description": "A name to use in a lexicographically ordered list",
57
56
  "type": ["string", "null"]
58
57
  },
59
58
  "email": {
@@ -88,6 +87,10 @@
88
87
  "description": "An extended account of a person's life",
89
88
  "type": ["string", "null"]
90
89
  },
90
+ "national_identity": {
91
+ "description": "A national identity",
92
+ "type": ["string", "null"]
93
+ },
91
94
  "contact_details": {
92
95
  "description": "Means of contacting the person",
93
96
  "type": "array",
@@ -11,8 +11,14 @@
11
11
  },
12
12
  "label": {
13
13
  "description": "A label describing the post",
14
- "type": "string",
15
- "required": true
14
+ "type": ["string", "null"]
15
+ },
16
+ "other_label": {
17
+ "description": "An alternate label",
18
+ "type": "array",
19
+ "items": {
20
+ "type": "string"
21
+ }
16
22
  },
17
23
  "role": {
18
24
  "description": "The function that the holder of the post fulfills",
@@ -26,6 +32,14 @@
26
32
  "description": "The organization in which the post is held",
27
33
  "$ref": "http://popoloproject.com/schemas/organization.json#"
28
34
  },
35
+ "area_id": {
36
+ "description": "The ID of the geographic area to which this post is related",
37
+ "type": ["string", "null"]
38
+ },
39
+ "area": {
40
+ "description": "The geographic area to which this post is related",
41
+ "$ref": "http://popoloproject.com/schemas/area.json#"
42
+ },
29
43
  "start_date": {
30
44
  "description": "The date on which the post was created",
31
45
  "type": ["string", "null"],
@@ -146,7 +146,7 @@ describe Pupa::Model do
146
146
  end
147
147
 
148
148
  it 'should coerce the _id to a string' do
149
- object._id = Moped::BSON::ObjectId.new
149
+ object._id = BSON::ObjectId.new
150
150
  object._id.should be_a(String)
151
151
  end
152
152
  end
@@ -0,0 +1,61 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../spec_helper')
2
+
3
+ describe Pupa::Processor::Connection::MongoDBAdapter do
4
+ def _type
5
+ if testing_python_compatibility?
6
+ 'person'
7
+ else
8
+ 'pupa/person'
9
+ end
10
+ end
11
+
12
+ def connection
13
+ Pupa::Processor::Connection::MongoDBAdapter.new('mongodb://localhost:27017/pupa_test')
14
+ end
15
+
16
+ before :all do
17
+ connection.raw_connection[:people].drop
18
+
19
+ connection.save(Pupa::Person.new(_id: 'existing', name: 'existing', email: 'existing@example.com'))
20
+ connection.raw_connection[:people].insert(_type: 'pupa/person', name: 'non-unique')
21
+ connection.raw_connection[:people].insert(_type: 'pupa/person', name: 'non-unique')
22
+ end
23
+
24
+ describe '.find' do
25
+ it 'should raise an error if selector is empty' do
26
+ expect{connection.find(_type: _type)}.to raise_error(Pupa::Errors::EmptySelectorError)
27
+ end
28
+
29
+ it 'should return nil if no matches' do
30
+ connection.find(_type: _type, name: 'nonexistent').should == nil
31
+ end
32
+
33
+ it 'should return a document if one match' do
34
+ connection.find(_type: _type, name: 'existing').should be_a(Hash)
35
+ end
36
+
37
+ it 'should raise an error if many matches' do
38
+ expect{connection.find(_type: 'pupa/person', name: 'non-unique')}.to raise_error(Pupa::Errors::TooManyMatches)
39
+ end
40
+ end
41
+
42
+ describe '.save' do
43
+ it 'should raise an error if selector is empty' do
44
+ expect{connection.save(Pupa::Person.new)}.to raise_error(Pupa::Errors::EmptySelectorError)
45
+ end
46
+
47
+ it 'should insert a document if no matches' do
48
+ connection.save(Pupa::Person.new(_id: 'new', name: 'new', email: 'new@example.com')).should == [true, 'new']
49
+ connection.find(_type: _type, name: 'new')['email'].should == 'new@example.com'
50
+ end
51
+
52
+ it 'should update a document if one match' do
53
+ connection.save(Pupa::Person.new(_id: 'changed', name: 'existing', email: 'changed@example.com')).should == [false, 'existing']
54
+ connection.find(_type: _type, name: 'existing')['email'].should == 'changed@example.com'
55
+ end
56
+
57
+ it 'should raise an error if many matches' do
58
+ expect{connection.save(Pupa::Person.new(name: 'non-unique'))}.to raise_error(Pupa::Errors::TooManyMatches)
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,70 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../spec_helper')
2
+
3
+ describe Pupa::Processor::Connection::PostgreSQLAdapter do
4
+ def _type
5
+ if testing_python_compatibility?
6
+ 'person'
7
+ else
8
+ 'pupa/person'
9
+ end
10
+ end
11
+
12
+ def connection
13
+ Pupa::Processor::Connection::PostgreSQLAdapter.new('postgres://localhost:5432/pupa_test')
14
+ end
15
+
16
+ before :all do
17
+ connection.raw_connection.drop_table(:people)
18
+ connection.raw_connection.create_table(:people) do
19
+ primary_key :id
20
+ String :_id
21
+ String :_type
22
+ String :name
23
+ String :email
24
+ Time :created_at
25
+ Time :updated_at
26
+ end
27
+
28
+ connection.save(Pupa::Person.new(_id: 'existing', name: 'existing', email: 'existing@example.com'))
29
+ connection.raw_connection[:people].insert(_type: 'pupa/person', name: 'non-unique')
30
+ connection.raw_connection[:people].insert(_type: 'pupa/person', name: 'non-unique')
31
+ end
32
+
33
+ describe '.find' do
34
+ it 'should raise an error if selector is empty' do
35
+ expect{connection.find(_type: _type)}.to raise_error(Pupa::Errors::EmptySelectorError)
36
+ end
37
+
38
+ it 'should return nil if no matches' do
39
+ connection.find(_type: _type, name: 'nonexistent').should == nil
40
+ end
41
+
42
+ it 'should return a document if one match' do
43
+ connection.find(_type: _type, name: 'existing').should be_a(Hash)
44
+ end
45
+
46
+ it 'should raise an error if many matches' do
47
+ expect{connection.find(_type: 'pupa/person', name: 'non-unique')}.to raise_error(Pupa::Errors::TooManyMatches)
48
+ end
49
+ end
50
+
51
+ describe '.save' do
52
+ it 'should raise an error if selector is empty' do
53
+ expect{connection.save(Pupa::Person.new)}.to raise_error(Pupa::Errors::EmptySelectorError)
54
+ end
55
+
56
+ it 'should insert a document if no matches' do
57
+ connection.save(Pupa::Person.new(_id: 'new', name: 'new', email: 'new@example.com')).should == [true, 'new']
58
+ connection.find(_type: _type, name: 'new')['email'].should == 'new@example.com'
59
+ end
60
+
61
+ it 'should update a document if one match' do
62
+ connection.save(Pupa::Person.new(_id: 'changed', name: 'existing', email: 'changed@example.com')).should == [false, 'existing']
63
+ connection.find(_type: _type, name: 'existing')['email'].should == 'changed@example.com'
64
+ end
65
+
66
+ it 'should raise an error if many matches' do
67
+ expect{connection.save(Pupa::Person.new(name: 'non-unique'))}.to raise_error(Pupa::Errors::TooManyMatches)
68
+ end
69
+ end
70
+ end