pupa 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +6 -0
- data/.travis.yml +5 -0
- data/.yardopts +4 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.md +52 -0
- data/Rakefile +37 -0
- data/USAGE +1 -0
- data/lib/pupa/errors.rb +30 -0
- data/lib/pupa/logger.rb +37 -0
- data/lib/pupa/models/base.rb +190 -0
- data/lib/pupa/models/concerns/contactable.rb +34 -0
- data/lib/pupa/models/concerns/identifiable.rb +26 -0
- data/lib/pupa/models/concerns/linkable.rb +26 -0
- data/lib/pupa/models/concerns/nameable.rb +34 -0
- data/lib/pupa/models/concerns/sourceable.rb +26 -0
- data/lib/pupa/models/concerns/timestamps.rb +22 -0
- data/lib/pupa/models/contact_detail_list.rb +28 -0
- data/lib/pupa/models/membership.rb +37 -0
- data/lib/pupa/models/organization.rb +40 -0
- data/lib/pupa/models/person.rb +35 -0
- data/lib/pupa/models/post.rb +28 -0
- data/lib/pupa/processor/client.rb +42 -0
- data/lib/pupa/processor/dependency_graph.rb +18 -0
- data/lib/pupa/processor/helper.rb +15 -0
- data/lib/pupa/processor/middleware/logger.rb +37 -0
- data/lib/pupa/processor/middleware/parse_html.rb +16 -0
- data/lib/pupa/processor/persistence.rb +80 -0
- data/lib/pupa/processor/yielder.rb +50 -0
- data/lib/pupa/processor.rb +351 -0
- data/lib/pupa/refinements/faraday_middleware.rb +32 -0
- data/lib/pupa/refinements/json-schema.rb +36 -0
- data/lib/pupa/runner.rb +185 -0
- data/lib/pupa/version.rb +3 -0
- data/lib/pupa.rb +31 -0
- data/pupa.gemspec +34 -0
- data/schemas/popolo/contact_detail.json +44 -0
- data/schemas/popolo/identifier.json +18 -0
- data/schemas/popolo/link.json +19 -0
- data/schemas/popolo/membership.json +86 -0
- data/schemas/popolo/organization.json +104 -0
- data/schemas/popolo/other_name.json +28 -0
- data/schemas/popolo/person.json +130 -0
- data/schemas/popolo/post.json +78 -0
- data/spec/cassettes/31ac91ccad069eefc07d96cfbe66fa66c1b41fcf.yml +56 -0
- data/spec/cassettes/4ff54d737afb5d693653752d7bf234a405a80172.yml +48 -0
- data/spec/cassettes/898049a22e6ca51dfa2510d9e0e0207a5c396524.yml +54 -0
- data/spec/cassettes/ce69ff734ce852d2bfaa482bbf55d7ffb4762e87.yml +26 -0
- data/spec/cassettes/da629b01e0836deda8a5540a4e6a08783dd7aef9.yml +46 -0
- data/spec/cassettes/e398f35bea86b3d4c87a6934bae1eb7fca8744f9.yml +26 -0
- data/spec/logger_spec.rb +4 -0
- data/spec/models/base_spec.rb +194 -0
- data/spec/models/concerns/contactable_spec.rb +37 -0
- data/spec/models/concerns/identifiable_spec.rb +25 -0
- data/spec/models/concerns/linkable_spec.rb +25 -0
- data/spec/models/concerns/nameable_spec.rb +25 -0
- data/spec/models/concerns/sourceable_spec.rb +25 -0
- data/spec/models/concerns/timestamps_spec.rb +32 -0
- data/spec/models/contact_detail_list_spec.rb +44 -0
- data/spec/models/membership_spec.rb +30 -0
- data/spec/models/organization_spec.rb +24 -0
- data/spec/models/person_spec.rb +24 -0
- data/spec/models/post_spec.rb +19 -0
- data/spec/processor/client_spec.rb +4 -0
- data/spec/processor/dependency_graph_spec.rb +4 -0
- data/spec/processor/helper_spec.rb +4 -0
- data/spec/processor/middleware/logger_spec.rb +87 -0
- data/spec/processor/middleware/parse_html_spec.rb +92 -0
- data/spec/processor/persistence_spec.rb +41 -0
- data/spec/processor/yielder_spec.rb +55 -0
- data/spec/processor_spec.rb +268 -0
- data/spec/runner_spec.rb +85 -0
- data/spec/spec_helper.rb +17 -0
- metadata +342 -0
@@ -0,0 +1,87 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../spec_helper')
|
2
|
+
|
3
|
+
# @see test/adapters/logger_test.rb in faraday
|
4
|
+
describe Pupa::Processor::Middleware::Logger do
|
5
|
+
let :io do
|
6
|
+
StringIO.new
|
7
|
+
end
|
8
|
+
|
9
|
+
context 'with DEBUG log level' do
|
10
|
+
let :logger do
|
11
|
+
logger = Logger.new(io)
|
12
|
+
logger.level = Logger::DEBUG
|
13
|
+
logger
|
14
|
+
end
|
15
|
+
|
16
|
+
let :connection do
|
17
|
+
Faraday.new do |connection|
|
18
|
+
connection.use Pupa::Processor::Middleware::Logger, logger
|
19
|
+
connection.adapter :test do |stubs|
|
20
|
+
stubs.get('/hello') { [200, {'Content-Type' => 'text/html'}, 'hello'] }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
before :each do
|
26
|
+
@response = connection.get('/hello', nil, :accept => 'text/html')
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should still return output' do
|
30
|
+
@response.body.should == 'hello'
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'should log the method and URL' do
|
34
|
+
io.string.should match('get http:/hello')
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should log request headers' do
|
38
|
+
io.string.should match('Accept: "text/html')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
context 'with INFO log level' do
|
43
|
+
let :logger do
|
44
|
+
logger = Logger.new(io)
|
45
|
+
logger.level = Logger::INFO
|
46
|
+
logger
|
47
|
+
end
|
48
|
+
|
49
|
+
let :connection do
|
50
|
+
Faraday.new do |connection|
|
51
|
+
connection.use Pupa::Processor::Middleware::Logger, logger
|
52
|
+
connection.adapter :test do |stubs|
|
53
|
+
stubs.get('/hello') { [200, {'Content-Type' => 'text/html'}, 'hello'] }
|
54
|
+
stubs.post('/hello') { [200, {'Content-Type' => 'text/html'}, 'hello'] }
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context 'with GET request' do
|
60
|
+
before :each do
|
61
|
+
connection.get('/hello', nil, :accept => 'text/html')
|
62
|
+
end
|
63
|
+
|
64
|
+
it 'should log the method and URL' do
|
65
|
+
io.string.should match('get http:/hello')
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'should not log request headers' do
|
69
|
+
io.string.should_not match('Accept: "text/html')
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context 'with POST request' do
|
74
|
+
before :each do
|
75
|
+
connection.post('/hello', 'foo=bar', :accept => 'text/html')
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'should log the method and URL' do
|
79
|
+
io.string.should match('post http:/hello foo=bar')
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'should not log request headers' do
|
83
|
+
io.string.should_not match('Accept: "text/html')
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../spec_helper')
|
2
|
+
|
3
|
+
# @see spec/helper.rb and spec/parse_xml_spec.rb in faraday_middleware
|
4
|
+
describe Pupa::Processor::Middleware::ParseHtml do
|
5
|
+
let(:options) { Hash.new }
|
6
|
+
let(:headers) { Hash.new }
|
7
|
+
let(:middleware) {
|
8
|
+
described_class.new(lambda {|env|
|
9
|
+
Faraday::Response.new(env)
|
10
|
+
}, options)
|
11
|
+
}
|
12
|
+
|
13
|
+
def process(body, content_type = nil, options = {})
|
14
|
+
env = {
|
15
|
+
:body => body, :request => options,
|
16
|
+
:response_headers => Faraday::Utils::Headers.new(headers)
|
17
|
+
}
|
18
|
+
env[:response_headers]['content-type'] = content_type if content_type
|
19
|
+
middleware.call(env)
|
20
|
+
end
|
21
|
+
|
22
|
+
let(:html) { '<html><head><title>foo</title></head><body>bar</body></html>' }
|
23
|
+
let(:title) { 'foo' }
|
24
|
+
let(:body) { 'bar' }
|
25
|
+
|
26
|
+
context "no type matching" do
|
27
|
+
it "doesn't change nil body" do
|
28
|
+
expect(process(nil).body).to be_nil
|
29
|
+
end
|
30
|
+
|
31
|
+
it "turns empty body into nil" do
|
32
|
+
expect(process('').body).to be_nil
|
33
|
+
end
|
34
|
+
|
35
|
+
it "parses html body" do
|
36
|
+
response = process(html)
|
37
|
+
expect(response.body.at_css('title').text).to eq(title)
|
38
|
+
expect(response.body.at_css('body').text).to eq(body)
|
39
|
+
expect(response.env[:raw_body]).to be_nil
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
context "with preserving raw" do
|
44
|
+
let(:options) { {:preserve_raw => true} }
|
45
|
+
|
46
|
+
it "parses html body" do
|
47
|
+
response = process(html)
|
48
|
+
expect(response.body.at_css('title').text).to eq(title)
|
49
|
+
expect(response.body.at_css('body').text).to eq(body)
|
50
|
+
expect(response.env[:raw_body]).to eq(html)
|
51
|
+
end
|
52
|
+
|
53
|
+
it "can opt out of preserving raw" do
|
54
|
+
response = process(html, nil, :preserve_raw => false)
|
55
|
+
expect(response.env[:raw_body]).to be_nil
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context "with regexp type matching" do
|
60
|
+
let(:options) { {:content_type => /\bhtml$/} }
|
61
|
+
|
62
|
+
it "parses html body of correct type" do
|
63
|
+
response = process(html, 'text/html')
|
64
|
+
expect(response.body.at_css('title').text).to eq(title)
|
65
|
+
expect(response.body.at_css('body').text).to eq(body)
|
66
|
+
end
|
67
|
+
|
68
|
+
it "ignores html body of incorrect type" do
|
69
|
+
response = process(html, 'application/xml')
|
70
|
+
expect(response.body).to eq(html)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
context "with array type matching" do
|
75
|
+
let(:options) { {:content_type => %w[a/b c/d]} }
|
76
|
+
|
77
|
+
it "parses html body of correct type" do
|
78
|
+
expect(process(html, 'a/b').body).to be_a(Nokogiri::HTML::Document)
|
79
|
+
expect(process(html, 'c/d').body).to be_a(Nokogiri::HTML::Document)
|
80
|
+
end
|
81
|
+
|
82
|
+
it "ignores html body of incorrect type" do
|
83
|
+
expect(process(html, 'a/d').body).not_to be_a(Nokogiri::HTML::Document)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
it "doesn't choke on invalid html" do
|
88
|
+
['{!', '"a"', 'true', 'null', '1'].each do |data|
|
89
|
+
expect{ process(data) }.to_not raise_error
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe Pupa::Processor::Persistence do
|
4
|
+
before :all do
|
5
|
+
Pupa.session = Moped::Session.new(['localhost:27017'], database: 'pupa_test')
|
6
|
+
Pupa.session.collections.each(&:drop)
|
7
|
+
|
8
|
+
Pupa::Processor::Persistence.new(Pupa::Person.new(name: 'existing')).save
|
9
|
+
|
10
|
+
Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
|
11
|
+
Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
|
12
|
+
end
|
13
|
+
|
14
|
+
describe '#find' do
|
15
|
+
it 'should return nil if no matches' do
|
16
|
+
Pupa::Processor::Persistence.find(_type: 'pupa/person', name: 'nonexistent').should == nil
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should return a document if one match' do
|
20
|
+
Pupa::Processor::Persistence.find(_type: 'pupa/person', name: 'existing').should be_a(Hash)
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should raise an error if many matches' do
|
24
|
+
expect{Pupa::Processor::Persistence.find(_type: 'pupa/person', name: 'non-unique')}.to raise_error(Pupa::Errors::TooManyMatches)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
describe '#save' do
|
29
|
+
it 'should insert a document if no matches' do
|
30
|
+
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'new', name: 'new')).save.should == 'new'
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'should update a document if one match' do
|
34
|
+
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'existing', name: 'existing')).save.should_not == 'existing'
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should raise an error if many matches' do
|
38
|
+
expect{Pupa::Processor::Persistence.new(Pupa::Person.new(name: 'non-unique')).save}.to raise_error(Pupa::Errors::TooManyMatches)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe Pupa::Processor::Yielder do
|
4
|
+
let :yielder do
|
5
|
+
Pupa::Processor::Yielder.new do
|
6
|
+
10.times do |n|
|
7
|
+
Fiber.yield(n)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
let :raiser do
|
13
|
+
Pupa::Processor::Yielder.new do
|
14
|
+
raise
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '#each' do
|
19
|
+
it 'should iterate over the items in the enumeration' do
|
20
|
+
array = []
|
21
|
+
yielder.each do |n|
|
22
|
+
array << n
|
23
|
+
end
|
24
|
+
array.should == (0..9).to_a
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should be composable with other iterators' do
|
28
|
+
yielder.each.map{|n| n}.should == (0..9).to_a
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe '#next' do
|
33
|
+
it 'should return the next item in the enumeration' do
|
34
|
+
array = []
|
35
|
+
10.times do |n|
|
36
|
+
array << yielder.next
|
37
|
+
end
|
38
|
+
array.should == (0..9).to_a
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'should raise an error if the enumerator is at the end' do
|
42
|
+
expect{11.times{yielder.next}}.to raise_error(StopIteration)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
describe '#to_enum' do
|
47
|
+
it 'should return an enumerator' do
|
48
|
+
yielder.to_enum.should be_a(Enumerator)
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should return a lazy enumerator' do
|
52
|
+
expect{raiser.to_enum}.to_not raise_error
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,268 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe Pupa::Processor do
|
4
|
+
class PersonProcessor < Pupa::Processor
|
5
|
+
def person
|
6
|
+
@person ||= make_person_valid
|
7
|
+
end
|
8
|
+
|
9
|
+
def make_person_valid
|
10
|
+
@person = Pupa::Person.new(name: 'foo')
|
11
|
+
end
|
12
|
+
|
13
|
+
def make_person_invalid
|
14
|
+
@person = Pupa::Person.new
|
15
|
+
end
|
16
|
+
|
17
|
+
def scrape_people
|
18
|
+
Fiber.yield(person)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
before :all do
|
23
|
+
PersonProcessor.add_scraping_task(:people)
|
24
|
+
end
|
25
|
+
|
26
|
+
let :io do
|
27
|
+
StringIO.new
|
28
|
+
end
|
29
|
+
|
30
|
+
let :processor do
|
31
|
+
PersonProcessor.new('/tmp', level: 'WARN', logdev: io)
|
32
|
+
end
|
33
|
+
|
34
|
+
describe '#get' do
|
35
|
+
it 'should send a GET request' do
|
36
|
+
processor.get('http://httpbin.org/get', 'foo=bar')['args'].should == {'foo' => 'bar'}
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should automatically parse the response' do
|
40
|
+
processor.get('http://httpbin.org/get').should be_a(Hash)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe '#post' do
|
45
|
+
it 'should send a POST request' do
|
46
|
+
processor.post('http://httpbin.org/post', 'foo=bar')['form'].should == {'foo' => 'bar'}
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'should automatically parse the response' do
|
50
|
+
processor.post('http://httpbin.org/post').should be_a(Hash)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe '#add_scraping_task' do
|
55
|
+
it 'should add a scraping task and define a lazy method' do
|
56
|
+
PersonProcessor.tasks.should == [:people]
|
57
|
+
processor.should respond_to(:people)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe '#dump_scraped_objects' do
|
62
|
+
let :path do
|
63
|
+
path = "/tmp/person_#{processor.person._id}.json"
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should not overwrite an existing file' do
|
67
|
+
FileUtils.touch(path)
|
68
|
+
expect{processor.dump_scraped_objects(:people)}.to raise_error(Pupa::Errors::DuplicateObjectIdError)
|
69
|
+
FileUtils.rm(path)
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'should dump a JSON document' do
|
73
|
+
processor.dump_scraped_objects(:people)
|
74
|
+
File.exist?(path).should == true
|
75
|
+
io.string.should_not match('http://popoloproject.com/schemas/person.json')
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'should validate the object' do
|
79
|
+
processor.make_person_invalid
|
80
|
+
processor.dump_scraped_objects(:people)
|
81
|
+
io.string.should match('http://popoloproject.com/schemas/person.json')
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
describe '#import' do
|
86
|
+
before :each do
|
87
|
+
Pupa.session = Moped::Session.new(['localhost:27017'], database: 'pupa_test')
|
88
|
+
Pupa.session.collections.each(&:drop)
|
89
|
+
end
|
90
|
+
|
91
|
+
let :graphable do
|
92
|
+
{
|
93
|
+
'1' => Pupa::Organization.new({
|
94
|
+
_id: '1',
|
95
|
+
name: 'Child',
|
96
|
+
parent_id: '3',
|
97
|
+
}),
|
98
|
+
'2' => Pupa::Organization.new({
|
99
|
+
_id: '2',
|
100
|
+
name: 'Parent',
|
101
|
+
}),
|
102
|
+
'3' => Pupa::Organization.new({
|
103
|
+
_id: '3',
|
104
|
+
name: 'Parent',
|
105
|
+
}),
|
106
|
+
}
|
107
|
+
end
|
108
|
+
|
109
|
+
let :ungraphable do
|
110
|
+
{
|
111
|
+
'4' => Pupa::Organization.new({
|
112
|
+
_id: '4',
|
113
|
+
name: 'Child',
|
114
|
+
parent: {_type: 'pupa/organization', name: 'Parent'},
|
115
|
+
}),
|
116
|
+
'5' => Pupa::Organization.new({
|
117
|
+
_id: '5',
|
118
|
+
name: 'Parent',
|
119
|
+
}),
|
120
|
+
'6' => Pupa::Organization.new({
|
121
|
+
_id: '6',
|
122
|
+
name: 'Parent',
|
123
|
+
}),
|
124
|
+
}
|
125
|
+
end
|
126
|
+
|
127
|
+
it 'should use a dependency graph if possible' do
|
128
|
+
processor.should_receive(:load_scraped_objects).and_return(graphable)
|
129
|
+
|
130
|
+
Pupa::Processor::DependencyGraph.any_instance.should_receive(:tsort).and_return(['2', '1'])
|
131
|
+
processor.import
|
132
|
+
end
|
133
|
+
|
134
|
+
it 'should not use a dependency graph if not possible' do
|
135
|
+
processor.should_receive(:load_scraped_objects).and_return(ungraphable)
|
136
|
+
|
137
|
+
Pupa::Processor::DependencyGraph.any_instance.should_not_receive(:tsort)
|
138
|
+
processor.import
|
139
|
+
end
|
140
|
+
|
141
|
+
it 'should remove duplicate objects and re-assign foreign keys' do
|
142
|
+
processor.should_receive(:load_scraped_objects).and_return(graphable)
|
143
|
+
|
144
|
+
processor.import
|
145
|
+
documents = Pupa.session[:organizations].find.entries
|
146
|
+
documents.size.should == 2
|
147
|
+
documents[0].slice('_id', '_type', 'name', 'parent_id').should == {'_id' => '2', '_type' => 'pupa/organization', 'name' => 'Parent'}
|
148
|
+
documents[1].slice('_id', '_type', 'name', 'parent_id').should == {'_id' => '1', '_type' => 'pupa/organization', 'name' => 'Child', 'parent_id' => '2'}
|
149
|
+
end
|
150
|
+
|
151
|
+
it 'should resolve foreign objects' do
|
152
|
+
processor.should_receive(:load_scraped_objects).and_return(ungraphable)
|
153
|
+
|
154
|
+
processor.import
|
155
|
+
documents = Pupa.session[:organizations].find.entries
|
156
|
+
documents.size.should == 2
|
157
|
+
documents[0].slice('_id', '_type', 'name', 'parent_id').should == {'_id' => '5', '_type' => 'pupa/organization', 'name' => 'Parent'}
|
158
|
+
documents[1].slice('_id', '_type', 'name', 'parent_id').should == {'_id' => '4', '_type' => 'pupa/organization', 'name' => 'Child', 'parent_id' => '5'}
|
159
|
+
end
|
160
|
+
|
161
|
+
context 'with existing documents' do
|
162
|
+
before :each do
|
163
|
+
processor.should_receive(:load_scraped_objects).and_return(graphable)
|
164
|
+
processor.import
|
165
|
+
end
|
166
|
+
|
167
|
+
let :resolvable_foreign_key do
|
168
|
+
{
|
169
|
+
'a' => Pupa::Organization.new({
|
170
|
+
_id: 'a',
|
171
|
+
name: 'Child',
|
172
|
+
parent_id: 'c',
|
173
|
+
}),
|
174
|
+
'b' => Pupa::Organization.new({
|
175
|
+
_id: 'b',
|
176
|
+
name: 'Parent',
|
177
|
+
}),
|
178
|
+
'c' => Pupa::Organization.new({
|
179
|
+
_id: 'c',
|
180
|
+
name: 'Parent',
|
181
|
+
}),
|
182
|
+
}
|
183
|
+
end
|
184
|
+
|
185
|
+
let :unresolvable_foreign_key do
|
186
|
+
{
|
187
|
+
'a' => Pupa::Organization.new({
|
188
|
+
_id: 'a',
|
189
|
+
name: 'Child',
|
190
|
+
parent: {_type: 'pupa/organization', name: 'Parent'},
|
191
|
+
}),
|
192
|
+
'b' => Pupa::Organization.new({
|
193
|
+
_id: 'b',
|
194
|
+
name: 'Parent',
|
195
|
+
}),
|
196
|
+
'c' => Pupa::Organization.new({
|
197
|
+
_id: 'c',
|
198
|
+
name: 'Child',
|
199
|
+
parent_id: 'nonexistent',
|
200
|
+
}),
|
201
|
+
}
|
202
|
+
end
|
203
|
+
|
204
|
+
let :unresolvable_foreign_object do
|
205
|
+
{
|
206
|
+
'a' => Pupa::Organization.new({
|
207
|
+
_id: 'a',
|
208
|
+
name: 'Child',
|
209
|
+
parent: {_type: 'pupa/organization', name: 'Nonexistent'},
|
210
|
+
}),
|
211
|
+
'b' => Pupa::Organization.new({
|
212
|
+
_id: 'b',
|
213
|
+
name: 'Parent',
|
214
|
+
}),
|
215
|
+
'c' => Pupa::Organization.new({
|
216
|
+
_id: 'c',
|
217
|
+
name: 'Child',
|
218
|
+
parent_id: 'b',
|
219
|
+
}),
|
220
|
+
}
|
221
|
+
end
|
222
|
+
|
223
|
+
let :duplicate_documents do
|
224
|
+
{
|
225
|
+
'a' => Pupa::Organization.new({
|
226
|
+
_id: 'a',
|
227
|
+
name: 'Child',
|
228
|
+
parent: {_type: 'pupa/organization', name: 'Parent'},
|
229
|
+
}),
|
230
|
+
'b' => Pupa::Organization.new({
|
231
|
+
_id: 'b',
|
232
|
+
name: 'Parent',
|
233
|
+
}),
|
234
|
+
'c' => Pupa::Organization.new({
|
235
|
+
_id: 'c',
|
236
|
+
name: 'Child',
|
237
|
+
parent_id: 'b',
|
238
|
+
}),
|
239
|
+
}
|
240
|
+
end
|
241
|
+
|
242
|
+
it 'should resolve foreign keys' do
|
243
|
+
processor.should_receive(:load_scraped_objects).and_return(resolvable_foreign_key)
|
244
|
+
|
245
|
+
processor.import
|
246
|
+
documents = Pupa.session[:organizations].find.entries
|
247
|
+
documents.size.should == 2
|
248
|
+
documents[0].slice('_id', '_type', 'name', 'parent_id').should == {'_id' => '2', '_type' => 'pupa/organization', 'name' => 'Parent'}
|
249
|
+
documents[1].slice('_id', '_type', 'name', 'parent_id').should == {'_id' => '1', '_type' => 'pupa/organization', 'name' => 'Child', 'parent_id' => '2'}
|
250
|
+
end
|
251
|
+
|
252
|
+
it 'should raise an error if a foreign key cannot be resolved' do
|
253
|
+
processor.should_receive(:load_scraped_objects).and_return(unresolvable_foreign_key)
|
254
|
+
expect{processor.import}.to raise_error(Pupa::Errors::UnprocessableEntity)
|
255
|
+
end
|
256
|
+
|
257
|
+
it 'should raise an error if a foreign object cannot be resolved' do
|
258
|
+
processor.should_receive(:load_scraped_objects).and_return(unresolvable_foreign_object)
|
259
|
+
expect{processor.import}.to raise_error(Pupa::Errors::UnprocessableEntity)
|
260
|
+
end
|
261
|
+
|
262
|
+
it 'should raise an error if a duplicate was inadvertently saved' do
|
263
|
+
processor.should_receive(:load_scraped_objects).and_return(duplicate_documents)
|
264
|
+
expect{processor.import}.to raise_error(Pupa::Errors::DuplicateDocumentError)
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
data/spec/runner_spec.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe Pupa::Runner do
|
4
|
+
class TestProcessor < Pupa::Processor
|
5
|
+
def scrape_people
|
6
|
+
end
|
7
|
+
|
8
|
+
def scrape_organizations
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
before :all do
|
13
|
+
TestProcessor.add_scraping_task(:people)
|
14
|
+
TestProcessor.add_scraping_task(:organizations)
|
15
|
+
end
|
16
|
+
|
17
|
+
let :dry_runner do
|
18
|
+
runner = Pupa::Runner.new(TestProcessor, level: 'UNKNOWN', dry_run: true)
|
19
|
+
runner.add_action(name: 'example', description: 'An example action')
|
20
|
+
runner
|
21
|
+
end
|
22
|
+
|
23
|
+
let :runner do
|
24
|
+
Pupa::Runner.new(TestProcessor, level: 'UNKNOWN')
|
25
|
+
end
|
26
|
+
|
27
|
+
describe '#initialize' do
|
28
|
+
it 'should accept default options' do
|
29
|
+
dry_runner.options.level.should_not == 'INFO'
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe '#add_action' do
|
34
|
+
it 'should add an action' do
|
35
|
+
dry_runner.actions.last.to_h.should == {name: 'example', description: 'An example action'}
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe '#run' do
|
40
|
+
def dry_run(argv = [], **kwargs)
|
41
|
+
begin
|
42
|
+
dry_runner.run(argv, kwargs)
|
43
|
+
rescue SystemExit
|
44
|
+
# pass
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should accept overridden options' do
|
49
|
+
dry_run(['--quiet'], level: 'ERROR')
|
50
|
+
dry_runner.options.level.should == 'ERROR'
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should use default actions if none set' do
|
54
|
+
dry_run
|
55
|
+
dry_runner.options.actions.should == %w(scrape import)
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should use default tasks if none set' do
|
59
|
+
dry_run
|
60
|
+
dry_runner.options.tasks.should == %i(people organizations)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Unlike an action, it's not possible for a task to be undefined, because
|
64
|
+
# `add_scraping_task` would raise an error first.
|
65
|
+
it 'should abort if the action is not defined' do
|
66
|
+
expect{dry_runner.run(['--action', 'example'])}.to raise_error(SystemExit, "`example` is not a rspec action. See `rspec --help` for a list of available actions.")
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should not run any actions on a dry run' do
|
70
|
+
expect{dry_runner.run([])}.to raise_error(SystemExit, nil)
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'should run actions' do
|
74
|
+
TestProcessor.any_instance.should_receive(:dump_scraped_objects).twice
|
75
|
+
TestProcessor.any_instance.should_receive(:import)
|
76
|
+
runner.run([])
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should run tasks' do
|
80
|
+
TestProcessor.any_instance.should_receive(:people).and_return([])
|
81
|
+
TestProcessor.any_instance.should_receive(:organizations).and_return([])
|
82
|
+
runner.run([])
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require 'coveralls'
|
4
|
+
Coveralls.wear!
|
5
|
+
|
6
|
+
require 'rspec'
|
7
|
+
require 'vcr'
|
8
|
+
require File.dirname(__FILE__) + '/../lib/pupa'
|
9
|
+
|
10
|
+
VCR.configure do |c|
|
11
|
+
c.cassette_library_dir = 'spec/cassettes'
|
12
|
+
c.hook_into :faraday
|
13
|
+
|
14
|
+
c.around_http_request do |request|
|
15
|
+
VCR.use_cassette(Digest::SHA1.hexdigest(request.uri + request.body + request.headers.to_s), &request)
|
16
|
+
end
|
17
|
+
end
|