turbot-runner 0.1.38 → 0.1.39
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/turbot_runner/processor.rb +3 -1
- data/lib/turbot_runner/runner.rb +11 -4
- data/lib/turbot_runner/validator.rb +25 -11
- data/lib/turbot_runner/version.rb +1 -1
- data/spec/bots/bot-that-is-allowed-to-produce-duplicates/manifest.json +9 -0
- data/spec/bots/bot-that-is-allowed-to-produce-duplicates/scraper.rb +10 -0
- data/spec/bots/bot-that-produces-duplicates/manifest.json +8 -0
- data/spec/bots/bot-that-produces-duplicates/scraper.rb +10 -0
- data/spec/lib/processor_spec.rb +1 -1
- data/spec/lib/runner_spec.rb +20 -0
- data/spec/lib/validator_spec.rb +18 -3
- data/spec/spec_helper.rb +2 -2
- metadata +20 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MTk1NGQ2ZjNlYTZhODFkMDk4OWU4YzNhOTc3YzMzNjVmMDQwOWY1Ng==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YzcyOTQyMTdkYTc0ZmNkNDYwNzY5YTM1YWIxYjc2M2UyOGNhY2FhYQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NmMzYzJhYmIwMTkxMTUwMWM0MmI5ZjBiZDQwOWE1NjVkYzQ3OGVmNDZmZTZj
|
10
|
+
ZGI2YmNlODQxYTE2NzdiYTQ4YmQ5ZWRhMWY2NGZkOTZiYzM3NzNlYjNjZmJk
|
11
|
+
NWYzOWU2NTZjYzM3ZWVjNDIwMjliZDA5OTJmMGFmOWQ4ZTZhMDc=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
OWZiNWM0YTNmZDBhMmE0NGIyOWY4N2VlMTM0NmE2YTFhMWYxODNmZGU5Y2Zl
|
14
|
+
MmYxYmVlMTZkZmE1MTM0MTIyZjFjZWQ2Yzc5YzhkMDE5NzE5M2E0MWQ2ODMy
|
15
|
+
ZTEwNzg2ZGE4Mzk4YTcyOGFiZTY4YjZhNTcyMmNlMWFiNWYxZTA=
|
@@ -7,6 +7,7 @@ module TurbotRunner
|
|
7
7
|
@data_type = script_config[:data_type]
|
8
8
|
@identifying_fields = script_config[:identifying_fields]
|
9
9
|
@record_handler = record_handler
|
10
|
+
@seen_uids = script_config[:duplicates_allowed] ? nil : Set.new
|
10
11
|
end
|
11
12
|
|
12
13
|
def process(line)
|
@@ -22,7 +23,8 @@ module TurbotRunner
|
|
22
23
|
error_message = Validator.validate(
|
23
24
|
@data_type,
|
24
25
|
record_to_validate,
|
25
|
-
@identifying_fields
|
26
|
+
@identifying_fields,
|
27
|
+
@seen_uids
|
26
28
|
)
|
27
29
|
|
28
30
|
if error_message.nil?
|
data/lib/turbot_runner/runner.rb
CHANGED
@@ -27,9 +27,11 @@ module TurbotRunner
|
|
27
27
|
succeeded = run_script(scraper_config)
|
28
28
|
# Run the transformers even if the scraper fails
|
29
29
|
transformers.each do |transformer_config|
|
30
|
-
|
31
|
-
|
32
|
-
|
30
|
+
config = transformer_config.merge(
|
31
|
+
:base_directory => @base_directory,
|
32
|
+
:duplicates_allowed => duplicates_allowed
|
33
|
+
)
|
34
|
+
succeeded = run_script(config, input_file=scraper_output_file) && succeeded
|
33
35
|
end
|
34
36
|
succeeded
|
35
37
|
end
|
@@ -142,7 +144,8 @@ module TurbotRunner
|
|
142
144
|
:base_directory => @base_directory,
|
143
145
|
:file => scraper_script,
|
144
146
|
:data_type => scraper_data_type,
|
145
|
-
:identifying_fields => scraper_identifying_fields
|
147
|
+
:identifying_fields => scraper_identifying_fields,
|
148
|
+
:duplicates_allowed => duplicates_allowed
|
146
149
|
}
|
147
150
|
end
|
148
151
|
|
@@ -170,6 +173,10 @@ module TurbotRunner
|
|
170
173
|
@config[:identifying_fields]
|
171
174
|
end
|
172
175
|
|
176
|
+
def duplicates_allowed
|
177
|
+
@config[:duplicates_allowed]
|
178
|
+
end
|
179
|
+
|
173
180
|
def assert_absolute_path(path)
|
174
181
|
unless Pathname.new(path).absolute?
|
175
182
|
raise "#{path} must be an absolute path"
|
@@ -1,28 +1,42 @@
|
|
1
|
+
require 'active_support/core_ext/hash/slice'
|
2
|
+
require 'active_support/core_ext/object/to_query'
|
3
|
+
|
1
4
|
module TurbotRunner
|
2
5
|
module Validator
|
3
6
|
extend self
|
4
7
|
|
5
|
-
def validate(data_type, record, identifying_fields)
|
8
|
+
def validate(data_type, record, identifying_fields, seen_uids)
|
6
9
|
schema_path = TurbotRunner.schema_path(data_type)
|
7
10
|
error = Openc::JsonSchema.validate(schema_path, record)
|
8
11
|
|
9
|
-
message = nil
|
10
|
-
|
11
|
-
if error.nil?
|
12
|
-
flattened_record = TurbotRunner::Utils.flatten(record)
|
13
|
-
|
14
|
-
identifying_attributes = flattened_record.reject do |k, v|
|
15
|
-
!identifying_fields.include?(k) || v.nil? || v == ''
|
16
|
-
end
|
12
|
+
message = error.nil? ? nil : error[:message]
|
17
13
|
|
14
|
+
if message.nil?
|
15
|
+
identifying_hash = identifying_hash(record, identifying_fields)
|
16
|
+
identifying_attributes = identifying_hash.reject {|k, v| v.nil? || v == ''}
|
18
17
|
if identifying_attributes.empty?
|
19
18
|
message = "There were no values provided for any of the identifying fields: #{identifying_fields.join(', ')}"
|
20
19
|
end
|
21
|
-
|
22
|
-
|
20
|
+
end
|
21
|
+
|
22
|
+
if message.nil? && !seen_uids.nil?
|
23
|
+
record_uid = record_uid(identifying_hash)
|
24
|
+
if seen_uids.include?(record_uid)
|
25
|
+
message = "Already seen record with these identifying fields: #{identifying_hash}"
|
26
|
+
else
|
27
|
+
seen_uids.add(record_uid)
|
28
|
+
end
|
23
29
|
end
|
24
30
|
|
25
31
|
message
|
26
32
|
end
|
33
|
+
|
34
|
+
def identifying_hash(record, identifying_fields)
|
35
|
+
TurbotRunner::Utils.flatten(record).slice(*identifying_fields)
|
36
|
+
end
|
37
|
+
|
38
|
+
def record_uid(identifying_hash)
|
39
|
+
Digest::SHA1.hexdigest(identifying_hash.to_query)
|
40
|
+
end
|
27
41
|
end
|
28
42
|
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
{
|
2
|
+
"bot_id": "bot-that-is-allowed-to-produce-duplicates",
|
3
|
+
"description": "This bot produces duplicates, but that's ok",
|
4
|
+
"language": "ruby",
|
5
|
+
"data_type": "primary data",
|
6
|
+
"identifying_fields": ["licence_number"],
|
7
|
+
"files": ["scraper.rb"],
|
8
|
+
"duplicates_allowed": true
|
9
|
+
}
|
data/spec/lib/processor_spec.rb
CHANGED
@@ -142,7 +142,7 @@ describe TurbotRunner::Processor do
|
|
142
142
|
}
|
143
143
|
|
144
144
|
expect(TurbotRunner::Validator).to receive(:validate).
|
145
|
-
with('primary data', expected_record_to_validate, ['number'])
|
145
|
+
with('primary data', expected_record_to_validate, ['number'], Set.new)
|
146
146
|
@processor.process(record.to_json)
|
147
147
|
end
|
148
148
|
end
|
data/spec/lib/runner_spec.rb
CHANGED
@@ -236,6 +236,26 @@ describe TurbotRunner::Runner do
|
|
236
236
|
expect{@runner.run}.to raise_error(TurbotRunner::InvalidDataType)
|
237
237
|
end
|
238
238
|
end
|
239
|
+
|
240
|
+
context 'with a bot that produces duplicate data' do
|
241
|
+
before do
|
242
|
+
@runner = test_runner('bot-that-produces-duplicates')
|
243
|
+
end
|
244
|
+
|
245
|
+
it 'raises returns false' do
|
246
|
+
expect(@runner.run).to be(false)
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
context 'with a bot that is expected to produce duplicate data' do
|
251
|
+
before do
|
252
|
+
@runner = test_runner('bot-that-is-allowed-to-produce-duplicates')
|
253
|
+
end
|
254
|
+
|
255
|
+
it 'raises returns false' do
|
256
|
+
expect(@runner.run).to be(true)
|
257
|
+
end
|
258
|
+
end
|
239
259
|
end
|
240
260
|
|
241
261
|
describe '#process_output' do
|
data/spec/lib/validator_spec.rb
CHANGED
@@ -58,7 +58,7 @@ describe TurbotRunner::Validator do
|
|
58
58
|
'four' => {}
|
59
59
|
}
|
60
60
|
identifying_fields = ['one.two.three', 'four.five.six']
|
61
|
-
error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields)
|
61
|
+
error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields, Set.new)
|
62
62
|
expect(error).to eq('There were no values provided for any of the identifying fields: one.two.three, four.five.six')
|
63
63
|
end
|
64
64
|
|
@@ -69,7 +69,7 @@ describe TurbotRunner::Validator do
|
|
69
69
|
'one' => {'two' => {'three' => 123}}
|
70
70
|
}
|
71
71
|
identifying_fields = ['one.two.three', 'four.five.six']
|
72
|
-
error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields)
|
72
|
+
error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields, Set.new)
|
73
73
|
expect(error).to eq(nil)
|
74
74
|
end
|
75
75
|
|
@@ -81,9 +81,24 @@ describe TurbotRunner::Validator do
|
|
81
81
|
'four' => {'five' => {'six' => 456}}
|
82
82
|
}
|
83
83
|
identifying_fields = ['one.two.three', 'four.five.six']
|
84
|
-
error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields)
|
84
|
+
error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields, Set.new)
|
85
85
|
expect(error).to eq(nil)
|
86
86
|
end
|
87
87
|
end
|
88
|
+
|
89
|
+
specify 'with duplicate record' do
|
90
|
+
record = {
|
91
|
+
'sample_date' => '2014-06-01',
|
92
|
+
'source_url' => 'http://example.com/123',
|
93
|
+
'number' => 123
|
94
|
+
}
|
95
|
+
|
96
|
+
seen_uids = Set.new
|
97
|
+
error = TurbotRunner::Validator.validate('primary-data', record, 'number', seen_uids)
|
98
|
+
expect(error).to eq(nil)
|
99
|
+
|
100
|
+
error = TurbotRunner::Validator.validate('primary-data', record, 'number', seen_uids)
|
101
|
+
expect(error).to eq('Already seen record with these identifying fields: {"number"=>123}')
|
102
|
+
end
|
88
103
|
end
|
89
104
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -3,7 +3,7 @@ require 'turbot_runner'
|
|
3
3
|
RSpec::Matchers.define(:fail_validation_with) do |expected_error|
|
4
4
|
match do |record|
|
5
5
|
identifying_fields = ['number']
|
6
|
-
@error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields)
|
6
|
+
@error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields, Set.new)
|
7
7
|
expect(@error).to eq(expected_error)
|
8
8
|
end
|
9
9
|
|
@@ -15,6 +15,6 @@ end
|
|
15
15
|
RSpec::Matchers.define(:be_valid) do
|
16
16
|
match do |record|
|
17
17
|
identifying_fields = ['number']
|
18
|
-
expect(TurbotRunner::Validator.validate('primary-data', record, identifying_fields)).to eq(nil)
|
18
|
+
expect(TurbotRunner::Validator.validate('primary-data', record, identifying_fields, Set.new)).to eq(nil)
|
19
19
|
end
|
20
20
|
end
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: turbot-runner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.39
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- OpenCorporates
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: activesupport
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 4.1.4
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 4.1.4
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: openc-json_schema
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,6 +100,10 @@ files:
|
|
86
100
|
- spec/bots/bot-that-expects-file/manifest.json
|
87
101
|
- spec/bots/bot-that-expects-file/scraper.rb
|
88
102
|
- spec/bots/bot-that-expects-file/something.txt
|
103
|
+
- spec/bots/bot-that-is-allowed-to-produce-duplicates/manifest.json
|
104
|
+
- spec/bots/bot-that-is-allowed-to-produce-duplicates/scraper.rb
|
105
|
+
- spec/bots/bot-that-produces-duplicates/manifest.json
|
106
|
+
- spec/bots/bot-that-produces-duplicates/scraper.rb
|
89
107
|
- spec/bots/bot-with-invalid-data-type/manifest.json
|
90
108
|
- spec/bots/bot-with-invalid-data-type/scraper.rb
|
91
109
|
- spec/bots/bot-with-invalid-sample-date/manifest.json
|