turbot-runner 0.1.38 → 0.1.39

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YmMwMGQ1NTRjODA5MDQ5NzEyMjc2NWEzNzQzNzMxNmI5ZmMzNTJmYQ==
4
+ MTk1NGQ2ZjNlYTZhODFkMDk4OWU4YzNhOTc3YzMzNjVmMDQwOWY1Ng==
5
5
  data.tar.gz: !binary |-
6
- MjA3NGU1ZTk2ZDM3NDdjZjVjNTY5Zjc0MGE2ZDAxNjRiYTM1ZmY0NQ==
6
+ YzcyOTQyMTdkYTc0ZmNkNDYwNzY5YTM1YWIxYjc2M2UyOGNhY2FhYQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZjQwYTQwNzQ5YmNmZWJjYTY1NjI2ZTM3MjVlYjM1ZWZiMGY0ZGQ1M2U1MDY5
10
- NTZiZDFiZDI3YzZhYzMwYmU0ZTMxZWZkOWQ4ZjYwZWI5YzgxMzZjZGJjMDc4
11
- MmQ0OWRiZmEzY2Q2YmY3Yzg0OTMyNTY3NDUxMzA1ZTc0ODZiM2U=
9
+ NmMzYzJhYmIwMTkxMTUwMWM0MmI5ZjBiZDQwOWE1NjVkYzQ3OGVmNDZmZTZj
10
+ ZGI2YmNlODQxYTE2NzdiYTQ4YmQ5ZWRhMWY2NGZkOTZiYzM3NzNlYjNjZmJk
11
+ NWYzOWU2NTZjYzM3ZWVjNDIwMjliZDA5OTJmMGFmOWQ4ZTZhMDc=
12
12
  data.tar.gz: !binary |-
13
- Y2RjYmVkZjk5YmI4MTY4MGEyYjNkZDUwZTAzNjhlZThkZWZjZjQ1OGZmOTY3
14
- YTU4ZmFmMzZmNjhmODc4ZDI4YTQwNTEyNDE0NmRhZDdhODQzNThhZGYwNWQ2
15
- YjlkZTMwNzQwN2U3YmI0NTAzNmI2ZDJiMjI3MzZmYzgwZWMyNzc=
13
+ OWZiNWM0YTNmZDBhMmE0NGIyOWY4N2VlMTM0NmE2YTFhMWYxODNmZGU5Y2Zl
14
+ MmYxYmVlMTZkZmE1MTM0MTIyZjFjZWQ2Yzc5YzhkMDE5NzE5M2E0MWQ2ODMy
15
+ ZTEwNzg2ZGE4Mzk4YTcyOGFiZTY4YjZhNTcyMmNlMWFiNWYxZTA=
@@ -7,6 +7,7 @@ module TurbotRunner
7
7
  @data_type = script_config[:data_type]
8
8
  @identifying_fields = script_config[:identifying_fields]
9
9
  @record_handler = record_handler
10
+ @seen_uids = script_config[:duplicates_allowed] ? nil : Set.new
10
11
  end
11
12
 
12
13
  def process(line)
@@ -22,7 +23,8 @@ module TurbotRunner
22
23
  error_message = Validator.validate(
23
24
  @data_type,
24
25
  record_to_validate,
25
- @identifying_fields
26
+ @identifying_fields,
27
+ @seen_uids
26
28
  )
27
29
 
28
30
  if error_message.nil?
@@ -27,9 +27,11 @@ module TurbotRunner
27
27
  succeeded = run_script(scraper_config)
28
28
  # Run the transformers even if the scraper fails
29
29
  transformers.each do |transformer_config|
30
- succeeded = run_script(
31
- transformer_config.merge(:base_directory => @base_directory),
32
- input_file=scraper_output_file) && succeeded
30
+ config = transformer_config.merge(
31
+ :base_directory => @base_directory,
32
+ :duplicates_allowed => duplicates_allowed
33
+ )
34
+ succeeded = run_script(config, input_file=scraper_output_file) && succeeded
33
35
  end
34
36
  succeeded
35
37
  end
@@ -142,7 +144,8 @@ module TurbotRunner
142
144
  :base_directory => @base_directory,
143
145
  :file => scraper_script,
144
146
  :data_type => scraper_data_type,
145
- :identifying_fields => scraper_identifying_fields
147
+ :identifying_fields => scraper_identifying_fields,
148
+ :duplicates_allowed => duplicates_allowed
146
149
  }
147
150
  end
148
151
 
@@ -170,6 +173,10 @@ module TurbotRunner
170
173
  @config[:identifying_fields]
171
174
  end
172
175
 
176
+ def duplicates_allowed
177
+ @config[:duplicates_allowed]
178
+ end
179
+
173
180
  def assert_absolute_path(path)
174
181
  unless Pathname.new(path).absolute?
175
182
  raise "#{path} must be an absolute path"
@@ -1,28 +1,42 @@
1
+ require 'active_support/core_ext/hash/slice'
2
+ require 'active_support/core_ext/object/to_query'
3
+
1
4
  module TurbotRunner
2
5
  module Validator
3
6
  extend self
4
7
 
5
- def validate(data_type, record, identifying_fields)
8
+ def validate(data_type, record, identifying_fields, seen_uids)
6
9
  schema_path = TurbotRunner.schema_path(data_type)
7
10
  error = Openc::JsonSchema.validate(schema_path, record)
8
11
 
9
- message = nil
10
-
11
- if error.nil?
12
- flattened_record = TurbotRunner::Utils.flatten(record)
13
-
14
- identifying_attributes = flattened_record.reject do |k, v|
15
- !identifying_fields.include?(k) || v.nil? || v == ''
16
- end
12
+ message = error.nil? ? nil : error[:message]
17
13
 
14
+ if message.nil?
15
+ identifying_hash = identifying_hash(record, identifying_fields)
16
+ identifying_attributes = identifying_hash.reject {|k, v| v.nil? || v == ''}
18
17
  if identifying_attributes.empty?
19
18
  message = "There were no values provided for any of the identifying fields: #{identifying_fields.join(', ')}"
20
19
  end
21
- else
22
- message = error[:message]
20
+ end
21
+
22
+ if message.nil? && !seen_uids.nil?
23
+ record_uid = record_uid(identifying_hash)
24
+ if seen_uids.include?(record_uid)
25
+ message = "Already seen record with these identifying fields: #{identifying_hash}"
26
+ else
27
+ seen_uids.add(record_uid)
28
+ end
23
29
  end
24
30
 
25
31
  message
26
32
  end
33
+
34
+ def identifying_hash(record, identifying_fields)
35
+ TurbotRunner::Utils.flatten(record).slice(*identifying_fields)
36
+ end
37
+
38
+ def record_uid(identifying_hash)
39
+ Digest::SHA1.hexdigest(identifying_hash.to_query)
40
+ end
27
41
  end
28
42
  end
@@ -1,3 +1,3 @@
1
1
  module TurbotRunner
2
- VERSION = '0.1.38'
2
+ VERSION = '0.1.39'
3
3
  end
@@ -0,0 +1,9 @@
1
+ {
2
+ "bot_id": "bot-that-is-allowed-to-produce-duplicates",
3
+ "description": "This bot produces duplicates, but that's ok",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"],
8
+ "duplicates_allowed": true
9
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n % 5}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "bot-that-produces-duplicates",
3
+ "description": "This bot produces duplicates",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"]
8
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n % 5}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -142,7 +142,7 @@ describe TurbotRunner::Processor do
142
142
  }
143
143
 
144
144
  expect(TurbotRunner::Validator).to receive(:validate).
145
- with('primary data', expected_record_to_validate, ['number'])
145
+ with('primary data', expected_record_to_validate, ['number'], Set.new)
146
146
  @processor.process(record.to_json)
147
147
  end
148
148
  end
@@ -236,6 +236,26 @@ describe TurbotRunner::Runner do
236
236
  expect{@runner.run}.to raise_error(TurbotRunner::InvalidDataType)
237
237
  end
238
238
  end
239
+
240
+ context 'with a bot that produces duplicate data' do
241
+ before do
242
+ @runner = test_runner('bot-that-produces-duplicates')
243
+ end
244
+
245
+ it 'raises returns false' do
246
+ expect(@runner.run).to be(false)
247
+ end
248
+ end
249
+
250
+ context 'with a bot that is expected to produce duplicate data' do
251
+ before do
252
+ @runner = test_runner('bot-that-is-allowed-to-produce-duplicates')
253
+ end
254
+
255
+ it 'raises returns false' do
256
+ expect(@runner.run).to be(true)
257
+ end
258
+ end
239
259
  end
240
260
 
241
261
  describe '#process_output' do
@@ -58,7 +58,7 @@ describe TurbotRunner::Validator do
58
58
  'four' => {}
59
59
  }
60
60
  identifying_fields = ['one.two.three', 'four.five.six']
61
- error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields)
61
+ error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields, Set.new)
62
62
  expect(error).to eq('There were no values provided for any of the identifying fields: one.two.three, four.five.six')
63
63
  end
64
64
 
@@ -69,7 +69,7 @@ describe TurbotRunner::Validator do
69
69
  'one' => {'two' => {'three' => 123}}
70
70
  }
71
71
  identifying_fields = ['one.two.three', 'four.five.six']
72
- error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields)
72
+ error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields, Set.new)
73
73
  expect(error).to eq(nil)
74
74
  end
75
75
 
@@ -81,9 +81,24 @@ describe TurbotRunner::Validator do
81
81
  'four' => {'five' => {'six' => 456}}
82
82
  }
83
83
  identifying_fields = ['one.two.three', 'four.five.six']
84
- error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields)
84
+ error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields, Set.new)
85
85
  expect(error).to eq(nil)
86
86
  end
87
87
  end
88
+
89
+ specify 'with duplicate record' do
90
+ record = {
91
+ 'sample_date' => '2014-06-01',
92
+ 'source_url' => 'http://example.com/123',
93
+ 'number' => 123
94
+ }
95
+
96
+ seen_uids = Set.new
97
+ error = TurbotRunner::Validator.validate('primary-data', record, 'number', seen_uids)
98
+ expect(error).to eq(nil)
99
+
100
+ error = TurbotRunner::Validator.validate('primary-data', record, 'number', seen_uids)
101
+ expect(error).to eq('Already seen record with these identifying fields: {"number"=>123}')
102
+ end
88
103
  end
89
104
  end
data/spec/spec_helper.rb CHANGED
@@ -3,7 +3,7 @@ require 'turbot_runner'
3
3
  RSpec::Matchers.define(:fail_validation_with) do |expected_error|
4
4
  match do |record|
5
5
  identifying_fields = ['number']
6
- @error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields)
6
+ @error = TurbotRunner::Validator.validate('primary-data', record, identifying_fields, Set.new)
7
7
  expect(@error).to eq(expected_error)
8
8
  end
9
9
 
@@ -15,6 +15,6 @@ end
15
15
  RSpec::Matchers.define(:be_valid) do
16
16
  match do |record|
17
17
  identifying_fields = ['number']
18
- expect(TurbotRunner::Validator.validate('primary-data', record, identifying_fields)).to eq(nil)
18
+ expect(TurbotRunner::Validator.validate('primary-data', record, identifying_fields, Set.new)).to eq(nil)
19
19
  end
20
20
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: turbot-runner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.38
4
+ version: 0.1.39
5
5
  platform: ruby
6
6
  authors:
7
7
  - OpenCorporates
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-12 00:00:00.000000000 Z
11
+ date: 2015-03-20 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 4.1.4
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 4.1.4
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: openc-json_schema
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -86,6 +100,10 @@ files:
86
100
  - spec/bots/bot-that-expects-file/manifest.json
87
101
  - spec/bots/bot-that-expects-file/scraper.rb
88
102
  - spec/bots/bot-that-expects-file/something.txt
103
+ - spec/bots/bot-that-is-allowed-to-produce-duplicates/manifest.json
104
+ - spec/bots/bot-that-is-allowed-to-produce-duplicates/scraper.rb
105
+ - spec/bots/bot-that-produces-duplicates/manifest.json
106
+ - spec/bots/bot-that-produces-duplicates/scraper.rb
89
107
  - spec/bots/bot-with-invalid-data-type/manifest.json
90
108
  - spec/bots/bot-with-invalid-data-type/scraper.rb
91
109
  - spec/bots/bot-with-invalid-sample-date/manifest.json