turbot-runner 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MWZiMTU5NGIzYjc4ZTdlNTJiOGRiMzkwNzI0NjRiY2MwYWU2Yzk0Ng==
4
+ M2Q4MGEyNmViMzEzODU2YWJkNTRkNjY2NzQ4N2JkODUzYmM3MTVhOQ==
5
5
  data.tar.gz: !binary |-
6
- YWZkY2NhNTNhNzdjMWQ1MzBkNzc2NWNhNTY0ZDg2YzU0MWI0YTgxMA==
6
+ MjJhZTJjZjMwYmZjNTY5Mjk3YWU4OTUwMTA1M2I4MjNiZWFiNzVlYw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NzlhZjdmMjZhZTNjNDQyOTY1MWVmZGEzZDFmM2RjZWY1MjAwMDM5OTlmZDAx
10
- N2E2ZGExNDljYTYxOTE3NDc3NTEyODYyMDkzOTUxYzVmODdmMzVjMTc5MmE3
11
- NzUyNDhjNTRmNTk2NjRhZDU4M2IzY2VjZTAwOWJmMmFmNTRkYjc=
9
+ ZjYzMTZhODVkNmU3MWE3ZjkzNTNkMzRhNzZjMWUzODlhODc0YmFlODhlMWE3
10
+ YzUxMDU3Y2E5MTBhZTg2MTY3MjIyMDNkZmE2OTIwOWI1M2RmMTQ4MDQzODlm
11
+ MjE0MGMzNTc2NjZlZWI5MGI5NTgxZTQzZDRkYmE4YzUxZjAzOWE=
12
12
  data.tar.gz: !binary |-
13
- MDA5NGJhOTBiZTIyZGYyNTEyN2NhNWEwOTVjYTVlNTQ2YTE2YTg1ODNlYTlj
14
- NTJjYzAxZTMzYzkwMDgzN2MzNGExZGQwMWU1Mzk3YzAwMTVjNDQxMGFkMGU3
15
- MWFlMzQ3Yjg1Njk4ODM3NWUwNGIwNTI0YTk3MWM4YThhM2YxNWY=
13
+ ZGE3NDZmZGRhYWI2NjNlZTM4MWExYzg3MmZiYzNiMzExMTFhZDlkYWVhOTIy
14
+ MTk3Mzk1NTU3Y2RhZDY2ZGQ1Yzc5ZTg0YTMxMmQyN2Y0YzVmMjA5ZjVlODk4
15
+ MDg1MDQ0ZWI4MDk1ZWZjZjk2ZmMyOTEwMDY5Mzc5NjFjZWRjZjg=
@@ -3,9 +3,10 @@ require 'json-schema'
3
3
 
4
4
  module TurbotRunner
5
5
  class Processor
6
- def initialize(runner, data_type, record_handler)
6
+ def initialize(runner, script_config, record_handler)
7
7
  @runner = runner
8
- @data_type = data_type
8
+ @data_type = script_config[:data_type]
9
+ @identifying_fields = script_config[:identifying_fields]
9
10
  @record_handler = record_handler
10
11
  end
11
12
 
@@ -44,6 +45,18 @@ module TurbotRunner
44
45
  error[:message]
45
46
  end
46
47
  end
48
+
49
+ if messages.empty?
50
+ identifying_attributes = record.reject do |k, v|
51
+ !@identifying_fields.include?(k) || v.nil? || v == ''
52
+ end
53
+
54
+ if identifying_attributes.empty?
55
+ messages << "There were no values provided for any of the identifying fields: #{@identifying_fields.join(', ')}"
56
+ end
57
+ end
58
+
59
+ messages
47
60
  end
48
61
 
49
62
  def schema
@@ -17,20 +17,20 @@ module TurbotRunner
17
17
  def run
18
18
  FileUtils.mkdir_p(@output_directory)
19
19
 
20
- return false if not run_scraper
20
+ return false if not run_script(scraper_config)
21
21
 
22
- transformers.each do |transformer|
23
- return false if not run_transformer(transformer)
22
+ transformers.each do |transformer_config|
23
+ return false if not run_script(transformer_config, input_file=scraper_output_file)
24
24
  end
25
25
 
26
26
  true
27
27
  end
28
28
 
29
29
  def process_output
30
- return false if not process_scraper_output
30
+ return false if not process_script_output(scraper_config)
31
31
 
32
- transformers.each do |transformer|
33
- return false if not process_transformer_output(transformer)
32
+ transformers.each do |transformer_config|
33
+ return false if not process_script_output(transformer_config)
34
34
  end
35
35
 
36
36
  true
@@ -50,25 +50,13 @@ module TurbotRunner
50
50
  end
51
51
  end
52
52
 
53
- def run_scraper
54
- run_script(scraper_script, scraper_data_type)
55
- end
56
-
57
- def run_transformer(transformer)
58
- run_script(
59
- transformer[:file],
60
- transformer[:data_type],
61
- input_file=scraper_output_file
62
- )
63
- end
64
-
65
- def run_script(script, data_type, input_file=nil)
66
- command = build_command(script, input_file)
53
+ def run_script(script_config, input_file=nil)
54
+ command = build_command(script_config[:file], input_file)
67
55
 
68
56
  runner = ScriptRunner.new(
69
57
  command,
70
- output_file(script),
71
- data_type,
58
+ output_file(script_config[:file]),
59
+ script_config,
72
60
  :record_handler => @record_handler,
73
61
  :timeout => @timeout
74
62
  )
@@ -76,18 +64,10 @@ module TurbotRunner
76
64
  runner.run
77
65
  end
78
66
 
79
- def process_scraper_output
80
- process_script_output(scraper_script, scraper_data_type)
81
- end
82
-
83
- def process_transformer_output(transformer)
84
- process_script_output(transformer[:file], transformer[:data_type])
85
- end
86
-
87
- def process_script_output(script, data_type)
88
- processor = Processor.new(nil, data_type, @record_handler)
67
+ def process_script_output(script_config)
68
+ processor = Processor.new(nil, script_config, @record_handler)
89
69
 
90
- File.open(output_file(script)) do |f|
70
+ File.open(output_file(script_config[:file])) do |f|
91
71
  f.each_line do |line|
92
72
  processor.process(line)
93
73
  end
@@ -123,6 +103,14 @@ module TurbotRunner
123
103
  }[language]
124
104
  end
125
105
 
106
+ def scraper_config
107
+ {
108
+ :file => scraper_script,
109
+ :data_type => scraper_data_type,
110
+ :identifying_fields => scraper_identifying_fields
111
+ }
112
+ end
113
+
126
114
  def scraper_script
127
115
  "scraper#{script_extension}"
128
116
  end
@@ -142,5 +130,9 @@ module TurbotRunner
142
130
  def scraper_data_type
143
131
  @config[:data_type]
144
132
  end
133
+
134
+ def scraper_identifying_fields
135
+ @config[:identifying_fields]
136
+ end
145
137
  end
146
138
  end
@@ -8,12 +8,12 @@ trap('INT') {}
8
8
 
9
9
  module TurbotRunner
10
10
  class ScriptRunner
11
- def initialize(command, output_file, data_type, options={})
11
+ def initialize(command, output_file, script_config, options={})
12
12
  @command = command
13
13
  @output_file = output_file
14
14
 
15
15
  record_handler = options[:record_handler] || BaseHandler.new # A BaseHandler does nothing
16
- @processor = Processor.new(self, data_type, record_handler)
16
+ @processor = Processor.new(self, script_config, record_handler)
17
17
 
18
18
  @timeout = options[:timeout] || 3600
19
19
  end
@@ -1,3 +1,3 @@
1
1
  module TurbotRunner
2
- VERSION = '0.1.3'
2
+ VERSION = '0.1.4'
3
3
  end
@@ -6,7 +6,13 @@ describe TurbotRunner::Processor do
6
6
  before do
7
7
  @handler = TurbotRunner::BaseHandler.new
8
8
  @data_type = 'primary data'
9
- @processor = TurbotRunner::Processor.new(@handler, @data_type)
9
+ script_config = {
10
+ :data_type => @data_type,
11
+ :identifying_fields => ['number']
12
+ }
13
+ script_runner = instance_double('ScriptRunner')
14
+ allow(script_runner).to receive(:interrupt_and_mark_as_failed)
15
+ @processor = TurbotRunner::Processor.new(script_runner, script_config, @handler)
10
16
  end
11
17
 
12
18
  context 'with valid record' do
@@ -21,15 +27,15 @@ describe TurbotRunner::Processor do
21
27
  end
22
28
  end
23
29
 
24
- context 'with invalid record' do
25
- it 'calls Handler#handle_invalid_record' do
26
- before do
27
- @record = {
28
- 'sample_date' => '2014-06-01',
29
- 'number' => 123
30
- }
31
- end
30
+ context 'with record missing required field' do
31
+ before do
32
+ @record = {
33
+ 'sample_date' => '2014-06-01',
34
+ 'number' => 123
35
+ }
36
+ end
32
37
 
38
+ it 'calls Handler#handle_invalid_record' do
33
39
  expected_errors = ['Missing required attribute: source_url']
34
40
  expect(@handler).to receive(:handle_invalid_record).
35
41
  with(@record, @data_type, expected_errors)
@@ -37,6 +43,22 @@ describe TurbotRunner::Processor do
37
43
  end
38
44
  end
39
45
 
46
+ context 'with record missing all identifying fields' do
47
+ before do
48
+ @record = {
49
+ 'sample_date' => '2014-06-01',
50
+ 'source_url' => 'http://example.com/123'
51
+ }
52
+ end
53
+
54
+ it 'calls Handler#handle_invalid_record' do
55
+ expected_errors = ['There were no values provided for any of the identifying fields: number']
56
+ expect(@handler).to receive(:handle_invalid_record).
57
+ with(@record, @data_type, expected_errors)
58
+ @processor.process(@record.to_json)
59
+ end
60
+ end
61
+
40
62
  context 'with invalid JSON' do
41
63
  it 'calls Handler#handle_invalid_json' do
42
64
  line = 'this is not JSON'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: turbot-runner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - OpenCorporates
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-07 00:00:00.000000000 Z
11
+ date: 2014-08-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json-schema
@@ -88,7 +88,7 @@ files:
88
88
  - spec/bots/ruby-bot/scraper.rb
89
89
  - spec/bots/slow-bot/manifest.json
90
90
  - spec/bots/slow-bot/scraper.rb
91
- - spec/lib/processor.rb
91
+ - spec/lib/processor_spec.rb
92
92
  - spec/lib/runner_spec.rb
93
93
  - spec/manual_spec.rb
94
94
  - spec/outputs/full-scraper.out