turbot-runner 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MWZiMTU5NGIzYjc4ZTdlNTJiOGRiMzkwNzI0NjRiY2MwYWU2Yzk0Ng==
4
+ M2Q4MGEyNmViMzEzODU2YWJkNTRkNjY2NzQ4N2JkODUzYmM3MTVhOQ==
5
5
  data.tar.gz: !binary |-
6
- YWZkY2NhNTNhNzdjMWQ1MzBkNzc2NWNhNTY0ZDg2YzU0MWI0YTgxMA==
6
+ MjJhZTJjZjMwYmZjNTY5Mjk3YWU4OTUwMTA1M2I4MjNiZWFiNzVlYw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NzlhZjdmMjZhZTNjNDQyOTY1MWVmZGEzZDFmM2RjZWY1MjAwMDM5OTlmZDAx
10
- N2E2ZGExNDljYTYxOTE3NDc3NTEyODYyMDkzOTUxYzVmODdmMzVjMTc5MmE3
11
- NzUyNDhjNTRmNTk2NjRhZDU4M2IzY2VjZTAwOWJmMmFmNTRkYjc=
9
+ ZjYzMTZhODVkNmU3MWE3ZjkzNTNkMzRhNzZjMWUzODlhODc0YmFlODhlMWE3
10
+ YzUxMDU3Y2E5MTBhZTg2MTY3MjIyMDNkZmE2OTIwOWI1M2RmMTQ4MDQzODlm
11
+ MjE0MGMzNTc2NjZlZWI5MGI5NTgxZTQzZDRkYmE4YzUxZjAzOWE=
12
12
  data.tar.gz: !binary |-
13
- MDA5NGJhOTBiZTIyZGYyNTEyN2NhNWEwOTVjYTVlNTQ2YTE2YTg1ODNlYTlj
14
- NTJjYzAxZTMzYzkwMDgzN2MzNGExZGQwMWU1Mzk3YzAwMTVjNDQxMGFkMGU3
15
- MWFlMzQ3Yjg1Njk4ODM3NWUwNGIwNTI0YTk3MWM4YThhM2YxNWY=
13
+ ZGE3NDZmZGRhYWI2NjNlZTM4MWExYzg3MmZiYzNiMzExMTFhZDlkYWVhOTIy
14
+ MTk3Mzk1NTU3Y2RhZDY2ZGQ1Yzc5ZTg0YTMxMmQyN2Y0YzVmMjA5ZjVlODk4
15
+ MDg1MDQ0ZWI4MDk1ZWZjZjk2ZmMyOTEwMDY5Mzc5NjFjZWRjZjg=
@@ -3,9 +3,10 @@ require 'json-schema'
3
3
 
4
4
  module TurbotRunner
5
5
  class Processor
6
- def initialize(runner, data_type, record_handler)
6
+ def initialize(runner, script_config, record_handler)
7
7
  @runner = runner
8
- @data_type = data_type
8
+ @data_type = script_config[:data_type]
9
+ @identifying_fields = script_config[:identifying_fields]
9
10
  @record_handler = record_handler
10
11
  end
11
12
 
@@ -44,6 +45,18 @@ module TurbotRunner
44
45
  error[:message]
45
46
  end
46
47
  end
48
+
49
+ if messages.empty?
50
+ identifying_attributes = record.reject do |k, v|
51
+ !@identifying_fields.include?(k) || v.nil? || v == ''
52
+ end
53
+
54
+ if identifying_attributes.empty?
55
+ messages << "There were no values provided for any of the identifying fields: #{@identifying_fields.join(', ')}"
56
+ end
57
+ end
58
+
59
+ messages
47
60
  end
48
61
 
49
62
  def schema
@@ -17,20 +17,20 @@ module TurbotRunner
17
17
  def run
18
18
  FileUtils.mkdir_p(@output_directory)
19
19
 
20
- return false if not run_scraper
20
+ return false if not run_script(scraper_config)
21
21
 
22
- transformers.each do |transformer|
23
- return false if not run_transformer(transformer)
22
+ transformers.each do |transformer_config|
23
+ return false if not run_script(transformer_config, input_file=scraper_output_file)
24
24
  end
25
25
 
26
26
  true
27
27
  end
28
28
 
29
29
  def process_output
30
- return false if not process_scraper_output
30
+ return false if not process_script_output(scraper_config)
31
31
 
32
- transformers.each do |transformer|
33
- return false if not process_transformer_output(transformer)
32
+ transformers.each do |transformer_config|
33
+ return false if not process_script_output(transformer_config)
34
34
  end
35
35
 
36
36
  true
@@ -50,25 +50,13 @@ module TurbotRunner
50
50
  end
51
51
  end
52
52
 
53
- def run_scraper
54
- run_script(scraper_script, scraper_data_type)
55
- end
56
-
57
- def run_transformer(transformer)
58
- run_script(
59
- transformer[:file],
60
- transformer[:data_type],
61
- input_file=scraper_output_file
62
- )
63
- end
64
-
65
- def run_script(script, data_type, input_file=nil)
66
- command = build_command(script, input_file)
53
+ def run_script(script_config, input_file=nil)
54
+ command = build_command(script_config[:file], input_file)
67
55
 
68
56
  runner = ScriptRunner.new(
69
57
  command,
70
- output_file(script),
71
- data_type,
58
+ output_file(script_config[:file]),
59
+ script_config,
72
60
  :record_handler => @record_handler,
73
61
  :timeout => @timeout
74
62
  )
@@ -76,18 +64,10 @@ module TurbotRunner
76
64
  runner.run
77
65
  end
78
66
 
79
- def process_scraper_output
80
- process_script_output(scraper_script, scraper_data_type)
81
- end
82
-
83
- def process_transformer_output(transformer)
84
- process_script_output(transformer[:file], transformer[:data_type])
85
- end
86
-
87
- def process_script_output(script, data_type)
88
- processor = Processor.new(nil, data_type, @record_handler)
67
+ def process_script_output(script_config)
68
+ processor = Processor.new(nil, script_config, @record_handler)
89
69
 
90
- File.open(output_file(script)) do |f|
70
+ File.open(output_file(script_config[:file])) do |f|
91
71
  f.each_line do |line|
92
72
  processor.process(line)
93
73
  end
@@ -123,6 +103,14 @@ module TurbotRunner
123
103
  }[language]
124
104
  end
125
105
 
106
+ def scraper_config
107
+ {
108
+ :file => scraper_script,
109
+ :data_type => scraper_data_type,
110
+ :identifying_fields => scraper_identifying_fields
111
+ }
112
+ end
113
+
126
114
  def scraper_script
127
115
  "scraper#{script_extension}"
128
116
  end
@@ -142,5 +130,9 @@ module TurbotRunner
142
130
  def scraper_data_type
143
131
  @config[:data_type]
144
132
  end
133
+
134
+ def scraper_identifying_fields
135
+ @config[:identifying_fields]
136
+ end
145
137
  end
146
138
  end
@@ -8,12 +8,12 @@ trap('INT') {}
8
8
 
9
9
  module TurbotRunner
10
10
  class ScriptRunner
11
- def initialize(command, output_file, data_type, options={})
11
+ def initialize(command, output_file, script_config, options={})
12
12
  @command = command
13
13
  @output_file = output_file
14
14
 
15
15
  record_handler = options[:record_handler] || BaseHandler.new # A BaseHandler does nothing
16
- @processor = Processor.new(self, data_type, record_handler)
16
+ @processor = Processor.new(self, script_config, record_handler)
17
17
 
18
18
  @timeout = options[:timeout] || 3600
19
19
  end
@@ -1,3 +1,3 @@
1
1
  module TurbotRunner
2
- VERSION = '0.1.3'
2
+ VERSION = '0.1.4'
3
3
  end
@@ -6,7 +6,13 @@ describe TurbotRunner::Processor do
6
6
  before do
7
7
  @handler = TurbotRunner::BaseHandler.new
8
8
  @data_type = 'primary data'
9
- @processor = TurbotRunner::Processor.new(@handler, @data_type)
9
+ script_config = {
10
+ :data_type => @data_type,
11
+ :identifying_fields => ['number']
12
+ }
13
+ script_runner = instance_double('ScriptRunner')
14
+ allow(script_runner).to receive(:interrupt_and_mark_as_failed)
15
+ @processor = TurbotRunner::Processor.new(script_runner, script_config, @handler)
10
16
  end
11
17
 
12
18
  context 'with valid record' do
@@ -21,15 +27,15 @@ describe TurbotRunner::Processor do
21
27
  end
22
28
  end
23
29
 
24
- context 'with invalid record' do
25
- it 'calls Handler#handle_invalid_record' do
26
- before do
27
- @record = {
28
- 'sample_date' => '2014-06-01',
29
- 'number' => 123
30
- }
31
- end
30
+ context 'with record missing required field' do
31
+ before do
32
+ @record = {
33
+ 'sample_date' => '2014-06-01',
34
+ 'number' => 123
35
+ }
36
+ end
32
37
 
38
+ it 'calls Handler#handle_invalid_record' do
33
39
  expected_errors = ['Missing required attribute: source_url']
34
40
  expect(@handler).to receive(:handle_invalid_record).
35
41
  with(@record, @data_type, expected_errors)
@@ -37,6 +43,22 @@ describe TurbotRunner::Processor do
37
43
  end
38
44
  end
39
45
 
46
+ context 'with record missing all identifying fields' do
47
+ before do
48
+ @record = {
49
+ 'sample_date' => '2014-06-01',
50
+ 'source_url' => 'http://example.com/123'
51
+ }
52
+ end
53
+
54
+ it 'calls Handler#handle_invalid_record' do
55
+ expected_errors = ['There were no values provided for any of the identifying fields: number']
56
+ expect(@handler).to receive(:handle_invalid_record).
57
+ with(@record, @data_type, expected_errors)
58
+ @processor.process(@record.to_json)
59
+ end
60
+ end
61
+
40
62
  context 'with invalid JSON' do
41
63
  it 'calls Handler#handle_invalid_json' do
42
64
  line = 'this is not JSON'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: turbot-runner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - OpenCorporates
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-07 00:00:00.000000000 Z
11
+ date: 2014-08-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json-schema
@@ -88,7 +88,7 @@ files:
88
88
  - spec/bots/ruby-bot/scraper.rb
89
89
  - spec/bots/slow-bot/manifest.json
90
90
  - spec/bots/slow-bot/scraper.rb
91
- - spec/lib/processor.rb
91
+ - spec/lib/processor_spec.rb
92
92
  - spec/lib/runner_spec.rb
93
93
  - spec/manual_spec.rb
94
94
  - spec/outputs/full-scraper.out