turbot-runner 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
M2Q4MGEyNmViMzEzODU2YWJkNTRkNjY2NzQ4N2JkODUzYmM3MTVhOQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MjJhZTJjZjMwYmZjNTY5Mjk3YWU4OTUwMTA1M2I4MjNiZWFiNzVlYw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZjYzMTZhODVkNmU3MWE3ZjkzNTNkMzRhNzZjMWUzODlhODc0YmFlODhlMWE3
|
10
|
+
YzUxMDU3Y2E5MTBhZTg2MTY3MjIyMDNkZmE2OTIwOWI1M2RmMTQ4MDQzODlm
|
11
|
+
MjE0MGMzNTc2NjZlZWI5MGI5NTgxZTQzZDRkYmE4YzUxZjAzOWE=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZGE3NDZmZGRhYWI2NjNlZTM4MWExYzg3MmZiYzNiMzExMTFhZDlkYWVhOTIy
|
14
|
+
MTk3Mzk1NTU3Y2RhZDY2ZGQ1Yzc5ZTg0YTMxMmQyN2Y0YzVmMjA5ZjVlODk4
|
15
|
+
MDg1MDQ0ZWI4MDk1ZWZjZjk2ZmMyOTEwMDY5Mzc5NjFjZWRjZjg=
|
@@ -3,9 +3,10 @@ require 'json-schema'
|
|
3
3
|
|
4
4
|
module TurbotRunner
|
5
5
|
class Processor
|
6
|
-
def initialize(runner,
|
6
|
+
def initialize(runner, script_config, record_handler)
|
7
7
|
@runner = runner
|
8
|
-
@data_type = data_type
|
8
|
+
@data_type = script_config[:data_type]
|
9
|
+
@identifying_fields = script_config[:identifying_fields]
|
9
10
|
@record_handler = record_handler
|
10
11
|
end
|
11
12
|
|
@@ -44,6 +45,18 @@ module TurbotRunner
|
|
44
45
|
error[:message]
|
45
46
|
end
|
46
47
|
end
|
48
|
+
|
49
|
+
if messages.empty?
|
50
|
+
identifying_attributes = record.reject do |k, v|
|
51
|
+
!@identifying_fields.include?(k) || v.nil? || v == ''
|
52
|
+
end
|
53
|
+
|
54
|
+
if identifying_attributes.empty?
|
55
|
+
messages << "There were no values provided for any of the identifying fields: #{@identifying_fields.join(', ')}"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
messages
|
47
60
|
end
|
48
61
|
|
49
62
|
def schema
|
data/lib/turbot_runner/runner.rb
CHANGED
@@ -17,20 +17,20 @@ module TurbotRunner
|
|
17
17
|
def run
|
18
18
|
FileUtils.mkdir_p(@output_directory)
|
19
19
|
|
20
|
-
return false if not
|
20
|
+
return false if not run_script(scraper_config)
|
21
21
|
|
22
|
-
transformers.each do |
|
23
|
-
return false if not
|
22
|
+
transformers.each do |transformer_config|
|
23
|
+
return false if not run_script(transformer_config, input_file=scraper_output_file)
|
24
24
|
end
|
25
25
|
|
26
26
|
true
|
27
27
|
end
|
28
28
|
|
29
29
|
def process_output
|
30
|
-
return false if not
|
30
|
+
return false if not process_script_output(scraper_config)
|
31
31
|
|
32
|
-
transformers.each do |
|
33
|
-
return false if not
|
32
|
+
transformers.each do |transformer_config|
|
33
|
+
return false if not process_script_output(transformer_config)
|
34
34
|
end
|
35
35
|
|
36
36
|
true
|
@@ -50,25 +50,13 @@ module TurbotRunner
|
|
50
50
|
end
|
51
51
|
end
|
52
52
|
|
53
|
-
def
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
def run_transformer(transformer)
|
58
|
-
run_script(
|
59
|
-
transformer[:file],
|
60
|
-
transformer[:data_type],
|
61
|
-
input_file=scraper_output_file
|
62
|
-
)
|
63
|
-
end
|
64
|
-
|
65
|
-
def run_script(script, data_type, input_file=nil)
|
66
|
-
command = build_command(script, input_file)
|
53
|
+
def run_script(script_config, input_file=nil)
|
54
|
+
command = build_command(script_config[:file], input_file)
|
67
55
|
|
68
56
|
runner = ScriptRunner.new(
|
69
57
|
command,
|
70
|
-
output_file(
|
71
|
-
|
58
|
+
output_file(script_config[:file]),
|
59
|
+
script_config,
|
72
60
|
:record_handler => @record_handler,
|
73
61
|
:timeout => @timeout
|
74
62
|
)
|
@@ -76,18 +64,10 @@ module TurbotRunner
|
|
76
64
|
runner.run
|
77
65
|
end
|
78
66
|
|
79
|
-
def
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
def process_transformer_output(transformer)
|
84
|
-
process_script_output(transformer[:file], transformer[:data_type])
|
85
|
-
end
|
86
|
-
|
87
|
-
def process_script_output(script, data_type)
|
88
|
-
processor = Processor.new(nil, data_type, @record_handler)
|
67
|
+
def process_script_output(script_config)
|
68
|
+
processor = Processor.new(nil, script_config, @record_handler)
|
89
69
|
|
90
|
-
File.open(output_file(
|
70
|
+
File.open(output_file(script_config[:file])) do |f|
|
91
71
|
f.each_line do |line|
|
92
72
|
processor.process(line)
|
93
73
|
end
|
@@ -123,6 +103,14 @@ module TurbotRunner
|
|
123
103
|
}[language]
|
124
104
|
end
|
125
105
|
|
106
|
+
def scraper_config
|
107
|
+
{
|
108
|
+
:file => scraper_script,
|
109
|
+
:data_type => scraper_data_type,
|
110
|
+
:identifying_fields => scraper_identifying_fields
|
111
|
+
}
|
112
|
+
end
|
113
|
+
|
126
114
|
def scraper_script
|
127
115
|
"scraper#{script_extension}"
|
128
116
|
end
|
@@ -142,5 +130,9 @@ module TurbotRunner
|
|
142
130
|
def scraper_data_type
|
143
131
|
@config[:data_type]
|
144
132
|
end
|
133
|
+
|
134
|
+
def scraper_identifying_fields
|
135
|
+
@config[:identifying_fields]
|
136
|
+
end
|
145
137
|
end
|
146
138
|
end
|
@@ -8,12 +8,12 @@ trap('INT') {}
|
|
8
8
|
|
9
9
|
module TurbotRunner
|
10
10
|
class ScriptRunner
|
11
|
-
def initialize(command, output_file,
|
11
|
+
def initialize(command, output_file, script_config, options={})
|
12
12
|
@command = command
|
13
13
|
@output_file = output_file
|
14
14
|
|
15
15
|
record_handler = options[:record_handler] || BaseHandler.new # A BaseHandler does nothing
|
16
|
-
@processor = Processor.new(self,
|
16
|
+
@processor = Processor.new(self, script_config, record_handler)
|
17
17
|
|
18
18
|
@timeout = options[:timeout] || 3600
|
19
19
|
end
|
@@ -6,7 +6,13 @@ describe TurbotRunner::Processor do
|
|
6
6
|
before do
|
7
7
|
@handler = TurbotRunner::BaseHandler.new
|
8
8
|
@data_type = 'primary data'
|
9
|
-
|
9
|
+
script_config = {
|
10
|
+
:data_type => @data_type,
|
11
|
+
:identifying_fields => ['number']
|
12
|
+
}
|
13
|
+
script_runner = instance_double('ScriptRunner')
|
14
|
+
allow(script_runner).to receive(:interrupt_and_mark_as_failed)
|
15
|
+
@processor = TurbotRunner::Processor.new(script_runner, script_config, @handler)
|
10
16
|
end
|
11
17
|
|
12
18
|
context 'with valid record' do
|
@@ -21,15 +27,15 @@ describe TurbotRunner::Processor do
|
|
21
27
|
end
|
22
28
|
end
|
23
29
|
|
24
|
-
context 'with
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
end
|
30
|
+
context 'with record missing required field' do
|
31
|
+
before do
|
32
|
+
@record = {
|
33
|
+
'sample_date' => '2014-06-01',
|
34
|
+
'number' => 123
|
35
|
+
}
|
36
|
+
end
|
32
37
|
|
38
|
+
it 'calls Handler#handle_invalid_record' do
|
33
39
|
expected_errors = ['Missing required attribute: source_url']
|
34
40
|
expect(@handler).to receive(:handle_invalid_record).
|
35
41
|
with(@record, @data_type, expected_errors)
|
@@ -37,6 +43,22 @@ describe TurbotRunner::Processor do
|
|
37
43
|
end
|
38
44
|
end
|
39
45
|
|
46
|
+
context 'with record missing all identifying fields' do
|
47
|
+
before do
|
48
|
+
@record = {
|
49
|
+
'sample_date' => '2014-06-01',
|
50
|
+
'source_url' => 'http://example.com/123'
|
51
|
+
}
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'calls Handler#handle_invalid_record' do
|
55
|
+
expected_errors = ['There were no values provided for any of the identifying fields: number']
|
56
|
+
expect(@handler).to receive(:handle_invalid_record).
|
57
|
+
with(@record, @data_type, expected_errors)
|
58
|
+
@processor.process(@record.to_json)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
40
62
|
context 'with invalid JSON' do
|
41
63
|
it 'calls Handler#handle_invalid_json' do
|
42
64
|
line = 'this is not JSON'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: turbot-runner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- OpenCorporates
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-08-
|
11
|
+
date: 2014-08-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json-schema
|
@@ -88,7 +88,7 @@ files:
|
|
88
88
|
- spec/bots/ruby-bot/scraper.rb
|
89
89
|
- spec/bots/slow-bot/manifest.json
|
90
90
|
- spec/bots/slow-bot/scraper.rb
|
91
|
-
- spec/lib/
|
91
|
+
- spec/lib/processor_spec.rb
|
92
92
|
- spec/lib/runner_spec.rb
|
93
93
|
- spec/manual_spec.rb
|
94
94
|
- spec/outputs/full-scraper.out
|