turbot-runner 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
M2Q4MGEyNmViMzEzODU2YWJkNTRkNjY2NzQ4N2JkODUzYmM3MTVhOQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MjJhZTJjZjMwYmZjNTY5Mjk3YWU4OTUwMTA1M2I4MjNiZWFiNzVlYw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZjYzMTZhODVkNmU3MWE3ZjkzNTNkMzRhNzZjMWUzODlhODc0YmFlODhlMWE3
|
10
|
+
YzUxMDU3Y2E5MTBhZTg2MTY3MjIyMDNkZmE2OTIwOWI1M2RmMTQ4MDQzODlm
|
11
|
+
MjE0MGMzNTc2NjZlZWI5MGI5NTgxZTQzZDRkYmE4YzUxZjAzOWE=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZGE3NDZmZGRhYWI2NjNlZTM4MWExYzg3MmZiYzNiMzExMTFhZDlkYWVhOTIy
|
14
|
+
MTk3Mzk1NTU3Y2RhZDY2ZGQ1Yzc5ZTg0YTMxMmQyN2Y0YzVmMjA5ZjVlODk4
|
15
|
+
MDg1MDQ0ZWI4MDk1ZWZjZjk2ZmMyOTEwMDY5Mzc5NjFjZWRjZjg=
|
@@ -3,9 +3,10 @@ require 'json-schema'
|
|
3
3
|
|
4
4
|
module TurbotRunner
|
5
5
|
class Processor
|
6
|
-
def initialize(runner,
|
6
|
+
def initialize(runner, script_config, record_handler)
|
7
7
|
@runner = runner
|
8
|
-
@data_type = data_type
|
8
|
+
@data_type = script_config[:data_type]
|
9
|
+
@identifying_fields = script_config[:identifying_fields]
|
9
10
|
@record_handler = record_handler
|
10
11
|
end
|
11
12
|
|
@@ -44,6 +45,18 @@ module TurbotRunner
|
|
44
45
|
error[:message]
|
45
46
|
end
|
46
47
|
end
|
48
|
+
|
49
|
+
if messages.empty?
|
50
|
+
identifying_attributes = record.reject do |k, v|
|
51
|
+
!@identifying_fields.include?(k) || v.nil? || v == ''
|
52
|
+
end
|
53
|
+
|
54
|
+
if identifying_attributes.empty?
|
55
|
+
messages << "There were no values provided for any of the identifying fields: #{@identifying_fields.join(', ')}"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
messages
|
47
60
|
end
|
48
61
|
|
49
62
|
def schema
|
data/lib/turbot_runner/runner.rb
CHANGED
@@ -17,20 +17,20 @@ module TurbotRunner
|
|
17
17
|
def run
|
18
18
|
FileUtils.mkdir_p(@output_directory)
|
19
19
|
|
20
|
-
return false if not
|
20
|
+
return false if not run_script(scraper_config)
|
21
21
|
|
22
|
-
transformers.each do |
|
23
|
-
return false if not
|
22
|
+
transformers.each do |transformer_config|
|
23
|
+
return false if not run_script(transformer_config, input_file=scraper_output_file)
|
24
24
|
end
|
25
25
|
|
26
26
|
true
|
27
27
|
end
|
28
28
|
|
29
29
|
def process_output
|
30
|
-
return false if not
|
30
|
+
return false if not process_script_output(scraper_config)
|
31
31
|
|
32
|
-
transformers.each do |
|
33
|
-
return false if not
|
32
|
+
transformers.each do |transformer_config|
|
33
|
+
return false if not process_script_output(transformer_config)
|
34
34
|
end
|
35
35
|
|
36
36
|
true
|
@@ -50,25 +50,13 @@ module TurbotRunner
|
|
50
50
|
end
|
51
51
|
end
|
52
52
|
|
53
|
-
def
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
def run_transformer(transformer)
|
58
|
-
run_script(
|
59
|
-
transformer[:file],
|
60
|
-
transformer[:data_type],
|
61
|
-
input_file=scraper_output_file
|
62
|
-
)
|
63
|
-
end
|
64
|
-
|
65
|
-
def run_script(script, data_type, input_file=nil)
|
66
|
-
command = build_command(script, input_file)
|
53
|
+
def run_script(script_config, input_file=nil)
|
54
|
+
command = build_command(script_config[:file], input_file)
|
67
55
|
|
68
56
|
runner = ScriptRunner.new(
|
69
57
|
command,
|
70
|
-
output_file(
|
71
|
-
|
58
|
+
output_file(script_config[:file]),
|
59
|
+
script_config,
|
72
60
|
:record_handler => @record_handler,
|
73
61
|
:timeout => @timeout
|
74
62
|
)
|
@@ -76,18 +64,10 @@ module TurbotRunner
|
|
76
64
|
runner.run
|
77
65
|
end
|
78
66
|
|
79
|
-
def
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
def process_transformer_output(transformer)
|
84
|
-
process_script_output(transformer[:file], transformer[:data_type])
|
85
|
-
end
|
86
|
-
|
87
|
-
def process_script_output(script, data_type)
|
88
|
-
processor = Processor.new(nil, data_type, @record_handler)
|
67
|
+
def process_script_output(script_config)
|
68
|
+
processor = Processor.new(nil, script_config, @record_handler)
|
89
69
|
|
90
|
-
File.open(output_file(
|
70
|
+
File.open(output_file(script_config[:file])) do |f|
|
91
71
|
f.each_line do |line|
|
92
72
|
processor.process(line)
|
93
73
|
end
|
@@ -123,6 +103,14 @@ module TurbotRunner
|
|
123
103
|
}[language]
|
124
104
|
end
|
125
105
|
|
106
|
+
def scraper_config
|
107
|
+
{
|
108
|
+
:file => scraper_script,
|
109
|
+
:data_type => scraper_data_type,
|
110
|
+
:identifying_fields => scraper_identifying_fields
|
111
|
+
}
|
112
|
+
end
|
113
|
+
|
126
114
|
def scraper_script
|
127
115
|
"scraper#{script_extension}"
|
128
116
|
end
|
@@ -142,5 +130,9 @@ module TurbotRunner
|
|
142
130
|
def scraper_data_type
|
143
131
|
@config[:data_type]
|
144
132
|
end
|
133
|
+
|
134
|
+
def scraper_identifying_fields
|
135
|
+
@config[:identifying_fields]
|
136
|
+
end
|
145
137
|
end
|
146
138
|
end
|
@@ -8,12 +8,12 @@ trap('INT') {}
|
|
8
8
|
|
9
9
|
module TurbotRunner
|
10
10
|
class ScriptRunner
|
11
|
-
def initialize(command, output_file,
|
11
|
+
def initialize(command, output_file, script_config, options={})
|
12
12
|
@command = command
|
13
13
|
@output_file = output_file
|
14
14
|
|
15
15
|
record_handler = options[:record_handler] || BaseHandler.new # A BaseHandler does nothing
|
16
|
-
@processor = Processor.new(self,
|
16
|
+
@processor = Processor.new(self, script_config, record_handler)
|
17
17
|
|
18
18
|
@timeout = options[:timeout] || 3600
|
19
19
|
end
|
@@ -6,7 +6,13 @@ describe TurbotRunner::Processor do
|
|
6
6
|
before do
|
7
7
|
@handler = TurbotRunner::BaseHandler.new
|
8
8
|
@data_type = 'primary data'
|
9
|
-
|
9
|
+
script_config = {
|
10
|
+
:data_type => @data_type,
|
11
|
+
:identifying_fields => ['number']
|
12
|
+
}
|
13
|
+
script_runner = instance_double('ScriptRunner')
|
14
|
+
allow(script_runner).to receive(:interrupt_and_mark_as_failed)
|
15
|
+
@processor = TurbotRunner::Processor.new(script_runner, script_config, @handler)
|
10
16
|
end
|
11
17
|
|
12
18
|
context 'with valid record' do
|
@@ -21,15 +27,15 @@ describe TurbotRunner::Processor do
|
|
21
27
|
end
|
22
28
|
end
|
23
29
|
|
24
|
-
context 'with
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
end
|
30
|
+
context 'with record missing required field' do
|
31
|
+
before do
|
32
|
+
@record = {
|
33
|
+
'sample_date' => '2014-06-01',
|
34
|
+
'number' => 123
|
35
|
+
}
|
36
|
+
end
|
32
37
|
|
38
|
+
it 'calls Handler#handle_invalid_record' do
|
33
39
|
expected_errors = ['Missing required attribute: source_url']
|
34
40
|
expect(@handler).to receive(:handle_invalid_record).
|
35
41
|
with(@record, @data_type, expected_errors)
|
@@ -37,6 +43,22 @@ describe TurbotRunner::Processor do
|
|
37
43
|
end
|
38
44
|
end
|
39
45
|
|
46
|
+
context 'with record missing all identifying fields' do
|
47
|
+
before do
|
48
|
+
@record = {
|
49
|
+
'sample_date' => '2014-06-01',
|
50
|
+
'source_url' => 'http://example.com/123'
|
51
|
+
}
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'calls Handler#handle_invalid_record' do
|
55
|
+
expected_errors = ['There were no values provided for any of the identifying fields: number']
|
56
|
+
expect(@handler).to receive(:handle_invalid_record).
|
57
|
+
with(@record, @data_type, expected_errors)
|
58
|
+
@processor.process(@record.to_json)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
40
62
|
context 'with invalid JSON' do
|
41
63
|
it 'calls Handler#handle_invalid_json' do
|
42
64
|
line = 'this is not JSON'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: turbot-runner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- OpenCorporates
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-08-
|
11
|
+
date: 2014-08-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json-schema
|
@@ -88,7 +88,7 @@ files:
|
|
88
88
|
- spec/bots/ruby-bot/scraper.rb
|
89
89
|
- spec/bots/slow-bot/manifest.json
|
90
90
|
- spec/bots/slow-bot/scraper.rb
|
91
|
-
- spec/lib/
|
91
|
+
- spec/lib/processor_spec.rb
|
92
92
|
- spec/lib/runner_spec.rb
|
93
93
|
- spec/manual_spec.rb
|
94
94
|
- spec/outputs/full-scraper.out
|