turbot-runner 0.2.13 → 0.2.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -0
- data/lib/turbot_runner/version.rb +1 -1
- data/schema/schemas/accounts-statement-schema.json +1 -1
- data/schema/schemas/company-schema.json +1 -1
- data/schema/schemas/filing-schema.json +2 -2
- data/schema/schemas/gazette-notice-schema.json +2 -9
- data/schema/schemas/includes/company-for-nesting.json +4 -13
- data/schema/schemas/licence-schema.json +2 -2
- data/spec/lib/runner_spec.rb +0 -5
- metadata +4 -62
- data/bin/rspec +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a93e7b12e6d13b95affabf91eb2bcf3914260948
|
4
|
+
data.tar.gz: f129aea4ce1e9f857fa1e9a587dc5fd69a732c81
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 21329ceb668293687d5c0822ae29192a4c30ba3cc263cff784e0373cf94a8e9033d05305eafe53c5508622002c93db493571667a7c20149ac375750f24301134
|
7
|
+
data.tar.gz: f48480dacda4fdf1c3e3d4b21e01ac86851a6d91b463449c2670749fac9079fcdce794b833a9f7315b4dd1405afe525becb19b0e5ce4cb1cd1c3340a3d14b1d2
|
data/README.md
CHANGED
@@ -18,3 +18,50 @@ Bump the version in `lib/turbot_runner/version.rb` according to the [Semantic Ve
|
|
18
18
|
rake release # requires Rubygems credentials
|
19
19
|
|
20
20
|
Finally, [rebuild the Docker image](https://github.com/openc/morph-docker-ruby#readme).
|
21
|
+
|
22
|
+
## Rough outline of how it works
|
23
|
+
|
24
|
+
TurbotRunner is responsible for running a scraper, transforming its data, and
|
25
|
+
then validating and processing any output.
|
26
|
+
|
27
|
+
Work is coordinated by an instance of `Runner`. Most of the interesting work
|
28
|
+
is done in `Runner#run_script`, which constructs a command like:
|
29
|
+
|
30
|
+
python transformer.py >transformer.out 2>transformer.err <scraper.out
|
31
|
+
|
32
|
+
This command is then passed to an instance of `ScriptRunner` which runs the
|
33
|
+
command via `system` in a new thread. The main thread then monitors the output
|
34
|
+
file, and processes each complete line of output.
|
35
|
+
|
36
|
+
A line is processed by an instance of `Processor`, which checks that the line
|
37
|
+
is valid JSON, and then passes it on to the instance of a subclass of
|
38
|
+
`BaseHandler` that was passed to the `Runner` when it was created.
|
39
|
+
|
40
|
+
The subclass of `BaseHandler` can implement any of the following methods:
|
41
|
+
|
42
|
+
* `handle_valid_record`
|
43
|
+
* `handle_invalid_record`
|
44
|
+
* `handle_invalid_json`
|
45
|
+
* `handle_snapshot_ended`
|
46
|
+
|
47
|
+
If the `Processor` finds an invalid record, it interrupts the `ScriptRunner`,
|
48
|
+
and marks the run as having failed.
|
49
|
+
|
50
|
+
The `Processor` will catch an `InterruptRun` that's raised by
|
51
|
+
`handler.handle_valid_record`, which will interrupt the `ScriptRunner`, but
|
52
|
+
will not mark the run as having failed.
|
53
|
+
|
54
|
+
When the `ScriptRunner` is interrupted, it will kill the running process, by
|
55
|
+
sending SIGINT to all the processes in the current process group. The current
|
56
|
+
process is set up (via `trap('INT') {}` to ignore this.
|
57
|
+
|
58
|
+
If the `ScriptRunner` reads no output from the command within a timeout (by
|
59
|
+
default, 24 hours) it interrupts itself, and marks the run as having failed.
|
60
|
+
|
61
|
+
## Running the tests
|
62
|
+
|
63
|
+
Tests are run with rspec:
|
64
|
+
|
65
|
+
`./bin/rspec`
|
66
|
+
|
67
|
+
The first two specs to run require some manual input.
|
@@ -64,9 +64,9 @@
|
|
64
64
|
"format": "date"
|
65
65
|
},
|
66
66
|
"retrieved_at": {
|
67
|
-
"description": "
|
67
|
+
"description": "Date-time this was retrieved from the source",
|
68
68
|
"type": "string",
|
69
|
-
"format": "date"
|
69
|
+
"format": "date-time"
|
70
70
|
},
|
71
71
|
"other_attributes": {
|
72
72
|
"description": "Use for other attributes for which we don't yet have curated schema attributes",
|
@@ -267,13 +267,6 @@
|
|
267
267
|
"type": "string",
|
268
268
|
"enum": ["other"]
|
269
269
|
},
|
270
|
-
"classification": {
|
271
|
-
"description": "The type of judgment",
|
272
|
-
"type": "array",
|
273
|
-
"items": {
|
274
|
-
"$ref": "includes/classification.json"
|
275
|
-
}
|
276
|
-
},
|
277
270
|
"body": {
|
278
271
|
"description": "The unstructured prose content",
|
279
272
|
"$ref": "#/definitions/body"
|
@@ -412,9 +405,9 @@
|
|
412
405
|
"format": "date"
|
413
406
|
},
|
414
407
|
"retrieved_at": {
|
415
|
-
"description": "The time
|
408
|
+
"description": "The time at which the source URL was requested",
|
416
409
|
"type": "string",
|
417
|
-
"format": "date"
|
410
|
+
"format": "date-time"
|
418
411
|
},
|
419
412
|
"confidence": {
|
420
413
|
"description": "The scraper's author's confidence in the accuracy of the data",
|
@@ -33,7 +33,7 @@
|
|
33
33
|
},
|
34
34
|
"retrieved_at": {
|
35
35
|
"type": "string",
|
36
|
-
"format": "date"
|
36
|
+
"format": "date-time"
|
37
37
|
},
|
38
38
|
"current_status": {
|
39
39
|
"type": [
|
@@ -239,16 +239,7 @@
|
|
239
239
|
}
|
240
240
|
},
|
241
241
|
"additionalProperties": false,
|
242
|
-
"
|
243
|
-
|
244
|
-
"required": [
|
245
|
-
"name"
|
246
|
-
]
|
247
|
-
},
|
248
|
-
{
|
249
|
-
"required": [
|
250
|
-
"company_number"
|
251
|
-
]
|
252
|
-
}
|
242
|
+
"required": [
|
243
|
+
"name"
|
253
244
|
]
|
254
|
-
}
|
245
|
+
}
|
@@ -59,9 +59,9 @@
|
|
59
59
|
"format": "date"
|
60
60
|
},
|
61
61
|
"retrieved_at": {
|
62
|
-
"description": "
|
62
|
+
"description": "Date-time this was retrieved from the source",
|
63
63
|
"type": "string",
|
64
|
-
"format": "date"
|
64
|
+
"format": "date-time"
|
65
65
|
},
|
66
66
|
"licence_url": {
|
67
67
|
"type": "string",
|
data/spec/lib/runner_spec.rb
CHANGED
@@ -2,11 +2,6 @@ require 'json'
|
|
2
2
|
require 'turbot_runner'
|
3
3
|
|
4
4
|
describe TurbotRunner::Runner do
|
5
|
-
after(:all) do
|
6
|
-
puts
|
7
|
-
puts 'If all specs passed, you should now run `ruby spec/manual_spec.rb`'
|
8
|
-
end
|
9
|
-
|
10
5
|
describe '#run' do
|
11
6
|
context 'with a bot written in ruby' do
|
12
7
|
before do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: turbot-runner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- OpenCorporates
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -96,8 +96,7 @@ dependencies:
|
|
96
96
|
version: '3.0'
|
97
97
|
description:
|
98
98
|
email: bots@opencorporates.com
|
99
|
-
executables:
|
100
|
-
- rspec
|
99
|
+
executables: []
|
101
100
|
extensions: []
|
102
101
|
extra_rdoc_files: []
|
103
102
|
files:
|
@@ -106,7 +105,6 @@ files:
|
|
106
105
|
- Gemfile
|
107
106
|
- README.md
|
108
107
|
- Rakefile
|
109
|
-
- bin/rspec
|
110
108
|
- lib/turbot_runner.rb
|
111
109
|
- lib/turbot_runner/base_handler.rb
|
112
110
|
- lib/turbot_runner/exceptions.rb
|
@@ -234,60 +232,4 @@ rubygems_version: 2.4.5
|
|
234
232
|
signing_key:
|
235
233
|
specification_version: 4
|
236
234
|
summary: Utilities for running bots with Turbot
|
237
|
-
test_files:
|
238
|
-
- spec/bots/bot-that-crashes-immediately/manifest.json
|
239
|
-
- spec/bots/bot-that-crashes-immediately/scraper.rb
|
240
|
-
- spec/bots/bot-that-crashes-immediately/transformer1.rb
|
241
|
-
- spec/bots/bot-that-crashes-in-scraper/manifest.json
|
242
|
-
- spec/bots/bot-that-crashes-in-scraper/scraper.rb
|
243
|
-
- spec/bots/bot-that-crashes-in-scraper/transformer1.rb
|
244
|
-
- spec/bots/bot-that-crashes-in-transformer/manifest.json
|
245
|
-
- spec/bots/bot-that-crashes-in-transformer/scraper.rb
|
246
|
-
- spec/bots/bot-that-crashes-in-transformer/transformer1.rb
|
247
|
-
- spec/bots/bot-that-crashes-in-transformer/transformer2.rb
|
248
|
-
- spec/bots/bot-that-emits-run-ended/manifest.json
|
249
|
-
- spec/bots/bot-that-emits-run-ended/scraper.rb
|
250
|
-
- spec/bots/bot-that-emits-snapshot-ended/manifest.json
|
251
|
-
- spec/bots/bot-that-emits-snapshot-ended/scraper.rb
|
252
|
-
- spec/bots/bot-that-expects-file/manifest.json
|
253
|
-
- spec/bots/bot-that-expects-file/scraper.rb
|
254
|
-
- spec/bots/bot-that-expects-file/something.txt
|
255
|
-
- spec/bots/bot-that-is-allowed-to-produce-duplicates/manifest.json
|
256
|
-
- spec/bots/bot-that-is-allowed-to-produce-duplicates/scraper.rb
|
257
|
-
- spec/bots/bot-that-produces-duplicates/manifest.json
|
258
|
-
- spec/bots/bot-that-produces-duplicates/scraper.rb
|
259
|
-
- spec/bots/bot-with-invalid-data-type/manifest.json
|
260
|
-
- spec/bots/bot-with-invalid-data-type/scraper.rb
|
261
|
-
- spec/bots/bot-with-invalid-sample-date/manifest.json
|
262
|
-
- spec/bots/bot-with-invalid-sample-date/scraper.rb
|
263
|
-
- spec/bots/bot-with-pause/manifest.json
|
264
|
-
- spec/bots/bot-with-pause/scraper.rb
|
265
|
-
- spec/bots/bot-with-transformer/manifest.json
|
266
|
-
- spec/bots/bot-with-transformer/scraper.rb
|
267
|
-
- spec/bots/bot-with-transformer/transformer.rb
|
268
|
-
- spec/bots/bot-with-transformers/manifest.json
|
269
|
-
- spec/bots/bot-with-transformers/scraper.rb
|
270
|
-
- spec/bots/bot-with-transformers/transformer1.rb
|
271
|
-
- spec/bots/bot-with-transformers/transformer2.rb
|
272
|
-
- spec/bots/invalid-json-bot/manifest.json
|
273
|
-
- spec/bots/invalid-json-bot/scraper.rb
|
274
|
-
- spec/bots/invalid-record-bot/manifest.json
|
275
|
-
- spec/bots/invalid-record-bot/scraper.rb
|
276
|
-
- spec/bots/logging-bot/manifest.json
|
277
|
-
- spec/bots/logging-bot/scraper.rb
|
278
|
-
- spec/bots/python-bot/manifest.json
|
279
|
-
- spec/bots/python-bot/scraper.py
|
280
|
-
- spec/bots/ruby-bot/manifest.json
|
281
|
-
- spec/bots/ruby-bot/scraper.rb
|
282
|
-
- spec/bots/slow-bot/manifest.json
|
283
|
-
- spec/bots/slow-bot/scraper.rb
|
284
|
-
- spec/lib/processor_spec.rb
|
285
|
-
- spec/lib/runner_spec.rb
|
286
|
-
- spec/lib/utils_spec.rb
|
287
|
-
- spec/lib/validator_spec.rb
|
288
|
-
- spec/manual_spec.rb
|
289
|
-
- spec/outputs/full-scraper.out
|
290
|
-
- spec/outputs/full-transformer.out
|
291
|
-
- spec/outputs/truncated-scraper.out
|
292
|
-
- spec/spec_helper.rb
|
293
|
-
has_rdoc:
|
235
|
+
test_files: []
|
data/bin/rspec
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file was generated by Bundler.
|
4
|
-
#
|
5
|
-
# The application 'rspec' is installed as part of a gem, and
|
6
|
-
# this file is here to facilitate running it.
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'pathname'
|
10
|
-
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
-
Pathname.new(__FILE__).realpath)
|
12
|
-
|
13
|
-
require 'rubygems'
|
14
|
-
require 'bundler/setup'
|
15
|
-
|
16
|
-
load Gem.bin_path('rspec-core', 'rspec')
|