turbot-runner 0.2.13 → 0.2.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +47 -0
- data/lib/turbot_runner/version.rb +1 -1
- data/schema/schemas/accounts-statement-schema.json +1 -1
- data/schema/schemas/company-schema.json +1 -1
- data/schema/schemas/filing-schema.json +2 -2
- data/schema/schemas/gazette-notice-schema.json +2 -9
- data/schema/schemas/includes/company-for-nesting.json +4 -13
- data/schema/schemas/licence-schema.json +2 -2
- data/spec/lib/runner_spec.rb +0 -5
- metadata +4 -62
- data/bin/rspec +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a93e7b12e6d13b95affabf91eb2bcf3914260948
|
4
|
+
data.tar.gz: f129aea4ce1e9f857fa1e9a587dc5fd69a732c81
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 21329ceb668293687d5c0822ae29192a4c30ba3cc263cff784e0373cf94a8e9033d05305eafe53c5508622002c93db493571667a7c20149ac375750f24301134
|
7
|
+
data.tar.gz: f48480dacda4fdf1c3e3d4b21e01ac86851a6d91b463449c2670749fac9079fcdce794b833a9f7315b4dd1405afe525becb19b0e5ce4cb1cd1c3340a3d14b1d2
|
data/README.md
CHANGED
@@ -18,3 +18,50 @@ Bump the version in `lib/turbot_runner/version.rb` according to the [Semantic Ve
|
|
18
18
|
rake release # requires Rubygems credentials
|
19
19
|
|
20
20
|
Finally, [rebuild the Docker image](https://github.com/openc/morph-docker-ruby#readme).
|
21
|
+
|
22
|
+
## Rough outline of how it works
|
23
|
+
|
24
|
+
TurbotRunner is responsible for running a scraper, transforming its data, and
|
25
|
+
then validating and processing any output.
|
26
|
+
|
27
|
+
Work is coordinated by an instance of `Runner`. Most of the interesting work
|
28
|
+
is done in `Runner#run_script`, which constructs a command like:
|
29
|
+
|
30
|
+
python transformer.py >transformer.out 2>transformer.err <scraper.out
|
31
|
+
|
32
|
+
This command is then passed to an instance of `ScriptRunner` which runs the
|
33
|
+
command via `system` in a new thread. The main thread then monitors the output
|
34
|
+
file, and processes each complete line of output.
|
35
|
+
|
36
|
+
A line is processed by an instance of `Processor`, which checks that the line
|
37
|
+
is valid JSON, and then passes it on to the instance of a subclass of
|
38
|
+
`BaseHandler` that was passed to the `Runner` when it was created.
|
39
|
+
|
40
|
+
The subclass of `BaseHandler` can implement any of the following methods:
|
41
|
+
|
42
|
+
* `handle_valid_record`
|
43
|
+
* `handle_invalid_record`
|
44
|
+
* `handle_invalid_json`
|
45
|
+
* `handle_snapshot_ended`
|
46
|
+
|
47
|
+
If the `Processor` finds an invalid record, it interrupts the `ScriptRunner`,
|
48
|
+
and marks the run as having failed.
|
49
|
+
|
50
|
+
The `Processor` will catch an `InterruptRun` that's raised by
|
51
|
+
`handler.handle_valid_record`, which will interrupt the `ScriptRunner`, but
|
52
|
+
will not mark the run as having failed.
|
53
|
+
|
54
|
+
When the `ScriptRunner` is interrupted, it will kill the running process, by
|
55
|
+
sending SIGINT to all the processes in the current process group. The current
|
56
|
+
process is set up (via `trap('INT') {}` to ignore this.
|
57
|
+
|
58
|
+
If the `ScriptRunner` reads no output from the command within a timeout (by
|
59
|
+
default, 24 hours) it interrupts itself, and marks the run as having failed.
|
60
|
+
|
61
|
+
## Running the tests
|
62
|
+
|
63
|
+
Tests are run with rspec:
|
64
|
+
|
65
|
+
`./bin/rspec`
|
66
|
+
|
67
|
+
The first two specs to run require some manual input.
|
@@ -64,9 +64,9 @@
|
|
64
64
|
"format": "date"
|
65
65
|
},
|
66
66
|
"retrieved_at": {
|
67
|
-
"description": "
|
67
|
+
"description": "Date-time this was retrieved from the source",
|
68
68
|
"type": "string",
|
69
|
-
"format": "date"
|
69
|
+
"format": "date-time"
|
70
70
|
},
|
71
71
|
"other_attributes": {
|
72
72
|
"description": "Use for other attributes for which we don't yet have curated schema attributes",
|
@@ -267,13 +267,6 @@
|
|
267
267
|
"type": "string",
|
268
268
|
"enum": ["other"]
|
269
269
|
},
|
270
|
-
"classification": {
|
271
|
-
"description": "The type of judgment",
|
272
|
-
"type": "array",
|
273
|
-
"items": {
|
274
|
-
"$ref": "includes/classification.json"
|
275
|
-
}
|
276
|
-
},
|
277
270
|
"body": {
|
278
271
|
"description": "The unstructured prose content",
|
279
272
|
"$ref": "#/definitions/body"
|
@@ -412,9 +405,9 @@
|
|
412
405
|
"format": "date"
|
413
406
|
},
|
414
407
|
"retrieved_at": {
|
415
|
-
"description": "The time
|
408
|
+
"description": "The time at which the source URL was requested",
|
416
409
|
"type": "string",
|
417
|
-
"format": "date"
|
410
|
+
"format": "date-time"
|
418
411
|
},
|
419
412
|
"confidence": {
|
420
413
|
"description": "The scraper's author's confidence in the accuracy of the data",
|
@@ -33,7 +33,7 @@
|
|
33
33
|
},
|
34
34
|
"retrieved_at": {
|
35
35
|
"type": "string",
|
36
|
-
"format": "date"
|
36
|
+
"format": "date-time"
|
37
37
|
},
|
38
38
|
"current_status": {
|
39
39
|
"type": [
|
@@ -239,16 +239,7 @@
|
|
239
239
|
}
|
240
240
|
},
|
241
241
|
"additionalProperties": false,
|
242
|
-
"
|
243
|
-
|
244
|
-
"required": [
|
245
|
-
"name"
|
246
|
-
]
|
247
|
-
},
|
248
|
-
{
|
249
|
-
"required": [
|
250
|
-
"company_number"
|
251
|
-
]
|
252
|
-
}
|
242
|
+
"required": [
|
243
|
+
"name"
|
253
244
|
]
|
254
|
-
}
|
245
|
+
}
|
@@ -59,9 +59,9 @@
|
|
59
59
|
"format": "date"
|
60
60
|
},
|
61
61
|
"retrieved_at": {
|
62
|
-
"description": "
|
62
|
+
"description": "Date-time this was retrieved from the source",
|
63
63
|
"type": "string",
|
64
|
-
"format": "date"
|
64
|
+
"format": "date-time"
|
65
65
|
},
|
66
66
|
"licence_url": {
|
67
67
|
"type": "string",
|
data/spec/lib/runner_spec.rb
CHANGED
@@ -2,11 +2,6 @@ require 'json'
|
|
2
2
|
require 'turbot_runner'
|
3
3
|
|
4
4
|
describe TurbotRunner::Runner do
|
5
|
-
after(:all) do
|
6
|
-
puts
|
7
|
-
puts 'If all specs passed, you should now run `ruby spec/manual_spec.rb`'
|
8
|
-
end
|
9
|
-
|
10
5
|
describe '#run' do
|
11
6
|
context 'with a bot written in ruby' do
|
12
7
|
before do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: turbot-runner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- OpenCorporates
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -96,8 +96,7 @@ dependencies:
|
|
96
96
|
version: '3.0'
|
97
97
|
description:
|
98
98
|
email: bots@opencorporates.com
|
99
|
-
executables:
|
100
|
-
- rspec
|
99
|
+
executables: []
|
101
100
|
extensions: []
|
102
101
|
extra_rdoc_files: []
|
103
102
|
files:
|
@@ -106,7 +105,6 @@ files:
|
|
106
105
|
- Gemfile
|
107
106
|
- README.md
|
108
107
|
- Rakefile
|
109
|
-
- bin/rspec
|
110
108
|
- lib/turbot_runner.rb
|
111
109
|
- lib/turbot_runner/base_handler.rb
|
112
110
|
- lib/turbot_runner/exceptions.rb
|
@@ -234,60 +232,4 @@ rubygems_version: 2.4.5
|
|
234
232
|
signing_key:
|
235
233
|
specification_version: 4
|
236
234
|
summary: Utilities for running bots with Turbot
|
237
|
-
test_files:
|
238
|
-
- spec/bots/bot-that-crashes-immediately/manifest.json
|
239
|
-
- spec/bots/bot-that-crashes-immediately/scraper.rb
|
240
|
-
- spec/bots/bot-that-crashes-immediately/transformer1.rb
|
241
|
-
- spec/bots/bot-that-crashes-in-scraper/manifest.json
|
242
|
-
- spec/bots/bot-that-crashes-in-scraper/scraper.rb
|
243
|
-
- spec/bots/bot-that-crashes-in-scraper/transformer1.rb
|
244
|
-
- spec/bots/bot-that-crashes-in-transformer/manifest.json
|
245
|
-
- spec/bots/bot-that-crashes-in-transformer/scraper.rb
|
246
|
-
- spec/bots/bot-that-crashes-in-transformer/transformer1.rb
|
247
|
-
- spec/bots/bot-that-crashes-in-transformer/transformer2.rb
|
248
|
-
- spec/bots/bot-that-emits-run-ended/manifest.json
|
249
|
-
- spec/bots/bot-that-emits-run-ended/scraper.rb
|
250
|
-
- spec/bots/bot-that-emits-snapshot-ended/manifest.json
|
251
|
-
- spec/bots/bot-that-emits-snapshot-ended/scraper.rb
|
252
|
-
- spec/bots/bot-that-expects-file/manifest.json
|
253
|
-
- spec/bots/bot-that-expects-file/scraper.rb
|
254
|
-
- spec/bots/bot-that-expects-file/something.txt
|
255
|
-
- spec/bots/bot-that-is-allowed-to-produce-duplicates/manifest.json
|
256
|
-
- spec/bots/bot-that-is-allowed-to-produce-duplicates/scraper.rb
|
257
|
-
- spec/bots/bot-that-produces-duplicates/manifest.json
|
258
|
-
- spec/bots/bot-that-produces-duplicates/scraper.rb
|
259
|
-
- spec/bots/bot-with-invalid-data-type/manifest.json
|
260
|
-
- spec/bots/bot-with-invalid-data-type/scraper.rb
|
261
|
-
- spec/bots/bot-with-invalid-sample-date/manifest.json
|
262
|
-
- spec/bots/bot-with-invalid-sample-date/scraper.rb
|
263
|
-
- spec/bots/bot-with-pause/manifest.json
|
264
|
-
- spec/bots/bot-with-pause/scraper.rb
|
265
|
-
- spec/bots/bot-with-transformer/manifest.json
|
266
|
-
- spec/bots/bot-with-transformer/scraper.rb
|
267
|
-
- spec/bots/bot-with-transformer/transformer.rb
|
268
|
-
- spec/bots/bot-with-transformers/manifest.json
|
269
|
-
- spec/bots/bot-with-transformers/scraper.rb
|
270
|
-
- spec/bots/bot-with-transformers/transformer1.rb
|
271
|
-
- spec/bots/bot-with-transformers/transformer2.rb
|
272
|
-
- spec/bots/invalid-json-bot/manifest.json
|
273
|
-
- spec/bots/invalid-json-bot/scraper.rb
|
274
|
-
- spec/bots/invalid-record-bot/manifest.json
|
275
|
-
- spec/bots/invalid-record-bot/scraper.rb
|
276
|
-
- spec/bots/logging-bot/manifest.json
|
277
|
-
- spec/bots/logging-bot/scraper.rb
|
278
|
-
- spec/bots/python-bot/manifest.json
|
279
|
-
- spec/bots/python-bot/scraper.py
|
280
|
-
- spec/bots/ruby-bot/manifest.json
|
281
|
-
- spec/bots/ruby-bot/scraper.rb
|
282
|
-
- spec/bots/slow-bot/manifest.json
|
283
|
-
- spec/bots/slow-bot/scraper.rb
|
284
|
-
- spec/lib/processor_spec.rb
|
285
|
-
- spec/lib/runner_spec.rb
|
286
|
-
- spec/lib/utils_spec.rb
|
287
|
-
- spec/lib/validator_spec.rb
|
288
|
-
- spec/manual_spec.rb
|
289
|
-
- spec/outputs/full-scraper.out
|
290
|
-
- spec/outputs/full-transformer.out
|
291
|
-
- spec/outputs/truncated-scraper.out
|
292
|
-
- spec/spec_helper.rb
|
293
|
-
has_rdoc:
|
235
|
+
test_files: []
|
data/bin/rspec
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file was generated by Bundler.
|
4
|
-
#
|
5
|
-
# The application 'rspec' is installed as part of a gem, and
|
6
|
-
# this file is here to facilitate running it.
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'pathname'
|
10
|
-
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
-
Pathname.new(__FILE__).realpath)
|
12
|
-
|
13
|
-
require 'rubygems'
|
14
|
-
require 'bundler/setup'
|
15
|
-
|
16
|
-
load Gem.bin_path('rspec-core', 'rspec')
|