turbot-runner-morph 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/bin/rspec +16 -0
- data/lib/turbot_runner.rb +28 -0
- data/lib/turbot_runner/base_handler.rb +15 -0
- data/lib/turbot_runner/exceptions.rb +4 -0
- data/lib/turbot_runner/prerun.rb +3 -0
- data/lib/turbot_runner/processor.rb +53 -0
- data/lib/turbot_runner/runner.rb +179 -0
- data/lib/turbot_runner/script_runner.rb +98 -0
- data/lib/turbot_runner/utils.rb +47 -0
- data/lib/turbot_runner/validator.rb +28 -0
- data/lib/turbot_runner/version.rb +3 -0
- data/schema/schemas/company-schema.json +243 -0
- data/schema/schemas/financial-payment-schema.json +32 -0
- data/schema/schemas/includes/address.json +53 -0
- data/schema/schemas/includes/alternative_name.json +36 -0
- data/schema/schemas/includes/company-for-nesting.json +245 -0
- data/schema/schemas/includes/company.json +25 -0
- data/schema/schemas/includes/entity.json +58 -0
- data/schema/schemas/includes/filing.json +52 -0
- data/schema/schemas/includes/financial-payment-data-object.json +112 -0
- data/schema/schemas/includes/identifier.json +20 -0
- data/schema/schemas/includes/industry_code.json +29 -0
- data/schema/schemas/includes/licence-data-object.json +63 -0
- data/schema/schemas/includes/officer.json +70 -0
- data/schema/schemas/includes/organisation.json +58 -0
- data/schema/schemas/includes/permission.json +46 -0
- data/schema/schemas/includes/person.json +62 -0
- data/schema/schemas/includes/person_name.json +71 -0
- data/schema/schemas/includes/previous_name.json +24 -0
- data/schema/schemas/includes/share-parcel-data.json +82 -0
- data/schema/schemas/includes/share-parcel.json +78 -0
- data/schema/schemas/includes/subsidiary-relationship-data.json +58 -0
- data/schema/schemas/includes/total-shares.json +17 -0
- data/schema/schemas/includes/unknown_entity_type.json +58 -0
- data/schema/schemas/licence-schema.json +105 -0
- data/schema/schemas/primary-data-schema.json +20 -0
- data/schema/schemas/share-parcel-schema.json +22 -0
- data/schema/schemas/simple-financial-payment-schema.json +122 -0
- data/schema/schemas/simple-licence-schema.json +82 -0
- data/schema/schemas/simple-subsidiary-schema.json +85 -0
- data/schema/schemas/subsidiary-relationship-schema.json +46 -0
- data/spec/bots/bot-that-crashes-immediately/manifest.json +15 -0
- data/spec/bots/bot-that-crashes-immediately/scraper.rb +1 -0
- data/spec/bots/bot-that-crashes-immediately/transformer1.rb +15 -0
- data/spec/bots/bot-that-crashes-in-scraper/manifest.json +15 -0
- data/spec/bots/bot-that-crashes-in-scraper/scraper.rb +11 -0
- data/spec/bots/bot-that-crashes-in-scraper/transformer1.rb +15 -0
- data/spec/bots/bot-that-crashes-in-transformer/manifest.json +20 -0
- data/spec/bots/bot-that-crashes-in-transformer/scraper.rb +10 -0
- data/spec/bots/bot-that-crashes-in-transformer/transformer1.rb +15 -0
- data/spec/bots/bot-that-crashes-in-transformer/transformer2.rb +17 -0
- data/spec/bots/bot-that-emits-run-ended/manifest.json +8 -0
- data/spec/bots/bot-that-emits-run-ended/scraper.rb +11 -0
- data/spec/bots/bot-that-expects-file/manifest.json +8 -0
- data/spec/bots/bot-that-expects-file/scraper.rb +11 -0
- data/spec/bots/bot-that-expects-file/something.txt +1 -0
- data/spec/bots/bot-with-invalid-data-type/manifest.json +8 -0
- data/spec/bots/bot-with-invalid-data-type/scraper.rb +10 -0
- data/spec/bots/bot-with-invalid-sample-date/manifest.json +8 -0
- data/spec/bots/bot-with-invalid-sample-date/scraper.rb +10 -0
- data/spec/bots/bot-with-pause/manifest.json +8 -0
- data/spec/bots/bot-with-pause/scraper.rb +16 -0
- data/spec/bots/bot-with-transformer/manifest.json +15 -0
- data/spec/bots/bot-with-transformer/scraper.rb +10 -0
- data/spec/bots/bot-with-transformer/transformer.rb +15 -0
- data/spec/bots/bot-with-transformers/manifest.json +20 -0
- data/spec/bots/bot-with-transformers/scraper.rb +10 -0
- data/spec/bots/bot-with-transformers/transformer1.rb +15 -0
- data/spec/bots/bot-with-transformers/transformer2.rb +15 -0
- data/spec/bots/invalid-json-bot/manifest.json +8 -0
- data/spec/bots/invalid-json-bot/scraper.rb +11 -0
- data/spec/bots/invalid-record-bot/manifest.json +8 -0
- data/spec/bots/invalid-record-bot/scraper.rb +11 -0
- data/spec/bots/logging-bot/manifest.json +8 -0
- data/spec/bots/logging-bot/scraper.rb +14 -0
- data/spec/bots/python-bot/manifest.json +8 -0
- data/spec/bots/python-bot/scraper.py +11 -0
- data/spec/bots/ruby-bot/manifest.json +8 -0
- data/spec/bots/ruby-bot/scraper.rb +10 -0
- data/spec/bots/slow-bot/manifest.json +8 -0
- data/spec/bots/slow-bot/scraper.rb +11 -0
- data/spec/lib/processor_spec.rb +181 -0
- data/spec/lib/runner_spec.rb +330 -0
- data/spec/lib/utils_spec.rb +23 -0
- data/spec/lib/validator_spec.rb +89 -0
- data/spec/manual_spec.rb +57 -0
- data/spec/outputs/full-scraper.out +10 -0
- data/spec/outputs/full-transformer.out +10 -0
- data/spec/outputs/truncated-scraper.out +5 -0
- data/spec/spec_helper.rb +20 -0
- metadata +148 -0
@@ -0,0 +1,46 @@
|
|
1
|
+
{
|
2
|
+
"$schema": "http://json-schema.org/draft-04/schema#",
|
3
|
+
"description": "A relationship of control between two companies",
|
4
|
+
"type": "object",
|
5
|
+
"properties": {
|
6
|
+
"sample_date": {
|
7
|
+
"type": "string",
|
8
|
+
"format": "date"
|
9
|
+
},
|
10
|
+
"start_date": {
|
11
|
+
"type": "string",
|
12
|
+
"format": "date"
|
13
|
+
},
|
14
|
+
"start_date_type": {
|
15
|
+
"type": "string"
|
16
|
+
},
|
17
|
+
"end_date": {
|
18
|
+
"type": "string",
|
19
|
+
"format": "date"
|
20
|
+
},
|
21
|
+
"end_date_type": {
|
22
|
+
"type": "string"
|
23
|
+
},
|
24
|
+
"source_jurisdiction": {
|
25
|
+
"description": "Jurisdiction of the source of the data",
|
26
|
+
"type": "string"
|
27
|
+
},
|
28
|
+
"company": {
|
29
|
+
"$ref": "includes/company.json"
|
30
|
+
},
|
31
|
+
"data": {
|
32
|
+
"type": "array",
|
33
|
+
"description": "This is an array of data objects, that is the objects which actually contain the data about the datum. Generally there will only be one element in this, although in for some types of data_types there may naturally be several releated to the same company, e.g. shareholders",
|
34
|
+
"items": {
|
35
|
+
"$ref": "includes/subsidiary-relationship-data.json"
|
36
|
+
},
|
37
|
+
"additionalItems": false
|
38
|
+
}
|
39
|
+
},
|
40
|
+
"additionalProperties": false,
|
41
|
+
"required": [
|
42
|
+
"company",
|
43
|
+
"data",
|
44
|
+
"sample_date"
|
45
|
+
]
|
46
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
{
|
2
|
+
"bot_id": "bot-that-crashes-immediately",
|
3
|
+
"description": "This is a bot that crashes immediately",
|
4
|
+
"language": "ruby",
|
5
|
+
"data_type": "primary data",
|
6
|
+
"identifying_fields": ["licence_number"],
|
7
|
+
"files": ["scraper.rb", "transformer1.rb"],
|
8
|
+
"transformers": [
|
9
|
+
{
|
10
|
+
"file": "transformer1.rb",
|
11
|
+
"data_type": "simple-licence",
|
12
|
+
"identifying_fields": ["licence_number"]
|
13
|
+
}
|
14
|
+
]
|
15
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
raise 'Oh no'
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
STDIN.each_line do |line|
|
4
|
+
raw_record = JSON.parse(line)
|
5
|
+
|
6
|
+
transformed_record = {
|
7
|
+
:company_name => 'Foo Widgets',
|
8
|
+
:company_jurisdiction => 'gb',
|
9
|
+
:licence_number => raw_record['licence_number'],
|
10
|
+
:source_url => raw_record['source_url'],
|
11
|
+
:sample_date => raw_record['sample_date'],
|
12
|
+
}
|
13
|
+
|
14
|
+
puts transformed_record.to_json
|
15
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
{
|
2
|
+
"bot_id": "bot-that-crashes-in-scraper",
|
3
|
+
"description": "This is a bot that crashes in the scraper",
|
4
|
+
"language": "ruby",
|
5
|
+
"data_type": "primary data",
|
6
|
+
"identifying_fields": ["licence_number"],
|
7
|
+
"files": ["scraper.rb", "transformer1.rb"],
|
8
|
+
"transformers": [
|
9
|
+
{
|
10
|
+
"file": "transformer1.rb",
|
11
|
+
"data_type": "simple-licence",
|
12
|
+
"identifying_fields": ["licence_number"]
|
13
|
+
}
|
14
|
+
]
|
15
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
STDIN.each_line do |line|
|
4
|
+
raw_record = JSON.parse(line)
|
5
|
+
|
6
|
+
transformed_record = {
|
7
|
+
:company_name => 'Foo Widgets',
|
8
|
+
:company_jurisdiction => 'gb',
|
9
|
+
:licence_number => raw_record['licence_number'],
|
10
|
+
:source_url => raw_record['source_url'],
|
11
|
+
:sample_date => raw_record['sample_date'],
|
12
|
+
}
|
13
|
+
|
14
|
+
puts transformed_record.to_json
|
15
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
{
|
2
|
+
"bot_id": "bot-that-crashes-in-transformer",
|
3
|
+
"description": "This is a bot that crashes in the transformer",
|
4
|
+
"language": "ruby",
|
5
|
+
"data_type": "primary data",
|
6
|
+
"identifying_fields": ["licence_number"],
|
7
|
+
"files": ["scraper.rb", "transformer1.rb", "transformer2.rb"],
|
8
|
+
"transformers": [
|
9
|
+
{
|
10
|
+
"file": "transformer1.rb",
|
11
|
+
"data_type": "simple-licence",
|
12
|
+
"identifying_fields": ["licence_number"]
|
13
|
+
},
|
14
|
+
{
|
15
|
+
"file": "transformer2.rb",
|
16
|
+
"data_type": "simple-licence",
|
17
|
+
"identifying_fields": ["licence_number"]
|
18
|
+
}
|
19
|
+
]
|
20
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
STDIN.each_line do |line|
|
4
|
+
raw_record = JSON.parse(line)
|
5
|
+
|
6
|
+
transformed_record = {
|
7
|
+
:company_name => 'Foo Widgets',
|
8
|
+
:company_jurisdiction => 'gb',
|
9
|
+
:licence_number => raw_record['licence_number'],
|
10
|
+
:source_url => raw_record['source_url'],
|
11
|
+
:sample_date => raw_record['sample_date'],
|
12
|
+
}
|
13
|
+
|
14
|
+
puts transformed_record.to_json
|
15
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
STDIN.each_line do |line|
|
4
|
+
raw_record = JSON.parse(line)
|
5
|
+
|
6
|
+
transformed_record = {
|
7
|
+
:company_name => 'Foo Widgets',
|
8
|
+
:company_jurisdiction => 'gb',
|
9
|
+
:licence_number => raw_record['licence_number'],
|
10
|
+
:source_url => raw_record['source_url'],
|
11
|
+
:sample_date => raw_record['sample_date'],
|
12
|
+
}
|
13
|
+
|
14
|
+
puts transformed_record.to_json
|
15
|
+
|
16
|
+
raise 'Oh no' if raw_record['licence_number'] == 'XYZ4'
|
17
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
{
|
2
|
+
"bot_id": "bot-that-expects-file",
|
3
|
+
"description": "This bot depends on being able to open a file in the current directory",
|
4
|
+
"language": "ruby",
|
5
|
+
"data_type": "primary data",
|
6
|
+
"identifying_fields": ["licence_number"],
|
7
|
+
"files": ["scraper.rb", "something.txt"]
|
8
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
ehlo
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
0.upto(9) do |n|
|
4
|
+
record = {
|
5
|
+
:licence_number => "XYZ#{n}",
|
6
|
+
:source_url => 'http://example.com',
|
7
|
+
:sample_date => '2014-06-01'
|
8
|
+
}
|
9
|
+
puts(record.to_json)
|
10
|
+
|
11
|
+
if n == 4
|
12
|
+
$stderr.puts 'The scraper will sleep for ten seconds...'
|
13
|
+
sleep 10
|
14
|
+
$stderr.puts 'The scraper is resuming...'
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
{
|
2
|
+
"bot_id": "bot-with-transformer",
|
3
|
+
"description": "This is a bot with a transformer",
|
4
|
+
"language": "ruby",
|
5
|
+
"data_type": "primary data",
|
6
|
+
"identifying_fields": ["licence_number"],
|
7
|
+
"files": ["scraper.rb", "transformer1.rb", "transformer2.rb"],
|
8
|
+
"transformers": [
|
9
|
+
{
|
10
|
+
"file": "transformer.rb",
|
11
|
+
"data_type": "simple-licence",
|
12
|
+
"identifying_fields": ["licence_number"]
|
13
|
+
}
|
14
|
+
]
|
15
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
STDIN.each_line do |line|
|
4
|
+
raw_record = JSON.parse(line)
|
5
|
+
|
6
|
+
transformed_record = {
|
7
|
+
:company_name => 'Foo Widgets',
|
8
|
+
:company_jurisdiction => 'gb',
|
9
|
+
:licence_number => raw_record['licence_number'],
|
10
|
+
:source_url => raw_record['source_url'],
|
11
|
+
:sample_date => raw_record['sample_date'],
|
12
|
+
}
|
13
|
+
|
14
|
+
puts transformed_record.to_json
|
15
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
{
|
2
|
+
"bot_id": "bot-with-transformers",
|
3
|
+
"description": "This is a bot with multiple transformers",
|
4
|
+
"language": "ruby",
|
5
|
+
"data_type": "primary data",
|
6
|
+
"identifying_fields": ["licence_number"],
|
7
|
+
"files": ["scraper.rb"],
|
8
|
+
"transformers": [
|
9
|
+
{
|
10
|
+
"file": "transformer1.rb",
|
11
|
+
"data_type": "simple-licence",
|
12
|
+
"identifying_fields": ["licence_number"]
|
13
|
+
},
|
14
|
+
{
|
15
|
+
"file": "transformer2.rb",
|
16
|
+
"data_type": "simple-licence",
|
17
|
+
"identifying_fields": ["licence_number"]
|
18
|
+
}
|
19
|
+
]
|
20
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
STDIN.each_line do |line|
|
4
|
+
raw_record = JSON.parse(line)
|
5
|
+
|
6
|
+
transformed_record = {
|
7
|
+
:company_name => 'Foo Widgets',
|
8
|
+
:company_jurisdiction => 'gb',
|
9
|
+
:licence_number => raw_record['licence_number'],
|
10
|
+
:source_url => raw_record['source_url'],
|
11
|
+
:sample_date => raw_record['sample_date'],
|
12
|
+
}
|
13
|
+
|
14
|
+
puts transformed_record.to_json
|
15
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
STDIN.each_line do |line|
|
4
|
+
raw_record = JSON.parse(line)
|
5
|
+
|
6
|
+
transformed_record = {
|
7
|
+
:company_name => 'Foo Widgets',
|
8
|
+
:company_jurisdiction => 'gb',
|
9
|
+
:licence_number => raw_record['licence_number'],
|
10
|
+
:source_url => raw_record['source_url'],
|
11
|
+
:sample_date => raw_record['sample_date'],
|
12
|
+
}
|
13
|
+
|
14
|
+
puts transformed_record.to_json
|
15
|
+
end
|