turbot-runner-morph 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (92) hide show
  1. checksums.yaml +15 -0
  2. data/bin/rspec +16 -0
  3. data/lib/turbot_runner.rb +28 -0
  4. data/lib/turbot_runner/base_handler.rb +15 -0
  5. data/lib/turbot_runner/exceptions.rb +4 -0
  6. data/lib/turbot_runner/prerun.rb +3 -0
  7. data/lib/turbot_runner/processor.rb +53 -0
  8. data/lib/turbot_runner/runner.rb +179 -0
  9. data/lib/turbot_runner/script_runner.rb +98 -0
  10. data/lib/turbot_runner/utils.rb +47 -0
  11. data/lib/turbot_runner/validator.rb +28 -0
  12. data/lib/turbot_runner/version.rb +3 -0
  13. data/schema/schemas/company-schema.json +243 -0
  14. data/schema/schemas/financial-payment-schema.json +32 -0
  15. data/schema/schemas/includes/address.json +53 -0
  16. data/schema/schemas/includes/alternative_name.json +36 -0
  17. data/schema/schemas/includes/company-for-nesting.json +245 -0
  18. data/schema/schemas/includes/company.json +25 -0
  19. data/schema/schemas/includes/entity.json +58 -0
  20. data/schema/schemas/includes/filing.json +52 -0
  21. data/schema/schemas/includes/financial-payment-data-object.json +112 -0
  22. data/schema/schemas/includes/identifier.json +20 -0
  23. data/schema/schemas/includes/industry_code.json +29 -0
  24. data/schema/schemas/includes/licence-data-object.json +63 -0
  25. data/schema/schemas/includes/officer.json +70 -0
  26. data/schema/schemas/includes/organisation.json +58 -0
  27. data/schema/schemas/includes/permission.json +46 -0
  28. data/schema/schemas/includes/person.json +62 -0
  29. data/schema/schemas/includes/person_name.json +71 -0
  30. data/schema/schemas/includes/previous_name.json +24 -0
  31. data/schema/schemas/includes/share-parcel-data.json +82 -0
  32. data/schema/schemas/includes/share-parcel.json +78 -0
  33. data/schema/schemas/includes/subsidiary-relationship-data.json +58 -0
  34. data/schema/schemas/includes/total-shares.json +17 -0
  35. data/schema/schemas/includes/unknown_entity_type.json +58 -0
  36. data/schema/schemas/licence-schema.json +105 -0
  37. data/schema/schemas/primary-data-schema.json +20 -0
  38. data/schema/schemas/share-parcel-schema.json +22 -0
  39. data/schema/schemas/simple-financial-payment-schema.json +122 -0
  40. data/schema/schemas/simple-licence-schema.json +82 -0
  41. data/schema/schemas/simple-subsidiary-schema.json +85 -0
  42. data/schema/schemas/subsidiary-relationship-schema.json +46 -0
  43. data/spec/bots/bot-that-crashes-immediately/manifest.json +15 -0
  44. data/spec/bots/bot-that-crashes-immediately/scraper.rb +1 -0
  45. data/spec/bots/bot-that-crashes-immediately/transformer1.rb +15 -0
  46. data/spec/bots/bot-that-crashes-in-scraper/manifest.json +15 -0
  47. data/spec/bots/bot-that-crashes-in-scraper/scraper.rb +11 -0
  48. data/spec/bots/bot-that-crashes-in-scraper/transformer1.rb +15 -0
  49. data/spec/bots/bot-that-crashes-in-transformer/manifest.json +20 -0
  50. data/spec/bots/bot-that-crashes-in-transformer/scraper.rb +10 -0
  51. data/spec/bots/bot-that-crashes-in-transformer/transformer1.rb +15 -0
  52. data/spec/bots/bot-that-crashes-in-transformer/transformer2.rb +17 -0
  53. data/spec/bots/bot-that-emits-run-ended/manifest.json +8 -0
  54. data/spec/bots/bot-that-emits-run-ended/scraper.rb +11 -0
  55. data/spec/bots/bot-that-expects-file/manifest.json +8 -0
  56. data/spec/bots/bot-that-expects-file/scraper.rb +11 -0
  57. data/spec/bots/bot-that-expects-file/something.txt +1 -0
  58. data/spec/bots/bot-with-invalid-data-type/manifest.json +8 -0
  59. data/spec/bots/bot-with-invalid-data-type/scraper.rb +10 -0
  60. data/spec/bots/bot-with-invalid-sample-date/manifest.json +8 -0
  61. data/spec/bots/bot-with-invalid-sample-date/scraper.rb +10 -0
  62. data/spec/bots/bot-with-pause/manifest.json +8 -0
  63. data/spec/bots/bot-with-pause/scraper.rb +16 -0
  64. data/spec/bots/bot-with-transformer/manifest.json +15 -0
  65. data/spec/bots/bot-with-transformer/scraper.rb +10 -0
  66. data/spec/bots/bot-with-transformer/transformer.rb +15 -0
  67. data/spec/bots/bot-with-transformers/manifest.json +20 -0
  68. data/spec/bots/bot-with-transformers/scraper.rb +10 -0
  69. data/spec/bots/bot-with-transformers/transformer1.rb +15 -0
  70. data/spec/bots/bot-with-transformers/transformer2.rb +15 -0
  71. data/spec/bots/invalid-json-bot/manifest.json +8 -0
  72. data/spec/bots/invalid-json-bot/scraper.rb +11 -0
  73. data/spec/bots/invalid-record-bot/manifest.json +8 -0
  74. data/spec/bots/invalid-record-bot/scraper.rb +11 -0
  75. data/spec/bots/logging-bot/manifest.json +8 -0
  76. data/spec/bots/logging-bot/scraper.rb +14 -0
  77. data/spec/bots/python-bot/manifest.json +8 -0
  78. data/spec/bots/python-bot/scraper.py +11 -0
  79. data/spec/bots/ruby-bot/manifest.json +8 -0
  80. data/spec/bots/ruby-bot/scraper.rb +10 -0
  81. data/spec/bots/slow-bot/manifest.json +8 -0
  82. data/spec/bots/slow-bot/scraper.rb +11 -0
  83. data/spec/lib/processor_spec.rb +181 -0
  84. data/spec/lib/runner_spec.rb +330 -0
  85. data/spec/lib/utils_spec.rb +23 -0
  86. data/spec/lib/validator_spec.rb +89 -0
  87. data/spec/manual_spec.rb +57 -0
  88. data/spec/outputs/full-scraper.out +10 -0
  89. data/spec/outputs/full-transformer.out +10 -0
  90. data/spec/outputs/truncated-scraper.out +5 -0
  91. data/spec/spec_helper.rb +20 -0
  92. metadata +148 -0
@@ -0,0 +1,46 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-04/schema#",
3
+ "description": "A relationship of control between two companies",
4
+ "type": "object",
5
+ "properties": {
6
+ "sample_date": {
7
+ "type": "string",
8
+ "format": "date"
9
+ },
10
+ "start_date": {
11
+ "type": "string",
12
+ "format": "date"
13
+ },
14
+ "start_date_type": {
15
+ "type": "string"
16
+ },
17
+ "end_date": {
18
+ "type": "string",
19
+ "format": "date"
20
+ },
21
+ "end_date_type": {
22
+ "type": "string"
23
+ },
24
+ "source_jurisdiction": {
25
+ "description": "Jurisdiction of the source of the data",
26
+ "type": "string"
27
+ },
28
+ "company": {
29
+ "$ref": "includes/company.json"
30
+ },
31
+ "data": {
32
+ "type": "array",
33
+ "description": "This is an array of data objects, that is the objects which actually contain the data about the datum. Generally there will only be one element in this, although in for some types of data_types there may naturally be several releated to the same company, e.g. shareholders",
34
+ "items": {
35
+ "$ref": "includes/subsidiary-relationship-data.json"
36
+ },
37
+ "additionalItems": false
38
+ }
39
+ },
40
+ "additionalProperties": false,
41
+ "required": [
42
+ "company",
43
+ "data",
44
+ "sample_date"
45
+ ]
46
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ "bot_id": "bot-that-crashes-immediately",
3
+ "description": "This is a bot that crashes immediately",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb", "transformer1.rb"],
8
+ "transformers": [
9
+ {
10
+ "file": "transformer1.rb",
11
+ "data_type": "simple-licence",
12
+ "identifying_fields": ["licence_number"]
13
+ }
14
+ ]
15
+ }
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,15 @@
1
+ {
2
+ "bot_id": "bot-that-crashes-in-scraper",
3
+ "description": "This is a bot that crashes in the scraper",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb", "transformer1.rb"],
8
+ "transformers": [
9
+ {
10
+ "file": "transformer1.rb",
11
+ "data_type": "simple-licence",
12
+ "identifying_fields": ["licence_number"]
13
+ }
14
+ ]
15
+ }
@@ -0,0 +1,11 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ raise 'Oh no' if n == 4
11
+ end
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,20 @@
1
+ {
2
+ "bot_id": "bot-that-crashes-in-transformer",
3
+ "description": "This is a bot that crashes in the transformer",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb", "transformer1.rb", "transformer2.rb"],
8
+ "transformers": [
9
+ {
10
+ "file": "transformer1.rb",
11
+ "data_type": "simple-licence",
12
+ "identifying_fields": ["licence_number"]
13
+ },
14
+ {
15
+ "file": "transformer2.rb",
16
+ "data_type": "simple-licence",
17
+ "identifying_fields": ["licence_number"]
18
+ }
19
+ ]
20
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,17 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+
16
+ raise 'Oh no' if raw_record['licence_number'] == 'XYZ4'
17
+ end
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "bot-that-emits-run-ended",
3
+ "description": "This is a bot that emits RUN ENDED",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"]
8
+ }
@@ -0,0 +1,11 @@
1
+ require 'json'
2
+
3
+ 0.upto(2) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
11
+ puts "RUN ENDED"
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "bot-that-expects-file",
3
+ "description": "This bot depends on being able to open a file in the current directory",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb", "something.txt"]
8
+ }
@@ -0,0 +1,11 @@
1
+ require 'json'
2
+
3
+ asd = open("something.txt", "r")
4
+ 0.upto(2) do |n|
5
+ record = {
6
+ :licence_number => "XYZ#{n}",
7
+ :source_url => 'http://example.com',
8
+ :sample_date => '2014-06-01'
9
+ }
10
+ puts(record.to_json)
11
+ end
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "bot-with-invalid-data-type",
3
+ "description": "This bot has an invalid data type",
4
+ "language": "ruby",
5
+ "data_type": "fishmonger licence",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"]
8
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "bot-with-invalid-sample-date",
3
+ "description": "This bot produces data with an invalid sample date",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"]
8
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '01/06/2014'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "bot-with-pause",
3
+ "description": "This bot pauses for ten second halfway after outputting five lines",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"]
8
+ }
@@ -0,0 +1,16 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+
11
+ if n == 4
12
+ $stderr.puts 'The scraper will sleep for ten seconds...'
13
+ sleep 10
14
+ $stderr.puts 'The scraper is resuming...'
15
+ end
16
+ end
@@ -0,0 +1,15 @@
1
+ {
2
+ "bot_id": "bot-with-transformer",
3
+ "description": "This is a bot with a transformer",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb", "transformer1.rb", "transformer2.rb"],
8
+ "transformers": [
9
+ {
10
+ "file": "transformer.rb",
11
+ "data_type": "simple-licence",
12
+ "identifying_fields": ["licence_number"]
13
+ }
14
+ ]
15
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,20 @@
1
+ {
2
+ "bot_id": "bot-with-transformers",
3
+ "description": "This is a bot with multiple transformers",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"],
8
+ "transformers": [
9
+ {
10
+ "file": "transformer1.rb",
11
+ "data_type": "simple-licence",
12
+ "identifying_fields": ["licence_number"]
13
+ },
14
+ {
15
+ "file": "transformer2.rb",
16
+ "data_type": "simple-licence",
17
+ "identifying_fields": ["licence_number"]
18
+ }
19
+ ]
20
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "invalid-json-bot",
3
+ "description": "This is a bot that produces invalid JSON",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"]
8
+ }
@@ -0,0 +1,11 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ record = 'This is not JSON' if n == 5
10
+ puts(record.to_json)
11
+ end