turbot-runner-morph 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +15 -0
  2. data/bin/rspec +16 -0
  3. data/lib/turbot_runner.rb +28 -0
  4. data/lib/turbot_runner/base_handler.rb +15 -0
  5. data/lib/turbot_runner/exceptions.rb +4 -0
  6. data/lib/turbot_runner/prerun.rb +3 -0
  7. data/lib/turbot_runner/processor.rb +53 -0
  8. data/lib/turbot_runner/runner.rb +179 -0
  9. data/lib/turbot_runner/script_runner.rb +98 -0
  10. data/lib/turbot_runner/utils.rb +47 -0
  11. data/lib/turbot_runner/validator.rb +28 -0
  12. data/lib/turbot_runner/version.rb +3 -0
  13. data/schema/schemas/company-schema.json +243 -0
  14. data/schema/schemas/financial-payment-schema.json +32 -0
  15. data/schema/schemas/includes/address.json +53 -0
  16. data/schema/schemas/includes/alternative_name.json +36 -0
  17. data/schema/schemas/includes/company-for-nesting.json +245 -0
  18. data/schema/schemas/includes/company.json +25 -0
  19. data/schema/schemas/includes/entity.json +58 -0
  20. data/schema/schemas/includes/filing.json +52 -0
  21. data/schema/schemas/includes/financial-payment-data-object.json +112 -0
  22. data/schema/schemas/includes/identifier.json +20 -0
  23. data/schema/schemas/includes/industry_code.json +29 -0
  24. data/schema/schemas/includes/licence-data-object.json +63 -0
  25. data/schema/schemas/includes/officer.json +70 -0
  26. data/schema/schemas/includes/organisation.json +58 -0
  27. data/schema/schemas/includes/permission.json +46 -0
  28. data/schema/schemas/includes/person.json +62 -0
  29. data/schema/schemas/includes/person_name.json +71 -0
  30. data/schema/schemas/includes/previous_name.json +24 -0
  31. data/schema/schemas/includes/share-parcel-data.json +82 -0
  32. data/schema/schemas/includes/share-parcel.json +78 -0
  33. data/schema/schemas/includes/subsidiary-relationship-data.json +58 -0
  34. data/schema/schemas/includes/total-shares.json +17 -0
  35. data/schema/schemas/includes/unknown_entity_type.json +58 -0
  36. data/schema/schemas/licence-schema.json +105 -0
  37. data/schema/schemas/primary-data-schema.json +20 -0
  38. data/schema/schemas/share-parcel-schema.json +22 -0
  39. data/schema/schemas/simple-financial-payment-schema.json +122 -0
  40. data/schema/schemas/simple-licence-schema.json +82 -0
  41. data/schema/schemas/simple-subsidiary-schema.json +85 -0
  42. data/schema/schemas/subsidiary-relationship-schema.json +46 -0
  43. data/spec/bots/bot-that-crashes-immediately/manifest.json +15 -0
  44. data/spec/bots/bot-that-crashes-immediately/scraper.rb +1 -0
  45. data/spec/bots/bot-that-crashes-immediately/transformer1.rb +15 -0
  46. data/spec/bots/bot-that-crashes-in-scraper/manifest.json +15 -0
  47. data/spec/bots/bot-that-crashes-in-scraper/scraper.rb +11 -0
  48. data/spec/bots/bot-that-crashes-in-scraper/transformer1.rb +15 -0
  49. data/spec/bots/bot-that-crashes-in-transformer/manifest.json +20 -0
  50. data/spec/bots/bot-that-crashes-in-transformer/scraper.rb +10 -0
  51. data/spec/bots/bot-that-crashes-in-transformer/transformer1.rb +15 -0
  52. data/spec/bots/bot-that-crashes-in-transformer/transformer2.rb +17 -0
  53. data/spec/bots/bot-that-emits-run-ended/manifest.json +8 -0
  54. data/spec/bots/bot-that-emits-run-ended/scraper.rb +11 -0
  55. data/spec/bots/bot-that-expects-file/manifest.json +8 -0
  56. data/spec/bots/bot-that-expects-file/scraper.rb +11 -0
  57. data/spec/bots/bot-that-expects-file/something.txt +1 -0
  58. data/spec/bots/bot-with-invalid-data-type/manifest.json +8 -0
  59. data/spec/bots/bot-with-invalid-data-type/scraper.rb +10 -0
  60. data/spec/bots/bot-with-invalid-sample-date/manifest.json +8 -0
  61. data/spec/bots/bot-with-invalid-sample-date/scraper.rb +10 -0
  62. data/spec/bots/bot-with-pause/manifest.json +8 -0
  63. data/spec/bots/bot-with-pause/scraper.rb +16 -0
  64. data/spec/bots/bot-with-transformer/manifest.json +15 -0
  65. data/spec/bots/bot-with-transformer/scraper.rb +10 -0
  66. data/spec/bots/bot-with-transformer/transformer.rb +15 -0
  67. data/spec/bots/bot-with-transformers/manifest.json +20 -0
  68. data/spec/bots/bot-with-transformers/scraper.rb +10 -0
  69. data/spec/bots/bot-with-transformers/transformer1.rb +15 -0
  70. data/spec/bots/bot-with-transformers/transformer2.rb +15 -0
  71. data/spec/bots/invalid-json-bot/manifest.json +8 -0
  72. data/spec/bots/invalid-json-bot/scraper.rb +11 -0
  73. data/spec/bots/invalid-record-bot/manifest.json +8 -0
  74. data/spec/bots/invalid-record-bot/scraper.rb +11 -0
  75. data/spec/bots/logging-bot/manifest.json +8 -0
  76. data/spec/bots/logging-bot/scraper.rb +14 -0
  77. data/spec/bots/python-bot/manifest.json +8 -0
  78. data/spec/bots/python-bot/scraper.py +11 -0
  79. data/spec/bots/ruby-bot/manifest.json +8 -0
  80. data/spec/bots/ruby-bot/scraper.rb +10 -0
  81. data/spec/bots/slow-bot/manifest.json +8 -0
  82. data/spec/bots/slow-bot/scraper.rb +11 -0
  83. data/spec/lib/processor_spec.rb +181 -0
  84. data/spec/lib/runner_spec.rb +330 -0
  85. data/spec/lib/utils_spec.rb +23 -0
  86. data/spec/lib/validator_spec.rb +89 -0
  87. data/spec/manual_spec.rb +57 -0
  88. data/spec/outputs/full-scraper.out +10 -0
  89. data/spec/outputs/full-transformer.out +10 -0
  90. data/spec/outputs/truncated-scraper.out +5 -0
  91. data/spec/spec_helper.rb +20 -0
  92. metadata +148 -0
@@ -0,0 +1,46 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-04/schema#",
3
+ "description": "A relationship of control between two companies",
4
+ "type": "object",
5
+ "properties": {
6
+ "sample_date": {
7
+ "type": "string",
8
+ "format": "date"
9
+ },
10
+ "start_date": {
11
+ "type": "string",
12
+ "format": "date"
13
+ },
14
+ "start_date_type": {
15
+ "type": "string"
16
+ },
17
+ "end_date": {
18
+ "type": "string",
19
+ "format": "date"
20
+ },
21
+ "end_date_type": {
22
+ "type": "string"
23
+ },
24
+ "source_jurisdiction": {
25
+ "description": "Jurisdiction of the source of the data",
26
+ "type": "string"
27
+ },
28
+ "company": {
29
+ "$ref": "includes/company.json"
30
+ },
31
+ "data": {
32
+ "type": "array",
33
+ "description": "This is an array of data objects, that is the objects which actually contain the data about the datum. Generally there will only be one element in this, although in for some types of data_types there may naturally be several releated to the same company, e.g. shareholders",
34
+ "items": {
35
+ "$ref": "includes/subsidiary-relationship-data.json"
36
+ },
37
+ "additionalItems": false
38
+ }
39
+ },
40
+ "additionalProperties": false,
41
+ "required": [
42
+ "company",
43
+ "data",
44
+ "sample_date"
45
+ ]
46
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ "bot_id": "bot-that-crashes-immediately",
3
+ "description": "This is a bot that crashes immediately",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb", "transformer1.rb"],
8
+ "transformers": [
9
+ {
10
+ "file": "transformer1.rb",
11
+ "data_type": "simple-licence",
12
+ "identifying_fields": ["licence_number"]
13
+ }
14
+ ]
15
+ }
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,15 @@
1
+ {
2
+ "bot_id": "bot-that-crashes-in-scraper",
3
+ "description": "This is a bot that crashes in the scraper",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb", "transformer1.rb"],
8
+ "transformers": [
9
+ {
10
+ "file": "transformer1.rb",
11
+ "data_type": "simple-licence",
12
+ "identifying_fields": ["licence_number"]
13
+ }
14
+ ]
15
+ }
@@ -0,0 +1,11 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ raise 'Oh no' if n == 4
11
+ end
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,20 @@
1
+ {
2
+ "bot_id": "bot-that-crashes-in-transformer",
3
+ "description": "This is a bot that crashes in the transformer",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb", "transformer1.rb", "transformer2.rb"],
8
+ "transformers": [
9
+ {
10
+ "file": "transformer1.rb",
11
+ "data_type": "simple-licence",
12
+ "identifying_fields": ["licence_number"]
13
+ },
14
+ {
15
+ "file": "transformer2.rb",
16
+ "data_type": "simple-licence",
17
+ "identifying_fields": ["licence_number"]
18
+ }
19
+ ]
20
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,17 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+
16
+ raise 'Oh no' if raw_record['licence_number'] == 'XYZ4'
17
+ end
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "bot-that-emits-run-ended",
3
+ "description": "This is a bot that emits RUN ENDED",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"]
8
+ }
@@ -0,0 +1,11 @@
1
+ require 'json'
2
+
3
+ 0.upto(2) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
11
+ puts "RUN ENDED"
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "bot-that-expects-file",
3
+ "description": "This bot depends on being able to open a file in the current directory",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb", "something.txt"]
8
+ }
@@ -0,0 +1,11 @@
1
+ require 'json'
2
+
3
+ asd = open("something.txt", "r")
4
+ 0.upto(2) do |n|
5
+ record = {
6
+ :licence_number => "XYZ#{n}",
7
+ :source_url => 'http://example.com',
8
+ :sample_date => '2014-06-01'
9
+ }
10
+ puts(record.to_json)
11
+ end
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "bot-with-invalid-data-type",
3
+ "description": "This bot has an invalid data type",
4
+ "language": "ruby",
5
+ "data_type": "fishmonger licence",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"]
8
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "bot-with-invalid-sample-date",
3
+ "description": "This bot produces data with an invalid sample date",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"]
8
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '01/06/2014'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "bot-with-pause",
3
+ "description": "This bot pauses for ten second halfway after outputting five lines",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"]
8
+ }
@@ -0,0 +1,16 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+
11
+ if n == 4
12
+ $stderr.puts 'The scraper will sleep for ten seconds...'
13
+ sleep 10
14
+ $stderr.puts 'The scraper is resuming...'
15
+ end
16
+ end
@@ -0,0 +1,15 @@
1
+ {
2
+ "bot_id": "bot-with-transformer",
3
+ "description": "This is a bot with a transformer",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb", "transformer1.rb", "transformer2.rb"],
8
+ "transformers": [
9
+ {
10
+ "file": "transformer.rb",
11
+ "data_type": "simple-licence",
12
+ "identifying_fields": ["licence_number"]
13
+ }
14
+ ]
15
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,20 @@
1
+ {
2
+ "bot_id": "bot-with-transformers",
3
+ "description": "This is a bot with multiple transformers",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"],
8
+ "transformers": [
9
+ {
10
+ "file": "transformer1.rb",
11
+ "data_type": "simple-licence",
12
+ "identifying_fields": ["licence_number"]
13
+ },
14
+ {
15
+ "file": "transformer2.rb",
16
+ "data_type": "simple-licence",
17
+ "identifying_fields": ["licence_number"]
18
+ }
19
+ ]
20
+ }
@@ -0,0 +1,10 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ puts(record.to_json)
10
+ end
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,15 @@
1
+ require 'json'
2
+
3
+ STDIN.each_line do |line|
4
+ raw_record = JSON.parse(line)
5
+
6
+ transformed_record = {
7
+ :company_name => 'Foo Widgets',
8
+ :company_jurisdiction => 'gb',
9
+ :licence_number => raw_record['licence_number'],
10
+ :source_url => raw_record['source_url'],
11
+ :sample_date => raw_record['sample_date'],
12
+ }
13
+
14
+ puts transformed_record.to_json
15
+ end
@@ -0,0 +1,8 @@
1
+ {
2
+ "bot_id": "invalid-json-bot",
3
+ "description": "This is a bot that produces invalid JSON",
4
+ "language": "ruby",
5
+ "data_type": "primary data",
6
+ "identifying_fields": ["licence_number"],
7
+ "files": ["scraper.rb"]
8
+ }
@@ -0,0 +1,11 @@
1
+ require 'json'
2
+
3
+ 0.upto(9) do |n|
4
+ record = {
5
+ :licence_number => "XYZ#{n}",
6
+ :source_url => 'http://example.com',
7
+ :sample_date => '2014-06-01'
8
+ }
9
+ record = 'This is not JSON' if n == 5
10
+ puts(record.to_json)
11
+ end