turbot-runner-morph 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +15 -0
  2. data/bin/rspec +16 -0
  3. data/lib/turbot_runner.rb +28 -0
  4. data/lib/turbot_runner/base_handler.rb +15 -0
  5. data/lib/turbot_runner/exceptions.rb +4 -0
  6. data/lib/turbot_runner/prerun.rb +3 -0
  7. data/lib/turbot_runner/processor.rb +53 -0
  8. data/lib/turbot_runner/runner.rb +179 -0
  9. data/lib/turbot_runner/script_runner.rb +98 -0
  10. data/lib/turbot_runner/utils.rb +47 -0
  11. data/lib/turbot_runner/validator.rb +28 -0
  12. data/lib/turbot_runner/version.rb +3 -0
  13. data/schema/schemas/company-schema.json +243 -0
  14. data/schema/schemas/financial-payment-schema.json +32 -0
  15. data/schema/schemas/includes/address.json +53 -0
  16. data/schema/schemas/includes/alternative_name.json +36 -0
  17. data/schema/schemas/includes/company-for-nesting.json +245 -0
  18. data/schema/schemas/includes/company.json +25 -0
  19. data/schema/schemas/includes/entity.json +58 -0
  20. data/schema/schemas/includes/filing.json +52 -0
  21. data/schema/schemas/includes/financial-payment-data-object.json +112 -0
  22. data/schema/schemas/includes/identifier.json +20 -0
  23. data/schema/schemas/includes/industry_code.json +29 -0
  24. data/schema/schemas/includes/licence-data-object.json +63 -0
  25. data/schema/schemas/includes/officer.json +70 -0
  26. data/schema/schemas/includes/organisation.json +58 -0
  27. data/schema/schemas/includes/permission.json +46 -0
  28. data/schema/schemas/includes/person.json +62 -0
  29. data/schema/schemas/includes/person_name.json +71 -0
  30. data/schema/schemas/includes/previous_name.json +24 -0
  31. data/schema/schemas/includes/share-parcel-data.json +82 -0
  32. data/schema/schemas/includes/share-parcel.json +78 -0
  33. data/schema/schemas/includes/subsidiary-relationship-data.json +58 -0
  34. data/schema/schemas/includes/total-shares.json +17 -0
  35. data/schema/schemas/includes/unknown_entity_type.json +58 -0
  36. data/schema/schemas/licence-schema.json +105 -0
  37. data/schema/schemas/primary-data-schema.json +20 -0
  38. data/schema/schemas/share-parcel-schema.json +22 -0
  39. data/schema/schemas/simple-financial-payment-schema.json +122 -0
  40. data/schema/schemas/simple-licence-schema.json +82 -0
  41. data/schema/schemas/simple-subsidiary-schema.json +85 -0
  42. data/schema/schemas/subsidiary-relationship-schema.json +46 -0
  43. data/spec/bots/bot-that-crashes-immediately/manifest.json +15 -0
  44. data/spec/bots/bot-that-crashes-immediately/scraper.rb +1 -0
  45. data/spec/bots/bot-that-crashes-immediately/transformer1.rb +15 -0
  46. data/spec/bots/bot-that-crashes-in-scraper/manifest.json +15 -0
  47. data/spec/bots/bot-that-crashes-in-scraper/scraper.rb +11 -0
  48. data/spec/bots/bot-that-crashes-in-scraper/transformer1.rb +15 -0
  49. data/spec/bots/bot-that-crashes-in-transformer/manifest.json +20 -0
  50. data/spec/bots/bot-that-crashes-in-transformer/scraper.rb +10 -0
  51. data/spec/bots/bot-that-crashes-in-transformer/transformer1.rb +15 -0
  52. data/spec/bots/bot-that-crashes-in-transformer/transformer2.rb +17 -0
  53. data/spec/bots/bot-that-emits-run-ended/manifest.json +8 -0
  54. data/spec/bots/bot-that-emits-run-ended/scraper.rb +11 -0
  55. data/spec/bots/bot-that-expects-file/manifest.json +8 -0
  56. data/spec/bots/bot-that-expects-file/scraper.rb +11 -0
  57. data/spec/bots/bot-that-expects-file/something.txt +1 -0
  58. data/spec/bots/bot-with-invalid-data-type/manifest.json +8 -0
  59. data/spec/bots/bot-with-invalid-data-type/scraper.rb +10 -0
  60. data/spec/bots/bot-with-invalid-sample-date/manifest.json +8 -0
  61. data/spec/bots/bot-with-invalid-sample-date/scraper.rb +10 -0
  62. data/spec/bots/bot-with-pause/manifest.json +8 -0
  63. data/spec/bots/bot-with-pause/scraper.rb +16 -0
  64. data/spec/bots/bot-with-transformer/manifest.json +15 -0
  65. data/spec/bots/bot-with-transformer/scraper.rb +10 -0
  66. data/spec/bots/bot-with-transformer/transformer.rb +15 -0
  67. data/spec/bots/bot-with-transformers/manifest.json +20 -0
  68. data/spec/bots/bot-with-transformers/scraper.rb +10 -0
  69. data/spec/bots/bot-with-transformers/transformer1.rb +15 -0
  70. data/spec/bots/bot-with-transformers/transformer2.rb +15 -0
  71. data/spec/bots/invalid-json-bot/manifest.json +8 -0
  72. data/spec/bots/invalid-json-bot/scraper.rb +11 -0
  73. data/spec/bots/invalid-record-bot/manifest.json +8 -0
  74. data/spec/bots/invalid-record-bot/scraper.rb +11 -0
  75. data/spec/bots/logging-bot/manifest.json +8 -0
  76. data/spec/bots/logging-bot/scraper.rb +14 -0
  77. data/spec/bots/python-bot/manifest.json +8 -0
  78. data/spec/bots/python-bot/scraper.py +11 -0
  79. data/spec/bots/ruby-bot/manifest.json +8 -0
  80. data/spec/bots/ruby-bot/scraper.rb +10 -0
  81. data/spec/bots/slow-bot/manifest.json +8 -0
  82. data/spec/bots/slow-bot/scraper.rb +11 -0
  83. data/spec/lib/processor_spec.rb +181 -0
  84. data/spec/lib/runner_spec.rb +330 -0
  85. data/spec/lib/utils_spec.rb +23 -0
  86. data/spec/lib/validator_spec.rb +89 -0
  87. data/spec/manual_spec.rb +57 -0
  88. data/spec/outputs/full-scraper.out +10 -0
  89. data/spec/outputs/full-transformer.out +10 -0
  90. data/spec/outputs/truncated-scraper.out +5 -0
  91. data/spec/spec_helper.rb +20 -0
  92. metadata +148 -0
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ M2U4ZDM0MzAxNWY3ZDJkOWExYTJhNGY2NzRiYjZlMDY3OTM4YzBlYQ==
5
+ data.tar.gz: !binary |-
6
+ MzQ3MmUzY2I5MzhhMWIxYmI4NWU3NGNlMmJkMWIyYjE1MmNmMTZhOQ==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ ZWMxODdmOTNjYzg0ZmIxZTI4MDQyMmRjMzYwNzZiMDA2YTE0M2EyYmJjNDZk
10
+ NjY4MjA3YWRhMzNlOGU5NmUzNDgyYmE3MjMyMDgwMjM5ZDkwMWE2OWU0MGYz
11
+ MzEzZWQ0YzE5NDdkMzc3M2YzMzJjNWM5OWI5YjY2ZmFiYzc4MDM=
12
+ data.tar.gz: !binary |-
13
+ NDVkNTg4ZjlmMjMxNGUwZmJjMTAzZjhhNDE2YWUzZGQyZjNhNTIyMjMwYTJm
14
+ Zjc4ZGRlMTJiMzRlZTI0ZDZiZjVjYjZjNjgxMTFhZDE5YmFkMjViOTcwNDVh
15
+ NjdjMWRiYmJkMGM4NjYwNGJlNGMzMWRiOTE3MzQ3NjNmZmMzNWQ=
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'rspec' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('rspec-core', 'rspec')
@@ -0,0 +1,28 @@
1
+ require 'set'
2
+
3
+ require 'turbot_runner/base_handler'
4
+ require 'turbot_runner/exceptions'
5
+ require 'turbot_runner/processor'
6
+ require 'turbot_runner/runner'
7
+ require 'turbot_runner/script_runner'
8
+ require 'turbot_runner/utils'
9
+ require 'turbot_runner/validator'
10
+ require 'turbot_runner/version'
11
+
12
+ module TurbotRunner
13
+ SCHEMAS_PATH = File.expand_path('../../schema/schemas', __FILE__)
14
+
15
+ def self.schema_path(data_type)
16
+ @schema_paths ||= Hash.new do |h, k|
17
+ h[k] = get_and_validate_schema_path(k)
18
+ end
19
+ @schema_paths[data_type]
20
+ end
21
+
22
+ def self.get_and_validate_schema_path(data_type)
23
+ hyphenated_name = data_type.to_s.gsub("_", "-").gsub(" ", "-")
24
+ path = File.join(SCHEMAS_PATH, "#{hyphenated_name}-schema.json")
25
+ raise TurbotRunner::InvalidDataType unless File.exists?(path)
26
+ path
27
+ end
28
+ end
@@ -0,0 +1,15 @@
1
+ module TurbotRunner
2
+ class BaseHandler
3
+ def handle_valid_record(record, data_type)
4
+ end
5
+
6
+ def handle_run_ended
7
+ end
8
+
9
+ def handle_invalid_record(record, data_type, error_message)
10
+ end
11
+
12
+ def handle_invalid_json(line)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,4 @@
1
+ module TurbotRunner
2
+ class InterruptRun < StandardError; end
3
+ class InvalidDataType < StandardError; end
4
+ end
@@ -0,0 +1,3 @@
1
+ # Disable output buffering
2
+ STDOUT.sync = true
3
+ STDERR.sync = true
@@ -0,0 +1,53 @@
1
+ require 'openc/json_schema'
2
+
3
+ module TurbotRunner
4
+ class Processor
5
+ def initialize(runner, script_config, record_handler)
6
+ @runner = runner
7
+ @data_type = script_config[:data_type]
8
+ @identifying_fields = script_config[:identifying_fields]
9
+ @record_handler = record_handler
10
+ end
11
+
12
+ def process(line)
13
+ begin
14
+ if line.strip == "RUN ENDED"
15
+ @record_handler.handle_run_ended
16
+ @runner.interrupt if @runner
17
+ else
18
+ record = Openc::JsonSchema.convert_dates(schema_path, JSON.parse(line))
19
+
20
+ record_to_validate = record.select {|k, v| k != 'retrieved_at'}
21
+
22
+ error_message = Validator.validate(
23
+ @data_type,
24
+ record_to_validate,
25
+ @identifying_fields
26
+ )
27
+
28
+ if error_message.nil?
29
+ begin
30
+ @record_handler.handle_valid_record(record, @data_type)
31
+ rescue InterruptRun
32
+ @runner.interrupt if @runner
33
+ end
34
+ else
35
+ @record_handler.handle_invalid_record(record, @data_type, error_message)
36
+ @runner.interrupt_and_mark_as_failed if @runner
37
+ end
38
+ end
39
+ rescue JSON::ParserError
40
+ @record_handler.handle_invalid_json(line)
41
+ @runner.interrupt_and_mark_as_failed if @runner
42
+ end
43
+ end
44
+
45
+ def interrupt
46
+ @runner.interrupt
47
+ end
48
+
49
+ def schema_path
50
+ TurbotRunner.schema_path(@data_type)
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,179 @@
1
+ require 'json'
2
+ require 'fileutils'
3
+ require 'pathname'
4
+
5
+ module TurbotRunner
6
+ class Runner
7
+ attr_reader :base_directory
8
+
9
+ def initialize(directory, options={})
10
+ assert_absolute_path(directory)
11
+ @base_directory = directory
12
+ @config = load_config(directory)
13
+ @record_handler = options[:record_handler]
14
+ @log_to_file = options[:log_to_file]
15
+ @timeout = options[:timeout]
16
+ if options[:output_directory]
17
+ assert_absolute_path(options[:output_directory])
18
+ @output_directory = options[:output_directory]
19
+ else
20
+ @output_directory = File.join(@base_directory, 'output')
21
+ end
22
+ end
23
+
24
+ def run
25
+ set_up_output_directory
26
+
27
+ succeeded = run_script(scraper_config)
28
+ # Run the transformers even if the scraper fails
29
+ transformers.each do |transformer_config|
30
+ succeeded = run_script(
31
+ transformer_config.merge(:base_directory => @base_directory),
32
+ input_file=scraper_output_file) && succeeded
33
+ end
34
+ succeeded
35
+ end
36
+
37
+ def set_up_output_directory
38
+ FileUtils.mkdir_p(@output_directory)
39
+ FileUtils.rm_f(File.join(@output_directory, 'scraper.out'))
40
+ FileUtils.rm_f(File.join(@output_directory, 'scraper.err'))
41
+
42
+ transformers.each do |transformer_config|
43
+ FileUtils.rm_f(File.join(@output_directory, "#{transformer_config[:file]}.out"))
44
+ FileUtils.rm_f(File.join(@output_directory, "#{transformer_config[:file]}.err"))
45
+ end
46
+ end
47
+
48
+ def process_output
49
+ process_script_output(scraper_config)
50
+
51
+ transformers.each do |transformer_config|
52
+ process_script_output(transformer_config.merge(:base_directory => @base_directory))
53
+ end
54
+ end
55
+
56
+ private
57
+ def full_interpreter_path
58
+ if language == "ruby"
59
+ # Ensure we use the same ruby as the current interpreter when
60
+ # creating a subshell. Necessary for OSX packaged version.
61
+ RbConfig.ruby
62
+ else
63
+ # Assume the first python in PATH
64
+ language
65
+ end
66
+ end
67
+
68
+ def load_config(directory)
69
+ manifest_path = File.join(directory, 'manifest.json')
70
+ raise "Could not find #{manifest_path}" unless File.exist?(manifest_path)
71
+
72
+ begin
73
+ json = open(manifest_path) {|f| f.read}
74
+ JSON.parse(json, :symbolize_names => true)
75
+ rescue JSON::ParserError
76
+ # TODO provide better error message
77
+ raise "Could not parse #{manifest_path} as JSON"
78
+ end
79
+ end
80
+
81
+
82
+ def run_script(script_config, input_file=nil)
83
+ command = build_command(script_config[:file], input_file)
84
+ script_runner = ScriptRunner.new(
85
+ command,
86
+ output_file(script_config[:file]),
87
+ script_config,
88
+ :record_handler => @record_handler,
89
+ :timeout => @timeout
90
+ )
91
+
92
+ script_runner.run # returns boolean indicating success
93
+ end
94
+
95
+ def process_script_output(script_config)
96
+ # The first argument to the Processor constructor is a nil
97
+ # Runner. This is because no running behaviour
98
+ # (e.g. interruptions etc) is required; we just want to do
99
+ # record handling.
100
+ processor = Processor.new(nil, script_config, @record_handler)
101
+ file = output_file(script_config[:file])
102
+ File.open(file) do |f|
103
+ f.each_line do |line|
104
+ processor.process(line)
105
+ end
106
+ end
107
+ rescue Errno::ENOENT => e
108
+ # We only want to catch ENOENT if the output file doesn't exist, and not
109
+ # if, for instance, a schema file is missing.
110
+ raise unless e.message == "No such file or directory - #{output_file(script_config[:file])}"
111
+ end
112
+
113
+ def build_command(script, input_file=nil)
114
+ raise "Could not run #{script} with #{language}" unless script_extension == File.extname(script)
115
+ command = "#{full_interpreter_path} #{additional_args} #{script} >#{output_file(script)}"
116
+ command << " 2>#{output_file(script, '.err')}" if @log_to_file
117
+ command << " <#{input_file}" unless input_file.nil?
118
+ command
119
+ end
120
+
121
+ def output_file(script, extension='.out')
122
+ basename = File.basename(script, script_extension)
123
+ File.join(@output_directory, basename) + extension
124
+ end
125
+
126
+ def script_extension
127
+ {
128
+ 'ruby' => '.rb',
129
+ 'python' => '.py',
130
+ }[language]
131
+ end
132
+
133
+ def additional_args
134
+ {
135
+ 'ruby' => "-r#{File.expand_path('../prerun.rb', __FILE__)}",
136
+ 'python' => '-u',
137
+ }[language]
138
+ end
139
+
140
+ def scraper_config
141
+ {
142
+ :base_directory => @base_directory,
143
+ :file => scraper_script,
144
+ :data_type => scraper_data_type,
145
+ :identifying_fields => scraper_identifying_fields
146
+ }
147
+ end
148
+
149
+ def scraper_script
150
+ "scraper#{script_extension}"
151
+ end
152
+
153
+ def transformers
154
+ @config[:transformers] || []
155
+ end
156
+
157
+ def scraper_output_file
158
+ File.join(@output_directory, 'scraper.out')
159
+ end
160
+
161
+ def language
162
+ @config[:language].downcase
163
+ end
164
+
165
+ def scraper_data_type
166
+ @config[:data_type]
167
+ end
168
+
169
+ def scraper_identifying_fields
170
+ @config[:identifying_fields]
171
+ end
172
+
173
+ def assert_absolute_path(path)
174
+ unless Pathname.new(path).absolute?
175
+ raise "#{path} must be an absolute path"
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,98 @@
1
+ # This is a useful blog post:
2
+ # http://blog.robseaman.com/2008/12/12/sending-ctrl-c-to-a-subprocess-with-ruby
3
+
4
+ # Ensure that SIGINT is ignored by the process running this.
5
+ trap('INT') {}
6
+
7
+ module TurbotRunner
8
+ class ScriptRunner
9
+ def initialize(command, output_file, script_config, options={})
10
+ @command = command
11
+ @output_file = output_file
12
+ @script_config = script_config
13
+ record_handler = options[:record_handler] || BaseHandler.new # A BaseHandler does nothing
14
+ @processor = Processor.new(self, script_config, record_handler)
15
+ @timeout = options[:timeout] || 3600
16
+ end
17
+
18
+ def run
19
+ Dir.chdir(@script_config[:base_directory]) do
20
+
21
+ begin
22
+ @interrupted = false
23
+ @failed = false
24
+
25
+ # Start a thread that spawns a subprocess that runs the script and
26
+ # redirects the script's output to a file at a known location.
27
+ script_thread = Thread.new { run_command(@command) }
28
+
29
+ # Wait for the output file to be created, so that we can start to read
30
+ # from it.
31
+ begin
32
+ f = File.open(@output_file, "r")
33
+ rescue Errno::ENOENT
34
+ sleep 0.1
35
+ retry
36
+ end
37
+ # Read from output file buildling up lines byte by byte byte by byte
38
+ # until either we reach the end of the file and the script has exited, or
39
+ # @interrupted becomes true. We cannot use IO#readline here because if
40
+ # only half a line has been synced to the file by the time we read it,
41
+ # then the incomplete line will be read, causing chaos down the line.
42
+ line = ''
43
+
44
+ time_of_last_read = Time.now
45
+ until @interrupted do
46
+ byte = f.read(1)
47
+ if byte.nil?
48
+ if script_thread.alive?
49
+ sleep 0.1
50
+ interrupt_and_mark_as_failed if (Time.now - time_of_last_read) > @timeout
51
+ else
52
+ break
53
+ end
54
+ elsif byte == "\n"
55
+ @processor.process(line)
56
+ time_of_last_read = Time.now
57
+ line = ''
58
+ else
59
+ time_of_last_read = Time.now
60
+ line << byte
61
+ end
62
+ end
63
+
64
+ # script_thread may still be alive if we exited the loop above becuase
65
+ # @interrupted became true, and so we must kill it.
66
+ kill_running_processes if script_thread.alive?
67
+
68
+ @failed ? false : script_thread.join.value
69
+ ensure
70
+ f.close if f
71
+ end
72
+ end
73
+ end
74
+
75
+ def interrupt
76
+ @interrupted = true
77
+ end
78
+
79
+ def interrupt_and_mark_as_failed
80
+ @interrupted = true
81
+ @failed = true
82
+ end
83
+
84
+ private
85
+ def run_command(command)
86
+ system(command)
87
+ # A nil exitstatus indicates that the script was interrupted. A
88
+ # termsig of 2 indicates that the script was interrupted by a SIGINT.
89
+ $?.exitstatus == 0 || ($?.exitstatus.nil? && $?.termsig == 2)
90
+ end
91
+
92
+ def kill_running_processes
93
+ # Send SIGINT to each process in the current proceess group, having
94
+ # already ensured that the current process itself ignores the signal.
95
+ Process.kill('INT', 0)
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,47 @@
1
+ module TurbotRunner
2
+ module Utils
3
+ extend self
4
+
5
+ def deep_copy(thing)
6
+ Marshal.load(Marshal.dump(thing))
7
+ end
8
+
9
+ # This turns a hash of the form:
10
+ #
11
+ # {
12
+ # 'a' => {
13
+ # 'b' => {
14
+ # 'c' => '123',
15
+ # 'd' => '124',
16
+ # },
17
+ # 'e' => {
18
+ # 'f' => '156',
19
+ # }
20
+ # }
21
+ # }
22
+ #
23
+ # into a hash of the form:
24
+ #
25
+ # {
26
+ # 'a.b.c' => '123',
27
+ # 'a.b.d' => '124',
28
+ # 'a.e.f' => '156',
29
+ # }
30
+ def flatten(hash)
31
+ pairs = []
32
+
33
+ hash.each do |k, v|
34
+ case v
35
+ when Hash
36
+ flatten(v).each do |k1, v1|
37
+ pairs << ["#{k}.#{k1}", v1]
38
+ end
39
+ else
40
+ pairs << [k, v]
41
+ end
42
+ end
43
+
44
+ Hash[pairs]
45
+ end
46
+ end
47
+ end