turbot-runner-morph 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (92) hide show
  1. checksums.yaml +15 -0
  2. data/bin/rspec +16 -0
  3. data/lib/turbot_runner.rb +28 -0
  4. data/lib/turbot_runner/base_handler.rb +15 -0
  5. data/lib/turbot_runner/exceptions.rb +4 -0
  6. data/lib/turbot_runner/prerun.rb +3 -0
  7. data/lib/turbot_runner/processor.rb +53 -0
  8. data/lib/turbot_runner/runner.rb +179 -0
  9. data/lib/turbot_runner/script_runner.rb +98 -0
  10. data/lib/turbot_runner/utils.rb +47 -0
  11. data/lib/turbot_runner/validator.rb +28 -0
  12. data/lib/turbot_runner/version.rb +3 -0
  13. data/schema/schemas/company-schema.json +243 -0
  14. data/schema/schemas/financial-payment-schema.json +32 -0
  15. data/schema/schemas/includes/address.json +53 -0
  16. data/schema/schemas/includes/alternative_name.json +36 -0
  17. data/schema/schemas/includes/company-for-nesting.json +245 -0
  18. data/schema/schemas/includes/company.json +25 -0
  19. data/schema/schemas/includes/entity.json +58 -0
  20. data/schema/schemas/includes/filing.json +52 -0
  21. data/schema/schemas/includes/financial-payment-data-object.json +112 -0
  22. data/schema/schemas/includes/identifier.json +20 -0
  23. data/schema/schemas/includes/industry_code.json +29 -0
  24. data/schema/schemas/includes/licence-data-object.json +63 -0
  25. data/schema/schemas/includes/officer.json +70 -0
  26. data/schema/schemas/includes/organisation.json +58 -0
  27. data/schema/schemas/includes/permission.json +46 -0
  28. data/schema/schemas/includes/person.json +62 -0
  29. data/schema/schemas/includes/person_name.json +71 -0
  30. data/schema/schemas/includes/previous_name.json +24 -0
  31. data/schema/schemas/includes/share-parcel-data.json +82 -0
  32. data/schema/schemas/includes/share-parcel.json +78 -0
  33. data/schema/schemas/includes/subsidiary-relationship-data.json +58 -0
  34. data/schema/schemas/includes/total-shares.json +17 -0
  35. data/schema/schemas/includes/unknown_entity_type.json +58 -0
  36. data/schema/schemas/licence-schema.json +105 -0
  37. data/schema/schemas/primary-data-schema.json +20 -0
  38. data/schema/schemas/share-parcel-schema.json +22 -0
  39. data/schema/schemas/simple-financial-payment-schema.json +122 -0
  40. data/schema/schemas/simple-licence-schema.json +82 -0
  41. data/schema/schemas/simple-subsidiary-schema.json +85 -0
  42. data/schema/schemas/subsidiary-relationship-schema.json +46 -0
  43. data/spec/bots/bot-that-crashes-immediately/manifest.json +15 -0
  44. data/spec/bots/bot-that-crashes-immediately/scraper.rb +1 -0
  45. data/spec/bots/bot-that-crashes-immediately/transformer1.rb +15 -0
  46. data/spec/bots/bot-that-crashes-in-scraper/manifest.json +15 -0
  47. data/spec/bots/bot-that-crashes-in-scraper/scraper.rb +11 -0
  48. data/spec/bots/bot-that-crashes-in-scraper/transformer1.rb +15 -0
  49. data/spec/bots/bot-that-crashes-in-transformer/manifest.json +20 -0
  50. data/spec/bots/bot-that-crashes-in-transformer/scraper.rb +10 -0
  51. data/spec/bots/bot-that-crashes-in-transformer/transformer1.rb +15 -0
  52. data/spec/bots/bot-that-crashes-in-transformer/transformer2.rb +17 -0
  53. data/spec/bots/bot-that-emits-run-ended/manifest.json +8 -0
  54. data/spec/bots/bot-that-emits-run-ended/scraper.rb +11 -0
  55. data/spec/bots/bot-that-expects-file/manifest.json +8 -0
  56. data/spec/bots/bot-that-expects-file/scraper.rb +11 -0
  57. data/spec/bots/bot-that-expects-file/something.txt +1 -0
  58. data/spec/bots/bot-with-invalid-data-type/manifest.json +8 -0
  59. data/spec/bots/bot-with-invalid-data-type/scraper.rb +10 -0
  60. data/spec/bots/bot-with-invalid-sample-date/manifest.json +8 -0
  61. data/spec/bots/bot-with-invalid-sample-date/scraper.rb +10 -0
  62. data/spec/bots/bot-with-pause/manifest.json +8 -0
  63. data/spec/bots/bot-with-pause/scraper.rb +16 -0
  64. data/spec/bots/bot-with-transformer/manifest.json +15 -0
  65. data/spec/bots/bot-with-transformer/scraper.rb +10 -0
  66. data/spec/bots/bot-with-transformer/transformer.rb +15 -0
  67. data/spec/bots/bot-with-transformers/manifest.json +20 -0
  68. data/spec/bots/bot-with-transformers/scraper.rb +10 -0
  69. data/spec/bots/bot-with-transformers/transformer1.rb +15 -0
  70. data/spec/bots/bot-with-transformers/transformer2.rb +15 -0
  71. data/spec/bots/invalid-json-bot/manifest.json +8 -0
  72. data/spec/bots/invalid-json-bot/scraper.rb +11 -0
  73. data/spec/bots/invalid-record-bot/manifest.json +8 -0
  74. data/spec/bots/invalid-record-bot/scraper.rb +11 -0
  75. data/spec/bots/logging-bot/manifest.json +8 -0
  76. data/spec/bots/logging-bot/scraper.rb +14 -0
  77. data/spec/bots/python-bot/manifest.json +8 -0
  78. data/spec/bots/python-bot/scraper.py +11 -0
  79. data/spec/bots/ruby-bot/manifest.json +8 -0
  80. data/spec/bots/ruby-bot/scraper.rb +10 -0
  81. data/spec/bots/slow-bot/manifest.json +8 -0
  82. data/spec/bots/slow-bot/scraper.rb +11 -0
  83. data/spec/lib/processor_spec.rb +181 -0
  84. data/spec/lib/runner_spec.rb +330 -0
  85. data/spec/lib/utils_spec.rb +23 -0
  86. data/spec/lib/validator_spec.rb +89 -0
  87. data/spec/manual_spec.rb +57 -0
  88. data/spec/outputs/full-scraper.out +10 -0
  89. data/spec/outputs/full-transformer.out +10 -0
  90. data/spec/outputs/truncated-scraper.out +5 -0
  91. data/spec/spec_helper.rb +20 -0
  92. metadata +148 -0
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ M2U4ZDM0MzAxNWY3ZDJkOWExYTJhNGY2NzRiYjZlMDY3OTM4YzBlYQ==
5
+ data.tar.gz: !binary |-
6
+ MzQ3MmUzY2I5MzhhMWIxYmI4NWU3NGNlMmJkMWIyYjE1MmNmMTZhOQ==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ ZWMxODdmOTNjYzg0ZmIxZTI4MDQyMmRjMzYwNzZiMDA2YTE0M2EyYmJjNDZk
10
+ NjY4MjA3YWRhMzNlOGU5NmUzNDgyYmE3MjMyMDgwMjM5ZDkwMWE2OWU0MGYz
11
+ MzEzZWQ0YzE5NDdkMzc3M2YzMzJjNWM5OWI5YjY2ZmFiYzc4MDM=
12
+ data.tar.gz: !binary |-
13
+ NDVkNTg4ZjlmMjMxNGUwZmJjMTAzZjhhNDE2YWUzZGQyZjNhNTIyMjMwYTJm
14
+ Zjc4ZGRlMTJiMzRlZTI0ZDZiZjVjYjZjNjgxMTFhZDE5YmFkMjViOTcwNDVh
15
+ NjdjMWRiYmJkMGM4NjYwNGJlNGMzMWRiOTE3MzQ3NjNmZmMzNWQ=
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'rspec' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('rspec-core', 'rspec')
@@ -0,0 +1,28 @@
1
+ require 'set'
2
+
3
+ require 'turbot_runner/base_handler'
4
+ require 'turbot_runner/exceptions'
5
+ require 'turbot_runner/processor'
6
+ require 'turbot_runner/runner'
7
+ require 'turbot_runner/script_runner'
8
+ require 'turbot_runner/utils'
9
+ require 'turbot_runner/validator'
10
+ require 'turbot_runner/version'
11
+
12
+ module TurbotRunner
13
+ SCHEMAS_PATH = File.expand_path('../../schema/schemas', __FILE__)
14
+
15
+ def self.schema_path(data_type)
16
+ @schema_paths ||= Hash.new do |h, k|
17
+ h[k] = get_and_validate_schema_path(k)
18
+ end
19
+ @schema_paths[data_type]
20
+ end
21
+
22
+ def self.get_and_validate_schema_path(data_type)
23
+ hyphenated_name = data_type.to_s.gsub("_", "-").gsub(" ", "-")
24
+ path = File.join(SCHEMAS_PATH, "#{hyphenated_name}-schema.json")
25
+ raise TurbotRunner::InvalidDataType unless File.exists?(path)
26
+ path
27
+ end
28
+ end
@@ -0,0 +1,15 @@
1
+ module TurbotRunner
2
+ class BaseHandler
3
+ def handle_valid_record(record, data_type)
4
+ end
5
+
6
+ def handle_run_ended
7
+ end
8
+
9
+ def handle_invalid_record(record, data_type, error_message)
10
+ end
11
+
12
+ def handle_invalid_json(line)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,4 @@
1
+ module TurbotRunner
2
+ class InterruptRun < StandardError; end
3
+ class InvalidDataType < StandardError; end
4
+ end
@@ -0,0 +1,3 @@
1
+ # Disable output buffering
2
+ STDOUT.sync = true
3
+ STDERR.sync = true
@@ -0,0 +1,53 @@
1
+ require 'openc/json_schema'
2
+
3
+ module TurbotRunner
4
+ class Processor
5
+ def initialize(runner, script_config, record_handler)
6
+ @runner = runner
7
+ @data_type = script_config[:data_type]
8
+ @identifying_fields = script_config[:identifying_fields]
9
+ @record_handler = record_handler
10
+ end
11
+
12
+ def process(line)
13
+ begin
14
+ if line.strip == "RUN ENDED"
15
+ @record_handler.handle_run_ended
16
+ @runner.interrupt if @runner
17
+ else
18
+ record = Openc::JsonSchema.convert_dates(schema_path, JSON.parse(line))
19
+
20
+ record_to_validate = record.select {|k, v| k != 'retrieved_at'}
21
+
22
+ error_message = Validator.validate(
23
+ @data_type,
24
+ record_to_validate,
25
+ @identifying_fields
26
+ )
27
+
28
+ if error_message.nil?
29
+ begin
30
+ @record_handler.handle_valid_record(record, @data_type)
31
+ rescue InterruptRun
32
+ @runner.interrupt if @runner
33
+ end
34
+ else
35
+ @record_handler.handle_invalid_record(record, @data_type, error_message)
36
+ @runner.interrupt_and_mark_as_failed if @runner
37
+ end
38
+ end
39
+ rescue JSON::ParserError
40
+ @record_handler.handle_invalid_json(line)
41
+ @runner.interrupt_and_mark_as_failed if @runner
42
+ end
43
+ end
44
+
45
+ def interrupt
46
+ @runner.interrupt
47
+ end
48
+
49
+ def schema_path
50
+ TurbotRunner.schema_path(@data_type)
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,179 @@
1
+ require 'json'
2
+ require 'fileutils'
3
+ require 'pathname'
4
+
5
+ module TurbotRunner
6
+ class Runner
7
+ attr_reader :base_directory
8
+
9
+ def initialize(directory, options={})
10
+ assert_absolute_path(directory)
11
+ @base_directory = directory
12
+ @config = load_config(directory)
13
+ @record_handler = options[:record_handler]
14
+ @log_to_file = options[:log_to_file]
15
+ @timeout = options[:timeout]
16
+ if options[:output_directory]
17
+ assert_absolute_path(options[:output_directory])
18
+ @output_directory = options[:output_directory]
19
+ else
20
+ @output_directory = File.join(@base_directory, 'output')
21
+ end
22
+ end
23
+
24
+ def run
25
+ set_up_output_directory
26
+
27
+ succeeded = run_script(scraper_config)
28
+ # Run the transformers even if the scraper fails
29
+ transformers.each do |transformer_config|
30
+ succeeded = run_script(
31
+ transformer_config.merge(:base_directory => @base_directory),
32
+ input_file=scraper_output_file) && succeeded
33
+ end
34
+ succeeded
35
+ end
36
+
37
+ def set_up_output_directory
38
+ FileUtils.mkdir_p(@output_directory)
39
+ FileUtils.rm_f(File.join(@output_directory, 'scraper.out'))
40
+ FileUtils.rm_f(File.join(@output_directory, 'scraper.err'))
41
+
42
+ transformers.each do |transformer_config|
43
+ FileUtils.rm_f(File.join(@output_directory, "#{transformer_config[:file]}.out"))
44
+ FileUtils.rm_f(File.join(@output_directory, "#{transformer_config[:file]}.err"))
45
+ end
46
+ end
47
+
48
+ def process_output
49
+ process_script_output(scraper_config)
50
+
51
+ transformers.each do |transformer_config|
52
+ process_script_output(transformer_config.merge(:base_directory => @base_directory))
53
+ end
54
+ end
55
+
56
+ private
57
+ def full_interpreter_path
58
+ if language == "ruby"
59
+ # Ensure we use the same ruby as the current interpreter when
60
+ # creating a subshell. Necessary for OSX packaged version.
61
+ RbConfig.ruby
62
+ else
63
+ # Assume the first python in PATH
64
+ language
65
+ end
66
+ end
67
+
68
+ def load_config(directory)
69
+ manifest_path = File.join(directory, 'manifest.json')
70
+ raise "Could not find #{manifest_path}" unless File.exist?(manifest_path)
71
+
72
+ begin
73
+ json = open(manifest_path) {|f| f.read}
74
+ JSON.parse(json, :symbolize_names => true)
75
+ rescue JSON::ParserError
76
+ # TODO provide better error message
77
+ raise "Could not parse #{manifest_path} as JSON"
78
+ end
79
+ end
80
+
81
+
82
+ def run_script(script_config, input_file=nil)
83
+ command = build_command(script_config[:file], input_file)
84
+ script_runner = ScriptRunner.new(
85
+ command,
86
+ output_file(script_config[:file]),
87
+ script_config,
88
+ :record_handler => @record_handler,
89
+ :timeout => @timeout
90
+ )
91
+
92
+ script_runner.run # returns boolean indicating success
93
+ end
94
+
95
+ def process_script_output(script_config)
96
+ # The first argument to the Processor constructor is a nil
97
+ # Runner. This is because no running behaviour
98
+ # (e.g. interruptions etc) is required; we just want to do
99
+ # record handling.
100
+ processor = Processor.new(nil, script_config, @record_handler)
101
+ file = output_file(script_config[:file])
102
+ File.open(file) do |f|
103
+ f.each_line do |line|
104
+ processor.process(line)
105
+ end
106
+ end
107
+ rescue Errno::ENOENT => e
108
+ # We only want to catch ENOENT if the output file doesn't exist, and not
109
+ # if, for instance, a schema file is missing.
110
+ raise unless e.message == "No such file or directory - #{output_file(script_config[:file])}"
111
+ end
112
+
113
+ def build_command(script, input_file=nil)
114
+ raise "Could not run #{script} with #{language}" unless script_extension == File.extname(script)
115
+ command = "#{full_interpreter_path} #{additional_args} #{script} >#{output_file(script)}"
116
+ command << " 2>#{output_file(script, '.err')}" if @log_to_file
117
+ command << " <#{input_file}" unless input_file.nil?
118
+ command
119
+ end
120
+
121
+ def output_file(script, extension='.out')
122
+ basename = File.basename(script, script_extension)
123
+ File.join(@output_directory, basename) + extension
124
+ end
125
+
126
+ def script_extension
127
+ {
128
+ 'ruby' => '.rb',
129
+ 'python' => '.py',
130
+ }[language]
131
+ end
132
+
133
+ def additional_args
134
+ {
135
+ 'ruby' => "-r#{File.expand_path('../prerun.rb', __FILE__)}",
136
+ 'python' => '-u',
137
+ }[language]
138
+ end
139
+
140
+ def scraper_config
141
+ {
142
+ :base_directory => @base_directory,
143
+ :file => scraper_script,
144
+ :data_type => scraper_data_type,
145
+ :identifying_fields => scraper_identifying_fields
146
+ }
147
+ end
148
+
149
+ def scraper_script
150
+ "scraper#{script_extension}"
151
+ end
152
+
153
+ def transformers
154
+ @config[:transformers] || []
155
+ end
156
+
157
+ def scraper_output_file
158
+ File.join(@output_directory, 'scraper.out')
159
+ end
160
+
161
+ def language
162
+ @config[:language].downcase
163
+ end
164
+
165
+ def scraper_data_type
166
+ @config[:data_type]
167
+ end
168
+
169
+ def scraper_identifying_fields
170
+ @config[:identifying_fields]
171
+ end
172
+
173
+ def assert_absolute_path(path)
174
+ unless Pathname.new(path).absolute?
175
+ raise "#{path} must be an absolute path"
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,98 @@
1
+ # This is a useful blog post:
2
+ # http://blog.robseaman.com/2008/12/12/sending-ctrl-c-to-a-subprocess-with-ruby
3
+
4
+ # Ensure that SIGINT is ignored by the process running this.
5
+ trap('INT') {}
6
+
7
+ module TurbotRunner
8
+ class ScriptRunner
9
+ def initialize(command, output_file, script_config, options={})
10
+ @command = command
11
+ @output_file = output_file
12
+ @script_config = script_config
13
+ record_handler = options[:record_handler] || BaseHandler.new # A BaseHandler does nothing
14
+ @processor = Processor.new(self, script_config, record_handler)
15
+ @timeout = options[:timeout] || 3600
16
+ end
17
+
18
+ def run
19
+ Dir.chdir(@script_config[:base_directory]) do
20
+
21
+ begin
22
+ @interrupted = false
23
+ @failed = false
24
+
25
+ # Start a thread that spawns a subprocess that runs the script and
26
+ # redirects the script's output to a file at a known location.
27
+ script_thread = Thread.new { run_command(@command) }
28
+
29
+ # Wait for the output file to be created, so that we can start to read
30
+ # from it.
31
+ begin
32
+ f = File.open(@output_file, "r")
33
+ rescue Errno::ENOENT
34
+ sleep 0.1
35
+ retry
36
+ end
37
+ # Read from output file buildling up lines byte by byte byte by byte
38
+ # until either we reach the end of the file and the script has exited, or
39
+ # @interrupted becomes true. We cannot use IO#readline here because if
40
+ # only half a line has been synced to the file by the time we read it,
41
+ # then the incomplete line will be read, causing chaos down the line.
42
+ line = ''
43
+
44
+ time_of_last_read = Time.now
45
+ until @interrupted do
46
+ byte = f.read(1)
47
+ if byte.nil?
48
+ if script_thread.alive?
49
+ sleep 0.1
50
+ interrupt_and_mark_as_failed if (Time.now - time_of_last_read) > @timeout
51
+ else
52
+ break
53
+ end
54
+ elsif byte == "\n"
55
+ @processor.process(line)
56
+ time_of_last_read = Time.now
57
+ line = ''
58
+ else
59
+ time_of_last_read = Time.now
60
+ line << byte
61
+ end
62
+ end
63
+
64
+ # script_thread may still be alive if we exited the loop above becuase
65
+ # @interrupted became true, and so we must kill it.
66
+ kill_running_processes if script_thread.alive?
67
+
68
+ @failed ? false : script_thread.join.value
69
+ ensure
70
+ f.close if f
71
+ end
72
+ end
73
+ end
74
+
75
+ def interrupt
76
+ @interrupted = true
77
+ end
78
+
79
+ def interrupt_and_mark_as_failed
80
+ @interrupted = true
81
+ @failed = true
82
+ end
83
+
84
+ private
85
+ def run_command(command)
86
+ system(command)
87
+ # A nil exitstatus indicates that the script was interrupted. A
88
+ # termsig of 2 indicates that the script was interrupted by a SIGINT.
89
+ $?.exitstatus == 0 || ($?.exitstatus.nil? && $?.termsig == 2)
90
+ end
91
+
92
+ def kill_running_processes
93
+ # Send SIGINT to each process in the current proceess group, having
94
+ # already ensured that the current process itself ignores the signal.
95
+ Process.kill('INT', 0)
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,47 @@
1
+ module TurbotRunner
2
+ module Utils
3
+ extend self
4
+
5
+ def deep_copy(thing)
6
+ Marshal.load(Marshal.dump(thing))
7
+ end
8
+
9
+ # This turns a hash of the form:
10
+ #
11
+ # {
12
+ # 'a' => {
13
+ # 'b' => {
14
+ # 'c' => '123',
15
+ # 'd' => '124',
16
+ # },
17
+ # 'e' => {
18
+ # 'f' => '156',
19
+ # }
20
+ # }
21
+ # }
22
+ #
23
+ # into a hash of the form:
24
+ #
25
+ # {
26
+ # 'a.b.c' => '123',
27
+ # 'a.b.d' => '124',
28
+ # 'a.e.f' => '156',
29
+ # }
30
+ def flatten(hash)
31
+ pairs = []
32
+
33
+ hash.each do |k, v|
34
+ case v
35
+ when Hash
36
+ flatten(v).each do |k1, v1|
37
+ pairs << ["#{k}.#{k1}", v1]
38
+ end
39
+ else
40
+ pairs << [k, v]
41
+ end
42
+ end
43
+
44
+ Hash[pairs]
45
+ end
46
+ end
47
+ end