RubyGems - turbot-runner-morph - Versions diffs - 0.0.1 - Mend

turbot-runner-morph 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

checksums.yaml +15 -0
data/bin/rspec +16 -0
data/lib/turbot_runner.rb +28 -0
data/lib/turbot_runner/base_handler.rb +15 -0
data/lib/turbot_runner/exceptions.rb +4 -0
data/lib/turbot_runner/prerun.rb +3 -0
data/lib/turbot_runner/processor.rb +53 -0
data/lib/turbot_runner/runner.rb +179 -0
data/lib/turbot_runner/script_runner.rb +98 -0
data/lib/turbot_runner/utils.rb +47 -0
data/lib/turbot_runner/validator.rb +28 -0
data/lib/turbot_runner/version.rb +3 -0
data/schema/schemas/company-schema.json +243 -0
data/schema/schemas/financial-payment-schema.json +32 -0
data/schema/schemas/includes/address.json +53 -0
data/schema/schemas/includes/alternative_name.json +36 -0
data/schema/schemas/includes/company-for-nesting.json +245 -0
data/schema/schemas/includes/company.json +25 -0
data/schema/schemas/includes/entity.json +58 -0
data/schema/schemas/includes/filing.json +52 -0
data/schema/schemas/includes/financial-payment-data-object.json +112 -0
data/schema/schemas/includes/identifier.json +20 -0
data/schema/schemas/includes/industry_code.json +29 -0
data/schema/schemas/includes/licence-data-object.json +63 -0
data/schema/schemas/includes/officer.json +70 -0
data/schema/schemas/includes/organisation.json +58 -0
data/schema/schemas/includes/permission.json +46 -0
data/schema/schemas/includes/person.json +62 -0
data/schema/schemas/includes/person_name.json +71 -0
data/schema/schemas/includes/previous_name.json +24 -0
data/schema/schemas/includes/share-parcel-data.json +82 -0
data/schema/schemas/includes/share-parcel.json +78 -0
data/schema/schemas/includes/subsidiary-relationship-data.json +58 -0
data/schema/schemas/includes/total-shares.json +17 -0
data/schema/schemas/includes/unknown_entity_type.json +58 -0
data/schema/schemas/licence-schema.json +105 -0
data/schema/schemas/primary-data-schema.json +20 -0
data/schema/schemas/share-parcel-schema.json +22 -0
data/schema/schemas/simple-financial-payment-schema.json +122 -0
data/schema/schemas/simple-licence-schema.json +82 -0
data/schema/schemas/simple-subsidiary-schema.json +85 -0
data/schema/schemas/subsidiary-relationship-schema.json +46 -0
data/spec/bots/bot-that-crashes-immediately/manifest.json +15 -0
data/spec/bots/bot-that-crashes-immediately/scraper.rb +1 -0
data/spec/bots/bot-that-crashes-immediately/transformer1.rb +15 -0
data/spec/bots/bot-that-crashes-in-scraper/manifest.json +15 -0
data/spec/bots/bot-that-crashes-in-scraper/scraper.rb +11 -0
data/spec/bots/bot-that-crashes-in-scraper/transformer1.rb +15 -0
data/spec/bots/bot-that-crashes-in-transformer/manifest.json +20 -0
data/spec/bots/bot-that-crashes-in-transformer/scraper.rb +10 -0
data/spec/bots/bot-that-crashes-in-transformer/transformer1.rb +15 -0
data/spec/bots/bot-that-crashes-in-transformer/transformer2.rb +17 -0
data/spec/bots/bot-that-emits-run-ended/manifest.json +8 -0
data/spec/bots/bot-that-emits-run-ended/scraper.rb +11 -0
data/spec/bots/bot-that-expects-file/manifest.json +8 -0
data/spec/bots/bot-that-expects-file/scraper.rb +11 -0
data/spec/bots/bot-that-expects-file/something.txt +1 -0
data/spec/bots/bot-with-invalid-data-type/manifest.json +8 -0
data/spec/bots/bot-with-invalid-data-type/scraper.rb +10 -0
data/spec/bots/bot-with-invalid-sample-date/manifest.json +8 -0
data/spec/bots/bot-with-invalid-sample-date/scraper.rb +10 -0
data/spec/bots/bot-with-pause/manifest.json +8 -0
data/spec/bots/bot-with-pause/scraper.rb +16 -0
data/spec/bots/bot-with-transformer/manifest.json +15 -0
data/spec/bots/bot-with-transformer/scraper.rb +10 -0
data/spec/bots/bot-with-transformer/transformer.rb +15 -0
data/spec/bots/bot-with-transformers/manifest.json +20 -0
data/spec/bots/bot-with-transformers/scraper.rb +10 -0
data/spec/bots/bot-with-transformers/transformer1.rb +15 -0
data/spec/bots/bot-with-transformers/transformer2.rb +15 -0
data/spec/bots/invalid-json-bot/manifest.json +8 -0
data/spec/bots/invalid-json-bot/scraper.rb +11 -0
data/spec/bots/invalid-record-bot/manifest.json +8 -0
data/spec/bots/invalid-record-bot/scraper.rb +11 -0
data/spec/bots/logging-bot/manifest.json +8 -0
data/spec/bots/logging-bot/scraper.rb +14 -0
data/spec/bots/python-bot/manifest.json +8 -0
data/spec/bots/python-bot/scraper.py +11 -0
data/spec/bots/ruby-bot/manifest.json +8 -0
data/spec/bots/ruby-bot/scraper.rb +10 -0
data/spec/bots/slow-bot/manifest.json +8 -0
data/spec/bots/slow-bot/scraper.rb +11 -0
data/spec/lib/processor_spec.rb +181 -0
data/spec/lib/runner_spec.rb +330 -0
data/spec/lib/utils_spec.rb +23 -0
data/spec/lib/validator_spec.rb +89 -0
data/spec/manual_spec.rb +57 -0
data/spec/outputs/full-scraper.out +10 -0
data/spec/outputs/full-transformer.out +10 -0
data/spec/outputs/truncated-scraper.out +5 -0
data/spec/spec_helper.rb +20 -0
metadata +148 -0

checksums.yaml ADDED

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    M2U4ZDM0MzAxNWY3ZDJkOWExYTJhNGY2NzRiYjZlMDY3OTM4YzBlYQ==
+  data.tar.gz: !binary |-
+    MzQ3MmUzY2I5MzhhMWIxYmI4NWU3NGNlMmJkMWIyYjE1MmNmMTZhOQ==
+SHA512:
+  metadata.gz: !binary |-
+    ZWMxODdmOTNjYzg0ZmIxZTI4MDQyMmRjMzYwNzZiMDA2YTE0M2EyYmJjNDZk
+    NjY4MjA3YWRhMzNlOGU5NmUzNDgyYmE3MjMyMDgwMjM5ZDkwMWE2OWU0MGYz
+    MzEzZWQ0YzE5NDdkMzc3M2YzMzJjNWM5OWI5YjY2ZmFiYzc4MDM=
+  data.tar.gz: !binary |-
+    NDVkNTg4ZjlmMjMxNGUwZmJjMTAzZjhhNDE2YWUzZGQyZjNhNTIyMjMwYTJm
+    Zjc4ZGRlMTJiMzRlZTI0ZDZiZjVjYjZjNjgxMTFhZDE5YmFkMjViOTcwNDVh
+    NjdjMWRiYmJkMGM4NjYwNGJlNGMzMWRiOTE3MzQ3NjNmZmMzNWQ=

data/bin/rspec ADDED

@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+#
+# This file was generated by Bundler.
+#
+# The application 'rspec' is installed as part of a gem, and
+# this file is here to facilitate running it.
+#
+require 'pathname'
+ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
+  Pathname.new(__FILE__).realpath)
+require 'rubygems'
+require 'bundler/setup'
+load Gem.bin_path('rspec-core', 'rspec')

data/lib/turbot_runner.rb ADDED

@@ -0,0 +1,28 @@
+require 'set'
+require 'turbot_runner/base_handler'
+require 'turbot_runner/exceptions'
+require 'turbot_runner/processor'
+require 'turbot_runner/runner'
+require 'turbot_runner/script_runner'
+require 'turbot_runner/utils'
+require 'turbot_runner/validator'
+require 'turbot_runner/version'
+module TurbotRunner
+  SCHEMAS_PATH = File.expand_path('../../schema/schemas', __FILE__)
+  def self.schema_path(data_type)
+    @schema_paths ||= Hash.new do |h, k|
+      h[k] = get_and_validate_schema_path(k)
+    end
+    @schema_paths[data_type]
+  end
+  def self.get_and_validate_schema_path(data_type)
+    hyphenated_name = data_type.to_s.gsub("_", "-").gsub(" ", "-")
+    path = File.join(SCHEMAS_PATH, "#{hyphenated_name}-schema.json")
+    raise TurbotRunner::InvalidDataType unless File.exists?(path)
+    path
+  end
+end

data/lib/turbot_runner/base_handler.rb ADDED

@@ -0,0 +1,15 @@
+module TurbotRunner
+  class BaseHandler
+    def handle_valid_record(record, data_type)
+    end
+    def handle_run_ended
+    end
+    def handle_invalid_record(record, data_type, error_message)
+    end
+    def handle_invalid_json(line)
+    end
+  end
+end

data/lib/turbot_runner/exceptions.rb ADDED

@@ -0,0 +1,4 @@
+module TurbotRunner
+  class InterruptRun < StandardError; end
+  class InvalidDataType < StandardError; end
+end

data/lib/turbot_runner/prerun.rb ADDED

@@ -0,0 +1,3 @@
+# Disable output buffering
+STDOUT.sync = true
+STDERR.sync = true

data/lib/turbot_runner/processor.rb ADDED

@@ -0,0 +1,53 @@
+require 'openc/json_schema'
+module TurbotRunner
+  class Processor
+    def initialize(runner, script_config, record_handler)
+      @runner = runner
+      @data_type = script_config[:data_type]
+      @identifying_fields = script_config[:identifying_fields]
+      @record_handler = record_handler
+    end
+    def process(line)
+      begin
+        if line.strip == "RUN ENDED"
+          @record_handler.handle_run_ended
+          @runner.interrupt if @runner
+        else
+          record = Openc::JsonSchema.convert_dates(schema_path, JSON.parse(line))
+          record_to_validate = record.select {|k, v| k != 'retrieved_at'}
+          error_message = Validator.validate(
+            @data_type,
+            record_to_validate,
+            @identifying_fields
+          )
+          if error_message.nil?
+            begin
+              @record_handler.handle_valid_record(record, @data_type)
+            rescue InterruptRun
+              @runner.interrupt if @runner
+            end
+          else
+            @record_handler.handle_invalid_record(record, @data_type, error_message)
+            @runner.interrupt_and_mark_as_failed if @runner
+          end
+        end
+      rescue JSON::ParserError
+        @record_handler.handle_invalid_json(line)
+        @runner.interrupt_and_mark_as_failed if @runner
+      end
+    end
+    def interrupt
+      @runner.interrupt
+    end
+    def schema_path
+      TurbotRunner.schema_path(@data_type)
+    end
+  end
+end

data/lib/turbot_runner/runner.rb ADDED

@@ -0,0 +1,179 @@
+require 'json'
+require 'fileutils'
+require 'pathname'
+module TurbotRunner
+  class Runner
+    attr_reader :base_directory
+    def initialize(directory, options={})
+      assert_absolute_path(directory)
+      @base_directory = directory
+      @config = load_config(directory)
+      @record_handler = options[:record_handler]
+      @log_to_file = options[:log_to_file]
+      @timeout = options[:timeout]
+      if options[:output_directory]
+        assert_absolute_path(options[:output_directory])
+        @output_directory = options[:output_directory]
+      else
+        @output_directory = File.join(@base_directory, 'output')
+      end
+    end
+    def run
+      set_up_output_directory
+      succeeded = run_script(scraper_config)
+      # Run the transformers even if the scraper fails
+      transformers.each do |transformer_config|
+        succeeded = run_script(
+          transformer_config.merge(:base_directory => @base_directory),
+          input_file=scraper_output_file) && succeeded
+      end
+      succeeded
+    end
+    def set_up_output_directory
+      FileUtils.mkdir_p(@output_directory)
+      FileUtils.rm_f(File.join(@output_directory, 'scraper.out'))
+      FileUtils.rm_f(File.join(@output_directory, 'scraper.err'))
+      transformers.each do |transformer_config|
+        FileUtils.rm_f(File.join(@output_directory, "#{transformer_config[:file]}.out"))
+        FileUtils.rm_f(File.join(@output_directory, "#{transformer_config[:file]}.err"))
+      end
+    end
+    def process_output
+      process_script_output(scraper_config)
+      transformers.each do |transformer_config|
+        process_script_output(transformer_config.merge(:base_directory => @base_directory))
+      end
+    end
+    private
+    def full_interpreter_path
+      if language == "ruby"
+        # Ensure we use the same ruby as the current interpreter when
+        # creating a subshell. Necessary for OSX packaged version.
+        RbConfig.ruby
+      else
+        # Assume the first python in PATH
+        language
+      end
+    end
+    def load_config(directory)
+      manifest_path = File.join(directory, 'manifest.json')
+      raise "Could not find #{manifest_path}" unless File.exist?(manifest_path)
+      begin
+        json = open(manifest_path) {|f| f.read}
+        JSON.parse(json, :symbolize_names => true)
+      rescue JSON::ParserError
+        # TODO provide better error message
+        raise "Could not parse #{manifest_path} as JSON"
+      end
+    end
+    def run_script(script_config, input_file=nil)
+      command = build_command(script_config[:file], input_file)
+      script_runner = ScriptRunner.new(
+        command,
+        output_file(script_config[:file]),
+        script_config,
+        :record_handler => @record_handler,
+        :timeout => @timeout
+      )
+      script_runner.run # returns boolean indicating success
+    end
+    def process_script_output(script_config)
+      # The first argument to the Processor constructor is a nil
+      # Runner. This is because no running behaviour
+      # (e.g. interruptions etc) is required; we just want to do
+      # record handling.
+      processor = Processor.new(nil, script_config, @record_handler)
+      file = output_file(script_config[:file])
+      File.open(file) do |f|
+        f.each_line do |line|
+          processor.process(line)
+        end
+      end
+    rescue Errno::ENOENT => e
+      # We only want to catch ENOENT if the output file doesn't exist, and not
+      # if, for instance, a schema file is missing.
+      raise unless e.message == "No such file or directory - #{output_file(script_config[:file])}"
+    end
+    def build_command(script, input_file=nil)
+      raise "Could not run #{script} with #{language}" unless script_extension == File.extname(script)
+      command = "#{full_interpreter_path} #{additional_args} #{script} >#{output_file(script)}"
+      command << " 2>#{output_file(script, '.err')}" if @log_to_file
+      command << " <#{input_file}" unless input_file.nil?
+      command
+    end
+    def output_file(script, extension='.out')
+      basename = File.basename(script, script_extension)
+      File.join(@output_directory, basename) + extension
+    end
+    def script_extension
+      {
+        'ruby' => '.rb',
+        'python' => '.py',
+      }[language]
+    end
+    def additional_args
+      {
+        'ruby' => "-r#{File.expand_path('../prerun.rb', __FILE__)}",
+        'python' => '-u',
+      }[language]
+    end
+    def scraper_config
+      {
+        :base_directory => @base_directory,
+        :file => scraper_script,
+        :data_type => scraper_data_type,
+        :identifying_fields => scraper_identifying_fields
+      }
+    end
+    def scraper_script
+      "scraper#{script_extension}"
+    end
+    def transformers
+      @config[:transformers] || []
+    end
+    def scraper_output_file
+      File.join(@output_directory, 'scraper.out')
+    end
+    def language
+      @config[:language].downcase
+    end
+    def scraper_data_type
+      @config[:data_type]
+    end
+    def scraper_identifying_fields
+      @config[:identifying_fields]
+    end
+    def assert_absolute_path(path)
+      unless Pathname.new(path).absolute?
+        raise "#{path} must be an absolute path"
+      end
+    end
+  end
+end

data/lib/turbot_runner/script_runner.rb ADDED

@@ -0,0 +1,98 @@
+# This is a useful blog post:
+# http://blog.robseaman.com/2008/12/12/sending-ctrl-c-to-a-subprocess-with-ruby
+# Ensure that SIGINT is ignored by the process running this.
+trap('INT') {}
+module TurbotRunner
+  class ScriptRunner
+    def initialize(command, output_file, script_config, options={})
+      @command = command
+      @output_file = output_file
+      @script_config = script_config
+      record_handler = options[:record_handler] || BaseHandler.new  # A BaseHandler does nothing
+      @processor = Processor.new(self, script_config, record_handler)
+      @timeout = options[:timeout] || 3600
+    end
+    def run
+      Dir.chdir(@script_config[:base_directory]) do
+        begin
+          @interrupted = false
+          @failed = false
+          # Start a thread that spawns a subprocess that runs the script and
+          # redirects the script's output to a file at a known location.
+          script_thread = Thread.new { run_command(@command) }
+          # Wait for the output file to be created, so that we can start to read
+          # from it.
+          begin
+            f = File.open(@output_file, "r")
+          rescue Errno::ENOENT
+            sleep 0.1
+            retry
+          end
+          # Read from output file buildling up lines byte by byte byte by byte
+          # until either we reach the end of the file and the script has exited, or
+          # @interrupted becomes true.  We cannot use IO#readline here because if
+          # only half a line has been synced to the file by the time we read it,
+          # then the incomplete line will be read, causing chaos down the line.
+          line = ''
+          time_of_last_read = Time.now
+          until @interrupted do
+            byte = f.read(1)
+            if byte.nil?
+              if script_thread.alive?
+                sleep 0.1
+                interrupt_and_mark_as_failed if (Time.now - time_of_last_read) > @timeout
+              else
+                break
+              end
+            elsif byte == "\n"
+              @processor.process(line)
+              time_of_last_read = Time.now
+              line = ''
+            else
+              time_of_last_read = Time.now
+              line << byte
+            end
+          end
+          # script_thread may still be alive if we exited the loop above becuase
+          # @interrupted became true, and so we must kill it.
+          kill_running_processes if script_thread.alive?
+          @failed ? false : script_thread.join.value
+        ensure
+          f.close if f
+        end
+      end
+    end
+    def interrupt
+      @interrupted = true
+    end
+    def interrupt_and_mark_as_failed
+      @interrupted = true
+      @failed = true
+    end
+    private
+    def run_command(command)
+      system(command)
+      # A nil exitstatus indicates that the script was interrupted.  A
+      # termsig of 2 indicates that the script was interrupted by a SIGINT.
+      $?.exitstatus == 0 || ($?.exitstatus.nil? && $?.termsig == 2)
+    end
+    def kill_running_processes
+      # Send SIGINT to each process in the current proceess group, having
+      # already ensured that the current process itself ignores the signal.
+      Process.kill('INT', 0)
+    end
+  end
+end

data/lib/turbot_runner/utils.rb ADDED

@@ -0,0 +1,47 @@
+module TurbotRunner
+  module Utils
+    extend self
+    def deep_copy(thing)
+      Marshal.load(Marshal.dump(thing))
+    end
+    # This turns a hash of the form:
+    #
+    # {
+    #   'a' => {
+    #     'b' => {
+    #       'c' => '123',
+    #       'd' => '124',
+    #     },
+    #     'e' => {
+    #       'f' => '156',
+    #     }
+    #   }
+    # }
+    #
+    # into a hash of the form:
+    #
+    # {
+    #   'a.b.c' => '123',
+    #   'a.b.d' => '124',
+    #   'a.e.f' => '156',
+    # }
+    def flatten(hash)
+      pairs = []
+      hash.each do |k, v|
+        case v
+        when Hash
+          flatten(v).each do |k1, v1|
+            pairs << ["#{k}.#{k1}", v1]
+          end
+        else
+          pairs << [k, v]
+        end
+      end
+      Hash[pairs]
+    end
+  end
+end