RubyGems - openc_bot - Versions diffs - 0.0.11 - Mend

openc_bot 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

checksums.yaml +7 -0
data/.gitignore +22 -0
data/.travis.yml +8 -0
data/CHANGELOG.md +2 -0
data/Gemfile +8 -0
data/LICENSE.txt +22 -0
data/README.md +253 -0
data/Rakefile +14 -0
data/bin/openc_bot +13 -0
data/create_bot.sh +30 -0
data/create_company_bot.sh +16 -0
data/create_simple_licence_bot.sh +31 -0
data/db/.gitkeep +0 -0
data/examples/basic/.gitignore +3 -0
data/examples/basic/Gemfile +7 -0
data/examples/basic/config.yml +21 -0
data/examples/basic/lib/basic.rb +88 -0
data/examples/basic_with_proxy/Gemfile +7 -0
data/examples/basic_with_proxy/config.yml +21 -0
data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
data/examples/bot_with_simple_iterator/Gemfile +6 -0
data/examples/bot_with_simple_iterator/config.yml +21 -0
data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
data/examples/company_fetchers/basic.rb +49 -0
data/lib/monkey_patches/mechanize.rb +53 -0
data/lib/openc_bot.rb +89 -0
data/lib/openc_bot/bot_data_validator.rb +18 -0
data/lib/openc_bot/company_fetcher_bot.rb +40 -0
data/lib/openc_bot/exceptions.rb +17 -0
data/lib/openc_bot/helpers/_csv.rb +10 -0
data/lib/openc_bot/helpers/alpha_search.rb +73 -0
data/lib/openc_bot/helpers/dates.rb +33 -0
data/lib/openc_bot/helpers/html.rb +8 -0
data/lib/openc_bot/helpers/incremental_search.rb +106 -0
data/lib/openc_bot/helpers/register_methods.rb +205 -0
data/lib/openc_bot/helpers/text.rb +18 -0
data/lib/openc_bot/incrementers.rb +2 -0
data/lib/openc_bot/incrementers/base.rb +214 -0
data/lib/openc_bot/incrementers/common.rb +47 -0
data/lib/openc_bot/tasks.rb +385 -0
data/lib/openc_bot/templates/README.md +35 -0
data/lib/openc_bot/templates/bin/export_data +28 -0
data/lib/openc_bot/templates/bin/fetch_data +23 -0
data/lib/openc_bot/templates/bin/verify_data +1 -0
data/lib/openc_bot/templates/config.yml +21 -0
data/lib/openc_bot/templates/lib/bot.rb +43 -0
data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
data/lib/openc_bot/version.rb +3 -0
data/lib/simple_openc_bot.rb +289 -0
data/openc_bot.gemspec +35 -0
data/schemas/company-schema.json +112 -0
data/schemas/includes/address.json +23 -0
data/schemas/includes/base-statement.json +27 -0
data/schemas/includes/company.json +14 -0
data/schemas/includes/filing.json +20 -0
data/schemas/includes/license-data.json +27 -0
data/schemas/includes/officer.json +14 -0
data/schemas/includes/previous_name.json +11 -0
data/schemas/includes/share-parcel-data.json +67 -0
data/schemas/includes/share-parcel.json +60 -0
data/schemas/includes/subsidiary-relationship-data.json +52 -0
data/schemas/includes/total-shares.json +10 -0
data/schemas/licence-schema.json +21 -0
data/schemas/share-parcel-schema.json +21 -0
data/schemas/subsidiary-relationship-schema.json +19 -0
data/spec/dummy_classes/foo_bot.rb +4 -0
data/spec/lib/bot_data_validator_spec.rb +69 -0
data/spec/lib/company_fetcher_bot_spec.rb +93 -0
data/spec/lib/exceptions_spec.rb +25 -0
data/spec/lib/helpers/alpha_search_spec.rb +173 -0
data/spec/lib/helpers/dates_spec.rb +65 -0
data/spec/lib/helpers/incremental_search_spec.rb +471 -0
data/spec/lib/helpers/register_methods_spec.rb +558 -0
data/spec/lib/helpers/text_spec.rb +50 -0
data/spec/lib/openc_bot/db/.gitkeep +0 -0
data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
data/spec/lib/openc_bot_spec.rb +116 -0
data/spec/schemas/company-schema_spec.rb +676 -0
data/spec/simple_openc_bot_spec.rb +302 -0
data/spec/spec_helper.rb +19 -0
metadata +300 -0

data/lib/monkey_patches/mechanize.rb ADDED Viewed

@@ -0,0 +1,53 @@
+class Mechanize::HTTP::Agent
+  MAX_RESET_RETRIES = 10
+  # We need to replace the core Mechanize HTTP method:
+  #
+  #   Mechanize::HTTP::Agent#fetch
+  #
+  # with a wrapper that handles the infamous "too many connection resets"
+  # Mechanize bug that is described here:
+  #
+  #   https://github.com/sparklemotion/mechanize/issues/123
+  #
+  # The wrapper shuts down the persistent HTTP connection when it fails with
+  # this error, and simply tries again. In practice, this only ever needs to
+  # be retried once, but I am going to let it retry a few times
+  # (MAX_RESET_RETRIES), just in case.
+  #
+  def fetch_with_retry(
+    uri,
+    method    = :get,
+    headers   = {},
+    params    = [],
+    referer   = current_page,
+    redirects = 0
+  )
+    action      = "#{method.to_s.upcase} #{uri.to_s}"
+    retry_count = 0
+    begin
+      fetch_without_retry(uri, method, headers, params, referer, redirects)
+    rescue Net::HTTP::Persistent::Error => e
+      # Pass on any other type of error.
+      raise unless e.message =~ /too many connection resets/
+      # Pass on the error if we've tried too many times.
+      if retry_count >= MAX_RESET_RETRIES
+        puts "**** WARN: Mechanize retried connection reset #{MAX_RESET_RETRIES} times and never succeeded: #{action}"
+        raise
+      end
+      # Otherwise, shutdown the persistent HTTP connection and try again.
+      puts "**** WARN: Mechanize retrying connection reset error: #{action}"
+      retry_count += 1
+      self.http.shutdown
+      retry
+    end
+  end
+  # Alias so #fetch actually uses our new #fetch_with_retry to wrap the
+  # old one aliased as #fetch_without_retry.
+  alias_method :fetch_without_retry, :fetch
+  alias_method :fetch, :fetch_with_retry
+end

data/lib/openc_bot.rb ADDED Viewed

@@ -0,0 +1,89 @@
+# encoding: UTF-8
+require 'openc_bot/version'
+require 'json'
+require 'scraperwiki'
+require_relative 'openc_bot/bot_data_validator'
+require 'openc_bot/helpers/text'
+require 'openc_bot/exceptions'
+module OpencBot
+  class OpencBotError < StandardError;end
+  class DatabaseError < OpencBotError;end
+  class InvalidDataError < OpencBotError;end
+  class NotFoundError < OpencBotError;end
+  include ScraperWiki
+  # include by default, as some were previously in made openc_bot file
+  include Helpers::Text
+  def insert_or_update(uniq_keys, values_hash, tbl_name='ocdata')
+    sqlite_magic_connection.insert_or_update(uniq_keys, values_hash, tbl_name)
+  end
+  def save_data(uniq_keys, values_array, tbl_name='ocdata')
+    save_sqlite(uniq_keys, values_array, tbl_name)
+  end
+  def save_run_report(report_hash)
+    json_report = report_hash.to_json
+    save_data([:run_at], { :report => json_report, :run_at => Time.now.to_s }, :ocrunreports)
+  end
+  # Returns the root directory of the bot (not this gem).
+  # Assumes the bot file that extends its functionality using this bot is in a directory (lib) inside the root directory
+  def root_directory
+    @@app_directory
+  end
+  def unlock_database
+    sqlite_magic_connection.execute("BEGIN TRANSACTION; END;")
+  end
+  # Convenience method that returns true if VERBOSE environmental variable set (at the moment whatever it is set to)
+  def verbose?
+    ENV['VERBOSE']
+  end
+  def export(opts={})
+    export_data(opts).each do |record|
+      $stdout.puts record.to_json
+      $stdout.flush
+    end
+  end
+  def spotcheck
+    $stdout.puts JSON.pretty_generate(spotcheck_data)
+  end
+  # When deciding on the location of the SQLite databases we need to
+  # set the directory relative to the directory of the file/app that
+  # includes the gem, not the gem itself.  Doing it this way, and
+  # setting a class variable feels ugly, but this appears to be
+  # difficult in Ruby, esp as the file may ultimately be called by
+  # another process, e.g. the main OpenCorporates app or the console,
+  # whose main directory is unrelated to where the databases are
+  # stored (which means we can't use Dir.pwd etc). The only time we
+  # know about the directory is when the module is called to extend
+  # the file, and we capture that in the @app_directory class variable
+  def self.extended(obj)
+    path, = caller[0].partition(":")
+    path = File.expand_path(File.join(File.dirname(path),'..'))
+    @@app_directory = path
+  end
+  def db_name
+    if is_a?(Module)
+      "#{self.name.downcase}.db"
+    else
+      "#{self.class.name.downcase}.db"
+    end
+  end
+  # Override default in ScraperWiki gem
+  def sqlite_magic_connection
+    db = @config ? @config[:db] : File.expand_path(File.join(@@app_directory, 'db', db_name))
+    @sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
+  end
+end

data/lib/openc_bot/bot_data_validator.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# encoding: UTF-8
+module OpencBot
+  module BotDataValidator
+    extend self
+    def validate(datum)
+      datum.kind_of?(Hash) and
+      datum[:company][:name] and
+      not datum[:company][:name].strip.empty? and
+      not datum[:source_url].strip.empty? and
+      not datum[:data].empty? and
+      datum[:data].all?{ |data| not data[:data_type].to_s.strip.empty? and not data[:properties].empty? }
+    rescue Exception => e
+      #any probs then it's invalid
+      false
+    end
+  end
+end

data/lib/openc_bot/company_fetcher_bot.rb ADDED Viewed

@@ -0,0 +1,40 @@
+require 'openc_bot'
+require 'openc_bot/helpers/incremental_search'
+require 'openc_bot/helpers/alpha_search'
+module OpencBot
+  module CompanyFetcherBot
+    include OpencBot
+    include OpencBot::Helpers::IncrementalSearch
+    include OpencBot::Helpers::AlphaSearch
+    # This is called by #update_datum
+    def fetch_datum(company_number)
+      company_page = fetch_registry_page(company_number)
+      {:company_page => company_page}
+    end
+    def inferred_jurisdiction_code
+      poss_j_code = self.name.sub(/CompaniesFetcher/,'').underscore
+      poss_j_code[/^[a-z]{2}$|^[a-z]{2}_[a-z]{2}$/]
+    end
+    def primary_key_name
+      :company_number
+    end
+    # This overrides default #save_entity (defined in RegisterMethods) and adds
+    # the inferred jurisdiction_code, unless it is overridden in entity_info
+    def save_entity(entity_info)
+      return if entity_info.blank?
+      default_options = {:jurisdiction_code => inferred_jurisdiction_code}
+      super(default_options.merge(entity_info))
+    end
+    def schema_name
+      super || 'company-schema'
+    end
+  end
+end

data/lib/openc_bot/exceptions.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module OpencBot
+  # Generic Error class for OpencBot exceptions
+  class OpencBotError < StandardError;end
+  #
+  # Raised by <tt>save_entity!</tt> when the record is invalid.
+  # Use the +validation_errors+ method to retrieve the, er, validation errors.
+  class RecordInvalid < OpencBotError
+    attr_reader :validation_errors
+    def initialize(validation_errors)
+      @validation_errors = validation_errors
+    end
+  end
+end

data/lib/openc_bot/helpers/_csv.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# This is in _csr.rb to avoid requiring it when we mean to require the system
+# csv library.
+module OpencBot
+  module Helpers
+    module Csv
+      # This module will eventually hold some helper methods for
+      # dealing with Csv content
+    end
+  end
+end

data/lib/openc_bot/helpers/alpha_search.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# encoding: UTF-8
+require 'openc_bot/helpers/register_methods'
+module OpencBot
+  module Helpers
+    module AlphaSearch
+      include OpencBot::Helpers::RegisterMethods
+      def alpha_terms(starting_term=nil)
+        all_perms = letters_and_numbers.repeated_permutation(numbers_of_chars_in_search).
+          collect(&:join)
+        # get starting position from given term
+        starting_position = starting_term && all_perms.index(starting_term)
+        # start from starting_position if we have it or from start of array (pos 0) if not
+        all_perms[starting_position.to_i..-1]
+      end
+      def fetch_data_via_alpha_search(options={})
+        starting_term = options[:starting_term]||get_var('starting_term')
+        each_search_term(starting_term) do |term|
+          save_var('starting_term', term)
+          search_for_entities_for_term(term, options) do |entity_datum|
+            save_entity(entity_datum)
+          end
+        end
+        # reset pointer
+        save_var('starting_term',nil)
+      end
+      # Iterates through each search term, yielding the result to a block, or returning
+      # the array of search_terms if no block given
+      def each_search_term(starting_term=nil)
+        alpha_terms(starting_term).each{ |t| yield t if block_given?}
+      end
+      def letters_and_numbers
+        ('A'..'Z').to_a + ('0'..'9').to_a
+      end
+      def numbers_of_chars_in_search
+        self.const_defined?('NUMBER_OF_CHARS_IN_SEARCH') ? self.const_get('NUMBER_OF_CHARS_IN_SEARCH') : 1
+      end
+      def search_for_entities_for_term(term, options={})
+        raise "The #search_for_entities_for_term method has not been implemented for this case.\nIt needs to be, and should yield a company data Hash"
+      end
+      def get_results_and_extract_data_for(prefix, search_offset)
+        while search_offset do
+          url = "http://www.oera.li/WebServices/ZefixFL/ZefixFL.asmx/SearchFirm?name=#{prefix}%20&suche_nach=-&rf=&sitz=&id=&language=&phonetisch=no&posMin=#{search_offset}"
+          response =
+            begin
+              html = open(url).read.encode!('utf-8','iso-8859-1')
+            rescue Exception, Timeout::Error => e
+              puts "Problem getting/parsing data from #{url}: #{e.inspect}"
+              nil
+            end
+          next unless response
+          if response.match(/webservices\/HRG/) # check has links to companies
+            puts "****Scraping page #{(search_offset+10)/10}"
+            scrape_search_results_page(response, url)
+            save_var('search_offset', search_offset)
+            search_offset += 10
+          else
+            search_offset = false
+          end
+        end
+      end
+    end
+  end
+end

data/lib/openc_bot/helpers/dates.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# encoding: UTF-8
+require 'date'
+module OpencBot
+  module Helpers
+    module Dates
+      extend self
+      AMERICAN_DATE_RE = %r_\A\s*(\d{1,2})/(\d{1,2})/(\d{4}|\d{2})_.freeze
+      def normalise_uk_date(raw_date)
+        return if raw_date.nil? or raw_date.to_s.strip.empty?
+        if raw_date.is_a?(String)
+          cleaned_up_date = raw_date.gsub(/\s+/,'').match(/^\d+\/[\d\w]+\/\d+$/) ? raw_date.gsub('/','-') : raw_date
+          raw_date = to_date(cleaned_up_date.sub(/^(\d{1,2}-)([\w\d]+-)([01]\d)$/,'\1\220\3').sub(/^(\d{1,2}-)([\w\d]+-)([9]\d)$/,'\1\219\3'))
+        end
+        raw_date.to_s
+      end
+      def normalise_us_date(raw_date)
+        return if raw_date.nil? or raw_date.to_s.strip.empty?
+        # we want to set century to 19 if there's none set and the years are in the 20s or later
+        raw_date = raw_date.to_s.sub(/^(\s*\d{1,2}[\/-]\d{1,2}[\/-])([2-9]\d)$/,'\119\2')
+        iso_date = raw_date.to_s.sub(AMERICAN_DATE_RE) { |m| "#$3-#$1-#$2" }
+        to_date(iso_date, true).to_s
+      end
+      private
+      def to_date(date, comp=false)
+        return if date.nil?
+        Date.parse(date,comp)
+      end
+    end
+  end
+end

data/lib/openc_bot/helpers/html.rb ADDED Viewed

@@ -0,0 +1,8 @@
+module OpencBot
+  module Helpers
+    module Html
+      # This module will eventually hold some helper methods for
+      # dealing with HTML content
+    end
+  end
+end

data/lib/openc_bot/helpers/incremental_search.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# encoding: UTF-8
+require 'openc_bot/helpers/register_methods'
+module OpencBot
+  module Helpers
+    module IncrementalSearch
+      include OpencBot::Helpers::RegisterMethods
+      # Gets new records using an incremental search
+      def fetch_data_via_incremental_search(options={})
+        return unless old_highest_numbers = options.delete(:highest_entry_uids) || highest_entry_uids
+        # offset by rewind count if set and also in that case assume by default we want to skip_existing_companies
+        options = {:offset => (0 - incremental_rewind_count), :skip_existing_entries => true}.merge(options) if incremental_rewind_count
+        new_highest_numbers = old_highest_numbers.collect do |old_highest_number|
+          incremental_search(old_highest_number, options)
+        end
+        save_var(:highest_entry_uids, new_highest_numbers)
+      end
+      def highest_entry_uids(force_get = false)
+        bad_results = []
+        results = get_var('highest_entry_uids')
+        if results.nil? || results.empty? || (results.is_a?(Array) && results.any?{ |r| r.nil? || r.empty? })
+          results = entity_uid_prefixes.collect do |prefix|
+            hcn = highest_entry_uid_result(:prefix => prefix)
+            bad_results << prefix if (hcn.nil? || hcn.empty?)
+            hcn
+          end
+        end
+        results.compact! unless bad_results.empty?
+        return results unless results.empty?
+      end
+      def highest_entry_uid_result(options={})
+        if options[:prefix]
+          sql_query = ["ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} LIKE ? ORDER BY cast(substr(#{primary_key_name},?) as real) DESC LIMIT 1", ["#{options[:prefix]}%", options[:prefix].length + 1]]
+        elsif options[:suffix]
+          sql_query = ["ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} LIKE ? ORDER BY cast(#{primary_key_name} as real) DESC LIMIT 1", "%#{options[:suffix]}"]
+        else
+          sql_query = "ocdata.#{primary_key_name} FROM ocdata ORDER BY cast(#{primary_key_name} as real) DESC LIMIT 1"
+        end
+        select(*sql_query).first[primary_key_name.to_s]# rescue nil
+      rescue SqliteMagic::NoSuchTable
+        # first run, so no table or database yet
+        return "#{options[:prefix]}0"
+      end
+      def incremental_rewind_count
+        self.const_defined?('INCREMENTAL_REWIND_COUNT') ? self.const_get('INCREMENTAL_REWIND_COUNT') : nil
+      end
+      def entity_uid_prefixes
+        self.const_defined?('ENTITY_UID_PREFIXES') ? self.const_get('ENTITY_UID_PREFIXES') : [nil]
+      end
+      def entity_uid_suffixes
+        self.const_defined?('ENTITY_UID_SUFFIXES') ? self.const_get('ENTITY_UID_SUFFIXES') : [nil]
+      end
+      def incremental_search(uid, options={})
+        first_number = uid.dup
+        current_number = nil # set up ouside of loop
+        error_count = 0
+        last_good_co_no = nil
+        skip_existing_entries = options.delete(:skip_existing_entries)
+        # start at given number but offset by given amount. i.e. by offset
+        uid = increment_number(uid, options[:offset]) if options[:offset]
+        loop do
+          current_number = uid
+          if skip_existing_entries and datum_exists?(uid)
+            uid = increment_number(uid)
+            error_count = 0 # reset error count
+            next
+          elsif update_datum(current_number, false)
+            last_good_co_no = current_number
+            error_count = 0 # reset error count
+          else
+            error_count += 1
+            puts "Failed to find company with uid #{current_number}. Error count: #{error_count}" if verbose?
+            break if error_count > max_failed_count
+          end
+          uid = increment_number(uid)
+        end
+        # return orig uid if we haven't had any new entities
+        last_good_co_no ? last_good_co_no.to_s : first_number
+      end
+      def increment_number(uid,increment_amount=1)
+        orig_uid = uid.to_s.dup
+        uid.to_s.sub(/\d+/) do |d|
+          length = d.length
+          incremented_number = d.to_i + increment_amount
+          length = d.length
+          length = incremented_number.to_s.length if increment_amount < 0 and not d[/^0/]
+          sprintf("%0#{length}d", incremented_number)
+        end
+      end
+      def max_failed_count
+        self.const_defined?('MAX_FAILED_COUNT') ? self.const_get('MAX_FAILED_COUNT') : 10
+      end
+    end
+  end
+end