openc_bot 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +2 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +253 -0
- data/Rakefile +14 -0
- data/bin/openc_bot +13 -0
- data/create_bot.sh +30 -0
- data/create_company_bot.sh +16 -0
- data/create_simple_licence_bot.sh +31 -0
- data/db/.gitkeep +0 -0
- data/examples/basic/.gitignore +3 -0
- data/examples/basic/Gemfile +7 -0
- data/examples/basic/config.yml +21 -0
- data/examples/basic/lib/basic.rb +88 -0
- data/examples/basic_with_proxy/Gemfile +7 -0
- data/examples/basic_with_proxy/config.yml +21 -0
- data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
- data/examples/bot_with_simple_iterator/Gemfile +6 -0
- data/examples/bot_with_simple_iterator/config.yml +21 -0
- data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
- data/examples/company_fetchers/basic.rb +49 -0
- data/lib/monkey_patches/mechanize.rb +53 -0
- data/lib/openc_bot.rb +89 -0
- data/lib/openc_bot/bot_data_validator.rb +18 -0
- data/lib/openc_bot/company_fetcher_bot.rb +40 -0
- data/lib/openc_bot/exceptions.rb +17 -0
- data/lib/openc_bot/helpers/_csv.rb +10 -0
- data/lib/openc_bot/helpers/alpha_search.rb +73 -0
- data/lib/openc_bot/helpers/dates.rb +33 -0
- data/lib/openc_bot/helpers/html.rb +8 -0
- data/lib/openc_bot/helpers/incremental_search.rb +106 -0
- data/lib/openc_bot/helpers/register_methods.rb +205 -0
- data/lib/openc_bot/helpers/text.rb +18 -0
- data/lib/openc_bot/incrementers.rb +2 -0
- data/lib/openc_bot/incrementers/base.rb +214 -0
- data/lib/openc_bot/incrementers/common.rb +47 -0
- data/lib/openc_bot/tasks.rb +385 -0
- data/lib/openc_bot/templates/README.md +35 -0
- data/lib/openc_bot/templates/bin/export_data +28 -0
- data/lib/openc_bot/templates/bin/fetch_data +23 -0
- data/lib/openc_bot/templates/bin/verify_data +1 -0
- data/lib/openc_bot/templates/config.yml +21 -0
- data/lib/openc_bot/templates/lib/bot.rb +43 -0
- data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
- data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
- data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
- data/lib/openc_bot/version.rb +3 -0
- data/lib/simple_openc_bot.rb +289 -0
- data/openc_bot.gemspec +35 -0
- data/schemas/company-schema.json +112 -0
- data/schemas/includes/address.json +23 -0
- data/schemas/includes/base-statement.json +27 -0
- data/schemas/includes/company.json +14 -0
- data/schemas/includes/filing.json +20 -0
- data/schemas/includes/license-data.json +27 -0
- data/schemas/includes/officer.json +14 -0
- data/schemas/includes/previous_name.json +11 -0
- data/schemas/includes/share-parcel-data.json +67 -0
- data/schemas/includes/share-parcel.json +60 -0
- data/schemas/includes/subsidiary-relationship-data.json +52 -0
- data/schemas/includes/total-shares.json +10 -0
- data/schemas/licence-schema.json +21 -0
- data/schemas/share-parcel-schema.json +21 -0
- data/schemas/subsidiary-relationship-schema.json +19 -0
- data/spec/dummy_classes/foo_bot.rb +4 -0
- data/spec/lib/bot_data_validator_spec.rb +69 -0
- data/spec/lib/company_fetcher_bot_spec.rb +93 -0
- data/spec/lib/exceptions_spec.rb +25 -0
- data/spec/lib/helpers/alpha_search_spec.rb +173 -0
- data/spec/lib/helpers/dates_spec.rb +65 -0
- data/spec/lib/helpers/incremental_search_spec.rb +471 -0
- data/spec/lib/helpers/register_methods_spec.rb +558 -0
- data/spec/lib/helpers/text_spec.rb +50 -0
- data/spec/lib/openc_bot/db/.gitkeep +0 -0
- data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
- data/spec/lib/openc_bot_spec.rb +116 -0
- data/spec/schemas/company-schema_spec.rb +676 -0
- data/spec/simple_openc_bot_spec.rb +302 -0
- data/spec/spec_helper.rb +19 -0
- metadata +300 -0
@@ -0,0 +1,53 @@
|
|
1
|
+
class Mechanize::HTTP::Agent
|
2
|
+
MAX_RESET_RETRIES = 10
|
3
|
+
|
4
|
+
# We need to replace the core Mechanize HTTP method:
|
5
|
+
#
|
6
|
+
# Mechanize::HTTP::Agent#fetch
|
7
|
+
#
|
8
|
+
# with a wrapper that handles the infamous "too many connection resets"
|
9
|
+
# Mechanize bug that is described here:
|
10
|
+
#
|
11
|
+
# https://github.com/sparklemotion/mechanize/issues/123
|
12
|
+
#
|
13
|
+
# The wrapper shuts down the persistent HTTP connection when it fails with
|
14
|
+
# this error, and simply tries again. In practice, this only ever needs to
|
15
|
+
# be retried once, but I am going to let it retry a few times
|
16
|
+
# (MAX_RESET_RETRIES), just in case.
|
17
|
+
#
|
18
|
+
def fetch_with_retry(
|
19
|
+
uri,
|
20
|
+
method = :get,
|
21
|
+
headers = {},
|
22
|
+
params = [],
|
23
|
+
referer = current_page,
|
24
|
+
redirects = 0
|
25
|
+
)
|
26
|
+
action = "#{method.to_s.upcase} #{uri.to_s}"
|
27
|
+
retry_count = 0
|
28
|
+
|
29
|
+
begin
|
30
|
+
fetch_without_retry(uri, method, headers, params, referer, redirects)
|
31
|
+
rescue Net::HTTP::Persistent::Error => e
|
32
|
+
# Pass on any other type of error.
|
33
|
+
raise unless e.message =~ /too many connection resets/
|
34
|
+
|
35
|
+
# Pass on the error if we've tried too many times.
|
36
|
+
if retry_count >= MAX_RESET_RETRIES
|
37
|
+
puts "**** WARN: Mechanize retried connection reset #{MAX_RESET_RETRIES} times and never succeeded: #{action}"
|
38
|
+
raise
|
39
|
+
end
|
40
|
+
|
41
|
+
# Otherwise, shutdown the persistent HTTP connection and try again.
|
42
|
+
puts "**** WARN: Mechanize retrying connection reset error: #{action}"
|
43
|
+
retry_count += 1
|
44
|
+
self.http.shutdown
|
45
|
+
retry
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Alias so #fetch actually uses our new #fetch_with_retry to wrap the
|
50
|
+
# old one aliased as #fetch_without_retry.
|
51
|
+
alias_method :fetch_without_retry, :fetch
|
52
|
+
alias_method :fetch, :fetch_with_retry
|
53
|
+
end
|
data/lib/openc_bot.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'openc_bot/version'
|
3
|
+
require 'json'
|
4
|
+
require 'scraperwiki'
|
5
|
+
require_relative 'openc_bot/bot_data_validator'
|
6
|
+
require 'openc_bot/helpers/text'
|
7
|
+
require 'openc_bot/exceptions'
|
8
|
+
|
9
|
+
module OpencBot
|
10
|
+
|
11
|
+
class OpencBotError < StandardError;end
|
12
|
+
class DatabaseError < OpencBotError;end
|
13
|
+
class InvalidDataError < OpencBotError;end
|
14
|
+
class NotFoundError < OpencBotError;end
|
15
|
+
|
16
|
+
include ScraperWiki
|
17
|
+
# include by default, as some were previously in made openc_bot file
|
18
|
+
include Helpers::Text
|
19
|
+
|
20
|
+
def insert_or_update(uniq_keys, values_hash, tbl_name='ocdata')
|
21
|
+
sqlite_magic_connection.insert_or_update(uniq_keys, values_hash, tbl_name)
|
22
|
+
end
|
23
|
+
|
24
|
+
def save_data(uniq_keys, values_array, tbl_name='ocdata')
|
25
|
+
save_sqlite(uniq_keys, values_array, tbl_name)
|
26
|
+
end
|
27
|
+
|
28
|
+
def save_run_report(report_hash)
|
29
|
+
json_report = report_hash.to_json
|
30
|
+
save_data([:run_at], { :report => json_report, :run_at => Time.now.to_s }, :ocrunreports)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns the root directory of the bot (not this gem).
|
34
|
+
# Assumes the bot file that extends its functionality using this bot is in a directory (lib) inside the root directory
|
35
|
+
def root_directory
|
36
|
+
@@app_directory
|
37
|
+
end
|
38
|
+
|
39
|
+
def unlock_database
|
40
|
+
sqlite_magic_connection.execute("BEGIN TRANSACTION; END;")
|
41
|
+
end
|
42
|
+
|
43
|
+
# Convenience method that returns true if VERBOSE environmental variable set (at the moment whatever it is set to)
|
44
|
+
def verbose?
|
45
|
+
ENV['VERBOSE']
|
46
|
+
end
|
47
|
+
|
48
|
+
def export(opts={})
|
49
|
+
export_data(opts).each do |record|
|
50
|
+
$stdout.puts record.to_json
|
51
|
+
$stdout.flush
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def spotcheck
|
56
|
+
$stdout.puts JSON.pretty_generate(spotcheck_data)
|
57
|
+
end
|
58
|
+
|
59
|
+
# When deciding on the location of the SQLite databases we need to
|
60
|
+
# set the directory relative to the directory of the file/app that
|
61
|
+
# includes the gem, not the gem itself. Doing it this way, and
|
62
|
+
# setting a class variable feels ugly, but this appears to be
|
63
|
+
# difficult in Ruby, esp as the file may ultimately be called by
|
64
|
+
# another process, e.g. the main OpenCorporates app or the console,
|
65
|
+
# whose main directory is unrelated to where the databases are
|
66
|
+
# stored (which means we can't use Dir.pwd etc). The only time we
|
67
|
+
# know about the directory is when the module is called to extend
|
68
|
+
# the file, and we capture that in the @app_directory class variable
|
69
|
+
def self.extended(obj)
|
70
|
+
path, = caller[0].partition(":")
|
71
|
+
path = File.expand_path(File.join(File.dirname(path),'..'))
|
72
|
+
@@app_directory = path
|
73
|
+
end
|
74
|
+
|
75
|
+
def db_name
|
76
|
+
if is_a?(Module)
|
77
|
+
"#{self.name.downcase}.db"
|
78
|
+
else
|
79
|
+
"#{self.class.name.downcase}.db"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Override default in ScraperWiki gem
|
84
|
+
def sqlite_magic_connection
|
85
|
+
db = @config ? @config[:db] : File.expand_path(File.join(@@app_directory, 'db', db_name))
|
86
|
+
@sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module OpencBot
|
3
|
+
module BotDataValidator
|
4
|
+
|
5
|
+
extend self
|
6
|
+
def validate(datum)
|
7
|
+
datum.kind_of?(Hash) and
|
8
|
+
datum[:company][:name] and
|
9
|
+
not datum[:company][:name].strip.empty? and
|
10
|
+
not datum[:source_url].strip.empty? and
|
11
|
+
not datum[:data].empty? and
|
12
|
+
datum[:data].all?{ |data| not data[:data_type].to_s.strip.empty? and not data[:properties].empty? }
|
13
|
+
rescue Exception => e
|
14
|
+
#any probs then it's invalid
|
15
|
+
false
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'openc_bot'
|
2
|
+
require 'openc_bot/helpers/incremental_search'
|
3
|
+
require 'openc_bot/helpers/alpha_search'
|
4
|
+
|
5
|
+
|
6
|
+
module OpencBot
|
7
|
+
module CompanyFetcherBot
|
8
|
+
include OpencBot
|
9
|
+
include OpencBot::Helpers::IncrementalSearch
|
10
|
+
include OpencBot::Helpers::AlphaSearch
|
11
|
+
|
12
|
+
# This is called by #update_datum
|
13
|
+
def fetch_datum(company_number)
|
14
|
+
company_page = fetch_registry_page(company_number)
|
15
|
+
{:company_page => company_page}
|
16
|
+
end
|
17
|
+
|
18
|
+
def inferred_jurisdiction_code
|
19
|
+
poss_j_code = self.name.sub(/CompaniesFetcher/,'').underscore
|
20
|
+
poss_j_code[/^[a-z]{2}$|^[a-z]{2}_[a-z]{2}$/]
|
21
|
+
end
|
22
|
+
|
23
|
+
def primary_key_name
|
24
|
+
:company_number
|
25
|
+
end
|
26
|
+
|
27
|
+
# This overrides default #save_entity (defined in RegisterMethods) and adds
|
28
|
+
# the inferred jurisdiction_code, unless it is overridden in entity_info
|
29
|
+
def save_entity(entity_info)
|
30
|
+
return if entity_info.blank?
|
31
|
+
default_options = {:jurisdiction_code => inferred_jurisdiction_code}
|
32
|
+
super(default_options.merge(entity_info))
|
33
|
+
end
|
34
|
+
|
35
|
+
def schema_name
|
36
|
+
super || 'company-schema'
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module OpencBot
|
2
|
+
|
3
|
+
# Generic Error class for OpencBot exceptions
|
4
|
+
class OpencBotError < StandardError;end
|
5
|
+
|
6
|
+
#
|
7
|
+
# Raised by <tt>save_entity!</tt> when the record is invalid.
|
8
|
+
# Use the +validation_errors+ method to retrieve the, er, validation errors.
|
9
|
+
class RecordInvalid < OpencBotError
|
10
|
+
attr_reader :validation_errors
|
11
|
+
|
12
|
+
def initialize(validation_errors)
|
13
|
+
@validation_errors = validation_errors
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'openc_bot/helpers/register_methods'
|
3
|
+
|
4
|
+
module OpencBot
|
5
|
+
module Helpers
|
6
|
+
module AlphaSearch
|
7
|
+
|
8
|
+
include OpencBot::Helpers::RegisterMethods
|
9
|
+
|
10
|
+
def alpha_terms(starting_term=nil)
|
11
|
+
all_perms = letters_and_numbers.repeated_permutation(numbers_of_chars_in_search).
|
12
|
+
collect(&:join)
|
13
|
+
# get starting position from given term
|
14
|
+
starting_position = starting_term && all_perms.index(starting_term)
|
15
|
+
# start from starting_position if we have it or from start of array (pos 0) if not
|
16
|
+
all_perms[starting_position.to_i..-1]
|
17
|
+
end
|
18
|
+
|
19
|
+
def fetch_data_via_alpha_search(options={})
|
20
|
+
starting_term = options[:starting_term]||get_var('starting_term')
|
21
|
+
each_search_term(starting_term) do |term|
|
22
|
+
save_var('starting_term', term)
|
23
|
+
search_for_entities_for_term(term, options) do |entity_datum|
|
24
|
+
save_entity(entity_datum)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
# reset pointer
|
28
|
+
save_var('starting_term',nil)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Iterates through each search term, yielding the result to a block, or returning
|
32
|
+
# the array of search_terms if no block given
|
33
|
+
def each_search_term(starting_term=nil)
|
34
|
+
alpha_terms(starting_term).each{ |t| yield t if block_given?}
|
35
|
+
end
|
36
|
+
|
37
|
+
def letters_and_numbers
|
38
|
+
('A'..'Z').to_a + ('0'..'9').to_a
|
39
|
+
end
|
40
|
+
|
41
|
+
def numbers_of_chars_in_search
|
42
|
+
self.const_defined?('NUMBER_OF_CHARS_IN_SEARCH') ? self.const_get('NUMBER_OF_CHARS_IN_SEARCH') : 1
|
43
|
+
end
|
44
|
+
|
45
|
+
def search_for_entities_for_term(term, options={})
|
46
|
+
raise "The #search_for_entities_for_term method has not been implemented for this case.\nIt needs to be, and should yield a company data Hash"
|
47
|
+
end
|
48
|
+
|
49
|
+
def get_results_and_extract_data_for(prefix, search_offset)
|
50
|
+
while search_offset do
|
51
|
+
url = "http://www.oera.li/WebServices/ZefixFL/ZefixFL.asmx/SearchFirm?name=#{prefix}%20&suche_nach=-&rf=&sitz=&id=&language=&phonetisch=no&posMin=#{search_offset}"
|
52
|
+
response =
|
53
|
+
begin
|
54
|
+
html = open(url).read.encode!('utf-8','iso-8859-1')
|
55
|
+
rescue Exception, Timeout::Error => e
|
56
|
+
puts "Problem getting/parsing data from #{url}: #{e.inspect}"
|
57
|
+
nil
|
58
|
+
end
|
59
|
+
next unless response
|
60
|
+
if response.match(/webservices\/HRG/) # check has links to companies
|
61
|
+
puts "****Scraping page #{(search_offset+10)/10}"
|
62
|
+
scrape_search_results_page(response, url)
|
63
|
+
save_var('search_offset', search_offset)
|
64
|
+
search_offset += 10
|
65
|
+
else
|
66
|
+
search_offset = false
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'date'
|
3
|
+
module OpencBot
|
4
|
+
module Helpers
|
5
|
+
module Dates
|
6
|
+
extend self
|
7
|
+
AMERICAN_DATE_RE = %r_\A\s*(\d{1,2})/(\d{1,2})/(\d{4}|\d{2})_.freeze
|
8
|
+
|
9
|
+
def normalise_uk_date(raw_date)
|
10
|
+
return if raw_date.nil? or raw_date.to_s.strip.empty?
|
11
|
+
if raw_date.is_a?(String)
|
12
|
+
cleaned_up_date = raw_date.gsub(/\s+/,'').match(/^\d+\/[\d\w]+\/\d+$/) ? raw_date.gsub('/','-') : raw_date
|
13
|
+
raw_date = to_date(cleaned_up_date.sub(/^(\d{1,2}-)([\w\d]+-)([01]\d)$/,'\1\220\3').sub(/^(\d{1,2}-)([\w\d]+-)([9]\d)$/,'\1\219\3'))
|
14
|
+
end
|
15
|
+
raw_date.to_s
|
16
|
+
end
|
17
|
+
|
18
|
+
def normalise_us_date(raw_date)
|
19
|
+
return if raw_date.nil? or raw_date.to_s.strip.empty?
|
20
|
+
# we want to set century to 19 if there's none set and the years are in the 20s or later
|
21
|
+
raw_date = raw_date.to_s.sub(/^(\s*\d{1,2}[\/-]\d{1,2}[\/-])([2-9]\d)$/,'\119\2')
|
22
|
+
iso_date = raw_date.to_s.sub(AMERICAN_DATE_RE) { |m| "#$3-#$1-#$2" }
|
23
|
+
to_date(iso_date, true).to_s
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
def to_date(date, comp=false)
|
28
|
+
return if date.nil?
|
29
|
+
Date.parse(date,comp)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'openc_bot/helpers/register_methods'
|
3
|
+
|
4
|
+
module OpencBot
|
5
|
+
module Helpers
|
6
|
+
module IncrementalSearch
|
7
|
+
|
8
|
+
include OpencBot::Helpers::RegisterMethods
|
9
|
+
|
10
|
+
# Gets new records using an incremental search
|
11
|
+
def fetch_data_via_incremental_search(options={})
|
12
|
+
return unless old_highest_numbers = options.delete(:highest_entry_uids) || highest_entry_uids
|
13
|
+
# offset by rewind count if set and also in that case assume by default we want to skip_existing_companies
|
14
|
+
options = {:offset => (0 - incremental_rewind_count), :skip_existing_entries => true}.merge(options) if incremental_rewind_count
|
15
|
+
new_highest_numbers = old_highest_numbers.collect do |old_highest_number|
|
16
|
+
incremental_search(old_highest_number, options)
|
17
|
+
end
|
18
|
+
save_var(:highest_entry_uids, new_highest_numbers)
|
19
|
+
end
|
20
|
+
|
21
|
+
def highest_entry_uids(force_get = false)
|
22
|
+
bad_results = []
|
23
|
+
results = get_var('highest_entry_uids')
|
24
|
+
if results.nil? || results.empty? || (results.is_a?(Array) && results.any?{ |r| r.nil? || r.empty? })
|
25
|
+
results = entity_uid_prefixes.collect do |prefix|
|
26
|
+
hcn = highest_entry_uid_result(:prefix => prefix)
|
27
|
+
bad_results << prefix if (hcn.nil? || hcn.empty?)
|
28
|
+
hcn
|
29
|
+
end
|
30
|
+
end
|
31
|
+
results.compact! unless bad_results.empty?
|
32
|
+
return results unless results.empty?
|
33
|
+
end
|
34
|
+
|
35
|
+
def highest_entry_uid_result(options={})
|
36
|
+
if options[:prefix]
|
37
|
+
sql_query = ["ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} LIKE ? ORDER BY cast(substr(#{primary_key_name},?) as real) DESC LIMIT 1", ["#{options[:prefix]}%", options[:prefix].length + 1]]
|
38
|
+
elsif options[:suffix]
|
39
|
+
sql_query = ["ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} LIKE ? ORDER BY cast(#{primary_key_name} as real) DESC LIMIT 1", "%#{options[:suffix]}"]
|
40
|
+
else
|
41
|
+
sql_query = "ocdata.#{primary_key_name} FROM ocdata ORDER BY cast(#{primary_key_name} as real) DESC LIMIT 1"
|
42
|
+
end
|
43
|
+
select(*sql_query).first[primary_key_name.to_s]# rescue nil
|
44
|
+
rescue SqliteMagic::NoSuchTable
|
45
|
+
# first run, so no table or database yet
|
46
|
+
return "#{options[:prefix]}0"
|
47
|
+
end
|
48
|
+
|
49
|
+
def incremental_rewind_count
|
50
|
+
self.const_defined?('INCREMENTAL_REWIND_COUNT') ? self.const_get('INCREMENTAL_REWIND_COUNT') : nil
|
51
|
+
end
|
52
|
+
|
53
|
+
def entity_uid_prefixes
|
54
|
+
self.const_defined?('ENTITY_UID_PREFIXES') ? self.const_get('ENTITY_UID_PREFIXES') : [nil]
|
55
|
+
end
|
56
|
+
|
57
|
+
def entity_uid_suffixes
|
58
|
+
self.const_defined?('ENTITY_UID_SUFFIXES') ? self.const_get('ENTITY_UID_SUFFIXES') : [nil]
|
59
|
+
end
|
60
|
+
|
61
|
+
def incremental_search(uid, options={})
|
62
|
+
first_number = uid.dup
|
63
|
+
current_number = nil # set up ouside of loop
|
64
|
+
error_count = 0
|
65
|
+
last_good_co_no = nil
|
66
|
+
skip_existing_entries = options.delete(:skip_existing_entries)
|
67
|
+
# start at given number but offset by given amount. i.e. by offset
|
68
|
+
uid = increment_number(uid, options[:offset]) if options[:offset]
|
69
|
+
loop do
|
70
|
+
current_number = uid
|
71
|
+
if skip_existing_entries and datum_exists?(uid)
|
72
|
+
uid = increment_number(uid)
|
73
|
+
error_count = 0 # reset error count
|
74
|
+
next
|
75
|
+
elsif update_datum(current_number, false)
|
76
|
+
last_good_co_no = current_number
|
77
|
+
error_count = 0 # reset error count
|
78
|
+
else
|
79
|
+
error_count += 1
|
80
|
+
puts "Failed to find company with uid #{current_number}. Error count: #{error_count}" if verbose?
|
81
|
+
break if error_count > max_failed_count
|
82
|
+
end
|
83
|
+
uid = increment_number(uid)
|
84
|
+
end
|
85
|
+
# return orig uid if we haven't had any new entities
|
86
|
+
last_good_co_no ? last_good_co_no.to_s : first_number
|
87
|
+
end
|
88
|
+
|
89
|
+
def increment_number(uid,increment_amount=1)
|
90
|
+
orig_uid = uid.to_s.dup
|
91
|
+
uid.to_s.sub(/\d+/) do |d|
|
92
|
+
length = d.length
|
93
|
+
incremented_number = d.to_i + increment_amount
|
94
|
+
length = d.length
|
95
|
+
length = incremented_number.to_s.length if increment_amount < 0 and not d[/^0/]
|
96
|
+
sprintf("%0#{length}d", incremented_number)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def max_failed_count
|
101
|
+
self.const_defined?('MAX_FAILED_COUNT') ? self.const_get('MAX_FAILED_COUNT') : 10
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|