openc_bot 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
@@ -0,0 +1,53 @@
1
+ class Mechanize::HTTP::Agent
2
+ MAX_RESET_RETRIES = 10
3
+
4
+ # We need to replace the core Mechanize HTTP method:
5
+ #
6
+ # Mechanize::HTTP::Agent#fetch
7
+ #
8
+ # with a wrapper that handles the infamous "too many connection resets"
9
+ # Mechanize bug that is described here:
10
+ #
11
+ # https://github.com/sparklemotion/mechanize/issues/123
12
+ #
13
+ # The wrapper shuts down the persistent HTTP connection when it fails with
14
+ # this error, and simply tries again. In practice, this only ever needs to
15
+ # be retried once, but I am going to let it retry a few times
16
+ # (MAX_RESET_RETRIES), just in case.
17
+ #
18
+ def fetch_with_retry(
19
+ uri,
20
+ method = :get,
21
+ headers = {},
22
+ params = [],
23
+ referer = current_page,
24
+ redirects = 0
25
+ )
26
+ action = "#{method.to_s.upcase} #{uri.to_s}"
27
+ retry_count = 0
28
+
29
+ begin
30
+ fetch_without_retry(uri, method, headers, params, referer, redirects)
31
+ rescue Net::HTTP::Persistent::Error => e
32
+ # Pass on any other type of error.
33
+ raise unless e.message =~ /too many connection resets/
34
+
35
+ # Pass on the error if we've tried too many times.
36
+ if retry_count >= MAX_RESET_RETRIES
37
+ puts "**** WARN: Mechanize retried connection reset #{MAX_RESET_RETRIES} times and never succeeded: #{action}"
38
+ raise
39
+ end
40
+
41
+ # Otherwise, shutdown the persistent HTTP connection and try again.
42
+ puts "**** WARN: Mechanize retrying connection reset error: #{action}"
43
+ retry_count += 1
44
+ self.http.shutdown
45
+ retry
46
+ end
47
+ end
48
+
49
+ # Alias so #fetch actually uses our new #fetch_with_retry to wrap the
50
+ # old one aliased as #fetch_without_retry.
51
+ alias_method :fetch_without_retry, :fetch
52
+ alias_method :fetch, :fetch_with_retry
53
+ end
data/lib/openc_bot.rb ADDED
@@ -0,0 +1,89 @@
1
+ # encoding: UTF-8
2
+ require 'openc_bot/version'
3
+ require 'json'
4
+ require 'scraperwiki'
5
+ require_relative 'openc_bot/bot_data_validator'
6
+ require 'openc_bot/helpers/text'
7
+ require 'openc_bot/exceptions'
8
+
9
+ module OpencBot
10
+
11
+ class OpencBotError < StandardError;end
12
+ class DatabaseError < OpencBotError;end
13
+ class InvalidDataError < OpencBotError;end
14
+ class NotFoundError < OpencBotError;end
15
+
16
+ include ScraperWiki
17
+ # include by default, as some were previously in made openc_bot file
18
+ include Helpers::Text
19
+
20
+ def insert_or_update(uniq_keys, values_hash, tbl_name='ocdata')
21
+ sqlite_magic_connection.insert_or_update(uniq_keys, values_hash, tbl_name)
22
+ end
23
+
24
+ def save_data(uniq_keys, values_array, tbl_name='ocdata')
25
+ save_sqlite(uniq_keys, values_array, tbl_name)
26
+ end
27
+
28
+ def save_run_report(report_hash)
29
+ json_report = report_hash.to_json
30
+ save_data([:run_at], { :report => json_report, :run_at => Time.now.to_s }, :ocrunreports)
31
+ end
32
+
33
+ # Returns the root directory of the bot (not this gem).
34
+ # Assumes the bot file that extends its functionality using this bot is in a directory (lib) inside the root directory
35
+ def root_directory
36
+ @@app_directory
37
+ end
38
+
39
+ def unlock_database
40
+ sqlite_magic_connection.execute("BEGIN TRANSACTION; END;")
41
+ end
42
+
43
+ # Convenience method that returns true if VERBOSE environmental variable set (at the moment whatever it is set to)
44
+ def verbose?
45
+ ENV['VERBOSE']
46
+ end
47
+
48
+ def export(opts={})
49
+ export_data(opts).each do |record|
50
+ $stdout.puts record.to_json
51
+ $stdout.flush
52
+ end
53
+ end
54
+
55
+ def spotcheck
56
+ $stdout.puts JSON.pretty_generate(spotcheck_data)
57
+ end
58
+
59
+ # When deciding on the location of the SQLite databases we need to
60
+ # set the directory relative to the directory of the file/app that
61
+ # includes the gem, not the gem itself. Doing it this way, and
62
+ # setting a class variable feels ugly, but this appears to be
63
+ # difficult in Ruby, esp as the file may ultimately be called by
64
+ # another process, e.g. the main OpenCorporates app or the console,
65
+ # whose main directory is unrelated to where the databases are
66
+ # stored (which means we can't use Dir.pwd etc). The only time we
67
+ # know about the directory is when the module is called to extend
68
+ # the file, and we capture that in the @app_directory class variable
69
+ def self.extended(obj)
70
+ path, = caller[0].partition(":")
71
+ path = File.expand_path(File.join(File.dirname(path),'..'))
72
+ @@app_directory = path
73
+ end
74
+
75
+ def db_name
76
+ if is_a?(Module)
77
+ "#{self.name.downcase}.db"
78
+ else
79
+ "#{self.class.name.downcase}.db"
80
+ end
81
+ end
82
+
83
+ # Override default in ScraperWiki gem
84
+ def sqlite_magic_connection
85
+ db = @config ? @config[:db] : File.expand_path(File.join(@@app_directory, 'db', db_name))
86
+ @sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
87
+ end
88
+
89
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: UTF-8
2
+ module OpencBot
3
+ module BotDataValidator
4
+
5
+ extend self
6
+ def validate(datum)
7
+ datum.kind_of?(Hash) and
8
+ datum[:company][:name] and
9
+ not datum[:company][:name].strip.empty? and
10
+ not datum[:source_url].strip.empty? and
11
+ not datum[:data].empty? and
12
+ datum[:data].all?{ |data| not data[:data_type].to_s.strip.empty? and not data[:properties].empty? }
13
+ rescue Exception => e
14
+ #any probs then it's invalid
15
+ false
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,40 @@
1
+ require 'openc_bot'
2
+ require 'openc_bot/helpers/incremental_search'
3
+ require 'openc_bot/helpers/alpha_search'
4
+
5
+
6
+ module OpencBot
7
+ module CompanyFetcherBot
8
+ include OpencBot
9
+ include OpencBot::Helpers::IncrementalSearch
10
+ include OpencBot::Helpers::AlphaSearch
11
+
12
+ # This is called by #update_datum
13
+ def fetch_datum(company_number)
14
+ company_page = fetch_registry_page(company_number)
15
+ {:company_page => company_page}
16
+ end
17
+
18
+ def inferred_jurisdiction_code
19
+ poss_j_code = self.name.sub(/CompaniesFetcher/,'').underscore
20
+ poss_j_code[/^[a-z]{2}$|^[a-z]{2}_[a-z]{2}$/]
21
+ end
22
+
23
+ def primary_key_name
24
+ :company_number
25
+ end
26
+
27
+ # This overrides default #save_entity (defined in RegisterMethods) and adds
28
+ # the inferred jurisdiction_code, unless it is overridden in entity_info
29
+ def save_entity(entity_info)
30
+ return if entity_info.blank?
31
+ default_options = {:jurisdiction_code => inferred_jurisdiction_code}
32
+ super(default_options.merge(entity_info))
33
+ end
34
+
35
+ def schema_name
36
+ super || 'company-schema'
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,17 @@
1
+ module OpencBot
2
+
3
+ # Generic Error class for OpencBot exceptions
4
+ class OpencBotError < StandardError;end
5
+
6
+ #
7
+ # Raised by <tt>save_entity!</tt> when the record is invalid.
8
+ # Use the +validation_errors+ method to retrieve the, er, validation errors.
9
+ class RecordInvalid < OpencBotError
10
+ attr_reader :validation_errors
11
+
12
+ def initialize(validation_errors)
13
+ @validation_errors = validation_errors
14
+ end
15
+ end
16
+
17
+ end
@@ -0,0 +1,10 @@
1
+ # This is in _csr.rb to avoid requiring it when we mean to require the system
2
+ # csv library.
3
+ module OpencBot
4
+ module Helpers
5
+ module Csv
6
+ # This module will eventually hold some helper methods for
7
+ # dealing with Csv content
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,73 @@
1
+ # encoding: UTF-8
2
+ require 'openc_bot/helpers/register_methods'
3
+
4
+ module OpencBot
5
+ module Helpers
6
+ module AlphaSearch
7
+
8
+ include OpencBot::Helpers::RegisterMethods
9
+
10
+ def alpha_terms(starting_term=nil)
11
+ all_perms = letters_and_numbers.repeated_permutation(numbers_of_chars_in_search).
12
+ collect(&:join)
13
+ # get starting position from given term
14
+ starting_position = starting_term && all_perms.index(starting_term)
15
+ # start from starting_position if we have it or from start of array (pos 0) if not
16
+ all_perms[starting_position.to_i..-1]
17
+ end
18
+
19
+ def fetch_data_via_alpha_search(options={})
20
+ starting_term = options[:starting_term]||get_var('starting_term')
21
+ each_search_term(starting_term) do |term|
22
+ save_var('starting_term', term)
23
+ search_for_entities_for_term(term, options) do |entity_datum|
24
+ save_entity(entity_datum)
25
+ end
26
+ end
27
+ # reset pointer
28
+ save_var('starting_term',nil)
29
+ end
30
+
31
+ # Iterates through each search term, yielding the result to a block, or returning
32
+ # the array of search_terms if no block given
33
+ def each_search_term(starting_term=nil)
34
+ alpha_terms(starting_term).each{ |t| yield t if block_given?}
35
+ end
36
+
37
+ def letters_and_numbers
38
+ ('A'..'Z').to_a + ('0'..'9').to_a
39
+ end
40
+
41
+ def numbers_of_chars_in_search
42
+ self.const_defined?('NUMBER_OF_CHARS_IN_SEARCH') ? self.const_get('NUMBER_OF_CHARS_IN_SEARCH') : 1
43
+ end
44
+
45
+ def search_for_entities_for_term(term, options={})
46
+ raise "The #search_for_entities_for_term method has not been implemented for this case.\nIt needs to be, and should yield a company data Hash"
47
+ end
48
+
49
+ def get_results_and_extract_data_for(prefix, search_offset)
50
+ while search_offset do
51
+ url = "http://www.oera.li/WebServices/ZefixFL/ZefixFL.asmx/SearchFirm?name=#{prefix}%20&suche_nach=-&rf=&sitz=&id=&language=&phonetisch=no&posMin=#{search_offset}"
52
+ response =
53
+ begin
54
+ html = open(url).read.encode!('utf-8','iso-8859-1')
55
+ rescue Exception, Timeout::Error => e
56
+ puts "Problem getting/parsing data from #{url}: #{e.inspect}"
57
+ nil
58
+ end
59
+ next unless response
60
+ if response.match(/webservices\/HRG/) # check has links to companies
61
+ puts "****Scraping page #{(search_offset+10)/10}"
62
+ scrape_search_results_page(response, url)
63
+ save_var('search_offset', search_offset)
64
+ search_offset += 10
65
+ else
66
+ search_offset = false
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ end
73
+ end
@@ -0,0 +1,33 @@
1
+ # encoding: UTF-8
2
+ require 'date'
3
+ module OpencBot
4
+ module Helpers
5
+ module Dates
6
+ extend self
7
+ AMERICAN_DATE_RE = %r_\A\s*(\d{1,2})/(\d{1,2})/(\d{4}|\d{2})_.freeze
8
+
9
+ def normalise_uk_date(raw_date)
10
+ return if raw_date.nil? or raw_date.to_s.strip.empty?
11
+ if raw_date.is_a?(String)
12
+ cleaned_up_date = raw_date.gsub(/\s+/,'').match(/^\d+\/[\d\w]+\/\d+$/) ? raw_date.gsub('/','-') : raw_date
13
+ raw_date = to_date(cleaned_up_date.sub(/^(\d{1,2}-)([\w\d]+-)([01]\d)$/,'\1\220\3').sub(/^(\d{1,2}-)([\w\d]+-)([9]\d)$/,'\1\219\3'))
14
+ end
15
+ raw_date.to_s
16
+ end
17
+
18
+ def normalise_us_date(raw_date)
19
+ return if raw_date.nil? or raw_date.to_s.strip.empty?
20
+ # we want to set century to 19 if there's none set and the years are in the 20s or later
21
+ raw_date = raw_date.to_s.sub(/^(\s*\d{1,2}[\/-]\d{1,2}[\/-])([2-9]\d)$/,'\119\2')
22
+ iso_date = raw_date.to_s.sub(AMERICAN_DATE_RE) { |m| "#$3-#$1-#$2" }
23
+ to_date(iso_date, true).to_s
24
+ end
25
+
26
+ private
27
+ def to_date(date, comp=false)
28
+ return if date.nil?
29
+ Date.parse(date,comp)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,8 @@
1
+ module OpencBot
2
+ module Helpers
3
+ module Html
4
+ # This module will eventually hold some helper methods for
5
+ # dealing with HTML content
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,106 @@
1
+ # encoding: UTF-8
2
+ require 'openc_bot/helpers/register_methods'
3
+
4
+ module OpencBot
5
+ module Helpers
6
+ module IncrementalSearch
7
+
8
+ include OpencBot::Helpers::RegisterMethods
9
+
10
+ # Gets new records using an incremental search
11
+ def fetch_data_via_incremental_search(options={})
12
+ return unless old_highest_numbers = options.delete(:highest_entry_uids) || highest_entry_uids
13
+ # offset by rewind count if set and also in that case assume by default we want to skip_existing_companies
14
+ options = {:offset => (0 - incremental_rewind_count), :skip_existing_entries => true}.merge(options) if incremental_rewind_count
15
+ new_highest_numbers = old_highest_numbers.collect do |old_highest_number|
16
+ incremental_search(old_highest_number, options)
17
+ end
18
+ save_var(:highest_entry_uids, new_highest_numbers)
19
+ end
20
+
21
+ def highest_entry_uids(force_get = false)
22
+ bad_results = []
23
+ results = get_var('highest_entry_uids')
24
+ if results.nil? || results.empty? || (results.is_a?(Array) && results.any?{ |r| r.nil? || r.empty? })
25
+ results = entity_uid_prefixes.collect do |prefix|
26
+ hcn = highest_entry_uid_result(:prefix => prefix)
27
+ bad_results << prefix if (hcn.nil? || hcn.empty?)
28
+ hcn
29
+ end
30
+ end
31
+ results.compact! unless bad_results.empty?
32
+ return results unless results.empty?
33
+ end
34
+
35
+ def highest_entry_uid_result(options={})
36
+ if options[:prefix]
37
+ sql_query = ["ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} LIKE ? ORDER BY cast(substr(#{primary_key_name},?) as real) DESC LIMIT 1", ["#{options[:prefix]}%", options[:prefix].length + 1]]
38
+ elsif options[:suffix]
39
+ sql_query = ["ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} LIKE ? ORDER BY cast(#{primary_key_name} as real) DESC LIMIT 1", "%#{options[:suffix]}"]
40
+ else
41
+ sql_query = "ocdata.#{primary_key_name} FROM ocdata ORDER BY cast(#{primary_key_name} as real) DESC LIMIT 1"
42
+ end
43
+ select(*sql_query).first[primary_key_name.to_s]# rescue nil
44
+ rescue SqliteMagic::NoSuchTable
45
+ # first run, so no table or database yet
46
+ return "#{options[:prefix]}0"
47
+ end
48
+
49
+ def incremental_rewind_count
50
+ self.const_defined?('INCREMENTAL_REWIND_COUNT') ? self.const_get('INCREMENTAL_REWIND_COUNT') : nil
51
+ end
52
+
53
+ def entity_uid_prefixes
54
+ self.const_defined?('ENTITY_UID_PREFIXES') ? self.const_get('ENTITY_UID_PREFIXES') : [nil]
55
+ end
56
+
57
+ def entity_uid_suffixes
58
+ self.const_defined?('ENTITY_UID_SUFFIXES') ? self.const_get('ENTITY_UID_SUFFIXES') : [nil]
59
+ end
60
+
61
+ def incremental_search(uid, options={})
62
+ first_number = uid.dup
63
+ current_number = nil # set up ouside of loop
64
+ error_count = 0
65
+ last_good_co_no = nil
66
+ skip_existing_entries = options.delete(:skip_existing_entries)
67
+ # start at given number but offset by given amount. i.e. by offset
68
+ uid = increment_number(uid, options[:offset]) if options[:offset]
69
+ loop do
70
+ current_number = uid
71
+ if skip_existing_entries and datum_exists?(uid)
72
+ uid = increment_number(uid)
73
+ error_count = 0 # reset error count
74
+ next
75
+ elsif update_datum(current_number, false)
76
+ last_good_co_no = current_number
77
+ error_count = 0 # reset error count
78
+ else
79
+ error_count += 1
80
+ puts "Failed to find company with uid #{current_number}. Error count: #{error_count}" if verbose?
81
+ break if error_count > max_failed_count
82
+ end
83
+ uid = increment_number(uid)
84
+ end
85
+ # return orig uid if we haven't had any new entities
86
+ last_good_co_no ? last_good_co_no.to_s : first_number
87
+ end
88
+
89
+ def increment_number(uid,increment_amount=1)
90
+ orig_uid = uid.to_s.dup
91
+ uid.to_s.sub(/\d+/) do |d|
92
+ length = d.length
93
+ incremented_number = d.to_i + increment_amount
94
+ length = d.length
95
+ length = incremented_number.to_s.length if increment_amount < 0 and not d[/^0/]
96
+ sprintf("%0#{length}d", incremented_number)
97
+ end
98
+ end
99
+
100
+ def max_failed_count
101
+ self.const_defined?('MAX_FAILED_COUNT') ? self.const_get('MAX_FAILED_COUNT') : 10
102
+ end
103
+
104
+ end
105
+ end
106
+ end