openc_bot 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
@@ -0,0 +1,53 @@
1
+ class Mechanize::HTTP::Agent
2
+ MAX_RESET_RETRIES = 10
3
+
4
+ # We need to replace the core Mechanize HTTP method:
5
+ #
6
+ # Mechanize::HTTP::Agent#fetch
7
+ #
8
+ # with a wrapper that handles the infamous "too many connection resets"
9
+ # Mechanize bug that is described here:
10
+ #
11
+ # https://github.com/sparklemotion/mechanize/issues/123
12
+ #
13
+ # The wrapper shuts down the persistent HTTP connection when it fails with
14
+ # this error, and simply tries again. In practice, this only ever needs to
15
+ # be retried once, but I am going to let it retry a few times
16
+ # (MAX_RESET_RETRIES), just in case.
17
+ #
18
+ def fetch_with_retry(
19
+ uri,
20
+ method = :get,
21
+ headers = {},
22
+ params = [],
23
+ referer = current_page,
24
+ redirects = 0
25
+ )
26
+ action = "#{method.to_s.upcase} #{uri.to_s}"
27
+ retry_count = 0
28
+
29
+ begin
30
+ fetch_without_retry(uri, method, headers, params, referer, redirects)
31
+ rescue Net::HTTP::Persistent::Error => e
32
+ # Pass on any other type of error.
33
+ raise unless e.message =~ /too many connection resets/
34
+
35
+ # Pass on the error if we've tried too many times.
36
+ if retry_count >= MAX_RESET_RETRIES
37
+ puts "**** WARN: Mechanize retried connection reset #{MAX_RESET_RETRIES} times and never succeeded: #{action}"
38
+ raise
39
+ end
40
+
41
+ # Otherwise, shutdown the persistent HTTP connection and try again.
42
+ puts "**** WARN: Mechanize retrying connection reset error: #{action}"
43
+ retry_count += 1
44
+ self.http.shutdown
45
+ retry
46
+ end
47
+ end
48
+
49
+ # Alias so #fetch actually uses our new #fetch_with_retry to wrap the
50
+ # old one aliased as #fetch_without_retry.
51
+ alias_method :fetch_without_retry, :fetch
52
+ alias_method :fetch, :fetch_with_retry
53
+ end
data/lib/openc_bot.rb ADDED
@@ -0,0 +1,89 @@
1
+ # encoding: UTF-8
2
+ require 'openc_bot/version'
3
+ require 'json'
4
+ require 'scraperwiki'
5
+ require_relative 'openc_bot/bot_data_validator'
6
+ require 'openc_bot/helpers/text'
7
+ require 'openc_bot/exceptions'
8
+
9
+ module OpencBot
10
+
11
+ class OpencBotError < StandardError;end
12
+ class DatabaseError < OpencBotError;end
13
+ class InvalidDataError < OpencBotError;end
14
+ class NotFoundError < OpencBotError;end
15
+
16
+ include ScraperWiki
17
+ # include by default, as some were previously in made openc_bot file
18
+ include Helpers::Text
19
+
20
+ def insert_or_update(uniq_keys, values_hash, tbl_name='ocdata')
21
+ sqlite_magic_connection.insert_or_update(uniq_keys, values_hash, tbl_name)
22
+ end
23
+
24
+ def save_data(uniq_keys, values_array, tbl_name='ocdata')
25
+ save_sqlite(uniq_keys, values_array, tbl_name)
26
+ end
27
+
28
+ def save_run_report(report_hash)
29
+ json_report = report_hash.to_json
30
+ save_data([:run_at], { :report => json_report, :run_at => Time.now.to_s }, :ocrunreports)
31
+ end
32
+
33
+ # Returns the root directory of the bot (not this gem).
34
+ # Assumes the bot file that extends its functionality using this bot is in a directory (lib) inside the root directory
35
+ def root_directory
36
+ @@app_directory
37
+ end
38
+
39
+ def unlock_database
40
+ sqlite_magic_connection.execute("BEGIN TRANSACTION; END;")
41
+ end
42
+
43
+ # Convenience method that returns true if VERBOSE environmental variable set (at the moment whatever it is set to)
44
+ def verbose?
45
+ ENV['VERBOSE']
46
+ end
47
+
48
+ def export(opts={})
49
+ export_data(opts).each do |record|
50
+ $stdout.puts record.to_json
51
+ $stdout.flush
52
+ end
53
+ end
54
+
55
+ def spotcheck
56
+ $stdout.puts JSON.pretty_generate(spotcheck_data)
57
+ end
58
+
59
+ # When deciding on the location of the SQLite databases we need to
60
+ # set the directory relative to the directory of the file/app that
61
+ # includes the gem, not the gem itself. Doing it this way, and
62
+ # setting a class variable feels ugly, but this appears to be
63
+ # difficult in Ruby, esp as the file may ultimately be called by
64
+ # another process, e.g. the main OpenCorporates app or the console,
65
+ # whose main directory is unrelated to where the databases are
66
+ # stored (which means we can't use Dir.pwd etc). The only time we
67
+ # know about the directory is when the module is called to extend
68
+ # the file, and we capture that in the @app_directory class variable
69
+ def self.extended(obj)
70
+ path, = caller[0].partition(":")
71
+ path = File.expand_path(File.join(File.dirname(path),'..'))
72
+ @@app_directory = path
73
+ end
74
+
75
+ def db_name
76
+ if is_a?(Module)
77
+ "#{self.name.downcase}.db"
78
+ else
79
+ "#{self.class.name.downcase}.db"
80
+ end
81
+ end
82
+
83
+ # Override default in ScraperWiki gem
84
+ def sqlite_magic_connection
85
+ db = @config ? @config[:db] : File.expand_path(File.join(@@app_directory, 'db', db_name))
86
+ @sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
87
+ end
88
+
89
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: UTF-8
2
+ module OpencBot
3
+ module BotDataValidator
4
+
5
+ extend self
6
+ def validate(datum)
7
+ datum.kind_of?(Hash) and
8
+ datum[:company][:name] and
9
+ not datum[:company][:name].strip.empty? and
10
+ not datum[:source_url].strip.empty? and
11
+ not datum[:data].empty? and
12
+ datum[:data].all?{ |data| not data[:data_type].to_s.strip.empty? and not data[:properties].empty? }
13
+ rescue Exception => e
14
+ #any probs then it's invalid
15
+ false
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,40 @@
1
+ require 'openc_bot'
2
+ require 'openc_bot/helpers/incremental_search'
3
+ require 'openc_bot/helpers/alpha_search'
4
+
5
+
6
+ module OpencBot
7
+ module CompanyFetcherBot
8
+ include OpencBot
9
+ include OpencBot::Helpers::IncrementalSearch
10
+ include OpencBot::Helpers::AlphaSearch
11
+
12
+ # This is called by #update_datum
13
+ def fetch_datum(company_number)
14
+ company_page = fetch_registry_page(company_number)
15
+ {:company_page => company_page}
16
+ end
17
+
18
+ def inferred_jurisdiction_code
19
+ poss_j_code = self.name.sub(/CompaniesFetcher/,'').underscore
20
+ poss_j_code[/^[a-z]{2}$|^[a-z]{2}_[a-z]{2}$/]
21
+ end
22
+
23
+ def primary_key_name
24
+ :company_number
25
+ end
26
+
27
+ # This overrides default #save_entity (defined in RegisterMethods) and adds
28
+ # the inferred jurisdiction_code, unless it is overridden in entity_info
29
+ def save_entity(entity_info)
30
+ return if entity_info.blank?
31
+ default_options = {:jurisdiction_code => inferred_jurisdiction_code}
32
+ super(default_options.merge(entity_info))
33
+ end
34
+
35
+ def schema_name
36
+ super || 'company-schema'
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,17 @@
1
+ module OpencBot
2
+
3
+ # Generic Error class for OpencBot exceptions
4
+ class OpencBotError < StandardError;end
5
+
6
+ #
7
+ # Raised by <tt>save_entity!</tt> when the record is invalid.
8
+ # Use the +validation_errors+ method to retrieve the, er, validation errors.
9
+ class RecordInvalid < OpencBotError
10
+ attr_reader :validation_errors
11
+
12
+ def initialize(validation_errors)
13
+ @validation_errors = validation_errors
14
+ end
15
+ end
16
+
17
+ end
@@ -0,0 +1,10 @@
1
+ # This is in _csr.rb to avoid requiring it when we mean to require the system
2
+ # csv library.
3
+ module OpencBot
4
+ module Helpers
5
+ module Csv
6
+ # This module will eventually hold some helper methods for
7
+ # dealing with Csv content
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,73 @@
1
+ # encoding: UTF-8
2
+ require 'openc_bot/helpers/register_methods'
3
+
4
+ module OpencBot
5
+ module Helpers
6
+ module AlphaSearch
7
+
8
+ include OpencBot::Helpers::RegisterMethods
9
+
10
+ def alpha_terms(starting_term=nil)
11
+ all_perms = letters_and_numbers.repeated_permutation(numbers_of_chars_in_search).
12
+ collect(&:join)
13
+ # get starting position from given term
14
+ starting_position = starting_term && all_perms.index(starting_term)
15
+ # start from starting_position if we have it or from start of array (pos 0) if not
16
+ all_perms[starting_position.to_i..-1]
17
+ end
18
+
19
+ def fetch_data_via_alpha_search(options={})
20
+ starting_term = options[:starting_term]||get_var('starting_term')
21
+ each_search_term(starting_term) do |term|
22
+ save_var('starting_term', term)
23
+ search_for_entities_for_term(term, options) do |entity_datum|
24
+ save_entity(entity_datum)
25
+ end
26
+ end
27
+ # reset pointer
28
+ save_var('starting_term',nil)
29
+ end
30
+
31
+ # Iterates through each search term, yielding the result to a block, or returning
32
+ # the array of search_terms if no block given
33
+ def each_search_term(starting_term=nil)
34
+ alpha_terms(starting_term).each{ |t| yield t if block_given?}
35
+ end
36
+
37
+ def letters_and_numbers
38
+ ('A'..'Z').to_a + ('0'..'9').to_a
39
+ end
40
+
41
+ def numbers_of_chars_in_search
42
+ self.const_defined?('NUMBER_OF_CHARS_IN_SEARCH') ? self.const_get('NUMBER_OF_CHARS_IN_SEARCH') : 1
43
+ end
44
+
45
+ def search_for_entities_for_term(term, options={})
46
+ raise "The #search_for_entities_for_term method has not been implemented for this case.\nIt needs to be, and should yield a company data Hash"
47
+ end
48
+
49
+ def get_results_and_extract_data_for(prefix, search_offset)
50
+ while search_offset do
51
+ url = "http://www.oera.li/WebServices/ZefixFL/ZefixFL.asmx/SearchFirm?name=#{prefix}%20&suche_nach=-&rf=&sitz=&id=&language=&phonetisch=no&posMin=#{search_offset}"
52
+ response =
53
+ begin
54
+ html = open(url).read.encode!('utf-8','iso-8859-1')
55
+ rescue Exception, Timeout::Error => e
56
+ puts "Problem getting/parsing data from #{url}: #{e.inspect}"
57
+ nil
58
+ end
59
+ next unless response
60
+ if response.match(/webservices\/HRG/) # check has links to companies
61
+ puts "****Scraping page #{(search_offset+10)/10}"
62
+ scrape_search_results_page(response, url)
63
+ save_var('search_offset', search_offset)
64
+ search_offset += 10
65
+ else
66
+ search_offset = false
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ end
73
+ end
@@ -0,0 +1,33 @@
1
+ # encoding: UTF-8
2
+ require 'date'
3
+ module OpencBot
4
+ module Helpers
5
+ module Dates
6
+ extend self
7
+ AMERICAN_DATE_RE = %r_\A\s*(\d{1,2})/(\d{1,2})/(\d{4}|\d{2})_.freeze
8
+
9
+ def normalise_uk_date(raw_date)
10
+ return if raw_date.nil? or raw_date.to_s.strip.empty?
11
+ if raw_date.is_a?(String)
12
+ cleaned_up_date = raw_date.gsub(/\s+/,'').match(/^\d+\/[\d\w]+\/\d+$/) ? raw_date.gsub('/','-') : raw_date
13
+ raw_date = to_date(cleaned_up_date.sub(/^(\d{1,2}-)([\w\d]+-)([01]\d)$/,'\1\220\3').sub(/^(\d{1,2}-)([\w\d]+-)([9]\d)$/,'\1\219\3'))
14
+ end
15
+ raw_date.to_s
16
+ end
17
+
18
+ def normalise_us_date(raw_date)
19
+ return if raw_date.nil? or raw_date.to_s.strip.empty?
20
+ # we want to set century to 19 if there's none set and the years are in the 20s or later
21
+ raw_date = raw_date.to_s.sub(/^(\s*\d{1,2}[\/-]\d{1,2}[\/-])([2-9]\d)$/,'\119\2')
22
+ iso_date = raw_date.to_s.sub(AMERICAN_DATE_RE) { |m| "#$3-#$1-#$2" }
23
+ to_date(iso_date, true).to_s
24
+ end
25
+
26
+ private
27
+ def to_date(date, comp=false)
28
+ return if date.nil?
29
+ Date.parse(date,comp)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,8 @@
1
+ module OpencBot
2
+ module Helpers
3
+ module Html
4
+ # This module will eventually hold some helper methods for
5
+ # dealing with HTML content
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,106 @@
1
+ # encoding: UTF-8
2
+ require 'openc_bot/helpers/register_methods'
3
+
4
+ module OpencBot
5
+ module Helpers
6
+ module IncrementalSearch
7
+
8
+ include OpencBot::Helpers::RegisterMethods
9
+
10
+ # Gets new records using an incremental search
11
+ def fetch_data_via_incremental_search(options={})
12
+ return unless old_highest_numbers = options.delete(:highest_entry_uids) || highest_entry_uids
13
+ # offset by rewind count if set and also in that case assume by default we want to skip_existing_companies
14
+ options = {:offset => (0 - incremental_rewind_count), :skip_existing_entries => true}.merge(options) if incremental_rewind_count
15
+ new_highest_numbers = old_highest_numbers.collect do |old_highest_number|
16
+ incremental_search(old_highest_number, options)
17
+ end
18
+ save_var(:highest_entry_uids, new_highest_numbers)
19
+ end
20
+
21
+ def highest_entry_uids(force_get = false)
22
+ bad_results = []
23
+ results = get_var('highest_entry_uids')
24
+ if results.nil? || results.empty? || (results.is_a?(Array) && results.any?{ |r| r.nil? || r.empty? })
25
+ results = entity_uid_prefixes.collect do |prefix|
26
+ hcn = highest_entry_uid_result(:prefix => prefix)
27
+ bad_results << prefix if (hcn.nil? || hcn.empty?)
28
+ hcn
29
+ end
30
+ end
31
+ results.compact! unless bad_results.empty?
32
+ return results unless results.empty?
33
+ end
34
+
35
+ def highest_entry_uid_result(options={})
36
+ if options[:prefix]
37
+ sql_query = ["ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} LIKE ? ORDER BY cast(substr(#{primary_key_name},?) as real) DESC LIMIT 1", ["#{options[:prefix]}%", options[:prefix].length + 1]]
38
+ elsif options[:suffix]
39
+ sql_query = ["ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} LIKE ? ORDER BY cast(#{primary_key_name} as real) DESC LIMIT 1", "%#{options[:suffix]}"]
40
+ else
41
+ sql_query = "ocdata.#{primary_key_name} FROM ocdata ORDER BY cast(#{primary_key_name} as real) DESC LIMIT 1"
42
+ end
43
+ select(*sql_query).first[primary_key_name.to_s]# rescue nil
44
+ rescue SqliteMagic::NoSuchTable
45
+ # first run, so no table or database yet
46
+ return "#{options[:prefix]}0"
47
+ end
48
+
49
+ def incremental_rewind_count
50
+ self.const_defined?('INCREMENTAL_REWIND_COUNT') ? self.const_get('INCREMENTAL_REWIND_COUNT') : nil
51
+ end
52
+
53
+ def entity_uid_prefixes
54
+ self.const_defined?('ENTITY_UID_PREFIXES') ? self.const_get('ENTITY_UID_PREFIXES') : [nil]
55
+ end
56
+
57
+ def entity_uid_suffixes
58
+ self.const_defined?('ENTITY_UID_SUFFIXES') ? self.const_get('ENTITY_UID_SUFFIXES') : [nil]
59
+ end
60
+
61
+ def incremental_search(uid, options={})
62
+ first_number = uid.dup
63
+ current_number = nil # set up ouside of loop
64
+ error_count = 0
65
+ last_good_co_no = nil
66
+ skip_existing_entries = options.delete(:skip_existing_entries)
67
+ # start at given number but offset by given amount. i.e. by offset
68
+ uid = increment_number(uid, options[:offset]) if options[:offset]
69
+ loop do
70
+ current_number = uid
71
+ if skip_existing_entries and datum_exists?(uid)
72
+ uid = increment_number(uid)
73
+ error_count = 0 # reset error count
74
+ next
75
+ elsif update_datum(current_number, false)
76
+ last_good_co_no = current_number
77
+ error_count = 0 # reset error count
78
+ else
79
+ error_count += 1
80
+ puts "Failed to find company with uid #{current_number}. Error count: #{error_count}" if verbose?
81
+ break if error_count > max_failed_count
82
+ end
83
+ uid = increment_number(uid)
84
+ end
85
+ # return orig uid if we haven't had any new entities
86
+ last_good_co_no ? last_good_co_no.to_s : first_number
87
+ end
88
+
89
+ def increment_number(uid,increment_amount=1)
90
+ orig_uid = uid.to_s.dup
91
+ uid.to_s.sub(/\d+/) do |d|
92
+ length = d.length
93
+ incremented_number = d.to_i + increment_amount
94
+ length = d.length
95
+ length = incremented_number.to_s.length if increment_amount < 0 and not d[/^0/]
96
+ sprintf("%0#{length}d", incremented_number)
97
+ end
98
+ end
99
+
100
+ def max_failed_count
101
+ self.const_defined?('MAX_FAILED_COUNT') ? self.const_get('MAX_FAILED_COUNT') : 10
102
+ end
103
+
104
+ end
105
+ end
106
+ end