openc_bot 0.0.27 → 0.0.46
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -13
- data/.travis.yml +1 -0
- data/Gemfile +2 -0
- data/lib/openc_bot.rb +16 -1
- data/lib/openc_bot/company_fetcher_bot.rb +50 -0
- data/lib/openc_bot/helpers/register_methods.rb +59 -15
- data/lib/openc_bot/tasks.rb +15 -2
- data/lib/openc_bot/templates/spec/spec_helper.rb +4 -0
- data/lib/openc_bot/version.rb +1 -1
- data/openc_bot.gemspec +6 -3
- data/spec/lib/company_fetcher_bot_spec.rb +23 -0
- data/spec/lib/helpers/register_methods_spec.rb +120 -2
- data/spec/lib/openc_bot_spec.rb +14 -1
- data/spec/simple_openc_bot_spec.rb +1 -0
- data/spec/spec_helper.rb +3 -1
- metadata +43 -220
- data/schemas/.gitignore +0 -13
- data/schemas/.travis.yml +0 -5
- data/schemas/Gemfile +0 -9
- data/schemas/README.md +0 -2
- data/schemas/Rakefile +0 -46
- data/schemas/lib/base-statement.json +0 -22
- data/schemas/lib/snippets/financial-payment-base.json +0 -4
- data/schemas/lib/snippets/licence-base.json +0 -4
- data/schemas/schemas/company-schema.json +0 -183
- data/schemas/schemas/financial-payment-schema.json +0 -27
- data/schemas/schemas/includes/address.json +0 -23
- data/schemas/schemas/includes/alternative_name.json +0 -13
- data/schemas/schemas/includes/company.json +0 -16
- data/schemas/schemas/includes/filing.json +0 -20
- data/schemas/schemas/includes/financial-payment-data-object.json +0 -60
- data/schemas/schemas/includes/industry_code.json +0 -16
- data/schemas/schemas/includes/licence-data-object.json +0 -36
- data/schemas/schemas/includes/officer.json +0 -50
- data/schemas/schemas/includes/previous_name.json +0 -14
- data/schemas/schemas/includes/share-parcel-data.json +0 -67
- data/schemas/schemas/includes/share-parcel.json +0 -63
- data/schemas/schemas/includes/subsidiary-relationship-data.json +0 -47
- data/schemas/schemas/includes/total-shares.json +0 -10
- data/schemas/schemas/licence-schema.json +0 -27
- data/schemas/schemas/primary-data-schema.json +0 -17
- data/schemas/schemas/share-parcel-schema.json +0 -21
- data/schemas/schemas/simple-financial-payment-schema.json +0 -85
- data/schemas/schemas/simple-licence-schema.json +0 -59
- data/schemas/schemas/simple-subsidiary-schema.json +0 -68
- data/schemas/schemas/subsidiary-relationship-schema.json +0 -27
- data/schemas/spec/sample-data/invalid/company-01.json +0 -4
- data/schemas/spec/sample-data/invalid/company-02.json +0 -5
- data/schemas/spec/sample-data/invalid/company-03.json +0 -5
- data/schemas/spec/sample-data/invalid/company-04.json +0 -5
- data/schemas/spec/sample-data/invalid/company-05.json +0 -5
- data/schemas/spec/sample-data/invalid/company-06.json +0 -6
- data/schemas/spec/sample-data/invalid/company-07.json +0 -8
- data/schemas/spec/sample-data/invalid/company-08.json +0 -7
- data/schemas/spec/sample-data/invalid/company-09.json +0 -9
- data/schemas/spec/sample-data/invalid/company-10.json +0 -7
- data/schemas/spec/sample-data/invalid/company-11.json +0 -9
- data/schemas/spec/sample-data/invalid/company-12.json +0 -11
- data/schemas/spec/sample-data/invalid/company-13.json +0 -11
- data/schemas/spec/sample-data/invalid/company-14.json +0 -7
- data/schemas/spec/sample-data/invalid/company-15.json +0 -7
- data/schemas/spec/sample-data/invalid/company-16.json +0 -7
- data/schemas/spec/sample-data/invalid/company-17.json +0 -9
- data/schemas/spec/sample-data/invalid/company-18.json +0 -9
- data/schemas/spec/sample-data/invalid/company-19.json +0 -9
- data/schemas/spec/sample-data/invalid/company-20.json +0 -9
- data/schemas/spec/sample-data/invalid/company-21.json +0 -11
- data/schemas/spec/sample-data/invalid/company-22.json +0 -11
- data/schemas/spec/sample-data/invalid/company-23.json +0 -7
- data/schemas/spec/sample-data/invalid/company-24.json +0 -12
- data/schemas/spec/sample-data/invalid/company-25.json +0 -9
- data/schemas/spec/sample-data/invalid/company-26.json +0 -11
- data/schemas/spec/sample-data/invalid/company-27.json +0 -7
- data/schemas/spec/sample-data/invalid/company-28.json +0 -9
- data/schemas/spec/sample-data/invalid/company-29.json +0 -12
- data/schemas/spec/sample-data/invalid/company-30.json +0 -16
- data/schemas/spec/sample-data/invalid/company-31.json +0 -14
- data/schemas/spec/sample-data/invalid/company-32.json +0 -11
- data/schemas/spec/sample-data/invalid/company-33.json +0 -7
- data/schemas/spec/sample-data/invalid/company-34.json +0 -9
- data/schemas/spec/sample-data/invalid/company-35.json +0 -9
- data/schemas/spec/sample-data/invalid/company-36.json +0 -10
- data/schemas/spec/sample-data/invalid/company-37.json +0 -7
- data/schemas/spec/sample-data/invalid/company-38.json +0 -9
- data/schemas/spec/sample-data/invalid/company-39.json +0 -11
- data/schemas/spec/sample-data/invalid/company-40.json +0 -12
- data/schemas/spec/sample-data/invalid/company-41.json +0 -12
- data/schemas/spec/sample-data/invalid/company-42.json +0 -7
- data/schemas/spec/sample-data/invalid/company-43.json +0 -9
- data/schemas/spec/sample-data/invalid/company-44.json +0 -11
- data/schemas/spec/sample-data/invalid/company-45.json +0 -11
- data/schemas/spec/sample-data/invalid/company-46.json +0 -7
- data/schemas/spec/sample-data/invalid/company-47.json +0 -9
- data/schemas/spec/sample-data/invalid/company-48.json +0 -9
- data/schemas/spec/sample-data/invalid/company-49.json +0 -9
- data/schemas/spec/sample-data/invalid/company-50.json +0 -9
- data/schemas/spec/sample-data/invalid/company-51.json +0 -9
- data/schemas/spec/sample-data/invalid/company-52.json +0 -9
- data/schemas/spec/sample-data/invalid/company-53.json +0 -10
- data/schemas/spec/sample-data/invalid/company-54.json +0 -9
- data/schemas/spec/sample-data/invalid/company-55.json +0 -9
- data/schemas/spec/sample-data/invalid/company-56.json +0 -7
- data/schemas/spec/sample-data/invalid/company-57.json +0 -7
- data/schemas/spec/sample-data/invalid/company-58.json +0 -7
- data/schemas/spec/sample-data/invalid/company-59.json +0 -13
- data/schemas/spec/sample-data/invalid/company-60.json +0 -7
- data/schemas/spec/sample-data/invalid/company-61.json +0 -7
- data/schemas/spec/sample-data/invalid/company-62.json +0 -9
- data/schemas/spec/sample-data/invalid/company-63.json +0 -12
- data/schemas/spec/sample-data/invalid/company-64.json +0 -14
- data/schemas/spec/sample-data/invalid/company-65.json +0 -14
- data/schemas/spec/sample-data/invalid/company-66.json +0 -13
- data/schemas/spec/sample-data/invalid/company-67.json +0 -14
- data/schemas/spec/sample-data/invalid/company-68.json +0 -12
- data/schemas/spec/sample-data/invalid/company-69.json +0 -12
- data/schemas/spec/sample-data/invalid/company-70.json +0 -14
- data/schemas/spec/sample-data/invalid/financial-payment-01.json +0 -24
- data/schemas/spec/sample-data/invalid/licence-01.json +0 -18
- data/schemas/spec/sample-data/invalid/licence-02.json +0 -18
- data/schemas/spec/sample-data/invalid/licence-03.json +0 -12
- data/schemas/spec/sample-data/invalid/licence-04.json +0 -18
- data/schemas/spec/sample-data/invalid/licence-05.json +0 -18
- data/schemas/spec/sample-data/invalid/licence-06.json +0 -18
- data/schemas/spec/sample-data/invalid/licence-07.json +0 -20
- data/schemas/spec/sample-data/invalid/licence-08.json +0 -21
- data/schemas/spec/sample-data/invalid/primary-data-01.json +0 -4
- data/schemas/spec/sample-data/invalid/primary-data-02.json +0 -4
- data/schemas/spec/sample-data/invalid/simple-licence-01.json +0 -9
- data/schemas/spec/sample-data/invalid/simple-licence-02.json +0 -8
- data/schemas/spec/sample-data/invalid/simple-licence-03.json +0 -9
- data/schemas/spec/sample-data/invalid/simple-licence-04.json +0 -10
- data/schemas/spec/sample-data/invalid/simple-licence-05.json +0 -10
- data/schemas/spec/sample-data/invalid/simple-licence-06.json +0 -10
- data/schemas/spec/sample-data/invalid/simple-subsidiary-01.json +0 -13
- data/schemas/spec/sample-data/invalid/simple-subsidiary-02.json +0 -13
- data/schemas/spec/sample-data/licence-schema.json.old +0 -21
- data/schemas/spec/sample-data/valid/company-01.json +0 -6
- data/schemas/spec/sample-data/valid/company-02.json +0 -7
- data/schemas/spec/sample-data/valid/company-03.json +0 -8
- data/schemas/spec/sample-data/valid/company-04.json +0 -7
- data/schemas/spec/sample-data/valid/company-05.json +0 -7
- data/schemas/spec/sample-data/valid/company-06.json +0 -12
- data/schemas/spec/sample-data/valid/company-07.json +0 -9
- data/schemas/spec/sample-data/valid/company-08.json +0 -9
- data/schemas/spec/sample-data/valid/company-09.json +0 -20
- data/schemas/spec/sample-data/valid/company-10.json +0 -9
- data/schemas/spec/sample-data/valid/company-11.json +0 -7
- data/schemas/spec/sample-data/valid/company-12.json +0 -7
- data/schemas/spec/sample-data/valid/company-13.json +0 -7
- data/schemas/spec/sample-data/valid/company-14.json +0 -15
- data/schemas/spec/sample-data/valid/company-15.json +0 -8
- data/schemas/spec/sample-data/valid/company-16.json +0 -9
- data/schemas/spec/sample-data/valid/company-17.json +0 -9
- data/schemas/spec/sample-data/valid/company-18.json +0 -9
- data/schemas/spec/sample-data/valid/company-19.json +0 -37
- data/schemas/spec/sample-data/valid/company-20.json +0 -9
- data/schemas/spec/sample-data/valid/company-21.json +0 -26
- data/schemas/spec/sample-data/valid/company-22.json +0 -20
- data/schemas/spec/sample-data/valid/company-23.json +0 -9
- data/schemas/spec/sample-data/valid/company-24.json +0 -12
- data/schemas/spec/sample-data/valid/company-25.json +0 -12
- data/schemas/spec/sample-data/valid/company-26.json +0 -12
- data/schemas/spec/sample-data/valid/company-27.json +0 -28
- data/schemas/spec/sample-data/valid/company-28.json +0 -9
- data/schemas/spec/sample-data/valid/company-29.json +0 -10
- data/schemas/spec/sample-data/valid/company-30.json +0 -9
- data/schemas/spec/sample-data/valid/company-31.json +0 -17
- data/schemas/spec/sample-data/valid/company-32.json +0 -9
- data/schemas/spec/sample-data/valid/company-33.json +0 -29
- data/schemas/spec/sample-data/valid/company-34.json +0 -9
- data/schemas/spec/sample-data/valid/company-35.json +0 -9
- data/schemas/spec/sample-data/valid/company-36.json +0 -9
- data/schemas/spec/sample-data/valid/company-37.json +0 -9
- data/schemas/spec/sample-data/valid/company-38.json +0 -9
- data/schemas/spec/sample-data/valid/company-39.json +0 -9
- data/schemas/spec/sample-data/valid/company-40.json +0 -9
- data/schemas/spec/sample-data/valid/company-41.json +0 -9
- data/schemas/spec/sample-data/valid/company-42.json +0 -10
- data/schemas/spec/sample-data/valid/company-43.json +0 -7
- data/schemas/spec/sample-data/valid/company-44.json +0 -7
- data/schemas/spec/sample-data/valid/company-45.json +0 -23
- data/schemas/spec/sample-data/valid/company-46.json +0 -7
- data/schemas/spec/sample-data/valid/company-47.json +0 -12
- data/schemas/spec/sample-data/valid/company-48.json +0 -7
- data/schemas/spec/sample-data/valid/company-49.json +0 -14
- data/schemas/spec/sample-data/valid/company-50.json +0 -13
- data/schemas/spec/sample-data/valid/company-51.json +0 -14
- data/schemas/spec/sample-data/valid/company-52.json +0 -12
- data/schemas/spec/sample-data/valid/company-53.json +0 -9
- data/schemas/spec/sample-data/valid/financial-payment-01.json +0 -25
- data/schemas/spec/sample-data/valid/financial-payment-02.json +0 -29
- data/schemas/spec/sample-data/valid/licence-01.json +0 -19
- data/schemas/spec/sample-data/valid/licence-02.json +0 -21
- data/schemas/spec/sample-data/valid/licence-03.json +0 -21
- data/schemas/spec/sample-data/valid/licence-04.json +0 -26
- data/schemas/spec/sample-data/valid/primary-data-01.json +0 -4
- data/schemas/spec/sample-data/valid/primary-data-02.json +0 -5
- data/schemas/spec/sample-data/valid/simple-licence-01.json +0 -10
- data/schemas/spec/sample-data/valid/simple-licence-02.json +0 -10
- data/schemas/spec/sample-data/valid/simple-licence-03.json +0 -12
- data/schemas/spec/sample-data/valid/simple-subsidiary-01.json +0 -13
- data/schemas/spec/sample-data/valid/simple-subsidiary-02.json +0 -13
- data/schemas/spec/sample-data/valid/subsidiary-relationship-01.json +0 -23
- data/schemas/spec/spec_helper.rb +0 -78
- data/schemas/spec/validation_spec.rb +0 -39
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
MTQ0OGM2ZWZjOWYwNzQ5MGQ3Y2YxZDRiOGYyM2FiY2Y4MzBjNDIzZQ==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1db3c143f46fc934729ee27c6cc5b4047fb2a5c5
|
4
|
+
data.tar.gz: 4c2de3f8f0ecc62f77689386dc6e50ed26290714
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
ZGUxNDc2ZTZkNzlkOTYzOWFlYTRkZDM5ZDgwMTRmYmE0ZTM3ZTAxMWIwNGFm
|
11
|
-
YTA4MTgxNDc4OGI2OWRhZTk0NmQ3ODc4MmY5NWE4YmE4YzRlMjc=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
MjQ2NmFhOGY4NDFjZDE2NGJjNTlkNGUwNWJjMzUyMDJjNGM4YjBiMGYxMzMw
|
14
|
-
YTE4ZmIxZmE0YTU0N2Y1NWE0NDU4ZGUzZjc1ODExZmZmZDAxNmZmZWMzY2Qx
|
15
|
-
MWQyNWU4NmEzYmQ5MjdiYzIxYTFlYTkyZjMzMWZjYWY0NjkwYWM=
|
6
|
+
metadata.gz: 6046b31e46416716606c0540ab60cc3b4c20d53043b7ec5701103dde77e47ccf6d0abb77e284bae0928dcaf66471d025c83bb51fbc6591c882ea401263cd4ae7
|
7
|
+
data.tar.gz: 22bd41bdd6639ea13f9d4787204fc7f13321d0c08ddf48748354b89a88bef8c0a73dda93f47e73cc20589332a7f6940400efa4998cc17c5ca7ebe8445222e65f
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
2
|
gem "sqlite_magic", :git => 'https://github.com/openc/sqlite_magic.git'
|
3
|
+
|
3
4
|
gem "pry", :group => [:development,:test]
|
4
5
|
# Specify your gem's dependencies in openc_bot.gemspec
|
5
6
|
gemspec
|
6
7
|
|
8
|
+
|
7
9
|
# we need to do pull request and bump version
|
8
10
|
# gem 'scraperwiki', '>=3.0.2', :git => 'https://github.com/openc/scraperwiki-ruby.git'
|
data/lib/openc_bot.rb
CHANGED
@@ -80,10 +80,25 @@ module OpencBot
|
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
83
|
+
def db_location
|
84
|
+
File.expand_path(File.join(@@app_directory, 'db', db_name))
|
85
|
+
end
|
86
|
+
|
83
87
|
# Override default in ScraperWiki gem
|
84
88
|
def sqlite_magic_connection
|
85
89
|
db = @config ? @config[:db] : File.expand_path(File.join(@@app_directory, 'db', db_name))
|
86
|
-
|
90
|
+
options = sqlite_busy_timeout ? {:busy_timeout => sqlite_busy_timeout} : {:busy_timeout => 10000}
|
91
|
+
@sqlite_magic_connection ||= SqliteMagic::Connection.new(db, options)
|
92
|
+
end
|
93
|
+
|
94
|
+
def sqlite_busy_timeout
|
95
|
+
self.const_defined?('SQLITE_BUSY_TIMEOUT') && self.const_get('SQLITE_BUSY_TIMEOUT')
|
96
|
+
end
|
97
|
+
|
98
|
+
def table_summary
|
99
|
+
field_names = sqlite_magic_connection.execute('PRAGMA table_info(ocdata)').collect{|c| c['name']}
|
100
|
+
select_sql = "COUNT(1) Total, " + field_names.collect{ |fn| "COUNT(#{fn}) #{fn}_not_null" }.join(', ') + " FROM ocdata"
|
101
|
+
select(select_sql).first
|
87
102
|
end
|
88
103
|
|
89
104
|
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'openc_bot'
|
2
2
|
require 'openc_bot/helpers/incremental_search'
|
3
3
|
require 'openc_bot/helpers/alpha_search'
|
4
|
+
# require 'openc_bot/asana_notifier'
|
5
|
+
require 'mail'
|
4
6
|
|
5
7
|
|
6
8
|
module OpencBot
|
@@ -9,6 +11,8 @@ module OpencBot
|
|
9
11
|
include OpencBot::Helpers::IncrementalSearch
|
10
12
|
include OpencBot::Helpers::AlphaSearch
|
11
13
|
|
14
|
+
STDOUT.sync = true
|
15
|
+
STDERR.sync = true
|
12
16
|
# This is called by #update_datum
|
13
17
|
def fetch_datum(company_number)
|
14
18
|
company_page = fetch_registry_page(company_number)
|
@@ -42,5 +46,51 @@ module OpencBot
|
|
42
46
|
super || 'company-schema'
|
43
47
|
end
|
44
48
|
|
49
|
+
def update_data(options={})
|
50
|
+
fetch_data
|
51
|
+
update_stale
|
52
|
+
send_run_report
|
53
|
+
rescue Exception => e
|
54
|
+
send_error_report(e)
|
55
|
+
raise e
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
def mark_bot_as_failing_on_asana(exception)
|
60
|
+
# error_description = "Code for this bot: https://github.com/openc/external_bots/tree/master/#{inferred_jurisdiction_code}_companies_fetcher\nError details: #{exception.inspect}.\nBacktrace:\n#{exception.backtrace}"
|
61
|
+
# params = {
|
62
|
+
# :tag => inferred_jurisdiction_code,
|
63
|
+
# :asana_api_key => ENV['ASANA_API_KEY'],
|
64
|
+
# :workspace => ENV['ASANA_WORKSPACE'],
|
65
|
+
# :title => exception.message,
|
66
|
+
# :description => error_description
|
67
|
+
# }
|
68
|
+
# AsanaNotifier.create_failed_bot_task(params)
|
69
|
+
end
|
70
|
+
|
71
|
+
def send_error_report(e)
|
72
|
+
subject = "Error running #{self.name}: #{e}"
|
73
|
+
body = "Error details: #{e.inspect}.\nBacktrace:\n#{e.backtrace}"
|
74
|
+
mark_bot_as_failing_on_asana(e) if ENV['CREATE_ASANA_TASKS_FOR_BOT_FAILURES']
|
75
|
+
send_report(:subject => subject, :body => body)
|
76
|
+
end
|
77
|
+
|
78
|
+
def send_run_report
|
79
|
+
subject = "#{self.name} successfully ran"
|
80
|
+
db_filesize = File.size?(db_location)
|
81
|
+
body = "No problems to report. db is #{db_location}, #{db_filesize} bytes. Last modified: #{File.stat(db_location).mtime}"
|
82
|
+
send_report(:subject => subject, :body => body)
|
83
|
+
end
|
84
|
+
|
85
|
+
def send_report(params)
|
86
|
+
Mail.deliver do
|
87
|
+
from 'admin@opencorporates.com'
|
88
|
+
to 'bots@opencorporates.com'
|
89
|
+
subject params[:subject]
|
90
|
+
body params[:body]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
|
45
95
|
end
|
46
96
|
end
|
@@ -16,6 +16,10 @@ module OpencBot
|
|
16
16
|
!!select("ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} = ? LIMIT 1", uid).first
|
17
17
|
end
|
18
18
|
|
19
|
+
def default_stale_count
|
20
|
+
self.const_defined?('STALE_COUNT') ? self.const_get('STALE_COUNT') : 1000
|
21
|
+
end
|
22
|
+
|
19
23
|
# fetches and saves data. By default assumes an incremental search, or an alpha search
|
20
24
|
# if USE_ALPHA_SEARCH is set. This method should be overridden if you are going to do a
|
21
25
|
# different type of data import, e.g from a CSV file.
|
@@ -35,24 +39,25 @@ module OpencBot
|
|
35
39
|
end
|
36
40
|
|
37
41
|
def fetch_registry_page(company_number)
|
42
|
+
sleep_before_http_req
|
38
43
|
_client.get_content(registry_url(company_number))
|
39
44
|
end
|
40
45
|
|
41
46
|
def prepare_and_save_data(all_data,options={})
|
42
47
|
data_to_be_saved = prepare_for_saving(all_data)
|
43
|
-
fail_count, retry_interval = 0, 5
|
48
|
+
# fail_count, retry_interval = 0, 5
|
44
49
|
begin
|
45
50
|
insert_or_update([primary_key_name], data_to_be_saved)
|
46
51
|
rescue SQLite3::BusyException => e
|
47
|
-
fail_count += 1
|
48
|
-
if fail_count <= MAX_BUSY_RETRIES
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
else
|
54
|
-
|
55
|
-
end
|
52
|
+
# fail_count += 1
|
53
|
+
# if fail_count <= MAX_BUSY_RETRIES
|
54
|
+
puts "#{e.inspect} raised saving:\n#{all_data}\n\n" if verbose?
|
55
|
+
# sleep retry_interval
|
56
|
+
# retry_interval = retry_interval * 2
|
57
|
+
# retry
|
58
|
+
# else
|
59
|
+
raise e
|
60
|
+
# end
|
56
61
|
end
|
57
62
|
|
58
63
|
end
|
@@ -61,6 +66,10 @@ module OpencBot
|
|
61
66
|
self.const_defined?('PRIMARY_KEY_NAME') ? self.const_get('PRIMARY_KEY_NAME') : :uid
|
62
67
|
end
|
63
68
|
|
69
|
+
def raise_when_saving_invalid_record
|
70
|
+
!!self.const_defined?('RAISE_WHEN_SAVING_INVALID_RECORD')
|
71
|
+
end
|
72
|
+
|
64
73
|
# sensible default. Either uses computed version or registry_url in db
|
65
74
|
def registry_url(uid)
|
66
75
|
computed_registry_url(uid) || registry_url_from_db(uid)
|
@@ -94,7 +103,7 @@ module OpencBot
|
|
94
103
|
end
|
95
104
|
|
96
105
|
def stale_entry_uids(stale_count=nil)
|
97
|
-
stale_count ||=
|
106
|
+
stale_count ||= default_stale_count
|
98
107
|
sql_query = "ocdata.* from ocdata WHERE retrieved_at IS NULL OR strftime('%s', retrieved_at) < strftime('%s', '#{Date.today - 30}') LIMIT #{stale_count.to_i}"
|
99
108
|
raw_data = select(sql_query).each do |res|
|
100
109
|
yield res[primary_key_name.to_s]
|
@@ -108,6 +117,24 @@ module OpencBot
|
|
108
117
|
end
|
109
118
|
end
|
110
119
|
|
120
|
+
def get_raw_data(uid, format=nil)
|
121
|
+
file_location = raw_data_file_location(uid, format)
|
122
|
+
File.read(file_location) if File.exist?(file_location)
|
123
|
+
end
|
124
|
+
|
125
|
+
def save_raw_data(raw_data, uid, format=nil)
|
126
|
+
file_location = raw_data_file_location(uid, format)
|
127
|
+
File.open(file_location, 'w') { |f| f.print raw_data }
|
128
|
+
end
|
129
|
+
|
130
|
+
def raw_data_file_location(uid, format=nil)
|
131
|
+
normalised_uid = uid.gsub(/[^[[:alnum:]]]/,'')
|
132
|
+
directory = File.join(*([root_directory,'data',normalised_uid.gsub(/^0+/,'').split(//).first(5)].flatten))
|
133
|
+
FileUtils.mkdir_p(directory) unless Dir.exist?(directory)
|
134
|
+
filename = format ? "#{normalised_uid}.#{format}" : normalised_uid
|
135
|
+
File.join(directory, filename)
|
136
|
+
end
|
137
|
+
|
111
138
|
def update_data(options={})
|
112
139
|
fetch_data
|
113
140
|
update_stale
|
@@ -130,13 +157,14 @@ module OpencBot
|
|
130
157
|
# or, if output_as_json is requested then the validation error is included
|
131
158
|
# in the JSON error message
|
132
159
|
def update_datum(uid, output_as_json=false,replace_existing_data=false)
|
160
|
+
# XXX here we refuse to run depending on run algorithm
|
133
161
|
return unless raw_data = fetch_datum(uid)
|
134
162
|
default_options = {primary_key_name => uid, :retrieved_at => Time.now}
|
135
163
|
return unless base_processed_data = process_datum(raw_data)
|
136
164
|
processed_data = default_options.merge(base_processed_data)
|
137
165
|
# prepare the data for saving (converting Arrays, Hashes to json) and
|
138
166
|
# save the original data too, as we may not extracting everything from it yet
|
139
|
-
save_entity(processed_data.merge(:data => raw_data))
|
167
|
+
raise_when_saving_invalid_record ? save_entity!(processed_data.merge(:data => raw_data)) : save_entity(processed_data.merge(:data => raw_data))
|
140
168
|
if output_as_json
|
141
169
|
puts processed_data.to_json
|
142
170
|
else
|
@@ -152,11 +180,18 @@ module OpencBot
|
|
152
180
|
end
|
153
181
|
end
|
154
182
|
|
183
|
+
# at a rate of 1.16 companies per second, and allowing 12 hours
|
184
|
+
# running per day. a 3m register would be updated in 2 months:
|
185
|
+
MAX_STALE_COUNT = 100_000
|
155
186
|
def update_stale(stale_count=nil)
|
156
|
-
|
157
|
-
|
187
|
+
# XXX here set an arbitrarily large number and then rely on the system to stop
|
188
|
+
# XXX wrap this with timings to work out per-record rate
|
189
|
+
rate_limiter do |limiter|
|
190
|
+
stale_entry_uids(MAX_STALE_COUNT) do |stale_entry_uid|
|
191
|
+
update_datum(stale_entry_uid)
|
192
|
+
limiter.checkpoint
|
193
|
+
end
|
158
194
|
end
|
159
|
-
|
160
195
|
end
|
161
196
|
|
162
197
|
def validate_datum(record)
|
@@ -196,6 +231,15 @@ module OpencBot
|
|
196
231
|
prepared_data
|
197
232
|
end
|
198
233
|
|
234
|
+
def sleep_before_http_req
|
235
|
+
if self.const_defined?('SLEEP_BEFORE_HTTP_REQ')
|
236
|
+
sleep_time = self.const_get('SLEEP_BEFORE_HTTP_REQ')
|
237
|
+
puts "#{self.name} about to sleep for #{sleep_time} before fetching data. Time now: #{Time.now}" if verbose?
|
238
|
+
sleep(sleep_time)
|
239
|
+
puts "#{self.name} slept for #{sleep_time}: Time now #{Time.now}" if verbose?
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
199
243
|
def _client(options={})
|
200
244
|
return @client if @client
|
201
245
|
@client = HTTPClient.new(options.delete(:proxy))
|
data/lib/openc_bot/tasks.rb
CHANGED
@@ -3,6 +3,8 @@ require 'optparse'
|
|
3
3
|
require 'json'
|
4
4
|
require 'fileutils'
|
5
5
|
|
6
|
+
PID_DIR = "/oc/pids"
|
7
|
+
|
6
8
|
namespace :bot do
|
7
9
|
desc "create a skeleton bot that can be used in OpenCorporates"
|
8
10
|
task :create do
|
@@ -134,6 +136,17 @@ namespace :bot do
|
|
134
136
|
end
|
135
137
|
end
|
136
138
|
|
139
|
+
desc 'Lists count of non-null values in each field in ocdata table'
|
140
|
+
task :table_summary do
|
141
|
+
only_process_running('table_summary') do
|
142
|
+
bot_name = get_bot_name
|
143
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
144
|
+
runner = callable_from_file_name(bot_name)
|
145
|
+
res = runner.table_summary
|
146
|
+
res.each {|k,v| puts "#{k}:\t#{v}"}
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
137
150
|
desc 'Summarise data for quality checking (only works for licences at the moment)'
|
138
151
|
task :summarise_data do
|
139
152
|
def as_sorted_hash(name, data)
|
@@ -327,7 +340,7 @@ EOF
|
|
327
340
|
puts "Created #{new_file}"
|
328
341
|
end
|
329
342
|
end
|
330
|
-
|
343
|
+
|
331
344
|
#Add rspec debugger to gemfile
|
332
345
|
File.open(File.join(working_dir,'Gemfile'),'a') do |file|
|
333
346
|
file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
|
@@ -341,7 +354,7 @@ EOF
|
|
341
354
|
end
|
342
355
|
|
343
356
|
def only_process_running(task_name)
|
344
|
-
pid_path = File.join(
|
357
|
+
pid_path = File.join(PID_DIR, 'pids', task_name)
|
345
358
|
|
346
359
|
raise_if_already_running(pid_path)
|
347
360
|
write_pid_file(pid_path)
|
data/lib/openc_bot/version.rb
CHANGED
data/openc_bot.gemspec
CHANGED
@@ -35,14 +35,17 @@ Gem::Specification.new do |gem|
|
|
35
35
|
gem.add_dependency "rake"
|
36
36
|
gem.add_dependency "activesupport", "4.1.4"
|
37
37
|
gem.add_dependency "nokogiri"
|
38
|
-
|
38
|
+
gem.add_dependency "sqlite_magic", "0.0.6"
|
39
39
|
gem.add_dependency "json"
|
40
40
|
gem.add_dependency "json-schema"
|
41
41
|
gem.add_dependency "httpclient"
|
42
42
|
gem.add_dependency "backports"
|
43
43
|
gem.add_dependency "scraperwiki", "3.0.2"
|
44
|
+
gem.add_dependency "mail"
|
45
|
+
# gem.add_dependency "openc-asana" unless RUBY_VERSION < '2.0'
|
44
46
|
|
45
|
-
gem.add_development_dependency "perftools.rb"
|
46
|
-
gem.add_development_dependency "
|
47
|
+
# gem.add_development_dependency "perftools.rb"
|
48
|
+
gem.add_development_dependency "byebug" unless RUBY_VERSION < '2.0'
|
49
|
+
gem.add_development_dependency "debugger" if RUBY_VERSION < '2.0'
|
47
50
|
gem.add_development_dependency "rspec"
|
48
51
|
end
|
@@ -3,6 +3,10 @@ require_relative '../spec_helper'
|
|
3
3
|
require 'openc_bot'
|
4
4
|
require 'openc_bot/company_fetcher_bot'
|
5
5
|
|
6
|
+
Mail.defaults do
|
7
|
+
delivery_method :test # no, don't send emails when testing
|
8
|
+
end
|
9
|
+
|
6
10
|
module TestCompaniesFetcher
|
7
11
|
extend OpencBot::CompanyFetcherBot
|
8
12
|
end
|
@@ -121,4 +125,23 @@ describe "A module that extends CompanyFetcherBot" do
|
|
121
125
|
end
|
122
126
|
end
|
123
127
|
end
|
128
|
+
|
129
|
+
describe '#update_data' do
|
130
|
+
|
131
|
+
before do
|
132
|
+
TestCompaniesFetcher.stub(:fetch_data_via_incremental_search)
|
133
|
+
TestCompaniesFetcher.stub(:update_stale)
|
134
|
+
#this can be any file that we can stat
|
135
|
+
TestCompaniesFetcher.stub(:db_location).
|
136
|
+
and_return(File.join(File.dirname(__FILE__),"company_fetcher_bot_spec.rb"))
|
137
|
+
|
138
|
+
Mail::TestMailer.deliveries.clear
|
139
|
+
TestCompaniesFetcher.update_data
|
140
|
+
end
|
141
|
+
|
142
|
+
it 'should send success email' do
|
143
|
+
Mail::TestMailer.deliveries.first.subject.should match /successfully ran/
|
144
|
+
end
|
145
|
+
|
146
|
+
end
|
124
147
|
end
|
@@ -8,6 +8,8 @@ module ModuleThatIncludesRegisterMethods
|
|
8
8
|
extend OpencBot::Helpers::RegisterMethods
|
9
9
|
PRIMARY_KEY_NAME = :custom_uid
|
10
10
|
SCHEMA_NAME = 'company-schema'
|
11
|
+
SLEEP_BEFORE_HTTP_REQ = 2
|
12
|
+
RAISE_WHEN_SAVING_INVALID_RECORD = true
|
11
13
|
end
|
12
14
|
|
13
15
|
module ModuleWithNoCustomPrimaryKey
|
@@ -29,7 +31,6 @@ describe 'a module that includes RegisterMethods' do
|
|
29
31
|
describe "#datum_exists?" do
|
30
32
|
before do
|
31
33
|
ModuleThatIncludesRegisterMethods.stub(:select).and_return([])
|
32
|
-
|
33
34
|
end
|
34
35
|
|
35
36
|
it "should select_data from database" do
|
@@ -233,10 +234,12 @@ describe 'a module that includes RegisterMethods' do
|
|
233
234
|
|
234
235
|
context 'and SQLite3::BusyException raised' do
|
235
236
|
it 'should retry up to 3 times' do
|
237
|
+
pending "deciding whether to allow this in some circumstances"
|
236
238
|
ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).exactly(4).times.and_raise(SQLite3::BusyException)
|
237
239
|
lambda { ModuleThatIncludesRegisterMethods.prepare_and_save_data(@params) }.should raise_error(SQLite3::BusyException)
|
238
240
|
end
|
239
241
|
it 'should not raise error if successful before limit' do
|
242
|
+
pending "deciding whether to allow this in some circumstances"
|
240
243
|
ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).exactly(3).times.ordered.and_raise(SQLite3::BusyException)
|
241
244
|
ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).ordered
|
242
245
|
lambda { ModuleThatIncludesRegisterMethods.prepare_and_save_data(@params) }.should_not raise_error
|
@@ -372,6 +375,15 @@ describe 'a module that includes RegisterMethods' do
|
|
372
375
|
end
|
373
376
|
end
|
374
377
|
|
378
|
+
context 'and errors returned validating data' do
|
379
|
+
it "should validate processed data" do
|
380
|
+
ModuleThatIncludesRegisterMethods.stub(:validate_datum).and_return([{:failed_attribute => 'foo', :message => 'Something not right'}])
|
381
|
+
lambda { ModuleThatIncludesRegisterMethods.update_datum(@uid)}.should raise_error
|
382
|
+
end
|
383
|
+
|
384
|
+
|
385
|
+
end
|
386
|
+
|
375
387
|
context 'and process_datum returns nil' do
|
376
388
|
before do
|
377
389
|
ModuleThatIncludesRegisterMethods.stub(:process_datum).and_return(nil)
|
@@ -405,11 +417,12 @@ describe 'a module that includes RegisterMethods' do
|
|
405
417
|
end
|
406
418
|
end
|
407
419
|
|
408
|
-
describe "#fetch_registry_page for
|
420
|
+
describe "#fetch_registry_page for uid" do
|
409
421
|
before do
|
410
422
|
@dummy_client = double('http_client', :get_content => nil)
|
411
423
|
ModuleThatIncludesRegisterMethods.stub(:_client).and_return(@dummy_client)
|
412
424
|
ModuleThatIncludesRegisterMethods.stub(:registry_url).and_return('http://some.registry.url')
|
425
|
+
@dummy_client.stub(:get_content).and_return(:registry_page_html)
|
413
426
|
end
|
414
427
|
|
415
428
|
it "should GET registry_page for registry_url for company_number" do
|
@@ -423,6 +436,24 @@ describe 'a module that includes RegisterMethods' do
|
|
423
436
|
@dummy_client.stub(:get_content).and_return(:registry_page_html)
|
424
437
|
ModuleThatIncludesRegisterMethods.fetch_registry_page('76543').should == :registry_page_html
|
425
438
|
end
|
439
|
+
|
440
|
+
context 'and SLEEP_BEFORE_HTTP_REQ is set' do
|
441
|
+
it 'should sleep for given period' do
|
442
|
+
ModuleThatIncludesRegisterMethods.should_receive(:sleep).with(2)
|
443
|
+
ModuleThatIncludesRegisterMethods.fetch_registry_page('76543')
|
444
|
+
end
|
445
|
+
end
|
446
|
+
|
447
|
+
context 'and SLEEP_BEFORE_HTTP_REQ is not set' do
|
448
|
+
before do
|
449
|
+
ModuleWithNoCustomPrimaryKey.stub(:_client).and_return(@dummy_client)
|
450
|
+
end
|
451
|
+
|
452
|
+
it 'should sleep for given period' do
|
453
|
+
ModuleWithNoCustomPrimaryKey.should_not_receive(:sleep)
|
454
|
+
ModuleWithNoCustomPrimaryKey.fetch_registry_page('76543')
|
455
|
+
end
|
456
|
+
end
|
426
457
|
end
|
427
458
|
|
428
459
|
describe "#validate_datum" do
|
@@ -594,4 +625,91 @@ describe 'a module that includes RegisterMethods' do
|
|
594
625
|
end
|
595
626
|
end
|
596
627
|
|
628
|
+
describe 'raise_when_saving_invalid_record' do
|
629
|
+
describe '#primary_key_name' do
|
630
|
+
it 'should return false if RAISE_WHEN_SAVING_INVALID_RECORD not set' do
|
631
|
+
ModuleWithNoCustomPrimaryKey.send(:raise_when_saving_invalid_record).should == false
|
632
|
+
end
|
633
|
+
|
634
|
+
it 'should return true if RAISE_WHEN_SAVING_INVALID_RECORD set' do
|
635
|
+
ModuleThatIncludesRegisterMethods.send(:raise_when_saving_invalid_record).should == true
|
636
|
+
end
|
637
|
+
end
|
638
|
+
end
|
639
|
+
|
640
|
+
describe '#raw_data_file_location for a uid' do
|
641
|
+
before do
|
642
|
+
@dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
|
643
|
+
Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
|
644
|
+
|
645
|
+
ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
|
646
|
+
end
|
647
|
+
|
648
|
+
after do
|
649
|
+
FileUtils.rmdir(File.join(@dummy_root_directory, 'data'))
|
650
|
+
end
|
651
|
+
|
652
|
+
it 'should return directory built from uid inside root data directory' do
|
653
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('123456', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','3','4','5', '123456.html')
|
654
|
+
end
|
655
|
+
|
656
|
+
it 'should create directory structure if it doesnt exist' do
|
657
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('123456', 'html')
|
658
|
+
Dir.exist?(File.join(@dummy_root_directory, 'data', '1','2','3','4','5')).should == true
|
659
|
+
end
|
660
|
+
|
661
|
+
it 'should ignore leading zeroes when building directory' do
|
662
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('001234', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','3','4', '001234.html')
|
663
|
+
end
|
664
|
+
|
665
|
+
it 'should ignore non alphanum chars when building directory' do
|
666
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html')
|
667
|
+
end
|
668
|
+
|
669
|
+
it 'should allow format to be missing' do
|
670
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456').should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')
|
671
|
+
end
|
672
|
+
|
673
|
+
it 'should allow format to be nil' do
|
674
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456', nil).should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')
|
675
|
+
end
|
676
|
+
end
|
677
|
+
|
678
|
+
describe '#save_raw_data' do
|
679
|
+
before do
|
680
|
+
@dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
|
681
|
+
Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
|
682
|
+
|
683
|
+
ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
|
684
|
+
end
|
685
|
+
|
686
|
+
it 'should save raw data as in computed raw_data_file_location' do
|
687
|
+
ModuleThatIncludesRegisterMethods.save_raw_data('foo bar', '12a-b/3456', 'html')
|
688
|
+
File.read(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html')).should == 'foo bar'
|
689
|
+
end
|
690
|
+
|
691
|
+
it 'should allow format to be missing' do
|
692
|
+
ModuleThatIncludesRegisterMethods.save_raw_data('foo bar', '12a-b/3456')
|
693
|
+
File.read(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')).should == 'foo bar'
|
694
|
+
end
|
695
|
+
end
|
696
|
+
|
697
|
+
describe '#get_raw_data' do
|
698
|
+
before do
|
699
|
+
@dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
|
700
|
+
Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
|
701
|
+
|
702
|
+
ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
|
703
|
+
end
|
704
|
+
|
705
|
+
it 'should read raw data in computed raw_data_file_location' do
|
706
|
+
File.open(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html'),'w') { |f| f.print 'foo bar' }
|
707
|
+
ModuleThatIncludesRegisterMethods.get_raw_data('12a-b/3456', 'html').should == 'foo bar'
|
708
|
+
end
|
709
|
+
|
710
|
+
it 'should allow format to be missing' do
|
711
|
+
File.open(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456'),'w') { |f| f.print 'foo bar' }
|
712
|
+
ModuleThatIncludesRegisterMethods.get_raw_data('12a-b/3456').should == 'foo bar'
|
713
|
+
end
|
714
|
+
end
|
597
715
|
end
|