openc_bot 0.0.27 → 0.0.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/.travis.yml +1 -0
- data/Gemfile +2 -0
- data/lib/openc_bot.rb +16 -1
- data/lib/openc_bot/company_fetcher_bot.rb +50 -0
- data/lib/openc_bot/helpers/register_methods.rb +59 -15
- data/lib/openc_bot/tasks.rb +15 -2
- data/lib/openc_bot/templates/spec/spec_helper.rb +4 -0
- data/lib/openc_bot/version.rb +1 -1
- data/openc_bot.gemspec +6 -3
- data/spec/lib/company_fetcher_bot_spec.rb +23 -0
- data/spec/lib/helpers/register_methods_spec.rb +120 -2
- data/spec/lib/openc_bot_spec.rb +14 -1
- data/spec/simple_openc_bot_spec.rb +1 -0
- data/spec/spec_helper.rb +3 -1
- metadata +43 -220
- data/schemas/.gitignore +0 -13
- data/schemas/.travis.yml +0 -5
- data/schemas/Gemfile +0 -9
- data/schemas/README.md +0 -2
- data/schemas/Rakefile +0 -46
- data/schemas/lib/base-statement.json +0 -22
- data/schemas/lib/snippets/financial-payment-base.json +0 -4
- data/schemas/lib/snippets/licence-base.json +0 -4
- data/schemas/schemas/company-schema.json +0 -183
- data/schemas/schemas/financial-payment-schema.json +0 -27
- data/schemas/schemas/includes/address.json +0 -23
- data/schemas/schemas/includes/alternative_name.json +0 -13
- data/schemas/schemas/includes/company.json +0 -16
- data/schemas/schemas/includes/filing.json +0 -20
- data/schemas/schemas/includes/financial-payment-data-object.json +0 -60
- data/schemas/schemas/includes/industry_code.json +0 -16
- data/schemas/schemas/includes/licence-data-object.json +0 -36
- data/schemas/schemas/includes/officer.json +0 -50
- data/schemas/schemas/includes/previous_name.json +0 -14
- data/schemas/schemas/includes/share-parcel-data.json +0 -67
- data/schemas/schemas/includes/share-parcel.json +0 -63
- data/schemas/schemas/includes/subsidiary-relationship-data.json +0 -47
- data/schemas/schemas/includes/total-shares.json +0 -10
- data/schemas/schemas/licence-schema.json +0 -27
- data/schemas/schemas/primary-data-schema.json +0 -17
- data/schemas/schemas/share-parcel-schema.json +0 -21
- data/schemas/schemas/simple-financial-payment-schema.json +0 -85
- data/schemas/schemas/simple-licence-schema.json +0 -59
- data/schemas/schemas/simple-subsidiary-schema.json +0 -68
- data/schemas/schemas/subsidiary-relationship-schema.json +0 -27
- data/schemas/spec/sample-data/invalid/company-01.json +0 -4
- data/schemas/spec/sample-data/invalid/company-02.json +0 -5
- data/schemas/spec/sample-data/invalid/company-03.json +0 -5
- data/schemas/spec/sample-data/invalid/company-04.json +0 -5
- data/schemas/spec/sample-data/invalid/company-05.json +0 -5
- data/schemas/spec/sample-data/invalid/company-06.json +0 -6
- data/schemas/spec/sample-data/invalid/company-07.json +0 -8
- data/schemas/spec/sample-data/invalid/company-08.json +0 -7
- data/schemas/spec/sample-data/invalid/company-09.json +0 -9
- data/schemas/spec/sample-data/invalid/company-10.json +0 -7
- data/schemas/spec/sample-data/invalid/company-11.json +0 -9
- data/schemas/spec/sample-data/invalid/company-12.json +0 -11
- data/schemas/spec/sample-data/invalid/company-13.json +0 -11
- data/schemas/spec/sample-data/invalid/company-14.json +0 -7
- data/schemas/spec/sample-data/invalid/company-15.json +0 -7
- data/schemas/spec/sample-data/invalid/company-16.json +0 -7
- data/schemas/spec/sample-data/invalid/company-17.json +0 -9
- data/schemas/spec/sample-data/invalid/company-18.json +0 -9
- data/schemas/spec/sample-data/invalid/company-19.json +0 -9
- data/schemas/spec/sample-data/invalid/company-20.json +0 -9
- data/schemas/spec/sample-data/invalid/company-21.json +0 -11
- data/schemas/spec/sample-data/invalid/company-22.json +0 -11
- data/schemas/spec/sample-data/invalid/company-23.json +0 -7
- data/schemas/spec/sample-data/invalid/company-24.json +0 -12
- data/schemas/spec/sample-data/invalid/company-25.json +0 -9
- data/schemas/spec/sample-data/invalid/company-26.json +0 -11
- data/schemas/spec/sample-data/invalid/company-27.json +0 -7
- data/schemas/spec/sample-data/invalid/company-28.json +0 -9
- data/schemas/spec/sample-data/invalid/company-29.json +0 -12
- data/schemas/spec/sample-data/invalid/company-30.json +0 -16
- data/schemas/spec/sample-data/invalid/company-31.json +0 -14
- data/schemas/spec/sample-data/invalid/company-32.json +0 -11
- data/schemas/spec/sample-data/invalid/company-33.json +0 -7
- data/schemas/spec/sample-data/invalid/company-34.json +0 -9
- data/schemas/spec/sample-data/invalid/company-35.json +0 -9
- data/schemas/spec/sample-data/invalid/company-36.json +0 -10
- data/schemas/spec/sample-data/invalid/company-37.json +0 -7
- data/schemas/spec/sample-data/invalid/company-38.json +0 -9
- data/schemas/spec/sample-data/invalid/company-39.json +0 -11
- data/schemas/spec/sample-data/invalid/company-40.json +0 -12
- data/schemas/spec/sample-data/invalid/company-41.json +0 -12
- data/schemas/spec/sample-data/invalid/company-42.json +0 -7
- data/schemas/spec/sample-data/invalid/company-43.json +0 -9
- data/schemas/spec/sample-data/invalid/company-44.json +0 -11
- data/schemas/spec/sample-data/invalid/company-45.json +0 -11
- data/schemas/spec/sample-data/invalid/company-46.json +0 -7
- data/schemas/spec/sample-data/invalid/company-47.json +0 -9
- data/schemas/spec/sample-data/invalid/company-48.json +0 -9
- data/schemas/spec/sample-data/invalid/company-49.json +0 -9
- data/schemas/spec/sample-data/invalid/company-50.json +0 -9
- data/schemas/spec/sample-data/invalid/company-51.json +0 -9
- data/schemas/spec/sample-data/invalid/company-52.json +0 -9
- data/schemas/spec/sample-data/invalid/company-53.json +0 -10
- data/schemas/spec/sample-data/invalid/company-54.json +0 -9
- data/schemas/spec/sample-data/invalid/company-55.json +0 -9
- data/schemas/spec/sample-data/invalid/company-56.json +0 -7
- data/schemas/spec/sample-data/invalid/company-57.json +0 -7
- data/schemas/spec/sample-data/invalid/company-58.json +0 -7
- data/schemas/spec/sample-data/invalid/company-59.json +0 -13
- data/schemas/spec/sample-data/invalid/company-60.json +0 -7
- data/schemas/spec/sample-data/invalid/company-61.json +0 -7
- data/schemas/spec/sample-data/invalid/company-62.json +0 -9
- data/schemas/spec/sample-data/invalid/company-63.json +0 -12
- data/schemas/spec/sample-data/invalid/company-64.json +0 -14
- data/schemas/spec/sample-data/invalid/company-65.json +0 -14
- data/schemas/spec/sample-data/invalid/company-66.json +0 -13
- data/schemas/spec/sample-data/invalid/company-67.json +0 -14
- data/schemas/spec/sample-data/invalid/company-68.json +0 -12
- data/schemas/spec/sample-data/invalid/company-69.json +0 -12
- data/schemas/spec/sample-data/invalid/company-70.json +0 -14
- data/schemas/spec/sample-data/invalid/financial-payment-01.json +0 -24
- data/schemas/spec/sample-data/invalid/licence-01.json +0 -18
- data/schemas/spec/sample-data/invalid/licence-02.json +0 -18
- data/schemas/spec/sample-data/invalid/licence-03.json +0 -12
- data/schemas/spec/sample-data/invalid/licence-04.json +0 -18
- data/schemas/spec/sample-data/invalid/licence-05.json +0 -18
- data/schemas/spec/sample-data/invalid/licence-06.json +0 -18
- data/schemas/spec/sample-data/invalid/licence-07.json +0 -20
- data/schemas/spec/sample-data/invalid/licence-08.json +0 -21
- data/schemas/spec/sample-data/invalid/primary-data-01.json +0 -4
- data/schemas/spec/sample-data/invalid/primary-data-02.json +0 -4
- data/schemas/spec/sample-data/invalid/simple-licence-01.json +0 -9
- data/schemas/spec/sample-data/invalid/simple-licence-02.json +0 -8
- data/schemas/spec/sample-data/invalid/simple-licence-03.json +0 -9
- data/schemas/spec/sample-data/invalid/simple-licence-04.json +0 -10
- data/schemas/spec/sample-data/invalid/simple-licence-05.json +0 -10
- data/schemas/spec/sample-data/invalid/simple-licence-06.json +0 -10
- data/schemas/spec/sample-data/invalid/simple-subsidiary-01.json +0 -13
- data/schemas/spec/sample-data/invalid/simple-subsidiary-02.json +0 -13
- data/schemas/spec/sample-data/licence-schema.json.old +0 -21
- data/schemas/spec/sample-data/valid/company-01.json +0 -6
- data/schemas/spec/sample-data/valid/company-02.json +0 -7
- data/schemas/spec/sample-data/valid/company-03.json +0 -8
- data/schemas/spec/sample-data/valid/company-04.json +0 -7
- data/schemas/spec/sample-data/valid/company-05.json +0 -7
- data/schemas/spec/sample-data/valid/company-06.json +0 -12
- data/schemas/spec/sample-data/valid/company-07.json +0 -9
- data/schemas/spec/sample-data/valid/company-08.json +0 -9
- data/schemas/spec/sample-data/valid/company-09.json +0 -20
- data/schemas/spec/sample-data/valid/company-10.json +0 -9
- data/schemas/spec/sample-data/valid/company-11.json +0 -7
- data/schemas/spec/sample-data/valid/company-12.json +0 -7
- data/schemas/spec/sample-data/valid/company-13.json +0 -7
- data/schemas/spec/sample-data/valid/company-14.json +0 -15
- data/schemas/spec/sample-data/valid/company-15.json +0 -8
- data/schemas/spec/sample-data/valid/company-16.json +0 -9
- data/schemas/spec/sample-data/valid/company-17.json +0 -9
- data/schemas/spec/sample-data/valid/company-18.json +0 -9
- data/schemas/spec/sample-data/valid/company-19.json +0 -37
- data/schemas/spec/sample-data/valid/company-20.json +0 -9
- data/schemas/spec/sample-data/valid/company-21.json +0 -26
- data/schemas/spec/sample-data/valid/company-22.json +0 -20
- data/schemas/spec/sample-data/valid/company-23.json +0 -9
- data/schemas/spec/sample-data/valid/company-24.json +0 -12
- data/schemas/spec/sample-data/valid/company-25.json +0 -12
- data/schemas/spec/sample-data/valid/company-26.json +0 -12
- data/schemas/spec/sample-data/valid/company-27.json +0 -28
- data/schemas/spec/sample-data/valid/company-28.json +0 -9
- data/schemas/spec/sample-data/valid/company-29.json +0 -10
- data/schemas/spec/sample-data/valid/company-30.json +0 -9
- data/schemas/spec/sample-data/valid/company-31.json +0 -17
- data/schemas/spec/sample-data/valid/company-32.json +0 -9
- data/schemas/spec/sample-data/valid/company-33.json +0 -29
- data/schemas/spec/sample-data/valid/company-34.json +0 -9
- data/schemas/spec/sample-data/valid/company-35.json +0 -9
- data/schemas/spec/sample-data/valid/company-36.json +0 -9
- data/schemas/spec/sample-data/valid/company-37.json +0 -9
- data/schemas/spec/sample-data/valid/company-38.json +0 -9
- data/schemas/spec/sample-data/valid/company-39.json +0 -9
- data/schemas/spec/sample-data/valid/company-40.json +0 -9
- data/schemas/spec/sample-data/valid/company-41.json +0 -9
- data/schemas/spec/sample-data/valid/company-42.json +0 -10
- data/schemas/spec/sample-data/valid/company-43.json +0 -7
- data/schemas/spec/sample-data/valid/company-44.json +0 -7
- data/schemas/spec/sample-data/valid/company-45.json +0 -23
- data/schemas/spec/sample-data/valid/company-46.json +0 -7
- data/schemas/spec/sample-data/valid/company-47.json +0 -12
- data/schemas/spec/sample-data/valid/company-48.json +0 -7
- data/schemas/spec/sample-data/valid/company-49.json +0 -14
- data/schemas/spec/sample-data/valid/company-50.json +0 -13
- data/schemas/spec/sample-data/valid/company-51.json +0 -14
- data/schemas/spec/sample-data/valid/company-52.json +0 -12
- data/schemas/spec/sample-data/valid/company-53.json +0 -9
- data/schemas/spec/sample-data/valid/financial-payment-01.json +0 -25
- data/schemas/spec/sample-data/valid/financial-payment-02.json +0 -29
- data/schemas/spec/sample-data/valid/licence-01.json +0 -19
- data/schemas/spec/sample-data/valid/licence-02.json +0 -21
- data/schemas/spec/sample-data/valid/licence-03.json +0 -21
- data/schemas/spec/sample-data/valid/licence-04.json +0 -26
- data/schemas/spec/sample-data/valid/primary-data-01.json +0 -4
- data/schemas/spec/sample-data/valid/primary-data-02.json +0 -5
- data/schemas/spec/sample-data/valid/simple-licence-01.json +0 -10
- data/schemas/spec/sample-data/valid/simple-licence-02.json +0 -10
- data/schemas/spec/sample-data/valid/simple-licence-03.json +0 -12
- data/schemas/spec/sample-data/valid/simple-subsidiary-01.json +0 -13
- data/schemas/spec/sample-data/valid/simple-subsidiary-02.json +0 -13
- data/schemas/spec/sample-data/valid/subsidiary-relationship-01.json +0 -23
- data/schemas/spec/spec_helper.rb +0 -78
- data/schemas/spec/validation_spec.rb +0 -39
checksums.yaml
CHANGED
|
@@ -1,15 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
|
|
5
|
-
data.tar.gz: !binary |-
|
|
6
|
-
MTQ0OGM2ZWZjOWYwNzQ5MGQ3Y2YxZDRiOGYyM2FiY2Y4MzBjNDIzZQ==
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 1db3c143f46fc934729ee27c6cc5b4047fb2a5c5
|
|
4
|
+
data.tar.gz: 4c2de3f8f0ecc62f77689386dc6e50ed26290714
|
|
7
5
|
SHA512:
|
|
8
|
-
metadata.gz:
|
|
9
|
-
|
|
10
|
-
ZGUxNDc2ZTZkNzlkOTYzOWFlYTRkZDM5ZDgwMTRmYmE0ZTM3ZTAxMWIwNGFm
|
|
11
|
-
YTA4MTgxNDc4OGI2OWRhZTk0NmQ3ODc4MmY5NWE4YmE4YzRlMjc=
|
|
12
|
-
data.tar.gz: !binary |-
|
|
13
|
-
MjQ2NmFhOGY4NDFjZDE2NGJjNTlkNGUwNWJjMzUyMDJjNGM4YjBiMGYxMzMw
|
|
14
|
-
YTE4ZmIxZmE0YTU0N2Y1NWE0NDU4ZGUzZjc1ODExZmZmZDAxNmZmZWMzY2Qx
|
|
15
|
-
MWQyNWU4NmEzYmQ5MjdiYzIxYTFlYTkyZjMzMWZjYWY0NjkwYWM=
|
|
6
|
+
metadata.gz: 6046b31e46416716606c0540ab60cc3b4c20d53043b7ec5701103dde77e47ccf6d0abb77e284bae0928dcaf66471d025c83bb51fbc6591c882ea401263cd4ae7
|
|
7
|
+
data.tar.gz: 22bd41bdd6639ea13f9d4787204fc7f13321d0c08ddf48748354b89a88bef8c0a73dda93f47e73cc20589332a7f6940400efa4998cc17c5ca7ebe8445222e65f
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
source 'https://rubygems.org'
|
|
2
2
|
gem "sqlite_magic", :git => 'https://github.com/openc/sqlite_magic.git'
|
|
3
|
+
|
|
3
4
|
gem "pry", :group => [:development,:test]
|
|
4
5
|
# Specify your gem's dependencies in openc_bot.gemspec
|
|
5
6
|
gemspec
|
|
6
7
|
|
|
8
|
+
|
|
7
9
|
# we need to do pull request and bump version
|
|
8
10
|
# gem 'scraperwiki', '>=3.0.2', :git => 'https://github.com/openc/scraperwiki-ruby.git'
|
data/lib/openc_bot.rb
CHANGED
|
@@ -80,10 +80,25 @@ module OpencBot
|
|
|
80
80
|
end
|
|
81
81
|
end
|
|
82
82
|
|
|
83
|
+
def db_location
|
|
84
|
+
File.expand_path(File.join(@@app_directory, 'db', db_name))
|
|
85
|
+
end
|
|
86
|
+
|
|
83
87
|
# Override default in ScraperWiki gem
|
|
84
88
|
def sqlite_magic_connection
|
|
85
89
|
db = @config ? @config[:db] : File.expand_path(File.join(@@app_directory, 'db', db_name))
|
|
86
|
-
|
|
90
|
+
options = sqlite_busy_timeout ? {:busy_timeout => sqlite_busy_timeout} : {:busy_timeout => 10000}
|
|
91
|
+
@sqlite_magic_connection ||= SqliteMagic::Connection.new(db, options)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def sqlite_busy_timeout
|
|
95
|
+
self.const_defined?('SQLITE_BUSY_TIMEOUT') && self.const_get('SQLITE_BUSY_TIMEOUT')
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def table_summary
|
|
99
|
+
field_names = sqlite_magic_connection.execute('PRAGMA table_info(ocdata)').collect{|c| c['name']}
|
|
100
|
+
select_sql = "COUNT(1) Total, " + field_names.collect{ |fn| "COUNT(#{fn}) #{fn}_not_null" }.join(', ') + " FROM ocdata"
|
|
101
|
+
select(select_sql).first
|
|
87
102
|
end
|
|
88
103
|
|
|
89
104
|
end
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
require 'openc_bot'
|
|
2
2
|
require 'openc_bot/helpers/incremental_search'
|
|
3
3
|
require 'openc_bot/helpers/alpha_search'
|
|
4
|
+
# require 'openc_bot/asana_notifier'
|
|
5
|
+
require 'mail'
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
module OpencBot
|
|
@@ -9,6 +11,8 @@ module OpencBot
|
|
|
9
11
|
include OpencBot::Helpers::IncrementalSearch
|
|
10
12
|
include OpencBot::Helpers::AlphaSearch
|
|
11
13
|
|
|
14
|
+
STDOUT.sync = true
|
|
15
|
+
STDERR.sync = true
|
|
12
16
|
# This is called by #update_datum
|
|
13
17
|
def fetch_datum(company_number)
|
|
14
18
|
company_page = fetch_registry_page(company_number)
|
|
@@ -42,5 +46,51 @@ module OpencBot
|
|
|
42
46
|
super || 'company-schema'
|
|
43
47
|
end
|
|
44
48
|
|
|
49
|
+
def update_data(options={})
|
|
50
|
+
fetch_data
|
|
51
|
+
update_stale
|
|
52
|
+
send_run_report
|
|
53
|
+
rescue Exception => e
|
|
54
|
+
send_error_report(e)
|
|
55
|
+
raise e
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
def mark_bot_as_failing_on_asana(exception)
|
|
60
|
+
# error_description = "Code for this bot: https://github.com/openc/external_bots/tree/master/#{inferred_jurisdiction_code}_companies_fetcher\nError details: #{exception.inspect}.\nBacktrace:\n#{exception.backtrace}"
|
|
61
|
+
# params = {
|
|
62
|
+
# :tag => inferred_jurisdiction_code,
|
|
63
|
+
# :asana_api_key => ENV['ASANA_API_KEY'],
|
|
64
|
+
# :workspace => ENV['ASANA_WORKSPACE'],
|
|
65
|
+
# :title => exception.message,
|
|
66
|
+
# :description => error_description
|
|
67
|
+
# }
|
|
68
|
+
# AsanaNotifier.create_failed_bot_task(params)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def send_error_report(e)
|
|
72
|
+
subject = "Error running #{self.name}: #{e}"
|
|
73
|
+
body = "Error details: #{e.inspect}.\nBacktrace:\n#{e.backtrace}"
|
|
74
|
+
mark_bot_as_failing_on_asana(e) if ENV['CREATE_ASANA_TASKS_FOR_BOT_FAILURES']
|
|
75
|
+
send_report(:subject => subject, :body => body)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def send_run_report
|
|
79
|
+
subject = "#{self.name} successfully ran"
|
|
80
|
+
db_filesize = File.size?(db_location)
|
|
81
|
+
body = "No problems to report. db is #{db_location}, #{db_filesize} bytes. Last modified: #{File.stat(db_location).mtime}"
|
|
82
|
+
send_report(:subject => subject, :body => body)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def send_report(params)
|
|
86
|
+
Mail.deliver do
|
|
87
|
+
from 'admin@opencorporates.com'
|
|
88
|
+
to 'bots@opencorporates.com'
|
|
89
|
+
subject params[:subject]
|
|
90
|
+
body params[:body]
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
|
|
45
95
|
end
|
|
46
96
|
end
|
|
@@ -16,6 +16,10 @@ module OpencBot
|
|
|
16
16
|
!!select("ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} = ? LIMIT 1", uid).first
|
|
17
17
|
end
|
|
18
18
|
|
|
19
|
+
def default_stale_count
|
|
20
|
+
self.const_defined?('STALE_COUNT') ? self.const_get('STALE_COUNT') : 1000
|
|
21
|
+
end
|
|
22
|
+
|
|
19
23
|
# fetches and saves data. By default assumes an incremental search, or an alpha search
|
|
20
24
|
# if USE_ALPHA_SEARCH is set. This method should be overridden if you are going to do a
|
|
21
25
|
# different type of data import, e.g from a CSV file.
|
|
@@ -35,24 +39,25 @@ module OpencBot
|
|
|
35
39
|
end
|
|
36
40
|
|
|
37
41
|
def fetch_registry_page(company_number)
|
|
42
|
+
sleep_before_http_req
|
|
38
43
|
_client.get_content(registry_url(company_number))
|
|
39
44
|
end
|
|
40
45
|
|
|
41
46
|
def prepare_and_save_data(all_data,options={})
|
|
42
47
|
data_to_be_saved = prepare_for_saving(all_data)
|
|
43
|
-
fail_count, retry_interval = 0, 5
|
|
48
|
+
# fail_count, retry_interval = 0, 5
|
|
44
49
|
begin
|
|
45
50
|
insert_or_update([primary_key_name], data_to_be_saved)
|
|
46
51
|
rescue SQLite3::BusyException => e
|
|
47
|
-
fail_count += 1
|
|
48
|
-
if fail_count <= MAX_BUSY_RETRIES
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
else
|
|
54
|
-
|
|
55
|
-
end
|
|
52
|
+
# fail_count += 1
|
|
53
|
+
# if fail_count <= MAX_BUSY_RETRIES
|
|
54
|
+
puts "#{e.inspect} raised saving:\n#{all_data}\n\n" if verbose?
|
|
55
|
+
# sleep retry_interval
|
|
56
|
+
# retry_interval = retry_interval * 2
|
|
57
|
+
# retry
|
|
58
|
+
# else
|
|
59
|
+
raise e
|
|
60
|
+
# end
|
|
56
61
|
end
|
|
57
62
|
|
|
58
63
|
end
|
|
@@ -61,6 +66,10 @@ module OpencBot
|
|
|
61
66
|
self.const_defined?('PRIMARY_KEY_NAME') ? self.const_get('PRIMARY_KEY_NAME') : :uid
|
|
62
67
|
end
|
|
63
68
|
|
|
69
|
+
def raise_when_saving_invalid_record
|
|
70
|
+
!!self.const_defined?('RAISE_WHEN_SAVING_INVALID_RECORD')
|
|
71
|
+
end
|
|
72
|
+
|
|
64
73
|
# sensible default. Either uses computed version or registry_url in db
|
|
65
74
|
def registry_url(uid)
|
|
66
75
|
computed_registry_url(uid) || registry_url_from_db(uid)
|
|
@@ -94,7 +103,7 @@ module OpencBot
|
|
|
94
103
|
end
|
|
95
104
|
|
|
96
105
|
def stale_entry_uids(stale_count=nil)
|
|
97
|
-
stale_count ||=
|
|
106
|
+
stale_count ||= default_stale_count
|
|
98
107
|
sql_query = "ocdata.* from ocdata WHERE retrieved_at IS NULL OR strftime('%s', retrieved_at) < strftime('%s', '#{Date.today - 30}') LIMIT #{stale_count.to_i}"
|
|
99
108
|
raw_data = select(sql_query).each do |res|
|
|
100
109
|
yield res[primary_key_name.to_s]
|
|
@@ -108,6 +117,24 @@ module OpencBot
|
|
|
108
117
|
end
|
|
109
118
|
end
|
|
110
119
|
|
|
120
|
+
def get_raw_data(uid, format=nil)
|
|
121
|
+
file_location = raw_data_file_location(uid, format)
|
|
122
|
+
File.read(file_location) if File.exist?(file_location)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def save_raw_data(raw_data, uid, format=nil)
|
|
126
|
+
file_location = raw_data_file_location(uid, format)
|
|
127
|
+
File.open(file_location, 'w') { |f| f.print raw_data }
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def raw_data_file_location(uid, format=nil)
|
|
131
|
+
normalised_uid = uid.gsub(/[^[[:alnum:]]]/,'')
|
|
132
|
+
directory = File.join(*([root_directory,'data',normalised_uid.gsub(/^0+/,'').split(//).first(5)].flatten))
|
|
133
|
+
FileUtils.mkdir_p(directory) unless Dir.exist?(directory)
|
|
134
|
+
filename = format ? "#{normalised_uid}.#{format}" : normalised_uid
|
|
135
|
+
File.join(directory, filename)
|
|
136
|
+
end
|
|
137
|
+
|
|
111
138
|
def update_data(options={})
|
|
112
139
|
fetch_data
|
|
113
140
|
update_stale
|
|
@@ -130,13 +157,14 @@ module OpencBot
|
|
|
130
157
|
# or, if output_as_json is requested then the validation error is included
|
|
131
158
|
# in the JSON error message
|
|
132
159
|
def update_datum(uid, output_as_json=false,replace_existing_data=false)
|
|
160
|
+
# XXX here we refuse to run depending on run algorithm
|
|
133
161
|
return unless raw_data = fetch_datum(uid)
|
|
134
162
|
default_options = {primary_key_name => uid, :retrieved_at => Time.now}
|
|
135
163
|
return unless base_processed_data = process_datum(raw_data)
|
|
136
164
|
processed_data = default_options.merge(base_processed_data)
|
|
137
165
|
# prepare the data for saving (converting Arrays, Hashes to json) and
|
|
138
166
|
# save the original data too, as we may not extracting everything from it yet
|
|
139
|
-
save_entity(processed_data.merge(:data => raw_data))
|
|
167
|
+
raise_when_saving_invalid_record ? save_entity!(processed_data.merge(:data => raw_data)) : save_entity(processed_data.merge(:data => raw_data))
|
|
140
168
|
if output_as_json
|
|
141
169
|
puts processed_data.to_json
|
|
142
170
|
else
|
|
@@ -152,11 +180,18 @@ module OpencBot
|
|
|
152
180
|
end
|
|
153
181
|
end
|
|
154
182
|
|
|
183
|
+
# at a rate of 1.16 companies per second, and allowing 12 hours
|
|
184
|
+
# running per day. a 3m register would be updated in 2 months:
|
|
185
|
+
MAX_STALE_COUNT = 100_000
|
|
155
186
|
def update_stale(stale_count=nil)
|
|
156
|
-
|
|
157
|
-
|
|
187
|
+
# XXX here set an arbitrarily large number and then rely on the system to stop
|
|
188
|
+
# XXX wrap this with timings to work out per-record rate
|
|
189
|
+
rate_limiter do |limiter|
|
|
190
|
+
stale_entry_uids(MAX_STALE_COUNT) do |stale_entry_uid|
|
|
191
|
+
update_datum(stale_entry_uid)
|
|
192
|
+
limiter.checkpoint
|
|
193
|
+
end
|
|
158
194
|
end
|
|
159
|
-
|
|
160
195
|
end
|
|
161
196
|
|
|
162
197
|
def validate_datum(record)
|
|
@@ -196,6 +231,15 @@ module OpencBot
|
|
|
196
231
|
prepared_data
|
|
197
232
|
end
|
|
198
233
|
|
|
234
|
+
def sleep_before_http_req
|
|
235
|
+
if self.const_defined?('SLEEP_BEFORE_HTTP_REQ')
|
|
236
|
+
sleep_time = self.const_get('SLEEP_BEFORE_HTTP_REQ')
|
|
237
|
+
puts "#{self.name} about to sleep for #{sleep_time} before fetching data. Time now: #{Time.now}" if verbose?
|
|
238
|
+
sleep(sleep_time)
|
|
239
|
+
puts "#{self.name} slept for #{sleep_time}: Time now #{Time.now}" if verbose?
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
199
243
|
def _client(options={})
|
|
200
244
|
return @client if @client
|
|
201
245
|
@client = HTTPClient.new(options.delete(:proxy))
|
data/lib/openc_bot/tasks.rb
CHANGED
|
@@ -3,6 +3,8 @@ require 'optparse'
|
|
|
3
3
|
require 'json'
|
|
4
4
|
require 'fileutils'
|
|
5
5
|
|
|
6
|
+
PID_DIR = "/oc/pids"
|
|
7
|
+
|
|
6
8
|
namespace :bot do
|
|
7
9
|
desc "create a skeleton bot that can be used in OpenCorporates"
|
|
8
10
|
task :create do
|
|
@@ -134,6 +136,17 @@ namespace :bot do
|
|
|
134
136
|
end
|
|
135
137
|
end
|
|
136
138
|
|
|
139
|
+
desc 'Lists count of non-null values in each field in ocdata table'
|
|
140
|
+
task :table_summary do
|
|
141
|
+
only_process_running('table_summary') do
|
|
142
|
+
bot_name = get_bot_name
|
|
143
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
|
144
|
+
runner = callable_from_file_name(bot_name)
|
|
145
|
+
res = runner.table_summary
|
|
146
|
+
res.each {|k,v| puts "#{k}:\t#{v}"}
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
137
150
|
desc 'Summarise data for quality checking (only works for licences at the moment)'
|
|
138
151
|
task :summarise_data do
|
|
139
152
|
def as_sorted_hash(name, data)
|
|
@@ -327,7 +340,7 @@ EOF
|
|
|
327
340
|
puts "Created #{new_file}"
|
|
328
341
|
end
|
|
329
342
|
end
|
|
330
|
-
|
|
343
|
+
|
|
331
344
|
#Add rspec debugger to gemfile
|
|
332
345
|
File.open(File.join(working_dir,'Gemfile'),'a') do |file|
|
|
333
346
|
file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
|
|
@@ -341,7 +354,7 @@ EOF
|
|
|
341
354
|
end
|
|
342
355
|
|
|
343
356
|
def only_process_running(task_name)
|
|
344
|
-
pid_path = File.join(
|
|
357
|
+
pid_path = File.join(PID_DIR, 'pids', task_name)
|
|
345
358
|
|
|
346
359
|
raise_if_already_running(pid_path)
|
|
347
360
|
write_pid_file(pid_path)
|
data/lib/openc_bot/version.rb
CHANGED
data/openc_bot.gemspec
CHANGED
|
@@ -35,14 +35,17 @@ Gem::Specification.new do |gem|
|
|
|
35
35
|
gem.add_dependency "rake"
|
|
36
36
|
gem.add_dependency "activesupport", "4.1.4"
|
|
37
37
|
gem.add_dependency "nokogiri"
|
|
38
|
-
|
|
38
|
+
gem.add_dependency "sqlite_magic", "0.0.6"
|
|
39
39
|
gem.add_dependency "json"
|
|
40
40
|
gem.add_dependency "json-schema"
|
|
41
41
|
gem.add_dependency "httpclient"
|
|
42
42
|
gem.add_dependency "backports"
|
|
43
43
|
gem.add_dependency "scraperwiki", "3.0.2"
|
|
44
|
+
gem.add_dependency "mail"
|
|
45
|
+
# gem.add_dependency "openc-asana" unless RUBY_VERSION < '2.0'
|
|
44
46
|
|
|
45
|
-
gem.add_development_dependency "perftools.rb"
|
|
46
|
-
gem.add_development_dependency "
|
|
47
|
+
# gem.add_development_dependency "perftools.rb"
|
|
48
|
+
gem.add_development_dependency "byebug" unless RUBY_VERSION < '2.0'
|
|
49
|
+
gem.add_development_dependency "debugger" if RUBY_VERSION < '2.0'
|
|
47
50
|
gem.add_development_dependency "rspec"
|
|
48
51
|
end
|
|
@@ -3,6 +3,10 @@ require_relative '../spec_helper'
|
|
|
3
3
|
require 'openc_bot'
|
|
4
4
|
require 'openc_bot/company_fetcher_bot'
|
|
5
5
|
|
|
6
|
+
Mail.defaults do
|
|
7
|
+
delivery_method :test # no, don't send emails when testing
|
|
8
|
+
end
|
|
9
|
+
|
|
6
10
|
module TestCompaniesFetcher
|
|
7
11
|
extend OpencBot::CompanyFetcherBot
|
|
8
12
|
end
|
|
@@ -121,4 +125,23 @@ describe "A module that extends CompanyFetcherBot" do
|
|
|
121
125
|
end
|
|
122
126
|
end
|
|
123
127
|
end
|
|
128
|
+
|
|
129
|
+
describe '#update_data' do
|
|
130
|
+
|
|
131
|
+
before do
|
|
132
|
+
TestCompaniesFetcher.stub(:fetch_data_via_incremental_search)
|
|
133
|
+
TestCompaniesFetcher.stub(:update_stale)
|
|
134
|
+
#this can be any file that we can stat
|
|
135
|
+
TestCompaniesFetcher.stub(:db_location).
|
|
136
|
+
and_return(File.join(File.dirname(__FILE__),"company_fetcher_bot_spec.rb"))
|
|
137
|
+
|
|
138
|
+
Mail::TestMailer.deliveries.clear
|
|
139
|
+
TestCompaniesFetcher.update_data
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
it 'should send success email' do
|
|
143
|
+
Mail::TestMailer.deliveries.first.subject.should match /successfully ran/
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
end
|
|
124
147
|
end
|
|
@@ -8,6 +8,8 @@ module ModuleThatIncludesRegisterMethods
|
|
|
8
8
|
extend OpencBot::Helpers::RegisterMethods
|
|
9
9
|
PRIMARY_KEY_NAME = :custom_uid
|
|
10
10
|
SCHEMA_NAME = 'company-schema'
|
|
11
|
+
SLEEP_BEFORE_HTTP_REQ = 2
|
|
12
|
+
RAISE_WHEN_SAVING_INVALID_RECORD = true
|
|
11
13
|
end
|
|
12
14
|
|
|
13
15
|
module ModuleWithNoCustomPrimaryKey
|
|
@@ -29,7 +31,6 @@ describe 'a module that includes RegisterMethods' do
|
|
|
29
31
|
describe "#datum_exists?" do
|
|
30
32
|
before do
|
|
31
33
|
ModuleThatIncludesRegisterMethods.stub(:select).and_return([])
|
|
32
|
-
|
|
33
34
|
end
|
|
34
35
|
|
|
35
36
|
it "should select_data from database" do
|
|
@@ -233,10 +234,12 @@ describe 'a module that includes RegisterMethods' do
|
|
|
233
234
|
|
|
234
235
|
context 'and SQLite3::BusyException raised' do
|
|
235
236
|
it 'should retry up to 3 times' do
|
|
237
|
+
pending "deciding whether to allow this in some circumstances"
|
|
236
238
|
ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).exactly(4).times.and_raise(SQLite3::BusyException)
|
|
237
239
|
lambda { ModuleThatIncludesRegisterMethods.prepare_and_save_data(@params) }.should raise_error(SQLite3::BusyException)
|
|
238
240
|
end
|
|
239
241
|
it 'should not raise error if successful before limit' do
|
|
242
|
+
pending "deciding whether to allow this in some circumstances"
|
|
240
243
|
ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).exactly(3).times.ordered.and_raise(SQLite3::BusyException)
|
|
241
244
|
ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).ordered
|
|
242
245
|
lambda { ModuleThatIncludesRegisterMethods.prepare_and_save_data(@params) }.should_not raise_error
|
|
@@ -372,6 +375,15 @@ describe 'a module that includes RegisterMethods' do
|
|
|
372
375
|
end
|
|
373
376
|
end
|
|
374
377
|
|
|
378
|
+
context 'and errors returned validating data' do
|
|
379
|
+
it "should validate processed data" do
|
|
380
|
+
ModuleThatIncludesRegisterMethods.stub(:validate_datum).and_return([{:failed_attribute => 'foo', :message => 'Something not right'}])
|
|
381
|
+
lambda { ModuleThatIncludesRegisterMethods.update_datum(@uid)}.should raise_error
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
end
|
|
386
|
+
|
|
375
387
|
context 'and process_datum returns nil' do
|
|
376
388
|
before do
|
|
377
389
|
ModuleThatIncludesRegisterMethods.stub(:process_datum).and_return(nil)
|
|
@@ -405,11 +417,12 @@ describe 'a module that includes RegisterMethods' do
|
|
|
405
417
|
end
|
|
406
418
|
end
|
|
407
419
|
|
|
408
|
-
describe "#fetch_registry_page for
|
|
420
|
+
describe "#fetch_registry_page for uid" do
|
|
409
421
|
before do
|
|
410
422
|
@dummy_client = double('http_client', :get_content => nil)
|
|
411
423
|
ModuleThatIncludesRegisterMethods.stub(:_client).and_return(@dummy_client)
|
|
412
424
|
ModuleThatIncludesRegisterMethods.stub(:registry_url).and_return('http://some.registry.url')
|
|
425
|
+
@dummy_client.stub(:get_content).and_return(:registry_page_html)
|
|
413
426
|
end
|
|
414
427
|
|
|
415
428
|
it "should GET registry_page for registry_url for company_number" do
|
|
@@ -423,6 +436,24 @@ describe 'a module that includes RegisterMethods' do
|
|
|
423
436
|
@dummy_client.stub(:get_content).and_return(:registry_page_html)
|
|
424
437
|
ModuleThatIncludesRegisterMethods.fetch_registry_page('76543').should == :registry_page_html
|
|
425
438
|
end
|
|
439
|
+
|
|
440
|
+
context 'and SLEEP_BEFORE_HTTP_REQ is set' do
|
|
441
|
+
it 'should sleep for given period' do
|
|
442
|
+
ModuleThatIncludesRegisterMethods.should_receive(:sleep).with(2)
|
|
443
|
+
ModuleThatIncludesRegisterMethods.fetch_registry_page('76543')
|
|
444
|
+
end
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
context 'and SLEEP_BEFORE_HTTP_REQ is not set' do
|
|
448
|
+
before do
|
|
449
|
+
ModuleWithNoCustomPrimaryKey.stub(:_client).and_return(@dummy_client)
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
it 'should sleep for given period' do
|
|
453
|
+
ModuleWithNoCustomPrimaryKey.should_not_receive(:sleep)
|
|
454
|
+
ModuleWithNoCustomPrimaryKey.fetch_registry_page('76543')
|
|
455
|
+
end
|
|
456
|
+
end
|
|
426
457
|
end
|
|
427
458
|
|
|
428
459
|
describe "#validate_datum" do
|
|
@@ -594,4 +625,91 @@ describe 'a module that includes RegisterMethods' do
|
|
|
594
625
|
end
|
|
595
626
|
end
|
|
596
627
|
|
|
628
|
+
describe 'raise_when_saving_invalid_record' do
|
|
629
|
+
describe '#primary_key_name' do
|
|
630
|
+
it 'should return false if RAISE_WHEN_SAVING_INVALID_RECORD not set' do
|
|
631
|
+
ModuleWithNoCustomPrimaryKey.send(:raise_when_saving_invalid_record).should == false
|
|
632
|
+
end
|
|
633
|
+
|
|
634
|
+
it 'should return true if RAISE_WHEN_SAVING_INVALID_RECORD set' do
|
|
635
|
+
ModuleThatIncludesRegisterMethods.send(:raise_when_saving_invalid_record).should == true
|
|
636
|
+
end
|
|
637
|
+
end
|
|
638
|
+
end
|
|
639
|
+
|
|
640
|
+
describe '#raw_data_file_location for a uid' do
|
|
641
|
+
before do
|
|
642
|
+
@dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
|
|
643
|
+
Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
|
|
644
|
+
|
|
645
|
+
ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
|
|
646
|
+
end
|
|
647
|
+
|
|
648
|
+
after do
|
|
649
|
+
FileUtils.rmdir(File.join(@dummy_root_directory, 'data'))
|
|
650
|
+
end
|
|
651
|
+
|
|
652
|
+
it 'should return directory built from uid inside root data directory' do
|
|
653
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('123456', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','3','4','5', '123456.html')
|
|
654
|
+
end
|
|
655
|
+
|
|
656
|
+
it 'should create directory structure if it doesnt exist' do
|
|
657
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('123456', 'html')
|
|
658
|
+
Dir.exist?(File.join(@dummy_root_directory, 'data', '1','2','3','4','5')).should == true
|
|
659
|
+
end
|
|
660
|
+
|
|
661
|
+
it 'should ignore leading zeroes when building directory' do
|
|
662
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('001234', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','3','4', '001234.html')
|
|
663
|
+
end
|
|
664
|
+
|
|
665
|
+
it 'should ignore non alphanum chars when building directory' do
|
|
666
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html')
|
|
667
|
+
end
|
|
668
|
+
|
|
669
|
+
it 'should allow format to be missing' do
|
|
670
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456').should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')
|
|
671
|
+
end
|
|
672
|
+
|
|
673
|
+
it 'should allow format to be nil' do
|
|
674
|
+
ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456', nil).should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')
|
|
675
|
+
end
|
|
676
|
+
end
|
|
677
|
+
|
|
678
|
+
describe '#save_raw_data' do
|
|
679
|
+
before do
|
|
680
|
+
@dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
|
|
681
|
+
Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
|
|
682
|
+
|
|
683
|
+
ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
|
|
684
|
+
end
|
|
685
|
+
|
|
686
|
+
it 'should save raw data as in computed raw_data_file_location' do
|
|
687
|
+
ModuleThatIncludesRegisterMethods.save_raw_data('foo bar', '12a-b/3456', 'html')
|
|
688
|
+
File.read(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html')).should == 'foo bar'
|
|
689
|
+
end
|
|
690
|
+
|
|
691
|
+
it 'should allow format to be missing' do
|
|
692
|
+
ModuleThatIncludesRegisterMethods.save_raw_data('foo bar', '12a-b/3456')
|
|
693
|
+
File.read(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')).should == 'foo bar'
|
|
694
|
+
end
|
|
695
|
+
end
|
|
696
|
+
|
|
697
|
+
describe '#get_raw_data' do
|
|
698
|
+
before do
|
|
699
|
+
@dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
|
|
700
|
+
Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
|
|
701
|
+
|
|
702
|
+
ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
|
|
703
|
+
end
|
|
704
|
+
|
|
705
|
+
it 'should read raw data in computed raw_data_file_location' do
|
|
706
|
+
File.open(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html'),'w') { |f| f.print 'foo bar' }
|
|
707
|
+
ModuleThatIncludesRegisterMethods.get_raw_data('12a-b/3456', 'html').should == 'foo bar'
|
|
708
|
+
end
|
|
709
|
+
|
|
710
|
+
it 'should allow format to be missing' do
|
|
711
|
+
File.open(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456'),'w') { |f| f.print 'foo bar' }
|
|
712
|
+
ModuleThatIncludesRegisterMethods.get_raw_data('12a-b/3456').should == 'foo bar'
|
|
713
|
+
end
|
|
714
|
+
end
|
|
597
715
|
end
|