openc_bot 0.0.27 → 0.0.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. checksums.yaml +5 -13
  2. data/.travis.yml +1 -0
  3. data/Gemfile +2 -0
  4. data/lib/openc_bot.rb +16 -1
  5. data/lib/openc_bot/company_fetcher_bot.rb +50 -0
  6. data/lib/openc_bot/helpers/register_methods.rb +59 -15
  7. data/lib/openc_bot/tasks.rb +15 -2
  8. data/lib/openc_bot/templates/spec/spec_helper.rb +4 -0
  9. data/lib/openc_bot/version.rb +1 -1
  10. data/openc_bot.gemspec +6 -3
  11. data/spec/lib/company_fetcher_bot_spec.rb +23 -0
  12. data/spec/lib/helpers/register_methods_spec.rb +120 -2
  13. data/spec/lib/openc_bot_spec.rb +14 -1
  14. data/spec/simple_openc_bot_spec.rb +1 -0
  15. data/spec/spec_helper.rb +3 -1
  16. metadata +43 -220
  17. data/schemas/.gitignore +0 -13
  18. data/schemas/.travis.yml +0 -5
  19. data/schemas/Gemfile +0 -9
  20. data/schemas/README.md +0 -2
  21. data/schemas/Rakefile +0 -46
  22. data/schemas/lib/base-statement.json +0 -22
  23. data/schemas/lib/snippets/financial-payment-base.json +0 -4
  24. data/schemas/lib/snippets/licence-base.json +0 -4
  25. data/schemas/schemas/company-schema.json +0 -183
  26. data/schemas/schemas/financial-payment-schema.json +0 -27
  27. data/schemas/schemas/includes/address.json +0 -23
  28. data/schemas/schemas/includes/alternative_name.json +0 -13
  29. data/schemas/schemas/includes/company.json +0 -16
  30. data/schemas/schemas/includes/filing.json +0 -20
  31. data/schemas/schemas/includes/financial-payment-data-object.json +0 -60
  32. data/schemas/schemas/includes/industry_code.json +0 -16
  33. data/schemas/schemas/includes/licence-data-object.json +0 -36
  34. data/schemas/schemas/includes/officer.json +0 -50
  35. data/schemas/schemas/includes/previous_name.json +0 -14
  36. data/schemas/schemas/includes/share-parcel-data.json +0 -67
  37. data/schemas/schemas/includes/share-parcel.json +0 -63
  38. data/schemas/schemas/includes/subsidiary-relationship-data.json +0 -47
  39. data/schemas/schemas/includes/total-shares.json +0 -10
  40. data/schemas/schemas/licence-schema.json +0 -27
  41. data/schemas/schemas/primary-data-schema.json +0 -17
  42. data/schemas/schemas/share-parcel-schema.json +0 -21
  43. data/schemas/schemas/simple-financial-payment-schema.json +0 -85
  44. data/schemas/schemas/simple-licence-schema.json +0 -59
  45. data/schemas/schemas/simple-subsidiary-schema.json +0 -68
  46. data/schemas/schemas/subsidiary-relationship-schema.json +0 -27
  47. data/schemas/spec/sample-data/invalid/company-01.json +0 -4
  48. data/schemas/spec/sample-data/invalid/company-02.json +0 -5
  49. data/schemas/spec/sample-data/invalid/company-03.json +0 -5
  50. data/schemas/spec/sample-data/invalid/company-04.json +0 -5
  51. data/schemas/spec/sample-data/invalid/company-05.json +0 -5
  52. data/schemas/spec/sample-data/invalid/company-06.json +0 -6
  53. data/schemas/spec/sample-data/invalid/company-07.json +0 -8
  54. data/schemas/spec/sample-data/invalid/company-08.json +0 -7
  55. data/schemas/spec/sample-data/invalid/company-09.json +0 -9
  56. data/schemas/spec/sample-data/invalid/company-10.json +0 -7
  57. data/schemas/spec/sample-data/invalid/company-11.json +0 -9
  58. data/schemas/spec/sample-data/invalid/company-12.json +0 -11
  59. data/schemas/spec/sample-data/invalid/company-13.json +0 -11
  60. data/schemas/spec/sample-data/invalid/company-14.json +0 -7
  61. data/schemas/spec/sample-data/invalid/company-15.json +0 -7
  62. data/schemas/spec/sample-data/invalid/company-16.json +0 -7
  63. data/schemas/spec/sample-data/invalid/company-17.json +0 -9
  64. data/schemas/spec/sample-data/invalid/company-18.json +0 -9
  65. data/schemas/spec/sample-data/invalid/company-19.json +0 -9
  66. data/schemas/spec/sample-data/invalid/company-20.json +0 -9
  67. data/schemas/spec/sample-data/invalid/company-21.json +0 -11
  68. data/schemas/spec/sample-data/invalid/company-22.json +0 -11
  69. data/schemas/spec/sample-data/invalid/company-23.json +0 -7
  70. data/schemas/spec/sample-data/invalid/company-24.json +0 -12
  71. data/schemas/spec/sample-data/invalid/company-25.json +0 -9
  72. data/schemas/spec/sample-data/invalid/company-26.json +0 -11
  73. data/schemas/spec/sample-data/invalid/company-27.json +0 -7
  74. data/schemas/spec/sample-data/invalid/company-28.json +0 -9
  75. data/schemas/spec/sample-data/invalid/company-29.json +0 -12
  76. data/schemas/spec/sample-data/invalid/company-30.json +0 -16
  77. data/schemas/spec/sample-data/invalid/company-31.json +0 -14
  78. data/schemas/spec/sample-data/invalid/company-32.json +0 -11
  79. data/schemas/spec/sample-data/invalid/company-33.json +0 -7
  80. data/schemas/spec/sample-data/invalid/company-34.json +0 -9
  81. data/schemas/spec/sample-data/invalid/company-35.json +0 -9
  82. data/schemas/spec/sample-data/invalid/company-36.json +0 -10
  83. data/schemas/spec/sample-data/invalid/company-37.json +0 -7
  84. data/schemas/spec/sample-data/invalid/company-38.json +0 -9
  85. data/schemas/spec/sample-data/invalid/company-39.json +0 -11
  86. data/schemas/spec/sample-data/invalid/company-40.json +0 -12
  87. data/schemas/spec/sample-data/invalid/company-41.json +0 -12
  88. data/schemas/spec/sample-data/invalid/company-42.json +0 -7
  89. data/schemas/spec/sample-data/invalid/company-43.json +0 -9
  90. data/schemas/spec/sample-data/invalid/company-44.json +0 -11
  91. data/schemas/spec/sample-data/invalid/company-45.json +0 -11
  92. data/schemas/spec/sample-data/invalid/company-46.json +0 -7
  93. data/schemas/spec/sample-data/invalid/company-47.json +0 -9
  94. data/schemas/spec/sample-data/invalid/company-48.json +0 -9
  95. data/schemas/spec/sample-data/invalid/company-49.json +0 -9
  96. data/schemas/spec/sample-data/invalid/company-50.json +0 -9
  97. data/schemas/spec/sample-data/invalid/company-51.json +0 -9
  98. data/schemas/spec/sample-data/invalid/company-52.json +0 -9
  99. data/schemas/spec/sample-data/invalid/company-53.json +0 -10
  100. data/schemas/spec/sample-data/invalid/company-54.json +0 -9
  101. data/schemas/spec/sample-data/invalid/company-55.json +0 -9
  102. data/schemas/spec/sample-data/invalid/company-56.json +0 -7
  103. data/schemas/spec/sample-data/invalid/company-57.json +0 -7
  104. data/schemas/spec/sample-data/invalid/company-58.json +0 -7
  105. data/schemas/spec/sample-data/invalid/company-59.json +0 -13
  106. data/schemas/spec/sample-data/invalid/company-60.json +0 -7
  107. data/schemas/spec/sample-data/invalid/company-61.json +0 -7
  108. data/schemas/spec/sample-data/invalid/company-62.json +0 -9
  109. data/schemas/spec/sample-data/invalid/company-63.json +0 -12
  110. data/schemas/spec/sample-data/invalid/company-64.json +0 -14
  111. data/schemas/spec/sample-data/invalid/company-65.json +0 -14
  112. data/schemas/spec/sample-data/invalid/company-66.json +0 -13
  113. data/schemas/spec/sample-data/invalid/company-67.json +0 -14
  114. data/schemas/spec/sample-data/invalid/company-68.json +0 -12
  115. data/schemas/spec/sample-data/invalid/company-69.json +0 -12
  116. data/schemas/spec/sample-data/invalid/company-70.json +0 -14
  117. data/schemas/spec/sample-data/invalid/financial-payment-01.json +0 -24
  118. data/schemas/spec/sample-data/invalid/licence-01.json +0 -18
  119. data/schemas/spec/sample-data/invalid/licence-02.json +0 -18
  120. data/schemas/spec/sample-data/invalid/licence-03.json +0 -12
  121. data/schemas/spec/sample-data/invalid/licence-04.json +0 -18
  122. data/schemas/spec/sample-data/invalid/licence-05.json +0 -18
  123. data/schemas/spec/sample-data/invalid/licence-06.json +0 -18
  124. data/schemas/spec/sample-data/invalid/licence-07.json +0 -20
  125. data/schemas/spec/sample-data/invalid/licence-08.json +0 -21
  126. data/schemas/spec/sample-data/invalid/primary-data-01.json +0 -4
  127. data/schemas/spec/sample-data/invalid/primary-data-02.json +0 -4
  128. data/schemas/spec/sample-data/invalid/simple-licence-01.json +0 -9
  129. data/schemas/spec/sample-data/invalid/simple-licence-02.json +0 -8
  130. data/schemas/spec/sample-data/invalid/simple-licence-03.json +0 -9
  131. data/schemas/spec/sample-data/invalid/simple-licence-04.json +0 -10
  132. data/schemas/spec/sample-data/invalid/simple-licence-05.json +0 -10
  133. data/schemas/spec/sample-data/invalid/simple-licence-06.json +0 -10
  134. data/schemas/spec/sample-data/invalid/simple-subsidiary-01.json +0 -13
  135. data/schemas/spec/sample-data/invalid/simple-subsidiary-02.json +0 -13
  136. data/schemas/spec/sample-data/licence-schema.json.old +0 -21
  137. data/schemas/spec/sample-data/valid/company-01.json +0 -6
  138. data/schemas/spec/sample-data/valid/company-02.json +0 -7
  139. data/schemas/spec/sample-data/valid/company-03.json +0 -8
  140. data/schemas/spec/sample-data/valid/company-04.json +0 -7
  141. data/schemas/spec/sample-data/valid/company-05.json +0 -7
  142. data/schemas/spec/sample-data/valid/company-06.json +0 -12
  143. data/schemas/spec/sample-data/valid/company-07.json +0 -9
  144. data/schemas/spec/sample-data/valid/company-08.json +0 -9
  145. data/schemas/spec/sample-data/valid/company-09.json +0 -20
  146. data/schemas/spec/sample-data/valid/company-10.json +0 -9
  147. data/schemas/spec/sample-data/valid/company-11.json +0 -7
  148. data/schemas/spec/sample-data/valid/company-12.json +0 -7
  149. data/schemas/spec/sample-data/valid/company-13.json +0 -7
  150. data/schemas/spec/sample-data/valid/company-14.json +0 -15
  151. data/schemas/spec/sample-data/valid/company-15.json +0 -8
  152. data/schemas/spec/sample-data/valid/company-16.json +0 -9
  153. data/schemas/spec/sample-data/valid/company-17.json +0 -9
  154. data/schemas/spec/sample-data/valid/company-18.json +0 -9
  155. data/schemas/spec/sample-data/valid/company-19.json +0 -37
  156. data/schemas/spec/sample-data/valid/company-20.json +0 -9
  157. data/schemas/spec/sample-data/valid/company-21.json +0 -26
  158. data/schemas/spec/sample-data/valid/company-22.json +0 -20
  159. data/schemas/spec/sample-data/valid/company-23.json +0 -9
  160. data/schemas/spec/sample-data/valid/company-24.json +0 -12
  161. data/schemas/spec/sample-data/valid/company-25.json +0 -12
  162. data/schemas/spec/sample-data/valid/company-26.json +0 -12
  163. data/schemas/spec/sample-data/valid/company-27.json +0 -28
  164. data/schemas/spec/sample-data/valid/company-28.json +0 -9
  165. data/schemas/spec/sample-data/valid/company-29.json +0 -10
  166. data/schemas/spec/sample-data/valid/company-30.json +0 -9
  167. data/schemas/spec/sample-data/valid/company-31.json +0 -17
  168. data/schemas/spec/sample-data/valid/company-32.json +0 -9
  169. data/schemas/spec/sample-data/valid/company-33.json +0 -29
  170. data/schemas/spec/sample-data/valid/company-34.json +0 -9
  171. data/schemas/spec/sample-data/valid/company-35.json +0 -9
  172. data/schemas/spec/sample-data/valid/company-36.json +0 -9
  173. data/schemas/spec/sample-data/valid/company-37.json +0 -9
  174. data/schemas/spec/sample-data/valid/company-38.json +0 -9
  175. data/schemas/spec/sample-data/valid/company-39.json +0 -9
  176. data/schemas/spec/sample-data/valid/company-40.json +0 -9
  177. data/schemas/spec/sample-data/valid/company-41.json +0 -9
  178. data/schemas/spec/sample-data/valid/company-42.json +0 -10
  179. data/schemas/spec/sample-data/valid/company-43.json +0 -7
  180. data/schemas/spec/sample-data/valid/company-44.json +0 -7
  181. data/schemas/spec/sample-data/valid/company-45.json +0 -23
  182. data/schemas/spec/sample-data/valid/company-46.json +0 -7
  183. data/schemas/spec/sample-data/valid/company-47.json +0 -12
  184. data/schemas/spec/sample-data/valid/company-48.json +0 -7
  185. data/schemas/spec/sample-data/valid/company-49.json +0 -14
  186. data/schemas/spec/sample-data/valid/company-50.json +0 -13
  187. data/schemas/spec/sample-data/valid/company-51.json +0 -14
  188. data/schemas/spec/sample-data/valid/company-52.json +0 -12
  189. data/schemas/spec/sample-data/valid/company-53.json +0 -9
  190. data/schemas/spec/sample-data/valid/financial-payment-01.json +0 -25
  191. data/schemas/spec/sample-data/valid/financial-payment-02.json +0 -29
  192. data/schemas/spec/sample-data/valid/licence-01.json +0 -19
  193. data/schemas/spec/sample-data/valid/licence-02.json +0 -21
  194. data/schemas/spec/sample-data/valid/licence-03.json +0 -21
  195. data/schemas/spec/sample-data/valid/licence-04.json +0 -26
  196. data/schemas/spec/sample-data/valid/primary-data-01.json +0 -4
  197. data/schemas/spec/sample-data/valid/primary-data-02.json +0 -5
  198. data/schemas/spec/sample-data/valid/simple-licence-01.json +0 -10
  199. data/schemas/spec/sample-data/valid/simple-licence-02.json +0 -10
  200. data/schemas/spec/sample-data/valid/simple-licence-03.json +0 -12
  201. data/schemas/spec/sample-data/valid/simple-subsidiary-01.json +0 -13
  202. data/schemas/spec/sample-data/valid/simple-subsidiary-02.json +0 -13
  203. data/schemas/spec/sample-data/valid/subsidiary-relationship-01.json +0 -23
  204. data/schemas/spec/spec_helper.rb +0 -78
  205. data/schemas/spec/validation_spec.rb +0 -39
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- MGI3OTY2YWNlZWFlNDk4M2EwOTVhOTUwYTAwNjViODZmZDVlYjY2MQ==
5
- data.tar.gz: !binary |-
6
- MTQ0OGM2ZWZjOWYwNzQ5MGQ3Y2YxZDRiOGYyM2FiY2Y4MzBjNDIzZQ==
2
+ SHA1:
3
+ metadata.gz: 1db3c143f46fc934729ee27c6cc5b4047fb2a5c5
4
+ data.tar.gz: 4c2de3f8f0ecc62f77689386dc6e50ed26290714
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- ZTFkYmVlMWFmZmIwMGNiYmIxZTA1ZDQzMTY5YWE0MjEyZWFlYmZiNTUxMjZj
10
- ZGUxNDc2ZTZkNzlkOTYzOWFlYTRkZDM5ZDgwMTRmYmE0ZTM3ZTAxMWIwNGFm
11
- YTA4MTgxNDc4OGI2OWRhZTk0NmQ3ODc4MmY5NWE4YmE4YzRlMjc=
12
- data.tar.gz: !binary |-
13
- MjQ2NmFhOGY4NDFjZDE2NGJjNTlkNGUwNWJjMzUyMDJjNGM4YjBiMGYxMzMw
14
- YTE4ZmIxZmE0YTU0N2Y1NWE0NDU4ZGUzZjc1ODExZmZmZDAxNmZmZWMzY2Qx
15
- MWQyNWU4NmEzYmQ5MjdiYzIxYTFlYTkyZjMzMWZjYWY0NjkwYWM=
6
+ metadata.gz: 6046b31e46416716606c0540ab60cc3b4c20d53043b7ec5701103dde77e47ccf6d0abb77e284bae0928dcaf66471d025c83bb51fbc6591c882ea401263cd4ae7
7
+ data.tar.gz: 22bd41bdd6639ea13f9d4787204fc7f13321d0c08ddf48748354b89a88bef8c0a73dda93f47e73cc20589332a7f6940400efa4998cc17c5ca7ebe8445222e65f
data/.travis.yml CHANGED
@@ -4,6 +4,7 @@ rvm:
4
4
  - "1.9.3"
5
5
  - "2.0.0"
6
6
  - "2.1.0"
7
+ - "2.2.0"
7
8
  # - jruby-18mode # JRuby in 1.8 mode
8
9
  # - jruby-19mode # JRuby in 1.9 mode
9
10
  # - rbx
data/Gemfile CHANGED
@@ -1,8 +1,10 @@
1
1
  source 'https://rubygems.org'
2
2
  gem "sqlite_magic", :git => 'https://github.com/openc/sqlite_magic.git'
3
+
3
4
  gem "pry", :group => [:development,:test]
4
5
  # Specify your gem's dependencies in openc_bot.gemspec
5
6
  gemspec
6
7
 
8
+
7
9
  # we need to do pull request and bump version
8
10
  # gem 'scraperwiki', '>=3.0.2', :git => 'https://github.com/openc/scraperwiki-ruby.git'
data/lib/openc_bot.rb CHANGED
@@ -80,10 +80,25 @@ module OpencBot
80
80
  end
81
81
  end
82
82
 
83
+ def db_location
84
+ File.expand_path(File.join(@@app_directory, 'db', db_name))
85
+ end
86
+
83
87
  # Override default in ScraperWiki gem
84
88
  def sqlite_magic_connection
85
89
  db = @config ? @config[:db] : File.expand_path(File.join(@@app_directory, 'db', db_name))
86
- @sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
90
+ options = sqlite_busy_timeout ? {:busy_timeout => sqlite_busy_timeout} : {:busy_timeout => 10000}
91
+ @sqlite_magic_connection ||= SqliteMagic::Connection.new(db, options)
92
+ end
93
+
94
+ def sqlite_busy_timeout
95
+ self.const_defined?('SQLITE_BUSY_TIMEOUT') && self.const_get('SQLITE_BUSY_TIMEOUT')
96
+ end
97
+
98
+ def table_summary
99
+ field_names = sqlite_magic_connection.execute('PRAGMA table_info(ocdata)').collect{|c| c['name']}
100
+ select_sql = "COUNT(1) Total, " + field_names.collect{ |fn| "COUNT(#{fn}) #{fn}_not_null" }.join(', ') + " FROM ocdata"
101
+ select(select_sql).first
87
102
  end
88
103
 
89
104
  end
@@ -1,6 +1,8 @@
1
1
  require 'openc_bot'
2
2
  require 'openc_bot/helpers/incremental_search'
3
3
  require 'openc_bot/helpers/alpha_search'
4
+ # require 'openc_bot/asana_notifier'
5
+ require 'mail'
4
6
 
5
7
 
6
8
  module OpencBot
@@ -9,6 +11,8 @@ module OpencBot
9
11
  include OpencBot::Helpers::IncrementalSearch
10
12
  include OpencBot::Helpers::AlphaSearch
11
13
 
14
+ STDOUT.sync = true
15
+ STDERR.sync = true
12
16
  # This is called by #update_datum
13
17
  def fetch_datum(company_number)
14
18
  company_page = fetch_registry_page(company_number)
@@ -42,5 +46,51 @@ module OpencBot
42
46
  super || 'company-schema'
43
47
  end
44
48
 
49
+ def update_data(options={})
50
+ fetch_data
51
+ update_stale
52
+ send_run_report
53
+ rescue Exception => e
54
+ send_error_report(e)
55
+ raise e
56
+ end
57
+
58
+ private
59
+ def mark_bot_as_failing_on_asana(exception)
60
+ # error_description = "Code for this bot: https://github.com/openc/external_bots/tree/master/#{inferred_jurisdiction_code}_companies_fetcher\nError details: #{exception.inspect}.\nBacktrace:\n#{exception.backtrace}"
61
+ # params = {
62
+ # :tag => inferred_jurisdiction_code,
63
+ # :asana_api_key => ENV['ASANA_API_KEY'],
64
+ # :workspace => ENV['ASANA_WORKSPACE'],
65
+ # :title => exception.message,
66
+ # :description => error_description
67
+ # }
68
+ # AsanaNotifier.create_failed_bot_task(params)
69
+ end
70
+
71
+ def send_error_report(e)
72
+ subject = "Error running #{self.name}: #{e}"
73
+ body = "Error details: #{e.inspect}.\nBacktrace:\n#{e.backtrace}"
74
+ mark_bot_as_failing_on_asana(e) if ENV['CREATE_ASANA_TASKS_FOR_BOT_FAILURES']
75
+ send_report(:subject => subject, :body => body)
76
+ end
77
+
78
+ def send_run_report
79
+ subject = "#{self.name} successfully ran"
80
+ db_filesize = File.size?(db_location)
81
+ body = "No problems to report. db is #{db_location}, #{db_filesize} bytes. Last modified: #{File.stat(db_location).mtime}"
82
+ send_report(:subject => subject, :body => body)
83
+ end
84
+
85
+ def send_report(params)
86
+ Mail.deliver do
87
+ from 'admin@opencorporates.com'
88
+ to 'bots@opencorporates.com'
89
+ subject params[:subject]
90
+ body params[:body]
91
+ end
92
+ end
93
+
94
+
45
95
  end
46
96
  end
@@ -16,6 +16,10 @@ module OpencBot
16
16
  !!select("ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} = ? LIMIT 1", uid).first
17
17
  end
18
18
 
19
+ def default_stale_count
20
+ self.const_defined?('STALE_COUNT') ? self.const_get('STALE_COUNT') : 1000
21
+ end
22
+
19
23
  # fetches and saves data. By default assumes an incremental search, or an alpha search
20
24
  # if USE_ALPHA_SEARCH is set. This method should be overridden if you are going to do a
21
25
  # different type of data import, e.g from a CSV file.
@@ -35,24 +39,25 @@ module OpencBot
35
39
  end
36
40
 
37
41
  def fetch_registry_page(company_number)
42
+ sleep_before_http_req
38
43
  _client.get_content(registry_url(company_number))
39
44
  end
40
45
 
41
46
  def prepare_and_save_data(all_data,options={})
42
47
  data_to_be_saved = prepare_for_saving(all_data)
43
- fail_count, retry_interval = 0, 5
48
+ # fail_count, retry_interval = 0, 5
44
49
  begin
45
50
  insert_or_update([primary_key_name], data_to_be_saved)
46
51
  rescue SQLite3::BusyException => e
47
- fail_count += 1
48
- if fail_count <= MAX_BUSY_RETRIES
49
- puts "#{e.inspect} raised #{fail_count} times saving:\n#{all_data}\n\nNow retrying in #{retry_interval} seconds" if verbose?
50
- sleep retry_interval
51
- retry_interval = retry_interval * 2
52
- retry
53
- else
54
- raise e
55
- end
52
+ # fail_count += 1
53
+ # if fail_count <= MAX_BUSY_RETRIES
54
+ puts "#{e.inspect} raised saving:\n#{all_data}\n\n" if verbose?
55
+ # sleep retry_interval
56
+ # retry_interval = retry_interval * 2
57
+ # retry
58
+ # else
59
+ raise e
60
+ # end
56
61
  end
57
62
 
58
63
  end
@@ -61,6 +66,10 @@ module OpencBot
61
66
  self.const_defined?('PRIMARY_KEY_NAME') ? self.const_get('PRIMARY_KEY_NAME') : :uid
62
67
  end
63
68
 
69
+ def raise_when_saving_invalid_record
70
+ !!self.const_defined?('RAISE_WHEN_SAVING_INVALID_RECORD')
71
+ end
72
+
64
73
  # sensible default. Either uses computed version or registry_url in db
65
74
  def registry_url(uid)
66
75
  computed_registry_url(uid) || registry_url_from_db(uid)
@@ -94,7 +103,7 @@ module OpencBot
94
103
  end
95
104
 
96
105
  def stale_entry_uids(stale_count=nil)
97
- stale_count ||= 1000
106
+ stale_count ||= default_stale_count
98
107
  sql_query = "ocdata.* from ocdata WHERE retrieved_at IS NULL OR strftime('%s', retrieved_at) < strftime('%s', '#{Date.today - 30}') LIMIT #{stale_count.to_i}"
99
108
  raw_data = select(sql_query).each do |res|
100
109
  yield res[primary_key_name.to_s]
@@ -108,6 +117,24 @@ module OpencBot
108
117
  end
109
118
  end
110
119
 
120
+ def get_raw_data(uid, format=nil)
121
+ file_location = raw_data_file_location(uid, format)
122
+ File.read(file_location) if File.exist?(file_location)
123
+ end
124
+
125
+ def save_raw_data(raw_data, uid, format=nil)
126
+ file_location = raw_data_file_location(uid, format)
127
+ File.open(file_location, 'w') { |f| f.print raw_data }
128
+ end
129
+
130
+ def raw_data_file_location(uid, format=nil)
131
+ normalised_uid = uid.gsub(/[^[[:alnum:]]]/,'')
132
+ directory = File.join(*([root_directory,'data',normalised_uid.gsub(/^0+/,'').split(//).first(5)].flatten))
133
+ FileUtils.mkdir_p(directory) unless Dir.exist?(directory)
134
+ filename = format ? "#{normalised_uid}.#{format}" : normalised_uid
135
+ File.join(directory, filename)
136
+ end
137
+
111
138
  def update_data(options={})
112
139
  fetch_data
113
140
  update_stale
@@ -130,13 +157,14 @@ module OpencBot
130
157
  # or, if output_as_json is requested then the validation error is included
131
158
  # in the JSON error message
132
159
  def update_datum(uid, output_as_json=false,replace_existing_data=false)
160
+ # XXX here we refuse to run depending on run algorithm
133
161
  return unless raw_data = fetch_datum(uid)
134
162
  default_options = {primary_key_name => uid, :retrieved_at => Time.now}
135
163
  return unless base_processed_data = process_datum(raw_data)
136
164
  processed_data = default_options.merge(base_processed_data)
137
165
  # prepare the data for saving (converting Arrays, Hashes to json) and
138
166
  # save the original data too, as we may not extracting everything from it yet
139
- save_entity(processed_data.merge(:data => raw_data))
167
+ raise_when_saving_invalid_record ? save_entity!(processed_data.merge(:data => raw_data)) : save_entity(processed_data.merge(:data => raw_data))
140
168
  if output_as_json
141
169
  puts processed_data.to_json
142
170
  else
@@ -152,11 +180,18 @@ module OpencBot
152
180
  end
153
181
  end
154
182
 
183
+ # at a rate of 1.16 companies per second, and allowing 12 hours
184
+ # running per day. a 3m register would be updated in 2 months:
185
+ MAX_STALE_COUNT = 100_000
155
186
  def update_stale(stale_count=nil)
156
- stale_entry_uids(stale_count) do |stale_entry_uid|
157
- update_datum(stale_entry_uid)
187
+ # XXX here set an arbitrarily large number and then rely on the system to stop
188
+ # XXX wrap this with timings to work out per-record rate
189
+ rate_limiter do |limiter|
190
+ stale_entry_uids(MAX_STALE_COUNT) do |stale_entry_uid|
191
+ update_datum(stale_entry_uid)
192
+ limiter.checkpoint
193
+ end
158
194
  end
159
-
160
195
  end
161
196
 
162
197
  def validate_datum(record)
@@ -196,6 +231,15 @@ module OpencBot
196
231
  prepared_data
197
232
  end
198
233
 
234
+ def sleep_before_http_req
235
+ if self.const_defined?('SLEEP_BEFORE_HTTP_REQ')
236
+ sleep_time = self.const_get('SLEEP_BEFORE_HTTP_REQ')
237
+ puts "#{self.name} about to sleep for #{sleep_time} before fetching data. Time now: #{Time.now}" if verbose?
238
+ sleep(sleep_time)
239
+ puts "#{self.name} slept for #{sleep_time}: Time now #{Time.now}" if verbose?
240
+ end
241
+ end
242
+
199
243
  def _client(options={})
200
244
  return @client if @client
201
245
  @client = HTTPClient.new(options.delete(:proxy))
@@ -3,6 +3,8 @@ require 'optparse'
3
3
  require 'json'
4
4
  require 'fileutils'
5
5
 
6
+ PID_DIR = "/oc/pids"
7
+
6
8
  namespace :bot do
7
9
  desc "create a skeleton bot that can be used in OpenCorporates"
8
10
  task :create do
@@ -134,6 +136,17 @@ namespace :bot do
134
136
  end
135
137
  end
136
138
 
139
+ desc 'Lists count of non-null values in each field in ocdata table'
140
+ task :table_summary do
141
+ only_process_running('table_summary') do
142
+ bot_name = get_bot_name
143
+ require_relative File.join(Dir.pwd,'lib', bot_name)
144
+ runner = callable_from_file_name(bot_name)
145
+ res = runner.table_summary
146
+ res.each {|k,v| puts "#{k}:\t#{v}"}
147
+ end
148
+ end
149
+
137
150
  desc 'Summarise data for quality checking (only works for licences at the moment)'
138
151
  task :summarise_data do
139
152
  def as_sorted_hash(name, data)
@@ -327,7 +340,7 @@ EOF
327
340
  puts "Created #{new_file}"
328
341
  end
329
342
  end
330
-
343
+
331
344
  #Add rspec debugger to gemfile
332
345
  File.open(File.join(working_dir,'Gemfile'),'a') do |file|
333
346
  file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
@@ -341,7 +354,7 @@ EOF
341
354
  end
342
355
 
343
356
  def only_process_running(task_name)
344
- pid_path = File.join(Dir.pwd, 'pids', task_name)
357
+ pid_path = File.join(PID_DIR, 'pids', task_name)
345
358
 
346
359
  raise_if_already_running(pid_path)
347
360
  write_pid_file(pid_path)
@@ -11,3 +11,7 @@ end
11
11
  def dummy_response(response_name, options={})
12
12
  IO.read(File.join(File.dirname(__FILE__),"dummy_responses",response_name.to_s), options)
13
13
  end
14
+
15
+ Mail.defaults do
16
+ delivery_method :test # no, don't send emails when testing,
17
+ end
@@ -1,3 +1,3 @@
1
1
  module OpencBot
2
- VERSION = "0.0.27"
2
+ VERSION = "0.0.46"
3
3
  end
data/openc_bot.gemspec CHANGED
@@ -35,14 +35,17 @@ Gem::Specification.new do |gem|
35
35
  gem.add_dependency "rake"
36
36
  gem.add_dependency "activesupport", "4.1.4"
37
37
  gem.add_dependency "nokogiri"
38
- # gem.add_dependency "sqlite3"
38
+ gem.add_dependency "sqlite_magic", "0.0.6"
39
39
  gem.add_dependency "json"
40
40
  gem.add_dependency "json-schema"
41
41
  gem.add_dependency "httpclient"
42
42
  gem.add_dependency "backports"
43
43
  gem.add_dependency "scraperwiki", "3.0.2"
44
+ gem.add_dependency "mail"
45
+ # gem.add_dependency "openc-asana" unless RUBY_VERSION < '2.0'
44
46
 
45
- gem.add_development_dependency "perftools.rb"
46
- gem.add_development_dependency "debugger"
47
+ # gem.add_development_dependency "perftools.rb"
48
+ gem.add_development_dependency "byebug" unless RUBY_VERSION < '2.0'
49
+ gem.add_development_dependency "debugger" if RUBY_VERSION < '2.0'
47
50
  gem.add_development_dependency "rspec"
48
51
  end
@@ -3,6 +3,10 @@ require_relative '../spec_helper'
3
3
  require 'openc_bot'
4
4
  require 'openc_bot/company_fetcher_bot'
5
5
 
6
+ Mail.defaults do
7
+ delivery_method :test # no, don't send emails when testing
8
+ end
9
+
6
10
  module TestCompaniesFetcher
7
11
  extend OpencBot::CompanyFetcherBot
8
12
  end
@@ -121,4 +125,23 @@ describe "A module that extends CompanyFetcherBot" do
121
125
  end
122
126
  end
123
127
  end
128
+
129
+ describe '#update_data' do
130
+
131
+ before do
132
+ TestCompaniesFetcher.stub(:fetch_data_via_incremental_search)
133
+ TestCompaniesFetcher.stub(:update_stale)
134
+ #this can be any file that we can stat
135
+ TestCompaniesFetcher.stub(:db_location).
136
+ and_return(File.join(File.dirname(__FILE__),"company_fetcher_bot_spec.rb"))
137
+
138
+ Mail::TestMailer.deliveries.clear
139
+ TestCompaniesFetcher.update_data
140
+ end
141
+
142
+ it 'should send success email' do
143
+ Mail::TestMailer.deliveries.first.subject.should match /successfully ran/
144
+ end
145
+
146
+ end
124
147
  end
@@ -8,6 +8,8 @@ module ModuleThatIncludesRegisterMethods
8
8
  extend OpencBot::Helpers::RegisterMethods
9
9
  PRIMARY_KEY_NAME = :custom_uid
10
10
  SCHEMA_NAME = 'company-schema'
11
+ SLEEP_BEFORE_HTTP_REQ = 2
12
+ RAISE_WHEN_SAVING_INVALID_RECORD = true
11
13
  end
12
14
 
13
15
  module ModuleWithNoCustomPrimaryKey
@@ -29,7 +31,6 @@ describe 'a module that includes RegisterMethods' do
29
31
  describe "#datum_exists?" do
30
32
  before do
31
33
  ModuleThatIncludesRegisterMethods.stub(:select).and_return([])
32
-
33
34
  end
34
35
 
35
36
  it "should select_data from database" do
@@ -233,10 +234,12 @@ describe 'a module that includes RegisterMethods' do
233
234
 
234
235
  context 'and SQLite3::BusyException raised' do
235
236
  it 'should retry up to 3 times' do
237
+ pending "deciding whether to allow this in some circumstances"
236
238
  ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).exactly(4).times.and_raise(SQLite3::BusyException)
237
239
  lambda { ModuleThatIncludesRegisterMethods.prepare_and_save_data(@params) }.should raise_error(SQLite3::BusyException)
238
240
  end
239
241
  it 'should not raise error if successful before limit' do
242
+ pending "deciding whether to allow this in some circumstances"
240
243
  ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).exactly(3).times.ordered.and_raise(SQLite3::BusyException)
241
244
  ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).ordered
242
245
  lambda { ModuleThatIncludesRegisterMethods.prepare_and_save_data(@params) }.should_not raise_error
@@ -372,6 +375,15 @@ describe 'a module that includes RegisterMethods' do
372
375
  end
373
376
  end
374
377
 
378
+ context 'and errors returned validating data' do
379
+ it "should validate processed data" do
380
+ ModuleThatIncludesRegisterMethods.stub(:validate_datum).and_return([{:failed_attribute => 'foo', :message => 'Something not right'}])
381
+ lambda { ModuleThatIncludesRegisterMethods.update_datum(@uid)}.should raise_error
382
+ end
383
+
384
+
385
+ end
386
+
375
387
  context 'and process_datum returns nil' do
376
388
  before do
377
389
  ModuleThatIncludesRegisterMethods.stub(:process_datum).and_return(nil)
@@ -405,11 +417,12 @@ describe 'a module that includes RegisterMethods' do
405
417
  end
406
418
  end
407
419
 
408
- describe "#fetch_registry_page for company_number" do
420
+ describe "#fetch_registry_page for uid" do
409
421
  before do
410
422
  @dummy_client = double('http_client', :get_content => nil)
411
423
  ModuleThatIncludesRegisterMethods.stub(:_client).and_return(@dummy_client)
412
424
  ModuleThatIncludesRegisterMethods.stub(:registry_url).and_return('http://some.registry.url')
425
+ @dummy_client.stub(:get_content).and_return(:registry_page_html)
413
426
  end
414
427
 
415
428
  it "should GET registry_page for registry_url for company_number" do
@@ -423,6 +436,24 @@ describe 'a module that includes RegisterMethods' do
423
436
  @dummy_client.stub(:get_content).and_return(:registry_page_html)
424
437
  ModuleThatIncludesRegisterMethods.fetch_registry_page('76543').should == :registry_page_html
425
438
  end
439
+
440
+ context 'and SLEEP_BEFORE_HTTP_REQ is set' do
441
+ it 'should sleep for given period' do
442
+ ModuleThatIncludesRegisterMethods.should_receive(:sleep).with(2)
443
+ ModuleThatIncludesRegisterMethods.fetch_registry_page('76543')
444
+ end
445
+ end
446
+
447
+ context 'and SLEEP_BEFORE_HTTP_REQ is not set' do
448
+ before do
449
+ ModuleWithNoCustomPrimaryKey.stub(:_client).and_return(@dummy_client)
450
+ end
451
+
452
+ it 'should sleep for given period' do
453
+ ModuleWithNoCustomPrimaryKey.should_not_receive(:sleep)
454
+ ModuleWithNoCustomPrimaryKey.fetch_registry_page('76543')
455
+ end
456
+ end
426
457
  end
427
458
 
428
459
  describe "#validate_datum" do
@@ -594,4 +625,91 @@ describe 'a module that includes RegisterMethods' do
594
625
  end
595
626
  end
596
627
 
628
+ describe 'raise_when_saving_invalid_record' do
629
+ describe '#primary_key_name' do
630
+ it 'should return false if RAISE_WHEN_SAVING_INVALID_RECORD not set' do
631
+ ModuleWithNoCustomPrimaryKey.send(:raise_when_saving_invalid_record).should == false
632
+ end
633
+
634
+ it 'should return true if RAISE_WHEN_SAVING_INVALID_RECORD set' do
635
+ ModuleThatIncludesRegisterMethods.send(:raise_when_saving_invalid_record).should == true
636
+ end
637
+ end
638
+ end
639
+
640
+ describe '#raw_data_file_location for a uid' do
641
+ before do
642
+ @dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
643
+ Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
644
+
645
+ ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
646
+ end
647
+
648
+ after do
649
+ FileUtils.rmdir(File.join(@dummy_root_directory, 'data'))
650
+ end
651
+
652
+ it 'should return directory built from uid inside root data directory' do
653
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('123456', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','3','4','5', '123456.html')
654
+ end
655
+
656
+ it 'should create directory structure if it doesnt exist' do
657
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('123456', 'html')
658
+ Dir.exist?(File.join(@dummy_root_directory, 'data', '1','2','3','4','5')).should == true
659
+ end
660
+
661
+ it 'should ignore leading zeroes when building directory' do
662
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('001234', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','3','4', '001234.html')
663
+ end
664
+
665
+ it 'should ignore non alphanum chars when building directory' do
666
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html')
667
+ end
668
+
669
+ it 'should allow format to be missing' do
670
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456').should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')
671
+ end
672
+
673
+ it 'should allow format to be nil' do
674
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456', nil).should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')
675
+ end
676
+ end
677
+
678
+ describe '#save_raw_data' do
679
+ before do
680
+ @dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
681
+ Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
682
+
683
+ ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
684
+ end
685
+
686
+ it 'should save raw data as in computed raw_data_file_location' do
687
+ ModuleThatIncludesRegisterMethods.save_raw_data('foo bar', '12a-b/3456', 'html')
688
+ File.read(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html')).should == 'foo bar'
689
+ end
690
+
691
+ it 'should allow format to be missing' do
692
+ ModuleThatIncludesRegisterMethods.save_raw_data('foo bar', '12a-b/3456')
693
+ File.read(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')).should == 'foo bar'
694
+ end
695
+ end
696
+
697
+ describe '#get_raw_data' do
698
+ before do
699
+ @dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
700
+ Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
701
+
702
+ ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
703
+ end
704
+
705
+ it 'should read raw data in computed raw_data_file_location' do
706
+ File.open(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html'),'w') { |f| f.print 'foo bar' }
707
+ ModuleThatIncludesRegisterMethods.get_raw_data('12a-b/3456', 'html').should == 'foo bar'
708
+ end
709
+
710
+ it 'should allow format to be missing' do
711
+ File.open(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456'),'w') { |f| f.print 'foo bar' }
712
+ ModuleThatIncludesRegisterMethods.get_raw_data('12a-b/3456').should == 'foo bar'
713
+ end
714
+ end
597
715
  end