openc_bot 0.0.27 → 0.0.46

Sign up to get free protection for your applications and to get access to all the features.
Files changed (205) hide show
  1. checksums.yaml +5 -13
  2. data/.travis.yml +1 -0
  3. data/Gemfile +2 -0
  4. data/lib/openc_bot.rb +16 -1
  5. data/lib/openc_bot/company_fetcher_bot.rb +50 -0
  6. data/lib/openc_bot/helpers/register_methods.rb +59 -15
  7. data/lib/openc_bot/tasks.rb +15 -2
  8. data/lib/openc_bot/templates/spec/spec_helper.rb +4 -0
  9. data/lib/openc_bot/version.rb +1 -1
  10. data/openc_bot.gemspec +6 -3
  11. data/spec/lib/company_fetcher_bot_spec.rb +23 -0
  12. data/spec/lib/helpers/register_methods_spec.rb +120 -2
  13. data/spec/lib/openc_bot_spec.rb +14 -1
  14. data/spec/simple_openc_bot_spec.rb +1 -0
  15. data/spec/spec_helper.rb +3 -1
  16. metadata +43 -220
  17. data/schemas/.gitignore +0 -13
  18. data/schemas/.travis.yml +0 -5
  19. data/schemas/Gemfile +0 -9
  20. data/schemas/README.md +0 -2
  21. data/schemas/Rakefile +0 -46
  22. data/schemas/lib/base-statement.json +0 -22
  23. data/schemas/lib/snippets/financial-payment-base.json +0 -4
  24. data/schemas/lib/snippets/licence-base.json +0 -4
  25. data/schemas/schemas/company-schema.json +0 -183
  26. data/schemas/schemas/financial-payment-schema.json +0 -27
  27. data/schemas/schemas/includes/address.json +0 -23
  28. data/schemas/schemas/includes/alternative_name.json +0 -13
  29. data/schemas/schemas/includes/company.json +0 -16
  30. data/schemas/schemas/includes/filing.json +0 -20
  31. data/schemas/schemas/includes/financial-payment-data-object.json +0 -60
  32. data/schemas/schemas/includes/industry_code.json +0 -16
  33. data/schemas/schemas/includes/licence-data-object.json +0 -36
  34. data/schemas/schemas/includes/officer.json +0 -50
  35. data/schemas/schemas/includes/previous_name.json +0 -14
  36. data/schemas/schemas/includes/share-parcel-data.json +0 -67
  37. data/schemas/schemas/includes/share-parcel.json +0 -63
  38. data/schemas/schemas/includes/subsidiary-relationship-data.json +0 -47
  39. data/schemas/schemas/includes/total-shares.json +0 -10
  40. data/schemas/schemas/licence-schema.json +0 -27
  41. data/schemas/schemas/primary-data-schema.json +0 -17
  42. data/schemas/schemas/share-parcel-schema.json +0 -21
  43. data/schemas/schemas/simple-financial-payment-schema.json +0 -85
  44. data/schemas/schemas/simple-licence-schema.json +0 -59
  45. data/schemas/schemas/simple-subsidiary-schema.json +0 -68
  46. data/schemas/schemas/subsidiary-relationship-schema.json +0 -27
  47. data/schemas/spec/sample-data/invalid/company-01.json +0 -4
  48. data/schemas/spec/sample-data/invalid/company-02.json +0 -5
  49. data/schemas/spec/sample-data/invalid/company-03.json +0 -5
  50. data/schemas/spec/sample-data/invalid/company-04.json +0 -5
  51. data/schemas/spec/sample-data/invalid/company-05.json +0 -5
  52. data/schemas/spec/sample-data/invalid/company-06.json +0 -6
  53. data/schemas/spec/sample-data/invalid/company-07.json +0 -8
  54. data/schemas/spec/sample-data/invalid/company-08.json +0 -7
  55. data/schemas/spec/sample-data/invalid/company-09.json +0 -9
  56. data/schemas/spec/sample-data/invalid/company-10.json +0 -7
  57. data/schemas/spec/sample-data/invalid/company-11.json +0 -9
  58. data/schemas/spec/sample-data/invalid/company-12.json +0 -11
  59. data/schemas/spec/sample-data/invalid/company-13.json +0 -11
  60. data/schemas/spec/sample-data/invalid/company-14.json +0 -7
  61. data/schemas/spec/sample-data/invalid/company-15.json +0 -7
  62. data/schemas/spec/sample-data/invalid/company-16.json +0 -7
  63. data/schemas/spec/sample-data/invalid/company-17.json +0 -9
  64. data/schemas/spec/sample-data/invalid/company-18.json +0 -9
  65. data/schemas/spec/sample-data/invalid/company-19.json +0 -9
  66. data/schemas/spec/sample-data/invalid/company-20.json +0 -9
  67. data/schemas/spec/sample-data/invalid/company-21.json +0 -11
  68. data/schemas/spec/sample-data/invalid/company-22.json +0 -11
  69. data/schemas/spec/sample-data/invalid/company-23.json +0 -7
  70. data/schemas/spec/sample-data/invalid/company-24.json +0 -12
  71. data/schemas/spec/sample-data/invalid/company-25.json +0 -9
  72. data/schemas/spec/sample-data/invalid/company-26.json +0 -11
  73. data/schemas/spec/sample-data/invalid/company-27.json +0 -7
  74. data/schemas/spec/sample-data/invalid/company-28.json +0 -9
  75. data/schemas/spec/sample-data/invalid/company-29.json +0 -12
  76. data/schemas/spec/sample-data/invalid/company-30.json +0 -16
  77. data/schemas/spec/sample-data/invalid/company-31.json +0 -14
  78. data/schemas/spec/sample-data/invalid/company-32.json +0 -11
  79. data/schemas/spec/sample-data/invalid/company-33.json +0 -7
  80. data/schemas/spec/sample-data/invalid/company-34.json +0 -9
  81. data/schemas/spec/sample-data/invalid/company-35.json +0 -9
  82. data/schemas/spec/sample-data/invalid/company-36.json +0 -10
  83. data/schemas/spec/sample-data/invalid/company-37.json +0 -7
  84. data/schemas/spec/sample-data/invalid/company-38.json +0 -9
  85. data/schemas/spec/sample-data/invalid/company-39.json +0 -11
  86. data/schemas/spec/sample-data/invalid/company-40.json +0 -12
  87. data/schemas/spec/sample-data/invalid/company-41.json +0 -12
  88. data/schemas/spec/sample-data/invalid/company-42.json +0 -7
  89. data/schemas/spec/sample-data/invalid/company-43.json +0 -9
  90. data/schemas/spec/sample-data/invalid/company-44.json +0 -11
  91. data/schemas/spec/sample-data/invalid/company-45.json +0 -11
  92. data/schemas/spec/sample-data/invalid/company-46.json +0 -7
  93. data/schemas/spec/sample-data/invalid/company-47.json +0 -9
  94. data/schemas/spec/sample-data/invalid/company-48.json +0 -9
  95. data/schemas/spec/sample-data/invalid/company-49.json +0 -9
  96. data/schemas/spec/sample-data/invalid/company-50.json +0 -9
  97. data/schemas/spec/sample-data/invalid/company-51.json +0 -9
  98. data/schemas/spec/sample-data/invalid/company-52.json +0 -9
  99. data/schemas/spec/sample-data/invalid/company-53.json +0 -10
  100. data/schemas/spec/sample-data/invalid/company-54.json +0 -9
  101. data/schemas/spec/sample-data/invalid/company-55.json +0 -9
  102. data/schemas/spec/sample-data/invalid/company-56.json +0 -7
  103. data/schemas/spec/sample-data/invalid/company-57.json +0 -7
  104. data/schemas/spec/sample-data/invalid/company-58.json +0 -7
  105. data/schemas/spec/sample-data/invalid/company-59.json +0 -13
  106. data/schemas/spec/sample-data/invalid/company-60.json +0 -7
  107. data/schemas/spec/sample-data/invalid/company-61.json +0 -7
  108. data/schemas/spec/sample-data/invalid/company-62.json +0 -9
  109. data/schemas/spec/sample-data/invalid/company-63.json +0 -12
  110. data/schemas/spec/sample-data/invalid/company-64.json +0 -14
  111. data/schemas/spec/sample-data/invalid/company-65.json +0 -14
  112. data/schemas/spec/sample-data/invalid/company-66.json +0 -13
  113. data/schemas/spec/sample-data/invalid/company-67.json +0 -14
  114. data/schemas/spec/sample-data/invalid/company-68.json +0 -12
  115. data/schemas/spec/sample-data/invalid/company-69.json +0 -12
  116. data/schemas/spec/sample-data/invalid/company-70.json +0 -14
  117. data/schemas/spec/sample-data/invalid/financial-payment-01.json +0 -24
  118. data/schemas/spec/sample-data/invalid/licence-01.json +0 -18
  119. data/schemas/spec/sample-data/invalid/licence-02.json +0 -18
  120. data/schemas/spec/sample-data/invalid/licence-03.json +0 -12
  121. data/schemas/spec/sample-data/invalid/licence-04.json +0 -18
  122. data/schemas/spec/sample-data/invalid/licence-05.json +0 -18
  123. data/schemas/spec/sample-data/invalid/licence-06.json +0 -18
  124. data/schemas/spec/sample-data/invalid/licence-07.json +0 -20
  125. data/schemas/spec/sample-data/invalid/licence-08.json +0 -21
  126. data/schemas/spec/sample-data/invalid/primary-data-01.json +0 -4
  127. data/schemas/spec/sample-data/invalid/primary-data-02.json +0 -4
  128. data/schemas/spec/sample-data/invalid/simple-licence-01.json +0 -9
  129. data/schemas/spec/sample-data/invalid/simple-licence-02.json +0 -8
  130. data/schemas/spec/sample-data/invalid/simple-licence-03.json +0 -9
  131. data/schemas/spec/sample-data/invalid/simple-licence-04.json +0 -10
  132. data/schemas/spec/sample-data/invalid/simple-licence-05.json +0 -10
  133. data/schemas/spec/sample-data/invalid/simple-licence-06.json +0 -10
  134. data/schemas/spec/sample-data/invalid/simple-subsidiary-01.json +0 -13
  135. data/schemas/spec/sample-data/invalid/simple-subsidiary-02.json +0 -13
  136. data/schemas/spec/sample-data/licence-schema.json.old +0 -21
  137. data/schemas/spec/sample-data/valid/company-01.json +0 -6
  138. data/schemas/spec/sample-data/valid/company-02.json +0 -7
  139. data/schemas/spec/sample-data/valid/company-03.json +0 -8
  140. data/schemas/spec/sample-data/valid/company-04.json +0 -7
  141. data/schemas/spec/sample-data/valid/company-05.json +0 -7
  142. data/schemas/spec/sample-data/valid/company-06.json +0 -12
  143. data/schemas/spec/sample-data/valid/company-07.json +0 -9
  144. data/schemas/spec/sample-data/valid/company-08.json +0 -9
  145. data/schemas/spec/sample-data/valid/company-09.json +0 -20
  146. data/schemas/spec/sample-data/valid/company-10.json +0 -9
  147. data/schemas/spec/sample-data/valid/company-11.json +0 -7
  148. data/schemas/spec/sample-data/valid/company-12.json +0 -7
  149. data/schemas/spec/sample-data/valid/company-13.json +0 -7
  150. data/schemas/spec/sample-data/valid/company-14.json +0 -15
  151. data/schemas/spec/sample-data/valid/company-15.json +0 -8
  152. data/schemas/spec/sample-data/valid/company-16.json +0 -9
  153. data/schemas/spec/sample-data/valid/company-17.json +0 -9
  154. data/schemas/spec/sample-data/valid/company-18.json +0 -9
  155. data/schemas/spec/sample-data/valid/company-19.json +0 -37
  156. data/schemas/spec/sample-data/valid/company-20.json +0 -9
  157. data/schemas/spec/sample-data/valid/company-21.json +0 -26
  158. data/schemas/spec/sample-data/valid/company-22.json +0 -20
  159. data/schemas/spec/sample-data/valid/company-23.json +0 -9
  160. data/schemas/spec/sample-data/valid/company-24.json +0 -12
  161. data/schemas/spec/sample-data/valid/company-25.json +0 -12
  162. data/schemas/spec/sample-data/valid/company-26.json +0 -12
  163. data/schemas/spec/sample-data/valid/company-27.json +0 -28
  164. data/schemas/spec/sample-data/valid/company-28.json +0 -9
  165. data/schemas/spec/sample-data/valid/company-29.json +0 -10
  166. data/schemas/spec/sample-data/valid/company-30.json +0 -9
  167. data/schemas/spec/sample-data/valid/company-31.json +0 -17
  168. data/schemas/spec/sample-data/valid/company-32.json +0 -9
  169. data/schemas/spec/sample-data/valid/company-33.json +0 -29
  170. data/schemas/spec/sample-data/valid/company-34.json +0 -9
  171. data/schemas/spec/sample-data/valid/company-35.json +0 -9
  172. data/schemas/spec/sample-data/valid/company-36.json +0 -9
  173. data/schemas/spec/sample-data/valid/company-37.json +0 -9
  174. data/schemas/spec/sample-data/valid/company-38.json +0 -9
  175. data/schemas/spec/sample-data/valid/company-39.json +0 -9
  176. data/schemas/spec/sample-data/valid/company-40.json +0 -9
  177. data/schemas/spec/sample-data/valid/company-41.json +0 -9
  178. data/schemas/spec/sample-data/valid/company-42.json +0 -10
  179. data/schemas/spec/sample-data/valid/company-43.json +0 -7
  180. data/schemas/spec/sample-data/valid/company-44.json +0 -7
  181. data/schemas/spec/sample-data/valid/company-45.json +0 -23
  182. data/schemas/spec/sample-data/valid/company-46.json +0 -7
  183. data/schemas/spec/sample-data/valid/company-47.json +0 -12
  184. data/schemas/spec/sample-data/valid/company-48.json +0 -7
  185. data/schemas/spec/sample-data/valid/company-49.json +0 -14
  186. data/schemas/spec/sample-data/valid/company-50.json +0 -13
  187. data/schemas/spec/sample-data/valid/company-51.json +0 -14
  188. data/schemas/spec/sample-data/valid/company-52.json +0 -12
  189. data/schemas/spec/sample-data/valid/company-53.json +0 -9
  190. data/schemas/spec/sample-data/valid/financial-payment-01.json +0 -25
  191. data/schemas/spec/sample-data/valid/financial-payment-02.json +0 -29
  192. data/schemas/spec/sample-data/valid/licence-01.json +0 -19
  193. data/schemas/spec/sample-data/valid/licence-02.json +0 -21
  194. data/schemas/spec/sample-data/valid/licence-03.json +0 -21
  195. data/schemas/spec/sample-data/valid/licence-04.json +0 -26
  196. data/schemas/spec/sample-data/valid/primary-data-01.json +0 -4
  197. data/schemas/spec/sample-data/valid/primary-data-02.json +0 -5
  198. data/schemas/spec/sample-data/valid/simple-licence-01.json +0 -10
  199. data/schemas/spec/sample-data/valid/simple-licence-02.json +0 -10
  200. data/schemas/spec/sample-data/valid/simple-licence-03.json +0 -12
  201. data/schemas/spec/sample-data/valid/simple-subsidiary-01.json +0 -13
  202. data/schemas/spec/sample-data/valid/simple-subsidiary-02.json +0 -13
  203. data/schemas/spec/sample-data/valid/subsidiary-relationship-01.json +0 -23
  204. data/schemas/spec/spec_helper.rb +0 -78
  205. data/schemas/spec/validation_spec.rb +0 -39
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- MGI3OTY2YWNlZWFlNDk4M2EwOTVhOTUwYTAwNjViODZmZDVlYjY2MQ==
5
- data.tar.gz: !binary |-
6
- MTQ0OGM2ZWZjOWYwNzQ5MGQ3Y2YxZDRiOGYyM2FiY2Y4MzBjNDIzZQ==
2
+ SHA1:
3
+ metadata.gz: 1db3c143f46fc934729ee27c6cc5b4047fb2a5c5
4
+ data.tar.gz: 4c2de3f8f0ecc62f77689386dc6e50ed26290714
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- ZTFkYmVlMWFmZmIwMGNiYmIxZTA1ZDQzMTY5YWE0MjEyZWFlYmZiNTUxMjZj
10
- ZGUxNDc2ZTZkNzlkOTYzOWFlYTRkZDM5ZDgwMTRmYmE0ZTM3ZTAxMWIwNGFm
11
- YTA4MTgxNDc4OGI2OWRhZTk0NmQ3ODc4MmY5NWE4YmE4YzRlMjc=
12
- data.tar.gz: !binary |-
13
- MjQ2NmFhOGY4NDFjZDE2NGJjNTlkNGUwNWJjMzUyMDJjNGM4YjBiMGYxMzMw
14
- YTE4ZmIxZmE0YTU0N2Y1NWE0NDU4ZGUzZjc1ODExZmZmZDAxNmZmZWMzY2Qx
15
- MWQyNWU4NmEzYmQ5MjdiYzIxYTFlYTkyZjMzMWZjYWY0NjkwYWM=
6
+ metadata.gz: 6046b31e46416716606c0540ab60cc3b4c20d53043b7ec5701103dde77e47ccf6d0abb77e284bae0928dcaf66471d025c83bb51fbc6591c882ea401263cd4ae7
7
+ data.tar.gz: 22bd41bdd6639ea13f9d4787204fc7f13321d0c08ddf48748354b89a88bef8c0a73dda93f47e73cc20589332a7f6940400efa4998cc17c5ca7ebe8445222e65f
data/.travis.yml CHANGED
@@ -4,6 +4,7 @@ rvm:
4
4
  - "1.9.3"
5
5
  - "2.0.0"
6
6
  - "2.1.0"
7
+ - "2.2.0"
7
8
  # - jruby-18mode # JRuby in 1.8 mode
8
9
  # - jruby-19mode # JRuby in 1.9 mode
9
10
  # - rbx
data/Gemfile CHANGED
@@ -1,8 +1,10 @@
1
1
  source 'https://rubygems.org'
2
2
  gem "sqlite_magic", :git => 'https://github.com/openc/sqlite_magic.git'
3
+
3
4
  gem "pry", :group => [:development,:test]
4
5
  # Specify your gem's dependencies in openc_bot.gemspec
5
6
  gemspec
6
7
 
8
+
7
9
  # we need to do pull request and bump version
8
10
  # gem 'scraperwiki', '>=3.0.2', :git => 'https://github.com/openc/scraperwiki-ruby.git'
data/lib/openc_bot.rb CHANGED
@@ -80,10 +80,25 @@ module OpencBot
80
80
  end
81
81
  end
82
82
 
83
+ def db_location
84
+ File.expand_path(File.join(@@app_directory, 'db', db_name))
85
+ end
86
+
83
87
  # Override default in ScraperWiki gem
84
88
  def sqlite_magic_connection
85
89
  db = @config ? @config[:db] : File.expand_path(File.join(@@app_directory, 'db', db_name))
86
- @sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
90
+ options = sqlite_busy_timeout ? {:busy_timeout => sqlite_busy_timeout} : {:busy_timeout => 10000}
91
+ @sqlite_magic_connection ||= SqliteMagic::Connection.new(db, options)
92
+ end
93
+
94
+ def sqlite_busy_timeout
95
+ self.const_defined?('SQLITE_BUSY_TIMEOUT') && self.const_get('SQLITE_BUSY_TIMEOUT')
96
+ end
97
+
98
+ def table_summary
99
+ field_names = sqlite_magic_connection.execute('PRAGMA table_info(ocdata)').collect{|c| c['name']}
100
+ select_sql = "COUNT(1) Total, " + field_names.collect{ |fn| "COUNT(#{fn}) #{fn}_not_null" }.join(', ') + " FROM ocdata"
101
+ select(select_sql).first
87
102
  end
88
103
 
89
104
  end
@@ -1,6 +1,8 @@
1
1
  require 'openc_bot'
2
2
  require 'openc_bot/helpers/incremental_search'
3
3
  require 'openc_bot/helpers/alpha_search'
4
+ # require 'openc_bot/asana_notifier'
5
+ require 'mail'
4
6
 
5
7
 
6
8
  module OpencBot
@@ -9,6 +11,8 @@ module OpencBot
9
11
  include OpencBot::Helpers::IncrementalSearch
10
12
  include OpencBot::Helpers::AlphaSearch
11
13
 
14
+ STDOUT.sync = true
15
+ STDERR.sync = true
12
16
  # This is called by #update_datum
13
17
  def fetch_datum(company_number)
14
18
  company_page = fetch_registry_page(company_number)
@@ -42,5 +46,51 @@ module OpencBot
42
46
  super || 'company-schema'
43
47
  end
44
48
 
49
+ def update_data(options={})
50
+ fetch_data
51
+ update_stale
52
+ send_run_report
53
+ rescue Exception => e
54
+ send_error_report(e)
55
+ raise e
56
+ end
57
+
58
+ private
59
+ def mark_bot_as_failing_on_asana(exception)
60
+ # error_description = "Code for this bot: https://github.com/openc/external_bots/tree/master/#{inferred_jurisdiction_code}_companies_fetcher\nError details: #{exception.inspect}.\nBacktrace:\n#{exception.backtrace}"
61
+ # params = {
62
+ # :tag => inferred_jurisdiction_code,
63
+ # :asana_api_key => ENV['ASANA_API_KEY'],
64
+ # :workspace => ENV['ASANA_WORKSPACE'],
65
+ # :title => exception.message,
66
+ # :description => error_description
67
+ # }
68
+ # AsanaNotifier.create_failed_bot_task(params)
69
+ end
70
+
71
+ def send_error_report(e)
72
+ subject = "Error running #{self.name}: #{e}"
73
+ body = "Error details: #{e.inspect}.\nBacktrace:\n#{e.backtrace}"
74
+ mark_bot_as_failing_on_asana(e) if ENV['CREATE_ASANA_TASKS_FOR_BOT_FAILURES']
75
+ send_report(:subject => subject, :body => body)
76
+ end
77
+
78
+ def send_run_report
79
+ subject = "#{self.name} successfully ran"
80
+ db_filesize = File.size?(db_location)
81
+ body = "No problems to report. db is #{db_location}, #{db_filesize} bytes. Last modified: #{File.stat(db_location).mtime}"
82
+ send_report(:subject => subject, :body => body)
83
+ end
84
+
85
+ def send_report(params)
86
+ Mail.deliver do
87
+ from 'admin@opencorporates.com'
88
+ to 'bots@opencorporates.com'
89
+ subject params[:subject]
90
+ body params[:body]
91
+ end
92
+ end
93
+
94
+
45
95
  end
46
96
  end
@@ -16,6 +16,10 @@ module OpencBot
16
16
  !!select("ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} = ? LIMIT 1", uid).first
17
17
  end
18
18
 
19
+ def default_stale_count
20
+ self.const_defined?('STALE_COUNT') ? self.const_get('STALE_COUNT') : 1000
21
+ end
22
+
19
23
  # fetches and saves data. By default assumes an incremental search, or an alpha search
20
24
  # if USE_ALPHA_SEARCH is set. This method should be overridden if you are going to do a
21
25
  # different type of data import, e.g from a CSV file.
@@ -35,24 +39,25 @@ module OpencBot
35
39
  end
36
40
 
37
41
  def fetch_registry_page(company_number)
42
+ sleep_before_http_req
38
43
  _client.get_content(registry_url(company_number))
39
44
  end
40
45
 
41
46
  def prepare_and_save_data(all_data,options={})
42
47
  data_to_be_saved = prepare_for_saving(all_data)
43
- fail_count, retry_interval = 0, 5
48
+ # fail_count, retry_interval = 0, 5
44
49
  begin
45
50
  insert_or_update([primary_key_name], data_to_be_saved)
46
51
  rescue SQLite3::BusyException => e
47
- fail_count += 1
48
- if fail_count <= MAX_BUSY_RETRIES
49
- puts "#{e.inspect} raised #{fail_count} times saving:\n#{all_data}\n\nNow retrying in #{retry_interval} seconds" if verbose?
50
- sleep retry_interval
51
- retry_interval = retry_interval * 2
52
- retry
53
- else
54
- raise e
55
- end
52
+ # fail_count += 1
53
+ # if fail_count <= MAX_BUSY_RETRIES
54
+ puts "#{e.inspect} raised saving:\n#{all_data}\n\n" if verbose?
55
+ # sleep retry_interval
56
+ # retry_interval = retry_interval * 2
57
+ # retry
58
+ # else
59
+ raise e
60
+ # end
56
61
  end
57
62
 
58
63
  end
@@ -61,6 +66,10 @@ module OpencBot
61
66
  self.const_defined?('PRIMARY_KEY_NAME') ? self.const_get('PRIMARY_KEY_NAME') : :uid
62
67
  end
63
68
 
69
+ def raise_when_saving_invalid_record
70
+ !!self.const_defined?('RAISE_WHEN_SAVING_INVALID_RECORD')
71
+ end
72
+
64
73
  # sensible default. Either uses computed version or registry_url in db
65
74
  def registry_url(uid)
66
75
  computed_registry_url(uid) || registry_url_from_db(uid)
@@ -94,7 +103,7 @@ module OpencBot
94
103
  end
95
104
 
96
105
  def stale_entry_uids(stale_count=nil)
97
- stale_count ||= 1000
106
+ stale_count ||= default_stale_count
98
107
  sql_query = "ocdata.* from ocdata WHERE retrieved_at IS NULL OR strftime('%s', retrieved_at) < strftime('%s', '#{Date.today - 30}') LIMIT #{stale_count.to_i}"
99
108
  raw_data = select(sql_query).each do |res|
100
109
  yield res[primary_key_name.to_s]
@@ -108,6 +117,24 @@ module OpencBot
108
117
  end
109
118
  end
110
119
 
120
+ def get_raw_data(uid, format=nil)
121
+ file_location = raw_data_file_location(uid, format)
122
+ File.read(file_location) if File.exist?(file_location)
123
+ end
124
+
125
+ def save_raw_data(raw_data, uid, format=nil)
126
+ file_location = raw_data_file_location(uid, format)
127
+ File.open(file_location, 'w') { |f| f.print raw_data }
128
+ end
129
+
130
+ def raw_data_file_location(uid, format=nil)
131
+ normalised_uid = uid.gsub(/[^[[:alnum:]]]/,'')
132
+ directory = File.join(*([root_directory,'data',normalised_uid.gsub(/^0+/,'').split(//).first(5)].flatten))
133
+ FileUtils.mkdir_p(directory) unless Dir.exist?(directory)
134
+ filename = format ? "#{normalised_uid}.#{format}" : normalised_uid
135
+ File.join(directory, filename)
136
+ end
137
+
111
138
  def update_data(options={})
112
139
  fetch_data
113
140
  update_stale
@@ -130,13 +157,14 @@ module OpencBot
130
157
  # or, if output_as_json is requested then the validation error is included
131
158
  # in the JSON error message
132
159
  def update_datum(uid, output_as_json=false,replace_existing_data=false)
160
+ # XXX here we refuse to run depending on run algorithm
133
161
  return unless raw_data = fetch_datum(uid)
134
162
  default_options = {primary_key_name => uid, :retrieved_at => Time.now}
135
163
  return unless base_processed_data = process_datum(raw_data)
136
164
  processed_data = default_options.merge(base_processed_data)
137
165
  # prepare the data for saving (converting Arrays, Hashes to json) and
138
166
  # save the original data too, as we may not extracting everything from it yet
139
- save_entity(processed_data.merge(:data => raw_data))
167
+ raise_when_saving_invalid_record ? save_entity!(processed_data.merge(:data => raw_data)) : save_entity(processed_data.merge(:data => raw_data))
140
168
  if output_as_json
141
169
  puts processed_data.to_json
142
170
  else
@@ -152,11 +180,18 @@ module OpencBot
152
180
  end
153
181
  end
154
182
 
183
+ # at a rate of 1.16 companies per second, and allowing 12 hours
184
+ # running per day. a 3m register would be updated in 2 months:
185
+ MAX_STALE_COUNT = 100_000
155
186
  def update_stale(stale_count=nil)
156
- stale_entry_uids(stale_count) do |stale_entry_uid|
157
- update_datum(stale_entry_uid)
187
+ # XXX here set an arbitrarily large number and then rely on the system to stop
188
+ # XXX wrap this with timings to work out per-record rate
189
+ rate_limiter do |limiter|
190
+ stale_entry_uids(MAX_STALE_COUNT) do |stale_entry_uid|
191
+ update_datum(stale_entry_uid)
192
+ limiter.checkpoint
193
+ end
158
194
  end
159
-
160
195
  end
161
196
 
162
197
  def validate_datum(record)
@@ -196,6 +231,15 @@ module OpencBot
196
231
  prepared_data
197
232
  end
198
233
 
234
+ def sleep_before_http_req
235
+ if self.const_defined?('SLEEP_BEFORE_HTTP_REQ')
236
+ sleep_time = self.const_get('SLEEP_BEFORE_HTTP_REQ')
237
+ puts "#{self.name} about to sleep for #{sleep_time} before fetching data. Time now: #{Time.now}" if verbose?
238
+ sleep(sleep_time)
239
+ puts "#{self.name} slept for #{sleep_time}: Time now #{Time.now}" if verbose?
240
+ end
241
+ end
242
+
199
243
  def _client(options={})
200
244
  return @client if @client
201
245
  @client = HTTPClient.new(options.delete(:proxy))
@@ -3,6 +3,8 @@ require 'optparse'
3
3
  require 'json'
4
4
  require 'fileutils'
5
5
 
6
+ PID_DIR = "/oc/pids"
7
+
6
8
  namespace :bot do
7
9
  desc "create a skeleton bot that can be used in OpenCorporates"
8
10
  task :create do
@@ -134,6 +136,17 @@ namespace :bot do
134
136
  end
135
137
  end
136
138
 
139
+ desc 'Lists count of non-null values in each field in ocdata table'
140
+ task :table_summary do
141
+ only_process_running('table_summary') do
142
+ bot_name = get_bot_name
143
+ require_relative File.join(Dir.pwd,'lib', bot_name)
144
+ runner = callable_from_file_name(bot_name)
145
+ res = runner.table_summary
146
+ res.each {|k,v| puts "#{k}:\t#{v}"}
147
+ end
148
+ end
149
+
137
150
  desc 'Summarise data for quality checking (only works for licences at the moment)'
138
151
  task :summarise_data do
139
152
  def as_sorted_hash(name, data)
@@ -327,7 +340,7 @@ EOF
327
340
  puts "Created #{new_file}"
328
341
  end
329
342
  end
330
-
343
+
331
344
  #Add rspec debugger to gemfile
332
345
  File.open(File.join(working_dir,'Gemfile'),'a') do |file|
333
346
  file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
@@ -341,7 +354,7 @@ EOF
341
354
  end
342
355
 
343
356
  def only_process_running(task_name)
344
- pid_path = File.join(Dir.pwd, 'pids', task_name)
357
+ pid_path = File.join(PID_DIR, 'pids', task_name)
345
358
 
346
359
  raise_if_already_running(pid_path)
347
360
  write_pid_file(pid_path)
@@ -11,3 +11,7 @@ end
11
11
  def dummy_response(response_name, options={})
12
12
  IO.read(File.join(File.dirname(__FILE__),"dummy_responses",response_name.to_s), options)
13
13
  end
14
+
15
+ Mail.defaults do
16
+ delivery_method :test # no, don't send emails when testing,
17
+ end
@@ -1,3 +1,3 @@
1
1
  module OpencBot
2
- VERSION = "0.0.27"
2
+ VERSION = "0.0.46"
3
3
  end
data/openc_bot.gemspec CHANGED
@@ -35,14 +35,17 @@ Gem::Specification.new do |gem|
35
35
  gem.add_dependency "rake"
36
36
  gem.add_dependency "activesupport", "4.1.4"
37
37
  gem.add_dependency "nokogiri"
38
- # gem.add_dependency "sqlite3"
38
+ gem.add_dependency "sqlite_magic", "0.0.6"
39
39
  gem.add_dependency "json"
40
40
  gem.add_dependency "json-schema"
41
41
  gem.add_dependency "httpclient"
42
42
  gem.add_dependency "backports"
43
43
  gem.add_dependency "scraperwiki", "3.0.2"
44
+ gem.add_dependency "mail"
45
+ # gem.add_dependency "openc-asana" unless RUBY_VERSION < '2.0'
44
46
 
45
- gem.add_development_dependency "perftools.rb"
46
- gem.add_development_dependency "debugger"
47
+ # gem.add_development_dependency "perftools.rb"
48
+ gem.add_development_dependency "byebug" unless RUBY_VERSION < '2.0'
49
+ gem.add_development_dependency "debugger" if RUBY_VERSION < '2.0'
47
50
  gem.add_development_dependency "rspec"
48
51
  end
@@ -3,6 +3,10 @@ require_relative '../spec_helper'
3
3
  require 'openc_bot'
4
4
  require 'openc_bot/company_fetcher_bot'
5
5
 
6
+ Mail.defaults do
7
+ delivery_method :test # no, don't send emails when testing
8
+ end
9
+
6
10
  module TestCompaniesFetcher
7
11
  extend OpencBot::CompanyFetcherBot
8
12
  end
@@ -121,4 +125,23 @@ describe "A module that extends CompanyFetcherBot" do
121
125
  end
122
126
  end
123
127
  end
128
+
129
+ describe '#update_data' do
130
+
131
+ before do
132
+ TestCompaniesFetcher.stub(:fetch_data_via_incremental_search)
133
+ TestCompaniesFetcher.stub(:update_stale)
134
+ #this can be any file that we can stat
135
+ TestCompaniesFetcher.stub(:db_location).
136
+ and_return(File.join(File.dirname(__FILE__),"company_fetcher_bot_spec.rb"))
137
+
138
+ Mail::TestMailer.deliveries.clear
139
+ TestCompaniesFetcher.update_data
140
+ end
141
+
142
+ it 'should send success email' do
143
+ Mail::TestMailer.deliveries.first.subject.should match /successfully ran/
144
+ end
145
+
146
+ end
124
147
  end
@@ -8,6 +8,8 @@ module ModuleThatIncludesRegisterMethods
8
8
  extend OpencBot::Helpers::RegisterMethods
9
9
  PRIMARY_KEY_NAME = :custom_uid
10
10
  SCHEMA_NAME = 'company-schema'
11
+ SLEEP_BEFORE_HTTP_REQ = 2
12
+ RAISE_WHEN_SAVING_INVALID_RECORD = true
11
13
  end
12
14
 
13
15
  module ModuleWithNoCustomPrimaryKey
@@ -29,7 +31,6 @@ describe 'a module that includes RegisterMethods' do
29
31
  describe "#datum_exists?" do
30
32
  before do
31
33
  ModuleThatIncludesRegisterMethods.stub(:select).and_return([])
32
-
33
34
  end
34
35
 
35
36
  it "should select_data from database" do
@@ -233,10 +234,12 @@ describe 'a module that includes RegisterMethods' do
233
234
 
234
235
  context 'and SQLite3::BusyException raised' do
235
236
  it 'should retry up to 3 times' do
237
+ pending "deciding whether to allow this in some circumstances"
236
238
  ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).exactly(4).times.and_raise(SQLite3::BusyException)
237
239
  lambda { ModuleThatIncludesRegisterMethods.prepare_and_save_data(@params) }.should raise_error(SQLite3::BusyException)
238
240
  end
239
241
  it 'should not raise error if successful before limit' do
242
+ pending "deciding whether to allow this in some circumstances"
240
243
  ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).exactly(3).times.ordered.and_raise(SQLite3::BusyException)
241
244
  ModuleThatIncludesRegisterMethods.should_receive(:insert_or_update).ordered
242
245
  lambda { ModuleThatIncludesRegisterMethods.prepare_and_save_data(@params) }.should_not raise_error
@@ -372,6 +375,15 @@ describe 'a module that includes RegisterMethods' do
372
375
  end
373
376
  end
374
377
 
378
+ context 'and errors returned validating data' do
379
+ it "should validate processed data" do
380
+ ModuleThatIncludesRegisterMethods.stub(:validate_datum).and_return([{:failed_attribute => 'foo', :message => 'Something not right'}])
381
+ lambda { ModuleThatIncludesRegisterMethods.update_datum(@uid)}.should raise_error
382
+ end
383
+
384
+
385
+ end
386
+
375
387
  context 'and process_datum returns nil' do
376
388
  before do
377
389
  ModuleThatIncludesRegisterMethods.stub(:process_datum).and_return(nil)
@@ -405,11 +417,12 @@ describe 'a module that includes RegisterMethods' do
405
417
  end
406
418
  end
407
419
 
408
- describe "#fetch_registry_page for company_number" do
420
+ describe "#fetch_registry_page for uid" do
409
421
  before do
410
422
  @dummy_client = double('http_client', :get_content => nil)
411
423
  ModuleThatIncludesRegisterMethods.stub(:_client).and_return(@dummy_client)
412
424
  ModuleThatIncludesRegisterMethods.stub(:registry_url).and_return('http://some.registry.url')
425
+ @dummy_client.stub(:get_content).and_return(:registry_page_html)
413
426
  end
414
427
 
415
428
  it "should GET registry_page for registry_url for company_number" do
@@ -423,6 +436,24 @@ describe 'a module that includes RegisterMethods' do
423
436
  @dummy_client.stub(:get_content).and_return(:registry_page_html)
424
437
  ModuleThatIncludesRegisterMethods.fetch_registry_page('76543').should == :registry_page_html
425
438
  end
439
+
440
+ context 'and SLEEP_BEFORE_HTTP_REQ is set' do
441
+ it 'should sleep for given period' do
442
+ ModuleThatIncludesRegisterMethods.should_receive(:sleep).with(2)
443
+ ModuleThatIncludesRegisterMethods.fetch_registry_page('76543')
444
+ end
445
+ end
446
+
447
+ context 'and SLEEP_BEFORE_HTTP_REQ is not set' do
448
+ before do
449
+ ModuleWithNoCustomPrimaryKey.stub(:_client).and_return(@dummy_client)
450
+ end
451
+
452
+ it 'should sleep for given period' do
453
+ ModuleWithNoCustomPrimaryKey.should_not_receive(:sleep)
454
+ ModuleWithNoCustomPrimaryKey.fetch_registry_page('76543')
455
+ end
456
+ end
426
457
  end
427
458
 
428
459
  describe "#validate_datum" do
@@ -594,4 +625,91 @@ describe 'a module that includes RegisterMethods' do
594
625
  end
595
626
  end
596
627
 
628
+ describe 'raise_when_saving_invalid_record' do
629
+ describe '#primary_key_name' do
630
+ it 'should return false if RAISE_WHEN_SAVING_INVALID_RECORD not set' do
631
+ ModuleWithNoCustomPrimaryKey.send(:raise_when_saving_invalid_record).should == false
632
+ end
633
+
634
+ it 'should return true if RAISE_WHEN_SAVING_INVALID_RECORD set' do
635
+ ModuleThatIncludesRegisterMethods.send(:raise_when_saving_invalid_record).should == true
636
+ end
637
+ end
638
+ end
639
+
640
+ describe '#raw_data_file_location for a uid' do
641
+ before do
642
+ @dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
643
+ Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
644
+
645
+ ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
646
+ end
647
+
648
+ after do
649
+ FileUtils.rmdir(File.join(@dummy_root_directory, 'data'))
650
+ end
651
+
652
+ it 'should return directory built from uid inside root data directory' do
653
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('123456', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','3','4','5', '123456.html')
654
+ end
655
+
656
+ it 'should create directory structure if it doesnt exist' do
657
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('123456', 'html')
658
+ Dir.exist?(File.join(@dummy_root_directory, 'data', '1','2','3','4','5')).should == true
659
+ end
660
+
661
+ it 'should ignore leading zeroes when building directory' do
662
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('001234', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','3','4', '001234.html')
663
+ end
664
+
665
+ it 'should ignore non alphanum chars when building directory' do
666
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456', 'html').should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html')
667
+ end
668
+
669
+ it 'should allow format to be missing' do
670
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456').should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')
671
+ end
672
+
673
+ it 'should allow format to be nil' do
674
+ ModuleThatIncludesRegisterMethods.raw_data_file_location('12a-b/3456', nil).should == File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')
675
+ end
676
+ end
677
+
678
+ describe '#save_raw_data' do
679
+ before do
680
+ @dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
681
+ Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
682
+
683
+ ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
684
+ end
685
+
686
+ it 'should save raw data as in computed raw_data_file_location' do
687
+ ModuleThatIncludesRegisterMethods.save_raw_data('foo bar', '12a-b/3456', 'html')
688
+ File.read(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html')).should == 'foo bar'
689
+ end
690
+
691
+ it 'should allow format to be missing' do
692
+ ModuleThatIncludesRegisterMethods.save_raw_data('foo bar', '12a-b/3456')
693
+ File.read(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456')).should == 'foo bar'
694
+ end
695
+ end
696
+
697
+ describe '#get_raw_data' do
698
+ before do
699
+ @dummy_root_directory = File.join(File.dirname(__FILE__),'..','..','tmp')
700
+ Dir.mkdir(@dummy_root_directory) unless Dir.exist?(@dummy_root_directory)
701
+
702
+ ModuleThatIncludesRegisterMethods.stub(:root_directory).and_return(@dummy_root_directory)
703
+ end
704
+
705
+ it 'should read raw data in computed raw_data_file_location' do
706
+ File.open(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456.html'),'w') { |f| f.print 'foo bar' }
707
+ ModuleThatIncludesRegisterMethods.get_raw_data('12a-b/3456', 'html').should == 'foo bar'
708
+ end
709
+
710
+ it 'should allow format to be missing' do
711
+ File.open(File.join(@dummy_root_directory, 'data', '1','2','a','b','3', '12ab3456'),'w') { |f| f.print 'foo bar' }
712
+ ModuleThatIncludesRegisterMethods.get_raw_data('12a-b/3456').should == 'foo bar'
713
+ end
714
+ end
597
715
  end