openc_bot 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
@@ -0,0 +1,385 @@
1
+ require 'simple_openc_bot'
2
+ require 'optparse'
3
+ require 'json'
4
+ require 'fileutils'
5
+
6
+ namespace :bot do
7
+ desc "create a skeleton bot that can be used in OpenCorporates"
8
+ task :create do
9
+ create_bot
10
+ end
11
+
12
+ desc "create a skeleton bot that can be used in OpenCorporates"
13
+ task :create_company_bot do
14
+ create_bot('company')
15
+ end
16
+
17
+ desc "create a skeleton simple_bot that can be used in OpenCorporates"
18
+ task :create_simple_bot do
19
+ working_dir = Dir.pwd
20
+ bot_name = get_bot_name
21
+ new_module_name = bot_name.split('_').collect(&:capitalize).join
22
+ %w(bin db data lib spec spec/dummy_responses tmp pids).each do |new_dir|
23
+ Dir.mkdir(File.join(working_dir,new_dir)) unless Dir.exist?(File.join(working_dir,new_dir))
24
+ end
25
+ templates = ['spec/spec_helper.rb','spec/simple_bot_spec.rb','lib/simple_bot.rb', 'README.md', 'config.yml', 'bin/export_data', 'bin/fetch_data', 'bin/verify_data']
26
+ templates.each do |template_location|
27
+ template = File.open(File.join(File.dirname(__FILE__), 'templates',template_location)).read
28
+ template.gsub!('MyLicence',new_module_name)
29
+ template.gsub!('my_module',bot_name)
30
+ begin
31
+ new_file = File.join(working_dir,"#{template_location.sub(/template/,'').sub(/simple_bot/,bot_name)}")
32
+ File.open(new_file, File::WRONLY|File::CREAT|File::EXCL) { |f| f.puts template }
33
+ puts "Created #{new_file}"
34
+ rescue Errno::EEXIST
35
+ puts "Skipped creating #{new_file} as it already exists"
36
+ end
37
+ FileUtils.chmod(0755, Dir.glob("#{working_dir}/bin/*"))
38
+ end
39
+ #Add rspec debugger to gemfile
40
+ File.open(File.join(working_dir,'Gemfile'),'a') do |file|
41
+ file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
42
+ puts "Added rspec and debugger to Gemfile at #{file.path}"
43
+ end
44
+ puts "Please run 'bundle install'"
45
+ end
46
+
47
+ desc 'Get data from target'
48
+ task :run do |t, args|
49
+ only_process_running(t.name) do
50
+ options = {}
51
+ options[:specific_ids] = []
52
+ options[:reset_iterator] = false
53
+ OptionParser.new(args) do |opts|
54
+ opts.banner = "Usage: rake #{t.name} -- [options]"
55
+ opts.on("-i", "--identifier UNIQUE_FIELD_VAL",
56
+ "Identifier of specific record to fetch",
57
+ " (may specify multiple times; refer to bot for its unique_fields)") do |val|
58
+ options[:specific_ids] << val
59
+ end
60
+ opts.on("-t", "--test-mode",
61
+ "Pass 'test' flag to bot") do |val|
62
+ options[:test_mode] = true
63
+ end
64
+ opts.on("-r", "--reset",
65
+ "Don't resume incremental bots; reset and start from the beginning") do |val|
66
+ options[:reset_iterator] = true
67
+ end
68
+ opts.on("-m", "--max-iterations MAX_ITERATIONS",
69
+ "Exit all iterators after MAX_ITERATIONS iterations. Useful for debugging.") do |val|
70
+ options[:max_iterations] = val.to_i
71
+ end
72
+ end.parse!
73
+ bot_name = get_bot_name
74
+ require_relative File.join(Dir.pwd,'lib', bot_name)
75
+ runner = callable_from_file_name(bot_name)
76
+ count = runner.update_data(options)
77
+ puts "Got #{count} records"
78
+ end
79
+ end
80
+
81
+ desc 'Update stale data from target'
82
+ task :update_stale do
83
+ only_process_running('update_stale') do
84
+ bot_name = get_bot_name
85
+ require_relative File.join(Dir.pwd,'lib', bot_name)
86
+ runner = callable_from_file_name(bot_name)
87
+ runner.update_stale
88
+ end
89
+ end
90
+
91
+ desc 'Run bot, but just for record with given uid'
92
+ task :run_for_uid, :uid do |t, args|
93
+ only_process_running('run_for_uid') do
94
+ bot_name = get_bot_name
95
+ require_relative File.join(Dir.pwd,'lib', bot_name)
96
+ runner = callable_from_file_name(bot_name)
97
+ # this should output the updated json data for the given uid to
98
+ # STDOUT, as well as updating local database, when passed true as second argument
99
+ runner.update_datum(args[:uid], true)
100
+ end
101
+ end
102
+
103
+ desc 'Export data to stdout'
104
+ task :export do |t, args|
105
+ only_process_running(t.name) do
106
+ options = {}
107
+ options[:specific_ids] = []
108
+ OptionParser.new(args) do |opts|
109
+ opts.banner = "Usage: rake #{t.name} -- [options]"
110
+ opts.on("-i", "--identifier UNIQUE_FIELD_VAL",
111
+ "Identifier of specific record to export",
112
+ " (may specify multiple times; refer to bot for its unique_fields)") do |val|
113
+ options[:specific_ids] << val
114
+ end
115
+ opts.on("-a", "--all",
116
+ "Export everything (default is only to export data that has changed since last export)") do |val|
117
+ options[:all] = true
118
+ end
119
+ end.parse!
120
+ bot_name = get_bot_name
121
+ require_relative File.join(Dir.pwd,'lib', bot_name)
122
+ runner = callable_from_file_name(bot_name)
123
+ runner.export(options)
124
+ end
125
+ end
126
+
127
+ desc 'Export 5 records to stdout for manual checking'
128
+ task :spotcheck do
129
+ only_process_running('spotcheck') do
130
+ bot_name = get_bot_name
131
+ require_relative File.join(Dir.pwd,'lib', bot_name)
132
+ runner = callable_from_file_name(bot_name)
133
+ runner.spotcheck
134
+ end
135
+ end
136
+
137
+ desc 'Summarise data for quality checking (only works for licences at the moment)'
138
+ task :summarise_data do
139
+ def as_sorted_hash(name, data)
140
+ title = "#{name} counts:"
141
+ puts title
142
+ puts "-" * title.length
143
+ grouped = Hash[*data.group_by{|i| i}.map{|k,v| [Array(k).join(", "), v.count] }.flatten]
144
+ hash = grouped.sort_by do |k, v|
145
+ v
146
+ end
147
+ hash.each do |k, v|
148
+ printf("%-60s %10s\n", k, v)
149
+ end
150
+ puts
151
+ end
152
+
153
+ def as_longest_and_shortest(name, data)
154
+ sorted = data.compact.sort_by do |n|
155
+ n.length
156
+ end
157
+ puts
158
+ title = "shortest 5 #{name}"
159
+ puts title
160
+ puts "-" * title.length
161
+ puts sorted[0..5]
162
+ puts
163
+ title = "longest 5 #{name}"
164
+ puts title
165
+ puts "-" * title.length
166
+ puts sorted[-5..-1]
167
+ puts
168
+ end
169
+
170
+ def main
171
+ #result = open("foo", "r").read
172
+ result = `bundle exec openc_bot rake bot:export -- -a`
173
+ jurisdictions = []
174
+ names = []
175
+ start_dates = []
176
+ end_dates = []
177
+ sample_dates = []
178
+ licence_numbers = []
179
+ jurisdiction_classifications = []
180
+ result.split(/\r?\n/).each do |line|
181
+ line = JSON.parse(line)
182
+ jurisdictions << line["company"]["jurisdiction"]
183
+ names << line["company"]["name"]
184
+ start_dates << line["start_date"]
185
+ end_dates << line["end_date"]
186
+ sample_dates << line["sample_date"]
187
+ licence_numbers << line["data"][0]["properties"]["licence_number"]
188
+ jurisdiction_classifications << line["data"][0]["properties"]["jurisdiction_classification"]
189
+ end
190
+
191
+ # This could be a histogram:
192
+ as_sorted_hash("[company][jurisdiction]", jurisdictions)
193
+
194
+ # This could be a list of the longest and shortest names:
195
+ as_longest_and_shortest("[company][name]s", names)
196
+
197
+ # earliest start date and latest start date and sample dates
198
+ start_dates = start_dates.compact.sort
199
+ end_dates = end_dates.compact.sort
200
+ sample_dates = sample_dates.compact.sort
201
+ puts
202
+ puts "Dates"
203
+ puts "-----"
204
+ printf("%-22s %10s\n", "Earliest start_date:", start_dates.first)
205
+ printf("%-22s %10s\n", "Earliest end_date:", end_dates.first)
206
+ printf("%-22s %10s\n", "Earliest sample_date:", end_dates.first)
207
+ printf("%-22s %10s\n", "Latest start_date:", start_dates.last)
208
+ printf("%-22s %10s\n", "Latest end_date:", end_dates.last)
209
+ printf("%-22s %10s\n", "Latest sample_date:", sample_dates.last)
210
+
211
+ as_longest_and_shortest("licence numbers", licence_numbers)
212
+ as_sorted_hash("jurisdiction_classifications", jurisdiction_classifications)
213
+ end
214
+
215
+ main()
216
+
217
+ end
218
+
219
+ desc 'Lint old-style bots'
220
+ task :lint do
221
+ bot_name = get_bot_name
222
+ require_relative File.join(Dir.pwd,'lib', bot_name)
223
+ runner = callable_from_file_name(bot_name)
224
+ messages = []
225
+ if runner.method(:export_data).arity == 0
226
+ messages << "export_data method must accept a hash as a single argument (e.g. `export_data(opts={})`"
227
+ end
228
+
229
+ if runner.is_a? SimpleOpencBot
230
+ if !runner.respond_to? "fetch_all_records"
231
+ messages << "You must rename fetch_records -> fetch_all_records."
232
+ end
233
+
234
+ full_source = File.open(File.join(Dir.pwd,'lib', bot_name) + ".rb").read
235
+ if !full_source.match("^\s+yields")
236
+ messages << <<EOF
237
+ You must call the `yields` class method with the class
238
+ of the Records you're returning. For example:
239
+
240
+ class FooLicenses < SimpleOpencBot
241
+ yields FooLicensesRecord
242
+
243
+ EOF
244
+ end
245
+
246
+ # fetch_all_methods must yield rather than return
247
+ if runner.respond_to? "fetch_all_records"
248
+ source, line = runner.method(:fetch_all_records).source_location
249
+ count = 0
250
+ found = false
251
+ File.foreach(source, "\n") do |l|
252
+ count += 1
253
+ next if count < line + 1
254
+
255
+ if l.match("^\s+yield")
256
+ found = true
257
+ break
258
+ end
259
+ break if l.match("^\s+def")
260
+ end
261
+ messages << "fetch_all_records must `yield` single records (rather than returning an array)" if !found
262
+ end
263
+ end
264
+ messages.each_with_index do |m, i|
265
+ puts "#{i + 1}:"
266
+ puts m
267
+ puts "------------"
268
+ end
269
+ puts "No problems!" if messages.empty?
270
+ end
271
+
272
+ task :test do
273
+ bot_name = get_bot_name
274
+ require_relative File.join(Dir.pwd,'lib', bot_name)
275
+ runner = callable_from_file_name(bot_name)
276
+ if runner.respond_to?(:validate_data)
277
+ results = runner.validate_data
278
+ if !results.empty?
279
+ raise OpencBot::InvalidDataError.new(results)
280
+ end
281
+ else
282
+ results = runner.export_data
283
+ results.each do |datum|
284
+ raise OpencBot::InvalidDataError.new("This datum is invalid: #{datum.inspect}") unless
285
+ OpencBot::BotDataValidator.validate(datum)
286
+ end
287
+ end
288
+ puts "Congratulations! This data appears to be valid"
289
+ end
290
+
291
+ def klass_from_file_name(underscore_file_name)
292
+ camelcase_version = underscore_file_name.split('_').map{ |e| e.capitalize }.join
293
+ Object.const_get(camelcase_version)
294
+ end
295
+
296
+ # At the moment, we have simple bots and bots; the former expect to
297
+ # be instances, the latter modules with class methods.
298
+ def callable_from_file_name(underscore_file_name)
299
+ bot_klass = klass_from_file_name(underscore_file_name)
300
+ if bot_klass.respond_to?(:new)
301
+ callable = bot_klass.new
302
+ else
303
+ callable = bot_klass
304
+ end
305
+ callable
306
+ end
307
+
308
+ def create_bot(template_name='bot')
309
+ working_dir = Dir.pwd
310
+ bot_name = get_bot_name
311
+ new_module_name = bot_name.split('_').collect(&:capitalize).join
312
+
313
+ %w(bin db data lib spec spec/dummy_responses tmp pids).each do |new_dir|
314
+ new_dir_path = File.join(working_dir,new_dir)
315
+ FileUtils.mkdir_p(new_dir_path)
316
+ end
317
+
318
+ bot_template = "lib/#{template_name}.rb"
319
+ templates = ['spec/spec_helper.rb','spec/bot_spec.rb', 'README.md', 'config.yml', bot_template]
320
+ templates.each do |template_location|
321
+ template = File.open(File.join(File.dirname(__FILE__), 'templates',template_location)).read
322
+ template.gsub!('MyModule',new_module_name)
323
+ template.gsub!('my_module',bot_name)
324
+ new_file = File.join(working_dir,"#{template_location.sub(/template/,'').sub(/bot/,bot_name)}")
325
+ unless File.exists? new_file
326
+ File.open(new_file, File::WRONLY|File::CREAT|File::EXCL) { |f| f.puts template }
327
+ puts "Created #{new_file}"
328
+ end
329
+ end
330
+
331
+ #Add rspec debugger to gemfile
332
+ File.open(File.join(working_dir,'Gemfile'),'a') do |file|
333
+ file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
334
+ puts "Added rspec and debugger to Gemfile at #{file}"
335
+ end
336
+ puts "Please run 'bundle install'"
337
+ end
338
+
339
+ def get_bot_name
340
+ bot_name ||= Dir.pwd.split('/').last
341
+ end
342
+
343
+ def only_process_running(task_name)
344
+ pid_path = File.join(Dir.pwd, 'pids', task_name)
345
+
346
+ raise_if_already_running(pid_path)
347
+ write_pid_file(pid_path)
348
+
349
+ begin
350
+ yield
351
+ ensure
352
+ remove_pid_file(pid_path)
353
+ end
354
+ end
355
+
356
+ def raise_if_already_running(pid_path)
357
+ begin
358
+ pid = File.open(pid_path).read.to_i
359
+ rescue Errno::ENOENT
360
+ # PID file doesn't exist
361
+ return
362
+ end
363
+
364
+ begin
365
+ Process.getpgid(pid)
366
+ rescue Errno::ESRCH
367
+ # Process with PID doesn't exist
368
+ # TODO Log this
369
+ return
370
+ else
371
+ # Process with PID does exist
372
+ # TODO Log this
373
+ raise 'Already running'
374
+ end
375
+ end
376
+
377
+ def write_pid_file(pid_path)
378
+ File.open(pid_path, 'w') {|file| file.write(Process.pid)}
379
+ end
380
+
381
+ def remove_pid_file(pid_path)
382
+ File.delete(pid_path)
383
+ end
384
+
385
+ end
@@ -0,0 +1,35 @@
1
+ # MyModule Bot
2
+
3
+ ## About the data publisher
4
+
5
+ Describe the source. Specifically:
6
+
7
+ * Who is behind it?
8
+ * What gives it regulatory power (thus justifying it being in
9
+ OpenCorporates)?
10
+ * How often is it updated?
11
+
12
+ ## About the data
13
+
14
+ * Give a sample URL showing a typical page in the source, or
15
+ instructions on how to find one.
16
+ * Provide references, if possible, to where the meanings of the fields
17
+ are defined.
18
+ * Found any interesting bits of data while debugging? Mention them here!
19
+
20
+ ### Data exported
21
+
22
+ * List the fields that this bot currently exports to OpenCorporates,
23
+ along with their meaning within this jurisdiction.
24
+
25
+ ### Data not currently exported, but stored
26
+
27
+ * List the fields that might be of interest to OpenCorporares in the
28
+ future. For example, the source might contain address data, but the
29
+ scraper might currently only focus on licenses.
30
+
31
+ ## Assumptions
32
+
33
+ * What assumptions have you made in preparing the data for export?
34
+ E.g. you might be inferring that a field refers to a company (rather
35
+ than a person) based on a regular expression.
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ require 'open3'
3
+ require 'fileutils'
4
+
5
+ command = "bundle exec openc_bot rake bot:export"
6
+ options = { chdir: File.join(File.dirname(__FILE__), "..") }
7
+ _, stdout, stderr, wait_thread = Open3::popen3(command, options)
8
+ result = wait_thread.value
9
+
10
+ if result.success?
11
+ data = stdout.read
12
+ if !data.strip.empty?
13
+ dir = "data/#{Time.now.strftime('%Y-%m-%d')}"
14
+ FileUtils.mkdir_p(dir)
15
+ export_number = Dir.glob("#{dir}/export-*.json").count + 1
16
+ dest ="#{dir}/export-#{export_number}.json"
17
+ File.open(dest, "w") do |f|
18
+ f.write(data)
19
+ end
20
+ puts "Written data to #{dest}"
21
+ else
22
+ puts "No new data to export"
23
+ end
24
+ exit 0
25
+ else
26
+ STDERR.puts stderr.read
27
+ exit 1
28
+ end