openc_bot 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
@@ -0,0 +1,385 @@
1
+ require 'simple_openc_bot'
2
+ require 'optparse'
3
+ require 'json'
4
+ require 'fileutils'
5
+
6
+ namespace :bot do
7
+ desc "create a skeleton bot that can be used in OpenCorporates"
8
+ task :create do
9
+ create_bot
10
+ end
11
+
12
+ desc "create a skeleton bot that can be used in OpenCorporates"
13
+ task :create_company_bot do
14
+ create_bot('company')
15
+ end
16
+
17
+ desc "create a skeleton simple_bot that can be used in OpenCorporates"
18
+ task :create_simple_bot do
19
+ working_dir = Dir.pwd
20
+ bot_name = get_bot_name
21
+ new_module_name = bot_name.split('_').collect(&:capitalize).join
22
+ %w(bin db data lib spec spec/dummy_responses tmp pids).each do |new_dir|
23
+ Dir.mkdir(File.join(working_dir,new_dir)) unless Dir.exist?(File.join(working_dir,new_dir))
24
+ end
25
+ templates = ['spec/spec_helper.rb','spec/simple_bot_spec.rb','lib/simple_bot.rb', 'README.md', 'config.yml', 'bin/export_data', 'bin/fetch_data', 'bin/verify_data']
26
+ templates.each do |template_location|
27
+ template = File.open(File.join(File.dirname(__FILE__), 'templates',template_location)).read
28
+ template.gsub!('MyLicence',new_module_name)
29
+ template.gsub!('my_module',bot_name)
30
+ begin
31
+ new_file = File.join(working_dir,"#{template_location.sub(/template/,'').sub(/simple_bot/,bot_name)}")
32
+ File.open(new_file, File::WRONLY|File::CREAT|File::EXCL) { |f| f.puts template }
33
+ puts "Created #{new_file}"
34
+ rescue Errno::EEXIST
35
+ puts "Skipped creating #{new_file} as it already exists"
36
+ end
37
+ FileUtils.chmod(0755, Dir.glob("#{working_dir}/bin/*"))
38
+ end
39
+ #Add rspec debugger to gemfile
40
+ File.open(File.join(working_dir,'Gemfile'),'a') do |file|
41
+ file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
42
+ puts "Added rspec and debugger to Gemfile at #{file.path}"
43
+ end
44
+ puts "Please run 'bundle install'"
45
+ end
46
+
47
+ desc 'Get data from target'
48
+ task :run do |t, args|
49
+ only_process_running(t.name) do
50
+ options = {}
51
+ options[:specific_ids] = []
52
+ options[:reset_iterator] = false
53
+ OptionParser.new(args) do |opts|
54
+ opts.banner = "Usage: rake #{t.name} -- [options]"
55
+ opts.on("-i", "--identifier UNIQUE_FIELD_VAL",
56
+ "Identifier of specific record to fetch",
57
+ " (may specify multiple times; refer to bot for its unique_fields)") do |val|
58
+ options[:specific_ids] << val
59
+ end
60
+ opts.on("-t", "--test-mode",
61
+ "Pass 'test' flag to bot") do |val|
62
+ options[:test_mode] = true
63
+ end
64
+ opts.on("-r", "--reset",
65
+ "Don't resume incremental bots; reset and start from the beginning") do |val|
66
+ options[:reset_iterator] = true
67
+ end
68
+ opts.on("-m", "--max-iterations MAX_ITERATIONS",
69
+ "Exit all iterators after MAX_ITERATIONS iterations. Useful for debugging.") do |val|
70
+ options[:max_iterations] = val.to_i
71
+ end
72
+ end.parse!
73
+ bot_name = get_bot_name
74
+ require_relative File.join(Dir.pwd,'lib', bot_name)
75
+ runner = callable_from_file_name(bot_name)
76
+ count = runner.update_data(options)
77
+ puts "Got #{count} records"
78
+ end
79
+ end
80
+
81
+ desc 'Update stale data from target'
82
+ task :update_stale do
83
+ only_process_running('update_stale') do
84
+ bot_name = get_bot_name
85
+ require_relative File.join(Dir.pwd,'lib', bot_name)
86
+ runner = callable_from_file_name(bot_name)
87
+ runner.update_stale
88
+ end
89
+ end
90
+
91
+ desc 'Run bot, but just for record with given uid'
92
+ task :run_for_uid, :uid do |t, args|
93
+ only_process_running('run_for_uid') do
94
+ bot_name = get_bot_name
95
+ require_relative File.join(Dir.pwd,'lib', bot_name)
96
+ runner = callable_from_file_name(bot_name)
97
+ # this should output the updated json data for the given uid to
98
+ # STDOUT, as well as updating local database, when passed true as second argument
99
+ runner.update_datum(args[:uid], true)
100
+ end
101
+ end
102
+
103
+ desc 'Export data to stdout'
104
+ task :export do |t, args|
105
+ only_process_running(t.name) do
106
+ options = {}
107
+ options[:specific_ids] = []
108
+ OptionParser.new(args) do |opts|
109
+ opts.banner = "Usage: rake #{t.name} -- [options]"
110
+ opts.on("-i", "--identifier UNIQUE_FIELD_VAL",
111
+ "Identifier of specific record to export",
112
+ " (may specify multiple times; refer to bot for its unique_fields)") do |val|
113
+ options[:specific_ids] << val
114
+ end
115
+ opts.on("-a", "--all",
116
+ "Export everything (default is only to export data that has changed since last export)") do |val|
117
+ options[:all] = true
118
+ end
119
+ end.parse!
120
+ bot_name = get_bot_name
121
+ require_relative File.join(Dir.pwd,'lib', bot_name)
122
+ runner = callable_from_file_name(bot_name)
123
+ runner.export(options)
124
+ end
125
+ end
126
+
127
+ desc 'Export 5 records to stdout for manual checking'
128
+ task :spotcheck do
129
+ only_process_running('spotcheck') do
130
+ bot_name = get_bot_name
131
+ require_relative File.join(Dir.pwd,'lib', bot_name)
132
+ runner = callable_from_file_name(bot_name)
133
+ runner.spotcheck
134
+ end
135
+ end
136
+
137
+ desc 'Summarise data for quality checking (only works for licences at the moment)'
138
+ task :summarise_data do
139
+ def as_sorted_hash(name, data)
140
+ title = "#{name} counts:"
141
+ puts title
142
+ puts "-" * title.length
143
+ grouped = Hash[*data.group_by{|i| i}.map{|k,v| [Array(k).join(", "), v.count] }.flatten]
144
+ hash = grouped.sort_by do |k, v|
145
+ v
146
+ end
147
+ hash.each do |k, v|
148
+ printf("%-60s %10s\n", k, v)
149
+ end
150
+ puts
151
+ end
152
+
153
+ def as_longest_and_shortest(name, data)
154
+ sorted = data.compact.sort_by do |n|
155
+ n.length
156
+ end
157
+ puts
158
+ title = "shortest 5 #{name}"
159
+ puts title
160
+ puts "-" * title.length
161
+ puts sorted[0..5]
162
+ puts
163
+ title = "longest 5 #{name}"
164
+ puts title
165
+ puts "-" * title.length
166
+ puts sorted[-5..-1]
167
+ puts
168
+ end
169
+
170
+ def main
171
+ #result = open("foo", "r").read
172
+ result = `bundle exec openc_bot rake bot:export -- -a`
173
+ jurisdictions = []
174
+ names = []
175
+ start_dates = []
176
+ end_dates = []
177
+ sample_dates = []
178
+ licence_numbers = []
179
+ jurisdiction_classifications = []
180
+ result.split(/\r?\n/).each do |line|
181
+ line = JSON.parse(line)
182
+ jurisdictions << line["company"]["jurisdiction"]
183
+ names << line["company"]["name"]
184
+ start_dates << line["start_date"]
185
+ end_dates << line["end_date"]
186
+ sample_dates << line["sample_date"]
187
+ licence_numbers << line["data"][0]["properties"]["licence_number"]
188
+ jurisdiction_classifications << line["data"][0]["properties"]["jurisdiction_classification"]
189
+ end
190
+
191
+ # This could be a histogram:
192
+ as_sorted_hash("[company][jurisdiction]", jurisdictions)
193
+
194
+ # This could be a list of the longest and shortest names:
195
+ as_longest_and_shortest("[company][name]s", names)
196
+
197
+ # earliest start date and latest start date and sample dates
198
+ start_dates = start_dates.compact.sort
199
+ end_dates = end_dates.compact.sort
200
+ sample_dates = sample_dates.compact.sort
201
+ puts
202
+ puts "Dates"
203
+ puts "-----"
204
+ printf("%-22s %10s\n", "Earliest start_date:", start_dates.first)
205
+ printf("%-22s %10s\n", "Earliest end_date:", end_dates.first)
206
+ printf("%-22s %10s\n", "Earliest sample_date:", end_dates.first)
207
+ printf("%-22s %10s\n", "Latest start_date:", start_dates.last)
208
+ printf("%-22s %10s\n", "Latest end_date:", end_dates.last)
209
+ printf("%-22s %10s\n", "Latest sample_date:", sample_dates.last)
210
+
211
+ as_longest_and_shortest("licence numbers", licence_numbers)
212
+ as_sorted_hash("jurisdiction_classifications", jurisdiction_classifications)
213
+ end
214
+
215
+ main()
216
+
217
+ end
218
+
219
+ desc 'Lint old-style bots'
220
+ task :lint do
221
+ bot_name = get_bot_name
222
+ require_relative File.join(Dir.pwd,'lib', bot_name)
223
+ runner = callable_from_file_name(bot_name)
224
+ messages = []
225
+ if runner.method(:export_data).arity == 0
226
+ messages << "export_data method must accept a hash as a single argument (e.g. `export_data(opts={})`"
227
+ end
228
+
229
+ if runner.is_a? SimpleOpencBot
230
+ if !runner.respond_to? "fetch_all_records"
231
+ messages << "You must rename fetch_records -> fetch_all_records."
232
+ end
233
+
234
+ full_source = File.open(File.join(Dir.pwd,'lib', bot_name) + ".rb").read
235
+ if !full_source.match("^\s+yields")
236
+ messages << <<EOF
237
+ You must call the `yields` class method with the class
238
+ of the Records you're returning. For example:
239
+
240
+ class FooLicenses < SimpleOpencBot
241
+ yields FooLicensesRecord
242
+
243
+ EOF
244
+ end
245
+
246
+ # fetch_all_methods must yield rather than return
247
+ if runner.respond_to? "fetch_all_records"
248
+ source, line = runner.method(:fetch_all_records).source_location
249
+ count = 0
250
+ found = false
251
+ File.foreach(source, "\n") do |l|
252
+ count += 1
253
+ next if count < line + 1
254
+
255
+ if l.match("^\s+yield")
256
+ found = true
257
+ break
258
+ end
259
+ break if l.match("^\s+def")
260
+ end
261
+ messages << "fetch_all_records must `yield` single records (rather than returning an array)" if !found
262
+ end
263
+ end
264
+ messages.each_with_index do |m, i|
265
+ puts "#{i + 1}:"
266
+ puts m
267
+ puts "------------"
268
+ end
269
+ puts "No problems!" if messages.empty?
270
+ end
271
+
272
+ task :test do
273
+ bot_name = get_bot_name
274
+ require_relative File.join(Dir.pwd,'lib', bot_name)
275
+ runner = callable_from_file_name(bot_name)
276
+ if runner.respond_to?(:validate_data)
277
+ results = runner.validate_data
278
+ if !results.empty?
279
+ raise OpencBot::InvalidDataError.new(results)
280
+ end
281
+ else
282
+ results = runner.export_data
283
+ results.each do |datum|
284
+ raise OpencBot::InvalidDataError.new("This datum is invalid: #{datum.inspect}") unless
285
+ OpencBot::BotDataValidator.validate(datum)
286
+ end
287
+ end
288
+ puts "Congratulations! This data appears to be valid"
289
+ end
290
+
291
+ def klass_from_file_name(underscore_file_name)
292
+ camelcase_version = underscore_file_name.split('_').map{ |e| e.capitalize }.join
293
+ Object.const_get(camelcase_version)
294
+ end
295
+
296
+ # At the moment, we have simple bots and bots; the former expect to
297
+ # be instances, the latter modules with class methods.
298
+ def callable_from_file_name(underscore_file_name)
299
+ bot_klass = klass_from_file_name(underscore_file_name)
300
+ if bot_klass.respond_to?(:new)
301
+ callable = bot_klass.new
302
+ else
303
+ callable = bot_klass
304
+ end
305
+ callable
306
+ end
307
+
308
+ def create_bot(template_name='bot')
309
+ working_dir = Dir.pwd
310
+ bot_name = get_bot_name
311
+ new_module_name = bot_name.split('_').collect(&:capitalize).join
312
+
313
+ %w(bin db data lib spec spec/dummy_responses tmp pids).each do |new_dir|
314
+ new_dir_path = File.join(working_dir,new_dir)
315
+ FileUtils.mkdir_p(new_dir_path)
316
+ end
317
+
318
+ bot_template = "lib/#{template_name}.rb"
319
+ templates = ['spec/spec_helper.rb','spec/bot_spec.rb', 'README.md', 'config.yml', bot_template]
320
+ templates.each do |template_location|
321
+ template = File.open(File.join(File.dirname(__FILE__), 'templates',template_location)).read
322
+ template.gsub!('MyModule',new_module_name)
323
+ template.gsub!('my_module',bot_name)
324
+ new_file = File.join(working_dir,"#{template_location.sub(/template/,'').sub(/bot/,bot_name)}")
325
+ unless File.exists? new_file
326
+ File.open(new_file, File::WRONLY|File::CREAT|File::EXCL) { |f| f.puts template }
327
+ puts "Created #{new_file}"
328
+ end
329
+ end
330
+
331
+ #Add rspec debugger to gemfile
332
+ File.open(File.join(working_dir,'Gemfile'),'a') do |file|
333
+ file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
334
+ puts "Added rspec and debugger to Gemfile at #{file}"
335
+ end
336
+ puts "Please run 'bundle install'"
337
+ end
338
+
339
+ def get_bot_name
340
+ bot_name ||= Dir.pwd.split('/').last
341
+ end
342
+
343
+ def only_process_running(task_name)
344
+ pid_path = File.join(Dir.pwd, 'pids', task_name)
345
+
346
+ raise_if_already_running(pid_path)
347
+ write_pid_file(pid_path)
348
+
349
+ begin
350
+ yield
351
+ ensure
352
+ remove_pid_file(pid_path)
353
+ end
354
+ end
355
+
356
+ def raise_if_already_running(pid_path)
357
+ begin
358
+ pid = File.open(pid_path).read.to_i
359
+ rescue Errno::ENOENT
360
+ # PID file doesn't exist
361
+ return
362
+ end
363
+
364
+ begin
365
+ Process.getpgid(pid)
366
+ rescue Errno::ESRCH
367
+ # Process with PID doesn't exist
368
+ # TODO Log this
369
+ return
370
+ else
371
+ # Process with PID does exist
372
+ # TODO Log this
373
+ raise 'Already running'
374
+ end
375
+ end
376
+
377
+ def write_pid_file(pid_path)
378
+ File.open(pid_path, 'w') {|file| file.write(Process.pid)}
379
+ end
380
+
381
+ def remove_pid_file(pid_path)
382
+ File.delete(pid_path)
383
+ end
384
+
385
+ end
@@ -0,0 +1,35 @@
1
+ # MyModule Bot
2
+
3
+ ## About the data publisher
4
+
5
+ Describe the source. Specifically:
6
+
7
+ * Who is behind it?
8
+ * What gives it regulatory power (thus justifying it being in
9
+ OpenCorporates)?
10
+ * How often is it updated?
11
+
12
+ ## About the data
13
+
14
+ * Give a sample URL showing a typical page in the source, or
15
+ instructions on how to find one.
16
+ * Provide references, if possible, to where the meanings of the fields
17
+ are defined.
18
+ * Found any interesting bits of data while debugging? Mention them here!
19
+
20
+ ### Data exported
21
+
22
+ * List the fields that this bot currently exports to OpenCorporates,
23
+ along with their meaning within this jurisdiction.
24
+
25
+ ### Data not currently exported, but stored
26
+
27
+ * List the fields that might be of interest to OpenCorporares in the
28
+ future. For example, the source might contain address data, but the
29
+ scraper might currently only focus on licenses.
30
+
31
+ ## Assumptions
32
+
33
+ * What assumptions have you made in preparing the data for export?
34
+ E.g. you might be inferring that a field refers to a company (rather
35
+ than a person) based on a regular expression.
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ require 'open3'
3
+ require 'fileutils'
4
+
5
+ command = "bundle exec openc_bot rake bot:export"
6
+ options = { chdir: File.join(File.dirname(__FILE__), "..") }
7
+ _, stdout, stderr, wait_thread = Open3::popen3(command, options)
8
+ result = wait_thread.value
9
+
10
+ if result.success?
11
+ data = stdout.read
12
+ if !data.strip.empty?
13
+ dir = "data/#{Time.now.strftime('%Y-%m-%d')}"
14
+ FileUtils.mkdir_p(dir)
15
+ export_number = Dir.glob("#{dir}/export-*.json").count + 1
16
+ dest ="#{dir}/export-#{export_number}.json"
17
+ File.open(dest, "w") do |f|
18
+ f.write(data)
19
+ end
20
+ puts "Written data to #{dest}"
21
+ else
22
+ puts "No new data to export"
23
+ end
24
+ exit 0
25
+ else
26
+ STDERR.puts stderr.read
27
+ exit 1
28
+ end