openc_bot 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +2 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +253 -0
- data/Rakefile +14 -0
- data/bin/openc_bot +13 -0
- data/create_bot.sh +30 -0
- data/create_company_bot.sh +16 -0
- data/create_simple_licence_bot.sh +31 -0
- data/db/.gitkeep +0 -0
- data/examples/basic/.gitignore +3 -0
- data/examples/basic/Gemfile +7 -0
- data/examples/basic/config.yml +21 -0
- data/examples/basic/lib/basic.rb +88 -0
- data/examples/basic_with_proxy/Gemfile +7 -0
- data/examples/basic_with_proxy/config.yml +21 -0
- data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
- data/examples/bot_with_simple_iterator/Gemfile +6 -0
- data/examples/bot_with_simple_iterator/config.yml +21 -0
- data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
- data/examples/company_fetchers/basic.rb +49 -0
- data/lib/monkey_patches/mechanize.rb +53 -0
- data/lib/openc_bot.rb +89 -0
- data/lib/openc_bot/bot_data_validator.rb +18 -0
- data/lib/openc_bot/company_fetcher_bot.rb +40 -0
- data/lib/openc_bot/exceptions.rb +17 -0
- data/lib/openc_bot/helpers/_csv.rb +10 -0
- data/lib/openc_bot/helpers/alpha_search.rb +73 -0
- data/lib/openc_bot/helpers/dates.rb +33 -0
- data/lib/openc_bot/helpers/html.rb +8 -0
- data/lib/openc_bot/helpers/incremental_search.rb +106 -0
- data/lib/openc_bot/helpers/register_methods.rb +205 -0
- data/lib/openc_bot/helpers/text.rb +18 -0
- data/lib/openc_bot/incrementers.rb +2 -0
- data/lib/openc_bot/incrementers/base.rb +214 -0
- data/lib/openc_bot/incrementers/common.rb +47 -0
- data/lib/openc_bot/tasks.rb +385 -0
- data/lib/openc_bot/templates/README.md +35 -0
- data/lib/openc_bot/templates/bin/export_data +28 -0
- data/lib/openc_bot/templates/bin/fetch_data +23 -0
- data/lib/openc_bot/templates/bin/verify_data +1 -0
- data/lib/openc_bot/templates/config.yml +21 -0
- data/lib/openc_bot/templates/lib/bot.rb +43 -0
- data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
- data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
- data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
- data/lib/openc_bot/version.rb +3 -0
- data/lib/simple_openc_bot.rb +289 -0
- data/openc_bot.gemspec +35 -0
- data/schemas/company-schema.json +112 -0
- data/schemas/includes/address.json +23 -0
- data/schemas/includes/base-statement.json +27 -0
- data/schemas/includes/company.json +14 -0
- data/schemas/includes/filing.json +20 -0
- data/schemas/includes/license-data.json +27 -0
- data/schemas/includes/officer.json +14 -0
- data/schemas/includes/previous_name.json +11 -0
- data/schemas/includes/share-parcel-data.json +67 -0
- data/schemas/includes/share-parcel.json +60 -0
- data/schemas/includes/subsidiary-relationship-data.json +52 -0
- data/schemas/includes/total-shares.json +10 -0
- data/schemas/licence-schema.json +21 -0
- data/schemas/share-parcel-schema.json +21 -0
- data/schemas/subsidiary-relationship-schema.json +19 -0
- data/spec/dummy_classes/foo_bot.rb +4 -0
- data/spec/lib/bot_data_validator_spec.rb +69 -0
- data/spec/lib/company_fetcher_bot_spec.rb +93 -0
- data/spec/lib/exceptions_spec.rb +25 -0
- data/spec/lib/helpers/alpha_search_spec.rb +173 -0
- data/spec/lib/helpers/dates_spec.rb +65 -0
- data/spec/lib/helpers/incremental_search_spec.rb +471 -0
- data/spec/lib/helpers/register_methods_spec.rb +558 -0
- data/spec/lib/helpers/text_spec.rb +50 -0
- data/spec/lib/openc_bot/db/.gitkeep +0 -0
- data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
- data/spec/lib/openc_bot_spec.rb +116 -0
- data/spec/schemas/company-schema_spec.rb +676 -0
- data/spec/simple_openc_bot_spec.rb +302 -0
- data/spec/spec_helper.rb +19 -0
- metadata +300 -0
@@ -0,0 +1,385 @@
|
|
1
|
+
require 'simple_openc_bot'
|
2
|
+
require 'optparse'
|
3
|
+
require 'json'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
namespace :bot do
|
7
|
+
desc "create a skeleton bot that can be used in OpenCorporates"
|
8
|
+
task :create do
|
9
|
+
create_bot
|
10
|
+
end
|
11
|
+
|
12
|
+
desc "create a skeleton bot that can be used in OpenCorporates"
|
13
|
+
task :create_company_bot do
|
14
|
+
create_bot('company')
|
15
|
+
end
|
16
|
+
|
17
|
+
desc "create a skeleton simple_bot that can be used in OpenCorporates"
|
18
|
+
task :create_simple_bot do
|
19
|
+
working_dir = Dir.pwd
|
20
|
+
bot_name = get_bot_name
|
21
|
+
new_module_name = bot_name.split('_').collect(&:capitalize).join
|
22
|
+
%w(bin db data lib spec spec/dummy_responses tmp pids).each do |new_dir|
|
23
|
+
Dir.mkdir(File.join(working_dir,new_dir)) unless Dir.exist?(File.join(working_dir,new_dir))
|
24
|
+
end
|
25
|
+
templates = ['spec/spec_helper.rb','spec/simple_bot_spec.rb','lib/simple_bot.rb', 'README.md', 'config.yml', 'bin/export_data', 'bin/fetch_data', 'bin/verify_data']
|
26
|
+
templates.each do |template_location|
|
27
|
+
template = File.open(File.join(File.dirname(__FILE__), 'templates',template_location)).read
|
28
|
+
template.gsub!('MyLicence',new_module_name)
|
29
|
+
template.gsub!('my_module',bot_name)
|
30
|
+
begin
|
31
|
+
new_file = File.join(working_dir,"#{template_location.sub(/template/,'').sub(/simple_bot/,bot_name)}")
|
32
|
+
File.open(new_file, File::WRONLY|File::CREAT|File::EXCL) { |f| f.puts template }
|
33
|
+
puts "Created #{new_file}"
|
34
|
+
rescue Errno::EEXIST
|
35
|
+
puts "Skipped creating #{new_file} as it already exists"
|
36
|
+
end
|
37
|
+
FileUtils.chmod(0755, Dir.glob("#{working_dir}/bin/*"))
|
38
|
+
end
|
39
|
+
#Add rspec debugger to gemfile
|
40
|
+
File.open(File.join(working_dir,'Gemfile'),'a') do |file|
|
41
|
+
file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
|
42
|
+
puts "Added rspec and debugger to Gemfile at #{file.path}"
|
43
|
+
end
|
44
|
+
puts "Please run 'bundle install'"
|
45
|
+
end
|
46
|
+
|
47
|
+
desc 'Get data from target'
|
48
|
+
task :run do |t, args|
|
49
|
+
only_process_running(t.name) do
|
50
|
+
options = {}
|
51
|
+
options[:specific_ids] = []
|
52
|
+
options[:reset_iterator] = false
|
53
|
+
OptionParser.new(args) do |opts|
|
54
|
+
opts.banner = "Usage: rake #{t.name} -- [options]"
|
55
|
+
opts.on("-i", "--identifier UNIQUE_FIELD_VAL",
|
56
|
+
"Identifier of specific record to fetch",
|
57
|
+
" (may specify multiple times; refer to bot for its unique_fields)") do |val|
|
58
|
+
options[:specific_ids] << val
|
59
|
+
end
|
60
|
+
opts.on("-t", "--test-mode",
|
61
|
+
"Pass 'test' flag to bot") do |val|
|
62
|
+
options[:test_mode] = true
|
63
|
+
end
|
64
|
+
opts.on("-r", "--reset",
|
65
|
+
"Don't resume incremental bots; reset and start from the beginning") do |val|
|
66
|
+
options[:reset_iterator] = true
|
67
|
+
end
|
68
|
+
opts.on("-m", "--max-iterations MAX_ITERATIONS",
|
69
|
+
"Exit all iterators after MAX_ITERATIONS iterations. Useful for debugging.") do |val|
|
70
|
+
options[:max_iterations] = val.to_i
|
71
|
+
end
|
72
|
+
end.parse!
|
73
|
+
bot_name = get_bot_name
|
74
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
75
|
+
runner = callable_from_file_name(bot_name)
|
76
|
+
count = runner.update_data(options)
|
77
|
+
puts "Got #{count} records"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
desc 'Update stale data from target'
|
82
|
+
task :update_stale do
|
83
|
+
only_process_running('update_stale') do
|
84
|
+
bot_name = get_bot_name
|
85
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
86
|
+
runner = callable_from_file_name(bot_name)
|
87
|
+
runner.update_stale
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
desc 'Run bot, but just for record with given uid'
|
92
|
+
task :run_for_uid, :uid do |t, args|
|
93
|
+
only_process_running('run_for_uid') do
|
94
|
+
bot_name = get_bot_name
|
95
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
96
|
+
runner = callable_from_file_name(bot_name)
|
97
|
+
# this should output the updated json data for the given uid to
|
98
|
+
# STDOUT, as well as updating local database, when passed true as second argument
|
99
|
+
runner.update_datum(args[:uid], true)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
desc 'Export data to stdout'
|
104
|
+
task :export do |t, args|
|
105
|
+
only_process_running(t.name) do
|
106
|
+
options = {}
|
107
|
+
options[:specific_ids] = []
|
108
|
+
OptionParser.new(args) do |opts|
|
109
|
+
opts.banner = "Usage: rake #{t.name} -- [options]"
|
110
|
+
opts.on("-i", "--identifier UNIQUE_FIELD_VAL",
|
111
|
+
"Identifier of specific record to export",
|
112
|
+
" (may specify multiple times; refer to bot for its unique_fields)") do |val|
|
113
|
+
options[:specific_ids] << val
|
114
|
+
end
|
115
|
+
opts.on("-a", "--all",
|
116
|
+
"Export everything (default is only to export data that has changed since last export)") do |val|
|
117
|
+
options[:all] = true
|
118
|
+
end
|
119
|
+
end.parse!
|
120
|
+
bot_name = get_bot_name
|
121
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
122
|
+
runner = callable_from_file_name(bot_name)
|
123
|
+
runner.export(options)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
desc 'Export 5 records to stdout for manual checking'
|
128
|
+
task :spotcheck do
|
129
|
+
only_process_running('spotcheck') do
|
130
|
+
bot_name = get_bot_name
|
131
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
132
|
+
runner = callable_from_file_name(bot_name)
|
133
|
+
runner.spotcheck
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
desc 'Summarise data for quality checking (only works for licences at the moment)'
|
138
|
+
task :summarise_data do
|
139
|
+
def as_sorted_hash(name, data)
|
140
|
+
title = "#{name} counts:"
|
141
|
+
puts title
|
142
|
+
puts "-" * title.length
|
143
|
+
grouped = Hash[*data.group_by{|i| i}.map{|k,v| [Array(k).join(", "), v.count] }.flatten]
|
144
|
+
hash = grouped.sort_by do |k, v|
|
145
|
+
v
|
146
|
+
end
|
147
|
+
hash.each do |k, v|
|
148
|
+
printf("%-60s %10s\n", k, v)
|
149
|
+
end
|
150
|
+
puts
|
151
|
+
end
|
152
|
+
|
153
|
+
def as_longest_and_shortest(name, data)
|
154
|
+
sorted = data.compact.sort_by do |n|
|
155
|
+
n.length
|
156
|
+
end
|
157
|
+
puts
|
158
|
+
title = "shortest 5 #{name}"
|
159
|
+
puts title
|
160
|
+
puts "-" * title.length
|
161
|
+
puts sorted[0..5]
|
162
|
+
puts
|
163
|
+
title = "longest 5 #{name}"
|
164
|
+
puts title
|
165
|
+
puts "-" * title.length
|
166
|
+
puts sorted[-5..-1]
|
167
|
+
puts
|
168
|
+
end
|
169
|
+
|
170
|
+
def main
|
171
|
+
#result = open("foo", "r").read
|
172
|
+
result = `bundle exec openc_bot rake bot:export -- -a`
|
173
|
+
jurisdictions = []
|
174
|
+
names = []
|
175
|
+
start_dates = []
|
176
|
+
end_dates = []
|
177
|
+
sample_dates = []
|
178
|
+
licence_numbers = []
|
179
|
+
jurisdiction_classifications = []
|
180
|
+
result.split(/\r?\n/).each do |line|
|
181
|
+
line = JSON.parse(line)
|
182
|
+
jurisdictions << line["company"]["jurisdiction"]
|
183
|
+
names << line["company"]["name"]
|
184
|
+
start_dates << line["start_date"]
|
185
|
+
end_dates << line["end_date"]
|
186
|
+
sample_dates << line["sample_date"]
|
187
|
+
licence_numbers << line["data"][0]["properties"]["licence_number"]
|
188
|
+
jurisdiction_classifications << line["data"][0]["properties"]["jurisdiction_classification"]
|
189
|
+
end
|
190
|
+
|
191
|
+
# This could be a histogram:
|
192
|
+
as_sorted_hash("[company][jurisdiction]", jurisdictions)
|
193
|
+
|
194
|
+
# This could be a list of the longest and shortest names:
|
195
|
+
as_longest_and_shortest("[company][name]s", names)
|
196
|
+
|
197
|
+
# earliest start date and latest start date and sample dates
|
198
|
+
start_dates = start_dates.compact.sort
|
199
|
+
end_dates = end_dates.compact.sort
|
200
|
+
sample_dates = sample_dates.compact.sort
|
201
|
+
puts
|
202
|
+
puts "Dates"
|
203
|
+
puts "-----"
|
204
|
+
printf("%-22s %10s\n", "Earliest start_date:", start_dates.first)
|
205
|
+
printf("%-22s %10s\n", "Earliest end_date:", end_dates.first)
|
206
|
+
printf("%-22s %10s\n", "Earliest sample_date:", end_dates.first)
|
207
|
+
printf("%-22s %10s\n", "Latest start_date:", start_dates.last)
|
208
|
+
printf("%-22s %10s\n", "Latest end_date:", end_dates.last)
|
209
|
+
printf("%-22s %10s\n", "Latest sample_date:", sample_dates.last)
|
210
|
+
|
211
|
+
as_longest_and_shortest("licence numbers", licence_numbers)
|
212
|
+
as_sorted_hash("jurisdiction_classifications", jurisdiction_classifications)
|
213
|
+
end
|
214
|
+
|
215
|
+
main()
|
216
|
+
|
217
|
+
end
|
218
|
+
|
219
|
+
desc 'Lint old-style bots'
|
220
|
+
task :lint do
|
221
|
+
bot_name = get_bot_name
|
222
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
223
|
+
runner = callable_from_file_name(bot_name)
|
224
|
+
messages = []
|
225
|
+
if runner.method(:export_data).arity == 0
|
226
|
+
messages << "export_data method must accept a hash as a single argument (e.g. `export_data(opts={})`"
|
227
|
+
end
|
228
|
+
|
229
|
+
if runner.is_a? SimpleOpencBot
|
230
|
+
if !runner.respond_to? "fetch_all_records"
|
231
|
+
messages << "You must rename fetch_records -> fetch_all_records."
|
232
|
+
end
|
233
|
+
|
234
|
+
full_source = File.open(File.join(Dir.pwd,'lib', bot_name) + ".rb").read
|
235
|
+
if !full_source.match("^\s+yields")
|
236
|
+
messages << <<EOF
|
237
|
+
You must call the `yields` class method with the class
|
238
|
+
of the Records you're returning. For example:
|
239
|
+
|
240
|
+
class FooLicenses < SimpleOpencBot
|
241
|
+
yields FooLicensesRecord
|
242
|
+
|
243
|
+
EOF
|
244
|
+
end
|
245
|
+
|
246
|
+
# fetch_all_methods must yield rather than return
|
247
|
+
if runner.respond_to? "fetch_all_records"
|
248
|
+
source, line = runner.method(:fetch_all_records).source_location
|
249
|
+
count = 0
|
250
|
+
found = false
|
251
|
+
File.foreach(source, "\n") do |l|
|
252
|
+
count += 1
|
253
|
+
next if count < line + 1
|
254
|
+
|
255
|
+
if l.match("^\s+yield")
|
256
|
+
found = true
|
257
|
+
break
|
258
|
+
end
|
259
|
+
break if l.match("^\s+def")
|
260
|
+
end
|
261
|
+
messages << "fetch_all_records must `yield` single records (rather than returning an array)" if !found
|
262
|
+
end
|
263
|
+
end
|
264
|
+
messages.each_with_index do |m, i|
|
265
|
+
puts "#{i + 1}:"
|
266
|
+
puts m
|
267
|
+
puts "------------"
|
268
|
+
end
|
269
|
+
puts "No problems!" if messages.empty?
|
270
|
+
end
|
271
|
+
|
272
|
+
task :test do
|
273
|
+
bot_name = get_bot_name
|
274
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
275
|
+
runner = callable_from_file_name(bot_name)
|
276
|
+
if runner.respond_to?(:validate_data)
|
277
|
+
results = runner.validate_data
|
278
|
+
if !results.empty?
|
279
|
+
raise OpencBot::InvalidDataError.new(results)
|
280
|
+
end
|
281
|
+
else
|
282
|
+
results = runner.export_data
|
283
|
+
results.each do |datum|
|
284
|
+
raise OpencBot::InvalidDataError.new("This datum is invalid: #{datum.inspect}") unless
|
285
|
+
OpencBot::BotDataValidator.validate(datum)
|
286
|
+
end
|
287
|
+
end
|
288
|
+
puts "Congratulations! This data appears to be valid"
|
289
|
+
end
|
290
|
+
|
291
|
+
def klass_from_file_name(underscore_file_name)
|
292
|
+
camelcase_version = underscore_file_name.split('_').map{ |e| e.capitalize }.join
|
293
|
+
Object.const_get(camelcase_version)
|
294
|
+
end
|
295
|
+
|
296
|
+
# At the moment, we have simple bots and bots; the former expect to
|
297
|
+
# be instances, the latter modules with class methods.
|
298
|
+
def callable_from_file_name(underscore_file_name)
|
299
|
+
bot_klass = klass_from_file_name(underscore_file_name)
|
300
|
+
if bot_klass.respond_to?(:new)
|
301
|
+
callable = bot_klass.new
|
302
|
+
else
|
303
|
+
callable = bot_klass
|
304
|
+
end
|
305
|
+
callable
|
306
|
+
end
|
307
|
+
|
308
|
+
def create_bot(template_name='bot')
|
309
|
+
working_dir = Dir.pwd
|
310
|
+
bot_name = get_bot_name
|
311
|
+
new_module_name = bot_name.split('_').collect(&:capitalize).join
|
312
|
+
|
313
|
+
%w(bin db data lib spec spec/dummy_responses tmp pids).each do |new_dir|
|
314
|
+
new_dir_path = File.join(working_dir,new_dir)
|
315
|
+
FileUtils.mkdir_p(new_dir_path)
|
316
|
+
end
|
317
|
+
|
318
|
+
bot_template = "lib/#{template_name}.rb"
|
319
|
+
templates = ['spec/spec_helper.rb','spec/bot_spec.rb', 'README.md', 'config.yml', bot_template]
|
320
|
+
templates.each do |template_location|
|
321
|
+
template = File.open(File.join(File.dirname(__FILE__), 'templates',template_location)).read
|
322
|
+
template.gsub!('MyModule',new_module_name)
|
323
|
+
template.gsub!('my_module',bot_name)
|
324
|
+
new_file = File.join(working_dir,"#{template_location.sub(/template/,'').sub(/bot/,bot_name)}")
|
325
|
+
unless File.exists? new_file
|
326
|
+
File.open(new_file, File::WRONLY|File::CREAT|File::EXCL) { |f| f.puts template }
|
327
|
+
puts "Created #{new_file}"
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
#Add rspec debugger to gemfile
|
332
|
+
File.open(File.join(working_dir,'Gemfile'),'a') do |file|
|
333
|
+
file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
|
334
|
+
puts "Added rspec and debugger to Gemfile at #{file}"
|
335
|
+
end
|
336
|
+
puts "Please run 'bundle install'"
|
337
|
+
end
|
338
|
+
|
339
|
+
def get_bot_name
|
340
|
+
bot_name ||= Dir.pwd.split('/').last
|
341
|
+
end
|
342
|
+
|
343
|
+
def only_process_running(task_name)
|
344
|
+
pid_path = File.join(Dir.pwd, 'pids', task_name)
|
345
|
+
|
346
|
+
raise_if_already_running(pid_path)
|
347
|
+
write_pid_file(pid_path)
|
348
|
+
|
349
|
+
begin
|
350
|
+
yield
|
351
|
+
ensure
|
352
|
+
remove_pid_file(pid_path)
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
def raise_if_already_running(pid_path)
|
357
|
+
begin
|
358
|
+
pid = File.open(pid_path).read.to_i
|
359
|
+
rescue Errno::ENOENT
|
360
|
+
# PID file doesn't exist
|
361
|
+
return
|
362
|
+
end
|
363
|
+
|
364
|
+
begin
|
365
|
+
Process.getpgid(pid)
|
366
|
+
rescue Errno::ESRCH
|
367
|
+
# Process with PID doesn't exist
|
368
|
+
# TODO Log this
|
369
|
+
return
|
370
|
+
else
|
371
|
+
# Process with PID does exist
|
372
|
+
# TODO Log this
|
373
|
+
raise 'Already running'
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
def write_pid_file(pid_path)
|
378
|
+
File.open(pid_path, 'w') {|file| file.write(Process.pid)}
|
379
|
+
end
|
380
|
+
|
381
|
+
def remove_pid_file(pid_path)
|
382
|
+
File.delete(pid_path)
|
383
|
+
end
|
384
|
+
|
385
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# MyModule Bot
|
2
|
+
|
3
|
+
## About the data publisher
|
4
|
+
|
5
|
+
Describe the source. Specifically:
|
6
|
+
|
7
|
+
* Who is behind it?
|
8
|
+
* What gives it regulatory power (thus justifying it being in
|
9
|
+
OpenCorporates)?
|
10
|
+
* How often is it updated?
|
11
|
+
|
12
|
+
## About the data
|
13
|
+
|
14
|
+
* Give a sample URL showing a typical page in the source, or
|
15
|
+
instructions on how to find one.
|
16
|
+
* Provide references, if possible, to where the meanings of the fields
|
17
|
+
are defined.
|
18
|
+
* Found any interesting bits of data while debugging? Mention them here!
|
19
|
+
|
20
|
+
### Data exported
|
21
|
+
|
22
|
+
* List the fields that this bot currently exports to OpenCorporates,
|
23
|
+
along with their meaning within this jurisdiction.
|
24
|
+
|
25
|
+
### Data not currently exported, but stored
|
26
|
+
|
27
|
+
* List the fields that might be of interest to OpenCorporares in the
|
28
|
+
future. For example, the source might contain address data, but the
|
29
|
+
scraper might currently only focus on licenses.
|
30
|
+
|
31
|
+
## Assumptions
|
32
|
+
|
33
|
+
* What assumptions have you made in preparing the data for export?
|
34
|
+
E.g. you might be inferring that a field refers to a company (rather
|
35
|
+
than a person) based on a regular expression.
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'open3'
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
command = "bundle exec openc_bot rake bot:export"
|
6
|
+
options = { chdir: File.join(File.dirname(__FILE__), "..") }
|
7
|
+
_, stdout, stderr, wait_thread = Open3::popen3(command, options)
|
8
|
+
result = wait_thread.value
|
9
|
+
|
10
|
+
if result.success?
|
11
|
+
data = stdout.read
|
12
|
+
if !data.strip.empty?
|
13
|
+
dir = "data/#{Time.now.strftime('%Y-%m-%d')}"
|
14
|
+
FileUtils.mkdir_p(dir)
|
15
|
+
export_number = Dir.glob("#{dir}/export-*.json").count + 1
|
16
|
+
dest ="#{dir}/export-#{export_number}.json"
|
17
|
+
File.open(dest, "w") do |f|
|
18
|
+
f.write(data)
|
19
|
+
end
|
20
|
+
puts "Written data to #{dest}"
|
21
|
+
else
|
22
|
+
puts "No new data to export"
|
23
|
+
end
|
24
|
+
exit 0
|
25
|
+
else
|
26
|
+
STDERR.puts stderr.read
|
27
|
+
exit 1
|
28
|
+
end
|