openc_bot 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +2 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +253 -0
- data/Rakefile +14 -0
- data/bin/openc_bot +13 -0
- data/create_bot.sh +30 -0
- data/create_company_bot.sh +16 -0
- data/create_simple_licence_bot.sh +31 -0
- data/db/.gitkeep +0 -0
- data/examples/basic/.gitignore +3 -0
- data/examples/basic/Gemfile +7 -0
- data/examples/basic/config.yml +21 -0
- data/examples/basic/lib/basic.rb +88 -0
- data/examples/basic_with_proxy/Gemfile +7 -0
- data/examples/basic_with_proxy/config.yml +21 -0
- data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
- data/examples/bot_with_simple_iterator/Gemfile +6 -0
- data/examples/bot_with_simple_iterator/config.yml +21 -0
- data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
- data/examples/company_fetchers/basic.rb +49 -0
- data/lib/monkey_patches/mechanize.rb +53 -0
- data/lib/openc_bot.rb +89 -0
- data/lib/openc_bot/bot_data_validator.rb +18 -0
- data/lib/openc_bot/company_fetcher_bot.rb +40 -0
- data/lib/openc_bot/exceptions.rb +17 -0
- data/lib/openc_bot/helpers/_csv.rb +10 -0
- data/lib/openc_bot/helpers/alpha_search.rb +73 -0
- data/lib/openc_bot/helpers/dates.rb +33 -0
- data/lib/openc_bot/helpers/html.rb +8 -0
- data/lib/openc_bot/helpers/incremental_search.rb +106 -0
- data/lib/openc_bot/helpers/register_methods.rb +205 -0
- data/lib/openc_bot/helpers/text.rb +18 -0
- data/lib/openc_bot/incrementers.rb +2 -0
- data/lib/openc_bot/incrementers/base.rb +214 -0
- data/lib/openc_bot/incrementers/common.rb +47 -0
- data/lib/openc_bot/tasks.rb +385 -0
- data/lib/openc_bot/templates/README.md +35 -0
- data/lib/openc_bot/templates/bin/export_data +28 -0
- data/lib/openc_bot/templates/bin/fetch_data +23 -0
- data/lib/openc_bot/templates/bin/verify_data +1 -0
- data/lib/openc_bot/templates/config.yml +21 -0
- data/lib/openc_bot/templates/lib/bot.rb +43 -0
- data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
- data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
- data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
- data/lib/openc_bot/version.rb +3 -0
- data/lib/simple_openc_bot.rb +289 -0
- data/openc_bot.gemspec +35 -0
- data/schemas/company-schema.json +112 -0
- data/schemas/includes/address.json +23 -0
- data/schemas/includes/base-statement.json +27 -0
- data/schemas/includes/company.json +14 -0
- data/schemas/includes/filing.json +20 -0
- data/schemas/includes/license-data.json +27 -0
- data/schemas/includes/officer.json +14 -0
- data/schemas/includes/previous_name.json +11 -0
- data/schemas/includes/share-parcel-data.json +67 -0
- data/schemas/includes/share-parcel.json +60 -0
- data/schemas/includes/subsidiary-relationship-data.json +52 -0
- data/schemas/includes/total-shares.json +10 -0
- data/schemas/licence-schema.json +21 -0
- data/schemas/share-parcel-schema.json +21 -0
- data/schemas/subsidiary-relationship-schema.json +19 -0
- data/spec/dummy_classes/foo_bot.rb +4 -0
- data/spec/lib/bot_data_validator_spec.rb +69 -0
- data/spec/lib/company_fetcher_bot_spec.rb +93 -0
- data/spec/lib/exceptions_spec.rb +25 -0
- data/spec/lib/helpers/alpha_search_spec.rb +173 -0
- data/spec/lib/helpers/dates_spec.rb +65 -0
- data/spec/lib/helpers/incremental_search_spec.rb +471 -0
- data/spec/lib/helpers/register_methods_spec.rb +558 -0
- data/spec/lib/helpers/text_spec.rb +50 -0
- data/spec/lib/openc_bot/db/.gitkeep +0 -0
- data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
- data/spec/lib/openc_bot_spec.rb +116 -0
- data/spec/schemas/company-schema_spec.rb +676 -0
- data/spec/simple_openc_bot_spec.rb +302 -0
- data/spec/spec_helper.rb +19 -0
- metadata +300 -0
@@ -0,0 +1,385 @@
|
|
1
|
+
require 'simple_openc_bot'
|
2
|
+
require 'optparse'
|
3
|
+
require 'json'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
namespace :bot do
|
7
|
+
desc "create a skeleton bot that can be used in OpenCorporates"
|
8
|
+
task :create do
|
9
|
+
create_bot
|
10
|
+
end
|
11
|
+
|
12
|
+
desc "create a skeleton bot that can be used in OpenCorporates"
|
13
|
+
task :create_company_bot do
|
14
|
+
create_bot('company')
|
15
|
+
end
|
16
|
+
|
17
|
+
desc "create a skeleton simple_bot that can be used in OpenCorporates"
|
18
|
+
task :create_simple_bot do
|
19
|
+
working_dir = Dir.pwd
|
20
|
+
bot_name = get_bot_name
|
21
|
+
new_module_name = bot_name.split('_').collect(&:capitalize).join
|
22
|
+
%w(bin db data lib spec spec/dummy_responses tmp pids).each do |new_dir|
|
23
|
+
Dir.mkdir(File.join(working_dir,new_dir)) unless Dir.exist?(File.join(working_dir,new_dir))
|
24
|
+
end
|
25
|
+
templates = ['spec/spec_helper.rb','spec/simple_bot_spec.rb','lib/simple_bot.rb', 'README.md', 'config.yml', 'bin/export_data', 'bin/fetch_data', 'bin/verify_data']
|
26
|
+
templates.each do |template_location|
|
27
|
+
template = File.open(File.join(File.dirname(__FILE__), 'templates',template_location)).read
|
28
|
+
template.gsub!('MyLicence',new_module_name)
|
29
|
+
template.gsub!('my_module',bot_name)
|
30
|
+
begin
|
31
|
+
new_file = File.join(working_dir,"#{template_location.sub(/template/,'').sub(/simple_bot/,bot_name)}")
|
32
|
+
File.open(new_file, File::WRONLY|File::CREAT|File::EXCL) { |f| f.puts template }
|
33
|
+
puts "Created #{new_file}"
|
34
|
+
rescue Errno::EEXIST
|
35
|
+
puts "Skipped creating #{new_file} as it already exists"
|
36
|
+
end
|
37
|
+
FileUtils.chmod(0755, Dir.glob("#{working_dir}/bin/*"))
|
38
|
+
end
|
39
|
+
#Add rspec debugger to gemfile
|
40
|
+
File.open(File.join(working_dir,'Gemfile'),'a') do |file|
|
41
|
+
file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
|
42
|
+
puts "Added rspec and debugger to Gemfile at #{file.path}"
|
43
|
+
end
|
44
|
+
puts "Please run 'bundle install'"
|
45
|
+
end
|
46
|
+
|
47
|
+
desc 'Get data from target'
|
48
|
+
task :run do |t, args|
|
49
|
+
only_process_running(t.name) do
|
50
|
+
options = {}
|
51
|
+
options[:specific_ids] = []
|
52
|
+
options[:reset_iterator] = false
|
53
|
+
OptionParser.new(args) do |opts|
|
54
|
+
opts.banner = "Usage: rake #{t.name} -- [options]"
|
55
|
+
opts.on("-i", "--identifier UNIQUE_FIELD_VAL",
|
56
|
+
"Identifier of specific record to fetch",
|
57
|
+
" (may specify multiple times; refer to bot for its unique_fields)") do |val|
|
58
|
+
options[:specific_ids] << val
|
59
|
+
end
|
60
|
+
opts.on("-t", "--test-mode",
|
61
|
+
"Pass 'test' flag to bot") do |val|
|
62
|
+
options[:test_mode] = true
|
63
|
+
end
|
64
|
+
opts.on("-r", "--reset",
|
65
|
+
"Don't resume incremental bots; reset and start from the beginning") do |val|
|
66
|
+
options[:reset_iterator] = true
|
67
|
+
end
|
68
|
+
opts.on("-m", "--max-iterations MAX_ITERATIONS",
|
69
|
+
"Exit all iterators after MAX_ITERATIONS iterations. Useful for debugging.") do |val|
|
70
|
+
options[:max_iterations] = val.to_i
|
71
|
+
end
|
72
|
+
end.parse!
|
73
|
+
bot_name = get_bot_name
|
74
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
75
|
+
runner = callable_from_file_name(bot_name)
|
76
|
+
count = runner.update_data(options)
|
77
|
+
puts "Got #{count} records"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
desc 'Update stale data from target'
|
82
|
+
task :update_stale do
|
83
|
+
only_process_running('update_stale') do
|
84
|
+
bot_name = get_bot_name
|
85
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
86
|
+
runner = callable_from_file_name(bot_name)
|
87
|
+
runner.update_stale
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
desc 'Run bot, but just for record with given uid'
|
92
|
+
task :run_for_uid, :uid do |t, args|
|
93
|
+
only_process_running('run_for_uid') do
|
94
|
+
bot_name = get_bot_name
|
95
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
96
|
+
runner = callable_from_file_name(bot_name)
|
97
|
+
# this should output the updated json data for the given uid to
|
98
|
+
# STDOUT, as well as updating local database, when passed true as second argument
|
99
|
+
runner.update_datum(args[:uid], true)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
desc 'Export data to stdout'
|
104
|
+
task :export do |t, args|
|
105
|
+
only_process_running(t.name) do
|
106
|
+
options = {}
|
107
|
+
options[:specific_ids] = []
|
108
|
+
OptionParser.new(args) do |opts|
|
109
|
+
opts.banner = "Usage: rake #{t.name} -- [options]"
|
110
|
+
opts.on("-i", "--identifier UNIQUE_FIELD_VAL",
|
111
|
+
"Identifier of specific record to export",
|
112
|
+
" (may specify multiple times; refer to bot for its unique_fields)") do |val|
|
113
|
+
options[:specific_ids] << val
|
114
|
+
end
|
115
|
+
opts.on("-a", "--all",
|
116
|
+
"Export everything (default is only to export data that has changed since last export)") do |val|
|
117
|
+
options[:all] = true
|
118
|
+
end
|
119
|
+
end.parse!
|
120
|
+
bot_name = get_bot_name
|
121
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
122
|
+
runner = callable_from_file_name(bot_name)
|
123
|
+
runner.export(options)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
desc 'Export 5 records to stdout for manual checking'
|
128
|
+
task :spotcheck do
|
129
|
+
only_process_running('spotcheck') do
|
130
|
+
bot_name = get_bot_name
|
131
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
132
|
+
runner = callable_from_file_name(bot_name)
|
133
|
+
runner.spotcheck
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
desc 'Summarise data for quality checking (only works for licences at the moment)'
|
138
|
+
task :summarise_data do
|
139
|
+
def as_sorted_hash(name, data)
|
140
|
+
title = "#{name} counts:"
|
141
|
+
puts title
|
142
|
+
puts "-" * title.length
|
143
|
+
grouped = Hash[*data.group_by{|i| i}.map{|k,v| [Array(k).join(", "), v.count] }.flatten]
|
144
|
+
hash = grouped.sort_by do |k, v|
|
145
|
+
v
|
146
|
+
end
|
147
|
+
hash.each do |k, v|
|
148
|
+
printf("%-60s %10s\n", k, v)
|
149
|
+
end
|
150
|
+
puts
|
151
|
+
end
|
152
|
+
|
153
|
+
def as_longest_and_shortest(name, data)
|
154
|
+
sorted = data.compact.sort_by do |n|
|
155
|
+
n.length
|
156
|
+
end
|
157
|
+
puts
|
158
|
+
title = "shortest 5 #{name}"
|
159
|
+
puts title
|
160
|
+
puts "-" * title.length
|
161
|
+
puts sorted[0..5]
|
162
|
+
puts
|
163
|
+
title = "longest 5 #{name}"
|
164
|
+
puts title
|
165
|
+
puts "-" * title.length
|
166
|
+
puts sorted[-5..-1]
|
167
|
+
puts
|
168
|
+
end
|
169
|
+
|
170
|
+
def main
|
171
|
+
#result = open("foo", "r").read
|
172
|
+
result = `bundle exec openc_bot rake bot:export -- -a`
|
173
|
+
jurisdictions = []
|
174
|
+
names = []
|
175
|
+
start_dates = []
|
176
|
+
end_dates = []
|
177
|
+
sample_dates = []
|
178
|
+
licence_numbers = []
|
179
|
+
jurisdiction_classifications = []
|
180
|
+
result.split(/\r?\n/).each do |line|
|
181
|
+
line = JSON.parse(line)
|
182
|
+
jurisdictions << line["company"]["jurisdiction"]
|
183
|
+
names << line["company"]["name"]
|
184
|
+
start_dates << line["start_date"]
|
185
|
+
end_dates << line["end_date"]
|
186
|
+
sample_dates << line["sample_date"]
|
187
|
+
licence_numbers << line["data"][0]["properties"]["licence_number"]
|
188
|
+
jurisdiction_classifications << line["data"][0]["properties"]["jurisdiction_classification"]
|
189
|
+
end
|
190
|
+
|
191
|
+
# This could be a histogram:
|
192
|
+
as_sorted_hash("[company][jurisdiction]", jurisdictions)
|
193
|
+
|
194
|
+
# This could be a list of the longest and shortest names:
|
195
|
+
as_longest_and_shortest("[company][name]s", names)
|
196
|
+
|
197
|
+
# earliest start date and latest start date and sample dates
|
198
|
+
start_dates = start_dates.compact.sort
|
199
|
+
end_dates = end_dates.compact.sort
|
200
|
+
sample_dates = sample_dates.compact.sort
|
201
|
+
puts
|
202
|
+
puts "Dates"
|
203
|
+
puts "-----"
|
204
|
+
printf("%-22s %10s\n", "Earliest start_date:", start_dates.first)
|
205
|
+
printf("%-22s %10s\n", "Earliest end_date:", end_dates.first)
|
206
|
+
printf("%-22s %10s\n", "Earliest sample_date:", end_dates.first)
|
207
|
+
printf("%-22s %10s\n", "Latest start_date:", start_dates.last)
|
208
|
+
printf("%-22s %10s\n", "Latest end_date:", end_dates.last)
|
209
|
+
printf("%-22s %10s\n", "Latest sample_date:", sample_dates.last)
|
210
|
+
|
211
|
+
as_longest_and_shortest("licence numbers", licence_numbers)
|
212
|
+
as_sorted_hash("jurisdiction_classifications", jurisdiction_classifications)
|
213
|
+
end
|
214
|
+
|
215
|
+
main()
|
216
|
+
|
217
|
+
end
|
218
|
+
|
219
|
+
desc 'Lint old-style bots'
|
220
|
+
task :lint do
|
221
|
+
bot_name = get_bot_name
|
222
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
223
|
+
runner = callable_from_file_name(bot_name)
|
224
|
+
messages = []
|
225
|
+
if runner.method(:export_data).arity == 0
|
226
|
+
messages << "export_data method must accept a hash as a single argument (e.g. `export_data(opts={})`"
|
227
|
+
end
|
228
|
+
|
229
|
+
if runner.is_a? SimpleOpencBot
|
230
|
+
if !runner.respond_to? "fetch_all_records"
|
231
|
+
messages << "You must rename fetch_records -> fetch_all_records."
|
232
|
+
end
|
233
|
+
|
234
|
+
full_source = File.open(File.join(Dir.pwd,'lib', bot_name) + ".rb").read
|
235
|
+
if !full_source.match("^\s+yields")
|
236
|
+
messages << <<EOF
|
237
|
+
You must call the `yields` class method with the class
|
238
|
+
of the Records you're returning. For example:
|
239
|
+
|
240
|
+
class FooLicenses < SimpleOpencBot
|
241
|
+
yields FooLicensesRecord
|
242
|
+
|
243
|
+
EOF
|
244
|
+
end
|
245
|
+
|
246
|
+
# fetch_all_methods must yield rather than return
|
247
|
+
if runner.respond_to? "fetch_all_records"
|
248
|
+
source, line = runner.method(:fetch_all_records).source_location
|
249
|
+
count = 0
|
250
|
+
found = false
|
251
|
+
File.foreach(source, "\n") do |l|
|
252
|
+
count += 1
|
253
|
+
next if count < line + 1
|
254
|
+
|
255
|
+
if l.match("^\s+yield")
|
256
|
+
found = true
|
257
|
+
break
|
258
|
+
end
|
259
|
+
break if l.match("^\s+def")
|
260
|
+
end
|
261
|
+
messages << "fetch_all_records must `yield` single records (rather than returning an array)" if !found
|
262
|
+
end
|
263
|
+
end
|
264
|
+
messages.each_with_index do |m, i|
|
265
|
+
puts "#{i + 1}:"
|
266
|
+
puts m
|
267
|
+
puts "------------"
|
268
|
+
end
|
269
|
+
puts "No problems!" if messages.empty?
|
270
|
+
end
|
271
|
+
|
272
|
+
task :test do
|
273
|
+
bot_name = get_bot_name
|
274
|
+
require_relative File.join(Dir.pwd,'lib', bot_name)
|
275
|
+
runner = callable_from_file_name(bot_name)
|
276
|
+
if runner.respond_to?(:validate_data)
|
277
|
+
results = runner.validate_data
|
278
|
+
if !results.empty?
|
279
|
+
raise OpencBot::InvalidDataError.new(results)
|
280
|
+
end
|
281
|
+
else
|
282
|
+
results = runner.export_data
|
283
|
+
results.each do |datum|
|
284
|
+
raise OpencBot::InvalidDataError.new("This datum is invalid: #{datum.inspect}") unless
|
285
|
+
OpencBot::BotDataValidator.validate(datum)
|
286
|
+
end
|
287
|
+
end
|
288
|
+
puts "Congratulations! This data appears to be valid"
|
289
|
+
end
|
290
|
+
|
291
|
+
def klass_from_file_name(underscore_file_name)
|
292
|
+
camelcase_version = underscore_file_name.split('_').map{ |e| e.capitalize }.join
|
293
|
+
Object.const_get(camelcase_version)
|
294
|
+
end
|
295
|
+
|
296
|
+
# At the moment, we have simple bots and bots; the former expect to
|
297
|
+
# be instances, the latter modules with class methods.
|
298
|
+
def callable_from_file_name(underscore_file_name)
|
299
|
+
bot_klass = klass_from_file_name(underscore_file_name)
|
300
|
+
if bot_klass.respond_to?(:new)
|
301
|
+
callable = bot_klass.new
|
302
|
+
else
|
303
|
+
callable = bot_klass
|
304
|
+
end
|
305
|
+
callable
|
306
|
+
end
|
307
|
+
|
308
|
+
def create_bot(template_name='bot')
|
309
|
+
working_dir = Dir.pwd
|
310
|
+
bot_name = get_bot_name
|
311
|
+
new_module_name = bot_name.split('_').collect(&:capitalize).join
|
312
|
+
|
313
|
+
%w(bin db data lib spec spec/dummy_responses tmp pids).each do |new_dir|
|
314
|
+
new_dir_path = File.join(working_dir,new_dir)
|
315
|
+
FileUtils.mkdir_p(new_dir_path)
|
316
|
+
end
|
317
|
+
|
318
|
+
bot_template = "lib/#{template_name}.rb"
|
319
|
+
templates = ['spec/spec_helper.rb','spec/bot_spec.rb', 'README.md', 'config.yml', bot_template]
|
320
|
+
templates.each do |template_location|
|
321
|
+
template = File.open(File.join(File.dirname(__FILE__), 'templates',template_location)).read
|
322
|
+
template.gsub!('MyModule',new_module_name)
|
323
|
+
template.gsub!('my_module',bot_name)
|
324
|
+
new_file = File.join(working_dir,"#{template_location.sub(/template/,'').sub(/bot/,bot_name)}")
|
325
|
+
unless File.exists? new_file
|
326
|
+
File.open(new_file, File::WRONLY|File::CREAT|File::EXCL) { |f| f.puts template }
|
327
|
+
puts "Created #{new_file}"
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
#Add rspec debugger to gemfile
|
332
|
+
File.open(File.join(working_dir,'Gemfile'),'a') do |file|
|
333
|
+
file.puts "group :test do\n gem 'rspec'\n gem 'debugger'\nend"
|
334
|
+
puts "Added rspec and debugger to Gemfile at #{file}"
|
335
|
+
end
|
336
|
+
puts "Please run 'bundle install'"
|
337
|
+
end
|
338
|
+
|
339
|
+
def get_bot_name
|
340
|
+
bot_name ||= Dir.pwd.split('/').last
|
341
|
+
end
|
342
|
+
|
343
|
+
def only_process_running(task_name)
|
344
|
+
pid_path = File.join(Dir.pwd, 'pids', task_name)
|
345
|
+
|
346
|
+
raise_if_already_running(pid_path)
|
347
|
+
write_pid_file(pid_path)
|
348
|
+
|
349
|
+
begin
|
350
|
+
yield
|
351
|
+
ensure
|
352
|
+
remove_pid_file(pid_path)
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
def raise_if_already_running(pid_path)
|
357
|
+
begin
|
358
|
+
pid = File.open(pid_path).read.to_i
|
359
|
+
rescue Errno::ENOENT
|
360
|
+
# PID file doesn't exist
|
361
|
+
return
|
362
|
+
end
|
363
|
+
|
364
|
+
begin
|
365
|
+
Process.getpgid(pid)
|
366
|
+
rescue Errno::ESRCH
|
367
|
+
# Process with PID doesn't exist
|
368
|
+
# TODO Log this
|
369
|
+
return
|
370
|
+
else
|
371
|
+
# Process with PID does exist
|
372
|
+
# TODO Log this
|
373
|
+
raise 'Already running'
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
def write_pid_file(pid_path)
|
378
|
+
File.open(pid_path, 'w') {|file| file.write(Process.pid)}
|
379
|
+
end
|
380
|
+
|
381
|
+
def remove_pid_file(pid_path)
|
382
|
+
File.delete(pid_path)
|
383
|
+
end
|
384
|
+
|
385
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# MyModule Bot
|
2
|
+
|
3
|
+
## About the data publisher
|
4
|
+
|
5
|
+
Describe the source. Specifically:
|
6
|
+
|
7
|
+
* Who is behind it?
|
8
|
+
* What gives it regulatory power (thus justifying it being in
|
9
|
+
OpenCorporates)?
|
10
|
+
* How often is it updated?
|
11
|
+
|
12
|
+
## About the data
|
13
|
+
|
14
|
+
* Give a sample URL showing a typical page in the source, or
|
15
|
+
instructions on how to find one.
|
16
|
+
* Provide references, if possible, to where the meanings of the fields
|
17
|
+
are defined.
|
18
|
+
* Found any interesting bits of data while debugging? Mention them here!
|
19
|
+
|
20
|
+
### Data exported
|
21
|
+
|
22
|
+
* List the fields that this bot currently exports to OpenCorporates,
|
23
|
+
along with their meaning within this jurisdiction.
|
24
|
+
|
25
|
+
### Data not currently exported, but stored
|
26
|
+
|
27
|
+
* List the fields that might be of interest to OpenCorporares in the
|
28
|
+
future. For example, the source might contain address data, but the
|
29
|
+
scraper might currently only focus on licenses.
|
30
|
+
|
31
|
+
## Assumptions
|
32
|
+
|
33
|
+
* What assumptions have you made in preparing the data for export?
|
34
|
+
E.g. you might be inferring that a field refers to a company (rather
|
35
|
+
than a person) based on a regular expression.
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'open3'
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
command = "bundle exec openc_bot rake bot:export"
|
6
|
+
options = { chdir: File.join(File.dirname(__FILE__), "..") }
|
7
|
+
_, stdout, stderr, wait_thread = Open3::popen3(command, options)
|
8
|
+
result = wait_thread.value
|
9
|
+
|
10
|
+
if result.success?
|
11
|
+
data = stdout.read
|
12
|
+
if !data.strip.empty?
|
13
|
+
dir = "data/#{Time.now.strftime('%Y-%m-%d')}"
|
14
|
+
FileUtils.mkdir_p(dir)
|
15
|
+
export_number = Dir.glob("#{dir}/export-*.json").count + 1
|
16
|
+
dest ="#{dir}/export-#{export_number}.json"
|
17
|
+
File.open(dest, "w") do |f|
|
18
|
+
f.write(data)
|
19
|
+
end
|
20
|
+
puts "Written data to #{dest}"
|
21
|
+
else
|
22
|
+
puts "No new data to export"
|
23
|
+
end
|
24
|
+
exit 0
|
25
|
+
else
|
26
|
+
STDERR.puts stderr.read
|
27
|
+
exit 1
|
28
|
+
end
|