openc_bot 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +2 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +253 -0
- data/Rakefile +14 -0
- data/bin/openc_bot +13 -0
- data/create_bot.sh +30 -0
- data/create_company_bot.sh +16 -0
- data/create_simple_licence_bot.sh +31 -0
- data/db/.gitkeep +0 -0
- data/examples/basic/.gitignore +3 -0
- data/examples/basic/Gemfile +7 -0
- data/examples/basic/config.yml +21 -0
- data/examples/basic/lib/basic.rb +88 -0
- data/examples/basic_with_proxy/Gemfile +7 -0
- data/examples/basic_with_proxy/config.yml +21 -0
- data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
- data/examples/bot_with_simple_iterator/Gemfile +6 -0
- data/examples/bot_with_simple_iterator/config.yml +21 -0
- data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
- data/examples/company_fetchers/basic.rb +49 -0
- data/lib/monkey_patches/mechanize.rb +53 -0
- data/lib/openc_bot.rb +89 -0
- data/lib/openc_bot/bot_data_validator.rb +18 -0
- data/lib/openc_bot/company_fetcher_bot.rb +40 -0
- data/lib/openc_bot/exceptions.rb +17 -0
- data/lib/openc_bot/helpers/_csv.rb +10 -0
- data/lib/openc_bot/helpers/alpha_search.rb +73 -0
- data/lib/openc_bot/helpers/dates.rb +33 -0
- data/lib/openc_bot/helpers/html.rb +8 -0
- data/lib/openc_bot/helpers/incremental_search.rb +106 -0
- data/lib/openc_bot/helpers/register_methods.rb +205 -0
- data/lib/openc_bot/helpers/text.rb +18 -0
- data/lib/openc_bot/incrementers.rb +2 -0
- data/lib/openc_bot/incrementers/base.rb +214 -0
- data/lib/openc_bot/incrementers/common.rb +47 -0
- data/lib/openc_bot/tasks.rb +385 -0
- data/lib/openc_bot/templates/README.md +35 -0
- data/lib/openc_bot/templates/bin/export_data +28 -0
- data/lib/openc_bot/templates/bin/fetch_data +23 -0
- data/lib/openc_bot/templates/bin/verify_data +1 -0
- data/lib/openc_bot/templates/config.yml +21 -0
- data/lib/openc_bot/templates/lib/bot.rb +43 -0
- data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
- data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
- data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
- data/lib/openc_bot/version.rb +3 -0
- data/lib/simple_openc_bot.rb +289 -0
- data/openc_bot.gemspec +35 -0
- data/schemas/company-schema.json +112 -0
- data/schemas/includes/address.json +23 -0
- data/schemas/includes/base-statement.json +27 -0
- data/schemas/includes/company.json +14 -0
- data/schemas/includes/filing.json +20 -0
- data/schemas/includes/license-data.json +27 -0
- data/schemas/includes/officer.json +14 -0
- data/schemas/includes/previous_name.json +11 -0
- data/schemas/includes/share-parcel-data.json +67 -0
- data/schemas/includes/share-parcel.json +60 -0
- data/schemas/includes/subsidiary-relationship-data.json +52 -0
- data/schemas/includes/total-shares.json +10 -0
- data/schemas/licence-schema.json +21 -0
- data/schemas/share-parcel-schema.json +21 -0
- data/schemas/subsidiary-relationship-schema.json +19 -0
- data/spec/dummy_classes/foo_bot.rb +4 -0
- data/spec/lib/bot_data_validator_spec.rb +69 -0
- data/spec/lib/company_fetcher_bot_spec.rb +93 -0
- data/spec/lib/exceptions_spec.rb +25 -0
- data/spec/lib/helpers/alpha_search_spec.rb +173 -0
- data/spec/lib/helpers/dates_spec.rb +65 -0
- data/spec/lib/helpers/incremental_search_spec.rb +471 -0
- data/spec/lib/helpers/register_methods_spec.rb +558 -0
- data/spec/lib/helpers/text_spec.rb +50 -0
- data/spec/lib/openc_bot/db/.gitkeep +0 -0
- data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
- data/spec/lib/openc_bot_spec.rb +116 -0
- data/spec/schemas/company-schema_spec.rb +676 -0
- data/spec/simple_openc_bot_spec.rb +302 -0
- data/spec/spec_helper.rb +19 -0
- metadata +300 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'trollop'
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
opts = Trollop::options do
|
6
|
+
opt :test, "Run in test mode", :short => 't'
|
7
|
+
|
8
|
+
end
|
9
|
+
|
10
|
+
command = "bundle exec openc_bot rake bot:run[#{opts[:test]||''}]"
|
11
|
+
options = { chdir: File.join(File.dirname(__FILE__), "..") }
|
12
|
+
_, stdout, stderr, wait_thread = Open3::popen3(command, options)
|
13
|
+
result = wait_thread.value
|
14
|
+
|
15
|
+
puts "Running in test mode" if opts[:test]
|
16
|
+
|
17
|
+
if result.success?
|
18
|
+
puts stdout.read
|
19
|
+
exit 0
|
20
|
+
else
|
21
|
+
STDERR.puts stderr.read
|
22
|
+
exit 1
|
23
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
bundle exec openc_bot rake bot:test
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# This is a short description of the kind of data the bot handles.
|
2
|
+
description: ''
|
3
|
+
|
4
|
+
# This is your confidence, between 0 and 100, in the accuracy of the data
|
5
|
+
# provided by the data source. It is suggested that you do not change this
|
6
|
+
# without first discussing with OpenCorporates.
|
7
|
+
default_confidence: 80
|
8
|
+
|
9
|
+
# Don't change this. OpenCorporates will bump this version number when the bot
|
10
|
+
# is running in production.
|
11
|
+
version: 0
|
12
|
+
|
13
|
+
# the frequency that updates may happen at the source. One of daily,
|
14
|
+
# weekly, monthly or yearly
|
15
|
+
frequency: monthly
|
16
|
+
|
17
|
+
# If this flag is true, our deployment script will set up the bot to
|
18
|
+
# be run on OpenCorporates servers (specifically, deploy the bot and
|
19
|
+
# allow it to be executed from the data pipeline). Please don't change
|
20
|
+
# this; we'll change it when we're ready to test the bot.
|
21
|
+
enabled: false
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'openc_bot'
|
3
|
+
|
4
|
+
# you may need to require other libraries here
|
5
|
+
#
|
6
|
+
# require 'nokogiri'
|
7
|
+
|
8
|
+
module MyModule
|
9
|
+
extend OpencBot
|
10
|
+
extend self # make these methods as Module methods, rather than instance ones
|
11
|
+
|
12
|
+
def export_data
|
13
|
+
# This is the basic functionality for exporting the data from the database. By default the data
|
14
|
+
# table (what is created when you save_data) is called ocdata, but it can be called anything else,
|
15
|
+
# and the query can be more complex, returning, for example, only the most recent results.
|
16
|
+
sql_query = "ocdata.* from ocdata"
|
17
|
+
select(sql_query).collect do |raw_datum|
|
18
|
+
# raw_datum will be a Hash of field names (as symbols) for the keys and the values for each field.
|
19
|
+
# It should be converted to the format necessary for importing into OpenCorporates by using a
|
20
|
+
# prepare_for_export method.
|
21
|
+
prepare_for_export(raw_datum)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def prepare_for_export(raw_data)
|
26
|
+
# do something here to convert the raw data from the database (if you are using one) into
|
27
|
+
# the form required by the export.
|
28
|
+
end
|
29
|
+
|
30
|
+
def update_data
|
31
|
+
# write code here (using other methods if necessary) for
|
32
|
+
# updating your local database with data from the source
|
33
|
+
# that you are scraping or fetching from
|
34
|
+
#
|
35
|
+
# # See https://github.com/openc/openc_bot README for details
|
36
|
+
# save_data([:uid,:date], my_data, sometablename)
|
37
|
+
#
|
38
|
+
# After updating the data you should run save_run_report, which
|
39
|
+
# saves the status (and other data, if applicable)
|
40
|
+
save_run_report(:status => 'success')
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'openc_bot'
|
3
|
+
require 'openc_bot/company_fetcher_bot'
|
4
|
+
|
5
|
+
# you may need to require other libraries here
|
6
|
+
#
|
7
|
+
# require 'nokogiri'
|
8
|
+
|
9
|
+
# uncomment (and line further down) to get Date helper methods. (Also available csv and text helpers)
|
10
|
+
# require 'openc_bot/helpers/dates'
|
11
|
+
|
12
|
+
module MyModule
|
13
|
+
extend OpencBot
|
14
|
+
# This adds the CompanyFetcherBot functionality
|
15
|
+
extend OpencBot::CompanyFetcherBot
|
16
|
+
# uncomment to get Date helper methods
|
17
|
+
# extend OpencBot::Helpers::Dates
|
18
|
+
extend self # make these methods as Module methods, rather than instance ones
|
19
|
+
|
20
|
+
|
21
|
+
# Uncomment to use alpha search – default is incremental search
|
22
|
+
# USE_ALPHA_SEARCH = true
|
23
|
+
|
24
|
+
# Default number of characters used for search terms in alpha search. Default is 1 (i.e. 'A','B'...)
|
25
|
+
# NUMBER_OF_CHARS_IN_SEARCH = 3
|
26
|
+
|
27
|
+
|
28
|
+
# If the register has a GET'able URL based on the company_number define it here. This should mean that
|
29
|
+
# #fetch_datum 'just works'.
|
30
|
+
def computed_registry_url(company_number)
|
31
|
+
# e.g.
|
32
|
+
# "http://some,register.com/path/to/#{company_number}"
|
33
|
+
end
|
34
|
+
|
35
|
+
# #fetch_data is the primary method for getting companies from the register, and by default is
|
36
|
+
# called when the bot is 'run' (e.g. via bundle exec openc_bot rake bot:run, which calls
|
37
|
+
# #update_data, which in turn calls this)
|
38
|
+
# By default this uses an incremental search (which increments through :company_number identifiers),
|
39
|
+
# or if USE_ALPHA_SEARCH has been set, an alpha search (e.g. searching for entities using 'AA', 'AB')
|
40
|
+
# Define this locally if a different method for getting companies is going to done (e.g.
|
41
|
+
# parsing a CSV file)
|
42
|
+
# def fetch_data
|
43
|
+
# end
|
44
|
+
|
45
|
+
# This is called by #update_datum (defined in the IncrementalSearch helper module), which updates the
|
46
|
+
# information for a given company_number. This allows the individual records to be updated, for example,
|
47
|
+
# via the 'Update from Register' button on the company page on OpenCorporates. This method is also called
|
48
|
+
# by the #fetch_data method in the case of incremental_searches.
|
49
|
+
# By default it calls #fetch_registry_page with the company_number and returns the result in a hash,
|
50
|
+
# with :company_page as a key. This will then be processed or parsed by the #process_datum method,
|
51
|
+
# and the result will be saved by #update_datum, and also returned in a form that can be used by the
|
52
|
+
# main OpenCorporates system
|
53
|
+
#
|
54
|
+
# This hash can contain other data, such as a page of filings or shareholdings. The hash will be
|
55
|
+
# converted to json, and stored in the database in the row for that company number, under the
|
56
|
+
# :data key, so that it can be reused or referred it in the future.
|
57
|
+
# {:company_page => company_page_html, :filings_page => filings_page_html}
|
58
|
+
# def fetch_datum(company_number)
|
59
|
+
# end
|
60
|
+
|
61
|
+
# This method must be defined for all bots that can fetch and process individual records, e.g.
|
62
|
+
# incremental searchers, or individual company pages in an alpha search.
|
63
|
+
# Where the bot cannot do this (e.g. where the underlying data is
|
64
|
+
# only available as a CSV file, or there are no individual pages for each company, it can be
|
65
|
+
# left as a stub method)
|
66
|
+
# It should return a hash that conforms to the company-schema schema (and it will be checked)
|
67
|
+
# against this schema before saving
|
68
|
+
def process_datum(datum_hash)
|
69
|
+
# write your code to parse what is in the company pages/data
|
70
|
+
end
|
71
|
+
|
72
|
+
# This is the standard method for alpha searches e.g. where you are searching a series of terms,
|
73
|
+
# from A-Z0-9. You can increase the number of characters in the search term by setting the
|
74
|
+
# NUMBER_OF_CHARS_IN_SEARCH constant (see above). Define this method locally if you need different
|
75
|
+
# behavtiour o this
|
76
|
+
# def fetch_data_via_alpha_search(options={})
|
77
|
+
# starting_term = options[:starting_term]||get_var('starting_term')
|
78
|
+
# each_search_term(starting_term) do |term|
|
79
|
+
# save_var('starting_term', term)
|
80
|
+
# search_for_entities_for_term(term, options) do |entity_datum|
|
81
|
+
# save_entity(entity_datum)
|
82
|
+
# end
|
83
|
+
# end
|
84
|
+
# # reset pointer
|
85
|
+
# save_var('starting_term',nil)
|
86
|
+
# end
|
87
|
+
|
88
|
+
# This method is called by #fetch_data_via_alpha_search (defined in AlphaSearch helper),
|
89
|
+
# and is passed a search term, typically search term of a number of characters (e.g. 'AB', 'AC'...).
|
90
|
+
# This method should yield a hash of company data which can be validated to the company-schema
|
91
|
+
def search_for_entities_for_term(term, options={})
|
92
|
+
# write your code to search all the pages for the given term, and yield a series of company hashes
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'simple_openc_bot'
|
3
|
+
|
4
|
+
# you may need to require other libraries here
|
5
|
+
# require 'nokogiri'
|
6
|
+
# require 'mechanize'
|
7
|
+
|
8
|
+
class MyLicenceRecord < SimpleOpencBot::BaseLicenceRecord
|
9
|
+
# The JSON schema to use to validate records; correspond with files
|
10
|
+
# in `schema/*-schema.json`
|
11
|
+
schema :licence
|
12
|
+
|
13
|
+
# Fields you define here will be persisted to a local database when
|
14
|
+
# 'fetch_records' (see below) is run.
|
15
|
+
store_fields :name, :type, :reporting_date
|
16
|
+
|
17
|
+
# This is the field(s) which will uniquely define a record (think
|
18
|
+
# primary key in a database).
|
19
|
+
unique_fields :name
|
20
|
+
|
21
|
+
# This must be defined, and should return a timestamp in ISO8601
|
22
|
+
# format. Its value should change when something about the record
|
23
|
+
# has changed. It doesn't have to be a method - it can also be a
|
24
|
+
# member of `store_fields`, above.
|
25
|
+
def last_updated_at
|
26
|
+
reporting_date
|
27
|
+
end
|
28
|
+
|
29
|
+
# This method must be defined. You can test that you're outputting
|
30
|
+
# in the right format with `bin/verify_data`, which will validate
|
31
|
+
# any data you've fetched against the relevant schema. See
|
32
|
+
# `doc/SCHEMA.md` for documentation.
|
33
|
+
def to_pipeline
|
34
|
+
{
|
35
|
+
sample_date: last_updated_at,
|
36
|
+
company: {
|
37
|
+
name: name,
|
38
|
+
jurisdiction: "xx",
|
39
|
+
},
|
40
|
+
source_url: "xx",
|
41
|
+
data: [{
|
42
|
+
data_type: :licence,
|
43
|
+
properties: {
|
44
|
+
jurisdiction_code: "xx",
|
45
|
+
category: 'Financial',
|
46
|
+
jurisdiction_classification: [type],
|
47
|
+
}
|
48
|
+
}]
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
class MyLicence < SimpleOpencBot
|
55
|
+
|
56
|
+
# the class that `fetch_records` yields. Must be defined.
|
57
|
+
yields MyLicenceRecord
|
58
|
+
|
59
|
+
# This method should yield Records. It must be defined.
|
60
|
+
def fetch_all_records(opts={})
|
61
|
+
data = [{:name => "A", :type => "B"}]
|
62
|
+
data.each do |datum|
|
63
|
+
yield MyLicenceRecord.new(
|
64
|
+
datum.merge(:reporting_date => Time.now.iso8601(2)))
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'rspec/autorun'
|
2
|
+
require 'debugger'
|
3
|
+
|
4
|
+
RSpec.configure do |config|
|
5
|
+
|
6
|
+
end
|
7
|
+
|
8
|
+
# Utility method to allow sample html pages, csv files, json or whatever.
|
9
|
+
# Expects the files to be stored in a 'dummy_responses' folder in the spec directory
|
10
|
+
#
|
11
|
+
def dummy_response(response_name, options={})
|
12
|
+
IO.read(File.join(File.dirname(__FILE__),"dummy_responses",response_name.to_s), options)
|
13
|
+
end
|
@@ -0,0 +1,289 @@
|
|
1
|
+
require 'active_support/core_ext'
|
2
|
+
require 'openc_bot'
|
3
|
+
require 'json-schema'
|
4
|
+
require 'openc_bot/incrementers'
|
5
|
+
|
6
|
+
class SimpleOpencBot
|
7
|
+
include OpencBot
|
8
|
+
|
9
|
+
class_attribute :_yields
|
10
|
+
|
11
|
+
def self.yields(*fields)
|
12
|
+
raise "We currently only support one Record type per bot" if fields.count > 1
|
13
|
+
self._yields = fields
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.inherited(obj)
|
17
|
+
path, = caller[0].partition(":")
|
18
|
+
path = File.expand_path(File.join(File.dirname(path), ".."))
|
19
|
+
@@simple_app_directory = path
|
20
|
+
end
|
21
|
+
|
22
|
+
# Override default in ScraperWiki gem
|
23
|
+
def sqlite_magic_connection
|
24
|
+
db = @config ? @config[:db] : File.expand_path(File.join(@@simple_app_directory, 'db', db_name))
|
25
|
+
@sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
|
26
|
+
end
|
27
|
+
|
28
|
+
def update_data(opts={})
|
29
|
+
if opts[:specific_ids].nil? || opts[:specific_ids].empty?
|
30
|
+
# fetch everything
|
31
|
+
record_enumerator = Enumerator.new do |yielder|
|
32
|
+
fetch_all_records(opts) do |result|
|
33
|
+
yielder.yield(result)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
else
|
37
|
+
# fetch records with specified ids
|
38
|
+
record_enumerator = Enumerator.new do |yielder|
|
39
|
+
fetch_specific_records(opts) do |result|
|
40
|
+
yielder.yield(result)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
saves_count = 0
|
45
|
+
batch_size = opts[:test_mode] ? 1 : 500
|
46
|
+
record_enumerator.each_slice(batch_size) do |records|
|
47
|
+
begin
|
48
|
+
sqlite_magic_connection.execute("BEGIN TRANSACTION")
|
49
|
+
records.each do |record|
|
50
|
+
insert_or_update(record.class.unique_fields,
|
51
|
+
record.to_hash)
|
52
|
+
saves_count += 1
|
53
|
+
if saves_count == 1
|
54
|
+
# TODO: move this validation to somewhere more explicit
|
55
|
+
raise "Bot must specify what record type it will yield" if _yields.nil?
|
56
|
+
check_unique_index(_yields[0])
|
57
|
+
end
|
58
|
+
STDOUT.print(".")
|
59
|
+
STDOUT.flush
|
60
|
+
end
|
61
|
+
ensure
|
62
|
+
sqlite_magic_connection.execute("COMMIT") if sqlite_magic_connection.database.transaction_active?
|
63
|
+
end
|
64
|
+
end
|
65
|
+
# NB #save_run_report saves a timestamp called :run_at, so :completed_at is redundant.
|
66
|
+
# Probably should either remove or replace with another datapoint, e.g. started_at
|
67
|
+
save_run_report(:status => 'success', :completed_at => Time.now)
|
68
|
+
saves_count
|
69
|
+
end
|
70
|
+
|
71
|
+
# should this be a method in sqlite_magic gem?
|
72
|
+
def check_unique_index(record_class)
|
73
|
+
indexes = sqlite_magic_connection.execute("PRAGMA INDEX_LIST('ocdata')")
|
74
|
+
db_unique_fields = indexes.map do |i|
|
75
|
+
next if i["unique"] != 1
|
76
|
+
next unless i["name"] =~ /autoindex/
|
77
|
+
info = sqlite_magic_connection.execute("PRAGMA INDEX_INFO('#{i["name"]}')")
|
78
|
+
info.map{|x| x["name"]}
|
79
|
+
end.compact.flatten
|
80
|
+
record_unique_fields = record_class.unique_fields.map(&:to_s)
|
81
|
+
if db_unique_fields != record_unique_fields
|
82
|
+
sqlite_magic_connection.execute("ROLLBACK")
|
83
|
+
error = "Unique fields #{record_unique_fields} do not match the unique index(es) in `ocdata` table!"
|
84
|
+
error += "\nThis is usually because the value of unique_field has changed since the table was automatically created."
|
85
|
+
error += "\nUnique fields in `ocdata`: #{db_unique_fields.flatten}; in record #{record_class.name}: #{record_unique_fields}"
|
86
|
+
raise error
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def count_stored_records
|
91
|
+
begin
|
92
|
+
all_stored_records(:count => true).first["count"]
|
93
|
+
rescue SqliteMagic::NoSuchTable
|
94
|
+
0
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def all_stored_records(opts={})
|
99
|
+
if opts[:only_unexported]
|
100
|
+
opts[:limit] ||= opts[:batch]
|
101
|
+
end
|
102
|
+
|
103
|
+
select = opts[:select] || "ocdata.*"
|
104
|
+
table = opts[:table] || "ocdata"
|
105
|
+
where = (opts[:where] ? "\nWHERE #{opts[:where]}\n" : "\nWHERE 1 \n")
|
106
|
+
order = (opts[:order] ? "\nORDER BY #{opts[:order]}\n" : "")
|
107
|
+
limit = (opts[:limit] ? "\nLIMIT #{opts[:limit]}\n" : "")
|
108
|
+
|
109
|
+
if opts[:only_unexported]
|
110
|
+
where += " AND (_last_exported_at IS NULL "\
|
111
|
+
"OR _last_exported_at < _last_updated_at)"
|
112
|
+
|
113
|
+
if !opts[:specific_ids].blank?
|
114
|
+
ids = opts[:specific_ids].map{|id| "'#{id}'"}.join(",")
|
115
|
+
where += " AND #{_yields[0].unique_field} IN (#{ids})"
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
if opts[:count]
|
120
|
+
sql = "COUNT(*) AS count from #{table} #{where}"
|
121
|
+
puts sql if opts[:debug]
|
122
|
+
select(sql)
|
123
|
+
else
|
124
|
+
sql = "#{select} from #{table} #{where} #{order} #{limit}"
|
125
|
+
puts sql if opts[:debug]
|
126
|
+
select_records(sql)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def unexported_stored_records(opts={})
|
131
|
+
all_stored_records(opts.merge!(:only_unexported => true))
|
132
|
+
end
|
133
|
+
|
134
|
+
def spotcheck_records(limit = 5)
|
135
|
+
all_stored_records(:order => "RANDOM()", :limit => limit)
|
136
|
+
end
|
137
|
+
|
138
|
+
def select_records(sql)
|
139
|
+
select(sql).map { |record| record['_type'].constantize.new(record) }
|
140
|
+
end
|
141
|
+
|
142
|
+
def export_data(opts={})
|
143
|
+
begin
|
144
|
+
sqlite_magic_connection.add_columns(
|
145
|
+
'ocdata', [:_last_exported_at, :_last_updated_at])
|
146
|
+
rescue SQLite3::SQLException
|
147
|
+
end
|
148
|
+
Enumerator.new do |yielder|
|
149
|
+
b = 1
|
150
|
+
loop do
|
151
|
+
if opts[:all]
|
152
|
+
break if b > 1
|
153
|
+
batch = all_stored_records(opts)
|
154
|
+
else
|
155
|
+
batch = unexported_stored_records(:batch => 100, :specific_ids => opts[:specific_ids])
|
156
|
+
end
|
157
|
+
break if batch.empty?
|
158
|
+
updates = {}
|
159
|
+
batch.map do |record|
|
160
|
+
pipeline_data = record.to_pipeline
|
161
|
+
next if pipeline_data.nil?
|
162
|
+
updates[record.class.name] ||= []
|
163
|
+
# opts[:all] is currently called in the bot:test rake task
|
164
|
+
# This has the unfortunate side effect of updating the _last_exported_at
|
165
|
+
# time when running the validation task, so I've added the following conditional
|
166
|
+
if !opts[:all]
|
167
|
+
updates[record.class.name] << record.to_hash.merge(
|
168
|
+
:_last_exported_at => Time.now.iso8601(2))
|
169
|
+
else
|
170
|
+
updates[record.class.name] << record.to_hash
|
171
|
+
end
|
172
|
+
yielder << pipeline_data
|
173
|
+
end
|
174
|
+
sqlite_magic_connection.execute("BEGIN TRANSACTION")
|
175
|
+
if b == 1
|
176
|
+
check_unique_index(_yields[0])
|
177
|
+
end
|
178
|
+
updates.each do |k, v|
|
179
|
+
save_data(k.constantize.unique_fields, v)
|
180
|
+
end
|
181
|
+
sqlite_magic_connection.execute("COMMIT")
|
182
|
+
b += 1
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def spotcheck_data
|
188
|
+
batch = spotcheck_records
|
189
|
+
batch.collect do |record|
|
190
|
+
record.to_pipeline
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def validate_data(opts={})
|
195
|
+
opts = {:limit => 1000}.merge(opts)
|
196
|
+
errors = all_stored_records(opts).map do |record|
|
197
|
+
record.errors
|
198
|
+
end.compact
|
199
|
+
total = count_stored_records
|
200
|
+
selected = [opts[:limit], total].min
|
201
|
+
puts "NOTICE: only validated first #{selected} of #{total} records"
|
202
|
+
errors
|
203
|
+
end
|
204
|
+
|
205
|
+
|
206
|
+
class BaseLicenceRecord
|
207
|
+
class_attribute :_store_fields, :_type, :_schema, :_unique_fields
|
208
|
+
|
209
|
+
def self.store_fields(*fields)
|
210
|
+
self._store_fields ||= []
|
211
|
+
self._store_fields.concat(fields)
|
212
|
+
fields << :_last_exported_at unless _store_fields.include?(:_last_exported_at)
|
213
|
+
fields << :_last_updated_at unless _store_fields.include?(:_last_updated_at)
|
214
|
+
fields.each do |field|
|
215
|
+
attr_accessor field
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def self.unique_fields(*fields)
|
220
|
+
self._unique_fields = fields unless fields.empty?
|
221
|
+
self._unique_fields
|
222
|
+
end
|
223
|
+
|
224
|
+
def self.schema(schema)
|
225
|
+
hyphenated_name = schema.to_s.gsub("_", "-")
|
226
|
+
self._schema = File.expand_path("../../schemas/#{hyphenated_name}-schema.json", __FILE__)
|
227
|
+
end
|
228
|
+
|
229
|
+
def initialize(attrs={})
|
230
|
+
validate_instance!
|
231
|
+
attrs = attrs.with_indifferent_access
|
232
|
+
self._type = self.class.name
|
233
|
+
self._store_fields.each do |k|
|
234
|
+
send("#{k}=", attrs[k])
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
def validate_instance!
|
239
|
+
all_errors = []
|
240
|
+
required_functions = [:last_updated_at, :to_pipeline]
|
241
|
+
func_errors = []
|
242
|
+
required_functions.each do |func|
|
243
|
+
if !respond_to?(func)
|
244
|
+
func_errors << func
|
245
|
+
end
|
246
|
+
end
|
247
|
+
if !func_errors.empty?
|
248
|
+
all_errors << "You must define the following functions in your record class: #{func_errors.join(', ')}"
|
249
|
+
end
|
250
|
+
field_errors = []
|
251
|
+
required_fields = [:_store_fields, :_unique_fields, :_schema]
|
252
|
+
required_fields.each do |f|
|
253
|
+
if !send(f)
|
254
|
+
field_errors << f.to_s[1..-1]
|
255
|
+
end
|
256
|
+
end
|
257
|
+
if !field_errors.empty?
|
258
|
+
all_errors << "You must define the following fields on your record class: #{field_errors.join(', ')}"
|
259
|
+
end
|
260
|
+
raise all_errors.join('\n') unless all_errors.empty?
|
261
|
+
end
|
262
|
+
|
263
|
+
def to_hash
|
264
|
+
hsh = Hash[_store_fields.map{|field| [field, send(field)]}]
|
265
|
+
hsh[:_type] = self.class.name
|
266
|
+
hsh[:_last_updated_at] = last_updated_at
|
267
|
+
hsh
|
268
|
+
end
|
269
|
+
|
270
|
+
# return a structure including errors if invalid; otherwise return nil
|
271
|
+
def errors
|
272
|
+
data = self.to_pipeline
|
273
|
+
if data
|
274
|
+
if !self._schema
|
275
|
+
# backwards compatibility
|
276
|
+
self._schema = File.expand_path("../../schemas/licence-schema.json", __FILE__)
|
277
|
+
end
|
278
|
+
errors = JSON::Validator.fully_validate(
|
279
|
+
self._schema,
|
280
|
+
data.to_json,
|
281
|
+
{:errors_as_objects => true})
|
282
|
+
if !errors.empty?
|
283
|
+
data[:errors] = errors
|
284
|
+
data
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|