openc_bot 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +2 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +253 -0
- data/Rakefile +14 -0
- data/bin/openc_bot +13 -0
- data/create_bot.sh +30 -0
- data/create_company_bot.sh +16 -0
- data/create_simple_licence_bot.sh +31 -0
- data/db/.gitkeep +0 -0
- data/examples/basic/.gitignore +3 -0
- data/examples/basic/Gemfile +7 -0
- data/examples/basic/config.yml +21 -0
- data/examples/basic/lib/basic.rb +88 -0
- data/examples/basic_with_proxy/Gemfile +7 -0
- data/examples/basic_with_proxy/config.yml +21 -0
- data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
- data/examples/bot_with_simple_iterator/Gemfile +6 -0
- data/examples/bot_with_simple_iterator/config.yml +21 -0
- data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
- data/examples/company_fetchers/basic.rb +49 -0
- data/lib/monkey_patches/mechanize.rb +53 -0
- data/lib/openc_bot.rb +89 -0
- data/lib/openc_bot/bot_data_validator.rb +18 -0
- data/lib/openc_bot/company_fetcher_bot.rb +40 -0
- data/lib/openc_bot/exceptions.rb +17 -0
- data/lib/openc_bot/helpers/_csv.rb +10 -0
- data/lib/openc_bot/helpers/alpha_search.rb +73 -0
- data/lib/openc_bot/helpers/dates.rb +33 -0
- data/lib/openc_bot/helpers/html.rb +8 -0
- data/lib/openc_bot/helpers/incremental_search.rb +106 -0
- data/lib/openc_bot/helpers/register_methods.rb +205 -0
- data/lib/openc_bot/helpers/text.rb +18 -0
- data/lib/openc_bot/incrementers.rb +2 -0
- data/lib/openc_bot/incrementers/base.rb +214 -0
- data/lib/openc_bot/incrementers/common.rb +47 -0
- data/lib/openc_bot/tasks.rb +385 -0
- data/lib/openc_bot/templates/README.md +35 -0
- data/lib/openc_bot/templates/bin/export_data +28 -0
- data/lib/openc_bot/templates/bin/fetch_data +23 -0
- data/lib/openc_bot/templates/bin/verify_data +1 -0
- data/lib/openc_bot/templates/config.yml +21 -0
- data/lib/openc_bot/templates/lib/bot.rb +43 -0
- data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
- data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
- data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
- data/lib/openc_bot/version.rb +3 -0
- data/lib/simple_openc_bot.rb +289 -0
- data/openc_bot.gemspec +35 -0
- data/schemas/company-schema.json +112 -0
- data/schemas/includes/address.json +23 -0
- data/schemas/includes/base-statement.json +27 -0
- data/schemas/includes/company.json +14 -0
- data/schemas/includes/filing.json +20 -0
- data/schemas/includes/license-data.json +27 -0
- data/schemas/includes/officer.json +14 -0
- data/schemas/includes/previous_name.json +11 -0
- data/schemas/includes/share-parcel-data.json +67 -0
- data/schemas/includes/share-parcel.json +60 -0
- data/schemas/includes/subsidiary-relationship-data.json +52 -0
- data/schemas/includes/total-shares.json +10 -0
- data/schemas/licence-schema.json +21 -0
- data/schemas/share-parcel-schema.json +21 -0
- data/schemas/subsidiary-relationship-schema.json +19 -0
- data/spec/dummy_classes/foo_bot.rb +4 -0
- data/spec/lib/bot_data_validator_spec.rb +69 -0
- data/spec/lib/company_fetcher_bot_spec.rb +93 -0
- data/spec/lib/exceptions_spec.rb +25 -0
- data/spec/lib/helpers/alpha_search_spec.rb +173 -0
- data/spec/lib/helpers/dates_spec.rb +65 -0
- data/spec/lib/helpers/incremental_search_spec.rb +471 -0
- data/spec/lib/helpers/register_methods_spec.rb +558 -0
- data/spec/lib/helpers/text_spec.rb +50 -0
- data/spec/lib/openc_bot/db/.gitkeep +0 -0
- data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
- data/spec/lib/openc_bot_spec.rb +116 -0
- data/spec/schemas/company-schema_spec.rb +676 -0
- data/spec/simple_openc_bot_spec.rb +302 -0
- data/spec/spec_helper.rb +19 -0
- metadata +300 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'trollop'
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
opts = Trollop::options do
|
6
|
+
opt :test, "Run in test mode", :short => 't'
|
7
|
+
|
8
|
+
end
|
9
|
+
|
10
|
+
command = "bundle exec openc_bot rake bot:run[#{opts[:test]||''}]"
|
11
|
+
options = { chdir: File.join(File.dirname(__FILE__), "..") }
|
12
|
+
_, stdout, stderr, wait_thread = Open3::popen3(command, options)
|
13
|
+
result = wait_thread.value
|
14
|
+
|
15
|
+
puts "Running in test mode" if opts[:test]
|
16
|
+
|
17
|
+
if result.success?
|
18
|
+
puts stdout.read
|
19
|
+
exit 0
|
20
|
+
else
|
21
|
+
STDERR.puts stderr.read
|
22
|
+
exit 1
|
23
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
bundle exec openc_bot rake bot:test
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# This is a short description of the kind of data the bot handles.
|
2
|
+
description: ''
|
3
|
+
|
4
|
+
# This is your confidence, between 0 and 100, in the accuracy of the data
|
5
|
+
# provided by the data source. It is suggested that you do not change this
|
6
|
+
# without first discussing with OpenCorporates.
|
7
|
+
default_confidence: 80
|
8
|
+
|
9
|
+
# Don't change this. OpenCorporates will bump this version number when the bot
|
10
|
+
# is running in production.
|
11
|
+
version: 0
|
12
|
+
|
13
|
+
# the frequency that updates may happen at the source. One of daily,
|
14
|
+
# weekly, monthly or yearly
|
15
|
+
frequency: monthly
|
16
|
+
|
17
|
+
# If this flag is true, our deployment script will set up the bot to
|
18
|
+
# be run on OpenCorporates servers (specifically, deploy the bot and
|
19
|
+
# allow it to be executed from the data pipeline). Please don't change
|
20
|
+
# this; we'll change it when we're ready to test the bot.
|
21
|
+
enabled: false
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'openc_bot'
|
3
|
+
|
4
|
+
# you may need to require other libraries here
|
5
|
+
#
|
6
|
+
# require 'nokogiri'
|
7
|
+
|
8
|
+
module MyModule
|
9
|
+
extend OpencBot
|
10
|
+
extend self # make these methods as Module methods, rather than instance ones
|
11
|
+
|
12
|
+
def export_data
|
13
|
+
# This is the basic functionality for exporting the data from the database. By default the data
|
14
|
+
# table (what is created when you save_data) is called ocdata, but it can be called anything else,
|
15
|
+
# and the query can be more complex, returning, for example, only the most recent results.
|
16
|
+
sql_query = "ocdata.* from ocdata"
|
17
|
+
select(sql_query).collect do |raw_datum|
|
18
|
+
# raw_datum will be a Hash of field names (as symbols) for the keys and the values for each field.
|
19
|
+
# It should be converted to the format necessary for importing into OpenCorporates by using a
|
20
|
+
# prepare_for_export method.
|
21
|
+
prepare_for_export(raw_datum)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def prepare_for_export(raw_data)
|
26
|
+
# do something here to convert the raw data from the database (if you are using one) into
|
27
|
+
# the form required by the export.
|
28
|
+
end
|
29
|
+
|
30
|
+
def update_data
|
31
|
+
# write code here (using other methods if necessary) for
|
32
|
+
# updating your local database with data from the source
|
33
|
+
# that you are scraping or fetching from
|
34
|
+
#
|
35
|
+
# # See https://github.com/openc/openc_bot README for details
|
36
|
+
# save_data([:uid,:date], my_data, sometablename)
|
37
|
+
#
|
38
|
+
# After updating the data you should run save_run_report, which
|
39
|
+
# saves the status (and other data, if applicable)
|
40
|
+
save_run_report(:status => 'success')
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'openc_bot'
|
3
|
+
require 'openc_bot/company_fetcher_bot'
|
4
|
+
|
5
|
+
# you may need to require other libraries here
|
6
|
+
#
|
7
|
+
# require 'nokogiri'
|
8
|
+
|
9
|
+
# uncomment (and line further down) to get Date helper methods. (Also available csv and text helpers)
|
10
|
+
# require 'openc_bot/helpers/dates'
|
11
|
+
|
12
|
+
module MyModule
|
13
|
+
extend OpencBot
|
14
|
+
# This adds the CompanyFetcherBot functionality
|
15
|
+
extend OpencBot::CompanyFetcherBot
|
16
|
+
# uncomment to get Date helper methods
|
17
|
+
# extend OpencBot::Helpers::Dates
|
18
|
+
extend self # make these methods as Module methods, rather than instance ones
|
19
|
+
|
20
|
+
|
21
|
+
# Uncomment to use alpha search – default is incremental search
|
22
|
+
# USE_ALPHA_SEARCH = true
|
23
|
+
|
24
|
+
# Default number of characters used for search terms in alpha search. Default is 1 (i.e. 'A','B'...)
|
25
|
+
# NUMBER_OF_CHARS_IN_SEARCH = 3
|
26
|
+
|
27
|
+
|
28
|
+
# If the register has a GET'able URL based on the company_number define it here. This should mean that
|
29
|
+
# #fetch_datum 'just works'.
|
30
|
+
def computed_registry_url(company_number)
|
31
|
+
# e.g.
|
32
|
+
# "http://some,register.com/path/to/#{company_number}"
|
33
|
+
end
|
34
|
+
|
35
|
+
# #fetch_data is the primary method for getting companies from the register, and by default is
|
36
|
+
# called when the bot is 'run' (e.g. via bundle exec openc_bot rake bot:run, which calls
|
37
|
+
# #update_data, which in turn calls this)
|
38
|
+
# By default this uses an incremental search (which increments through :company_number identifiers),
|
39
|
+
# or if USE_ALPHA_SEARCH has been set, an alpha search (e.g. searching for entities using 'AA', 'AB')
|
40
|
+
# Define this locally if a different method for getting companies is going to done (e.g.
|
41
|
+
# parsing a CSV file)
|
42
|
+
# def fetch_data
|
43
|
+
# end
|
44
|
+
|
45
|
+
# This is called by #update_datum (defined in the IncrementalSearch helper module), which updates the
|
46
|
+
# information for a given company_number. This allows the individual records to be updated, for example,
|
47
|
+
# via the 'Update from Register' button on the company page on OpenCorporates. This method is also called
|
48
|
+
# by the #fetch_data method in the case of incremental_searches.
|
49
|
+
# By default it calls #fetch_registry_page with the company_number and returns the result in a hash,
|
50
|
+
# with :company_page as a key. This will then be processed or parsed by the #process_datum method,
|
51
|
+
# and the result will be saved by #update_datum, and also returned in a form that can be used by the
|
52
|
+
# main OpenCorporates system
|
53
|
+
#
|
54
|
+
# This hash can contain other data, such as a page of filings or shareholdings. The hash will be
|
55
|
+
# converted to json, and stored in the database in the row for that company number, under the
|
56
|
+
# :data key, so that it can be reused or referred it in the future.
|
57
|
+
# {:company_page => company_page_html, :filings_page => filings_page_html}
|
58
|
+
# def fetch_datum(company_number)
|
59
|
+
# end
|
60
|
+
|
61
|
+
# This method must be defined for all bots that can fetch and process individual records, e.g.
|
62
|
+
# incremental searchers, or individual company pages in an alpha search.
|
63
|
+
# Where the bot cannot do this (e.g. where the underlying data is
|
64
|
+
# only available as a CSV file, or there are no individual pages for each company, it can be
|
65
|
+
# left as a stub method)
|
66
|
+
# It should return a hash that conforms to the company-schema schema (and it will be checked)
|
67
|
+
# against this schema before saving
|
68
|
+
def process_datum(datum_hash)
|
69
|
+
# write your code to parse what is in the company pages/data
|
70
|
+
end
|
71
|
+
|
72
|
+
# This is the standard method for alpha searches e.g. where you are searching a series of terms,
|
73
|
+
# from A-Z0-9. You can increase the number of characters in the search term by setting the
|
74
|
+
# NUMBER_OF_CHARS_IN_SEARCH constant (see above). Define this method locally if you need different
|
75
|
+
# behavtiour o this
|
76
|
+
# def fetch_data_via_alpha_search(options={})
|
77
|
+
# starting_term = options[:starting_term]||get_var('starting_term')
|
78
|
+
# each_search_term(starting_term) do |term|
|
79
|
+
# save_var('starting_term', term)
|
80
|
+
# search_for_entities_for_term(term, options) do |entity_datum|
|
81
|
+
# save_entity(entity_datum)
|
82
|
+
# end
|
83
|
+
# end
|
84
|
+
# # reset pointer
|
85
|
+
# save_var('starting_term',nil)
|
86
|
+
# end
|
87
|
+
|
88
|
+
# This method is called by #fetch_data_via_alpha_search (defined in AlphaSearch helper),
|
89
|
+
# and is passed a search term, typically search term of a number of characters (e.g. 'AB', 'AC'...).
|
90
|
+
# This method should yield a hash of company data which can be validated to the company-schema
|
91
|
+
def search_for_entities_for_term(term, options={})
|
92
|
+
# write your code to search all the pages for the given term, and yield a series of company hashes
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'simple_openc_bot'
|
3
|
+
|
4
|
+
# you may need to require other libraries here
|
5
|
+
# require 'nokogiri'
|
6
|
+
# require 'mechanize'
|
7
|
+
|
8
|
+
class MyLicenceRecord < SimpleOpencBot::BaseLicenceRecord
|
9
|
+
# The JSON schema to use to validate records; correspond with files
|
10
|
+
# in `schema/*-schema.json`
|
11
|
+
schema :licence
|
12
|
+
|
13
|
+
# Fields you define here will be persisted to a local database when
|
14
|
+
# 'fetch_records' (see below) is run.
|
15
|
+
store_fields :name, :type, :reporting_date
|
16
|
+
|
17
|
+
# This is the field(s) which will uniquely define a record (think
|
18
|
+
# primary key in a database).
|
19
|
+
unique_fields :name
|
20
|
+
|
21
|
+
# This must be defined, and should return a timestamp in ISO8601
|
22
|
+
# format. Its value should change when something about the record
|
23
|
+
# has changed. It doesn't have to be a method - it can also be a
|
24
|
+
# member of `store_fields`, above.
|
25
|
+
def last_updated_at
|
26
|
+
reporting_date
|
27
|
+
end
|
28
|
+
|
29
|
+
# This method must be defined. You can test that you're outputting
|
30
|
+
# in the right format with `bin/verify_data`, which will validate
|
31
|
+
# any data you've fetched against the relevant schema. See
|
32
|
+
# `doc/SCHEMA.md` for documentation.
|
33
|
+
def to_pipeline
|
34
|
+
{
|
35
|
+
sample_date: last_updated_at,
|
36
|
+
company: {
|
37
|
+
name: name,
|
38
|
+
jurisdiction: "xx",
|
39
|
+
},
|
40
|
+
source_url: "xx",
|
41
|
+
data: [{
|
42
|
+
data_type: :licence,
|
43
|
+
properties: {
|
44
|
+
jurisdiction_code: "xx",
|
45
|
+
category: 'Financial',
|
46
|
+
jurisdiction_classification: [type],
|
47
|
+
}
|
48
|
+
}]
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
class MyLicence < SimpleOpencBot
|
55
|
+
|
56
|
+
# the class that `fetch_records` yields. Must be defined.
|
57
|
+
yields MyLicenceRecord
|
58
|
+
|
59
|
+
# This method should yield Records. It must be defined.
|
60
|
+
def fetch_all_records(opts={})
|
61
|
+
data = [{:name => "A", :type => "B"}]
|
62
|
+
data.each do |datum|
|
63
|
+
yield MyLicenceRecord.new(
|
64
|
+
datum.merge(:reporting_date => Time.now.iso8601(2)))
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'rspec/autorun'
|
2
|
+
require 'debugger'
|
3
|
+
|
4
|
+
RSpec.configure do |config|
|
5
|
+
|
6
|
+
end
|
7
|
+
|
8
|
+
# Utility method to allow sample html pages, csv files, json or whatever.
|
9
|
+
# Expects the files to be stored in a 'dummy_responses' folder in the spec directory
|
10
|
+
#
|
11
|
+
def dummy_response(response_name, options={})
|
12
|
+
IO.read(File.join(File.dirname(__FILE__),"dummy_responses",response_name.to_s), options)
|
13
|
+
end
|
@@ -0,0 +1,289 @@
|
|
1
|
+
require 'active_support/core_ext'
|
2
|
+
require 'openc_bot'
|
3
|
+
require 'json-schema'
|
4
|
+
require 'openc_bot/incrementers'
|
5
|
+
|
6
|
+
class SimpleOpencBot
|
7
|
+
include OpencBot
|
8
|
+
|
9
|
+
class_attribute :_yields
|
10
|
+
|
11
|
+
def self.yields(*fields)
|
12
|
+
raise "We currently only support one Record type per bot" if fields.count > 1
|
13
|
+
self._yields = fields
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.inherited(obj)
|
17
|
+
path, = caller[0].partition(":")
|
18
|
+
path = File.expand_path(File.join(File.dirname(path), ".."))
|
19
|
+
@@simple_app_directory = path
|
20
|
+
end
|
21
|
+
|
22
|
+
# Override default in ScraperWiki gem
|
23
|
+
def sqlite_magic_connection
|
24
|
+
db = @config ? @config[:db] : File.expand_path(File.join(@@simple_app_directory, 'db', db_name))
|
25
|
+
@sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
|
26
|
+
end
|
27
|
+
|
28
|
+
def update_data(opts={})
|
29
|
+
if opts[:specific_ids].nil? || opts[:specific_ids].empty?
|
30
|
+
# fetch everything
|
31
|
+
record_enumerator = Enumerator.new do |yielder|
|
32
|
+
fetch_all_records(opts) do |result|
|
33
|
+
yielder.yield(result)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
else
|
37
|
+
# fetch records with specified ids
|
38
|
+
record_enumerator = Enumerator.new do |yielder|
|
39
|
+
fetch_specific_records(opts) do |result|
|
40
|
+
yielder.yield(result)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
saves_count = 0
|
45
|
+
batch_size = opts[:test_mode] ? 1 : 500
|
46
|
+
record_enumerator.each_slice(batch_size) do |records|
|
47
|
+
begin
|
48
|
+
sqlite_magic_connection.execute("BEGIN TRANSACTION")
|
49
|
+
records.each do |record|
|
50
|
+
insert_or_update(record.class.unique_fields,
|
51
|
+
record.to_hash)
|
52
|
+
saves_count += 1
|
53
|
+
if saves_count == 1
|
54
|
+
# TODO: move this validation to somewhere more explicit
|
55
|
+
raise "Bot must specify what record type it will yield" if _yields.nil?
|
56
|
+
check_unique_index(_yields[0])
|
57
|
+
end
|
58
|
+
STDOUT.print(".")
|
59
|
+
STDOUT.flush
|
60
|
+
end
|
61
|
+
ensure
|
62
|
+
sqlite_magic_connection.execute("COMMIT") if sqlite_magic_connection.database.transaction_active?
|
63
|
+
end
|
64
|
+
end
|
65
|
+
# NB #save_run_report saves a timestamp called :run_at, so :completed_at is redundant.
|
66
|
+
# Probably should either remove or replace with another datapoint, e.g. started_at
|
67
|
+
save_run_report(:status => 'success', :completed_at => Time.now)
|
68
|
+
saves_count
|
69
|
+
end
|
70
|
+
|
71
|
+
# should this be a method in sqlite_magic gem?
|
72
|
+
def check_unique_index(record_class)
|
73
|
+
indexes = sqlite_magic_connection.execute("PRAGMA INDEX_LIST('ocdata')")
|
74
|
+
db_unique_fields = indexes.map do |i|
|
75
|
+
next if i["unique"] != 1
|
76
|
+
next unless i["name"] =~ /autoindex/
|
77
|
+
info = sqlite_magic_connection.execute("PRAGMA INDEX_INFO('#{i["name"]}')")
|
78
|
+
info.map{|x| x["name"]}
|
79
|
+
end.compact.flatten
|
80
|
+
record_unique_fields = record_class.unique_fields.map(&:to_s)
|
81
|
+
if db_unique_fields != record_unique_fields
|
82
|
+
sqlite_magic_connection.execute("ROLLBACK")
|
83
|
+
error = "Unique fields #{record_unique_fields} do not match the unique index(es) in `ocdata` table!"
|
84
|
+
error += "\nThis is usually because the value of unique_field has changed since the table was automatically created."
|
85
|
+
error += "\nUnique fields in `ocdata`: #{db_unique_fields.flatten}; in record #{record_class.name}: #{record_unique_fields}"
|
86
|
+
raise error
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def count_stored_records
|
91
|
+
begin
|
92
|
+
all_stored_records(:count => true).first["count"]
|
93
|
+
rescue SqliteMagic::NoSuchTable
|
94
|
+
0
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def all_stored_records(opts={})
|
99
|
+
if opts[:only_unexported]
|
100
|
+
opts[:limit] ||= opts[:batch]
|
101
|
+
end
|
102
|
+
|
103
|
+
select = opts[:select] || "ocdata.*"
|
104
|
+
table = opts[:table] || "ocdata"
|
105
|
+
where = (opts[:where] ? "\nWHERE #{opts[:where]}\n" : "\nWHERE 1 \n")
|
106
|
+
order = (opts[:order] ? "\nORDER BY #{opts[:order]}\n" : "")
|
107
|
+
limit = (opts[:limit] ? "\nLIMIT #{opts[:limit]}\n" : "")
|
108
|
+
|
109
|
+
if opts[:only_unexported]
|
110
|
+
where += " AND (_last_exported_at IS NULL "\
|
111
|
+
"OR _last_exported_at < _last_updated_at)"
|
112
|
+
|
113
|
+
if !opts[:specific_ids].blank?
|
114
|
+
ids = opts[:specific_ids].map{|id| "'#{id}'"}.join(",")
|
115
|
+
where += " AND #{_yields[0].unique_field} IN (#{ids})"
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
if opts[:count]
|
120
|
+
sql = "COUNT(*) AS count from #{table} #{where}"
|
121
|
+
puts sql if opts[:debug]
|
122
|
+
select(sql)
|
123
|
+
else
|
124
|
+
sql = "#{select} from #{table} #{where} #{order} #{limit}"
|
125
|
+
puts sql if opts[:debug]
|
126
|
+
select_records(sql)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def unexported_stored_records(opts={})
|
131
|
+
all_stored_records(opts.merge!(:only_unexported => true))
|
132
|
+
end
|
133
|
+
|
134
|
+
def spotcheck_records(limit = 5)
|
135
|
+
all_stored_records(:order => "RANDOM()", :limit => limit)
|
136
|
+
end
|
137
|
+
|
138
|
+
def select_records(sql)
|
139
|
+
select(sql).map { |record| record['_type'].constantize.new(record) }
|
140
|
+
end
|
141
|
+
|
142
|
+
def export_data(opts={})
|
143
|
+
begin
|
144
|
+
sqlite_magic_connection.add_columns(
|
145
|
+
'ocdata', [:_last_exported_at, :_last_updated_at])
|
146
|
+
rescue SQLite3::SQLException
|
147
|
+
end
|
148
|
+
Enumerator.new do |yielder|
|
149
|
+
b = 1
|
150
|
+
loop do
|
151
|
+
if opts[:all]
|
152
|
+
break if b > 1
|
153
|
+
batch = all_stored_records(opts)
|
154
|
+
else
|
155
|
+
batch = unexported_stored_records(:batch => 100, :specific_ids => opts[:specific_ids])
|
156
|
+
end
|
157
|
+
break if batch.empty?
|
158
|
+
updates = {}
|
159
|
+
batch.map do |record|
|
160
|
+
pipeline_data = record.to_pipeline
|
161
|
+
next if pipeline_data.nil?
|
162
|
+
updates[record.class.name] ||= []
|
163
|
+
# opts[:all] is currently called in the bot:test rake task
|
164
|
+
# This has the unfortunate side effect of updating the _last_exported_at
|
165
|
+
# time when running the validation task, so I've added the following conditional
|
166
|
+
if !opts[:all]
|
167
|
+
updates[record.class.name] << record.to_hash.merge(
|
168
|
+
:_last_exported_at => Time.now.iso8601(2))
|
169
|
+
else
|
170
|
+
updates[record.class.name] << record.to_hash
|
171
|
+
end
|
172
|
+
yielder << pipeline_data
|
173
|
+
end
|
174
|
+
sqlite_magic_connection.execute("BEGIN TRANSACTION")
|
175
|
+
if b == 1
|
176
|
+
check_unique_index(_yields[0])
|
177
|
+
end
|
178
|
+
updates.each do |k, v|
|
179
|
+
save_data(k.constantize.unique_fields, v)
|
180
|
+
end
|
181
|
+
sqlite_magic_connection.execute("COMMIT")
|
182
|
+
b += 1
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def spotcheck_data
|
188
|
+
batch = spotcheck_records
|
189
|
+
batch.collect do |record|
|
190
|
+
record.to_pipeline
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def validate_data(opts={})
|
195
|
+
opts = {:limit => 1000}.merge(opts)
|
196
|
+
errors = all_stored_records(opts).map do |record|
|
197
|
+
record.errors
|
198
|
+
end.compact
|
199
|
+
total = count_stored_records
|
200
|
+
selected = [opts[:limit], total].min
|
201
|
+
puts "NOTICE: only validated first #{selected} of #{total} records"
|
202
|
+
errors
|
203
|
+
end
|
204
|
+
|
205
|
+
|
206
|
+
class BaseLicenceRecord
|
207
|
+
class_attribute :_store_fields, :_type, :_schema, :_unique_fields
|
208
|
+
|
209
|
+
def self.store_fields(*fields)
|
210
|
+
self._store_fields ||= []
|
211
|
+
self._store_fields.concat(fields)
|
212
|
+
fields << :_last_exported_at unless _store_fields.include?(:_last_exported_at)
|
213
|
+
fields << :_last_updated_at unless _store_fields.include?(:_last_updated_at)
|
214
|
+
fields.each do |field|
|
215
|
+
attr_accessor field
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def self.unique_fields(*fields)
|
220
|
+
self._unique_fields = fields unless fields.empty?
|
221
|
+
self._unique_fields
|
222
|
+
end
|
223
|
+
|
224
|
+
def self.schema(schema)
|
225
|
+
hyphenated_name = schema.to_s.gsub("_", "-")
|
226
|
+
self._schema = File.expand_path("../../schemas/#{hyphenated_name}-schema.json", __FILE__)
|
227
|
+
end
|
228
|
+
|
229
|
+
def initialize(attrs={})
|
230
|
+
validate_instance!
|
231
|
+
attrs = attrs.with_indifferent_access
|
232
|
+
self._type = self.class.name
|
233
|
+
self._store_fields.each do |k|
|
234
|
+
send("#{k}=", attrs[k])
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
def validate_instance!
|
239
|
+
all_errors = []
|
240
|
+
required_functions = [:last_updated_at, :to_pipeline]
|
241
|
+
func_errors = []
|
242
|
+
required_functions.each do |func|
|
243
|
+
if !respond_to?(func)
|
244
|
+
func_errors << func
|
245
|
+
end
|
246
|
+
end
|
247
|
+
if !func_errors.empty?
|
248
|
+
all_errors << "You must define the following functions in your record class: #{func_errors.join(', ')}"
|
249
|
+
end
|
250
|
+
field_errors = []
|
251
|
+
required_fields = [:_store_fields, :_unique_fields, :_schema]
|
252
|
+
required_fields.each do |f|
|
253
|
+
if !send(f)
|
254
|
+
field_errors << f.to_s[1..-1]
|
255
|
+
end
|
256
|
+
end
|
257
|
+
if !field_errors.empty?
|
258
|
+
all_errors << "You must define the following fields on your record class: #{field_errors.join(', ')}"
|
259
|
+
end
|
260
|
+
raise all_errors.join('\n') unless all_errors.empty?
|
261
|
+
end
|
262
|
+
|
263
|
+
def to_hash
|
264
|
+
hsh = Hash[_store_fields.map{|field| [field, send(field)]}]
|
265
|
+
hsh[:_type] = self.class.name
|
266
|
+
hsh[:_last_updated_at] = last_updated_at
|
267
|
+
hsh
|
268
|
+
end
|
269
|
+
|
270
|
+
# return a structure including errors if invalid; otherwise return nil
|
271
|
+
def errors
|
272
|
+
data = self.to_pipeline
|
273
|
+
if data
|
274
|
+
if !self._schema
|
275
|
+
# backwards compatibility
|
276
|
+
self._schema = File.expand_path("../../schemas/licence-schema.json", __FILE__)
|
277
|
+
end
|
278
|
+
errors = JSON::Validator.fully_validate(
|
279
|
+
self._schema,
|
280
|
+
data.to_json,
|
281
|
+
{:errors_as_objects => true})
|
282
|
+
if !errors.empty?
|
283
|
+
data[:errors] = errors
|
284
|
+
data
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|