openc_bot 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ require 'trollop'
3
+ require 'open3'
4
+
5
+ opts = Trollop::options do
6
+ opt :test, "Run in test mode", :short => 't'
7
+
8
+ end
9
+
10
+ command = "bundle exec openc_bot rake bot:run[#{opts[:test]||''}]"
11
+ options = { chdir: File.join(File.dirname(__FILE__), "..") }
12
+ _, stdout, stderr, wait_thread = Open3::popen3(command, options)
13
+ result = wait_thread.value
14
+
15
+ puts "Running in test mode" if opts[:test]
16
+
17
+ if result.success?
18
+ puts stdout.read
19
+ exit 0
20
+ else
21
+ STDERR.puts stderr.read
22
+ exit 1
23
+ end
@@ -0,0 +1 @@
1
+ bundle exec openc_bot rake bot:test
@@ -0,0 +1,21 @@
1
+ # This is a short description of the kind of data the bot handles.
2
+ description: ''
3
+
4
+ # This is your confidence, between 0 and 100, in the accuracy of the data
5
+ # provided by the data source. It is suggested that you do not change this
6
+ # without first discussing with OpenCorporates.
7
+ default_confidence: 80
8
+
9
+ # Don't change this. OpenCorporates will bump this version number when the bot
10
+ # is running in production.
11
+ version: 0
12
+
13
+ # the frequency that updates may happen at the source. One of daily,
14
+ # weekly, monthly or yearly
15
+ frequency: monthly
16
+
17
+ # If this flag is true, our deployment script will set up the bot to
18
+ # be run on OpenCorporates servers (specifically, deploy the bot and
19
+ # allow it to be executed from the data pipeline). Please don't change
20
+ # this; we'll change it when we're ready to test the bot.
21
+ enabled: false
@@ -0,0 +1,43 @@
1
+ # encoding: UTF-8
2
+ require 'openc_bot'
3
+
4
+ # you may need to require other libraries here
5
+ #
6
+ # require 'nokogiri'
7
+
8
+ module MyModule
9
+ extend OpencBot
10
+ extend self # make these methods as Module methods, rather than instance ones
11
+
12
+ def export_data
13
+ # This is the basic functionality for exporting the data from the database. By default the data
14
+ # table (what is created when you save_data) is called ocdata, but it can be called anything else,
15
+ # and the query can be more complex, returning, for example, only the most recent results.
16
+ sql_query = "ocdata.* from ocdata"
17
+ select(sql_query).collect do |raw_datum|
18
+ # raw_datum will be a Hash of field names (as symbols) for the keys and the values for each field.
19
+ # It should be converted to the format necessary for importing into OpenCorporates by using a
20
+ # prepare_for_export method.
21
+ prepare_for_export(raw_datum)
22
+ end
23
+ end
24
+
25
+ def prepare_for_export(raw_data)
26
+ # do something here to convert the raw data from the database (if you are using one) into
27
+ # the form required by the export.
28
+ end
29
+
30
+ def update_data
31
+ # write code here (using other methods if necessary) for
32
+ # updating your local database with data from the source
33
+ # that you are scraping or fetching from
34
+ #
35
+ # # See https://github.com/openc/openc_bot README for details
36
+ # save_data([:uid,:date], my_data, sometablename)
37
+ #
38
+ # After updating the data you should run save_run_report, which
39
+ # saves the status (and other data, if applicable)
40
+ save_run_report(:status => 'success')
41
+ end
42
+
43
+ end
@@ -0,0 +1,95 @@
1
+ # encoding: UTF-8
2
+ require 'openc_bot'
3
+ require 'openc_bot/company_fetcher_bot'
4
+
5
+ # you may need to require other libraries here
6
+ #
7
+ # require 'nokogiri'
8
+
9
+ # uncomment (and line further down) to get Date helper methods. (Also available csv and text helpers)
10
+ # require 'openc_bot/helpers/dates'
11
+
12
+ module MyModule
13
+ extend OpencBot
14
+ # This adds the CompanyFetcherBot functionality
15
+ extend OpencBot::CompanyFetcherBot
16
+ # uncomment to get Date helper methods
17
+ # extend OpencBot::Helpers::Dates
18
+ extend self # make these methods as Module methods, rather than instance ones
19
+
20
+
21
+ # Uncomment to use alpha search – default is incremental search
22
+ # USE_ALPHA_SEARCH = true
23
+
24
+ # Default number of characters used for search terms in alpha search. Default is 1 (i.e. 'A','B'...)
25
+ # NUMBER_OF_CHARS_IN_SEARCH = 3
26
+
27
+
28
+ # If the register has a GET'able URL based on the company_number define it here. This should mean that
29
+ # #fetch_datum 'just works'.
30
+ def computed_registry_url(company_number)
31
+ # e.g.
32
+ # "http://some,register.com/path/to/#{company_number}"
33
+ end
34
+
35
+ # #fetch_data is the primary method for getting companies from the register, and by default is
36
+ # called when the bot is 'run' (e.g. via bundle exec openc_bot rake bot:run, which calls
37
+ # #update_data, which in turn calls this)
38
+ # By default this uses an incremental search (which increments through :company_number identifiers),
39
+ # or if USE_ALPHA_SEARCH has been set, an alpha search (e.g. searching for entities using 'AA', 'AB')
40
+ # Define this locally if a different method for getting companies is going to done (e.g.
41
+ # parsing a CSV file)
42
+ # def fetch_data
43
+ # end
44
+
45
+ # This is called by #update_datum (defined in the IncrementalSearch helper module), which updates the
46
+ # information for a given company_number. This allows the individual records to be updated, for example,
47
+ # via the 'Update from Register' button on the company page on OpenCorporates. This method is also called
48
+ # by the #fetch_data method in the case of incremental_searches.
49
+ # By default it calls #fetch_registry_page with the company_number and returns the result in a hash,
50
+ # with :company_page as a key. This will then be processed or parsed by the #process_datum method,
51
+ # and the result will be saved by #update_datum, and also returned in a form that can be used by the
52
+ # main OpenCorporates system
53
+ #
54
+ # This hash can contain other data, such as a page of filings or shareholdings. The hash will be
55
+ # converted to json, and stored in the database in the row for that company number, under the
56
+ # :data key, so that it can be reused or referred it in the future.
57
+ # {:company_page => company_page_html, :filings_page => filings_page_html}
58
+ # def fetch_datum(company_number)
59
+ # end
60
+
61
+ # This method must be defined for all bots that can fetch and process individual records, e.g.
62
+ # incremental searchers, or individual company pages in an alpha search.
63
+ # Where the bot cannot do this (e.g. where the underlying data is
64
+ # only available as a CSV file, or there are no individual pages for each company, it can be
65
+ # left as a stub method)
66
+ # It should return a hash that conforms to the company-schema schema (and it will be checked)
67
+ # against this schema before saving
68
+ def process_datum(datum_hash)
69
+ # write your code to parse what is in the company pages/data
70
+ end
71
+
72
+ # This is the standard method for alpha searches e.g. where you are searching a series of terms,
73
+ # from A-Z0-9. You can increase the number of characters in the search term by setting the
74
+ # NUMBER_OF_CHARS_IN_SEARCH constant (see above). Define this method locally if you need different
75
+ # behavtiour o this
76
+ # def fetch_data_via_alpha_search(options={})
77
+ # starting_term = options[:starting_term]||get_var('starting_term')
78
+ # each_search_term(starting_term) do |term|
79
+ # save_var('starting_term', term)
80
+ # search_for_entities_for_term(term, options) do |entity_datum|
81
+ # save_entity(entity_datum)
82
+ # end
83
+ # end
84
+ # # reset pointer
85
+ # save_var('starting_term',nil)
86
+ # end
87
+
88
+ # This method is called by #fetch_data_via_alpha_search (defined in AlphaSearch helper),
89
+ # and is passed a search term, typically search term of a number of characters (e.g. 'AB', 'AC'...).
90
+ # This method should yield a hash of company data which can be validated to the company-schema
91
+ def search_for_entities_for_term(term, options={})
92
+ # write your code to search all the pages for the given term, and yield a series of company hashes
93
+ end
94
+
95
+ end
@@ -0,0 +1,67 @@
1
+ # encoding: UTF-8
2
+ require 'simple_openc_bot'
3
+
4
+ # you may need to require other libraries here
5
+ # require 'nokogiri'
6
+ # require 'mechanize'
7
+
8
+ class MyLicenceRecord < SimpleOpencBot::BaseLicenceRecord
9
+ # The JSON schema to use to validate records; correspond with files
10
+ # in `schema/*-schema.json`
11
+ schema :licence
12
+
13
+ # Fields you define here will be persisted to a local database when
14
+ # 'fetch_records' (see below) is run.
15
+ store_fields :name, :type, :reporting_date
16
+
17
+ # This is the field(s) which will uniquely define a record (think
18
+ # primary key in a database).
19
+ unique_fields :name
20
+
21
+ # This must be defined, and should return a timestamp in ISO8601
22
+ # format. Its value should change when something about the record
23
+ # has changed. It doesn't have to be a method - it can also be a
24
+ # member of `store_fields`, above.
25
+ def last_updated_at
26
+ reporting_date
27
+ end
28
+
29
+ # This method must be defined. You can test that you're outputting
30
+ # in the right format with `bin/verify_data`, which will validate
31
+ # any data you've fetched against the relevant schema. See
32
+ # `doc/SCHEMA.md` for documentation.
33
+ def to_pipeline
34
+ {
35
+ sample_date: last_updated_at,
36
+ company: {
37
+ name: name,
38
+ jurisdiction: "xx",
39
+ },
40
+ source_url: "xx",
41
+ data: [{
42
+ data_type: :licence,
43
+ properties: {
44
+ jurisdiction_code: "xx",
45
+ category: 'Financial',
46
+ jurisdiction_classification: [type],
47
+ }
48
+ }]
49
+ }
50
+ end
51
+
52
+ end
53
+
54
+ class MyLicence < SimpleOpencBot
55
+
56
+ # the class that `fetch_records` yields. Must be defined.
57
+ yields MyLicenceRecord
58
+
59
+ # This method should yield Records. It must be defined.
60
+ def fetch_all_records(opts={})
61
+ data = [{:name => "A", :type => "B"}]
62
+ data.each do |datum|
63
+ yield MyLicenceRecord.new(
64
+ datum.merge(:reporting_date => Time.now.iso8601(2)))
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: UTF-8
2
+ require_relative 'spec_helper'
3
+ require_relative '../lib/my_module'
4
+
5
+ describe MyModule do
6
+
7
+ it "should extend with OpencBot methods" do
8
+ MyModule.should respond_to :save_data
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: UTF-8
2
+ require_relative 'spec_helper'
3
+ require_relative '../lib/my_module'
4
+
5
+ describe MyModule do
6
+
7
+ it "should extend with OpencBot methods" do
8
+ MyModule.should respond_to :save_data
9
+ end
10
+
11
+ end
@@ -0,0 +1,13 @@
1
+ require 'rspec/autorun'
2
+ require 'debugger'
3
+
4
+ RSpec.configure do |config|
5
+
6
+ end
7
+
8
+ # Utility method to allow sample html pages, csv files, json or whatever.
9
+ # Expects the files to be stored in a 'dummy_responses' folder in the spec directory
10
+ #
11
+ def dummy_response(response_name, options={})
12
+ IO.read(File.join(File.dirname(__FILE__),"dummy_responses",response_name.to_s), options)
13
+ end
@@ -0,0 +1,3 @@
1
+ module OpencBot
2
+ VERSION = "0.0.11"
3
+ end
@@ -0,0 +1,289 @@
1
+ require 'active_support/core_ext'
2
+ require 'openc_bot'
3
+ require 'json-schema'
4
+ require 'openc_bot/incrementers'
5
+
6
+ class SimpleOpencBot
7
+ include OpencBot
8
+
9
+ class_attribute :_yields
10
+
11
+ def self.yields(*fields)
12
+ raise "We currently only support one Record type per bot" if fields.count > 1
13
+ self._yields = fields
14
+ end
15
+
16
+ def self.inherited(obj)
17
+ path, = caller[0].partition(":")
18
+ path = File.expand_path(File.join(File.dirname(path), ".."))
19
+ @@simple_app_directory = path
20
+ end
21
+
22
+ # Override default in ScraperWiki gem
23
+ def sqlite_magic_connection
24
+ db = @config ? @config[:db] : File.expand_path(File.join(@@simple_app_directory, 'db', db_name))
25
+ @sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
26
+ end
27
+
28
+ def update_data(opts={})
29
+ if opts[:specific_ids].nil? || opts[:specific_ids].empty?
30
+ # fetch everything
31
+ record_enumerator = Enumerator.new do |yielder|
32
+ fetch_all_records(opts) do |result|
33
+ yielder.yield(result)
34
+ end
35
+ end
36
+ else
37
+ # fetch records with specified ids
38
+ record_enumerator = Enumerator.new do |yielder|
39
+ fetch_specific_records(opts) do |result|
40
+ yielder.yield(result)
41
+ end
42
+ end
43
+ end
44
+ saves_count = 0
45
+ batch_size = opts[:test_mode] ? 1 : 500
46
+ record_enumerator.each_slice(batch_size) do |records|
47
+ begin
48
+ sqlite_magic_connection.execute("BEGIN TRANSACTION")
49
+ records.each do |record|
50
+ insert_or_update(record.class.unique_fields,
51
+ record.to_hash)
52
+ saves_count += 1
53
+ if saves_count == 1
54
+ # TODO: move this validation to somewhere more explicit
55
+ raise "Bot must specify what record type it will yield" if _yields.nil?
56
+ check_unique_index(_yields[0])
57
+ end
58
+ STDOUT.print(".")
59
+ STDOUT.flush
60
+ end
61
+ ensure
62
+ sqlite_magic_connection.execute("COMMIT") if sqlite_magic_connection.database.transaction_active?
63
+ end
64
+ end
65
+ # NB #save_run_report saves a timestamp called :run_at, so :completed_at is redundant.
66
+ # Probably should either remove or replace with another datapoint, e.g. started_at
67
+ save_run_report(:status => 'success', :completed_at => Time.now)
68
+ saves_count
69
+ end
70
+
71
+ # should this be a method in sqlite_magic gem?
72
+ def check_unique_index(record_class)
73
+ indexes = sqlite_magic_connection.execute("PRAGMA INDEX_LIST('ocdata')")
74
+ db_unique_fields = indexes.map do |i|
75
+ next if i["unique"] != 1
76
+ next unless i["name"] =~ /autoindex/
77
+ info = sqlite_magic_connection.execute("PRAGMA INDEX_INFO('#{i["name"]}')")
78
+ info.map{|x| x["name"]}
79
+ end.compact.flatten
80
+ record_unique_fields = record_class.unique_fields.map(&:to_s)
81
+ if db_unique_fields != record_unique_fields
82
+ sqlite_magic_connection.execute("ROLLBACK")
83
+ error = "Unique fields #{record_unique_fields} do not match the unique index(es) in `ocdata` table!"
84
+ error += "\nThis is usually because the value of unique_field has changed since the table was automatically created."
85
+ error += "\nUnique fields in `ocdata`: #{db_unique_fields.flatten}; in record #{record_class.name}: #{record_unique_fields}"
86
+ raise error
87
+ end
88
+ end
89
+
90
+ def count_stored_records
91
+ begin
92
+ all_stored_records(:count => true).first["count"]
93
+ rescue SqliteMagic::NoSuchTable
94
+ 0
95
+ end
96
+ end
97
+
98
+ def all_stored_records(opts={})
99
+ if opts[:only_unexported]
100
+ opts[:limit] ||= opts[:batch]
101
+ end
102
+
103
+ select = opts[:select] || "ocdata.*"
104
+ table = opts[:table] || "ocdata"
105
+ where = (opts[:where] ? "\nWHERE #{opts[:where]}\n" : "\nWHERE 1 \n")
106
+ order = (opts[:order] ? "\nORDER BY #{opts[:order]}\n" : "")
107
+ limit = (opts[:limit] ? "\nLIMIT #{opts[:limit]}\n" : "")
108
+
109
+ if opts[:only_unexported]
110
+ where += " AND (_last_exported_at IS NULL "\
111
+ "OR _last_exported_at < _last_updated_at)"
112
+
113
+ if !opts[:specific_ids].blank?
114
+ ids = opts[:specific_ids].map{|id| "'#{id}'"}.join(",")
115
+ where += " AND #{_yields[0].unique_field} IN (#{ids})"
116
+ end
117
+ end
118
+
119
+ if opts[:count]
120
+ sql = "COUNT(*) AS count from #{table} #{where}"
121
+ puts sql if opts[:debug]
122
+ select(sql)
123
+ else
124
+ sql = "#{select} from #{table} #{where} #{order} #{limit}"
125
+ puts sql if opts[:debug]
126
+ select_records(sql)
127
+ end
128
+ end
129
+
130
+ def unexported_stored_records(opts={})
131
+ all_stored_records(opts.merge!(:only_unexported => true))
132
+ end
133
+
134
+ def spotcheck_records(limit = 5)
135
+ all_stored_records(:order => "RANDOM()", :limit => limit)
136
+ end
137
+
138
+ def select_records(sql)
139
+ select(sql).map { |record| record['_type'].constantize.new(record) }
140
+ end
141
+
142
+ def export_data(opts={})
143
+ begin
144
+ sqlite_magic_connection.add_columns(
145
+ 'ocdata', [:_last_exported_at, :_last_updated_at])
146
+ rescue SQLite3::SQLException
147
+ end
148
+ Enumerator.new do |yielder|
149
+ b = 1
150
+ loop do
151
+ if opts[:all]
152
+ break if b > 1
153
+ batch = all_stored_records(opts)
154
+ else
155
+ batch = unexported_stored_records(:batch => 100, :specific_ids => opts[:specific_ids])
156
+ end
157
+ break if batch.empty?
158
+ updates = {}
159
+ batch.map do |record|
160
+ pipeline_data = record.to_pipeline
161
+ next if pipeline_data.nil?
162
+ updates[record.class.name] ||= []
163
+ # opts[:all] is currently called in the bot:test rake task
164
+ # This has the unfortunate side effect of updating the _last_exported_at
165
+ # time when running the validation task, so I've added the following conditional
166
+ if !opts[:all]
167
+ updates[record.class.name] << record.to_hash.merge(
168
+ :_last_exported_at => Time.now.iso8601(2))
169
+ else
170
+ updates[record.class.name] << record.to_hash
171
+ end
172
+ yielder << pipeline_data
173
+ end
174
+ sqlite_magic_connection.execute("BEGIN TRANSACTION")
175
+ if b == 1
176
+ check_unique_index(_yields[0])
177
+ end
178
+ updates.each do |k, v|
179
+ save_data(k.constantize.unique_fields, v)
180
+ end
181
+ sqlite_magic_connection.execute("COMMIT")
182
+ b += 1
183
+ end
184
+ end
185
+ end
186
+
187
+ def spotcheck_data
188
+ batch = spotcheck_records
189
+ batch.collect do |record|
190
+ record.to_pipeline
191
+ end
192
+ end
193
+
194
+ def validate_data(opts={})
195
+ opts = {:limit => 1000}.merge(opts)
196
+ errors = all_stored_records(opts).map do |record|
197
+ record.errors
198
+ end.compact
199
+ total = count_stored_records
200
+ selected = [opts[:limit], total].min
201
+ puts "NOTICE: only validated first #{selected} of #{total} records"
202
+ errors
203
+ end
204
+
205
+
206
+ class BaseLicenceRecord
207
+ class_attribute :_store_fields, :_type, :_schema, :_unique_fields
208
+
209
+ def self.store_fields(*fields)
210
+ self._store_fields ||= []
211
+ self._store_fields.concat(fields)
212
+ fields << :_last_exported_at unless _store_fields.include?(:_last_exported_at)
213
+ fields << :_last_updated_at unless _store_fields.include?(:_last_updated_at)
214
+ fields.each do |field|
215
+ attr_accessor field
216
+ end
217
+ end
218
+
219
+ def self.unique_fields(*fields)
220
+ self._unique_fields = fields unless fields.empty?
221
+ self._unique_fields
222
+ end
223
+
224
+ def self.schema(schema)
225
+ hyphenated_name = schema.to_s.gsub("_", "-")
226
+ self._schema = File.expand_path("../../schemas/#{hyphenated_name}-schema.json", __FILE__)
227
+ end
228
+
229
+ def initialize(attrs={})
230
+ validate_instance!
231
+ attrs = attrs.with_indifferent_access
232
+ self._type = self.class.name
233
+ self._store_fields.each do |k|
234
+ send("#{k}=", attrs[k])
235
+ end
236
+ end
237
+
238
+ def validate_instance!
239
+ all_errors = []
240
+ required_functions = [:last_updated_at, :to_pipeline]
241
+ func_errors = []
242
+ required_functions.each do |func|
243
+ if !respond_to?(func)
244
+ func_errors << func
245
+ end
246
+ end
247
+ if !func_errors.empty?
248
+ all_errors << "You must define the following functions in your record class: #{func_errors.join(', ')}"
249
+ end
250
+ field_errors = []
251
+ required_fields = [:_store_fields, :_unique_fields, :_schema]
252
+ required_fields.each do |f|
253
+ if !send(f)
254
+ field_errors << f.to_s[1..-1]
255
+ end
256
+ end
257
+ if !field_errors.empty?
258
+ all_errors << "You must define the following fields on your record class: #{field_errors.join(', ')}"
259
+ end
260
+ raise all_errors.join('\n') unless all_errors.empty?
261
+ end
262
+
263
+ def to_hash
264
+ hsh = Hash[_store_fields.map{|field| [field, send(field)]}]
265
+ hsh[:_type] = self.class.name
266
+ hsh[:_last_updated_at] = last_updated_at
267
+ hsh
268
+ end
269
+
270
+ # return a structure including errors if invalid; otherwise return nil
271
+ def errors
272
+ data = self.to_pipeline
273
+ if data
274
+ if !self._schema
275
+ # backwards compatibility
276
+ self._schema = File.expand_path("../../schemas/licence-schema.json", __FILE__)
277
+ end
278
+ errors = JSON::Validator.fully_validate(
279
+ self._schema,
280
+ data.to_json,
281
+ {:errors_as_objects => true})
282
+ if !errors.empty?
283
+ data[:errors] = errors
284
+ data
285
+ end
286
+ end
287
+ end
288
+ end
289
+ end