openc_bot 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ require 'trollop'
3
+ require 'open3'
4
+
5
+ opts = Trollop::options do
6
+ opt :test, "Run in test mode", :short => 't'
7
+
8
+ end
9
+
10
+ command = "bundle exec openc_bot rake bot:run[#{opts[:test]||''}]"
11
+ options = { chdir: File.join(File.dirname(__FILE__), "..") }
12
+ _, stdout, stderr, wait_thread = Open3::popen3(command, options)
13
+ result = wait_thread.value
14
+
15
+ puts "Running in test mode" if opts[:test]
16
+
17
+ if result.success?
18
+ puts stdout.read
19
+ exit 0
20
+ else
21
+ STDERR.puts stderr.read
22
+ exit 1
23
+ end
@@ -0,0 +1 @@
1
+ bundle exec openc_bot rake bot:test
@@ -0,0 +1,21 @@
1
+ # This is a short description of the kind of data the bot handles.
2
+ description: ''
3
+
4
+ # This is your confidence, between 0 and 100, in the accuracy of the data
5
+ # provided by the data source. It is suggested that you do not change this
6
+ # without first discussing with OpenCorporates.
7
+ default_confidence: 80
8
+
9
+ # Don't change this. OpenCorporates will bump this version number when the bot
10
+ # is running in production.
11
+ version: 0
12
+
13
+ # the frequency that updates may happen at the source. One of daily,
14
+ # weekly, monthly or yearly
15
+ frequency: monthly
16
+
17
+ # If this flag is true, our deployment script will set up the bot to
18
+ # be run on OpenCorporates servers (specifically, deploy the bot and
19
+ # allow it to be executed from the data pipeline). Please don't change
20
+ # this; we'll change it when we're ready to test the bot.
21
+ enabled: false
@@ -0,0 +1,43 @@
1
+ # encoding: UTF-8
2
+ require 'openc_bot'
3
+
4
+ # you may need to require other libraries here
5
+ #
6
+ # require 'nokogiri'
7
+
8
+ module MyModule
9
+ extend OpencBot
10
+ extend self # make these methods as Module methods, rather than instance ones
11
+
12
+ def export_data
13
+ # This is the basic functionality for exporting the data from the database. By default the data
14
+ # table (what is created when you save_data) is called ocdata, but it can be called anything else,
15
+ # and the query can be more complex, returning, for example, only the most recent results.
16
+ sql_query = "ocdata.* from ocdata"
17
+ select(sql_query).collect do |raw_datum|
18
+ # raw_datum will be a Hash of field names (as symbols) for the keys and the values for each field.
19
+ # It should be converted to the format necessary for importing into OpenCorporates by using a
20
+ # prepare_for_export method.
21
+ prepare_for_export(raw_datum)
22
+ end
23
+ end
24
+
25
+ def prepare_for_export(raw_data)
26
+ # do something here to convert the raw data from the database (if you are using one) into
27
+ # the form required by the export.
28
+ end
29
+
30
+ def update_data
31
+ # write code here (using other methods if necessary) for
32
+ # updating your local database with data from the source
33
+ # that you are scraping or fetching from
34
+ #
35
+ # # See https://github.com/openc/openc_bot README for details
36
+ # save_data([:uid,:date], my_data, sometablename)
37
+ #
38
+ # After updating the data you should run save_run_report, which
39
+ # saves the status (and other data, if applicable)
40
+ save_run_report(:status => 'success')
41
+ end
42
+
43
+ end
@@ -0,0 +1,95 @@
1
+ # encoding: UTF-8
2
+ require 'openc_bot'
3
+ require 'openc_bot/company_fetcher_bot'
4
+
5
+ # you may need to require other libraries here
6
+ #
7
+ # require 'nokogiri'
8
+
9
+ # uncomment (and line further down) to get Date helper methods. (Also available csv and text helpers)
10
+ # require 'openc_bot/helpers/dates'
11
+
12
+ module MyModule
13
+ extend OpencBot
14
+ # This adds the CompanyFetcherBot functionality
15
+ extend OpencBot::CompanyFetcherBot
16
+ # uncomment to get Date helper methods
17
+ # extend OpencBot::Helpers::Dates
18
+ extend self # make these methods as Module methods, rather than instance ones
19
+
20
+
21
+ # Uncomment to use alpha search – default is incremental search
22
+ # USE_ALPHA_SEARCH = true
23
+
24
+ # Default number of characters used for search terms in alpha search. Default is 1 (i.e. 'A','B'...)
25
+ # NUMBER_OF_CHARS_IN_SEARCH = 3
26
+
27
+
28
+ # If the register has a GET'able URL based on the company_number define it here. This should mean that
29
+ # #fetch_datum 'just works'.
30
+ def computed_registry_url(company_number)
31
+ # e.g.
32
+ # "http://some,register.com/path/to/#{company_number}"
33
+ end
34
+
35
+ # #fetch_data is the primary method for getting companies from the register, and by default is
36
+ # called when the bot is 'run' (e.g. via bundle exec openc_bot rake bot:run, which calls
37
+ # #update_data, which in turn calls this)
38
+ # By default this uses an incremental search (which increments through :company_number identifiers),
39
+ # or if USE_ALPHA_SEARCH has been set, an alpha search (e.g. searching for entities using 'AA', 'AB')
40
+ # Define this locally if a different method for getting companies is going to done (e.g.
41
+ # parsing a CSV file)
42
+ # def fetch_data
43
+ # end
44
+
45
+ # This is called by #update_datum (defined in the IncrementalSearch helper module), which updates the
46
+ # information for a given company_number. This allows the individual records to be updated, for example,
47
+ # via the 'Update from Register' button on the company page on OpenCorporates. This method is also called
48
+ # by the #fetch_data method in the case of incremental_searches.
49
+ # By default it calls #fetch_registry_page with the company_number and returns the result in a hash,
50
+ # with :company_page as a key. This will then be processed or parsed by the #process_datum method,
51
+ # and the result will be saved by #update_datum, and also returned in a form that can be used by the
52
+ # main OpenCorporates system
53
+ #
54
+ # This hash can contain other data, such as a page of filings or shareholdings. The hash will be
55
+ # converted to json, and stored in the database in the row for that company number, under the
56
+ # :data key, so that it can be reused or referred it in the future.
57
+ # {:company_page => company_page_html, :filings_page => filings_page_html}
58
+ # def fetch_datum(company_number)
59
+ # end
60
+
61
+ # This method must be defined for all bots that can fetch and process individual records, e.g.
62
+ # incremental searchers, or individual company pages in an alpha search.
63
+ # Where the bot cannot do this (e.g. where the underlying data is
64
+ # only available as a CSV file, or there are no individual pages for each company, it can be
65
+ # left as a stub method)
66
+ # It should return a hash that conforms to the company-schema schema (and it will be checked)
67
+ # against this schema before saving
68
+ def process_datum(datum_hash)
69
+ # write your code to parse what is in the company pages/data
70
+ end
71
+
72
+ # This is the standard method for alpha searches e.g. where you are searching a series of terms,
73
+ # from A-Z0-9. You can increase the number of characters in the search term by setting the
74
+ # NUMBER_OF_CHARS_IN_SEARCH constant (see above). Define this method locally if you need different
75
+ # behavtiour o this
76
+ # def fetch_data_via_alpha_search(options={})
77
+ # starting_term = options[:starting_term]||get_var('starting_term')
78
+ # each_search_term(starting_term) do |term|
79
+ # save_var('starting_term', term)
80
+ # search_for_entities_for_term(term, options) do |entity_datum|
81
+ # save_entity(entity_datum)
82
+ # end
83
+ # end
84
+ # # reset pointer
85
+ # save_var('starting_term',nil)
86
+ # end
87
+
88
+ # This method is called by #fetch_data_via_alpha_search (defined in AlphaSearch helper),
89
+ # and is passed a search term, typically search term of a number of characters (e.g. 'AB', 'AC'...).
90
+ # This method should yield a hash of company data which can be validated to the company-schema
91
+ def search_for_entities_for_term(term, options={})
92
+ # write your code to search all the pages for the given term, and yield a series of company hashes
93
+ end
94
+
95
+ end
@@ -0,0 +1,67 @@
1
+ # encoding: UTF-8
2
+ require 'simple_openc_bot'
3
+
4
+ # you may need to require other libraries here
5
+ # require 'nokogiri'
6
+ # require 'mechanize'
7
+
8
+ class MyLicenceRecord < SimpleOpencBot::BaseLicenceRecord
9
+ # The JSON schema to use to validate records; correspond with files
10
+ # in `schema/*-schema.json`
11
+ schema :licence
12
+
13
+ # Fields you define here will be persisted to a local database when
14
+ # 'fetch_records' (see below) is run.
15
+ store_fields :name, :type, :reporting_date
16
+
17
+ # This is the field(s) which will uniquely define a record (think
18
+ # primary key in a database).
19
+ unique_fields :name
20
+
21
+ # This must be defined, and should return a timestamp in ISO8601
22
+ # format. Its value should change when something about the record
23
+ # has changed. It doesn't have to be a method - it can also be a
24
+ # member of `store_fields`, above.
25
+ def last_updated_at
26
+ reporting_date
27
+ end
28
+
29
+ # This method must be defined. You can test that you're outputting
30
+ # in the right format with `bin/verify_data`, which will validate
31
+ # any data you've fetched against the relevant schema. See
32
+ # `doc/SCHEMA.md` for documentation.
33
+ def to_pipeline
34
+ {
35
+ sample_date: last_updated_at,
36
+ company: {
37
+ name: name,
38
+ jurisdiction: "xx",
39
+ },
40
+ source_url: "xx",
41
+ data: [{
42
+ data_type: :licence,
43
+ properties: {
44
+ jurisdiction_code: "xx",
45
+ category: 'Financial',
46
+ jurisdiction_classification: [type],
47
+ }
48
+ }]
49
+ }
50
+ end
51
+
52
+ end
53
+
54
+ class MyLicence < SimpleOpencBot
55
+
56
+ # the class that `fetch_records` yields. Must be defined.
57
+ yields MyLicenceRecord
58
+
59
+ # This method should yield Records. It must be defined.
60
+ def fetch_all_records(opts={})
61
+ data = [{:name => "A", :type => "B"}]
62
+ data.each do |datum|
63
+ yield MyLicenceRecord.new(
64
+ datum.merge(:reporting_date => Time.now.iso8601(2)))
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: UTF-8
2
+ require_relative 'spec_helper'
3
+ require_relative '../lib/my_module'
4
+
5
+ describe MyModule do
6
+
7
+ it "should extend with OpencBot methods" do
8
+ MyModule.should respond_to :save_data
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: UTF-8
2
+ require_relative 'spec_helper'
3
+ require_relative '../lib/my_module'
4
+
5
+ describe MyModule do
6
+
7
+ it "should extend with OpencBot methods" do
8
+ MyModule.should respond_to :save_data
9
+ end
10
+
11
+ end
@@ -0,0 +1,13 @@
1
+ require 'rspec/autorun'
2
+ require 'debugger'
3
+
4
+ RSpec.configure do |config|
5
+
6
+ end
7
+
8
+ # Utility method to allow sample html pages, csv files, json or whatever.
9
+ # Expects the files to be stored in a 'dummy_responses' folder in the spec directory
10
+ #
11
+ def dummy_response(response_name, options={})
12
+ IO.read(File.join(File.dirname(__FILE__),"dummy_responses",response_name.to_s), options)
13
+ end
@@ -0,0 +1,3 @@
1
+ module OpencBot
2
+ VERSION = "0.0.11"
3
+ end
@@ -0,0 +1,289 @@
1
+ require 'active_support/core_ext'
2
+ require 'openc_bot'
3
+ require 'json-schema'
4
+ require 'openc_bot/incrementers'
5
+
6
+ class SimpleOpencBot
7
+ include OpencBot
8
+
9
+ class_attribute :_yields
10
+
11
+ def self.yields(*fields)
12
+ raise "We currently only support one Record type per bot" if fields.count > 1
13
+ self._yields = fields
14
+ end
15
+
16
+ def self.inherited(obj)
17
+ path, = caller[0].partition(":")
18
+ path = File.expand_path(File.join(File.dirname(path), ".."))
19
+ @@simple_app_directory = path
20
+ end
21
+
22
+ # Override default in ScraperWiki gem
23
+ def sqlite_magic_connection
24
+ db = @config ? @config[:db] : File.expand_path(File.join(@@simple_app_directory, 'db', db_name))
25
+ @sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
26
+ end
27
+
28
+ def update_data(opts={})
29
+ if opts[:specific_ids].nil? || opts[:specific_ids].empty?
30
+ # fetch everything
31
+ record_enumerator = Enumerator.new do |yielder|
32
+ fetch_all_records(opts) do |result|
33
+ yielder.yield(result)
34
+ end
35
+ end
36
+ else
37
+ # fetch records with specified ids
38
+ record_enumerator = Enumerator.new do |yielder|
39
+ fetch_specific_records(opts) do |result|
40
+ yielder.yield(result)
41
+ end
42
+ end
43
+ end
44
+ saves_count = 0
45
+ batch_size = opts[:test_mode] ? 1 : 500
46
+ record_enumerator.each_slice(batch_size) do |records|
47
+ begin
48
+ sqlite_magic_connection.execute("BEGIN TRANSACTION")
49
+ records.each do |record|
50
+ insert_or_update(record.class.unique_fields,
51
+ record.to_hash)
52
+ saves_count += 1
53
+ if saves_count == 1
54
+ # TODO: move this validation to somewhere more explicit
55
+ raise "Bot must specify what record type it will yield" if _yields.nil?
56
+ check_unique_index(_yields[0])
57
+ end
58
+ STDOUT.print(".")
59
+ STDOUT.flush
60
+ end
61
+ ensure
62
+ sqlite_magic_connection.execute("COMMIT") if sqlite_magic_connection.database.transaction_active?
63
+ end
64
+ end
65
+ # NB #save_run_report saves a timestamp called :run_at, so :completed_at is redundant.
66
+ # Probably should either remove or replace with another datapoint, e.g. started_at
67
+ save_run_report(:status => 'success', :completed_at => Time.now)
68
+ saves_count
69
+ end
70
+
71
+ # should this be a method in sqlite_magic gem?
72
+ def check_unique_index(record_class)
73
+ indexes = sqlite_magic_connection.execute("PRAGMA INDEX_LIST('ocdata')")
74
+ db_unique_fields = indexes.map do |i|
75
+ next if i["unique"] != 1
76
+ next unless i["name"] =~ /autoindex/
77
+ info = sqlite_magic_connection.execute("PRAGMA INDEX_INFO('#{i["name"]}')")
78
+ info.map{|x| x["name"]}
79
+ end.compact.flatten
80
+ record_unique_fields = record_class.unique_fields.map(&:to_s)
81
+ if db_unique_fields != record_unique_fields
82
+ sqlite_magic_connection.execute("ROLLBACK")
83
+ error = "Unique fields #{record_unique_fields} do not match the unique index(es) in `ocdata` table!"
84
+ error += "\nThis is usually because the value of unique_field has changed since the table was automatically created."
85
+ error += "\nUnique fields in `ocdata`: #{db_unique_fields.flatten}; in record #{record_class.name}: #{record_unique_fields}"
86
+ raise error
87
+ end
88
+ end
89
+
90
+ def count_stored_records
91
+ begin
92
+ all_stored_records(:count => true).first["count"]
93
+ rescue SqliteMagic::NoSuchTable
94
+ 0
95
+ end
96
+ end
97
+
98
+ def all_stored_records(opts={})
99
+ if opts[:only_unexported]
100
+ opts[:limit] ||= opts[:batch]
101
+ end
102
+
103
+ select = opts[:select] || "ocdata.*"
104
+ table = opts[:table] || "ocdata"
105
+ where = (opts[:where] ? "\nWHERE #{opts[:where]}\n" : "\nWHERE 1 \n")
106
+ order = (opts[:order] ? "\nORDER BY #{opts[:order]}\n" : "")
107
+ limit = (opts[:limit] ? "\nLIMIT #{opts[:limit]}\n" : "")
108
+
109
+ if opts[:only_unexported]
110
+ where += " AND (_last_exported_at IS NULL "\
111
+ "OR _last_exported_at < _last_updated_at)"
112
+
113
+ if !opts[:specific_ids].blank?
114
+ ids = opts[:specific_ids].map{|id| "'#{id}'"}.join(",")
115
+ where += " AND #{_yields[0].unique_field} IN (#{ids})"
116
+ end
117
+ end
118
+
119
+ if opts[:count]
120
+ sql = "COUNT(*) AS count from #{table} #{where}"
121
+ puts sql if opts[:debug]
122
+ select(sql)
123
+ else
124
+ sql = "#{select} from #{table} #{where} #{order} #{limit}"
125
+ puts sql if opts[:debug]
126
+ select_records(sql)
127
+ end
128
+ end
129
+
130
+ def unexported_stored_records(opts={})
131
+ all_stored_records(opts.merge!(:only_unexported => true))
132
+ end
133
+
134
+ def spotcheck_records(limit = 5)
135
+ all_stored_records(:order => "RANDOM()", :limit => limit)
136
+ end
137
+
138
+ def select_records(sql)
139
+ select(sql).map { |record| record['_type'].constantize.new(record) }
140
+ end
141
+
142
+ def export_data(opts={})
143
+ begin
144
+ sqlite_magic_connection.add_columns(
145
+ 'ocdata', [:_last_exported_at, :_last_updated_at])
146
+ rescue SQLite3::SQLException
147
+ end
148
+ Enumerator.new do |yielder|
149
+ b = 1
150
+ loop do
151
+ if opts[:all]
152
+ break if b > 1
153
+ batch = all_stored_records(opts)
154
+ else
155
+ batch = unexported_stored_records(:batch => 100, :specific_ids => opts[:specific_ids])
156
+ end
157
+ break if batch.empty?
158
+ updates = {}
159
+ batch.map do |record|
160
+ pipeline_data = record.to_pipeline
161
+ next if pipeline_data.nil?
162
+ updates[record.class.name] ||= []
163
+ # opts[:all] is currently called in the bot:test rake task
164
+ # This has the unfortunate side effect of updating the _last_exported_at
165
+ # time when running the validation task, so I've added the following conditional
166
+ if !opts[:all]
167
+ updates[record.class.name] << record.to_hash.merge(
168
+ :_last_exported_at => Time.now.iso8601(2))
169
+ else
170
+ updates[record.class.name] << record.to_hash
171
+ end
172
+ yielder << pipeline_data
173
+ end
174
+ sqlite_magic_connection.execute("BEGIN TRANSACTION")
175
+ if b == 1
176
+ check_unique_index(_yields[0])
177
+ end
178
+ updates.each do |k, v|
179
+ save_data(k.constantize.unique_fields, v)
180
+ end
181
+ sqlite_magic_connection.execute("COMMIT")
182
+ b += 1
183
+ end
184
+ end
185
+ end
186
+
187
+ def spotcheck_data
188
+ batch = spotcheck_records
189
+ batch.collect do |record|
190
+ record.to_pipeline
191
+ end
192
+ end
193
+
194
+ def validate_data(opts={})
195
+ opts = {:limit => 1000}.merge(opts)
196
+ errors = all_stored_records(opts).map do |record|
197
+ record.errors
198
+ end.compact
199
+ total = count_stored_records
200
+ selected = [opts[:limit], total].min
201
+ puts "NOTICE: only validated first #{selected} of #{total} records"
202
+ errors
203
+ end
204
+
205
+
206
+ class BaseLicenceRecord
207
+ class_attribute :_store_fields, :_type, :_schema, :_unique_fields
208
+
209
+ def self.store_fields(*fields)
210
+ self._store_fields ||= []
211
+ self._store_fields.concat(fields)
212
+ fields << :_last_exported_at unless _store_fields.include?(:_last_exported_at)
213
+ fields << :_last_updated_at unless _store_fields.include?(:_last_updated_at)
214
+ fields.each do |field|
215
+ attr_accessor field
216
+ end
217
+ end
218
+
219
+ def self.unique_fields(*fields)
220
+ self._unique_fields = fields unless fields.empty?
221
+ self._unique_fields
222
+ end
223
+
224
+ def self.schema(schema)
225
+ hyphenated_name = schema.to_s.gsub("_", "-")
226
+ self._schema = File.expand_path("../../schemas/#{hyphenated_name}-schema.json", __FILE__)
227
+ end
228
+
229
+ def initialize(attrs={})
230
+ validate_instance!
231
+ attrs = attrs.with_indifferent_access
232
+ self._type = self.class.name
233
+ self._store_fields.each do |k|
234
+ send("#{k}=", attrs[k])
235
+ end
236
+ end
237
+
238
+ def validate_instance!
239
+ all_errors = []
240
+ required_functions = [:last_updated_at, :to_pipeline]
241
+ func_errors = []
242
+ required_functions.each do |func|
243
+ if !respond_to?(func)
244
+ func_errors << func
245
+ end
246
+ end
247
+ if !func_errors.empty?
248
+ all_errors << "You must define the following functions in your record class: #{func_errors.join(', ')}"
249
+ end
250
+ field_errors = []
251
+ required_fields = [:_store_fields, :_unique_fields, :_schema]
252
+ required_fields.each do |f|
253
+ if !send(f)
254
+ field_errors << f.to_s[1..-1]
255
+ end
256
+ end
257
+ if !field_errors.empty?
258
+ all_errors << "You must define the following fields on your record class: #{field_errors.join(', ')}"
259
+ end
260
+ raise all_errors.join('\n') unless all_errors.empty?
261
+ end
262
+
263
+ def to_hash
264
+ hsh = Hash[_store_fields.map{|field| [field, send(field)]}]
265
+ hsh[:_type] = self.class.name
266
+ hsh[:_last_updated_at] = last_updated_at
267
+ hsh
268
+ end
269
+
270
+ # return a structure including errors if invalid; otherwise return nil
271
+ def errors
272
+ data = self.to_pipeline
273
+ if data
274
+ if !self._schema
275
+ # backwards compatibility
276
+ self._schema = File.expand_path("../../schemas/licence-schema.json", __FILE__)
277
+ end
278
+ errors = JSON::Validator.fully_validate(
279
+ self._schema,
280
+ data.to_json,
281
+ {:errors_as_objects => true})
282
+ if !errors.empty?
283
+ data[:errors] = errors
284
+ data
285
+ end
286
+ end
287
+ end
288
+ end
289
+ end