openc_bot 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
@@ -0,0 +1,205 @@
1
+ # encoding: UTF-8
2
+ require 'json-schema'
3
+ require 'active_support/core_ext'
4
+
5
+ module OpencBot
6
+ module Helpers
7
+ module RegisterMethods
8
+
9
+ def use_alpha_search
10
+ self.const_defined?('USE_ALPHA_SEARCH') && self.const_get('USE_ALPHA_SEARCH')
11
+ end
12
+
13
+ def datum_exists?(uid)
14
+ !!select("ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} = ? LIMIT 1", uid).first
15
+ end
16
+
17
+ # fetches and saves data. By default assumes an incremental search, or an alpha search
18
+ # if USE_ALPHA_SEARCH is set. This method should be overridden if you are going to do a
19
+ # different type of data import, e.g from a CSV file.
20
+ def fetch_data
21
+ if use_alpha_search
22
+ fetch_data_via_alpha_search
23
+ else
24
+ fetch_data_via_incremental_search
25
+ end
26
+ end
27
+
28
+ def export_data
29
+ sql_query = "ocdata.* from ocdata"
30
+ select(sql_query).each do |res|
31
+ yield post_process(res, true)
32
+ end
33
+ end
34
+
35
+ def fetch_registry_page(company_number)
36
+ _client.get_content(registry_url(company_number))
37
+ end
38
+
39
+ def prepare_and_save_data(all_data,options={})
40
+ data_to_be_saved = prepare_for_saving(all_data)
41
+ insert_or_update([primary_key_name], data_to_be_saved)
42
+ end
43
+
44
+ def primary_key_name
45
+ self.const_defined?('PRIMARY_KEY_NAME') ? self.const_get('PRIMARY_KEY_NAME') : :uid
46
+ end
47
+
48
+ # sensible default. Either uses computed version or registry_url in db
49
+ def registry_url(uid)
50
+ computed_registry_url(uid) || registry_url_from_db(uid)
51
+ end
52
+
53
+ # stub method. Override in including module if this can be computed from uid
54
+ def computed_registry_url(uid)
55
+ end
56
+
57
+ # stub method. Override in including module if this can be pulled from db (i.e. it is stored there)
58
+ def registry_url_from_db(uid)
59
+ end
60
+
61
+ def save_entity(entity_datum)
62
+ validation_errors = validate_datum(entity_datum.except(:data))
63
+ return unless validation_errors.blank?
64
+ prepare_and_save_data(entity_datum)
65
+ end
66
+
67
+ # Behaves like +save_entity+ but raises RecordInvalid exception if
68
+ # record is not valid (validation errors are available in the
69
+ # excpetion's +validation_errors+ method)
70
+ def save_entity!(entity_datum)
71
+ validation_errors = validate_datum(entity_datum.except(:data))
72
+ raise OpencBot::RecordInvalid.new(validation_errors) unless validation_errors.blank?
73
+ prepare_and_save_data(entity_datum)
74
+ end
75
+
76
+ def schema_name
77
+ self.const_defined?('SCHEMA_NAME') ? self.const_get('SCHEMA_NAME') : nil
78
+ end
79
+
80
+ def stale_entry_uids(stale_count=nil)
81
+ stale_count ||= 1000
82
+ sql_query = "ocdata.* from ocdata WHERE retrieved_at IS NULL OR strftime('%s', retrieved_at) < strftime('%s', '#{Date.today - 30}') LIMIT #{stale_count.to_i}"
83
+ raw_data = select(sql_query).each do |res|
84
+ yield res[primary_key_name.to_s]
85
+ end
86
+ rescue SQLite3::SQLException => e
87
+ if e.message[/no such column: retrieved_at/]
88
+ sqlite_magic_connection.add_columns('ocdata', ['retrieved_at'])
89
+ retry
90
+ else
91
+ raise e
92
+ end
93
+ end
94
+
95
+ def update_data(options={})
96
+ fetch_data
97
+ update_stale
98
+ save_run_report(:status => 'success')
99
+ end
100
+
101
+ # This method updates a datum given by a uid (e.g. a company_number), by fetching new data, processing it
102
+ # and then saving it. It assumes the methods for doing this (#fetch_datum and #process_datum) are implemented
103
+ # in the module that includes this method.
104
+ #
105
+ # If no second argument is passed to this method, or false is passed, the
106
+ # method will return the processed data hash
107
+ # If true is passed as the second argument, the method will output the
108
+ # updated result as json to STDOUT, which can then be consumed by, say,
109
+ # something which triggered this method, for example if it was called by
110
+ # a rake task, which in turn might have been called by the main
111
+ # OpenCorporates application
112
+ def update_datum(uid, output_as_json=false,replace_existing_data=false)
113
+ return unless raw_data = fetch_datum(uid)
114
+ default_options = {primary_key_name => uid, :retrieved_at => Time.now}
115
+ processed_data = default_options.merge(process_datum(raw_data))
116
+ # prepare the data for saving (converting Arrays, Hashes to json) and
117
+ # save the original data too, as we may not extracting everything from it yet
118
+ save_entity(processed_data.merge(:data => raw_data))
119
+ if output_as_json
120
+ puts processed_data.to_json
121
+ else
122
+ processed_data
123
+ end
124
+ rescue Exception => e
125
+ output_json_error_message(e) if output_as_json
126
+ end
127
+
128
+ def update_stale(stale_count=nil)
129
+ stale_entry_uids(stale_count) do |stale_entry_uid|
130
+ update_datum(stale_entry_uid)
131
+ end
132
+
133
+ end
134
+
135
+ def validate_datum(record)
136
+ schema = File.expand_path("../../../../schemas/#{schema_name}.json", __FILE__)
137
+ errors = JSON::Validator.fully_validate(
138
+ schema,
139
+ record.to_json,
140
+ {:errors_as_objects => true})
141
+ end
142
+
143
+ def post_process(row_hash, skip_nulls=false)
144
+ # many of the fields will be serialized json and so we convert to ruby objects
145
+ convert_json_to_ruby(row_hash.except(:data), skip_nulls)
146
+ end
147
+
148
+ private
149
+ # This is a utility method for outputting an error message as json to STDOUT
150
+ # (which can then be handled by the importer)
151
+ def output_json_error_message(err_obj)
152
+ err_msg = {'error' => {'klass' => err_obj.class.to_s, 'message' => err_obj.message, 'backtrace' => err_obj.backtrace}}
153
+ puts err_msg.to_json
154
+ end
155
+
156
+ def prepare_for_saving(raw_data_hash)
157
+ prepared_data = deep_clone_hash(raw_data_hash)
158
+ #This jsonifies each value that is an an array or hash so that it can be saved as a string in sqlite
159
+ prepared_data.each do |k,v|
160
+ case v
161
+ when Array, Hash
162
+ prepared_data[k] = v.to_json
163
+ when Date, Time, DateTime
164
+ prepared_data[k] = v.iso8601
165
+ end
166
+ end
167
+ prepared_data
168
+ end
169
+
170
+ def _client(options={})
171
+ return @client if @client
172
+ @client = HTTPClient.new(options.delete(:proxy))
173
+ @client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE if options.delete(:skip_ssl_verification)
174
+ @client.agent_name = options.delete(:user_agent)
175
+ @client.ssl_config.ssl_version = options.delete(:ssl_version) if options[:ssl_version]
176
+ if ssl_certificate = options.delete(:ssl_certificate)
177
+ @client.ssl_config.add_trust_ca(ssl_certificate) # Above cert
178
+ end
179
+ @client
180
+ end
181
+
182
+ def deep_clone_hash(given_hash)
183
+ Marshal.load( Marshal.dump(given_hash) )
184
+ end
185
+
186
+ def convert_json_to_ruby(data_hash, skip_nulls=false)
187
+ data_hash.each do |k,v|
188
+ parsed_data = JSON.parse(v) if v.is_a?(String) && v[/^[\{\[]+\"|^\[\]$|^{}$/] rescue v
189
+ case parsed_data
190
+ when Hash
191
+ parsed_data = parsed_data.with_indifferent_access
192
+ when Array
193
+ parsed_data.collect!{ |e| e.is_a?(Hash) ? e.with_indifferent_access : e }
194
+ end
195
+ if skip_nulls && v.nil?
196
+ data_hash.delete(k)
197
+ else
198
+ data_hash[k] = parsed_data if parsed_data
199
+ end
200
+ end
201
+ end
202
+
203
+ end
204
+ end
205
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: UTF-8
2
+ module OpencBot
3
+ module Helpers
4
+ module Text
5
+ extend self
6
+
7
+ def normalise_utf8_spaces(raw_text)
8
+ raw_text&&raw_text.gsub(/\xC2\xA0/, ' ')
9
+ end
10
+
11
+ def strip_all_spaces(text)
12
+ text&&normalise_utf8_spaces(text).strip.gsub(/\s+/,' ')
13
+ end
14
+
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,2 @@
1
+ require 'openc_bot/incrementers/base'
2
+ require 'openc_bot/incrementers/common'
@@ -0,0 +1,214 @@
1
+ require 'backports/2.0.0/enumerable/lazy'
2
+ require 'json'
3
+ module OpencBot
4
+ class BaseIncrementer
5
+
6
+ def initialize(name, opts={})
7
+ @name = name
8
+ @expected_count = opts[:expected_count]
9
+ @count = 0
10
+ @app_path = opts[:app_path]
11
+ @show_progress = opts[:show_progress] || (opts[:show_progress].nil? && true)
12
+ @reset_iterator = opts[:reset_iterator]
13
+ @max_iterations = opts[:max_iterations]
14
+ @opts = opts
15
+ end
16
+
17
+ def self.new(*args)
18
+ path, = caller[0].partition(":")
19
+ path = File.expand_path(File.join(File.dirname(path), ".."))
20
+ args << {} if args.count == 1
21
+ args[1][:app_path] = path if !args[1][:app_path]
22
+ super(*args)
23
+ end
24
+
25
+ def log_progress(percent)
26
+ puts "Iterator #{@name} progress: " + (percent.to_s + "%") if @show_progress
27
+ end
28
+
29
+ def progress_percent
30
+ (@count.to_f / @expected_count * 100).round(2) if @expected_count
31
+ end
32
+
33
+ def each
34
+ Enumerator.new do |yielder|
35
+ increment_yielder do |result|
36
+ if result.is_a? Hash
37
+ formatted_result = result.to_json
38
+ else
39
+ formatted_result = result
40
+ end
41
+ write_current(formatted_result)
42
+ yielder.yield(result)
43
+ @count += 1
44
+ log_progress(progress_percent)
45
+ end
46
+ reset_current
47
+ end.lazy
48
+ end
49
+
50
+ def resumable
51
+ enum = each
52
+ enum = resuming_enum(enum) unless @reset_iterator
53
+ enum = enum.take(@max_iterations) if @max_iterations
54
+ enum
55
+ end
56
+
57
+ def resuming_enum(enum)
58
+ start_from = read_current
59
+ preset_show_progress = @show_progress
60
+ @show_progress = false
61
+ if start_from && start_from != ""
62
+ enum = enum.drop_while do |x|
63
+ found_start_point = (x.to_s == start_from)
64
+ @show_progress = preset_show_progress && found_start_point
65
+ !found_start_point
66
+ end
67
+ end
68
+ enum
69
+ end
70
+
71
+ def position_file_name
72
+ "#{@app_path}/db/#{db_name}-iterator-position.txt"
73
+ end
74
+
75
+ def db_name
76
+ @name
77
+ end
78
+
79
+ # this is done with a file, rather than SQL, for speed reasons
80
+ def reset_current
81
+ File.open(position_file_name, "w") do |f|
82
+ f.write("")
83
+ end
84
+ end
85
+
86
+ def write_current(val)
87
+ File.open(position_file_name, "w") do |f|
88
+ f.write(val.to_s)
89
+ end
90
+ end
91
+
92
+ def read_current
93
+ begin
94
+ File.open(position_file_name, "r") do |f|
95
+ f.read
96
+ end
97
+ rescue Errno::ENOENT
98
+ nil
99
+ end
100
+ end
101
+ end
102
+
103
+ class ManualIncrementer < OpencBot::BaseIncrementer
104
+
105
+ include ScraperWiki
106
+
107
+ ITEMS_TABLE = "items"
108
+
109
+ def single_transaction
110
+ sqlite_magic_connection.execute("BEGIN TRANSACTION")
111
+ yield(self)
112
+ sqlite_magic_connection.execute("COMMIT")
113
+ end
114
+
115
+ def initialize(name, opts={})
116
+ super(name, opts)
117
+ raise "Fields must be defined for this Record" if opts[:fields].nil?
118
+ query = "CREATE TABLE IF NOT EXISTS #{ITEMS_TABLE} (#{opts[:fields].join(',')}, _id INTEGER PRIMARY KEY)"
119
+ sqlite_magic_connection.execute query
120
+ query = "CREATE UNIQUE INDEX IF NOT EXISTS #{opts[:fields].join('_')} " +
121
+ "ON #{ITEMS_TABLE} (#{opts[:fields].join(',')})"
122
+ sqlite_magic_connection.execute query
123
+ end
124
+
125
+ # Override default in ScraperWiki gem
126
+ def sqlite_magic_connection
127
+ db = File.expand_path(File.join(@app_path, 'db', "#{db_name}.db"))
128
+ @sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
129
+ end
130
+
131
+ def increment_yielder(start_row=nil)
132
+ start_id = start_row && start_row["_id"].to_i
133
+ @expected_count = count_all_items
134
+ @count = count_processed_items(start_id)
135
+ loop do
136
+ result = read_batch(start_id).each do |row|
137
+ yield row
138
+ start_id = row["_id"].to_i + 1
139
+ end
140
+ raise StopIteration if result.empty?
141
+ end
142
+ end
143
+
144
+ def populated
145
+ begin
146
+ result = select("populated FROM misc").first['populated']
147
+ result && result == "true"
148
+ rescue SqliteMagic::NoSuchTable
149
+ end
150
+ end
151
+
152
+ def populated=(val)
153
+ if val && val == "true" || val == true
154
+ save_sqlite([:populated], {:populated => "true"}, "misc")
155
+ end
156
+ end
157
+
158
+ def enum(*args)
159
+ self.populated = true
160
+ each
161
+ end
162
+
163
+ def add_row(val)
164
+ sqlite_magic_connection.insert_or_update(
165
+ val.keys, val, ITEMS_TABLE, :update_unique_keys => true)
166
+ end
167
+
168
+ def count_processed_items(start_id)
169
+ if start_id
170
+ begin
171
+ result = select("count(*) as count FROM #{ITEMS_TABLE} WHERE _id < #{start_id}").first
172
+ result && result['count']
173
+ rescue SqliteMagic::NoSuchTable
174
+ 0
175
+ end
176
+ else
177
+ 0
178
+ end
179
+ end
180
+
181
+ def count_all_items
182
+ begin
183
+ select("count(*) as count FROM #{ITEMS_TABLE}").first['count']
184
+ rescue SqliteMagic::NoSuchTable
185
+ end
186
+ end
187
+
188
+ def read_batch(start_id=nil)
189
+ sql = "* FROM #{ITEMS_TABLE}"
190
+ if start_id
191
+ sql += " WHERE _id >= #{start_id}"
192
+ end
193
+ sql += " LIMIT 100"
194
+ select(sql)
195
+ end
196
+
197
+ # override superclass definition for more efficient version
198
+ def resuming_enum(enum)
199
+ current_row = read_current && read_current != "" && JSON.parse(read_current)
200
+ if current_row
201
+ enum = Enumerator.new do |yielder|
202
+ increment_yielder(current_row) do |result|
203
+ write_current(result.to_json)
204
+ yielder.yield(result)
205
+ @count += 1
206
+ log_progress(progress_percent)
207
+ end
208
+ reset_current
209
+ end.lazy
210
+ end
211
+ enum
212
+ end
213
+ end
214
+ end
@@ -0,0 +1,47 @@
1
+ module OpencBot
2
+ class NumericIncrementer < OpencBot::BaseIncrementer
3
+ def initialize(name, opts={})
4
+ raise "You must specify an end_val for a NumericIncrementer" if ! opts[:end_val]
5
+ @start_val = opts[:start_val] || 0
6
+ @end_val = opts[:end_val]
7
+ super(name, opts)
8
+ end
9
+
10
+ def increment_yielder
11
+ @expected_count = @end_val
12
+ i = @start_val
13
+ loop do
14
+ if i > @end_val
15
+ raise StopIteration
16
+ end
17
+ yield i
18
+ i += 1
19
+ end
20
+ end
21
+ end
22
+
23
+ class AsciiIncrementer < OpencBot::BaseIncrementer
24
+ def initialize(name, opts={})
25
+ @size = opts[:size] || 3
26
+ super(name, opts)
27
+ end
28
+
29
+ def increment_yielder
30
+ alnum = (0...36).map{|i|i.to_s 36} # 0...z
31
+ all_perms = alnum.repeated_permutation(@size)
32
+ case @size
33
+ when 1
34
+ @expected_count = 36
35
+ when 2
36
+ @expected_count = 1296
37
+ when 3
38
+ @expected_count = 46656
39
+ when 4
40
+ @expected_count = 1679616
41
+ end
42
+ all_perms.each do |perm|
43
+ yield perm.join
44
+ end
45
+ end
46
+ end
47
+ end