openc_bot 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
@@ -0,0 +1,205 @@
1
+ # encoding: UTF-8
2
+ require 'json-schema'
3
+ require 'active_support/core_ext'
4
+
5
+ module OpencBot
6
+ module Helpers
7
+ module RegisterMethods
8
+
9
+ def use_alpha_search
10
+ self.const_defined?('USE_ALPHA_SEARCH') && self.const_get('USE_ALPHA_SEARCH')
11
+ end
12
+
13
+ def datum_exists?(uid)
14
+ !!select("ocdata.#{primary_key_name} FROM ocdata WHERE #{primary_key_name} = ? LIMIT 1", uid).first
15
+ end
16
+
17
+ # fetches and saves data. By default assumes an incremental search, or an alpha search
18
+ # if USE_ALPHA_SEARCH is set. This method should be overridden if you are going to do a
19
+ # different type of data import, e.g from a CSV file.
20
+ def fetch_data
21
+ if use_alpha_search
22
+ fetch_data_via_alpha_search
23
+ else
24
+ fetch_data_via_incremental_search
25
+ end
26
+ end
27
+
28
+ def export_data
29
+ sql_query = "ocdata.* from ocdata"
30
+ select(sql_query).each do |res|
31
+ yield post_process(res, true)
32
+ end
33
+ end
34
+
35
+ def fetch_registry_page(company_number)
36
+ _client.get_content(registry_url(company_number))
37
+ end
38
+
39
+ def prepare_and_save_data(all_data,options={})
40
+ data_to_be_saved = prepare_for_saving(all_data)
41
+ insert_or_update([primary_key_name], data_to_be_saved)
42
+ end
43
+
44
+ def primary_key_name
45
+ self.const_defined?('PRIMARY_KEY_NAME') ? self.const_get('PRIMARY_KEY_NAME') : :uid
46
+ end
47
+
48
+ # sensible default. Either uses computed version or registry_url in db
49
+ def registry_url(uid)
50
+ computed_registry_url(uid) || registry_url_from_db(uid)
51
+ end
52
+
53
+ # stub method. Override in including module if this can be computed from uid
54
+ def computed_registry_url(uid)
55
+ end
56
+
57
+ # stub method. Override in including module if this can be pulled from db (i.e. it is stored there)
58
+ def registry_url_from_db(uid)
59
+ end
60
+
61
+ def save_entity(entity_datum)
62
+ validation_errors = validate_datum(entity_datum.except(:data))
63
+ return unless validation_errors.blank?
64
+ prepare_and_save_data(entity_datum)
65
+ end
66
+
67
+ # Behaves like +save_entity+ but raises RecordInvalid exception if
68
+ # record is not valid (validation errors are available in the
69
+ # excpetion's +validation_errors+ method)
70
+ def save_entity!(entity_datum)
71
+ validation_errors = validate_datum(entity_datum.except(:data))
72
+ raise OpencBot::RecordInvalid.new(validation_errors) unless validation_errors.blank?
73
+ prepare_and_save_data(entity_datum)
74
+ end
75
+
76
+ def schema_name
77
+ self.const_defined?('SCHEMA_NAME') ? self.const_get('SCHEMA_NAME') : nil
78
+ end
79
+
80
+ def stale_entry_uids(stale_count=nil)
81
+ stale_count ||= 1000
82
+ sql_query = "ocdata.* from ocdata WHERE retrieved_at IS NULL OR strftime('%s', retrieved_at) < strftime('%s', '#{Date.today - 30}') LIMIT #{stale_count.to_i}"
83
+ raw_data = select(sql_query).each do |res|
84
+ yield res[primary_key_name.to_s]
85
+ end
86
+ rescue SQLite3::SQLException => e
87
+ if e.message[/no such column: retrieved_at/]
88
+ sqlite_magic_connection.add_columns('ocdata', ['retrieved_at'])
89
+ retry
90
+ else
91
+ raise e
92
+ end
93
+ end
94
+
95
+ def update_data(options={})
96
+ fetch_data
97
+ update_stale
98
+ save_run_report(:status => 'success')
99
+ end
100
+
101
+ # This method updates a datum given by a uid (e.g. a company_number), by fetching new data, processing it
102
+ # and then saving it. It assumes the methods for doing this (#fetch_datum and #process_datum) are implemented
103
+ # in the module that includes this method.
104
+ #
105
+ # If no second argument is passed to this method, or false is passed, the
106
+ # method will return the processed data hash
107
+ # If true is passed as the second argument, the method will output the
108
+ # updated result as json to STDOUT, which can then be consumed by, say,
109
+ # something which triggered this method, for example if it was called by
110
+ # a rake task, which in turn might have been called by the main
111
+ # OpenCorporates application
112
+ def update_datum(uid, output_as_json=false,replace_existing_data=false)
113
+ return unless raw_data = fetch_datum(uid)
114
+ default_options = {primary_key_name => uid, :retrieved_at => Time.now}
115
+ processed_data = default_options.merge(process_datum(raw_data))
116
+ # prepare the data for saving (converting Arrays, Hashes to json) and
117
+ # save the original data too, as we may not extracting everything from it yet
118
+ save_entity(processed_data.merge(:data => raw_data))
119
+ if output_as_json
120
+ puts processed_data.to_json
121
+ else
122
+ processed_data
123
+ end
124
+ rescue Exception => e
125
+ output_json_error_message(e) if output_as_json
126
+ end
127
+
128
+ def update_stale(stale_count=nil)
129
+ stale_entry_uids(stale_count) do |stale_entry_uid|
130
+ update_datum(stale_entry_uid)
131
+ end
132
+
133
+ end
134
+
135
+ def validate_datum(record)
136
+ schema = File.expand_path("../../../../schemas/#{schema_name}.json", __FILE__)
137
+ errors = JSON::Validator.fully_validate(
138
+ schema,
139
+ record.to_json,
140
+ {:errors_as_objects => true})
141
+ end
142
+
143
+ def post_process(row_hash, skip_nulls=false)
144
+ # many of the fields will be serialized json and so we convert to ruby objects
145
+ convert_json_to_ruby(row_hash.except(:data), skip_nulls)
146
+ end
147
+
148
+ private
149
+ # This is a utility method for outputting an error message as json to STDOUT
150
+ # (which can then be handled by the importer)
151
+ def output_json_error_message(err_obj)
152
+ err_msg = {'error' => {'klass' => err_obj.class.to_s, 'message' => err_obj.message, 'backtrace' => err_obj.backtrace}}
153
+ puts err_msg.to_json
154
+ end
155
+
156
+ def prepare_for_saving(raw_data_hash)
157
+ prepared_data = deep_clone_hash(raw_data_hash)
158
+ #This jsonifies each value that is an an array or hash so that it can be saved as a string in sqlite
159
+ prepared_data.each do |k,v|
160
+ case v
161
+ when Array, Hash
162
+ prepared_data[k] = v.to_json
163
+ when Date, Time, DateTime
164
+ prepared_data[k] = v.iso8601
165
+ end
166
+ end
167
+ prepared_data
168
+ end
169
+
170
+ def _client(options={})
171
+ return @client if @client
172
+ @client = HTTPClient.new(options.delete(:proxy))
173
+ @client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE if options.delete(:skip_ssl_verification)
174
+ @client.agent_name = options.delete(:user_agent)
175
+ @client.ssl_config.ssl_version = options.delete(:ssl_version) if options[:ssl_version]
176
+ if ssl_certificate = options.delete(:ssl_certificate)
177
+ @client.ssl_config.add_trust_ca(ssl_certificate) # Above cert
178
+ end
179
+ @client
180
+ end
181
+
182
+ def deep_clone_hash(given_hash)
183
+ Marshal.load( Marshal.dump(given_hash) )
184
+ end
185
+
186
+ def convert_json_to_ruby(data_hash, skip_nulls=false)
187
+ data_hash.each do |k,v|
188
+ parsed_data = JSON.parse(v) if v.is_a?(String) && v[/^[\{\[]+\"|^\[\]$|^{}$/] rescue v
189
+ case parsed_data
190
+ when Hash
191
+ parsed_data = parsed_data.with_indifferent_access
192
+ when Array
193
+ parsed_data.collect!{ |e| e.is_a?(Hash) ? e.with_indifferent_access : e }
194
+ end
195
+ if skip_nulls && v.nil?
196
+ data_hash.delete(k)
197
+ else
198
+ data_hash[k] = parsed_data if parsed_data
199
+ end
200
+ end
201
+ end
202
+
203
+ end
204
+ end
205
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: UTF-8
2
+ module OpencBot
3
+ module Helpers
4
+ module Text
5
+ extend self
6
+
7
+ def normalise_utf8_spaces(raw_text)
8
+ raw_text&&raw_text.gsub(/\xC2\xA0/, ' ')
9
+ end
10
+
11
+ def strip_all_spaces(text)
12
+ text&&normalise_utf8_spaces(text).strip.gsub(/\s+/,' ')
13
+ end
14
+
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,2 @@
1
+ require 'openc_bot/incrementers/base'
2
+ require 'openc_bot/incrementers/common'
@@ -0,0 +1,214 @@
1
+ require 'backports/2.0.0/enumerable/lazy'
2
+ require 'json'
3
+ module OpencBot
4
+ class BaseIncrementer
5
+
6
+ def initialize(name, opts={})
7
+ @name = name
8
+ @expected_count = opts[:expected_count]
9
+ @count = 0
10
+ @app_path = opts[:app_path]
11
+ @show_progress = opts[:show_progress] || (opts[:show_progress].nil? && true)
12
+ @reset_iterator = opts[:reset_iterator]
13
+ @max_iterations = opts[:max_iterations]
14
+ @opts = opts
15
+ end
16
+
17
+ def self.new(*args)
18
+ path, = caller[0].partition(":")
19
+ path = File.expand_path(File.join(File.dirname(path), ".."))
20
+ args << {} if args.count == 1
21
+ args[1][:app_path] = path if !args[1][:app_path]
22
+ super(*args)
23
+ end
24
+
25
+ def log_progress(percent)
26
+ puts "Iterator #{@name} progress: " + (percent.to_s + "%") if @show_progress
27
+ end
28
+
29
+ def progress_percent
30
+ (@count.to_f / @expected_count * 100).round(2) if @expected_count
31
+ end
32
+
33
+ def each
34
+ Enumerator.new do |yielder|
35
+ increment_yielder do |result|
36
+ if result.is_a? Hash
37
+ formatted_result = result.to_json
38
+ else
39
+ formatted_result = result
40
+ end
41
+ write_current(formatted_result)
42
+ yielder.yield(result)
43
+ @count += 1
44
+ log_progress(progress_percent)
45
+ end
46
+ reset_current
47
+ end.lazy
48
+ end
49
+
50
+ def resumable
51
+ enum = each
52
+ enum = resuming_enum(enum) unless @reset_iterator
53
+ enum = enum.take(@max_iterations) if @max_iterations
54
+ enum
55
+ end
56
+
57
+ def resuming_enum(enum)
58
+ start_from = read_current
59
+ preset_show_progress = @show_progress
60
+ @show_progress = false
61
+ if start_from && start_from != ""
62
+ enum = enum.drop_while do |x|
63
+ found_start_point = (x.to_s == start_from)
64
+ @show_progress = preset_show_progress && found_start_point
65
+ !found_start_point
66
+ end
67
+ end
68
+ enum
69
+ end
70
+
71
+ def position_file_name
72
+ "#{@app_path}/db/#{db_name}-iterator-position.txt"
73
+ end
74
+
75
+ def db_name
76
+ @name
77
+ end
78
+
79
+ # this is done with a file, rather than SQL, for speed reasons
80
+ def reset_current
81
+ File.open(position_file_name, "w") do |f|
82
+ f.write("")
83
+ end
84
+ end
85
+
86
+ def write_current(val)
87
+ File.open(position_file_name, "w") do |f|
88
+ f.write(val.to_s)
89
+ end
90
+ end
91
+
92
+ def read_current
93
+ begin
94
+ File.open(position_file_name, "r") do |f|
95
+ f.read
96
+ end
97
+ rescue Errno::ENOENT
98
+ nil
99
+ end
100
+ end
101
+ end
102
+
103
+ class ManualIncrementer < OpencBot::BaseIncrementer
104
+
105
+ include ScraperWiki
106
+
107
+ ITEMS_TABLE = "items"
108
+
109
+ def single_transaction
110
+ sqlite_magic_connection.execute("BEGIN TRANSACTION")
111
+ yield(self)
112
+ sqlite_magic_connection.execute("COMMIT")
113
+ end
114
+
115
+ def initialize(name, opts={})
116
+ super(name, opts)
117
+ raise "Fields must be defined for this Record" if opts[:fields].nil?
118
+ query = "CREATE TABLE IF NOT EXISTS #{ITEMS_TABLE} (#{opts[:fields].join(',')}, _id INTEGER PRIMARY KEY)"
119
+ sqlite_magic_connection.execute query
120
+ query = "CREATE UNIQUE INDEX IF NOT EXISTS #{opts[:fields].join('_')} " +
121
+ "ON #{ITEMS_TABLE} (#{opts[:fields].join(',')})"
122
+ sqlite_magic_connection.execute query
123
+ end
124
+
125
+ # Override default in ScraperWiki gem
126
+ def sqlite_magic_connection
127
+ db = File.expand_path(File.join(@app_path, 'db', "#{db_name}.db"))
128
+ @sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
129
+ end
130
+
131
+ def increment_yielder(start_row=nil)
132
+ start_id = start_row && start_row["_id"].to_i
133
+ @expected_count = count_all_items
134
+ @count = count_processed_items(start_id)
135
+ loop do
136
+ result = read_batch(start_id).each do |row|
137
+ yield row
138
+ start_id = row["_id"].to_i + 1
139
+ end
140
+ raise StopIteration if result.empty?
141
+ end
142
+ end
143
+
144
+ def populated
145
+ begin
146
+ result = select("populated FROM misc").first['populated']
147
+ result && result == "true"
148
+ rescue SqliteMagic::NoSuchTable
149
+ end
150
+ end
151
+
152
+ def populated=(val)
153
+ if val && val == "true" || val == true
154
+ save_sqlite([:populated], {:populated => "true"}, "misc")
155
+ end
156
+ end
157
+
158
+ def enum(*args)
159
+ self.populated = true
160
+ each
161
+ end
162
+
163
+ def add_row(val)
164
+ sqlite_magic_connection.insert_or_update(
165
+ val.keys, val, ITEMS_TABLE, :update_unique_keys => true)
166
+ end
167
+
168
+ def count_processed_items(start_id)
169
+ if start_id
170
+ begin
171
+ result = select("count(*) as count FROM #{ITEMS_TABLE} WHERE _id < #{start_id}").first
172
+ result && result['count']
173
+ rescue SqliteMagic::NoSuchTable
174
+ 0
175
+ end
176
+ else
177
+ 0
178
+ end
179
+ end
180
+
181
+ def count_all_items
182
+ begin
183
+ select("count(*) as count FROM #{ITEMS_TABLE}").first['count']
184
+ rescue SqliteMagic::NoSuchTable
185
+ end
186
+ end
187
+
188
+ def read_batch(start_id=nil)
189
+ sql = "* FROM #{ITEMS_TABLE}"
190
+ if start_id
191
+ sql += " WHERE _id >= #{start_id}"
192
+ end
193
+ sql += " LIMIT 100"
194
+ select(sql)
195
+ end
196
+
197
+ # override superclass definition for more efficient version
198
+ def resuming_enum(enum)
199
+ current_row = read_current && read_current != "" && JSON.parse(read_current)
200
+ if current_row
201
+ enum = Enumerator.new do |yielder|
202
+ increment_yielder(current_row) do |result|
203
+ write_current(result.to_json)
204
+ yielder.yield(result)
205
+ @count += 1
206
+ log_progress(progress_percent)
207
+ end
208
+ reset_current
209
+ end.lazy
210
+ end
211
+ enum
212
+ end
213
+ end
214
+ end
@@ -0,0 +1,47 @@
1
+ module OpencBot
2
+ class NumericIncrementer < OpencBot::BaseIncrementer
3
+ def initialize(name, opts={})
4
+ raise "You must specify an end_val for a NumericIncrementer" if ! opts[:end_val]
5
+ @start_val = opts[:start_val] || 0
6
+ @end_val = opts[:end_val]
7
+ super(name, opts)
8
+ end
9
+
10
+ def increment_yielder
11
+ @expected_count = @end_val
12
+ i = @start_val
13
+ loop do
14
+ if i > @end_val
15
+ raise StopIteration
16
+ end
17
+ yield i
18
+ i += 1
19
+ end
20
+ end
21
+ end
22
+
23
+ class AsciiIncrementer < OpencBot::BaseIncrementer
24
+ def initialize(name, opts={})
25
+ @size = opts[:size] || 3
26
+ super(name, opts)
27
+ end
28
+
29
+ def increment_yielder
30
+ alnum = (0...36).map{|i|i.to_s 36} # 0...z
31
+ all_perms = alnum.repeated_permutation(@size)
32
+ case @size
33
+ when 1
34
+ @expected_count = 36
35
+ when 2
36
+ @expected_count = 1296
37
+ when 3
38
+ @expected_count = 46656
39
+ when 4
40
+ @expected_count = 1679616
41
+ end
42
+ all_perms.each do |perm|
43
+ yield perm.join
44
+ end
45
+ end
46
+ end
47
+ end