openc_bot 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +2 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +253 -0
- data/Rakefile +14 -0
- data/bin/openc_bot +13 -0
- data/create_bot.sh +30 -0
- data/create_company_bot.sh +16 -0
- data/create_simple_licence_bot.sh +31 -0
- data/db/.gitkeep +0 -0
- data/examples/basic/.gitignore +3 -0
- data/examples/basic/Gemfile +7 -0
- data/examples/basic/config.yml +21 -0
- data/examples/basic/lib/basic.rb +88 -0
- data/examples/basic_with_proxy/Gemfile +7 -0
- data/examples/basic_with_proxy/config.yml +21 -0
- data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
- data/examples/bot_with_simple_iterator/Gemfile +6 -0
- data/examples/bot_with_simple_iterator/config.yml +21 -0
- data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
- data/examples/company_fetchers/basic.rb +49 -0
- data/lib/monkey_patches/mechanize.rb +53 -0
- data/lib/openc_bot.rb +89 -0
- data/lib/openc_bot/bot_data_validator.rb +18 -0
- data/lib/openc_bot/company_fetcher_bot.rb +40 -0
- data/lib/openc_bot/exceptions.rb +17 -0
- data/lib/openc_bot/helpers/_csv.rb +10 -0
- data/lib/openc_bot/helpers/alpha_search.rb +73 -0
- data/lib/openc_bot/helpers/dates.rb +33 -0
- data/lib/openc_bot/helpers/html.rb +8 -0
- data/lib/openc_bot/helpers/incremental_search.rb +106 -0
- data/lib/openc_bot/helpers/register_methods.rb +205 -0
- data/lib/openc_bot/helpers/text.rb +18 -0
- data/lib/openc_bot/incrementers.rb +2 -0
- data/lib/openc_bot/incrementers/base.rb +214 -0
- data/lib/openc_bot/incrementers/common.rb +47 -0
- data/lib/openc_bot/tasks.rb +385 -0
- data/lib/openc_bot/templates/README.md +35 -0
- data/lib/openc_bot/templates/bin/export_data +28 -0
- data/lib/openc_bot/templates/bin/fetch_data +23 -0
- data/lib/openc_bot/templates/bin/verify_data +1 -0
- data/lib/openc_bot/templates/config.yml +21 -0
- data/lib/openc_bot/templates/lib/bot.rb +43 -0
- data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
- data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
- data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
- data/lib/openc_bot/version.rb +3 -0
- data/lib/simple_openc_bot.rb +289 -0
- data/openc_bot.gemspec +35 -0
- data/schemas/company-schema.json +112 -0
- data/schemas/includes/address.json +23 -0
- data/schemas/includes/base-statement.json +27 -0
- data/schemas/includes/company.json +14 -0
- data/schemas/includes/filing.json +20 -0
- data/schemas/includes/license-data.json +27 -0
- data/schemas/includes/officer.json +14 -0
- data/schemas/includes/previous_name.json +11 -0
- data/schemas/includes/share-parcel-data.json +67 -0
- data/schemas/includes/share-parcel.json +60 -0
- data/schemas/includes/subsidiary-relationship-data.json +52 -0
- data/schemas/includes/total-shares.json +10 -0
- data/schemas/licence-schema.json +21 -0
- data/schemas/share-parcel-schema.json +21 -0
- data/schemas/subsidiary-relationship-schema.json +19 -0
- data/spec/dummy_classes/foo_bot.rb +4 -0
- data/spec/lib/bot_data_validator_spec.rb +69 -0
- data/spec/lib/company_fetcher_bot_spec.rb +93 -0
- data/spec/lib/exceptions_spec.rb +25 -0
- data/spec/lib/helpers/alpha_search_spec.rb +173 -0
- data/spec/lib/helpers/dates_spec.rb +65 -0
- data/spec/lib/helpers/incremental_search_spec.rb +471 -0
- data/spec/lib/helpers/register_methods_spec.rb +558 -0
- data/spec/lib/helpers/text_spec.rb +50 -0
- data/spec/lib/openc_bot/db/.gitkeep +0 -0
- data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
- data/spec/lib/openc_bot_spec.rb +116 -0
- data/spec/schemas/company-schema_spec.rb +676 -0
- data/spec/simple_openc_bot_spec.rb +302 -0
- data/spec/spec_helper.rb +19 -0
- metadata +300 -0
data/db/.gitkeep
ADDED
File without changes
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# This is a short description of the kind of data the bot handles.
|
2
|
+
description: ''
|
3
|
+
|
4
|
+
# This is your confidence, between 0 and 100, in the accuracy of the data
|
5
|
+
# provided by the data source. It is suggested that you do not change this
|
6
|
+
# without first discussing with OpenCorporates.
|
7
|
+
default_confidence: 80
|
8
|
+
|
9
|
+
# Don't change this. OpenCorporates will bump this version number when the bot
|
10
|
+
# is running in production.
|
11
|
+
version: 0
|
12
|
+
|
13
|
+
# the frequency that updates may happen at the source. One of daily,
|
14
|
+
# weekly, monthly or yearly
|
15
|
+
frequency: monthly
|
16
|
+
|
17
|
+
# If this flag is true, our deployment script will set up the bot to
|
18
|
+
# be run on OpenCorporates servers (specifically, deploy the bot and
|
19
|
+
# allow it to be executed from the data pipeline). Please don't change
|
20
|
+
# this; we'll change it when we're ready to test the bot.
|
21
|
+
enabled: false
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'simple_openc_bot'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
# you may need to require other libraries here
|
6
|
+
# require 'nokogiri'
|
7
|
+
|
8
|
+
class BasicRecord < SimpleOpencBot::BaseLicenceRecord
|
9
|
+
# The JSON schema to use to validate records; correspond with files
|
10
|
+
# in `schema/*-schema.json`
|
11
|
+
schema :licence
|
12
|
+
|
13
|
+
# Fields you define here will be persisted to a local database when
|
14
|
+
# 'fetch_records' (see below) is run.
|
15
|
+
store_fields :name, :type, :reporting_date
|
16
|
+
|
17
|
+
# This is the field(s) which will uniquely define a record (think
|
18
|
+
# primary key in a database).
|
19
|
+
unique_fields :name
|
20
|
+
|
21
|
+
# These are just example methods and constants used by
|
22
|
+
# `to_pipeline`, below
|
23
|
+
JURISDICTION = "uk"
|
24
|
+
URL = "http://foo.com"
|
25
|
+
|
26
|
+
def jurisdiction_classification
|
27
|
+
type
|
28
|
+
end
|
29
|
+
|
30
|
+
# This must be defined, and should return a timestamp in ISO8601
|
31
|
+
# format. Its value should change when something about the record
|
32
|
+
# has changed. It doesn't have to be a method - it can also be a
|
33
|
+
# member of `store_fields`, above.
|
34
|
+
def last_updated_at
|
35
|
+
reporting_date
|
36
|
+
end
|
37
|
+
|
38
|
+
# This method must be defined. You can test that you're outputting
|
39
|
+
# in the right format with `bin/verify_data`, which will validate
|
40
|
+
# any data you've fetched against the relevant schema. See
|
41
|
+
# `doc/SCHEMA.md` for documentation.
|
42
|
+
def to_pipeline
|
43
|
+
{
|
44
|
+
sample_date: last_updated_at,
|
45
|
+
company: {
|
46
|
+
name: name,
|
47
|
+
jurisdiction: JURISDICTION,
|
48
|
+
},
|
49
|
+
source_url: URL,
|
50
|
+
data: [{
|
51
|
+
data_type: :licence,
|
52
|
+
properties: {
|
53
|
+
jurisdiction_code: JURISDICTION,
|
54
|
+
category: 'Financial',
|
55
|
+
jurisdiction_classification: [jurisdiction_classification],
|
56
|
+
}
|
57
|
+
}]
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
class Basic < SimpleOpencBot
|
64
|
+
|
65
|
+
# the class that `fetch_records` yields. Must be defined.
|
66
|
+
yields BasicRecord
|
67
|
+
|
68
|
+
# This method should yield Records. It must be defined.
|
69
|
+
def fetch_all_records(opts={})
|
70
|
+
|
71
|
+
# you can use any client here, e.g. HTTPClient, open-uri, etc.
|
72
|
+
agent = Mechanize.new
|
73
|
+
|
74
|
+
# This is a live page on our website - have a look to see what's going on.
|
75
|
+
page = agent.get("http://assets.opencorporates.com/test_bot_page.html")
|
76
|
+
|
77
|
+
# We tend to use Nokogiri to parse responses, but again this is up
|
78
|
+
# to you.
|
79
|
+
doc = Nokogiri::HTML(page.body)
|
80
|
+
doc.xpath("//li").map do |li|
|
81
|
+
name, type = li.content.split(":")
|
82
|
+
yield BasicRecord.new(
|
83
|
+
:name => name.strip,
|
84
|
+
:type => type.strip,
|
85
|
+
:reporting_date => Time.now.iso8601(2))
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# This is a short description of the kind of data the bot handles.
|
2
|
+
description: ''
|
3
|
+
|
4
|
+
# This is your confidence, between 0 and 100, in the accuracy of the data
|
5
|
+
# provided by the data source. It is suggested that you do not change this
|
6
|
+
# without first discussing with OpenCorporates.
|
7
|
+
default_confidence: 80
|
8
|
+
|
9
|
+
# Don't change this. OpenCorporates will bump this version number when the bot
|
10
|
+
# is running in production.
|
11
|
+
version: 0
|
12
|
+
|
13
|
+
# the frequency that updates may happen at the source. One of daily,
|
14
|
+
# weekly, monthly or yearly
|
15
|
+
frequency: monthly
|
16
|
+
|
17
|
+
# If this flag is true, our deployment script will set up the bot to
|
18
|
+
# be run on OpenCorporates servers (specifically, deploy the bot and
|
19
|
+
# allow it to be executed from the data pipeline). Please don't change
|
20
|
+
# this; we'll change it when we're ready to test the bot.
|
21
|
+
enabled: false
|
@@ -0,0 +1,103 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'simple_openc_bot'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
# you may need to require other libraries here
|
6
|
+
# require 'nokogiri'
|
7
|
+
|
8
|
+
class BasicWithProxyRecord < SimpleOpencBot::BaseLicenceRecord
|
9
|
+
# The JSON schema to use to validate records; correspond with files
|
10
|
+
# in `schema/*-schema.json`
|
11
|
+
schema :licence
|
12
|
+
|
13
|
+
# Fields you define here will be persisted to a local database when
|
14
|
+
# 'fetch_records' (see below) is run.
|
15
|
+
store_fields :name, :type, :reporting_date
|
16
|
+
|
17
|
+
# This is the field(s) which will uniquely define a record (think
|
18
|
+
# primary key in a database).
|
19
|
+
unique_fields :name
|
20
|
+
|
21
|
+
# These are just example methods and constants used by
|
22
|
+
# `to_pipeline`, below
|
23
|
+
JURISDICTION = "uk"
|
24
|
+
URL = "http://foo.com"
|
25
|
+
|
26
|
+
def jurisdiction_classification
|
27
|
+
type
|
28
|
+
end
|
29
|
+
|
30
|
+
# This must be defined, and should return a timestamp in ISO8601
|
31
|
+
# format. Its value should change when something about the record
|
32
|
+
# has changed. It doesn't have to be a method - it can also be a
|
33
|
+
# member of `store_fields`, above.
|
34
|
+
def last_updated_at
|
35
|
+
reporting_date
|
36
|
+
end
|
37
|
+
|
38
|
+
# This method must be defined. You can test that you're outputting
|
39
|
+
# in the right format with `bin/verify_data`, which will validate
|
40
|
+
# any data you've fetched against the relevant schema. See
|
41
|
+
# `doc/SCHEMA.md` for documentation.
|
42
|
+
def to_pipeline
|
43
|
+
{
|
44
|
+
sample_date: last_updated_at,
|
45
|
+
company: {
|
46
|
+
name: name,
|
47
|
+
jurisdiction: JURISDICTION,
|
48
|
+
},
|
49
|
+
source_url: URL,
|
50
|
+
data: [{
|
51
|
+
data_type: :licence,
|
52
|
+
properties: {
|
53
|
+
jurisdiction_code: JURISDICTION,
|
54
|
+
category: 'Financial',
|
55
|
+
jurisdiction_classification: [jurisdiction_classification],
|
56
|
+
}
|
57
|
+
}]
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
class BasicWithProxy < SimpleOpencBot
|
64
|
+
|
65
|
+
# the class that `fetch_records` yields. Must be defined.
|
66
|
+
yields BasicWithProxyRecord
|
67
|
+
|
68
|
+
# This method should yield Records. It must be defined.
|
69
|
+
def fetch_all_records(opts={})
|
70
|
+
|
71
|
+
# you can use any client here, e.g. HTTPClient, open-uri, etc.
|
72
|
+
agent = Mechanize.new
|
73
|
+
|
74
|
+
# This option is set to true when the rake task is called with a
|
75
|
+
# --test switch
|
76
|
+
if opts[:test_mode]
|
77
|
+
# It is recommended to set up a proxy on your computer when
|
78
|
+
# developing and debugging bots. It can greatly speed things up
|
79
|
+
# by removing the network time from the equation (though things
|
80
|
+
# like POSTs won't be cached, anyway)
|
81
|
+
|
82
|
+
# Different agents have different ways of setting a proxy. This
|
83
|
+
# is how Mechanize does it:
|
84
|
+
agent.set_proxy 'localhost', 8123
|
85
|
+
end
|
86
|
+
|
87
|
+
# This is a live page on our website - have a look to see what's
|
88
|
+
# going on. If you have a proxy set up on your computer, the
|
89
|
+
# second time you run this bot, the website won't get hit.
|
90
|
+
page = agent.get("http://assets.opencorporates.com/test_bot_page.html")
|
91
|
+
|
92
|
+
# We tend to use Nokogiri to parse responses, but again this is up
|
93
|
+
# to you.
|
94
|
+
doc = Nokogiri::HTML(page.body)
|
95
|
+
doc.xpath("//li").map do |li|
|
96
|
+
name, type = li.content.split(":")
|
97
|
+
yield BasicWithProxyRecord.new(
|
98
|
+
:name => name.strip,
|
99
|
+
:type => type.strip,
|
100
|
+
:reporting_date => Time.now.iso8601(2))
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# This is a short description of the kind of data the bot handles.
|
2
|
+
description: ''
|
3
|
+
|
4
|
+
# This is your confidence, between 0 and 100, in the accuracy of the data
|
5
|
+
# provided by the data source. It is suggested that you do not change this
|
6
|
+
# without first discussing with OpenCorporates.
|
7
|
+
default_confidence: 80
|
8
|
+
|
9
|
+
# Don't change this. OpenCorporates will bump this version number when the bot
|
10
|
+
# is running in production.
|
11
|
+
version: 0
|
12
|
+
|
13
|
+
# the frequency that updates may happen at the source. One of daily,
|
14
|
+
# weekly, monthly or yearly
|
15
|
+
frequency: monthly
|
16
|
+
|
17
|
+
# If this flag is true, our deployment script will set up the bot to
|
18
|
+
# be run on OpenCorporates servers (specifically, deploy the bot and
|
19
|
+
# allow it to be executed from the data pipeline). Please don't change
|
20
|
+
# this; we'll change it when we're ready to test the bot.
|
21
|
+
enabled: false
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'simple_openc_bot'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
class BotWithSimpleIterator < SimpleOpencBot
|
6
|
+
|
7
|
+
yields Object
|
8
|
+
|
9
|
+
# This method should return an array of Records. It must be defined.
|
10
|
+
def fetch_all_records(opts={})
|
11
|
+
|
12
|
+
# The following methods illustrate four common incrementer
|
13
|
+
# patterns.
|
14
|
+
|
15
|
+
# If a run is interrupted, it will resume where it left off --
|
16
|
+
# unless you pass the reset flag (`bundle exec openc_bot rake
|
17
|
+
# bot:run -- --reset`), or a full iteration has previously
|
18
|
+
# completed (in which case it will start again)
|
19
|
+
|
20
|
+
# Try running `bundle exec openc_bot rake bot:run`, using CTRL-C
|
21
|
+
# to interrupt, and then try resuming.
|
22
|
+
|
23
|
+
increment_over_ascii(opts)
|
24
|
+
increment_over_number(opts)
|
25
|
+
increment_over_manual(opts)
|
26
|
+
combine_incrementers(opts)
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def increment_over_ascii(opts)
|
31
|
+
# Create the incrementer
|
32
|
+
ascii_incrementer = OpencBot::AsciiIncrementer.new(
|
33
|
+
:ascii_incrementer,
|
34
|
+
opts.merge(:size => 2))
|
35
|
+
|
36
|
+
ascii_incrementer.resumable.each do |letters|
|
37
|
+
# This will iterate over all two-digit combinations of 0-9 and
|
38
|
+
# a-z.
|
39
|
+
puts "http://assets.opencorporates.com/test_bot_page_#{letters}.html"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def increment_over_number(opts)
|
44
|
+
# Create the incrementer
|
45
|
+
numeric_incrementer = NumericIncrementer.new(
|
46
|
+
:numeric_incrementer,
|
47
|
+
opts.merge(
|
48
|
+
:start_val => 0,
|
49
|
+
:end_val => 20))
|
50
|
+
|
51
|
+
numeric_incrementer.resumable.each do |number|
|
52
|
+
# This will iterate over numbers 0 - 20
|
53
|
+
puts "http://assets.opencorporates.com/test_bot_page_#{number}.html"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def increment_over_manual(opts)
|
58
|
+
# Create the incrementer
|
59
|
+
manual_incrementer = OpencBot::ManualIncrementer.new(
|
60
|
+
:manual_incrementer,
|
61
|
+
opts.merge(:fields => [:name]))
|
62
|
+
|
63
|
+
if !manual_incrementer.populated
|
64
|
+
# Populate it, if it's not been done before
|
65
|
+
manual_incrementer.add_row({"name" => "Bob"})
|
66
|
+
manual_incrementer.add_row({"name" => "Sue"})
|
67
|
+
end
|
68
|
+
|
69
|
+
# Mark populating as complete.. the `populated` flag is not
|
70
|
+
# necessary, but it's useful when debugging to skip slow
|
71
|
+
# population steps.
|
72
|
+
manual_incrementer.populated
|
73
|
+
|
74
|
+
manual_incrementer.resumable.each do |row|
|
75
|
+
# This will iterate over all the rows added previously.
|
76
|
+
puts "http://assets.opencorporates.com/test_bot_page_#{row["name"]}.html"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
# Often you will need to use an iterator to build a list of pages to
|
82
|
+
# get, using another iterator.
|
83
|
+
def combine_incrementers(opts)
|
84
|
+
ascii_incrementer = OpencBot::AsciiIncrementer.new(
|
85
|
+
:ascii_incrementer_2,
|
86
|
+
opts.merge(:size => 1))
|
87
|
+
|
88
|
+
manual_incrementer = OpencBot::ManualIncrementer.new(
|
89
|
+
:manual_incrementer_2,
|
90
|
+
opts.merge(:fields => [:url]))
|
91
|
+
|
92
|
+
ascii_incrementer.resumable.each do |letters|
|
93
|
+
get_urls_for_letter_combination(letters).each do |url|
|
94
|
+
manual_incrementer.add_row({"url" => url})
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
manual_incrementer.resumable.each do |row|
|
99
|
+
puts row["url"]
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
def get_urls_for_letter_combination(letters)
|
105
|
+
# This method might do something like:
|
106
|
+
# page = http_client.get("http://somewhere.com/?q=#{letters}")
|
107
|
+
# urls = page.xpath("//a/@href")
|
108
|
+
|
109
|
+
# However, for demonstration purposes, we just return:
|
110
|
+
["http://foo.com/#{letters}/1", "http://foo.com/#{letters}/2"]
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'openc_bot'
|
2
|
+
require 'openc_bot/company_fetcher_bot'
|
3
|
+
# We tend to use Nokogiri to parse HTML//XML but this is optional
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
|
7
|
+
module XyCompaniesFetcher
|
8
|
+
extend OpencBot
|
9
|
+
# This adds the CompanyFetcherBot functionality
|
10
|
+
extend OpencBot::CompanyFetcherBot
|
11
|
+
extend self # make these methods as Module methods, rather than instance ones
|
12
|
+
|
13
|
+
|
14
|
+
# The update_data module method is called when the bot is run. This is the only required method a bot needs,
|
15
|
+
# and the only requirement that it needs to satisy is that it should save a company as a Hash that
|
16
|
+
# conforms to the company-schema (https://github.com/openc/openc_bot/blob/master/schemas/company-schema.json)
|
17
|
+
# using the #save_entity method. This method validates the hash, and saves in the database, adding the
|
18
|
+
# ISO-3166-2 jurisdiction_code inferred from the name of the module (in this case xy)
|
19
|
+
#
|
20
|
+
# There are various helpers that we've found useful (see https://github.com/openc/openc_bot/tree/master/lib/openc_bot/helpers)
|
21
|
+
# but not of them are required. For example, if you are doing an alpha search ('AA','AB',...) there are
|
22
|
+
# intelligent defaults for doing such a search, and in fact you don't even need to write the #update_data
|
23
|
+
# method. Similarly for incremental searches (where you are iterating through a series of increasing uids).
|
24
|
+
# There are also helpers for normalising dates and text.
|
25
|
+
def update_data
|
26
|
+
# This code is actually for the Bermuda company register
|
27
|
+
#
|
28
|
+
# Get all the pages containing companies...
|
29
|
+
a_z_links = Nokogiri.HTML(open( 'https://www.roc.gov.bm/roc/rocweb.nsf/ReviewPublicRegA-Z?OpenForm')).search('a')
|
30
|
+
# iterate through them...
|
31
|
+
a_z_links.each do |link|
|
32
|
+
page = Nokogiri.HTML(open('https://www.roc.gov.bm' + link[:href]))
|
33
|
+
# find all the companies in the table...
|
34
|
+
page.search('//table[2]//tr').each do |tr|
|
35
|
+
# extract the information
|
36
|
+
name = tr.at_xpath('.//td[2]//a').inner_text.strip rescue nil
|
37
|
+
company_number = tr.at('td a').inner_text.strip rescue nil
|
38
|
+
incorporation_date = tr.at('.//td[3]//a').inner_text.to_date.to_s rescue nil
|
39
|
+
next if !name&&!company_number&&!incorporation_date
|
40
|
+
# save the entity hash in the local database, using #save_entity helper method, which
|
41
|
+
# validating it against the company schema first
|
42
|
+
save_entity(:name => name, :company_number => company_number, :incorporation_date => incorporation_date, :retrieved_at => Time.now.to_s)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
end
|