openc_bot 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +2 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +253 -0
- data/Rakefile +14 -0
- data/bin/openc_bot +13 -0
- data/create_bot.sh +30 -0
- data/create_company_bot.sh +16 -0
- data/create_simple_licence_bot.sh +31 -0
- data/db/.gitkeep +0 -0
- data/examples/basic/.gitignore +3 -0
- data/examples/basic/Gemfile +7 -0
- data/examples/basic/config.yml +21 -0
- data/examples/basic/lib/basic.rb +88 -0
- data/examples/basic_with_proxy/Gemfile +7 -0
- data/examples/basic_with_proxy/config.yml +21 -0
- data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
- data/examples/bot_with_simple_iterator/Gemfile +6 -0
- data/examples/bot_with_simple_iterator/config.yml +21 -0
- data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
- data/examples/company_fetchers/basic.rb +49 -0
- data/lib/monkey_patches/mechanize.rb +53 -0
- data/lib/openc_bot.rb +89 -0
- data/lib/openc_bot/bot_data_validator.rb +18 -0
- data/lib/openc_bot/company_fetcher_bot.rb +40 -0
- data/lib/openc_bot/exceptions.rb +17 -0
- data/lib/openc_bot/helpers/_csv.rb +10 -0
- data/lib/openc_bot/helpers/alpha_search.rb +73 -0
- data/lib/openc_bot/helpers/dates.rb +33 -0
- data/lib/openc_bot/helpers/html.rb +8 -0
- data/lib/openc_bot/helpers/incremental_search.rb +106 -0
- data/lib/openc_bot/helpers/register_methods.rb +205 -0
- data/lib/openc_bot/helpers/text.rb +18 -0
- data/lib/openc_bot/incrementers.rb +2 -0
- data/lib/openc_bot/incrementers/base.rb +214 -0
- data/lib/openc_bot/incrementers/common.rb +47 -0
- data/lib/openc_bot/tasks.rb +385 -0
- data/lib/openc_bot/templates/README.md +35 -0
- data/lib/openc_bot/templates/bin/export_data +28 -0
- data/lib/openc_bot/templates/bin/fetch_data +23 -0
- data/lib/openc_bot/templates/bin/verify_data +1 -0
- data/lib/openc_bot/templates/config.yml +21 -0
- data/lib/openc_bot/templates/lib/bot.rb +43 -0
- data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
- data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
- data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
- data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
- data/lib/openc_bot/version.rb +3 -0
- data/lib/simple_openc_bot.rb +289 -0
- data/openc_bot.gemspec +35 -0
- data/schemas/company-schema.json +112 -0
- data/schemas/includes/address.json +23 -0
- data/schemas/includes/base-statement.json +27 -0
- data/schemas/includes/company.json +14 -0
- data/schemas/includes/filing.json +20 -0
- data/schemas/includes/license-data.json +27 -0
- data/schemas/includes/officer.json +14 -0
- data/schemas/includes/previous_name.json +11 -0
- data/schemas/includes/share-parcel-data.json +67 -0
- data/schemas/includes/share-parcel.json +60 -0
- data/schemas/includes/subsidiary-relationship-data.json +52 -0
- data/schemas/includes/total-shares.json +10 -0
- data/schemas/licence-schema.json +21 -0
- data/schemas/share-parcel-schema.json +21 -0
- data/schemas/subsidiary-relationship-schema.json +19 -0
- data/spec/dummy_classes/foo_bot.rb +4 -0
- data/spec/lib/bot_data_validator_spec.rb +69 -0
- data/spec/lib/company_fetcher_bot_spec.rb +93 -0
- data/spec/lib/exceptions_spec.rb +25 -0
- data/spec/lib/helpers/alpha_search_spec.rb +173 -0
- data/spec/lib/helpers/dates_spec.rb +65 -0
- data/spec/lib/helpers/incremental_search_spec.rb +471 -0
- data/spec/lib/helpers/register_methods_spec.rb +558 -0
- data/spec/lib/helpers/text_spec.rb +50 -0
- data/spec/lib/openc_bot/db/.gitkeep +0 -0
- data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
- data/spec/lib/openc_bot_spec.rb +116 -0
- data/spec/schemas/company-schema_spec.rb +676 -0
- data/spec/simple_openc_bot_spec.rb +302 -0
- data/spec/spec_helper.rb +19 -0
- metadata +300 -0
data/db/.gitkeep
ADDED
File without changes
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# This is a short description of the kind of data the bot handles.
|
2
|
+
description: ''
|
3
|
+
|
4
|
+
# This is your confidence, between 0 and 100, in the accuracy of the data
|
5
|
+
# provided by the data source. It is suggested that you do not change this
|
6
|
+
# without first discussing with OpenCorporates.
|
7
|
+
default_confidence: 80
|
8
|
+
|
9
|
+
# Don't change this. OpenCorporates will bump this version number when the bot
|
10
|
+
# is running in production.
|
11
|
+
version: 0
|
12
|
+
|
13
|
+
# the frequency that updates may happen at the source. One of daily,
|
14
|
+
# weekly, monthly or yearly
|
15
|
+
frequency: monthly
|
16
|
+
|
17
|
+
# If this flag is true, our deployment script will set up the bot to
|
18
|
+
# be run on OpenCorporates servers (specifically, deploy the bot and
|
19
|
+
# allow it to be executed from the data pipeline). Please don't change
|
20
|
+
# this; we'll change it when we're ready to test the bot.
|
21
|
+
enabled: false
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'simple_openc_bot'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
# you may need to require other libraries here
|
6
|
+
# require 'nokogiri'
|
7
|
+
|
8
|
+
class BasicRecord < SimpleOpencBot::BaseLicenceRecord
|
9
|
+
# The JSON schema to use to validate records; correspond with files
|
10
|
+
# in `schema/*-schema.json`
|
11
|
+
schema :licence
|
12
|
+
|
13
|
+
# Fields you define here will be persisted to a local database when
|
14
|
+
# 'fetch_records' (see below) is run.
|
15
|
+
store_fields :name, :type, :reporting_date
|
16
|
+
|
17
|
+
# This is the field(s) which will uniquely define a record (think
|
18
|
+
# primary key in a database).
|
19
|
+
unique_fields :name
|
20
|
+
|
21
|
+
# These are just example methods and constants used by
|
22
|
+
# `to_pipeline`, below
|
23
|
+
JURISDICTION = "uk"
|
24
|
+
URL = "http://foo.com"
|
25
|
+
|
26
|
+
def jurisdiction_classification
|
27
|
+
type
|
28
|
+
end
|
29
|
+
|
30
|
+
# This must be defined, and should return a timestamp in ISO8601
|
31
|
+
# format. Its value should change when something about the record
|
32
|
+
# has changed. It doesn't have to be a method - it can also be a
|
33
|
+
# member of `store_fields`, above.
|
34
|
+
def last_updated_at
|
35
|
+
reporting_date
|
36
|
+
end
|
37
|
+
|
38
|
+
# This method must be defined. You can test that you're outputting
|
39
|
+
# in the right format with `bin/verify_data`, which will validate
|
40
|
+
# any data you've fetched against the relevant schema. See
|
41
|
+
# `doc/SCHEMA.md` for documentation.
|
42
|
+
def to_pipeline
|
43
|
+
{
|
44
|
+
sample_date: last_updated_at,
|
45
|
+
company: {
|
46
|
+
name: name,
|
47
|
+
jurisdiction: JURISDICTION,
|
48
|
+
},
|
49
|
+
source_url: URL,
|
50
|
+
data: [{
|
51
|
+
data_type: :licence,
|
52
|
+
properties: {
|
53
|
+
jurisdiction_code: JURISDICTION,
|
54
|
+
category: 'Financial',
|
55
|
+
jurisdiction_classification: [jurisdiction_classification],
|
56
|
+
}
|
57
|
+
}]
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
class Basic < SimpleOpencBot
|
64
|
+
|
65
|
+
# the class that `fetch_records` yields. Must be defined.
|
66
|
+
yields BasicRecord
|
67
|
+
|
68
|
+
# This method should yield Records. It must be defined.
|
69
|
+
def fetch_all_records(opts={})
|
70
|
+
|
71
|
+
# you can use any client here, e.g. HTTPClient, open-uri, etc.
|
72
|
+
agent = Mechanize.new
|
73
|
+
|
74
|
+
# This is a live page on our website - have a look to see what's going on.
|
75
|
+
page = agent.get("http://assets.opencorporates.com/test_bot_page.html")
|
76
|
+
|
77
|
+
# We tend to use Nokogiri to parse responses, but again this is up
|
78
|
+
# to you.
|
79
|
+
doc = Nokogiri::HTML(page.body)
|
80
|
+
doc.xpath("//li").map do |li|
|
81
|
+
name, type = li.content.split(":")
|
82
|
+
yield BasicRecord.new(
|
83
|
+
:name => name.strip,
|
84
|
+
:type => type.strip,
|
85
|
+
:reporting_date => Time.now.iso8601(2))
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# This is a short description of the kind of data the bot handles.
|
2
|
+
description: ''
|
3
|
+
|
4
|
+
# This is your confidence, between 0 and 100, in the accuracy of the data
|
5
|
+
# provided by the data source. It is suggested that you do not change this
|
6
|
+
# without first discussing with OpenCorporates.
|
7
|
+
default_confidence: 80
|
8
|
+
|
9
|
+
# Don't change this. OpenCorporates will bump this version number when the bot
|
10
|
+
# is running in production.
|
11
|
+
version: 0
|
12
|
+
|
13
|
+
# the frequency that updates may happen at the source. One of daily,
|
14
|
+
# weekly, monthly or yearly
|
15
|
+
frequency: monthly
|
16
|
+
|
17
|
+
# If this flag is true, our deployment script will set up the bot to
|
18
|
+
# be run on OpenCorporates servers (specifically, deploy the bot and
|
19
|
+
# allow it to be executed from the data pipeline). Please don't change
|
20
|
+
# this; we'll change it when we're ready to test the bot.
|
21
|
+
enabled: false
|
@@ -0,0 +1,103 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'simple_openc_bot'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
# you may need to require other libraries here
|
6
|
+
# require 'nokogiri'
|
7
|
+
|
8
|
+
class BasicWithProxyRecord < SimpleOpencBot::BaseLicenceRecord
|
9
|
+
# The JSON schema to use to validate records; correspond with files
|
10
|
+
# in `schema/*-schema.json`
|
11
|
+
schema :licence
|
12
|
+
|
13
|
+
# Fields you define here will be persisted to a local database when
|
14
|
+
# 'fetch_records' (see below) is run.
|
15
|
+
store_fields :name, :type, :reporting_date
|
16
|
+
|
17
|
+
# This is the field(s) which will uniquely define a record (think
|
18
|
+
# primary key in a database).
|
19
|
+
unique_fields :name
|
20
|
+
|
21
|
+
# These are just example methods and constants used by
|
22
|
+
# `to_pipeline`, below
|
23
|
+
JURISDICTION = "uk"
|
24
|
+
URL = "http://foo.com"
|
25
|
+
|
26
|
+
def jurisdiction_classification
|
27
|
+
type
|
28
|
+
end
|
29
|
+
|
30
|
+
# This must be defined, and should return a timestamp in ISO8601
|
31
|
+
# format. Its value should change when something about the record
|
32
|
+
# has changed. It doesn't have to be a method - it can also be a
|
33
|
+
# member of `store_fields`, above.
|
34
|
+
def last_updated_at
|
35
|
+
reporting_date
|
36
|
+
end
|
37
|
+
|
38
|
+
# This method must be defined. You can test that you're outputting
|
39
|
+
# in the right format with `bin/verify_data`, which will validate
|
40
|
+
# any data you've fetched against the relevant schema. See
|
41
|
+
# `doc/SCHEMA.md` for documentation.
|
42
|
+
def to_pipeline
|
43
|
+
{
|
44
|
+
sample_date: last_updated_at,
|
45
|
+
company: {
|
46
|
+
name: name,
|
47
|
+
jurisdiction: JURISDICTION,
|
48
|
+
},
|
49
|
+
source_url: URL,
|
50
|
+
data: [{
|
51
|
+
data_type: :licence,
|
52
|
+
properties: {
|
53
|
+
jurisdiction_code: JURISDICTION,
|
54
|
+
category: 'Financial',
|
55
|
+
jurisdiction_classification: [jurisdiction_classification],
|
56
|
+
}
|
57
|
+
}]
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
class BasicWithProxy < SimpleOpencBot
|
64
|
+
|
65
|
+
# the class that `fetch_records` yields. Must be defined.
|
66
|
+
yields BasicWithProxyRecord
|
67
|
+
|
68
|
+
# This method should yield Records. It must be defined.
|
69
|
+
def fetch_all_records(opts={})
|
70
|
+
|
71
|
+
# you can use any client here, e.g. HTTPClient, open-uri, etc.
|
72
|
+
agent = Mechanize.new
|
73
|
+
|
74
|
+
# This option is set to true when the rake task is called with a
|
75
|
+
# --test switch
|
76
|
+
if opts[:test_mode]
|
77
|
+
# It is recommended to set up a proxy on your computer when
|
78
|
+
# developing and debugging bots. It can greatly speed things up
|
79
|
+
# by removing the network time from the equation (though things
|
80
|
+
# like POSTs won't be cached, anyway)
|
81
|
+
|
82
|
+
# Different agents have different ways of setting a proxy. This
|
83
|
+
# is how Mechanize does it:
|
84
|
+
agent.set_proxy 'localhost', 8123
|
85
|
+
end
|
86
|
+
|
87
|
+
# This is a live page on our website - have a look to see what's
|
88
|
+
# going on. If you have a proxy set up on your computer, the
|
89
|
+
# second time you run this bot, the website won't get hit.
|
90
|
+
page = agent.get("http://assets.opencorporates.com/test_bot_page.html")
|
91
|
+
|
92
|
+
# We tend to use Nokogiri to parse responses, but again this is up
|
93
|
+
# to you.
|
94
|
+
doc = Nokogiri::HTML(page.body)
|
95
|
+
doc.xpath("//li").map do |li|
|
96
|
+
name, type = li.content.split(":")
|
97
|
+
yield BasicWithProxyRecord.new(
|
98
|
+
:name => name.strip,
|
99
|
+
:type => type.strip,
|
100
|
+
:reporting_date => Time.now.iso8601(2))
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# This is a short description of the kind of data the bot handles.
|
2
|
+
description: ''
|
3
|
+
|
4
|
+
# This is your confidence, between 0 and 100, in the accuracy of the data
|
5
|
+
# provided by the data source. It is suggested that you do not change this
|
6
|
+
# without first discussing with OpenCorporates.
|
7
|
+
default_confidence: 80
|
8
|
+
|
9
|
+
# Don't change this. OpenCorporates will bump this version number when the bot
|
10
|
+
# is running in production.
|
11
|
+
version: 0
|
12
|
+
|
13
|
+
# the frequency that updates may happen at the source. One of daily,
|
14
|
+
# weekly, monthly or yearly
|
15
|
+
frequency: monthly
|
16
|
+
|
17
|
+
# If this flag is true, our deployment script will set up the bot to
|
18
|
+
# be run on OpenCorporates servers (specifically, deploy the bot and
|
19
|
+
# allow it to be executed from the data pipeline). Please don't change
|
20
|
+
# this; we'll change it when we're ready to test the bot.
|
21
|
+
enabled: false
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'simple_openc_bot'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
class BotWithSimpleIterator < SimpleOpencBot
|
6
|
+
|
7
|
+
yields Object
|
8
|
+
|
9
|
+
# This method should return an array of Records. It must be defined.
|
10
|
+
def fetch_all_records(opts={})
|
11
|
+
|
12
|
+
# The following methods illustrate four common incrementer
|
13
|
+
# patterns.
|
14
|
+
|
15
|
+
# If a run is interrupted, it will resume where it left off --
|
16
|
+
# unless you pass the reset flag (`bundle exec openc_bot rake
|
17
|
+
# bot:run -- --reset`), or a full iteration has previously
|
18
|
+
# completed (in which case it will start again)
|
19
|
+
|
20
|
+
# Try running `bundle exec openc_bot rake bot:run`, using CTRL-C
|
21
|
+
# to interrupt, and then try resuming.
|
22
|
+
|
23
|
+
increment_over_ascii(opts)
|
24
|
+
increment_over_number(opts)
|
25
|
+
increment_over_manual(opts)
|
26
|
+
combine_incrementers(opts)
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def increment_over_ascii(opts)
|
31
|
+
# Create the incrementer
|
32
|
+
ascii_incrementer = OpencBot::AsciiIncrementer.new(
|
33
|
+
:ascii_incrementer,
|
34
|
+
opts.merge(:size => 2))
|
35
|
+
|
36
|
+
ascii_incrementer.resumable.each do |letters|
|
37
|
+
# This will iterate over all two-digit combinations of 0-9 and
|
38
|
+
# a-z.
|
39
|
+
puts "http://assets.opencorporates.com/test_bot_page_#{letters}.html"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def increment_over_number(opts)
|
44
|
+
# Create the incrementer
|
45
|
+
numeric_incrementer = NumericIncrementer.new(
|
46
|
+
:numeric_incrementer,
|
47
|
+
opts.merge(
|
48
|
+
:start_val => 0,
|
49
|
+
:end_val => 20))
|
50
|
+
|
51
|
+
numeric_incrementer.resumable.each do |number|
|
52
|
+
# This will iterate over numbers 0 - 20
|
53
|
+
puts "http://assets.opencorporates.com/test_bot_page_#{number}.html"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def increment_over_manual(opts)
|
58
|
+
# Create the incrementer
|
59
|
+
manual_incrementer = OpencBot::ManualIncrementer.new(
|
60
|
+
:manual_incrementer,
|
61
|
+
opts.merge(:fields => [:name]))
|
62
|
+
|
63
|
+
if !manual_incrementer.populated
|
64
|
+
# Populate it, if it's not been done before
|
65
|
+
manual_incrementer.add_row({"name" => "Bob"})
|
66
|
+
manual_incrementer.add_row({"name" => "Sue"})
|
67
|
+
end
|
68
|
+
|
69
|
+
# Mark populating as complete.. the `populated` flag is not
|
70
|
+
# necessary, but it's useful when debugging to skip slow
|
71
|
+
# population steps.
|
72
|
+
manual_incrementer.populated
|
73
|
+
|
74
|
+
manual_incrementer.resumable.each do |row|
|
75
|
+
# This will iterate over all the rows added previously.
|
76
|
+
puts "http://assets.opencorporates.com/test_bot_page_#{row["name"]}.html"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
# Often you will need to use an iterator to build a list of pages to
|
82
|
+
# get, using another iterator.
|
83
|
+
def combine_incrementers(opts)
|
84
|
+
ascii_incrementer = OpencBot::AsciiIncrementer.new(
|
85
|
+
:ascii_incrementer_2,
|
86
|
+
opts.merge(:size => 1))
|
87
|
+
|
88
|
+
manual_incrementer = OpencBot::ManualIncrementer.new(
|
89
|
+
:manual_incrementer_2,
|
90
|
+
opts.merge(:fields => [:url]))
|
91
|
+
|
92
|
+
ascii_incrementer.resumable.each do |letters|
|
93
|
+
get_urls_for_letter_combination(letters).each do |url|
|
94
|
+
manual_incrementer.add_row({"url" => url})
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
manual_incrementer.resumable.each do |row|
|
99
|
+
puts row["url"]
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
def get_urls_for_letter_combination(letters)
|
105
|
+
# This method might do something like:
|
106
|
+
# page = http_client.get("http://somewhere.com/?q=#{letters}")
|
107
|
+
# urls = page.xpath("//a/@href")
|
108
|
+
|
109
|
+
# However, for demonstration purposes, we just return:
|
110
|
+
["http://foo.com/#{letters}/1", "http://foo.com/#{letters}/2"]
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'openc_bot'
|
2
|
+
require 'openc_bot/company_fetcher_bot'
|
3
|
+
# We tend to use Nokogiri to parse HTML//XML but this is optional
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
|
7
|
+
module XyCompaniesFetcher
|
8
|
+
extend OpencBot
|
9
|
+
# This adds the CompanyFetcherBot functionality
|
10
|
+
extend OpencBot::CompanyFetcherBot
|
11
|
+
extend self # make these methods as Module methods, rather than instance ones
|
12
|
+
|
13
|
+
|
14
|
+
# The update_data module method is called when the bot is run. This is the only required method a bot needs,
|
15
|
+
# and the only requirement that it needs to satisy is that it should save a company as a Hash that
|
16
|
+
# conforms to the company-schema (https://github.com/openc/openc_bot/blob/master/schemas/company-schema.json)
|
17
|
+
# using the #save_entity method. This method validates the hash, and saves in the database, adding the
|
18
|
+
# ISO-3166-2 jurisdiction_code inferred from the name of the module (in this case xy)
|
19
|
+
#
|
20
|
+
# There are various helpers that we've found useful (see https://github.com/openc/openc_bot/tree/master/lib/openc_bot/helpers)
|
21
|
+
# but not of them are required. For example, if you are doing an alpha search ('AA','AB',...) there are
|
22
|
+
# intelligent defaults for doing such a search, and in fact you don't even need to write the #update_data
|
23
|
+
# method. Similarly for incremental searches (where you are iterating through a series of increasing uids).
|
24
|
+
# There are also helpers for normalising dates and text.
|
25
|
+
def update_data
|
26
|
+
# This code is actually for the Bermuda company register
|
27
|
+
#
|
28
|
+
# Get all the pages containing companies...
|
29
|
+
a_z_links = Nokogiri.HTML(open( 'https://www.roc.gov.bm/roc/rocweb.nsf/ReviewPublicRegA-Z?OpenForm')).search('a')
|
30
|
+
# iterate through them...
|
31
|
+
a_z_links.each do |link|
|
32
|
+
page = Nokogiri.HTML(open('https://www.roc.gov.bm' + link[:href]))
|
33
|
+
# find all the companies in the table...
|
34
|
+
page.search('//table[2]//tr').each do |tr|
|
35
|
+
# extract the information
|
36
|
+
name = tr.at_xpath('.//td[2]//a').inner_text.strip rescue nil
|
37
|
+
company_number = tr.at('td a').inner_text.strip rescue nil
|
38
|
+
incorporation_date = tr.at('.//td[3]//a').inner_text.to_date.to_s rescue nil
|
39
|
+
next if !name&&!company_number&&!incorporation_date
|
40
|
+
# save the entity hash in the local database, using #save_entity helper method, which
|
41
|
+
# validating it against the company schema first
|
42
|
+
save_entity(:name => name, :company_number => company_number, :incorporation_date => incorporation_date, :retrieved_at => Time.now.to_s)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
end
|