openc_bot 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
data/db/.gitkeep ADDED
File without changes
@@ -0,0 +1,3 @@
1
+ /db
2
+ /data
3
+ /tmp
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+ gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'
3
+ gem 'mechanize'
4
+ group :test do
5
+ gem 'rspec'
6
+ gem 'debugger'
7
+ end
@@ -0,0 +1,21 @@
1
+ # This is a short description of the kind of data the bot handles.
2
+ description: ''
3
+
4
+ # This is your confidence, between 0 and 100, in the accuracy of the data
5
+ # provided by the data source. It is suggested that you do not change this
6
+ # without first discussing with OpenCorporates.
7
+ default_confidence: 80
8
+
9
+ # Don't change this. OpenCorporates will bump this version number when the bot
10
+ # is running in production.
11
+ version: 0
12
+
13
+ # the frequency that updates may happen at the source. One of daily,
14
+ # weekly, monthly or yearly
15
+ frequency: monthly
16
+
17
+ # If this flag is true, our deployment script will set up the bot to
18
+ # be run on OpenCorporates servers (specifically, deploy the bot and
19
+ # allow it to be executed from the data pipeline). Please don't change
20
+ # this; we'll change it when we're ready to test the bot.
21
+ enabled: false
@@ -0,0 +1,88 @@
1
+ # encoding: UTF-8
2
+ require 'simple_openc_bot'
3
+ require 'mechanize'
4
+
5
+ # you may need to require other libraries here
6
+ # require 'nokogiri'
7
+
8
+ class BasicRecord < SimpleOpencBot::BaseLicenceRecord
9
+ # The JSON schema to use to validate records; correspond with files
10
+ # in `schema/*-schema.json`
11
+ schema :licence
12
+
13
+ # Fields you define here will be persisted to a local database when
14
+ # 'fetch_records' (see below) is run.
15
+ store_fields :name, :type, :reporting_date
16
+
17
+ # This is the field(s) which will uniquely define a record (think
18
+ # primary key in a database).
19
+ unique_fields :name
20
+
21
+ # These are just example methods and constants used by
22
+ # `to_pipeline`, below
23
+ JURISDICTION = "uk"
24
+ URL = "http://foo.com"
25
+
26
+ def jurisdiction_classification
27
+ type
28
+ end
29
+
30
+ # This must be defined, and should return a timestamp in ISO8601
31
+ # format. Its value should change when something about the record
32
+ # has changed. It doesn't have to be a method - it can also be a
33
+ # member of `store_fields`, above.
34
+ def last_updated_at
35
+ reporting_date
36
+ end
37
+
38
+ # This method must be defined. You can test that you're outputting
39
+ # in the right format with `bin/verify_data`, which will validate
40
+ # any data you've fetched against the relevant schema. See
41
+ # `doc/SCHEMA.md` for documentation.
42
+ def to_pipeline
43
+ {
44
+ sample_date: last_updated_at,
45
+ company: {
46
+ name: name,
47
+ jurisdiction: JURISDICTION,
48
+ },
49
+ source_url: URL,
50
+ data: [{
51
+ data_type: :licence,
52
+ properties: {
53
+ jurisdiction_code: JURISDICTION,
54
+ category: 'Financial',
55
+ jurisdiction_classification: [jurisdiction_classification],
56
+ }
57
+ }]
58
+ }
59
+ end
60
+
61
+ end
62
+
63
+ class Basic < SimpleOpencBot
64
+
65
+ # the class that `fetch_records` yields. Must be defined.
66
+ yields BasicRecord
67
+
68
+ # This method should yield Records. It must be defined.
69
+ def fetch_all_records(opts={})
70
+
71
+ # you can use any client here, e.g. HTTPClient, open-uri, etc.
72
+ agent = Mechanize.new
73
+
74
+ # This is a live page on our website - have a look to see what's going on.
75
+ page = agent.get("http://assets.opencorporates.com/test_bot_page.html")
76
+
77
+ # We tend to use Nokogiri to parse responses, but again this is up
78
+ # to you.
79
+ doc = Nokogiri::HTML(page.body)
80
+ doc.xpath("//li").map do |li|
81
+ name, type = li.content.split(":")
82
+ yield BasicRecord.new(
83
+ :name => name.strip,
84
+ :type => type.strip,
85
+ :reporting_date => Time.now.iso8601(2))
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+ gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'
3
+ gem 'mechanize'
4
+ group :test do
5
+ gem 'rspec'
6
+ gem 'debugger'
7
+ end
@@ -0,0 +1,21 @@
1
+ # This is a short description of the kind of data the bot handles.
2
+ description: ''
3
+
4
+ # This is your confidence, between 0 and 100, in the accuracy of the data
5
+ # provided by the data source. It is suggested that you do not change this
6
+ # without first discussing with OpenCorporates.
7
+ default_confidence: 80
8
+
9
+ # Don't change this. OpenCorporates will bump this version number when the bot
10
+ # is running in production.
11
+ version: 0
12
+
13
+ # the frequency that updates may happen at the source. One of daily,
14
+ # weekly, monthly or yearly
15
+ frequency: monthly
16
+
17
+ # If this flag is true, our deployment script will set up the bot to
18
+ # be run on OpenCorporates servers (specifically, deploy the bot and
19
+ # allow it to be executed from the data pipeline). Please don't change
20
+ # this; we'll change it when we're ready to test the bot.
21
+ enabled: false
@@ -0,0 +1,103 @@
1
+ # encoding: UTF-8
2
+ require 'simple_openc_bot'
3
+ require 'mechanize'
4
+
5
+ # you may need to require other libraries here
6
+ # require 'nokogiri'
7
+
8
+ class BasicWithProxyRecord < SimpleOpencBot::BaseLicenceRecord
9
+ # The JSON schema to use to validate records; correspond with files
10
+ # in `schema/*-schema.json`
11
+ schema :licence
12
+
13
+ # Fields you define here will be persisted to a local database when
14
+ # 'fetch_records' (see below) is run.
15
+ store_fields :name, :type, :reporting_date
16
+
17
+ # This is the field(s) which will uniquely define a record (think
18
+ # primary key in a database).
19
+ unique_fields :name
20
+
21
+ # These are just example methods and constants used by
22
+ # `to_pipeline`, below
23
+ JURISDICTION = "uk"
24
+ URL = "http://foo.com"
25
+
26
+ def jurisdiction_classification
27
+ type
28
+ end
29
+
30
+ # This must be defined, and should return a timestamp in ISO8601
31
+ # format. Its value should change when something about the record
32
+ # has changed. It doesn't have to be a method - it can also be a
33
+ # member of `store_fields`, above.
34
+ def last_updated_at
35
+ reporting_date
36
+ end
37
+
38
+ # This method must be defined. You can test that you're outputting
39
+ # in the right format with `bin/verify_data`, which will validate
40
+ # any data you've fetched against the relevant schema. See
41
+ # `doc/SCHEMA.md` for documentation.
42
+ def to_pipeline
43
+ {
44
+ sample_date: last_updated_at,
45
+ company: {
46
+ name: name,
47
+ jurisdiction: JURISDICTION,
48
+ },
49
+ source_url: URL,
50
+ data: [{
51
+ data_type: :licence,
52
+ properties: {
53
+ jurisdiction_code: JURISDICTION,
54
+ category: 'Financial',
55
+ jurisdiction_classification: [jurisdiction_classification],
56
+ }
57
+ }]
58
+ }
59
+ end
60
+
61
+ end
62
+
63
+ class BasicWithProxy < SimpleOpencBot
64
+
65
+ # the class that `fetch_records` yields. Must be defined.
66
+ yields BasicWithProxyRecord
67
+
68
+ # This method should yield Records. It must be defined.
69
+ def fetch_all_records(opts={})
70
+
71
+ # you can use any client here, e.g. HTTPClient, open-uri, etc.
72
+ agent = Mechanize.new
73
+
74
+ # This option is set to true when the rake task is called with a
75
+ # --test switch
76
+ if opts[:test_mode]
77
+ # It is recommended to set up a proxy on your computer when
78
+ # developing and debugging bots. It can greatly speed things up
79
+ # by removing the network time from the equation (though things
80
+ # like POSTs won't be cached, anyway)
81
+
82
+ # Different agents have different ways of setting a proxy. This
83
+ # is how Mechanize does it:
84
+ agent.set_proxy 'localhost', 8123
85
+ end
86
+
87
+ # This is a live page on our website - have a look to see what's
88
+ # going on. If you have a proxy set up on your computer, the
89
+ # second time you run this bot, the website won't get hit.
90
+ page = agent.get("http://assets.opencorporates.com/test_bot_page.html")
91
+
92
+ # We tend to use Nokogiri to parse responses, but again this is up
93
+ # to you.
94
+ doc = Nokogiri::HTML(page.body)
95
+ doc.xpath("//li").map do |li|
96
+ name, type = li.content.split(":")
97
+ yield BasicWithProxyRecord.new(
98
+ :name => name.strip,
99
+ :type => type.strip,
100
+ :reporting_date => Time.now.iso8601(2))
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+ gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'
3
+ group :test do
4
+ gem 'rspec'
5
+ gem 'debugger'
6
+ end
@@ -0,0 +1,21 @@
1
+ # This is a short description of the kind of data the bot handles.
2
+ description: ''
3
+
4
+ # This is your confidence, between 0 and 100, in the accuracy of the data
5
+ # provided by the data source. It is suggested that you do not change this
6
+ # without first discussing with OpenCorporates.
7
+ default_confidence: 80
8
+
9
+ # Don't change this. OpenCorporates will bump this version number when the bot
10
+ # is running in production.
11
+ version: 0
12
+
13
+ # the frequency that updates may happen at the source. One of daily,
14
+ # weekly, monthly or yearly
15
+ frequency: monthly
16
+
17
+ # If this flag is true, our deployment script will set up the bot to
18
+ # be run on OpenCorporates servers (specifically, deploy the bot and
19
+ # allow it to be executed from the data pipeline). Please don't change
20
+ # this; we'll change it when we're ready to test the bot.
21
+ enabled: false
@@ -0,0 +1,112 @@
1
+ # encoding: UTF-8
2
+ require 'simple_openc_bot'
3
+ require 'mechanize'
4
+
5
+ class BotWithSimpleIterator < SimpleOpencBot
6
+
7
+ yields Object
8
+
9
+ # This method should return an array of Records. It must be defined.
10
+ def fetch_all_records(opts={})
11
+
12
+ # The following methods illustrate four common incrementer
13
+ # patterns.
14
+
15
+ # If a run is interrupted, it will resume where it left off --
16
+ # unless you pass the reset flag (`bundle exec openc_bot rake
17
+ # bot:run -- --reset`), or a full iteration has previously
18
+ # completed (in which case it will start again)
19
+
20
+ # Try running `bundle exec openc_bot rake bot:run`, using CTRL-C
21
+ # to interrupt, and then try resuming.
22
+
23
+ increment_over_ascii(opts)
24
+ increment_over_number(opts)
25
+ increment_over_manual(opts)
26
+ combine_incrementers(opts)
27
+ end
28
+
29
+
30
+ def increment_over_ascii(opts)
31
+ # Create the incrementer
32
+ ascii_incrementer = OpencBot::AsciiIncrementer.new(
33
+ :ascii_incrementer,
34
+ opts.merge(:size => 2))
35
+
36
+ ascii_incrementer.resumable.each do |letters|
37
+ # This will iterate over all two-digit combinations of 0-9 and
38
+ # a-z.
39
+ puts "http://assets.opencorporates.com/test_bot_page_#{letters}.html"
40
+ end
41
+ end
42
+
43
+ def increment_over_number(opts)
44
+ # Create the incrementer
45
+ numeric_incrementer = NumericIncrementer.new(
46
+ :numeric_incrementer,
47
+ opts.merge(
48
+ :start_val => 0,
49
+ :end_val => 20))
50
+
51
+ numeric_incrementer.resumable.each do |number|
52
+ # This will iterate over numbers 0 - 20
53
+ puts "http://assets.opencorporates.com/test_bot_page_#{number}.html"
54
+ end
55
+ end
56
+
57
+ def increment_over_manual(opts)
58
+ # Create the incrementer
59
+ manual_incrementer = OpencBot::ManualIncrementer.new(
60
+ :manual_incrementer,
61
+ opts.merge(:fields => [:name]))
62
+
63
+ if !manual_incrementer.populated
64
+ # Populate it, if it's not been done before
65
+ manual_incrementer.add_row({"name" => "Bob"})
66
+ manual_incrementer.add_row({"name" => "Sue"})
67
+ end
68
+
69
+ # Mark populating as complete.. the `populated` flag is not
70
+ # necessary, but it's useful when debugging to skip slow
71
+ # population steps.
72
+ manual_incrementer.populated
73
+
74
+ manual_incrementer.resumable.each do |row|
75
+ # This will iterate over all the rows added previously.
76
+ puts "http://assets.opencorporates.com/test_bot_page_#{row["name"]}.html"
77
+ end
78
+ end
79
+
80
+
81
+ # Often you will need to use an iterator to build a list of pages to
82
+ # get, using another iterator.
83
+ def combine_incrementers(opts)
84
+ ascii_incrementer = OpencBot::AsciiIncrementer.new(
85
+ :ascii_incrementer_2,
86
+ opts.merge(:size => 1))
87
+
88
+ manual_incrementer = OpencBot::ManualIncrementer.new(
89
+ :manual_incrementer_2,
90
+ opts.merge(:fields => [:url]))
91
+
92
+ ascii_incrementer.resumable.each do |letters|
93
+ get_urls_for_letter_combination(letters).each do |url|
94
+ manual_incrementer.add_row({"url" => url})
95
+ end
96
+ end
97
+
98
+ manual_incrementer.resumable.each do |row|
99
+ puts row["url"]
100
+ end
101
+
102
+ end
103
+
104
+ def get_urls_for_letter_combination(letters)
105
+ # This method might do something like:
106
+ # page = http_client.get("http://somewhere.com/?q=#{letters}")
107
+ # urls = page.xpath("//a/@href")
108
+
109
+ # However, for demonstration purposes, we just return:
110
+ ["http://foo.com/#{letters}/1", "http://foo.com/#{letters}/2"]
111
+ end
112
+ end
@@ -0,0 +1,49 @@
1
+ require 'openc_bot'
2
+ require 'openc_bot/company_fetcher_bot'
3
+ # We tend to use Nokogiri to parse HTML//XML but this is optional
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+
7
+ module XyCompaniesFetcher
8
+ extend OpencBot
9
+ # This adds the CompanyFetcherBot functionality
10
+ extend OpencBot::CompanyFetcherBot
11
+ extend self # make these methods as Module methods, rather than instance ones
12
+
13
+
14
+ # The update_data module method is called when the bot is run. This is the only required method a bot needs,
15
+ # and the only requirement that it needs to satisy is that it should save a company as a Hash that
16
+ # conforms to the company-schema (https://github.com/openc/openc_bot/blob/master/schemas/company-schema.json)
17
+ # using the #save_entity method. This method validates the hash, and saves in the database, adding the
18
+ # ISO-3166-2 jurisdiction_code inferred from the name of the module (in this case xy)
19
+ #
20
+ # There are various helpers that we've found useful (see https://github.com/openc/openc_bot/tree/master/lib/openc_bot/helpers)
21
+ # but not of them are required. For example, if you are doing an alpha search ('AA','AB',...) there are
22
+ # intelligent defaults for doing such a search, and in fact you don't even need to write the #update_data
23
+ # method. Similarly for incremental searches (where you are iterating through a series of increasing uids).
24
+ # There are also helpers for normalising dates and text.
25
+ def update_data
26
+ # This code is actually for the Bermuda company register
27
+ #
28
+ # Get all the pages containing companies...
29
+ a_z_links = Nokogiri.HTML(open( 'https://www.roc.gov.bm/roc/rocweb.nsf/ReviewPublicRegA-Z?OpenForm')).search('a')
30
+ # iterate through them...
31
+ a_z_links.each do |link|
32
+ page = Nokogiri.HTML(open('https://www.roc.gov.bm' + link[:href]))
33
+ # find all the companies in the table...
34
+ page.search('//table[2]//tr').each do |tr|
35
+ # extract the information
36
+ name = tr.at_xpath('.//td[2]//a').inner_text.strip rescue nil
37
+ company_number = tr.at('td a').inner_text.strip rescue nil
38
+ incorporation_date = tr.at('.//td[3]//a').inner_text.to_date.to_s rescue nil
39
+ next if !name&&!company_number&&!incorporation_date
40
+ # save the entity hash in the local database, using #save_entity helper method, which
41
+ # validating it against the company schema first
42
+ save_entity(:name => name, :company_number => company_number, :incorporation_date => incorporation_date, :retrieved_at => Time.now.to_s)
43
+ end
44
+ end
45
+
46
+ end
47
+
48
+
49
+ end