openc_bot 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
data/db/.gitkeep ADDED
File without changes
@@ -0,0 +1,3 @@
1
+ /db
2
+ /data
3
+ /tmp
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+ gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'
3
+ gem 'mechanize'
4
+ group :test do
5
+ gem 'rspec'
6
+ gem 'debugger'
7
+ end
@@ -0,0 +1,21 @@
1
+ # This is a short description of the kind of data the bot handles.
2
+ description: ''
3
+
4
+ # This is your confidence, between 0 and 100, in the accuracy of the data
5
+ # provided by the data source. It is suggested that you do not change this
6
+ # without first discussing with OpenCorporates.
7
+ default_confidence: 80
8
+
9
+ # Don't change this. OpenCorporates will bump this version number when the bot
10
+ # is running in production.
11
+ version: 0
12
+
13
+ # the frequency that updates may happen at the source. One of daily,
14
+ # weekly, monthly or yearly
15
+ frequency: monthly
16
+
17
+ # If this flag is true, our deployment script will set up the bot to
18
+ # be run on OpenCorporates servers (specifically, deploy the bot and
19
+ # allow it to be executed from the data pipeline). Please don't change
20
+ # this; we'll change it when we're ready to test the bot.
21
+ enabled: false
@@ -0,0 +1,88 @@
1
+ # encoding: UTF-8
2
+ require 'simple_openc_bot'
3
+ require 'mechanize'
4
+
5
+ # you may need to require other libraries here
6
+ # require 'nokogiri'
7
+
8
+ class BasicRecord < SimpleOpencBot::BaseLicenceRecord
9
+ # The JSON schema to use to validate records; correspond with files
10
+ # in `schema/*-schema.json`
11
+ schema :licence
12
+
13
+ # Fields you define here will be persisted to a local database when
14
+ # 'fetch_records' (see below) is run.
15
+ store_fields :name, :type, :reporting_date
16
+
17
+ # This is the field(s) which will uniquely define a record (think
18
+ # primary key in a database).
19
+ unique_fields :name
20
+
21
+ # These are just example methods and constants used by
22
+ # `to_pipeline`, below
23
+ JURISDICTION = "uk"
24
+ URL = "http://foo.com"
25
+
26
+ def jurisdiction_classification
27
+ type
28
+ end
29
+
30
+ # This must be defined, and should return a timestamp in ISO8601
31
+ # format. Its value should change when something about the record
32
+ # has changed. It doesn't have to be a method - it can also be a
33
+ # member of `store_fields`, above.
34
+ def last_updated_at
35
+ reporting_date
36
+ end
37
+
38
+ # This method must be defined. You can test that you're outputting
39
+ # in the right format with `bin/verify_data`, which will validate
40
+ # any data you've fetched against the relevant schema. See
41
+ # `doc/SCHEMA.md` for documentation.
42
+ def to_pipeline
43
+ {
44
+ sample_date: last_updated_at,
45
+ company: {
46
+ name: name,
47
+ jurisdiction: JURISDICTION,
48
+ },
49
+ source_url: URL,
50
+ data: [{
51
+ data_type: :licence,
52
+ properties: {
53
+ jurisdiction_code: JURISDICTION,
54
+ category: 'Financial',
55
+ jurisdiction_classification: [jurisdiction_classification],
56
+ }
57
+ }]
58
+ }
59
+ end
60
+
61
+ end
62
+
63
+ class Basic < SimpleOpencBot
64
+
65
+ # the class that `fetch_records` yields. Must be defined.
66
+ yields BasicRecord
67
+
68
+ # This method should yield Records. It must be defined.
69
+ def fetch_all_records(opts={})
70
+
71
+ # you can use any client here, e.g. HTTPClient, open-uri, etc.
72
+ agent = Mechanize.new
73
+
74
+ # This is a live page on our website - have a look to see what's going on.
75
+ page = agent.get("http://assets.opencorporates.com/test_bot_page.html")
76
+
77
+ # We tend to use Nokogiri to parse responses, but again this is up
78
+ # to you.
79
+ doc = Nokogiri::HTML(page.body)
80
+ doc.xpath("//li").map do |li|
81
+ name, type = li.content.split(":")
82
+ yield BasicRecord.new(
83
+ :name => name.strip,
84
+ :type => type.strip,
85
+ :reporting_date => Time.now.iso8601(2))
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+ gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'
3
+ gem 'mechanize'
4
+ group :test do
5
+ gem 'rspec'
6
+ gem 'debugger'
7
+ end
@@ -0,0 +1,21 @@
1
+ # This is a short description of the kind of data the bot handles.
2
+ description: ''
3
+
4
+ # This is your confidence, between 0 and 100, in the accuracy of the data
5
+ # provided by the data source. It is suggested that you do not change this
6
+ # without first discussing with OpenCorporates.
7
+ default_confidence: 80
8
+
9
+ # Don't change this. OpenCorporates will bump this version number when the bot
10
+ # is running in production.
11
+ version: 0
12
+
13
+ # the frequency that updates may happen at the source. One of daily,
14
+ # weekly, monthly or yearly
15
+ frequency: monthly
16
+
17
+ # If this flag is true, our deployment script will set up the bot to
18
+ # be run on OpenCorporates servers (specifically, deploy the bot and
19
+ # allow it to be executed from the data pipeline). Please don't change
20
+ # this; we'll change it when we're ready to test the bot.
21
+ enabled: false
@@ -0,0 +1,103 @@
1
+ # encoding: UTF-8
2
+ require 'simple_openc_bot'
3
+ require 'mechanize'
4
+
5
+ # you may need to require other libraries here
6
+ # require 'nokogiri'
7
+
8
+ class BasicWithProxyRecord < SimpleOpencBot::BaseLicenceRecord
9
+ # The JSON schema to use to validate records; correspond with files
10
+ # in `schema/*-schema.json`
11
+ schema :licence
12
+
13
+ # Fields you define here will be persisted to a local database when
14
+ # 'fetch_records' (see below) is run.
15
+ store_fields :name, :type, :reporting_date
16
+
17
+ # This is the field(s) which will uniquely define a record (think
18
+ # primary key in a database).
19
+ unique_fields :name
20
+
21
+ # These are just example methods and constants used by
22
+ # `to_pipeline`, below
23
+ JURISDICTION = "uk"
24
+ URL = "http://foo.com"
25
+
26
+ def jurisdiction_classification
27
+ type
28
+ end
29
+
30
+ # This must be defined, and should return a timestamp in ISO8601
31
+ # format. Its value should change when something about the record
32
+ # has changed. It doesn't have to be a method - it can also be a
33
+ # member of `store_fields`, above.
34
+ def last_updated_at
35
+ reporting_date
36
+ end
37
+
38
+ # This method must be defined. You can test that you're outputting
39
+ # in the right format with `bin/verify_data`, which will validate
40
+ # any data you've fetched against the relevant schema. See
41
+ # `doc/SCHEMA.md` for documentation.
42
+ def to_pipeline
43
+ {
44
+ sample_date: last_updated_at,
45
+ company: {
46
+ name: name,
47
+ jurisdiction: JURISDICTION,
48
+ },
49
+ source_url: URL,
50
+ data: [{
51
+ data_type: :licence,
52
+ properties: {
53
+ jurisdiction_code: JURISDICTION,
54
+ category: 'Financial',
55
+ jurisdiction_classification: [jurisdiction_classification],
56
+ }
57
+ }]
58
+ }
59
+ end
60
+
61
+ end
62
+
63
+ class BasicWithProxy < SimpleOpencBot
64
+
65
+ # the class that `fetch_records` yields. Must be defined.
66
+ yields BasicWithProxyRecord
67
+
68
+ # This method should yield Records. It must be defined.
69
+ def fetch_all_records(opts={})
70
+
71
+ # you can use any client here, e.g. HTTPClient, open-uri, etc.
72
+ agent = Mechanize.new
73
+
74
+ # This option is set to true when the rake task is called with a
75
+ # --test switch
76
+ if opts[:test_mode]
77
+ # It is recommended to set up a proxy on your computer when
78
+ # developing and debugging bots. It can greatly speed things up
79
+ # by removing the network time from the equation (though things
80
+ # like POSTs won't be cached, anyway)
81
+
82
+ # Different agents have different ways of setting a proxy. This
83
+ # is how Mechanize does it:
84
+ agent.set_proxy 'localhost', 8123
85
+ end
86
+
87
+ # This is a live page on our website - have a look to see what's
88
+ # going on. If you have a proxy set up on your computer, the
89
+ # second time you run this bot, the website won't get hit.
90
+ page = agent.get("http://assets.opencorporates.com/test_bot_page.html")
91
+
92
+ # We tend to use Nokogiri to parse responses, but again this is up
93
+ # to you.
94
+ doc = Nokogiri::HTML(page.body)
95
+ doc.xpath("//li").map do |li|
96
+ name, type = li.content.split(":")
97
+ yield BasicWithProxyRecord.new(
98
+ :name => name.strip,
99
+ :type => type.strip,
100
+ :reporting_date => Time.now.iso8601(2))
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+ gem 'openc_bot', :git => 'https://github.com/openc/openc_bot.git'
3
+ group :test do
4
+ gem 'rspec'
5
+ gem 'debugger'
6
+ end
@@ -0,0 +1,21 @@
1
+ # This is a short description of the kind of data the bot handles.
2
+ description: ''
3
+
4
+ # This is your confidence, between 0 and 100, in the accuracy of the data
5
+ # provided by the data source. It is suggested that you do not change this
6
+ # without first discussing with OpenCorporates.
7
+ default_confidence: 80
8
+
9
+ # Don't change this. OpenCorporates will bump this version number when the bot
10
+ # is running in production.
11
+ version: 0
12
+
13
+ # the frequency that updates may happen at the source. One of daily,
14
+ # weekly, monthly or yearly
15
+ frequency: monthly
16
+
17
+ # If this flag is true, our deployment script will set up the bot to
18
+ # be run on OpenCorporates servers (specifically, deploy the bot and
19
+ # allow it to be executed from the data pipeline). Please don't change
20
+ # this; we'll change it when we're ready to test the bot.
21
+ enabled: false
@@ -0,0 +1,112 @@
1
+ # encoding: UTF-8
2
+ require 'simple_openc_bot'
3
+ require 'mechanize'
4
+
5
+ class BotWithSimpleIterator < SimpleOpencBot
6
+
7
+ yields Object
8
+
9
+ # This method should return an array of Records. It must be defined.
10
+ def fetch_all_records(opts={})
11
+
12
+ # The following methods illustrate four common incrementer
13
+ # patterns.
14
+
15
+ # If a run is interrupted, it will resume where it left off --
16
+ # unless you pass the reset flag (`bundle exec openc_bot rake
17
+ # bot:run -- --reset`), or a full iteration has previously
18
+ # completed (in which case it will start again)
19
+
20
+ # Try running `bundle exec openc_bot rake bot:run`, using CTRL-C
21
+ # to interrupt, and then try resuming.
22
+
23
+ increment_over_ascii(opts)
24
+ increment_over_number(opts)
25
+ increment_over_manual(opts)
26
+ combine_incrementers(opts)
27
+ end
28
+
29
+
30
+ def increment_over_ascii(opts)
31
+ # Create the incrementer
32
+ ascii_incrementer = OpencBot::AsciiIncrementer.new(
33
+ :ascii_incrementer,
34
+ opts.merge(:size => 2))
35
+
36
+ ascii_incrementer.resumable.each do |letters|
37
+ # This will iterate over all two-digit combinations of 0-9 and
38
+ # a-z.
39
+ puts "http://assets.opencorporates.com/test_bot_page_#{letters}.html"
40
+ end
41
+ end
42
+
43
+ def increment_over_number(opts)
44
+ # Create the incrementer
45
+ numeric_incrementer = NumericIncrementer.new(
46
+ :numeric_incrementer,
47
+ opts.merge(
48
+ :start_val => 0,
49
+ :end_val => 20))
50
+
51
+ numeric_incrementer.resumable.each do |number|
52
+ # This will iterate over numbers 0 - 20
53
+ puts "http://assets.opencorporates.com/test_bot_page_#{number}.html"
54
+ end
55
+ end
56
+
57
+ def increment_over_manual(opts)
58
+ # Create the incrementer
59
+ manual_incrementer = OpencBot::ManualIncrementer.new(
60
+ :manual_incrementer,
61
+ opts.merge(:fields => [:name]))
62
+
63
+ if !manual_incrementer.populated
64
+ # Populate it, if it's not been done before
65
+ manual_incrementer.add_row({"name" => "Bob"})
66
+ manual_incrementer.add_row({"name" => "Sue"})
67
+ end
68
+
69
+ # Mark populating as complete.. the `populated` flag is not
70
+ # necessary, but it's useful when debugging to skip slow
71
+ # population steps.
72
+ manual_incrementer.populated
73
+
74
+ manual_incrementer.resumable.each do |row|
75
+ # This will iterate over all the rows added previously.
76
+ puts "http://assets.opencorporates.com/test_bot_page_#{row["name"]}.html"
77
+ end
78
+ end
79
+
80
+
81
+ # Often you will need to use an iterator to build a list of pages to
82
+ # get, using another iterator.
83
+ def combine_incrementers(opts)
84
+ ascii_incrementer = OpencBot::AsciiIncrementer.new(
85
+ :ascii_incrementer_2,
86
+ opts.merge(:size => 1))
87
+
88
+ manual_incrementer = OpencBot::ManualIncrementer.new(
89
+ :manual_incrementer_2,
90
+ opts.merge(:fields => [:url]))
91
+
92
+ ascii_incrementer.resumable.each do |letters|
93
+ get_urls_for_letter_combination(letters).each do |url|
94
+ manual_incrementer.add_row({"url" => url})
95
+ end
96
+ end
97
+
98
+ manual_incrementer.resumable.each do |row|
99
+ puts row["url"]
100
+ end
101
+
102
+ end
103
+
104
+ def get_urls_for_letter_combination(letters)
105
+ # This method might do something like:
106
+ # page = http_client.get("http://somewhere.com/?q=#{letters}")
107
+ # urls = page.xpath("//a/@href")
108
+
109
+ # However, for demonstration purposes, we just return:
110
+ ["http://foo.com/#{letters}/1", "http://foo.com/#{letters}/2"]
111
+ end
112
+ end
@@ -0,0 +1,49 @@
1
+ require 'openc_bot'
2
+ require 'openc_bot/company_fetcher_bot'
3
+ # We tend to use Nokogiri to parse HTML//XML but this is optional
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+
7
+ module XyCompaniesFetcher
8
+ extend OpencBot
9
+ # This adds the CompanyFetcherBot functionality
10
+ extend OpencBot::CompanyFetcherBot
11
+ extend self # make these methods as Module methods, rather than instance ones
12
+
13
+
14
+ # The update_data module method is called when the bot is run. This is the only required method a bot needs,
15
+ # and the only requirement that it needs to satisy is that it should save a company as a Hash that
16
+ # conforms to the company-schema (https://github.com/openc/openc_bot/blob/master/schemas/company-schema.json)
17
+ # using the #save_entity method. This method validates the hash, and saves in the database, adding the
18
+ # ISO-3166-2 jurisdiction_code inferred from the name of the module (in this case xy)
19
+ #
20
+ # There are various helpers that we've found useful (see https://github.com/openc/openc_bot/tree/master/lib/openc_bot/helpers)
21
+ # but not of them are required. For example, if you are doing an alpha search ('AA','AB',...) there are
22
+ # intelligent defaults for doing such a search, and in fact you don't even need to write the #update_data
23
+ # method. Similarly for incremental searches (where you are iterating through a series of increasing uids).
24
+ # There are also helpers for normalising dates and text.
25
+ def update_data
26
+ # This code is actually for the Bermuda company register
27
+ #
28
+ # Get all the pages containing companies...
29
+ a_z_links = Nokogiri.HTML(open( 'https://www.roc.gov.bm/roc/rocweb.nsf/ReviewPublicRegA-Z?OpenForm')).search('a')
30
+ # iterate through them...
31
+ a_z_links.each do |link|
32
+ page = Nokogiri.HTML(open('https://www.roc.gov.bm' + link[:href]))
33
+ # find all the companies in the table...
34
+ page.search('//table[2]//tr').each do |tr|
35
+ # extract the information
36
+ name = tr.at_xpath('.//td[2]//a').inner_text.strip rescue nil
37
+ company_number = tr.at('td a').inner_text.strip rescue nil
38
+ incorporation_date = tr.at('.//td[3]//a').inner_text.to_date.to_s rescue nil
39
+ next if !name&&!company_number&&!incorporation_date
40
+ # save the entity hash in the local database, using #save_entity helper method, which
41
+ # validating it against the company schema first
42
+ save_entity(:name => name, :company_number => company_number, :incorporation_date => incorporation_date, :retrieved_at => Time.now.to_s)
43
+ end
44
+ end
45
+
46
+ end
47
+
48
+
49
+ end