openc_bot 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
data/openc_bot.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'openc_bot/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "openc_bot"
8
+ gem.version = OpencBot::VERSION
9
+ gem.authors = ["Chris Taggart"]
10
+ gem.email = ["info@opencorporates.com"]
11
+ gem.description = %q{This gem is to make the writing and running of bots for OpenCorporates quick and easy}
12
+ gem.summary = %q{Helper gem for writing external bots for OpenCorporates}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+
17
+ gem.executables = ['openc_bot']
18
+
19
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.require_paths = ["lib",'lib/openc_bot/helpers']
21
+
22
+ gem.add_dependency "rake"
23
+ gem.add_dependency "activesupport", "3.2.17"
24
+ gem.add_dependency "nokogiri"
25
+ # gem.add_dependency "sqlite3"
26
+ gem.add_dependency "json"
27
+ gem.add_dependency "json-schema"
28
+ gem.add_dependency "httpclient"
29
+ gem.add_dependency "backports"
30
+ gem.add_dependency "scraperwiki", "3.0.2"
31
+
32
+ gem.add_development_dependency "perftools.rb"
33
+ gem.add_development_dependency "debugger"
34
+ gem.add_development_dependency "rspec"
35
+ end
@@ -0,0 +1,112 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-04/schema#",
3
+ "title": "Company Schema",
4
+ "type": "object",
5
+ "description": "A company in OpenCorporates",
6
+ "properties": {
7
+ "company_number": {
8
+ "type": "string",
9
+ "minLength": 1
10
+ },
11
+ "name": {
12
+ "type": "string",
13
+ "minLength": 1
14
+ },
15
+ "jurisdiction_code": {
16
+ "type": "string",
17
+ "minLength": 2,
18
+ "maxLength": 5
19
+ },
20
+ "incorporation_date": {
21
+ "type": "date"
22
+ },
23
+ "dissolution_date": {
24
+ "type": "date"
25
+ },
26
+ "retrieved_at": {
27
+ "type": "date-time"
28
+ },
29
+ "current_status": {
30
+ "type": "string"
31
+ },
32
+ "company_type": {
33
+ "type": "string"
34
+ },
35
+ "registry_url": {
36
+ "type": "string"
37
+ },
38
+ "registered_address": {
39
+ "$ref": "includes/address.json"
40
+ },
41
+ "officers": {
42
+ "type": "array",
43
+ "items": {
44
+ "$ref": "includes/officer.json"
45
+ }
46
+ },
47
+ "share_parcels": {
48
+ "type": "array",
49
+ "items": {
50
+ "$ref": "includes/share-parcel.json"
51
+ }
52
+ },
53
+ "total_shares": {
54
+ "$ref": "includes/total-shares.json"
55
+ },
56
+ "filings": {
57
+ "type": "array",
58
+ "items": {
59
+ "$ref": "includes/filing.json"
60
+ }
61
+ },
62
+ "previous_names": {
63
+ "type": "array",
64
+ "items": {
65
+ "$ref": "includes/previous_name.json"
66
+ }
67
+ },
68
+ "branch": {
69
+ "type": ["string","null"],
70
+ "description": "A flag to denote whether a company is a branch entity. This should only be set if the company is a type of branch (otherwise should be null). In general the only option here is 'F' for a 'Foreign' branch, i.e. an out-of-jurisdiction entity that has registered as having a presence in the jurisdiction. In the US this is sometimes called a Foreign Corporation",
71
+ "enum": ["F", "L",null]
72
+ },
73
+ "all_attributes": {
74
+ "type": "object",
75
+ "description": "Other arbitrary attributes for a given company",
76
+ "properties": {
77
+ "jurisdiction_of_origin": {
78
+ "type": ["string","null"],
79
+ "description": "The jurisdiction of the 'home' company if this is a branch",
80
+ "minLength": 1
81
+ },
82
+ "home_company_number": {
83
+ "type": ["string","null"],
84
+ "description": "If the entity is a 'branch', this is the company_number of the 'home' company in the home company's jurisdiction",
85
+ "minLength": 1
86
+ },
87
+ "home_legal_name": {
88
+ "type": ["string","null"],
89
+ "description": "The legal name of the 'home' company in its jurisdiction if this is a branch, and the name is different from the legal name of the branch",
90
+ "minLength": 1
91
+ },
92
+ "registered_agent_address": {
93
+ "type": ["string","null"],
94
+ "description": "The address of the 'Agent', a public address to which legal papers can be served",
95
+ "minLength": 1
96
+ },
97
+ "registered_agent_name": {
98
+ "type": ["string","null"],
99
+ "description": "The 'Agent' of the company \u2013 a person or entity that is empowered to accept service for the company",
100
+ "minLength": 1
101
+ },
102
+ "number_of_employees": {
103
+ "type": "number",
104
+ "description": "The number of employees",
105
+ "minimum": 0
106
+ }
107
+ }
108
+ }
109
+ },
110
+ "additionalProperties": false,
111
+ "required": ["company_number", "name", "jurisdiction_code"]
112
+ }
@@ -0,0 +1,23 @@
1
+ { "oneOf":
2
+ [
3
+ {
4
+ "type": ["string","null"],
5
+ "minLength": 2
6
+ },
7
+ {
8
+ "name": "Address",
9
+ "description": "An address object",
10
+ "type": "object",
11
+ "properties": { "street_address": { "type": "string" },
12
+ "locality": { "type": "date" },
13
+ "region": { "type": "string" },
14
+ "postal_code": { "type": "string" },
15
+ "country": { "type": "string" }
16
+ },
17
+ "anyOf": [ { "required": ["street_address"] },
18
+ { "required": ["postal_code"] }
19
+ ]
20
+ }
21
+ ]
22
+ }
23
+
@@ -0,0 +1,27 @@
1
+ {
2
+ "name": "Base statement",
3
+ "description": "A base statement which must be extended by a schema to be used in OpenCorporates",
4
+ "type": "object",
5
+ "properties": {
6
+ "source_url": {
7
+ "description": "Place where this fact can be verified",
8
+ "type": "string"
9
+ },
10
+ "source_jurisdiction": {
11
+ "description": "Jurisdiction of the source of the data",
12
+ "type": "string"
13
+ },
14
+ "sample_date": {
15
+ "description": "Date on which this fact was known to be true",
16
+ "type": "string"
17
+ },
18
+ "company": {
19
+ "$ref" : "../includes/company.json"
20
+ },
21
+ "data": {
22
+ "type": "array",
23
+ "additionalItems": false
24
+ }
25
+ },
26
+ "required": ["source_url", "data", "sample_date", "company"]
27
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "name": "Company",
3
+ "description": "A company which is the subject of a statement",
4
+ "type": "object",
5
+ "properties": {
6
+ "name": { "type": "string" },
7
+ "jurisdiction": { "type": "string" },
8
+ "identifier": {
9
+ "type": "string",
10
+ "pattern": "^[^/]+/[^/]+$"
11
+ }
12
+ },
13
+ "required": [ "name", "jurisdiction" ]
14
+ }
@@ -0,0 +1,20 @@
1
+ {
2
+ "name": "Filing",
3
+ "description": "A statutory filing",
4
+ "type": "object",
5
+ "properties": { "title": { "type": "string" },
6
+ "date": { "type": "date" },
7
+ "description": { "type": "string" },
8
+ "uid": { "type": "string" },
9
+ "url": { "type": "string" },
10
+ "filing_type_code": { "type": "string" },
11
+ "filing_type_name": { "type": "string" },
12
+ "other_attributes": { "type": "object" }
13
+ },
14
+ "required": [ "date" ],
15
+ "anyOf": [ { "required": ["title"] },
16
+ { "required": ["description"] },
17
+ { "required": ["filing_type_name"] }
18
+ ]
19
+ }
20
+
@@ -0,0 +1,27 @@
1
+ {
2
+ "name": "License Data",
3
+ "type": "object",
4
+ "properties": {
5
+ "data_type": { "enum": [ "licence" ] },
6
+ "properties": {
7
+ "type": "object",
8
+ "properties": {
9
+ "regulator": {
10
+ "description": "The regulating body that issued the licence",
11
+ "type": "string"},
12
+ "jurisdiction_code": {
13
+ "description": "The jurisdiction for which licence was issued",
14
+ "type": "string"},
15
+ "licence_number": {
16
+ "type": "string"},
17
+ "jurisdiction_classification": {
18
+ "type": "array",
19
+ "minItems": 1},
20
+ "oc_classification": {
21
+ "type": "array"}
22
+ },
23
+ "required": [ "jurisdiction_code", "jurisdiction_classification" ]
24
+ }
25
+ },
26
+ "required": [ "data_type", "properties" ]
27
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "name": "Officer",
3
+ "description": "An officer (director, senior executive) of a company",
4
+ "type": "object",
5
+ "properties": { "name": { "type": "string", "minLength": 1 },
6
+ "start_date": { "type": "date" },
7
+ "end_date": { "type": "date" },
8
+ "position": { "type": "string" },
9
+ "other_attributes": { "type": "object" },
10
+ "uid": { "type": "string" }
11
+ },
12
+ "required": [ "name" ]
13
+ }
14
+
@@ -0,0 +1,11 @@
1
+ {
2
+ "name": "PreviousName",
3
+ "description": "A previous name of a company",
4
+ "type": "object",
5
+ "properties": { "company_name": { "type": "string", "minLength": 1 },
6
+ "con_date": { "type": "date" },
7
+ "start_date": { "type": "date" }
8
+ },
9
+ "required": [ "company_name" ]
10
+ }
11
+
@@ -0,0 +1,67 @@
1
+ {
2
+ "name": "Share Parcel Data",
3
+ "type": "object",
4
+ "properties": {
5
+ "data_type": { "enum": [ "share_parcel" ] },
6
+ "properties": {
7
+ "type": "object",
8
+ "required": ["shareholders"],
9
+ "properties": {
10
+ "number_of_shares": {
11
+ "description": "Number of shares, if known",
12
+ "type": "integer"},
13
+ "percentage_of_shares": {
14
+ "description": "Percentage of shares, if known",
15
+ "type": "number",
16
+ "maximum": 100,
17
+ "minimum": 0
18
+ },
19
+ "shareholders": {
20
+ "description": "Legal persons who own this share parcel",
21
+ "type": "array",
22
+ "minItems": 1,
23
+ "items": {
24
+ "anyOf": [
25
+ {"type": "object",
26
+ "name": "person",
27
+ "required": ["name"],
28
+ "properties": {
29
+ "name": {
30
+ "description": "Name of natural person or company",
31
+ "type": "string"
32
+ },
33
+ "jurisdiction": {
34
+ "description": "Jurisdiction of registration, if company",
35
+ "type": "string"
36
+ },
37
+ "company_number": {
38
+ "description": "Company number, if company and known",
39
+ "type": "string"
40
+ },
41
+ "identifier": {
42
+ "description": "Unique identifier of person",
43
+ "type": "string"
44
+ },
45
+ "type": {
46
+ "description": "Type of person if known (company or natural person)",
47
+ "enum" : ["Company", "Person"]
48
+ },
49
+ "address": {
50
+ "description": "Address given for owner of parcel",
51
+ "type": "string"
52
+ },
53
+ "address_country": {
54
+ "description": "Country part of owner's address",
55
+ "type": "string"
56
+ }
57
+
58
+ }
59
+ }
60
+ ]
61
+ }
62
+ }
63
+ }
64
+ }
65
+ },
66
+ "required": [ "data_type", "properties" ]
67
+ }
@@ -0,0 +1,60 @@
1
+ {
2
+ "name": "Share Parcel",
3
+ "type": "object",
4
+ "description": "A parcel of shares in a company",
5
+ "required": ["shareholders"],
6
+ "properties": {
7
+ "number_of_shares": {
8
+ "description": "Number of shares, if known",
9
+ "type": "integer"},
10
+ "percentage_of_shares": {
11
+ "description": "Percentage of shares, if known",
12
+ "type": "number",
13
+ "maximum": 100,
14
+ "minimum": 0
15
+ },
16
+ "shareholders": {
17
+ "description": "Legal persons who own this share parcel",
18
+ "type": "array",
19
+ "minItems": 1,
20
+ "items": {
21
+ "type": "object",
22
+ "name": "shareholder",
23
+ "required": ["name"],
24
+ "properties": {
25
+ "name": {
26
+ "description": "Name of natural person or company",
27
+ "type": "string",
28
+ "minLength": 1
29
+ },
30
+ "jurisdiction": {
31
+ "description": "Jurisdiction of registration, if company",
32
+ "type": "string"
33
+ },
34
+ "company_number": {
35
+ "description": "Company number, if company and known",
36
+ "type": "string"
37
+ },
38
+ "identifier": {
39
+ "description": "Unique identifier of person",
40
+ "type": "string"
41
+ },
42
+ "type": {
43
+ "description": "Type of person if known (company or natural person)",
44
+ "enum" : ["Company", "Person"]
45
+ },
46
+ "address": {
47
+ "description": "Address given for owner of parcel",
48
+ "type": "string"
49
+ },
50
+ "address_country": {
51
+ "description": "Country part of owner's address",
52
+ "type": "string"
53
+ }
54
+
55
+ }
56
+ }
57
+ }
58
+ },
59
+ "required": [ ]
60
+ }
@@ -0,0 +1,52 @@
1
+ {
2
+ "name": "Share Parcel Data",
3
+ "type": "object",
4
+ "properties": {
5
+ "data_type": { "enum": [ "subsidiary_relationship" ] },
6
+ "properties": {
7
+ "type": "object",
8
+ "required": ["subsidiary"],
9
+ "additionalProperties": false,
10
+ "properties": {
11
+ "direct": {
12
+ "description": "If the control is direct (if via an intermediary, this value should be false; if unknown, left blank)",
13
+ "type": "boolean"},
14
+ "significant": {
15
+ "description": "Does the source define the control as somehow significant?",
16
+ "type": "boolean"},
17
+ "subsidiary": {
18
+ "description": "Company that is controlled",
19
+ "type": "object",
20
+ "required": ["name"],
21
+ "additionalProperties": false,
22
+ "properties": {
23
+ "name": {
24
+ "description": "Name of company",
25
+ "type": "string"
26
+ },
27
+ "jurisdiction": {
28
+ "description": "Jurisdiction of registration",
29
+ "type": "string"
30
+ },
31
+ "company_number": {
32
+ "description": "Company number, if company and known",
33
+ "type": "string"
34
+ },
35
+ "identifier": {
36
+ "description": "Unique identifier of person",
37
+ "type": "string"
38
+ },
39
+ "address": {
40
+ "description": "Address given for owner of parcel",
41
+ "type": "string"
42
+ },
43
+ "address_country": {
44
+ "description": "Country part of owner's address",
45
+ "type": "string"
46
+ }
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }