openc_bot 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +8 -0
  4. data/CHANGELOG.md +2 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +253 -0
  8. data/Rakefile +14 -0
  9. data/bin/openc_bot +13 -0
  10. data/create_bot.sh +30 -0
  11. data/create_company_bot.sh +16 -0
  12. data/create_simple_licence_bot.sh +31 -0
  13. data/db/.gitkeep +0 -0
  14. data/examples/basic/.gitignore +3 -0
  15. data/examples/basic/Gemfile +7 -0
  16. data/examples/basic/config.yml +21 -0
  17. data/examples/basic/lib/basic.rb +88 -0
  18. data/examples/basic_with_proxy/Gemfile +7 -0
  19. data/examples/basic_with_proxy/config.yml +21 -0
  20. data/examples/basic_with_proxy/lib/basic_with_proxy.rb +103 -0
  21. data/examples/bot_with_simple_iterator/Gemfile +6 -0
  22. data/examples/bot_with_simple_iterator/config.yml +21 -0
  23. data/examples/bot_with_simple_iterator/lib/bot_with_simple_iterator.rb +112 -0
  24. data/examples/company_fetchers/basic.rb +49 -0
  25. data/lib/monkey_patches/mechanize.rb +53 -0
  26. data/lib/openc_bot.rb +89 -0
  27. data/lib/openc_bot/bot_data_validator.rb +18 -0
  28. data/lib/openc_bot/company_fetcher_bot.rb +40 -0
  29. data/lib/openc_bot/exceptions.rb +17 -0
  30. data/lib/openc_bot/helpers/_csv.rb +10 -0
  31. data/lib/openc_bot/helpers/alpha_search.rb +73 -0
  32. data/lib/openc_bot/helpers/dates.rb +33 -0
  33. data/lib/openc_bot/helpers/html.rb +8 -0
  34. data/lib/openc_bot/helpers/incremental_search.rb +106 -0
  35. data/lib/openc_bot/helpers/register_methods.rb +205 -0
  36. data/lib/openc_bot/helpers/text.rb +18 -0
  37. data/lib/openc_bot/incrementers.rb +2 -0
  38. data/lib/openc_bot/incrementers/base.rb +214 -0
  39. data/lib/openc_bot/incrementers/common.rb +47 -0
  40. data/lib/openc_bot/tasks.rb +385 -0
  41. data/lib/openc_bot/templates/README.md +35 -0
  42. data/lib/openc_bot/templates/bin/export_data +28 -0
  43. data/lib/openc_bot/templates/bin/fetch_data +23 -0
  44. data/lib/openc_bot/templates/bin/verify_data +1 -0
  45. data/lib/openc_bot/templates/config.yml +21 -0
  46. data/lib/openc_bot/templates/lib/bot.rb +43 -0
  47. data/lib/openc_bot/templates/lib/company_fetcher_bot.rb +95 -0
  48. data/lib/openc_bot/templates/lib/simple_bot.rb +67 -0
  49. data/lib/openc_bot/templates/spec/bot_spec.rb +11 -0
  50. data/lib/openc_bot/templates/spec/simple_bot_spec.rb +11 -0
  51. data/lib/openc_bot/templates/spec/spec_helper.rb +13 -0
  52. data/lib/openc_bot/version.rb +3 -0
  53. data/lib/simple_openc_bot.rb +289 -0
  54. data/openc_bot.gemspec +35 -0
  55. data/schemas/company-schema.json +112 -0
  56. data/schemas/includes/address.json +23 -0
  57. data/schemas/includes/base-statement.json +27 -0
  58. data/schemas/includes/company.json +14 -0
  59. data/schemas/includes/filing.json +20 -0
  60. data/schemas/includes/license-data.json +27 -0
  61. data/schemas/includes/officer.json +14 -0
  62. data/schemas/includes/previous_name.json +11 -0
  63. data/schemas/includes/share-parcel-data.json +67 -0
  64. data/schemas/includes/share-parcel.json +60 -0
  65. data/schemas/includes/subsidiary-relationship-data.json +52 -0
  66. data/schemas/includes/total-shares.json +10 -0
  67. data/schemas/licence-schema.json +21 -0
  68. data/schemas/share-parcel-schema.json +21 -0
  69. data/schemas/subsidiary-relationship-schema.json +19 -0
  70. data/spec/dummy_classes/foo_bot.rb +4 -0
  71. data/spec/lib/bot_data_validator_spec.rb +69 -0
  72. data/spec/lib/company_fetcher_bot_spec.rb +93 -0
  73. data/spec/lib/exceptions_spec.rb +25 -0
  74. data/spec/lib/helpers/alpha_search_spec.rb +173 -0
  75. data/spec/lib/helpers/dates_spec.rb +65 -0
  76. data/spec/lib/helpers/incremental_search_spec.rb +471 -0
  77. data/spec/lib/helpers/register_methods_spec.rb +558 -0
  78. data/spec/lib/helpers/text_spec.rb +50 -0
  79. data/spec/lib/openc_bot/db/.gitkeep +0 -0
  80. data/spec/lib/openc_bot/incrementers/common_spec.rb +83 -0
  81. data/spec/lib/openc_bot_spec.rb +116 -0
  82. data/spec/schemas/company-schema_spec.rb +676 -0
  83. data/spec/simple_openc_bot_spec.rb +302 -0
  84. data/spec/spec_helper.rb +19 -0
  85. metadata +300 -0
data/openc_bot.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'openc_bot/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "openc_bot"
8
+ gem.version = OpencBot::VERSION
9
+ gem.authors = ["Chris Taggart"]
10
+ gem.email = ["info@opencorporates.com"]
11
+ gem.description = %q{This gem is to make the writing and running of bots for OpenCorporates quick and easy}
12
+ gem.summary = %q{Helper gem for writing external bots for OpenCorporates}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+
17
+ gem.executables = ['openc_bot']
18
+
19
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.require_paths = ["lib",'lib/openc_bot/helpers']
21
+
22
+ gem.add_dependency "rake"
23
+ gem.add_dependency "activesupport", "3.2.17"
24
+ gem.add_dependency "nokogiri"
25
+ # gem.add_dependency "sqlite3"
26
+ gem.add_dependency "json"
27
+ gem.add_dependency "json-schema"
28
+ gem.add_dependency "httpclient"
29
+ gem.add_dependency "backports"
30
+ gem.add_dependency "scraperwiki", "3.0.2"
31
+
32
+ gem.add_development_dependency "perftools.rb"
33
+ gem.add_development_dependency "debugger"
34
+ gem.add_development_dependency "rspec"
35
+ end
@@ -0,0 +1,112 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-04/schema#",
3
+ "title": "Company Schema",
4
+ "type": "object",
5
+ "description": "A company in OpenCorporates",
6
+ "properties": {
7
+ "company_number": {
8
+ "type": "string",
9
+ "minLength": 1
10
+ },
11
+ "name": {
12
+ "type": "string",
13
+ "minLength": 1
14
+ },
15
+ "jurisdiction_code": {
16
+ "type": "string",
17
+ "minLength": 2,
18
+ "maxLength": 5
19
+ },
20
+ "incorporation_date": {
21
+ "type": "date"
22
+ },
23
+ "dissolution_date": {
24
+ "type": "date"
25
+ },
26
+ "retrieved_at": {
27
+ "type": "date-time"
28
+ },
29
+ "current_status": {
30
+ "type": "string"
31
+ },
32
+ "company_type": {
33
+ "type": "string"
34
+ },
35
+ "registry_url": {
36
+ "type": "string"
37
+ },
38
+ "registered_address": {
39
+ "$ref": "includes/address.json"
40
+ },
41
+ "officers": {
42
+ "type": "array",
43
+ "items": {
44
+ "$ref": "includes/officer.json"
45
+ }
46
+ },
47
+ "share_parcels": {
48
+ "type": "array",
49
+ "items": {
50
+ "$ref": "includes/share-parcel.json"
51
+ }
52
+ },
53
+ "total_shares": {
54
+ "$ref": "includes/total-shares.json"
55
+ },
56
+ "filings": {
57
+ "type": "array",
58
+ "items": {
59
+ "$ref": "includes/filing.json"
60
+ }
61
+ },
62
+ "previous_names": {
63
+ "type": "array",
64
+ "items": {
65
+ "$ref": "includes/previous_name.json"
66
+ }
67
+ },
68
+ "branch": {
69
+ "type": ["string","null"],
70
+ "description": "A flag to denote whether a company is a branch entity. This should only be set if the company is a type of branch (otherwise should be null). In general the only option here is 'F' for a 'Foreign' branch, i.e. an out-of-jurisdiction entity that has registered as having a presence in the jurisdiction. In the US this is sometimes called a Foreign Corporation",
71
+ "enum": ["F", "L",null]
72
+ },
73
+ "all_attributes": {
74
+ "type": "object",
75
+ "description": "Other arbitrary attributes for a given company",
76
+ "properties": {
77
+ "jurisdiction_of_origin": {
78
+ "type": ["string","null"],
79
+ "description": "The jurisdiction of the 'home' company if this is a branch",
80
+ "minLength": 1
81
+ },
82
+ "home_company_number": {
83
+ "type": ["string","null"],
84
+ "description": "If the entity is a 'branch', this is the company_number of the 'home' company in the home company's jurisdiction",
85
+ "minLength": 1
86
+ },
87
+ "home_legal_name": {
88
+ "type": ["string","null"],
89
+ "description": "The legal name of the 'home' company in its jurisdiction if this is a branch, and the name is different from the legal name of the branch",
90
+ "minLength": 1
91
+ },
92
+ "registered_agent_address": {
93
+ "type": ["string","null"],
94
+ "description": "The address of the 'Agent', a public address to which legal papers can be served",
95
+ "minLength": 1
96
+ },
97
+ "registered_agent_name": {
98
+ "type": ["string","null"],
99
+ "description": "The 'Agent' of the company \u2013 a person or entity that is empowered to accept service for the company",
100
+ "minLength": 1
101
+ },
102
+ "number_of_employees": {
103
+ "type": "number",
104
+ "description": "The number of employees",
105
+ "minimum": 0
106
+ }
107
+ }
108
+ }
109
+ },
110
+ "additionalProperties": false,
111
+ "required": ["company_number", "name", "jurisdiction_code"]
112
+ }
@@ -0,0 +1,23 @@
1
+ { "oneOf":
2
+ [
3
+ {
4
+ "type": ["string","null"],
5
+ "minLength": 2
6
+ },
7
+ {
8
+ "name": "Address",
9
+ "description": "An address object",
10
+ "type": "object",
11
+ "properties": { "street_address": { "type": "string" },
12
+ "locality": { "type": "date" },
13
+ "region": { "type": "string" },
14
+ "postal_code": { "type": "string" },
15
+ "country": { "type": "string" }
16
+ },
17
+ "anyOf": [ { "required": ["street_address"] },
18
+ { "required": ["postal_code"] }
19
+ ]
20
+ }
21
+ ]
22
+ }
23
+
@@ -0,0 +1,27 @@
1
+ {
2
+ "name": "Base statement",
3
+ "description": "A base statement which must be extended by a schema to be used in OpenCorporates",
4
+ "type": "object",
5
+ "properties": {
6
+ "source_url": {
7
+ "description": "Place where this fact can be verified",
8
+ "type": "string"
9
+ },
10
+ "source_jurisdiction": {
11
+ "description": "Jurisdiction of the source of the data",
12
+ "type": "string"
13
+ },
14
+ "sample_date": {
15
+ "description": "Date on which this fact was known to be true",
16
+ "type": "string"
17
+ },
18
+ "company": {
19
+ "$ref" : "../includes/company.json"
20
+ },
21
+ "data": {
22
+ "type": "array",
23
+ "additionalItems": false
24
+ }
25
+ },
26
+ "required": ["source_url", "data", "sample_date", "company"]
27
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "name": "Company",
3
+ "description": "A company which is the subject of a statement",
4
+ "type": "object",
5
+ "properties": {
6
+ "name": { "type": "string" },
7
+ "jurisdiction": { "type": "string" },
8
+ "identifier": {
9
+ "type": "string",
10
+ "pattern": "^[^/]+/[^/]+$"
11
+ }
12
+ },
13
+ "required": [ "name", "jurisdiction" ]
14
+ }
@@ -0,0 +1,20 @@
1
+ {
2
+ "name": "Filing",
3
+ "description": "A statutory filing",
4
+ "type": "object",
5
+ "properties": { "title": { "type": "string" },
6
+ "date": { "type": "date" },
7
+ "description": { "type": "string" },
8
+ "uid": { "type": "string" },
9
+ "url": { "type": "string" },
10
+ "filing_type_code": { "type": "string" },
11
+ "filing_type_name": { "type": "string" },
12
+ "other_attributes": { "type": "object" }
13
+ },
14
+ "required": [ "date" ],
15
+ "anyOf": [ { "required": ["title"] },
16
+ { "required": ["description"] },
17
+ { "required": ["filing_type_name"] }
18
+ ]
19
+ }
20
+
@@ -0,0 +1,27 @@
1
+ {
2
+ "name": "License Data",
3
+ "type": "object",
4
+ "properties": {
5
+ "data_type": { "enum": [ "licence" ] },
6
+ "properties": {
7
+ "type": "object",
8
+ "properties": {
9
+ "regulator": {
10
+ "description": "The regulating body that issued the licence",
11
+ "type": "string"},
12
+ "jurisdiction_code": {
13
+ "description": "The jurisdiction for which licence was issued",
14
+ "type": "string"},
15
+ "licence_number": {
16
+ "type": "string"},
17
+ "jurisdiction_classification": {
18
+ "type": "array",
19
+ "minItems": 1},
20
+ "oc_classification": {
21
+ "type": "array"}
22
+ },
23
+ "required": [ "jurisdiction_code", "jurisdiction_classification" ]
24
+ }
25
+ },
26
+ "required": [ "data_type", "properties" ]
27
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "name": "Officer",
3
+ "description": "An officer (director, senior executive) of a company",
4
+ "type": "object",
5
+ "properties": { "name": { "type": "string", "minLength": 1 },
6
+ "start_date": { "type": "date" },
7
+ "end_date": { "type": "date" },
8
+ "position": { "type": "string" },
9
+ "other_attributes": { "type": "object" },
10
+ "uid": { "type": "string" }
11
+ },
12
+ "required": [ "name" ]
13
+ }
14
+
@@ -0,0 +1,11 @@
1
+ {
2
+ "name": "PreviousName",
3
+ "description": "A previous name of a company",
4
+ "type": "object",
5
+ "properties": { "company_name": { "type": "string", "minLength": 1 },
6
+ "con_date": { "type": "date" },
7
+ "start_date": { "type": "date" }
8
+ },
9
+ "required": [ "company_name" ]
10
+ }
11
+
@@ -0,0 +1,67 @@
1
+ {
2
+ "name": "Share Parcel Data",
3
+ "type": "object",
4
+ "properties": {
5
+ "data_type": { "enum": [ "share_parcel" ] },
6
+ "properties": {
7
+ "type": "object",
8
+ "required": ["shareholders"],
9
+ "properties": {
10
+ "number_of_shares": {
11
+ "description": "Number of shares, if known",
12
+ "type": "integer"},
13
+ "percentage_of_shares": {
14
+ "description": "Percentage of shares, if known",
15
+ "type": "number",
16
+ "maximum": 100,
17
+ "minimum": 0
18
+ },
19
+ "shareholders": {
20
+ "description": "Legal persons who own this share parcel",
21
+ "type": "array",
22
+ "minItems": 1,
23
+ "items": {
24
+ "anyOf": [
25
+ {"type": "object",
26
+ "name": "person",
27
+ "required": ["name"],
28
+ "properties": {
29
+ "name": {
30
+ "description": "Name of natural person or company",
31
+ "type": "string"
32
+ },
33
+ "jurisdiction": {
34
+ "description": "Jurisdiction of registration, if company",
35
+ "type": "string"
36
+ },
37
+ "company_number": {
38
+ "description": "Company number, if company and known",
39
+ "type": "string"
40
+ },
41
+ "identifier": {
42
+ "description": "Unique identifier of person",
43
+ "type": "string"
44
+ },
45
+ "type": {
46
+ "description": "Type of person if known (company or natural person)",
47
+ "enum" : ["Company", "Person"]
48
+ },
49
+ "address": {
50
+ "description": "Address given for owner of parcel",
51
+ "type": "string"
52
+ },
53
+ "address_country": {
54
+ "description": "Country part of owner's address",
55
+ "type": "string"
56
+ }
57
+
58
+ }
59
+ }
60
+ ]
61
+ }
62
+ }
63
+ }
64
+ }
65
+ },
66
+ "required": [ "data_type", "properties" ]
67
+ }
@@ -0,0 +1,60 @@
1
+ {
2
+ "name": "Share Parcel",
3
+ "type": "object",
4
+ "description": "A parcel of shares in a company",
5
+ "required": ["shareholders"],
6
+ "properties": {
7
+ "number_of_shares": {
8
+ "description": "Number of shares, if known",
9
+ "type": "integer"},
10
+ "percentage_of_shares": {
11
+ "description": "Percentage of shares, if known",
12
+ "type": "number",
13
+ "maximum": 100,
14
+ "minimum": 0
15
+ },
16
+ "shareholders": {
17
+ "description": "Legal persons who own this share parcel",
18
+ "type": "array",
19
+ "minItems": 1,
20
+ "items": {
21
+ "type": "object",
22
+ "name": "shareholder",
23
+ "required": ["name"],
24
+ "properties": {
25
+ "name": {
26
+ "description": "Name of natural person or company",
27
+ "type": "string",
28
+ "minLength": 1
29
+ },
30
+ "jurisdiction": {
31
+ "description": "Jurisdiction of registration, if company",
32
+ "type": "string"
33
+ },
34
+ "company_number": {
35
+ "description": "Company number, if company and known",
36
+ "type": "string"
37
+ },
38
+ "identifier": {
39
+ "description": "Unique identifier of person",
40
+ "type": "string"
41
+ },
42
+ "type": {
43
+ "description": "Type of person if known (company or natural person)",
44
+ "enum" : ["Company", "Person"]
45
+ },
46
+ "address": {
47
+ "description": "Address given for owner of parcel",
48
+ "type": "string"
49
+ },
50
+ "address_country": {
51
+ "description": "Country part of owner's address",
52
+ "type": "string"
53
+ }
54
+
55
+ }
56
+ }
57
+ }
58
+ },
59
+ "required": [ ]
60
+ }
@@ -0,0 +1,52 @@
1
+ {
2
+ "name": "Share Parcel Data",
3
+ "type": "object",
4
+ "properties": {
5
+ "data_type": { "enum": [ "subsidiary_relationship" ] },
6
+ "properties": {
7
+ "type": "object",
8
+ "required": ["subsidiary"],
9
+ "additionalProperties": false,
10
+ "properties": {
11
+ "direct": {
12
+ "description": "If the control is direct (if via an intermediary, this value should be false; if unknown, left blank)",
13
+ "type": "boolean"},
14
+ "significant": {
15
+ "description": "Does the source define the control as somehow significant?",
16
+ "type": "boolean"},
17
+ "subsidiary": {
18
+ "description": "Company that is controlled",
19
+ "type": "object",
20
+ "required": ["name"],
21
+ "additionalProperties": false,
22
+ "properties": {
23
+ "name": {
24
+ "description": "Name of company",
25
+ "type": "string"
26
+ },
27
+ "jurisdiction": {
28
+ "description": "Jurisdiction of registration",
29
+ "type": "string"
30
+ },
31
+ "company_number": {
32
+ "description": "Company number, if company and known",
33
+ "type": "string"
34
+ },
35
+ "identifier": {
36
+ "description": "Unique identifier of person",
37
+ "type": "string"
38
+ },
39
+ "address": {
40
+ "description": "Address given for owner of parcel",
41
+ "type": "string"
42
+ },
43
+ "address_country": {
44
+ "description": "Country part of owner's address",
45
+ "type": "string"
46
+ }
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }