turbot-runner 0.1.23 → 0.1.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +8 -8
  2. data/lib/turbot_runner.rb +0 -2
  3. data/lib/turbot_runner/processor.rb +7 -70
  4. data/lib/turbot_runner/version.rb +1 -1
  5. data/schema/schemas/company-schema.json +69 -22
  6. data/schema/schemas/financial-payment-schema.json +12 -5
  7. data/schema/schemas/includes/address.json +36 -13
  8. data/schema/schemas/includes/alternative_name.json +32 -9
  9. data/schema/schemas/includes/company-for-nesting.json +70 -23
  10. data/schema/schemas/includes/company.json +15 -6
  11. data/schema/schemas/includes/filing.json +48 -16
  12. data/schema/schemas/includes/financial-payment-data-object.json +73 -21
  13. data/schema/schemas/includes/identifier.json +14 -8
  14. data/schema/schemas/includes/industry_code.json +25 -12
  15. data/schema/schemas/includes/licence-data-object.json +40 -13
  16. data/schema/schemas/includes/officer.json +66 -46
  17. data/schema/schemas/includes/organisation.json +6 -5
  18. data/schema/schemas/includes/permission.json +36 -17
  19. data/schema/schemas/includes/person.json +14 -13
  20. data/schema/schemas/includes/person_name.json +36 -12
  21. data/schema/schemas/includes/previous_name.json +19 -9
  22. data/schema/schemas/includes/share-parcel-data.json +55 -40
  23. data/schema/schemas/includes/share-parcel.json +71 -56
  24. data/schema/schemas/includes/subsidiary-relationship-data.json +19 -8
  25. data/schema/schemas/includes/total-shares.json +13 -6
  26. data/schema/schemas/includes/unknown_entity_type.json +6 -5
  27. data/schema/schemas/licence-schema.json +157 -17
  28. data/schema/schemas/primary-data-schema.json +18 -16
  29. data/schema/schemas/share-parcel-schema.json +8 -7
  30. data/schema/schemas/simple-financial-payment-schema.json +47 -11
  31. data/schema/schemas/simple-licence-schema.json +21 -6
  32. data/schema/schemas/simple-subsidiary-schema.json +15 -4
  33. data/schema/schemas/subsidiary-relationship-schema.json +26 -7
  34. data/spec/lib/processor_spec.rb +3 -75
  35. metadata +5 -10
  36. data/lib/turbot_runner/validator.rb +0 -77
  37. data/lib/turbot_runner/validators.rb +0 -12
  38. data/schema/schemas/rich-licence-schema.json +0 -103
  39. data/spec/lib/validator_spec.rb +0 -235
  40. data/spec/lib/validators_spec.rb +0 -48
@@ -1,21 +1,22 @@
1
1
  {
2
2
  "$schema": "http://json-schema.org/draft-04/schema#",
3
- "title": "Share Parcel Schema",
4
3
  "type": "object",
5
- "allOf" : [
6
- // The following is a basic statement with sample_date, etc
7
- { "$ref": "includes/base-statement.json" },
8
- // And this overrides it to provide data-type-specific information
4
+ "allOf": [
5
+ {
6
+ "$ref": "includes/base-statement.json"
7
+ },
9
8
  {
10
9
  "properties": {
11
10
  "data": {
12
11
  "items": {
13
12
  "allOf": [
14
- { "$ref": "includes/share-parcel-data.json" }
13
+ {
14
+ "$ref": "includes/share-parcel-data.json"
15
+ }
15
16
  ]
16
17
  }
17
18
  }
18
19
  }
19
20
  }
20
21
  ]
21
- }
22
+ }
@@ -1,6 +1,5 @@
1
1
  {
2
2
  "$schema": "http://json-schema.org/draft-04/schema#",
3
- "title": "Simple Financial Payment",
4
3
  "description": "A Financial Payment is a payment from government to a recipient",
5
4
  "type": "object",
6
5
  "properties": {
@@ -16,7 +15,11 @@
16
15
  "confidence": {
17
16
  "description": "Confidence in accuracy of data",
18
17
  "type": "string",
19
- "enum": ["HIGH", "MEDIUM", "LOW"]
18
+ "enum": [
19
+ "HIGH",
20
+ "MEDIUM",
21
+ "LOW"
22
+ ]
20
23
  },
21
24
  "company_name": {
22
25
  "description": "Name of the company that received the money",
@@ -52,19 +55,32 @@
52
55
  },
53
56
  "more_details_url": {
54
57
  "description": "A url from which more details can be seen (may be the same as the source_url)",
55
- "type": ["string",null]
58
+ "type": [
59
+ "string",
60
+ null
61
+ ]
56
62
  },
57
63
  "description": {
58
64
  "description": "The description of the transaction as given in the raw data",
59
- "type": ["string",null]
65
+ "type": [
66
+ "string",
67
+ null
68
+ ]
60
69
  },
61
70
  "expense_type": {
62
71
  "description": "The type of expense -- can be either capital, revenue (i.e. current expenditure) or null",
63
- "enum": ["capital","revenue",null]
72
+ "enum": [
73
+ "capital",
74
+ "revenue",
75
+ null
76
+ ]
64
77
  },
65
78
  "expense_area": {
66
79
  "description": "category (in words) of the expenditure",
67
- "type": ["string",null]
80
+ "type": [
81
+ "string",
82
+ null
83
+ ]
68
84
  },
69
85
  "entity_name": {
70
86
  "description": "The name of the government entity that made the payment, e.g. Environment Agency",
@@ -72,15 +88,35 @@
72
88
  },
73
89
  "entity_uri": {
74
90
  "description": "A unique URL (ideally a dereferencable URI) for the government entity",
75
- "type": ["string",null]
91
+ "type": [
92
+ "string",
93
+ null
94
+ ]
76
95
  },
77
96
  "department_name": {
78
97
  "description": "The name of the government department which the entity belongs to (if relevant), e.g. Department of Health",
79
- "type": ["string",null]
98
+ "type": [
99
+ "string",
100
+ null
101
+ ]
80
102
  },
81
103
  "csv_line_number": {
82
104
  "description": "If the source for the data is a CSV file, you can optionally include the line number of the CSV from which this data was retrieved",
83
- "type": ["string",null]}
105
+ "type": [
106
+ "string",
107
+ null
108
+ ]
109
+ }
84
110
  },
85
- "required": ["source_url", "sample_date", "company_name", "company_jurisdiction", "value", "payee_name", "date", "currency"]
86
- }
111
+ "additionalProperties": false,
112
+ "required": [
113
+ "source_url",
114
+ "sample_date",
115
+ "company_name",
116
+ "company_jurisdiction",
117
+ "value",
118
+ "payee_name",
119
+ "date",
120
+ "currency"
121
+ ]
122
+ }
@@ -1,6 +1,5 @@
1
1
  {
2
2
  "$schema": "http://json-schema.org/draft-04/schema#",
3
- "title": "Simple Licence Schema",
4
3
  "type": "object",
5
4
  "properties": {
6
5
  "source_url": {
@@ -16,7 +15,11 @@
16
15
  "confidence": {
17
16
  "description": "Confidence in accuracy of data",
18
17
  "type": "string",
19
- "enum": ["HIGH", "MEDIUM", "LOW"]
18
+ "enum": [
19
+ "HIGH",
20
+ "MEDIUM",
21
+ "LOW"
22
+ ]
20
23
  },
21
24
  "company_name": {
22
25
  "description": "Name of the company holding the licence",
@@ -34,7 +37,10 @@
34
37
  },
35
38
  "jurisdiction_classification": {
36
39
  "description": "Description of how regulator classifies licence",
37
- "type": ["string", "array"]
40
+ "type": [
41
+ "string",
42
+ "array"
43
+ ]
38
44
  },
39
45
  "regulator": {
40
46
  "description": "The regulating body that issued the licence",
@@ -52,8 +58,17 @@
52
58
  "category": {
53
59
  "description": "Category of licence",
54
60
  "type": "string",
55
- "enum": ["Financial", "Business"]
61
+ "enum": [
62
+ "Financial",
63
+ "Business"
64
+ ]
56
65
  }
57
66
  },
58
- "required": ["source_url", "sample_date", "company_name", "company_jurisdiction"]
59
- }
67
+ "additionalProperties": false,
68
+ "required": [
69
+ "source_url",
70
+ "sample_date",
71
+ "company_name",
72
+ "company_jurisdiction"
73
+ ]
74
+ }
@@ -1,6 +1,5 @@
1
1
  {
2
2
  "$schema": "http://json-schema.org/draft-04/schema#",
3
- "title": "Simple Subsidiary Schema",
4
3
  "type": "object",
5
4
  "properties": {
6
5
  "source_url": {
@@ -26,7 +25,11 @@
26
25
  "confidence": {
27
26
  "description": "Confidence in accuracy of data",
28
27
  "type": "string",
29
- "enum": ["HIGH", "MEDIUM", "LOW"]
28
+ "enum": [
29
+ "HIGH",
30
+ "MEDIUM",
31
+ "LOW"
32
+ ]
30
33
  },
31
34
  "parent_name": {
32
35
  "description": "Name of the controlling company",
@@ -66,5 +69,13 @@
66
69
  "type": "string"
67
70
  }
68
71
  },
69
- "required": ["source_url", "sample_date", "parent_name", "parent_jurisdiction", "subsidiary_name", "subsidiary_jurisdiction"]
70
- }
72
+ "additionalProperties": false,
73
+ "required": [
74
+ "source_url",
75
+ "sample_date",
76
+ "parent_name",
77
+ "parent_jurisdiction",
78
+ "subsidiary_name",
79
+ "subsidiary_jurisdiction"
80
+ ]
81
+ }
@@ -1,12 +1,26 @@
1
1
  {
2
- "title": "Subsidiary Relationship",
3
- "description": "A relationship of control between two companies",
4
2
  "$schema": "http://json-schema.org/draft-04/schema#",
3
+ "description": "A relationship of control between two companies",
5
4
  "type": "object",
6
5
  "properties": {
7
- "sample_date": {"type": "string", "format": "date"},
8
- "start_date": {"type": "string", "format": "date"},
9
- "end_date": {"type": "string", "format": "date"},
6
+ "sample_date": {
7
+ "type": "string",
8
+ "format": "date"
9
+ },
10
+ "start_date": {
11
+ "type": "string",
12
+ "format": "date"
13
+ },
14
+ "start_date_type": {
15
+ "type": "string"
16
+ },
17
+ "end_date": {
18
+ "type": "string",
19
+ "format": "date"
20
+ },
21
+ "end_date_type": {
22
+ "type": "string"
23
+ },
10
24
  "source_jurisdiction": {
11
25
  "description": "Jurisdiction of the source of the data",
12
26
  "type": "string"
@@ -23,5 +37,10 @@
23
37
  "additionalItems": false
24
38
  }
25
39
  },
26
- "required": ["company", "data", "sample_date"]
27
- }
40
+ "additionalProperties": false,
41
+ "required": [
42
+ "company",
43
+ "data",
44
+ "sample_date"
45
+ ]
46
+ }
@@ -134,7 +134,7 @@ describe TurbotRunner::Processor do
134
134
  'number' => 123
135
135
  }
136
136
 
137
- expected_error = 'Missing required property: sample_date'
137
+ expected_error = 'Property not of expected format: sample_date (must be of format yyyy-mm-dd)'
138
138
  expect(@handler).to receive(:handle_invalid_record).
139
139
  with(record, @data_type, expected_error)
140
140
  @processor.process(record.to_json)
@@ -149,7 +149,7 @@ describe TurbotRunner::Processor do
149
149
  'number' => 123
150
150
  }
151
151
 
152
- expected_error = 'Property not a valid date: sample_date'
152
+ expected_error = 'Property not of expected format: sample_date (must be of format yyyy-mm-dd)'
153
153
  expect(@handler).to receive(:handle_invalid_record).
154
154
  with(record, @data_type, expected_error)
155
155
  @processor.process(record.to_json)
@@ -160,7 +160,7 @@ describe TurbotRunner::Processor do
160
160
  it 'can handle schemas with $refs' do
161
161
  handler = TurbotRunner::BaseHandler.new
162
162
  script_config = {
163
- :data_type => 'rich-licence',
163
+ :data_type => 'licence',
164
164
  :identifying_fields => ['licence_number']
165
165
  }
166
166
 
@@ -188,76 +188,4 @@ describe TurbotRunner::Processor do
188
188
  processor.process(record.to_json)
189
189
  end
190
190
  end
191
-
192
- describe '#convert_record' do
193
- before do
194
- schema = {
195
- '$schema' => 'http://json-schema.org/draft-04/schema#',
196
- 'type' => 'object',
197
- 'properties' => {
198
- 'aaa' => {'format' => 'date'},
199
- 'bbb' => {'format' => 'not-date'},
200
- }
201
- }
202
-
203
- @processor = TurbotRunner::Processor.new(nil, {}, nil)
204
- allow(@processor).to receive(:schema).and_return(schema)
205
- end
206
-
207
- context 'when date field is YYYY-MM-DD' do
208
- it 'leaves date field alone' do
209
- record = {'aaa' => '2015-01-26', 'bbb' => 'cabbage'}
210
- expect(@processor.convert_record(record)).to eq({'aaa' => '2015-01-26', 'bbb' => 'cabbage'})
211
- end
212
- end
213
-
214
- context 'when date field with YYYY-MM-DD HH:MM:SS' do
215
- it 'replaces value with YYYY-MM-DD' do
216
- record = {'aaa' => '2015-01-26 12:34:56', 'bbb' => 'cabbage'}
217
- expect(@processor.convert_record(record)).to eq({'aaa' => '2015-01-26', 'bbb' => 'cabbage'})
218
- end
219
- end
220
-
221
- context 'when date field is empty string' do
222
- it 'replaces removes field' do
223
- record = {'aaa' => '', 'bbb' => 'cabbage'}
224
- expect(@processor.convert_record(record)).to eq({'bbb' => 'cabbage'})
225
- end
226
- end
227
-
228
- context 'when date field is invalid date' do
229
- it 'rasies ConversionError' do
230
- record = {'aaa' => 'cabbage', 'bbb' => 'cabbage'}
231
- expect{@processor.convert_record(record)}.to raise_error(TurbotRunner::Processor::ConversionError)
232
- end
233
- end
234
- end
235
-
236
- specify '#get_date_paths' do
237
- schema = {
238
- '$schema' => 'http://json-schema.org/draft-04/schema#',
239
- 'type' => 'object',
240
- 'properties' => {
241
- 'aaa' => {'format' => 'date'},
242
- 'bbb' => {'format' => 'not-date'},
243
- 'ccc' => {
244
- 'type' => 'object',
245
- 'properties' => {
246
- 'ddd' => {'format' => 'date'},
247
- 'eee' => {'format' => 'not-date'},
248
- 'fff' => {
249
- 'type' => 'object',
250
- 'properties' => {
251
- 'ggg' => {'format' => 'date'},
252
- 'hhh' => {'format' => 'not-date'},
253
- }
254
- }
255
- }
256
- }
257
- }
258
- }
259
-
260
- processor = TurbotRunner::Processor.new(nil, {}, nil)
261
- expect(processor.get_date_paths(schema['properties'])).to eq([['aaa'], ['ccc', 'ddd'], ['ccc', 'fff', 'ggg']])
262
- end
263
191
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: turbot-runner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.23
4
+ version: 0.1.24
5
5
  platform: ruby
6
6
  authors:
7
7
  - OpenCorporates
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-29 00:00:00.000000000 Z
11
+ date: 2015-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: json-schema
14
+ name: openc-json_schema
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - '='
18
18
  - !ruby/object:Gem::Version
19
- version: 2.5.0
19
+ version: 0.0.5
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - '='
25
25
  - !ruby/object:Gem::Version
26
- version: 2.5.0
26
+ version: 0.0.5
27
27
  description:
28
28
  email: bots@opencorporates.com
29
29
  executables: []
@@ -39,8 +39,6 @@ files:
39
39
  - lib/turbot_runner/runner.rb
40
40
  - lib/turbot_runner/script_runner.rb
41
41
  - lib/turbot_runner/utils.rb
42
- - lib/turbot_runner/validator.rb
43
- - lib/turbot_runner/validators.rb
44
42
  - lib/turbot_runner/version.rb
45
43
  - schema/schemas/company-schema.json
46
44
  - schema/schemas/financial-payment-schema.json
@@ -66,7 +64,6 @@ files:
66
64
  - schema/schemas/includes/unknown_entity_type.json
67
65
  - schema/schemas/licence-schema.json
68
66
  - schema/schemas/primary-data-schema.json
69
- - schema/schemas/rich-licence-schema.json
70
67
  - schema/schemas/share-parcel-schema.json
71
68
  - schema/schemas/simple-financial-payment-schema.json
72
69
  - schema/schemas/simple-licence-schema.json
@@ -112,8 +109,6 @@ files:
112
109
  - spec/bots/slow-bot/scraper.rb
113
110
  - spec/lib/processor_spec.rb
114
111
  - spec/lib/runner_spec.rb
115
- - spec/lib/validator_spec.rb
116
- - spec/lib/validators_spec.rb
117
112
  - spec/manual_spec.rb
118
113
  - spec/outputs/full-scraper.out
119
114
  - spec/outputs/full-transformer.out
@@ -1,77 +0,0 @@
1
- require 'json-schema'
2
-
3
- module TurbotRunner
4
- module Validator
5
- extend self
6
-
7
- def validate(schema, record)
8
- # We must change directory for the relative paths in schemas to make sense.
9
- errors = Dir.chdir(SCHEMAS_PATH) do
10
- JSON::Validator.fully_validate(schema, record, :errors_as_objects => true)
11
- end
12
-
13
- # For now, we just handle the first error.
14
- error = errors[0]
15
- return if error.nil?
16
-
17
- case error[:failed_attribute]
18
- when 'Required'
19
- match = error[:message].match(/required property of '(.*)'/)
20
- missing_property = match[1]
21
- path = fragment_to_path("#{error[:fragment]}/#{missing_property}")
22
-
23
- {:type => :missing, :path => path}
24
- when 'OneOf'
25
- if error[:message].match(/did not match any/)
26
- path_elements = fragment_to_path(error[:fragment]).split('.')
27
-
28
- raise "Deeply nested OneOf error at: #{error[:fragment]}" unless path_elements.size == 1
29
-
30
- record_fragment = record[path_elements[0]]
31
- schema_fragments = schema['properties'][path_elements[0]]['oneOf']
32
-
33
- schema_fragments.each do |s|
34
- s['properties'].each do |k, v|
35
- next if v['enum'].nil?
36
-
37
- if v['enum'].include?(record_fragment[k])
38
- error1 = validate(s, record_fragment)
39
- return error1.merge(:path => "#{path_elements[0]}.#{error1[:path]}")
40
- end
41
- end
42
- end
43
-
44
- {:type => :one_of_no_matches, :path => fragment_to_path(error[:fragment])}
45
- else
46
- {:type => :one_of_many_matches, :path => fragment_to_path(error[:fragment])}
47
- end
48
- when 'MinLength'
49
- match = error[:message].match(/minimum string length of (\d+) in/)
50
- min_length = match[1].to_i
51
- {:type => :too_short, :path => fragment_to_path(error[:fragment]), :length => min_length}
52
- when 'MaxLength'
53
- match = error[:message].match(/maximum string length of (\d+) in/)
54
- max_length = match[1].to_i
55
- {:type => :too_long, :path => fragment_to_path(error[:fragment]), :length => max_length}
56
- when 'TypeV4'
57
- match = error[:message].match(/the following types?: ([\w\s,]+) in schema/)
58
- allowed_types = match[1].split(',').map(&:strip)
59
- {:type => :type_mismatch, :path => fragment_to_path(error[:fragment]), :allowed_types => allowed_types}
60
- when 'Enum'
61
- match = error[:message].match(/the following values: ([\w\s,]+) in schema/)
62
- allowed_values = match[1].split(',').map(&:strip)
63
- {:type => :enum_mismatch, :path => fragment_to_path(error[:fragment]), :allowed_values => allowed_values}
64
- else
65
- if error[:message].match(/must be of format yyyy-mm-dd/)
66
- {:type => :format_mismatch, :path => fragment_to_path(error[:fragment]), :expected_format => 'yyyy-mm-dd'}
67
- else
68
- {:type => :unknown, :path => fragment_to_path(error[:fragment]), :failed_attribute => error[:failed_attribute], :message => error[:message]}
69
- end
70
- end
71
- end
72
-
73
- def fragment_to_path(fragment)
74
- fragment.sub(/^#?\/*/, '').gsub('/', '.')
75
- end
76
- end
77
- end