turbot-runner 0.1.23 → 0.1.24

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +8 -8
  2. data/lib/turbot_runner.rb +0 -2
  3. data/lib/turbot_runner/processor.rb +7 -70
  4. data/lib/turbot_runner/version.rb +1 -1
  5. data/schema/schemas/company-schema.json +69 -22
  6. data/schema/schemas/financial-payment-schema.json +12 -5
  7. data/schema/schemas/includes/address.json +36 -13
  8. data/schema/schemas/includes/alternative_name.json +32 -9
  9. data/schema/schemas/includes/company-for-nesting.json +70 -23
  10. data/schema/schemas/includes/company.json +15 -6
  11. data/schema/schemas/includes/filing.json +48 -16
  12. data/schema/schemas/includes/financial-payment-data-object.json +73 -21
  13. data/schema/schemas/includes/identifier.json +14 -8
  14. data/schema/schemas/includes/industry_code.json +25 -12
  15. data/schema/schemas/includes/licence-data-object.json +40 -13
  16. data/schema/schemas/includes/officer.json +66 -46
  17. data/schema/schemas/includes/organisation.json +6 -5
  18. data/schema/schemas/includes/permission.json +36 -17
  19. data/schema/schemas/includes/person.json +14 -13
  20. data/schema/schemas/includes/person_name.json +36 -12
  21. data/schema/schemas/includes/previous_name.json +19 -9
  22. data/schema/schemas/includes/share-parcel-data.json +55 -40
  23. data/schema/schemas/includes/share-parcel.json +71 -56
  24. data/schema/schemas/includes/subsidiary-relationship-data.json +19 -8
  25. data/schema/schemas/includes/total-shares.json +13 -6
  26. data/schema/schemas/includes/unknown_entity_type.json +6 -5
  27. data/schema/schemas/licence-schema.json +157 -17
  28. data/schema/schemas/primary-data-schema.json +18 -16
  29. data/schema/schemas/share-parcel-schema.json +8 -7
  30. data/schema/schemas/simple-financial-payment-schema.json +47 -11
  31. data/schema/schemas/simple-licence-schema.json +21 -6
  32. data/schema/schemas/simple-subsidiary-schema.json +15 -4
  33. data/schema/schemas/subsidiary-relationship-schema.json +26 -7
  34. data/spec/lib/processor_spec.rb +3 -75
  35. metadata +5 -10
  36. data/lib/turbot_runner/validator.rb +0 -77
  37. data/lib/turbot_runner/validators.rb +0 -12
  38. data/schema/schemas/rich-licence-schema.json +0 -103
  39. data/spec/lib/validator_spec.rb +0 -235
  40. data/spec/lib/validators_spec.rb +0 -48
@@ -1,21 +1,22 @@
1
1
  {
2
2
  "$schema": "http://json-schema.org/draft-04/schema#",
3
- "title": "Share Parcel Schema",
4
3
  "type": "object",
5
- "allOf" : [
6
- // The following is a basic statement with sample_date, etc
7
- { "$ref": "includes/base-statement.json" },
8
- // And this overrides it to provide data-type-specific information
4
+ "allOf": [
5
+ {
6
+ "$ref": "includes/base-statement.json"
7
+ },
9
8
  {
10
9
  "properties": {
11
10
  "data": {
12
11
  "items": {
13
12
  "allOf": [
14
- { "$ref": "includes/share-parcel-data.json" }
13
+ {
14
+ "$ref": "includes/share-parcel-data.json"
15
+ }
15
16
  ]
16
17
  }
17
18
  }
18
19
  }
19
20
  }
20
21
  ]
21
- }
22
+ }
@@ -1,6 +1,5 @@
1
1
  {
2
2
  "$schema": "http://json-schema.org/draft-04/schema#",
3
- "title": "Simple Financial Payment",
4
3
  "description": "A Financial Payment is a payment from government to a recipient",
5
4
  "type": "object",
6
5
  "properties": {
@@ -16,7 +15,11 @@
16
15
  "confidence": {
17
16
  "description": "Confidence in accuracy of data",
18
17
  "type": "string",
19
- "enum": ["HIGH", "MEDIUM", "LOW"]
18
+ "enum": [
19
+ "HIGH",
20
+ "MEDIUM",
21
+ "LOW"
22
+ ]
20
23
  },
21
24
  "company_name": {
22
25
  "description": "Name of the company that received the money",
@@ -52,19 +55,32 @@
52
55
  },
53
56
  "more_details_url": {
54
57
  "description": "A url from which more details can be seen (may be the same as the source_url)",
55
- "type": ["string",null]
58
+ "type": [
59
+ "string",
60
+ null
61
+ ]
56
62
  },
57
63
  "description": {
58
64
  "description": "The description of the transaction as given in the raw data",
59
- "type": ["string",null]
65
+ "type": [
66
+ "string",
67
+ null
68
+ ]
60
69
  },
61
70
  "expense_type": {
62
71
  "description": "The type of expense -- can be either capital, revenue (i.e. current expenditure) or null",
63
- "enum": ["capital","revenue",null]
72
+ "enum": [
73
+ "capital",
74
+ "revenue",
75
+ null
76
+ ]
64
77
  },
65
78
  "expense_area": {
66
79
  "description": "category (in words) of the expenditure",
67
- "type": ["string",null]
80
+ "type": [
81
+ "string",
82
+ null
83
+ ]
68
84
  },
69
85
  "entity_name": {
70
86
  "description": "The name of the government entity that made the payment, e.g. Environment Agency",
@@ -72,15 +88,35 @@
72
88
  },
73
89
  "entity_uri": {
74
90
  "description": "A unique URL (ideally a dereferencable URI) for the government entity",
75
- "type": ["string",null]
91
+ "type": [
92
+ "string",
93
+ null
94
+ ]
76
95
  },
77
96
  "department_name": {
78
97
  "description": "The name of the government department which the entity belongs to (if relevant), e.g. Department of Health",
79
- "type": ["string",null]
98
+ "type": [
99
+ "string",
100
+ null
101
+ ]
80
102
  },
81
103
  "csv_line_number": {
82
104
  "description": "If the source for the data is a CSV file, you can optionally include the line number of the CSV from which this data was retrieved",
83
- "type": ["string",null]}
105
+ "type": [
106
+ "string",
107
+ null
108
+ ]
109
+ }
84
110
  },
85
- "required": ["source_url", "sample_date", "company_name", "company_jurisdiction", "value", "payee_name", "date", "currency"]
86
- }
111
+ "additionalProperties": false,
112
+ "required": [
113
+ "source_url",
114
+ "sample_date",
115
+ "company_name",
116
+ "company_jurisdiction",
117
+ "value",
118
+ "payee_name",
119
+ "date",
120
+ "currency"
121
+ ]
122
+ }
@@ -1,6 +1,5 @@
1
1
  {
2
2
  "$schema": "http://json-schema.org/draft-04/schema#",
3
- "title": "Simple Licence Schema",
4
3
  "type": "object",
5
4
  "properties": {
6
5
  "source_url": {
@@ -16,7 +15,11 @@
16
15
  "confidence": {
17
16
  "description": "Confidence in accuracy of data",
18
17
  "type": "string",
19
- "enum": ["HIGH", "MEDIUM", "LOW"]
18
+ "enum": [
19
+ "HIGH",
20
+ "MEDIUM",
21
+ "LOW"
22
+ ]
20
23
  },
21
24
  "company_name": {
22
25
  "description": "Name of the company holding the licence",
@@ -34,7 +37,10 @@
34
37
  },
35
38
  "jurisdiction_classification": {
36
39
  "description": "Description of how regulator classifies licence",
37
- "type": ["string", "array"]
40
+ "type": [
41
+ "string",
42
+ "array"
43
+ ]
38
44
  },
39
45
  "regulator": {
40
46
  "description": "The regulating body that issued the licence",
@@ -52,8 +58,17 @@
52
58
  "category": {
53
59
  "description": "Category of licence",
54
60
  "type": "string",
55
- "enum": ["Financial", "Business"]
61
+ "enum": [
62
+ "Financial",
63
+ "Business"
64
+ ]
56
65
  }
57
66
  },
58
- "required": ["source_url", "sample_date", "company_name", "company_jurisdiction"]
59
- }
67
+ "additionalProperties": false,
68
+ "required": [
69
+ "source_url",
70
+ "sample_date",
71
+ "company_name",
72
+ "company_jurisdiction"
73
+ ]
74
+ }
@@ -1,6 +1,5 @@
1
1
  {
2
2
  "$schema": "http://json-schema.org/draft-04/schema#",
3
- "title": "Simple Subsidiary Schema",
4
3
  "type": "object",
5
4
  "properties": {
6
5
  "source_url": {
@@ -26,7 +25,11 @@
26
25
  "confidence": {
27
26
  "description": "Confidence in accuracy of data",
28
27
  "type": "string",
29
- "enum": ["HIGH", "MEDIUM", "LOW"]
28
+ "enum": [
29
+ "HIGH",
30
+ "MEDIUM",
31
+ "LOW"
32
+ ]
30
33
  },
31
34
  "parent_name": {
32
35
  "description": "Name of the controlling company",
@@ -66,5 +69,13 @@
66
69
  "type": "string"
67
70
  }
68
71
  },
69
- "required": ["source_url", "sample_date", "parent_name", "parent_jurisdiction", "subsidiary_name", "subsidiary_jurisdiction"]
70
- }
72
+ "additionalProperties": false,
73
+ "required": [
74
+ "source_url",
75
+ "sample_date",
76
+ "parent_name",
77
+ "parent_jurisdiction",
78
+ "subsidiary_name",
79
+ "subsidiary_jurisdiction"
80
+ ]
81
+ }
@@ -1,12 +1,26 @@
1
1
  {
2
- "title": "Subsidiary Relationship",
3
- "description": "A relationship of control between two companies",
4
2
  "$schema": "http://json-schema.org/draft-04/schema#",
3
+ "description": "A relationship of control between two companies",
5
4
  "type": "object",
6
5
  "properties": {
7
- "sample_date": {"type": "string", "format": "date"},
8
- "start_date": {"type": "string", "format": "date"},
9
- "end_date": {"type": "string", "format": "date"},
6
+ "sample_date": {
7
+ "type": "string",
8
+ "format": "date"
9
+ },
10
+ "start_date": {
11
+ "type": "string",
12
+ "format": "date"
13
+ },
14
+ "start_date_type": {
15
+ "type": "string"
16
+ },
17
+ "end_date": {
18
+ "type": "string",
19
+ "format": "date"
20
+ },
21
+ "end_date_type": {
22
+ "type": "string"
23
+ },
10
24
  "source_jurisdiction": {
11
25
  "description": "Jurisdiction of the source of the data",
12
26
  "type": "string"
@@ -23,5 +37,10 @@
23
37
  "additionalItems": false
24
38
  }
25
39
  },
26
- "required": ["company", "data", "sample_date"]
27
- }
40
+ "additionalProperties": false,
41
+ "required": [
42
+ "company",
43
+ "data",
44
+ "sample_date"
45
+ ]
46
+ }
@@ -134,7 +134,7 @@ describe TurbotRunner::Processor do
134
134
  'number' => 123
135
135
  }
136
136
 
137
- expected_error = 'Missing required property: sample_date'
137
+ expected_error = 'Property not of expected format: sample_date (must be of format yyyy-mm-dd)'
138
138
  expect(@handler).to receive(:handle_invalid_record).
139
139
  with(record, @data_type, expected_error)
140
140
  @processor.process(record.to_json)
@@ -149,7 +149,7 @@ describe TurbotRunner::Processor do
149
149
  'number' => 123
150
150
  }
151
151
 
152
- expected_error = 'Property not a valid date: sample_date'
152
+ expected_error = 'Property not of expected format: sample_date (must be of format yyyy-mm-dd)'
153
153
  expect(@handler).to receive(:handle_invalid_record).
154
154
  with(record, @data_type, expected_error)
155
155
  @processor.process(record.to_json)
@@ -160,7 +160,7 @@ describe TurbotRunner::Processor do
160
160
  it 'can handle schemas with $refs' do
161
161
  handler = TurbotRunner::BaseHandler.new
162
162
  script_config = {
163
- :data_type => 'rich-licence',
163
+ :data_type => 'licence',
164
164
  :identifying_fields => ['licence_number']
165
165
  }
166
166
 
@@ -188,76 +188,4 @@ describe TurbotRunner::Processor do
188
188
  processor.process(record.to_json)
189
189
  end
190
190
  end
191
-
192
- describe '#convert_record' do
193
- before do
194
- schema = {
195
- '$schema' => 'http://json-schema.org/draft-04/schema#',
196
- 'type' => 'object',
197
- 'properties' => {
198
- 'aaa' => {'format' => 'date'},
199
- 'bbb' => {'format' => 'not-date'},
200
- }
201
- }
202
-
203
- @processor = TurbotRunner::Processor.new(nil, {}, nil)
204
- allow(@processor).to receive(:schema).and_return(schema)
205
- end
206
-
207
- context 'when date field is YYYY-MM-DD' do
208
- it 'leaves date field alone' do
209
- record = {'aaa' => '2015-01-26', 'bbb' => 'cabbage'}
210
- expect(@processor.convert_record(record)).to eq({'aaa' => '2015-01-26', 'bbb' => 'cabbage'})
211
- end
212
- end
213
-
214
- context 'when date field with YYYY-MM-DD HH:MM:SS' do
215
- it 'replaces value with YYYY-MM-DD' do
216
- record = {'aaa' => '2015-01-26 12:34:56', 'bbb' => 'cabbage'}
217
- expect(@processor.convert_record(record)).to eq({'aaa' => '2015-01-26', 'bbb' => 'cabbage'})
218
- end
219
- end
220
-
221
- context 'when date field is empty string' do
222
- it 'replaces removes field' do
223
- record = {'aaa' => '', 'bbb' => 'cabbage'}
224
- expect(@processor.convert_record(record)).to eq({'bbb' => 'cabbage'})
225
- end
226
- end
227
-
228
- context 'when date field is invalid date' do
229
- it 'rasies ConversionError' do
230
- record = {'aaa' => 'cabbage', 'bbb' => 'cabbage'}
231
- expect{@processor.convert_record(record)}.to raise_error(TurbotRunner::Processor::ConversionError)
232
- end
233
- end
234
- end
235
-
236
- specify '#get_date_paths' do
237
- schema = {
238
- '$schema' => 'http://json-schema.org/draft-04/schema#',
239
- 'type' => 'object',
240
- 'properties' => {
241
- 'aaa' => {'format' => 'date'},
242
- 'bbb' => {'format' => 'not-date'},
243
- 'ccc' => {
244
- 'type' => 'object',
245
- 'properties' => {
246
- 'ddd' => {'format' => 'date'},
247
- 'eee' => {'format' => 'not-date'},
248
- 'fff' => {
249
- 'type' => 'object',
250
- 'properties' => {
251
- 'ggg' => {'format' => 'date'},
252
- 'hhh' => {'format' => 'not-date'},
253
- }
254
- }
255
- }
256
- }
257
- }
258
- }
259
-
260
- processor = TurbotRunner::Processor.new(nil, {}, nil)
261
- expect(processor.get_date_paths(schema['properties'])).to eq([['aaa'], ['ccc', 'ddd'], ['ccc', 'fff', 'ggg']])
262
- end
263
191
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: turbot-runner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.23
4
+ version: 0.1.24
5
5
  platform: ruby
6
6
  authors:
7
7
  - OpenCorporates
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-29 00:00:00.000000000 Z
11
+ date: 2015-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: json-schema
14
+ name: openc-json_schema
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - '='
18
18
  - !ruby/object:Gem::Version
19
- version: 2.5.0
19
+ version: 0.0.5
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - '='
25
25
  - !ruby/object:Gem::Version
26
- version: 2.5.0
26
+ version: 0.0.5
27
27
  description:
28
28
  email: bots@opencorporates.com
29
29
  executables: []
@@ -39,8 +39,6 @@ files:
39
39
  - lib/turbot_runner/runner.rb
40
40
  - lib/turbot_runner/script_runner.rb
41
41
  - lib/turbot_runner/utils.rb
42
- - lib/turbot_runner/validator.rb
43
- - lib/turbot_runner/validators.rb
44
42
  - lib/turbot_runner/version.rb
45
43
  - schema/schemas/company-schema.json
46
44
  - schema/schemas/financial-payment-schema.json
@@ -66,7 +64,6 @@ files:
66
64
  - schema/schemas/includes/unknown_entity_type.json
67
65
  - schema/schemas/licence-schema.json
68
66
  - schema/schemas/primary-data-schema.json
69
- - schema/schemas/rich-licence-schema.json
70
67
  - schema/schemas/share-parcel-schema.json
71
68
  - schema/schemas/simple-financial-payment-schema.json
72
69
  - schema/schemas/simple-licence-schema.json
@@ -112,8 +109,6 @@ files:
112
109
  - spec/bots/slow-bot/scraper.rb
113
110
  - spec/lib/processor_spec.rb
114
111
  - spec/lib/runner_spec.rb
115
- - spec/lib/validator_spec.rb
116
- - spec/lib/validators_spec.rb
117
112
  - spec/manual_spec.rb
118
113
  - spec/outputs/full-scraper.out
119
114
  - spec/outputs/full-transformer.out
@@ -1,77 +0,0 @@
1
- require 'json-schema'
2
-
3
- module TurbotRunner
4
- module Validator
5
- extend self
6
-
7
- def validate(schema, record)
8
- # We must change directory for the relative paths in schemas to make sense.
9
- errors = Dir.chdir(SCHEMAS_PATH) do
10
- JSON::Validator.fully_validate(schema, record, :errors_as_objects => true)
11
- end
12
-
13
- # For now, we just handle the first error.
14
- error = errors[0]
15
- return if error.nil?
16
-
17
- case error[:failed_attribute]
18
- when 'Required'
19
- match = error[:message].match(/required property of '(.*)'/)
20
- missing_property = match[1]
21
- path = fragment_to_path("#{error[:fragment]}/#{missing_property}")
22
-
23
- {:type => :missing, :path => path}
24
- when 'OneOf'
25
- if error[:message].match(/did not match any/)
26
- path_elements = fragment_to_path(error[:fragment]).split('.')
27
-
28
- raise "Deeply nested OneOf error at: #{error[:fragment]}" unless path_elements.size == 1
29
-
30
- record_fragment = record[path_elements[0]]
31
- schema_fragments = schema['properties'][path_elements[0]]['oneOf']
32
-
33
- schema_fragments.each do |s|
34
- s['properties'].each do |k, v|
35
- next if v['enum'].nil?
36
-
37
- if v['enum'].include?(record_fragment[k])
38
- error1 = validate(s, record_fragment)
39
- return error1.merge(:path => "#{path_elements[0]}.#{error1[:path]}")
40
- end
41
- end
42
- end
43
-
44
- {:type => :one_of_no_matches, :path => fragment_to_path(error[:fragment])}
45
- else
46
- {:type => :one_of_many_matches, :path => fragment_to_path(error[:fragment])}
47
- end
48
- when 'MinLength'
49
- match = error[:message].match(/minimum string length of (\d+) in/)
50
- min_length = match[1].to_i
51
- {:type => :too_short, :path => fragment_to_path(error[:fragment]), :length => min_length}
52
- when 'MaxLength'
53
- match = error[:message].match(/maximum string length of (\d+) in/)
54
- max_length = match[1].to_i
55
- {:type => :too_long, :path => fragment_to_path(error[:fragment]), :length => max_length}
56
- when 'TypeV4'
57
- match = error[:message].match(/the following types?: ([\w\s,]+) in schema/)
58
- allowed_types = match[1].split(',').map(&:strip)
59
- {:type => :type_mismatch, :path => fragment_to_path(error[:fragment]), :allowed_types => allowed_types}
60
- when 'Enum'
61
- match = error[:message].match(/the following values: ([\w\s,]+) in schema/)
62
- allowed_values = match[1].split(',').map(&:strip)
63
- {:type => :enum_mismatch, :path => fragment_to_path(error[:fragment]), :allowed_values => allowed_values}
64
- else
65
- if error[:message].match(/must be of format yyyy-mm-dd/)
66
- {:type => :format_mismatch, :path => fragment_to_path(error[:fragment]), :expected_format => 'yyyy-mm-dd'}
67
- else
68
- {:type => :unknown, :path => fragment_to_path(error[:fragment]), :failed_attribute => error[:failed_attribute], :message => error[:message]}
69
- end
70
- end
71
- end
72
-
73
- def fragment_to_path(fragment)
74
- fragment.sub(/^#?\/*/, '').gsub('/', '.')
75
- end
76
- end
77
- end