datapackage 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/bin/console +0 -0
- data/bin/datapackage +0 -0
- data/lib/datapackage.rb +1 -0
- data/lib/datapackage/interpreter.rb +66 -0
- data/lib/datapackage/package.rb +18 -0
- data/lib/datapackage/resource.rb +28 -0
- data/lib/datapackage/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c75abeabc9ffe3f12589976c2790dec6c2e4a748b97a99d196408fd02689d432
|
4
|
+
data.tar.gz: 94deaf207fd46c81e154b0617cbfcd2074f10dce2243eff888bb414618f756b8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a54d0a226c49d9a38db69f0a48f5e56220a8636ff5c27601a56d5d4325f3a0efbcb5999347631c4e0238bc4429dcf532391160decc3a4179703c05afb6c343b
|
7
|
+
data.tar.gz: 6390e3c8018ef3c6b8bf2c654b32e51dee41bca12762c6bc45ee5002547aa4080786c4235a331e0794458b855b884bd991522eb0621cd6fe6f1fc6e8d55dd5f5
|
data/bin/console
CHANGED
File without changes
|
data/bin/datapackage
CHANGED
File without changes
|
data/lib/datapackage.rb
CHANGED
@@ -0,0 +1,66 @@
|
|
1
|
+
module DataPackage
|
2
|
+
class Interpreter
|
3
|
+
INFER_THRESHOLD = 10
|
4
|
+
INFER_CONFIDENCE = 0.75
|
5
|
+
YEAR_PATTERN = /[12]\d{3}/
|
6
|
+
DATE_PATTERN = /(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4})|(\d{4}[-\/]\d{1,2}[-\/]\d{1,2})/
|
7
|
+
DATETIME_PATTERN = /(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4}|\d{4}[-\/]\d{1,2}[-\/]\d{1,2}).\d{1,2}:\d{2}/
|
8
|
+
TIME_PATTERN = /^\d{1,2}((:\d{1,2})|(am|pm|AM|PM))$/
|
9
|
+
INTEGER_PATTERN = /^\d+$/
|
10
|
+
DEFAULT_TYPE_FORMAT = {'type' => 'any', 'format' => 'default'}
|
11
|
+
|
12
|
+
attr_reader :csv, :threshold
|
13
|
+
|
14
|
+
def initialize(csv)
|
15
|
+
@csv = csv
|
16
|
+
@threshold = [csv.length, INFER_THRESHOLD].min
|
17
|
+
end
|
18
|
+
|
19
|
+
def type_and_format_at(header)
|
20
|
+
values = csv.values_at(header).flatten
|
21
|
+
counter = {}
|
22
|
+
type_and_format = DEFAULT_TYPE_FORMAT
|
23
|
+
|
24
|
+
values.each_with_index do |value, i|
|
25
|
+
inspection_count = i + 1
|
26
|
+
|
27
|
+
inspection = inspect_value(value)
|
28
|
+
counter[inspection] = (counter[inspection] || 0) + 1
|
29
|
+
if inspection_count >= threshold
|
30
|
+
if counter[inspection] / inspection_count >= INFER_CONFIDENCE
|
31
|
+
type_and_format = inspection
|
32
|
+
break
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
type_and_format
|
38
|
+
end
|
39
|
+
|
40
|
+
def inspect_value(value)
|
41
|
+
return DEFAULT_TYPE_FORMAT unless value.is_a?(String)
|
42
|
+
|
43
|
+
if value.length == 4 && value.match(YEAR_PATTERN)
|
44
|
+
return { 'type' => 'year', 'format' => 'default' }
|
45
|
+
end
|
46
|
+
|
47
|
+
if value.match(DATETIME_PATTERN)
|
48
|
+
return { 'type' => 'datetime', 'format' => 'default' }
|
49
|
+
end
|
50
|
+
|
51
|
+
if value.match(DATE_PATTERN)
|
52
|
+
return { 'type' => 'date', 'format' => 'default' }
|
53
|
+
end
|
54
|
+
|
55
|
+
if value.match(TIME_PATTERN)
|
56
|
+
return { 'type' => 'time', 'format' => 'default' }
|
57
|
+
end
|
58
|
+
|
59
|
+
if value.match(INTEGER_PATTERN)
|
60
|
+
return { 'type' => 'integer', 'format' => 'default' }
|
61
|
+
end
|
62
|
+
|
63
|
+
DEFAULT_TYPE_FORMAT
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/datapackage/package.rb
CHANGED
@@ -119,6 +119,24 @@ module DataPackage
|
|
119
119
|
self[property] || default
|
120
120
|
end
|
121
121
|
|
122
|
+
def infer(base_path: nil, directory: nil)
|
123
|
+
raise PackageException.new('Base path is required for infer') unless base_path
|
124
|
+
raise PackageException.new('Directory is required for infer') unless directory
|
125
|
+
|
126
|
+
dir_path = File.join(base_path, directory)
|
127
|
+
Dir.glob("#{dir_path}/*.csv") do |filename|
|
128
|
+
resource = Resource.infer(filename)
|
129
|
+
add_resource(resource)
|
130
|
+
end
|
131
|
+
|
132
|
+
# If there were CSVs, this is a tabular data package
|
133
|
+
if resources.count > 0
|
134
|
+
self['profile'] = 'tabular-data-package'
|
135
|
+
end
|
136
|
+
|
137
|
+
descriptor
|
138
|
+
end
|
139
|
+
|
122
140
|
# Private
|
123
141
|
|
124
142
|
private
|
data/lib/datapackage/resource.rb
CHANGED
@@ -6,6 +6,33 @@ module DataPackage
|
|
6
6
|
|
7
7
|
attr_reader :errors, :profile, :name, :source
|
8
8
|
|
9
|
+
def self.infer(filepath)
|
10
|
+
name = File.basename(filepath)
|
11
|
+
if name[-4..-1] != '.csv'
|
12
|
+
raise ResourceException.new('Inferrable resource must have .csv extension')
|
13
|
+
end
|
14
|
+
|
15
|
+
descr = {
|
16
|
+
'format' => 'csv',
|
17
|
+
'mediatype' => 'text/csv',
|
18
|
+
'name' => name[0...-4],
|
19
|
+
'path' => filepath,
|
20
|
+
'schema' => {
|
21
|
+
'fields' => []
|
22
|
+
},
|
23
|
+
}
|
24
|
+
|
25
|
+
csv = CSV.read(filepath, headers: true)
|
26
|
+
interpreter = DataPackage::Interpreter.new(csv)
|
27
|
+
csv.headers.each do |header|
|
28
|
+
field = { 'name' => header, 'type' => 'string'}
|
29
|
+
field.merge! interpreter.type_and_format_at(header)
|
30
|
+
descr['schema']['fields'] << field
|
31
|
+
end
|
32
|
+
|
33
|
+
new(descr)
|
34
|
+
end
|
35
|
+
|
9
36
|
def initialize(resource, base_path = '')
|
10
37
|
self.merge! dereference_descriptor(resource, base_path: base_path,
|
11
38
|
reference_fields: ['schema', 'dialect'])
|
@@ -132,6 +159,7 @@ module DataPackage
|
|
132
159
|
end
|
133
160
|
|
134
161
|
def apply_table_defaults!
|
162
|
+
self['profile'] = DataPackage::DEFAULTS[:resource][:tabular_profile]
|
135
163
|
if self.fetch('schema', nil)
|
136
164
|
self['schema']['missingValues'] = DataPackage::DEFAULTS[:schema][:missing_values]
|
137
165
|
self['schema'].fetch('fields', []).each do |field_descriptor|
|
data/lib/datapackage/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datapackage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leigh Dodds
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2019-11-21 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: json-schema
|
@@ -183,6 +183,7 @@ files:
|
|
183
183
|
- lib/datapackage/defaults.rb
|
184
184
|
- lib/datapackage/exceptions.rb
|
185
185
|
- lib/datapackage/helpers.rb
|
186
|
+
- lib/datapackage/interpreter.rb
|
186
187
|
- lib/datapackage/package.rb
|
187
188
|
- lib/datapackage/profile.rb
|
188
189
|
- lib/datapackage/registry.rb
|
@@ -215,7 +216,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
215
216
|
version: '0'
|
216
217
|
requirements: []
|
217
218
|
rubyforge_project:
|
218
|
-
rubygems_version: 2.
|
219
|
+
rubygems_version: 2.7.7
|
219
220
|
signing_key:
|
220
221
|
specification_version: 4
|
221
222
|
summary: Library for working with data packages
|