datapackage 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/console +0 -0
- data/bin/datapackage +0 -0
- data/lib/datapackage.rb +1 -0
- data/lib/datapackage/interpreter.rb +66 -0
- data/lib/datapackage/package.rb +18 -0
- data/lib/datapackage/resource.rb +28 -0
- data/lib/datapackage/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c75abeabc9ffe3f12589976c2790dec6c2e4a748b97a99d196408fd02689d432
|
4
|
+
data.tar.gz: 94deaf207fd46c81e154b0617cbfcd2074f10dce2243eff888bb414618f756b8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a54d0a226c49d9a38db69f0a48f5e56220a8636ff5c27601a56d5d4325f3a0efbcb5999347631c4e0238bc4429dcf532391160decc3a4179703c05afb6c343b
|
7
|
+
data.tar.gz: 6390e3c8018ef3c6b8bf2c654b32e51dee41bca12762c6bc45ee5002547aa4080786c4235a331e0794458b855b884bd991522eb0621cd6fe6f1fc6e8d55dd5f5
|
data/bin/console
CHANGED
File without changes
|
data/bin/datapackage
CHANGED
File without changes
|
data/lib/datapackage.rb
CHANGED
@@ -0,0 +1,66 @@
|
|
1
|
+
module DataPackage
|
2
|
+
class Interpreter
|
3
|
+
INFER_THRESHOLD = 10
|
4
|
+
INFER_CONFIDENCE = 0.75
|
5
|
+
YEAR_PATTERN = /[12]\d{3}/
|
6
|
+
DATE_PATTERN = /(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4})|(\d{4}[-\/]\d{1,2}[-\/]\d{1,2})/
|
7
|
+
DATETIME_PATTERN = /(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4}|\d{4}[-\/]\d{1,2}[-\/]\d{1,2}).\d{1,2}:\d{2}/
|
8
|
+
TIME_PATTERN = /^\d{1,2}((:\d{1,2})|(am|pm|AM|PM))$/
|
9
|
+
INTEGER_PATTERN = /^\d+$/
|
10
|
+
DEFAULT_TYPE_FORMAT = {'type' => 'any', 'format' => 'default'}
|
11
|
+
|
12
|
+
attr_reader :csv, :threshold
|
13
|
+
|
14
|
+
def initialize(csv)
|
15
|
+
@csv = csv
|
16
|
+
@threshold = [csv.length, INFER_THRESHOLD].min
|
17
|
+
end
|
18
|
+
|
19
|
+
def type_and_format_at(header)
|
20
|
+
values = csv.values_at(header).flatten
|
21
|
+
counter = {}
|
22
|
+
type_and_format = DEFAULT_TYPE_FORMAT
|
23
|
+
|
24
|
+
values.each_with_index do |value, i|
|
25
|
+
inspection_count = i + 1
|
26
|
+
|
27
|
+
inspection = inspect_value(value)
|
28
|
+
counter[inspection] = (counter[inspection] || 0) + 1
|
29
|
+
if inspection_count >= threshold
|
30
|
+
if counter[inspection] / inspection_count >= INFER_CONFIDENCE
|
31
|
+
type_and_format = inspection
|
32
|
+
break
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
type_and_format
|
38
|
+
end
|
39
|
+
|
40
|
+
def inspect_value(value)
|
41
|
+
return DEFAULT_TYPE_FORMAT unless value.is_a?(String)
|
42
|
+
|
43
|
+
if value.length == 4 && value.match(YEAR_PATTERN)
|
44
|
+
return { 'type' => 'year', 'format' => 'default' }
|
45
|
+
end
|
46
|
+
|
47
|
+
if value.match(DATETIME_PATTERN)
|
48
|
+
return { 'type' => 'datetime', 'format' => 'default' }
|
49
|
+
end
|
50
|
+
|
51
|
+
if value.match(DATE_PATTERN)
|
52
|
+
return { 'type' => 'date', 'format' => 'default' }
|
53
|
+
end
|
54
|
+
|
55
|
+
if value.match(TIME_PATTERN)
|
56
|
+
return { 'type' => 'time', 'format' => 'default' }
|
57
|
+
end
|
58
|
+
|
59
|
+
if value.match(INTEGER_PATTERN)
|
60
|
+
return { 'type' => 'integer', 'format' => 'default' }
|
61
|
+
end
|
62
|
+
|
63
|
+
DEFAULT_TYPE_FORMAT
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/datapackage/package.rb
CHANGED
@@ -119,6 +119,24 @@ module DataPackage
|
|
119
119
|
self[property] || default
|
120
120
|
end
|
121
121
|
|
122
|
+
def infer(base_path: nil, directory: nil)
|
123
|
+
raise PackageException.new('Base path is required for infer') unless base_path
|
124
|
+
raise PackageException.new('Directory is required for infer') unless directory
|
125
|
+
|
126
|
+
dir_path = File.join(base_path, directory)
|
127
|
+
Dir.glob("#{dir_path}/*.csv") do |filename|
|
128
|
+
resource = Resource.infer(filename)
|
129
|
+
add_resource(resource)
|
130
|
+
end
|
131
|
+
|
132
|
+
# If there were CSVs, this is a tabular data package
|
133
|
+
if resources.count > 0
|
134
|
+
self['profile'] = 'tabular-data-package'
|
135
|
+
end
|
136
|
+
|
137
|
+
descriptor
|
138
|
+
end
|
139
|
+
|
122
140
|
# Private
|
123
141
|
|
124
142
|
private
|
data/lib/datapackage/resource.rb
CHANGED
@@ -6,6 +6,33 @@ module DataPackage
|
|
6
6
|
|
7
7
|
attr_reader :errors, :profile, :name, :source
|
8
8
|
|
9
|
+
def self.infer(filepath)
|
10
|
+
name = File.basename(filepath)
|
11
|
+
if name[-4..-1] != '.csv'
|
12
|
+
raise ResourceException.new('Inferrable resource must have .csv extension')
|
13
|
+
end
|
14
|
+
|
15
|
+
descr = {
|
16
|
+
'format' => 'csv',
|
17
|
+
'mediatype' => 'text/csv',
|
18
|
+
'name' => name[0...-4],
|
19
|
+
'path' => filepath,
|
20
|
+
'schema' => {
|
21
|
+
'fields' => []
|
22
|
+
},
|
23
|
+
}
|
24
|
+
|
25
|
+
csv = CSV.read(filepath, headers: true)
|
26
|
+
interpreter = DataPackage::Interpreter.new(csv)
|
27
|
+
csv.headers.each do |header|
|
28
|
+
field = { 'name' => header, 'type' => 'string'}
|
29
|
+
field.merge! interpreter.type_and_format_at(header)
|
30
|
+
descr['schema']['fields'] << field
|
31
|
+
end
|
32
|
+
|
33
|
+
new(descr)
|
34
|
+
end
|
35
|
+
|
9
36
|
def initialize(resource, base_path = '')
|
10
37
|
self.merge! dereference_descriptor(resource, base_path: base_path,
|
11
38
|
reference_fields: ['schema', 'dialect'])
|
@@ -132,6 +159,7 @@ module DataPackage
|
|
132
159
|
end
|
133
160
|
|
134
161
|
def apply_table_defaults!
|
162
|
+
self['profile'] = DataPackage::DEFAULTS[:resource][:tabular_profile]
|
135
163
|
if self.fetch('schema', nil)
|
136
164
|
self['schema']['missingValues'] = DataPackage::DEFAULTS[:schema][:missing_values]
|
137
165
|
self['schema'].fetch('fields', []).each do |field_descriptor|
|
data/lib/datapackage/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datapackage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leigh Dodds
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2019-11-21 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: json-schema
|
@@ -183,6 +183,7 @@ files:
|
|
183
183
|
- lib/datapackage/defaults.rb
|
184
184
|
- lib/datapackage/exceptions.rb
|
185
185
|
- lib/datapackage/helpers.rb
|
186
|
+
- lib/datapackage/interpreter.rb
|
186
187
|
- lib/datapackage/package.rb
|
187
188
|
- lib/datapackage/profile.rb
|
188
189
|
- lib/datapackage/registry.rb
|
@@ -215,7 +216,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
215
216
|
version: '0'
|
216
217
|
requirements: []
|
217
218
|
rubyforge_project:
|
218
|
-
rubygems_version: 2.
|
219
|
+
rubygems_version: 2.7.7
|
219
220
|
signing_key:
|
220
221
|
specification_version: 4
|
221
222
|
summary: Library for working with data packages
|