datapackage 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: d07ade783e277a0c0f4559c64435201036560e30
4
- data.tar.gz: dc64a0a4b573ccbbd376e1f2c8d47bcceb1b0475
2
+ SHA256:
3
+ metadata.gz: c75abeabc9ffe3f12589976c2790dec6c2e4a748b97a99d196408fd02689d432
4
+ data.tar.gz: 94deaf207fd46c81e154b0617cbfcd2074f10dce2243eff888bb414618f756b8
5
5
  SHA512:
6
- metadata.gz: ba0539cd939c96123bd11da407171f07a720a9dd9a2de96e86f77e8e62a243b0272a04609838aaeb895bb421dbe04303110bfa416aacd0b5cf69b03badad0858
7
- data.tar.gz: c93372165ca7b145843912115c1d98eeceb9ea8dd8826b85e2702e2f5cd343fc5cfc4300ce30716b1110aee987f753adb1eaae3837011495e985064f0a9fd3b1
6
+ metadata.gz: 2a54d0a226c49d9a38db69f0a48f5e56220a8636ff5c27601a56d5d4325f3a0efbcb5999347631c4e0238bc4429dcf532391160decc3a4179703c05afb6c343b
7
+ data.tar.gz: 6390e3c8018ef3c6b8bf2c654b32e51dee41bca12762c6bc45ee5002547aa4080786c4235a331e0794458b855b884bd991522eb0621cd6fe6f1fc6e8d55dd5f5
data/bin/console CHANGED
File without changes
data/bin/datapackage CHANGED
File without changes
data/lib/datapackage.rb CHANGED
@@ -16,3 +16,4 @@ require 'datapackage/profile'
16
16
  require 'datapackage/resource'
17
17
  require 'datapackage/package'
18
18
  require 'datapackage/registry'
19
+ require 'datapackage/interpreter'
@@ -0,0 +1,66 @@
1
+ module DataPackage
2
+ class Interpreter
3
+ INFER_THRESHOLD = 10
4
+ INFER_CONFIDENCE = 0.75
5
+ YEAR_PATTERN = /[12]\d{3}/
6
+ DATE_PATTERN = /(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4})|(\d{4}[-\/]\d{1,2}[-\/]\d{1,2})/
7
+ DATETIME_PATTERN = /(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4}|\d{4}[-\/]\d{1,2}[-\/]\d{1,2}).\d{1,2}:\d{2}/
8
+ TIME_PATTERN = /^\d{1,2}((:\d{1,2})|(am|pm|AM|PM))$/
9
+ INTEGER_PATTERN = /^\d+$/
10
+ DEFAULT_TYPE_FORMAT = {'type' => 'any', 'format' => 'default'}
11
+
12
+ attr_reader :csv, :threshold
13
+
14
+ def initialize(csv)
15
+ @csv = csv
16
+ @threshold = [csv.length, INFER_THRESHOLD].min
17
+ end
18
+
19
+ def type_and_format_at(header)
20
+ values = csv.values_at(header).flatten
21
+ counter = {}
22
+ type_and_format = DEFAULT_TYPE_FORMAT
23
+
24
+ values.each_with_index do |value, i|
25
+ inspection_count = i + 1
26
+
27
+ inspection = inspect_value(value)
28
+ counter[inspection] = (counter[inspection] || 0) + 1
29
+ if inspection_count >= threshold
30
+ if counter[inspection] / inspection_count >= INFER_CONFIDENCE
31
+ type_and_format = inspection
32
+ break
33
+ end
34
+ end
35
+ end
36
+
37
+ type_and_format
38
+ end
39
+
40
+ def inspect_value(value)
41
+ return DEFAULT_TYPE_FORMAT unless value.is_a?(String)
42
+
43
+ if value.length == 4 && value.match(YEAR_PATTERN)
44
+ return { 'type' => 'year', 'format' => 'default' }
45
+ end
46
+
47
+ if value.match(DATETIME_PATTERN)
48
+ return { 'type' => 'datetime', 'format' => 'default' }
49
+ end
50
+
51
+ if value.match(DATE_PATTERN)
52
+ return { 'type' => 'date', 'format' => 'default' }
53
+ end
54
+
55
+ if value.match(TIME_PATTERN)
56
+ return { 'type' => 'time', 'format' => 'default' }
57
+ end
58
+
59
+ if value.match(INTEGER_PATTERN)
60
+ return { 'type' => 'integer', 'format' => 'default' }
61
+ end
62
+
63
+ DEFAULT_TYPE_FORMAT
64
+ end
65
+ end
66
+ end
@@ -119,6 +119,24 @@ module DataPackage
119
119
  self[property] || default
120
120
  end
121
121
 
122
+ def infer(base_path: nil, directory: nil)
123
+ raise PackageException.new('Base path is required for infer') unless base_path
124
+ raise PackageException.new('Directory is required for infer') unless directory
125
+
126
+ dir_path = File.join(base_path, directory)
127
+ Dir.glob("#{dir_path}/*.csv") do |filename|
128
+ resource = Resource.infer(filename)
129
+ add_resource(resource)
130
+ end
131
+
132
+ # If there were CSVs, this is a tabular data package
133
+ if resources.count > 0
134
+ self['profile'] = 'tabular-data-package'
135
+ end
136
+
137
+ descriptor
138
+ end
139
+
122
140
  # Private
123
141
 
124
142
  private
@@ -6,6 +6,33 @@ module DataPackage
6
6
 
7
7
  attr_reader :errors, :profile, :name, :source
8
8
 
9
+ def self.infer(filepath)
10
+ name = File.basename(filepath)
11
+ if name[-4..-1] != '.csv'
12
+ raise ResourceException.new('Inferrable resource must have .csv extension')
13
+ end
14
+
15
+ descr = {
16
+ 'format' => 'csv',
17
+ 'mediatype' => 'text/csv',
18
+ 'name' => name[0...-4],
19
+ 'path' => filepath,
20
+ 'schema' => {
21
+ 'fields' => []
22
+ },
23
+ }
24
+
25
+ csv = CSV.read(filepath, headers: true)
26
+ interpreter = DataPackage::Interpreter.new(csv)
27
+ csv.headers.each do |header|
28
+ field = { 'name' => header, 'type' => 'string'}
29
+ field.merge! interpreter.type_and_format_at(header)
30
+ descr['schema']['fields'] << field
31
+ end
32
+
33
+ new(descr)
34
+ end
35
+
9
36
  def initialize(resource, base_path = '')
10
37
  self.merge! dereference_descriptor(resource, base_path: base_path,
11
38
  reference_fields: ['schema', 'dialect'])
@@ -132,6 +159,7 @@ module DataPackage
132
159
  end
133
160
 
134
161
  def apply_table_defaults!
162
+ self['profile'] = DataPackage::DEFAULTS[:resource][:tabular_profile]
135
163
  if self.fetch('schema', nil)
136
164
  self['schema']['missingValues'] = DataPackage::DEFAULTS[:schema][:missing_values]
137
165
  self['schema'].fetch('fields', []).each do |field_descriptor|
@@ -1,3 +1,3 @@
1
1
  module DataPackage
2
- VERSION = "1.0.0"
2
+ VERSION = "1.1.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datapackage
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leigh Dodds
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2017-09-04 00:00:00.000000000 Z
13
+ date: 2019-11-21 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: json-schema
@@ -183,6 +183,7 @@ files:
183
183
  - lib/datapackage/defaults.rb
184
184
  - lib/datapackage/exceptions.rb
185
185
  - lib/datapackage/helpers.rb
186
+ - lib/datapackage/interpreter.rb
186
187
  - lib/datapackage/package.rb
187
188
  - lib/datapackage/profile.rb
188
189
  - lib/datapackage/registry.rb
@@ -215,7 +216,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
215
216
  version: '0'
216
217
  requirements: []
217
218
  rubyforge_project:
218
- rubygems_version: 2.6.11
219
+ rubygems_version: 2.7.7
219
220
  signing_key:
220
221
  specification_version: 4
221
222
  summary: Library for working with data packages