datapackage 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: d07ade783e277a0c0f4559c64435201036560e30
4
- data.tar.gz: dc64a0a4b573ccbbd376e1f2c8d47bcceb1b0475
2
+ SHA256:
3
+ metadata.gz: c75abeabc9ffe3f12589976c2790dec6c2e4a748b97a99d196408fd02689d432
4
+ data.tar.gz: 94deaf207fd46c81e154b0617cbfcd2074f10dce2243eff888bb414618f756b8
5
5
  SHA512:
6
- metadata.gz: ba0539cd939c96123bd11da407171f07a720a9dd9a2de96e86f77e8e62a243b0272a04609838aaeb895bb421dbe04303110bfa416aacd0b5cf69b03badad0858
7
- data.tar.gz: c93372165ca7b145843912115c1d98eeceb9ea8dd8826b85e2702e2f5cd343fc5cfc4300ce30716b1110aee987f753adb1eaae3837011495e985064f0a9fd3b1
6
+ metadata.gz: 2a54d0a226c49d9a38db69f0a48f5e56220a8636ff5c27601a56d5d4325f3a0efbcb5999347631c4e0238bc4429dcf532391160decc3a4179703c05afb6c343b
7
+ data.tar.gz: 6390e3c8018ef3c6b8bf2c654b32e51dee41bca12762c6bc45ee5002547aa4080786c4235a331e0794458b855b884bd991522eb0621cd6fe6f1fc6e8d55dd5f5
data/bin/console CHANGED
File without changes
data/bin/datapackage CHANGED
File without changes
data/lib/datapackage.rb CHANGED
@@ -16,3 +16,4 @@ require 'datapackage/profile'
16
16
  require 'datapackage/resource'
17
17
  require 'datapackage/package'
18
18
  require 'datapackage/registry'
19
+ require 'datapackage/interpreter'
@@ -0,0 +1,66 @@
1
+ module DataPackage
2
+ class Interpreter
3
+ INFER_THRESHOLD = 10
4
+ INFER_CONFIDENCE = 0.75
5
+ YEAR_PATTERN = /[12]\d{3}/
6
+ DATE_PATTERN = /(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4})|(\d{4}[-\/]\d{1,2}[-\/]\d{1,2})/
7
+ DATETIME_PATTERN = /(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4}|\d{4}[-\/]\d{1,2}[-\/]\d{1,2}).\d{1,2}:\d{2}/
8
+ TIME_PATTERN = /^\d{1,2}((:\d{1,2})|(am|pm|AM|PM))$/
9
+ INTEGER_PATTERN = /^\d+$/
10
+ DEFAULT_TYPE_FORMAT = {'type' => 'any', 'format' => 'default'}
11
+
12
+ attr_reader :csv, :threshold
13
+
14
+ def initialize(csv)
15
+ @csv = csv
16
+ @threshold = [csv.length, INFER_THRESHOLD].min
17
+ end
18
+
19
+ def type_and_format_at(header)
20
+ values = csv.values_at(header).flatten
21
+ counter = {}
22
+ type_and_format = DEFAULT_TYPE_FORMAT
23
+
24
+ values.each_with_index do |value, i|
25
+ inspection_count = i + 1
26
+
27
+ inspection = inspect_value(value)
28
+ counter[inspection] = (counter[inspection] || 0) + 1
29
+ if inspection_count >= threshold
30
+ if counter[inspection] / inspection_count >= INFER_CONFIDENCE
31
+ type_and_format = inspection
32
+ break
33
+ end
34
+ end
35
+ end
36
+
37
+ type_and_format
38
+ end
39
+
40
+ def inspect_value(value)
41
+ return DEFAULT_TYPE_FORMAT unless value.is_a?(String)
42
+
43
+ if value.length == 4 && value.match(YEAR_PATTERN)
44
+ return { 'type' => 'year', 'format' => 'default' }
45
+ end
46
+
47
+ if value.match(DATETIME_PATTERN)
48
+ return { 'type' => 'datetime', 'format' => 'default' }
49
+ end
50
+
51
+ if value.match(DATE_PATTERN)
52
+ return { 'type' => 'date', 'format' => 'default' }
53
+ end
54
+
55
+ if value.match(TIME_PATTERN)
56
+ return { 'type' => 'time', 'format' => 'default' }
57
+ end
58
+
59
+ if value.match(INTEGER_PATTERN)
60
+ return { 'type' => 'integer', 'format' => 'default' }
61
+ end
62
+
63
+ DEFAULT_TYPE_FORMAT
64
+ end
65
+ end
66
+ end
@@ -119,6 +119,24 @@ module DataPackage
119
119
  self[property] || default
120
120
  end
121
121
 
122
+ def infer(base_path: nil, directory: nil)
123
+ raise PackageException.new('Base path is required for infer') unless base_path
124
+ raise PackageException.new('Directory is required for infer') unless directory
125
+
126
+ dir_path = File.join(base_path, directory)
127
+ Dir.glob("#{dir_path}/*.csv") do |filename|
128
+ resource = Resource.infer(filename)
129
+ add_resource(resource)
130
+ end
131
+
132
+ # If there were CSVs, this is a tabular data package
133
+ if resources.count > 0
134
+ self['profile'] = 'tabular-data-package'
135
+ end
136
+
137
+ descriptor
138
+ end
139
+
122
140
  # Private
123
141
 
124
142
  private
@@ -6,6 +6,33 @@ module DataPackage
6
6
 
7
7
  attr_reader :errors, :profile, :name, :source
8
8
 
9
+ def self.infer(filepath)
10
+ name = File.basename(filepath)
11
+ if name[-4..-1] != '.csv'
12
+ raise ResourceException.new('Inferrable resource must have .csv extension')
13
+ end
14
+
15
+ descr = {
16
+ 'format' => 'csv',
17
+ 'mediatype' => 'text/csv',
18
+ 'name' => name[0...-4],
19
+ 'path' => filepath,
20
+ 'schema' => {
21
+ 'fields' => []
22
+ },
23
+ }
24
+
25
+ csv = CSV.read(filepath, headers: true)
26
+ interpreter = DataPackage::Interpreter.new(csv)
27
+ csv.headers.each do |header|
28
+ field = { 'name' => header, 'type' => 'string'}
29
+ field.merge! interpreter.type_and_format_at(header)
30
+ descr['schema']['fields'] << field
31
+ end
32
+
33
+ new(descr)
34
+ end
35
+
9
36
  def initialize(resource, base_path = '')
10
37
  self.merge! dereference_descriptor(resource, base_path: base_path,
11
38
  reference_fields: ['schema', 'dialect'])
@@ -132,6 +159,7 @@ module DataPackage
132
159
  end
133
160
 
134
161
  def apply_table_defaults!
162
+ self['profile'] = DataPackage::DEFAULTS[:resource][:tabular_profile]
135
163
  if self.fetch('schema', nil)
136
164
  self['schema']['missingValues'] = DataPackage::DEFAULTS[:schema][:missing_values]
137
165
  self['schema'].fetch('fields', []).each do |field_descriptor|
@@ -1,3 +1,3 @@
1
1
  module DataPackage
2
- VERSION = "1.0.0"
2
+ VERSION = "1.1.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datapackage
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leigh Dodds
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2017-09-04 00:00:00.000000000 Z
13
+ date: 2019-11-21 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: json-schema
@@ -183,6 +183,7 @@ files:
183
183
  - lib/datapackage/defaults.rb
184
184
  - lib/datapackage/exceptions.rb
185
185
  - lib/datapackage/helpers.rb
186
+ - lib/datapackage/interpreter.rb
186
187
  - lib/datapackage/package.rb
187
188
  - lib/datapackage/profile.rb
188
189
  - lib/datapackage/registry.rb
@@ -215,7 +216,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
215
216
  version: '0'
216
217
  requirements: []
217
218
  rubyforge_project:
218
- rubygems_version: 2.6.11
219
+ rubygems_version: 2.7.7
219
220
  signing_key:
220
221
  specification_version: 4
221
222
  summary: Library for working with data packages