tableschema 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +21 -0
- data/.travis.yml +15 -1
- data/README.md +164 -129
- data/Rakefile +10 -1
- data/bin/console +2 -6
- data/{etc/schemas → lib/profiles}/geojson.json +0 -1
- data/lib/profiles/table-schema.json +1625 -0
- data/lib/profiles/topojson.json +311 -0
- data/lib/tableschema.rb +5 -3
- data/lib/tableschema/constraints/constraints.rb +12 -24
- data/lib/tableschema/constraints/enum.rb +6 -2
- data/lib/tableschema/constraints/max_length.rb +6 -2
- data/lib/tableschema/constraints/maximum.rb +12 -2
- data/lib/tableschema/constraints/min_length.rb +6 -2
- data/lib/tableschema/constraints/minimum.rb +12 -2
- data/lib/tableschema/constraints/pattern.rb +9 -2
- data/lib/tableschema/constraints/required.rb +6 -15
- data/lib/tableschema/constraints/unique.rb +12 -0
- data/lib/tableschema/defaults.rb +9 -0
- data/lib/tableschema/exceptions.rb +15 -2
- data/lib/tableschema/field.rb +39 -20
- data/lib/tableschema/helpers.rb +32 -15
- data/lib/tableschema/infer.rb +31 -28
- data/lib/tableschema/model.rb +57 -34
- data/lib/tableschema/schema.rb +40 -6
- data/lib/tableschema/table.rb +75 -26
- data/lib/tableschema/types/any.rb +1 -0
- data/lib/tableschema/types/array.rb +2 -1
- data/lib/tableschema/types/base.rb +9 -21
- data/lib/tableschema/types/date.rb +1 -0
- data/lib/tableschema/types/datetime.rb +1 -0
- data/lib/tableschema/types/duration.rb +31 -0
- data/lib/tableschema/types/geojson.rb +27 -5
- data/lib/tableschema/types/geopoint.rb +4 -3
- data/lib/tableschema/types/integer.rb +1 -0
- data/lib/tableschema/types/number.rb +40 -25
- data/lib/tableschema/types/object.rb +2 -1
- data/lib/tableschema/types/string.rb +8 -0
- data/lib/tableschema/types/time.rb +1 -0
- data/lib/tableschema/types/year.rb +34 -0
- data/lib/tableschema/types/yearmonth.rb +52 -0
- data/lib/tableschema/validate.rb +45 -29
- data/lib/tableschema/version.rb +1 -1
- data/tableschema.gemspec +2 -1
- metadata +31 -12
- data/etc/schemas/json-table-schema.json +0 -102
- data/lib/tableschema/data.rb +0 -60
- data/lib/tableschema/types/null.rb +0 -37
data/lib/tableschema/model.rb
CHANGED
@@ -1,73 +1,96 @@
|
|
1
|
+
require 'tableschema/defaults'
|
2
|
+
|
1
3
|
module TableSchema
|
2
4
|
module Model
|
3
5
|
|
4
|
-
DEFAULTS = {
|
5
|
-
'format' => 'default',
|
6
|
-
'type' => 'string'
|
7
|
-
}
|
8
|
-
|
9
6
|
def headers
|
10
|
-
fields.map { |f| transform(f[
|
7
|
+
fields.map { |f| transform(f[:name]) }
|
11
8
|
rescue NoMethodError
|
12
9
|
[]
|
13
10
|
end
|
14
11
|
|
12
|
+
alias :field_names :headers
|
13
|
+
|
15
14
|
def fields
|
16
|
-
self[
|
15
|
+
self[:fields]
|
17
16
|
end
|
18
17
|
|
19
18
|
def primary_keys
|
20
|
-
[self[
|
19
|
+
[self[:primaryKey]].flatten.reject { |k| k.nil? }
|
21
20
|
end
|
22
21
|
|
23
22
|
def foreign_keys
|
24
|
-
self[
|
23
|
+
self[:foreignKeys] || []
|
24
|
+
end
|
25
|
+
|
26
|
+
def missing_values
|
27
|
+
self.fetch(:missingValues, TableSchema::DEFAULTS[:missing_values])
|
25
28
|
end
|
26
29
|
|
27
|
-
def get_type(
|
28
|
-
get_field(
|
30
|
+
def get_type(field_name)
|
31
|
+
get_field(field_name)[:type]
|
29
32
|
end
|
30
33
|
|
31
|
-
def get_constraints(
|
32
|
-
get_field(
|
34
|
+
def get_constraints(field_name)
|
35
|
+
get_field(field_name)[:constraints] || {}
|
33
36
|
end
|
34
37
|
|
35
38
|
def required_headers
|
36
|
-
fields.select { |f| f
|
37
|
-
.map { |f| transform(f[
|
38
|
-
|
39
|
-
|
39
|
+
fields.select { |f| f.fetch(:constraints, {}).fetch(:required, nil).to_s == 'true' }
|
40
|
+
.map { |f| transform(f[:name]) }
|
41
|
+
end
|
42
|
+
|
43
|
+
def unique_headers
|
44
|
+
fields.select { |f| f.fetch(:constraints, {}).fetch(:unique, nil).to_s == 'true' }
|
45
|
+
.map { |f| transform(f[:name]) }
|
40
46
|
end
|
41
47
|
|
42
|
-
def has_field?(
|
43
|
-
get_field(
|
48
|
+
def has_field?(field_name)
|
49
|
+
get_field(field_name) != nil
|
44
50
|
end
|
45
51
|
|
46
|
-
def get_field(
|
47
|
-
fields.find { |f| f[
|
52
|
+
def get_field(field_name)
|
53
|
+
fields.find { |f| f[:name] == field_name }
|
48
54
|
end
|
49
55
|
|
50
56
|
def get_fields_by_type(type)
|
51
|
-
fields.select { |f| f[
|
57
|
+
fields.select { |f| f[:type] == type }
|
58
|
+
end
|
59
|
+
|
60
|
+
def add_field(descriptor)
|
61
|
+
self[:fields].push(descriptor)
|
62
|
+
validate!
|
63
|
+
descriptor
|
64
|
+
rescue TableSchema::SchemaException => e
|
65
|
+
self[:fields].pop
|
66
|
+
raise e if @strict
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def remove_field(field_name)
|
71
|
+
field = get_field(field_name)
|
72
|
+
self[:fields].reject!{ |f| f.name == field_name }
|
73
|
+
validate
|
74
|
+
field
|
52
75
|
end
|
53
76
|
|
54
77
|
private
|
55
78
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
79
|
+
def transform(name)
|
80
|
+
name.downcase! if @case_insensitive_headers == true
|
81
|
+
name
|
82
|
+
end
|
60
83
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
end
|
84
|
+
def expand!
|
85
|
+
(self[:fields] || []).each do |f|
|
86
|
+
f[:type] = TableSchema::DEFAULTS[:type] if f[:type] == nil
|
87
|
+
f[:format] = TableSchema::DEFAULTS[:format] if f[:format] == nil
|
66
88
|
end
|
89
|
+
end
|
67
90
|
|
68
|
-
|
69
|
-
|
70
|
-
|
91
|
+
def load_fields!
|
92
|
+
self[:fields] = (self[:fields] || []).map { |f| TableSchema::Field.new(f, missing_values) }
|
93
|
+
end
|
71
94
|
|
72
95
|
end
|
73
96
|
end
|
data/lib/tableschema/schema.rb
CHANGED
@@ -2,16 +2,23 @@ module TableSchema
|
|
2
2
|
class Schema < Hash
|
3
3
|
include TableSchema::Validate
|
4
4
|
include TableSchema::Model
|
5
|
-
include TableSchema::Data
|
6
5
|
include TableSchema::Helpers
|
7
6
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
attr_reader :errors
|
8
|
+
|
9
|
+
def initialize(descriptor, case_insensitive_headers: false, strict: false)
|
10
|
+
self.merge! deep_symbolize_keys(parse_schema(descriptor))
|
11
|
+
@case_insensitive_headers = case_insensitive_headers
|
12
|
+
@strict = strict
|
12
13
|
load_fields!
|
13
14
|
load_validator!
|
14
15
|
expand!
|
16
|
+
@strict == true ? validate! : validate
|
17
|
+
self
|
18
|
+
end
|
19
|
+
|
20
|
+
def descriptor
|
21
|
+
self.to_h
|
15
22
|
end
|
16
23
|
|
17
24
|
def parse_schema(descriptor)
|
@@ -19,7 +26,7 @@ module TableSchema
|
|
19
26
|
descriptor
|
20
27
|
elsif descriptor.class == String
|
21
28
|
begin
|
22
|
-
JSON.parse
|
29
|
+
JSON.parse(open(descriptor).read, symbolize_names: true)
|
23
30
|
rescue Errno::ENOENT
|
24
31
|
raise SchemaException.new("File not found at `#{descriptor}`")
|
25
32
|
rescue OpenURI::HTTPError => e
|
@@ -32,5 +39,32 @@ module TableSchema
|
|
32
39
|
end
|
33
40
|
end
|
34
41
|
|
42
|
+
def cast_row(row, fail_fast: true)
|
43
|
+
errors = Set.new
|
44
|
+
handle_error = lambda { |e| fail_fast == true ? raise(e) : errors << e }
|
45
|
+
row = row.fields if row.class == CSV::Row
|
46
|
+
if row.count != self.fields.count
|
47
|
+
handle_error.call(TableSchema::ConversionError.new("The number of items to convert (#{row.count}) does not match the number of headers in the schema (#{self.fields.count})"))
|
48
|
+
end
|
49
|
+
|
50
|
+
self.fields.each_with_index do |field, i|
|
51
|
+
begin
|
52
|
+
row[i] = field.cast_value(row[i])
|
53
|
+
rescue TableSchema::Exception => e
|
54
|
+
handle_error.call(e)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
unless errors.empty?
|
59
|
+
raise(TableSchema::MultipleInvalid.new("There were errors parsing the data", errors))
|
60
|
+
end
|
61
|
+
row
|
62
|
+
end
|
63
|
+
|
64
|
+
def save(target)
|
65
|
+
File.open(target, "w") { |file| file << JSON.pretty_generate(self.descriptor) }
|
66
|
+
true
|
67
|
+
end
|
68
|
+
|
35
69
|
end
|
36
70
|
end
|
data/lib/tableschema/table.rb
CHANGED
@@ -1,51 +1,100 @@
|
|
1
1
|
module TableSchema
|
2
2
|
class Table
|
3
3
|
|
4
|
-
attr_reader :schema
|
4
|
+
attr_reader :schema, :headers
|
5
5
|
|
6
|
-
def self.infer_schema(csv,
|
7
|
-
TableSchema::Table.new(csv, nil,
|
6
|
+
def self.infer_schema(csv, csv_options: {})
|
7
|
+
TableSchema::Table.new(csv, nil, csv_options)
|
8
8
|
end
|
9
9
|
|
10
|
-
def initialize(csv, descriptor,
|
11
|
-
@
|
10
|
+
def initialize(csv, descriptor, csv_options: {})
|
11
|
+
@csv_options = csv_options.merge(headers: true)
|
12
12
|
@csv = parse_csv(csv)
|
13
|
-
@
|
13
|
+
@headers = initialize_headers
|
14
|
+
@schema = descriptor.nil? ? infer_schema : TableSchema::Schema.new(descriptor)
|
15
|
+
initialize_unique_colums
|
14
16
|
end
|
15
17
|
|
18
|
+
def iter(row_limit: nil, cast: true, keyed: false)
|
19
|
+
unless block_given?
|
20
|
+
return enum_for(:iter, row_limit: row_limit, cast: cast, keyed: keyed)
|
21
|
+
end
|
22
|
+
|
23
|
+
@csv.each_with_index do |row, i|
|
24
|
+
break if row_limit && (row_limit <= i)
|
25
|
+
if cast == true
|
26
|
+
cast_values = @schema.cast_row(row)
|
27
|
+
row = CSV::Row.new(@headers, cast_values)
|
28
|
+
check_unique_fields(row, i)
|
29
|
+
end
|
30
|
+
if keyed == true
|
31
|
+
yield row.to_h
|
32
|
+
else
|
33
|
+
yield row.fields
|
34
|
+
end
|
35
|
+
collect_unique_fields(row, i)
|
36
|
+
end
|
37
|
+
|
38
|
+
@csv.rewind
|
39
|
+
end
|
40
|
+
|
41
|
+
def read(row_limit: nil, cast: true, keyed: false)
|
42
|
+
iterator = self.iter(row_limit: row_limit, cast: cast, keyed: keyed)
|
43
|
+
iterator.to_a
|
44
|
+
end
|
45
|
+
|
46
|
+
def save(target)
|
47
|
+
CSV.open(target, "wb", @csv_options) do |csv|
|
48
|
+
csv << @headers
|
49
|
+
self.iter{ |row| csv << row }
|
50
|
+
end
|
51
|
+
true
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
16
56
|
def parse_csv(csv)
|
17
57
|
csv = csv.is_a?(Array) ? StringIO.new(array_to_csv csv) : open(csv)
|
18
|
-
CSV.new(csv, csv_options)
|
58
|
+
CSV.new(csv, @csv_options)
|
19
59
|
end
|
20
60
|
|
21
|
-
def
|
22
|
-
(
|
61
|
+
def array_to_csv(array)
|
62
|
+
array.map { |row| row.to_csv(row_sep: nil) }.join("\r\n")
|
23
63
|
end
|
24
64
|
|
25
|
-
def
|
26
|
-
|
27
|
-
|
28
|
-
|
65
|
+
def infer_schema
|
66
|
+
inferer = TableSchema::Infer.new(@headers, @csv)
|
67
|
+
@csv.rewind
|
68
|
+
inferer.schema
|
29
69
|
end
|
30
70
|
|
31
|
-
|
71
|
+
def initialize_headers
|
72
|
+
headers = @csv.first.to_h.keys
|
73
|
+
@csv.rewind
|
74
|
+
headers
|
75
|
+
end
|
32
76
|
|
33
|
-
|
34
|
-
|
77
|
+
def initialize_unique_colums
|
78
|
+
@unique_columns = {}
|
79
|
+
unless @schema.unique_headers.empty?
|
80
|
+
@schema.unique_headers.each{ |header| @unique_columns[header] = [] }
|
35
81
|
end
|
82
|
+
end
|
36
83
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
end
|
41
|
-
end
|
84
|
+
def collect_unique_fields(row, row_number)
|
85
|
+
@unique_columns.each { |col_name, values| values[row_number] = row[col_name] }
|
86
|
+
end
|
42
87
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
88
|
+
def check_unique_fields(row, row_number)
|
89
|
+
@unique_columns.each do |col_name, values|
|
90
|
+
row_value = row[col_name]
|
91
|
+
previous_values = values[0..row_number-1]
|
92
|
+
previous_values.map!{|value| @schema.get_field(col_name).cast_type(value)}
|
93
|
+
if previous_values.include?(row_value)
|
94
|
+
raise TableSchema::ConstraintError.new("The values for the field `#{col_name}` should be unique but value `#{row_value}` is repeated")
|
95
|
+
end
|
48
96
|
end
|
97
|
+
end
|
49
98
|
|
50
99
|
end
|
51
100
|
end
|
@@ -9,6 +9,7 @@ module TableSchema
|
|
9
9
|
def self.supported_constraints
|
10
10
|
[
|
11
11
|
'required',
|
12
|
+
'unique',
|
12
13
|
'pattern',
|
13
14
|
'enum',
|
14
15
|
'minLength',
|
@@ -22,7 +23,7 @@ module TableSchema
|
|
22
23
|
|
23
24
|
def cast_default(value)
|
24
25
|
return value if value.is_a?(type)
|
25
|
-
parsed = JSON.parse(value)
|
26
|
+
parsed = JSON.parse(value, symbolize_names: true)
|
26
27
|
if parsed.is_a?(type)
|
27
28
|
return parsed
|
28
29
|
else
|
@@ -1,20 +1,16 @@
|
|
1
|
+
require 'tableschema/defaults'
|
2
|
+
|
1
3
|
module TableSchema
|
2
4
|
module Types
|
3
5
|
class Base
|
4
6
|
include TableSchema::Helpers
|
5
7
|
|
6
|
-
|
7
8
|
def initialize(field)
|
8
9
|
@field = field
|
9
|
-
@constraints = field['constraints'] || {}
|
10
|
-
@required = ['true', true].include?(@constraints['required'])
|
11
|
-
@type = @field['type']
|
12
10
|
set_format
|
13
11
|
end
|
14
12
|
|
15
|
-
def cast(value
|
16
|
-
TableSchema::Constraints.new(@field, value).validate! unless skip_constraints
|
17
|
-
return nil if is_null?(value)
|
13
|
+
def cast(value)
|
18
14
|
send("cast_#{@format}", value)
|
19
15
|
rescue NoMethodError => e
|
20
16
|
if e.message.start_with?('undefined method `cast_')
|
@@ -25,30 +21,22 @@ module TableSchema
|
|
25
21
|
end
|
26
22
|
|
27
23
|
def test(value)
|
28
|
-
cast(value
|
24
|
+
cast(value)
|
29
25
|
true
|
30
26
|
rescue TableSchema::Exception
|
31
27
|
false
|
32
28
|
end
|
33
29
|
|
30
|
+
private
|
31
|
+
|
34
32
|
def set_format
|
35
|
-
if (@field[
|
36
|
-
@format, @format_string = *@field[
|
33
|
+
if (@field[:format] || '').start_with?('fmt:')
|
34
|
+
@format, @format_string = *@field[:format].split(':', 2)
|
37
35
|
else
|
38
|
-
@format = @field[
|
36
|
+
@format = @field[:format] || TableSchema::DEFAULTS[:format]
|
39
37
|
end
|
40
38
|
end
|
41
39
|
|
42
|
-
private
|
43
|
-
|
44
|
-
def is_null?(value)
|
45
|
-
null_values.include?(value) && @required == false
|
46
|
-
end
|
47
|
-
|
48
|
-
def null_values
|
49
|
-
['null', 'none', 'nil', 'nan', '-', '']
|
50
|
-
end
|
51
|
-
|
52
40
|
end
|
53
41
|
end
|
54
42
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module TableSchema
|
2
|
+
module Types
|
3
|
+
class Duration < Base
|
4
|
+
|
5
|
+
def name
|
6
|
+
'duration'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.supported_constraints
|
10
|
+
[
|
11
|
+
'required',
|
12
|
+
'unique',
|
13
|
+
'enum',
|
14
|
+
'minimum',
|
15
|
+
'maximum',
|
16
|
+
]
|
17
|
+
end
|
18
|
+
|
19
|
+
def type
|
20
|
+
ActiveSupport::Duration
|
21
|
+
end
|
22
|
+
|
23
|
+
def cast_default(value)
|
24
|
+
ActiveSupport::Duration.parse(value)
|
25
|
+
rescue ActiveSupport::Duration::ISO8601Parser::ParsingError, TypeError
|
26
|
+
raise TableSchema::InvalidDurationType.new("#{value} is not a valid duration")
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|