tableschema 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +21 -0
- data/.travis.yml +15 -1
- data/README.md +164 -129
- data/Rakefile +10 -1
- data/bin/console +2 -6
- data/{etc/schemas → lib/profiles}/geojson.json +0 -1
- data/lib/profiles/table-schema.json +1625 -0
- data/lib/profiles/topojson.json +311 -0
- data/lib/tableschema.rb +5 -3
- data/lib/tableschema/constraints/constraints.rb +12 -24
- data/lib/tableschema/constraints/enum.rb +6 -2
- data/lib/tableschema/constraints/max_length.rb +6 -2
- data/lib/tableschema/constraints/maximum.rb +12 -2
- data/lib/tableschema/constraints/min_length.rb +6 -2
- data/lib/tableschema/constraints/minimum.rb +12 -2
- data/lib/tableschema/constraints/pattern.rb +9 -2
- data/lib/tableschema/constraints/required.rb +6 -15
- data/lib/tableschema/constraints/unique.rb +12 -0
- data/lib/tableschema/defaults.rb +9 -0
- data/lib/tableschema/exceptions.rb +15 -2
- data/lib/tableschema/field.rb +39 -20
- data/lib/tableschema/helpers.rb +32 -15
- data/lib/tableschema/infer.rb +31 -28
- data/lib/tableschema/model.rb +57 -34
- data/lib/tableschema/schema.rb +40 -6
- data/lib/tableschema/table.rb +75 -26
- data/lib/tableschema/types/any.rb +1 -0
- data/lib/tableschema/types/array.rb +2 -1
- data/lib/tableschema/types/base.rb +9 -21
- data/lib/tableschema/types/date.rb +1 -0
- data/lib/tableschema/types/datetime.rb +1 -0
- data/lib/tableschema/types/duration.rb +31 -0
- data/lib/tableschema/types/geojson.rb +27 -5
- data/lib/tableschema/types/geopoint.rb +4 -3
- data/lib/tableschema/types/integer.rb +1 -0
- data/lib/tableschema/types/number.rb +40 -25
- data/lib/tableschema/types/object.rb +2 -1
- data/lib/tableschema/types/string.rb +8 -0
- data/lib/tableschema/types/time.rb +1 -0
- data/lib/tableschema/types/year.rb +34 -0
- data/lib/tableschema/types/yearmonth.rb +52 -0
- data/lib/tableschema/validate.rb +45 -29
- data/lib/tableschema/version.rb +1 -1
- data/tableschema.gemspec +2 -1
- metadata +31 -12
- data/etc/schemas/json-table-schema.json +0 -102
- data/lib/tableschema/data.rb +0 -60
- data/lib/tableschema/types/null.rb +0 -37
data/lib/tableschema/model.rb
CHANGED
@@ -1,73 +1,96 @@
|
|
1
|
+
require 'tableschema/defaults'
|
2
|
+
|
1
3
|
module TableSchema
|
2
4
|
module Model
|
3
5
|
|
4
|
-
DEFAULTS = {
|
5
|
-
'format' => 'default',
|
6
|
-
'type' => 'string'
|
7
|
-
}
|
8
|
-
|
9
6
|
def headers
|
10
|
-
fields.map { |f| transform(f[
|
7
|
+
fields.map { |f| transform(f[:name]) }
|
11
8
|
rescue NoMethodError
|
12
9
|
[]
|
13
10
|
end
|
14
11
|
|
12
|
+
alias :field_names :headers
|
13
|
+
|
15
14
|
def fields
|
16
|
-
self[
|
15
|
+
self[:fields]
|
17
16
|
end
|
18
17
|
|
19
18
|
def primary_keys
|
20
|
-
[self[
|
19
|
+
[self[:primaryKey]].flatten.reject { |k| k.nil? }
|
21
20
|
end
|
22
21
|
|
23
22
|
def foreign_keys
|
24
|
-
self[
|
23
|
+
self[:foreignKeys] || []
|
24
|
+
end
|
25
|
+
|
26
|
+
def missing_values
|
27
|
+
self.fetch(:missingValues, TableSchema::DEFAULTS[:missing_values])
|
25
28
|
end
|
26
29
|
|
27
|
-
def get_type(
|
28
|
-
get_field(
|
30
|
+
def get_type(field_name)
|
31
|
+
get_field(field_name)[:type]
|
29
32
|
end
|
30
33
|
|
31
|
-
def get_constraints(
|
32
|
-
get_field(
|
34
|
+
def get_constraints(field_name)
|
35
|
+
get_field(field_name)[:constraints] || {}
|
33
36
|
end
|
34
37
|
|
35
38
|
def required_headers
|
36
|
-
fields.select { |f| f
|
37
|
-
.map { |f| transform(f[
|
38
|
-
|
39
|
-
|
39
|
+
fields.select { |f| f.fetch(:constraints, {}).fetch(:required, nil).to_s == 'true' }
|
40
|
+
.map { |f| transform(f[:name]) }
|
41
|
+
end
|
42
|
+
|
43
|
+
def unique_headers
|
44
|
+
fields.select { |f| f.fetch(:constraints, {}).fetch(:unique, nil).to_s == 'true' }
|
45
|
+
.map { |f| transform(f[:name]) }
|
40
46
|
end
|
41
47
|
|
42
|
-
def has_field?(
|
43
|
-
get_field(
|
48
|
+
def has_field?(field_name)
|
49
|
+
get_field(field_name) != nil
|
44
50
|
end
|
45
51
|
|
46
|
-
def get_field(
|
47
|
-
fields.find { |f| f[
|
52
|
+
def get_field(field_name)
|
53
|
+
fields.find { |f| f[:name] == field_name }
|
48
54
|
end
|
49
55
|
|
50
56
|
def get_fields_by_type(type)
|
51
|
-
fields.select { |f| f[
|
57
|
+
fields.select { |f| f[:type] == type }
|
58
|
+
end
|
59
|
+
|
60
|
+
def add_field(descriptor)
|
61
|
+
self[:fields].push(descriptor)
|
62
|
+
validate!
|
63
|
+
descriptor
|
64
|
+
rescue TableSchema::SchemaException => e
|
65
|
+
self[:fields].pop
|
66
|
+
raise e if @strict
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def remove_field(field_name)
|
71
|
+
field = get_field(field_name)
|
72
|
+
self[:fields].reject!{ |f| f.name == field_name }
|
73
|
+
validate
|
74
|
+
field
|
52
75
|
end
|
53
76
|
|
54
77
|
private
|
55
78
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
79
|
+
def transform(name)
|
80
|
+
name.downcase! if @case_insensitive_headers == true
|
81
|
+
name
|
82
|
+
end
|
60
83
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
end
|
84
|
+
def expand!
|
85
|
+
(self[:fields] || []).each do |f|
|
86
|
+
f[:type] = TableSchema::DEFAULTS[:type] if f[:type] == nil
|
87
|
+
f[:format] = TableSchema::DEFAULTS[:format] if f[:format] == nil
|
66
88
|
end
|
89
|
+
end
|
67
90
|
|
68
|
-
|
69
|
-
|
70
|
-
|
91
|
+
def load_fields!
|
92
|
+
self[:fields] = (self[:fields] || []).map { |f| TableSchema::Field.new(f, missing_values) }
|
93
|
+
end
|
71
94
|
|
72
95
|
end
|
73
96
|
end
|
data/lib/tableschema/schema.rb
CHANGED
@@ -2,16 +2,23 @@ module TableSchema
|
|
2
2
|
class Schema < Hash
|
3
3
|
include TableSchema::Validate
|
4
4
|
include TableSchema::Model
|
5
|
-
include TableSchema::Data
|
6
5
|
include TableSchema::Helpers
|
7
6
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
attr_reader :errors
|
8
|
+
|
9
|
+
def initialize(descriptor, case_insensitive_headers: false, strict: false)
|
10
|
+
self.merge! deep_symbolize_keys(parse_schema(descriptor))
|
11
|
+
@case_insensitive_headers = case_insensitive_headers
|
12
|
+
@strict = strict
|
12
13
|
load_fields!
|
13
14
|
load_validator!
|
14
15
|
expand!
|
16
|
+
@strict == true ? validate! : validate
|
17
|
+
self
|
18
|
+
end
|
19
|
+
|
20
|
+
def descriptor
|
21
|
+
self.to_h
|
15
22
|
end
|
16
23
|
|
17
24
|
def parse_schema(descriptor)
|
@@ -19,7 +26,7 @@ module TableSchema
|
|
19
26
|
descriptor
|
20
27
|
elsif descriptor.class == String
|
21
28
|
begin
|
22
|
-
JSON.parse
|
29
|
+
JSON.parse(open(descriptor).read, symbolize_names: true)
|
23
30
|
rescue Errno::ENOENT
|
24
31
|
raise SchemaException.new("File not found at `#{descriptor}`")
|
25
32
|
rescue OpenURI::HTTPError => e
|
@@ -32,5 +39,32 @@ module TableSchema
|
|
32
39
|
end
|
33
40
|
end
|
34
41
|
|
42
|
+
def cast_row(row, fail_fast: true)
|
43
|
+
errors = Set.new
|
44
|
+
handle_error = lambda { |e| fail_fast == true ? raise(e) : errors << e }
|
45
|
+
row = row.fields if row.class == CSV::Row
|
46
|
+
if row.count != self.fields.count
|
47
|
+
handle_error.call(TableSchema::ConversionError.new("The number of items to convert (#{row.count}) does not match the number of headers in the schema (#{self.fields.count})"))
|
48
|
+
end
|
49
|
+
|
50
|
+
self.fields.each_with_index do |field, i|
|
51
|
+
begin
|
52
|
+
row[i] = field.cast_value(row[i])
|
53
|
+
rescue TableSchema::Exception => e
|
54
|
+
handle_error.call(e)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
unless errors.empty?
|
59
|
+
raise(TableSchema::MultipleInvalid.new("There were errors parsing the data", errors))
|
60
|
+
end
|
61
|
+
row
|
62
|
+
end
|
63
|
+
|
64
|
+
def save(target)
|
65
|
+
File.open(target, "w") { |file| file << JSON.pretty_generate(self.descriptor) }
|
66
|
+
true
|
67
|
+
end
|
68
|
+
|
35
69
|
end
|
36
70
|
end
|
data/lib/tableschema/table.rb
CHANGED
@@ -1,51 +1,100 @@
|
|
1
1
|
module TableSchema
|
2
2
|
class Table
|
3
3
|
|
4
|
-
attr_reader :schema
|
4
|
+
attr_reader :schema, :headers
|
5
5
|
|
6
|
-
def self.infer_schema(csv,
|
7
|
-
TableSchema::Table.new(csv, nil,
|
6
|
+
def self.infer_schema(csv, csv_options: {})
|
7
|
+
TableSchema::Table.new(csv, nil, csv_options)
|
8
8
|
end
|
9
9
|
|
10
|
-
def initialize(csv, descriptor,
|
11
|
-
@
|
10
|
+
def initialize(csv, descriptor, csv_options: {})
|
11
|
+
@csv_options = csv_options.merge(headers: true)
|
12
12
|
@csv = parse_csv(csv)
|
13
|
-
@
|
13
|
+
@headers = initialize_headers
|
14
|
+
@schema = descriptor.nil? ? infer_schema : TableSchema::Schema.new(descriptor)
|
15
|
+
initialize_unique_colums
|
14
16
|
end
|
15
17
|
|
18
|
+
def iter(row_limit: nil, cast: true, keyed: false)
|
19
|
+
unless block_given?
|
20
|
+
return enum_for(:iter, row_limit: row_limit, cast: cast, keyed: keyed)
|
21
|
+
end
|
22
|
+
|
23
|
+
@csv.each_with_index do |row, i|
|
24
|
+
break if row_limit && (row_limit <= i)
|
25
|
+
if cast == true
|
26
|
+
cast_values = @schema.cast_row(row)
|
27
|
+
row = CSV::Row.new(@headers, cast_values)
|
28
|
+
check_unique_fields(row, i)
|
29
|
+
end
|
30
|
+
if keyed == true
|
31
|
+
yield row.to_h
|
32
|
+
else
|
33
|
+
yield row.fields
|
34
|
+
end
|
35
|
+
collect_unique_fields(row, i)
|
36
|
+
end
|
37
|
+
|
38
|
+
@csv.rewind
|
39
|
+
end
|
40
|
+
|
41
|
+
def read(row_limit: nil, cast: true, keyed: false)
|
42
|
+
iterator = self.iter(row_limit: row_limit, cast: cast, keyed: keyed)
|
43
|
+
iterator.to_a
|
44
|
+
end
|
45
|
+
|
46
|
+
def save(target)
|
47
|
+
CSV.open(target, "wb", @csv_options) do |csv|
|
48
|
+
csv << @headers
|
49
|
+
self.iter{ |row| csv << row }
|
50
|
+
end
|
51
|
+
true
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
16
56
|
def parse_csv(csv)
|
17
57
|
csv = csv.is_a?(Array) ? StringIO.new(array_to_csv csv) : open(csv)
|
18
|
-
CSV.new(csv, csv_options)
|
58
|
+
CSV.new(csv, @csv_options)
|
19
59
|
end
|
20
60
|
|
21
|
-
def
|
22
|
-
(
|
61
|
+
def array_to_csv(array)
|
62
|
+
array.map { |row| row.to_csv(row_sep: nil) }.join("\r\n")
|
23
63
|
end
|
24
64
|
|
25
|
-
def
|
26
|
-
|
27
|
-
|
28
|
-
|
65
|
+
def infer_schema
|
66
|
+
inferer = TableSchema::Infer.new(@headers, @csv)
|
67
|
+
@csv.rewind
|
68
|
+
inferer.schema
|
29
69
|
end
|
30
70
|
|
31
|
-
|
71
|
+
def initialize_headers
|
72
|
+
headers = @csv.first.to_h.keys
|
73
|
+
@csv.rewind
|
74
|
+
headers
|
75
|
+
end
|
32
76
|
|
33
|
-
|
34
|
-
|
77
|
+
def initialize_unique_colums
|
78
|
+
@unique_columns = {}
|
79
|
+
unless @schema.unique_headers.empty?
|
80
|
+
@schema.unique_headers.each{ |header| @unique_columns[header] = [] }
|
35
81
|
end
|
82
|
+
end
|
36
83
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
end
|
41
|
-
end
|
84
|
+
def collect_unique_fields(row, row_number)
|
85
|
+
@unique_columns.each { |col_name, values| values[row_number] = row[col_name] }
|
86
|
+
end
|
42
87
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
88
|
+
def check_unique_fields(row, row_number)
|
89
|
+
@unique_columns.each do |col_name, values|
|
90
|
+
row_value = row[col_name]
|
91
|
+
previous_values = values[0..row_number-1]
|
92
|
+
previous_values.map!{|value| @schema.get_field(col_name).cast_type(value)}
|
93
|
+
if previous_values.include?(row_value)
|
94
|
+
raise TableSchema::ConstraintError.new("The values for the field `#{col_name}` should be unique but value `#{row_value}` is repeated")
|
95
|
+
end
|
48
96
|
end
|
97
|
+
end
|
49
98
|
|
50
99
|
end
|
51
100
|
end
|
@@ -9,6 +9,7 @@ module TableSchema
|
|
9
9
|
def self.supported_constraints
|
10
10
|
[
|
11
11
|
'required',
|
12
|
+
'unique',
|
12
13
|
'pattern',
|
13
14
|
'enum',
|
14
15
|
'minLength',
|
@@ -22,7 +23,7 @@ module TableSchema
|
|
22
23
|
|
23
24
|
def cast_default(value)
|
24
25
|
return value if value.is_a?(type)
|
25
|
-
parsed = JSON.parse(value)
|
26
|
+
parsed = JSON.parse(value, symbolize_names: true)
|
26
27
|
if parsed.is_a?(type)
|
27
28
|
return parsed
|
28
29
|
else
|
@@ -1,20 +1,16 @@
|
|
1
|
+
require 'tableschema/defaults'
|
2
|
+
|
1
3
|
module TableSchema
|
2
4
|
module Types
|
3
5
|
class Base
|
4
6
|
include TableSchema::Helpers
|
5
7
|
|
6
|
-
|
7
8
|
def initialize(field)
|
8
9
|
@field = field
|
9
|
-
@constraints = field['constraints'] || {}
|
10
|
-
@required = ['true', true].include?(@constraints['required'])
|
11
|
-
@type = @field['type']
|
12
10
|
set_format
|
13
11
|
end
|
14
12
|
|
15
|
-
def cast(value
|
16
|
-
TableSchema::Constraints.new(@field, value).validate! unless skip_constraints
|
17
|
-
return nil if is_null?(value)
|
13
|
+
def cast(value)
|
18
14
|
send("cast_#{@format}", value)
|
19
15
|
rescue NoMethodError => e
|
20
16
|
if e.message.start_with?('undefined method `cast_')
|
@@ -25,30 +21,22 @@ module TableSchema
|
|
25
21
|
end
|
26
22
|
|
27
23
|
def test(value)
|
28
|
-
cast(value
|
24
|
+
cast(value)
|
29
25
|
true
|
30
26
|
rescue TableSchema::Exception
|
31
27
|
false
|
32
28
|
end
|
33
29
|
|
30
|
+
private
|
31
|
+
|
34
32
|
def set_format
|
35
|
-
if (@field[
|
36
|
-
@format, @format_string = *@field[
|
33
|
+
if (@field[:format] || '').start_with?('fmt:')
|
34
|
+
@format, @format_string = *@field[:format].split(':', 2)
|
37
35
|
else
|
38
|
-
@format = @field[
|
36
|
+
@format = @field[:format] || TableSchema::DEFAULTS[:format]
|
39
37
|
end
|
40
38
|
end
|
41
39
|
|
42
|
-
private
|
43
|
-
|
44
|
-
def is_null?(value)
|
45
|
-
null_values.include?(value) && @required == false
|
46
|
-
end
|
47
|
-
|
48
|
-
def null_values
|
49
|
-
['null', 'none', 'nil', 'nan', '-', '']
|
50
|
-
end
|
51
|
-
|
52
40
|
end
|
53
41
|
end
|
54
42
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module TableSchema
|
2
|
+
module Types
|
3
|
+
class Duration < Base
|
4
|
+
|
5
|
+
def name
|
6
|
+
'duration'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.supported_constraints
|
10
|
+
[
|
11
|
+
'required',
|
12
|
+
'unique',
|
13
|
+
'enum',
|
14
|
+
'minimum',
|
15
|
+
'maximum',
|
16
|
+
]
|
17
|
+
end
|
18
|
+
|
19
|
+
def type
|
20
|
+
ActiveSupport::Duration
|
21
|
+
end
|
22
|
+
|
23
|
+
def cast_default(value)
|
24
|
+
ActiveSupport::Duration.parse(value)
|
25
|
+
rescue ActiveSupport::Duration::ISO8601Parser::ParsingError, TypeError
|
26
|
+
raise TableSchema::InvalidDurationType.new("#{value} is not a valid duration")
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|