tableschema 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +21 -0
  3. data/.travis.yml +15 -1
  4. data/README.md +164 -129
  5. data/Rakefile +10 -1
  6. data/bin/console +2 -6
  7. data/{etc/schemas → lib/profiles}/geojson.json +0 -1
  8. data/lib/profiles/table-schema.json +1625 -0
  9. data/lib/profiles/topojson.json +311 -0
  10. data/lib/tableschema.rb +5 -3
  11. data/lib/tableschema/constraints/constraints.rb +12 -24
  12. data/lib/tableschema/constraints/enum.rb +6 -2
  13. data/lib/tableschema/constraints/max_length.rb +6 -2
  14. data/lib/tableschema/constraints/maximum.rb +12 -2
  15. data/lib/tableschema/constraints/min_length.rb +6 -2
  16. data/lib/tableschema/constraints/minimum.rb +12 -2
  17. data/lib/tableschema/constraints/pattern.rb +9 -2
  18. data/lib/tableschema/constraints/required.rb +6 -15
  19. data/lib/tableschema/constraints/unique.rb +12 -0
  20. data/lib/tableschema/defaults.rb +9 -0
  21. data/lib/tableschema/exceptions.rb +15 -2
  22. data/lib/tableschema/field.rb +39 -20
  23. data/lib/tableschema/helpers.rb +32 -15
  24. data/lib/tableschema/infer.rb +31 -28
  25. data/lib/tableschema/model.rb +57 -34
  26. data/lib/tableschema/schema.rb +40 -6
  27. data/lib/tableschema/table.rb +75 -26
  28. data/lib/tableschema/types/any.rb +1 -0
  29. data/lib/tableschema/types/array.rb +2 -1
  30. data/lib/tableschema/types/base.rb +9 -21
  31. data/lib/tableschema/types/date.rb +1 -0
  32. data/lib/tableschema/types/datetime.rb +1 -0
  33. data/lib/tableschema/types/duration.rb +31 -0
  34. data/lib/tableschema/types/geojson.rb +27 -5
  35. data/lib/tableschema/types/geopoint.rb +4 -3
  36. data/lib/tableschema/types/integer.rb +1 -0
  37. data/lib/tableschema/types/number.rb +40 -25
  38. data/lib/tableschema/types/object.rb +2 -1
  39. data/lib/tableschema/types/string.rb +8 -0
  40. data/lib/tableschema/types/time.rb +1 -0
  41. data/lib/tableschema/types/year.rb +34 -0
  42. data/lib/tableschema/types/yearmonth.rb +52 -0
  43. data/lib/tableschema/validate.rb +45 -29
  44. data/lib/tableschema/version.rb +1 -1
  45. data/tableschema.gemspec +2 -1
  46. metadata +31 -12
  47. data/etc/schemas/json-table-schema.json +0 -102
  48. data/lib/tableschema/data.rb +0 -60
  49. data/lib/tableschema/types/null.rb +0 -37
@@ -1,73 +1,96 @@
1
+ require 'tableschema/defaults'
2
+
1
3
  module TableSchema
2
4
  module Model
3
5
 
4
- DEFAULTS = {
5
- 'format' => 'default',
6
- 'type' => 'string'
7
- }
8
-
9
6
  def headers
10
- fields.map { |f| transform(f['name']) }
7
+ fields.map { |f| transform(f[:name]) }
11
8
  rescue NoMethodError
12
9
  []
13
10
  end
14
11
 
12
+ alias :field_names :headers
13
+
15
14
  def fields
16
- self['fields']
15
+ self[:fields]
17
16
  end
18
17
 
19
18
  def primary_keys
20
- [self['primaryKey']].flatten.reject { |k| k.nil? }
19
+ [self[:primaryKey]].flatten.reject { |k| k.nil? }
21
20
  end
22
21
 
23
22
  def foreign_keys
24
- self['foreignKeys'] || []
23
+ self[:foreignKeys] || []
24
+ end
25
+
26
+ def missing_values
27
+ self.fetch(:missingValues, TableSchema::DEFAULTS[:missing_values])
25
28
  end
26
29
 
27
- def get_type(key)
28
- get_field(key)['type']
30
+ def get_type(field_name)
31
+ get_field(field_name)[:type]
29
32
  end
30
33
 
31
- def get_constraints(key)
32
- get_field(key)['constraints'] || {}
34
+ def get_constraints(field_name)
35
+ get_field(field_name)[:constraints] || {}
33
36
  end
34
37
 
35
38
  def required_headers
36
- fields.select { |f| f['constraints']!= nil && f['constraints']['required'] == true }
37
- .map { |f| transform(f['name']) }
38
- rescue NoMethodError
39
- []
39
+ fields.select { |f| f.fetch(:constraints, {}).fetch(:required, nil).to_s == 'true' }
40
+ .map { |f| transform(f[:name]) }
41
+ end
42
+
43
+ def unique_headers
44
+ fields.select { |f| f.fetch(:constraints, {}).fetch(:unique, nil).to_s == 'true' }
45
+ .map { |f| transform(f[:name]) }
40
46
  end
41
47
 
42
- def has_field?(key)
43
- get_field(key) != nil
48
+ def has_field?(field_name)
49
+ get_field(field_name) != nil
44
50
  end
45
51
 
46
- def get_field(key)
47
- fields.find { |f| f['name'] == key }
52
+ def get_field(field_name)
53
+ fields.find { |f| f[:name] == field_name }
48
54
  end
49
55
 
50
56
  def get_fields_by_type(type)
51
- fields.select { |f| f['type'] == type }
57
+ fields.select { |f| f[:type] == type }
58
+ end
59
+
60
+ def add_field(descriptor)
61
+ self[:fields].push(descriptor)
62
+ validate!
63
+ descriptor
64
+ rescue TableSchema::SchemaException => e
65
+ self[:fields].pop
66
+ raise e if @strict
67
+ nil
68
+ end
69
+
70
+ def remove_field(field_name)
71
+ field = get_field(field_name)
72
+ self[:fields].reject!{ |f| f.name == field_name }
73
+ validate
74
+ field
52
75
  end
53
76
 
54
77
  private
55
78
 
56
- def transform(name)
57
- name.downcase! if @opts[:case_insensitive_headers]
58
- name
59
- end
79
+ def transform(name)
80
+ name.downcase! if @case_insensitive_headers == true
81
+ name
82
+ end
60
83
 
61
- def expand!
62
- (self['fields'] || []).each do |f|
63
- f['type'] = DEFAULTS['type'] if f['type'] == nil
64
- f['format'] = DEFAULTS['format'] if f['format'] == nil
65
- end
84
+ def expand!
85
+ (self[:fields] || []).each do |f|
86
+ f[:type] = TableSchema::DEFAULTS[:type] if f[:type] == nil
87
+ f[:format] = TableSchema::DEFAULTS[:format] if f[:format] == nil
66
88
  end
89
+ end
67
90
 
68
- def load_fields!
69
- self['fields'] = (self['fields'] || []).map { |f| TableSchema::Field.new(f) }
70
- end
91
+ def load_fields!
92
+ self[:fields] = (self[:fields] || []).map { |f| TableSchema::Field.new(f, missing_values) }
93
+ end
71
94
 
72
95
  end
73
96
  end
@@ -2,16 +2,23 @@ module TableSchema
2
2
  class Schema < Hash
3
3
  include TableSchema::Validate
4
4
  include TableSchema::Model
5
- include TableSchema::Data
6
5
  include TableSchema::Helpers
7
6
 
8
- def initialize(descriptor, opts = {})
9
- self.merge! parse_schema(descriptor)
10
- @messages = []
11
- @opts = opts
7
+ attr_reader :errors
8
+
9
+ def initialize(descriptor, case_insensitive_headers: false, strict: false)
10
+ self.merge! deep_symbolize_keys(parse_schema(descriptor))
11
+ @case_insensitive_headers = case_insensitive_headers
12
+ @strict = strict
12
13
  load_fields!
13
14
  load_validator!
14
15
  expand!
16
+ @strict == true ? validate! : validate
17
+ self
18
+ end
19
+
20
+ def descriptor
21
+ self.to_h
15
22
  end
16
23
 
17
24
  def parse_schema(descriptor)
@@ -19,7 +26,7 @@ module TableSchema
19
26
  descriptor
20
27
  elsif descriptor.class == String
21
28
  begin
22
- JSON.parse open(descriptor).read
29
+ JSON.parse(open(descriptor).read, symbolize_names: true)
23
30
  rescue Errno::ENOENT
24
31
  raise SchemaException.new("File not found at `#{descriptor}`")
25
32
  rescue OpenURI::HTTPError => e
@@ -32,5 +39,32 @@ module TableSchema
32
39
  end
33
40
  end
34
41
 
42
+ def cast_row(row, fail_fast: true)
43
+ errors = Set.new
44
+ handle_error = lambda { |e| fail_fast == true ? raise(e) : errors << e }
45
+ row = row.fields if row.class == CSV::Row
46
+ if row.count != self.fields.count
47
+ handle_error.call(TableSchema::ConversionError.new("The number of items to convert (#{row.count}) does not match the number of headers in the schema (#{self.fields.count})"))
48
+ end
49
+
50
+ self.fields.each_with_index do |field, i|
51
+ begin
52
+ row[i] = field.cast_value(row[i])
53
+ rescue TableSchema::Exception => e
54
+ handle_error.call(e)
55
+ end
56
+ end
57
+
58
+ unless errors.empty?
59
+ raise(TableSchema::MultipleInvalid.new("There were errors parsing the data", errors))
60
+ end
61
+ row
62
+ end
63
+
64
+ def save(target)
65
+ File.open(target, "w") { |file| file << JSON.pretty_generate(self.descriptor) }
66
+ true
67
+ end
68
+
35
69
  end
36
70
  end
@@ -1,51 +1,100 @@
1
1
  module TableSchema
2
2
  class Table
3
3
 
4
- attr_reader :schema
4
+ attr_reader :schema, :headers
5
5
 
6
- def self.infer_schema(csv, opts = {})
7
- TableSchema::Table.new(csv, nil, opts)
6
+ def self.infer_schema(csv, csv_options: {})
7
+ TableSchema::Table.new(csv, nil, csv_options)
8
8
  end
9
9
 
10
- def initialize(csv, descriptor, opts = {})
11
- @opts = opts
10
+ def initialize(csv, descriptor, csv_options: {})
11
+ @csv_options = csv_options.merge(headers: true)
12
12
  @csv = parse_csv(csv)
13
- @schema = descriptor.nil? ? infer_schema(@csv) : TableSchema::Schema.new(descriptor)
13
+ @headers = initialize_headers
14
+ @schema = descriptor.nil? ? infer_schema : TableSchema::Schema.new(descriptor)
15
+ initialize_unique_colums
14
16
  end
15
17
 
18
+ def iter(row_limit: nil, cast: true, keyed: false)
19
+ unless block_given?
20
+ return enum_for(:iter, row_limit: row_limit, cast: cast, keyed: keyed)
21
+ end
22
+
23
+ @csv.each_with_index do |row, i|
24
+ break if row_limit && (row_limit <= i)
25
+ if cast == true
26
+ cast_values = @schema.cast_row(row)
27
+ row = CSV::Row.new(@headers, cast_values)
28
+ check_unique_fields(row, i)
29
+ end
30
+ if keyed == true
31
+ yield row.to_h
32
+ else
33
+ yield row.fields
34
+ end
35
+ collect_unique_fields(row, i)
36
+ end
37
+
38
+ @csv.rewind
39
+ end
40
+
41
+ def read(row_limit: nil, cast: true, keyed: false)
42
+ iterator = self.iter(row_limit: row_limit, cast: cast, keyed: keyed)
43
+ iterator.to_a
44
+ end
45
+
46
+ def save(target)
47
+ CSV.open(target, "wb", @csv_options) do |csv|
48
+ csv << @headers
49
+ self.iter{ |row| csv << row }
50
+ end
51
+ true
52
+ end
53
+
54
+ private
55
+
16
56
  def parse_csv(csv)
17
57
  csv = csv.is_a?(Array) ? StringIO.new(array_to_csv csv) : open(csv)
18
- CSV.new(csv, csv_options)
58
+ CSV.new(csv, @csv_options)
19
59
  end
20
60
 
21
- def csv_options
22
- (@opts[:csv_options] || {}).merge(headers: true)
61
+ def array_to_csv(array)
62
+ array.map { |row| row.to_csv(row_sep: nil) }.join("\r\n")
23
63
  end
24
64
 
25
- def rows(opts = {})
26
- fail_fast = opts[:fail_fast] || opts[:fail_fast].nil?
27
- converted = @schema.cast_rows(@csv, fail_fast, opts[:limit])
28
- opts[:keyed] ? coverted_to_hash(@csv.headers, converted) : converted
65
+ def infer_schema
66
+ inferer = TableSchema::Infer.new(@headers, @csv)
67
+ @csv.rewind
68
+ inferer.schema
29
69
  end
30
70
 
31
- private
71
+ def initialize_headers
72
+ headers = @csv.first.to_h.keys
73
+ @csv.rewind
74
+ headers
75
+ end
32
76
 
33
- def array_to_csv(array)
34
- array.map { |row| row.to_csv(row_sep: nil) }.join("\r\n")
77
+ def initialize_unique_colums
78
+ @unique_columns = {}
79
+ unless @schema.unique_headers.empty?
80
+ @schema.unique_headers.each{ |header| @unique_columns[header] = [] }
35
81
  end
82
+ end
36
83
 
37
- def coverted_to_hash(headers, array)
38
- array.map do |row|
39
- Hash[row.map.with_index { |col, i| [headers[i], col] }]
40
- end
41
- end
84
+ def collect_unique_fields(row, row_number)
85
+ @unique_columns.each { |col_name, values| values[row_number] = row[col_name] }
86
+ end
42
87
 
43
- def infer_schema(csv)
44
- headers = csv.first.to_h.keys
45
- csv.rewind
46
- inferer = TableSchema::Infer.new(headers, csv)
47
- inferer.schema
88
+ def check_unique_fields(row, row_number)
89
+ @unique_columns.each do |col_name, values|
90
+ row_value = row[col_name]
91
+ previous_values = values[0..row_number-1]
92
+ previous_values.map!{|value| @schema.get_field(col_name).cast_type(value)}
93
+ if previous_values.include?(row_value)
94
+ raise TableSchema::ConstraintError.new("The values for the field `#{col_name}` should be unique but value `#{row_value}` is repeated")
95
+ end
48
96
  end
97
+ end
49
98
 
50
99
  end
51
100
  end
@@ -9,6 +9,7 @@ module TableSchema
9
9
  def self.supported_constraints
10
10
  [
11
11
  'required',
12
+ 'unique',
12
13
  'pattern',
13
14
  'enum'
14
15
  ]
@@ -9,6 +9,7 @@ module TableSchema
9
9
  def self.supported_constraints
10
10
  [
11
11
  'required',
12
+ 'unique',
12
13
  'pattern',
13
14
  'enum',
14
15
  'minLength',
@@ -22,7 +23,7 @@ module TableSchema
22
23
 
23
24
  def cast_default(value)
24
25
  return value if value.is_a?(type)
25
- parsed = JSON.parse(value)
26
+ parsed = JSON.parse(value, symbolize_names: true)
26
27
  if parsed.is_a?(type)
27
28
  return parsed
28
29
  else
@@ -1,20 +1,16 @@
1
+ require 'tableschema/defaults'
2
+
1
3
  module TableSchema
2
4
  module Types
3
5
  class Base
4
6
  include TableSchema::Helpers
5
7
 
6
-
7
8
  def initialize(field)
8
9
  @field = field
9
- @constraints = field['constraints'] || {}
10
- @required = ['true', true].include?(@constraints['required'])
11
- @type = @field['type']
12
10
  set_format
13
11
  end
14
12
 
15
- def cast(value, skip_constraints = false)
16
- TableSchema::Constraints.new(@field, value).validate! unless skip_constraints
17
- return nil if is_null?(value)
13
+ def cast(value)
18
14
  send("cast_#{@format}", value)
19
15
  rescue NoMethodError => e
20
16
  if e.message.start_with?('undefined method `cast_')
@@ -25,30 +21,22 @@ module TableSchema
25
21
  end
26
22
 
27
23
  def test(value)
28
- cast(value, true)
24
+ cast(value)
29
25
  true
30
26
  rescue TableSchema::Exception
31
27
  false
32
28
  end
33
29
 
30
+ private
31
+
34
32
  def set_format
35
- if (@field['format'] || '').start_with?('fmt:')
36
- @format, @format_string = *@field['format'].split(':', 2)
33
+ if (@field[:format] || '').start_with?('fmt:')
34
+ @format, @format_string = *@field[:format].split(':', 2)
37
35
  else
38
- @format = @field['format'] || 'default'
36
+ @format = @field[:format] || TableSchema::DEFAULTS[:format]
39
37
  end
40
38
  end
41
39
 
42
- private
43
-
44
- def is_null?(value)
45
- null_values.include?(value) && @required == false
46
- end
47
-
48
- def null_values
49
- ['null', 'none', 'nil', 'nan', '-', '']
50
- end
51
-
52
40
  end
53
41
  end
54
42
  end
@@ -9,6 +9,7 @@ module TableSchema
9
9
  def self.supported_constraints
10
10
  [
11
11
  'required',
12
+ 'unique',
12
13
  'pattern',
13
14
  'enum',
14
15
  'minimum',
@@ -9,6 +9,7 @@ module TableSchema
9
9
  def self.supported_constraints
10
10
  [
11
11
  'required',
12
+ 'unique',
12
13
  'pattern',
13
14
  'enum',
14
15
  'minimum',
@@ -0,0 +1,31 @@
1
+ module TableSchema
2
+ module Types
3
+ class Duration < Base
4
+
5
+ def name
6
+ 'duration'
7
+ end
8
+
9
+ def self.supported_constraints
10
+ [
11
+ 'required',
12
+ 'unique',
13
+ 'enum',
14
+ 'minimum',
15
+ 'maximum',
16
+ ]
17
+ end
18
+
19
+ def type
20
+ ActiveSupport::Duration
21
+ end
22
+
23
+ def cast_default(value)
24
+ ActiveSupport::Duration.parse(value)
25
+ rescue ActiveSupport::Duration::ISO8601Parser::ParsingError, TypeError
26
+ raise TableSchema::InvalidDurationType.new("#{value} is not a valid duration")
27
+ end
28
+
29
+ end
30
+ end
31
+ end