tableschema 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +21 -0
  3. data/.travis.yml +15 -1
  4. data/README.md +164 -129
  5. data/Rakefile +10 -1
  6. data/bin/console +2 -6
  7. data/{etc/schemas → lib/profiles}/geojson.json +0 -1
  8. data/lib/profiles/table-schema.json +1625 -0
  9. data/lib/profiles/topojson.json +311 -0
  10. data/lib/tableschema.rb +5 -3
  11. data/lib/tableschema/constraints/constraints.rb +12 -24
  12. data/lib/tableschema/constraints/enum.rb +6 -2
  13. data/lib/tableschema/constraints/max_length.rb +6 -2
  14. data/lib/tableschema/constraints/maximum.rb +12 -2
  15. data/lib/tableschema/constraints/min_length.rb +6 -2
  16. data/lib/tableschema/constraints/minimum.rb +12 -2
  17. data/lib/tableschema/constraints/pattern.rb +9 -2
  18. data/lib/tableschema/constraints/required.rb +6 -15
  19. data/lib/tableschema/constraints/unique.rb +12 -0
  20. data/lib/tableschema/defaults.rb +9 -0
  21. data/lib/tableschema/exceptions.rb +15 -2
  22. data/lib/tableschema/field.rb +39 -20
  23. data/lib/tableschema/helpers.rb +32 -15
  24. data/lib/tableschema/infer.rb +31 -28
  25. data/lib/tableschema/model.rb +57 -34
  26. data/lib/tableschema/schema.rb +40 -6
  27. data/lib/tableschema/table.rb +75 -26
  28. data/lib/tableschema/types/any.rb +1 -0
  29. data/lib/tableschema/types/array.rb +2 -1
  30. data/lib/tableschema/types/base.rb +9 -21
  31. data/lib/tableschema/types/date.rb +1 -0
  32. data/lib/tableschema/types/datetime.rb +1 -0
  33. data/lib/tableschema/types/duration.rb +31 -0
  34. data/lib/tableschema/types/geojson.rb +27 -5
  35. data/lib/tableschema/types/geopoint.rb +4 -3
  36. data/lib/tableschema/types/integer.rb +1 -0
  37. data/lib/tableschema/types/number.rb +40 -25
  38. data/lib/tableschema/types/object.rb +2 -1
  39. data/lib/tableschema/types/string.rb +8 -0
  40. data/lib/tableschema/types/time.rb +1 -0
  41. data/lib/tableschema/types/year.rb +34 -0
  42. data/lib/tableschema/types/yearmonth.rb +52 -0
  43. data/lib/tableschema/validate.rb +45 -29
  44. data/lib/tableschema/version.rb +1 -1
  45. data/tableschema.gemspec +2 -1
  46. metadata +31 -12
  47. data/etc/schemas/json-table-schema.json +0 -102
  48. data/lib/tableschema/data.rb +0 -60
  49. data/lib/tableschema/types/null.rb +0 -37
@@ -1,73 +1,96 @@
1
+ require 'tableschema/defaults'
2
+
1
3
  module TableSchema
2
4
  module Model
3
5
 
4
- DEFAULTS = {
5
- 'format' => 'default',
6
- 'type' => 'string'
7
- }
8
-
9
6
  def headers
10
- fields.map { |f| transform(f['name']) }
7
+ fields.map { |f| transform(f[:name]) }
11
8
  rescue NoMethodError
12
9
  []
13
10
  end
14
11
 
12
+ alias :field_names :headers
13
+
15
14
  def fields
16
- self['fields']
15
+ self[:fields]
17
16
  end
18
17
 
19
18
  def primary_keys
20
- [self['primaryKey']].flatten.reject { |k| k.nil? }
19
+ [self[:primaryKey]].flatten.reject { |k| k.nil? }
21
20
  end
22
21
 
23
22
  def foreign_keys
24
- self['foreignKeys'] || []
23
+ self[:foreignKeys] || []
24
+ end
25
+
26
+ def missing_values
27
+ self.fetch(:missingValues, TableSchema::DEFAULTS[:missing_values])
25
28
  end
26
29
 
27
- def get_type(key)
28
- get_field(key)['type']
30
+ def get_type(field_name)
31
+ get_field(field_name)[:type]
29
32
  end
30
33
 
31
- def get_constraints(key)
32
- get_field(key)['constraints'] || {}
34
+ def get_constraints(field_name)
35
+ get_field(field_name)[:constraints] || {}
33
36
  end
34
37
 
35
38
  def required_headers
36
- fields.select { |f| f['constraints']!= nil && f['constraints']['required'] == true }
37
- .map { |f| transform(f['name']) }
38
- rescue NoMethodError
39
- []
39
+ fields.select { |f| f.fetch(:constraints, {}).fetch(:required, nil).to_s == 'true' }
40
+ .map { |f| transform(f[:name]) }
41
+ end
42
+
43
+ def unique_headers
44
+ fields.select { |f| f.fetch(:constraints, {}).fetch(:unique, nil).to_s == 'true' }
45
+ .map { |f| transform(f[:name]) }
40
46
  end
41
47
 
42
- def has_field?(key)
43
- get_field(key) != nil
48
+ def has_field?(field_name)
49
+ get_field(field_name) != nil
44
50
  end
45
51
 
46
- def get_field(key)
47
- fields.find { |f| f['name'] == key }
52
+ def get_field(field_name)
53
+ fields.find { |f| f[:name] == field_name }
48
54
  end
49
55
 
50
56
  def get_fields_by_type(type)
51
- fields.select { |f| f['type'] == type }
57
+ fields.select { |f| f[:type] == type }
58
+ end
59
+
60
+ def add_field(descriptor)
61
+ self[:fields].push(descriptor)
62
+ validate!
63
+ descriptor
64
+ rescue TableSchema::SchemaException => e
65
+ self[:fields].pop
66
+ raise e if @strict
67
+ nil
68
+ end
69
+
70
+ def remove_field(field_name)
71
+ field = get_field(field_name)
72
+ self[:fields].reject!{ |f| f.name == field_name }
73
+ validate
74
+ field
52
75
  end
53
76
 
54
77
  private
55
78
 
56
- def transform(name)
57
- name.downcase! if @opts[:case_insensitive_headers]
58
- name
59
- end
79
+ def transform(name)
80
+ name.downcase! if @case_insensitive_headers == true
81
+ name
82
+ end
60
83
 
61
- def expand!
62
- (self['fields'] || []).each do |f|
63
- f['type'] = DEFAULTS['type'] if f['type'] == nil
64
- f['format'] = DEFAULTS['format'] if f['format'] == nil
65
- end
84
+ def expand!
85
+ (self[:fields] || []).each do |f|
86
+ f[:type] = TableSchema::DEFAULTS[:type] if f[:type] == nil
87
+ f[:format] = TableSchema::DEFAULTS[:format] if f[:format] == nil
66
88
  end
89
+ end
67
90
 
68
- def load_fields!
69
- self['fields'] = (self['fields'] || []).map { |f| TableSchema::Field.new(f) }
70
- end
91
+ def load_fields!
92
+ self[:fields] = (self[:fields] || []).map { |f| TableSchema::Field.new(f, missing_values) }
93
+ end
71
94
 
72
95
  end
73
96
  end
@@ -2,16 +2,23 @@ module TableSchema
2
2
  class Schema < Hash
3
3
  include TableSchema::Validate
4
4
  include TableSchema::Model
5
- include TableSchema::Data
6
5
  include TableSchema::Helpers
7
6
 
8
- def initialize(descriptor, opts = {})
9
- self.merge! parse_schema(descriptor)
10
- @messages = []
11
- @opts = opts
7
+ attr_reader :errors
8
+
9
+ def initialize(descriptor, case_insensitive_headers: false, strict: false)
10
+ self.merge! deep_symbolize_keys(parse_schema(descriptor))
11
+ @case_insensitive_headers = case_insensitive_headers
12
+ @strict = strict
12
13
  load_fields!
13
14
  load_validator!
14
15
  expand!
16
+ @strict == true ? validate! : validate
17
+ self
18
+ end
19
+
20
+ def descriptor
21
+ self.to_h
15
22
  end
16
23
 
17
24
  def parse_schema(descriptor)
@@ -19,7 +26,7 @@ module TableSchema
19
26
  descriptor
20
27
  elsif descriptor.class == String
21
28
  begin
22
- JSON.parse open(descriptor).read
29
+ JSON.parse(open(descriptor).read, symbolize_names: true)
23
30
  rescue Errno::ENOENT
24
31
  raise SchemaException.new("File not found at `#{descriptor}`")
25
32
  rescue OpenURI::HTTPError => e
@@ -32,5 +39,32 @@ module TableSchema
32
39
  end
33
40
  end
34
41
 
42
+ def cast_row(row, fail_fast: true)
43
+ errors = Set.new
44
+ handle_error = lambda { |e| fail_fast == true ? raise(e) : errors << e }
45
+ row = row.fields if row.class == CSV::Row
46
+ if row.count != self.fields.count
47
+ handle_error.call(TableSchema::ConversionError.new("The number of items to convert (#{row.count}) does not match the number of headers in the schema (#{self.fields.count})"))
48
+ end
49
+
50
+ self.fields.each_with_index do |field, i|
51
+ begin
52
+ row[i] = field.cast_value(row[i])
53
+ rescue TableSchema::Exception => e
54
+ handle_error.call(e)
55
+ end
56
+ end
57
+
58
+ unless errors.empty?
59
+ raise(TableSchema::MultipleInvalid.new("There were errors parsing the data", errors))
60
+ end
61
+ row
62
+ end
63
+
64
+ def save(target)
65
+ File.open(target, "w") { |file| file << JSON.pretty_generate(self.descriptor) }
66
+ true
67
+ end
68
+
35
69
  end
36
70
  end
@@ -1,51 +1,100 @@
1
1
  module TableSchema
2
2
  class Table
3
3
 
4
- attr_reader :schema
4
+ attr_reader :schema, :headers
5
5
 
6
- def self.infer_schema(csv, opts = {})
7
- TableSchema::Table.new(csv, nil, opts)
6
+ def self.infer_schema(csv, csv_options: {})
7
+ TableSchema::Table.new(csv, nil, csv_options)
8
8
  end
9
9
 
10
- def initialize(csv, descriptor, opts = {})
11
- @opts = opts
10
+ def initialize(csv, descriptor, csv_options: {})
11
+ @csv_options = csv_options.merge(headers: true)
12
12
  @csv = parse_csv(csv)
13
- @schema = descriptor.nil? ? infer_schema(@csv) : TableSchema::Schema.new(descriptor)
13
+ @headers = initialize_headers
14
+ @schema = descriptor.nil? ? infer_schema : TableSchema::Schema.new(descriptor)
15
+ initialize_unique_colums
14
16
  end
15
17
 
18
+ def iter(row_limit: nil, cast: true, keyed: false)
19
+ unless block_given?
20
+ return enum_for(:iter, row_limit: row_limit, cast: cast, keyed: keyed)
21
+ end
22
+
23
+ @csv.each_with_index do |row, i|
24
+ break if row_limit && (row_limit <= i)
25
+ if cast == true
26
+ cast_values = @schema.cast_row(row)
27
+ row = CSV::Row.new(@headers, cast_values)
28
+ check_unique_fields(row, i)
29
+ end
30
+ if keyed == true
31
+ yield row.to_h
32
+ else
33
+ yield row.fields
34
+ end
35
+ collect_unique_fields(row, i)
36
+ end
37
+
38
+ @csv.rewind
39
+ end
40
+
41
+ def read(row_limit: nil, cast: true, keyed: false)
42
+ iterator = self.iter(row_limit: row_limit, cast: cast, keyed: keyed)
43
+ iterator.to_a
44
+ end
45
+
46
+ def save(target)
47
+ CSV.open(target, "wb", @csv_options) do |csv|
48
+ csv << @headers
49
+ self.iter{ |row| csv << row }
50
+ end
51
+ true
52
+ end
53
+
54
+ private
55
+
16
56
  def parse_csv(csv)
17
57
  csv = csv.is_a?(Array) ? StringIO.new(array_to_csv csv) : open(csv)
18
- CSV.new(csv, csv_options)
58
+ CSV.new(csv, @csv_options)
19
59
  end
20
60
 
21
- def csv_options
22
- (@opts[:csv_options] || {}).merge(headers: true)
61
+ def array_to_csv(array)
62
+ array.map { |row| row.to_csv(row_sep: nil) }.join("\r\n")
23
63
  end
24
64
 
25
- def rows(opts = {})
26
- fail_fast = opts[:fail_fast] || opts[:fail_fast].nil?
27
- converted = @schema.cast_rows(@csv, fail_fast, opts[:limit])
28
- opts[:keyed] ? coverted_to_hash(@csv.headers, converted) : converted
65
+ def infer_schema
66
+ inferer = TableSchema::Infer.new(@headers, @csv)
67
+ @csv.rewind
68
+ inferer.schema
29
69
  end
30
70
 
31
- private
71
+ def initialize_headers
72
+ headers = @csv.first.to_h.keys
73
+ @csv.rewind
74
+ headers
75
+ end
32
76
 
33
- def array_to_csv(array)
34
- array.map { |row| row.to_csv(row_sep: nil) }.join("\r\n")
77
+ def initialize_unique_colums
78
+ @unique_columns = {}
79
+ unless @schema.unique_headers.empty?
80
+ @schema.unique_headers.each{ |header| @unique_columns[header] = [] }
35
81
  end
82
+ end
36
83
 
37
- def coverted_to_hash(headers, array)
38
- array.map do |row|
39
- Hash[row.map.with_index { |col, i| [headers[i], col] }]
40
- end
41
- end
84
+ def collect_unique_fields(row, row_number)
85
+ @unique_columns.each { |col_name, values| values[row_number] = row[col_name] }
86
+ end
42
87
 
43
- def infer_schema(csv)
44
- headers = csv.first.to_h.keys
45
- csv.rewind
46
- inferer = TableSchema::Infer.new(headers, csv)
47
- inferer.schema
88
+ def check_unique_fields(row, row_number)
89
+ @unique_columns.each do |col_name, values|
90
+ row_value = row[col_name]
91
+ previous_values = values[0..row_number-1]
92
+ previous_values.map!{|value| @schema.get_field(col_name).cast_type(value)}
93
+ if previous_values.include?(row_value)
94
+ raise TableSchema::ConstraintError.new("The values for the field `#{col_name}` should be unique but value `#{row_value}` is repeated")
95
+ end
48
96
  end
97
+ end
49
98
 
50
99
  end
51
100
  end
@@ -9,6 +9,7 @@ module TableSchema
9
9
  def self.supported_constraints
10
10
  [
11
11
  'required',
12
+ 'unique',
12
13
  'pattern',
13
14
  'enum'
14
15
  ]
@@ -9,6 +9,7 @@ module TableSchema
9
9
  def self.supported_constraints
10
10
  [
11
11
  'required',
12
+ 'unique',
12
13
  'pattern',
13
14
  'enum',
14
15
  'minLength',
@@ -22,7 +23,7 @@ module TableSchema
22
23
 
23
24
  def cast_default(value)
24
25
  return value if value.is_a?(type)
25
- parsed = JSON.parse(value)
26
+ parsed = JSON.parse(value, symbolize_names: true)
26
27
  if parsed.is_a?(type)
27
28
  return parsed
28
29
  else
@@ -1,20 +1,16 @@
1
+ require 'tableschema/defaults'
2
+
1
3
  module TableSchema
2
4
  module Types
3
5
  class Base
4
6
  include TableSchema::Helpers
5
7
 
6
-
7
8
  def initialize(field)
8
9
  @field = field
9
- @constraints = field['constraints'] || {}
10
- @required = ['true', true].include?(@constraints['required'])
11
- @type = @field['type']
12
10
  set_format
13
11
  end
14
12
 
15
- def cast(value, skip_constraints = false)
16
- TableSchema::Constraints.new(@field, value).validate! unless skip_constraints
17
- return nil if is_null?(value)
13
+ def cast(value)
18
14
  send("cast_#{@format}", value)
19
15
  rescue NoMethodError => e
20
16
  if e.message.start_with?('undefined method `cast_')
@@ -25,30 +21,22 @@ module TableSchema
25
21
  end
26
22
 
27
23
  def test(value)
28
- cast(value, true)
24
+ cast(value)
29
25
  true
30
26
  rescue TableSchema::Exception
31
27
  false
32
28
  end
33
29
 
30
+ private
31
+
34
32
  def set_format
35
- if (@field['format'] || '').start_with?('fmt:')
36
- @format, @format_string = *@field['format'].split(':', 2)
33
+ if (@field[:format] || '').start_with?('fmt:')
34
+ @format, @format_string = *@field[:format].split(':', 2)
37
35
  else
38
- @format = @field['format'] || 'default'
36
+ @format = @field[:format] || TableSchema::DEFAULTS[:format]
39
37
  end
40
38
  end
41
39
 
42
- private
43
-
44
- def is_null?(value)
45
- null_values.include?(value) && @required == false
46
- end
47
-
48
- def null_values
49
- ['null', 'none', 'nil', 'nan', '-', '']
50
- end
51
-
52
40
  end
53
41
  end
54
42
  end
@@ -9,6 +9,7 @@ module TableSchema
9
9
  def self.supported_constraints
10
10
  [
11
11
  'required',
12
+ 'unique',
12
13
  'pattern',
13
14
  'enum',
14
15
  'minimum',
@@ -9,6 +9,7 @@ module TableSchema
9
9
  def self.supported_constraints
10
10
  [
11
11
  'required',
12
+ 'unique',
12
13
  'pattern',
13
14
  'enum',
14
15
  'minimum',
@@ -0,0 +1,31 @@
1
+ module TableSchema
2
+ module Types
3
+ class Duration < Base
4
+
5
+ def name
6
+ 'duration'
7
+ end
8
+
9
+ def self.supported_constraints
10
+ [
11
+ 'required',
12
+ 'unique',
13
+ 'enum',
14
+ 'minimum',
15
+ 'maximum',
16
+ ]
17
+ end
18
+
19
+ def type
20
+ ActiveSupport::Duration
21
+ end
22
+
23
+ def cast_default(value)
24
+ ActiveSupport::Duration.parse(value)
25
+ rescue ActiveSupport::Duration::ISO8601Parser::ParsingError, TypeError
26
+ raise TableSchema::InvalidDurationType.new("#{value} is not a valid duration")
27
+ end
28
+
29
+ end
30
+ end
31
+ end