tableschema 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +14 -0
  5. data/CHANGELOG.md +31 -0
  6. data/CODE_OF_CONDUCT.md +49 -0
  7. data/Gemfile +4 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +274 -0
  10. data/Rakefile +6 -0
  11. data/bin/console +14 -0
  12. data/bin/setup +8 -0
  13. data/etc/schemas/geojson.json +209 -0
  14. data/etc/schemas/json-table-schema.json +102 -0
  15. data/lib/tableschema.rb +42 -0
  16. data/lib/tableschema/constraints/constraints.rb +76 -0
  17. data/lib/tableschema/constraints/enum.rb +14 -0
  18. data/lib/tableschema/constraints/max_length.rb +15 -0
  19. data/lib/tableschema/constraints/maximum.rb +14 -0
  20. data/lib/tableschema/constraints/min_length.rb +15 -0
  21. data/lib/tableschema/constraints/minimum.rb +14 -0
  22. data/lib/tableschema/constraints/pattern.rb +14 -0
  23. data/lib/tableschema/constraints/required.rb +32 -0
  24. data/lib/tableschema/data.rb +60 -0
  25. data/lib/tableschema/exceptions.rb +28 -0
  26. data/lib/tableschema/field.rb +41 -0
  27. data/lib/tableschema/helpers.rb +48 -0
  28. data/lib/tableschema/infer.rb +143 -0
  29. data/lib/tableschema/model.rb +73 -0
  30. data/lib/tableschema/schema.rb +36 -0
  31. data/lib/tableschema/table.rb +51 -0
  32. data/lib/tableschema/types/any.rb +23 -0
  33. data/lib/tableschema/types/array.rb +37 -0
  34. data/lib/tableschema/types/base.rb +54 -0
  35. data/lib/tableschema/types/boolean.rb +35 -0
  36. data/lib/tableschema/types/date.rb +56 -0
  37. data/lib/tableschema/types/datetime.rb +63 -0
  38. data/lib/tableschema/types/geojson.rb +38 -0
  39. data/lib/tableschema/types/geopoint.rb +56 -0
  40. data/lib/tableschema/types/integer.rb +35 -0
  41. data/lib/tableschema/types/null.rb +37 -0
  42. data/lib/tableschema/types/number.rb +60 -0
  43. data/lib/tableschema/types/object.rb +37 -0
  44. data/lib/tableschema/types/string.rb +64 -0
  45. data/lib/tableschema/types/time.rb +55 -0
  46. data/lib/tableschema/validate.rb +54 -0
  47. data/lib/tableschema/version.rb +3 -0
  48. data/tableschema.gemspec +32 -0
  49. metadata +231 -0
@@ -0,0 +1,15 @@
1
+ module TableSchema
2
+ class Constraints
3
+ module MaxLength
4
+
5
+ def check_max_length
6
+ return if @value.nil?
7
+ if @value.length > @constraints['maxLength'].to_i
8
+ raise TableSchema::ConstraintError.new("The field `#{@field['name']}` must have a maximum length of #{@constraints['maxLength']}")
9
+ end
10
+ true
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ module TableSchema
2
+ class Constraints
3
+ module Maximum
4
+
5
+ def check_maximum
6
+ if @value > parse_constraint(@constraints['maximum'])
7
+ raise TableSchema::ConstraintError.new("The field `#{@field['name']}` must not be more than #{@constraints['maximum']}")
8
+ end
9
+ true
10
+ end
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ module TableSchema
2
+ class Constraints
3
+ module MinLength
4
+
5
+ def check_min_length
6
+ return if @value.nil?
7
+ if @value.length < @constraints['minLength'].to_i
8
+ raise TableSchema::ConstraintError.new("The field `#{@field['name']}` must have a minimum length of #{@constraints['minLength']}")
9
+ end
10
+ true
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ module TableSchema
2
+ class Constraints
3
+ module Minimum
4
+
5
+ def check_minimum
6
+ if @value < parse_constraint(@constraints['minimum'])
7
+ raise TableSchema::ConstraintError.new("The field `#{@field['name']}` must not be less than #{@constraints['minimum']}")
8
+ end
9
+ true
10
+ end
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ module TableSchema
2
+ class Constraints
3
+ module Pattern
4
+
5
+ def check_pattern
6
+ if !@value.to_json.match /#{@constraints['pattern']}/
7
+ raise TableSchema::ConstraintError.new("The value for the field `#{@field['name']}` must match the pattern")
8
+ end
9
+ true
10
+ end
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,32 @@
1
+ module TableSchema
2
+ class Constraints
3
+ module Required
4
+
5
+ def check_required
6
+ if required? && is_empty?
7
+ raise TableSchema::ConstraintError.new("The field `#{@field['name']}` requires a value")
8
+ end
9
+ true
10
+ end
11
+
12
+ private
13
+
14
+ def required?
15
+ required == true && @field['type'] != 'null'
16
+ end
17
+
18
+ def is_empty?
19
+ null_values.include?(@value)
20
+ end
21
+
22
+ def required
23
+ @constraints['required'].to_s == 'true'
24
+ end
25
+
26
+ def null_values
27
+ ['null', 'none', 'nil', 'nan', '-', '']
28
+ end
29
+
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,60 @@
1
+ module TableSchema
2
+ module Data
3
+
4
+ attr_reader :errors
5
+
6
+ def cast_rows(rows, fail_fast = true, limit = nil)
7
+ @errors ||= []
8
+ parsed_rows = []
9
+ rows.each_with_index do |r, i|
10
+ begin
11
+ break if limit && (limit <= i)
12
+ r = r.fields if r.class == CSV::Row
13
+ parsed_rows << cast_row(r, fail_fast)
14
+ rescue MultipleInvalid, ConversionError => e
15
+ raise e if fail_fast == true
16
+ @errors << e if e.is_a?(ConversionError)
17
+ end
18
+ end
19
+ check_for_errors
20
+ parsed_rows
21
+ end
22
+
23
+ alias_method :convert, :cast_rows
24
+
25
+ def cast_row(row, fail_fast = true)
26
+ @errors ||= []
27
+ raise_header_error(row) if row.count != fields.count
28
+ fields.each_with_index do |field,i|
29
+ row[i] = cast_column(field, row[i], fail_fast)
30
+ end
31
+ check_for_errors
32
+ row
33
+ end
34
+
35
+ alias_method :convert_row, :cast_row
36
+
37
+ private
38
+
39
+ def raise_header_error(row)
40
+ raise(TableSchema::ConversionError.new("The number of items to convert (#{row.count}) does not match the number of headers in the schema (#{fields.count})"))
41
+ end
42
+
43
+ def check_for_errors
44
+ raise(TableSchema::MultipleInvalid.new("There were errors parsing the data")) if @errors.count > 0
45
+ end
46
+
47
+ def cast_column(field, col, fail_fast)
48
+ field.cast_value(col)
49
+ rescue Exception => e
50
+ if fail_fast == true
51
+ raise e
52
+ else
53
+ @errors << e
54
+ end
55
+ end
56
+
57
+ alias_method :convert_column, :cast_column
58
+
59
+ end
60
+ end
@@ -0,0 +1,28 @@
1
+ module TableSchema
2
+ class Exception < ::Exception ; end
3
+
4
+ class SchemaException < Exception
5
+ attr_reader :message
6
+
7
+ def initialize message
8
+ @message = message
9
+ end
10
+ end
11
+
12
+ class InvalidFormat < Exception ; end
13
+ class InvalidCast < Exception ; end
14
+ class InvalidEmail < Exception ; end
15
+ class InvalidURI < Exception ; end
16
+ class InvalidUUID < Exception ; end
17
+ class InvalidObjectType < Exception ; end
18
+ class InvalidArrayType < Exception ; end
19
+ class InvalidDateType < Exception ; end
20
+ class InvalidTimeType < Exception ; end
21
+ class InvalidDateTimeType < Exception ; end
22
+ class InvalidGeoJSONType < Exception ; end
23
+ class InvalidGeoPointType < Exception ; end
24
+ class ConstraintError < Exception ; end
25
+ class ConstraintNotSupported < Exception ; end
26
+ class ConversionError < Exception ; end
27
+ class MultipleInvalid < Exception ; end
28
+ end
@@ -0,0 +1,41 @@
1
+ module TableSchema
2
+ class Field < Hash
3
+ include TableSchema::Helpers
4
+
5
+ attr_reader :type_class
6
+
7
+ def initialize(descriptor)
8
+ self.merge! descriptor
9
+ @type_class = get_type
10
+ end
11
+
12
+ def name
13
+ self['name']
14
+ end
15
+
16
+ def type
17
+ self['type'] || 'string'
18
+ end
19
+
20
+ def format
21
+ self['format'] || 'default'
22
+ end
23
+
24
+ def constraints
25
+ self['constraints'] || {}
26
+ end
27
+
28
+ def cast_value(col)
29
+ klass = get_class_for_type(type)
30
+ converter = Kernel.const_get(klass).new(self)
31
+ converter.cast(col)
32
+ end
33
+
34
+ private
35
+
36
+ def get_type
37
+ Object.const_get get_class_for_type(type)
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,48 @@
1
+ module TableSchema
2
+ module Helpers
3
+
4
+ def convert_to_boolean(value)
5
+ if value.is_a?(Boolean)
6
+ return value
7
+ elsif true_values.include?(value.to_s.downcase)
8
+ true
9
+ elsif false_values.include?(value.to_s.downcase)
10
+ false
11
+ else
12
+ nil
13
+ end
14
+ end
15
+
16
+ def true_values
17
+ ['yes', 'y', 'true', 't', '1']
18
+ end
19
+
20
+ def false_values
21
+ ['no', 'n', 'false', 'f', '0']
22
+ end
23
+
24
+ def get_class_for_type(type)
25
+ "TableSchema::Types::#{type_class_lookup[type] || 'String'}"
26
+ end
27
+
28
+ def type_class_lookup
29
+ {
30
+ 'any' => 'Any',
31
+ 'array' => 'Array',
32
+ 'base' => 'Base',
33
+ 'boolean' => 'Boolean',
34
+ 'date' => 'Date',
35
+ 'datetime' => 'DateTime',
36
+ 'geojson' => 'GeoJSON',
37
+ 'geopoint' => 'GeoPoint',
38
+ 'integer' => 'Integer',
39
+ 'null' => 'Null',
40
+ 'number' => 'Number',
41
+ 'object' => 'Object',
42
+ 'string' => 'String',
43
+ 'time' => 'Time',
44
+ }
45
+ end
46
+
47
+ end
48
+ end
@@ -0,0 +1,143 @@
1
+ module TableSchema
2
+ class Infer
3
+
4
+ include TableSchema::Helpers
5
+
6
+ attr_reader :schema
7
+
8
+ def initialize(headers, rows, opts = {})
9
+ @headers = headers
10
+ @rows = rows
11
+ @explicit = opts[:explicit]
12
+ @primary_key = opts[:primary_key]
13
+ @row_limit = opts[:row_limit]
14
+
15
+ @schema = {
16
+ 'fields' => fields
17
+ }
18
+ @schema['primaryKey'] = @primary_key if @primary_key
19
+ infer!
20
+ end
21
+
22
+ def fields
23
+ @headers.map do |header|
24
+ descriptor = {
25
+ 'name' => header,
26
+ 'title' => '',
27
+ 'description' => '',
28
+ }
29
+
30
+ constraints = {}
31
+ constraints['required'] = @explicit === true
32
+ constraints['unique'] = (header == @primary_key)
33
+ constraints.delete_if { |k,v| v == false } unless @explicit === true
34
+ descriptor['constraints'] = constraints if constraints.count > 0
35
+ descriptor
36
+ end
37
+ end
38
+
39
+ def infer!
40
+ type_matches = []
41
+ @rows.each_with_index do |row, i|
42
+ break if @row_limit && i > @row_limit
43
+ row = row.fields if row.class == CSV::Row
44
+
45
+ row_length = row.count
46
+ headers_length = @headers.count
47
+
48
+ if row_length > headers_length
49
+ row = row[0..headers_length]
50
+ elsif row_length < headers_length
51
+ diff = headers_length - row_length
52
+ fill = [''] * diff
53
+ row = row.push(fill).flatten
54
+ end
55
+
56
+ row.each_with_index do |col, i|
57
+ type_matches[i] ||= []
58
+ type_matches[i] << guess_type(col, i)
59
+ end
60
+
61
+ end
62
+ resolve_types(type_matches)
63
+ @schema = TableSchema::Schema.new(@schema)
64
+ end
65
+
66
+ def guess_type(col, index)
67
+ guessed_type = 'string'
68
+ guessed_format = 'default'
69
+
70
+ available_types.reverse_each do |type|
71
+ klass = get_class_for_type(type)
72
+ converter = Kernel.const_get(klass).new(@schema['fields'][index])
73
+ if converter.test(col) === true
74
+ guessed_type = type
75
+ guessed_format = guess_format(converter, col)
76
+ break
77
+ end
78
+ end
79
+
80
+ {
81
+ 'type' => guessed_type,
82
+ 'format' => guessed_format
83
+ }
84
+ end
85
+
86
+ def guess_format(converter, col)
87
+ guessed_format = 'default'
88
+ converter.class.instance_methods.grep(/cast_/).each do |method|
89
+ begin
90
+ format = method.to_s
91
+ format.slice!('cast_')
92
+ next if format == 'default'
93
+ converter.send(method, col)
94
+ guessed_format = format
95
+ break
96
+ rescue TableSchema::Exception
97
+ end
98
+ end
99
+ guessed_format
100
+ end
101
+
102
+ def resolve_types(results)
103
+ results.each_with_index do |result,v|
104
+ result.uniq!
105
+
106
+ if result.count == 1
107
+ rv = result[0]
108
+ else
109
+ counts = {}
110
+ result.each do |r|
111
+ counts[r] ||= 0
112
+ counts[r] += 1
113
+ end
114
+
115
+ sorted_counts = counts.sort_by {|_key, value| value}
116
+ rv = sorted_counts[0][0]
117
+ end
118
+
119
+ @schema['fields'][v].merge!(rv)
120
+ end
121
+
122
+ end
123
+
124
+ def available_types
125
+ [
126
+ 'any',
127
+ 'string',
128
+ 'boolean',
129
+ 'number',
130
+ 'integer',
131
+ 'null',
132
+ 'date',
133
+ 'time',
134
+ 'datetime',
135
+ 'array',
136
+ 'object',
137
+ 'geopoint',
138
+ 'geojson'
139
+ ]
140
+ end
141
+
142
+ end
143
+ end
@@ -0,0 +1,73 @@
1
+ module TableSchema
2
+ module Model
3
+
4
+ DEFAULTS = {
5
+ 'format' => 'default',
6
+ 'type' => 'string'
7
+ }
8
+
9
+ def headers
10
+ fields.map { |f| transform(f['name']) }
11
+ rescue NoMethodError
12
+ []
13
+ end
14
+
15
+ def fields
16
+ self['fields']
17
+ end
18
+
19
+ def primary_keys
20
+ [self['primaryKey']].flatten.reject { |k| k.nil? }
21
+ end
22
+
23
+ def foreign_keys
24
+ self['foreignKeys'] || []
25
+ end
26
+
27
+ def get_type(key)
28
+ get_field(key)['type']
29
+ end
30
+
31
+ def get_constraints(key)
32
+ get_field(key)['constraints'] || {}
33
+ end
34
+
35
+ def required_headers
36
+ fields.select { |f| f['constraints']!= nil && f['constraints']['required'] == true }
37
+ .map { |f| transform(f['name']) }
38
+ rescue NoMethodError
39
+ []
40
+ end
41
+
42
+ def has_field?(key)
43
+ get_field(key) != nil
44
+ end
45
+
46
+ def get_field(key)
47
+ fields.find { |f| f['name'] == key }
48
+ end
49
+
50
+ def get_fields_by_type(type)
51
+ fields.select { |f| f['type'] == type }
52
+ end
53
+
54
+ private
55
+
56
+ def transform(name)
57
+ name.downcase! if @opts[:case_insensitive_headers]
58
+ name
59
+ end
60
+
61
+ def expand!
62
+ (self['fields'] || []).each do |f|
63
+ f['type'] = DEFAULTS['type'] if f['type'] == nil
64
+ f['format'] = DEFAULTS['format'] if f['format'] == nil
65
+ end
66
+ end
67
+
68
+ def load_fields!
69
+ self['fields'] = (self['fields'] || []).map { |f| TableSchema::Field.new(f) }
70
+ end
71
+
72
+ end
73
+ end