tableschema 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +14 -0
  5. data/CHANGELOG.md +31 -0
  6. data/CODE_OF_CONDUCT.md +49 -0
  7. data/Gemfile +4 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +274 -0
  10. data/Rakefile +6 -0
  11. data/bin/console +14 -0
  12. data/bin/setup +8 -0
  13. data/etc/schemas/geojson.json +209 -0
  14. data/etc/schemas/json-table-schema.json +102 -0
  15. data/lib/tableschema.rb +42 -0
  16. data/lib/tableschema/constraints/constraints.rb +76 -0
  17. data/lib/tableschema/constraints/enum.rb +14 -0
  18. data/lib/tableschema/constraints/max_length.rb +15 -0
  19. data/lib/tableschema/constraints/maximum.rb +14 -0
  20. data/lib/tableschema/constraints/min_length.rb +15 -0
  21. data/lib/tableschema/constraints/minimum.rb +14 -0
  22. data/lib/tableschema/constraints/pattern.rb +14 -0
  23. data/lib/tableschema/constraints/required.rb +32 -0
  24. data/lib/tableschema/data.rb +60 -0
  25. data/lib/tableschema/exceptions.rb +28 -0
  26. data/lib/tableschema/field.rb +41 -0
  27. data/lib/tableschema/helpers.rb +48 -0
  28. data/lib/tableschema/infer.rb +143 -0
  29. data/lib/tableschema/model.rb +73 -0
  30. data/lib/tableschema/schema.rb +36 -0
  31. data/lib/tableschema/table.rb +51 -0
  32. data/lib/tableschema/types/any.rb +23 -0
  33. data/lib/tableschema/types/array.rb +37 -0
  34. data/lib/tableschema/types/base.rb +54 -0
  35. data/lib/tableschema/types/boolean.rb +35 -0
  36. data/lib/tableschema/types/date.rb +56 -0
  37. data/lib/tableschema/types/datetime.rb +63 -0
  38. data/lib/tableschema/types/geojson.rb +38 -0
  39. data/lib/tableschema/types/geopoint.rb +56 -0
  40. data/lib/tableschema/types/integer.rb +35 -0
  41. data/lib/tableschema/types/null.rb +37 -0
  42. data/lib/tableschema/types/number.rb +60 -0
  43. data/lib/tableschema/types/object.rb +37 -0
  44. data/lib/tableschema/types/string.rb +64 -0
  45. data/lib/tableschema/types/time.rb +55 -0
  46. data/lib/tableschema/validate.rb +54 -0
  47. data/lib/tableschema/version.rb +3 -0
  48. data/tableschema.gemspec +32 -0
  49. metadata +231 -0
@@ -0,0 +1,36 @@
1
+ module TableSchema
2
+ class Schema < Hash
3
+ include TableSchema::Validate
4
+ include TableSchema::Model
5
+ include TableSchema::Data
6
+ include TableSchema::Helpers
7
+
8
+ def initialize(descriptor, opts = {})
9
+ self.merge! parse_schema(descriptor)
10
+ @messages = []
11
+ @opts = opts
12
+ load_fields!
13
+ load_validator!
14
+ expand!
15
+ end
16
+
17
+ def parse_schema(descriptor)
18
+ if descriptor.class == Hash
19
+ descriptor
20
+ elsif descriptor.class == String
21
+ begin
22
+ JSON.parse open(descriptor).read
23
+ rescue Errno::ENOENT
24
+ raise SchemaException.new("File not found at `#{descriptor}`")
25
+ rescue OpenURI::HTTPError => e
26
+ raise SchemaException.new("URL `#{descriptor}` returned #{e.message}")
27
+ rescue JSON::ParserError
28
+ raise SchemaException.new("File at `#{descriptor}` is not valid JSON")
29
+ end
30
+ else
31
+ raise SchemaException.new("A schema must be a hash, path or URL")
32
+ end
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,51 @@
1
+ module TableSchema
2
+ class Table
3
+
4
+ attr_reader :schema
5
+
6
+ def self.infer_schema(csv, opts = {})
7
+ TableSchema::Table.new(csv, nil, opts)
8
+ end
9
+
10
+ def initialize(csv, descriptor, opts = {})
11
+ @opts = opts
12
+ @csv = parse_csv(csv)
13
+ @schema = descriptor.nil? ? infer_schema(@csv) : TableSchema::Schema.new(descriptor)
14
+ end
15
+
16
+ def parse_csv(csv)
17
+ csv = csv.is_a?(Array) ? StringIO.new(array_to_csv csv) : open(csv)
18
+ CSV.new(csv, csv_options)
19
+ end
20
+
21
+ def csv_options
22
+ (@opts[:csv_options] || {}).merge(headers: true)
23
+ end
24
+
25
+ def rows(opts = {})
26
+ fail_fast = opts[:fail_fast] || opts[:fail_fast].nil?
27
+ converted = @schema.cast_rows(@csv, fail_fast, opts[:limit])
28
+ opts[:keyed] ? coverted_to_hash(@csv.headers, converted) : converted
29
+ end
30
+
31
+ private
32
+
33
+ def array_to_csv(array)
34
+ array.map { |row| row.to_csv(row_sep: nil) }.join("\r\n")
35
+ end
36
+
37
+ def coverted_to_hash(headers, array)
38
+ array.map do |row|
39
+ Hash[row.map.with_index { |col, i| [headers[i], col] }]
40
+ end
41
+ end
42
+
43
+ def infer_schema(csv)
44
+ headers = csv.first.to_h.keys
45
+ csv.rewind
46
+ inferer = TableSchema::Infer.new(headers, csv)
47
+ inferer.schema
48
+ end
49
+
50
+ end
51
+ end
@@ -0,0 +1,23 @@
1
+ module TableSchema
2
+ module Types
3
+ class Any < Base
4
+
5
+ def name
6
+ 'any'
7
+ end
8
+
9
+ def self.supported_constraints
10
+ [
11
+ 'required',
12
+ 'pattern',
13
+ 'enum'
14
+ ]
15
+ end
16
+
17
+ def cast_default(value)
18
+ value
19
+ end
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,37 @@
1
+ module TableSchema
2
+ module Types
3
+ class Array < Base
4
+
5
+ def name
6
+ 'array'
7
+ end
8
+
9
+ def self.supported_constraints
10
+ [
11
+ 'required',
12
+ 'pattern',
13
+ 'enum',
14
+ 'minLength',
15
+ 'maxLength',
16
+ ]
17
+ end
18
+
19
+ def type
20
+ ::Array
21
+ end
22
+
23
+ def cast_default(value)
24
+ return value if value.is_a?(type)
25
+ parsed = JSON.parse(value)
26
+ if parsed.is_a?(type)
27
+ return parsed
28
+ else
29
+ raise TableSchema::InvalidArrayType.new("#{value} is not a valid array")
30
+ end
31
+ rescue
32
+ raise TableSchema::InvalidArrayType.new("#{value} is not a valid array")
33
+ end
34
+
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,54 @@
1
+ module TableSchema
2
+ module Types
3
+ class Base
4
+ include TableSchema::Helpers
5
+
6
+
7
+ def initialize(field)
8
+ @field = field
9
+ @constraints = field['constraints'] || {}
10
+ @required = ['true', true].include?(@constraints['required'])
11
+ @type = @field['type']
12
+ set_format
13
+ end
14
+
15
+ def cast(value, skip_constraints = false)
16
+ TableSchema::Constraints.new(@field, value).validate! unless skip_constraints
17
+ return nil if is_null?(value)
18
+ send("cast_#{@format}", value)
19
+ rescue NoMethodError => e
20
+ if e.message.start_with?('undefined method `cast_')
21
+ raise(TableSchema::InvalidFormat.new("The format `#{@format}` is not supported by the type `#{@type}`"))
22
+ else
23
+ raise e
24
+ end
25
+ end
26
+
27
+ def test(value)
28
+ cast(value, true)
29
+ true
30
+ rescue TableSchema::Exception
31
+ false
32
+ end
33
+
34
+ def set_format
35
+ if (@field['format'] || '').start_with?('fmt:')
36
+ @format, @format_string = *@field['format'].split(':', 2)
37
+ else
38
+ @format = @field['format'] || 'default'
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def is_null?(value)
45
+ null_values.include?(value) && @required == false
46
+ end
47
+
48
+ def null_values
49
+ ['null', 'none', 'nil', 'nan', '-', '']
50
+ end
51
+
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,35 @@
1
+ # Hack to check against one type from http://stackoverflow.com/a/3028378/452684
2
+ # because Ruby doesn't have a single boolean class
3
+ module Boolean; end
4
+ class TrueClass; include Boolean; end
5
+ class FalseClass; include Boolean; end
6
+
7
+ module TableSchema
8
+ module Types
9
+ class Boolean < Base
10
+
11
+ def name
12
+ 'boolean'
13
+ end
14
+
15
+ def self.supported_constraints
16
+ [
17
+ 'required',
18
+ 'pattern',
19
+ 'enum',
20
+ ]
21
+ end
22
+
23
+ def type
24
+ ::Boolean
25
+ end
26
+
27
+ def cast_default(value)
28
+ value = convert_to_boolean(value)
29
+ raise TableSchema::InvalidCast.new("#{value} is not a #{name}") if value.nil?
30
+ value
31
+ end
32
+
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,56 @@
1
+ module TableSchema
2
+ module Types
3
+ class Date < Base
4
+
5
+ def name
6
+ 'date'
7
+ end
8
+
9
+ def self.supported_constraints
10
+ [
11
+ 'required',
12
+ 'pattern',
13
+ 'enum',
14
+ 'minimum',
15
+ 'maximum',
16
+ ]
17
+ end
18
+
19
+ def type
20
+ ::Date
21
+ end
22
+
23
+ def iso8601
24
+ '%Y-%m-%d'
25
+ end
26
+
27
+ def cast_default(value)
28
+ @format_string = iso8601
29
+ cast_fmt(value)
30
+ end
31
+
32
+ def cast_any(value)
33
+ return value if value.is_a?(type)
34
+
35
+ date = ::Date._parse(value)
36
+ if date.values.count == 3
37
+ ::Date.parse(value)
38
+ else
39
+ raise TableSchema::InvalidDateType.new("#{value} is not a valid date")
40
+ end
41
+ end
42
+
43
+ def cast_fmt(value)
44
+ return value if value.is_a?(type)
45
+
46
+ begin
47
+ return ::Date.strptime(value, @format_string)
48
+ rescue ArgumentError
49
+ raise TableSchema::InvalidDateType.new("#{value} is not a valid date")
50
+ end
51
+ end
52
+
53
+
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,63 @@
1
+ module TableSchema
2
+ module Types
3
+ class DateTime < Base
4
+
5
+ def name
6
+ 'datetime'
7
+ end
8
+
9
+ def self.supported_constraints
10
+ [
11
+ 'required',
12
+ 'pattern',
13
+ 'enum',
14
+ 'minimum',
15
+ 'maximum'
16
+ ]
17
+ end
18
+
19
+ def type
20
+ ::DateTime
21
+ end
22
+
23
+ def iso8601
24
+ '%Y-%m-%dT%H:%M:%SZ'
25
+ end
26
+
27
+ # raw_formats = ['DD/MM/YYYYThh/mm/ss']
28
+ # py_formats = ['%Y/%m/%dT%H:%M:%S']
29
+ # format_map = dict(zip(raw_formats, py_formats))
30
+
31
+ def cast_default(value)
32
+ @format_string = iso8601
33
+ cast_fmt(value)
34
+ end
35
+
36
+ def cast_any(value)
37
+ return value if value.is_a?(type)
38
+
39
+ begin
40
+ date = ::DateTime._parse(value)
41
+ if date.values.count >= 4
42
+ ::DateTime.parse(value)
43
+ else
44
+ raise TableSchema::InvalidDateTimeType.new("#{value} is not a valid datetime")
45
+ end
46
+ rescue ArgumentError
47
+ raise TableSchema::InvalidDateTimeType.new("#{value} is not a valid datetime")
48
+ end
49
+ end
50
+
51
+ def cast_fmt(value)
52
+ return value if value.is_a?(type)
53
+
54
+ begin
55
+ return ::DateTime.strptime(value, @format_string)
56
+ rescue ArgumentError
57
+ raise TableSchema::InvalidDateTimeType.new("#{value} is not a valid date")
58
+ end
59
+ end
60
+
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,38 @@
1
+ module TableSchema
2
+ module Types
3
+ class GeoJSON < Base
4
+
5
+ def name
6
+ 'geojson'
7
+ end
8
+
9
+ def self.supported_constraints
10
+ [
11
+ 'required',
12
+ 'pattern',
13
+ 'enum'
14
+ ]
15
+ end
16
+
17
+ def type
18
+ ::Hash
19
+ end
20
+
21
+ def cast_default(value)
22
+ value = JSON.parse(value) if !value.is_a?(type)
23
+ JSON::Validator.validate!(geojson_schema, value)
24
+ value
25
+ rescue JSON::Schema::ValidationError, JSON::ParserError
26
+ raise TableSchema::InvalidGeoJSONType.new("#{value} is not valid GeoJSON")
27
+ end
28
+
29
+ private
30
+
31
+ def geojson_schema
32
+ path = File.join( File.dirname(__FILE__), "..", "..", "..", "etc", "schemas", "geojson.json" )
33
+ @geojson_schema ||= JSON.parse File.read(path)
34
+ end
35
+
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,56 @@
1
+ module TableSchema
2
+ module Types
3
+ class GeoPoint < Base
4
+
5
+ def name
6
+ 'geopoint'
7
+ end
8
+
9
+ def self.supported_constraints
10
+ [
11
+ 'required',
12
+ 'pattern',
13
+ 'enum'
14
+ ]
15
+ end
16
+
17
+ def types
18
+ [::String, ::Array, ::Hash]
19
+ end
20
+
21
+ def cast_default(value)
22
+ latlng = value.split(',', 2)
23
+ cast_array([latlng[0], latlng[1]])
24
+ end
25
+
26
+ def cast_object(value)
27
+ value = JSON.parse(value) if value.is_a?(::String)
28
+ cast_array([value['longitude'], value['latitude']])
29
+ rescue JSON::ParserError
30
+ raise TableSchema::InvalidGeoPointType.new("#{value} is not a valid geopoint")
31
+ end
32
+
33
+ def cast_array(value)
34
+ value = JSON.parse(value) if value.is_a?(::String)
35
+ value = [Float(value[0]), Float(value[1])]
36
+ check_latlng_range(value)
37
+ value
38
+ rescue JSON::ParserError, ArgumentError, TypeError
39
+ raise TableSchema::InvalidGeoPointType.new("#{value} is not a valid geopoint")
40
+ end
41
+
42
+ private
43
+
44
+ def check_latlng_range(geopoint)
45
+ longitude = geopoint[0]
46
+ latitude = geopoint[1]
47
+ if longitude >= 180 or longitude <= -180
48
+ raise TableSchema::InvalidGeoPointType.new("longtitude should be between -180 and 180, found `#{longitude}`")
49
+ elsif latitude >= 90 or latitude <= -90
50
+ raise TableSchema::InvalidGeoPointType.new("longtitude should be between -90 and 90, found `#{latitude}`")
51
+ end
52
+ end
53
+
54
+ end
55
+ end
56
+ end