csvlint 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +8 -8
  2. data/.gitignore +7 -1
  3. data/CHANGELOG.md +19 -1
  4. data/README.md +93 -36
  5. data/bin/csvlint +68 -27
  6. data/csvlint.gemspec +2 -0
  7. data/features/csvw_schema_validation.feature +127 -0
  8. data/features/fixtures/spreadsheet.xlsx +0 -0
  9. data/features/sources.feature +3 -4
  10. data/features/step_definitions/parse_csv_steps.rb +13 -1
  11. data/features/step_definitions/schema_validation_steps.rb +27 -1
  12. data/features/step_definitions/sources_steps.rb +1 -1
  13. data/features/step_definitions/validation_errors_steps.rb +48 -1
  14. data/features/step_definitions/validation_info_steps.rb +5 -1
  15. data/features/step_definitions/validation_warnings_steps.rb +15 -1
  16. data/features/support/load_tests.rb +114 -0
  17. data/features/validation_errors.feature +12 -24
  18. data/features/validation_warnings.feature +18 -6
  19. data/lib/csvlint.rb +10 -0
  20. data/lib/csvlint/csvw/column.rb +359 -0
  21. data/lib/csvlint/csvw/date_format.rb +182 -0
  22. data/lib/csvlint/csvw/metadata_error.rb +13 -0
  23. data/lib/csvlint/csvw/number_format.rb +211 -0
  24. data/lib/csvlint/csvw/property_checker.rb +761 -0
  25. data/lib/csvlint/csvw/table.rb +204 -0
  26. data/lib/csvlint/csvw/table_group.rb +165 -0
  27. data/lib/csvlint/schema.rb +40 -23
  28. data/lib/csvlint/validate.rb +142 -19
  29. data/lib/csvlint/version.rb +1 -1
  30. data/spec/csvw/column_spec.rb +112 -0
  31. data/spec/csvw/date_format_spec.rb +49 -0
  32. data/spec/csvw/number_format_spec.rb +403 -0
  33. data/spec/csvw/table_group_spec.rb +143 -0
  34. data/spec/csvw/table_spec.rb +90 -0
  35. data/spec/schema_spec.rb +27 -1
  36. data/spec/spec_helper.rb +0 -1
  37. data/spec/validator_spec.rb +16 -10
  38. metadata +53 -2
@@ -0,0 +1,204 @@
1
+ module Csvlint
2
+ module Csvw
3
+ class Table
4
+
5
+ include Csvlint::ErrorCollector
6
+
7
+ attr_reader :columns, :dialect, :table_direction, :foreign_keys, :foreign_key_references, :id, :notes, :primary_key, :schema, :suppress_output, :transformations, :url, :annotations
8
+
9
+ def initialize(url, columns: [], dialect: {}, table_direction: :auto, foreign_keys: [], id: nil, notes: [], primary_key: nil, schema: nil, suppress_output: false, transformations: [], annotations: [], warnings: [])
10
+ @url = url
11
+ @columns = columns
12
+ @dialect = dialect
13
+ @table_direction = table_direction
14
+ @foreign_keys = foreign_keys
15
+ @foreign_key_values = {}
16
+ @foreign_key_references = []
17
+ @foreign_key_reference_values = {}
18
+ @id = id
19
+ @notes = notes
20
+ @primary_key = primary_key
21
+ @primary_key_values = {}
22
+ @schema = schema
23
+ @suppress_output = suppress_output
24
+ @transformations = transformations
25
+ @annotations = annotations
26
+ reset
27
+ @warnings += warnings
28
+ @errors += columns.map{|c| c.errors}.flatten
29
+ @warnings += columns.map{|c| c.warnings}.flatten
30
+ end
31
+
32
+ def validate_header(headers)
33
+ reset
34
+ headers.each_with_index do |header,i|
35
+ if columns[i]
36
+ columns[i].validate_header(header)
37
+ @errors += columns[i].errors
38
+ @warnings += columns[i].warnings
39
+ else
40
+ build_errors(:malformed_header, :schema, 1, nil, header, nil)
41
+ end
42
+ end unless columns.empty?
43
+ return valid?
44
+ end
45
+
46
+ def validate_row(values, row=nil)
47
+ reset
48
+ values.each_with_index do |value,i|
49
+ column = columns[i]
50
+ if column
51
+ column.validate(value, row)
52
+ @errors += column.errors
53
+ @warnings += column.warnings
54
+ else
55
+ build_errors(:too_many_values, :schema, row, nil, value, nil)
56
+ end
57
+ end unless columns.empty?
58
+ unless @primary_key.nil?
59
+ key = @primary_key.map { |column| column.parse(values[column.number - 1], row) }
60
+ build_errors(:duplicate_key, :schema, row, nil, key.join(","), @primary_key_values[key]) if @primary_key_values.include?(key)
61
+ @primary_key_values[key] = row
62
+ end
63
+ # build a record of the unique values that are referenced by foreign keys from other tables
64
+ # so that later we can check whether those foreign keys reference these values
65
+ @foreign_key_references.each do |foreign_key|
66
+ referenced_columns = foreign_key["referenced_columns"]
67
+ key = referenced_columns.map{ |column| column.parse(values[column.number - 1], row) }
68
+ known_values = @foreign_key_reference_values[foreign_key] = @foreign_key_reference_values[foreign_key] || {}
69
+ known_values[key] = known_values[key] || []
70
+ known_values[key] << row
71
+ end
72
+ # build a record of the references from this row to other tables
73
+ # we can't check yet whether these exist in the other tables because
74
+ # we might not have parsed those other tables
75
+ @foreign_keys.each do |foreign_key|
76
+ referencing_columns = foreign_key["referencing_columns"]
77
+ key = referencing_columns.map{ |column| column.parse(values[column.number - 1], row) }
78
+ known_values = @foreign_key_values[foreign_key] = @foreign_key_values[foreign_key] || []
79
+ known_values << key unless known_values.include?(key)
80
+ end
81
+ return valid?
82
+ end
83
+
84
+ def validate_foreign_keys
85
+ reset
86
+ @foreign_keys.each do |foreign_key|
87
+ local = @foreign_key_values[foreign_key]
88
+ remote_table = foreign_key["referenced_table"]
89
+ remote_table.validate_foreign_key_references(foreign_key, @url, local)
90
+ @errors += remote_table.errors unless remote_table == self
91
+ @warnings += remote_table.warnings unless remote_table == self
92
+ end
93
+ return valid?
94
+ end
95
+
96
+ def validate_foreign_key_references(foreign_key, remote_url, remote)
97
+ reset
98
+ local = @foreign_key_reference_values[foreign_key]
99
+ context = { "from" => { "url" => remote_url.to_s.split("/")[-1], "columns" => foreign_key["columnReference"] }, "to" => { "url" => @url.to_s.split("/")[-1], "columns" => foreign_key["reference"]["columnReference"] }}
100
+ remote.each do |r|
101
+ if local[r]
102
+ build_errors(:multiple_matched_rows, :schema, nil, nil, r, context) if local[r].length > 1
103
+ else
104
+ build_errors(:unmatched_foreign_key_reference, :schema, nil, nil, r, context)
105
+ end
106
+ end
107
+ return valid?
108
+ end
109
+
110
+ def self.from_json(table_desc, base_url=nil, lang="und", inherited_properties={})
111
+ annotations = {}
112
+ warnings = []
113
+ table_properties = {}
114
+ columns = []
115
+ notes = []
116
+ inherited_properties = inherited_properties.clone
117
+
118
+ table_desc.each do |property,value|
119
+ if property =="@type"
120
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].@type"), "@type of table is not 'Table'" unless value == 'Table'
121
+ elsif property == "notes"
122
+ notes = value
123
+ else
124
+ v, warning, type = Csvw::PropertyChecker.check_property(property, value, base_url, lang)
125
+ warnings += Array(warning).map{ |w| Csvlint::ErrorMessage.new(w, :metadata, nil, nil, "#{property}: #{value}", nil) } unless warning.nil? || warning.empty?
126
+ if type == :annotation
127
+ annotations[property] = v
128
+ elsif type == :table || type == :common
129
+ table_properties[property] = v
130
+ elsif type == :column
131
+ warnings << Csvlint::ErrorMessage.new(:invalid_property, :metadata, nil, nil, "#{property}", nil)
132
+ else
133
+ inherited_properties[property] = v
134
+ end
135
+ end
136
+ end
137
+
138
+ table_schema = table_properties["tableSchema"] || inherited_properties["tableSchema"]
139
+ column_names = []
140
+ foreign_keys = []
141
+ primary_key = nil
142
+ if table_schema
143
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].tableSchema.columns"), "schema columns is not an array" unless table_schema["columns"].instance_of? Array
144
+ virtual_columns = false
145
+ table_schema["columns"].each_with_index do |column_desc,i|
146
+ if column_desc.instance_of? Hash
147
+ column = Csvlint::Csvw::Column.from_json(i+1, column_desc, base_url, lang, inherited_properties)
148
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].tableSchema.columns[#{i}].virtual"), "virtual columns before non-virtual column #{column.name || i}" if virtual_columns && !column.virtual
149
+ virtual_columns = virtual_columns || column.virtual
150
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].tableSchema.columns"), "multiple columns named #{column.name}" if column_names.include? column.name
151
+ column_names << column.name unless column.name.nil?
152
+ columns << column
153
+ else
154
+ warnings << Csvlint::ErrorMessage.new(:invalid_column_description, :metadata, nil, nil, "#{column_desc}", nil)
155
+ end
156
+ end
157
+
158
+ primary_key = table_schema["primaryKey"]
159
+ primary_key_columns = []
160
+ primary_key_valid = true
161
+ primary_key.each do |reference|
162
+ i = column_names.index(reference)
163
+ if i
164
+ primary_key_columns << columns[i]
165
+ else
166
+ warnings << Csvlint::ErrorMessage.new(:invalid_column_reference, :metadata, nil, nil, "primaryKey: #{reference}", nil)
167
+ primary_key_valid = false
168
+ end
169
+ end if primary_key
170
+
171
+ foreign_keys = table_schema["foreignKeys"]
172
+ foreign_keys.each_with_index do |foreign_key, i|
173
+ foreign_key_columns = []
174
+ foreign_key["columnReference"].each do |reference|
175
+ i = column_names.index(reference)
176
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].tableSchema.foreignKeys[#{i}].columnReference"), "foreignKey references non-existant column" unless i
177
+ foreign_key_columns << columns[i]
178
+ end
179
+ foreign_key["referencing_columns"] = foreign_key_columns
180
+ end if foreign_keys
181
+
182
+ row_titles = table_schema["rowTitles"]
183
+ row_titles.each_with_index do |row_title,i|
184
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].tableSchema.rowTitles[#{i}]"), "rowTitles references non-existant column" unless column_names.include? row_title
185
+ end if row_titles
186
+
187
+ end
188
+
189
+ return self.new(table_properties["url"],
190
+ id: table_properties["@id"],
191
+ columns: columns,
192
+ dialect: table_properties["dialect"],
193
+ foreign_keys: foreign_keys || [],
194
+ notes: notes,
195
+ primary_key: primary_key_valid && !primary_key_columns.empty? ? primary_key_columns : nil,
196
+ schema: table_schema ? table_schema["@id"] : nil,
197
+ annotations: annotations,
198
+ warnings: warnings
199
+ )
200
+ end
201
+
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,165 @@
1
+ module Csvlint
2
+ module Csvw
3
+ class TableGroup
4
+
5
+ include Csvlint::ErrorCollector
6
+
7
+ attr_reader :url, :id, :tables, :notes, :annotations
8
+
9
+ def initialize(url, id: nil, tables: {}, notes: [], annotations: {}, warnings: [])
10
+ @url = url
11
+ @id = id
12
+ @tables = tables
13
+ @notes = notes
14
+ @annotations = annotations
15
+ @validated_tables = {}
16
+ @tables.each { |t,v| @validated_tables[t] = false }
17
+ reset
18
+ @warnings += warnings
19
+ @errors += @tables.map{|url,table| table.errors}.flatten
20
+ @warnings += @tables.map{|url,table| table.warnings}.flatten
21
+ end
22
+
23
+ def validate_header(header, table_url)
24
+ reset
25
+ table_url = "file:#{File.absolute_path(table_url)}" if table_url.instance_of? File
26
+ table = tables[table_url]
27
+ table.validate_header(header)
28
+ @errors += table.errors
29
+ @warnings += table.warnings
30
+ return valid?
31
+ end
32
+
33
+ def validate_row(values, row=nil, all_errors=[], table_url)
34
+ reset
35
+ table_url = "file:#{File.absolute_path(table_url)}" if table_url.instance_of? File
36
+ @validated_tables[table_url] = true
37
+ table = tables[table_url]
38
+ table.validate_row(values, row)
39
+ @errors += table.errors
40
+ @warnings += table.warnings
41
+ return valid?
42
+ end
43
+
44
+ def validate_foreign_keys
45
+ reset
46
+ unless @validated_tables.has_value?(false)
47
+ @tables.each do |table_url,table|
48
+ table.validate_foreign_keys
49
+ @errors += table.errors
50
+ @warnings += table.warnings
51
+ end
52
+ end
53
+ return valid?
54
+ end
55
+
56
+ def self.from_json(url, json)
57
+ warnings = []
58
+ tables = {}
59
+ annotations = {}
60
+ inherited_properties = {}
61
+ common_properties = {}
62
+ base_url = URI(url.to_s.strip)
63
+ lang = "und"
64
+
65
+ context = json["@context"]
66
+ if context.instance_of?(Array) && context[1]
67
+ context[1].each do |property,value|
68
+ v, warning, type = Csvw::PropertyChecker.check_property(property, value, base_url, lang)
69
+ if warning.nil? || warning.empty?
70
+ if type == :context
71
+ base_url = v if property == "@base"
72
+ lang = v if property == "@language"
73
+ else
74
+ raise Csvlint::Csvw::MetadataError.new("$.@context"), "@context contains properties other than @base or @language (#{property})"
75
+ end
76
+ else
77
+ raise Csvlint::Csvw::MetadataError.new("$.@context"), "@context contains properties other than @base or @language (#{property})" unless ["@base", "@language"].include?(property)
78
+ warnings += Array(warning).map{ |w| Csvlint::ErrorMessage.new(w, :metadata, nil, nil, "@context: #{property}: #{value}", nil) }
79
+ end
80
+ end
81
+ end
82
+ json.delete("@context")
83
+
84
+ if json["url"]
85
+ json = { "tables" => [ json ] }
86
+ end unless json["tables"]
87
+
88
+ json.each do |property,value|
89
+ unless VALID_PROPERTIES.include? property
90
+ v, warning, type = Csvw::PropertyChecker.check_property(property, value, base_url, lang)
91
+ warnings += Array(warning).map{ |w| Csvlint::ErrorMessage.new(w, :metadata, nil, nil, "#{property}: #{value}", nil) } unless warning.nil? || warning.empty?
92
+ if type == :annotation
93
+ annotations[property] = v
94
+ elsif type == :common
95
+ common_properties[property] = v
96
+ elsif type == :column
97
+ warnings << Csvlint::ErrorMessage.new(:invalid_property, :metadata, nil, nil, "#{property}", nil)
98
+ else
99
+ inherited_properties[property] = v
100
+ end
101
+ end
102
+ end
103
+
104
+ id = common_properties["@id"]
105
+
106
+ raise Csvlint::Csvw::MetadataError.new("$.@type"), "@type of table group is not 'TableGroup'" if json["@type"] && json["@type"] != 'TableGroup'
107
+
108
+ raise Csvlint::Csvw::MetadataError.new("$"), "no tables property" unless json["tables"]
109
+ raise Csvlint::Csvw::MetadataError.new("$.tables"), "empty tables property" if json["tables"].empty?
110
+ raise Csvlint::Csvw::MetadataError.new("$.tables"), "tables property is not an array" unless json["tables"].instance_of? Array
111
+
112
+ json["tables"].each do |table_desc|
113
+ if table_desc.instance_of? Hash
114
+ table_url = table_desc["url"]
115
+ unless table_url.instance_of? String
116
+ warnings << Csvlint::ErrorMessage.new(:invalid_url, :metadata, nil, nil, "url: #{table_url}", nil)
117
+ table_url = ""
118
+ end
119
+ table_url = URI.join(base_url, table_url).to_s
120
+ table_desc["url"] = table_url
121
+ table = Csvlint::Csvw::Table.from_json(table_desc, base_url, lang, inherited_properties)
122
+ tables[table_url] = table
123
+ else
124
+ warnings << Csvlint::ErrorMessage.new(:invalid_table_description, :metadata, nil, nil, "#{table_desc}", nil)
125
+ end
126
+ end
127
+
128
+ tables.each do |table_url, table|
129
+ table.foreign_keys.each_with_index do |foreign_key,i|
130
+ reference = foreign_key["reference"]
131
+ if reference["resource"]
132
+ resource = URI.join(base_url, reference["resource"]).to_s
133
+ referenced_table = tables[resource]
134
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_url}')].tableSchema.foreign_keys[#{i}].reference.resource"), "foreign key references table that does not exist (#{resource})" if referenced_table.nil?
135
+ else
136
+ schema_url = URI.join(base_url, reference["schemaReference"]).to_s
137
+ referenced_tables = tables.values.select{ |table| table.schema == schema_url }
138
+ referenced_table = referenced_tables[0]
139
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_url}')].tableSchema.foreign_keys[#{i}].reference.schemaReference"), "foreign key references schema that is not used (#{schema_url})" if referenced_table.nil?
140
+ end
141
+ foreign_key["referenced_table"] = referenced_table
142
+ table_columns = {}
143
+ referenced_table.columns.each do |column|
144
+ table_columns[column.name] = column if column.name
145
+ end
146
+ referenced_columns = []
147
+ Array(reference["columnReference"]).each do |column_reference|
148
+ column = table_columns[column_reference]
149
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_url}')].tableSchema.foreign_keys[#{i}].reference.columnReference"), "column named #{column_reference} does not exist in #{resource}" if column.nil?
150
+ referenced_columns << column
151
+ end
152
+ foreign_key["referenced_columns"] = referenced_columns
153
+ referenced_table.foreign_key_references << foreign_key
154
+ end
155
+ end
156
+
157
+ return self.new(base_url, id: id, tables: tables, notes: json["notes"] || [], annotations: annotations, warnings: warnings)
158
+ end
159
+
160
+ private
161
+ VALID_PROPERTIES = ['tables', 'notes', '@type']
162
+
163
+ end
164
+ end
165
+ end
@@ -14,7 +14,45 @@ module Csvlint
14
14
  reset
15
15
  end
16
16
 
17
- def validate_header(header)
17
+ class << self
18
+
19
+ def from_json_table(uri, json)
20
+ fields = []
21
+ json["fields"].each do |field_desc|
22
+ fields << Csvlint::Field.new( field_desc["name"] , field_desc["constraints"],
23
+ field_desc["title"], field_desc["description"] )
24
+ end if json["fields"]
25
+ return Schema.new( uri , fields, json["title"], json["description"] )
26
+ end
27
+
28
+ def from_csvw_metadata(uri, json)
29
+ return Csvlint::Csvw::TableGroup.from_json(uri, json)
30
+ end
31
+
32
+ def load_from_json(uri)
33
+ begin
34
+ json = JSON.parse( open(uri).read )
35
+ if json["@context"]
36
+ uri = "file:#{File.expand_path(uri)}" unless uri.to_s =~ /^http(s)?/
37
+ return Schema.from_csvw_metadata(uri,json)
38
+ else
39
+ return Schema.from_json_table(uri,json)
40
+ end
41
+ rescue Csvlint::Csvw::MetadataError => e
42
+ raise e
43
+ rescue OpenURI::HTTPError => e
44
+ raise e
45
+ rescue => e
46
+ STDERR.puts e.class
47
+ STDERR.puts e.message
48
+ STDERR.puts e.backtrace
49
+ return Schema.new(nil, [], "malformed", "malformed")
50
+ end
51
+ end
52
+
53
+ end
54
+
55
+ def validate_header(header, source_url=nil)
18
56
  reset
19
57
 
20
58
  found_header = header.to_csv(:row_sep => '')
@@ -25,7 +63,7 @@ module Csvlint
25
63
  return valid?
26
64
  end
27
65
 
28
- def validate_row(values, row=nil, all_errors=[])
66
+ def validate_row(values, row=nil, all_errors=[], source_url=nil)
29
67
  reset
30
68
  if values.length < fields.length
31
69
  fields[values.size..-1].each_with_index do |field, i|
@@ -48,26 +86,5 @@ module Csvlint
48
86
  return valid?
49
87
  end
50
88
 
51
- def Schema.from_json_table(uri, json)
52
- fields = []
53
- json["fields"].each do |field_desc|
54
- fields << Csvlint::Field.new( field_desc["name"] , field_desc["constraints"],
55
- field_desc["title"], field_desc["description"] )
56
- end if json["fields"]
57
- return Schema.new( uri , fields, json["title"], json["description"] )
58
- end
59
-
60
- # Difference in functionality between from_json_table and load_from_json_table
61
- # needs to be specified
62
-
63
- def Schema.load_from_json_table(uri)
64
- begin
65
- json = JSON.parse( open(uri).read )
66
- return Schema.from_json_table(uri,json)
67
- rescue
68
- return Schema.new(nil, [], "malformed", "malformed")
69
- end
70
- end
71
-
72
89
  end
73
90
  end