csvlint 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +8 -8
  2. data/.gitignore +7 -1
  3. data/CHANGELOG.md +19 -1
  4. data/README.md +93 -36
  5. data/bin/csvlint +68 -27
  6. data/csvlint.gemspec +2 -0
  7. data/features/csvw_schema_validation.feature +127 -0
  8. data/features/fixtures/spreadsheet.xlsx +0 -0
  9. data/features/sources.feature +3 -4
  10. data/features/step_definitions/parse_csv_steps.rb +13 -1
  11. data/features/step_definitions/schema_validation_steps.rb +27 -1
  12. data/features/step_definitions/sources_steps.rb +1 -1
  13. data/features/step_definitions/validation_errors_steps.rb +48 -1
  14. data/features/step_definitions/validation_info_steps.rb +5 -1
  15. data/features/step_definitions/validation_warnings_steps.rb +15 -1
  16. data/features/support/load_tests.rb +114 -0
  17. data/features/validation_errors.feature +12 -24
  18. data/features/validation_warnings.feature +18 -6
  19. data/lib/csvlint.rb +10 -0
  20. data/lib/csvlint/csvw/column.rb +359 -0
  21. data/lib/csvlint/csvw/date_format.rb +182 -0
  22. data/lib/csvlint/csvw/metadata_error.rb +13 -0
  23. data/lib/csvlint/csvw/number_format.rb +211 -0
  24. data/lib/csvlint/csvw/property_checker.rb +761 -0
  25. data/lib/csvlint/csvw/table.rb +204 -0
  26. data/lib/csvlint/csvw/table_group.rb +165 -0
  27. data/lib/csvlint/schema.rb +40 -23
  28. data/lib/csvlint/validate.rb +142 -19
  29. data/lib/csvlint/version.rb +1 -1
  30. data/spec/csvw/column_spec.rb +112 -0
  31. data/spec/csvw/date_format_spec.rb +49 -0
  32. data/spec/csvw/number_format_spec.rb +403 -0
  33. data/spec/csvw/table_group_spec.rb +143 -0
  34. data/spec/csvw/table_spec.rb +90 -0
  35. data/spec/schema_spec.rb +27 -1
  36. data/spec/spec_helper.rb +0 -1
  37. data/spec/validator_spec.rb +16 -10
  38. metadata +53 -2
@@ -0,0 +1,204 @@
1
+ module Csvlint
2
+ module Csvw
3
+ class Table
4
+
5
+ include Csvlint::ErrorCollector
6
+
7
+ attr_reader :columns, :dialect, :table_direction, :foreign_keys, :foreign_key_references, :id, :notes, :primary_key, :schema, :suppress_output, :transformations, :url, :annotations
8
+
9
+ def initialize(url, columns: [], dialect: {}, table_direction: :auto, foreign_keys: [], id: nil, notes: [], primary_key: nil, schema: nil, suppress_output: false, transformations: [], annotations: [], warnings: [])
10
+ @url = url
11
+ @columns = columns
12
+ @dialect = dialect
13
+ @table_direction = table_direction
14
+ @foreign_keys = foreign_keys
15
+ @foreign_key_values = {}
16
+ @foreign_key_references = []
17
+ @foreign_key_reference_values = {}
18
+ @id = id
19
+ @notes = notes
20
+ @primary_key = primary_key
21
+ @primary_key_values = {}
22
+ @schema = schema
23
+ @suppress_output = suppress_output
24
+ @transformations = transformations
25
+ @annotations = annotations
26
+ reset
27
+ @warnings += warnings
28
+ @errors += columns.map{|c| c.errors}.flatten
29
+ @warnings += columns.map{|c| c.warnings}.flatten
30
+ end
31
+
32
+ def validate_header(headers)
33
+ reset
34
+ headers.each_with_index do |header,i|
35
+ if columns[i]
36
+ columns[i].validate_header(header)
37
+ @errors += columns[i].errors
38
+ @warnings += columns[i].warnings
39
+ else
40
+ build_errors(:malformed_header, :schema, 1, nil, header, nil)
41
+ end
42
+ end unless columns.empty?
43
+ return valid?
44
+ end
45
+
46
+ def validate_row(values, row=nil)
47
+ reset
48
+ values.each_with_index do |value,i|
49
+ column = columns[i]
50
+ if column
51
+ column.validate(value, row)
52
+ @errors += column.errors
53
+ @warnings += column.warnings
54
+ else
55
+ build_errors(:too_many_values, :schema, row, nil, value, nil)
56
+ end
57
+ end unless columns.empty?
58
+ unless @primary_key.nil?
59
+ key = @primary_key.map { |column| column.parse(values[column.number - 1], row) }
60
+ build_errors(:duplicate_key, :schema, row, nil, key.join(","), @primary_key_values[key]) if @primary_key_values.include?(key)
61
+ @primary_key_values[key] = row
62
+ end
63
+ # build a record of the unique values that are referenced by foreign keys from other tables
64
+ # so that later we can check whether those foreign keys reference these values
65
+ @foreign_key_references.each do |foreign_key|
66
+ referenced_columns = foreign_key["referenced_columns"]
67
+ key = referenced_columns.map{ |column| column.parse(values[column.number - 1], row) }
68
+ known_values = @foreign_key_reference_values[foreign_key] = @foreign_key_reference_values[foreign_key] || {}
69
+ known_values[key] = known_values[key] || []
70
+ known_values[key] << row
71
+ end
72
+ # build a record of the references from this row to other tables
73
+ # we can't check yet whether these exist in the other tables because
74
+ # we might not have parsed those other tables
75
+ @foreign_keys.each do |foreign_key|
76
+ referencing_columns = foreign_key["referencing_columns"]
77
+ key = referencing_columns.map{ |column| column.parse(values[column.number - 1], row) }
78
+ known_values = @foreign_key_values[foreign_key] = @foreign_key_values[foreign_key] || []
79
+ known_values << key unless known_values.include?(key)
80
+ end
81
+ return valid?
82
+ end
83
+
84
+ def validate_foreign_keys
85
+ reset
86
+ @foreign_keys.each do |foreign_key|
87
+ local = @foreign_key_values[foreign_key]
88
+ remote_table = foreign_key["referenced_table"]
89
+ remote_table.validate_foreign_key_references(foreign_key, @url, local)
90
+ @errors += remote_table.errors unless remote_table == self
91
+ @warnings += remote_table.warnings unless remote_table == self
92
+ end
93
+ return valid?
94
+ end
95
+
96
+ def validate_foreign_key_references(foreign_key, remote_url, remote)
97
+ reset
98
+ local = @foreign_key_reference_values[foreign_key]
99
+ context = { "from" => { "url" => remote_url.to_s.split("/")[-1], "columns" => foreign_key["columnReference"] }, "to" => { "url" => @url.to_s.split("/")[-1], "columns" => foreign_key["reference"]["columnReference"] }}
100
+ remote.each do |r|
101
+ if local[r]
102
+ build_errors(:multiple_matched_rows, :schema, nil, nil, r, context) if local[r].length > 1
103
+ else
104
+ build_errors(:unmatched_foreign_key_reference, :schema, nil, nil, r, context)
105
+ end
106
+ end
107
+ return valid?
108
+ end
109
+
110
+ def self.from_json(table_desc, base_url=nil, lang="und", inherited_properties={})
111
+ annotations = {}
112
+ warnings = []
113
+ table_properties = {}
114
+ columns = []
115
+ notes = []
116
+ inherited_properties = inherited_properties.clone
117
+
118
+ table_desc.each do |property,value|
119
+ if property =="@type"
120
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].@type"), "@type of table is not 'Table'" unless value == 'Table'
121
+ elsif property == "notes"
122
+ notes = value
123
+ else
124
+ v, warning, type = Csvw::PropertyChecker.check_property(property, value, base_url, lang)
125
+ warnings += Array(warning).map{ |w| Csvlint::ErrorMessage.new(w, :metadata, nil, nil, "#{property}: #{value}", nil) } unless warning.nil? || warning.empty?
126
+ if type == :annotation
127
+ annotations[property] = v
128
+ elsif type == :table || type == :common
129
+ table_properties[property] = v
130
+ elsif type == :column
131
+ warnings << Csvlint::ErrorMessage.new(:invalid_property, :metadata, nil, nil, "#{property}", nil)
132
+ else
133
+ inherited_properties[property] = v
134
+ end
135
+ end
136
+ end
137
+
138
+ table_schema = table_properties["tableSchema"] || inherited_properties["tableSchema"]
139
+ column_names = []
140
+ foreign_keys = []
141
+ primary_key = nil
142
+ if table_schema
143
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].tableSchema.columns"), "schema columns is not an array" unless table_schema["columns"].instance_of? Array
144
+ virtual_columns = false
145
+ table_schema["columns"].each_with_index do |column_desc,i|
146
+ if column_desc.instance_of? Hash
147
+ column = Csvlint::Csvw::Column.from_json(i+1, column_desc, base_url, lang, inherited_properties)
148
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].tableSchema.columns[#{i}].virtual"), "virtual columns before non-virtual column #{column.name || i}" if virtual_columns && !column.virtual
149
+ virtual_columns = virtual_columns || column.virtual
150
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].tableSchema.columns"), "multiple columns named #{column.name}" if column_names.include? column.name
151
+ column_names << column.name unless column.name.nil?
152
+ columns << column
153
+ else
154
+ warnings << Csvlint::ErrorMessage.new(:invalid_column_description, :metadata, nil, nil, "#{column_desc}", nil)
155
+ end
156
+ end
157
+
158
+ primary_key = table_schema["primaryKey"]
159
+ primary_key_columns = []
160
+ primary_key_valid = true
161
+ primary_key.each do |reference|
162
+ i = column_names.index(reference)
163
+ if i
164
+ primary_key_columns << columns[i]
165
+ else
166
+ warnings << Csvlint::ErrorMessage.new(:invalid_column_reference, :metadata, nil, nil, "primaryKey: #{reference}", nil)
167
+ primary_key_valid = false
168
+ end
169
+ end if primary_key
170
+
171
+ foreign_keys = table_schema["foreignKeys"]
172
+ foreign_keys.each_with_index do |foreign_key, i|
173
+ foreign_key_columns = []
174
+ foreign_key["columnReference"].each do |reference|
175
+ i = column_names.index(reference)
176
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].tableSchema.foreignKeys[#{i}].columnReference"), "foreignKey references non-existant column" unless i
177
+ foreign_key_columns << columns[i]
178
+ end
179
+ foreign_key["referencing_columns"] = foreign_key_columns
180
+ end if foreign_keys
181
+
182
+ row_titles = table_schema["rowTitles"]
183
+ row_titles.each_with_index do |row_title,i|
184
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_desc["url"]}')].tableSchema.rowTitles[#{i}]"), "rowTitles references non-existant column" unless column_names.include? row_title
185
+ end if row_titles
186
+
187
+ end
188
+
189
+ return self.new(table_properties["url"],
190
+ id: table_properties["@id"],
191
+ columns: columns,
192
+ dialect: table_properties["dialect"],
193
+ foreign_keys: foreign_keys || [],
194
+ notes: notes,
195
+ primary_key: primary_key_valid && !primary_key_columns.empty? ? primary_key_columns : nil,
196
+ schema: table_schema ? table_schema["@id"] : nil,
197
+ annotations: annotations,
198
+ warnings: warnings
199
+ )
200
+ end
201
+
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,165 @@
1
+ module Csvlint
2
+ module Csvw
3
+ class TableGroup
4
+
5
+ include Csvlint::ErrorCollector
6
+
7
+ attr_reader :url, :id, :tables, :notes, :annotations
8
+
9
+ def initialize(url, id: nil, tables: {}, notes: [], annotations: {}, warnings: [])
10
+ @url = url
11
+ @id = id
12
+ @tables = tables
13
+ @notes = notes
14
+ @annotations = annotations
15
+ @validated_tables = {}
16
+ @tables.each { |t,v| @validated_tables[t] = false }
17
+ reset
18
+ @warnings += warnings
19
+ @errors += @tables.map{|url,table| table.errors}.flatten
20
+ @warnings += @tables.map{|url,table| table.warnings}.flatten
21
+ end
22
+
23
+ def validate_header(header, table_url)
24
+ reset
25
+ table_url = "file:#{File.absolute_path(table_url)}" if table_url.instance_of? File
26
+ table = tables[table_url]
27
+ table.validate_header(header)
28
+ @errors += table.errors
29
+ @warnings += table.warnings
30
+ return valid?
31
+ end
32
+
33
+ def validate_row(values, row=nil, all_errors=[], table_url)
34
+ reset
35
+ table_url = "file:#{File.absolute_path(table_url)}" if table_url.instance_of? File
36
+ @validated_tables[table_url] = true
37
+ table = tables[table_url]
38
+ table.validate_row(values, row)
39
+ @errors += table.errors
40
+ @warnings += table.warnings
41
+ return valid?
42
+ end
43
+
44
+ def validate_foreign_keys
45
+ reset
46
+ unless @validated_tables.has_value?(false)
47
+ @tables.each do |table_url,table|
48
+ table.validate_foreign_keys
49
+ @errors += table.errors
50
+ @warnings += table.warnings
51
+ end
52
+ end
53
+ return valid?
54
+ end
55
+
56
+ def self.from_json(url, json)
57
+ warnings = []
58
+ tables = {}
59
+ annotations = {}
60
+ inherited_properties = {}
61
+ common_properties = {}
62
+ base_url = URI(url.to_s.strip)
63
+ lang = "und"
64
+
65
+ context = json["@context"]
66
+ if context.instance_of?(Array) && context[1]
67
+ context[1].each do |property,value|
68
+ v, warning, type = Csvw::PropertyChecker.check_property(property, value, base_url, lang)
69
+ if warning.nil? || warning.empty?
70
+ if type == :context
71
+ base_url = v if property == "@base"
72
+ lang = v if property == "@language"
73
+ else
74
+ raise Csvlint::Csvw::MetadataError.new("$.@context"), "@context contains properties other than @base or @language (#{property})"
75
+ end
76
+ else
77
+ raise Csvlint::Csvw::MetadataError.new("$.@context"), "@context contains properties other than @base or @language (#{property})" unless ["@base", "@language"].include?(property)
78
+ warnings += Array(warning).map{ |w| Csvlint::ErrorMessage.new(w, :metadata, nil, nil, "@context: #{property}: #{value}", nil) }
79
+ end
80
+ end
81
+ end
82
+ json.delete("@context")
83
+
84
+ if json["url"]
85
+ json = { "tables" => [ json ] }
86
+ end unless json["tables"]
87
+
88
+ json.each do |property,value|
89
+ unless VALID_PROPERTIES.include? property
90
+ v, warning, type = Csvw::PropertyChecker.check_property(property, value, base_url, lang)
91
+ warnings += Array(warning).map{ |w| Csvlint::ErrorMessage.new(w, :metadata, nil, nil, "#{property}: #{value}", nil) } unless warning.nil? || warning.empty?
92
+ if type == :annotation
93
+ annotations[property] = v
94
+ elsif type == :common
95
+ common_properties[property] = v
96
+ elsif type == :column
97
+ warnings << Csvlint::ErrorMessage.new(:invalid_property, :metadata, nil, nil, "#{property}", nil)
98
+ else
99
+ inherited_properties[property] = v
100
+ end
101
+ end
102
+ end
103
+
104
+ id = common_properties["@id"]
105
+
106
+ raise Csvlint::Csvw::MetadataError.new("$.@type"), "@type of table group is not 'TableGroup'" if json["@type"] && json["@type"] != 'TableGroup'
107
+
108
+ raise Csvlint::Csvw::MetadataError.new("$"), "no tables property" unless json["tables"]
109
+ raise Csvlint::Csvw::MetadataError.new("$.tables"), "empty tables property" if json["tables"].empty?
110
+ raise Csvlint::Csvw::MetadataError.new("$.tables"), "tables property is not an array" unless json["tables"].instance_of? Array
111
+
112
+ json["tables"].each do |table_desc|
113
+ if table_desc.instance_of? Hash
114
+ table_url = table_desc["url"]
115
+ unless table_url.instance_of? String
116
+ warnings << Csvlint::ErrorMessage.new(:invalid_url, :metadata, nil, nil, "url: #{table_url}", nil)
117
+ table_url = ""
118
+ end
119
+ table_url = URI.join(base_url, table_url).to_s
120
+ table_desc["url"] = table_url
121
+ table = Csvlint::Csvw::Table.from_json(table_desc, base_url, lang, inherited_properties)
122
+ tables[table_url] = table
123
+ else
124
+ warnings << Csvlint::ErrorMessage.new(:invalid_table_description, :metadata, nil, nil, "#{table_desc}", nil)
125
+ end
126
+ end
127
+
128
+ tables.each do |table_url, table|
129
+ table.foreign_keys.each_with_index do |foreign_key,i|
130
+ reference = foreign_key["reference"]
131
+ if reference["resource"]
132
+ resource = URI.join(base_url, reference["resource"]).to_s
133
+ referenced_table = tables[resource]
134
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_url}')].tableSchema.foreign_keys[#{i}].reference.resource"), "foreign key references table that does not exist (#{resource})" if referenced_table.nil?
135
+ else
136
+ schema_url = URI.join(base_url, reference["schemaReference"]).to_s
137
+ referenced_tables = tables.values.select{ |table| table.schema == schema_url }
138
+ referenced_table = referenced_tables[0]
139
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_url}')].tableSchema.foreign_keys[#{i}].reference.schemaReference"), "foreign key references schema that is not used (#{schema_url})" if referenced_table.nil?
140
+ end
141
+ foreign_key["referenced_table"] = referenced_table
142
+ table_columns = {}
143
+ referenced_table.columns.each do |column|
144
+ table_columns[column.name] = column if column.name
145
+ end
146
+ referenced_columns = []
147
+ Array(reference["columnReference"]).each do |column_reference|
148
+ column = table_columns[column_reference]
149
+ raise Csvlint::Csvw::MetadataError.new("$.tables[?(@.url = '#{table_url}')].tableSchema.foreign_keys[#{i}].reference.columnReference"), "column named #{column_reference} does not exist in #{resource}" if column.nil?
150
+ referenced_columns << column
151
+ end
152
+ foreign_key["referenced_columns"] = referenced_columns
153
+ referenced_table.foreign_key_references << foreign_key
154
+ end
155
+ end
156
+
157
+ return self.new(base_url, id: id, tables: tables, notes: json["notes"] || [], annotations: annotations, warnings: warnings)
158
+ end
159
+
160
+ private
161
+ VALID_PROPERTIES = ['tables', 'notes', '@type']
162
+
163
+ end
164
+ end
165
+ end
@@ -14,7 +14,45 @@ module Csvlint
14
14
  reset
15
15
  end
16
16
 
17
- def validate_header(header)
17
+ class << self
18
+
19
+ def from_json_table(uri, json)
20
+ fields = []
21
+ json["fields"].each do |field_desc|
22
+ fields << Csvlint::Field.new( field_desc["name"] , field_desc["constraints"],
23
+ field_desc["title"], field_desc["description"] )
24
+ end if json["fields"]
25
+ return Schema.new( uri , fields, json["title"], json["description"] )
26
+ end
27
+
28
+ def from_csvw_metadata(uri, json)
29
+ return Csvlint::Csvw::TableGroup.from_json(uri, json)
30
+ end
31
+
32
+ def load_from_json(uri)
33
+ begin
34
+ json = JSON.parse( open(uri).read )
35
+ if json["@context"]
36
+ uri = "file:#{File.expand_path(uri)}" unless uri.to_s =~ /^http(s)?/
37
+ return Schema.from_csvw_metadata(uri,json)
38
+ else
39
+ return Schema.from_json_table(uri,json)
40
+ end
41
+ rescue Csvlint::Csvw::MetadataError => e
42
+ raise e
43
+ rescue OpenURI::HTTPError => e
44
+ raise e
45
+ rescue => e
46
+ STDERR.puts e.class
47
+ STDERR.puts e.message
48
+ STDERR.puts e.backtrace
49
+ return Schema.new(nil, [], "malformed", "malformed")
50
+ end
51
+ end
52
+
53
+ end
54
+
55
+ def validate_header(header, source_url=nil)
18
56
  reset
19
57
 
20
58
  found_header = header.to_csv(:row_sep => '')
@@ -25,7 +63,7 @@ module Csvlint
25
63
  return valid?
26
64
  end
27
65
 
28
- def validate_row(values, row=nil, all_errors=[])
66
+ def validate_row(values, row=nil, all_errors=[], source_url=nil)
29
67
  reset
30
68
  if values.length < fields.length
31
69
  fields[values.size..-1].each_with_index do |field, i|
@@ -48,26 +86,5 @@ module Csvlint
48
86
  return valid?
49
87
  end
50
88
 
51
- def Schema.from_json_table(uri, json)
52
- fields = []
53
- json["fields"].each do |field_desc|
54
- fields << Csvlint::Field.new( field_desc["name"] , field_desc["constraints"],
55
- field_desc["title"], field_desc["description"] )
56
- end if json["fields"]
57
- return Schema.new( uri , fields, json["title"], json["description"] )
58
- end
59
-
60
- # Difference in functionality between from_json_table and load_from_json_table
61
- # needs to be specified
62
-
63
- def Schema.load_from_json_table(uri)
64
- begin
65
- json = JSON.parse( open(uri).read )
66
- return Schema.from_json_table(uri,json)
67
- rescue
68
- return Schema.new(nil, [], "malformed", "malformed")
69
- end
70
- end
71
-
72
89
  end
73
90
  end