csvlint 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +1 -0
  3. data/.gitignore +22 -0
  4. data/.travis.yml +10 -0
  5. data/Gemfile +7 -0
  6. data/LICENSE.md +22 -0
  7. data/README.md +214 -0
  8. data/Rakefile +17 -0
  9. data/bin/create_schema +32 -0
  10. data/bin/csvlint +52 -0
  11. data/csvlint.gemspec +39 -0
  12. data/features/check_format.feature +46 -0
  13. data/features/csv_options.feature +35 -0
  14. data/features/fixtures/cr-line-endings.csv +1 -0
  15. data/features/fixtures/crlf-line-endings.csv +3 -0
  16. data/features/fixtures/inconsistent-line-endings.csv +2 -0
  17. data/features/fixtures/invalid-byte-sequence.csv +24 -0
  18. data/features/fixtures/lf-line-endings.csv +3 -0
  19. data/features/fixtures/spreadsheet.xls +0 -0
  20. data/features/fixtures/title-row.csv +4 -0
  21. data/features/fixtures/valid.csv +3 -0
  22. data/features/fixtures/windows-line-endings.csv +2 -0
  23. data/features/information.feature +22 -0
  24. data/features/parse_csv.feature +90 -0
  25. data/features/schema_validation.feature +63 -0
  26. data/features/sources.feature +18 -0
  27. data/features/step_definitions/csv_options_steps.rb +19 -0
  28. data/features/step_definitions/information_steps.rb +13 -0
  29. data/features/step_definitions/parse_csv_steps.rb +30 -0
  30. data/features/step_definitions/schema_validation_steps.rb +7 -0
  31. data/features/step_definitions/sources_steps.rb +7 -0
  32. data/features/step_definitions/validation_errors_steps.rb +43 -0
  33. data/features/step_definitions/validation_info_steps.rb +18 -0
  34. data/features/step_definitions/validation_warnings_steps.rb +46 -0
  35. data/features/support/env.rb +30 -0
  36. data/features/support/webmock.rb +1 -0
  37. data/features/validation_errors.feature +151 -0
  38. data/features/validation_info.feature +24 -0
  39. data/features/validation_warnings.feature +74 -0
  40. data/lib/csvlint.rb +13 -0
  41. data/lib/csvlint/error_collector.rb +43 -0
  42. data/lib/csvlint/error_message.rb +15 -0
  43. data/lib/csvlint/field.rb +102 -0
  44. data/lib/csvlint/schema.rb +69 -0
  45. data/lib/csvlint/types.rb +113 -0
  46. data/lib/csvlint/validate.rb +253 -0
  47. data/lib/csvlint/version.rb +3 -0
  48. data/lib/csvlint/wrapped_io.rb +39 -0
  49. data/spec/field_spec.rb +247 -0
  50. data/spec/schema_spec.rb +149 -0
  51. data/spec/spec_helper.rb +20 -0
  52. data/spec/validator_spec.rb +279 -0
  53. metadata +367 -0
@@ -0,0 +1,69 @@
1
+ require "set"
2
+
3
+ module Csvlint
4
+
5
+ class Schema
6
+
7
+ include Csvlint::ErrorCollector
8
+
9
+ attr_reader :uri, :fields, :title, :description
10
+
11
+ def initialize(uri, fields=[], title=nil, description=nil)
12
+ @uri = uri
13
+ @fields = fields
14
+ @title = title
15
+ @description = description
16
+ reset
17
+ end
18
+
19
+ def validate_header(header)
20
+ reset
21
+ header.each_with_index do |name,i|
22
+ build_warnings(:header_name, :schema, nil, i+1, name) if fields[i].name != name
23
+ end
24
+ return valid?
25
+ end
26
+
27
+ def validate_row(values, row=nil)
28
+ reset
29
+ if values.length < fields.length
30
+ fields[values.size..-1].each_with_index do |field, i|
31
+ build_warnings(:missing_column, :schema, row, values.size+i+1)
32
+ end
33
+ end
34
+ if values.length > fields.length
35
+ values[fields.size..-1].each_with_index do |data_column, i|
36
+ build_warnings(:extra_column, :schema, row, fields.size+i+1)
37
+ end
38
+ end
39
+
40
+ fields.each_with_index do |field,i|
41
+ value = values[i] || ""
42
+ result = field.validate_column(value, row, i+1)
43
+ @errors += fields[i].errors
44
+ @warnings += fields[i].warnings
45
+ end
46
+
47
+ return valid?
48
+ end
49
+
50
+ def Schema.from_json_table(uri, json)
51
+ fields = []
52
+ json["fields"].each do |field_desc|
53
+ fields << Csvlint::Field.new( field_desc["name"] , field_desc["constraints"],
54
+ field_desc["title"], field_desc["description"] )
55
+ end if json["fields"]
56
+ return Schema.new( uri , fields, json["title"], json["description"] )
57
+ end
58
+
59
+ def Schema.load_from_json_table(uri)
60
+ begin
61
+ json = JSON.parse( open(uri).read )
62
+ return Schema.from_json_table(uri,json)
63
+ rescue
64
+ return nil
65
+ end
66
+ end
67
+
68
+ end
69
+ end
@@ -0,0 +1,113 @@
1
+ require 'set'
2
+ require 'date'
3
+ require 'active_support/core_ext/date/conversions'
4
+ require 'active_support/core_ext/time/conversions'
5
+
6
+ module Csvlint
7
+
8
+ module Types
9
+
10
+ SIMPLE_FORMATS = {
11
+ 'string' => lambda { |value, constraints| value },
12
+ 'numeric' => lambda do |value, constraints|
13
+ begin
14
+ Integer value
15
+ rescue ArgumentError
16
+ Float value
17
+ end
18
+ end,
19
+ 'uri' => lambda do |value, constraints|
20
+ u = URI.parse value
21
+ raise ArgumentError unless u.kind_of?(URI::HTTP) || u.kind_of?(URI::HTTPS)
22
+ u
23
+ end
24
+ }
25
+
26
+ def self.date_format(klass = DateTime, value, type)
27
+ date = klass.strptime(value, klass::DATE_FORMATS[type])
28
+ raise ArgumentError unless date.to_formatted_s(type) == value
29
+ end
30
+
31
+ def self.included(base)
32
+ Time::DATE_FORMATS[:iso8601] = "%Y-%m-%dT%H:%M:%SZ"
33
+ Time::DATE_FORMATS[:hms] = "%H:%M:%S"
34
+
35
+ Date::DATE_FORMATS.each do |type|
36
+ SIMPLE_FORMATS["date_#{type.first}"] = lambda do |value, constraints|
37
+ date_format(Date, value, type.first)
38
+ end
39
+ end
40
+
41
+ Time::DATE_FORMATS.each do |type|
42
+ SIMPLE_FORMATS["dateTime_#{type.first}"] = lambda do |value, constraints|
43
+ date_format(Time, value, type.first)
44
+ end
45
+ end
46
+ end
47
+
48
+ TYPE_VALIDATIONS = {
49
+ 'http://www.w3.org/2001/XMLSchema#string' => SIMPLE_FORMATS['string'],
50
+ 'http://www.w3.org/2001/XMLSchema#int' => lambda { |value, constraints| Integer value },
51
+ 'http://www.w3.org/2001/XMLSchema#integer' => lambda { |value, constraints| Integer value },
52
+ 'http://www.w3.org/2001/XMLSchema#float' => lambda { |value, constraints| Float value },
53
+ 'http://www.w3.org/2001/XMLSchema#double' => lambda { |value, constraints| Float value },
54
+ 'http://www.w3.org/2001/XMLSchema#anyURI' => SIMPLE_FORMATS['uri'],
55
+ 'http://www.w3.org/2001/XMLSchema#boolean' => lambda do |value, constraints|
56
+ return true if ['true', '1'].include? value
57
+ return false if ['false', '0'].include? value
58
+ raise ArgumentError
59
+ end,
60
+ 'http://www.w3.org/2001/XMLSchema#nonPositiveInteger' => lambda do |value, constraints|
61
+ i = Integer value
62
+ raise ArgumentError unless i <= 0
63
+ i
64
+ end,
65
+ 'http://www.w3.org/2001/XMLSchema#negativeInteger' => lambda do |value, constraints|
66
+ i = Integer value
67
+ raise ArgumentError unless i < 0
68
+ i
69
+ end,
70
+ 'http://www.w3.org/2001/XMLSchema#nonNegativeInteger' => lambda do |value, constraints|
71
+ i = Integer value
72
+ raise ArgumentError unless i >= 0
73
+ i
74
+ end,
75
+ 'http://www.w3.org/2001/XMLSchema#positiveInteger' => lambda do |value, constraints|
76
+ i = Integer value
77
+ raise ArgumentError unless i > 0
78
+ i
79
+ end,
80
+ 'http://www.w3.org/2001/XMLSchema#dateTime' => lambda do |value, constraints|
81
+ date_pattern = constraints["datePattern"] || "%Y-%m-%dT%H:%M:%SZ"
82
+ d = DateTime.strptime(value, date_pattern)
83
+ raise ArgumentError unless d.strftime(date_pattern) == value
84
+ d
85
+ end,
86
+ 'http://www.w3.org/2001/XMLSchema#date' => lambda do |value, constraints|
87
+ date_pattern = constraints["datePattern"] || "%Y-%m-%d"
88
+ d = Date.strptime(value, date_pattern)
89
+ raise ArgumentError unless d.strftime(date_pattern) == value
90
+ d
91
+ end,
92
+ 'http://www.w3.org/2001/XMLSchema#time' => lambda do |value, constraints|
93
+ date_pattern = constraints["datePattern"] || "%H:%M:%S"
94
+ d = DateTime.strptime(value, date_pattern)
95
+ raise ArgumentError unless d.strftime(date_pattern) == value
96
+ d
97
+ end,
98
+ 'http://www.w3.org/2001/XMLSchema#gYear' => lambda do |value, constraints|
99
+ date_pattern = constraints["datePattern"] || "%Y"
100
+ d = Date.strptime(value, date_pattern)
101
+ raise ArgumentError unless d.strftime(date_pattern) == value
102
+ d
103
+ end,
104
+ 'http://www.w3.org/2001/XMLSchema#gYearMonth' => lambda do |value, constraints|
105
+ date_pattern = constraints["datePattern"] || "%Y-%m"
106
+ d = Date.strptime(value, date_pattern)
107
+ raise ArgumentError unless d.strftime(date_pattern) == value
108
+ d
109
+ end
110
+ }
111
+ end
112
+
113
+ end
@@ -0,0 +1,253 @@
1
+ require "open_uri_redirections"
2
+
3
+ module Csvlint
4
+
5
+ class Validator
6
+
7
+ include Csvlint::ErrorCollector
8
+ include Csvlint::Types
9
+
10
+ attr_reader :encoding, :content_type, :extension, :headers, :line_breaks, :dialect, :csv_header, :schema, :data
11
+
12
+ ERROR_MATCHERS = {
13
+ "Missing or stray quote" => :stray_quote,
14
+ "Illegal quoting" => :whitespace,
15
+ "Unclosed quoted field" => :unclosed_quote,
16
+ }
17
+
18
+ def initialize(source, dialect = nil, schema = nil)
19
+ @source = source
20
+ @formats = []
21
+ @schema = schema
22
+
23
+ @assumed_header = true
24
+ @supplied_dialect = dialect != nil
25
+
26
+ if dialect
27
+ @assumed_header = false
28
+ end
29
+
30
+ @dialect = dialect_defaults = {
31
+ "header" => true,
32
+ "delimiter" => ",",
33
+ "skipInitialSpace" => true,
34
+ "lineTerminator" => :auto,
35
+ "quoteChar" => '"'
36
+ }.merge(dialect || {})
37
+
38
+ @csv_header = @dialect["header"]
39
+
40
+ @csv_options = dialect_to_csv_options(@dialect)
41
+ @extension = parse_extension(source)
42
+ reset
43
+ validate
44
+ end
45
+
46
+ def validate
47
+ single_col = false
48
+ io = nil
49
+ begin
50
+ io = @source.respond_to?(:gets) ? @source : open(@source, :allow_redirections=>:all)
51
+ validate_metadata(io)
52
+ parse_csv(io)
53
+ unless @col_counts.inject(:+).nil?
54
+ build_warnings(:title_row, :structure) if @col_counts.first < (@col_counts.inject(:+) / @col_counts.count)
55
+ end
56
+ build_warnings(:check_options, :structure) if @expected_columns == 1
57
+ check_consistency
58
+ rescue OpenURI::HTTPError, Errno::ENOENT
59
+ build_errors(:not_found)
60
+ ensure
61
+ io.close if io && io.respond_to?(:close)
62
+ end
63
+ end
64
+
65
+ def validate_metadata(io)
66
+ @encoding = io.charset rescue nil
67
+ @content_type = io.content_type rescue nil
68
+ @headers = io.meta rescue nil
69
+ if @headers
70
+ if @headers["content-type"] =~ /header=(present|absent)/
71
+ @csv_header = true if $1 == "present"
72
+ @csv_header = false if $1 == "absent"
73
+ @assumed_header = false
74
+ end
75
+ if @headers["content-type"] !~ /charset=/
76
+ build_warnings(:no_encoding, :context)
77
+ else
78
+ build_warnings(:encoding, :context) if @encoding != "utf-8"
79
+ end
80
+ build_warnings(:no_content_type, :context) if @content_type == nil
81
+ build_warnings(:excel, :context) if @content_type == nil && @extension =~ /.xls(x)?/
82
+ build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
83
+
84
+ if @assumed_header && !@supplied_dialect && (@content_type == nil || @headers["content-type"] !~ /header=(present|absent)/ )
85
+ build_errors(:undeclared_header, :structure)
86
+ end
87
+
88
+ end
89
+ build_info_messages(:assumed_header, :structure) if @assumed_header
90
+ end
91
+
92
+ def parse_csv(io)
93
+ @expected_columns = 0
94
+ current_line = 0
95
+ reported_invalid_encoding = false
96
+ @col_counts = []
97
+
98
+ @csv_options[:encoding] = @encoding
99
+
100
+ begin
101
+ wrapper = WrappedIO.new( io )
102
+ csv = CSV.new( wrapper, @csv_options )
103
+ @data = []
104
+ @line_breaks = csv.row_sep
105
+ if @line_breaks != "\r\n"
106
+ build_info_messages(:nonrfc_line_breaks, :structure)
107
+ end
108
+ row = nil
109
+ loop do
110
+ current_line = current_line + 1
111
+ begin
112
+ row = csv.shift
113
+ @data << row
114
+ wrapper.finished
115
+ if row
116
+ if header? && current_line == 1
117
+ row = row.reject {|r| r.blank? }
118
+ validate_header(row)
119
+ @col_counts << row.count
120
+ else
121
+ build_formats(row, current_line)
122
+ @col_counts << row.reject {|r| r.blank? }.count
123
+ @expected_columns = row.count unless @expected_columns != 0
124
+
125
+ build_errors(:blank_rows, :structure, current_line, nil, wrapper.line) if row.reject{ |c| c.nil? || c.empty? }.count == 0
126
+
127
+ if @schema
128
+ @schema.validate_row(row, current_line)
129
+ @errors += @schema.errors
130
+ @warnings += @schema.warnings
131
+ else
132
+ build_errors(:ragged_rows, :structure, current_line, nil, wrapper.line) if !row.empty? && row.count != @expected_columns
133
+ end
134
+
135
+ end
136
+ else
137
+ break
138
+ end
139
+ rescue CSV::MalformedCSVError => e
140
+ wrapper.finished
141
+ type = fetch_error(e)
142
+ if type == :stray_quote && !wrapper.line.match(csv.row_sep)
143
+ build_errors(:line_breaks, :structure)
144
+ else
145
+ build_errors(type, :structure, current_line, nil, wrapper.line)
146
+ end
147
+ end
148
+ end
149
+ rescue ArgumentError => ae
150
+ wrapper.finished
151
+ build_errors(:invalid_encoding, :structure, current_line, wrapper.line) unless reported_invalid_encoding
152
+ reported_invalid_encoding = true
153
+ end
154
+ end
155
+
156
+ def validate_header(header)
157
+ names = Set.new
158
+ header.each_with_index do |name,i|
159
+ build_warnings(:empty_column_name, :schema, nil, i+1) if name == ""
160
+ if names.include?(name)
161
+ build_warnings(:duplicate_column_name, :schema, nil, i+1)
162
+ else
163
+ names << name
164
+ end
165
+ end
166
+ if @schema
167
+ @schema.validate_header(header)
168
+ @errors += @schema.errors
169
+ @warnings += @schema.warnings
170
+ end
171
+ return valid?
172
+ end
173
+
174
+ def header?
175
+ return @csv_header
176
+ end
177
+
178
+ def fetch_error(error)
179
+ e = error.message.match(/^([a-z ]+) (i|o)n line ([0-9]+)\.?$/i)
180
+ message = e[1] rescue nil
181
+ ERROR_MATCHERS.fetch(message, :unknown_error)
182
+ end
183
+
184
+ def dialect_to_csv_options(dialect)
185
+ skipinitialspace = dialect["skipInitialSpace"] || true
186
+ delimiter = dialect["delimiter"]
187
+ delimiter = delimiter + " " if !skipinitialspace
188
+ return {
189
+ :col_sep => delimiter,
190
+ :row_sep => dialect["lineTerminator"],
191
+ :quote_char => dialect["quoteChar"],
192
+ :skip_blanks => false
193
+ }
194
+ end
195
+
196
+ def build_formats(row, line)
197
+ row.each_with_index do |col, i|
198
+ next if col.blank?
199
+ @formats[i] ||= []
200
+
201
+ SIMPLE_FORMATS.each do |type, lambda|
202
+ begin
203
+ lambda.call(col, {})
204
+ @format = type
205
+ rescue => e
206
+ nil
207
+ end
208
+ end
209
+
210
+ @formats[i] << @format
211
+ end
212
+ end
213
+
214
+ def check_consistency
215
+ percentages = []
216
+
217
+ formats = SIMPLE_FORMATS.map {|type, lambda| type }
218
+
219
+ formats.each do |type, regex|
220
+ @formats.count.times do |i|
221
+ percentages[i] ||= {}
222
+ unless @formats[i].nil?
223
+ percentages[i][type] = @formats[i].grep(/^#{type}$/).count.to_f / @formats[i].count.to_f
224
+ end
225
+ end
226
+ end
227
+
228
+ percentages.each_with_index do |col, i|
229
+ next if col.values.blank?
230
+ build_warnings(:inconsistent_values, :schema, nil, i+1) if col.values.max < 0.9
231
+ end
232
+ end
233
+
234
+ private
235
+
236
+ def parse_extension(source)
237
+ case source
238
+ when File
239
+ return File.extname( source.path )
240
+ when IO
241
+ return ""
242
+ when StringIO
243
+ return ""
244
+ when Tempfile
245
+ return ""
246
+ else
247
+ parsed = URI.parse(source)
248
+ File.extname(parsed.path)
249
+ end
250
+ end
251
+
252
+ end
253
+ end
@@ -0,0 +1,3 @@
1
+ module Csvlint
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,39 @@
1
+ module Csvlint
2
+ class WrappedIO
3
+ def initialize(io)
4
+ @io = io
5
+ @line = ""
6
+ end
7
+
8
+ def gets(*args)
9
+ if args.count == 1 && args[0].is_a?(String)
10
+ delim = args[0]
11
+ @line = "" if @new_line
12
+ s = @io.gets(delim)
13
+ if s != nil
14
+ @line << s
15
+ end
16
+ return s
17
+ else
18
+ @io.gets(*args)
19
+ end
20
+ end
21
+
22
+ def eof?
23
+ @io.eof?
24
+ end
25
+
26
+ def finished
27
+ @new_line = true
28
+ end
29
+
30
+ def line
31
+ @line
32
+ end
33
+
34
+ def method_missing(method, *args)
35
+ @io.send(method, *args)
36
+ end
37
+
38
+ end
39
+ end