csvlint 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.coveralls.yml +1 -0
- data/.gitignore +22 -0
- data/.travis.yml +10 -0
- data/Gemfile +7 -0
- data/LICENSE.md +22 -0
- data/README.md +214 -0
- data/Rakefile +17 -0
- data/bin/create_schema +32 -0
- data/bin/csvlint +52 -0
- data/csvlint.gemspec +39 -0
- data/features/check_format.feature +46 -0
- data/features/csv_options.feature +35 -0
- data/features/fixtures/cr-line-endings.csv +1 -0
- data/features/fixtures/crlf-line-endings.csv +3 -0
- data/features/fixtures/inconsistent-line-endings.csv +2 -0
- data/features/fixtures/invalid-byte-sequence.csv +24 -0
- data/features/fixtures/lf-line-endings.csv +3 -0
- data/features/fixtures/spreadsheet.xls +0 -0
- data/features/fixtures/title-row.csv +4 -0
- data/features/fixtures/valid.csv +3 -0
- data/features/fixtures/windows-line-endings.csv +2 -0
- data/features/information.feature +22 -0
- data/features/parse_csv.feature +90 -0
- data/features/schema_validation.feature +63 -0
- data/features/sources.feature +18 -0
- data/features/step_definitions/csv_options_steps.rb +19 -0
- data/features/step_definitions/information_steps.rb +13 -0
- data/features/step_definitions/parse_csv_steps.rb +30 -0
- data/features/step_definitions/schema_validation_steps.rb +7 -0
- data/features/step_definitions/sources_steps.rb +7 -0
- data/features/step_definitions/validation_errors_steps.rb +43 -0
- data/features/step_definitions/validation_info_steps.rb +18 -0
- data/features/step_definitions/validation_warnings_steps.rb +46 -0
- data/features/support/env.rb +30 -0
- data/features/support/webmock.rb +1 -0
- data/features/validation_errors.feature +151 -0
- data/features/validation_info.feature +24 -0
- data/features/validation_warnings.feature +74 -0
- data/lib/csvlint.rb +13 -0
- data/lib/csvlint/error_collector.rb +43 -0
- data/lib/csvlint/error_message.rb +15 -0
- data/lib/csvlint/field.rb +102 -0
- data/lib/csvlint/schema.rb +69 -0
- data/lib/csvlint/types.rb +113 -0
- data/lib/csvlint/validate.rb +253 -0
- data/lib/csvlint/version.rb +3 -0
- data/lib/csvlint/wrapped_io.rb +39 -0
- data/spec/field_spec.rb +247 -0
- data/spec/schema_spec.rb +149 -0
- data/spec/spec_helper.rb +20 -0
- data/spec/validator_spec.rb +279 -0
- metadata +367 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
require "set"
|
2
|
+
|
3
|
+
module Csvlint
|
4
|
+
|
5
|
+
class Schema
|
6
|
+
|
7
|
+
include Csvlint::ErrorCollector
|
8
|
+
|
9
|
+
attr_reader :uri, :fields, :title, :description
|
10
|
+
|
11
|
+
def initialize(uri, fields=[], title=nil, description=nil)
|
12
|
+
@uri = uri
|
13
|
+
@fields = fields
|
14
|
+
@title = title
|
15
|
+
@description = description
|
16
|
+
reset
|
17
|
+
end
|
18
|
+
|
19
|
+
def validate_header(header)
|
20
|
+
reset
|
21
|
+
header.each_with_index do |name,i|
|
22
|
+
build_warnings(:header_name, :schema, nil, i+1, name) if fields[i].name != name
|
23
|
+
end
|
24
|
+
return valid?
|
25
|
+
end
|
26
|
+
|
27
|
+
def validate_row(values, row=nil)
|
28
|
+
reset
|
29
|
+
if values.length < fields.length
|
30
|
+
fields[values.size..-1].each_with_index do |field, i|
|
31
|
+
build_warnings(:missing_column, :schema, row, values.size+i+1)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
if values.length > fields.length
|
35
|
+
values[fields.size..-1].each_with_index do |data_column, i|
|
36
|
+
build_warnings(:extra_column, :schema, row, fields.size+i+1)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
fields.each_with_index do |field,i|
|
41
|
+
value = values[i] || ""
|
42
|
+
result = field.validate_column(value, row, i+1)
|
43
|
+
@errors += fields[i].errors
|
44
|
+
@warnings += fields[i].warnings
|
45
|
+
end
|
46
|
+
|
47
|
+
return valid?
|
48
|
+
end
|
49
|
+
|
50
|
+
def Schema.from_json_table(uri, json)
|
51
|
+
fields = []
|
52
|
+
json["fields"].each do |field_desc|
|
53
|
+
fields << Csvlint::Field.new( field_desc["name"] , field_desc["constraints"],
|
54
|
+
field_desc["title"], field_desc["description"] )
|
55
|
+
end if json["fields"]
|
56
|
+
return Schema.new( uri , fields, json["title"], json["description"] )
|
57
|
+
end
|
58
|
+
|
59
|
+
def Schema.load_from_json_table(uri)
|
60
|
+
begin
|
61
|
+
json = JSON.parse( open(uri).read )
|
62
|
+
return Schema.from_json_table(uri,json)
|
63
|
+
rescue
|
64
|
+
return nil
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'date'
|
3
|
+
require 'active_support/core_ext/date/conversions'
|
4
|
+
require 'active_support/core_ext/time/conversions'
|
5
|
+
|
6
|
+
module Csvlint
|
7
|
+
|
8
|
+
module Types
|
9
|
+
|
10
|
+
SIMPLE_FORMATS = {
|
11
|
+
'string' => lambda { |value, constraints| value },
|
12
|
+
'numeric' => lambda do |value, constraints|
|
13
|
+
begin
|
14
|
+
Integer value
|
15
|
+
rescue ArgumentError
|
16
|
+
Float value
|
17
|
+
end
|
18
|
+
end,
|
19
|
+
'uri' => lambda do |value, constraints|
|
20
|
+
u = URI.parse value
|
21
|
+
raise ArgumentError unless u.kind_of?(URI::HTTP) || u.kind_of?(URI::HTTPS)
|
22
|
+
u
|
23
|
+
end
|
24
|
+
}
|
25
|
+
|
26
|
+
def self.date_format(klass = DateTime, value, type)
|
27
|
+
date = klass.strptime(value, klass::DATE_FORMATS[type])
|
28
|
+
raise ArgumentError unless date.to_formatted_s(type) == value
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.included(base)
|
32
|
+
Time::DATE_FORMATS[:iso8601] = "%Y-%m-%dT%H:%M:%SZ"
|
33
|
+
Time::DATE_FORMATS[:hms] = "%H:%M:%S"
|
34
|
+
|
35
|
+
Date::DATE_FORMATS.each do |type|
|
36
|
+
SIMPLE_FORMATS["date_#{type.first}"] = lambda do |value, constraints|
|
37
|
+
date_format(Date, value, type.first)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
Time::DATE_FORMATS.each do |type|
|
42
|
+
SIMPLE_FORMATS["dateTime_#{type.first}"] = lambda do |value, constraints|
|
43
|
+
date_format(Time, value, type.first)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
TYPE_VALIDATIONS = {
|
49
|
+
'http://www.w3.org/2001/XMLSchema#string' => SIMPLE_FORMATS['string'],
|
50
|
+
'http://www.w3.org/2001/XMLSchema#int' => lambda { |value, constraints| Integer value },
|
51
|
+
'http://www.w3.org/2001/XMLSchema#integer' => lambda { |value, constraints| Integer value },
|
52
|
+
'http://www.w3.org/2001/XMLSchema#float' => lambda { |value, constraints| Float value },
|
53
|
+
'http://www.w3.org/2001/XMLSchema#double' => lambda { |value, constraints| Float value },
|
54
|
+
'http://www.w3.org/2001/XMLSchema#anyURI' => SIMPLE_FORMATS['uri'],
|
55
|
+
'http://www.w3.org/2001/XMLSchema#boolean' => lambda do |value, constraints|
|
56
|
+
return true if ['true', '1'].include? value
|
57
|
+
return false if ['false', '0'].include? value
|
58
|
+
raise ArgumentError
|
59
|
+
end,
|
60
|
+
'http://www.w3.org/2001/XMLSchema#nonPositiveInteger' => lambda do |value, constraints|
|
61
|
+
i = Integer value
|
62
|
+
raise ArgumentError unless i <= 0
|
63
|
+
i
|
64
|
+
end,
|
65
|
+
'http://www.w3.org/2001/XMLSchema#negativeInteger' => lambda do |value, constraints|
|
66
|
+
i = Integer value
|
67
|
+
raise ArgumentError unless i < 0
|
68
|
+
i
|
69
|
+
end,
|
70
|
+
'http://www.w3.org/2001/XMLSchema#nonNegativeInteger' => lambda do |value, constraints|
|
71
|
+
i = Integer value
|
72
|
+
raise ArgumentError unless i >= 0
|
73
|
+
i
|
74
|
+
end,
|
75
|
+
'http://www.w3.org/2001/XMLSchema#positiveInteger' => lambda do |value, constraints|
|
76
|
+
i = Integer value
|
77
|
+
raise ArgumentError unless i > 0
|
78
|
+
i
|
79
|
+
end,
|
80
|
+
'http://www.w3.org/2001/XMLSchema#dateTime' => lambda do |value, constraints|
|
81
|
+
date_pattern = constraints["datePattern"] || "%Y-%m-%dT%H:%M:%SZ"
|
82
|
+
d = DateTime.strptime(value, date_pattern)
|
83
|
+
raise ArgumentError unless d.strftime(date_pattern) == value
|
84
|
+
d
|
85
|
+
end,
|
86
|
+
'http://www.w3.org/2001/XMLSchema#date' => lambda do |value, constraints|
|
87
|
+
date_pattern = constraints["datePattern"] || "%Y-%m-%d"
|
88
|
+
d = Date.strptime(value, date_pattern)
|
89
|
+
raise ArgumentError unless d.strftime(date_pattern) == value
|
90
|
+
d
|
91
|
+
end,
|
92
|
+
'http://www.w3.org/2001/XMLSchema#time' => lambda do |value, constraints|
|
93
|
+
date_pattern = constraints["datePattern"] || "%H:%M:%S"
|
94
|
+
d = DateTime.strptime(value, date_pattern)
|
95
|
+
raise ArgumentError unless d.strftime(date_pattern) == value
|
96
|
+
d
|
97
|
+
end,
|
98
|
+
'http://www.w3.org/2001/XMLSchema#gYear' => lambda do |value, constraints|
|
99
|
+
date_pattern = constraints["datePattern"] || "%Y"
|
100
|
+
d = Date.strptime(value, date_pattern)
|
101
|
+
raise ArgumentError unless d.strftime(date_pattern) == value
|
102
|
+
d
|
103
|
+
end,
|
104
|
+
'http://www.w3.org/2001/XMLSchema#gYearMonth' => lambda do |value, constraints|
|
105
|
+
date_pattern = constraints["datePattern"] || "%Y-%m"
|
106
|
+
d = Date.strptime(value, date_pattern)
|
107
|
+
raise ArgumentError unless d.strftime(date_pattern) == value
|
108
|
+
d
|
109
|
+
end
|
110
|
+
}
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
@@ -0,0 +1,253 @@
|
|
1
|
+
require "open_uri_redirections"
|
2
|
+
|
3
|
+
module Csvlint
|
4
|
+
|
5
|
+
class Validator
|
6
|
+
|
7
|
+
include Csvlint::ErrorCollector
|
8
|
+
include Csvlint::Types
|
9
|
+
|
10
|
+
attr_reader :encoding, :content_type, :extension, :headers, :line_breaks, :dialect, :csv_header, :schema, :data
|
11
|
+
|
12
|
+
ERROR_MATCHERS = {
|
13
|
+
"Missing or stray quote" => :stray_quote,
|
14
|
+
"Illegal quoting" => :whitespace,
|
15
|
+
"Unclosed quoted field" => :unclosed_quote,
|
16
|
+
}
|
17
|
+
|
18
|
+
def initialize(source, dialect = nil, schema = nil)
|
19
|
+
@source = source
|
20
|
+
@formats = []
|
21
|
+
@schema = schema
|
22
|
+
|
23
|
+
@assumed_header = true
|
24
|
+
@supplied_dialect = dialect != nil
|
25
|
+
|
26
|
+
if dialect
|
27
|
+
@assumed_header = false
|
28
|
+
end
|
29
|
+
|
30
|
+
@dialect = dialect_defaults = {
|
31
|
+
"header" => true,
|
32
|
+
"delimiter" => ",",
|
33
|
+
"skipInitialSpace" => true,
|
34
|
+
"lineTerminator" => :auto,
|
35
|
+
"quoteChar" => '"'
|
36
|
+
}.merge(dialect || {})
|
37
|
+
|
38
|
+
@csv_header = @dialect["header"]
|
39
|
+
|
40
|
+
@csv_options = dialect_to_csv_options(@dialect)
|
41
|
+
@extension = parse_extension(source)
|
42
|
+
reset
|
43
|
+
validate
|
44
|
+
end
|
45
|
+
|
46
|
+
def validate
|
47
|
+
single_col = false
|
48
|
+
io = nil
|
49
|
+
begin
|
50
|
+
io = @source.respond_to?(:gets) ? @source : open(@source, :allow_redirections=>:all)
|
51
|
+
validate_metadata(io)
|
52
|
+
parse_csv(io)
|
53
|
+
unless @col_counts.inject(:+).nil?
|
54
|
+
build_warnings(:title_row, :structure) if @col_counts.first < (@col_counts.inject(:+) / @col_counts.count)
|
55
|
+
end
|
56
|
+
build_warnings(:check_options, :structure) if @expected_columns == 1
|
57
|
+
check_consistency
|
58
|
+
rescue OpenURI::HTTPError, Errno::ENOENT
|
59
|
+
build_errors(:not_found)
|
60
|
+
ensure
|
61
|
+
io.close if io && io.respond_to?(:close)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def validate_metadata(io)
|
66
|
+
@encoding = io.charset rescue nil
|
67
|
+
@content_type = io.content_type rescue nil
|
68
|
+
@headers = io.meta rescue nil
|
69
|
+
if @headers
|
70
|
+
if @headers["content-type"] =~ /header=(present|absent)/
|
71
|
+
@csv_header = true if $1 == "present"
|
72
|
+
@csv_header = false if $1 == "absent"
|
73
|
+
@assumed_header = false
|
74
|
+
end
|
75
|
+
if @headers["content-type"] !~ /charset=/
|
76
|
+
build_warnings(:no_encoding, :context)
|
77
|
+
else
|
78
|
+
build_warnings(:encoding, :context) if @encoding != "utf-8"
|
79
|
+
end
|
80
|
+
build_warnings(:no_content_type, :context) if @content_type == nil
|
81
|
+
build_warnings(:excel, :context) if @content_type == nil && @extension =~ /.xls(x)?/
|
82
|
+
build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
|
83
|
+
|
84
|
+
if @assumed_header && !@supplied_dialect && (@content_type == nil || @headers["content-type"] !~ /header=(present|absent)/ )
|
85
|
+
build_errors(:undeclared_header, :structure)
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
build_info_messages(:assumed_header, :structure) if @assumed_header
|
90
|
+
end
|
91
|
+
|
92
|
+
def parse_csv(io)
|
93
|
+
@expected_columns = 0
|
94
|
+
current_line = 0
|
95
|
+
reported_invalid_encoding = false
|
96
|
+
@col_counts = []
|
97
|
+
|
98
|
+
@csv_options[:encoding] = @encoding
|
99
|
+
|
100
|
+
begin
|
101
|
+
wrapper = WrappedIO.new( io )
|
102
|
+
csv = CSV.new( wrapper, @csv_options )
|
103
|
+
@data = []
|
104
|
+
@line_breaks = csv.row_sep
|
105
|
+
if @line_breaks != "\r\n"
|
106
|
+
build_info_messages(:nonrfc_line_breaks, :structure)
|
107
|
+
end
|
108
|
+
row = nil
|
109
|
+
loop do
|
110
|
+
current_line = current_line + 1
|
111
|
+
begin
|
112
|
+
row = csv.shift
|
113
|
+
@data << row
|
114
|
+
wrapper.finished
|
115
|
+
if row
|
116
|
+
if header? && current_line == 1
|
117
|
+
row = row.reject {|r| r.blank? }
|
118
|
+
validate_header(row)
|
119
|
+
@col_counts << row.count
|
120
|
+
else
|
121
|
+
build_formats(row, current_line)
|
122
|
+
@col_counts << row.reject {|r| r.blank? }.count
|
123
|
+
@expected_columns = row.count unless @expected_columns != 0
|
124
|
+
|
125
|
+
build_errors(:blank_rows, :structure, current_line, nil, wrapper.line) if row.reject{ |c| c.nil? || c.empty? }.count == 0
|
126
|
+
|
127
|
+
if @schema
|
128
|
+
@schema.validate_row(row, current_line)
|
129
|
+
@errors += @schema.errors
|
130
|
+
@warnings += @schema.warnings
|
131
|
+
else
|
132
|
+
build_errors(:ragged_rows, :structure, current_line, nil, wrapper.line) if !row.empty? && row.count != @expected_columns
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
else
|
137
|
+
break
|
138
|
+
end
|
139
|
+
rescue CSV::MalformedCSVError => e
|
140
|
+
wrapper.finished
|
141
|
+
type = fetch_error(e)
|
142
|
+
if type == :stray_quote && !wrapper.line.match(csv.row_sep)
|
143
|
+
build_errors(:line_breaks, :structure)
|
144
|
+
else
|
145
|
+
build_errors(type, :structure, current_line, nil, wrapper.line)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
rescue ArgumentError => ae
|
150
|
+
wrapper.finished
|
151
|
+
build_errors(:invalid_encoding, :structure, current_line, wrapper.line) unless reported_invalid_encoding
|
152
|
+
reported_invalid_encoding = true
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
def validate_header(header)
|
157
|
+
names = Set.new
|
158
|
+
header.each_with_index do |name,i|
|
159
|
+
build_warnings(:empty_column_name, :schema, nil, i+1) if name == ""
|
160
|
+
if names.include?(name)
|
161
|
+
build_warnings(:duplicate_column_name, :schema, nil, i+1)
|
162
|
+
else
|
163
|
+
names << name
|
164
|
+
end
|
165
|
+
end
|
166
|
+
if @schema
|
167
|
+
@schema.validate_header(header)
|
168
|
+
@errors += @schema.errors
|
169
|
+
@warnings += @schema.warnings
|
170
|
+
end
|
171
|
+
return valid?
|
172
|
+
end
|
173
|
+
|
174
|
+
def header?
|
175
|
+
return @csv_header
|
176
|
+
end
|
177
|
+
|
178
|
+
def fetch_error(error)
|
179
|
+
e = error.message.match(/^([a-z ]+) (i|o)n line ([0-9]+)\.?$/i)
|
180
|
+
message = e[1] rescue nil
|
181
|
+
ERROR_MATCHERS.fetch(message, :unknown_error)
|
182
|
+
end
|
183
|
+
|
184
|
+
def dialect_to_csv_options(dialect)
|
185
|
+
skipinitialspace = dialect["skipInitialSpace"] || true
|
186
|
+
delimiter = dialect["delimiter"]
|
187
|
+
delimiter = delimiter + " " if !skipinitialspace
|
188
|
+
return {
|
189
|
+
:col_sep => delimiter,
|
190
|
+
:row_sep => dialect["lineTerminator"],
|
191
|
+
:quote_char => dialect["quoteChar"],
|
192
|
+
:skip_blanks => false
|
193
|
+
}
|
194
|
+
end
|
195
|
+
|
196
|
+
def build_formats(row, line)
|
197
|
+
row.each_with_index do |col, i|
|
198
|
+
next if col.blank?
|
199
|
+
@formats[i] ||= []
|
200
|
+
|
201
|
+
SIMPLE_FORMATS.each do |type, lambda|
|
202
|
+
begin
|
203
|
+
lambda.call(col, {})
|
204
|
+
@format = type
|
205
|
+
rescue => e
|
206
|
+
nil
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
@formats[i] << @format
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def check_consistency
|
215
|
+
percentages = []
|
216
|
+
|
217
|
+
formats = SIMPLE_FORMATS.map {|type, lambda| type }
|
218
|
+
|
219
|
+
formats.each do |type, regex|
|
220
|
+
@formats.count.times do |i|
|
221
|
+
percentages[i] ||= {}
|
222
|
+
unless @formats[i].nil?
|
223
|
+
percentages[i][type] = @formats[i].grep(/^#{type}$/).count.to_f / @formats[i].count.to_f
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
percentages.each_with_index do |col, i|
|
229
|
+
next if col.values.blank?
|
230
|
+
build_warnings(:inconsistent_values, :schema, nil, i+1) if col.values.max < 0.9
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
private
|
235
|
+
|
236
|
+
def parse_extension(source)
|
237
|
+
case source
|
238
|
+
when File
|
239
|
+
return File.extname( source.path )
|
240
|
+
when IO
|
241
|
+
return ""
|
242
|
+
when StringIO
|
243
|
+
return ""
|
244
|
+
when Tempfile
|
245
|
+
return ""
|
246
|
+
else
|
247
|
+
parsed = URI.parse(source)
|
248
|
+
File.extname(parsed.path)
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
end
|
253
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Csvlint
|
2
|
+
class WrappedIO
|
3
|
+
def initialize(io)
|
4
|
+
@io = io
|
5
|
+
@line = ""
|
6
|
+
end
|
7
|
+
|
8
|
+
def gets(*args)
|
9
|
+
if args.count == 1 && args[0].is_a?(String)
|
10
|
+
delim = args[0]
|
11
|
+
@line = "" if @new_line
|
12
|
+
s = @io.gets(delim)
|
13
|
+
if s != nil
|
14
|
+
@line << s
|
15
|
+
end
|
16
|
+
return s
|
17
|
+
else
|
18
|
+
@io.gets(*args)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def eof?
|
23
|
+
@io.eof?
|
24
|
+
end
|
25
|
+
|
26
|
+
def finished
|
27
|
+
@new_line = true
|
28
|
+
end
|
29
|
+
|
30
|
+
def line
|
31
|
+
@line
|
32
|
+
end
|
33
|
+
|
34
|
+
def method_missing(method, *args)
|
35
|
+
@io.send(method, *args)
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|