wjordan213-csvlint 0.2.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.coveralls.yml +1 -0
- data/.gitattributes +2 -0
- data/.gitignore +28 -0
- data/.ruby-version +1 -0
- data/.travis.yml +32 -0
- data/CHANGELOG.md +361 -0
- data/Gemfile +7 -0
- data/LICENSE.md +22 -0
- data/README.md +328 -0
- data/Rakefile +17 -0
- data/bin/create_schema +32 -0
- data/bin/csvlint +10 -0
- data/features/check_format.feature +46 -0
- data/features/cli.feature +210 -0
- data/features/csv_options.feature +35 -0
- data/features/csvupload.feature +145 -0
- data/features/csvw_schema_validation.feature +127 -0
- data/features/fixtures/cr-line-endings.csv +0 -0
- data/features/fixtures/crlf-line-endings.csv +0 -0
- data/features/fixtures/inconsistent-line-endings-unquoted.csv +0 -0
- data/features/fixtures/inconsistent-line-endings.csv +0 -0
- data/features/fixtures/invalid-byte-sequence.csv +0 -0
- data/features/fixtures/invalid_many_rows.csv +0 -0
- data/features/fixtures/lf-line-endings.csv +0 -0
- data/features/fixtures/spreadsheet.xls +0 -0
- data/features/fixtures/spreadsheet.xlsx +0 -0
- data/features/fixtures/title-row.csv +0 -0
- data/features/fixtures/valid.csv +0 -0
- data/features/fixtures/valid_many_rows.csv +0 -0
- data/features/fixtures/windows-line-endings.csv +0 -0
- data/features/information.feature +22 -0
- data/features/parse_csv.feature +90 -0
- data/features/schema_validation.feature +105 -0
- data/features/sources.feature +17 -0
- data/features/step_definitions/cli_steps.rb +11 -0
- data/features/step_definitions/csv_options_steps.rb +24 -0
- data/features/step_definitions/information_steps.rb +13 -0
- data/features/step_definitions/parse_csv_steps.rb +42 -0
- data/features/step_definitions/schema_validation_steps.rb +33 -0
- data/features/step_definitions/sources_steps.rb +7 -0
- data/features/step_definitions/validation_errors_steps.rb +90 -0
- data/features/step_definitions/validation_info_steps.rb +22 -0
- data/features/step_definitions/validation_warnings_steps.rb +60 -0
- data/features/support/aruba.rb +56 -0
- data/features/support/env.rb +26 -0
- data/features/support/load_tests.rb +114 -0
- data/features/support/webmock.rb +1 -0
- data/features/validation_errors.feature +147 -0
- data/features/validation_info.feature +16 -0
- data/features/validation_warnings.feature +86 -0
- data/lib/csvlint.rb +27 -0
- data/lib/csvlint/cli.rb +165 -0
- data/lib/csvlint/csvw/column.rb +359 -0
- data/lib/csvlint/csvw/date_format.rb +182 -0
- data/lib/csvlint/csvw/metadata_error.rb +13 -0
- data/lib/csvlint/csvw/number_format.rb +211 -0
- data/lib/csvlint/csvw/property_checker.rb +761 -0
- data/lib/csvlint/csvw/table.rb +204 -0
- data/lib/csvlint/csvw/table_group.rb +165 -0
- data/lib/csvlint/error_collector.rb +27 -0
- data/lib/csvlint/error_message.rb +15 -0
- data/lib/csvlint/field.rb +196 -0
- data/lib/csvlint/schema.rb +92 -0
- data/lib/csvlint/validate.rb +599 -0
- data/lib/csvlint/version.rb +3 -0
- data/spec/csvw/column_spec.rb +112 -0
- data/spec/csvw/date_format_spec.rb +49 -0
- data/spec/csvw/number_format_spec.rb +417 -0
- data/spec/csvw/table_group_spec.rb +143 -0
- data/spec/csvw/table_spec.rb +90 -0
- data/spec/field_spec.rb +252 -0
- data/spec/schema_spec.rb +211 -0
- data/spec/spec_helper.rb +17 -0
- data/spec/validator_spec.rb +619 -0
- data/wjordan213_csvlint.gemspec +46 -0
- metadata +490 -0
@@ -0,0 +1,92 @@
|
|
1
|
+
module Csvlint
|
2
|
+
|
3
|
+
class Schema
|
4
|
+
|
5
|
+
include Csvlint::ErrorCollector
|
6
|
+
|
7
|
+
attr_reader :uri, :fields, :title, :description
|
8
|
+
|
9
|
+
def initialize(uri, fields=[], title=nil, description=nil)
|
10
|
+
@uri = uri
|
11
|
+
@fields = fields
|
12
|
+
@title = title
|
13
|
+
@description = description
|
14
|
+
reset
|
15
|
+
end
|
16
|
+
|
17
|
+
class << self
|
18
|
+
|
19
|
+
def from_json_table(uri, json)
|
20
|
+
fields = []
|
21
|
+
json["fields"].each do |field_desc|
|
22
|
+
fields << Csvlint::Field.new( field_desc["name"] , field_desc["constraints"],
|
23
|
+
field_desc["title"], field_desc["description"] )
|
24
|
+
end if json["fields"]
|
25
|
+
return Schema.new( uri , fields, json["title"], json["description"] )
|
26
|
+
end
|
27
|
+
|
28
|
+
def from_csvw_metadata(uri, json)
|
29
|
+
return Csvlint::Csvw::TableGroup.from_json(uri, json)
|
30
|
+
end
|
31
|
+
|
32
|
+
def load_from_json(uri, output_errors = true)
|
33
|
+
begin
|
34
|
+
json = JSON.parse( open(uri).read )
|
35
|
+
if json["@context"]
|
36
|
+
uri = "file:#{File.expand_path(uri)}" unless uri.to_s =~ /^http(s)?/
|
37
|
+
return Schema.from_csvw_metadata(uri,json)
|
38
|
+
else
|
39
|
+
return Schema.from_json_table(uri,json)
|
40
|
+
end
|
41
|
+
rescue Csvlint::Csvw::MetadataError => e
|
42
|
+
raise e
|
43
|
+
rescue OpenURI::HTTPError, Errno::ENOENT => e
|
44
|
+
raise e
|
45
|
+
rescue => e
|
46
|
+
if output_errors === true
|
47
|
+
STDERR.puts e.class
|
48
|
+
STDERR.puts e.message
|
49
|
+
STDERR.puts e.backtrace
|
50
|
+
end
|
51
|
+
return Schema.new(nil, [], "malformed", "malformed")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
def validate_header(header, source_url=nil)
|
58
|
+
reset
|
59
|
+
|
60
|
+
found_header = header.to_csv(:row_sep => '')
|
61
|
+
expected_header = @fields.map{ |f| f.name }.to_csv(:row_sep => '')
|
62
|
+
if found_header != expected_header
|
63
|
+
build_warnings(:malformed_header, :schema, 1, nil, found_header, "expectedHeader" => expected_header)
|
64
|
+
end
|
65
|
+
return valid?
|
66
|
+
end
|
67
|
+
|
68
|
+
def validate_row(values, row=nil, all_errors=[], source_url=nil)
|
69
|
+
reset
|
70
|
+
if values.length < fields.length
|
71
|
+
fields[values.size..-1].each_with_index do |field, i|
|
72
|
+
build_warnings(:missing_column, :schema, row, values.size+i+1)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
if values.length > fields.length
|
76
|
+
values[fields.size..-1].each_with_index do |data_column, i|
|
77
|
+
build_warnings(:extra_column, :schema, row, fields.size+i+1)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
fields.each_with_index do |field,i|
|
82
|
+
value = values[i] || ""
|
83
|
+
result = field.validate_column(value, row, i+1, all_errors)
|
84
|
+
@errors += fields[i].errors
|
85
|
+
@warnings += fields[i].warnings
|
86
|
+
end
|
87
|
+
|
88
|
+
return valid?
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,599 @@
|
|
1
|
+
module Csvlint
|
2
|
+
|
3
|
+
class Validator
|
4
|
+
class LineCSV < CSV
|
5
|
+
ENCODE_RE = Hash.new do |h,str|
|
6
|
+
h[str] = Regexp.new(str)
|
7
|
+
end
|
8
|
+
|
9
|
+
ENCODE_STR = Hash.new do |h,encoding_name|
|
10
|
+
h[encoding_name] = Hash.new do |h,chunks|
|
11
|
+
h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join('')
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
ESCAPE_RE = Hash.new do |h,re_chars|
|
16
|
+
h[re_chars] = Hash.new do |h,re_esc|
|
17
|
+
h[re_esc] = Hash.new do |h,str|
|
18
|
+
h[str] = str.gsub(re_chars) {|c| re_esc + c}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Optimization: Memoize `encode_re`.
|
24
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2273
|
25
|
+
def encode_re(*chunks)
|
26
|
+
ENCODE_RE[encode_str(*chunks)]
|
27
|
+
end
|
28
|
+
|
29
|
+
# Optimization: Memoize `encode_str`.
|
30
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2281
|
31
|
+
def encode_str(*chunks)
|
32
|
+
ENCODE_STR[@encoding.name][chunks]
|
33
|
+
end
|
34
|
+
|
35
|
+
# Optimization: Memoize `escape_re`.
|
36
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2265
|
37
|
+
def escape_re(str)
|
38
|
+
ESCAPE_RE[@re_chars][@re_esc][str]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Optimization: Disable the CSV library's converters feature.
|
42
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2100
|
43
|
+
def init_converters(options, field_name = :converters)
|
44
|
+
@converters = []
|
45
|
+
@header_converters = []
|
46
|
+
options.delete(:unconverted_fields)
|
47
|
+
options.delete(field_name)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
include Csvlint::ErrorCollector
|
52
|
+
|
53
|
+
attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :dialect, :csv_header, :schema, :data, :current_line
|
54
|
+
|
55
|
+
ERROR_MATCHERS = {
|
56
|
+
"Missing or stray quote" => :stray_quote,
|
57
|
+
"Illegal quoting" => :whitespace,
|
58
|
+
"Unclosed quoted field" => :unclosed_quote,
|
59
|
+
"Unquoted fields do not allow \\r or \\n" => :line_breaks,
|
60
|
+
}
|
61
|
+
|
62
|
+
def initialize(source, dialect = {}, schema = nil, options = {})
|
63
|
+
reset
|
64
|
+
@source = source
|
65
|
+
@formats = []
|
66
|
+
@schema = schema
|
67
|
+
@dialect = dialect
|
68
|
+
@csv_header = true
|
69
|
+
@headers = {}
|
70
|
+
@lambda = options[:lambda]
|
71
|
+
@leading = ""
|
72
|
+
|
73
|
+
@limit_lines = options[:limit_lines]
|
74
|
+
@extension = parse_extension(source) unless @source.nil?
|
75
|
+
|
76
|
+
@expected_columns = 0
|
77
|
+
@col_counts = []
|
78
|
+
@line_breaks = []
|
79
|
+
|
80
|
+
@errors += @schema.errors unless @schema.nil?
|
81
|
+
@warnings += @schema.warnings unless @schema.nil?
|
82
|
+
|
83
|
+
@data = [] # it may be advisable to flush this on init?
|
84
|
+
|
85
|
+
validate
|
86
|
+
end
|
87
|
+
|
88
|
+
def validate
|
89
|
+
if @extension =~ /.xls(x)?/
|
90
|
+
build_warnings(:excel, :context)
|
91
|
+
return
|
92
|
+
end
|
93
|
+
locate_schema unless @schema.instance_of?(Csvlint::Schema)
|
94
|
+
set_dialect
|
95
|
+
|
96
|
+
if @source.class == String
|
97
|
+
validate_url
|
98
|
+
else
|
99
|
+
validate_metadata
|
100
|
+
validate_stream
|
101
|
+
end
|
102
|
+
finish
|
103
|
+
end
|
104
|
+
|
105
|
+
def validate_stream
|
106
|
+
@current_line = 1
|
107
|
+
@source.each_line do |line|
|
108
|
+
break if line_limit_reached?
|
109
|
+
parse_line(line)
|
110
|
+
end
|
111
|
+
validate_line(@leading, @current_line) unless @leading == ""
|
112
|
+
end
|
113
|
+
|
114
|
+
def validate_url
|
115
|
+
@current_line = 1
|
116
|
+
request = Typhoeus::Request.new(@source, followlocation: true)
|
117
|
+
request.on_headers do |response|
|
118
|
+
@headers = response.headers || {}
|
119
|
+
@content_type = response.headers["content-type"] rescue nil
|
120
|
+
@response_code = response.code
|
121
|
+
return build_errors(:not_found) if response.code == 404
|
122
|
+
validate_metadata
|
123
|
+
end
|
124
|
+
request.on_body do |chunk|
|
125
|
+
io = StringIO.new(chunk)
|
126
|
+
io.each_line do |line|
|
127
|
+
break if line_limit_reached?
|
128
|
+
parse_line(line)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
request.run
|
132
|
+
# Validate the last line too
|
133
|
+
validate_line(@leading, @current_line) unless @leading == ""
|
134
|
+
end
|
135
|
+
|
136
|
+
def parse_line(line)
|
137
|
+
line = @leading + line
|
138
|
+
# Check if the last line is a line break - in which case it's a full line
|
139
|
+
if line[-1, 1].include?("\n")
|
140
|
+
# If the number of quotes is odd, the linebreak is inside some quotes
|
141
|
+
if line.count(@dialect["quoteChar"]).odd?
|
142
|
+
@leading = line
|
143
|
+
else
|
144
|
+
validate_line(line, @current_line)
|
145
|
+
@leading = ""
|
146
|
+
@current_line = @current_line+1
|
147
|
+
end
|
148
|
+
else
|
149
|
+
# If it's not a full line, then prepare to add it to the beginning of the next chunk
|
150
|
+
@leading = line
|
151
|
+
end
|
152
|
+
rescue ArgumentError => ae
|
153
|
+
build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
|
154
|
+
@current_line = @current_line+1
|
155
|
+
@reported_invalid_encoding = true
|
156
|
+
end
|
157
|
+
|
158
|
+
def validate_line(input = nil, index = nil)
|
159
|
+
@input = input
|
160
|
+
single_col = false
|
161
|
+
line = index.present? ? index : 0
|
162
|
+
@encoding = input.encoding.to_s
|
163
|
+
report_line_breaks(line)
|
164
|
+
parse_contents(input, line)
|
165
|
+
@lambda.call(self) unless @lambda.nil?
|
166
|
+
rescue ArgumentError => ae
|
167
|
+
build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
|
168
|
+
@reported_invalid_encoding = true
|
169
|
+
end
|
170
|
+
|
171
|
+
# analyses the provided csv and builds errors, warnings and info messages
|
172
|
+
def parse_contents(stream, line = nil)
|
173
|
+
# parse_contents will parse one line and apply headers, formats methods and error handle as appropriate
|
174
|
+
current_line = line.present? ? line : 1
|
175
|
+
all_errors = []
|
176
|
+
|
177
|
+
@csv_options[:encoding] = @encoding
|
178
|
+
|
179
|
+
begin
|
180
|
+
row = LineCSV.parse_line(stream, @csv_options)
|
181
|
+
rescue LineCSV::MalformedCSVError => e
|
182
|
+
build_exception_messages(e, stream, current_line)
|
183
|
+
end
|
184
|
+
|
185
|
+
@data << row
|
186
|
+
if row
|
187
|
+
if current_line <= 1 && @csv_header
|
188
|
+
# this conditional should be refactored somewhere
|
189
|
+
row = row.reject { |col| col.nil? || col.empty? }
|
190
|
+
validate_header(row)
|
191
|
+
@col_counts << row.size
|
192
|
+
else
|
193
|
+
build_formats(row)
|
194
|
+
@col_counts << row.reject { |col| col.nil? || col.empty? }.size
|
195
|
+
@expected_columns = row.size unless @expected_columns != 0
|
196
|
+
build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? || c.empty? }.size == 0
|
197
|
+
# Builds errors and warnings related to the provided schema file
|
198
|
+
if @schema
|
199
|
+
@schema.validate_row(row, current_line, all_errors, @source)
|
200
|
+
@errors += @schema.errors
|
201
|
+
all_errors += @schema.errors
|
202
|
+
@warnings += @schema.warnings
|
203
|
+
else
|
204
|
+
build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) if !row.empty? && row.size != @expected_columns
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
def finish
|
211
|
+
sum = @col_counts.inject(:+)
|
212
|
+
unless sum.nil?
|
213
|
+
build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
|
214
|
+
end
|
215
|
+
# return expected_columns to calling class
|
216
|
+
build_warnings(:check_options, :structure) if @expected_columns == 1
|
217
|
+
check_consistency
|
218
|
+
check_foreign_keys
|
219
|
+
check_mixed_linebreaks
|
220
|
+
validate_encoding
|
221
|
+
end
|
222
|
+
|
223
|
+
def validate_metadata
|
224
|
+
assumed_header = !@supplied_dialect
|
225
|
+
unless @headers.empty?
|
226
|
+
if @headers["content-type"] =~ /text\/csv/
|
227
|
+
@csv_header = @csv_header && true
|
228
|
+
assumed_header = @assumed_header.present?
|
229
|
+
end
|
230
|
+
if @headers["content-type"] =~ /header=(present|absent)/
|
231
|
+
@csv_header = true if $1 == "present"
|
232
|
+
@csv_header = false if $1 == "absent"
|
233
|
+
assumed_header = false
|
234
|
+
end
|
235
|
+
build_warnings(:no_content_type, :context) if @content_type == nil
|
236
|
+
build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
|
237
|
+
end
|
238
|
+
@header_processed = true
|
239
|
+
build_info_messages(:assumed_header, :structure) if assumed_header
|
240
|
+
|
241
|
+
@link_headers = @headers["link"].split(",") rescue nil
|
242
|
+
@link_headers.each do |link_header|
|
243
|
+
match = LINK_HEADER_REGEXP.match(link_header)
|
244
|
+
uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
|
245
|
+
rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
|
246
|
+
param = match["param"]
|
247
|
+
param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
|
248
|
+
if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
|
249
|
+
begin
|
250
|
+
url = URI.join(@source_url, uri)
|
251
|
+
schema = Schema.load_from_json(url)
|
252
|
+
if schema.instance_of? Csvlint::Csvw::TableGroup
|
253
|
+
if schema.tables[@source_url]
|
254
|
+
link_schema = schema
|
255
|
+
else
|
256
|
+
warn_if_unsuccessful = true
|
257
|
+
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
|
258
|
+
end
|
259
|
+
end
|
260
|
+
rescue OpenURI::HTTPError
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end if @link_headers
|
264
|
+
end
|
265
|
+
|
266
|
+
def header?
|
267
|
+
@csv_header && @dialect["header"]
|
268
|
+
end
|
269
|
+
|
270
|
+
def report_line_breaks(line_no=nil)
|
271
|
+
return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line
|
272
|
+
line_break = get_line_break(@input)
|
273
|
+
@line_breaks << line_break
|
274
|
+
unless line_breaks_reported?
|
275
|
+
if line_break != "\r\n"
|
276
|
+
build_info_messages(:nonrfc_line_breaks, :structure, line_no)
|
277
|
+
@line_breaks_reported = true
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
def line_breaks_reported?
|
283
|
+
@line_breaks_reported === true
|
284
|
+
end
|
285
|
+
|
286
|
+
def set_dialect
|
287
|
+
@assumed_header = @dialect["header"].nil?
|
288
|
+
@supplied_dialect = @dialect != {}
|
289
|
+
|
290
|
+
begin
|
291
|
+
schema_dialect = @schema.tables[@source_url].dialect || {}
|
292
|
+
rescue
|
293
|
+
schema_dialect = {}
|
294
|
+
end
|
295
|
+
@dialect = {
|
296
|
+
"header" => true,
|
297
|
+
"delimiter" => ",",
|
298
|
+
"skipInitialSpace" => true,
|
299
|
+
"lineTerminator" => :auto,
|
300
|
+
"quoteChar" => '"',
|
301
|
+
"trim" => :true
|
302
|
+
}.merge(schema_dialect).merge(@dialect || {})
|
303
|
+
|
304
|
+
@csv_header = @csv_header && @dialect["header"]
|
305
|
+
@csv_options = dialect_to_csv_options(@dialect)
|
306
|
+
end
|
307
|
+
|
308
|
+
def validate_encoding
|
309
|
+
if @headers["content-type"]
|
310
|
+
if @headers["content-type"] !~ /charset=/
|
311
|
+
build_warnings(:no_encoding, :context)
|
312
|
+
elsif @headers["content-type"] !~ /charset=utf-8/i
|
313
|
+
build_warnings(:encoding, :context)
|
314
|
+
end
|
315
|
+
end
|
316
|
+
build_warnings(:encoding, :context) if @encoding != "UTF-8"
|
317
|
+
end
|
318
|
+
|
319
|
+
def check_mixed_linebreaks
|
320
|
+
build_linebreak_error if @line_breaks.uniq.count > 1
|
321
|
+
end
|
322
|
+
|
323
|
+
def line_breaks
|
324
|
+
if @line_breaks.uniq.count > 1
|
325
|
+
:mixed
|
326
|
+
else
|
327
|
+
@line_breaks.uniq.first
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
def row_count
|
332
|
+
data.count
|
333
|
+
end
|
334
|
+
|
335
|
+
def build_exception_messages(csvException, errChars, lineNo)
|
336
|
+
#TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers
|
337
|
+
#TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options
|
338
|
+
type = fetch_error(csvException)
|
339
|
+
if !@csv_options[:row_sep].kind_of?(Symbol) && [:unclosed_quote,:stray_quote].include?(type) && !@input.match(@csv_options[:row_sep])
|
340
|
+
build_linebreak_error
|
341
|
+
else
|
342
|
+
build_errors(type, :structure, lineNo, nil, errChars)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
def build_linebreak_error
|
347
|
+
build_errors(:line_breaks, :structure) unless @errors.any? { |e| e.type == :line_breaks }
|
348
|
+
end
|
349
|
+
|
350
|
+
def validate_header(header)
|
351
|
+
names = Set.new
|
352
|
+
header.map{|h| h.strip! } if @dialect["trim"] == :true
|
353
|
+
header.each_with_index do |name,i|
|
354
|
+
build_warnings(:empty_column_name, :schema, nil, i+1) if name == ""
|
355
|
+
if names.include?(name)
|
356
|
+
build_warnings(:duplicate_column_name, :schema, nil, i+1)
|
357
|
+
else
|
358
|
+
names << name
|
359
|
+
end
|
360
|
+
end
|
361
|
+
if @schema
|
362
|
+
@schema.validate_header(header, @source)
|
363
|
+
@errors += @schema.errors
|
364
|
+
@warnings += @schema.warnings
|
365
|
+
end
|
366
|
+
return valid?
|
367
|
+
end
|
368
|
+
|
369
|
+
def fetch_error(error)
|
370
|
+
e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i)
|
371
|
+
message = e[1] rescue nil
|
372
|
+
ERROR_MATCHERS.fetch(message, :unknown_error)
|
373
|
+
end
|
374
|
+
|
375
|
+
def dialect_to_csv_options(dialect)
|
376
|
+
skipinitialspace = dialect["skipInitialSpace"] || true
|
377
|
+
delimiter = dialect["delimiter"]
|
378
|
+
delimiter = delimiter + " " if !skipinitialspace
|
379
|
+
return {
|
380
|
+
:col_sep => delimiter,
|
381
|
+
:row_sep => dialect["lineTerminator"],
|
382
|
+
:quote_char => dialect["quoteChar"],
|
383
|
+
:skip_blanks => false
|
384
|
+
}
|
385
|
+
end
|
386
|
+
|
387
|
+
def build_formats(row)
|
388
|
+
row.each_with_index do |col, i|
|
389
|
+
next if col.nil? || col.empty?
|
390
|
+
@formats[i] ||= Hash.new(0)
|
391
|
+
|
392
|
+
format =
|
393
|
+
if col.strip[FORMATS[:numeric]]
|
394
|
+
:numeric
|
395
|
+
elsif uri?(col)
|
396
|
+
:uri
|
397
|
+
elsif possible_date?(col)
|
398
|
+
date_formats(col)
|
399
|
+
else
|
400
|
+
:string
|
401
|
+
end
|
402
|
+
|
403
|
+
@formats[i][format] += 1
|
404
|
+
end
|
405
|
+
end
|
406
|
+
|
407
|
+
def check_consistency
|
408
|
+
@formats.each_with_index do |format,i|
|
409
|
+
if format
|
410
|
+
total = format.values.reduce(:+).to_f
|
411
|
+
if format.none?{|_,count| count / total >= 0.9}
|
412
|
+
build_warnings(:inconsistent_values, :schema, nil, i + 1)
|
413
|
+
end
|
414
|
+
end
|
415
|
+
end
|
416
|
+
end
|
417
|
+
|
418
|
+
def check_foreign_keys
|
419
|
+
if @schema.instance_of? Csvlint::Csvw::TableGroup
|
420
|
+
@schema.validate_foreign_keys
|
421
|
+
@errors += @schema.errors
|
422
|
+
@warnings += @schema.warnings
|
423
|
+
end
|
424
|
+
end
|
425
|
+
|
426
|
+
def locate_schema
|
427
|
+
|
428
|
+
@source_url = nil
|
429
|
+
warn_if_unsuccessful = false
|
430
|
+
case @source
|
431
|
+
when StringIO
|
432
|
+
return
|
433
|
+
when File
|
434
|
+
@source_url = "file:#{File.expand_path(@source)}"
|
435
|
+
else
|
436
|
+
@source_url = @source
|
437
|
+
end
|
438
|
+
unless @schema.nil?
|
439
|
+
if @schema.tables[@source_url]
|
440
|
+
return
|
441
|
+
else
|
442
|
+
@schema = nil
|
443
|
+
end
|
444
|
+
end
|
445
|
+
link_schema = nil
|
446
|
+
@schema = link_schema if link_schema
|
447
|
+
|
448
|
+
paths = []
|
449
|
+
if @source_url =~ /^http(s)?/
|
450
|
+
begin
|
451
|
+
well_known_uri = URI.join(@source_url, "/.well-known/csvm")
|
452
|
+
well_known = open(well_known_uri).read
|
453
|
+
# TODO
|
454
|
+
rescue OpenURI::HTTPError, URI::BadURIError
|
455
|
+
end
|
456
|
+
end
|
457
|
+
paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty?
|
458
|
+
paths.each do |template|
|
459
|
+
begin
|
460
|
+
template = URITemplate.new(template)
|
461
|
+
path = template.expand('url' => @source_url)
|
462
|
+
url = URI.join(@source_url, path)
|
463
|
+
url = File.new(url.to_s.sub(/^file:/, "")) if url.to_s =~ /^file:/
|
464
|
+
schema = Schema.load_from_json(url)
|
465
|
+
if schema.instance_of? Csvlint::Csvw::TableGroup
|
466
|
+
if schema.tables[@source_url]
|
467
|
+
@schema = schema
|
468
|
+
else
|
469
|
+
warn_if_unsuccessful = true
|
470
|
+
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
|
471
|
+
end
|
472
|
+
end
|
473
|
+
rescue Errno::ENOENT
|
474
|
+
rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError
|
475
|
+
rescue => e
|
476
|
+
STDERR.puts e.class
|
477
|
+
STDERR.puts e.message
|
478
|
+
STDERR.puts e.backtrace
|
479
|
+
raise e
|
480
|
+
end
|
481
|
+
end
|
482
|
+
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) if warn_if_unsuccessful
|
483
|
+
@schema = nil
|
484
|
+
end
|
485
|
+
|
486
|
+
private
|
487
|
+
|
488
|
+
def parse_extension(source)
|
489
|
+
|
490
|
+
case source
|
491
|
+
when File
|
492
|
+
return File.extname( source.path )
|
493
|
+
when IO
|
494
|
+
return ""
|
495
|
+
when StringIO
|
496
|
+
return ""
|
497
|
+
when Tempfile
|
498
|
+
# this is triggered when the revalidate dialect use case happens
|
499
|
+
return ""
|
500
|
+
else
|
501
|
+
begin
|
502
|
+
parsed = URI.parse(source)
|
503
|
+
File.extname(parsed.path)
|
504
|
+
rescue URI::InvalidURIError
|
505
|
+
return ""
|
506
|
+
end
|
507
|
+
end
|
508
|
+
end
|
509
|
+
|
510
|
+
def uri?(value)
|
511
|
+
if value.strip[FORMATS[:uri]]
|
512
|
+
uri = URI.parse(value)
|
513
|
+
uri.kind_of?(URI::HTTP) || uri.kind_of?(URI::HTTPS)
|
514
|
+
end
|
515
|
+
rescue URI::InvalidURIError
|
516
|
+
false
|
517
|
+
end
|
518
|
+
|
519
|
+
def possible_date?(col)
|
520
|
+
col[POSSIBLE_DATE_REGEXP]
|
521
|
+
end
|
522
|
+
|
523
|
+
def date_formats(col)
|
524
|
+
if col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
|
525
|
+
:date_db
|
526
|
+
elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
|
527
|
+
:date_short
|
528
|
+
elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
|
529
|
+
:date_rfc822
|
530
|
+
elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
|
531
|
+
:date_long
|
532
|
+
elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
|
533
|
+
:dateTime_time
|
534
|
+
elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
|
535
|
+
:dateTime_hms
|
536
|
+
elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
|
537
|
+
:dateTime_db
|
538
|
+
elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
|
539
|
+
:dateTime_iso8601
|
540
|
+
elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
|
541
|
+
:dateTime_short
|
542
|
+
elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
|
543
|
+
:dateTime_long
|
544
|
+
else
|
545
|
+
:string
|
546
|
+
end
|
547
|
+
end
|
548
|
+
|
549
|
+
def date_format?(klass, value, format)
|
550
|
+
klass.strptime(value, format).strftime(format) == value
|
551
|
+
rescue ArgumentError # invalid date
|
552
|
+
false
|
553
|
+
end
|
554
|
+
|
555
|
+
def line_limit_reached?
|
556
|
+
@limit_lines.present? && @current_line > @limit_lines
|
557
|
+
end
|
558
|
+
|
559
|
+
def get_line_break(line)
|
560
|
+
eol = line.chars.last(2)
|
561
|
+
if eol.first == "\r"
|
562
|
+
"\r\n"
|
563
|
+
else
|
564
|
+
"\n"
|
565
|
+
end
|
566
|
+
end
|
567
|
+
|
568
|
+
FORMATS = {
|
569
|
+
:string => nil,
|
570
|
+
:numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
|
571
|
+
:uri => /\Ahttps?:/,
|
572
|
+
:date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01"
|
573
|
+
:date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345"
|
574
|
+
:date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345"
|
575
|
+
:date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan"
|
576
|
+
:dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00"
|
577
|
+
:dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00"
|
578
|
+
:dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z"
|
579
|
+
:dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
|
580
|
+
:dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00"
|
581
|
+
:dateTime_time => /\A\d\d:\d\d\z/, # "00:00"
|
582
|
+
}.freeze
|
583
|
+
|
584
|
+
URI_REGEXP = /(?<uri>.*?)/
|
585
|
+
TOKEN_REGEXP = /([^\(\)\<\>@,;:\\"\/\[\]\?=\{\} \t]+)/
|
586
|
+
QUOTED_STRING_REGEXP = /("[^"]*")/
|
587
|
+
SGML_NAME_REGEXP = /([A-Za-z][-A-Za-z0-9\.]*)/
|
588
|
+
RELATIONSHIP_REGEXP = Regexp.new("(?<relationship>#{SGML_NAME_REGEXP}|(\"#{SGML_NAME_REGEXP}(\\s+#{SGML_NAME_REGEXP})*\"))")
|
589
|
+
REL_REGEXP = Regexp.new("(?<rel>\\s*rel\\s*=\\s*(?<rel-relationship>#{RELATIONSHIP_REGEXP}))")
|
590
|
+
REV_REGEXP = Regexp.new("(?<rev>\\s*rev\\s*=\\s*#{RELATIONSHIP_REGEXP})")
|
591
|
+
TITLE_REGEXP = Regexp.new("(?<title>\\s*title\\s*=\\s*#{QUOTED_STRING_REGEXP})")
|
592
|
+
ANCHOR_REGEXP = Regexp.new("(?<anchor>\\s*anchor\\s*=\\s*\\<#{URI_REGEXP}\\>)")
|
593
|
+
LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)")
|
594
|
+
LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})")
|
595
|
+
LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*")
|
596
|
+
POSSIBLE_DATE_REGEXP = Regexp.new("\\A(\\d|\\s\\d#{Date::ABBR_MONTHNAMES.join('|')}#{Date::MONTHNAMES.join('|')})")
|
597
|
+
|
598
|
+
end
|
599
|
+
end
|