wjordan213-csvlint 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.coveralls.yml +1 -0
- data/.gitattributes +2 -0
- data/.gitignore +28 -0
- data/.ruby-version +1 -0
- data/.travis.yml +32 -0
- data/CHANGELOG.md +361 -0
- data/Gemfile +7 -0
- data/LICENSE.md +22 -0
- data/README.md +328 -0
- data/Rakefile +17 -0
- data/bin/create_schema +32 -0
- data/bin/csvlint +10 -0
- data/features/check_format.feature +46 -0
- data/features/cli.feature +210 -0
- data/features/csv_options.feature +35 -0
- data/features/csvupload.feature +145 -0
- data/features/csvw_schema_validation.feature +127 -0
- data/features/fixtures/cr-line-endings.csv +0 -0
- data/features/fixtures/crlf-line-endings.csv +0 -0
- data/features/fixtures/inconsistent-line-endings-unquoted.csv +0 -0
- data/features/fixtures/inconsistent-line-endings.csv +0 -0
- data/features/fixtures/invalid-byte-sequence.csv +0 -0
- data/features/fixtures/invalid_many_rows.csv +0 -0
- data/features/fixtures/lf-line-endings.csv +0 -0
- data/features/fixtures/spreadsheet.xls +0 -0
- data/features/fixtures/spreadsheet.xlsx +0 -0
- data/features/fixtures/title-row.csv +0 -0
- data/features/fixtures/valid.csv +0 -0
- data/features/fixtures/valid_many_rows.csv +0 -0
- data/features/fixtures/windows-line-endings.csv +0 -0
- data/features/information.feature +22 -0
- data/features/parse_csv.feature +90 -0
- data/features/schema_validation.feature +105 -0
- data/features/sources.feature +17 -0
- data/features/step_definitions/cli_steps.rb +11 -0
- data/features/step_definitions/csv_options_steps.rb +24 -0
- data/features/step_definitions/information_steps.rb +13 -0
- data/features/step_definitions/parse_csv_steps.rb +42 -0
- data/features/step_definitions/schema_validation_steps.rb +33 -0
- data/features/step_definitions/sources_steps.rb +7 -0
- data/features/step_definitions/validation_errors_steps.rb +90 -0
- data/features/step_definitions/validation_info_steps.rb +22 -0
- data/features/step_definitions/validation_warnings_steps.rb +60 -0
- data/features/support/aruba.rb +56 -0
- data/features/support/env.rb +26 -0
- data/features/support/load_tests.rb +114 -0
- data/features/support/webmock.rb +1 -0
- data/features/validation_errors.feature +147 -0
- data/features/validation_info.feature +16 -0
- data/features/validation_warnings.feature +86 -0
- data/lib/csvlint.rb +27 -0
- data/lib/csvlint/cli.rb +165 -0
- data/lib/csvlint/csvw/column.rb +359 -0
- data/lib/csvlint/csvw/date_format.rb +182 -0
- data/lib/csvlint/csvw/metadata_error.rb +13 -0
- data/lib/csvlint/csvw/number_format.rb +211 -0
- data/lib/csvlint/csvw/property_checker.rb +761 -0
- data/lib/csvlint/csvw/table.rb +204 -0
- data/lib/csvlint/csvw/table_group.rb +165 -0
- data/lib/csvlint/error_collector.rb +27 -0
- data/lib/csvlint/error_message.rb +15 -0
- data/lib/csvlint/field.rb +196 -0
- data/lib/csvlint/schema.rb +92 -0
- data/lib/csvlint/validate.rb +599 -0
- data/lib/csvlint/version.rb +3 -0
- data/spec/csvw/column_spec.rb +112 -0
- data/spec/csvw/date_format_spec.rb +49 -0
- data/spec/csvw/number_format_spec.rb +417 -0
- data/spec/csvw/table_group_spec.rb +143 -0
- data/spec/csvw/table_spec.rb +90 -0
- data/spec/field_spec.rb +252 -0
- data/spec/schema_spec.rb +211 -0
- data/spec/spec_helper.rb +17 -0
- data/spec/validator_spec.rb +619 -0
- data/wjordan213_csvlint.gemspec +46 -0
- metadata +490 -0
@@ -0,0 +1,92 @@
|
|
1
|
+
module Csvlint
|
2
|
+
|
3
|
+
class Schema
|
4
|
+
|
5
|
+
include Csvlint::ErrorCollector
|
6
|
+
|
7
|
+
attr_reader :uri, :fields, :title, :description
|
8
|
+
|
9
|
+
def initialize(uri, fields=[], title=nil, description=nil)
|
10
|
+
@uri = uri
|
11
|
+
@fields = fields
|
12
|
+
@title = title
|
13
|
+
@description = description
|
14
|
+
reset
|
15
|
+
end
|
16
|
+
|
17
|
+
class << self
|
18
|
+
|
19
|
+
def from_json_table(uri, json)
|
20
|
+
fields = []
|
21
|
+
json["fields"].each do |field_desc|
|
22
|
+
fields << Csvlint::Field.new( field_desc["name"] , field_desc["constraints"],
|
23
|
+
field_desc["title"], field_desc["description"] )
|
24
|
+
end if json["fields"]
|
25
|
+
return Schema.new( uri , fields, json["title"], json["description"] )
|
26
|
+
end
|
27
|
+
|
28
|
+
def from_csvw_metadata(uri, json)
|
29
|
+
return Csvlint::Csvw::TableGroup.from_json(uri, json)
|
30
|
+
end
|
31
|
+
|
32
|
+
def load_from_json(uri, output_errors = true)
|
33
|
+
begin
|
34
|
+
json = JSON.parse( open(uri).read )
|
35
|
+
if json["@context"]
|
36
|
+
uri = "file:#{File.expand_path(uri)}" unless uri.to_s =~ /^http(s)?/
|
37
|
+
return Schema.from_csvw_metadata(uri,json)
|
38
|
+
else
|
39
|
+
return Schema.from_json_table(uri,json)
|
40
|
+
end
|
41
|
+
rescue Csvlint::Csvw::MetadataError => e
|
42
|
+
raise e
|
43
|
+
rescue OpenURI::HTTPError, Errno::ENOENT => e
|
44
|
+
raise e
|
45
|
+
rescue => e
|
46
|
+
if output_errors === true
|
47
|
+
STDERR.puts e.class
|
48
|
+
STDERR.puts e.message
|
49
|
+
STDERR.puts e.backtrace
|
50
|
+
end
|
51
|
+
return Schema.new(nil, [], "malformed", "malformed")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
def validate_header(header, source_url=nil)
|
58
|
+
reset
|
59
|
+
|
60
|
+
found_header = header.to_csv(:row_sep => '')
|
61
|
+
expected_header = @fields.map{ |f| f.name }.to_csv(:row_sep => '')
|
62
|
+
if found_header != expected_header
|
63
|
+
build_warnings(:malformed_header, :schema, 1, nil, found_header, "expectedHeader" => expected_header)
|
64
|
+
end
|
65
|
+
return valid?
|
66
|
+
end
|
67
|
+
|
68
|
+
def validate_row(values, row=nil, all_errors=[], source_url=nil)
|
69
|
+
reset
|
70
|
+
if values.length < fields.length
|
71
|
+
fields[values.size..-1].each_with_index do |field, i|
|
72
|
+
build_warnings(:missing_column, :schema, row, values.size+i+1)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
if values.length > fields.length
|
76
|
+
values[fields.size..-1].each_with_index do |data_column, i|
|
77
|
+
build_warnings(:extra_column, :schema, row, fields.size+i+1)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
fields.each_with_index do |field,i|
|
82
|
+
value = values[i] || ""
|
83
|
+
result = field.validate_column(value, row, i+1, all_errors)
|
84
|
+
@errors += fields[i].errors
|
85
|
+
@warnings += fields[i].warnings
|
86
|
+
end
|
87
|
+
|
88
|
+
return valid?
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,599 @@
|
|
1
|
+
module Csvlint
|
2
|
+
|
3
|
+
class Validator
|
4
|
+
class LineCSV < CSV
|
5
|
+
ENCODE_RE = Hash.new do |h,str|
|
6
|
+
h[str] = Regexp.new(str)
|
7
|
+
end
|
8
|
+
|
9
|
+
ENCODE_STR = Hash.new do |h,encoding_name|
|
10
|
+
h[encoding_name] = Hash.new do |h,chunks|
|
11
|
+
h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join('')
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
ESCAPE_RE = Hash.new do |h,re_chars|
|
16
|
+
h[re_chars] = Hash.new do |h,re_esc|
|
17
|
+
h[re_esc] = Hash.new do |h,str|
|
18
|
+
h[str] = str.gsub(re_chars) {|c| re_esc + c}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Optimization: Memoize `encode_re`.
|
24
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2273
|
25
|
+
def encode_re(*chunks)
|
26
|
+
ENCODE_RE[encode_str(*chunks)]
|
27
|
+
end
|
28
|
+
|
29
|
+
# Optimization: Memoize `encode_str`.
|
30
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2281
|
31
|
+
def encode_str(*chunks)
|
32
|
+
ENCODE_STR[@encoding.name][chunks]
|
33
|
+
end
|
34
|
+
|
35
|
+
# Optimization: Memoize `escape_re`.
|
36
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2265
|
37
|
+
def escape_re(str)
|
38
|
+
ESCAPE_RE[@re_chars][@re_esc][str]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Optimization: Disable the CSV library's converters feature.
|
42
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2100
|
43
|
+
def init_converters(options, field_name = :converters)
|
44
|
+
@converters = []
|
45
|
+
@header_converters = []
|
46
|
+
options.delete(:unconverted_fields)
|
47
|
+
options.delete(field_name)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
include Csvlint::ErrorCollector
|
52
|
+
|
53
|
+
attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :dialect, :csv_header, :schema, :data, :current_line
|
54
|
+
|
55
|
+
ERROR_MATCHERS = {
|
56
|
+
"Missing or stray quote" => :stray_quote,
|
57
|
+
"Illegal quoting" => :whitespace,
|
58
|
+
"Unclosed quoted field" => :unclosed_quote,
|
59
|
+
"Unquoted fields do not allow \\r or \\n" => :line_breaks,
|
60
|
+
}
|
61
|
+
|
62
|
+
def initialize(source, dialect = {}, schema = nil, options = {})
|
63
|
+
reset
|
64
|
+
@source = source
|
65
|
+
@formats = []
|
66
|
+
@schema = schema
|
67
|
+
@dialect = dialect
|
68
|
+
@csv_header = true
|
69
|
+
@headers = {}
|
70
|
+
@lambda = options[:lambda]
|
71
|
+
@leading = ""
|
72
|
+
|
73
|
+
@limit_lines = options[:limit_lines]
|
74
|
+
@extension = parse_extension(source) unless @source.nil?
|
75
|
+
|
76
|
+
@expected_columns = 0
|
77
|
+
@col_counts = []
|
78
|
+
@line_breaks = []
|
79
|
+
|
80
|
+
@errors += @schema.errors unless @schema.nil?
|
81
|
+
@warnings += @schema.warnings unless @schema.nil?
|
82
|
+
|
83
|
+
@data = [] # it may be advisable to flush this on init?
|
84
|
+
|
85
|
+
validate
|
86
|
+
end
|
87
|
+
|
88
|
+
def validate
|
89
|
+
if @extension =~ /.xls(x)?/
|
90
|
+
build_warnings(:excel, :context)
|
91
|
+
return
|
92
|
+
end
|
93
|
+
locate_schema unless @schema.instance_of?(Csvlint::Schema)
|
94
|
+
set_dialect
|
95
|
+
|
96
|
+
if @source.class == String
|
97
|
+
validate_url
|
98
|
+
else
|
99
|
+
validate_metadata
|
100
|
+
validate_stream
|
101
|
+
end
|
102
|
+
finish
|
103
|
+
end
|
104
|
+
|
105
|
+
def validate_stream
|
106
|
+
@current_line = 1
|
107
|
+
@source.each_line do |line|
|
108
|
+
break if line_limit_reached?
|
109
|
+
parse_line(line)
|
110
|
+
end
|
111
|
+
validate_line(@leading, @current_line) unless @leading == ""
|
112
|
+
end
|
113
|
+
|
114
|
+
def validate_url
|
115
|
+
@current_line = 1
|
116
|
+
request = Typhoeus::Request.new(@source, followlocation: true)
|
117
|
+
request.on_headers do |response|
|
118
|
+
@headers = response.headers || {}
|
119
|
+
@content_type = response.headers["content-type"] rescue nil
|
120
|
+
@response_code = response.code
|
121
|
+
return build_errors(:not_found) if response.code == 404
|
122
|
+
validate_metadata
|
123
|
+
end
|
124
|
+
request.on_body do |chunk|
|
125
|
+
io = StringIO.new(chunk)
|
126
|
+
io.each_line do |line|
|
127
|
+
break if line_limit_reached?
|
128
|
+
parse_line(line)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
request.run
|
132
|
+
# Validate the last line too
|
133
|
+
validate_line(@leading, @current_line) unless @leading == ""
|
134
|
+
end
|
135
|
+
|
136
|
+
def parse_line(line)
|
137
|
+
line = @leading + line
|
138
|
+
# Check if the last line is a line break - in which case it's a full line
|
139
|
+
if line[-1, 1].include?("\n")
|
140
|
+
# If the number of quotes is odd, the linebreak is inside some quotes
|
141
|
+
if line.count(@dialect["quoteChar"]).odd?
|
142
|
+
@leading = line
|
143
|
+
else
|
144
|
+
validate_line(line, @current_line)
|
145
|
+
@leading = ""
|
146
|
+
@current_line = @current_line+1
|
147
|
+
end
|
148
|
+
else
|
149
|
+
# If it's not a full line, then prepare to add it to the beginning of the next chunk
|
150
|
+
@leading = line
|
151
|
+
end
|
152
|
+
rescue ArgumentError => ae
|
153
|
+
build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
|
154
|
+
@current_line = @current_line+1
|
155
|
+
@reported_invalid_encoding = true
|
156
|
+
end
|
157
|
+
|
158
|
+
def validate_line(input = nil, index = nil)
|
159
|
+
@input = input
|
160
|
+
single_col = false
|
161
|
+
line = index.present? ? index : 0
|
162
|
+
@encoding = input.encoding.to_s
|
163
|
+
report_line_breaks(line)
|
164
|
+
parse_contents(input, line)
|
165
|
+
@lambda.call(self) unless @lambda.nil?
|
166
|
+
rescue ArgumentError => ae
|
167
|
+
build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
|
168
|
+
@reported_invalid_encoding = true
|
169
|
+
end
|
170
|
+
|
171
|
+
# analyses the provided csv and builds errors, warnings and info messages
|
172
|
+
def parse_contents(stream, line = nil)
|
173
|
+
# parse_contents will parse one line and apply headers, formats methods and error handle as appropriate
|
174
|
+
current_line = line.present? ? line : 1
|
175
|
+
all_errors = []
|
176
|
+
|
177
|
+
@csv_options[:encoding] = @encoding
|
178
|
+
|
179
|
+
begin
|
180
|
+
row = LineCSV.parse_line(stream, @csv_options)
|
181
|
+
rescue LineCSV::MalformedCSVError => e
|
182
|
+
build_exception_messages(e, stream, current_line)
|
183
|
+
end
|
184
|
+
|
185
|
+
@data << row
|
186
|
+
if row
|
187
|
+
if current_line <= 1 && @csv_header
|
188
|
+
# this conditional should be refactored somewhere
|
189
|
+
row = row.reject { |col| col.nil? || col.empty? }
|
190
|
+
validate_header(row)
|
191
|
+
@col_counts << row.size
|
192
|
+
else
|
193
|
+
build_formats(row)
|
194
|
+
@col_counts << row.reject { |col| col.nil? || col.empty? }.size
|
195
|
+
@expected_columns = row.size unless @expected_columns != 0
|
196
|
+
build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? || c.empty? }.size == 0
|
197
|
+
# Builds errors and warnings related to the provided schema file
|
198
|
+
if @schema
|
199
|
+
@schema.validate_row(row, current_line, all_errors, @source)
|
200
|
+
@errors += @schema.errors
|
201
|
+
all_errors += @schema.errors
|
202
|
+
@warnings += @schema.warnings
|
203
|
+
else
|
204
|
+
build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) if !row.empty? && row.size != @expected_columns
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
def finish
|
211
|
+
sum = @col_counts.inject(:+)
|
212
|
+
unless sum.nil?
|
213
|
+
build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
|
214
|
+
end
|
215
|
+
# return expected_columns to calling class
|
216
|
+
build_warnings(:check_options, :structure) if @expected_columns == 1
|
217
|
+
check_consistency
|
218
|
+
check_foreign_keys
|
219
|
+
check_mixed_linebreaks
|
220
|
+
validate_encoding
|
221
|
+
end
|
222
|
+
|
223
|
+
def validate_metadata
|
224
|
+
assumed_header = !@supplied_dialect
|
225
|
+
unless @headers.empty?
|
226
|
+
if @headers["content-type"] =~ /text\/csv/
|
227
|
+
@csv_header = @csv_header && true
|
228
|
+
assumed_header = @assumed_header.present?
|
229
|
+
end
|
230
|
+
if @headers["content-type"] =~ /header=(present|absent)/
|
231
|
+
@csv_header = true if $1 == "present"
|
232
|
+
@csv_header = false if $1 == "absent"
|
233
|
+
assumed_header = false
|
234
|
+
end
|
235
|
+
build_warnings(:no_content_type, :context) if @content_type == nil
|
236
|
+
build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
|
237
|
+
end
|
238
|
+
@header_processed = true
|
239
|
+
build_info_messages(:assumed_header, :structure) if assumed_header
|
240
|
+
|
241
|
+
@link_headers = @headers["link"].split(",") rescue nil
|
242
|
+
@link_headers.each do |link_header|
|
243
|
+
match = LINK_HEADER_REGEXP.match(link_header)
|
244
|
+
uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
|
245
|
+
rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
|
246
|
+
param = match["param"]
|
247
|
+
param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
|
248
|
+
if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
|
249
|
+
begin
|
250
|
+
url = URI.join(@source_url, uri)
|
251
|
+
schema = Schema.load_from_json(url)
|
252
|
+
if schema.instance_of? Csvlint::Csvw::TableGroup
|
253
|
+
if schema.tables[@source_url]
|
254
|
+
link_schema = schema
|
255
|
+
else
|
256
|
+
warn_if_unsuccessful = true
|
257
|
+
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
|
258
|
+
end
|
259
|
+
end
|
260
|
+
rescue OpenURI::HTTPError
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end if @link_headers
|
264
|
+
end
|
265
|
+
|
266
|
+
def header?
|
267
|
+
@csv_header && @dialect["header"]
|
268
|
+
end
|
269
|
+
|
270
|
+
def report_line_breaks(line_no=nil)
|
271
|
+
return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line
|
272
|
+
line_break = get_line_break(@input)
|
273
|
+
@line_breaks << line_break
|
274
|
+
unless line_breaks_reported?
|
275
|
+
if line_break != "\r\n"
|
276
|
+
build_info_messages(:nonrfc_line_breaks, :structure, line_no)
|
277
|
+
@line_breaks_reported = true
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
def line_breaks_reported?
|
283
|
+
@line_breaks_reported === true
|
284
|
+
end
|
285
|
+
|
286
|
+
def set_dialect
|
287
|
+
@assumed_header = @dialect["header"].nil?
|
288
|
+
@supplied_dialect = @dialect != {}
|
289
|
+
|
290
|
+
begin
|
291
|
+
schema_dialect = @schema.tables[@source_url].dialect || {}
|
292
|
+
rescue
|
293
|
+
schema_dialect = {}
|
294
|
+
end
|
295
|
+
@dialect = {
|
296
|
+
"header" => true,
|
297
|
+
"delimiter" => ",",
|
298
|
+
"skipInitialSpace" => true,
|
299
|
+
"lineTerminator" => :auto,
|
300
|
+
"quoteChar" => '"',
|
301
|
+
"trim" => :true
|
302
|
+
}.merge(schema_dialect).merge(@dialect || {})
|
303
|
+
|
304
|
+
@csv_header = @csv_header && @dialect["header"]
|
305
|
+
@csv_options = dialect_to_csv_options(@dialect)
|
306
|
+
end
|
307
|
+
|
308
|
+
def validate_encoding
|
309
|
+
if @headers["content-type"]
|
310
|
+
if @headers["content-type"] !~ /charset=/
|
311
|
+
build_warnings(:no_encoding, :context)
|
312
|
+
elsif @headers["content-type"] !~ /charset=utf-8/i
|
313
|
+
build_warnings(:encoding, :context)
|
314
|
+
end
|
315
|
+
end
|
316
|
+
build_warnings(:encoding, :context) if @encoding != "UTF-8"
|
317
|
+
end
|
318
|
+
|
319
|
+
def check_mixed_linebreaks
|
320
|
+
build_linebreak_error if @line_breaks.uniq.count > 1
|
321
|
+
end
|
322
|
+
|
323
|
+
def line_breaks
|
324
|
+
if @line_breaks.uniq.count > 1
|
325
|
+
:mixed
|
326
|
+
else
|
327
|
+
@line_breaks.uniq.first
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
def row_count
|
332
|
+
data.count
|
333
|
+
end
|
334
|
+
|
335
|
+
def build_exception_messages(csvException, errChars, lineNo)
|
336
|
+
#TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers
|
337
|
+
#TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options
|
338
|
+
type = fetch_error(csvException)
|
339
|
+
if !@csv_options[:row_sep].kind_of?(Symbol) && [:unclosed_quote,:stray_quote].include?(type) && !@input.match(@csv_options[:row_sep])
|
340
|
+
build_linebreak_error
|
341
|
+
else
|
342
|
+
build_errors(type, :structure, lineNo, nil, errChars)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
def build_linebreak_error
|
347
|
+
build_errors(:line_breaks, :structure) unless @errors.any? { |e| e.type == :line_breaks }
|
348
|
+
end
|
349
|
+
|
350
|
+
def validate_header(header)
|
351
|
+
names = Set.new
|
352
|
+
header.map{|h| h.strip! } if @dialect["trim"] == :true
|
353
|
+
header.each_with_index do |name,i|
|
354
|
+
build_warnings(:empty_column_name, :schema, nil, i+1) if name == ""
|
355
|
+
if names.include?(name)
|
356
|
+
build_warnings(:duplicate_column_name, :schema, nil, i+1)
|
357
|
+
else
|
358
|
+
names << name
|
359
|
+
end
|
360
|
+
end
|
361
|
+
if @schema
|
362
|
+
@schema.validate_header(header, @source)
|
363
|
+
@errors += @schema.errors
|
364
|
+
@warnings += @schema.warnings
|
365
|
+
end
|
366
|
+
return valid?
|
367
|
+
end
|
368
|
+
|
369
|
+
def fetch_error(error)
|
370
|
+
e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i)
|
371
|
+
message = e[1] rescue nil
|
372
|
+
ERROR_MATCHERS.fetch(message, :unknown_error)
|
373
|
+
end
|
374
|
+
|
375
|
+
def dialect_to_csv_options(dialect)
|
376
|
+
skipinitialspace = dialect["skipInitialSpace"] || true
|
377
|
+
delimiter = dialect["delimiter"]
|
378
|
+
delimiter = delimiter + " " if !skipinitialspace
|
379
|
+
return {
|
380
|
+
:col_sep => delimiter,
|
381
|
+
:row_sep => dialect["lineTerminator"],
|
382
|
+
:quote_char => dialect["quoteChar"],
|
383
|
+
:skip_blanks => false
|
384
|
+
}
|
385
|
+
end
|
386
|
+
|
387
|
+
def build_formats(row)
|
388
|
+
row.each_with_index do |col, i|
|
389
|
+
next if col.nil? || col.empty?
|
390
|
+
@formats[i] ||= Hash.new(0)
|
391
|
+
|
392
|
+
format =
|
393
|
+
if col.strip[FORMATS[:numeric]]
|
394
|
+
:numeric
|
395
|
+
elsif uri?(col)
|
396
|
+
:uri
|
397
|
+
elsif possible_date?(col)
|
398
|
+
date_formats(col)
|
399
|
+
else
|
400
|
+
:string
|
401
|
+
end
|
402
|
+
|
403
|
+
@formats[i][format] += 1
|
404
|
+
end
|
405
|
+
end
|
406
|
+
|
407
|
+
def check_consistency
|
408
|
+
@formats.each_with_index do |format,i|
|
409
|
+
if format
|
410
|
+
total = format.values.reduce(:+).to_f
|
411
|
+
if format.none?{|_,count| count / total >= 0.9}
|
412
|
+
build_warnings(:inconsistent_values, :schema, nil, i + 1)
|
413
|
+
end
|
414
|
+
end
|
415
|
+
end
|
416
|
+
end
|
417
|
+
|
418
|
+
def check_foreign_keys
|
419
|
+
if @schema.instance_of? Csvlint::Csvw::TableGroup
|
420
|
+
@schema.validate_foreign_keys
|
421
|
+
@errors += @schema.errors
|
422
|
+
@warnings += @schema.warnings
|
423
|
+
end
|
424
|
+
end
|
425
|
+
|
426
|
+
def locate_schema
|
427
|
+
|
428
|
+
@source_url = nil
|
429
|
+
warn_if_unsuccessful = false
|
430
|
+
case @source
|
431
|
+
when StringIO
|
432
|
+
return
|
433
|
+
when File
|
434
|
+
@source_url = "file:#{File.expand_path(@source)}"
|
435
|
+
else
|
436
|
+
@source_url = @source
|
437
|
+
end
|
438
|
+
unless @schema.nil?
|
439
|
+
if @schema.tables[@source_url]
|
440
|
+
return
|
441
|
+
else
|
442
|
+
@schema = nil
|
443
|
+
end
|
444
|
+
end
|
445
|
+
link_schema = nil
|
446
|
+
@schema = link_schema if link_schema
|
447
|
+
|
448
|
+
paths = []
|
449
|
+
if @source_url =~ /^http(s)?/
|
450
|
+
begin
|
451
|
+
well_known_uri = URI.join(@source_url, "/.well-known/csvm")
|
452
|
+
well_known = open(well_known_uri).read
|
453
|
+
# TODO
|
454
|
+
rescue OpenURI::HTTPError, URI::BadURIError
|
455
|
+
end
|
456
|
+
end
|
457
|
+
paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty?
|
458
|
+
paths.each do |template|
|
459
|
+
begin
|
460
|
+
template = URITemplate.new(template)
|
461
|
+
path = template.expand('url' => @source_url)
|
462
|
+
url = URI.join(@source_url, path)
|
463
|
+
url = File.new(url.to_s.sub(/^file:/, "")) if url.to_s =~ /^file:/
|
464
|
+
schema = Schema.load_from_json(url)
|
465
|
+
if schema.instance_of? Csvlint::Csvw::TableGroup
|
466
|
+
if schema.tables[@source_url]
|
467
|
+
@schema = schema
|
468
|
+
else
|
469
|
+
warn_if_unsuccessful = true
|
470
|
+
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
|
471
|
+
end
|
472
|
+
end
|
473
|
+
rescue Errno::ENOENT
|
474
|
+
rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError
|
475
|
+
rescue => e
|
476
|
+
STDERR.puts e.class
|
477
|
+
STDERR.puts e.message
|
478
|
+
STDERR.puts e.backtrace
|
479
|
+
raise e
|
480
|
+
end
|
481
|
+
end
|
482
|
+
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) if warn_if_unsuccessful
|
483
|
+
@schema = nil
|
484
|
+
end
|
485
|
+
|
486
|
+
private
|
487
|
+
|
488
|
+
def parse_extension(source)
|
489
|
+
|
490
|
+
case source
|
491
|
+
when File
|
492
|
+
return File.extname( source.path )
|
493
|
+
when IO
|
494
|
+
return ""
|
495
|
+
when StringIO
|
496
|
+
return ""
|
497
|
+
when Tempfile
|
498
|
+
# this is triggered when the revalidate dialect use case happens
|
499
|
+
return ""
|
500
|
+
else
|
501
|
+
begin
|
502
|
+
parsed = URI.parse(source)
|
503
|
+
File.extname(parsed.path)
|
504
|
+
rescue URI::InvalidURIError
|
505
|
+
return ""
|
506
|
+
end
|
507
|
+
end
|
508
|
+
end
|
509
|
+
|
510
|
+
def uri?(value)
|
511
|
+
if value.strip[FORMATS[:uri]]
|
512
|
+
uri = URI.parse(value)
|
513
|
+
uri.kind_of?(URI::HTTP) || uri.kind_of?(URI::HTTPS)
|
514
|
+
end
|
515
|
+
rescue URI::InvalidURIError
|
516
|
+
false
|
517
|
+
end
|
518
|
+
|
519
|
+
def possible_date?(col)
|
520
|
+
col[POSSIBLE_DATE_REGEXP]
|
521
|
+
end
|
522
|
+
|
523
|
+
def date_formats(col)
|
524
|
+
if col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
|
525
|
+
:date_db
|
526
|
+
elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
|
527
|
+
:date_short
|
528
|
+
elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
|
529
|
+
:date_rfc822
|
530
|
+
elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
|
531
|
+
:date_long
|
532
|
+
elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
|
533
|
+
:dateTime_time
|
534
|
+
elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
|
535
|
+
:dateTime_hms
|
536
|
+
elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
|
537
|
+
:dateTime_db
|
538
|
+
elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
|
539
|
+
:dateTime_iso8601
|
540
|
+
elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
|
541
|
+
:dateTime_short
|
542
|
+
elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
|
543
|
+
:dateTime_long
|
544
|
+
else
|
545
|
+
:string
|
546
|
+
end
|
547
|
+
end
|
548
|
+
|
549
|
+
def date_format?(klass, value, format)
|
550
|
+
klass.strptime(value, format).strftime(format) == value
|
551
|
+
rescue ArgumentError # invalid date
|
552
|
+
false
|
553
|
+
end
|
554
|
+
|
555
|
+
def line_limit_reached?
|
556
|
+
@limit_lines.present? && @current_line > @limit_lines
|
557
|
+
end
|
558
|
+
|
559
|
+
def get_line_break(line)
|
560
|
+
eol = line.chars.last(2)
|
561
|
+
if eol.first == "\r"
|
562
|
+
"\r\n"
|
563
|
+
else
|
564
|
+
"\n"
|
565
|
+
end
|
566
|
+
end
|
567
|
+
|
568
|
+
FORMATS = {
|
569
|
+
:string => nil,
|
570
|
+
:numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
|
571
|
+
:uri => /\Ahttps?:/,
|
572
|
+
:date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01"
|
573
|
+
:date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345"
|
574
|
+
:date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345"
|
575
|
+
:date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan"
|
576
|
+
:dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00"
|
577
|
+
:dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00"
|
578
|
+
:dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z"
|
579
|
+
:dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
|
580
|
+
:dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00"
|
581
|
+
:dateTime_time => /\A\d\d:\d\d\z/, # "00:00"
|
582
|
+
}.freeze
|
583
|
+
|
584
|
+
URI_REGEXP = /(?<uri>.*?)/
|
585
|
+
TOKEN_REGEXP = /([^\(\)\<\>@,;:\\"\/\[\]\?=\{\} \t]+)/
|
586
|
+
QUOTED_STRING_REGEXP = /("[^"]*")/
|
587
|
+
SGML_NAME_REGEXP = /([A-Za-z][-A-Za-z0-9\.]*)/
|
588
|
+
RELATIONSHIP_REGEXP = Regexp.new("(?<relationship>#{SGML_NAME_REGEXP}|(\"#{SGML_NAME_REGEXP}(\\s+#{SGML_NAME_REGEXP})*\"))")
|
589
|
+
REL_REGEXP = Regexp.new("(?<rel>\\s*rel\\s*=\\s*(?<rel-relationship>#{RELATIONSHIP_REGEXP}))")
|
590
|
+
REV_REGEXP = Regexp.new("(?<rev>\\s*rev\\s*=\\s*#{RELATIONSHIP_REGEXP})")
|
591
|
+
TITLE_REGEXP = Regexp.new("(?<title>\\s*title\\s*=\\s*#{QUOTED_STRING_REGEXP})")
|
592
|
+
ANCHOR_REGEXP = Regexp.new("(?<anchor>\\s*anchor\\s*=\\s*\\<#{URI_REGEXP}\\>)")
|
593
|
+
LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)")
|
594
|
+
LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})")
|
595
|
+
LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*")
|
596
|
+
POSSIBLE_DATE_REGEXP = Regexp.new("\\A(\\d|\\s\\d#{Date::ABBR_MONTHNAMES.join('|')}#{Date::MONTHNAMES.join('|')})")
|
597
|
+
|
598
|
+
end
|
599
|
+
end
|