wjordan213-csvlint 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (77) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +1 -0
  3. data/.gitattributes +2 -0
  4. data/.gitignore +28 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +32 -0
  7. data/CHANGELOG.md +361 -0
  8. data/Gemfile +7 -0
  9. data/LICENSE.md +22 -0
  10. data/README.md +328 -0
  11. data/Rakefile +17 -0
  12. data/bin/create_schema +32 -0
  13. data/bin/csvlint +10 -0
  14. data/features/check_format.feature +46 -0
  15. data/features/cli.feature +210 -0
  16. data/features/csv_options.feature +35 -0
  17. data/features/csvupload.feature +145 -0
  18. data/features/csvw_schema_validation.feature +127 -0
  19. data/features/fixtures/cr-line-endings.csv +0 -0
  20. data/features/fixtures/crlf-line-endings.csv +0 -0
  21. data/features/fixtures/inconsistent-line-endings-unquoted.csv +0 -0
  22. data/features/fixtures/inconsistent-line-endings.csv +0 -0
  23. data/features/fixtures/invalid-byte-sequence.csv +0 -0
  24. data/features/fixtures/invalid_many_rows.csv +0 -0
  25. data/features/fixtures/lf-line-endings.csv +0 -0
  26. data/features/fixtures/spreadsheet.xls +0 -0
  27. data/features/fixtures/spreadsheet.xlsx +0 -0
  28. data/features/fixtures/title-row.csv +0 -0
  29. data/features/fixtures/valid.csv +0 -0
  30. data/features/fixtures/valid_many_rows.csv +0 -0
  31. data/features/fixtures/windows-line-endings.csv +0 -0
  32. data/features/information.feature +22 -0
  33. data/features/parse_csv.feature +90 -0
  34. data/features/schema_validation.feature +105 -0
  35. data/features/sources.feature +17 -0
  36. data/features/step_definitions/cli_steps.rb +11 -0
  37. data/features/step_definitions/csv_options_steps.rb +24 -0
  38. data/features/step_definitions/information_steps.rb +13 -0
  39. data/features/step_definitions/parse_csv_steps.rb +42 -0
  40. data/features/step_definitions/schema_validation_steps.rb +33 -0
  41. data/features/step_definitions/sources_steps.rb +7 -0
  42. data/features/step_definitions/validation_errors_steps.rb +90 -0
  43. data/features/step_definitions/validation_info_steps.rb +22 -0
  44. data/features/step_definitions/validation_warnings_steps.rb +60 -0
  45. data/features/support/aruba.rb +56 -0
  46. data/features/support/env.rb +26 -0
  47. data/features/support/load_tests.rb +114 -0
  48. data/features/support/webmock.rb +1 -0
  49. data/features/validation_errors.feature +147 -0
  50. data/features/validation_info.feature +16 -0
  51. data/features/validation_warnings.feature +86 -0
  52. data/lib/csvlint.rb +27 -0
  53. data/lib/csvlint/cli.rb +165 -0
  54. data/lib/csvlint/csvw/column.rb +359 -0
  55. data/lib/csvlint/csvw/date_format.rb +182 -0
  56. data/lib/csvlint/csvw/metadata_error.rb +13 -0
  57. data/lib/csvlint/csvw/number_format.rb +211 -0
  58. data/lib/csvlint/csvw/property_checker.rb +761 -0
  59. data/lib/csvlint/csvw/table.rb +204 -0
  60. data/lib/csvlint/csvw/table_group.rb +165 -0
  61. data/lib/csvlint/error_collector.rb +27 -0
  62. data/lib/csvlint/error_message.rb +15 -0
  63. data/lib/csvlint/field.rb +196 -0
  64. data/lib/csvlint/schema.rb +92 -0
  65. data/lib/csvlint/validate.rb +599 -0
  66. data/lib/csvlint/version.rb +3 -0
  67. data/spec/csvw/column_spec.rb +112 -0
  68. data/spec/csvw/date_format_spec.rb +49 -0
  69. data/spec/csvw/number_format_spec.rb +417 -0
  70. data/spec/csvw/table_group_spec.rb +143 -0
  71. data/spec/csvw/table_spec.rb +90 -0
  72. data/spec/field_spec.rb +252 -0
  73. data/spec/schema_spec.rb +211 -0
  74. data/spec/spec_helper.rb +17 -0
  75. data/spec/validator_spec.rb +619 -0
  76. data/wjordan213_csvlint.gemspec +46 -0
  77. metadata +490 -0
@@ -0,0 +1,92 @@
1
+ module Csvlint
2
+
3
+ class Schema
4
+
5
+ include Csvlint::ErrorCollector
6
+
7
+ attr_reader :uri, :fields, :title, :description
8
+
9
+ def initialize(uri, fields=[], title=nil, description=nil)
10
+ @uri = uri
11
+ @fields = fields
12
+ @title = title
13
+ @description = description
14
+ reset
15
+ end
16
+
17
+ class << self
18
+
19
+ def from_json_table(uri, json)
20
+ fields = []
21
+ json["fields"].each do |field_desc|
22
+ fields << Csvlint::Field.new( field_desc["name"] , field_desc["constraints"],
23
+ field_desc["title"], field_desc["description"] )
24
+ end if json["fields"]
25
+ return Schema.new( uri , fields, json["title"], json["description"] )
26
+ end
27
+
28
+ def from_csvw_metadata(uri, json)
29
+ return Csvlint::Csvw::TableGroup.from_json(uri, json)
30
+ end
31
+
32
+ def load_from_json(uri, output_errors = true)
33
+ begin
34
+ json = JSON.parse( open(uri).read )
35
+ if json["@context"]
36
+ uri = "file:#{File.expand_path(uri)}" unless uri.to_s =~ /^http(s)?/
37
+ return Schema.from_csvw_metadata(uri,json)
38
+ else
39
+ return Schema.from_json_table(uri,json)
40
+ end
41
+ rescue Csvlint::Csvw::MetadataError => e
42
+ raise e
43
+ rescue OpenURI::HTTPError, Errno::ENOENT => e
44
+ raise e
45
+ rescue => e
46
+ if output_errors === true
47
+ STDERR.puts e.class
48
+ STDERR.puts e.message
49
+ STDERR.puts e.backtrace
50
+ end
51
+ return Schema.new(nil, [], "malformed", "malformed")
52
+ end
53
+ end
54
+
55
+ end
56
+
57
+ def validate_header(header, source_url=nil)
58
+ reset
59
+
60
+ found_header = header.to_csv(:row_sep => '')
61
+ expected_header = @fields.map{ |f| f.name }.to_csv(:row_sep => '')
62
+ if found_header != expected_header
63
+ build_warnings(:malformed_header, :schema, 1, nil, found_header, "expectedHeader" => expected_header)
64
+ end
65
+ return valid?
66
+ end
67
+
68
+ def validate_row(values, row=nil, all_errors=[], source_url=nil)
69
+ reset
70
+ if values.length < fields.length
71
+ fields[values.size..-1].each_with_index do |field, i|
72
+ build_warnings(:missing_column, :schema, row, values.size+i+1)
73
+ end
74
+ end
75
+ if values.length > fields.length
76
+ values[fields.size..-1].each_with_index do |data_column, i|
77
+ build_warnings(:extra_column, :schema, row, fields.size+i+1)
78
+ end
79
+ end
80
+
81
+ fields.each_with_index do |field,i|
82
+ value = values[i] || ""
83
+ result = field.validate_column(value, row, i+1, all_errors)
84
+ @errors += fields[i].errors
85
+ @warnings += fields[i].warnings
86
+ end
87
+
88
+ return valid?
89
+ end
90
+
91
+ end
92
+ end
@@ -0,0 +1,599 @@
1
+ module Csvlint
2
+
3
+ class Validator
4
+ class LineCSV < CSV
5
+ ENCODE_RE = Hash.new do |h,str|
6
+ h[str] = Regexp.new(str)
7
+ end
8
+
9
+ ENCODE_STR = Hash.new do |h,encoding_name|
10
+ h[encoding_name] = Hash.new do |h,chunks|
11
+ h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join('')
12
+ end
13
+ end
14
+
15
+ ESCAPE_RE = Hash.new do |h,re_chars|
16
+ h[re_chars] = Hash.new do |h,re_esc|
17
+ h[re_esc] = Hash.new do |h,str|
18
+ h[str] = str.gsub(re_chars) {|c| re_esc + c}
19
+ end
20
+ end
21
+ end
22
+
23
+ # Optimization: Memoize `encode_re`.
24
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2273
25
+ def encode_re(*chunks)
26
+ ENCODE_RE[encode_str(*chunks)]
27
+ end
28
+
29
+ # Optimization: Memoize `encode_str`.
30
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2281
31
+ def encode_str(*chunks)
32
+ ENCODE_STR[@encoding.name][chunks]
33
+ end
34
+
35
+ # Optimization: Memoize `escape_re`.
36
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2265
37
+ def escape_re(str)
38
+ ESCAPE_RE[@re_chars][@re_esc][str]
39
+ end
40
+
41
+ # Optimization: Disable the CSV library's converters feature.
42
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2100
43
+ def init_converters(options, field_name = :converters)
44
+ @converters = []
45
+ @header_converters = []
46
+ options.delete(:unconverted_fields)
47
+ options.delete(field_name)
48
+ end
49
+ end
50
+
51
+ include Csvlint::ErrorCollector
52
+
53
+ attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :dialect, :csv_header, :schema, :data, :current_line
54
+
55
+ ERROR_MATCHERS = {
56
+ "Missing or stray quote" => :stray_quote,
57
+ "Illegal quoting" => :whitespace,
58
+ "Unclosed quoted field" => :unclosed_quote,
59
+ "Unquoted fields do not allow \\r or \\n" => :line_breaks,
60
+ }
61
+
62
+ def initialize(source, dialect = {}, schema = nil, options = {})
63
+ reset
64
+ @source = source
65
+ @formats = []
66
+ @schema = schema
67
+ @dialect = dialect
68
+ @csv_header = true
69
+ @headers = {}
70
+ @lambda = options[:lambda]
71
+ @leading = ""
72
+
73
+ @limit_lines = options[:limit_lines]
74
+ @extension = parse_extension(source) unless @source.nil?
75
+
76
+ @expected_columns = 0
77
+ @col_counts = []
78
+ @line_breaks = []
79
+
80
+ @errors += @schema.errors unless @schema.nil?
81
+ @warnings += @schema.warnings unless @schema.nil?
82
+
83
+ @data = [] # it may be advisable to flush this on init?
84
+
85
+ validate
86
+ end
87
+
88
+ def validate
89
+ if @extension =~ /.xls(x)?/
90
+ build_warnings(:excel, :context)
91
+ return
92
+ end
93
+ locate_schema unless @schema.instance_of?(Csvlint::Schema)
94
+ set_dialect
95
+
96
+ if @source.class == String
97
+ validate_url
98
+ else
99
+ validate_metadata
100
+ validate_stream
101
+ end
102
+ finish
103
+ end
104
+
105
+ def validate_stream
106
+ @current_line = 1
107
+ @source.each_line do |line|
108
+ break if line_limit_reached?
109
+ parse_line(line)
110
+ end
111
+ validate_line(@leading, @current_line) unless @leading == ""
112
+ end
113
+
114
+ def validate_url
115
+ @current_line = 1
116
+ request = Typhoeus::Request.new(@source, followlocation: true)
117
+ request.on_headers do |response|
118
+ @headers = response.headers || {}
119
+ @content_type = response.headers["content-type"] rescue nil
120
+ @response_code = response.code
121
+ return build_errors(:not_found) if response.code == 404
122
+ validate_metadata
123
+ end
124
+ request.on_body do |chunk|
125
+ io = StringIO.new(chunk)
126
+ io.each_line do |line|
127
+ break if line_limit_reached?
128
+ parse_line(line)
129
+ end
130
+ end
131
+ request.run
132
+ # Validate the last line too
133
+ validate_line(@leading, @current_line) unless @leading == ""
134
+ end
135
+
136
+ def parse_line(line)
137
+ line = @leading + line
138
+ # Check if the last line is a line break - in which case it's a full line
139
+ if line[-1, 1].include?("\n")
140
+ # If the number of quotes is odd, the linebreak is inside some quotes
141
+ if line.count(@dialect["quoteChar"]).odd?
142
+ @leading = line
143
+ else
144
+ validate_line(line, @current_line)
145
+ @leading = ""
146
+ @current_line = @current_line+1
147
+ end
148
+ else
149
+ # If it's not a full line, then prepare to add it to the beginning of the next chunk
150
+ @leading = line
151
+ end
152
+ rescue ArgumentError => ae
153
+ build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
154
+ @current_line = @current_line+1
155
+ @reported_invalid_encoding = true
156
+ end
157
+
158
+ def validate_line(input = nil, index = nil)
159
+ @input = input
160
+ single_col = false
161
+ line = index.present? ? index : 0
162
+ @encoding = input.encoding.to_s
163
+ report_line_breaks(line)
164
+ parse_contents(input, line)
165
+ @lambda.call(self) unless @lambda.nil?
166
+ rescue ArgumentError => ae
167
+ build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
168
+ @reported_invalid_encoding = true
169
+ end
170
+
171
+ # analyses the provided csv and builds errors, warnings and info messages
172
+ def parse_contents(stream, line = nil)
173
+ # parse_contents will parse one line and apply headers, formats methods and error handle as appropriate
174
+ current_line = line.present? ? line : 1
175
+ all_errors = []
176
+
177
+ @csv_options[:encoding] = @encoding
178
+
179
+ begin
180
+ row = LineCSV.parse_line(stream, @csv_options)
181
+ rescue LineCSV::MalformedCSVError => e
182
+ build_exception_messages(e, stream, current_line)
183
+ end
184
+
185
+ @data << row
186
+ if row
187
+ if current_line <= 1 && @csv_header
188
+ # this conditional should be refactored somewhere
189
+ row = row.reject { |col| col.nil? || col.empty? }
190
+ validate_header(row)
191
+ @col_counts << row.size
192
+ else
193
+ build_formats(row)
194
+ @col_counts << row.reject { |col| col.nil? || col.empty? }.size
195
+ @expected_columns = row.size unless @expected_columns != 0
196
+ build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? || c.empty? }.size == 0
197
+ # Builds errors and warnings related to the provided schema file
198
+ if @schema
199
+ @schema.validate_row(row, current_line, all_errors, @source)
200
+ @errors += @schema.errors
201
+ all_errors += @schema.errors
202
+ @warnings += @schema.warnings
203
+ else
204
+ build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) if !row.empty? && row.size != @expected_columns
205
+ end
206
+ end
207
+ end
208
+ end
209
+
210
+ def finish
211
+ sum = @col_counts.inject(:+)
212
+ unless sum.nil?
213
+ build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
214
+ end
215
+ # return expected_columns to calling class
216
+ build_warnings(:check_options, :structure) if @expected_columns == 1
217
+ check_consistency
218
+ check_foreign_keys
219
+ check_mixed_linebreaks
220
+ validate_encoding
221
+ end
222
+
223
+ def validate_metadata
224
+ assumed_header = !@supplied_dialect
225
+ unless @headers.empty?
226
+ if @headers["content-type"] =~ /text\/csv/
227
+ @csv_header = @csv_header && true
228
+ assumed_header = @assumed_header.present?
229
+ end
230
+ if @headers["content-type"] =~ /header=(present|absent)/
231
+ @csv_header = true if $1 == "present"
232
+ @csv_header = false if $1 == "absent"
233
+ assumed_header = false
234
+ end
235
+ build_warnings(:no_content_type, :context) if @content_type == nil
236
+ build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
237
+ end
238
+ @header_processed = true
239
+ build_info_messages(:assumed_header, :structure) if assumed_header
240
+
241
+ @link_headers = @headers["link"].split(",") rescue nil
242
+ @link_headers.each do |link_header|
243
+ match = LINK_HEADER_REGEXP.match(link_header)
244
+ uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
245
+ rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
246
+ param = match["param"]
247
+ param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
248
+ if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
249
+ begin
250
+ url = URI.join(@source_url, uri)
251
+ schema = Schema.load_from_json(url)
252
+ if schema.instance_of? Csvlint::Csvw::TableGroup
253
+ if schema.tables[@source_url]
254
+ link_schema = schema
255
+ else
256
+ warn_if_unsuccessful = true
257
+ build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
258
+ end
259
+ end
260
+ rescue OpenURI::HTTPError
261
+ end
262
+ end
263
+ end if @link_headers
264
+ end
265
+
266
+ def header?
267
+ @csv_header && @dialect["header"]
268
+ end
269
+
270
+ def report_line_breaks(line_no=nil)
271
+ return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line
272
+ line_break = get_line_break(@input)
273
+ @line_breaks << line_break
274
+ unless line_breaks_reported?
275
+ if line_break != "\r\n"
276
+ build_info_messages(:nonrfc_line_breaks, :structure, line_no)
277
+ @line_breaks_reported = true
278
+ end
279
+ end
280
+ end
281
+
282
+ def line_breaks_reported?
283
+ @line_breaks_reported === true
284
+ end
285
+
286
+ def set_dialect
287
+ @assumed_header = @dialect["header"].nil?
288
+ @supplied_dialect = @dialect != {}
289
+
290
+ begin
291
+ schema_dialect = @schema.tables[@source_url].dialect || {}
292
+ rescue
293
+ schema_dialect = {}
294
+ end
295
+ @dialect = {
296
+ "header" => true,
297
+ "delimiter" => ",",
298
+ "skipInitialSpace" => true,
299
+ "lineTerminator" => :auto,
300
+ "quoteChar" => '"',
301
+ "trim" => :true
302
+ }.merge(schema_dialect).merge(@dialect || {})
303
+
304
+ @csv_header = @csv_header && @dialect["header"]
305
+ @csv_options = dialect_to_csv_options(@dialect)
306
+ end
307
+
308
+ def validate_encoding
309
+ if @headers["content-type"]
310
+ if @headers["content-type"] !~ /charset=/
311
+ build_warnings(:no_encoding, :context)
312
+ elsif @headers["content-type"] !~ /charset=utf-8/i
313
+ build_warnings(:encoding, :context)
314
+ end
315
+ end
316
+ build_warnings(:encoding, :context) if @encoding != "UTF-8"
317
+ end
318
+
319
+ def check_mixed_linebreaks
320
+ build_linebreak_error if @line_breaks.uniq.count > 1
321
+ end
322
+
323
+ def line_breaks
324
+ if @line_breaks.uniq.count > 1
325
+ :mixed
326
+ else
327
+ @line_breaks.uniq.first
328
+ end
329
+ end
330
+
331
+ def row_count
332
+ data.count
333
+ end
334
+
335
+ def build_exception_messages(csvException, errChars, lineNo)
336
+ #TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers
337
+ #TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options
338
+ type = fetch_error(csvException)
339
+ if !@csv_options[:row_sep].kind_of?(Symbol) && [:unclosed_quote,:stray_quote].include?(type) && !@input.match(@csv_options[:row_sep])
340
+ build_linebreak_error
341
+ else
342
+ build_errors(type, :structure, lineNo, nil, errChars)
343
+ end
344
+ end
345
+
346
+ def build_linebreak_error
347
+ build_errors(:line_breaks, :structure) unless @errors.any? { |e| e.type == :line_breaks }
348
+ end
349
+
350
+ def validate_header(header)
351
+ names = Set.new
352
+ header.map{|h| h.strip! } if @dialect["trim"] == :true
353
+ header.each_with_index do |name,i|
354
+ build_warnings(:empty_column_name, :schema, nil, i+1) if name == ""
355
+ if names.include?(name)
356
+ build_warnings(:duplicate_column_name, :schema, nil, i+1)
357
+ else
358
+ names << name
359
+ end
360
+ end
361
+ if @schema
362
+ @schema.validate_header(header, @source)
363
+ @errors += @schema.errors
364
+ @warnings += @schema.warnings
365
+ end
366
+ return valid?
367
+ end
368
+
369
+ def fetch_error(error)
370
+ e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i)
371
+ message = e[1] rescue nil
372
+ ERROR_MATCHERS.fetch(message, :unknown_error)
373
+ end
374
+
375
+ def dialect_to_csv_options(dialect)
376
+ skipinitialspace = dialect["skipInitialSpace"] || true
377
+ delimiter = dialect["delimiter"]
378
+ delimiter = delimiter + " " if !skipinitialspace
379
+ return {
380
+ :col_sep => delimiter,
381
+ :row_sep => dialect["lineTerminator"],
382
+ :quote_char => dialect["quoteChar"],
383
+ :skip_blanks => false
384
+ }
385
+ end
386
+
387
+ def build_formats(row)
388
+ row.each_with_index do |col, i|
389
+ next if col.nil? || col.empty?
390
+ @formats[i] ||= Hash.new(0)
391
+
392
+ format =
393
+ if col.strip[FORMATS[:numeric]]
394
+ :numeric
395
+ elsif uri?(col)
396
+ :uri
397
+ elsif possible_date?(col)
398
+ date_formats(col)
399
+ else
400
+ :string
401
+ end
402
+
403
+ @formats[i][format] += 1
404
+ end
405
+ end
406
+
407
+ def check_consistency
408
+ @formats.each_with_index do |format,i|
409
+ if format
410
+ total = format.values.reduce(:+).to_f
411
+ if format.none?{|_,count| count / total >= 0.9}
412
+ build_warnings(:inconsistent_values, :schema, nil, i + 1)
413
+ end
414
+ end
415
+ end
416
+ end
417
+
418
+ def check_foreign_keys
419
+ if @schema.instance_of? Csvlint::Csvw::TableGroup
420
+ @schema.validate_foreign_keys
421
+ @errors += @schema.errors
422
+ @warnings += @schema.warnings
423
+ end
424
+ end
425
+
426
+ def locate_schema
427
+
428
+ @source_url = nil
429
+ warn_if_unsuccessful = false
430
+ case @source
431
+ when StringIO
432
+ return
433
+ when File
434
+ @source_url = "file:#{File.expand_path(@source)}"
435
+ else
436
+ @source_url = @source
437
+ end
438
+ unless @schema.nil?
439
+ if @schema.tables[@source_url]
440
+ return
441
+ else
442
+ @schema = nil
443
+ end
444
+ end
445
+ link_schema = nil
446
+ @schema = link_schema if link_schema
447
+
448
+ paths = []
449
+ if @source_url =~ /^http(s)?/
450
+ begin
451
+ well_known_uri = URI.join(@source_url, "/.well-known/csvm")
452
+ well_known = open(well_known_uri).read
453
+ # TODO
454
+ rescue OpenURI::HTTPError, URI::BadURIError
455
+ end
456
+ end
457
+ paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty?
458
+ paths.each do |template|
459
+ begin
460
+ template = URITemplate.new(template)
461
+ path = template.expand('url' => @source_url)
462
+ url = URI.join(@source_url, path)
463
+ url = File.new(url.to_s.sub(/^file:/, "")) if url.to_s =~ /^file:/
464
+ schema = Schema.load_from_json(url)
465
+ if schema.instance_of? Csvlint::Csvw::TableGroup
466
+ if schema.tables[@source_url]
467
+ @schema = schema
468
+ else
469
+ warn_if_unsuccessful = true
470
+ build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
471
+ end
472
+ end
473
+ rescue Errno::ENOENT
474
+ rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError
475
+ rescue => e
476
+ STDERR.puts e.class
477
+ STDERR.puts e.message
478
+ STDERR.puts e.backtrace
479
+ raise e
480
+ end
481
+ end
482
+ build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) if warn_if_unsuccessful
483
+ @schema = nil
484
+ end
485
+
486
+ private
487
+
488
+ def parse_extension(source)
489
+
490
+ case source
491
+ when File
492
+ return File.extname( source.path )
493
+ when IO
494
+ return ""
495
+ when StringIO
496
+ return ""
497
+ when Tempfile
498
+ # this is triggered when the revalidate dialect use case happens
499
+ return ""
500
+ else
501
+ begin
502
+ parsed = URI.parse(source)
503
+ File.extname(parsed.path)
504
+ rescue URI::InvalidURIError
505
+ return ""
506
+ end
507
+ end
508
+ end
509
+
510
+ def uri?(value)
511
+ if value.strip[FORMATS[:uri]]
512
+ uri = URI.parse(value)
513
+ uri.kind_of?(URI::HTTP) || uri.kind_of?(URI::HTTPS)
514
+ end
515
+ rescue URI::InvalidURIError
516
+ false
517
+ end
518
+
519
+ def possible_date?(col)
520
+ col[POSSIBLE_DATE_REGEXP]
521
+ end
522
+
523
+ def date_formats(col)
524
+ if col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
525
+ :date_db
526
+ elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
527
+ :date_short
528
+ elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
529
+ :date_rfc822
530
+ elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
531
+ :date_long
532
+ elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
533
+ :dateTime_time
534
+ elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
535
+ :dateTime_hms
536
+ elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
537
+ :dateTime_db
538
+ elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
539
+ :dateTime_iso8601
540
+ elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
541
+ :dateTime_short
542
+ elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
543
+ :dateTime_long
544
+ else
545
+ :string
546
+ end
547
+ end
548
+
549
+ def date_format?(klass, value, format)
550
+ klass.strptime(value, format).strftime(format) == value
551
+ rescue ArgumentError # invalid date
552
+ false
553
+ end
554
+
555
+ def line_limit_reached?
556
+ @limit_lines.present? && @current_line > @limit_lines
557
+ end
558
+
559
+ def get_line_break(line)
560
+ eol = line.chars.last(2)
561
+ if eol.first == "\r"
562
+ "\r\n"
563
+ else
564
+ "\n"
565
+ end
566
+ end
567
+
568
+ FORMATS = {
569
+ :string => nil,
570
+ :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
571
+ :uri => /\Ahttps?:/,
572
+ :date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01"
573
+ :date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345"
574
+ :date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345"
575
+ :date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan"
576
+ :dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00"
577
+ :dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00"
578
+ :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z"
579
+ :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
580
+ :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00"
581
+ :dateTime_time => /\A\d\d:\d\d\z/, # "00:00"
582
+ }.freeze
583
+
584
+ URI_REGEXP = /(?<uri>.*?)/
585
+ TOKEN_REGEXP = /([^\(\)\<\>@,;:\\"\/\[\]\?=\{\} \t]+)/
586
+ QUOTED_STRING_REGEXP = /("[^"]*")/
587
+ SGML_NAME_REGEXP = /([A-Za-z][-A-Za-z0-9\.]*)/
588
+ RELATIONSHIP_REGEXP = Regexp.new("(?<relationship>#{SGML_NAME_REGEXP}|(\"#{SGML_NAME_REGEXP}(\\s+#{SGML_NAME_REGEXP})*\"))")
589
+ REL_REGEXP = Regexp.new("(?<rel>\\s*rel\\s*=\\s*(?<rel-relationship>#{RELATIONSHIP_REGEXP}))")
590
+ REV_REGEXP = Regexp.new("(?<rev>\\s*rev\\s*=\\s*#{RELATIONSHIP_REGEXP})")
591
+ TITLE_REGEXP = Regexp.new("(?<title>\\s*title\\s*=\\s*#{QUOTED_STRING_REGEXP})")
592
+ ANCHOR_REGEXP = Regexp.new("(?<anchor>\\s*anchor\\s*=\\s*\\<#{URI_REGEXP}\\>)")
593
+ LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)")
594
+ LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})")
595
+ LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*")
596
+ POSSIBLE_DATE_REGEXP = Regexp.new("\\A(\\d|\\s\\d#{Date::ABBR_MONTHNAMES.join('|')}#{Date::MONTHNAMES.join('|')})")
597
+
598
+ end
599
+ end