wjordan213-csvlint 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +1 -0
  3. data/.gitattributes +2 -0
  4. data/.gitignore +28 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +32 -0
  7. data/CHANGELOG.md +361 -0
  8. data/Gemfile +7 -0
  9. data/LICENSE.md +22 -0
  10. data/README.md +328 -0
  11. data/Rakefile +17 -0
  12. data/bin/create_schema +32 -0
  13. data/bin/csvlint +10 -0
  14. data/features/check_format.feature +46 -0
  15. data/features/cli.feature +210 -0
  16. data/features/csv_options.feature +35 -0
  17. data/features/csvupload.feature +145 -0
  18. data/features/csvw_schema_validation.feature +127 -0
  19. data/features/fixtures/cr-line-endings.csv +0 -0
  20. data/features/fixtures/crlf-line-endings.csv +0 -0
  21. data/features/fixtures/inconsistent-line-endings-unquoted.csv +0 -0
  22. data/features/fixtures/inconsistent-line-endings.csv +0 -0
  23. data/features/fixtures/invalid-byte-sequence.csv +0 -0
  24. data/features/fixtures/invalid_many_rows.csv +0 -0
  25. data/features/fixtures/lf-line-endings.csv +0 -0
  26. data/features/fixtures/spreadsheet.xls +0 -0
  27. data/features/fixtures/spreadsheet.xlsx +0 -0
  28. data/features/fixtures/title-row.csv +0 -0
  29. data/features/fixtures/valid.csv +0 -0
  30. data/features/fixtures/valid_many_rows.csv +0 -0
  31. data/features/fixtures/windows-line-endings.csv +0 -0
  32. data/features/information.feature +22 -0
  33. data/features/parse_csv.feature +90 -0
  34. data/features/schema_validation.feature +105 -0
  35. data/features/sources.feature +17 -0
  36. data/features/step_definitions/cli_steps.rb +11 -0
  37. data/features/step_definitions/csv_options_steps.rb +24 -0
  38. data/features/step_definitions/information_steps.rb +13 -0
  39. data/features/step_definitions/parse_csv_steps.rb +42 -0
  40. data/features/step_definitions/schema_validation_steps.rb +33 -0
  41. data/features/step_definitions/sources_steps.rb +7 -0
  42. data/features/step_definitions/validation_errors_steps.rb +90 -0
  43. data/features/step_definitions/validation_info_steps.rb +22 -0
  44. data/features/step_definitions/validation_warnings_steps.rb +60 -0
  45. data/features/support/aruba.rb +56 -0
  46. data/features/support/env.rb +26 -0
  47. data/features/support/load_tests.rb +114 -0
  48. data/features/support/webmock.rb +1 -0
  49. data/features/validation_errors.feature +147 -0
  50. data/features/validation_info.feature +16 -0
  51. data/features/validation_warnings.feature +86 -0
  52. data/lib/csvlint.rb +27 -0
  53. data/lib/csvlint/cli.rb +165 -0
  54. data/lib/csvlint/csvw/column.rb +359 -0
  55. data/lib/csvlint/csvw/date_format.rb +182 -0
  56. data/lib/csvlint/csvw/metadata_error.rb +13 -0
  57. data/lib/csvlint/csvw/number_format.rb +211 -0
  58. data/lib/csvlint/csvw/property_checker.rb +761 -0
  59. data/lib/csvlint/csvw/table.rb +204 -0
  60. data/lib/csvlint/csvw/table_group.rb +165 -0
  61. data/lib/csvlint/error_collector.rb +27 -0
  62. data/lib/csvlint/error_message.rb +15 -0
  63. data/lib/csvlint/field.rb +196 -0
  64. data/lib/csvlint/schema.rb +92 -0
  65. data/lib/csvlint/validate.rb +599 -0
  66. data/lib/csvlint/version.rb +3 -0
  67. data/spec/csvw/column_spec.rb +112 -0
  68. data/spec/csvw/date_format_spec.rb +49 -0
  69. data/spec/csvw/number_format_spec.rb +417 -0
  70. data/spec/csvw/table_group_spec.rb +143 -0
  71. data/spec/csvw/table_spec.rb +90 -0
  72. data/spec/field_spec.rb +252 -0
  73. data/spec/schema_spec.rb +211 -0
  74. data/spec/spec_helper.rb +17 -0
  75. data/spec/validator_spec.rb +619 -0
  76. data/wjordan213_csvlint.gemspec +46 -0
  77. metadata +490 -0
@@ -0,0 +1,92 @@
1
+ module Csvlint
2
+
3
+ class Schema
4
+
5
+ include Csvlint::ErrorCollector
6
+
7
+ attr_reader :uri, :fields, :title, :description
8
+
9
+ def initialize(uri, fields=[], title=nil, description=nil)
10
+ @uri = uri
11
+ @fields = fields
12
+ @title = title
13
+ @description = description
14
+ reset
15
+ end
16
+
17
+ class << self
18
+
19
+ def from_json_table(uri, json)
20
+ fields = []
21
+ json["fields"].each do |field_desc|
22
+ fields << Csvlint::Field.new( field_desc["name"] , field_desc["constraints"],
23
+ field_desc["title"], field_desc["description"] )
24
+ end if json["fields"]
25
+ return Schema.new( uri , fields, json["title"], json["description"] )
26
+ end
27
+
28
+ def from_csvw_metadata(uri, json)
29
+ return Csvlint::Csvw::TableGroup.from_json(uri, json)
30
+ end
31
+
32
+ def load_from_json(uri, output_errors = true)
33
+ begin
34
+ json = JSON.parse( open(uri).read )
35
+ if json["@context"]
36
+ uri = "file:#{File.expand_path(uri)}" unless uri.to_s =~ /^http(s)?/
37
+ return Schema.from_csvw_metadata(uri,json)
38
+ else
39
+ return Schema.from_json_table(uri,json)
40
+ end
41
+ rescue Csvlint::Csvw::MetadataError => e
42
+ raise e
43
+ rescue OpenURI::HTTPError, Errno::ENOENT => e
44
+ raise e
45
+ rescue => e
46
+ if output_errors === true
47
+ STDERR.puts e.class
48
+ STDERR.puts e.message
49
+ STDERR.puts e.backtrace
50
+ end
51
+ return Schema.new(nil, [], "malformed", "malformed")
52
+ end
53
+ end
54
+
55
+ end
56
+
57
+ def validate_header(header, source_url=nil)
58
+ reset
59
+
60
+ found_header = header.to_csv(:row_sep => '')
61
+ expected_header = @fields.map{ |f| f.name }.to_csv(:row_sep => '')
62
+ if found_header != expected_header
63
+ build_warnings(:malformed_header, :schema, 1, nil, found_header, "expectedHeader" => expected_header)
64
+ end
65
+ return valid?
66
+ end
67
+
68
+ def validate_row(values, row=nil, all_errors=[], source_url=nil)
69
+ reset
70
+ if values.length < fields.length
71
+ fields[values.size..-1].each_with_index do |field, i|
72
+ build_warnings(:missing_column, :schema, row, values.size+i+1)
73
+ end
74
+ end
75
+ if values.length > fields.length
76
+ values[fields.size..-1].each_with_index do |data_column, i|
77
+ build_warnings(:extra_column, :schema, row, fields.size+i+1)
78
+ end
79
+ end
80
+
81
+ fields.each_with_index do |field,i|
82
+ value = values[i] || ""
83
+ result = field.validate_column(value, row, i+1, all_errors)
84
+ @errors += fields[i].errors
85
+ @warnings += fields[i].warnings
86
+ end
87
+
88
+ return valid?
89
+ end
90
+
91
+ end
92
+ end
@@ -0,0 +1,599 @@
1
+ module Csvlint
2
+
3
+ class Validator
4
+ class LineCSV < CSV
5
+ ENCODE_RE = Hash.new do |h,str|
6
+ h[str] = Regexp.new(str)
7
+ end
8
+
9
+ ENCODE_STR = Hash.new do |h,encoding_name|
10
+ h[encoding_name] = Hash.new do |h,chunks|
11
+ h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join('')
12
+ end
13
+ end
14
+
15
+ ESCAPE_RE = Hash.new do |h,re_chars|
16
+ h[re_chars] = Hash.new do |h,re_esc|
17
+ h[re_esc] = Hash.new do |h,str|
18
+ h[str] = str.gsub(re_chars) {|c| re_esc + c}
19
+ end
20
+ end
21
+ end
22
+
23
+ # Optimization: Memoize `encode_re`.
24
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2273
25
+ def encode_re(*chunks)
26
+ ENCODE_RE[encode_str(*chunks)]
27
+ end
28
+
29
+ # Optimization: Memoize `encode_str`.
30
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2281
31
+ def encode_str(*chunks)
32
+ ENCODE_STR[@encoding.name][chunks]
33
+ end
34
+
35
+ # Optimization: Memoize `escape_re`.
36
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2265
37
+ def escape_re(str)
38
+ ESCAPE_RE[@re_chars][@re_esc][str]
39
+ end
40
+
41
+ # Optimization: Disable the CSV library's converters feature.
42
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2100
43
+ def init_converters(options, field_name = :converters)
44
+ @converters = []
45
+ @header_converters = []
46
+ options.delete(:unconverted_fields)
47
+ options.delete(field_name)
48
+ end
49
+ end
50
+
51
+ include Csvlint::ErrorCollector
52
+
53
+ attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :dialect, :csv_header, :schema, :data, :current_line
54
+
55
+ ERROR_MATCHERS = {
56
+ "Missing or stray quote" => :stray_quote,
57
+ "Illegal quoting" => :whitespace,
58
+ "Unclosed quoted field" => :unclosed_quote,
59
+ "Unquoted fields do not allow \\r or \\n" => :line_breaks,
60
+ }
61
+
62
+ def initialize(source, dialect = {}, schema = nil, options = {})
63
+ reset
64
+ @source = source
65
+ @formats = []
66
+ @schema = schema
67
+ @dialect = dialect
68
+ @csv_header = true
69
+ @headers = {}
70
+ @lambda = options[:lambda]
71
+ @leading = ""
72
+
73
+ @limit_lines = options[:limit_lines]
74
+ @extension = parse_extension(source) unless @source.nil?
75
+
76
+ @expected_columns = 0
77
+ @col_counts = []
78
+ @line_breaks = []
79
+
80
+ @errors += @schema.errors unless @schema.nil?
81
+ @warnings += @schema.warnings unless @schema.nil?
82
+
83
+ @data = [] # it may be advisable to flush this on init?
84
+
85
+ validate
86
+ end
87
+
88
+ def validate
89
+ if @extension =~ /.xls(x)?/
90
+ build_warnings(:excel, :context)
91
+ return
92
+ end
93
+ locate_schema unless @schema.instance_of?(Csvlint::Schema)
94
+ set_dialect
95
+
96
+ if @source.class == String
97
+ validate_url
98
+ else
99
+ validate_metadata
100
+ validate_stream
101
+ end
102
+ finish
103
+ end
104
+
105
+ def validate_stream
106
+ @current_line = 1
107
+ @source.each_line do |line|
108
+ break if line_limit_reached?
109
+ parse_line(line)
110
+ end
111
+ validate_line(@leading, @current_line) unless @leading == ""
112
+ end
113
+
114
+ def validate_url
115
+ @current_line = 1
116
+ request = Typhoeus::Request.new(@source, followlocation: true)
117
+ request.on_headers do |response|
118
+ @headers = response.headers || {}
119
+ @content_type = response.headers["content-type"] rescue nil
120
+ @response_code = response.code
121
+ return build_errors(:not_found) if response.code == 404
122
+ validate_metadata
123
+ end
124
+ request.on_body do |chunk|
125
+ io = StringIO.new(chunk)
126
+ io.each_line do |line|
127
+ break if line_limit_reached?
128
+ parse_line(line)
129
+ end
130
+ end
131
+ request.run
132
+ # Validate the last line too
133
+ validate_line(@leading, @current_line) unless @leading == ""
134
+ end
135
+
136
+ def parse_line(line)
137
+ line = @leading + line
138
+ # Check if the last line is a line break - in which case it's a full line
139
+ if line[-1, 1].include?("\n")
140
+ # If the number of quotes is odd, the linebreak is inside some quotes
141
+ if line.count(@dialect["quoteChar"]).odd?
142
+ @leading = line
143
+ else
144
+ validate_line(line, @current_line)
145
+ @leading = ""
146
+ @current_line = @current_line+1
147
+ end
148
+ else
149
+ # If it's not a full line, then prepare to add it to the beginning of the next chunk
150
+ @leading = line
151
+ end
152
+ rescue ArgumentError => ae
153
+ build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
154
+ @current_line = @current_line+1
155
+ @reported_invalid_encoding = true
156
+ end
157
+
158
+ def validate_line(input = nil, index = nil)
159
+ @input = input
160
+ single_col = false
161
+ line = index.present? ? index : 0
162
+ @encoding = input.encoding.to_s
163
+ report_line_breaks(line)
164
+ parse_contents(input, line)
165
+ @lambda.call(self) unless @lambda.nil?
166
+ rescue ArgumentError => ae
167
+ build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
168
+ @reported_invalid_encoding = true
169
+ end
170
+
171
+ # analyses the provided csv and builds errors, warnings and info messages
172
+ def parse_contents(stream, line = nil)
173
+ # parse_contents will parse one line and apply headers, formats methods and error handle as appropriate
174
+ current_line = line.present? ? line : 1
175
+ all_errors = []
176
+
177
+ @csv_options[:encoding] = @encoding
178
+
179
+ begin
180
+ row = LineCSV.parse_line(stream, @csv_options)
181
+ rescue LineCSV::MalformedCSVError => e
182
+ build_exception_messages(e, stream, current_line)
183
+ end
184
+
185
+ @data << row
186
+ if row
187
+ if current_line <= 1 && @csv_header
188
+ # this conditional should be refactored somewhere
189
+ row = row.reject { |col| col.nil? || col.empty? }
190
+ validate_header(row)
191
+ @col_counts << row.size
192
+ else
193
+ build_formats(row)
194
+ @col_counts << row.reject { |col| col.nil? || col.empty? }.size
195
+ @expected_columns = row.size unless @expected_columns != 0
196
+ build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? || c.empty? }.size == 0
197
+ # Builds errors and warnings related to the provided schema file
198
+ if @schema
199
+ @schema.validate_row(row, current_line, all_errors, @source)
200
+ @errors += @schema.errors
201
+ all_errors += @schema.errors
202
+ @warnings += @schema.warnings
203
+ else
204
+ build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) if !row.empty? && row.size != @expected_columns
205
+ end
206
+ end
207
+ end
208
+ end
209
+
210
+ def finish
211
+ sum = @col_counts.inject(:+)
212
+ unless sum.nil?
213
+ build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
214
+ end
215
+ # return expected_columns to calling class
216
+ build_warnings(:check_options, :structure) if @expected_columns == 1
217
+ check_consistency
218
+ check_foreign_keys
219
+ check_mixed_linebreaks
220
+ validate_encoding
221
+ end
222
+
223
+ def validate_metadata
224
+ assumed_header = !@supplied_dialect
225
+ unless @headers.empty?
226
+ if @headers["content-type"] =~ /text\/csv/
227
+ @csv_header = @csv_header && true
228
+ assumed_header = @assumed_header.present?
229
+ end
230
+ if @headers["content-type"] =~ /header=(present|absent)/
231
+ @csv_header = true if $1 == "present"
232
+ @csv_header = false if $1 == "absent"
233
+ assumed_header = false
234
+ end
235
+ build_warnings(:no_content_type, :context) if @content_type == nil
236
+ build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
237
+ end
238
+ @header_processed = true
239
+ build_info_messages(:assumed_header, :structure) if assumed_header
240
+
241
+ @link_headers = @headers["link"].split(",") rescue nil
242
+ @link_headers.each do |link_header|
243
+ match = LINK_HEADER_REGEXP.match(link_header)
244
+ uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
245
+ rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
246
+ param = match["param"]
247
+ param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
248
+ if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
249
+ begin
250
+ url = URI.join(@source_url, uri)
251
+ schema = Schema.load_from_json(url)
252
+ if schema.instance_of? Csvlint::Csvw::TableGroup
253
+ if schema.tables[@source_url]
254
+ link_schema = schema
255
+ else
256
+ warn_if_unsuccessful = true
257
+ build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
258
+ end
259
+ end
260
+ rescue OpenURI::HTTPError
261
+ end
262
+ end
263
+ end if @link_headers
264
+ end
265
+
266
+ def header?
267
+ @csv_header && @dialect["header"]
268
+ end
269
+
270
+ def report_line_breaks(line_no=nil)
271
+ return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line
272
+ line_break = get_line_break(@input)
273
+ @line_breaks << line_break
274
+ unless line_breaks_reported?
275
+ if line_break != "\r\n"
276
+ build_info_messages(:nonrfc_line_breaks, :structure, line_no)
277
+ @line_breaks_reported = true
278
+ end
279
+ end
280
+ end
281
+
282
+ def line_breaks_reported?
283
+ @line_breaks_reported === true
284
+ end
285
+
286
+ def set_dialect
287
+ @assumed_header = @dialect["header"].nil?
288
+ @supplied_dialect = @dialect != {}
289
+
290
+ begin
291
+ schema_dialect = @schema.tables[@source_url].dialect || {}
292
+ rescue
293
+ schema_dialect = {}
294
+ end
295
+ @dialect = {
296
+ "header" => true,
297
+ "delimiter" => ",",
298
+ "skipInitialSpace" => true,
299
+ "lineTerminator" => :auto,
300
+ "quoteChar" => '"',
301
+ "trim" => :true
302
+ }.merge(schema_dialect).merge(@dialect || {})
303
+
304
+ @csv_header = @csv_header && @dialect["header"]
305
+ @csv_options = dialect_to_csv_options(@dialect)
306
+ end
307
+
308
+ def validate_encoding
309
+ if @headers["content-type"]
310
+ if @headers["content-type"] !~ /charset=/
311
+ build_warnings(:no_encoding, :context)
312
+ elsif @headers["content-type"] !~ /charset=utf-8/i
313
+ build_warnings(:encoding, :context)
314
+ end
315
+ end
316
+ build_warnings(:encoding, :context) if @encoding != "UTF-8"
317
+ end
318
+
319
+ def check_mixed_linebreaks
320
+ build_linebreak_error if @line_breaks.uniq.count > 1
321
+ end
322
+
323
+ def line_breaks
324
+ if @line_breaks.uniq.count > 1
325
+ :mixed
326
+ else
327
+ @line_breaks.uniq.first
328
+ end
329
+ end
330
+
331
+ def row_count
332
+ data.count
333
+ end
334
+
335
+ def build_exception_messages(csvException, errChars, lineNo)
336
+ #TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers
337
+ #TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options
338
+ type = fetch_error(csvException)
339
+ if !@csv_options[:row_sep].kind_of?(Symbol) && [:unclosed_quote,:stray_quote].include?(type) && !@input.match(@csv_options[:row_sep])
340
+ build_linebreak_error
341
+ else
342
+ build_errors(type, :structure, lineNo, nil, errChars)
343
+ end
344
+ end
345
+
346
+ def build_linebreak_error
347
+ build_errors(:line_breaks, :structure) unless @errors.any? { |e| e.type == :line_breaks }
348
+ end
349
+
350
+ def validate_header(header)
351
+ names = Set.new
352
+ header.map{|h| h.strip! } if @dialect["trim"] == :true
353
+ header.each_with_index do |name,i|
354
+ build_warnings(:empty_column_name, :schema, nil, i+1) if name == ""
355
+ if names.include?(name)
356
+ build_warnings(:duplicate_column_name, :schema, nil, i+1)
357
+ else
358
+ names << name
359
+ end
360
+ end
361
+ if @schema
362
+ @schema.validate_header(header, @source)
363
+ @errors += @schema.errors
364
+ @warnings += @schema.warnings
365
+ end
366
+ return valid?
367
+ end
368
+
369
+ def fetch_error(error)
370
+ e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i)
371
+ message = e[1] rescue nil
372
+ ERROR_MATCHERS.fetch(message, :unknown_error)
373
+ end
374
+
375
+ def dialect_to_csv_options(dialect)
376
+ skipinitialspace = dialect["skipInitialSpace"] || true
377
+ delimiter = dialect["delimiter"]
378
+ delimiter = delimiter + " " if !skipinitialspace
379
+ return {
380
+ :col_sep => delimiter,
381
+ :row_sep => dialect["lineTerminator"],
382
+ :quote_char => dialect["quoteChar"],
383
+ :skip_blanks => false
384
+ }
385
+ end
386
+
387
+ def build_formats(row)
388
+ row.each_with_index do |col, i|
389
+ next if col.nil? || col.empty?
390
+ @formats[i] ||= Hash.new(0)
391
+
392
+ format =
393
+ if col.strip[FORMATS[:numeric]]
394
+ :numeric
395
+ elsif uri?(col)
396
+ :uri
397
+ elsif possible_date?(col)
398
+ date_formats(col)
399
+ else
400
+ :string
401
+ end
402
+
403
+ @formats[i][format] += 1
404
+ end
405
+ end
406
+
407
+ def check_consistency
408
+ @formats.each_with_index do |format,i|
409
+ if format
410
+ total = format.values.reduce(:+).to_f
411
+ if format.none?{|_,count| count / total >= 0.9}
412
+ build_warnings(:inconsistent_values, :schema, nil, i + 1)
413
+ end
414
+ end
415
+ end
416
+ end
417
+
418
+ def check_foreign_keys
419
+ if @schema.instance_of? Csvlint::Csvw::TableGroup
420
+ @schema.validate_foreign_keys
421
+ @errors += @schema.errors
422
+ @warnings += @schema.warnings
423
+ end
424
+ end
425
+
426
+ def locate_schema
427
+
428
+ @source_url = nil
429
+ warn_if_unsuccessful = false
430
+ case @source
431
+ when StringIO
432
+ return
433
+ when File
434
+ @source_url = "file:#{File.expand_path(@source)}"
435
+ else
436
+ @source_url = @source
437
+ end
438
+ unless @schema.nil?
439
+ if @schema.tables[@source_url]
440
+ return
441
+ else
442
+ @schema = nil
443
+ end
444
+ end
445
+ link_schema = nil
446
+ @schema = link_schema if link_schema
447
+
448
+ paths = []
449
+ if @source_url =~ /^http(s)?/
450
+ begin
451
+ well_known_uri = URI.join(@source_url, "/.well-known/csvm")
452
+ well_known = open(well_known_uri).read
453
+ # TODO
454
+ rescue OpenURI::HTTPError, URI::BadURIError
455
+ end
456
+ end
457
+ paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty?
458
+ paths.each do |template|
459
+ begin
460
+ template = URITemplate.new(template)
461
+ path = template.expand('url' => @source_url)
462
+ url = URI.join(@source_url, path)
463
+ url = File.new(url.to_s.sub(/^file:/, "")) if url.to_s =~ /^file:/
464
+ schema = Schema.load_from_json(url)
465
+ if schema.instance_of? Csvlint::Csvw::TableGroup
466
+ if schema.tables[@source_url]
467
+ @schema = schema
468
+ else
469
+ warn_if_unsuccessful = true
470
+ build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
471
+ end
472
+ end
473
+ rescue Errno::ENOENT
474
+ rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError
475
+ rescue => e
476
+ STDERR.puts e.class
477
+ STDERR.puts e.message
478
+ STDERR.puts e.backtrace
479
+ raise e
480
+ end
481
+ end
482
+ build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) if warn_if_unsuccessful
483
+ @schema = nil
484
+ end
485
+
486
+ private
487
+
488
+ def parse_extension(source)
489
+
490
+ case source
491
+ when File
492
+ return File.extname( source.path )
493
+ when IO
494
+ return ""
495
+ when StringIO
496
+ return ""
497
+ when Tempfile
498
+ # this is triggered when the revalidate dialect use case happens
499
+ return ""
500
+ else
501
+ begin
502
+ parsed = URI.parse(source)
503
+ File.extname(parsed.path)
504
+ rescue URI::InvalidURIError
505
+ return ""
506
+ end
507
+ end
508
+ end
509
+
510
+ def uri?(value)
511
+ if value.strip[FORMATS[:uri]]
512
+ uri = URI.parse(value)
513
+ uri.kind_of?(URI::HTTP) || uri.kind_of?(URI::HTTPS)
514
+ end
515
+ rescue URI::InvalidURIError
516
+ false
517
+ end
518
+
519
+ def possible_date?(col)
520
+ col[POSSIBLE_DATE_REGEXP]
521
+ end
522
+
523
+ def date_formats(col)
524
+ if col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
525
+ :date_db
526
+ elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
527
+ :date_short
528
+ elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
529
+ :date_rfc822
530
+ elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
531
+ :date_long
532
+ elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
533
+ :dateTime_time
534
+ elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
535
+ :dateTime_hms
536
+ elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
537
+ :dateTime_db
538
+ elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
539
+ :dateTime_iso8601
540
+ elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
541
+ :dateTime_short
542
+ elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
543
+ :dateTime_long
544
+ else
545
+ :string
546
+ end
547
+ end
548
+
549
+ def date_format?(klass, value, format)
550
+ klass.strptime(value, format).strftime(format) == value
551
+ rescue ArgumentError # invalid date
552
+ false
553
+ end
554
+
555
+ def line_limit_reached?
556
+ @limit_lines.present? && @current_line > @limit_lines
557
+ end
558
+
559
+ def get_line_break(line)
560
+ eol = line.chars.last(2)
561
+ if eol.first == "\r"
562
+ "\r\n"
563
+ else
564
+ "\n"
565
+ end
566
+ end
567
+
568
+ FORMATS = {
569
+ :string => nil,
570
+ :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
571
+ :uri => /\Ahttps?:/,
572
+ :date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01"
573
+ :date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345"
574
+ :date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345"
575
+ :date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan"
576
+ :dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00"
577
+ :dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00"
578
+ :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z"
579
+ :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
580
+ :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00"
581
+ :dateTime_time => /\A\d\d:\d\d\z/, # "00:00"
582
+ }.freeze
583
+
584
+ URI_REGEXP = /(?<uri>.*?)/
585
+ TOKEN_REGEXP = /([^\(\)\<\>@,;:\\"\/\[\]\?=\{\} \t]+)/
586
+ QUOTED_STRING_REGEXP = /("[^"]*")/
587
+ SGML_NAME_REGEXP = /([A-Za-z][-A-Za-z0-9\.]*)/
588
+ RELATIONSHIP_REGEXP = Regexp.new("(?<relationship>#{SGML_NAME_REGEXP}|(\"#{SGML_NAME_REGEXP}(\\s+#{SGML_NAME_REGEXP})*\"))")
589
+ REL_REGEXP = Regexp.new("(?<rel>\\s*rel\\s*=\\s*(?<rel-relationship>#{RELATIONSHIP_REGEXP}))")
590
+ REV_REGEXP = Regexp.new("(?<rev>\\s*rev\\s*=\\s*#{RELATIONSHIP_REGEXP})")
591
+ TITLE_REGEXP = Regexp.new("(?<title>\\s*title\\s*=\\s*#{QUOTED_STRING_REGEXP})")
592
+ ANCHOR_REGEXP = Regexp.new("(?<anchor>\\s*anchor\\s*=\\s*\\<#{URI_REGEXP}\\>)")
593
+ LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)")
594
+ LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})")
595
+ LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*")
596
+ POSSIBLE_DATE_REGEXP = Regexp.new("\\A(\\d|\\s\\d#{Date::ABBR_MONTHNAMES.join('|')}#{Date::MONTHNAMES.join('|')})")
597
+
598
+ end
599
+ end