csvlint 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YjlmZmFlNGZjOWQ5MmNlNDZiOTUxMWY0NGExYTRkYjhhNzdlNjAyNA==
4
+ MmUxZTY5NThhMmU1ZmVlM2M0OWJiMzQ5MGY2NGRiMzk5NGEyYzEyYQ==
5
5
  data.tar.gz: !binary |-
6
- ODFjZmJkZmI0Nzg2NmMzN2ViOGNiNDlmODA0NDcxMzM0Zjk4NTgwOQ==
6
+ NTllMTYzYjUyYTk0ZTcwZmY5NDJkZjVlMGQzNzM4YWNkYWU2NjFjMg==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZTIyMGVkYjIyMjc2ZWViNTBhYmZkMWIxN2E1OTU0OTFhNGMxNzBlYzg0OTI4
10
- NDRkMzY2YzgxNmQwZGZiZDE5M2M2NzYwMzk3ZWZjMDc3YWM0YzQ0NTczY2U3
11
- MGZjNTUwMGI2MzgzZDQxYzkzMzBiNzI3NmJkZTIxYjZiYjc5MDA=
9
+ NTc2NTdhMzI4ZGI5NzFiMzgwZWYwM2E1YWVhMzE2ZmY5ZDUyNzdkODU1MTkw
10
+ OTgyZGM1ZGFhODMxNGVmNDkwNjY3ZjY5NDEyM2YzYWJjZDQ3NThiODRiOWY1
11
+ OTU1NGM4NGQ0NzQ3ZmRiYmM2MDM1YWM5YWJlMDRiN2MyNWI0YmI=
12
12
  data.tar.gz: !binary |-
13
- NTI1M2I5Yzc3NGNhOTg3Y2VkMmM3ZGM1ZTdiZWNmMzM0ZTY5ODljODNmNWYy
14
- MDA0NGVlMGFhNDQ2ZjZjYjI0Nzc2OTdhMWRmODI5YTEzMGRmNTQxZjAyOTA5
15
- YjVmMjk4NDIyOWEzMzIxMTBlYjQ4YTgwZmE4MWZlYTQ4MjMzZmE=
13
+ OTQ2NDNkN2RjNDlhZDNlYTI3NmU5NmQ4YTIxOTYxMjQyMTg2MWNhODFkZWQ2
14
+ ZDYyYWUyNzJjZGNkYzFkYWU0YjI2NzkwZTI1OGNkODFmNTZhNzhjNjE5OGY4
15
+ MmQzMzFkMTIxYzNkODM5NDFkNzc4ZDYwMjc2YTE2ZmZkZDgxZWY=
@@ -0,0 +1,2 @@
1
+ # Don't fuck with my CSV files
2
+ *.csv binary
@@ -2,7 +2,25 @@
2
2
 
3
3
  ## [Unreleased](https://github.com/theodi/csvlint.rb/tree/HEAD)
4
4
 
5
- [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.1.4...HEAD)
5
+ [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...HEAD)
6
+
7
+ **Implemented enhancements:**
8
+
9
+ - Get total rows number about the CSV file that was validated [\#143](https://github.com/theodi/csvlint.rb/issues/143)
10
+
11
+ **Closed issues:**
12
+
13
+ - Optimization: Stream CSV [\#122](https://github.com/theodi/csvlint.rb/issues/122)
14
+
15
+ **Merged pull requests:**
16
+
17
+ - Add `row\_count` method [\#153](https://github.com/theodi/csvlint.rb/pull/153) ([pezholio](https://github.com/pezholio))
18
+
19
+ - Streaming validation [\#146](https://github.com/theodi/csvlint.rb/pull/146) ([pezholio](https://github.com/pezholio))
20
+
21
+ ## [0.2.0](https://github.com/theodi/csvlint.rb/tree/0.2.0) (2015-10-05)
22
+
23
+ [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.1.4...0.2.0)
6
24
 
7
25
  **Closed issues:**
8
26
 
data/README.md CHANGED
@@ -77,6 +77,7 @@ best practices
77
77
  validator.encoding
78
78
  validator.content_type
79
79
  validator.extension
80
+ validator.row_count
80
81
 
81
82
  #retrieve HTTP headers from request
82
83
  validator.headers
@@ -128,7 +129,6 @@ The following types of error can be reported:
128
129
  * `:unclosed_quote` -- unclosed quoted field
129
130
  * `:whitespace` -- a quoted column has leading or trailing whitespace
130
131
  * `:line_breaks` -- line breaks were inconsistent or incorrectly specified
131
- * `:undeclared_header` -- if there is no machine-readable description of whether a header is present (e.g. in a dialect or `Content-Type` header)
132
132
 
133
133
  ## Warnings
134
134
 
@@ -271,6 +271,20 @@ options = {
271
271
  validator = Csvlint::Validator.new( "http://example.org/data.csv", nil, nil, options )
272
272
  ```
273
273
 
274
+ * :lambda -- Pass a block of code to be called when each line is validated, this will give you access to the `Validator` object. For example, this will return the current line number for every line validated:
275
+
276
+ ```
277
+ options = {
278
+ lambda: ->(validator) { puts validator.current_line }
279
+ }
280
+ validator = Csvlint::Validator.new( "http://example.org/data.csv", nil, nil, options )
281
+ => 1
282
+ 2
283
+ 3
284
+ 4
285
+ .....
286
+ ```
287
+
274
288
  ## Contributing
275
289
 
276
290
  1. Fork it
@@ -58,12 +58,22 @@ def print_error(index, error, dump, color)
58
58
  end
59
59
 
60
60
  def validate_csv(source, schema, dump)
61
- validator = Csvlint::Validator.new( source, nil, schema )
61
+ @error_count = 0
62
+ report_lines = lambda do |row|
63
+ new_errors = row.errors.count
64
+ if new_errors > @error_count
65
+ print "!".red
66
+ else
67
+ print ".".green
68
+ end
69
+ @error_count = new_errors
70
+ end
71
+ validator = Csvlint::Validator.new( source, {}, schema, { lambda: report_lines } )
62
72
 
63
73
  if $stdout.tty?
64
- puts "#{source.path || source || "CSV"} is #{validator.valid? ? "VALID".green : "INVALID".red}"
74
+ puts "\r\n#{source.path || source || "CSV"} is #{validator.valid? ? "VALID".green : "INVALID".red}"
65
75
  else
66
- puts "#{source.path || source || "CSV"} is #{validator.valid? ? "VALID" : "INVALID"}"
76
+ puts "\r\n#{source.path || source || "CSV"} is #{validator.valid? ? "VALID" : "INVALID"}"
67
77
  end
68
78
 
69
79
  if validator.errors.size > 0
@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
23
23
  spec.add_dependency "open_uri_redirections"
24
24
  spec.add_dependency "activesupport"
25
25
  spec.add_dependency "addressable"
26
+ spec.add_dependency "typhoeus"
26
27
  spec.add_dependency "escape_utils"
27
28
  spec.add_dependency "uri_template"
28
29
 
@@ -14,7 +14,7 @@ Feature: Collect all the tests that should trigger dialect check related errors
14
14
  And it is stored at the url "http://example.com/example1.csv"
15
15
  And I set header to "true"
16
16
  And I ask if there are info messages
17
- Then there should be 2 info messages
17
+ Then there should be 1 info message
18
18
  And one of the messages should have the type "nonrfc_line_breaks"
19
19
 
20
20
  Scenario: CR line endings in file give an info message of type :nonrfc_line_breaks
@@ -22,7 +22,7 @@ Feature: Collect all the tests that should trigger dialect check related errors
22
22
  And it is stored at the url "http://example.com/example1.csv"
23
23
  And I set header to "true"
24
24
  And I ask if there are info messages
25
- Then there should be 2 info messages
25
+ Then there should be 1 info message
26
26
  And one of the messages should have the type "nonrfc_line_breaks"
27
27
 
28
28
  Scenario: CRLF line endings in file produces no info messages of type :nonrfc_line_breaks
@@ -30,13 +30,13 @@ Feature: Collect all the tests that should trigger dialect check related errors
30
30
  And it is stored at the url "http://example.com/example1.csv"
31
31
  And I set header to "true"
32
32
  And I ask if there are info messages
33
- Then there should be 1 info message
33
+ Then there should be 0 info messages
34
34
 
35
35
  # :line_breaks
36
36
 
37
37
  Scenario: Incorrect line endings specified in settings
38
- Given I have a CSV file called "cr-line-endings.csv"
39
- And I set the line endings to linefeed
38
+ Given I have a CSV file called "lf-line-endings.csv"
39
+ And I set the line endings to carriage return
40
40
  And it is stored at the url "http://example.com/example1.csv"
41
41
  And I ask if there are errors
42
42
  Then there should be 1 error
@@ -10,13 +10,13 @@ Feature: Return information
10
10
  And it is stored at the url "http://example.com/example1.csv?query=true"
11
11
 
12
12
  Scenario: Return encoding
13
- Then the "encoding" should be "utf-8"
14
-
13
+ Then the "encoding" should be "UTF-8"
14
+
15
15
  Scenario: Return content type
16
- Then the "content_type" should be "text/csv"
16
+ Then the "content_type" should be "text/csv; charset=utf-8"
17
17
 
18
18
  Scenario: Return extension
19
19
  Then the "extension" should be ".csv"
20
-
20
+
21
21
  Scenario: Return meta
22
22
  Then the metadata content type should be "text/csv; charset=utf-8"
@@ -13,6 +13,11 @@ Given(/^I set the line endings to linefeed$/) do
13
13
  @csv_options["lineTerminator"] = "\n"
14
14
  end
15
15
 
16
+ Given(/^I set the line endings to carriage return$/) do
17
+ @csv_options ||= default_csv_options
18
+ @csv_options["lineTerminator"] = "\r"
19
+ end
20
+
16
21
  Given(/^I set header to "(.*?)"$/) do |boolean|
17
22
  @csv_options ||= default_csv_options
18
23
  @csv_options["header"] = boolean == "true"
@@ -27,7 +27,7 @@ Feature: Get validation errors
27
27
  And that error should have the row "2"
28
28
  And that error should have the content ""Foo","Bar","Baz"
29
29
 
30
- Scenario: Successfully report a CSV with incorrect whitespace
30
+ Scenario: Successfully report a CSV with incorrect whitespace
31
31
  Given I have a CSV with the following content:
32
32
  """
33
33
  "col1","col2","col3"
@@ -3,22 +3,22 @@ Feature: Get validation information messages
3
3
  Scenario: LF line endings in file give an info message
4
4
  Given I have a CSV file called "lf-line-endings.csv"
5
5
  And it is stored at the url "http://example.com/example1.csv"
6
- And I set header to "true"
6
+ And I set header to "true"
7
7
  And I ask if there are info messages
8
- Then there should be 2 info messages
8
+ Then there should be 1 info messages
9
9
  And one of the messages should have the type "nonrfc_line_breaks"
10
10
 
11
11
  Scenario: CR line endings in file give an info message
12
12
  Given I have a CSV file called "cr-line-endings.csv"
13
13
  And it is stored at the url "http://example.com/example1.csv"
14
- And I set header to "true"
14
+ And I set header to "true"
15
15
  And I ask if there are info messages
16
- Then there should be 2 info messages
16
+ Then there should be 1 info messages
17
17
  And one of the messages should have the type "nonrfc_line_breaks"
18
18
 
19
19
  Scenario: CRLF line endings in file produces no info messages
20
20
  Given I have a CSV file called "crlf-line-endings.csv"
21
21
  And it is stored at the url "http://example.com/example1.csv"
22
- And I set header to "true"
22
+ And I set header to "true"
23
23
  And I ask if there are info messages
24
- Then there should be 1 info message
24
+ Then there should be 0 info messages
@@ -3,6 +3,7 @@ require 'date'
3
3
  require 'open-uri'
4
4
  require 'set'
5
5
  require 'tempfile'
6
+ require 'typhoeus'
6
7
 
7
8
  require 'active_support/core_ext/date/conversions'
8
9
  require 'active_support/core_ext/time/conversions'
@@ -13,7 +14,6 @@ require 'uri_template'
13
14
  require 'csvlint/error_message'
14
15
  require 'csvlint/error_collector'
15
16
  require 'csvlint/validate'
16
- require 'csvlint/wrapped_io'
17
17
  require 'csvlint/field'
18
18
 
19
19
  require 'csvlint/csvw/metadata_error'
@@ -134,7 +134,7 @@ module Csvlint
134
134
  fractional_regexp += "(#{Regexp.escape(@grouping_separator)}[0-9]{1,#{max_fraction_digits % @fractional_grouping_size}})?" if max_fraction_digits % @fractional_grouping_size > 0
135
135
  else
136
136
  fractional_regexp += "([0-9]{#{@fractional_grouping_size}}#{Regexp.escape(@grouping_separator)}){0,#{(max_fraction_digits / @fractional_grouping_size) - 1}}" if max_fraction_digits > @fractional_grouping_size
137
- fractional_regexp += "[0-9]{#{@fractional_grouping_size}}"
137
+ fractional_regexp += "[0-9]{1,#{@fractional_grouping_size}}"
138
138
  end
139
139
  fractional_regexp = "#{Regexp.escape(@decimal_separator)}#{fractional_regexp}"
140
140
  fractional_regexp = "(#{fractional_regexp})?" if min_fraction_digits == 0
@@ -17,7 +17,7 @@ module Csvlint
17
17
  def validate_column(value, row=nil, column=nil, all_errors=[])
18
18
  reset
19
19
  unless all_errors.any?{|error| ((error.type == :invalid_regex) && (error.column == column))}
20
- validate_regex(value, row, column)
20
+ validate_regex(value, row, column, all_errors)
21
21
  end
22
22
  validate_length(value, row, column)
23
23
  validate_values(value, row, column)
@@ -42,7 +42,7 @@ module Csvlint
42
42
  end
43
43
  end
44
44
 
45
- def validate_regex(value, row, column)
45
+ def validate_regex(value, row, column, all_errors)
46
46
  pattern = constraints["pattern"]
47
47
  if pattern
48
48
  begin
@@ -50,12 +50,18 @@ module Csvlint
50
50
  build_errors(:pattern, :schema, row, column, value,
51
51
  { "pattern" => constraints["pattern"] } ) if !value.nil? && !value.match( constraints["pattern"] )
52
52
  rescue RegexpError
53
- build_errors(:invalid_regex, :schema, nil, column, ("#{name}: Constraints: Pattern: #{pattern}"),
54
- { "pattern" => constraints["pattern"] })
53
+ build_regex_error(value, row, column, pattern, all_errors)
55
54
  end
56
55
  end
57
56
  end
58
57
 
58
+ def build_regex_error(value, row, column, pattern, all_errors)
59
+ return if @regex_error_exists
60
+ build_errors(:invalid_regex, :schema, nil, column, ("#{name}: Constraints: Pattern: #{pattern}"),
61
+ { "pattern" => constraints["pattern"] })
62
+ @regex_error_exists = true
63
+ end
64
+
59
65
  def validate_values(value, row, column)
60
66
  # If a pattern exists, raise an invalid regex error if it is not in
61
67
  # valid regex form, else, if the value of the relevant field in the csv
@@ -4,181 +4,308 @@ module Csvlint
4
4
 
5
5
  include Csvlint::ErrorCollector
6
6
 
7
- attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :line_breaks, :dialect, :csv_header, :schema, :data
7
+ attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :dialect, :csv_header, :schema, :data, :current_line
8
8
 
9
9
  ERROR_MATCHERS = {
10
- "Missing or stray quote" => :stray_quote,
11
- "Illegal quoting" => :whitespace,
12
- "Unclosed quoted field" => :unclosed_quote,
13
- "Unquoted fields do not allow \\r or \\n" => :line_breaks,
10
+ "Missing or stray quote" => :stray_quote,
11
+ "Illegal quoting" => :whitespace,
12
+ "Unclosed quoted field" => :unclosed_quote,
13
+ "Unquoted fields do not allow \\r or \\n" => :line_breaks,
14
14
  }
15
15
 
16
- def initialize(source, dialect = nil, schema = nil, options = {})
16
+ def initialize(source, dialect = {}, schema = nil, options = {})
17
17
  reset
18
18
  @source = source
19
19
  @formats = []
20
20
  @schema = schema
21
-
22
- @supplied_dialect = dialect != nil
21
+ @dialect = dialect
22
+ @csv_header = true
23
+ @headers = {}
24
+ @lambda = options[:lambda] || lambda { |a| nil }
25
+ @leading = ""
23
26
 
24
27
  @limit_lines = options[:limit_lines]
25
28
  @extension = parse_extension(source) unless @source.nil?
29
+
30
+ @expected_columns = 0
31
+ @col_counts = []
32
+ @line_breaks = []
33
+
26
34
  @errors += @schema.errors unless @schema.nil?
27
35
  @warnings += @schema.warnings unless @schema.nil?
28
- validate(dialect)
29
36
 
37
+ @data = [] # it may be advisable to flush this on init?
38
+
39
+ validate
30
40
  end
31
41
 
32
- def validate(dialect = nil)
33
- single_col = false
34
- io = nil
42
+ def validate
43
+ if @extension =~ /.xls(x)?/
44
+ build_warnings(:excel, :context)
45
+ return
46
+ end
47
+ locate_schema unless @schema.instance_of?(Csvlint::Schema)
48
+ set_dialect
49
+
50
+ if @source.class == String
51
+ validate_url
52
+ else
53
+ validate_metadata
54
+ validate_stream
55
+ end
56
+ finish
57
+ end
58
+
59
+ def validate_stream
60
+ @current_line = 1
61
+ @source.each_line do |line|
62
+ break if line_limit_reached?
63
+ parse_line(line)
64
+ end
65
+ validate_line(@leading, @current_line) unless @leading == ""
66
+ end
67
+
68
+ def validate_url
69
+ @current_line = 1
35
70
  begin
36
- if @extension =~ /.xls(x)?/
37
- build_warnings(:excel, :context)
38
- return
71
+ request = Typhoeus::Request.new(@source, followlocation: true)
72
+ request.on_headers do |response|
73
+ @headers = response.headers || {}
74
+ @content_type = response.headers["content-type"] rescue nil
75
+ @response_code = response.code
76
+ return build_errors(:not_found) if response.code == 404
77
+ validate_metadata
39
78
  end
40
- io = @source.respond_to?(:gets) ? @source : open(@source, :allow_redirections=>:all)
41
- validate_metadata(io)
42
- locate_schema unless @schema.instance_of?(Csvlint::Schema)
43
- set_dialect(dialect)
44
- parse_csv(io)
45
- sum = @col_counts.inject(:+)
46
- unless sum.nil?
47
- build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
79
+ request.on_body do |chunk|
80
+ io = StringIO.new(@leading + chunk)
81
+ io.each_line do |line|
82
+ break if line_limit_reached?
83
+ parse_line(line)
84
+ end
48
85
  end
49
- build_warnings(:check_options, :structure) if @expected_columns == 1
50
- check_consistency
51
- check_foreign_keys
52
- rescue OpenURI::HTTPError, Errno::ENOENT
53
- build_errors(:not_found, nil, nil, nil, @source)
54
- ensure
55
- io.close if io && io.respond_to?(:close)
86
+ request.run
87
+ # Validate the last line too
88
+ validate_line(@leading, @current_line) unless @leading == ""
89
+ rescue ArgumentError => ae
90
+ build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
91
+ @reported_invalid_encoding = true
56
92
  end
57
93
  end
58
94
 
59
- def validate_metadata(io)
60
- @csv_header = true
61
- @encoding = io.charset rescue nil
62
- @content_type = io.content_type rescue nil
63
- @headers = io.meta rescue nil
64
- @link_headers = @headers["link"].split(",") rescue nil
65
- assumed_header = undeclared_header = !@supplied_dialect
66
- if @headers
95
+ def parse_line(line)
96
+ line = @leading + line
97
+ # Check if the last line is a line break - in which case it's a full line
98
+ if line[-1, 1].include?("\n")
99
+ # If the number of quotes is odd, the linebreak is inside some quotes
100
+ if line.count(@dialect["quoteChar"]).odd?
101
+ @leading = line
102
+ else
103
+ validate_line(line, @current_line)
104
+ @leading = ""
105
+ @current_line = @current_line+1
106
+ end
107
+ else
108
+ # If it's not a full line, then prepare to add it to the beginning of the next chunk
109
+ @leading = line
110
+ end
111
+ end
112
+
113
+ def validate_line(input = nil, index = nil)
114
+ @input = input
115
+ single_col = false
116
+ line = index.present? ? index : 0
117
+ @encoding = input.encoding.to_s
118
+ report_line_breaks(line)
119
+ parse_contents(input, line)
120
+ @lambda.call(self)
121
+ rescue ArgumentError => ae
122
+ build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
123
+ @reported_invalid_encoding = true
124
+ end
125
+
126
+ # analyses the provided csv and builds errors, warnings and info messages
127
+ def parse_contents(stream, line = nil)
128
+ # parse_contents will parse one line and apply headers, formats methods and error handle as appropriate
129
+ current_line = line.present? ? line : 1
130
+ all_errors = []
131
+
132
+ @csv_options[:encoding] = @encoding
133
+
134
+ begin
135
+ row = CSV.parse_line(stream, @csv_options)
136
+ # this is a one line substitute for CSV.new followed by row = CSV.shift. a CSV Row class is required
137
+ # CSV.parse will return an array of arrays which breaks subsequent each_with_index invocations
138
+ # TODO investigate if above would be a drag on memory
139
+
140
+ rescue CSV::MalformedCSVError => e
141
+ build_exception_messages(e, stream, current_line)
142
+ end
143
+
144
+ @data << row
145
+ if row
146
+ if current_line <= 1 && @csv_header
147
+ # this conditional should be refactored somewhere
148
+ row = row.reject { |col| col.nil? || col.empty? }
149
+ validate_header(row)
150
+ @col_counts << row.size
151
+ else
152
+ build_formats(row)
153
+ @col_counts << row.reject { |col| col.nil? || col.empty? }.size
154
+ @expected_columns = row.size unless @expected_columns != 0
155
+ build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? || c.empty? }.size == 0
156
+ # Builds errors and warnings related to the provided schema file
157
+ if @schema
158
+ @schema.validate_row(row, current_line, all_errors, @source)
159
+ @errors += @schema.errors
160
+ all_errors += @schema.errors
161
+ @warnings += @schema.warnings
162
+ else
163
+ build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) if !row.empty? && row.size != @expected_columns
164
+ end
165
+ end
166
+ end
167
+ end
168
+
169
+ def finish
170
+ sum = @col_counts.inject(:+)
171
+ unless sum.nil?
172
+ build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
173
+ end
174
+ # return expected_columns to calling class
175
+ build_warnings(:check_options, :structure) if @expected_columns == 1
176
+ check_consistency
177
+ check_foreign_keys
178
+ check_mixed_linebreaks
179
+ validate_encoding
180
+ end
181
+
182
+ def validate_metadata
183
+ assumed_header = !@supplied_dialect
184
+ unless @headers.empty?
67
185
  if @headers["content-type"] =~ /text\/csv/
68
- @csv_header = true
69
- undeclared_header = false
70
- assumed_header = true
186
+ @csv_header = @csv_header && true
187
+ assumed_header = @assumed_header.present?
71
188
  end
72
189
  if @headers["content-type"] =~ /header=(present|absent)/
73
190
  @csv_header = true if $1 == "present"
74
191
  @csv_header = false if $1 == "absent"
75
- undeclared_header = false
76
192
  assumed_header = false
77
193
  end
78
- if @headers["content-type"] !~ /charset=/
79
- build_warnings(:no_encoding, :context)
80
- else
81
- build_warnings(:encoding, :context) if @encoding != "utf-8"
82
- end
83
194
  build_warnings(:no_content_type, :context) if @content_type == nil
84
195
  build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
196
+ end
197
+ @header_processed = true
198
+ build_info_messages(:assumed_header, :structure) if assumed_header
85
199
 
86
- if undeclared_header
87
- build_errors(:undeclared_header, :structure)
88
- assumed_header = false
200
+ @link_headers = @headers["link"].split(",") rescue nil
201
+ @link_headers.each do |link_header|
202
+ match = LINK_HEADER_REGEXP.match(link_header)
203
+ uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
204
+ rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
205
+ param = match["param"]
206
+ param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
207
+ if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
208
+ begin
209
+ url = URI.join(@source_url, uri)
210
+ schema = Schema.load_from_json(url)
211
+ if schema.instance_of? Csvlint::Csvw::TableGroup
212
+ if schema.tables[@source_url]
213
+ link_schema = schema
214
+ else
215
+ warn_if_unsuccessful = true
216
+ build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
217
+ end
218
+ end
219
+ rescue OpenURI::HTTPError
220
+ end
89
221
  end
222
+ end if @link_headers
223
+ end
224
+
225
+ def header?
226
+ @csv_header && @dialect["header"]
227
+ end
90
228
 
229
+ def report_line_breaks(line_no=nil)
230
+ return if @input !~ /[\r|\n]/ # Return straight away if there's no newline character - i.e. we're on the last line
231
+ line_break = CSV.new(@input).row_sep
232
+ @line_breaks << line_break
233
+ unless line_breaks_reported?
234
+ if line_break != "\r\n"
235
+ build_info_messages(:nonrfc_line_breaks, :structure, line_no)
236
+ @line_breaks_reported = true
237
+ end
91
238
  end
92
- build_info_messages(:assumed_header, :structure) if assumed_header
93
239
  end
94
240
 
95
- def set_dialect(dialect)
241
+ def line_breaks_reported?
242
+ @line_breaks_reported === true
243
+ end
244
+
245
+ def set_dialect
246
+ @assumed_header = @dialect["header"].nil?
247
+ @supplied_dialect = @dialect != {}
248
+
96
249
  begin
97
250
  schema_dialect = @schema.tables[@source_url].dialect || {}
98
251
  rescue
99
252
  schema_dialect = {}
100
253
  end
101
254
  @dialect = {
102
- "header" => true,
103
- "delimiter" => ",",
104
- "skipInitialSpace" => true,
105
- "lineTerminator" => :auto,
106
- "quoteChar" => '"',
107
- "trim" => :true
108
- }.merge(schema_dialect).merge(dialect || {})
255
+ "header" => true,
256
+ "delimiter" => ",",
257
+ "skipInitialSpace" => true,
258
+ "lineTerminator" => :auto,
259
+ "quoteChar" => '"',
260
+ "trim" => :true
261
+ }.merge(schema_dialect).merge(@dialect || {})
109
262
 
110
263
  @csv_header = @csv_header && @dialect["header"]
111
264
  @csv_options = dialect_to_csv_options(@dialect)
112
265
  end
113
266
 
114
- # analyses the provided csv and builds errors, warnings and info messages
115
- def parse_csv(io)
116
- @expected_columns = 0
117
- current_line = 0
118
- reported_invalid_encoding = false
119
- all_errors = []
120
- @col_counts = []
267
+ def validate_encoding
268
+ if @headers["content-type"]
269
+ if @headers["content-type"] !~ /charset=/
270
+ build_warnings(:no_encoding, :context)
271
+ elsif @headers["content-type"] !~ /charset=utf-8/i
272
+ build_warnings(:encoding, :context)
273
+ end
274
+ end
275
+ build_warnings(:encoding, :context) if @encoding != "UTF-8"
276
+ end
121
277
 
122
- @csv_options[:encoding] = @encoding
278
+ def check_mixed_linebreaks
279
+ build_linebreak_error if @line_breaks.uniq.count > 1
280
+ end
123
281
 
124
- begin
125
- wrapper = WrappedIO.new( io )
126
- csv = CSV.new( wrapper, @csv_options )
127
- @data = []
128
- @line_breaks = csv.row_sep
129
- if @line_breaks != "\r\n"
130
- build_info_messages(:nonrfc_line_breaks, :structure)
131
- end
132
- row = nil
133
- loop do
134
- current_line += 1
135
- if @limit_lines && current_line > @limit_lines
136
- break
137
- end
138
- begin
139
- wrapper.reset_line
140
- row = csv.shift
141
- @data << row
142
- if row
143
- if current_line == 1 && header?
144
- row = row.reject{|col| col.nil? || col.empty?}
145
- validate_header(row)
146
- @col_counts << row.size
147
- else
148
- build_formats(row)
149
- @col_counts << row.reject{|col| col.nil? || col.empty?}.size
150
- @expected_columns = row.size unless @expected_columns != 0
151
-
152
- build_errors(:blank_rows, :structure, current_line, nil, wrapper.line) if row.reject{ |c| c.nil? || c.empty? }.size == 0
153
- # Builds errors and warnings related to the provided schema file
154
- if @schema
155
- @schema.validate_row(row, current_line, all_errors, @source)
156
- @errors += @schema.errors
157
- all_errors += @schema.errors
158
- @warnings += @schema.warnings
159
- else
160
- build_errors(:ragged_rows, :structure, current_line, nil, wrapper.line) if !row.empty? && row.size != @expected_columns
161
- end
162
-
163
- end
164
- else
165
- break
166
- end
167
- rescue CSV::MalformedCSVError => e
168
- type = fetch_error(e)
169
- if type == :stray_quote && !wrapper.line.match(csv.row_sep)
170
- build_errors(:line_breaks, :structure)
171
- else
172
- build_errors(type, :structure, current_line, nil, wrapper.line)
173
- end
174
- end
282
+ def line_breaks
283
+ if @line_breaks.uniq.count > 1
284
+ :mixed
285
+ else
286
+ @line_breaks.uniq.first
175
287
  end
176
- rescue ArgumentError => ae
177
- build_errors(:invalid_encoding, :structure, current_line, nil, wrapper.line) unless reported_invalid_encoding
178
- reported_invalid_encoding = true
288
+ end
289
+
290
+ def row_count
291
+ data.count
292
+ end
293
+
294
+ def build_exception_messages(csvException, errChars, lineNo)
295
+ #TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers
296
+ #TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options
297
+ type = fetch_error(csvException)
298
+ if !@csv_options[:row_sep].kind_of?(Symbol) && [:unclosed_quote,:stray_quote].include?(type) && !@input.match(@csv_options[:row_sep])
299
+ build_linebreak_error
300
+ else
301
+ build_errors(type, :structure, lineNo, nil, errChars)
179
302
  end
180
303
  end
181
304
 
305
+ def build_linebreak_error
306
+ build_errors(:line_breaks, :structure) unless @errors.any? { |e| e.type == :line_breaks }
307
+ end
308
+
182
309
  def validate_header(header)
183
310
  names = Set.new
184
311
  header.map{|h| h.strip! } if @dialect["trim"] == :true
@@ -198,10 +325,6 @@ module Csvlint
198
325
  return valid?
199
326
  end
200
327
 
201
- def header?
202
- @csv_header
203
- end
204
-
205
328
  def fetch_error(error)
206
329
  e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i)
207
330
  message = e[1] rescue nil
@@ -209,15 +332,15 @@ module Csvlint
209
332
  end
210
333
 
211
334
  def dialect_to_csv_options(dialect)
212
- skipinitialspace = dialect["skipInitialSpace"] || true
213
- delimiter = dialect["delimiter"]
214
- delimiter = delimiter + " " if !skipinitialspace
215
- return {
216
- :col_sep => delimiter,
217
- :row_sep => dialect["lineTerminator"],
218
- :quote_char => dialect["quoteChar"],
219
- :skip_blanks => false
220
- }
335
+ skipinitialspace = dialect["skipInitialSpace"] || true
336
+ delimiter = dialect["delimiter"]
337
+ delimiter = delimiter + " " if !skipinitialspace
338
+ return {
339
+ :col_sep => delimiter,
340
+ :row_sep => dialect["lineTerminator"],
341
+ :quote_char => dialect["quoteChar"],
342
+ :skip_blanks => false
343
+ }
221
344
  end
222
345
 
223
346
  def build_formats(row)
@@ -225,33 +348,34 @@ module Csvlint
225
348
  next if col.nil? || col.empty?
226
349
  @formats[i] ||= Hash.new(0)
227
350
 
228
- format = if col.strip[FORMATS[:numeric]]
229
- :numeric
230
- elsif uri?(col)
231
- :uri
232
- elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
233
- :date_db
234
- elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
235
- :date_short
236
- elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
237
- :date_rfc822
238
- elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
239
- :date_long
240
- elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
241
- :dateTime_time
242
- elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
243
- :dateTime_hms
244
- elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
245
- :dateTime_db
246
- elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
247
- :dateTime_iso8601
248
- elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
249
- :dateTime_short
250
- elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
251
- :dateTime_long
252
- else
253
- :string
254
- end
351
+ format =
352
+ if col.strip[FORMATS[:numeric]]
353
+ :numeric
354
+ elsif uri?(col)
355
+ :uri
356
+ elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
357
+ :date_db
358
+ elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
359
+ :date_short
360
+ elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
361
+ :date_rfc822
362
+ elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
363
+ :date_long
364
+ elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
365
+ :dateTime_time
366
+ elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
367
+ :dateTime_hms
368
+ elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
369
+ :dateTime_db
370
+ elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
371
+ :dateTime_iso8601
372
+ elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
373
+ :dateTime_short
374
+ elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
375
+ :dateTime_long
376
+ else
377
+ :string
378
+ end
255
379
 
256
380
  @formats[i][format] += 1
257
381
  end
@@ -277,15 +401,16 @@ module Csvlint
277
401
  end
278
402
 
279
403
  def locate_schema
404
+
280
405
  @source_url = nil
281
406
  warn_if_unsuccessful = false
282
407
  case @source
283
- when StringIO
284
- return
285
- when File
286
- @source_url = "file:#{File.expand_path(@source)}"
287
- else
288
- @source_url = @source
408
+ when StringIO
409
+ return
410
+ when File
411
+ @source_url = "file:#{File.expand_path(@source)}"
412
+ else
413
+ @source_url = @source
289
414
  end
290
415
  unless @schema.nil?
291
416
  if @schema.tables[@source_url]
@@ -295,28 +420,6 @@ module Csvlint
295
420
  end
296
421
  end
297
422
  link_schema = nil
298
- @link_headers.each do |link_header|
299
- match = LINK_HEADER_REGEXP.match(link_header)
300
- uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
301
- rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
302
- param = match["param"]
303
- param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
304
- if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
305
- begin
306
- url = URI.join(@source_url, uri)
307
- schema = Schema.load_from_json(url)
308
- if schema.instance_of? Csvlint::Csvw::TableGroup
309
- if schema.tables[@source_url]
310
- link_schema = schema
311
- else
312
- warn_if_unsuccessful = true
313
- build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
314
- end
315
- end
316
- rescue OpenURI::HTTPError
317
- end
318
- end
319
- end if @link_headers
320
423
  @schema = link_schema if link_schema
321
424
 
322
425
  paths = []
@@ -324,8 +427,8 @@ module Csvlint
324
427
  begin
325
428
  well_known_uri = URI.join(@source_url, "/.well-known/csvm")
326
429
  well_known = open(well_known_uri).read
327
- # TODO
328
- rescue OpenURI::HTTPError
430
+ # TODO
431
+ rescue OpenURI::HTTPError, URI::BadURIError
329
432
  end
330
433
  end
331
434
  paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty?
@@ -345,8 +448,7 @@ module Csvlint
345
448
  end
346
449
  end
347
450
  rescue Errno::ENOENT
348
- rescue OpenURI::HTTPError
349
- rescue ArgumentError
451
+ rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError
350
452
  rescue => e
351
453
  STDERR.puts e.class
352
454
  STDERR.puts e.message
@@ -361,23 +463,24 @@ module Csvlint
361
463
  private
362
464
 
363
465
  def parse_extension(source)
466
+
364
467
  case source
365
- when File
366
- return File.extname( source.path )
367
- when IO
368
- return ""
369
- when StringIO
370
- return ""
468
+ when File
469
+ return File.extname( source.path )
470
+ when IO
471
+ return ""
472
+ when StringIO
473
+ return ""
371
474
  when Tempfile
372
475
  # this is triggered when the revalidate dialect use case happens
373
- return ""
374
- else
375
- begin
376
- parsed = URI.parse(source)
377
- File.extname(parsed.path)
378
- rescue URI::InvalidURIError
379
476
  return ""
380
- end
477
+ else
478
+ begin
479
+ parsed = URI.parse(source)
480
+ File.extname(parsed.path)
481
+ rescue URI::InvalidURIError
482
+ return ""
483
+ end
381
484
  end
382
485
  end
383
486
 
@@ -396,20 +499,24 @@ module Csvlint
396
499
  false
397
500
  end
398
501
 
502
+ def line_limit_reached?
503
+ @limit_lines.present? && @current_line > @limit_lines
504
+ end
505
+
399
506
  FORMATS = {
400
- :string => nil,
401
- :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
402
- :uri => /\Ahttps?:/,
403
- :date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01"
404
- :date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345"
405
- :date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345"
406
- :date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan"
407
- :dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00"
408
- :dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00"
409
- :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z"
410
- :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
411
- :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00"
412
- :dateTime_time => /\A\d\d:\d\d\z/, # "00:00"
507
+ :string => nil,
508
+ :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
509
+ :uri => /\Ahttps?:/,
510
+ :date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01"
511
+ :date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345"
512
+ :date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345"
513
+ :date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan"
514
+ :dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00"
515
+ :dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00"
516
+ :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z"
517
+ :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
518
+ :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00"
519
+ :dateTime_time => /\A\d\d:\d\d\z/, # "00:00"
413
520
  }.freeze
414
521
 
415
522
  URI_REGEXP = /(?<uri>.*?)/