csvlint 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YjlmZmFlNGZjOWQ5MmNlNDZiOTUxMWY0NGExYTRkYjhhNzdlNjAyNA==
4
+ MmUxZTY5NThhMmU1ZmVlM2M0OWJiMzQ5MGY2NGRiMzk5NGEyYzEyYQ==
5
5
  data.tar.gz: !binary |-
6
- ODFjZmJkZmI0Nzg2NmMzN2ViOGNiNDlmODA0NDcxMzM0Zjk4NTgwOQ==
6
+ NTllMTYzYjUyYTk0ZTcwZmY5NDJkZjVlMGQzNzM4YWNkYWU2NjFjMg==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZTIyMGVkYjIyMjc2ZWViNTBhYmZkMWIxN2E1OTU0OTFhNGMxNzBlYzg0OTI4
10
- NDRkMzY2YzgxNmQwZGZiZDE5M2M2NzYwMzk3ZWZjMDc3YWM0YzQ0NTczY2U3
11
- MGZjNTUwMGI2MzgzZDQxYzkzMzBiNzI3NmJkZTIxYjZiYjc5MDA=
9
+ NTc2NTdhMzI4ZGI5NzFiMzgwZWYwM2E1YWVhMzE2ZmY5ZDUyNzdkODU1MTkw
10
+ OTgyZGM1ZGFhODMxNGVmNDkwNjY3ZjY5NDEyM2YzYWJjZDQ3NThiODRiOWY1
11
+ OTU1NGM4NGQ0NzQ3ZmRiYmM2MDM1YWM5YWJlMDRiN2MyNWI0YmI=
12
12
  data.tar.gz: !binary |-
13
- NTI1M2I5Yzc3NGNhOTg3Y2VkMmM3ZGM1ZTdiZWNmMzM0ZTY5ODljODNmNWYy
14
- MDA0NGVlMGFhNDQ2ZjZjYjI0Nzc2OTdhMWRmODI5YTEzMGRmNTQxZjAyOTA5
15
- YjVmMjk4NDIyOWEzMzIxMTBlYjQ4YTgwZmE4MWZlYTQ4MjMzZmE=
13
+ OTQ2NDNkN2RjNDlhZDNlYTI3NmU5NmQ4YTIxOTYxMjQyMTg2MWNhODFkZWQ2
14
+ ZDYyYWUyNzJjZGNkYzFkYWU0YjI2NzkwZTI1OGNkODFmNTZhNzhjNjE5OGY4
15
+ MmQzMzFkMTIxYzNkODM5NDFkNzc4ZDYwMjc2YTE2ZmZkZDgxZWY=
@@ -0,0 +1,2 @@
1
+ # Don't fuck with my CSV files
2
+ *.csv binary
@@ -2,7 +2,25 @@
2
2
 
3
3
  ## [Unreleased](https://github.com/theodi/csvlint.rb/tree/HEAD)
4
4
 
5
- [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.1.4...HEAD)
5
+ [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...HEAD)
6
+
7
+ **Implemented enhancements:**
8
+
9
+ - Get total rows number about the CSV file that was validated [\#143](https://github.com/theodi/csvlint.rb/issues/143)
10
+
11
+ **Closed issues:**
12
+
13
+ - Optimization: Stream CSV [\#122](https://github.com/theodi/csvlint.rb/issues/122)
14
+
15
+ **Merged pull requests:**
16
+
17
+ - Add `row\_count` method [\#153](https://github.com/theodi/csvlint.rb/pull/153) ([pezholio](https://github.com/pezholio))
18
+
19
+ - Streaming validation [\#146](https://github.com/theodi/csvlint.rb/pull/146) ([pezholio](https://github.com/pezholio))
20
+
21
+ ## [0.2.0](https://github.com/theodi/csvlint.rb/tree/0.2.0) (2015-10-05)
22
+
23
+ [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.1.4...0.2.0)
6
24
 
7
25
  **Closed issues:**
8
26
 
data/README.md CHANGED
@@ -77,6 +77,7 @@ best practices
77
77
  validator.encoding
78
78
  validator.content_type
79
79
  validator.extension
80
+ validator.row_count
80
81
 
81
82
  #retrieve HTTP headers from request
82
83
  validator.headers
@@ -128,7 +129,6 @@ The following types of error can be reported:
128
129
  * `:unclosed_quote` -- unclosed quoted field
129
130
  * `:whitespace` -- a quoted column has leading or trailing whitespace
130
131
  * `:line_breaks` -- line breaks were inconsistent or incorrectly specified
131
- * `:undeclared_header` -- if there is no machine-readable description of whether a header is present (e.g. in a dialect or `Content-Type` header)
132
132
 
133
133
  ## Warnings
134
134
 
@@ -271,6 +271,20 @@ options = {
271
271
  validator = Csvlint::Validator.new( "http://example.org/data.csv", nil, nil, options )
272
272
  ```
273
273
 
274
+ * :lambda -- Pass a block of code to be called when each line is validated, this will give you access to the `Validator` object. For example, this will return the current line number for every line validated:
275
+
276
+ ```
277
+ options = {
278
+ lambda: ->(validator) { puts validator.current_line }
279
+ }
280
+ validator = Csvlint::Validator.new( "http://example.org/data.csv", nil, nil, options )
281
+ => 1
282
+ 2
283
+ 3
284
+ 4
285
+ .....
286
+ ```
287
+
274
288
  ## Contributing
275
289
 
276
290
  1. Fork it
@@ -58,12 +58,22 @@ def print_error(index, error, dump, color)
58
58
  end
59
59
 
60
60
  def validate_csv(source, schema, dump)
61
- validator = Csvlint::Validator.new( source, nil, schema )
61
+ @error_count = 0
62
+ report_lines = lambda do |row|
63
+ new_errors = row.errors.count
64
+ if new_errors > @error_count
65
+ print "!".red
66
+ else
67
+ print ".".green
68
+ end
69
+ @error_count = new_errors
70
+ end
71
+ validator = Csvlint::Validator.new( source, {}, schema, { lambda: report_lines } )
62
72
 
63
73
  if $stdout.tty?
64
- puts "#{source.path || source || "CSV"} is #{validator.valid? ? "VALID".green : "INVALID".red}"
74
+ puts "\r\n#{source.path || source || "CSV"} is #{validator.valid? ? "VALID".green : "INVALID".red}"
65
75
  else
66
- puts "#{source.path || source || "CSV"} is #{validator.valid? ? "VALID" : "INVALID"}"
76
+ puts "\r\n#{source.path || source || "CSV"} is #{validator.valid? ? "VALID" : "INVALID"}"
67
77
  end
68
78
 
69
79
  if validator.errors.size > 0
@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
23
23
  spec.add_dependency "open_uri_redirections"
24
24
  spec.add_dependency "activesupport"
25
25
  spec.add_dependency "addressable"
26
+ spec.add_dependency "typhoeus"
26
27
  spec.add_dependency "escape_utils"
27
28
  spec.add_dependency "uri_template"
28
29
 
@@ -14,7 +14,7 @@ Feature: Collect all the tests that should trigger dialect check related errors
14
14
  And it is stored at the url "http://example.com/example1.csv"
15
15
  And I set header to "true"
16
16
  And I ask if there are info messages
17
- Then there should be 2 info messages
17
+ Then there should be 1 info message
18
18
  And one of the messages should have the type "nonrfc_line_breaks"
19
19
 
20
20
  Scenario: CR line endings in file give an info message of type :nonrfc_line_breaks
@@ -22,7 +22,7 @@ Feature: Collect all the tests that should trigger dialect check related errors
22
22
  And it is stored at the url "http://example.com/example1.csv"
23
23
  And I set header to "true"
24
24
  And I ask if there are info messages
25
- Then there should be 2 info messages
25
+ Then there should be 1 info message
26
26
  And one of the messages should have the type "nonrfc_line_breaks"
27
27
 
28
28
  Scenario: CRLF line endings in file produces no info messages of type :nonrfc_line_breaks
@@ -30,13 +30,13 @@ Feature: Collect all the tests that should trigger dialect check related errors
30
30
  And it is stored at the url "http://example.com/example1.csv"
31
31
  And I set header to "true"
32
32
  And I ask if there are info messages
33
- Then there should be 1 info message
33
+ Then there should be 0 info messages
34
34
 
35
35
  # :line_breaks
36
36
 
37
37
  Scenario: Incorrect line endings specified in settings
38
- Given I have a CSV file called "cr-line-endings.csv"
39
- And I set the line endings to linefeed
38
+ Given I have a CSV file called "lf-line-endings.csv"
39
+ And I set the line endings to carriage return
40
40
  And it is stored at the url "http://example.com/example1.csv"
41
41
  And I ask if there are errors
42
42
  Then there should be 1 error
@@ -10,13 +10,13 @@ Feature: Return information
10
10
  And it is stored at the url "http://example.com/example1.csv?query=true"
11
11
 
12
12
  Scenario: Return encoding
13
- Then the "encoding" should be "utf-8"
14
-
13
+ Then the "encoding" should be "UTF-8"
14
+
15
15
  Scenario: Return content type
16
- Then the "content_type" should be "text/csv"
16
+ Then the "content_type" should be "text/csv; charset=utf-8"
17
17
 
18
18
  Scenario: Return extension
19
19
  Then the "extension" should be ".csv"
20
-
20
+
21
21
  Scenario: Return meta
22
22
  Then the metadata content type should be "text/csv; charset=utf-8"
@@ -13,6 +13,11 @@ Given(/^I set the line endings to linefeed$/) do
13
13
  @csv_options["lineTerminator"] = "\n"
14
14
  end
15
15
 
16
+ Given(/^I set the line endings to carriage return$/) do
17
+ @csv_options ||= default_csv_options
18
+ @csv_options["lineTerminator"] = "\r"
19
+ end
20
+
16
21
  Given(/^I set header to "(.*?)"$/) do |boolean|
17
22
  @csv_options ||= default_csv_options
18
23
  @csv_options["header"] = boolean == "true"
@@ -27,7 +27,7 @@ Feature: Get validation errors
27
27
  And that error should have the row "2"
28
28
  And that error should have the content ""Foo","Bar","Baz"
29
29
 
30
- Scenario: Successfully report a CSV with incorrect whitespace
30
+ Scenario: Successfully report a CSV with incorrect whitespace
31
31
  Given I have a CSV with the following content:
32
32
  """
33
33
  "col1","col2","col3"
@@ -3,22 +3,22 @@ Feature: Get validation information messages
3
3
  Scenario: LF line endings in file give an info message
4
4
  Given I have a CSV file called "lf-line-endings.csv"
5
5
  And it is stored at the url "http://example.com/example1.csv"
6
- And I set header to "true"
6
+ And I set header to "true"
7
7
  And I ask if there are info messages
8
- Then there should be 2 info messages
8
+ Then there should be 1 info messages
9
9
  And one of the messages should have the type "nonrfc_line_breaks"
10
10
 
11
11
  Scenario: CR line endings in file give an info message
12
12
  Given I have a CSV file called "cr-line-endings.csv"
13
13
  And it is stored at the url "http://example.com/example1.csv"
14
- And I set header to "true"
14
+ And I set header to "true"
15
15
  And I ask if there are info messages
16
- Then there should be 2 info messages
16
+ Then there should be 1 info messages
17
17
  And one of the messages should have the type "nonrfc_line_breaks"
18
18
 
19
19
  Scenario: CRLF line endings in file produces no info messages
20
20
  Given I have a CSV file called "crlf-line-endings.csv"
21
21
  And it is stored at the url "http://example.com/example1.csv"
22
- And I set header to "true"
22
+ And I set header to "true"
23
23
  And I ask if there are info messages
24
- Then there should be 1 info message
24
+ Then there should be 0 info messages
@@ -3,6 +3,7 @@ require 'date'
3
3
  require 'open-uri'
4
4
  require 'set'
5
5
  require 'tempfile'
6
+ require 'typhoeus'
6
7
 
7
8
  require 'active_support/core_ext/date/conversions'
8
9
  require 'active_support/core_ext/time/conversions'
@@ -13,7 +14,6 @@ require 'uri_template'
13
14
  require 'csvlint/error_message'
14
15
  require 'csvlint/error_collector'
15
16
  require 'csvlint/validate'
16
- require 'csvlint/wrapped_io'
17
17
  require 'csvlint/field'
18
18
 
19
19
  require 'csvlint/csvw/metadata_error'
@@ -134,7 +134,7 @@ module Csvlint
134
134
  fractional_regexp += "(#{Regexp.escape(@grouping_separator)}[0-9]{1,#{max_fraction_digits % @fractional_grouping_size}})?" if max_fraction_digits % @fractional_grouping_size > 0
135
135
  else
136
136
  fractional_regexp += "([0-9]{#{@fractional_grouping_size}}#{Regexp.escape(@grouping_separator)}){0,#{(max_fraction_digits / @fractional_grouping_size) - 1}}" if max_fraction_digits > @fractional_grouping_size
137
- fractional_regexp += "[0-9]{#{@fractional_grouping_size}}"
137
+ fractional_regexp += "[0-9]{1,#{@fractional_grouping_size}}"
138
138
  end
139
139
  fractional_regexp = "#{Regexp.escape(@decimal_separator)}#{fractional_regexp}"
140
140
  fractional_regexp = "(#{fractional_regexp})?" if min_fraction_digits == 0
@@ -17,7 +17,7 @@ module Csvlint
17
17
  def validate_column(value, row=nil, column=nil, all_errors=[])
18
18
  reset
19
19
  unless all_errors.any?{|error| ((error.type == :invalid_regex) && (error.column == column))}
20
- validate_regex(value, row, column)
20
+ validate_regex(value, row, column, all_errors)
21
21
  end
22
22
  validate_length(value, row, column)
23
23
  validate_values(value, row, column)
@@ -42,7 +42,7 @@ module Csvlint
42
42
  end
43
43
  end
44
44
 
45
- def validate_regex(value, row, column)
45
+ def validate_regex(value, row, column, all_errors)
46
46
  pattern = constraints["pattern"]
47
47
  if pattern
48
48
  begin
@@ -50,12 +50,18 @@ module Csvlint
50
50
  build_errors(:pattern, :schema, row, column, value,
51
51
  { "pattern" => constraints["pattern"] } ) if !value.nil? && !value.match( constraints["pattern"] )
52
52
  rescue RegexpError
53
- build_errors(:invalid_regex, :schema, nil, column, ("#{name}: Constraints: Pattern: #{pattern}"),
54
- { "pattern" => constraints["pattern"] })
53
+ build_regex_error(value, row, column, pattern, all_errors)
55
54
  end
56
55
  end
57
56
  end
58
57
 
58
+ def build_regex_error(value, row, column, pattern, all_errors)
59
+ return if @regex_error_exists
60
+ build_errors(:invalid_regex, :schema, nil, column, ("#{name}: Constraints: Pattern: #{pattern}"),
61
+ { "pattern" => constraints["pattern"] })
62
+ @regex_error_exists = true
63
+ end
64
+
59
65
  def validate_values(value, row, column)
60
66
  # If a pattern exists, raise an invalid regex error if it is not in
61
67
  # valid regex form, else, if the value of the relevant field in the csv
@@ -4,181 +4,308 @@ module Csvlint
4
4
 
5
5
  include Csvlint::ErrorCollector
6
6
 
7
- attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :line_breaks, :dialect, :csv_header, :schema, :data
7
+ attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :dialect, :csv_header, :schema, :data, :current_line
8
8
 
9
9
  ERROR_MATCHERS = {
10
- "Missing or stray quote" => :stray_quote,
11
- "Illegal quoting" => :whitespace,
12
- "Unclosed quoted field" => :unclosed_quote,
13
- "Unquoted fields do not allow \\r or \\n" => :line_breaks,
10
+ "Missing or stray quote" => :stray_quote,
11
+ "Illegal quoting" => :whitespace,
12
+ "Unclosed quoted field" => :unclosed_quote,
13
+ "Unquoted fields do not allow \\r or \\n" => :line_breaks,
14
14
  }
15
15
 
16
- def initialize(source, dialect = nil, schema = nil, options = {})
16
+ def initialize(source, dialect = {}, schema = nil, options = {})
17
17
  reset
18
18
  @source = source
19
19
  @formats = []
20
20
  @schema = schema
21
-
22
- @supplied_dialect = dialect != nil
21
+ @dialect = dialect
22
+ @csv_header = true
23
+ @headers = {}
24
+ @lambda = options[:lambda] || lambda { |a| nil }
25
+ @leading = ""
23
26
 
24
27
  @limit_lines = options[:limit_lines]
25
28
  @extension = parse_extension(source) unless @source.nil?
29
+
30
+ @expected_columns = 0
31
+ @col_counts = []
32
+ @line_breaks = []
33
+
26
34
  @errors += @schema.errors unless @schema.nil?
27
35
  @warnings += @schema.warnings unless @schema.nil?
28
- validate(dialect)
29
36
 
37
+ @data = [] # it may be advisable to flush this on init?
38
+
39
+ validate
30
40
  end
31
41
 
32
- def validate(dialect = nil)
33
- single_col = false
34
- io = nil
42
+ def validate
43
+ if @extension =~ /.xls(x)?/
44
+ build_warnings(:excel, :context)
45
+ return
46
+ end
47
+ locate_schema unless @schema.instance_of?(Csvlint::Schema)
48
+ set_dialect
49
+
50
+ if @source.class == String
51
+ validate_url
52
+ else
53
+ validate_metadata
54
+ validate_stream
55
+ end
56
+ finish
57
+ end
58
+
59
+ def validate_stream
60
+ @current_line = 1
61
+ @source.each_line do |line|
62
+ break if line_limit_reached?
63
+ parse_line(line)
64
+ end
65
+ validate_line(@leading, @current_line) unless @leading == ""
66
+ end
67
+
68
+ def validate_url
69
+ @current_line = 1
35
70
  begin
36
- if @extension =~ /.xls(x)?/
37
- build_warnings(:excel, :context)
38
- return
71
+ request = Typhoeus::Request.new(@source, followlocation: true)
72
+ request.on_headers do |response|
73
+ @headers = response.headers || {}
74
+ @content_type = response.headers["content-type"] rescue nil
75
+ @response_code = response.code
76
+ return build_errors(:not_found) if response.code == 404
77
+ validate_metadata
39
78
  end
40
- io = @source.respond_to?(:gets) ? @source : open(@source, :allow_redirections=>:all)
41
- validate_metadata(io)
42
- locate_schema unless @schema.instance_of?(Csvlint::Schema)
43
- set_dialect(dialect)
44
- parse_csv(io)
45
- sum = @col_counts.inject(:+)
46
- unless sum.nil?
47
- build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
79
+ request.on_body do |chunk|
80
+ io = StringIO.new(@leading + chunk)
81
+ io.each_line do |line|
82
+ break if line_limit_reached?
83
+ parse_line(line)
84
+ end
48
85
  end
49
- build_warnings(:check_options, :structure) if @expected_columns == 1
50
- check_consistency
51
- check_foreign_keys
52
- rescue OpenURI::HTTPError, Errno::ENOENT
53
- build_errors(:not_found, nil, nil, nil, @source)
54
- ensure
55
- io.close if io && io.respond_to?(:close)
86
+ request.run
87
+ # Validate the last line too
88
+ validate_line(@leading, @current_line) unless @leading == ""
89
+ rescue ArgumentError => ae
90
+ build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
91
+ @reported_invalid_encoding = true
56
92
  end
57
93
  end
58
94
 
59
- def validate_metadata(io)
60
- @csv_header = true
61
- @encoding = io.charset rescue nil
62
- @content_type = io.content_type rescue nil
63
- @headers = io.meta rescue nil
64
- @link_headers = @headers["link"].split(",") rescue nil
65
- assumed_header = undeclared_header = !@supplied_dialect
66
- if @headers
95
+ def parse_line(line)
96
+ line = @leading + line
97
+ # Check if the last line is a line break - in which case it's a full line
98
+ if line[-1, 1].include?("\n")
99
+ # If the number of quotes is odd, the linebreak is inside some quotes
100
+ if line.count(@dialect["quoteChar"]).odd?
101
+ @leading = line
102
+ else
103
+ validate_line(line, @current_line)
104
+ @leading = ""
105
+ @current_line = @current_line+1
106
+ end
107
+ else
108
+ # If it's not a full line, then prepare to add it to the beginning of the next chunk
109
+ @leading = line
110
+ end
111
+ end
112
+
113
+ def validate_line(input = nil, index = nil)
114
+ @input = input
115
+ single_col = false
116
+ line = index.present? ? index : 0
117
+ @encoding = input.encoding.to_s
118
+ report_line_breaks(line)
119
+ parse_contents(input, line)
120
+ @lambda.call(self)
121
+ rescue ArgumentError => ae
122
+ build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
123
+ @reported_invalid_encoding = true
124
+ end
125
+
126
+ # analyses the provided csv and builds errors, warnings and info messages
127
+ def parse_contents(stream, line = nil)
128
+ # parse_contents will parse one line and apply headers, formats methods and error handle as appropriate
129
+ current_line = line.present? ? line : 1
130
+ all_errors = []
131
+
132
+ @csv_options[:encoding] = @encoding
133
+
134
+ begin
135
+ row = CSV.parse_line(stream, @csv_options)
136
+ # this is a one line substitute for CSV.new followed by row = CSV.shift. a CSV Row class is required
137
+ # CSV.parse will return an array of arrays which breaks subsequent each_with_index invocations
138
+ # TODO investigate if above would be a drag on memory
139
+
140
+ rescue CSV::MalformedCSVError => e
141
+ build_exception_messages(e, stream, current_line)
142
+ end
143
+
144
+ @data << row
145
+ if row
146
+ if current_line <= 1 && @csv_header
147
+ # this conditional should be refactored somewhere
148
+ row = row.reject { |col| col.nil? || col.empty? }
149
+ validate_header(row)
150
+ @col_counts << row.size
151
+ else
152
+ build_formats(row)
153
+ @col_counts << row.reject { |col| col.nil? || col.empty? }.size
154
+ @expected_columns = row.size unless @expected_columns != 0
155
+ build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? || c.empty? }.size == 0
156
+ # Builds errors and warnings related to the provided schema file
157
+ if @schema
158
+ @schema.validate_row(row, current_line, all_errors, @source)
159
+ @errors += @schema.errors
160
+ all_errors += @schema.errors
161
+ @warnings += @schema.warnings
162
+ else
163
+ build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) if !row.empty? && row.size != @expected_columns
164
+ end
165
+ end
166
+ end
167
+ end
168
+
169
+ def finish
170
+ sum = @col_counts.inject(:+)
171
+ unless sum.nil?
172
+ build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
173
+ end
174
+ # return expected_columns to calling class
175
+ build_warnings(:check_options, :structure) if @expected_columns == 1
176
+ check_consistency
177
+ check_foreign_keys
178
+ check_mixed_linebreaks
179
+ validate_encoding
180
+ end
181
+
182
+ def validate_metadata
183
+ assumed_header = !@supplied_dialect
184
+ unless @headers.empty?
67
185
  if @headers["content-type"] =~ /text\/csv/
68
- @csv_header = true
69
- undeclared_header = false
70
- assumed_header = true
186
+ @csv_header = @csv_header && true
187
+ assumed_header = @assumed_header.present?
71
188
  end
72
189
  if @headers["content-type"] =~ /header=(present|absent)/
73
190
  @csv_header = true if $1 == "present"
74
191
  @csv_header = false if $1 == "absent"
75
- undeclared_header = false
76
192
  assumed_header = false
77
193
  end
78
- if @headers["content-type"] !~ /charset=/
79
- build_warnings(:no_encoding, :context)
80
- else
81
- build_warnings(:encoding, :context) if @encoding != "utf-8"
82
- end
83
194
  build_warnings(:no_content_type, :context) if @content_type == nil
84
195
  build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
196
+ end
197
+ @header_processed = true
198
+ build_info_messages(:assumed_header, :structure) if assumed_header
85
199
 
86
- if undeclared_header
87
- build_errors(:undeclared_header, :structure)
88
- assumed_header = false
200
+ @link_headers = @headers["link"].split(",") rescue nil
201
+ @link_headers.each do |link_header|
202
+ match = LINK_HEADER_REGEXP.match(link_header)
203
+ uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
204
+ rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
205
+ param = match["param"]
206
+ param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
207
+ if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
208
+ begin
209
+ url = URI.join(@source_url, uri)
210
+ schema = Schema.load_from_json(url)
211
+ if schema.instance_of? Csvlint::Csvw::TableGroup
212
+ if schema.tables[@source_url]
213
+ link_schema = schema
214
+ else
215
+ warn_if_unsuccessful = true
216
+ build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
217
+ end
218
+ end
219
+ rescue OpenURI::HTTPError
220
+ end
89
221
  end
222
+ end if @link_headers
223
+ end
224
+
225
+ def header?
226
+ @csv_header && @dialect["header"]
227
+ end
90
228
 
229
+ def report_line_breaks(line_no=nil)
230
+ return if @input !~ /[\r|\n]/ # Return straight away if there's no newline character - i.e. we're on the last line
231
+ line_break = CSV.new(@input).row_sep
232
+ @line_breaks << line_break
233
+ unless line_breaks_reported?
234
+ if line_break != "\r\n"
235
+ build_info_messages(:nonrfc_line_breaks, :structure, line_no)
236
+ @line_breaks_reported = true
237
+ end
91
238
  end
92
- build_info_messages(:assumed_header, :structure) if assumed_header
93
239
  end
94
240
 
95
- def set_dialect(dialect)
241
+ def line_breaks_reported?
242
+ @line_breaks_reported === true
243
+ end
244
+
245
+ def set_dialect
246
+ @assumed_header = @dialect["header"].nil?
247
+ @supplied_dialect = @dialect != {}
248
+
96
249
  begin
97
250
  schema_dialect = @schema.tables[@source_url].dialect || {}
98
251
  rescue
99
252
  schema_dialect = {}
100
253
  end
101
254
  @dialect = {
102
- "header" => true,
103
- "delimiter" => ",",
104
- "skipInitialSpace" => true,
105
- "lineTerminator" => :auto,
106
- "quoteChar" => '"',
107
- "trim" => :true
108
- }.merge(schema_dialect).merge(dialect || {})
255
+ "header" => true,
256
+ "delimiter" => ",",
257
+ "skipInitialSpace" => true,
258
+ "lineTerminator" => :auto,
259
+ "quoteChar" => '"',
260
+ "trim" => :true
261
+ }.merge(schema_dialect).merge(@dialect || {})
109
262
 
110
263
  @csv_header = @csv_header && @dialect["header"]
111
264
  @csv_options = dialect_to_csv_options(@dialect)
112
265
  end
113
266
 
114
- # analyses the provided csv and builds errors, warnings and info messages
115
- def parse_csv(io)
116
- @expected_columns = 0
117
- current_line = 0
118
- reported_invalid_encoding = false
119
- all_errors = []
120
- @col_counts = []
267
+ def validate_encoding
268
+ if @headers["content-type"]
269
+ if @headers["content-type"] !~ /charset=/
270
+ build_warnings(:no_encoding, :context)
271
+ elsif @headers["content-type"] !~ /charset=utf-8/i
272
+ build_warnings(:encoding, :context)
273
+ end
274
+ end
275
+ build_warnings(:encoding, :context) if @encoding != "UTF-8"
276
+ end
121
277
 
122
- @csv_options[:encoding] = @encoding
278
+ def check_mixed_linebreaks
279
+ build_linebreak_error if @line_breaks.uniq.count > 1
280
+ end
123
281
 
124
- begin
125
- wrapper = WrappedIO.new( io )
126
- csv = CSV.new( wrapper, @csv_options )
127
- @data = []
128
- @line_breaks = csv.row_sep
129
- if @line_breaks != "\r\n"
130
- build_info_messages(:nonrfc_line_breaks, :structure)
131
- end
132
- row = nil
133
- loop do
134
- current_line += 1
135
- if @limit_lines && current_line > @limit_lines
136
- break
137
- end
138
- begin
139
- wrapper.reset_line
140
- row = csv.shift
141
- @data << row
142
- if row
143
- if current_line == 1 && header?
144
- row = row.reject{|col| col.nil? || col.empty?}
145
- validate_header(row)
146
- @col_counts << row.size
147
- else
148
- build_formats(row)
149
- @col_counts << row.reject{|col| col.nil? || col.empty?}.size
150
- @expected_columns = row.size unless @expected_columns != 0
151
-
152
- build_errors(:blank_rows, :structure, current_line, nil, wrapper.line) if row.reject{ |c| c.nil? || c.empty? }.size == 0
153
- # Builds errors and warnings related to the provided schema file
154
- if @schema
155
- @schema.validate_row(row, current_line, all_errors, @source)
156
- @errors += @schema.errors
157
- all_errors += @schema.errors
158
- @warnings += @schema.warnings
159
- else
160
- build_errors(:ragged_rows, :structure, current_line, nil, wrapper.line) if !row.empty? && row.size != @expected_columns
161
- end
162
-
163
- end
164
- else
165
- break
166
- end
167
- rescue CSV::MalformedCSVError => e
168
- type = fetch_error(e)
169
- if type == :stray_quote && !wrapper.line.match(csv.row_sep)
170
- build_errors(:line_breaks, :structure)
171
- else
172
- build_errors(type, :structure, current_line, nil, wrapper.line)
173
- end
174
- end
282
+ def line_breaks
283
+ if @line_breaks.uniq.count > 1
284
+ :mixed
285
+ else
286
+ @line_breaks.uniq.first
175
287
  end
176
- rescue ArgumentError => ae
177
- build_errors(:invalid_encoding, :structure, current_line, nil, wrapper.line) unless reported_invalid_encoding
178
- reported_invalid_encoding = true
288
+ end
289
+
290
+ def row_count
291
+ data.count
292
+ end
293
+
294
+ def build_exception_messages(csvException, errChars, lineNo)
295
+ #TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers
296
+ #TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options
297
+ type = fetch_error(csvException)
298
+ if !@csv_options[:row_sep].kind_of?(Symbol) && [:unclosed_quote,:stray_quote].include?(type) && !@input.match(@csv_options[:row_sep])
299
+ build_linebreak_error
300
+ else
301
+ build_errors(type, :structure, lineNo, nil, errChars)
179
302
  end
180
303
  end
181
304
 
305
+ def build_linebreak_error
306
+ build_errors(:line_breaks, :structure) unless @errors.any? { |e| e.type == :line_breaks }
307
+ end
308
+
182
309
  def validate_header(header)
183
310
  names = Set.new
184
311
  header.map{|h| h.strip! } if @dialect["trim"] == :true
@@ -198,10 +325,6 @@ module Csvlint
198
325
  return valid?
199
326
  end
200
327
 
201
- def header?
202
- @csv_header
203
- end
204
-
205
328
  def fetch_error(error)
206
329
  e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i)
207
330
  message = e[1] rescue nil
@@ -209,15 +332,15 @@ module Csvlint
209
332
  end
210
333
 
211
334
  def dialect_to_csv_options(dialect)
212
- skipinitialspace = dialect["skipInitialSpace"] || true
213
- delimiter = dialect["delimiter"]
214
- delimiter = delimiter + " " if !skipinitialspace
215
- return {
216
- :col_sep => delimiter,
217
- :row_sep => dialect["lineTerminator"],
218
- :quote_char => dialect["quoteChar"],
219
- :skip_blanks => false
220
- }
335
+ skipinitialspace = dialect["skipInitialSpace"] || true
336
+ delimiter = dialect["delimiter"]
337
+ delimiter = delimiter + " " if !skipinitialspace
338
+ return {
339
+ :col_sep => delimiter,
340
+ :row_sep => dialect["lineTerminator"],
341
+ :quote_char => dialect["quoteChar"],
342
+ :skip_blanks => false
343
+ }
221
344
  end
222
345
 
223
346
  def build_formats(row)
@@ -225,33 +348,34 @@ module Csvlint
225
348
  next if col.nil? || col.empty?
226
349
  @formats[i] ||= Hash.new(0)
227
350
 
228
- format = if col.strip[FORMATS[:numeric]]
229
- :numeric
230
- elsif uri?(col)
231
- :uri
232
- elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
233
- :date_db
234
- elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
235
- :date_short
236
- elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
237
- :date_rfc822
238
- elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
239
- :date_long
240
- elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
241
- :dateTime_time
242
- elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
243
- :dateTime_hms
244
- elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
245
- :dateTime_db
246
- elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
247
- :dateTime_iso8601
248
- elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
249
- :dateTime_short
250
- elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
251
- :dateTime_long
252
- else
253
- :string
254
- end
351
+ format =
352
+ if col.strip[FORMATS[:numeric]]
353
+ :numeric
354
+ elsif uri?(col)
355
+ :uri
356
+ elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
357
+ :date_db
358
+ elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
359
+ :date_short
360
+ elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
361
+ :date_rfc822
362
+ elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
363
+ :date_long
364
+ elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
365
+ :dateTime_time
366
+ elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
367
+ :dateTime_hms
368
+ elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
369
+ :dateTime_db
370
+ elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
371
+ :dateTime_iso8601
372
+ elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
373
+ :dateTime_short
374
+ elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
375
+ :dateTime_long
376
+ else
377
+ :string
378
+ end
255
379
 
256
380
  @formats[i][format] += 1
257
381
  end
@@ -277,15 +401,16 @@ module Csvlint
277
401
  end
278
402
 
279
403
  def locate_schema
404
+
280
405
  @source_url = nil
281
406
  warn_if_unsuccessful = false
282
407
  case @source
283
- when StringIO
284
- return
285
- when File
286
- @source_url = "file:#{File.expand_path(@source)}"
287
- else
288
- @source_url = @source
408
+ when StringIO
409
+ return
410
+ when File
411
+ @source_url = "file:#{File.expand_path(@source)}"
412
+ else
413
+ @source_url = @source
289
414
  end
290
415
  unless @schema.nil?
291
416
  if @schema.tables[@source_url]
@@ -295,28 +420,6 @@ module Csvlint
295
420
  end
296
421
  end
297
422
  link_schema = nil
298
- @link_headers.each do |link_header|
299
- match = LINK_HEADER_REGEXP.match(link_header)
300
- uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
301
- rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
302
- param = match["param"]
303
- param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
304
- if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
305
- begin
306
- url = URI.join(@source_url, uri)
307
- schema = Schema.load_from_json(url)
308
- if schema.instance_of? Csvlint::Csvw::TableGroup
309
- if schema.tables[@source_url]
310
- link_schema = schema
311
- else
312
- warn_if_unsuccessful = true
313
- build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
314
- end
315
- end
316
- rescue OpenURI::HTTPError
317
- end
318
- end
319
- end if @link_headers
320
423
  @schema = link_schema if link_schema
321
424
 
322
425
  paths = []
@@ -324,8 +427,8 @@ module Csvlint
324
427
  begin
325
428
  well_known_uri = URI.join(@source_url, "/.well-known/csvm")
326
429
  well_known = open(well_known_uri).read
327
- # TODO
328
- rescue OpenURI::HTTPError
430
+ # TODO
431
+ rescue OpenURI::HTTPError, URI::BadURIError
329
432
  end
330
433
  end
331
434
  paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty?
@@ -345,8 +448,7 @@ module Csvlint
345
448
  end
346
449
  end
347
450
  rescue Errno::ENOENT
348
- rescue OpenURI::HTTPError
349
- rescue ArgumentError
451
+ rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError
350
452
  rescue => e
351
453
  STDERR.puts e.class
352
454
  STDERR.puts e.message
@@ -361,23 +463,24 @@ module Csvlint
361
463
  private
362
464
 
363
465
  def parse_extension(source)
466
+
364
467
  case source
365
- when File
366
- return File.extname( source.path )
367
- when IO
368
- return ""
369
- when StringIO
370
- return ""
468
+ when File
469
+ return File.extname( source.path )
470
+ when IO
471
+ return ""
472
+ when StringIO
473
+ return ""
371
474
  when Tempfile
372
475
  # this is triggered when the revalidate dialect use case happens
373
- return ""
374
- else
375
- begin
376
- parsed = URI.parse(source)
377
- File.extname(parsed.path)
378
- rescue URI::InvalidURIError
379
476
  return ""
380
- end
477
+ else
478
+ begin
479
+ parsed = URI.parse(source)
480
+ File.extname(parsed.path)
481
+ rescue URI::InvalidURIError
482
+ return ""
483
+ end
381
484
  end
382
485
  end
383
486
 
@@ -396,20 +499,24 @@ module Csvlint
396
499
  false
397
500
  end
398
501
 
502
+ def line_limit_reached?
503
+ @limit_lines.present? && @current_line > @limit_lines
504
+ end
505
+
399
506
  FORMATS = {
400
- :string => nil,
401
- :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
402
- :uri => /\Ahttps?:/,
403
- :date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01"
404
- :date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345"
405
- :date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345"
406
- :date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan"
407
- :dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00"
408
- :dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00"
409
- :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z"
410
- :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
411
- :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00"
412
- :dateTime_time => /\A\d\d:\d\d\z/, # "00:00"
507
+ :string => nil,
508
+ :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
509
+ :uri => /\Ahttps?:/,
510
+ :date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01"
511
+ :date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345"
512
+ :date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345"
513
+ :date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan"
514
+ :dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00"
515
+ :dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00"
516
+ :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z"
517
+ :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
518
+ :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00"
519
+ :dateTime_time => /\A\d\d:\d\d\z/, # "00:00"
413
520
  }.freeze
414
521
 
415
522
  URI_REGEXP = /(?<uri>.*?)/