csvlint 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.gitattributes +2 -0
- data/CHANGELOG.md +19 -1
- data/README.md +15 -1
- data/bin/csvlint +13 -3
- data/csvlint.gemspec +1 -0
- data/features/csvupload.feature +5 -5
- data/features/fixtures/inconsistent-line-endings-unquoted.csv +0 -0
- data/features/fixtures/inconsistent-line-endings.csv +0 -0
- data/features/fixtures/invalid_many_rows.csv +0 -0
- data/features/fixtures/valid_many_rows.csv +0 -0
- data/features/information.feature +4 -4
- data/features/step_definitions/csv_options_steps.rb +5 -0
- data/features/validation_errors.feature +1 -1
- data/features/validation_info.feature +6 -6
- data/lib/csvlint.rb +1 -1
- data/lib/csvlint/csvw/number_format.rb +1 -1
- data/lib/csvlint/field.rb +10 -4
- data/lib/csvlint/validate.rb +326 -219
- data/lib/csvlint/version.rb +1 -1
- data/spec/csvw/number_format_spec.rb +14 -0
- data/spec/validator_spec.rb +450 -146
- metadata +21 -3
- data/lib/csvlint/wrapped_io.rb +0 -21
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MmUxZTY5NThhMmU1ZmVlM2M0OWJiMzQ5MGY2NGRiMzk5NGEyYzEyYQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NTllMTYzYjUyYTk0ZTcwZmY5NDJkZjVlMGQzNzM4YWNkYWU2NjFjMg==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NTc2NTdhMzI4ZGI5NzFiMzgwZWYwM2E1YWVhMzE2ZmY5ZDUyNzdkODU1MTkw
|
10
|
+
OTgyZGM1ZGFhODMxNGVmNDkwNjY3ZjY5NDEyM2YzYWJjZDQ3NThiODRiOWY1
|
11
|
+
OTU1NGM4NGQ0NzQ3ZmRiYmM2MDM1YWM5YWJlMDRiN2MyNWI0YmI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
OTQ2NDNkN2RjNDlhZDNlYTI3NmU5NmQ4YTIxOTYxMjQyMTg2MWNhODFkZWQ2
|
14
|
+
ZDYyYWUyNzJjZGNkYzFkYWU0YjI2NzkwZTI1OGNkODFmNTZhNzhjNjE5OGY4
|
15
|
+
MmQzMzFkMTIxYzNkODM5NDFkNzc4ZDYwMjc2YTE2ZmZkZDgxZWY=
|
data/.gitattributes
ADDED
data/CHANGELOG.md
CHANGED
@@ -2,7 +2,25 @@
|
|
2
2
|
|
3
3
|
## [Unreleased](https://github.com/theodi/csvlint.rb/tree/HEAD)
|
4
4
|
|
5
|
-
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.
|
5
|
+
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...HEAD)
|
6
|
+
|
7
|
+
**Implemented enhancements:**
|
8
|
+
|
9
|
+
- Get total rows number about the CSV file that was validated [\#143](https://github.com/theodi/csvlint.rb/issues/143)
|
10
|
+
|
11
|
+
**Closed issues:**
|
12
|
+
|
13
|
+
- Optimization: Stream CSV [\#122](https://github.com/theodi/csvlint.rb/issues/122)
|
14
|
+
|
15
|
+
**Merged pull requests:**
|
16
|
+
|
17
|
+
- Add `row\_count` method [\#153](https://github.com/theodi/csvlint.rb/pull/153) ([pezholio](https://github.com/pezholio))
|
18
|
+
|
19
|
+
- Streaming validation [\#146](https://github.com/theodi/csvlint.rb/pull/146) ([pezholio](https://github.com/pezholio))
|
20
|
+
|
21
|
+
## [0.2.0](https://github.com/theodi/csvlint.rb/tree/0.2.0) (2015-10-05)
|
22
|
+
|
23
|
+
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.1.4...0.2.0)
|
6
24
|
|
7
25
|
**Closed issues:**
|
8
26
|
|
data/README.md
CHANGED
@@ -77,6 +77,7 @@ best practices
|
|
77
77
|
validator.encoding
|
78
78
|
validator.content_type
|
79
79
|
validator.extension
|
80
|
+
validator.row_count
|
80
81
|
|
81
82
|
#retrieve HTTP headers from request
|
82
83
|
validator.headers
|
@@ -128,7 +129,6 @@ The following types of error can be reported:
|
|
128
129
|
* `:unclosed_quote` -- unclosed quoted field
|
129
130
|
* `:whitespace` -- a quoted column has leading or trailing whitespace
|
130
131
|
* `:line_breaks` -- line breaks were inconsistent or incorrectly specified
|
131
|
-
* `:undeclared_header` -- if there is no machine-readable description of whether a header is present (e.g. in a dialect or `Content-Type` header)
|
132
132
|
|
133
133
|
## Warnings
|
134
134
|
|
@@ -271,6 +271,20 @@ options = {
|
|
271
271
|
validator = Csvlint::Validator.new( "http://example.org/data.csv", nil, nil, options )
|
272
272
|
```
|
273
273
|
|
274
|
+
* :lambda -- Pass a block of code to be called when each line is validated, this will give you access to the `Validator` object. For example, this will return the current line number for every line validated:
|
275
|
+
|
276
|
+
```
|
277
|
+
options = {
|
278
|
+
lambda: ->(validator) { puts validator.current_line }
|
279
|
+
}
|
280
|
+
validator = Csvlint::Validator.new( "http://example.org/data.csv", nil, nil, options )
|
281
|
+
=> 1
|
282
|
+
2
|
283
|
+
3
|
284
|
+
4
|
285
|
+
.....
|
286
|
+
```
|
287
|
+
|
274
288
|
## Contributing
|
275
289
|
|
276
290
|
1. Fork it
|
data/bin/csvlint
CHANGED
@@ -58,12 +58,22 @@ def print_error(index, error, dump, color)
|
|
58
58
|
end
|
59
59
|
|
60
60
|
def validate_csv(source, schema, dump)
|
61
|
-
|
61
|
+
@error_count = 0
|
62
|
+
report_lines = lambda do |row|
|
63
|
+
new_errors = row.errors.count
|
64
|
+
if new_errors > @error_count
|
65
|
+
print "!".red
|
66
|
+
else
|
67
|
+
print ".".green
|
68
|
+
end
|
69
|
+
@error_count = new_errors
|
70
|
+
end
|
71
|
+
validator = Csvlint::Validator.new( source, {}, schema, { lambda: report_lines } )
|
62
72
|
|
63
73
|
if $stdout.tty?
|
64
|
-
puts "#{source.path || source || "CSV"} is #{validator.valid? ? "VALID".green : "INVALID".red}"
|
74
|
+
puts "\r\n#{source.path || source || "CSV"} is #{validator.valid? ? "VALID".green : "INVALID".red}"
|
65
75
|
else
|
66
|
-
puts "#{source.path || source || "CSV"} is #{validator.valid? ? "VALID" : "INVALID"}"
|
76
|
+
puts "\r\n#{source.path || source || "CSV"} is #{validator.valid? ? "VALID" : "INVALID"}"
|
67
77
|
end
|
68
78
|
|
69
79
|
if validator.errors.size > 0
|
data/csvlint.gemspec
CHANGED
@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_dependency "open_uri_redirections"
|
24
24
|
spec.add_dependency "activesupport"
|
25
25
|
spec.add_dependency "addressable"
|
26
|
+
spec.add_dependency "typhoeus"
|
26
27
|
spec.add_dependency "escape_utils"
|
27
28
|
spec.add_dependency "uri_template"
|
28
29
|
|
data/features/csvupload.feature
CHANGED
@@ -14,7 +14,7 @@ Feature: Collect all the tests that should trigger dialect check related errors
|
|
14
14
|
And it is stored at the url "http://example.com/example1.csv"
|
15
15
|
And I set header to "true"
|
16
16
|
And I ask if there are info messages
|
17
|
-
Then there should be
|
17
|
+
Then there should be 1 info message
|
18
18
|
And one of the messages should have the type "nonrfc_line_breaks"
|
19
19
|
|
20
20
|
Scenario: CR line endings in file give an info message of type :nonrfc_line_breaks
|
@@ -22,7 +22,7 @@ Feature: Collect all the tests that should trigger dialect check related errors
|
|
22
22
|
And it is stored at the url "http://example.com/example1.csv"
|
23
23
|
And I set header to "true"
|
24
24
|
And I ask if there are info messages
|
25
|
-
Then there should be
|
25
|
+
Then there should be 1 info message
|
26
26
|
And one of the messages should have the type "nonrfc_line_breaks"
|
27
27
|
|
28
28
|
Scenario: CRLF line endings in file produces no info messages of type :nonrfc_line_breaks
|
@@ -30,13 +30,13 @@ Feature: Collect all the tests that should trigger dialect check related errors
|
|
30
30
|
And it is stored at the url "http://example.com/example1.csv"
|
31
31
|
And I set header to "true"
|
32
32
|
And I ask if there are info messages
|
33
|
-
Then there should be
|
33
|
+
Then there should be 0 info messages
|
34
34
|
|
35
35
|
# :line_breaks
|
36
36
|
|
37
37
|
Scenario: Incorrect line endings specified in settings
|
38
|
-
Given I have a CSV file called "
|
39
|
-
And I set the line endings to
|
38
|
+
Given I have a CSV file called "lf-line-endings.csv"
|
39
|
+
And I set the line endings to carriage return
|
40
40
|
And it is stored at the url "http://example.com/example1.csv"
|
41
41
|
And I ask if there are errors
|
42
42
|
Then there should be 1 error
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -10,13 +10,13 @@ Feature: Return information
|
|
10
10
|
And it is stored at the url "http://example.com/example1.csv?query=true"
|
11
11
|
|
12
12
|
Scenario: Return encoding
|
13
|
-
Then the "encoding" should be "
|
14
|
-
|
13
|
+
Then the "encoding" should be "UTF-8"
|
14
|
+
|
15
15
|
Scenario: Return content type
|
16
|
-
Then the "content_type" should be "text/csv"
|
16
|
+
Then the "content_type" should be "text/csv; charset=utf-8"
|
17
17
|
|
18
18
|
Scenario: Return extension
|
19
19
|
Then the "extension" should be ".csv"
|
20
|
-
|
20
|
+
|
21
21
|
Scenario: Return meta
|
22
22
|
Then the metadata content type should be "text/csv; charset=utf-8"
|
@@ -13,6 +13,11 @@ Given(/^I set the line endings to linefeed$/) do
|
|
13
13
|
@csv_options["lineTerminator"] = "\n"
|
14
14
|
end
|
15
15
|
|
16
|
+
Given(/^I set the line endings to carriage return$/) do
|
17
|
+
@csv_options ||= default_csv_options
|
18
|
+
@csv_options["lineTerminator"] = "\r"
|
19
|
+
end
|
20
|
+
|
16
21
|
Given(/^I set header to "(.*?)"$/) do |boolean|
|
17
22
|
@csv_options ||= default_csv_options
|
18
23
|
@csv_options["header"] = boolean == "true"
|
@@ -27,7 +27,7 @@ Feature: Get validation errors
|
|
27
27
|
And that error should have the row "2"
|
28
28
|
And that error should have the content ""Foo","Bar","Baz"
|
29
29
|
|
30
|
-
|
30
|
+
Scenario: Successfully report a CSV with incorrect whitespace
|
31
31
|
Given I have a CSV with the following content:
|
32
32
|
"""
|
33
33
|
"col1","col2","col3"
|
@@ -3,22 +3,22 @@ Feature: Get validation information messages
|
|
3
3
|
Scenario: LF line endings in file give an info message
|
4
4
|
Given I have a CSV file called "lf-line-endings.csv"
|
5
5
|
And it is stored at the url "http://example.com/example1.csv"
|
6
|
-
And I set header to "true"
|
6
|
+
And I set header to "true"
|
7
7
|
And I ask if there are info messages
|
8
|
-
Then there should be
|
8
|
+
Then there should be 1 info messages
|
9
9
|
And one of the messages should have the type "nonrfc_line_breaks"
|
10
10
|
|
11
11
|
Scenario: CR line endings in file give an info message
|
12
12
|
Given I have a CSV file called "cr-line-endings.csv"
|
13
13
|
And it is stored at the url "http://example.com/example1.csv"
|
14
|
-
And I set header to "true"
|
14
|
+
And I set header to "true"
|
15
15
|
And I ask if there are info messages
|
16
|
-
Then there should be
|
16
|
+
Then there should be 1 info messages
|
17
17
|
And one of the messages should have the type "nonrfc_line_breaks"
|
18
18
|
|
19
19
|
Scenario: CRLF line endings in file produces no info messages
|
20
20
|
Given I have a CSV file called "crlf-line-endings.csv"
|
21
21
|
And it is stored at the url "http://example.com/example1.csv"
|
22
|
-
And I set header to "true"
|
22
|
+
And I set header to "true"
|
23
23
|
And I ask if there are info messages
|
24
|
-
Then there should be
|
24
|
+
Then there should be 0 info messages
|
data/lib/csvlint.rb
CHANGED
@@ -3,6 +3,7 @@ require 'date'
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'set'
|
5
5
|
require 'tempfile'
|
6
|
+
require 'typhoeus'
|
6
7
|
|
7
8
|
require 'active_support/core_ext/date/conversions'
|
8
9
|
require 'active_support/core_ext/time/conversions'
|
@@ -13,7 +14,6 @@ require 'uri_template'
|
|
13
14
|
require 'csvlint/error_message'
|
14
15
|
require 'csvlint/error_collector'
|
15
16
|
require 'csvlint/validate'
|
16
|
-
require 'csvlint/wrapped_io'
|
17
17
|
require 'csvlint/field'
|
18
18
|
|
19
19
|
require 'csvlint/csvw/metadata_error'
|
@@ -134,7 +134,7 @@ module Csvlint
|
|
134
134
|
fractional_regexp += "(#{Regexp.escape(@grouping_separator)}[0-9]{1,#{max_fraction_digits % @fractional_grouping_size}})?" if max_fraction_digits % @fractional_grouping_size > 0
|
135
135
|
else
|
136
136
|
fractional_regexp += "([0-9]{#{@fractional_grouping_size}}#{Regexp.escape(@grouping_separator)}){0,#{(max_fraction_digits / @fractional_grouping_size) - 1}}" if max_fraction_digits > @fractional_grouping_size
|
137
|
-
fractional_regexp += "[0-9]{
|
137
|
+
fractional_regexp += "[0-9]{1,#{@fractional_grouping_size}}"
|
138
138
|
end
|
139
139
|
fractional_regexp = "#{Regexp.escape(@decimal_separator)}#{fractional_regexp}"
|
140
140
|
fractional_regexp = "(#{fractional_regexp})?" if min_fraction_digits == 0
|
data/lib/csvlint/field.rb
CHANGED
@@ -17,7 +17,7 @@ module Csvlint
|
|
17
17
|
def validate_column(value, row=nil, column=nil, all_errors=[])
|
18
18
|
reset
|
19
19
|
unless all_errors.any?{|error| ((error.type == :invalid_regex) && (error.column == column))}
|
20
|
-
validate_regex(value, row, column)
|
20
|
+
validate_regex(value, row, column, all_errors)
|
21
21
|
end
|
22
22
|
validate_length(value, row, column)
|
23
23
|
validate_values(value, row, column)
|
@@ -42,7 +42,7 @@ module Csvlint
|
|
42
42
|
end
|
43
43
|
end
|
44
44
|
|
45
|
-
def validate_regex(value, row, column)
|
45
|
+
def validate_regex(value, row, column, all_errors)
|
46
46
|
pattern = constraints["pattern"]
|
47
47
|
if pattern
|
48
48
|
begin
|
@@ -50,12 +50,18 @@ module Csvlint
|
|
50
50
|
build_errors(:pattern, :schema, row, column, value,
|
51
51
|
{ "pattern" => constraints["pattern"] } ) if !value.nil? && !value.match( constraints["pattern"] )
|
52
52
|
rescue RegexpError
|
53
|
-
|
54
|
-
{ "pattern" => constraints["pattern"] })
|
53
|
+
build_regex_error(value, row, column, pattern, all_errors)
|
55
54
|
end
|
56
55
|
end
|
57
56
|
end
|
58
57
|
|
58
|
+
def build_regex_error(value, row, column, pattern, all_errors)
|
59
|
+
return if @regex_error_exists
|
60
|
+
build_errors(:invalid_regex, :schema, nil, column, ("#{name}: Constraints: Pattern: #{pattern}"),
|
61
|
+
{ "pattern" => constraints["pattern"] })
|
62
|
+
@regex_error_exists = true
|
63
|
+
end
|
64
|
+
|
59
65
|
def validate_values(value, row, column)
|
60
66
|
# If a pattern exists, raise an invalid regex error if it is not in
|
61
67
|
# valid regex form, else, if the value of the relevant field in the csv
|
data/lib/csvlint/validate.rb
CHANGED
@@ -4,181 +4,308 @@ module Csvlint
|
|
4
4
|
|
5
5
|
include Csvlint::ErrorCollector
|
6
6
|
|
7
|
-
attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :
|
7
|
+
attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :dialect, :csv_header, :schema, :data, :current_line
|
8
8
|
|
9
9
|
ERROR_MATCHERS = {
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
"Missing or stray quote" => :stray_quote,
|
11
|
+
"Illegal quoting" => :whitespace,
|
12
|
+
"Unclosed quoted field" => :unclosed_quote,
|
13
|
+
"Unquoted fields do not allow \\r or \\n" => :line_breaks,
|
14
14
|
}
|
15
15
|
|
16
|
-
def initialize(source, dialect =
|
16
|
+
def initialize(source, dialect = {}, schema = nil, options = {})
|
17
17
|
reset
|
18
18
|
@source = source
|
19
19
|
@formats = []
|
20
20
|
@schema = schema
|
21
|
-
|
22
|
-
@
|
21
|
+
@dialect = dialect
|
22
|
+
@csv_header = true
|
23
|
+
@headers = {}
|
24
|
+
@lambda = options[:lambda] || lambda { |a| nil }
|
25
|
+
@leading = ""
|
23
26
|
|
24
27
|
@limit_lines = options[:limit_lines]
|
25
28
|
@extension = parse_extension(source) unless @source.nil?
|
29
|
+
|
30
|
+
@expected_columns = 0
|
31
|
+
@col_counts = []
|
32
|
+
@line_breaks = []
|
33
|
+
|
26
34
|
@errors += @schema.errors unless @schema.nil?
|
27
35
|
@warnings += @schema.warnings unless @schema.nil?
|
28
|
-
validate(dialect)
|
29
36
|
|
37
|
+
@data = [] # it may be advisable to flush this on init?
|
38
|
+
|
39
|
+
validate
|
30
40
|
end
|
31
41
|
|
32
|
-
def validate
|
33
|
-
|
34
|
-
|
42
|
+
def validate
|
43
|
+
if @extension =~ /.xls(x)?/
|
44
|
+
build_warnings(:excel, :context)
|
45
|
+
return
|
46
|
+
end
|
47
|
+
locate_schema unless @schema.instance_of?(Csvlint::Schema)
|
48
|
+
set_dialect
|
49
|
+
|
50
|
+
if @source.class == String
|
51
|
+
validate_url
|
52
|
+
else
|
53
|
+
validate_metadata
|
54
|
+
validate_stream
|
55
|
+
end
|
56
|
+
finish
|
57
|
+
end
|
58
|
+
|
59
|
+
def validate_stream
|
60
|
+
@current_line = 1
|
61
|
+
@source.each_line do |line|
|
62
|
+
break if line_limit_reached?
|
63
|
+
parse_line(line)
|
64
|
+
end
|
65
|
+
validate_line(@leading, @current_line) unless @leading == ""
|
66
|
+
end
|
67
|
+
|
68
|
+
def validate_url
|
69
|
+
@current_line = 1
|
35
70
|
begin
|
36
|
-
|
37
|
-
|
38
|
-
|
71
|
+
request = Typhoeus::Request.new(@source, followlocation: true)
|
72
|
+
request.on_headers do |response|
|
73
|
+
@headers = response.headers || {}
|
74
|
+
@content_type = response.headers["content-type"] rescue nil
|
75
|
+
@response_code = response.code
|
76
|
+
return build_errors(:not_found) if response.code == 404
|
77
|
+
validate_metadata
|
39
78
|
end
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
unless sum.nil?
|
47
|
-
build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
|
79
|
+
request.on_body do |chunk|
|
80
|
+
io = StringIO.new(@leading + chunk)
|
81
|
+
io.each_line do |line|
|
82
|
+
break if line_limit_reached?
|
83
|
+
parse_line(line)
|
84
|
+
end
|
48
85
|
end
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
rescue
|
53
|
-
build_errors(:
|
54
|
-
|
55
|
-
io.close if io && io.respond_to?(:close)
|
86
|
+
request.run
|
87
|
+
# Validate the last line too
|
88
|
+
validate_line(@leading, @current_line) unless @leading == ""
|
89
|
+
rescue ArgumentError => ae
|
90
|
+
build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
|
91
|
+
@reported_invalid_encoding = true
|
56
92
|
end
|
57
93
|
end
|
58
94
|
|
59
|
-
def
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
95
|
+
def parse_line(line)
|
96
|
+
line = @leading + line
|
97
|
+
# Check if the last line is a line break - in which case it's a full line
|
98
|
+
if line[-1, 1].include?("\n")
|
99
|
+
# If the number of quotes is odd, the linebreak is inside some quotes
|
100
|
+
if line.count(@dialect["quoteChar"]).odd?
|
101
|
+
@leading = line
|
102
|
+
else
|
103
|
+
validate_line(line, @current_line)
|
104
|
+
@leading = ""
|
105
|
+
@current_line = @current_line+1
|
106
|
+
end
|
107
|
+
else
|
108
|
+
# If it's not a full line, then prepare to add it to the beginning of the next chunk
|
109
|
+
@leading = line
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def validate_line(input = nil, index = nil)
|
114
|
+
@input = input
|
115
|
+
single_col = false
|
116
|
+
line = index.present? ? index : 0
|
117
|
+
@encoding = input.encoding.to_s
|
118
|
+
report_line_breaks(line)
|
119
|
+
parse_contents(input, line)
|
120
|
+
@lambda.call(self)
|
121
|
+
rescue ArgumentError => ae
|
122
|
+
build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
|
123
|
+
@reported_invalid_encoding = true
|
124
|
+
end
|
125
|
+
|
126
|
+
# analyses the provided csv and builds errors, warnings and info messages
|
127
|
+
def parse_contents(stream, line = nil)
|
128
|
+
# parse_contents will parse one line and apply headers, formats methods and error handle as appropriate
|
129
|
+
current_line = line.present? ? line : 1
|
130
|
+
all_errors = []
|
131
|
+
|
132
|
+
@csv_options[:encoding] = @encoding
|
133
|
+
|
134
|
+
begin
|
135
|
+
row = CSV.parse_line(stream, @csv_options)
|
136
|
+
# this is a one line substitute for CSV.new followed by row = CSV.shift. a CSV Row class is required
|
137
|
+
# CSV.parse will return an array of arrays which breaks subsequent each_with_index invocations
|
138
|
+
# TODO investigate if above would be a drag on memory
|
139
|
+
|
140
|
+
rescue CSV::MalformedCSVError => e
|
141
|
+
build_exception_messages(e, stream, current_line)
|
142
|
+
end
|
143
|
+
|
144
|
+
@data << row
|
145
|
+
if row
|
146
|
+
if current_line <= 1 && @csv_header
|
147
|
+
# this conditional should be refactored somewhere
|
148
|
+
row = row.reject { |col| col.nil? || col.empty? }
|
149
|
+
validate_header(row)
|
150
|
+
@col_counts << row.size
|
151
|
+
else
|
152
|
+
build_formats(row)
|
153
|
+
@col_counts << row.reject { |col| col.nil? || col.empty? }.size
|
154
|
+
@expected_columns = row.size unless @expected_columns != 0
|
155
|
+
build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? || c.empty? }.size == 0
|
156
|
+
# Builds errors and warnings related to the provided schema file
|
157
|
+
if @schema
|
158
|
+
@schema.validate_row(row, current_line, all_errors, @source)
|
159
|
+
@errors += @schema.errors
|
160
|
+
all_errors += @schema.errors
|
161
|
+
@warnings += @schema.warnings
|
162
|
+
else
|
163
|
+
build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) if !row.empty? && row.size != @expected_columns
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def finish
|
170
|
+
sum = @col_counts.inject(:+)
|
171
|
+
unless sum.nil?
|
172
|
+
build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
|
173
|
+
end
|
174
|
+
# return expected_columns to calling class
|
175
|
+
build_warnings(:check_options, :structure) if @expected_columns == 1
|
176
|
+
check_consistency
|
177
|
+
check_foreign_keys
|
178
|
+
check_mixed_linebreaks
|
179
|
+
validate_encoding
|
180
|
+
end
|
181
|
+
|
182
|
+
def validate_metadata
|
183
|
+
assumed_header = !@supplied_dialect
|
184
|
+
unless @headers.empty?
|
67
185
|
if @headers["content-type"] =~ /text\/csv/
|
68
|
-
@csv_header = true
|
69
|
-
|
70
|
-
assumed_header = true
|
186
|
+
@csv_header = @csv_header && true
|
187
|
+
assumed_header = @assumed_header.present?
|
71
188
|
end
|
72
189
|
if @headers["content-type"] =~ /header=(present|absent)/
|
73
190
|
@csv_header = true if $1 == "present"
|
74
191
|
@csv_header = false if $1 == "absent"
|
75
|
-
undeclared_header = false
|
76
192
|
assumed_header = false
|
77
193
|
end
|
78
|
-
if @headers["content-type"] !~ /charset=/
|
79
|
-
build_warnings(:no_encoding, :context)
|
80
|
-
else
|
81
|
-
build_warnings(:encoding, :context) if @encoding != "utf-8"
|
82
|
-
end
|
83
194
|
build_warnings(:no_content_type, :context) if @content_type == nil
|
84
195
|
build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
|
196
|
+
end
|
197
|
+
@header_processed = true
|
198
|
+
build_info_messages(:assumed_header, :structure) if assumed_header
|
85
199
|
|
86
|
-
|
87
|
-
|
88
|
-
|
200
|
+
@link_headers = @headers["link"].split(",") rescue nil
|
201
|
+
@link_headers.each do |link_header|
|
202
|
+
match = LINK_HEADER_REGEXP.match(link_header)
|
203
|
+
uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
|
204
|
+
rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
|
205
|
+
param = match["param"]
|
206
|
+
param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
|
207
|
+
if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
|
208
|
+
begin
|
209
|
+
url = URI.join(@source_url, uri)
|
210
|
+
schema = Schema.load_from_json(url)
|
211
|
+
if schema.instance_of? Csvlint::Csvw::TableGroup
|
212
|
+
if schema.tables[@source_url]
|
213
|
+
link_schema = schema
|
214
|
+
else
|
215
|
+
warn_if_unsuccessful = true
|
216
|
+
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
rescue OpenURI::HTTPError
|
220
|
+
end
|
89
221
|
end
|
222
|
+
end if @link_headers
|
223
|
+
end
|
224
|
+
|
225
|
+
def header?
|
226
|
+
@csv_header && @dialect["header"]
|
227
|
+
end
|
90
228
|
|
229
|
+
def report_line_breaks(line_no=nil)
|
230
|
+
return if @input !~ /[\r|\n]/ # Return straight away if there's no newline character - i.e. we're on the last line
|
231
|
+
line_break = CSV.new(@input).row_sep
|
232
|
+
@line_breaks << line_break
|
233
|
+
unless line_breaks_reported?
|
234
|
+
if line_break != "\r\n"
|
235
|
+
build_info_messages(:nonrfc_line_breaks, :structure, line_no)
|
236
|
+
@line_breaks_reported = true
|
237
|
+
end
|
91
238
|
end
|
92
|
-
build_info_messages(:assumed_header, :structure) if assumed_header
|
93
239
|
end
|
94
240
|
|
95
|
-
def
|
241
|
+
def line_breaks_reported?
|
242
|
+
@line_breaks_reported === true
|
243
|
+
end
|
244
|
+
|
245
|
+
def set_dialect
|
246
|
+
@assumed_header = @dialect["header"].nil?
|
247
|
+
@supplied_dialect = @dialect != {}
|
248
|
+
|
96
249
|
begin
|
97
250
|
schema_dialect = @schema.tables[@source_url].dialect || {}
|
98
251
|
rescue
|
99
252
|
schema_dialect = {}
|
100
253
|
end
|
101
254
|
@dialect = {
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
}.merge(schema_dialect).merge(dialect || {})
|
255
|
+
"header" => true,
|
256
|
+
"delimiter" => ",",
|
257
|
+
"skipInitialSpace" => true,
|
258
|
+
"lineTerminator" => :auto,
|
259
|
+
"quoteChar" => '"',
|
260
|
+
"trim" => :true
|
261
|
+
}.merge(schema_dialect).merge(@dialect || {})
|
109
262
|
|
110
263
|
@csv_header = @csv_header && @dialect["header"]
|
111
264
|
@csv_options = dialect_to_csv_options(@dialect)
|
112
265
|
end
|
113
266
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
267
|
+
def validate_encoding
|
268
|
+
if @headers["content-type"]
|
269
|
+
if @headers["content-type"] !~ /charset=/
|
270
|
+
build_warnings(:no_encoding, :context)
|
271
|
+
elsif @headers["content-type"] !~ /charset=utf-8/i
|
272
|
+
build_warnings(:encoding, :context)
|
273
|
+
end
|
274
|
+
end
|
275
|
+
build_warnings(:encoding, :context) if @encoding != "UTF-8"
|
276
|
+
end
|
121
277
|
|
122
|
-
|
278
|
+
def check_mixed_linebreaks
|
279
|
+
build_linebreak_error if @line_breaks.uniq.count > 1
|
280
|
+
end
|
123
281
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
@line_breaks
|
129
|
-
if @line_breaks != "\r\n"
|
130
|
-
build_info_messages(:nonrfc_line_breaks, :structure)
|
131
|
-
end
|
132
|
-
row = nil
|
133
|
-
loop do
|
134
|
-
current_line += 1
|
135
|
-
if @limit_lines && current_line > @limit_lines
|
136
|
-
break
|
137
|
-
end
|
138
|
-
begin
|
139
|
-
wrapper.reset_line
|
140
|
-
row = csv.shift
|
141
|
-
@data << row
|
142
|
-
if row
|
143
|
-
if current_line == 1 && header?
|
144
|
-
row = row.reject{|col| col.nil? || col.empty?}
|
145
|
-
validate_header(row)
|
146
|
-
@col_counts << row.size
|
147
|
-
else
|
148
|
-
build_formats(row)
|
149
|
-
@col_counts << row.reject{|col| col.nil? || col.empty?}.size
|
150
|
-
@expected_columns = row.size unless @expected_columns != 0
|
151
|
-
|
152
|
-
build_errors(:blank_rows, :structure, current_line, nil, wrapper.line) if row.reject{ |c| c.nil? || c.empty? }.size == 0
|
153
|
-
# Builds errors and warnings related to the provided schema file
|
154
|
-
if @schema
|
155
|
-
@schema.validate_row(row, current_line, all_errors, @source)
|
156
|
-
@errors += @schema.errors
|
157
|
-
all_errors += @schema.errors
|
158
|
-
@warnings += @schema.warnings
|
159
|
-
else
|
160
|
-
build_errors(:ragged_rows, :structure, current_line, nil, wrapper.line) if !row.empty? && row.size != @expected_columns
|
161
|
-
end
|
162
|
-
|
163
|
-
end
|
164
|
-
else
|
165
|
-
break
|
166
|
-
end
|
167
|
-
rescue CSV::MalformedCSVError => e
|
168
|
-
type = fetch_error(e)
|
169
|
-
if type == :stray_quote && !wrapper.line.match(csv.row_sep)
|
170
|
-
build_errors(:line_breaks, :structure)
|
171
|
-
else
|
172
|
-
build_errors(type, :structure, current_line, nil, wrapper.line)
|
173
|
-
end
|
174
|
-
end
|
282
|
+
def line_breaks
|
283
|
+
if @line_breaks.uniq.count > 1
|
284
|
+
:mixed
|
285
|
+
else
|
286
|
+
@line_breaks.uniq.first
|
175
287
|
end
|
176
|
-
|
177
|
-
|
178
|
-
|
288
|
+
end
|
289
|
+
|
290
|
+
def row_count
|
291
|
+
data.count
|
292
|
+
end
|
293
|
+
|
294
|
+
def build_exception_messages(csvException, errChars, lineNo)
|
295
|
+
#TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers
|
296
|
+
#TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options
|
297
|
+
type = fetch_error(csvException)
|
298
|
+
if !@csv_options[:row_sep].kind_of?(Symbol) && [:unclosed_quote,:stray_quote].include?(type) && !@input.match(@csv_options[:row_sep])
|
299
|
+
build_linebreak_error
|
300
|
+
else
|
301
|
+
build_errors(type, :structure, lineNo, nil, errChars)
|
179
302
|
end
|
180
303
|
end
|
181
304
|
|
305
|
+
def build_linebreak_error
|
306
|
+
build_errors(:line_breaks, :structure) unless @errors.any? { |e| e.type == :line_breaks }
|
307
|
+
end
|
308
|
+
|
182
309
|
def validate_header(header)
|
183
310
|
names = Set.new
|
184
311
|
header.map{|h| h.strip! } if @dialect["trim"] == :true
|
@@ -198,10 +325,6 @@ module Csvlint
|
|
198
325
|
return valid?
|
199
326
|
end
|
200
327
|
|
201
|
-
def header?
|
202
|
-
@csv_header
|
203
|
-
end
|
204
|
-
|
205
328
|
def fetch_error(error)
|
206
329
|
e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i)
|
207
330
|
message = e[1] rescue nil
|
@@ -209,15 +332,15 @@ module Csvlint
|
|
209
332
|
end
|
210
333
|
|
211
334
|
def dialect_to_csv_options(dialect)
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
335
|
+
skipinitialspace = dialect["skipInitialSpace"] || true
|
336
|
+
delimiter = dialect["delimiter"]
|
337
|
+
delimiter = delimiter + " " if !skipinitialspace
|
338
|
+
return {
|
339
|
+
:col_sep => delimiter,
|
340
|
+
:row_sep => dialect["lineTerminator"],
|
341
|
+
:quote_char => dialect["quoteChar"],
|
342
|
+
:skip_blanks => false
|
343
|
+
}
|
221
344
|
end
|
222
345
|
|
223
346
|
def build_formats(row)
|
@@ -225,33 +348,34 @@ module Csvlint
|
|
225
348
|
next if col.nil? || col.empty?
|
226
349
|
@formats[i] ||= Hash.new(0)
|
227
350
|
|
228
|
-
format =
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
351
|
+
format =
|
352
|
+
if col.strip[FORMATS[:numeric]]
|
353
|
+
:numeric
|
354
|
+
elsif uri?(col)
|
355
|
+
:uri
|
356
|
+
elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
|
357
|
+
:date_db
|
358
|
+
elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
|
359
|
+
:date_short
|
360
|
+
elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
|
361
|
+
:date_rfc822
|
362
|
+
elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
|
363
|
+
:date_long
|
364
|
+
elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
|
365
|
+
:dateTime_time
|
366
|
+
elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
|
367
|
+
:dateTime_hms
|
368
|
+
elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
|
369
|
+
:dateTime_db
|
370
|
+
elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
|
371
|
+
:dateTime_iso8601
|
372
|
+
elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
|
373
|
+
:dateTime_short
|
374
|
+
elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
|
375
|
+
:dateTime_long
|
376
|
+
else
|
377
|
+
:string
|
378
|
+
end
|
255
379
|
|
256
380
|
@formats[i][format] += 1
|
257
381
|
end
|
@@ -277,15 +401,16 @@ module Csvlint
|
|
277
401
|
end
|
278
402
|
|
279
403
|
def locate_schema
|
404
|
+
|
280
405
|
@source_url = nil
|
281
406
|
warn_if_unsuccessful = false
|
282
407
|
case @source
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
408
|
+
when StringIO
|
409
|
+
return
|
410
|
+
when File
|
411
|
+
@source_url = "file:#{File.expand_path(@source)}"
|
412
|
+
else
|
413
|
+
@source_url = @source
|
289
414
|
end
|
290
415
|
unless @schema.nil?
|
291
416
|
if @schema.tables[@source_url]
|
@@ -295,28 +420,6 @@ module Csvlint
|
|
295
420
|
end
|
296
421
|
end
|
297
422
|
link_schema = nil
|
298
|
-
@link_headers.each do |link_header|
|
299
|
-
match = LINK_HEADER_REGEXP.match(link_header)
|
300
|
-
uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
|
301
|
-
rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
|
302
|
-
param = match["param"]
|
303
|
-
param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
|
304
|
-
if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
|
305
|
-
begin
|
306
|
-
url = URI.join(@source_url, uri)
|
307
|
-
schema = Schema.load_from_json(url)
|
308
|
-
if schema.instance_of? Csvlint::Csvw::TableGroup
|
309
|
-
if schema.tables[@source_url]
|
310
|
-
link_schema = schema
|
311
|
-
else
|
312
|
-
warn_if_unsuccessful = true
|
313
|
-
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
|
314
|
-
end
|
315
|
-
end
|
316
|
-
rescue OpenURI::HTTPError
|
317
|
-
end
|
318
|
-
end
|
319
|
-
end if @link_headers
|
320
423
|
@schema = link_schema if link_schema
|
321
424
|
|
322
425
|
paths = []
|
@@ -324,8 +427,8 @@ module Csvlint
|
|
324
427
|
begin
|
325
428
|
well_known_uri = URI.join(@source_url, "/.well-known/csvm")
|
326
429
|
well_known = open(well_known_uri).read
|
327
|
-
|
328
|
-
rescue OpenURI::HTTPError
|
430
|
+
# TODO
|
431
|
+
rescue OpenURI::HTTPError, URI::BadURIError
|
329
432
|
end
|
330
433
|
end
|
331
434
|
paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty?
|
@@ -345,8 +448,7 @@ module Csvlint
|
|
345
448
|
end
|
346
449
|
end
|
347
450
|
rescue Errno::ENOENT
|
348
|
-
rescue OpenURI::HTTPError
|
349
|
-
rescue ArgumentError
|
451
|
+
rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError
|
350
452
|
rescue => e
|
351
453
|
STDERR.puts e.class
|
352
454
|
STDERR.puts e.message
|
@@ -361,23 +463,24 @@ module Csvlint
|
|
361
463
|
private
|
362
464
|
|
363
465
|
def parse_extension(source)
|
466
|
+
|
364
467
|
case source
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
468
|
+
when File
|
469
|
+
return File.extname( source.path )
|
470
|
+
when IO
|
471
|
+
return ""
|
472
|
+
when StringIO
|
473
|
+
return ""
|
371
474
|
when Tempfile
|
372
475
|
# this is triggered when the revalidate dialect use case happens
|
373
|
-
return ""
|
374
|
-
else
|
375
|
-
begin
|
376
|
-
parsed = URI.parse(source)
|
377
|
-
File.extname(parsed.path)
|
378
|
-
rescue URI::InvalidURIError
|
379
476
|
return ""
|
380
|
-
|
477
|
+
else
|
478
|
+
begin
|
479
|
+
parsed = URI.parse(source)
|
480
|
+
File.extname(parsed.path)
|
481
|
+
rescue URI::InvalidURIError
|
482
|
+
return ""
|
483
|
+
end
|
381
484
|
end
|
382
485
|
end
|
383
486
|
|
@@ -396,20 +499,24 @@ module Csvlint
|
|
396
499
|
false
|
397
500
|
end
|
398
501
|
|
502
|
+
def line_limit_reached?
|
503
|
+
@limit_lines.present? && @current_line > @limit_lines
|
504
|
+
end
|
505
|
+
|
399
506
|
FORMATS = {
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
507
|
+
:string => nil,
|
508
|
+
:numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
|
509
|
+
:uri => /\Ahttps?:/,
|
510
|
+
:date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01"
|
511
|
+
:date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345"
|
512
|
+
:date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345"
|
513
|
+
:date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan"
|
514
|
+
:dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00"
|
515
|
+
:dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00"
|
516
|
+
:dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z"
|
517
|
+
:dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
|
518
|
+
:dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00"
|
519
|
+
:dateTime_time => /\A\d\d:\d\d\z/, # "00:00"
|
413
520
|
}.freeze
|
414
521
|
|
415
522
|
URI_REGEXP = /(?<uri>.*?)/
|