csvlint 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.gitattributes +2 -0
- data/CHANGELOG.md +19 -1
- data/README.md +15 -1
- data/bin/csvlint +13 -3
- data/csvlint.gemspec +1 -0
- data/features/csvupload.feature +5 -5
- data/features/fixtures/inconsistent-line-endings-unquoted.csv +0 -0
- data/features/fixtures/inconsistent-line-endings.csv +0 -0
- data/features/fixtures/invalid_many_rows.csv +0 -0
- data/features/fixtures/valid_many_rows.csv +0 -0
- data/features/information.feature +4 -4
- data/features/step_definitions/csv_options_steps.rb +5 -0
- data/features/validation_errors.feature +1 -1
- data/features/validation_info.feature +6 -6
- data/lib/csvlint.rb +1 -1
- data/lib/csvlint/csvw/number_format.rb +1 -1
- data/lib/csvlint/field.rb +10 -4
- data/lib/csvlint/validate.rb +326 -219
- data/lib/csvlint/version.rb +1 -1
- data/spec/csvw/number_format_spec.rb +14 -0
- data/spec/validator_spec.rb +450 -146
- metadata +21 -3
- data/lib/csvlint/wrapped_io.rb +0 -21
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MmUxZTY5NThhMmU1ZmVlM2M0OWJiMzQ5MGY2NGRiMzk5NGEyYzEyYQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NTllMTYzYjUyYTk0ZTcwZmY5NDJkZjVlMGQzNzM4YWNkYWU2NjFjMg==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NTc2NTdhMzI4ZGI5NzFiMzgwZWYwM2E1YWVhMzE2ZmY5ZDUyNzdkODU1MTkw
|
10
|
+
OTgyZGM1ZGFhODMxNGVmNDkwNjY3ZjY5NDEyM2YzYWJjZDQ3NThiODRiOWY1
|
11
|
+
OTU1NGM4NGQ0NzQ3ZmRiYmM2MDM1YWM5YWJlMDRiN2MyNWI0YmI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
OTQ2NDNkN2RjNDlhZDNlYTI3NmU5NmQ4YTIxOTYxMjQyMTg2MWNhODFkZWQ2
|
14
|
+
ZDYyYWUyNzJjZGNkYzFkYWU0YjI2NzkwZTI1OGNkODFmNTZhNzhjNjE5OGY4
|
15
|
+
MmQzMzFkMTIxYzNkODM5NDFkNzc4ZDYwMjc2YTE2ZmZkZDgxZWY=
|
data/.gitattributes
ADDED
data/CHANGELOG.md
CHANGED
@@ -2,7 +2,25 @@
|
|
2
2
|
|
3
3
|
## [Unreleased](https://github.com/theodi/csvlint.rb/tree/HEAD)
|
4
4
|
|
5
|
-
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.
|
5
|
+
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...HEAD)
|
6
|
+
|
7
|
+
**Implemented enhancements:**
|
8
|
+
|
9
|
+
- Get total rows number about the CSV file that was validated [\#143](https://github.com/theodi/csvlint.rb/issues/143)
|
10
|
+
|
11
|
+
**Closed issues:**
|
12
|
+
|
13
|
+
- Optimization: Stream CSV [\#122](https://github.com/theodi/csvlint.rb/issues/122)
|
14
|
+
|
15
|
+
**Merged pull requests:**
|
16
|
+
|
17
|
+
- Add `row\_count` method [\#153](https://github.com/theodi/csvlint.rb/pull/153) ([pezholio](https://github.com/pezholio))
|
18
|
+
|
19
|
+
- Streaming validation [\#146](https://github.com/theodi/csvlint.rb/pull/146) ([pezholio](https://github.com/pezholio))
|
20
|
+
|
21
|
+
## [0.2.0](https://github.com/theodi/csvlint.rb/tree/0.2.0) (2015-10-05)
|
22
|
+
|
23
|
+
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.1.4...0.2.0)
|
6
24
|
|
7
25
|
**Closed issues:**
|
8
26
|
|
data/README.md
CHANGED
@@ -77,6 +77,7 @@ best practices
|
|
77
77
|
validator.encoding
|
78
78
|
validator.content_type
|
79
79
|
validator.extension
|
80
|
+
validator.row_count
|
80
81
|
|
81
82
|
#retrieve HTTP headers from request
|
82
83
|
validator.headers
|
@@ -128,7 +129,6 @@ The following types of error can be reported:
|
|
128
129
|
* `:unclosed_quote` -- unclosed quoted field
|
129
130
|
* `:whitespace` -- a quoted column has leading or trailing whitespace
|
130
131
|
* `:line_breaks` -- line breaks were inconsistent or incorrectly specified
|
131
|
-
* `:undeclared_header` -- if there is no machine-readable description of whether a header is present (e.g. in a dialect or `Content-Type` header)
|
132
132
|
|
133
133
|
## Warnings
|
134
134
|
|
@@ -271,6 +271,20 @@ options = {
|
|
271
271
|
validator = Csvlint::Validator.new( "http://example.org/data.csv", nil, nil, options )
|
272
272
|
```
|
273
273
|
|
274
|
+
* :lambda -- Pass a block of code to be called when each line is validated, this will give you access to the `Validator` object. For example, this will return the current line number for every line validated:
|
275
|
+
|
276
|
+
```
|
277
|
+
options = {
|
278
|
+
lambda: ->(validator) { puts validator.current_line }
|
279
|
+
}
|
280
|
+
validator = Csvlint::Validator.new( "http://example.org/data.csv", nil, nil, options )
|
281
|
+
=> 1
|
282
|
+
2
|
283
|
+
3
|
284
|
+
4
|
285
|
+
.....
|
286
|
+
```
|
287
|
+
|
274
288
|
## Contributing
|
275
289
|
|
276
290
|
1. Fork it
|
data/bin/csvlint
CHANGED
@@ -58,12 +58,22 @@ def print_error(index, error, dump, color)
|
|
58
58
|
end
|
59
59
|
|
60
60
|
def validate_csv(source, schema, dump)
|
61
|
-
|
61
|
+
@error_count = 0
|
62
|
+
report_lines = lambda do |row|
|
63
|
+
new_errors = row.errors.count
|
64
|
+
if new_errors > @error_count
|
65
|
+
print "!".red
|
66
|
+
else
|
67
|
+
print ".".green
|
68
|
+
end
|
69
|
+
@error_count = new_errors
|
70
|
+
end
|
71
|
+
validator = Csvlint::Validator.new( source, {}, schema, { lambda: report_lines } )
|
62
72
|
|
63
73
|
if $stdout.tty?
|
64
|
-
puts "#{source.path || source || "CSV"} is #{validator.valid? ? "VALID".green : "INVALID".red}"
|
74
|
+
puts "\r\n#{source.path || source || "CSV"} is #{validator.valid? ? "VALID".green : "INVALID".red}"
|
65
75
|
else
|
66
|
-
puts "#{source.path || source || "CSV"} is #{validator.valid? ? "VALID" : "INVALID"}"
|
76
|
+
puts "\r\n#{source.path || source || "CSV"} is #{validator.valid? ? "VALID" : "INVALID"}"
|
67
77
|
end
|
68
78
|
|
69
79
|
if validator.errors.size > 0
|
data/csvlint.gemspec
CHANGED
@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_dependency "open_uri_redirections"
|
24
24
|
spec.add_dependency "activesupport"
|
25
25
|
spec.add_dependency "addressable"
|
26
|
+
spec.add_dependency "typhoeus"
|
26
27
|
spec.add_dependency "escape_utils"
|
27
28
|
spec.add_dependency "uri_template"
|
28
29
|
|
data/features/csvupload.feature
CHANGED
@@ -14,7 +14,7 @@ Feature: Collect all the tests that should trigger dialect check related errors
|
|
14
14
|
And it is stored at the url "http://example.com/example1.csv"
|
15
15
|
And I set header to "true"
|
16
16
|
And I ask if there are info messages
|
17
|
-
Then there should be
|
17
|
+
Then there should be 1 info message
|
18
18
|
And one of the messages should have the type "nonrfc_line_breaks"
|
19
19
|
|
20
20
|
Scenario: CR line endings in file give an info message of type :nonrfc_line_breaks
|
@@ -22,7 +22,7 @@ Feature: Collect all the tests that should trigger dialect check related errors
|
|
22
22
|
And it is stored at the url "http://example.com/example1.csv"
|
23
23
|
And I set header to "true"
|
24
24
|
And I ask if there are info messages
|
25
|
-
Then there should be
|
25
|
+
Then there should be 1 info message
|
26
26
|
And one of the messages should have the type "nonrfc_line_breaks"
|
27
27
|
|
28
28
|
Scenario: CRLF line endings in file produces no info messages of type :nonrfc_line_breaks
|
@@ -30,13 +30,13 @@ Feature: Collect all the tests that should trigger dialect check related errors
|
|
30
30
|
And it is stored at the url "http://example.com/example1.csv"
|
31
31
|
And I set header to "true"
|
32
32
|
And I ask if there are info messages
|
33
|
-
Then there should be
|
33
|
+
Then there should be 0 info messages
|
34
34
|
|
35
35
|
# :line_breaks
|
36
36
|
|
37
37
|
Scenario: Incorrect line endings specified in settings
|
38
|
-
Given I have a CSV file called "
|
39
|
-
And I set the line endings to
|
38
|
+
Given I have a CSV file called "lf-line-endings.csv"
|
39
|
+
And I set the line endings to carriage return
|
40
40
|
And it is stored at the url "http://example.com/example1.csv"
|
41
41
|
And I ask if there are errors
|
42
42
|
Then there should be 1 error
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -10,13 +10,13 @@ Feature: Return information
|
|
10
10
|
And it is stored at the url "http://example.com/example1.csv?query=true"
|
11
11
|
|
12
12
|
Scenario: Return encoding
|
13
|
-
Then the "encoding" should be "
|
14
|
-
|
13
|
+
Then the "encoding" should be "UTF-8"
|
14
|
+
|
15
15
|
Scenario: Return content type
|
16
|
-
Then the "content_type" should be "text/csv"
|
16
|
+
Then the "content_type" should be "text/csv; charset=utf-8"
|
17
17
|
|
18
18
|
Scenario: Return extension
|
19
19
|
Then the "extension" should be ".csv"
|
20
|
-
|
20
|
+
|
21
21
|
Scenario: Return meta
|
22
22
|
Then the metadata content type should be "text/csv; charset=utf-8"
|
@@ -13,6 +13,11 @@ Given(/^I set the line endings to linefeed$/) do
|
|
13
13
|
@csv_options["lineTerminator"] = "\n"
|
14
14
|
end
|
15
15
|
|
16
|
+
Given(/^I set the line endings to carriage return$/) do
|
17
|
+
@csv_options ||= default_csv_options
|
18
|
+
@csv_options["lineTerminator"] = "\r"
|
19
|
+
end
|
20
|
+
|
16
21
|
Given(/^I set header to "(.*?)"$/) do |boolean|
|
17
22
|
@csv_options ||= default_csv_options
|
18
23
|
@csv_options["header"] = boolean == "true"
|
@@ -27,7 +27,7 @@ Feature: Get validation errors
|
|
27
27
|
And that error should have the row "2"
|
28
28
|
And that error should have the content ""Foo","Bar","Baz"
|
29
29
|
|
30
|
-
|
30
|
+
Scenario: Successfully report a CSV with incorrect whitespace
|
31
31
|
Given I have a CSV with the following content:
|
32
32
|
"""
|
33
33
|
"col1","col2","col3"
|
@@ -3,22 +3,22 @@ Feature: Get validation information messages
|
|
3
3
|
Scenario: LF line endings in file give an info message
|
4
4
|
Given I have a CSV file called "lf-line-endings.csv"
|
5
5
|
And it is stored at the url "http://example.com/example1.csv"
|
6
|
-
And I set header to "true"
|
6
|
+
And I set header to "true"
|
7
7
|
And I ask if there are info messages
|
8
|
-
Then there should be
|
8
|
+
Then there should be 1 info messages
|
9
9
|
And one of the messages should have the type "nonrfc_line_breaks"
|
10
10
|
|
11
11
|
Scenario: CR line endings in file give an info message
|
12
12
|
Given I have a CSV file called "cr-line-endings.csv"
|
13
13
|
And it is stored at the url "http://example.com/example1.csv"
|
14
|
-
And I set header to "true"
|
14
|
+
And I set header to "true"
|
15
15
|
And I ask if there are info messages
|
16
|
-
Then there should be
|
16
|
+
Then there should be 1 info messages
|
17
17
|
And one of the messages should have the type "nonrfc_line_breaks"
|
18
18
|
|
19
19
|
Scenario: CRLF line endings in file produces no info messages
|
20
20
|
Given I have a CSV file called "crlf-line-endings.csv"
|
21
21
|
And it is stored at the url "http://example.com/example1.csv"
|
22
|
-
And I set header to "true"
|
22
|
+
And I set header to "true"
|
23
23
|
And I ask if there are info messages
|
24
|
-
Then there should be
|
24
|
+
Then there should be 0 info messages
|
data/lib/csvlint.rb
CHANGED
@@ -3,6 +3,7 @@ require 'date'
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'set'
|
5
5
|
require 'tempfile'
|
6
|
+
require 'typhoeus'
|
6
7
|
|
7
8
|
require 'active_support/core_ext/date/conversions'
|
8
9
|
require 'active_support/core_ext/time/conversions'
|
@@ -13,7 +14,6 @@ require 'uri_template'
|
|
13
14
|
require 'csvlint/error_message'
|
14
15
|
require 'csvlint/error_collector'
|
15
16
|
require 'csvlint/validate'
|
16
|
-
require 'csvlint/wrapped_io'
|
17
17
|
require 'csvlint/field'
|
18
18
|
|
19
19
|
require 'csvlint/csvw/metadata_error'
|
@@ -134,7 +134,7 @@ module Csvlint
|
|
134
134
|
fractional_regexp += "(#{Regexp.escape(@grouping_separator)}[0-9]{1,#{max_fraction_digits % @fractional_grouping_size}})?" if max_fraction_digits % @fractional_grouping_size > 0
|
135
135
|
else
|
136
136
|
fractional_regexp += "([0-9]{#{@fractional_grouping_size}}#{Regexp.escape(@grouping_separator)}){0,#{(max_fraction_digits / @fractional_grouping_size) - 1}}" if max_fraction_digits > @fractional_grouping_size
|
137
|
-
fractional_regexp += "[0-9]{
|
137
|
+
fractional_regexp += "[0-9]{1,#{@fractional_grouping_size}}"
|
138
138
|
end
|
139
139
|
fractional_regexp = "#{Regexp.escape(@decimal_separator)}#{fractional_regexp}"
|
140
140
|
fractional_regexp = "(#{fractional_regexp})?" if min_fraction_digits == 0
|
data/lib/csvlint/field.rb
CHANGED
@@ -17,7 +17,7 @@ module Csvlint
|
|
17
17
|
def validate_column(value, row=nil, column=nil, all_errors=[])
|
18
18
|
reset
|
19
19
|
unless all_errors.any?{|error| ((error.type == :invalid_regex) && (error.column == column))}
|
20
|
-
validate_regex(value, row, column)
|
20
|
+
validate_regex(value, row, column, all_errors)
|
21
21
|
end
|
22
22
|
validate_length(value, row, column)
|
23
23
|
validate_values(value, row, column)
|
@@ -42,7 +42,7 @@ module Csvlint
|
|
42
42
|
end
|
43
43
|
end
|
44
44
|
|
45
|
-
def validate_regex(value, row, column)
|
45
|
+
def validate_regex(value, row, column, all_errors)
|
46
46
|
pattern = constraints["pattern"]
|
47
47
|
if pattern
|
48
48
|
begin
|
@@ -50,12 +50,18 @@ module Csvlint
|
|
50
50
|
build_errors(:pattern, :schema, row, column, value,
|
51
51
|
{ "pattern" => constraints["pattern"] } ) if !value.nil? && !value.match( constraints["pattern"] )
|
52
52
|
rescue RegexpError
|
53
|
-
|
54
|
-
{ "pattern" => constraints["pattern"] })
|
53
|
+
build_regex_error(value, row, column, pattern, all_errors)
|
55
54
|
end
|
56
55
|
end
|
57
56
|
end
|
58
57
|
|
58
|
+
def build_regex_error(value, row, column, pattern, all_errors)
|
59
|
+
return if @regex_error_exists
|
60
|
+
build_errors(:invalid_regex, :schema, nil, column, ("#{name}: Constraints: Pattern: #{pattern}"),
|
61
|
+
{ "pattern" => constraints["pattern"] })
|
62
|
+
@regex_error_exists = true
|
63
|
+
end
|
64
|
+
|
59
65
|
def validate_values(value, row, column)
|
60
66
|
# If a pattern exists, raise an invalid regex error if it is not in
|
61
67
|
# valid regex form, else, if the value of the relevant field in the csv
|
data/lib/csvlint/validate.rb
CHANGED
@@ -4,181 +4,308 @@ module Csvlint
|
|
4
4
|
|
5
5
|
include Csvlint::ErrorCollector
|
6
6
|
|
7
|
-
attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :
|
7
|
+
attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :dialect, :csv_header, :schema, :data, :current_line
|
8
8
|
|
9
9
|
ERROR_MATCHERS = {
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
"Missing or stray quote" => :stray_quote,
|
11
|
+
"Illegal quoting" => :whitespace,
|
12
|
+
"Unclosed quoted field" => :unclosed_quote,
|
13
|
+
"Unquoted fields do not allow \\r or \\n" => :line_breaks,
|
14
14
|
}
|
15
15
|
|
16
|
-
def initialize(source, dialect =
|
16
|
+
def initialize(source, dialect = {}, schema = nil, options = {})
|
17
17
|
reset
|
18
18
|
@source = source
|
19
19
|
@formats = []
|
20
20
|
@schema = schema
|
21
|
-
|
22
|
-
@
|
21
|
+
@dialect = dialect
|
22
|
+
@csv_header = true
|
23
|
+
@headers = {}
|
24
|
+
@lambda = options[:lambda] || lambda { |a| nil }
|
25
|
+
@leading = ""
|
23
26
|
|
24
27
|
@limit_lines = options[:limit_lines]
|
25
28
|
@extension = parse_extension(source) unless @source.nil?
|
29
|
+
|
30
|
+
@expected_columns = 0
|
31
|
+
@col_counts = []
|
32
|
+
@line_breaks = []
|
33
|
+
|
26
34
|
@errors += @schema.errors unless @schema.nil?
|
27
35
|
@warnings += @schema.warnings unless @schema.nil?
|
28
|
-
validate(dialect)
|
29
36
|
|
37
|
+
@data = [] # it may be advisable to flush this on init?
|
38
|
+
|
39
|
+
validate
|
30
40
|
end
|
31
41
|
|
32
|
-
def validate
|
33
|
-
|
34
|
-
|
42
|
+
def validate
|
43
|
+
if @extension =~ /.xls(x)?/
|
44
|
+
build_warnings(:excel, :context)
|
45
|
+
return
|
46
|
+
end
|
47
|
+
locate_schema unless @schema.instance_of?(Csvlint::Schema)
|
48
|
+
set_dialect
|
49
|
+
|
50
|
+
if @source.class == String
|
51
|
+
validate_url
|
52
|
+
else
|
53
|
+
validate_metadata
|
54
|
+
validate_stream
|
55
|
+
end
|
56
|
+
finish
|
57
|
+
end
|
58
|
+
|
59
|
+
def validate_stream
|
60
|
+
@current_line = 1
|
61
|
+
@source.each_line do |line|
|
62
|
+
break if line_limit_reached?
|
63
|
+
parse_line(line)
|
64
|
+
end
|
65
|
+
validate_line(@leading, @current_line) unless @leading == ""
|
66
|
+
end
|
67
|
+
|
68
|
+
def validate_url
|
69
|
+
@current_line = 1
|
35
70
|
begin
|
36
|
-
|
37
|
-
|
38
|
-
|
71
|
+
request = Typhoeus::Request.new(@source, followlocation: true)
|
72
|
+
request.on_headers do |response|
|
73
|
+
@headers = response.headers || {}
|
74
|
+
@content_type = response.headers["content-type"] rescue nil
|
75
|
+
@response_code = response.code
|
76
|
+
return build_errors(:not_found) if response.code == 404
|
77
|
+
validate_metadata
|
39
78
|
end
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
unless sum.nil?
|
47
|
-
build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
|
79
|
+
request.on_body do |chunk|
|
80
|
+
io = StringIO.new(@leading + chunk)
|
81
|
+
io.each_line do |line|
|
82
|
+
break if line_limit_reached?
|
83
|
+
parse_line(line)
|
84
|
+
end
|
48
85
|
end
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
rescue
|
53
|
-
build_errors(:
|
54
|
-
|
55
|
-
io.close if io && io.respond_to?(:close)
|
86
|
+
request.run
|
87
|
+
# Validate the last line too
|
88
|
+
validate_line(@leading, @current_line) unless @leading == ""
|
89
|
+
rescue ArgumentError => ae
|
90
|
+
build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
|
91
|
+
@reported_invalid_encoding = true
|
56
92
|
end
|
57
93
|
end
|
58
94
|
|
59
|
-
def
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
95
|
+
def parse_line(line)
|
96
|
+
line = @leading + line
|
97
|
+
# Check if the last line is a line break - in which case it's a full line
|
98
|
+
if line[-1, 1].include?("\n")
|
99
|
+
# If the number of quotes is odd, the linebreak is inside some quotes
|
100
|
+
if line.count(@dialect["quoteChar"]).odd?
|
101
|
+
@leading = line
|
102
|
+
else
|
103
|
+
validate_line(line, @current_line)
|
104
|
+
@leading = ""
|
105
|
+
@current_line = @current_line+1
|
106
|
+
end
|
107
|
+
else
|
108
|
+
# If it's not a full line, then prepare to add it to the beginning of the next chunk
|
109
|
+
@leading = line
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def validate_line(input = nil, index = nil)
|
114
|
+
@input = input
|
115
|
+
single_col = false
|
116
|
+
line = index.present? ? index : 0
|
117
|
+
@encoding = input.encoding.to_s
|
118
|
+
report_line_breaks(line)
|
119
|
+
parse_contents(input, line)
|
120
|
+
@lambda.call(self)
|
121
|
+
rescue ArgumentError => ae
|
122
|
+
build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
|
123
|
+
@reported_invalid_encoding = true
|
124
|
+
end
|
125
|
+
|
126
|
+
# analyses the provided csv and builds errors, warnings and info messages
|
127
|
+
def parse_contents(stream, line = nil)
|
128
|
+
# parse_contents will parse one line and apply headers, formats methods and error handle as appropriate
|
129
|
+
current_line = line.present? ? line : 1
|
130
|
+
all_errors = []
|
131
|
+
|
132
|
+
@csv_options[:encoding] = @encoding
|
133
|
+
|
134
|
+
begin
|
135
|
+
row = CSV.parse_line(stream, @csv_options)
|
136
|
+
# this is a one line substitute for CSV.new followed by row = CSV.shift. a CSV Row class is required
|
137
|
+
# CSV.parse will return an array of arrays which breaks subsequent each_with_index invocations
|
138
|
+
# TODO investigate if above would be a drag on memory
|
139
|
+
|
140
|
+
rescue CSV::MalformedCSVError => e
|
141
|
+
build_exception_messages(e, stream, current_line)
|
142
|
+
end
|
143
|
+
|
144
|
+
@data << row
|
145
|
+
if row
|
146
|
+
if current_line <= 1 && @csv_header
|
147
|
+
# this conditional should be refactored somewhere
|
148
|
+
row = row.reject { |col| col.nil? || col.empty? }
|
149
|
+
validate_header(row)
|
150
|
+
@col_counts << row.size
|
151
|
+
else
|
152
|
+
build_formats(row)
|
153
|
+
@col_counts << row.reject { |col| col.nil? || col.empty? }.size
|
154
|
+
@expected_columns = row.size unless @expected_columns != 0
|
155
|
+
build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? || c.empty? }.size == 0
|
156
|
+
# Builds errors and warnings related to the provided schema file
|
157
|
+
if @schema
|
158
|
+
@schema.validate_row(row, current_line, all_errors, @source)
|
159
|
+
@errors += @schema.errors
|
160
|
+
all_errors += @schema.errors
|
161
|
+
@warnings += @schema.warnings
|
162
|
+
else
|
163
|
+
build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) if !row.empty? && row.size != @expected_columns
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def finish
|
170
|
+
sum = @col_counts.inject(:+)
|
171
|
+
unless sum.nil?
|
172
|
+
build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
|
173
|
+
end
|
174
|
+
# return expected_columns to calling class
|
175
|
+
build_warnings(:check_options, :structure) if @expected_columns == 1
|
176
|
+
check_consistency
|
177
|
+
check_foreign_keys
|
178
|
+
check_mixed_linebreaks
|
179
|
+
validate_encoding
|
180
|
+
end
|
181
|
+
|
182
|
+
def validate_metadata
|
183
|
+
assumed_header = !@supplied_dialect
|
184
|
+
unless @headers.empty?
|
67
185
|
if @headers["content-type"] =~ /text\/csv/
|
68
|
-
@csv_header = true
|
69
|
-
|
70
|
-
assumed_header = true
|
186
|
+
@csv_header = @csv_header && true
|
187
|
+
assumed_header = @assumed_header.present?
|
71
188
|
end
|
72
189
|
if @headers["content-type"] =~ /header=(present|absent)/
|
73
190
|
@csv_header = true if $1 == "present"
|
74
191
|
@csv_header = false if $1 == "absent"
|
75
|
-
undeclared_header = false
|
76
192
|
assumed_header = false
|
77
193
|
end
|
78
|
-
if @headers["content-type"] !~ /charset=/
|
79
|
-
build_warnings(:no_encoding, :context)
|
80
|
-
else
|
81
|
-
build_warnings(:encoding, :context) if @encoding != "utf-8"
|
82
|
-
end
|
83
194
|
build_warnings(:no_content_type, :context) if @content_type == nil
|
84
195
|
build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
|
196
|
+
end
|
197
|
+
@header_processed = true
|
198
|
+
build_info_messages(:assumed_header, :structure) if assumed_header
|
85
199
|
|
86
|
-
|
87
|
-
|
88
|
-
|
200
|
+
@link_headers = @headers["link"].split(",") rescue nil
|
201
|
+
@link_headers.each do |link_header|
|
202
|
+
match = LINK_HEADER_REGEXP.match(link_header)
|
203
|
+
uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
|
204
|
+
rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
|
205
|
+
param = match["param"]
|
206
|
+
param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
|
207
|
+
if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
|
208
|
+
begin
|
209
|
+
url = URI.join(@source_url, uri)
|
210
|
+
schema = Schema.load_from_json(url)
|
211
|
+
if schema.instance_of? Csvlint::Csvw::TableGroup
|
212
|
+
if schema.tables[@source_url]
|
213
|
+
link_schema = schema
|
214
|
+
else
|
215
|
+
warn_if_unsuccessful = true
|
216
|
+
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
rescue OpenURI::HTTPError
|
220
|
+
end
|
89
221
|
end
|
222
|
+
end if @link_headers
|
223
|
+
end
|
224
|
+
|
225
|
+
def header?
|
226
|
+
@csv_header && @dialect["header"]
|
227
|
+
end
|
90
228
|
|
229
|
+
def report_line_breaks(line_no=nil)
|
230
|
+
return if @input !~ /[\r|\n]/ # Return straight away if there's no newline character - i.e. we're on the last line
|
231
|
+
line_break = CSV.new(@input).row_sep
|
232
|
+
@line_breaks << line_break
|
233
|
+
unless line_breaks_reported?
|
234
|
+
if line_break != "\r\n"
|
235
|
+
build_info_messages(:nonrfc_line_breaks, :structure, line_no)
|
236
|
+
@line_breaks_reported = true
|
237
|
+
end
|
91
238
|
end
|
92
|
-
build_info_messages(:assumed_header, :structure) if assumed_header
|
93
239
|
end
|
94
240
|
|
95
|
-
def
|
241
|
+
def line_breaks_reported?
|
242
|
+
@line_breaks_reported === true
|
243
|
+
end
|
244
|
+
|
245
|
+
def set_dialect
|
246
|
+
@assumed_header = @dialect["header"].nil?
|
247
|
+
@supplied_dialect = @dialect != {}
|
248
|
+
|
96
249
|
begin
|
97
250
|
schema_dialect = @schema.tables[@source_url].dialect || {}
|
98
251
|
rescue
|
99
252
|
schema_dialect = {}
|
100
253
|
end
|
101
254
|
@dialect = {
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
}.merge(schema_dialect).merge(dialect || {})
|
255
|
+
"header" => true,
|
256
|
+
"delimiter" => ",",
|
257
|
+
"skipInitialSpace" => true,
|
258
|
+
"lineTerminator" => :auto,
|
259
|
+
"quoteChar" => '"',
|
260
|
+
"trim" => :true
|
261
|
+
}.merge(schema_dialect).merge(@dialect || {})
|
109
262
|
|
110
263
|
@csv_header = @csv_header && @dialect["header"]
|
111
264
|
@csv_options = dialect_to_csv_options(@dialect)
|
112
265
|
end
|
113
266
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
267
|
+
def validate_encoding
|
268
|
+
if @headers["content-type"]
|
269
|
+
if @headers["content-type"] !~ /charset=/
|
270
|
+
build_warnings(:no_encoding, :context)
|
271
|
+
elsif @headers["content-type"] !~ /charset=utf-8/i
|
272
|
+
build_warnings(:encoding, :context)
|
273
|
+
end
|
274
|
+
end
|
275
|
+
build_warnings(:encoding, :context) if @encoding != "UTF-8"
|
276
|
+
end
|
121
277
|
|
122
|
-
|
278
|
+
def check_mixed_linebreaks
|
279
|
+
build_linebreak_error if @line_breaks.uniq.count > 1
|
280
|
+
end
|
123
281
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
@line_breaks
|
129
|
-
if @line_breaks != "\r\n"
|
130
|
-
build_info_messages(:nonrfc_line_breaks, :structure)
|
131
|
-
end
|
132
|
-
row = nil
|
133
|
-
loop do
|
134
|
-
current_line += 1
|
135
|
-
if @limit_lines && current_line > @limit_lines
|
136
|
-
break
|
137
|
-
end
|
138
|
-
begin
|
139
|
-
wrapper.reset_line
|
140
|
-
row = csv.shift
|
141
|
-
@data << row
|
142
|
-
if row
|
143
|
-
if current_line == 1 && header?
|
144
|
-
row = row.reject{|col| col.nil? || col.empty?}
|
145
|
-
validate_header(row)
|
146
|
-
@col_counts << row.size
|
147
|
-
else
|
148
|
-
build_formats(row)
|
149
|
-
@col_counts << row.reject{|col| col.nil? || col.empty?}.size
|
150
|
-
@expected_columns = row.size unless @expected_columns != 0
|
151
|
-
|
152
|
-
build_errors(:blank_rows, :structure, current_line, nil, wrapper.line) if row.reject{ |c| c.nil? || c.empty? }.size == 0
|
153
|
-
# Builds errors and warnings related to the provided schema file
|
154
|
-
if @schema
|
155
|
-
@schema.validate_row(row, current_line, all_errors, @source)
|
156
|
-
@errors += @schema.errors
|
157
|
-
all_errors += @schema.errors
|
158
|
-
@warnings += @schema.warnings
|
159
|
-
else
|
160
|
-
build_errors(:ragged_rows, :structure, current_line, nil, wrapper.line) if !row.empty? && row.size != @expected_columns
|
161
|
-
end
|
162
|
-
|
163
|
-
end
|
164
|
-
else
|
165
|
-
break
|
166
|
-
end
|
167
|
-
rescue CSV::MalformedCSVError => e
|
168
|
-
type = fetch_error(e)
|
169
|
-
if type == :stray_quote && !wrapper.line.match(csv.row_sep)
|
170
|
-
build_errors(:line_breaks, :structure)
|
171
|
-
else
|
172
|
-
build_errors(type, :structure, current_line, nil, wrapper.line)
|
173
|
-
end
|
174
|
-
end
|
282
|
+
def line_breaks
|
283
|
+
if @line_breaks.uniq.count > 1
|
284
|
+
:mixed
|
285
|
+
else
|
286
|
+
@line_breaks.uniq.first
|
175
287
|
end
|
176
|
-
|
177
|
-
|
178
|
-
|
288
|
+
end
|
289
|
+
|
290
|
+
def row_count
|
291
|
+
data.count
|
292
|
+
end
|
293
|
+
|
294
|
+
def build_exception_messages(csvException, errChars, lineNo)
|
295
|
+
#TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers
|
296
|
+
#TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options
|
297
|
+
type = fetch_error(csvException)
|
298
|
+
if !@csv_options[:row_sep].kind_of?(Symbol) && [:unclosed_quote,:stray_quote].include?(type) && !@input.match(@csv_options[:row_sep])
|
299
|
+
build_linebreak_error
|
300
|
+
else
|
301
|
+
build_errors(type, :structure, lineNo, nil, errChars)
|
179
302
|
end
|
180
303
|
end
|
181
304
|
|
305
|
+
def build_linebreak_error
|
306
|
+
build_errors(:line_breaks, :structure) unless @errors.any? { |e| e.type == :line_breaks }
|
307
|
+
end
|
308
|
+
|
182
309
|
def validate_header(header)
|
183
310
|
names = Set.new
|
184
311
|
header.map{|h| h.strip! } if @dialect["trim"] == :true
|
@@ -198,10 +325,6 @@ module Csvlint
|
|
198
325
|
return valid?
|
199
326
|
end
|
200
327
|
|
201
|
-
def header?
|
202
|
-
@csv_header
|
203
|
-
end
|
204
|
-
|
205
328
|
def fetch_error(error)
|
206
329
|
e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i)
|
207
330
|
message = e[1] rescue nil
|
@@ -209,15 +332,15 @@ module Csvlint
|
|
209
332
|
end
|
210
333
|
|
211
334
|
def dialect_to_csv_options(dialect)
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
335
|
+
skipinitialspace = dialect["skipInitialSpace"] || true
|
336
|
+
delimiter = dialect["delimiter"]
|
337
|
+
delimiter = delimiter + " " if !skipinitialspace
|
338
|
+
return {
|
339
|
+
:col_sep => delimiter,
|
340
|
+
:row_sep => dialect["lineTerminator"],
|
341
|
+
:quote_char => dialect["quoteChar"],
|
342
|
+
:skip_blanks => false
|
343
|
+
}
|
221
344
|
end
|
222
345
|
|
223
346
|
def build_formats(row)
|
@@ -225,33 +348,34 @@ module Csvlint
|
|
225
348
|
next if col.nil? || col.empty?
|
226
349
|
@formats[i] ||= Hash.new(0)
|
227
350
|
|
228
|
-
format =
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
351
|
+
format =
|
352
|
+
if col.strip[FORMATS[:numeric]]
|
353
|
+
:numeric
|
354
|
+
elsif uri?(col)
|
355
|
+
:uri
|
356
|
+
elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
|
357
|
+
:date_db
|
358
|
+
elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
|
359
|
+
:date_short
|
360
|
+
elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
|
361
|
+
:date_rfc822
|
362
|
+
elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
|
363
|
+
:date_long
|
364
|
+
elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
|
365
|
+
:dateTime_time
|
366
|
+
elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
|
367
|
+
:dateTime_hms
|
368
|
+
elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
|
369
|
+
:dateTime_db
|
370
|
+
elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
|
371
|
+
:dateTime_iso8601
|
372
|
+
elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
|
373
|
+
:dateTime_short
|
374
|
+
elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
|
375
|
+
:dateTime_long
|
376
|
+
else
|
377
|
+
:string
|
378
|
+
end
|
255
379
|
|
256
380
|
@formats[i][format] += 1
|
257
381
|
end
|
@@ -277,15 +401,16 @@ module Csvlint
|
|
277
401
|
end
|
278
402
|
|
279
403
|
def locate_schema
|
404
|
+
|
280
405
|
@source_url = nil
|
281
406
|
warn_if_unsuccessful = false
|
282
407
|
case @source
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
408
|
+
when StringIO
|
409
|
+
return
|
410
|
+
when File
|
411
|
+
@source_url = "file:#{File.expand_path(@source)}"
|
412
|
+
else
|
413
|
+
@source_url = @source
|
289
414
|
end
|
290
415
|
unless @schema.nil?
|
291
416
|
if @schema.tables[@source_url]
|
@@ -295,28 +420,6 @@ module Csvlint
|
|
295
420
|
end
|
296
421
|
end
|
297
422
|
link_schema = nil
|
298
|
-
@link_headers.each do |link_header|
|
299
|
-
match = LINK_HEADER_REGEXP.match(link_header)
|
300
|
-
uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
|
301
|
-
rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
|
302
|
-
param = match["param"]
|
303
|
-
param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
|
304
|
-
if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
|
305
|
-
begin
|
306
|
-
url = URI.join(@source_url, uri)
|
307
|
-
schema = Schema.load_from_json(url)
|
308
|
-
if schema.instance_of? Csvlint::Csvw::TableGroup
|
309
|
-
if schema.tables[@source_url]
|
310
|
-
link_schema = schema
|
311
|
-
else
|
312
|
-
warn_if_unsuccessful = true
|
313
|
-
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
|
314
|
-
end
|
315
|
-
end
|
316
|
-
rescue OpenURI::HTTPError
|
317
|
-
end
|
318
|
-
end
|
319
|
-
end if @link_headers
|
320
423
|
@schema = link_schema if link_schema
|
321
424
|
|
322
425
|
paths = []
|
@@ -324,8 +427,8 @@ module Csvlint
|
|
324
427
|
begin
|
325
428
|
well_known_uri = URI.join(@source_url, "/.well-known/csvm")
|
326
429
|
well_known = open(well_known_uri).read
|
327
|
-
|
328
|
-
rescue OpenURI::HTTPError
|
430
|
+
# TODO
|
431
|
+
rescue OpenURI::HTTPError, URI::BadURIError
|
329
432
|
end
|
330
433
|
end
|
331
434
|
paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty?
|
@@ -345,8 +448,7 @@ module Csvlint
|
|
345
448
|
end
|
346
449
|
end
|
347
450
|
rescue Errno::ENOENT
|
348
|
-
rescue OpenURI::HTTPError
|
349
|
-
rescue ArgumentError
|
451
|
+
rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError
|
350
452
|
rescue => e
|
351
453
|
STDERR.puts e.class
|
352
454
|
STDERR.puts e.message
|
@@ -361,23 +463,24 @@ module Csvlint
|
|
361
463
|
private
|
362
464
|
|
363
465
|
def parse_extension(source)
|
466
|
+
|
364
467
|
case source
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
468
|
+
when File
|
469
|
+
return File.extname( source.path )
|
470
|
+
when IO
|
471
|
+
return ""
|
472
|
+
when StringIO
|
473
|
+
return ""
|
371
474
|
when Tempfile
|
372
475
|
# this is triggered when the revalidate dialect use case happens
|
373
|
-
return ""
|
374
|
-
else
|
375
|
-
begin
|
376
|
-
parsed = URI.parse(source)
|
377
|
-
File.extname(parsed.path)
|
378
|
-
rescue URI::InvalidURIError
|
379
476
|
return ""
|
380
|
-
|
477
|
+
else
|
478
|
+
begin
|
479
|
+
parsed = URI.parse(source)
|
480
|
+
File.extname(parsed.path)
|
481
|
+
rescue URI::InvalidURIError
|
482
|
+
return ""
|
483
|
+
end
|
381
484
|
end
|
382
485
|
end
|
383
486
|
|
@@ -396,20 +499,24 @@ module Csvlint
|
|
396
499
|
false
|
397
500
|
end
|
398
501
|
|
502
|
+
def line_limit_reached?
|
503
|
+
@limit_lines.present? && @current_line > @limit_lines
|
504
|
+
end
|
505
|
+
|
399
506
|
FORMATS = {
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
507
|
+
:string => nil,
|
508
|
+
:numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
|
509
|
+
:uri => /\Ahttps?:/,
|
510
|
+
:date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01"
|
511
|
+
:date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345"
|
512
|
+
:date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345"
|
513
|
+
:date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan"
|
514
|
+
:dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00"
|
515
|
+
:dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00"
|
516
|
+
:dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z"
|
517
|
+
:dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
|
518
|
+
:dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00"
|
519
|
+
:dateTime_time => /\A\d\d:\d\d\z/, # "00:00"
|
413
520
|
}.freeze
|
414
521
|
|
415
522
|
URI_REGEXP = /(?<uri>.*?)/
|