csvlint 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/CHANGELOG.md +21 -1
- data/features/csvupload.feature +0 -8
- data/features/validation_info.feature +0 -8
- data/lib/csvlint/validate.rb +113 -51
- data/lib/csvlint/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NzVlNGUzMDczMzhmZmJiNDIzZDRiYmJmZTBhZGNjMzcwMzlmZjU5Yw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MjdiMmM3ZjVmOTIxOTYzYTk5NGFiNDY3Y2MxMmY0NWRlZjViZGM2OA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZTM5MTMwYWEzMTYyNzFmMGNlYjMyMWFlZjRlMDQ2YTA5ZjczM2Q4NzJiNDIy
|
10
|
+
NzYwYmM0MjQ3ZGMxNzZjN2NlNzA0NDAxNmZlYTQxMTZkNzhiYWIzMTRhOGFi
|
11
|
+
ZTIwM2NkYzgwMzcyOGM0YTE3NWZlYWRmYzdjMThjZjEzYzBhOGY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YTBjNDc4MjI5ZTcyNWUyYjQwMzQ2NWQyYTBjMDI1ODc4Njc4NDllNWQ4YzE0
|
14
|
+
MzA4ODJmOTIxYmIwMWE5YjAxYWViZTE1OGY4NDIyNTM2OGU0OTg5NWY2NGRj
|
15
|
+
ZmJhYzFhNTBiMzM0Yzg5Y2UxYjQxMzJlMzhmZTc0ZTU1MTg3MTE=
|
data/CHANGELOG.md
CHANGED
@@ -2,7 +2,27 @@
|
|
2
2
|
|
3
3
|
## [Unreleased](https://github.com/theodi/csvlint.rb/tree/HEAD)
|
4
4
|
|
5
|
-
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.
|
5
|
+
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.1...HEAD)
|
6
|
+
|
7
|
+
**Closed issues:**
|
8
|
+
|
9
|
+
- Eliminate some date and time formats \(for speed\) [\#105](https://github.com/theodi/csvlint.rb/issues/105)
|
10
|
+
|
11
|
+
**Merged pull requests:**
|
12
|
+
|
13
|
+
- Check characters in validate\_line method [\#160](https://github.com/theodi/csvlint.rb/pull/160) ([pezholio](https://github.com/pezholio))
|
14
|
+
|
15
|
+
- Further optimisations [\#159](https://github.com/theodi/csvlint.rb/pull/159) ([pezholio](https://github.com/pezholio))
|
16
|
+
|
17
|
+
- More optimizations after \#157 [\#158](https://github.com/theodi/csvlint.rb/pull/158) ([jpmckinney](https://github.com/jpmckinney))
|
18
|
+
|
19
|
+
- Memoize the result of CSV\#encode\_re [\#157](https://github.com/theodi/csvlint.rb/pull/157) ([jpmckinney](https://github.com/jpmckinney))
|
20
|
+
|
21
|
+
- Don't pass leading string to parse\_line [\#155](https://github.com/theodi/csvlint.rb/pull/155) ([pezholio](https://github.com/pezholio))
|
22
|
+
|
23
|
+
## [0.2.1](https://github.com/theodi/csvlint.rb/tree/0.2.1) (2015-10-07)
|
24
|
+
|
25
|
+
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...0.2.1)
|
6
26
|
|
7
27
|
**Implemented enhancements:**
|
8
28
|
|
data/features/csvupload.feature
CHANGED
@@ -17,14 +17,6 @@ Feature: Collect all the tests that should trigger dialect check related errors
|
|
17
17
|
Then there should be 1 info message
|
18
18
|
And one of the messages should have the type "nonrfc_line_breaks"
|
19
19
|
|
20
|
-
Scenario: CR line endings in file give an info message of type :nonrfc_line_breaks
|
21
|
-
Given I have a CSV file called "cr-line-endings.csv"
|
22
|
-
And it is stored at the url "http://example.com/example1.csv"
|
23
|
-
And I set header to "true"
|
24
|
-
And I ask if there are info messages
|
25
|
-
Then there should be 1 info message
|
26
|
-
And one of the messages should have the type "nonrfc_line_breaks"
|
27
|
-
|
28
20
|
Scenario: CRLF line endings in file produces no info messages of type :nonrfc_line_breaks
|
29
21
|
Given I have a CSV file called "crlf-line-endings.csv"
|
30
22
|
And it is stored at the url "http://example.com/example1.csv"
|
@@ -8,14 +8,6 @@ Feature: Get validation information messages
|
|
8
8
|
Then there should be 1 info messages
|
9
9
|
And one of the messages should have the type "nonrfc_line_breaks"
|
10
10
|
|
11
|
-
Scenario: CR line endings in file give an info message
|
12
|
-
Given I have a CSV file called "cr-line-endings.csv"
|
13
|
-
And it is stored at the url "http://example.com/example1.csv"
|
14
|
-
And I set header to "true"
|
15
|
-
And I ask if there are info messages
|
16
|
-
Then there should be 1 info messages
|
17
|
-
And one of the messages should have the type "nonrfc_line_breaks"
|
18
|
-
|
19
11
|
Scenario: CRLF line endings in file produces no info messages
|
20
12
|
Given I have a CSV file called "crlf-line-endings.csv"
|
21
13
|
And it is stored at the url "http://example.com/example1.csv"
|
data/lib/csvlint/validate.rb
CHANGED
@@ -1,6 +1,52 @@
|
|
1
1
|
module Csvlint
|
2
2
|
|
3
3
|
class Validator
|
4
|
+
class LineCSV < CSV
|
5
|
+
ENCODE_RE = Hash.new do |h,str|
|
6
|
+
h[str] = Regexp.new(str)
|
7
|
+
end
|
8
|
+
|
9
|
+
ENCODE_STR = Hash.new do |h,encoding_name|
|
10
|
+
h[encoding_name] = Hash.new do |h,chunks|
|
11
|
+
h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join('')
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
ESCAPE_RE = Hash.new do |h,re_chars|
|
16
|
+
h[re_chars] = Hash.new do |h,re_esc|
|
17
|
+
h[re_esc] = Hash.new do |h,str|
|
18
|
+
h[str] = str.gsub(re_chars) {|c| re_esc + c}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Optimization: Memoize `encode_re`.
|
24
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2273
|
25
|
+
def encode_re(*chunks)
|
26
|
+
ENCODE_RE[encode_str(*chunks)]
|
27
|
+
end
|
28
|
+
|
29
|
+
# Optimization: Memoize `encode_str`.
|
30
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2281
|
31
|
+
def encode_str(*chunks)
|
32
|
+
ENCODE_STR[@encoding.name][chunks]
|
33
|
+
end
|
34
|
+
|
35
|
+
# Optimization: Memoize `escape_re`.
|
36
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2265
|
37
|
+
def escape_re(str)
|
38
|
+
ESCAPE_RE[@re_chars][@re_esc][str]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Optimization: Disable the CSV library's converters feature.
|
42
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2100
|
43
|
+
def init_converters(options, field_name = :converters)
|
44
|
+
@converters = []
|
45
|
+
@header_converters = []
|
46
|
+
options.delete(:unconverted_fields)
|
47
|
+
options.delete(field_name)
|
48
|
+
end
|
49
|
+
end
|
4
50
|
|
5
51
|
include Csvlint::ErrorCollector
|
6
52
|
|
@@ -21,7 +67,7 @@ module Csvlint
|
|
21
67
|
@dialect = dialect
|
22
68
|
@csv_header = true
|
23
69
|
@headers = {}
|
24
|
-
@lambda = options[:lambda]
|
70
|
+
@lambda = options[:lambda]
|
25
71
|
@leading = ""
|
26
72
|
|
27
73
|
@limit_lines = options[:limit_lines]
|
@@ -67,29 +113,24 @@ module Csvlint
|
|
67
113
|
|
68
114
|
def validate_url
|
69
115
|
@current_line = 1
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
parse_line(line)
|
84
|
-
end
|
116
|
+
request = Typhoeus::Request.new(@source, followlocation: true)
|
117
|
+
request.on_headers do |response|
|
118
|
+
@headers = response.headers || {}
|
119
|
+
@content_type = response.headers["content-type"] rescue nil
|
120
|
+
@response_code = response.code
|
121
|
+
return build_errors(:not_found) if response.code == 404
|
122
|
+
validate_metadata
|
123
|
+
end
|
124
|
+
request.on_body do |chunk|
|
125
|
+
io = StringIO.new(chunk)
|
126
|
+
io.each_line do |line|
|
127
|
+
break if line_limit_reached?
|
128
|
+
parse_line(line)
|
85
129
|
end
|
86
|
-
request.run
|
87
|
-
# Validate the last line too
|
88
|
-
validate_line(@leading, @current_line) unless @leading == ""
|
89
|
-
rescue ArgumentError => ae
|
90
|
-
build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
|
91
|
-
@reported_invalid_encoding = true
|
92
130
|
end
|
131
|
+
request.run
|
132
|
+
# Validate the last line too
|
133
|
+
validate_line(@leading, @current_line) unless @leading == ""
|
93
134
|
end
|
94
135
|
|
95
136
|
def parse_line(line)
|
@@ -108,6 +149,9 @@ module Csvlint
|
|
108
149
|
# If it's not a full line, then prepare to add it to the beginning of the next chunk
|
109
150
|
@leading = line
|
110
151
|
end
|
152
|
+
rescue ArgumentError => ae
|
153
|
+
build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
|
154
|
+
@reported_invalid_encoding = true
|
111
155
|
end
|
112
156
|
|
113
157
|
def validate_line(input = nil, index = nil)
|
@@ -117,7 +161,7 @@ module Csvlint
|
|
117
161
|
@encoding = input.encoding.to_s
|
118
162
|
report_line_breaks(line)
|
119
163
|
parse_contents(input, line)
|
120
|
-
@lambda.call(self)
|
164
|
+
@lambda.call(self) unless @lambda.nil?
|
121
165
|
rescue ArgumentError => ae
|
122
166
|
build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
|
123
167
|
@reported_invalid_encoding = true
|
@@ -132,12 +176,8 @@ module Csvlint
|
|
132
176
|
@csv_options[:encoding] = @encoding
|
133
177
|
|
134
178
|
begin
|
135
|
-
row =
|
136
|
-
|
137
|
-
# CSV.parse will return an array of arrays which breaks subsequent each_with_index invocations
|
138
|
-
# TODO investigate if above would be a drag on memory
|
139
|
-
|
140
|
-
rescue CSV::MalformedCSVError => e
|
179
|
+
row = LineCSV.parse_line(stream, @csv_options)
|
180
|
+
rescue LineCSV::MalformedCSVError => e
|
141
181
|
build_exception_messages(e, stream, current_line)
|
142
182
|
end
|
143
183
|
|
@@ -227,8 +267,8 @@ module Csvlint
|
|
227
267
|
end
|
228
268
|
|
229
269
|
def report_line_breaks(line_no=nil)
|
230
|
-
return
|
231
|
-
line_break =
|
270
|
+
return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line
|
271
|
+
line_break = get_line_break(@input)
|
232
272
|
@line_breaks << line_break
|
233
273
|
unless line_breaks_reported?
|
234
274
|
if line_break != "\r\n"
|
@@ -353,26 +393,8 @@ module Csvlint
|
|
353
393
|
:numeric
|
354
394
|
elsif uri?(col)
|
355
395
|
:uri
|
356
|
-
elsif
|
357
|
-
|
358
|
-
elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
|
359
|
-
:date_short
|
360
|
-
elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
|
361
|
-
:date_rfc822
|
362
|
-
elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
|
363
|
-
:date_long
|
364
|
-
elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
|
365
|
-
:dateTime_time
|
366
|
-
elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
|
367
|
-
:dateTime_hms
|
368
|
-
elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
|
369
|
-
:dateTime_db
|
370
|
-
elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
|
371
|
-
:dateTime_iso8601
|
372
|
-
elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
|
373
|
-
:dateTime_short
|
374
|
-
elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
|
375
|
-
:dateTime_long
|
396
|
+
elsif possible_date?(col)
|
397
|
+
date_formats(col)
|
376
398
|
else
|
377
399
|
:string
|
378
400
|
end
|
@@ -493,6 +515,36 @@ module Csvlint
|
|
493
515
|
false
|
494
516
|
end
|
495
517
|
|
518
|
+
def possible_date?(col)
|
519
|
+
col[POSSIBLE_DATE_REGEXP]
|
520
|
+
end
|
521
|
+
|
522
|
+
def date_formats(col)
|
523
|
+
if col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
|
524
|
+
:date_db
|
525
|
+
elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
|
526
|
+
:date_short
|
527
|
+
elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
|
528
|
+
:date_rfc822
|
529
|
+
elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
|
530
|
+
:date_long
|
531
|
+
elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
|
532
|
+
:dateTime_time
|
533
|
+
elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
|
534
|
+
:dateTime_hms
|
535
|
+
elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
|
536
|
+
:dateTime_db
|
537
|
+
elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
|
538
|
+
:dateTime_iso8601
|
539
|
+
elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
|
540
|
+
:dateTime_short
|
541
|
+
elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
|
542
|
+
:dateTime_long
|
543
|
+
else
|
544
|
+
:string
|
545
|
+
end
|
546
|
+
end
|
547
|
+
|
496
548
|
def date_format?(klass, value, format)
|
497
549
|
klass.strptime(value, format).strftime(format) == value
|
498
550
|
rescue ArgumentError # invalid date
|
@@ -503,6 +555,15 @@ module Csvlint
|
|
503
555
|
@limit_lines.present? && @current_line > @limit_lines
|
504
556
|
end
|
505
557
|
|
558
|
+
def get_line_break(line)
|
559
|
+
eol = line.chars.last(2)
|
560
|
+
if eol.first == "\r"
|
561
|
+
"\r\n"
|
562
|
+
else
|
563
|
+
"\n"
|
564
|
+
end
|
565
|
+
end
|
566
|
+
|
506
567
|
FORMATS = {
|
507
568
|
:string => nil,
|
508
569
|
:numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
|
@@ -531,6 +592,7 @@ module Csvlint
|
|
531
592
|
LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)")
|
532
593
|
LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})")
|
533
594
|
LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*")
|
595
|
+
POSSIBLE_DATE_REGEXP = Regexp.new("\\A(\\d|\\s\\d#{Date::ABBR_MONTHNAMES.join('|')}#{Date::MONTHNAMES.join('|')})")
|
534
596
|
|
535
597
|
end
|
536
598
|
end
|
data/lib/csvlint/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csvlint
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pezholio
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mime-types
|