csvlint 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/CHANGELOG.md +21 -1
- data/features/csvupload.feature +0 -8
- data/features/validation_info.feature +0 -8
- data/lib/csvlint/validate.rb +113 -51
- data/lib/csvlint/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NzVlNGUzMDczMzhmZmJiNDIzZDRiYmJmZTBhZGNjMzcwMzlmZjU5Yw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MjdiMmM3ZjVmOTIxOTYzYTk5NGFiNDY3Y2MxMmY0NWRlZjViZGM2OA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZTM5MTMwYWEzMTYyNzFmMGNlYjMyMWFlZjRlMDQ2YTA5ZjczM2Q4NzJiNDIy
|
10
|
+
NzYwYmM0MjQ3ZGMxNzZjN2NlNzA0NDAxNmZlYTQxMTZkNzhiYWIzMTRhOGFi
|
11
|
+
ZTIwM2NkYzgwMzcyOGM0YTE3NWZlYWRmYzdjMThjZjEzYzBhOGY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YTBjNDc4MjI5ZTcyNWUyYjQwMzQ2NWQyYTBjMDI1ODc4Njc4NDllNWQ4YzE0
|
14
|
+
MzA4ODJmOTIxYmIwMWE5YjAxYWViZTE1OGY4NDIyNTM2OGU0OTg5NWY2NGRj
|
15
|
+
ZmJhYzFhNTBiMzM0Yzg5Y2UxYjQxMzJlMzhmZTc0ZTU1MTg3MTE=
|
data/CHANGELOG.md
CHANGED
@@ -2,7 +2,27 @@
|
|
2
2
|
|
3
3
|
## [Unreleased](https://github.com/theodi/csvlint.rb/tree/HEAD)
|
4
4
|
|
5
|
-
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.
|
5
|
+
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.1...HEAD)
|
6
|
+
|
7
|
+
**Closed issues:**
|
8
|
+
|
9
|
+
- Eliminate some date and time formats \(for speed\) [\#105](https://github.com/theodi/csvlint.rb/issues/105)
|
10
|
+
|
11
|
+
**Merged pull requests:**
|
12
|
+
|
13
|
+
- Check characters in validate\_line method [\#160](https://github.com/theodi/csvlint.rb/pull/160) ([pezholio](https://github.com/pezholio))
|
14
|
+
|
15
|
+
- Further optimisations [\#159](https://github.com/theodi/csvlint.rb/pull/159) ([pezholio](https://github.com/pezholio))
|
16
|
+
|
17
|
+
- More optimizations after \#157 [\#158](https://github.com/theodi/csvlint.rb/pull/158) ([jpmckinney](https://github.com/jpmckinney))
|
18
|
+
|
19
|
+
- Memoize the result of CSV\#encode\_re [\#157](https://github.com/theodi/csvlint.rb/pull/157) ([jpmckinney](https://github.com/jpmckinney))
|
20
|
+
|
21
|
+
- Don't pass leading string to parse\_line [\#155](https://github.com/theodi/csvlint.rb/pull/155) ([pezholio](https://github.com/pezholio))
|
22
|
+
|
23
|
+
## [0.2.1](https://github.com/theodi/csvlint.rb/tree/0.2.1) (2015-10-07)
|
24
|
+
|
25
|
+
[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...0.2.1)
|
6
26
|
|
7
27
|
**Implemented enhancements:**
|
8
28
|
|
data/features/csvupload.feature
CHANGED
@@ -17,14 +17,6 @@ Feature: Collect all the tests that should trigger dialect check related errors
|
|
17
17
|
Then there should be 1 info message
|
18
18
|
And one of the messages should have the type "nonrfc_line_breaks"
|
19
19
|
|
20
|
-
Scenario: CR line endings in file give an info message of type :nonrfc_line_breaks
|
21
|
-
Given I have a CSV file called "cr-line-endings.csv"
|
22
|
-
And it is stored at the url "http://example.com/example1.csv"
|
23
|
-
And I set header to "true"
|
24
|
-
And I ask if there are info messages
|
25
|
-
Then there should be 1 info message
|
26
|
-
And one of the messages should have the type "nonrfc_line_breaks"
|
27
|
-
|
28
20
|
Scenario: CRLF line endings in file produces no info messages of type :nonrfc_line_breaks
|
29
21
|
Given I have a CSV file called "crlf-line-endings.csv"
|
30
22
|
And it is stored at the url "http://example.com/example1.csv"
|
@@ -8,14 +8,6 @@ Feature: Get validation information messages
|
|
8
8
|
Then there should be 1 info messages
|
9
9
|
And one of the messages should have the type "nonrfc_line_breaks"
|
10
10
|
|
11
|
-
Scenario: CR line endings in file give an info message
|
12
|
-
Given I have a CSV file called "cr-line-endings.csv"
|
13
|
-
And it is stored at the url "http://example.com/example1.csv"
|
14
|
-
And I set header to "true"
|
15
|
-
And I ask if there are info messages
|
16
|
-
Then there should be 1 info messages
|
17
|
-
And one of the messages should have the type "nonrfc_line_breaks"
|
18
|
-
|
19
11
|
Scenario: CRLF line endings in file produces no info messages
|
20
12
|
Given I have a CSV file called "crlf-line-endings.csv"
|
21
13
|
And it is stored at the url "http://example.com/example1.csv"
|
data/lib/csvlint/validate.rb
CHANGED
@@ -1,6 +1,52 @@
|
|
1
1
|
module Csvlint
|
2
2
|
|
3
3
|
class Validator
|
4
|
+
class LineCSV < CSV
|
5
|
+
ENCODE_RE = Hash.new do |h,str|
|
6
|
+
h[str] = Regexp.new(str)
|
7
|
+
end
|
8
|
+
|
9
|
+
ENCODE_STR = Hash.new do |h,encoding_name|
|
10
|
+
h[encoding_name] = Hash.new do |h,chunks|
|
11
|
+
h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join('')
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
ESCAPE_RE = Hash.new do |h,re_chars|
|
16
|
+
h[re_chars] = Hash.new do |h,re_esc|
|
17
|
+
h[re_esc] = Hash.new do |h,str|
|
18
|
+
h[str] = str.gsub(re_chars) {|c| re_esc + c}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Optimization: Memoize `encode_re`.
|
24
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2273
|
25
|
+
def encode_re(*chunks)
|
26
|
+
ENCODE_RE[encode_str(*chunks)]
|
27
|
+
end
|
28
|
+
|
29
|
+
# Optimization: Memoize `encode_str`.
|
30
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2281
|
31
|
+
def encode_str(*chunks)
|
32
|
+
ENCODE_STR[@encoding.name][chunks]
|
33
|
+
end
|
34
|
+
|
35
|
+
# Optimization: Memoize `escape_re`.
|
36
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2265
|
37
|
+
def escape_re(str)
|
38
|
+
ESCAPE_RE[@re_chars][@re_esc][str]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Optimization: Disable the CSV library's converters feature.
|
42
|
+
# @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2100
|
43
|
+
def init_converters(options, field_name = :converters)
|
44
|
+
@converters = []
|
45
|
+
@header_converters = []
|
46
|
+
options.delete(:unconverted_fields)
|
47
|
+
options.delete(field_name)
|
48
|
+
end
|
49
|
+
end
|
4
50
|
|
5
51
|
include Csvlint::ErrorCollector
|
6
52
|
|
@@ -21,7 +67,7 @@ module Csvlint
|
|
21
67
|
@dialect = dialect
|
22
68
|
@csv_header = true
|
23
69
|
@headers = {}
|
24
|
-
@lambda = options[:lambda]
|
70
|
+
@lambda = options[:lambda]
|
25
71
|
@leading = ""
|
26
72
|
|
27
73
|
@limit_lines = options[:limit_lines]
|
@@ -67,29 +113,24 @@ module Csvlint
|
|
67
113
|
|
68
114
|
def validate_url
|
69
115
|
@current_line = 1
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
parse_line(line)
|
84
|
-
end
|
116
|
+
request = Typhoeus::Request.new(@source, followlocation: true)
|
117
|
+
request.on_headers do |response|
|
118
|
+
@headers = response.headers || {}
|
119
|
+
@content_type = response.headers["content-type"] rescue nil
|
120
|
+
@response_code = response.code
|
121
|
+
return build_errors(:not_found) if response.code == 404
|
122
|
+
validate_metadata
|
123
|
+
end
|
124
|
+
request.on_body do |chunk|
|
125
|
+
io = StringIO.new(chunk)
|
126
|
+
io.each_line do |line|
|
127
|
+
break if line_limit_reached?
|
128
|
+
parse_line(line)
|
85
129
|
end
|
86
|
-
request.run
|
87
|
-
# Validate the last line too
|
88
|
-
validate_line(@leading, @current_line) unless @leading == ""
|
89
|
-
rescue ArgumentError => ae
|
90
|
-
build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
|
91
|
-
@reported_invalid_encoding = true
|
92
130
|
end
|
131
|
+
request.run
|
132
|
+
# Validate the last line too
|
133
|
+
validate_line(@leading, @current_line) unless @leading == ""
|
93
134
|
end
|
94
135
|
|
95
136
|
def parse_line(line)
|
@@ -108,6 +149,9 @@ module Csvlint
|
|
108
149
|
# If it's not a full line, then prepare to add it to the beginning of the next chunk
|
109
150
|
@leading = line
|
110
151
|
end
|
152
|
+
rescue ArgumentError => ae
|
153
|
+
build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
|
154
|
+
@reported_invalid_encoding = true
|
111
155
|
end
|
112
156
|
|
113
157
|
def validate_line(input = nil, index = nil)
|
@@ -117,7 +161,7 @@ module Csvlint
|
|
117
161
|
@encoding = input.encoding.to_s
|
118
162
|
report_line_breaks(line)
|
119
163
|
parse_contents(input, line)
|
120
|
-
@lambda.call(self)
|
164
|
+
@lambda.call(self) unless @lambda.nil?
|
121
165
|
rescue ArgumentError => ae
|
122
166
|
build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
|
123
167
|
@reported_invalid_encoding = true
|
@@ -132,12 +176,8 @@ module Csvlint
|
|
132
176
|
@csv_options[:encoding] = @encoding
|
133
177
|
|
134
178
|
begin
|
135
|
-
row =
|
136
|
-
|
137
|
-
# CSV.parse will return an array of arrays which breaks subsequent each_with_index invocations
|
138
|
-
# TODO investigate if above would be a drag on memory
|
139
|
-
|
140
|
-
rescue CSV::MalformedCSVError => e
|
179
|
+
row = LineCSV.parse_line(stream, @csv_options)
|
180
|
+
rescue LineCSV::MalformedCSVError => e
|
141
181
|
build_exception_messages(e, stream, current_line)
|
142
182
|
end
|
143
183
|
|
@@ -227,8 +267,8 @@ module Csvlint
|
|
227
267
|
end
|
228
268
|
|
229
269
|
def report_line_breaks(line_no=nil)
|
230
|
-
return
|
231
|
-
line_break =
|
270
|
+
return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line
|
271
|
+
line_break = get_line_break(@input)
|
232
272
|
@line_breaks << line_break
|
233
273
|
unless line_breaks_reported?
|
234
274
|
if line_break != "\r\n"
|
@@ -353,26 +393,8 @@ module Csvlint
|
|
353
393
|
:numeric
|
354
394
|
elsif uri?(col)
|
355
395
|
:uri
|
356
|
-
elsif
|
357
|
-
|
358
|
-
elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
|
359
|
-
:date_short
|
360
|
-
elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
|
361
|
-
:date_rfc822
|
362
|
-
elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
|
363
|
-
:date_long
|
364
|
-
elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
|
365
|
-
:dateTime_time
|
366
|
-
elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
|
367
|
-
:dateTime_hms
|
368
|
-
elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
|
369
|
-
:dateTime_db
|
370
|
-
elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
|
371
|
-
:dateTime_iso8601
|
372
|
-
elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
|
373
|
-
:dateTime_short
|
374
|
-
elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
|
375
|
-
:dateTime_long
|
396
|
+
elsif possible_date?(col)
|
397
|
+
date_formats(col)
|
376
398
|
else
|
377
399
|
:string
|
378
400
|
end
|
@@ -493,6 +515,36 @@ module Csvlint
|
|
493
515
|
false
|
494
516
|
end
|
495
517
|
|
518
|
+
def possible_date?(col)
|
519
|
+
col[POSSIBLE_DATE_REGEXP]
|
520
|
+
end
|
521
|
+
|
522
|
+
def date_formats(col)
|
523
|
+
if col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
|
524
|
+
:date_db
|
525
|
+
elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
|
526
|
+
:date_short
|
527
|
+
elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
|
528
|
+
:date_rfc822
|
529
|
+
elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
|
530
|
+
:date_long
|
531
|
+
elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
|
532
|
+
:dateTime_time
|
533
|
+
elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
|
534
|
+
:dateTime_hms
|
535
|
+
elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
|
536
|
+
:dateTime_db
|
537
|
+
elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
|
538
|
+
:dateTime_iso8601
|
539
|
+
elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
|
540
|
+
:dateTime_short
|
541
|
+
elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
|
542
|
+
:dateTime_long
|
543
|
+
else
|
544
|
+
:string
|
545
|
+
end
|
546
|
+
end
|
547
|
+
|
496
548
|
def date_format?(klass, value, format)
|
497
549
|
klass.strptime(value, format).strftime(format) == value
|
498
550
|
rescue ArgumentError # invalid date
|
@@ -503,6 +555,15 @@ module Csvlint
|
|
503
555
|
@limit_lines.present? && @current_line > @limit_lines
|
504
556
|
end
|
505
557
|
|
558
|
+
def get_line_break(line)
|
559
|
+
eol = line.chars.last(2)
|
560
|
+
if eol.first == "\r"
|
561
|
+
"\r\n"
|
562
|
+
else
|
563
|
+
"\n"
|
564
|
+
end
|
565
|
+
end
|
566
|
+
|
506
567
|
FORMATS = {
|
507
568
|
:string => nil,
|
508
569
|
:numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
|
@@ -531,6 +592,7 @@ module Csvlint
|
|
531
592
|
LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)")
|
532
593
|
LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})")
|
533
594
|
LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*")
|
595
|
+
POSSIBLE_DATE_REGEXP = Regexp.new("\\A(\\d|\\s\\d#{Date::ABBR_MONTHNAMES.join('|')}#{Date::MONTHNAMES.join('|')})")
|
534
596
|
|
535
597
|
end
|
536
598
|
end
|
data/lib/csvlint/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csvlint
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pezholio
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mime-types
|