csvlint 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MmUxZTY5NThhMmU1ZmVlM2M0OWJiMzQ5MGY2NGRiMzk5NGEyYzEyYQ==
4
+ NzVlNGUzMDczMzhmZmJiNDIzZDRiYmJmZTBhZGNjMzcwMzlmZjU5Yw==
5
5
  data.tar.gz: !binary |-
6
- NTllMTYzYjUyYTk0ZTcwZmY5NDJkZjVlMGQzNzM4YWNkYWU2NjFjMg==
6
+ MjdiMmM3ZjVmOTIxOTYzYTk5NGFiNDY3Y2MxMmY0NWRlZjViZGM2OA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NTc2NTdhMzI4ZGI5NzFiMzgwZWYwM2E1YWVhMzE2ZmY5ZDUyNzdkODU1MTkw
10
- OTgyZGM1ZGFhODMxNGVmNDkwNjY3ZjY5NDEyM2YzYWJjZDQ3NThiODRiOWY1
11
- OTU1NGM4NGQ0NzQ3ZmRiYmM2MDM1YWM5YWJlMDRiN2MyNWI0YmI=
9
+ ZTM5MTMwYWEzMTYyNzFmMGNlYjMyMWFlZjRlMDQ2YTA5ZjczM2Q4NzJiNDIy
10
+ NzYwYmM0MjQ3ZGMxNzZjN2NlNzA0NDAxNmZlYTQxMTZkNzhiYWIzMTRhOGFi
11
+ ZTIwM2NkYzgwMzcyOGM0YTE3NWZlYWRmYzdjMThjZjEzYzBhOGY=
12
12
  data.tar.gz: !binary |-
13
- OTQ2NDNkN2RjNDlhZDNlYTI3NmU5NmQ4YTIxOTYxMjQyMTg2MWNhODFkZWQ2
14
- ZDYyYWUyNzJjZGNkYzFkYWU0YjI2NzkwZTI1OGNkODFmNTZhNzhjNjE5OGY4
15
- MmQzMzFkMTIxYzNkODM5NDFkNzc4ZDYwMjc2YTE2ZmZkZDgxZWY=
13
+ YTBjNDc4MjI5ZTcyNWUyYjQwMzQ2NWQyYTBjMDI1ODc4Njc4NDllNWQ4YzE0
14
+ MzA4ODJmOTIxYmIwMWE5YjAxYWViZTE1OGY4NDIyNTM2OGU0OTg5NWY2NGRj
15
+ ZmJhYzFhNTBiMzM0Yzg5Y2UxYjQxMzJlMzhmZTc0ZTU1MTg3MTE=
@@ -2,7 +2,27 @@
2
2
 
3
3
  ## [Unreleased](https://github.com/theodi/csvlint.rb/tree/HEAD)
4
4
 
5
- [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...HEAD)
5
+ [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.1...HEAD)
6
+
7
+ **Closed issues:**
8
+
9
+ - Eliminate some date and time formats \(for speed\) [\#105](https://github.com/theodi/csvlint.rb/issues/105)
10
+
11
+ **Merged pull requests:**
12
+
13
+ - Check characters in validate\_line method [\#160](https://github.com/theodi/csvlint.rb/pull/160) ([pezholio](https://github.com/pezholio))
14
+
15
+ - Further optimisations [\#159](https://github.com/theodi/csvlint.rb/pull/159) ([pezholio](https://github.com/pezholio))
16
+
17
+ - More optimizations after \#157 [\#158](https://github.com/theodi/csvlint.rb/pull/158) ([jpmckinney](https://github.com/jpmckinney))
18
+
19
+ - Memoize the result of CSV\#encode\_re [\#157](https://github.com/theodi/csvlint.rb/pull/157) ([jpmckinney](https://github.com/jpmckinney))
20
+
21
+ - Don't pass leading string to parse\_line [\#155](https://github.com/theodi/csvlint.rb/pull/155) ([pezholio](https://github.com/pezholio))
22
+
23
+ ## [0.2.1](https://github.com/theodi/csvlint.rb/tree/0.2.1) (2015-10-07)
24
+
25
+ [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...0.2.1)
6
26
 
7
27
  **Implemented enhancements:**
8
28
 
@@ -17,14 +17,6 @@ Feature: Collect all the tests that should trigger dialect check related errors
17
17
  Then there should be 1 info message
18
18
  And one of the messages should have the type "nonrfc_line_breaks"
19
19
 
20
- Scenario: CR line endings in file give an info message of type :nonrfc_line_breaks
21
- Given I have a CSV file called "cr-line-endings.csv"
22
- And it is stored at the url "http://example.com/example1.csv"
23
- And I set header to "true"
24
- And I ask if there are info messages
25
- Then there should be 1 info message
26
- And one of the messages should have the type "nonrfc_line_breaks"
27
-
28
20
  Scenario: CRLF line endings in file produces no info messages of type :nonrfc_line_breaks
29
21
  Given I have a CSV file called "crlf-line-endings.csv"
30
22
  And it is stored at the url "http://example.com/example1.csv"
@@ -8,14 +8,6 @@ Feature: Get validation information messages
8
8
  Then there should be 1 info messages
9
9
  And one of the messages should have the type "nonrfc_line_breaks"
10
10
 
11
- Scenario: CR line endings in file give an info message
12
- Given I have a CSV file called "cr-line-endings.csv"
13
- And it is stored at the url "http://example.com/example1.csv"
14
- And I set header to "true"
15
- And I ask if there are info messages
16
- Then there should be 1 info messages
17
- And one of the messages should have the type "nonrfc_line_breaks"
18
-
19
11
  Scenario: CRLF line endings in file produces no info messages
20
12
  Given I have a CSV file called "crlf-line-endings.csv"
21
13
  And it is stored at the url "http://example.com/example1.csv"
@@ -1,6 +1,52 @@
1
1
  module Csvlint
2
2
 
3
3
  class Validator
4
+ class LineCSV < CSV
5
+ ENCODE_RE = Hash.new do |h,str|
6
+ h[str] = Regexp.new(str)
7
+ end
8
+
9
+ ENCODE_STR = Hash.new do |h,encoding_name|
10
+ h[encoding_name] = Hash.new do |h,chunks|
11
+ h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join('')
12
+ end
13
+ end
14
+
15
+ ESCAPE_RE = Hash.new do |h,re_chars|
16
+ h[re_chars] = Hash.new do |h,re_esc|
17
+ h[re_esc] = Hash.new do |h,str|
18
+ h[str] = str.gsub(re_chars) {|c| re_esc + c}
19
+ end
20
+ end
21
+ end
22
+
23
+ # Optimization: Memoize `encode_re`.
24
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2273
25
+ def encode_re(*chunks)
26
+ ENCODE_RE[encode_str(*chunks)]
27
+ end
28
+
29
+ # Optimization: Memoize `encode_str`.
30
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2281
31
+ def encode_str(*chunks)
32
+ ENCODE_STR[@encoding.name][chunks]
33
+ end
34
+
35
+ # Optimization: Memoize `escape_re`.
36
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2265
37
+ def escape_re(str)
38
+ ESCAPE_RE[@re_chars][@re_esc][str]
39
+ end
40
+
41
+ # Optimization: Disable the CSV library's converters feature.
42
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2100
43
+ def init_converters(options, field_name = :converters)
44
+ @converters = []
45
+ @header_converters = []
46
+ options.delete(:unconverted_fields)
47
+ options.delete(field_name)
48
+ end
49
+ end
4
50
 
5
51
  include Csvlint::ErrorCollector
6
52
 
@@ -21,7 +67,7 @@ module Csvlint
21
67
  @dialect = dialect
22
68
  @csv_header = true
23
69
  @headers = {}
24
- @lambda = options[:lambda] || lambda { |a| nil }
70
+ @lambda = options[:lambda]
25
71
  @leading = ""
26
72
 
27
73
  @limit_lines = options[:limit_lines]
@@ -67,29 +113,24 @@ module Csvlint
67
113
 
68
114
  def validate_url
69
115
  @current_line = 1
70
- begin
71
- request = Typhoeus::Request.new(@source, followlocation: true)
72
- request.on_headers do |response|
73
- @headers = response.headers || {}
74
- @content_type = response.headers["content-type"] rescue nil
75
- @response_code = response.code
76
- return build_errors(:not_found) if response.code == 404
77
- validate_metadata
78
- end
79
- request.on_body do |chunk|
80
- io = StringIO.new(@leading + chunk)
81
- io.each_line do |line|
82
- break if line_limit_reached?
83
- parse_line(line)
84
- end
116
+ request = Typhoeus::Request.new(@source, followlocation: true)
117
+ request.on_headers do |response|
118
+ @headers = response.headers || {}
119
+ @content_type = response.headers["content-type"] rescue nil
120
+ @response_code = response.code
121
+ return build_errors(:not_found) if response.code == 404
122
+ validate_metadata
123
+ end
124
+ request.on_body do |chunk|
125
+ io = StringIO.new(chunk)
126
+ io.each_line do |line|
127
+ break if line_limit_reached?
128
+ parse_line(line)
85
129
  end
86
- request.run
87
- # Validate the last line too
88
- validate_line(@leading, @current_line) unless @leading == ""
89
- rescue ArgumentError => ae
90
- build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
91
- @reported_invalid_encoding = true
92
130
  end
131
+ request.run
132
+ # Validate the last line too
133
+ validate_line(@leading, @current_line) unless @leading == ""
93
134
  end
94
135
 
95
136
  def parse_line(line)
@@ -108,6 +149,9 @@ module Csvlint
108
149
  # If it's not a full line, then prepare to add it to the beginning of the next chunk
109
150
  @leading = line
110
151
  end
152
+ rescue ArgumentError => ae
153
+ build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
154
+ @reported_invalid_encoding = true
111
155
  end
112
156
 
113
157
  def validate_line(input = nil, index = nil)
@@ -117,7 +161,7 @@ module Csvlint
117
161
  @encoding = input.encoding.to_s
118
162
  report_line_breaks(line)
119
163
  parse_contents(input, line)
120
- @lambda.call(self)
164
+ @lambda.call(self) unless @lambda.nil?
121
165
  rescue ArgumentError => ae
122
166
  build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
123
167
  @reported_invalid_encoding = true
@@ -132,12 +176,8 @@ module Csvlint
132
176
  @csv_options[:encoding] = @encoding
133
177
 
134
178
  begin
135
- row = CSV.parse_line(stream, @csv_options)
136
- # this is a one line substitute for CSV.new followed by row = CSV.shift. a CSV Row class is required
137
- # CSV.parse will return an array of arrays which breaks subsequent each_with_index invocations
138
- # TODO investigate if above would be a drag on memory
139
-
140
- rescue CSV::MalformedCSVError => e
179
+ row = LineCSV.parse_line(stream, @csv_options)
180
+ rescue LineCSV::MalformedCSVError => e
141
181
  build_exception_messages(e, stream, current_line)
142
182
  end
143
183
 
@@ -227,8 +267,8 @@ module Csvlint
227
267
  end
228
268
 
229
269
  def report_line_breaks(line_no=nil)
230
- return if @input !~ /[\r|\n]/ # Return straight away if there's no newline character - i.e. we're on the last line
231
- line_break = CSV.new(@input).row_sep
270
+ return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line
271
+ line_break = get_line_break(@input)
232
272
  @line_breaks << line_break
233
273
  unless line_breaks_reported?
234
274
  if line_break != "\r\n"
@@ -353,26 +393,8 @@ module Csvlint
353
393
  :numeric
354
394
  elsif uri?(col)
355
395
  :uri
356
- elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
357
- :date_db
358
- elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
359
- :date_short
360
- elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
361
- :date_rfc822
362
- elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
363
- :date_long
364
- elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
365
- :dateTime_time
366
- elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
367
- :dateTime_hms
368
- elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
369
- :dateTime_db
370
- elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
371
- :dateTime_iso8601
372
- elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
373
- :dateTime_short
374
- elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
375
- :dateTime_long
396
+ elsif possible_date?(col)
397
+ date_formats(col)
376
398
  else
377
399
  :string
378
400
  end
@@ -493,6 +515,36 @@ module Csvlint
493
515
  false
494
516
  end
495
517
 
518
+ def possible_date?(col)
519
+ col[POSSIBLE_DATE_REGEXP]
520
+ end
521
+
522
+ def date_formats(col)
523
+ if col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
524
+ :date_db
525
+ elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
526
+ :date_short
527
+ elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
528
+ :date_rfc822
529
+ elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
530
+ :date_long
531
+ elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
532
+ :dateTime_time
533
+ elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
534
+ :dateTime_hms
535
+ elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
536
+ :dateTime_db
537
+ elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
538
+ :dateTime_iso8601
539
+ elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
540
+ :dateTime_short
541
+ elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
542
+ :dateTime_long
543
+ else
544
+ :string
545
+ end
546
+ end
547
+
496
548
  def date_format?(klass, value, format)
497
549
  klass.strptime(value, format).strftime(format) == value
498
550
  rescue ArgumentError # invalid date
@@ -503,6 +555,15 @@ module Csvlint
503
555
  @limit_lines.present? && @current_line > @limit_lines
504
556
  end
505
557
 
558
+ def get_line_break(line)
559
+ eol = line.chars.last(2)
560
+ if eol.first == "\r"
561
+ "\r\n"
562
+ else
563
+ "\n"
564
+ end
565
+ end
566
+
506
567
  FORMATS = {
507
568
  :string => nil,
508
569
  :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
@@ -531,6 +592,7 @@ module Csvlint
531
592
  LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)")
532
593
  LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})")
533
594
  LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*")
595
+ POSSIBLE_DATE_REGEXP = Regexp.new("\\A(\\d|\\s\\d#{Date::ABBR_MONTHNAMES.join('|')}#{Date::MONTHNAMES.join('|')})")
534
596
 
535
597
  end
536
598
  end
@@ -1,3 +1,3 @@
1
1
  module Csvlint
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csvlint
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - pezholio
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-07 00:00:00.000000000 Z
11
+ date: 2015-10-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mime-types