csvlint 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MmUxZTY5NThhMmU1ZmVlM2M0OWJiMzQ5MGY2NGRiMzk5NGEyYzEyYQ==
4
+ NzVlNGUzMDczMzhmZmJiNDIzZDRiYmJmZTBhZGNjMzcwMzlmZjU5Yw==
5
5
  data.tar.gz: !binary |-
6
- NTllMTYzYjUyYTk0ZTcwZmY5NDJkZjVlMGQzNzM4YWNkYWU2NjFjMg==
6
+ MjdiMmM3ZjVmOTIxOTYzYTk5NGFiNDY3Y2MxMmY0NWRlZjViZGM2OA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NTc2NTdhMzI4ZGI5NzFiMzgwZWYwM2E1YWVhMzE2ZmY5ZDUyNzdkODU1MTkw
10
- OTgyZGM1ZGFhODMxNGVmNDkwNjY3ZjY5NDEyM2YzYWJjZDQ3NThiODRiOWY1
11
- OTU1NGM4NGQ0NzQ3ZmRiYmM2MDM1YWM5YWJlMDRiN2MyNWI0YmI=
9
+ ZTM5MTMwYWEzMTYyNzFmMGNlYjMyMWFlZjRlMDQ2YTA5ZjczM2Q4NzJiNDIy
10
+ NzYwYmM0MjQ3ZGMxNzZjN2NlNzA0NDAxNmZlYTQxMTZkNzhiYWIzMTRhOGFi
11
+ ZTIwM2NkYzgwMzcyOGM0YTE3NWZlYWRmYzdjMThjZjEzYzBhOGY=
12
12
  data.tar.gz: !binary |-
13
- OTQ2NDNkN2RjNDlhZDNlYTI3NmU5NmQ4YTIxOTYxMjQyMTg2MWNhODFkZWQ2
14
- ZDYyYWUyNzJjZGNkYzFkYWU0YjI2NzkwZTI1OGNkODFmNTZhNzhjNjE5OGY4
15
- MmQzMzFkMTIxYzNkODM5NDFkNzc4ZDYwMjc2YTE2ZmZkZDgxZWY=
13
+ YTBjNDc4MjI5ZTcyNWUyYjQwMzQ2NWQyYTBjMDI1ODc4Njc4NDllNWQ4YzE0
14
+ MzA4ODJmOTIxYmIwMWE5YjAxYWViZTE1OGY4NDIyNTM2OGU0OTg5NWY2NGRj
15
+ ZmJhYzFhNTBiMzM0Yzg5Y2UxYjQxMzJlMzhmZTc0ZTU1MTg3MTE=
@@ -2,7 +2,27 @@
2
2
 
3
3
  ## [Unreleased](https://github.com/theodi/csvlint.rb/tree/HEAD)
4
4
 
5
- [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...HEAD)
5
+ [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.1...HEAD)
6
+
7
+ **Closed issues:**
8
+
9
+ - Eliminate some date and time formats \(for speed\) [\#105](https://github.com/theodi/csvlint.rb/issues/105)
10
+
11
+ **Merged pull requests:**
12
+
13
+ - Check characters in validate\_line method [\#160](https://github.com/theodi/csvlint.rb/pull/160) ([pezholio](https://github.com/pezholio))
14
+
15
+ - Further optimisations [\#159](https://github.com/theodi/csvlint.rb/pull/159) ([pezholio](https://github.com/pezholio))
16
+
17
+ - More optimizations after \#157 [\#158](https://github.com/theodi/csvlint.rb/pull/158) ([jpmckinney](https://github.com/jpmckinney))
18
+
19
+ - Memoize the result of CSV\#encode\_re [\#157](https://github.com/theodi/csvlint.rb/pull/157) ([jpmckinney](https://github.com/jpmckinney))
20
+
21
+ - Don't pass leading string to parse\_line [\#155](https://github.com/theodi/csvlint.rb/pull/155) ([pezholio](https://github.com/pezholio))
22
+
23
+ ## [0.2.1](https://github.com/theodi/csvlint.rb/tree/0.2.1) (2015-10-07)
24
+
25
+ [Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...0.2.1)
6
26
 
7
27
  **Implemented enhancements:**
8
28
 
@@ -17,14 +17,6 @@ Feature: Collect all the tests that should trigger dialect check related errors
17
17
  Then there should be 1 info message
18
18
  And one of the messages should have the type "nonrfc_line_breaks"
19
19
 
20
- Scenario: CR line endings in file give an info message of type :nonrfc_line_breaks
21
- Given I have a CSV file called "cr-line-endings.csv"
22
- And it is stored at the url "http://example.com/example1.csv"
23
- And I set header to "true"
24
- And I ask if there are info messages
25
- Then there should be 1 info message
26
- And one of the messages should have the type "nonrfc_line_breaks"
27
-
28
20
  Scenario: CRLF line endings in file produces no info messages of type :nonrfc_line_breaks
29
21
  Given I have a CSV file called "crlf-line-endings.csv"
30
22
  And it is stored at the url "http://example.com/example1.csv"
@@ -8,14 +8,6 @@ Feature: Get validation information messages
8
8
  Then there should be 1 info messages
9
9
  And one of the messages should have the type "nonrfc_line_breaks"
10
10
 
11
- Scenario: CR line endings in file give an info message
12
- Given I have a CSV file called "cr-line-endings.csv"
13
- And it is stored at the url "http://example.com/example1.csv"
14
- And I set header to "true"
15
- And I ask if there are info messages
16
- Then there should be 1 info messages
17
- And one of the messages should have the type "nonrfc_line_breaks"
18
-
19
11
  Scenario: CRLF line endings in file produces no info messages
20
12
  Given I have a CSV file called "crlf-line-endings.csv"
21
13
  And it is stored at the url "http://example.com/example1.csv"
@@ -1,6 +1,52 @@
1
1
  module Csvlint
2
2
 
3
3
  class Validator
4
+ class LineCSV < CSV
5
+ ENCODE_RE = Hash.new do |h,str|
6
+ h[str] = Regexp.new(str)
7
+ end
8
+
9
+ ENCODE_STR = Hash.new do |h,encoding_name|
10
+ h[encoding_name] = Hash.new do |h,chunks|
11
+ h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join('')
12
+ end
13
+ end
14
+
15
+ ESCAPE_RE = Hash.new do |h,re_chars|
16
+ h[re_chars] = Hash.new do |h,re_esc|
17
+ h[re_esc] = Hash.new do |h,str|
18
+ h[str] = str.gsub(re_chars) {|c| re_esc + c}
19
+ end
20
+ end
21
+ end
22
+
23
+ # Optimization: Memoize `encode_re`.
24
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2273
25
+ def encode_re(*chunks)
26
+ ENCODE_RE[encode_str(*chunks)]
27
+ end
28
+
29
+ # Optimization: Memoize `encode_str`.
30
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2281
31
+ def encode_str(*chunks)
32
+ ENCODE_STR[@encoding.name][chunks]
33
+ end
34
+
35
+ # Optimization: Memoize `escape_re`.
36
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2265
37
+ def escape_re(str)
38
+ ESCAPE_RE[@re_chars][@re_esc][str]
39
+ end
40
+
41
+ # Optimization: Disable the CSV library's converters feature.
42
+ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2100
43
+ def init_converters(options, field_name = :converters)
44
+ @converters = []
45
+ @header_converters = []
46
+ options.delete(:unconverted_fields)
47
+ options.delete(field_name)
48
+ end
49
+ end
4
50
 
5
51
  include Csvlint::ErrorCollector
6
52
 
@@ -21,7 +67,7 @@ module Csvlint
21
67
  @dialect = dialect
22
68
  @csv_header = true
23
69
  @headers = {}
24
- @lambda = options[:lambda] || lambda { |a| nil }
70
+ @lambda = options[:lambda]
25
71
  @leading = ""
26
72
 
27
73
  @limit_lines = options[:limit_lines]
@@ -67,29 +113,24 @@ module Csvlint
67
113
 
68
114
  def validate_url
69
115
  @current_line = 1
70
- begin
71
- request = Typhoeus::Request.new(@source, followlocation: true)
72
- request.on_headers do |response|
73
- @headers = response.headers || {}
74
- @content_type = response.headers["content-type"] rescue nil
75
- @response_code = response.code
76
- return build_errors(:not_found) if response.code == 404
77
- validate_metadata
78
- end
79
- request.on_body do |chunk|
80
- io = StringIO.new(@leading + chunk)
81
- io.each_line do |line|
82
- break if line_limit_reached?
83
- parse_line(line)
84
- end
116
+ request = Typhoeus::Request.new(@source, followlocation: true)
117
+ request.on_headers do |response|
118
+ @headers = response.headers || {}
119
+ @content_type = response.headers["content-type"] rescue nil
120
+ @response_code = response.code
121
+ return build_errors(:not_found) if response.code == 404
122
+ validate_metadata
123
+ end
124
+ request.on_body do |chunk|
125
+ io = StringIO.new(chunk)
126
+ io.each_line do |line|
127
+ break if line_limit_reached?
128
+ parse_line(line)
85
129
  end
86
- request.run
87
- # Validate the last line too
88
- validate_line(@leading, @current_line) unless @leading == ""
89
- rescue ArgumentError => ae
90
- build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
91
- @reported_invalid_encoding = true
92
130
  end
131
+ request.run
132
+ # Validate the last line too
133
+ validate_line(@leading, @current_line) unless @leading == ""
93
134
  end
94
135
 
95
136
  def parse_line(line)
@@ -108,6 +149,9 @@ module Csvlint
108
149
  # If it's not a full line, then prepare to add it to the beginning of the next chunk
109
150
  @leading = line
110
151
  end
152
+ rescue ArgumentError => ae
153
+ build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
154
+ @reported_invalid_encoding = true
111
155
  end
112
156
 
113
157
  def validate_line(input = nil, index = nil)
@@ -117,7 +161,7 @@ module Csvlint
117
161
  @encoding = input.encoding.to_s
118
162
  report_line_breaks(line)
119
163
  parse_contents(input, line)
120
- @lambda.call(self)
164
+ @lambda.call(self) unless @lambda.nil?
121
165
  rescue ArgumentError => ae
122
166
  build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
123
167
  @reported_invalid_encoding = true
@@ -132,12 +176,8 @@ module Csvlint
132
176
  @csv_options[:encoding] = @encoding
133
177
 
134
178
  begin
135
- row = CSV.parse_line(stream, @csv_options)
136
- # this is a one line substitute for CSV.new followed by row = CSV.shift. a CSV Row class is required
137
- # CSV.parse will return an array of arrays which breaks subsequent each_with_index invocations
138
- # TODO investigate if above would be a drag on memory
139
-
140
- rescue CSV::MalformedCSVError => e
179
+ row = LineCSV.parse_line(stream, @csv_options)
180
+ rescue LineCSV::MalformedCSVError => e
141
181
  build_exception_messages(e, stream, current_line)
142
182
  end
143
183
 
@@ -227,8 +267,8 @@ module Csvlint
227
267
  end
228
268
 
229
269
  def report_line_breaks(line_no=nil)
230
- return if @input !~ /[\r|\n]/ # Return straight away if there's no newline character - i.e. we're on the last line
231
- line_break = CSV.new(@input).row_sep
270
+ return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line
271
+ line_break = get_line_break(@input)
232
272
  @line_breaks << line_break
233
273
  unless line_breaks_reported?
234
274
  if line_break != "\r\n"
@@ -353,26 +393,8 @@ module Csvlint
353
393
  :numeric
354
394
  elsif uri?(col)
355
395
  :uri
356
- elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
357
- :date_db
358
- elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
359
- :date_short
360
- elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
361
- :date_rfc822
362
- elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
363
- :date_long
364
- elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
365
- :dateTime_time
366
- elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
367
- :dateTime_hms
368
- elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
369
- :dateTime_db
370
- elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
371
- :dateTime_iso8601
372
- elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
373
- :dateTime_short
374
- elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
375
- :dateTime_long
396
+ elsif possible_date?(col)
397
+ date_formats(col)
376
398
  else
377
399
  :string
378
400
  end
@@ -493,6 +515,36 @@ module Csvlint
493
515
  false
494
516
  end
495
517
 
518
+ def possible_date?(col)
519
+ col[POSSIBLE_DATE_REGEXP]
520
+ end
521
+
522
+ def date_formats(col)
523
+ if col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
524
+ :date_db
525
+ elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
526
+ :date_short
527
+ elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
528
+ :date_rfc822
529
+ elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
530
+ :date_long
531
+ elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
532
+ :dateTime_time
533
+ elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
534
+ :dateTime_hms
535
+ elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
536
+ :dateTime_db
537
+ elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
538
+ :dateTime_iso8601
539
+ elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
540
+ :dateTime_short
541
+ elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
542
+ :dateTime_long
543
+ else
544
+ :string
545
+ end
546
+ end
547
+
496
548
  def date_format?(klass, value, format)
497
549
  klass.strptime(value, format).strftime(format) == value
498
550
  rescue ArgumentError # invalid date
@@ -503,6 +555,15 @@ module Csvlint
503
555
  @limit_lines.present? && @current_line > @limit_lines
504
556
  end
505
557
 
558
+ def get_line_break(line)
559
+ eol = line.chars.last(2)
560
+ if eol.first == "\r"
561
+ "\r\n"
562
+ else
563
+ "\n"
564
+ end
565
+ end
566
+
506
567
  FORMATS = {
507
568
  :string => nil,
508
569
  :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
@@ -531,6 +592,7 @@ module Csvlint
531
592
  LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)")
532
593
  LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})")
533
594
  LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*")
595
+ POSSIBLE_DATE_REGEXP = Regexp.new("\\A(\\d|\\s\\d#{Date::ABBR_MONTHNAMES.join('|')}#{Date::MONTHNAMES.join('|')})")
534
596
 
535
597
  end
536
598
  end
@@ -1,3 +1,3 @@
1
1
  module Csvlint
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csvlint
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - pezholio
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-07 00:00:00.000000000 Z
11
+ date: 2015-10-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mime-types