RubyGems - csvlint - Versions diffs - 0.2.1 → 0.2.2 - Mend

csvlint 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +8 -8
data/CHANGELOG.md +21 -1
data/features/csvupload.feature +0 -8
data/features/validation_info.feature +0 -8
data/lib/csvlint/validate.rb +113 -51
data/lib/csvlint/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    MmUxZTY5NThhMmU1ZmVlM2M0OWJiMzQ5MGY2NGRiMzk5NGEyYzEyYQ==
+    NzVlNGUzMDczMzhmZmJiNDIzZDRiYmJmZTBhZGNjMzcwMzlmZjU5Yw==
   data.tar.gz: !binary |-
-    NTllMTYzYjUyYTk0ZTcwZmY5NDJkZjVlMGQzNzM4YWNkYWU2NjFjMg==
+    MjdiMmM3ZjVmOTIxOTYzYTk5NGFiNDY3Y2MxMmY0NWRlZjViZGM2OA==
 SHA512:
   metadata.gz: !binary |-
-    NTc2NTdhMzI4ZGI5NzFiMzgwZWYwM2E1YWVhMzE2ZmY5ZDUyNzdkODU1MTkw
-    OTgyZGM1ZGFhODMxNGVmNDkwNjY3ZjY5NDEyM2YzYWJjZDQ3NThiODRiOWY1
-    OTU1NGM4NGQ0NzQ3ZmRiYmM2MDM1YWM5YWJlMDRiN2MyNWI0YmI=
+    ZTM5MTMwYWEzMTYyNzFmMGNlYjMyMWFlZjRlMDQ2YTA5ZjczM2Q4NzJiNDIy
+    NzYwYmM0MjQ3ZGMxNzZjN2NlNzA0NDAxNmZlYTQxMTZkNzhiYWIzMTRhOGFi
+    ZTIwM2NkYzgwMzcyOGM0YTE3NWZlYWRmYzdjMThjZjEzYzBhOGY=
   data.tar.gz: !binary |-
-    OTQ2NDNkN2RjNDlhZDNlYTI3NmU5NmQ4YTIxOTYxMjQyMTg2MWNhODFkZWQ2
-    ZDYyYWUyNzJjZGNkYzFkYWU0YjI2NzkwZTI1OGNkODFmNTZhNzhjNjE5OGY4
-    MmQzMzFkMTIxYzNkODM5NDFkNzc4ZDYwMjc2YTE2ZmZkZDgxZWY=
+    YTBjNDc4MjI5ZTcyNWUyYjQwMzQ2NWQyYTBjMDI1ODc4Njc4NDllNWQ4YzE0
+    MzA4ODJmOTIxYmIwMWE5YjAxYWViZTE1OGY4NDIyNTM2OGU0OTg5NWY2NGRj
+    ZmJhYzFhNTBiMzM0Yzg5Y2UxYjQxMzJlMzhmZTc0ZTU1MTg3MTE=

data/CHANGELOG.md CHANGED

@@ -2,7 +2,27 @@
 ## [Unreleased](https://github.com/theodi/csvlint.rb/tree/HEAD)
-[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...HEAD)
+[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.1...HEAD)
+**Closed issues:**
+- Eliminate some date and time formats \(for speed\) [\#105](https://github.com/theodi/csvlint.rb/issues/105)
+**Merged pull requests:**
+- Check characters in validate\_line method [\#160](https://github.com/theodi/csvlint.rb/pull/160) ([pezholio](https://github.com/pezholio))
+- Further optimisations [\#159](https://github.com/theodi/csvlint.rb/pull/159) ([pezholio](https://github.com/pezholio))
+- More optimizations after \#157 [\#158](https://github.com/theodi/csvlint.rb/pull/158) ([jpmckinney](https://github.com/jpmckinney))
+- Memoize the result of CSV\#encode\_re [\#157](https://github.com/theodi/csvlint.rb/pull/157) ([jpmckinney](https://github.com/jpmckinney))
+- Don't pass leading string to parse\_line [\#155](https://github.com/theodi/csvlint.rb/pull/155) ([pezholio](https://github.com/pezholio))
+## [0.2.1](https://github.com/theodi/csvlint.rb/tree/0.2.1) (2015-10-07)
+[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...0.2.1)
 **Implemented enhancements:**

data/features/csvupload.feature CHANGED

@@ -17,14 +17,6 @@ Feature: Collect all the tests that should trigger dialect check related errors
     Then there should be 1 info message
     And one of the messages should have the type "nonrfc_line_breaks"
-  Scenario: CR line endings in file give an info message of type :nonrfc_line_breaks
-    Given I have a CSV file called "cr-line-endings.csv"
-    And it is stored at the url "http://example.com/example1.csv"
-    And I set header to "true"
-    And I ask if there are info messages
-    Then there should be 1 info message
-    And one of the messages should have the type "nonrfc_line_breaks"
   Scenario: CRLF line endings in file produces no info messages of type :nonrfc_line_breaks
     Given I have a CSV file called "crlf-line-endings.csv"
     And it is stored at the url "http://example.com/example1.csv"

data/features/validation_info.feature CHANGED

@@ -8,14 +8,6 @@ Feature: Get validation information messages
     Then there should be 1 info messages
     And one of the messages should have the type "nonrfc_line_breaks"
-  Scenario: CR line endings in file give an info message
-    Given I have a CSV file called "cr-line-endings.csv"
-    And it is stored at the url "http://example.com/example1.csv"
-    And I set header to "true"
-    And I ask if there are info messages
-    Then there should be 1 info messages
-    And one of the messages should have the type "nonrfc_line_breaks"
   Scenario: CRLF line endings in file produces no info messages
     Given I have a CSV file called "crlf-line-endings.csv"
     And it is stored at the url "http://example.com/example1.csv"

data/lib/csvlint/validate.rb CHANGED

@@ -1,6 +1,52 @@
 module Csvlint
   class Validator
+    class LineCSV < CSV
+      ENCODE_RE = Hash.new do |h,str|
+        h[str] = Regexp.new(str)
+      end
+      ENCODE_STR = Hash.new do |h,encoding_name|
+        h[encoding_name] = Hash.new do |h,chunks|
+          h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join('')
+        end
+      end
+      ESCAPE_RE = Hash.new do |h,re_chars|
+        h[re_chars] = Hash.new do |h,re_esc|
+          h[re_esc] = Hash.new do |h,str|
+            h[str] = str.gsub(re_chars) {|c| re_esc + c}
+          end
+        end
+      end
+      # Optimization: Memoize `encode_re`.
+      # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2273
+      def encode_re(*chunks)
+        ENCODE_RE[encode_str(*chunks)]
+      end
+      # Optimization: Memoize `encode_str`.
+      # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2281
+      def encode_str(*chunks)
+        ENCODE_STR[@encoding.name][chunks]
+      end
+      # Optimization: Memoize `escape_re`.
+      # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2265
+      def escape_re(str)
+        ESCAPE_RE[@re_chars][@re_esc][str]
+      end
+      # Optimization: Disable the CSV library's converters feature.
+      # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2100
+      def init_converters(options, field_name = :converters)
+        @converters = []
+        @header_converters = []
+        options.delete(:unconverted_fields)
+        options.delete(field_name)
+      end
+    end
     include Csvlint::ErrorCollector
@@ -21,7 +67,7 @@ module Csvlint
       @dialect = dialect
       @csv_header = true
       @headers = {}
-      @lambda = options[:lambda] || lambda { |a| nil }
+      @lambda = options[:lambda]
       @leading = ""
       @limit_lines = options[:limit_lines]
@@ -67,29 +113,24 @@ module Csvlint
     def validate_url
       @current_line = 1
-      begin
-        request = Typhoeus::Request.new(@source, followlocation: true)
-        request.on_headers do |response|
-          @headers = response.headers || {}
-          @content_type = response.headers["content-type"] rescue nil
-          @response_code = response.code
-          return build_errors(:not_found) if response.code == 404
-          validate_metadata
-        end
-        request.on_body do |chunk|
-          io = StringIO.new(@leading + chunk)
-          io.each_line do |line|
-            break if line_limit_reached?
-            parse_line(line)
-          end
+      request = Typhoeus::Request.new(@source, followlocation: true)
+      request.on_headers do |response|
+        @headers = response.headers || {}
+        @content_type = response.headers["content-type"] rescue nil
+        @response_code = response.code
+        return build_errors(:not_found) if response.code == 404
+        validate_metadata
+      end
+      request.on_body do |chunk|
+        io = StringIO.new(chunk)
+        io.each_line do |line|
+          break if line_limit_reached?
+          parse_line(line)
         end
-        request.run
-        # Validate the last line too
-        validate_line(@leading, @current_line) unless @leading == ""
-      rescue ArgumentError => ae
-        build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
-        @reported_invalid_encoding = true
       end
+      request.run
+      # Validate the last line too
+      validate_line(@leading, @current_line) unless @leading == ""
     end
     def parse_line(line)
@@ -108,6 +149,9 @@ module Csvlint
         # If it's not a full line, then prepare to add it to the beginning of the next chunk
         @leading = line
       end
+    rescue ArgumentError => ae
+      build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
+      @reported_invalid_encoding = true
     end
     def validate_line(input = nil, index = nil)
@@ -117,7 +161,7 @@ module Csvlint
       @encoding = input.encoding.to_s
       report_line_breaks(line)
       parse_contents(input, line)
-      @lambda.call(self)
+      @lambda.call(self) unless @lambda.nil?
     rescue ArgumentError => ae
       build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
       @reported_invalid_encoding = true
@@ -132,12 +176,8 @@ module Csvlint
       @csv_options[:encoding] = @encoding
       begin
-        row = CSV.parse_line(stream, @csv_options)
-          # this is a one line substitute for CSV.new followed by row = CSV.shift. a CSV Row class is required
-          # CSV.parse will return an array of arrays which breaks subsequent each_with_index invocations
-          # TODO investigate if above would be a drag on memory
-      rescue CSV::MalformedCSVError => e
+        row = LineCSV.parse_line(stream, @csv_options)
+      rescue LineCSV::MalformedCSVError => e
         build_exception_messages(e, stream, current_line)
       end
@@ -227,8 +267,8 @@ module Csvlint
     end
     def report_line_breaks(line_no=nil)
-      return if @input !~ /[\r|\n]/ # Return straight away if there's no newline character - i.e. we're on the last line
-      line_break = CSV.new(@input).row_sep
+      return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line
+      line_break = get_line_break(@input)
       @line_breaks << line_break
       unless line_breaks_reported?
         if line_break != "\r\n"
@@ -353,26 +393,8 @@ module Csvlint
               :numeric
             elsif uri?(col)
               :uri
-            elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
-              :date_db
-            elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
-              :date_short
-            elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
-              :date_rfc822
-            elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
-              :date_long
-            elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
-              :dateTime_time
-            elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
-              :dateTime_hms
-            elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
-              :dateTime_db
-            elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
-              :dateTime_iso8601
-            elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
-              :dateTime_short
-            elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
-              :dateTime_long
+            elsif possible_date?(col)
+              date_formats(col)
             else
               :string
             end
@@ -493,6 +515,36 @@ module Csvlint
       false
     end
+    def possible_date?(col)
+      col[POSSIBLE_DATE_REGEXP]
+    end
+    def date_formats(col)
+      if col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
+        :date_db
+      elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
+        :date_short
+      elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
+        :date_rfc822
+      elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
+        :date_long
+      elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
+        :dateTime_time
+      elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
+        :dateTime_hms
+      elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
+        :dateTime_db
+      elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
+        :dateTime_iso8601
+      elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
+        :dateTime_short
+      elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
+        :dateTime_long
+      else
+        :string
+      end
+    end
     def date_format?(klass, value, format)
       klass.strptime(value, format).strftime(format) == value
     rescue ArgumentError # invalid date
@@ -503,6 +555,15 @@ module Csvlint
       @limit_lines.present? && @current_line > @limit_lines
     end
+    def get_line_break(line)
+      eol = line.chars.last(2)
+      if eol.first == "\r"
+        "\r\n"
+      else
+        "\n"
+      end
+    end
     FORMATS = {
         :string => nil,
         :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
@@ -531,6 +592,7 @@ module Csvlint
     LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)")
     LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})")
     LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*")
+    POSSIBLE_DATE_REGEXP = Regexp.new("\\A(\\d|\\s\\d#{Date::ABBR_MONTHNAMES.join('|')}#{Date::MONTHNAMES.join('|')})")
   end
 end

data/lib/csvlint/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Csvlint
-  VERSION = "0.2.1"
+  VERSION = "0.2.2"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: csvlint
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.2
 platform: ruby
 authors:
 - pezholio
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-10-07 00:00:00.000000000 Z
+date: 2015-10-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mime-types