RubyGems - csvlint - Versions diffs - 0.2.0 → 0.2.1 - Mend

csvlint 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +8 -8
data/.gitattributes +2 -0
data/CHANGELOG.md +19 -1
data/README.md +15 -1
data/bin/csvlint +13 -3
data/csvlint.gemspec +1 -0
data/features/csvupload.feature +5 -5
data/features/fixtures/inconsistent-line-endings-unquoted.csv +0 -0
data/features/fixtures/inconsistent-line-endings.csv +0 -0
data/features/fixtures/invalid_many_rows.csv +0 -0
data/features/fixtures/valid_many_rows.csv +0 -0
data/features/information.feature +4 -4
data/features/step_definitions/csv_options_steps.rb +5 -0
data/features/validation_errors.feature +1 -1
data/features/validation_info.feature +6 -6
data/lib/csvlint.rb +1 -1
data/lib/csvlint/csvw/number_format.rb +1 -1
data/lib/csvlint/field.rb +10 -4
data/lib/csvlint/validate.rb +326 -219
data/lib/csvlint/version.rb +1 -1
data/spec/csvw/number_format_spec.rb +14 -0
data/spec/validator_spec.rb +450 -146
metadata +21 -3
data/lib/csvlint/wrapped_io.rb +0 -21

checksums.yaml CHANGED

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    YjlmZmFlNGZjOWQ5MmNlNDZiOTUxMWY0NGExYTRkYjhhNzdlNjAyNA==
+    MmUxZTY5NThhMmU1ZmVlM2M0OWJiMzQ5MGY2NGRiMzk5NGEyYzEyYQ==
   data.tar.gz: !binary |-
-    ODFjZmJkZmI0Nzg2NmMzN2ViOGNiNDlmODA0NDcxMzM0Zjk4NTgwOQ==
+    NTllMTYzYjUyYTk0ZTcwZmY5NDJkZjVlMGQzNzM4YWNkYWU2NjFjMg==
 SHA512:
   metadata.gz: !binary |-
-    ZTIyMGVkYjIyMjc2ZWViNTBhYmZkMWIxN2E1OTU0OTFhNGMxNzBlYzg0OTI4
-    NDRkMzY2YzgxNmQwZGZiZDE5M2M2NzYwMzk3ZWZjMDc3YWM0YzQ0NTczY2U3
-    MGZjNTUwMGI2MzgzZDQxYzkzMzBiNzI3NmJkZTIxYjZiYjc5MDA=
+    NTc2NTdhMzI4ZGI5NzFiMzgwZWYwM2E1YWVhMzE2ZmY5ZDUyNzdkODU1MTkw
+    OTgyZGM1ZGFhODMxNGVmNDkwNjY3ZjY5NDEyM2YzYWJjZDQ3NThiODRiOWY1
+    OTU1NGM4NGQ0NzQ3ZmRiYmM2MDM1YWM5YWJlMDRiN2MyNWI0YmI=
   data.tar.gz: !binary |-
-    NTI1M2I5Yzc3NGNhOTg3Y2VkMmM3ZGM1ZTdiZWNmMzM0ZTY5ODljODNmNWYy
-    MDA0NGVlMGFhNDQ2ZjZjYjI0Nzc2OTdhMWRmODI5YTEzMGRmNTQxZjAyOTA5
-    YjVmMjk4NDIyOWEzMzIxMTBlYjQ4YTgwZmE4MWZlYTQ4MjMzZmE=
+    OTQ2NDNkN2RjNDlhZDNlYTI3NmU5NmQ4YTIxOTYxMjQyMTg2MWNhODFkZWQ2
+    ZDYyYWUyNzJjZGNkYzFkYWU0YjI2NzkwZTI1OGNkODFmNTZhNzhjNjE5OGY4
+    MmQzMzFkMTIxYzNkODM5NDFkNzc4ZDYwMjc2YTE2ZmZkZDgxZWY=

data/.gitattributes ADDED

	@@ -0,0 +1,2 @@
1	+ # Don't fuck with my CSV files
2	+ *.csv binary

data/CHANGELOG.md CHANGED

@@ -2,7 +2,25 @@
 ## [Unreleased](https://github.com/theodi/csvlint.rb/tree/HEAD)
-[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.1.4...HEAD)
+[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.2.0...HEAD)
+**Implemented enhancements:**
+- Get total rows number about the CSV file that was validated [\#143](https://github.com/theodi/csvlint.rb/issues/143)
+**Closed issues:**
+- Optimization: Stream CSV [\#122](https://github.com/theodi/csvlint.rb/issues/122)
+**Merged pull requests:**
+- Add `row\_count` method [\#153](https://github.com/theodi/csvlint.rb/pull/153) ([pezholio](https://github.com/pezholio))
+- Streaming validation [\#146](https://github.com/theodi/csvlint.rb/pull/146) ([pezholio](https://github.com/pezholio))
+## [0.2.0](https://github.com/theodi/csvlint.rb/tree/0.2.0) (2015-10-05)
+[Full Changelog](https://github.com/theodi/csvlint.rb/compare/0.1.4...0.2.0)
 **Closed issues:**

data/README.md CHANGED

@@ -77,6 +77,7 @@ best practices
 	validator.encoding
 	validator.content_type
 	validator.extension
+	validator.row_count
 	#retrieve HTTP headers from request
 	validator.headers
@@ -128,7 +129,6 @@ The following types of error can be reported:
 * `:unclosed_quote` -- unclosed quoted field
 * `:whitespace` -- a quoted column has leading or trailing whitespace
 * `:line_breaks` -- line breaks were inconsistent or incorrectly specified
-* `:undeclared_header` -- if there is no machine-readable description of whether a header is present (e.g. in a dialect or `Content-Type` header)
 ## Warnings
@@ -271,6 +271,20 @@ options = {
 validator = Csvlint::Validator.new( "http://example.org/data.csv", nil, nil, options )
 ```
+* :lambda -- Pass a block of code to be called when each line is validated, this will give you access to the `Validator` object. For example, this will return the current line number for every line validated:
+```
+options = {
+  lambda: ->(validator) { puts validator.current_line }
+}
+validator = Csvlint::Validator.new( "http://example.org/data.csv", nil, nil, options )
+=> 1
+2
+3
+4
+.....
+```
 ## Contributing
 1. Fork it

data/bin/csvlint CHANGED

@@ -58,12 +58,22 @@ def print_error(index, error, dump, color)
 end
 def validate_csv(source, schema, dump)
-  validator = Csvlint::Validator.new( source, nil, schema )
+  @error_count = 0
+  report_lines = lambda do |row|
+    new_errors = row.errors.count
+    if new_errors > @error_count
+      print "!".red
+    else
+      print ".".green
+    end
+    @error_count = new_errors
+  end
+  validator = Csvlint::Validator.new( source, {}, schema, { lambda: report_lines } )
   if $stdout.tty?
-    puts "#{source.path || source || "CSV"} is #{validator.valid? ? "VALID".green : "INVALID".red}"
+    puts "\r\n#{source.path || source || "CSV"} is #{validator.valid? ? "VALID".green : "INVALID".red}"
   else
-    puts "#{source.path || source || "CSV"} is #{validator.valid? ? "VALID" : "INVALID"}"
+    puts "\r\n#{source.path || source || "CSV"} is #{validator.valid? ? "VALID" : "INVALID"}"
   end
   if validator.errors.size > 0

data/csvlint.gemspec CHANGED

@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency "open_uri_redirections"
   spec.add_dependency "activesupport"
   spec.add_dependency "addressable"
+  spec.add_dependency "typhoeus"
   spec.add_dependency "escape_utils"
   spec.add_dependency "uri_template"

data/features/csvupload.feature CHANGED

@@ -14,7 +14,7 @@ Feature: Collect all the tests that should trigger dialect check related errors
     And it is stored at the url "http://example.com/example1.csv"
     And I set header to "true"
     And I ask if there are info messages
-    Then there should be 2 info messages
+    Then there should be 1 info message
     And one of the messages should have the type "nonrfc_line_breaks"
   Scenario: CR line endings in file give an info message of type :nonrfc_line_breaks
@@ -22,7 +22,7 @@ Feature: Collect all the tests that should trigger dialect check related errors
     And it is stored at the url "http://example.com/example1.csv"
     And I set header to "true"
     And I ask if there are info messages
-    Then there should be 2 info messages
+    Then there should be 1 info message
     And one of the messages should have the type "nonrfc_line_breaks"
   Scenario: CRLF line endings in file produces no info messages of type :nonrfc_line_breaks
@@ -30,13 +30,13 @@ Feature: Collect all the tests that should trigger dialect check related errors
     And it is stored at the url "http://example.com/example1.csv"
     And I set header to "true"
     And I ask if there are info messages
-    Then there should be 1 info message
+    Then there should be 0 info messages
 #  :line_breaks
   Scenario: Incorrect line endings specified in settings
-    Given I have a CSV file called "cr-line-endings.csv"
-    And I set the line endings to linefeed
+    Given I have a CSV file called "lf-line-endings.csv"
+    And I set the line endings to carriage return
     And it is stored at the url "http://example.com/example1.csv"
     And I ask if there are errors
     Then there should be 1 error

data/features/fixtures/inconsistent-line-endings-unquoted.csv CHANGED

Binary file

data/features/fixtures/inconsistent-line-endings.csv CHANGED

Binary file

data/features/fixtures/invalid_many_rows.csv ADDED

Binary file

data/features/fixtures/valid_many_rows.csv ADDED

Binary file

data/features/information.feature CHANGED

@@ -10,13 +10,13 @@ Feature: Return information
     And it is stored at the url "http://example.com/example1.csv?query=true"
   Scenario: Return encoding
-    Then the "encoding" should be "utf-8"
+    Then the "encoding" should be "UTF-8"
   Scenario: Return content type
-    Then the "content_type" should be "text/csv"
+    Then the "content_type" should be "text/csv; charset=utf-8"
   Scenario: Return extension
     Then the "extension" should be ".csv"
   Scenario: Return meta
     Then the metadata content type should be "text/csv; charset=utf-8"

data/features/step_definitions/csv_options_steps.rb CHANGED

@@ -13,6 +13,11 @@ Given(/^I set the line endings to linefeed$/) do
   @csv_options["lineTerminator"] = "\n"
 end
+Given(/^I set the line endings to carriage return$/) do
+  @csv_options ||= default_csv_options
+  @csv_options["lineTerminator"] = "\r"
+end
 Given(/^I set header to "(.*?)"$/) do |boolean|
   @csv_options ||= default_csv_options
   @csv_options["header"] = boolean == "true"

data/features/validation_errors.feature CHANGED

@@ -27,7 +27,7 @@ Feature: Get validation errors
     And that error should have the row "2"
     And that error should have the content ""Foo","Bar","Baz"
-   Scenario: Successfully report a CSV with incorrect whitespace
+  Scenario: Successfully report a CSV with incorrect whitespace
     Given I have a CSV with the following content:
     """
 "col1","col2","col3"

data/features/validation_info.feature CHANGED

@@ -3,22 +3,22 @@ Feature: Get validation information messages
   Scenario: LF line endings in file give an info message
     Given I have a CSV file called "lf-line-endings.csv"
     And it is stored at the url "http://example.com/example1.csv"
-    And I set header to "true"
+    And I set header to "true"
     And I ask if there are info messages
-    Then there should be 2 info messages
+    Then there should be 1 info messages
     And one of the messages should have the type "nonrfc_line_breaks"
   Scenario: CR line endings in file give an info message
     Given I have a CSV file called "cr-line-endings.csv"
     And it is stored at the url "http://example.com/example1.csv"
-    And I set header to "true"
+    And I set header to "true"
     And I ask if there are info messages
-    Then there should be 2 info messages
+    Then there should be 1 info messages
     And one of the messages should have the type "nonrfc_line_breaks"
   Scenario: CRLF line endings in file produces no info messages
     Given I have a CSV file called "crlf-line-endings.csv"
     And it is stored at the url "http://example.com/example1.csv"
-    And I set header to "true"
+    And I set header to "true"
     And I ask if there are info messages
-    Then there should be 1 info message
+    Then there should be 0 info messages

data/lib/csvlint.rb CHANGED

@@ -3,6 +3,7 @@ require 'date'
 require 'open-uri'
 require 'set'
 require 'tempfile'
+require 'typhoeus'
 require 'active_support/core_ext/date/conversions'
 require 'active_support/core_ext/time/conversions'
@@ -13,7 +14,6 @@ require 'uri_template'
 require 'csvlint/error_message'
 require 'csvlint/error_collector'
 require 'csvlint/validate'
-require 'csvlint/wrapped_io'
 require 'csvlint/field'
 require 'csvlint/csvw/metadata_error'

data/lib/csvlint/csvw/number_format.rb CHANGED

@@ -134,7 +134,7 @@ module Csvlint
                 fractional_regexp += "(#{Regexp.escape(@grouping_separator)}[0-9]{1,#{max_fraction_digits % @fractional_grouping_size}})?" if max_fraction_digits % @fractional_grouping_size > 0
               else
                 fractional_regexp += "([0-9]{#{@fractional_grouping_size}}#{Regexp.escape(@grouping_separator)}){0,#{(max_fraction_digits / @fractional_grouping_size) - 1}}" if max_fraction_digits > @fractional_grouping_size
-                fractional_regexp += "[0-9]{#{@fractional_grouping_size}}"
+                fractional_regexp += "[0-9]{1,#{@fractional_grouping_size}}"
               end
               fractional_regexp = "#{Regexp.escape(@decimal_separator)}#{fractional_regexp}"
               fractional_regexp = "(#{fractional_regexp})?" if min_fraction_digits == 0

data/lib/csvlint/field.rb CHANGED

@@ -17,7 +17,7 @@ module Csvlint
     def validate_column(value, row=nil, column=nil, all_errors=[])
       reset
       unless all_errors.any?{|error| ((error.type == :invalid_regex) && (error.column == column))}
-        validate_regex(value, row, column)
+        validate_regex(value, row, column, all_errors)
       end
       validate_length(value, row, column)
       validate_values(value, row, column)
@@ -42,7 +42,7 @@ module Csvlint
         end
       end
-      def validate_regex(value, row, column)
+      def validate_regex(value, row, column, all_errors)
         pattern = constraints["pattern"]
         if pattern
           begin
@@ -50,12 +50,18 @@ module Csvlint
             build_errors(:pattern, :schema, row, column, value,
             { "pattern" => constraints["pattern"] } ) if !value.nil? && !value.match( constraints["pattern"] )
           rescue RegexpError
-            build_errors(:invalid_regex, :schema, nil, column, ("#{name}: Constraints: Pattern: #{pattern}"),
-              { "pattern" => constraints["pattern"] })
+            build_regex_error(value, row, column, pattern, all_errors)
           end
         end
       end
+      def build_regex_error(value, row, column, pattern, all_errors)
+        return if @regex_error_exists
+        build_errors(:invalid_regex, :schema, nil, column, ("#{name}: Constraints: Pattern: #{pattern}"),
+          { "pattern" => constraints["pattern"] })
+        @regex_error_exists = true
+      end
       def validate_values(value, row, column)
         # If a pattern exists, raise an invalid regex error if it is not in
         # valid regex form, else, if the value of the relevant field in the csv

data/lib/csvlint/validate.rb CHANGED

@@ -4,181 +4,308 @@ module Csvlint
     include Csvlint::ErrorCollector
-    attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :line_breaks, :dialect, :csv_header, :schema, :data
+    attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :dialect, :csv_header, :schema, :data, :current_line
     ERROR_MATCHERS = {
-      "Missing or stray quote" => :stray_quote,
-      "Illegal quoting" => :whitespace,
-      "Unclosed quoted field" => :unclosed_quote,
-      "Unquoted fields do not allow \\r or \\n" => :line_breaks,
+        "Missing or stray quote" => :stray_quote,
+        "Illegal quoting" => :whitespace,
+        "Unclosed quoted field" => :unclosed_quote,
+        "Unquoted fields do not allow \\r or \\n" => :line_breaks,
     }
-    def initialize(source, dialect = nil, schema = nil, options = {})
+    def initialize(source, dialect = {}, schema = nil, options = {})
       reset
       @source = source
       @formats = []
       @schema = schema
-      @supplied_dialect = dialect != nil
+      @dialect = dialect
+      @csv_header = true
+      @headers = {}
+      @lambda = options[:lambda] || lambda { |a| nil }
+      @leading = ""
       @limit_lines = options[:limit_lines]
       @extension = parse_extension(source) unless @source.nil?
+      @expected_columns = 0
+      @col_counts = []
+      @line_breaks = []
       @errors += @schema.errors unless @schema.nil?
       @warnings += @schema.warnings unless @schema.nil?
-      validate(dialect)
+      @data = [] # it may be advisable to flush this on init?
+      validate
     end
-    def validate(dialect = nil)
-      single_col = false
-      io = nil
+    def validate
+      if @extension =~ /.xls(x)?/
+        build_warnings(:excel, :context)
+        return
+      end
+      locate_schema unless @schema.instance_of?(Csvlint::Schema)
+      set_dialect
+      if @source.class == String
+        validate_url
+      else
+        validate_metadata
+        validate_stream
+      end
+      finish
+    end
+    def validate_stream
+      @current_line = 1
+      @source.each_line do |line|
+        break if line_limit_reached?
+        parse_line(line)
+      end
+      validate_line(@leading, @current_line) unless @leading == ""
+    end
+    def validate_url
+      @current_line = 1
       begin
-        if @extension =~ /.xls(x)?/
-          build_warnings(:excel, :context)
-          return
+        request = Typhoeus::Request.new(@source, followlocation: true)
+        request.on_headers do |response|
+          @headers = response.headers || {}
+          @content_type = response.headers["content-type"] rescue nil
+          @response_code = response.code
+          return build_errors(:not_found) if response.code == 404
+          validate_metadata
         end
-        io = @source.respond_to?(:gets) ? @source : open(@source, :allow_redirections=>:all)
-        validate_metadata(io)
-        locate_schema unless @schema.instance_of?(Csvlint::Schema)
-        set_dialect(dialect)
-        parse_csv(io)
-        sum = @col_counts.inject(:+)
-        unless sum.nil?
-          build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
+        request.on_body do |chunk|
+          io = StringIO.new(@leading + chunk)
+          io.each_line do |line|
+            break if line_limit_reached?
+            parse_line(line)
+          end
         end
-        build_warnings(:check_options, :structure) if @expected_columns == 1
-        check_consistency
-        check_foreign_keys
-      rescue OpenURI::HTTPError, Errno::ENOENT
-        build_errors(:not_found, nil, nil, nil, @source)
-      ensure
-        io.close if io && io.respond_to?(:close)
+        request.run
+        # Validate the last line too
+        validate_line(@leading, @current_line) unless @leading == ""
+      rescue ArgumentError => ae
+        build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
+        @reported_invalid_encoding = true
       end
     end
-    def validate_metadata(io)
-      @csv_header = true
-      @encoding = io.charset rescue nil
-      @content_type = io.content_type rescue nil
-      @headers = io.meta rescue nil
-      @link_headers = @headers["link"].split(",") rescue nil
-      assumed_header = undeclared_header = !@supplied_dialect
-      if @headers
+    def parse_line(line)
+      line = @leading + line
+      # Check if the last line is a line break - in which case it's a full line
+      if line[-1, 1].include?("\n")
+        # If the number of quotes is odd, the linebreak is inside some quotes
+        if line.count(@dialect["quoteChar"]).odd?
+          @leading = line
+        else
+          validate_line(line, @current_line)
+          @leading = ""
+          @current_line = @current_line+1
+        end
+      else
+        # If it's not a full line, then prepare to add it to the beginning of the next chunk
+        @leading = line
+      end
+    end
+    def validate_line(input = nil, index = nil)
+      @input = input
+      single_col = false
+      line = index.present? ? index : 0
+      @encoding = input.encoding.to_s
+      report_line_breaks(line)
+      parse_contents(input, line)
+      @lambda.call(self)
+    rescue ArgumentError => ae
+      build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
+      @reported_invalid_encoding = true
+    end
+    # analyses the provided csv and builds errors, warnings and info messages
+    def parse_contents(stream, line = nil)
+      # parse_contents will parse one line and apply headers, formats methods and error handle as appropriate
+      current_line = line.present? ? line : 1
+      all_errors = []
+      @csv_options[:encoding] = @encoding
+      begin
+        row = CSV.parse_line(stream, @csv_options)
+          # this is a one line substitute for CSV.new followed by row = CSV.shift. a CSV Row class is required
+          # CSV.parse will return an array of arrays which breaks subsequent each_with_index invocations
+          # TODO investigate if above would be a drag on memory
+      rescue CSV::MalformedCSVError => e
+        build_exception_messages(e, stream, current_line)
+      end
+      @data << row
+      if row
+        if current_line <= 1 && @csv_header
+          # this conditional should be refactored somewhere
+          row = row.reject { |col| col.nil? || col.empty? }
+          validate_header(row)
+          @col_counts << row.size
+        else
+          build_formats(row)
+          @col_counts << row.reject { |col| col.nil? || col.empty? }.size
+          @expected_columns = row.size unless @expected_columns != 0
+          build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? || c.empty? }.size == 0
+          # Builds errors and warnings related to the provided schema file
+          if @schema
+            @schema.validate_row(row, current_line, all_errors, @source)
+            @errors += @schema.errors
+            all_errors += @schema.errors
+            @warnings += @schema.warnings
+          else
+            build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) if !row.empty? && row.size != @expected_columns
+          end
+        end
+      end
+    end
+    def finish
+      sum = @col_counts.inject(:+)
+      unless sum.nil?
+        build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
+      end
+      # return expected_columns to calling class
+      build_warnings(:check_options, :structure) if @expected_columns == 1
+      check_consistency
+      check_foreign_keys
+      check_mixed_linebreaks
+      validate_encoding
+    end
+    def validate_metadata
+      assumed_header = !@supplied_dialect
+      unless @headers.empty?
         if @headers["content-type"] =~ /text\/csv/
-          @csv_header = true
-          undeclared_header = false
-          assumed_header = true
+          @csv_header = @csv_header && true
+          assumed_header = @assumed_header.present?
         end
         if @headers["content-type"] =~ /header=(present|absent)/
           @csv_header = true if $1 == "present"
           @csv_header = false if $1 == "absent"
-          undeclared_header = false
           assumed_header = false
         end
-        if @headers["content-type"] !~ /charset=/
-          build_warnings(:no_encoding, :context)
-        else
-          build_warnings(:encoding, :context) if @encoding != "utf-8"
-        end
         build_warnings(:no_content_type, :context) if @content_type == nil
         build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
+      end
+      @header_processed = true
+      build_info_messages(:assumed_header, :structure) if assumed_header
-        if undeclared_header
-          build_errors(:undeclared_header, :structure)
-          assumed_header = false
+      @link_headers = @headers["link"].split(",") rescue nil
+      @link_headers.each do |link_header|
+        match = LINK_HEADER_REGEXP.match(link_header)
+        uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
+        rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
+        param = match["param"]
+        param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
+        if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
+          begin
+            url = URI.join(@source_url, uri)
+            schema = Schema.load_from_json(url)
+            if schema.instance_of? Csvlint::Csvw::TableGroup
+              if schema.tables[@source_url]
+                link_schema = schema
+              else
+                warn_if_unsuccessful = true
+                build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
+              end
+            end
+          rescue OpenURI::HTTPError
+          end
         end
+      end if @link_headers
+    end
+    def header?
+      @csv_header && @dialect["header"]
+    end
+    def report_line_breaks(line_no=nil)
+      return if @input !~ /[\r|\n]/ # Return straight away if there's no newline character - i.e. we're on the last line
+      line_break = CSV.new(@input).row_sep
+      @line_breaks << line_break
+      unless line_breaks_reported?
+        if line_break != "\r\n"
+          build_info_messages(:nonrfc_line_breaks, :structure, line_no)
+          @line_breaks_reported = true
+        end
       end
-      build_info_messages(:assumed_header, :structure) if assumed_header
     end
-    def set_dialect(dialect)
+    def line_breaks_reported?
+      @line_breaks_reported === true
+    end
+    def set_dialect
+      @assumed_header = @dialect["header"].nil?
+      @supplied_dialect = @dialect != {}
       begin
         schema_dialect = @schema.tables[@source_url].dialect || {}
       rescue
         schema_dialect = {}
       end
       @dialect = {
-        "header" => true,
-        "delimiter" => ",",
-        "skipInitialSpace" => true,
-        "lineTerminator" => :auto,
-        "quoteChar" => '"',
-        "trim" => :true
-      }.merge(schema_dialect).merge(dialect || {})
+          "header" => true,
+          "delimiter" => ",",
+          "skipInitialSpace" => true,
+          "lineTerminator" => :auto,
+          "quoteChar" => '"',
+          "trim" => :true
+      }.merge(schema_dialect).merge(@dialect || {})
       @csv_header = @csv_header && @dialect["header"]
       @csv_options = dialect_to_csv_options(@dialect)
     end
-    # analyses the provided csv and builds errors, warnings and info messages
-    def parse_csv(io)
-      @expected_columns = 0
-      current_line = 0
-      reported_invalid_encoding = false
-      all_errors = []
-      @col_counts = []
+    def validate_encoding
+      if @headers["content-type"]
+        if @headers["content-type"] !~ /charset=/
+          build_warnings(:no_encoding, :context)
+        elsif @headers["content-type"] !~ /charset=utf-8/i
+          build_warnings(:encoding, :context)
+        end
+      end
+      build_warnings(:encoding, :context) if @encoding != "UTF-8"
+    end
-      @csv_options[:encoding] = @encoding
+    def check_mixed_linebreaks
+      build_linebreak_error if @line_breaks.uniq.count > 1
+    end
-      begin
-        wrapper = WrappedIO.new( io )
-        csv = CSV.new( wrapper, @csv_options )
-        @data = []
-        @line_breaks = csv.row_sep
-        if @line_breaks != "\r\n"
-          build_info_messages(:nonrfc_line_breaks, :structure)
-        end
-        row = nil
-        loop do
-         current_line += 1
-         if @limit_lines && current_line > @limit_lines
-           break
-         end
-         begin
-           wrapper.reset_line
-           row = csv.shift
-           @data << row
-           if row
-             if current_line == 1 && header?
-               row = row.reject{|col| col.nil? || col.empty?}
-               validate_header(row)
-               @col_counts << row.size
-             else
-               build_formats(row)
-               @col_counts << row.reject{|col| col.nil? || col.empty?}.size
-               @expected_columns = row.size unless @expected_columns != 0
-               build_errors(:blank_rows, :structure, current_line, nil, wrapper.line) if row.reject{ |c| c.nil? || c.empty? }.size == 0
-               # Builds errors and warnings related to the provided schema file
-               if @schema
-                 @schema.validate_row(row, current_line, all_errors, @source)
-                 @errors += @schema.errors
-                 all_errors += @schema.errors
-                 @warnings += @schema.warnings
-               else
-                 build_errors(:ragged_rows, :structure, current_line, nil, wrapper.line) if !row.empty? && row.size != @expected_columns
-               end
-             end
-           else
-             break
-           end
-         rescue CSV::MalformedCSVError => e
-           type = fetch_error(e)
-           if type == :stray_quote && !wrapper.line.match(csv.row_sep)
-             build_errors(:line_breaks, :structure)
-           else
-             build_errors(type, :structure, current_line, nil, wrapper.line)
-           end
-         end
+    def line_breaks
+      if @line_breaks.uniq.count > 1
+        :mixed
+      else
+        @line_breaks.uniq.first
       end
-      rescue ArgumentError => ae
-        build_errors(:invalid_encoding, :structure, current_line, nil, wrapper.line) unless reported_invalid_encoding
-        reported_invalid_encoding = true
+    end
+    def row_count
+      data.count
+    end
+    def build_exception_messages(csvException, errChars, lineNo)
+      #TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers
+      #TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options
+      type = fetch_error(csvException)
+      if !@csv_options[:row_sep].kind_of?(Symbol) && [:unclosed_quote,:stray_quote].include?(type) && !@input.match(@csv_options[:row_sep])
+        build_linebreak_error
+      else
+        build_errors(type, :structure, lineNo, nil, errChars)
       end
     end
+    def build_linebreak_error
+      build_errors(:line_breaks, :structure) unless @errors.any? { |e| e.type == :line_breaks }
+    end
     def validate_header(header)
       names = Set.new
       header.map{|h| h.strip! } if @dialect["trim"] == :true
@@ -198,10 +325,6 @@ module Csvlint
       return valid?
     end
-    def header?
-      @csv_header
-    end
     def fetch_error(error)
       e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i)
       message = e[1] rescue nil
@@ -209,15 +332,15 @@ module Csvlint
     end
     def dialect_to_csv_options(dialect)
-        skipinitialspace = dialect["skipInitialSpace"] || true
-        delimiter = dialect["delimiter"]
-        delimiter = delimiter + " " if !skipinitialspace
-        return {
-            :col_sep => delimiter,
-            :row_sep => dialect["lineTerminator"],
-            :quote_char => dialect["quoteChar"],
-            :skip_blanks => false
-        }
+      skipinitialspace = dialect["skipInitialSpace"] || true
+      delimiter = dialect["delimiter"]
+      delimiter = delimiter + " " if !skipinitialspace
+      return {
+          :col_sep => delimiter,
+          :row_sep => dialect["lineTerminator"],
+          :quote_char => dialect["quoteChar"],
+          :skip_blanks => false
+      }
     end
     def build_formats(row)
@@ -225,33 +348,34 @@ module Csvlint
         next if col.nil? || col.empty?
         @formats[i] ||= Hash.new(0)
-        format = if col.strip[FORMATS[:numeric]]
-          :numeric
-        elsif uri?(col)
-          :uri
-        elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
-          :date_db
-        elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
-          :date_short
-        elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
-          :date_rfc822
-        elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
-          :date_long
-        elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
-          :dateTime_time
-        elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
-          :dateTime_hms
-        elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
-          :dateTime_db
-        elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
-          :dateTime_iso8601
-        elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
-          :dateTime_short
-        elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
-          :dateTime_long
-        else
-          :string
-        end
+        format =
+            if col.strip[FORMATS[:numeric]]
+              :numeric
+            elsif uri?(col)
+              :uri
+            elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
+              :date_db
+            elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
+              :date_short
+            elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
+              :date_rfc822
+            elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
+              :date_long
+            elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
+              :dateTime_time
+            elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
+              :dateTime_hms
+            elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
+              :dateTime_db
+            elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
+              :dateTime_iso8601
+            elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
+              :dateTime_short
+            elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
+              :dateTime_long
+            else
+              :string
+            end
         @formats[i][format] += 1
       end
@@ -277,15 +401,16 @@ module Csvlint
     end
     def locate_schema
       @source_url = nil
       warn_if_unsuccessful = false
       case @source
-      when StringIO
-        return
-      when File
-        @source_url = "file:#{File.expand_path(@source)}"
-      else
-        @source_url = @source
+        when StringIO
+          return
+        when File
+          @source_url = "file:#{File.expand_path(@source)}"
+        else
+          @source_url = @source
       end
       unless @schema.nil?
         if @schema.tables[@source_url]
@@ -295,28 +420,6 @@ module Csvlint
         end
       end
       link_schema = nil
-      @link_headers.each do |link_header|
-        match = LINK_HEADER_REGEXP.match(link_header)
-        uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
-        rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
-        param = match["param"]
-        param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
-        if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
-          begin
-            url = URI.join(@source_url, uri)
-            schema = Schema.load_from_json(url)
-            if schema.instance_of? Csvlint::Csvw::TableGroup
-              if schema.tables[@source_url]
-                link_schema = schema
-              else
-                warn_if_unsuccessful = true
-                build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
-              end
-            end
-          rescue OpenURI::HTTPError
-          end
-        end
-      end if @link_headers
       @schema = link_schema if link_schema
       paths = []
@@ -324,8 +427,8 @@ module Csvlint
         begin
           well_known_uri = URI.join(@source_url, "/.well-known/csvm")
           well_known = open(well_known_uri).read
-          # TODO
-        rescue OpenURI::HTTPError
+            # TODO
+        rescue OpenURI::HTTPError, URI::BadURIError
         end
       end
       paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty?
@@ -345,8 +448,7 @@ module Csvlint
             end
           end
         rescue Errno::ENOENT
-        rescue OpenURI::HTTPError
-        rescue ArgumentError
+        rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError
         rescue => e
           STDERR.puts e.class
           STDERR.puts e.message
@@ -361,23 +463,24 @@ module Csvlint
     private
     def parse_extension(source)
       case source
-      when File
-        return File.extname( source.path )
-      when IO
-        return ""
-      when StringIO
-        return ""
+        when File
+          return File.extname( source.path )
+        when IO
+          return ""
+        when StringIO
+          return ""
         when Tempfile
           # this is triggered when the revalidate dialect use case happens
-        return ""
-      else
-        begin
-          parsed = URI.parse(source)
-          File.extname(parsed.path)
-        rescue URI::InvalidURIError
           return ""
-        end
+        else
+          begin
+            parsed = URI.parse(source)
+            File.extname(parsed.path)
+          rescue URI::InvalidURIError
+            return ""
+          end
       end
     end
@@ -396,20 +499,24 @@ module Csvlint
       false
     end
+    def line_limit_reached?
+      @limit_lines.present? && @current_line > @limit_lines
+    end
     FORMATS = {
-      :string => nil,
-      :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
-      :uri => /\Ahttps?:/,
-      :date_db => /\A\d{4,}-\d\d-\d\d\z/,                                               # "12345-01-01"
-      :date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/,            # "January  1, 12345"
-      :date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/,      # " 1 Jan 12345"
-      :date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/,              # "1 Jan"
-      :dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/,                            # "12345-01-01 00:00:00"
-      :dateTime_hms => /\A\d\d:\d\d:\d\d\z/,                                            # "00:00:00"
-      :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/,                      # "12345-01-01T00:00:00Z"
-      :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
-      :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/,   # "01 Jan 00:00"
-      :dateTime_time => /\A\d\d:\d\d\z/,                                                # "00:00"
+        :string => nil,
+        :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
+        :uri => /\Ahttps?:/,
+        :date_db => /\A\d{4,}-\d\d-\d\d\z/,                                               # "12345-01-01"
+        :date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/,            # "January  1, 12345"
+        :date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/,      # " 1 Jan 12345"
+        :date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/,              # "1 Jan"
+        :dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/,                            # "12345-01-01 00:00:00"
+        :dateTime_hms => /\A\d\d:\d\d:\d\d\z/,                                            # "00:00:00"
+        :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/,                      # "12345-01-01T00:00:00Z"
+        :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00"
+        :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/,   # "01 Jan 00:00"
+        :dateTime_time => /\A\d\d:\d\d\z/,                                                # "00:00"
     }.freeze
     URI_REGEXP = /(?<uri>.*?)/