RubyGems - reckon - Versions diffs - 0.4.4 → 0.5.4 - Mend

reckon 0.4.4 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

checksums.yaml +5 -5
data/.gitignore +3 -0
data/.ruby-version +1 -1
data/.travis.yml +10 -2
data/CHANGELOG.md +235 -0
data/Gemfile +0 -1
data/Gemfile.lock +73 -15
data/README.md +12 -5
data/lib/reckon.rb +13 -12
data/lib/reckon/app.rb +94 -116
data/lib/reckon/cosine_similarity.rb +122 -0
data/lib/reckon/csv_parser.rb +116 -129
data/lib/reckon/date_column.rb +60 -0
data/lib/reckon/ledger_parser.rb +204 -30
data/lib/reckon/logger.rb +4 -0
data/lib/reckon/money.rb +6 -62
data/lib/reckon/version.rb +3 -0
data/reckon.gemspec +8 -5
data/spec/data_fixtures/51-sample.csv +8 -0
data/spec/data_fixtures/51-tokens.yml +9 -0
data/spec/data_fixtures/73-sample.csv +2 -0
data/spec/data_fixtures/73-tokens.yml +8 -0
data/spec/data_fixtures/73-transactions.ledger +7 -0
data/spec/data_fixtures/85-date-example.csv +2 -0
data/spec/data_fixtures/austrian_example.csv +13 -0
data/spec/data_fixtures/bom_utf8_file.csv +1 -0
data/spec/data_fixtures/broker_canada_example.csv +12 -0
data/spec/data_fixtures/chase.csv +9 -0
data/spec/data_fixtures/danish_kroner_nordea_example.csv +6 -0
data/spec/data_fixtures/english_date_example.csv +3 -0
data/spec/data_fixtures/french_example.csv +9 -0
data/spec/data_fixtures/german_date_example.csv +3 -0
data/spec/data_fixtures/harder_date_example.csv +5 -0
data/spec/data_fixtures/ing.csv +3 -0
data/spec/data_fixtures/intuit_mint_example.csv +7 -0
data/spec/data_fixtures/invalid_header_example.csv +6 -0
data/spec/data_fixtures/inversed_credit_card.csv +16 -0
data/spec/data_fixtures/nationwide.csv +4 -0
data/spec/data_fixtures/simple.csv +2 -0
data/spec/data_fixtures/some_other.csv +9 -0
data/spec/data_fixtures/spanish_date_example.csv +3 -0
data/spec/data_fixtures/suntrust.csv +7 -0
data/spec/data_fixtures/test_money_column.csv +3 -0
data/spec/data_fixtures/two_money_columns.csv +5 -0
data/spec/data_fixtures/yyyymmdd_date_example.csv +1 -0
data/spec/reckon/app_spec.rb +96 -34
data/spec/reckon/csv_parser_spec.rb +185 -307
data/spec/reckon/date_column_spec.rb +12 -13
data/spec/reckon/ledger_parser_spec.rb +99 -9
data/spec/reckon/money_spec.rb +42 -29
data/spec/spec_helper.rb +22 -0
metadata +85 -21
data/CHANGES.md +0 -9

data/lib/reckon/app.rb CHANGED

@@ -1,21 +1,20 @@
-#coding: utf-8
+# coding: utf-8
 require 'pp'
 require 'yaml'
 module Reckon
   class App
-    VERSION = "Reckon 0.4.4"
-    attr_accessor :options, :accounts, :tokens, :seen, :csv_parser, :regexps
+    attr_accessor :options, :seen, :csv_parser, :regexps, :matcher
     def initialize(options = {})
+      LOGGER.level = Logger::INFO if options[:verbose]
       self.options = options
-      self.tokens = {}
       self.regexps = {}
-      self.accounts = {}
       self.seen = {}
       self.options[:currency] ||= '$'
       options[:string] = File.read(options[:file]) unless options[:string]
       @csv_parser = CSVParser.new( options )
+      @matcher = CosineSimilarity.new(options)
       learn!
     end
@@ -24,21 +23,44 @@ module Reckon
       puts str
     end
+    def learn!
+      learn_from_account_tokens(options[:account_tokens_file])
+      ledger_file = options[:existing_ledger_file]
+      return unless ledger_file
+      fail "#{ledger_file} doesn't exist!" unless File.exists?(ledger_file)
+      learn_from(File.read(ledger_file))
+    end
+    def learn_from_account_tokens(filename)
+      return unless filename
+      fail "#{filename} doesn't exist!" unless File.exists?(filename)
+      extract_account_tokens(YAML.load_file(filename)).each do |account, tokens|
+        tokens.each do |t|
+          if t.start_with?('/')
+            add_regexp(account, t)
+          else
+            @matcher.add_document(account, t)
+          end
+        end
+      end
+    end
     def learn_from(ledger)
       LedgerParser.new(ledger).entries.each do |entry|
         entry[:accounts].each do |account|
-          learn_about_account( account[:name],
-                              [entry[:desc], account[:amount]].join(" ") ) unless account[:name] == options[:bank_account]
-          seen[entry[:date]] ||= {}
-          seen[entry[:date]][@csv_parser.pretty_money(account[:amount])] = true
+          str = [entry[:desc], account[:amount]].join(" ")
+          @matcher.add_document(account[:name], str) unless account[:name] == options[:bank_account]
+          pretty_date = entry[:date].iso8601
+          seen[pretty_date] ||= {}
+          seen[pretty_date][@csv_parser.pretty_money(account[:amount])] = true
         end
       end
     end
-    def already_seen?(row)
-      seen[row[:pretty_date]] && seen[row[:pretty_date]][row[:pretty_money]]
-    end
+    # Add tokens from account_tokens_file to accounts
     def extract_account_tokens(subtree, account = nil)
       if subtree.nil?
         puts "Warning: empty #{account} tree"
@@ -46,50 +68,26 @@ module Reckon
       elsif subtree.is_a?(Array)
         { account => subtree }
       else
-        at = subtree.map { |k, v| extract_account_tokens(v, [account, k].compact.join(':')) }
-        at.inject({}) { |k, v| k = k.merge(v)}
-      end
-    end
-    def learn!
-      if options[:account_tokens_file]
-        fail "#{options[:account_tokens_file]} doesn't exist!" unless File.exists?(options[:account_tokens_file])
-        extract_account_tokens(YAML.load_file(options[:account_tokens_file])).each do |account, tokens|
-          tokens.each { |t| learn_about_account(account, t, true) }
+        at = subtree.map do |k, v|
+          merged_acct = [account, k].compact.join(':')
+          extract_account_tokens(v, merged_acct)
         end
+        at.inject({}) { |memo, e| memo.merge!(e)}
       end
-      return unless options[:existing_ledger_file]
-      fail "#{options[:existing_ledger_file]} doesn't exist!" unless File.exists?(options[:existing_ledger_file])
-      ledger_data = File.read(options[:existing_ledger_file])
-      learn_from(ledger_data)
     end
-    def learn_about_account(account, data, parse_regexps = false)
-      accounts[account] ||= 0
-      if parse_regexps && data.start_with?('/')
-        # https://github.com/tenderlove/psych/blob/master/lib/psych/visitors/to_ruby.rb
-        match = data.match(/^\/(.*)\/([ix]*)$/m)
-        fail "failed to parse regexp #{data}" unless match
-        options = 0
-        (match[2] || '').split('').each do |option|
-          case option
-          when 'x' then options |= Regexp::EXTENDED
-          when 'i' then options |= Regexp::IGNORECASE
-          end
-        end
-        regexps[Regexp.new(match[1], options)] = account
-      else
-        tokenize(data).each do |token|
-          tokens[token] ||= {}
-          tokens[token][account] ||= 0
-          tokens[token][account] += 1
-          accounts[account] += 1
+    def add_regexp(account, regex_str)
+      # https://github.com/tenderlove/psych/blob/master/lib/psych/visitors/to_ruby.rb
+      match = regex_str.match(/^\/(.*)\/([ix]*)$/m)
+      fail "failed to parse regexp #{regex_str}" unless match
+      options = 0
+      (match[2] || '').split('').each do |option|
+        case option
+        when 'x' then options |= Regexp::EXTENDED
+        when 'i' then options |= Regexp::IGNORECASE
         end
       end
-    end
-    def tokenize(str)
-      str.downcase.split(/[\s\-]/)
+      regexps[Regexp.new(match[1], options)] = account
     end
     def walk_backwards
@@ -107,8 +105,7 @@ module Reckon
           seen_anything_new = true
         end
-        possible_answers = most_specific_regexp_match(row)
-        possible_answers = weighted_account_match( row ).map! { |a| a[:account] } if possible_answers.empty?
+        possible_answers = suggest(row)
         ledger = if row[:money] > 0
           if options[:unattended]
@@ -156,15 +153,21 @@ module Reckon
       end
     end
-    def finish
-      options[:output_file].close unless options[:output_file] == STDOUT
-      interactive_output "Exiting."
-      exit
-    end
-    def output(ledger_line)
-      options[:output_file].puts ledger_line
-      options[:output_file].flush
+    def each_row_backwards
+      rows = []
+      (0...@csv_parser.columns.first.length).to_a.each do |index|
+        if @csv_parser.date_for(index).nil?
+          LOGGER.warn("Skipping row: '#{@csv_parser.row(index)}' that doesn't have a valid date")
+          next
+        end
+        rows << { :date => @csv_parser.date_for(index),
+                  :pretty_date => @csv_parser.pretty_date_for(index),
+                  :pretty_money => @csv_parser.pretty_money_for(index),
+                  :pretty_money_negated => @csv_parser.pretty_money_for(index, :negate),
+                  :money => @csv_parser.money_for(index),
+                  :description => @csv_parser.description_for(index) }
+      end
+      rows.sort_by { |n| n[:date] }.each {|row| yield row }
     end
     def most_specific_regexp_match( row )
@@ -176,41 +179,9 @@ module Reckon
       matches.sort_by! { |account, matched_text| matched_text.length }.map(&:first)
     end
-    # Weigh accounts by how well they match the row
-    def weighted_account_match( row )
-      query_tokens = tokenize(row[:description])
-      search_vector = []
-      account_vectors = {}
-      query_tokens.each do |token|
-        idf = Math.log((accounts.keys.length + 1) / ((tokens[token] || {}).keys.length.to_f + 1))
-        tf = 1.0 / query_tokens.length.to_f
-        search_vector << tf*idf
-        accounts.each do |account, total_terms|
-          tf = (tokens[token] && tokens[token][account]) ? tokens[token][account] / total_terms.to_f : 0
-          account_vectors[account] ||= []
-          account_vectors[account] << tf*idf
-        end
-      end
-      # Should I normalize the vectors?  Probably unnecessary due to tf-idf and short documents.
-      account_vectors = account_vectors.to_a.map do |account, account_vector|
-        { :cosine => (0...account_vector.length).to_a.inject(0) { |m, i| m + search_vector[i] * account_vector[i] },
-          :account => account }
-      end
-      account_vectors.sort! {|a, b| b[:cosine] <=> a[:cosine] }
-      # Return empty set if no accounts matched so that we can fallback to the defaults in the unattended mode
-      if options[:unattended]
-        if account_vectors.first && account_vectors.first[:account]
-          account_vectors = [] if account_vectors.first[:cosine] == 0
-        end
-      end
-      return account_vectors
+    def suggest(row)
+      most_specific_regexp_match(row) +
+        @matcher.find_similar(row[:description]).map { |n| n[:account] }
     end
     def ledger_format(row, line1, line2)
@@ -220,6 +191,21 @@ module Reckon
       out
     end
+    def output(ledger_line)
+      options[:output_file].puts ledger_line
+      options[:output_file].flush
+    end
+    def already_seen?(row)
+      seen[row[:pretty_date]] && seen[row[:pretty_date]][row[:pretty_money]]
+    end
+    def finish
+      options[:output_file].close unless options[:output_file] == STDOUT
+      interactive_output "Exiting."
+      exit
+    end
     def output_table
       output = Terminal::Table.new do |t|
         t.headings = 'Date', 'Amount', 'Description'
@@ -230,21 +216,6 @@ module Reckon
       interactive_output output
     end
-    def each_row_backwards
-      rows = []
-      (0...@csv_parser.columns.first.length).to_a.each do |index|
-        rows << { :date => @csv_parser.date_for(index),
-          :pretty_date => @csv_parser.pretty_date_for(index),
-          :pretty_money => @csv_parser.pretty_money_for(index),
-          :pretty_money_negated => @csv_parser.pretty_money_for(index, :negate),
-          :money => @csv_parser.money_for(index),
-          :description => @csv_parser.description_for(index) }
-      end
-      rows.sort { |a, b| a[:date] <=> b[:date] }.each do |row|
-        yield row
-      end
-    end
     def self.parse_opts(args = ARGV)
       options = { :output_file => STDOUT }
       parser = OptionParser.new do |opts|
@@ -255,7 +226,7 @@ module Reckon
           options[:file] = file
         end
-        opts.on("-a", "--account name", "The Ledger Account this file is for") do |a|
+        opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a|
           options[:bank_account] = a
         end
@@ -283,6 +254,14 @@ module Reckon
           options[:ignore_columns] = ignore.split(",").map { |i| i.to_i }
         end
+        opts.on("", "--money-column 2", Integer, "Specify the money column instead of letting Reckon guess - the first column is column 1") do |column_number|
+          options[:money_column] = column_number
+        end
+        opts.on("", "--date-column 3", Integer, "Specify the date column instead of letting Reckon guess - the first column is column 1") do |column_number|
+          options[:date_column] = column_number
+        end
         opts.on("", "--contains-header [N]", "The first row of the CSV is a header and should be skipped. Optionally add the number of rows to skip.") do |contains_header|
           options[:contains_header] = 1
           options[:contains_header] = contains_header.to_i if contains_header
@@ -316,11 +295,11 @@ module Reckon
           options[:account_tokens_file] = a
         end
-        opts.on("", "--default-into-account name", "Default into account") do |a|
+        opts.on("", "--default-into-account NAME", "Default into account") do |a|
           options[:default_into_account] = a
         end
-        opts.on("", "--default-outof-account name", "Default 'out of' account") do |a|
+        opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a|
           options[:default_outof_account] = a
         end
@@ -351,7 +330,6 @@ module Reckon
       end
       unless options[:bank_account]
         fail "Please specify --account for the unattended mode" if options[:unattended]
         options[:bank_account] = ask("What is the account name of this bank account in Ledger? ") do |q|

data/lib/reckon/cosine_similarity.rb ADDED

@@ -0,0 +1,122 @@
+require 'matrix'
+require 'set'
+# Implementation of consine similarity using TF-IDF for vectorization.
+# Used to suggest which account a transaction should be assigned to
+module Reckon
+  class CosineSimilarity
+    def initialize(options)
+      @options = options
+      @tokens = {}
+      @accounts = Hash.new(0)
+    end
+    def add_document(account, doc)
+      tokenize(doc).each do |n|
+        (token, count) = n
+        @tokens[token] ||= {}
+        @tokens[token][account] ||= 0
+        @tokens[token][account] += count
+        @accounts[account] += count
+      end
+    end
+    # find most similar documents to query
+    def find_similar(query)
+      (query_scores, corpus_scores) = td_idf_scores_for(query)
+      query_vector = Vector.elements(query_scores, false)
+      # For each doc, calculate the similarity to the query
+      suggestions = corpus_scores.map do |account, scores|
+        acct_vector = Vector.elements(scores, false)
+        acct_query_dp = acct_vector.inner_product(query_vector)
+        # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
+        # exactly opposite
+        # see https://en.wikipedia.org/wiki/Cosine_similarity
+        # cos(theta) = (A . B) / (||A|| ||B||)
+        # where A . B is the "dot product" and ||A|| is the magnitude of A
+        # ruby has the 'matrix' library we can use to do these calculations.
+        {
+          similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
+          account: account,
+        }
+      end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
+      LOGGER.info "most similar accounts: #{suggestions}"
+      return suggestions
+    end
+    private
+    def td_idf_scores_for(query)
+      query_tokens = tokenize(query)
+      corpus = Set.new
+      corpus_scores = {}
+      query_scores = []
+      num_docs = @accounts.length
+      query_tokens.each do |n|
+        (token, _count) = n
+        next unless @tokens[token]
+        corpus = corpus.union(Set.new(@tokens[token].keys))
+      end
+      query_tokens.each do |n|
+        (token, count) = n
+        # if no other docs have token, ignore it
+        next unless @tokens[token]
+        ## First, calculate scores for our query as we're building scores for the corpus
+        query_scores << calc_tf_idf(
+          count,
+          query_tokens.length,
+          @tokens[token].length,
+          num_docs
+        )
+        ## Next, calculate for the corpus, where our "account" is a document
+        corpus.each do |account|
+          corpus_scores[account] ||= []
+          corpus_scores[account] << calc_tf_idf(
+            (@tokens[token][account] || 0),
+            @accounts[account].to_f,
+            @tokens[token].length.to_f,
+            num_docs
+          )
+        end
+      end
+      [query_scores, corpus_scores]
+    end
+    def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
+      # tf(t,d) = count of t in d / number of words in d
+      tf = token_count / num_words_in_doc.to_f
+      # smooth idf weight
+      # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
+      # df(t) = num of documents with term t in them
+      # idf(t) = log(N/(1 + df )) + 1
+      idf = Math.log(num_docs.to_f / (1 + df)) + 1
+      tf * idf
+    end
+    def tokenize(str)
+      mk_tokens(str).inject(Hash.new(0)) do |memo, n|
+        memo[n] += 1
+        memo
+      end.to_a
+    end
+    def mk_tokens(str)
+      str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
+    end
+  end
+end

data/lib/reckon/csv_parser.rb CHANGED

@@ -1,5 +1,4 @@
 #coding: utf-8
-require 'pp'
 module Reckon
   class CSVParser
@@ -8,43 +7,74 @@ module Reckon
     def initialize(options = {})
       self.options = options
       self.options[:currency] ||= '$'
-      parse
+      @csv_data = parse(options[:string] || File.read(options[:file]), options[:file])
       filter_csv
       detect_columns
     end
-    def filter_csv
-      if options[:ignore_columns]
-        new_columns = []
-        columns.each_with_index do |column, index|
-          new_columns << column unless options[:ignore_columns].include?(index + 1)
+    def columns
+      @columns ||=
+        begin
+          last_row_length = nil
+          csv_data.inject([]) do |memo, row|
+            unless row.all? { |i| i.nil? || i.length == 0 }
+              row.each_with_index do |entry, index|
+                memo[index] ||= []
+                memo[index] << (entry || '').strip
+              end
+              last_row_length = row.length
+            end
+            memo
+          end
         end
-        @columns = new_columns
-      end
     end
-    def money_for(index)
-      @money_column[index]
+    def date_for(index)
+      @date_column.for(index)
     end
-    def pretty_money_for(index, negate = false)
-      money_for( index ).pretty( negate )
+    def pretty_date_for(index)
+      @date_column.pretty_for( index )
+    end
+    def money_for(index)
+      @money_column[index]
     end
     def pretty_money(amount, negate = false)
       Money.new( amount, @options ).pretty( negate )
     end
-    def date_for(index)
-      @date_column.for( index )
-    end
+    def pretty_money_for(index, negate = false)
+      money = money_for(index)
+      return 0 if money.nil?
-    def pretty_date_for(index)
-      @date_column.pretty_for( index )
+      money.pretty(negate)
     end
     def description_for(index)
-      description_column_indices.map { |i| columns[i][index] }.reject { |a| a.empty? }.join("; ").squeeze(" ").gsub(/(;\s+){2,}/, '').strip
+      description_column_indices.map { |i| columns[i][index].to_s.strip }
+        .reject(&:empty?)
+        .join("; ")
+        .squeeze(" ")
+        .gsub(/(;\s+){2,}/, '')
+        .strip
+    end
+    def row(index)
+      csv_data[index].join(", ")
+    end
+    private
+    def filter_csv
+      if options[:ignore_columns]
+        new_columns = []
+        columns.each_with_index do |column, index|
+          new_columns << column unless options[:ignore_columns].include?(index + 1)
+        end
+        @columns = new_columns
+      end
     end
     def evaluate_columns(cols)
@@ -88,48 +118,24 @@ module Reckon
         results << { :index => index, :money_score => money_score, :date_score => date_score }
       end
-      return [results, found_likely_money_column]
-    end
+      results.sort_by! { |n| -n[:money_score] }
-    def merge_columns(a, b)
-      output_columns = []
-      columns.each_with_index do |column, index|
-        if index == a
-          new_column = MoneyColumn.new( column )
-            .merge!( MoneyColumn.new( columns[b] ) )
-            .map { |m| m.amount.to_s }
-          output_columns << new_column
-        elsif index == b
-          # skip
-        else
-          output_columns << column
-        end
+      # check if it looks like a 2-column file with a balance field
+      if results.length >= 3 && results[1][:money_score] + results[2][:money_score] >= results[0][:money_score]
+        results[1][:is_money_column] = true
+        results[2][:is_money_column] = true
+      else
+        results[0][:is_money_column] = true
       end
-      output_columns
-    end
-    def evaluate_two_money_columns( columns, id1, id2, unmerged_results )
-      merged_columns = merge_columns( id1, id2 )
-      results, found_likely_money_column = evaluate_columns( merged_columns )
-      if !found_likely_money_column
-        new_res = results.find { |el| el[:index] == id1 }
-        old_res1 = unmerged_results.find { |el| el[:index] == id1 }
-        old_res2 = unmerged_results.find { |el| el[:index] == id2 }
-        if new_res[:money_score] > old_res1[:money_score] &&
-          new_res[:money_score] > old_res2[:money_score]
-          found_likely_money_column = true
-        end
-      end
-      [results, found_likely_money_column]
+      return results.sort_by { |n| n[:index] }
     end
-    def found_double_money_column( id1, id2 )
-      self.money_column_indices = [ id1, id2 ]
-      unless settings[:testing]
-        puts "It looks like this CSV has two seperate columns for money, one of which shows positive"
-        puts "changes and one of which shows negative changes.  If this is true, great.  Otherwise,"
-        puts "please report this issue to us so we can take a look!\n"
-      end
+    def found_double_money_column(id1, id2)
+      self.money_column_indices = [id1, id2]
+      puts "It looks like this CSV has two seperate columns for money, one of which shows positive"
+      puts "changes and one of which shows negative changes.  If this is true, great.  Otherwise,"
+      puts "please report this issue to us so we can take a look!\n"
     end
     # Some csv files negative/positive amounts are indicated in separate account
@@ -159,100 +165,81 @@ module Reckon
     end
     def detect_columns
-      results, found_likely_money_column = evaluate_columns(columns)
-      self.money_column_indices = [ results.sort { |a, b| b[:money_score] <=> a[:money_score] }.first[:index] ]
-      if !found_likely_money_column
-        found_likely_double_money_columns = false
-        0.upto(columns.length - 2) do |i|
-          if MoneyColumn.new( columns[i] ).merge!( MoneyColumn.new( columns[i+1] ) )
-            _, found_likely_double_money_columns = evaluate_columns(merge_columns(i, i+1))
-            if found_likely_double_money_columns
-              found_double_money_column( i, i + 1 )
-              break
-            end
-          end
-        end
-        if !found_likely_double_money_columns
-          0.upto(columns.length - 2) do |i|
-            if MoneyColumn.new( columns[i] ).merge!( MoneyColumn.new( columns[i+1] ) )
-              # Try a more specific test
-              _, found_likely_double_money_columns = evaluate_two_money_columns( columns, i, i+1, results )
-              if found_likely_double_money_columns
-                found_double_money_column( i, i + 1 )
-                break
-              end
-            end
-          end
-        end
+      results = evaluate_columns(columns)
-        if !found_likely_double_money_columns && !settings[:testing]
-          puts "I didn't find a high-likelyhood money column, but I'm taking my best guess with column #{money_column_indices.first + 1}."
+      if options[:money_column]
+        self.money_column_indices = [ options[:money_column] - 1 ]
+      else
+        self.money_column_indices = results.select { |n| n[:is_money_column] }.map { |n| n[:index] }
+        if self.money_column_indices.length == 1
+          puts "Using column #{money_column_indices.first + 1} as the money column.  Use --money-colum to specify a different one."
+        elsif self.money_column_indices.length == 2
+          found_double_money_column(*self.money_column_indices)
+        else
+          puts "Unable to determine a money column, use --money-column to specify the column reckon should use."
         end
       end
-      results.reject! {|i| money_column_indices.include?(i[:index]) }
-      self.date_column_index = results.sort { |a, b| b[:date_score] <=> a[:date_score] }.first[:index]
-      results.reject! {|i| i[:index] == date_column_index }
-      @date_column = DateColumn.new( columns[ self.date_column_index ], @options )
+      results.reject! { |i| money_column_indices.include?(i[:index]) }
+      if options[:date_column]
+        @date_column_index = options[:date_column] - 1
+      else
+        # sort by highest score followed by lowest index
+        @date_column_index = results.max_by { |n| [n[:date_score], -n[:index]] }[:index]
+      end
+      results.reject! { |i| i[:index] == date_column_index }
+      @date_column = DateColumn.new(columns[date_column_index], @options)
-      if ( money_column_indices.length == 1 )
-        @money_column = MoneyColumn.new( columns[money_column_indices[0]],
-                                        @options )
+      @money_column = MoneyColumn.new(columns[money_column_indices[0]], @options)
+      if money_column_indices.length == 1
         detect_sign_column if @money_column.positive?
       else
-        @money_column = MoneyColumn.new( columns[money_column_indices[0]],
-                                        @options )
-        @money_column.merge!(
-          MoneyColumn.new( columns[money_column_indices[1]], @options ) )
+        @money_column.merge! MoneyColumn.new(columns[money_column_indices[1]], @options)
       end
       self.description_column_indices = results.map { |i| i[:index] }
     end
-    def columns
-      @columns ||= begin
-        last_row_length = nil
-        csv_data.inject([]) do |memo, row|
-          # fail "Input CSV must have consistent row lengths." if last_row_length && row.length != last_row_length
-          unless row.all? { |i| i.nil? || i.length == 0 }
-            row.each_with_index do |entry, index|
-              memo[index] ||= []
-              memo[index] << (entry || '').strip
-            end
-            last_row_length = row.length
-          end
-          memo
-        end
+    def parse(data, filename=nil)
+      # Use force_encoding to convert the string to utf-8 with as few invalid characters
+      # as possible.
+      data.force_encoding(try_encoding(data, filename))
+      data = data.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
+      data.sub!("\xEF\xBB\xBF", '') # strip byte order marker, if it exists
+      rows = []
+      data.each_line.with_index do |line, i|
+        next if i < (options[:contains_header] || 0)
+        rows << CSV.parse_line(line, col_sep: options[:csv_separator] || ',')
       end
+      rows
     end
-    def parse
-      data = options[:string] || File.read(options[:file])
+    def try_encoding(data, filename = nil)
+      encoding = try_encoding_from_file(filename)
-      if RUBY_VERSION =~ /^1\.9/ || RUBY_VERSION =~ /^2/
-        data = data.force_encoding(options[:encoding] || 'BINARY').encode('UTF-8', :invalid => :replace, :undef => :replace, :replace => '?')
-        csv_engine = CSV
-      else
-        csv_engine = FasterCSV
-      end
+      cd = CharDet.detect(data)
+      encoding ||= cd['encoding']
-      @csv_data = csv_engine.parse data.strip, :col_sep => options[:csv_separator] || ','
-      if options[:contains_header]
-        options[:contains_header].times { csv_data.shift }
-      end
-      csv_data
-    end
+      encoding ||= 'BINARY'
-    @settings = { :testing => false }
+      LOGGER.info("suggested file encoding: #{encoding}")
-    def self.settings
-      @settings
+      options[:encoding] || encoding
     end
-    def settings
-      self.class.settings
+    def try_encoding_from_file(filename = nil)
+      return unless filename
+      m = nil
+      os = Gem::Platform.local.os
+      if os == 'linux'
+        m = `file -i #{filename}`.match(/charset=(\S+)/)
+      elsif os == 'darwin'
+        m = `file -I #{filename}`.match(/charset=(\S+)/)
+      end
+      m && m[1]
     end
   end
 end