RubyGems - reckon - Versions diffs - 0.5.0 → 0.6.0 - Mend

reckon 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/.gitignore +3 -0
data/.ruby-version +1 -1
data/CHANGELOG.md +55 -1
data/Gemfile.lock +1 -5
data/README.md +1 -1
data/lib/reckon.rb +7 -9
data/lib/reckon/app.rb +140 -69
data/lib/reckon/cosine_similarity.rb +92 -89
data/lib/reckon/csv_parser.rb +70 -113
data/lib/reckon/date_column.rb +60 -0
data/lib/reckon/ledger_parser.rb +11 -1
data/lib/reckon/logger.rb +4 -0
data/lib/reckon/money.rb +4 -59
data/lib/reckon/version.rb +3 -0
data/reckon.gemspec +3 -3
data/spec/data_fixtures/51-sample.csv +8 -0
data/spec/data_fixtures/51-tokens.yml +9 -0
data/spec/data_fixtures/85-date-example.csv +2 -0
data/spec/data_fixtures/test_money_column.csv +3 -0
data/spec/reckon/app_spec.rb +32 -2
data/spec/reckon/csv_parser_spec.rb +129 -129
data/spec/reckon/date_column_spec.rb +12 -13
data/spec/reckon/ledger_parser_spec.rb +42 -5
data/spec/reckon/money_spec.rb +42 -29
data/spec/spec_helper.rb +19 -0
metadata +12 -19

data/lib/reckon/cosine_similarity.rb CHANGED

@@ -1,119 +1,122 @@
 require 'matrix'
+require 'set'
 # Implementation of consine similarity using TF-IDF for vectorization.
 # Used to suggest which account a transaction should be assigned to
-class CosineSimilarity
-  def initialize(options)
-    @options = options
-    @tokens = {}
-    @accounts = Hash.new(0)
-  end
-  def add_document(account, doc)
-    tokenize(doc).each do |n|
-      (token, count) = n
-      @tokens[token] ||= {}
-      @tokens[token][account] ||= 0
-      @tokens[token][account] += count
-      @accounts[account] += count
+module Reckon
+  class CosineSimilarity
+    def initialize(options)
+      @options = options
+      @tokens = {}
+      @accounts = Hash.new(0)
     end
-  end
-  # find most similar documents to query
-  def find_similar(query)
-    (query_scores, corpus_scores) = td_idf_scores_for(query)
-    query_vector = Vector.elements(query_scores, false)
+    def add_document(account, doc)
+      tokenize(doc).each do |n|
+        (token, count) = n
-    # For each doc, calculate the similarity to the query
-    suggestions = corpus_scores.map do |account, scores|
-      acct_vector = Vector.elements(scores, false)
+        @tokens[token] ||= {}
+        @tokens[token][account] ||= 0
+        @tokens[token][account] += count
+        @accounts[account] += count
+      end
+    end
-      acct_query_dp = acct_vector.inner_product(query_vector)
-      # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
-      # exactly opposite
-      # see https://en.wikipedia.org/wiki/Cosine_similarity
-      # cos(theta) = (A . B) / (||A|| ||B||)
-      # where A . B is the "dot product" and ||A|| is the magnitude of A
-      # ruby has the 'matrix' library we can use to do these calculations.
-      {
-        similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
-        account: account,
-      }
-    end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
+    # find most similar documents to query
+    def find_similar(query)
+      (query_scores, corpus_scores) = td_idf_scores_for(query)
-    LOGGER.info "most similar accounts: #{suggestions}"
+      query_vector = Vector.elements(query_scores, false)
-    return suggestions
-  end
+      # For each doc, calculate the similarity to the query
+      suggestions = corpus_scores.map do |account, scores|
+        acct_vector = Vector.elements(scores, false)
-  private
+        acct_query_dp = acct_vector.inner_product(query_vector)
+        # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
+        # exactly opposite
+        # see https://en.wikipedia.org/wiki/Cosine_similarity
+        # cos(theta) = (A . B) / (||A|| ||B||)
+        # where A . B is the "dot product" and ||A|| is the magnitude of A
+        # ruby has the 'matrix' library we can use to do these calculations.
+        {
+          similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
+          account: account,
+        }
+      end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
-  def td_idf_scores_for(query)
-    query_tokens = tokenize(query)
-    corpus = Set.new
-    corpus_scores = {}
-    query_scores = []
-    num_docs = @accounts.length
+      LOGGER.info "most similar accounts: #{suggestions}"
-    query_tokens.each do |n|
-      (token, _count) = n
-      next unless @tokens[token]
-      corpus = corpus.union(Set.new(@tokens[token].keys))
+      return suggestions
     end
-    query_tokens.each do |n|
-      (token, count) = n
+    private
-      # if no other docs have token, ignore it
-      next unless @tokens[token]
+    def td_idf_scores_for(query)
+      query_tokens = tokenize(query)
+      corpus = Set.new
+      corpus_scores = {}
+      query_scores = []
+      num_docs = @accounts.length
+      query_tokens.each do |n|
+        (token, _count) = n
+        next unless @tokens[token]
+        corpus = corpus.union(Set.new(@tokens[token].keys))
+      end
-      ## First, calculate scores for our query as we're building scores for the corpus
-      query_scores << calc_tf_idf(
-        count,
-        query_tokens.length,
-        @tokens[token].length,
-        num_docs
-      )
+      query_tokens.each do |n|
+        (token, count) = n
-      ## Next, calculate for the corpus, where our "account" is a document
-      corpus.each do |account|
-        corpus_scores[account] ||= []
+        # if no other docs have token, ignore it
+        next unless @tokens[token]
-        corpus_scores[account] << calc_tf_idf(
-          (@tokens[token][account] || 0),
-          @accounts[account].to_f,
-          @tokens[token].length.to_f,
+        ## First, calculate scores for our query as we're building scores for the corpus
+        query_scores << calc_tf_idf(
+          count,
+          query_tokens.length,
+          @tokens[token].length,
           num_docs
         )
+        ## Next, calculate for the corpus, where our "account" is a document
+        corpus.each do |account|
+          corpus_scores[account] ||= []
+          corpus_scores[account] << calc_tf_idf(
+            (@tokens[token][account] || 0),
+            @accounts[account].to_f,
+            @tokens[token].length.to_f,
+            num_docs
+          )
+        end
       end
+      [query_scores, corpus_scores]
     end
-    [query_scores, corpus_scores]
-  end
-  def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
+    def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
-    # tf(t,d) = count of t in d / number of words in d
-    tf = token_count / num_words_in_doc.to_f
+      # tf(t,d) = count of t in d / number of words in d
+      tf = token_count / num_words_in_doc.to_f
-    # smooth idf weight
-    # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
-    # df(t) = num of documents with term t in them
-    # idf(t) = log(N/(1 + df )) + 1
-    idf = Math.log(num_docs.to_f / (1 + df)) + 1
+      # smooth idf weight
+      # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
+      # df(t) = num of documents with term t in them
+      # idf(t) = log(N/(1 + df )) + 1
+      idf = Math.log(num_docs.to_f / (1 + df)) + 1
-    tf * idf
-  end
+      tf * idf
+    end
-  def tokenize(str)
-    mk_tokens(str).inject(Hash.new(0)) do |memo, n|
-      memo[n] += 1
-      memo
-    end.to_a
-  end
-end
+    def tokenize(str)
+      mk_tokens(str).inject(Hash.new(0)) do |memo, n|
+        memo[n] += 1
+        memo
+      end.to_a
+    end
-def mk_tokens(str)
-  str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
+    def mk_tokens(str)
+      str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
+    end
+  end
 end

data/lib/reckon/csv_parser.rb CHANGED

@@ -12,38 +12,69 @@ module Reckon
       detect_columns
     end
-    def filter_csv
-      if options[:ignore_columns]
-        new_columns = []
-        columns.each_with_index do |column, index|
-          new_columns << column unless options[:ignore_columns].include?(index + 1)
+    def columns
+      @columns ||=
+        begin
+          last_row_length = nil
+          csv_data.inject([]) do |memo, row|
+            unless row.all? { |i| i.nil? || i.length == 0 }
+              row.each_with_index do |entry, index|
+                memo[index] ||= []
+                memo[index] << (entry || '').strip
+              end
+              last_row_length = row.length
+            end
+            memo
+          end
         end
-        @columns = new_columns
-      end
     end
-    def money_for(index)
-      @money_column[index]
+    def date_for(index)
+      @date_column.for(index)
     end
-    def pretty_money_for(index, negate = false)
-      money_for( index ).pretty( negate )
+    def pretty_date_for(index)
+      @date_column.pretty_for( index )
+    end
+    def money_for(index)
+      @money_column[index]
     end
     def pretty_money(amount, negate = false)
       Money.new( amount, @options ).pretty( negate )
     end
-    def date_for(index)
-      @date_column.for( index )
-    end
+    def pretty_money_for(index, negate = false)
+      money = money_for(index)
+      return 0 if money.nil?
-    def pretty_date_for(index)
-      @date_column.pretty_for( index )
+      money.pretty(negate)
     end
     def description_for(index)
-      description_column_indices.map { |i| columns[i][index] }.reject(&:empty?).join("; ").squeeze(" ").gsub(/(;\s+){2,}/, '').strip
+      description_column_indices.map { |i| columns[i][index].to_s.strip }
+        .reject(&:empty?)
+        .join("; ")
+        .squeeze(" ")
+        .gsub(/(;\s+){2,}/, '')
+        .strip
+    end
+    def row(index)
+      csv_data[index].join(", ")
+    end
+    private
+    def filter_csv
+      if options[:ignore_columns]
+        new_columns = []
+        columns.each_with_index do |column, index|
+          new_columns << column unless options[:ignore_columns].include?(index + 1)
+        end
+        @columns = new_columns
+      end
     end
     def evaluate_columns(cols)
@@ -87,48 +118,24 @@ module Reckon
         results << { :index => index, :money_score => money_score, :date_score => date_score }
       end
-      return [results, found_likely_money_column]
-    end
+      results.sort_by! { |n| -n[:money_score] }
-    def merge_columns(a, b)
-      output_columns = []
-      columns.each_with_index do |column, index|
-        if index == a
-          new_column = MoneyColumn.new( column )
-            .merge!( MoneyColumn.new( columns[b] ) )
-            .map { |m| m.amount.to_s }
-          output_columns << new_column
-        elsif index == b
-          # skip
-        else
-          output_columns << column
-        end
+      # check if it looks like a 2-column file with a balance field
+      if results.length >= 3 && results[1][:money_score] + results[2][:money_score] >= results[0][:money_score]
+        results[1][:is_money_column] = true
+        results[2][:is_money_column] = true
+      else
+        results[0][:is_money_column] = true
       end
-      output_columns
-    end
-    def evaluate_two_money_columns( columns, id1, id2, unmerged_results )
-      merged_columns = merge_columns( id1, id2 )
-      results, found_likely_money_column = evaluate_columns( merged_columns )
-      if !found_likely_money_column
-        new_res = results.find { |el| el[:index] == id1 }
-        old_res1 = unmerged_results.find { |el| el[:index] == id1 }
-        old_res2 = unmerged_results.find { |el| el[:index] == id2 }
-        if new_res[:money_score] > old_res1[:money_score] &&
-          new_res[:money_score] > old_res2[:money_score]
-          found_likely_money_column = true
-        end
-      end
-      [results, found_likely_money_column]
+      return results.sort_by { |n| n[:index] }
     end
-    def found_double_money_column( id1, id2 )
-      self.money_column_indices = [ id1, id2 ]
-      unless settings[:testing]
-        puts "It looks like this CSV has two seperate columns for money, one of which shows positive"
-        puts "changes and one of which shows negative changes.  If this is true, great.  Otherwise,"
-        puts "please report this issue to us so we can take a look!\n"
-      end
+    def found_double_money_column(id1, id2)
+      self.money_column_indices = [id1, id2]
+      puts "It looks like this CSV has two seperate columns for money, one of which shows positive"
+      puts "changes and one of which shows negative changes.  If this is true, great.  Otherwise,"
+      puts "please report this issue to us so we can take a look!\n"
     end
     # Some csv files negative/positive amounts are indicated in separate account
@@ -158,41 +165,18 @@ module Reckon
     end
     def detect_columns
-      results, found_likely_money_column = evaluate_columns(columns)
+      results = evaluate_columns(columns)
       if options[:money_column]
-        found_likely_money_column = true
         self.money_column_indices = [ options[:money_column] - 1 ]
       else
-        self.money_column_indices = [ results.max_by { |n| n[:money_score] }[:index] ]
-      end
-      if !found_likely_money_column
-        found_likely_double_money_columns = false
-        0.upto(columns.length - 2) do |i|
-          if MoneyColumn.new( columns[i] ).merge!( MoneyColumn.new( columns[i+1] ) )
-            _, found_likely_double_money_columns = evaluate_columns(merge_columns(i, i+1))
-            if found_likely_double_money_columns
-              found_double_money_column( i, i + 1 )
-              break
-            end
-          end
-        end
-        if !found_likely_double_money_columns
-          0.upto(columns.length - 2) do |i|
-            if MoneyColumn.new( columns[i] ).merge!( MoneyColumn.new( columns[i+1] ) )
-              # Try a more specific test
-              _, found_likely_double_money_columns = evaluate_two_money_columns( columns, i, i+1, results )
-              if found_likely_double_money_columns
-                found_double_money_column( i, i + 1 )
-                break
-              end
-            end
-          end
-        end
-        if !found_likely_double_money_columns && !settings[:testing]
-          puts "I didn't find a high-likelyhood money column, but I'm taking my best guess with column #{money_column_indices.first + 1}."
+        self.money_column_indices = results.select { |n| n[:is_money_column] }.map { |n| n[:index] }
+        if self.money_column_indices.length == 1
+          puts "Using column #{money_column_indices.first + 1} as the money column.  Use --money-colum to specify a different one."
+        elsif self.money_column_indices.length == 2
+          found_double_money_column(*self.money_column_indices)
+        else
+          puts "Unable to determine a money column, use --money-column to specify the column reckon should use."
         end
       end
@@ -216,23 +200,6 @@ module Reckon
       self.description_column_indices = results.map { |i| i[:index] }
     end
-    def columns
-      @columns ||= begin
-        last_row_length = nil
-        csv_data.inject([]) do |memo, row|
-          # fail "Input CSV must have consistent row lengths." if last_row_length && row.length != last_row_length
-          unless row.all? { |i| i.nil? || i.length == 0 }
-            row.each_with_index do |entry, index|
-              memo[index] ||= []
-              memo[index] << (entry || '').strip
-            end
-            last_row_length = row.length
-          end
-          memo
-        end
-      end
-    end
     def parse(data, filename=nil)
       # Use force_encoding to convert the string to utf-8 with as few invalid characters
       # as possible.
@@ -274,15 +241,5 @@ module Reckon
       end
       m && m[1]
     end
-    @settings = { :testing => false }
-    def self.settings
-      @settings
-    end
-    def settings
-      self.class.settings
-    end
   end
 end

data/lib/reckon/date_column.rb ADDED

@@ -0,0 +1,60 @@
+module Reckon
+  class DateColumn < Array
+    attr_accessor :endian_precedence
+    def initialize( arr = [], options = {} )
+      arr.each do |value|
+        if options[:date_format]
+          begin
+            value = Date.strptime(value, options[:date_format])
+          rescue
+            puts "I'm having trouble parsing #{value} with the desired format: #{options[:date_format]}"
+            exit 1
+          end
+        else
+          value = [$1, $2, $3].join("/") if value =~ /^(\d{4})(\d{2})(\d{2})\d+\[\d+\:GMT\]$/ # chase format
+          value = [$3, $2, $1].join("/") if value =~ /^(\d{2})\.(\d{2})\.(\d{4})$/            # german format
+          value = [$3, $2, $1].join("/") if value =~ /^(\d{2})\-(\d{2})\-(\d{4})$/            # nordea format
+          value = [$1, $2, $3].join("/") if value =~ /^(\d{4})\-(\d{2})\-(\d{2})$/            # yyyy-mm-dd format
+          value = [$1, $2, $3].join("/") if value =~ /^(\d{4})(\d{2})(\d{2})/                 # yyyymmdd format
+          unless @endian_precedence # Try to detect endian_precedence
+            reg_match = value.match( /^(\d\d)\/(\d\d)\/\d\d\d?\d?/ )
+            # If first one is not \d\d/\d\d/\d\d\d?\d set it to default
+            if !reg_match
+              @endian_precedence = [:middle, :little]
+            elsif reg_match[1].to_i > 12
+              @endian_precedence = [:little]
+            elsif reg_match[2].to_i > 12
+              @endian_precedence = [:middle]
+            end
+          end
+        end
+        self.push( value )
+      end
+      # if endian_precedence still nil, raise error
+      unless @endian_precedence || options[:date_format]
+        raise( "Unable to determine date format. Please specify using --date-format" )
+      end
+    end
+    def for( index )
+      value = self.at( index )
+      guess = Chronic.parse(value, :context => :past,
+                            :endian_precedence => @endian_precedence )
+      if guess.to_i < 953236800 && value =~ /\//
+        guess = Chronic.parse((value.split("/")[0...-1] + [(2000 + value.split("/").last.to_i).to_s]).join("/"), :context => :past,
+                              :endian_precedence => @endian_precedence)
+      end
+      guess && guess.to_date
+    end
+    def pretty_for(index)
+      date = self.for(index)
+      return "" if date.nil?
+      date.iso8601
+    end
+  end
+end