RubyGems - reckon - Versions diffs - 0.5.2 → 0.6.2 - Mend

reckon 0.5.2 → 0.6.2

Files changed (112) hide show

checksums.yaml +4 -4
data/.github/workflows/ruby.yml +50 -0
data/.gitignore +2 -0
data/.ruby-version +1 -1
data/CHANGELOG.md +66 -2
data/Gemfile.lock +1 -5
data/README.md +76 -16
data/Rakefile +17 -1
data/bin/reckon +6 -1
data/lib/reckon.rb +2 -5
data/lib/reckon/app.rb +156 -73
data/lib/reckon/cosine_similarity.rb +91 -89
data/lib/reckon/csv_parser.rb +8 -8
data/lib/reckon/date_column.rb +10 -0
data/lib/reckon/ledger_parser.rb +11 -1
data/lib/reckon/logger.rb +4 -0
data/lib/reckon/money.rb +48 -48
data/lib/reckon/version.rb +1 -1
data/reckon.gemspec +1 -2
data/spec/integration/another_bank_example/input.csv +9 -0
data/spec/integration/another_bank_example/output.ledger +36 -0
data/spec/integration/another_bank_example/test_args +1 -0
data/spec/integration/austrian_example/input.csv +13 -0
data/spec/integration/austrian_example/output.ledger +52 -0
data/spec/integration/austrian_example/test_args +2 -0
data/spec/integration/bom_utf8_file/input.csv +3 -0
data/spec/integration/bom_utf8_file/output.ledger +4 -0
data/spec/integration/bom_utf8_file/test_args +3 -0
data/spec/integration/broker_canada_example/input.csv +12 -0
data/spec/integration/broker_canada_example/output.ledger +48 -0
data/spec/integration/broker_canada_example/test_args +1 -0
data/spec/integration/chase/account_tokens_and_regex/output.ledger +36 -0
data/spec/integration/chase/account_tokens_and_regex/test_args +2 -0
data/spec/integration/chase/account_tokens_and_regex/tokens.yml +16 -0
data/spec/integration/chase/default_account_names/output.ledger +36 -0
data/spec/integration/chase/default_account_names/test_args +3 -0
data/spec/integration/chase/input.csv +9 -0
data/spec/integration/chase/learn_from_existing/learn.ledger +7 -0
data/spec/integration/chase/learn_from_existing/output.ledger +36 -0
data/spec/integration/chase/learn_from_existing/test_args +1 -0
data/spec/integration/chase/simple/output.ledger +36 -0
data/spec/integration/chase/simple/test_args +1 -0
data/spec/integration/danish_kroner_nordea_example/input.csv +6 -0
data/spec/integration/danish_kroner_nordea_example/output.ledger +24 -0
data/spec/integration/danish_kroner_nordea_example/test_args +1 -0
data/spec/integration/english_date_example/input.csv +3 -0
data/spec/integration/english_date_example/output.ledger +12 -0
data/spec/integration/english_date_example/test_args +1 -0
data/spec/integration/extratofake/input.csv +24 -0
data/spec/integration/extratofake/output.ledger +92 -0
data/spec/integration/extratofake/test_args +1 -0
data/spec/integration/french_example/input.csv +9 -0
data/spec/integration/french_example/output.ledger +36 -0
data/spec/integration/french_example/test_args +2 -0
data/spec/integration/german_date_example/input.csv +3 -0
data/spec/integration/german_date_example/output.ledger +12 -0
data/spec/integration/german_date_example/test_args +1 -0
data/spec/integration/harder_date_example/input.csv +5 -0
data/spec/integration/harder_date_example/output.ledger +20 -0
data/spec/integration/harder_date_example/test_args +1 -0
data/spec/integration/ing/input.csv +3 -0
data/spec/integration/ing/output.ledger +12 -0
data/spec/integration/ing/test_args +1 -0
data/spec/integration/intuit_mint_example/input.csv +7 -0
data/spec/integration/intuit_mint_example/output.ledger +28 -0
data/spec/integration/intuit_mint_example/test_args +1 -0
data/spec/integration/invalid_header_example/input.csv +6 -0
data/spec/integration/invalid_header_example/output.ledger +8 -0
data/spec/integration/invalid_header_example/test_args +1 -0
data/spec/integration/inversed_credit_card/input.csv +16 -0
data/spec/integration/inversed_credit_card/output.ledger +64 -0
data/spec/integration/inversed_credit_card/test_args +1 -0
data/spec/integration/nationwide/input.csv +4 -0
data/spec/integration/nationwide/output.ledger +16 -0
data/spec/integration/nationwide/test_args +1 -0
data/spec/integration/regression/issue_51_account_tokens/input.csv +8 -0
data/spec/integration/regression/issue_51_account_tokens/output.ledger +32 -0
data/spec/integration/regression/issue_51_account_tokens/test_args +4 -0
data/spec/integration/regression/issue_51_account_tokens/tokens.yml +9 -0
data/spec/integration/regression/issue_64_date_column/input.csv +3 -0
data/spec/integration/regression/issue_64_date_column/output.ledger +8 -0
data/spec/integration/regression/issue_64_date_column/test_args +1 -0
data/spec/integration/regression/issue_73_account_token_matching/input.csv +2 -0
data/spec/integration/regression/issue_73_account_token_matching/output.ledger +4 -0
data/spec/integration/regression/issue_73_account_token_matching/test_args +6 -0
data/spec/integration/regression/issue_73_account_token_matching/tokens.yml +8 -0
data/spec/integration/regression/issue_85_date_example/input.csv +2 -0
data/spec/integration/regression/issue_85_date_example/output.ledger +8 -0
data/spec/integration/regression/issue_85_date_example/test_args +1 -0
data/spec/integration/spanish_date_example/input.csv +3 -0
data/spec/integration/spanish_date_example/output.ledger +12 -0
data/spec/integration/spanish_date_example/test_args +1 -0
data/spec/integration/suntrust/input.csv +7 -0
data/spec/integration/suntrust/output.ledger +28 -0
data/spec/integration/suntrust/test_args +1 -0
data/spec/integration/test.sh +82 -0
data/spec/integration/test_money_column/input.csv +3 -0
data/spec/integration/test_money_column/output.ledger +8 -0
data/spec/integration/test_money_column/test_args +1 -0
data/spec/integration/two_money_columns/input.csv +5 -0
data/spec/integration/two_money_columns/output.ledger +20 -0
data/spec/integration/two_money_columns/test_args +1 -0
data/spec/integration/yyyymmdd_date_example/input.csv +1 -0
data/spec/integration/yyyymmdd_date_example/output.ledger +4 -0
data/spec/integration/yyyymmdd_date_example/test_args +1 -0
data/spec/reckon/app_spec.rb +18 -2
data/spec/reckon/csv_parser_spec.rb +5 -0
data/spec/reckon/ledger_parser_spec.rb +42 -5
data/spec/reckon/money_column_spec.rb +24 -24
data/spec/reckon/money_spec.rb +13 -32
metadata +94 -21
data/.travis.yml +0 -13

data/lib/reckon/cosine_similarity.rb CHANGED

@@ -3,118 +3,120 @@ require 'set'
 # Implementation of consine similarity using TF-IDF for vectorization.
 # Used to suggest which account a transaction should be assigned to
-class CosineSimilarity
-  def initialize(options)
-    @options = options
-    @tokens = {}
-    @accounts = Hash.new(0)
-  end
-  def add_document(account, doc)
-    tokenize(doc).each do |n|
-      (token, count) = n
-      @tokens[token] ||= {}
-      @tokens[token][account] ||= 0
-      @tokens[token][account] += count
-      @accounts[account] += count
+module Reckon
+  class CosineSimilarity
+    def initialize(options)
+      @options = options
+      @tokens = {}
+      @accounts = Hash.new(0)
     end
-  end
-  # find most similar documents to query
-  def find_similar(query)
-    (query_scores, corpus_scores) = td_idf_scores_for(query)
-    query_vector = Vector.elements(query_scores, false)
+    def add_document(account, doc)
+      tokenize(doc).each do |n|
+        (token, count) = n
-    # For each doc, calculate the similarity to the query
-    suggestions = corpus_scores.map do |account, scores|
-      acct_vector = Vector.elements(scores, false)
+        @tokens[token] ||= {}
+        @tokens[token][account] ||= 0
+        @tokens[token][account] += count
+        @accounts[account] += count
+      end
+    end
-      acct_query_dp = acct_vector.inner_product(query_vector)
-      # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
-      # exactly opposite
-      # see https://en.wikipedia.org/wiki/Cosine_similarity
-      # cos(theta) = (A . B) / (||A|| ||B||)
-      # where A . B is the "dot product" and ||A|| is the magnitude of A
-      # ruby has the 'matrix' library we can use to do these calculations.
-      {
-        similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
-        account: account,
-      }
-    end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
+    # find most similar documents to query
+    def find_similar(query)
+      (query_scores, corpus_scores) = td_idf_scores_for(query)
-    LOGGER.info "most similar accounts: #{suggestions}"
+      query_vector = Vector.elements(query_scores, false)
-    return suggestions
-  end
+      # For each doc, calculate the similarity to the query
+      suggestions = corpus_scores.map do |account, scores|
+        acct_vector = Vector.elements(scores, false)
-  private
+        acct_query_dp = acct_vector.inner_product(query_vector)
+        # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
+        # exactly opposite
+        # see https://en.wikipedia.org/wiki/Cosine_similarity
+        # cos(theta) = (A . B) / (||A|| ||B||)
+        # where A . B is the "dot product" and ||A|| is the magnitude of A
+        # ruby has the 'matrix' library we can use to do these calculations.
+        {
+          similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
+          account: account,
+        }
+      end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
-  def td_idf_scores_for(query)
-    query_tokens = tokenize(query)
-    corpus = Set.new
-    corpus_scores = {}
-    query_scores = []
-    num_docs = @accounts.length
+      LOGGER.info "most similar accounts: #{suggestions}"
-    query_tokens.each do |n|
-      (token, _count) = n
-      next unless @tokens[token]
-      corpus = corpus.union(Set.new(@tokens[token].keys))
+      return suggestions
     end
-    query_tokens.each do |n|
-      (token, count) = n
+    private
-      # if no other docs have token, ignore it
-      next unless @tokens[token]
+    def td_idf_scores_for(query)
+      query_tokens = tokenize(query)
+      corpus = Set.new
+      corpus_scores = {}
+      query_scores = []
+      num_docs = @accounts.length
+      query_tokens.each do |n|
+        (token, _count) = n
+        next unless @tokens[token]
+        corpus = corpus.union(Set.new(@tokens[token].keys))
+      end
-      ## First, calculate scores for our query as we're building scores for the corpus
-      query_scores << calc_tf_idf(
-        count,
-        query_tokens.length,
-        @tokens[token].length,
-        num_docs
-      )
+      query_tokens.each do |n|
+        (token, count) = n
-      ## Next, calculate for the corpus, where our "account" is a document
-      corpus.each do |account|
-        corpus_scores[account] ||= []
+        # if no other docs have token, ignore it
+        next unless @tokens[token]
-        corpus_scores[account] << calc_tf_idf(
-          (@tokens[token][account] || 0),
-          @accounts[account].to_f,
-          @tokens[token].length.to_f,
+        ## First, calculate scores for our query as we're building scores for the corpus
+        query_scores << calc_tf_idf(
+          count,
+          query_tokens.length,
+          @tokens[token].length,
           num_docs
         )
+        ## Next, calculate for the corpus, where our "account" is a document
+        corpus.each do |account|
+          corpus_scores[account] ||= []
+          corpus_scores[account] << calc_tf_idf(
+            (@tokens[token][account] || 0),
+            @accounts[account].to_f,
+            @tokens[token].length.to_f,
+            num_docs
+          )
+        end
       end
+      [query_scores, corpus_scores]
     end
-    [query_scores, corpus_scores]
-  end
-  def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
+    def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
-    # tf(t,d) = count of t in d / number of words in d
-    tf = token_count / num_words_in_doc.to_f
+      # tf(t,d) = count of t in d / number of words in d
+      tf = token_count / num_words_in_doc.to_f
-    # smooth idf weight
-    # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
-    # df(t) = num of documents with term t in them
-    # idf(t) = log(N/(1 + df )) + 1
-    idf = Math.log(num_docs.to_f / (1 + df)) + 1
+      # smooth idf weight
+      # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
+      # df(t) = num of documents with term t in them
+      # idf(t) = log(N/(1 + df )) + 1
+      idf = Math.log(num_docs.to_f / (1 + df)) + 1
-    tf * idf
-  end
+      tf * idf
+    end
-  def tokenize(str)
-    mk_tokens(str).inject(Hash.new(0)) do |memo, n|
-      memo[n] += 1
-      memo
-    end.to_a
-  end
-end
+    def tokenize(str)
+      mk_tokens(str).inject(Hash.new(0)) do |memo, n|
+        memo[n] += 1
+        memo
+      end.to_a
+    end
-def mk_tokens(str)
-  str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
+    def mk_tokens(str)
+      str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
+    end
+  end
 end

data/lib/reckon/csv_parser.rb CHANGED

@@ -53,7 +53,12 @@ module Reckon
     end
     def description_for(index)
-      description_column_indices.map { |i| columns[i][index] }.reject(&:empty?).join("; ").squeeze(" ").gsub(/(;\s+){2,}/, '').strip
+      description_column_indices.map { |i| columns[i][index].to_s.strip }
+        .reject(&:empty?)
+        .join("; ")
+        .squeeze(" ")
+        .gsub(/(;\s+){2,}/, '')
+        .strip
     end
     def row(index)
@@ -84,12 +89,7 @@ module Reckon
           money_score += Money::likelihood( entry )
           possible_neg_money_count += 1 if entry =~ /^\$?[\-\(]\$?\d+/
           possible_pos_money_count += 1 if entry =~ /^\+?\$?\+?\d+/
-          date_score += 10 if entry =~ /\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/i
-          date_score += 5 if entry =~ /^[\-\/\.\d:\[\]]+$/
-          date_score += entry.gsub(/[^\-\/\.\d:\[\]]/, '').length if entry.gsub(/[^\-\/\.\d:\[\]]/, '').length > 3
-          date_score -= entry.gsub(/[\-\/\.\d:\[\]]/, '').length
-          date_score += 30 if entry =~ /^\d+[:\/\.-]\d+[:\/\.-]\d+([ :]\d+[:\/\.]\d+)?$/
-          date_score += 10 if entry =~ /^\d+\[\d+:GMT\]$/i
+          date_score += DateColumn.likelihood(entry)
           # Try to determine if this is a balance column
           entry_as_num = entry.gsub(/[^\-\d\.]/, '').to_f
@@ -163,7 +163,7 @@ module Reckon
       results = evaluate_columns(columns)
       if options[:money_column]
-        self.money_column_indices = [ options[:money_column] - 1 ]
+        self.money_column_indices = [options[:money_column] - 1]
       else
         self.money_column_indices = results.select { |n| n[:is_money_column] }.map { |n| n[:index] }
         if self.money_column_indices.length == 1

data/lib/reckon/date_column.rb CHANGED

@@ -56,5 +56,15 @@ module Reckon
       date.iso8601
     end
+    def self.likelihood(entry)
+      date_score = 0
+      date_score += 10 if entry =~ /\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/i
+      date_score += 5 if entry =~ /^[\-\/\.\d:\[\]]+$/
+      date_score += entry.gsub(/[^\-\/\.\d:\[\]]/, '').length if entry.gsub(/[^\-\/\.\d:\[\]]/, '').length > 3
+      date_score -= entry.gsub(/[\-\/\.\d:\[\]]/, '').length
+      date_score += 30 if entry =~ /^\d+[:\/\.-]\d+[:\/\.-]\d+([ :]\d+[:\/\.]\d+)?$/
+      date_score += 10 if entry =~ /^\d+\[\d+:GMT\]$/i
+      return date_score
+    end
   end
 end

data/lib/reckon/ledger_parser.rb CHANGED

@@ -121,8 +121,14 @@ module Reckon
     def parse(ledger)
       @entries = []
       new_entry = {}
+      in_comment = false
+      comment_chars = ';#%*|'
       ledger.strip.split("\n").each do |entry|
-        next if entry =~ /^\s*$/ || entry =~ /^\s*;/
+        # strip comment lines
+        in_comment = true if entry == 'comment'
+        in_comment = false if entry == 'end comment'
+        next if in_comment
+        next if entry =~ /^\s*[#{comment_chars}]/
         # (date, type, code, description), type and code are optional
         if (m = entry.match(%r{^(\d+[\d/-]+)\s+([*!])?\s*(\([^)]+\))?\s*(.*)$}))
@@ -134,7 +140,11 @@ module Reckon
             desc: m[4].strip,
             accounts: []
           }
+        elsif entry =~ /^\s*$/ && new_entry[:date]
+          add_entry(new_entry)
+          new_entry = {}
         elsif new_entry[:date] && entry =~ /^\s+/
+          LOGGER.info("Adding new account #{entry}")
           new_entry[:accounts] << parse_account_line(entry)
         else
           LOGGER.info("Unknown entry type: #{entry}")

data/lib/reckon/logger.rb ADDED

@@ -0,0 +1,4 @@
+module Reckon
+  LOGGER = Logger.new(STDERR)
+  LOGGER.level = Logger::WARN
+end

data/lib/reckon/money.rb CHANGED

@@ -5,12 +5,13 @@ module Reckon
   class Money
     include Comparable
     attr_accessor :amount, :currency, :suffixed
-    def initialize( amount, options = {} )
-      if options[:inverse]
-        @amount = -1*amount.to_f
-      else
-        @amount = amount.to_f
-      end
+    def initialize(amount, options = {})
+      @options = options
+      @amount_raw = amount
+      @raw = options[:raw]
+      @amount = parse(amount, options)
+      @amount = -@amount if options[:inverse]
       @currency = options[:currency] || "$"
       @suffixed = options[:suffixed]
     end
@@ -19,11 +20,19 @@ module Reckon
       return @amount
     end
+    def to_s
+      return @options[:raw] ? "#{@amount_raw} | #{@amount}" : @amount
+    end
+    # unary minus
+    # ex
+    # m = Money.new
+    # -m
     def -@
-      Money.new( -@amount, :currency => @currency, :suffixed => @suffixed )
+      Money.new(-@amount, :currency => @currency, :suffixed => @suffixed)
     end
-    def <=>( mon )
+    def <=>(mon)
       other_amount = mon.to_f
       if @amount < other_amount
         -1
@@ -34,7 +43,13 @@ module Reckon
       end
     end
-    def pretty( negate = false )
+    def pretty(negate = false)
+      if @raw
+        return @amount_raw unless negate
+        return @amount_raw[0] == '-' ? @amount_raw[1..-1] : "-#{@amount_raw}"
+      end
       if @suffixed
         (@amount >= 0 ? " " : "") + sprintf("%0.2f #{@currency}", @amount * (negate ? -1 : 1))
       else
@@ -42,34 +57,20 @@ module Reckon
       end
     end
-    def Money::from_s( value, options = {} )
+    def parse(value, options = {})
+      value = value.to_s
       # Empty string is treated as money with value 0
-      return Money.new( 0.00, options ) if value.empty?
-      # Remove 1000 separaters and replace , with . if comma_separates_cents
-      # 1.000,00 -> 1000.00
-      value = value.gsub(/\./, '').gsub(/,/, '.') if options[:comma_separates_cents]
-      value = value.gsub(/,/, '')
-      money_format_regex = /^(.*?)(\d+\.\d\d)/ # Money has two decimal precision
-      any_number_regex = /^(.*?)([\d\.]+)/
-      # Prefer matching the money_format, match any number otherwise
-      m = value.match( money_format_regex ) ||
-        value.match( any_number_regex )
-      if m
-        amount = m[2].to_f
-        # Check whether the money had a - or (, which indicates negative amounts
-        if (m[1].match( /^[\(-]/ ) || m[1].match( /-$/  ))
-          amount *= -1
-        end
-        return Money.new( amount, options )
-      else
-        return nil
-      end
+      return value.to_f if value.to_s.empty?
+      invert = value.match(/^\(.*\)$/)
+      value = value.gsub(/[^0-9,.-]/, '')
+      value = value.tr('.', '').tr(',', '.') if options[:comma_separates_cents]
+      value = value.tr(',', '')
+      value = value.to_f
+      return invert ? -value : value
     end
-    def Money::likelihood( entry )
+    def Money::likelihood(entry)
       money_score = 0
       # digits separated by , or . with no more than 2 trailing digits
       money_score += 40 if entry.match(/\d+[,.]\d{2}[^\d]*$/)
@@ -83,31 +84,30 @@ module Reckon
   end
   class MoneyColumn < Array
-    def initialize( arr = [], options = {} )
-      arr.each { |str| self.push( Money.from_s( str, options ) ) }
+    def initialize(arr = [], options = {})
+      arr.each { |str| push(Money.new(str, options)) }
     end
     def positive?
-      self.each do |money|
-        return false if money < 0 if money
+      each do |money|
+        return false if money && money < 0
       end
       true
     end
-    def merge!( other_column )
+    def merge!(other_column)
       invert = false
-      invert = true if self.positive? && other_column.positive?
-      self.each_with_index do |mon, i|
+      invert = true if positive? && other_column.positive?
+      each_with_index do |mon, i|
         other = other_column[i]
-        return nil if (!mon || !other)
-        if mon != 0.00 && other == 0.0
-          if invert
-            self[i]= -mon
-          end
-        elsif mon == 0.00 && other != 0.00
+        return nil if !mon || !other
+        if mon != 0.0 && other == 0.0
+          self[i] = -mon if invert
+        elsif mon == 0.0 && other != 0.0
           self[i] = other
         else
-          return nil
+          self[i] = Money.new(0)
         end
       end
       self

data/lib/reckon/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Reckon
-  VERSION = "0.5.2"
+  VERSION="0.6.2"
 end

data/reckon.gemspec CHANGED

@@ -13,7 +13,7 @@ Gem::Specification.new do |s|
   s.files         = `git ls-files`.split("\n")
   s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
-  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.executables   = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
   s.require_paths = ["lib"]
   s.add_development_dependency "rspec", ">= 1.2.9"
@@ -21,6 +21,5 @@ Gem::Specification.new do |s|
   s.add_development_dependency "rantly", "= 1.2.0"
   s.add_runtime_dependency "chronic", ">= 0.3.0"
   s.add_runtime_dependency "highline", ">= 1.5.2"
-  s.add_runtime_dependency "terminal-table", ">= 1.4.2"
   s.add_runtime_dependency "rchardet", ">= 1.8.0"
 end