RubyGems - reckon - Versions diffs - 0.4.4 → 0.5.0 - Mend

reckon 0.4.4 → 0.5.0

Files changed (43) hide show

checksums.yaml +5 -5
data/.ruby-version +1 -1
data/.travis.yml +10 -2
data/CHANGELOG.md +197 -0
data/Gemfile +0 -1
data/Gemfile.lock +33 -15
data/README.md +2 -5
data/lib/reckon.rb +10 -8
data/lib/reckon/app.rb +92 -116
data/lib/reckon/cosine_similarity.rb +119 -0
data/lib/reckon/csv_parser.rb +57 -27
data/lib/reckon/ledger_parser.rb +194 -30
data/lib/reckon/money.rb +3 -4
data/reckon.gemspec +6 -5
data/spec/data_fixtures/73-sample.csv +2 -0
data/spec/data_fixtures/73-tokens.yml +8 -0
data/spec/data_fixtures/73-transactions.ledger +7 -0
data/spec/data_fixtures/austrian_example.csv +13 -0
data/spec/data_fixtures/bom_utf8_file.csv +1 -0
data/spec/data_fixtures/broker_canada_example.csv +12 -0
data/spec/data_fixtures/chase.csv +9 -0
data/spec/data_fixtures/danish_kroner_nordea_example.csv +6 -0
data/spec/data_fixtures/english_date_example.csv +3 -0
data/spec/data_fixtures/french_example.csv +9 -0
data/spec/data_fixtures/german_date_example.csv +3 -0
data/spec/data_fixtures/harder_date_example.csv +5 -0
data/spec/data_fixtures/ing.csv +3 -0
data/spec/data_fixtures/intuit_mint_example.csv +7 -0
data/spec/data_fixtures/invalid_header_example.csv +6 -0
data/spec/data_fixtures/inversed_credit_card.csv +16 -0
data/spec/data_fixtures/nationwide.csv +4 -0
data/spec/data_fixtures/simple.csv +2 -0
data/spec/data_fixtures/some_other.csv +9 -0
data/spec/data_fixtures/spanish_date_example.csv +3 -0
data/spec/data_fixtures/suntrust.csv +7 -0
data/spec/data_fixtures/two_money_columns.csv +5 -0
data/spec/data_fixtures/yyyymmdd_date_example.csv +1 -0
data/spec/reckon/app_spec.rb +66 -34
data/spec/reckon/csv_parser_spec.rb +79 -201
data/spec/reckon/ledger_parser_spec.rb +62 -9
data/spec/spec_helper.rb +3 -0
metadata +62 -19
data/CHANGES.md +0 -9

data/lib/reckon/cosine_similarity.rb ADDED

@@ -0,0 +1,119 @@
+require 'matrix'
+# Implementation of consine similarity using TF-IDF for vectorization.
+# Used to suggest which account a transaction should be assigned to
+class CosineSimilarity
+  def initialize(options)
+    @options = options
+    @tokens = {}
+    @accounts = Hash.new(0)
+  end
+  def add_document(account, doc)
+    tokenize(doc).each do |n|
+      (token, count) = n
+      @tokens[token] ||= {}
+      @tokens[token][account] ||= 0
+      @tokens[token][account] += count
+      @accounts[account] += count
+    end
+  end
+  # find most similar documents to query
+  def find_similar(query)
+    (query_scores, corpus_scores) = td_idf_scores_for(query)
+    query_vector = Vector.elements(query_scores, false)
+    # For each doc, calculate the similarity to the query
+    suggestions = corpus_scores.map do |account, scores|
+      acct_vector = Vector.elements(scores, false)
+      acct_query_dp = acct_vector.inner_product(query_vector)
+      # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
+      # exactly opposite
+      # see https://en.wikipedia.org/wiki/Cosine_similarity
+      # cos(theta) = (A . B) / (||A|| ||B||)
+      # where A . B is the "dot product" and ||A|| is the magnitude of A
+      # ruby has the 'matrix' library we can use to do these calculations.
+      {
+        similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
+        account: account,
+      }
+    end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
+    LOGGER.info "most similar accounts: #{suggestions}"
+    return suggestions
+  end
+  private
+  def td_idf_scores_for(query)
+    query_tokens = tokenize(query)
+    corpus = Set.new
+    corpus_scores = {}
+    query_scores = []
+    num_docs = @accounts.length
+    query_tokens.each do |n|
+      (token, _count) = n
+      next unless @tokens[token]
+      corpus = corpus.union(Set.new(@tokens[token].keys))
+    end
+    query_tokens.each do |n|
+      (token, count) = n
+      # if no other docs have token, ignore it
+      next unless @tokens[token]
+      ## First, calculate scores for our query as we're building scores for the corpus
+      query_scores << calc_tf_idf(
+        count,
+        query_tokens.length,
+        @tokens[token].length,
+        num_docs
+      )
+      ## Next, calculate for the corpus, where our "account" is a document
+      corpus.each do |account|
+        corpus_scores[account] ||= []
+        corpus_scores[account] << calc_tf_idf(
+          (@tokens[token][account] || 0),
+          @accounts[account].to_f,
+          @tokens[token].length.to_f,
+          num_docs
+        )
+      end
+    end
+    [query_scores, corpus_scores]
+  end
+  def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
+    # tf(t,d) = count of t in d / number of words in d
+    tf = token_count / num_words_in_doc.to_f
+    # smooth idf weight
+    # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
+    # df(t) = num of documents with term t in them
+    # idf(t) = log(N/(1 + df )) + 1
+    idf = Math.log(num_docs.to_f / (1 + df)) + 1
+    tf * idf
+  end
+  def tokenize(str)
+    mk_tokens(str).inject(Hash.new(0)) do |memo, n|
+      memo[n] += 1
+      memo
+    end.to_a
+  end
+end
+def mk_tokens(str)
+  str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
+end

data/lib/reckon/csv_parser.rb CHANGED

@@ -1,5 +1,4 @@
 #coding: utf-8
-require 'pp'
 module Reckon
   class CSVParser
@@ -8,7 +7,7 @@ module Reckon
     def initialize(options = {})
       self.options = options
       self.options[:currency] ||= '$'
-      parse
+      @csv_data = parse(options[:string] || File.read(options[:file]), options[:file])
       filter_csv
       detect_columns
     end
@@ -44,7 +43,7 @@ module Reckon
     end
     def description_for(index)
-      description_column_indices.map { |i| columns[i][index] }.reject { |a| a.empty? }.join("; ").squeeze(" ").gsub(/(;\s+){2,}/, '').strip
+      description_column_indices.map { |i| columns[i][index] }.reject(&:empty?).join("; ").squeeze(" ").gsub(/(;\s+){2,}/, '').strip
     end
     def evaluate_columns(cols)
@@ -160,7 +159,12 @@ module Reckon
     def detect_columns
       results, found_likely_money_column = evaluate_columns(columns)
-      self.money_column_indices = [ results.sort { |a, b| b[:money_score] <=> a[:money_score] }.first[:index] ]
+      if options[:money_column]
+        found_likely_money_column = true
+        self.money_column_indices = [ options[:money_column] - 1 ]
+      else
+        self.money_column_indices = [ results.max_by { |n| n[:money_score] }[:index] ]
+      end
       if !found_likely_money_column
         found_likely_double_money_columns = false
@@ -192,20 +196,21 @@ module Reckon
         end
       end
-      results.reject! {|i| money_column_indices.include?(i[:index]) }
-      self.date_column_index = results.sort { |a, b| b[:date_score] <=> a[:date_score] }.first[:index]
-      results.reject! {|i| i[:index] == date_column_index }
-      @date_column = DateColumn.new( columns[ self.date_column_index ], @options )
+      results.reject! { |i| money_column_indices.include?(i[:index]) }
+      if options[:date_column]
+        @date_column_index = options[:date_column] - 1
+      else
+        # sort by highest score followed by lowest index
+        @date_column_index = results.max_by { |n| [n[:date_score], -n[:index]] }[:index]
+      end
+      results.reject! { |i| i[:index] == date_column_index }
+      @date_column = DateColumn.new(columns[date_column_index], @options)
-      if ( money_column_indices.length == 1 )
-        @money_column = MoneyColumn.new( columns[money_column_indices[0]],
-                                        @options )
+      @money_column = MoneyColumn.new(columns[money_column_indices[0]], @options)
+      if money_column_indices.length == 1
         detect_sign_column if @money_column.positive?
       else
-        @money_column = MoneyColumn.new( columns[money_column_indices[0]],
-                                        @options )
-        @money_column.merge!(
-          MoneyColumn.new( columns[money_column_indices[1]], @options ) )
+        @money_column.merge! MoneyColumn.new(columns[money_column_indices[1]], @options)
       end
       self.description_column_indices = results.map { |i| i[:index] }
@@ -228,21 +233,46 @@ module Reckon
       end
     end
-    def parse
-      data = options[:string] || File.read(options[:file])
-      if RUBY_VERSION =~ /^1\.9/ || RUBY_VERSION =~ /^2/
-        data = data.force_encoding(options[:encoding] || 'BINARY').encode('UTF-8', :invalid => :replace, :undef => :replace, :replace => '?')
-        csv_engine = CSV
-      else
-        csv_engine = FasterCSV
+    def parse(data, filename=nil)
+      # Use force_encoding to convert the string to utf-8 with as few invalid characters
+      # as possible.
+      data.force_encoding(try_encoding(data, filename))
+      data = data.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
+      data.sub!("\xEF\xBB\xBF", '') # strip byte order marker, if it exists
+      rows = []
+      data.each_line.with_index do |line, i|
+        next if i < (options[:contains_header] || 0)
+        rows << CSV.parse_line(line, col_sep: options[:csv_separator] || ',')
       end
-      @csv_data = csv_engine.parse data.strip, :col_sep => options[:csv_separator] || ','
-      if options[:contains_header]
-        options[:contains_header].times { csv_data.shift }
+      rows
+    end
+    def try_encoding(data, filename = nil)
+      encoding = try_encoding_from_file(filename)
+      cd = CharDet.detect(data)
+      encoding ||= cd['encoding']
+      encoding ||= 'BINARY'
+      LOGGER.info("suggested file encoding: #{encoding}")
+      options[:encoding] || encoding
+    end
+    def try_encoding_from_file(filename = nil)
+      return unless filename
+      m = nil
+      os = Gem::Platform.local.os
+      if os == 'linux'
+        m = `file -i #{filename}`.match(/charset=(\S+)/)
+      elsif os == 'darwin'
+        m = `file -I #{filename}`.match(/charset=(\S+)/)
       end
-      csv_data
+      m && m[1]
     end
     @settings = { :testing => false }

data/lib/reckon/ledger_parser.rb CHANGED

@@ -1,4 +1,109 @@
 #!/usr/bin/env ruby
+# frozen_string_literal: true
+# From: https://www.ledger-cli.org/3.0/doc/ledger3.html#Transactions-and-Comments
+#
+# The ledger file format is quite simple, but also very flexible. It supports many
+# options, though typically the user can ignore most of them. They are summarized below.
+#
+# The initial character of each line determines what the line means, and how it should
+# be interpreted. Allowable initial characters are:
+#
+# NUMBER
+#     A line beginning with a number denotes an entry. It may be followed by any
+#     number of lines, each beginning with whitespace, to denote the entry's account
+#     transactions. The format of the first line is:
+#
+#     DATE[=EDATE] [*|!] [(CODE)] DESC
+#
+#     If '*' appears after the date (with optional effective date), it indicates the
+#     entry is "cleared", which can mean whatever the user wants it to mean. If '!'
+#     appears after the date, it indicates d the entry is "pending"; i.e., tentatively
+#     cleared from the user's point of view, but not yet actually cleared. If a 'CODE'
+#     appears in parentheses, it may be used to indicate a check number, or the type of
+#     the transaction. Following these is the payee, or a description of the
+#     transaction.
+#
+#     The format of each following transaction is:
+#
+#       ACCOUNT  AMOUNT  [; NOTE]
+#
+#     The 'ACCOUNT' may be surrounded by parentheses if it is a virtual transactions, or
+#     square brackets if it is a virtual transactions that must balance. The 'AMOUNT'
+#     can be followed by a per-unit transaction cost, by specifying '@ AMOUNT', or a
+#     complete transaction cost with '@@ AMOUNT'. Lastly, the 'NOTE' may specify an
+#     actual and/or effective date for the transaction by using the syntax
+#     '[ACTUAL_DATE]' or '[=EFFECTIVE_DATE]' or '[ACTUAL_DATE=EFFECtIVE_DATE]'.
+# =
+#     An automated entry. A value expression must appear after the equal sign.
+#
+#     After this initial line there should be a set of one or more transactions, just as
+#     if it were normal entry. If the amounts of the transactions have no commodity,
+#     they will be applied as modifiers to whichever real transaction is matched by the
+#     value expression.
+# ~
+#     A period entry. A period expression must appear after the tilde.
+#
+#     After this initial line there should be a set of one or more transactions, just as
+#     if it were normal entry.
+# !
+#     A line beginning with an exclamation mark denotes a command directive. It must be
+#     immediately followed by the command word. The supported commands are:
+#
+#     '!include'
+#         Include the stated ledger file.
+#
+#     '!account'
+#         The account name is given is taken to be the parent of all transactions that
+#         follow, until '!end' is seen.
+#
+#     '!end'
+#         Ends an account block.
+#
+# ;
+#     A line beginning with a colon indicates a comment, and is ignored.
+# Y
+#     If a line begins with a capital Y, it denotes the year used for all subsequent
+#     entries that give a date without a year. The year should appear immediately after
+#     the Y, for example: 'Y2004'. This is useful at the beginning of a file, to specify
+#     the year for that file. If all entries specify a year, however, this command has
+#     no effect.
+#
+# P
+#     Specifies a historical price for a commodity. These are usually found in a pricing
+#     history file (see the -Q option). The syntax is:
+#
+#     P DATE SYMBOL PRICE
+#
+# N SYMBOL
+#     Indicates that pricing information is to be ignored for a given symbol, nor will
+#     quotes ever be downloaded for that symbol. Useful with a home currency, such as
+#     the dollar ($). It is recommended that these pricing options be set in the price
+#     database file, which defaults to ~/.pricedb. The syntax for this command is:
+#
+#     N SYMBOL
+#
+# D AMOUNT
+#     Specifies the default commodity to use, by specifying an amount in the expected
+#     format. The entry command will use this commodity as the default when none other
+#     can be determined. This command may be used multiple times, to set the default
+#     flags for different commodities; whichever is seen last is used as the default
+#     commodity. For example, to set US dollars as the default commodity, while also
+#     setting the thousands flag and decimal flag for that commodity, use:
+#
+#     D $1,000.00
+#
+# C AMOUNT1 = AMOUNT2
+#     Specifies a commodity conversion, where the first amount is given to be equivalent
+#     to the second amount. The first amount should use the decimal precision desired
+#     during reporting:
+#
+#     C 1.00 Kb = 1024 bytes
+#
+# i, o, b, h
+#     These four relate to timeclock support, which permits ledger to read timelog
+#     files. See the timeclock's documentation for more info on the syntax of its
+#     timelog files.
 require 'rubygems'
@@ -8,54 +113,113 @@ module Reckon
     attr_accessor :entries
     def initialize(ledger, options = {})
-      @entries = []
+      @options = options
+      @date_format = options[:date_format] || '%Y-%m-%d'
       parse(ledger)
     end
     def parse(ledger)
       @entries = []
-      date = desc = nil
-      accounts = []
+      new_entry = {}
       ledger.strip.split("\n").each do |entry|
-        next if entry =~ /^\s*$/ || entry =~ /^[^ \t\d]/
-        if entry =~ /^([\d\/-]+)(\=[\d\/-]+)?(\s+[\*!]?\s*.*?)$/
-          @entries << { :date => date.strip, :desc => desc.strip, :accounts => balance(accounts) } if date
-          date = $1
-          desc = $3
-          accounts = []
-        elsif date && entry =~ /^\s+([a-z\s:_\-]+)(\s*$|(\s+[\$\.,\-\d\+]+)($|\s+($|[^\$\.,\-\d\+])))/i
-          accounts << { :name => $1.strip, :amount => clean_money($3) }
+        next if entry =~ /^\s*$/ || entry =~ /^\s*;/
+        # (date, type, code, description), type and code are optional
+        if (m = entry.match(%r{^(\d+[\d/-]+)\s+([*!])?\s*(\([^)]+\))?\s*(.*)$}))
+          add_entry(new_entry)
+          new_entry = {
+            date: try_parse_date(m[1]),
+            type: m[2] || "",
+            code: m[3] && m[3].tr('()', '') || "",
+            desc: m[4].strip,
+            accounts: []
+          }
+        elsif new_entry[:date] && entry =~ /^\s+/
+          new_entry[:accounts] << parse_account_line(entry)
         else
-          @entries << { :date => date.strip, :desc => desc.strip, :accounts => balance(accounts) } if date
-          date = desc = nil
-          accounts = []
+          LOGGER.info("Unknown entry type: #{entry}")
+          add_entry(new_entry)
+          new_entry = {}
         end
       end
-      @entries << { :date => date.strip, :desc => desc.strip, :accounts => balance(accounts) } if date
+      add_entry(new_entry)
     end
-    def balance(accounts)
-      if accounts.any? { |i| i[:amount].nil? }
-        sum = accounts.inject(0) {|m, account| m + (account[:amount] || 0) }
-        count = 0
-        accounts.each do |account|
-          if account[:amount].nil?
-            count += 1
-            account[:amount] = 0 - sum
-          end
-        end
-        if count > 1
-          puts "Warning: unparsable entry due to more than one missing money value."
-          p accounts
-          puts
+    # roughly matches ledger csv format
+    def to_csv
+      return @entries.flat_map do |n|
+        n[:accounts].map do |a|
+          row = [
+            n[:date].strftime(@date_format),
+            n[:code],
+            n[:desc],
+            a[:name],
+            "", # currency (not implemented)
+            a[:amount],
+            n[:type],
+            "", # account comment (not implemented)
+          ]
+          CSV.generate_line(row).strip
         end
       end
+    end
+    private
+    def add_entry(entry)
+      return unless entry[:date] && entry[:accounts].length > 1
+      entry[:accounts] = balance(entry[:accounts])
+      @entries << entry
+    end
+    def try_parse_date(date_str)
+      date = Date.parse(date_str)
+      return nil if date.year > 9999 || date.year < 1000
+      date
+    rescue ArgumentError
+      nil
+    end
+    def parse_account_line(entry)
+      (account_name, rest) = entry.strip.split(/\s{2,}|\t+/, 2)
+      return {
+        name: account_name,
+        amount: clean_money("")
+      } if rest.nil? || rest.empty?
+      (value, _comment) = rest.split(/;/)
+      return {
+        name: account_name,
+        amount: clean_money(value || "")
+      }
+    end
+    def balance(accounts)
+      return accounts unless accounts.any? { |i| i[:amount].nil? }
+      sum = accounts.reduce(0) { |m, n| m + (n[:amount] || 0) }
+      count = 0
+      accounts.each do |account|
+        next unless account[:amount].nil?
+        count += 1
+        account[:amount] = -sum
+      end
+      if count > 1
+        puts "Warning: unparsable entry due to more than one missing money value."
+        p accounts
+        puts
+      end
       accounts
     end
     def clean_money(money)
-      return nil if money.nil? || money.length == 0
+      return nil if money.nil? || money.empty?
       money.gsub(/[^0-9.-]/, '').to_f
     end
   end