RubyGems - reckon - Versions diffs - 0.6.2 → 0.8.0 - Mend

reckon 0.6.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

checksums.yaml +4 -4
data/.github/workflows/ruby.yml +4 -4
data/.gitignore +1 -0
data/CHANGELOG.md +54 -3
data/Gemfile.lock +1 -1
data/README.md +23 -19
data/Rakefile +2 -2
data/bin/build-new-version.sh +26 -0
data/bin/reckon +4 -1
data/lib/reckon.rb +1 -0
data/lib/reckon/app.rb +13 -150
data/lib/reckon/cosine_similarity.rb +67 -62
data/lib/reckon/date_column.rb +3 -2
data/lib/reckon/ledger_parser.rb +1 -1
data/lib/reckon/money.rb +12 -5
data/lib/reckon/options.rb +157 -0
data/lib/reckon/version.rb +1 -1
data/spec/cosine_training_and_test.rb +52 -0
data/spec/integration/another_bank_example/output.ledger +3 -3
data/spec/integration/ask_for_account/cli_input.exp +33 -0
data/spec/integration/ask_for_account/expected_output +11 -0
data/spec/integration/ask_for_account/input.csv +9 -0
data/spec/integration/ask_for_account/test_args +1 -0
data/spec/integration/broker_canada_example/output.ledger +2 -2
data/spec/integration/chase/account_tokens_and_regex/output.ledger +3 -3
data/spec/integration/chase/default_account_names/output.ledger +3 -3
data/spec/integration/chase/learn_from_existing/output.ledger +3 -3
data/spec/integration/chase/simple/output.ledger +3 -3
data/spec/integration/danish_kroner_nordea_example/output.ledger +1 -1
data/spec/integration/extratofake/output.ledger +1 -1
data/spec/integration/harder_date_example/output.ledger +2 -2
data/spec/integration/invalid_header_example/test_args +1 -1
data/spec/integration/ledger_date_format/compare_cmds +1 -0
data/spec/integration/ledger_date_format/input.csv +3 -0
data/spec/integration/ledger_date_format/output.ledger +12 -0
data/spec/integration/ledger_date_format/test_args +1 -0
data/spec/integration/test.sh +78 -27
data/spec/reckon/app_spec.rb +21 -19
data/spec/reckon/csv_parser_spec.rb +3 -3
data/spec/reckon/date_column_spec.rb +12 -0
data/spec/reckon/money_spec.rb +3 -3
data/spec/reckon/options_spec.rb +17 -0
data/spec/spec_helper.rb +6 -1
metadata +15 -2

data/lib/reckon/cosine_similarity.rb CHANGED Viewed

@@ -1,47 +1,52 @@
 require 'matrix'
 require 'set'
-# Implementation of consine similarity using TF-IDF for vectorization.
-# Used to suggest which account a transaction should be assigned to
+# Implementation of cosine similarity using TF-IDF for vectorization.
+#
+# In information retrieval, tf–idf, short for term frequency–inverse document frequency,
+# is a numerical statistic that is intended to reflect how important a word is to a
+# document in a collection or corpus
+#
+# Cosine Similarity a measurement to determine how similar 2 documents are to each other.
+#
+# These weights and measures are used to suggest which account a transaction should be
+# assigned to.
 module Reckon
   class CosineSimilarity
+    DocumentInfo = Struct.new(:tokens, :accounts)
     def initialize(options)
+      @docs = DocumentInfo.new({}, {})
       @options = options
-      @tokens = {}
-      @accounts = Hash.new(0)
     end
     def add_document(account, doc)
-      tokenize(doc).each do |n|
+      tokens = tokenize(doc)
+      LOGGER.info "doc tokens: #{tokens}"
+      tokens.each do |n|
         (token, count) = n
-        @tokens[token] ||= {}
-        @tokens[token][account] ||= 0
-        @tokens[token][account] += count
-        @accounts[account] += count
+        @docs.tokens[token] ||= Hash.new(0)
+        @docs.tokens[token][account] += count
+        @docs.accounts[account] ||= Hash.new(0)
+        @docs.accounts[account][token] += count
       end
     end
     # find most similar documents to query
     def find_similar(query)
-      (query_scores, corpus_scores) = td_idf_scores_for(query)
+      LOGGER.info "find_similar #{query}"
-      query_vector = Vector.elements(query_scores, false)
+      accounts = docs_to_check(query).map do |a|
+        [a, tfidf(@docs.accounts[a])]
+      end
-      # For each doc, calculate the similarity to the query
-      suggestions = corpus_scores.map do |account, scores|
-        acct_vector = Vector.elements(scores, false)
+      q = tfidf(tokenize(query))
-        acct_query_dp = acct_vector.inner_product(query_vector)
-        # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
-        # exactly opposite
-        # see https://en.wikipedia.org/wiki/Cosine_similarity
-        # cos(theta) = (A . B) / (||A|| ||B||)
-        # where A . B is the "dot product" and ||A|| is the magnitude of A
-        # ruby has the 'matrix' library we can use to do these calculations.
+      suggestions = accounts.map do |a, d|
         {
-          similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
-          account: account,
+          similarity: calc_similarity(q, d),
+          account: a
         }
       end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
@@ -52,50 +57,51 @@ module Reckon
     private
-    def td_idf_scores_for(query)
-      query_tokens = tokenize(query)
-      corpus = Set.new
-      corpus_scores = {}
-      query_scores = []
-      num_docs = @accounts.length
-      query_tokens.each do |n|
-        (token, _count) = n
-        next unless @tokens[token]
-        corpus = corpus.union(Set.new(@tokens[token].keys))
+    def docs_to_check(query)
+      return tokenize(query).reduce(Set.new) do |corpus, t|
+        corpus.union(Set.new(@docs.tokens[t[0]]&.keys))
       end
+    end
-      query_tokens.each do |n|
-        (token, count) = n
-        # if no other docs have token, ignore it
-        next unless @tokens[token]
+    def tfidf(tokens)
+      scores = {}
-        ## First, calculate scores for our query as we're building scores for the corpus
-        query_scores << calc_tf_idf(
-          count,
-          query_tokens.length,
-          @tokens[token].length,
-          num_docs
+      tokens.each do |t, n|
+        scores[t] = calc_tf_idf(
+          n,
+          tokens.length,
+          @docs.tokens[t]&.length&.to_f || 0,
+          @docs.accounts.length
         )
-        ## Next, calculate for the corpus, where our "account" is a document
-        corpus.each do |account|
-          corpus_scores[account] ||= []
-          corpus_scores[account] << calc_tf_idf(
-            (@tokens[token][account] || 0),
-            @accounts[account].to_f,
-            @tokens[token].length.to_f,
-            num_docs
-          )
-        end
       end
-      [query_scores, corpus_scores]
+      return scores
     end
-    def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
+    # Cosine similarity is used to compare how similar 2 documents are. Returns a float
+    # between 1 and -1, where 1 is exactly the same and -1 is exactly opposite.
+    #
+    # see https://en.wikipedia.org/wiki/Cosine_similarity
+    # cos(theta) = (A . B) / (||A|| ||B||)
+    # where A . B is the "dot product" and ||A|| is the magnitude of A
+    #
+    # The variables A and B are the set of unique terms in q and d.
+    #
+    # For example, when q = "big red balloon" and d ="small green balloon" then the
+    # variables are (big,red,balloon,small,green) and a = (1,1,1,0,0) and b =
+    # (0,0,1,1,1).
+    #
+    # query and doc are hashes of token => tf/idf score
+    def calc_similarity(query, doc)
+      tokens = Set.new(query.keys + doc.keys)
+      a = Vector.elements(tokens.map { |n| query[n] || 0 }, false)
+      b = Vector.elements(tokens.map { |n| doc[n] || 0 }, false)
+      return a.inner_product(b) / (a.magnitude * b.magnitude)
+    end
+    def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
       # tf(t,d) = count of t in d / number of words in d
       tf = token_count / num_words_in_doc.to_f
@@ -109,14 +115,13 @@ module Reckon
     end
     def tokenize(str)
-      mk_tokens(str).inject(Hash.new(0)) do |memo, n|
+      mk_tokens(str).each_with_object(Hash.new(0)) do |n, memo|
         memo[n] += 1
-        memo
       end.to_a
     end
     def mk_tokens(str)
-      str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
+      str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/).reject(&:empty?)
     end
   end
 end

data/lib/reckon/date_column.rb CHANGED Viewed

@@ -2,12 +2,13 @@ module Reckon
   class DateColumn < Array
     attr_accessor :endian_precedence
     def initialize( arr = [], options = {} )
+      @options = options
       arr.each do |value|
         if options[:date_format]
           begin
             value = Date.strptime(value, options[:date_format])
           rescue
-            puts "I'm having trouble parsing #{value} with the desired format: #{options[:date_format]}"
+            puts "I'm having trouble parsing '#{value}' with the desired format: #{options[:date_format]}"
             exit 1
           end
         else
@@ -53,7 +54,7 @@ module Reckon
       date = self.for(index)
       return "" if date.nil?
-      date.iso8601
+      date.strftime(@options[:ledger_date_format] || '%Y-%m-%d')
     end
     def self.likelihood(entry)

data/lib/reckon/ledger_parser.rb CHANGED Viewed

@@ -114,7 +114,7 @@ module Reckon
     def initialize(ledger, options = {})
       @options = options
-      @date_format = options[:date_format] || '%Y-%m-%d'
+      @date_format = options[:ledger_date_format] || options[:date_format] || '%Y-%m-%d'
       parse(ledger)
     end

data/lib/reckon/money.rb CHANGED Viewed

@@ -50,11 +50,18 @@ module Reckon
         return @amount_raw[0] == '-' ? @amount_raw[1..-1] : "-#{@amount_raw}"
       end
-      if @suffixed
-        (@amount >= 0 ? " " : "") + sprintf("%0.2f #{@currency}", @amount * (negate ? -1 : 1))
-      else
-        (@amount >= 0 ? " " : "") + sprintf("%0.2f", @amount * (negate ? -1 : 1)).gsub(/^((\-)|)(?=\d)/, "\\1#{@currency}")
-      end
+      amt = pretty_amount(@amount * (negate ? -1 : 1))
+      amt = if @suffixed
+              "#{amt} #{@currency}"
+            else
+              amt.gsub(/^((-)|)(?=\d)/, "\\1#{@currency}")
+            end
+      return (@amount >= 0 ? " " : "") + amt
+    end
+    def pretty_amount(amount)
+      sprintf("%0.2f", amount).reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
     end
     def parse(value, options = {})

data/lib/reckon/options.rb ADDED Viewed

@@ -0,0 +1,157 @@
+module Reckon
+  class Options
+    @@cli = HighLine.new
+    def self.parse(args = ARGV, stdin = $stdin)
+      options = { output_file: $stdout }
+      OptionParser.new do |opts|
+        opts.banner = "Usage: Reckon.rb [options]"
+        opts.separator ""
+        opts.on("-f", "--file FILE", "The CSV file to parse") do |file|
+          options[:file] = file
+        end
+        opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a|
+          options[:bank_account] = a
+        end
+        opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
+          options[:verbose] = v
+        end
+        opts.on("-i", "--inverse", "Use the negative of each amount") do |v|
+          options[:inverse] = v
+        end
+        opts.on("-p", "--print-table", "Print out the parsed CSV in table form") do |p|
+          options[:print_table] = p
+        end
+        opts.on("-o", "--output-file FILE", "The ledger file to append to") do |o|
+          options[:output_file] = File.open(o, 'a')
+        end
+        opts.on("-l", "--learn-from FILE", "An existing ledger file to learn accounts from") do |l|
+          options[:existing_ledger_file] = l
+        end
+        opts.on("", "--ignore-columns 1,2,5", "Columns to ignore, starts from 1") do |ignore|
+          options[:ignore_columns] = ignore.split(",").map(&:to_i)
+        end
+        opts.on("", "--money-column 2", Integer, "Column number of the money column, starts from 1") do |col|
+          options[:money_column] = col
+        end
+        opts.on("", "--raw-money", "Don't format money column (for stocks)") do |n|
+          options[:raw] = n
+        end
+        opts.on("", "--date-column 3", Integer, "Column number of the date column, starts from 1") do |col|
+          options[:date_column] = col
+        end
+        opts.on("", "--contains-header [N]", Integer, "Skip N header rows - default 1") do |hdr|
+          options[:contains_header] = 1
+          options[:contains_header] = hdr.to_i
+        end
+        opts.on("", "--csv-separator ','", "CSV separator (default ',')") do |sep|
+          options[:csv_separator] = sep
+        end
+        opts.on("", "--comma-separates-cents", "Use comma to separate cents ($100,50 vs. $100.50)") do |c|
+          options[:comma_separates_cents] = c
+        end
+        opts.on("", "--encoding 'UTF-8'", "Specify an encoding for the CSV file") do |e|
+          options[:encoding] = e
+        end
+        opts.on("-c", "--currency '$'", "Currency symbol to use - default $ (ex £, EUR)") do |e|
+          options[:currency] = e
+        end
+        opts.on("", "--date-format FORMAT", "CSV file date format (see `date` for format)") do |d|
+          options[:date_format] = d
+        end
+        opts.on("", "--ledger-date-format FORMAT", "Ledger date format (see `date` for format)") do |d|
+          options[:ledger_date_format] = d
+        end
+        opts.on("-u", "--unattended", "Don't ask questions and guess all the accounts automatically. Use with --learn-from or --account-tokens options.") do |n|
+          options[:unattended] = n
+        end
+        opts.on("-t", "--account-tokens FILE", "YAML file with manually-assigned tokens for each account (see README)") do |a|
+          options[:account_tokens_file] = a
+        end
+        opts.on("", "--table-output-file FILE") do |n|
+          options[:table_output_file] = n
+        end
+        options[:default_into_account] = 'Expenses:Unknown'
+        opts.on("", "--default-into-account NAME", "Default into account") do |a|
+          options[:default_into_account] = a
+        end
+        options[:default_outof_account] = 'Income:Unknown'
+        opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a|
+          options[:default_outof_account] = a
+        end
+        opts.on("", "--fail-on-unknown-account", "Fail on unmatched transactions.") do |n|
+          options[:fail_on_unknown_account] = n
+        end
+        opts.on("", "--suffixed", "Append currency symbol as a suffix.") do |e|
+          options[:suffixed] = e
+        end
+        opts.on_tail("-h", "--help", "Show this message") do
+          puts opts
+          exit
+        end
+        opts.on_tail("--version", "Show version") do
+          puts VERSION
+          exit
+        end
+        opts.parse!(args)
+      end
+      if options[:file] == '-'
+        unless options[:unattended]
+          raise "--unattended is required to use STDIN as CSV source."
+        end
+        options[:string] = stdin.read
+      end
+      unless options[:file]
+        options[:file] = @@cli.ask("What CSV file should I parse? ")
+        unless options[:file].empty?
+          puts "\nYou must provide a CSV file to parse.\n"
+          puts parser
+          exit
+        end
+      end
+      unless options[:bank_account]
+        raise "Must specify --account in unattended mode" if options[:unattended]
+        options[:bank_account] = @@cli.ask("What is this account named in Ledger?\n") do |q|
+          q.readline = true
+          q.validate = /^.{2,}$/
+          q.default = "Assets:Bank:Checking"
+        end
+      end
+      return options
+    end
+  end
+end

data/lib/reckon/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Reckon
-  VERSION="0.6.2"
+  VERSION="0.8.0"
 end

data/spec/cosine_training_and_test.rb ADDED Viewed

@@ -0,0 +1,52 @@
+#!/usr/bin/env ruby
+require 'pp'
+require 'reckon'
+ledger_file = ARGV[0]
+account = ARGV[1]
+seed = ARGV[2] ? ARGV[2].to_i : Random.new_seed
+ledger = Reckon::LedgerParser.new(File.read(ledger_file))
+matcher = Reckon::CosineSimilarity.new({})
+train = []
+test = []
+def has_account(account, entry)
+  entry[:accounts].map { |a| a[:name] }.include?(account)
+end
+entries = ledger.entries.select { |e| has_account(account, e) }
+r = Random.new(seed)
+entries.length.times do |i|
+  r.rand < 0.9 ? train << i : test << i
+end
+train.each do |i|
+  entry = entries[i]
+  entry[:accounts].each do |a|
+    matcher.add_document(
+      a[:name],
+      [entry[:desc], a[:amount]].join(" ")
+    )
+  end
+end
+result = [nil] * test.length
+test.each do |i|
+  entry = entries[i]
+  matches = matcher.find_similar(
+    entry[:desc] + " " + entry[:accounts][0][:amount].to_s
+  )
+  if !matches[0] || !has_account(matches[0][:account], entry)
+    result[i] = [entry, matches]
+  end
+end
+# pp result.compact
+puts "using #{seed} as random seed"
+puts "true: #{result.count(nil)} false: #{result.count { |v| !v.nil? }}"