reckon 0.6.2 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +4 -4
  3. data/.gitignore +1 -0
  4. data/CHANGELOG.md +54 -3
  5. data/Gemfile.lock +1 -1
  6. data/README.md +23 -19
  7. data/Rakefile +2 -2
  8. data/bin/build-new-version.sh +26 -0
  9. data/bin/reckon +4 -1
  10. data/lib/reckon.rb +1 -0
  11. data/lib/reckon/app.rb +13 -150
  12. data/lib/reckon/cosine_similarity.rb +67 -62
  13. data/lib/reckon/date_column.rb +3 -2
  14. data/lib/reckon/ledger_parser.rb +1 -1
  15. data/lib/reckon/money.rb +12 -5
  16. data/lib/reckon/options.rb +157 -0
  17. data/lib/reckon/version.rb +1 -1
  18. data/spec/cosine_training_and_test.rb +52 -0
  19. data/spec/integration/another_bank_example/output.ledger +3 -3
  20. data/spec/integration/ask_for_account/cli_input.exp +33 -0
  21. data/spec/integration/ask_for_account/expected_output +11 -0
  22. data/spec/integration/ask_for_account/input.csv +9 -0
  23. data/spec/integration/ask_for_account/test_args +1 -0
  24. data/spec/integration/broker_canada_example/output.ledger +2 -2
  25. data/spec/integration/chase/account_tokens_and_regex/output.ledger +3 -3
  26. data/spec/integration/chase/default_account_names/output.ledger +3 -3
  27. data/spec/integration/chase/learn_from_existing/output.ledger +3 -3
  28. data/spec/integration/chase/simple/output.ledger +3 -3
  29. data/spec/integration/danish_kroner_nordea_example/output.ledger +1 -1
  30. data/spec/integration/extratofake/output.ledger +1 -1
  31. data/spec/integration/harder_date_example/output.ledger +2 -2
  32. data/spec/integration/invalid_header_example/test_args +1 -1
  33. data/spec/integration/ledger_date_format/compare_cmds +1 -0
  34. data/spec/integration/ledger_date_format/input.csv +3 -0
  35. data/spec/integration/ledger_date_format/output.ledger +12 -0
  36. data/spec/integration/ledger_date_format/test_args +1 -0
  37. data/spec/integration/test.sh +78 -27
  38. data/spec/reckon/app_spec.rb +21 -19
  39. data/spec/reckon/csv_parser_spec.rb +3 -3
  40. data/spec/reckon/date_column_spec.rb +12 -0
  41. data/spec/reckon/money_spec.rb +3 -3
  42. data/spec/reckon/options_spec.rb +17 -0
  43. data/spec/spec_helper.rb +6 -1
  44. metadata +15 -2
@@ -1,47 +1,52 @@
1
1
  require 'matrix'
2
2
  require 'set'
3
3
 
4
- # Implementation of consine similarity using TF-IDF for vectorization.
5
- # Used to suggest which account a transaction should be assigned to
4
+ # Implementation of cosine similarity using TF-IDF for vectorization.
5
+ #
6
+ # In information retrieval, tf–idf, short for term frequency–inverse document frequency,
7
+ # is a numerical statistic that is intended to reflect how important a word is to a
8
+ # document in a collection or corpus
9
+ #
10
+ # Cosine Similarity a measurement to determine how similar 2 documents are to each other.
11
+ #
12
+ # These weights and measures are used to suggest which account a transaction should be
13
+ # assigned to.
6
14
  module Reckon
7
15
  class CosineSimilarity
16
+ DocumentInfo = Struct.new(:tokens, :accounts)
17
+
8
18
  def initialize(options)
19
+ @docs = DocumentInfo.new({}, {})
9
20
  @options = options
10
- @tokens = {}
11
- @accounts = Hash.new(0)
12
21
  end
13
22
 
14
23
  def add_document(account, doc)
15
- tokenize(doc).each do |n|
24
+ tokens = tokenize(doc)
25
+ LOGGER.info "doc tokens: #{tokens}"
26
+ tokens.each do |n|
16
27
  (token, count) = n
17
28
 
18
- @tokens[token] ||= {}
19
- @tokens[token][account] ||= 0
20
- @tokens[token][account] += count
21
- @accounts[account] += count
29
+ @docs.tokens[token] ||= Hash.new(0)
30
+ @docs.tokens[token][account] += count
31
+ @docs.accounts[account] ||= Hash.new(0)
32
+ @docs.accounts[account][token] += count
22
33
  end
23
34
  end
24
35
 
25
36
  # find most similar documents to query
26
37
  def find_similar(query)
27
- (query_scores, corpus_scores) = td_idf_scores_for(query)
38
+ LOGGER.info "find_similar #{query}"
28
39
 
29
- query_vector = Vector.elements(query_scores, false)
40
+ accounts = docs_to_check(query).map do |a|
41
+ [a, tfidf(@docs.accounts[a])]
42
+ end
30
43
 
31
- # For each doc, calculate the similarity to the query
32
- suggestions = corpus_scores.map do |account, scores|
33
- acct_vector = Vector.elements(scores, false)
44
+ q = tfidf(tokenize(query))
34
45
 
35
- acct_query_dp = acct_vector.inner_product(query_vector)
36
- # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
37
- # exactly opposite
38
- # see https://en.wikipedia.org/wiki/Cosine_similarity
39
- # cos(theta) = (A . B) / (||A|| ||B||)
40
- # where A . B is the "dot product" and ||A|| is the magnitude of A
41
- # ruby has the 'matrix' library we can use to do these calculations.
46
+ suggestions = accounts.map do |a, d|
42
47
  {
43
- similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
44
- account: account,
48
+ similarity: calc_similarity(q, d),
49
+ account: a
45
50
  }
46
51
  end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
47
52
 
@@ -52,50 +57,51 @@ module Reckon
52
57
 
53
58
  private
54
59
 
55
- def td_idf_scores_for(query)
56
- query_tokens = tokenize(query)
57
- corpus = Set.new
58
- corpus_scores = {}
59
- query_scores = []
60
- num_docs = @accounts.length
61
-
62
- query_tokens.each do |n|
63
- (token, _count) = n
64
- next unless @tokens[token]
65
- corpus = corpus.union(Set.new(@tokens[token].keys))
60
+ def docs_to_check(query)
61
+ return tokenize(query).reduce(Set.new) do |corpus, t|
62
+ corpus.union(Set.new(@docs.tokens[t[0]]&.keys))
66
63
  end
64
+ end
67
65
 
68
- query_tokens.each do |n|
69
- (token, count) = n
70
-
71
- # if no other docs have token, ignore it
72
- next unless @tokens[token]
66
+ def tfidf(tokens)
67
+ scores = {}
73
68
 
74
- ## First, calculate scores for our query as we're building scores for the corpus
75
- query_scores << calc_tf_idf(
76
- count,
77
- query_tokens.length,
78
- @tokens[token].length,
79
- num_docs
69
+ tokens.each do |t, n|
70
+ scores[t] = calc_tf_idf(
71
+ n,
72
+ tokens.length,
73
+ @docs.tokens[t]&.length&.to_f || 0,
74
+ @docs.accounts.length
80
75
  )
81
-
82
- ## Next, calculate for the corpus, where our "account" is a document
83
- corpus.each do |account|
84
- corpus_scores[account] ||= []
85
-
86
- corpus_scores[account] << calc_tf_idf(
87
- (@tokens[token][account] || 0),
88
- @accounts[account].to_f,
89
- @tokens[token].length.to_f,
90
- num_docs
91
- )
92
- end
93
76
  end
94
- [query_scores, corpus_scores]
77
+
78
+ return scores
95
79
  end
96
80
 
97
- def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
81
+ # Cosine similarity is used to compare how similar 2 documents are. Returns a float
82
+ # between 1 and -1, where 1 is exactly the same and -1 is exactly opposite.
83
+ #
84
+ # see https://en.wikipedia.org/wiki/Cosine_similarity
85
+ # cos(theta) = (A . B) / (||A|| ||B||)
86
+ # where A . B is the "dot product" and ||A|| is the magnitude of A
87
+ #
88
+ # The variables A and B are the set of unique terms in q and d.
89
+ #
90
+ # For example, when q = "big red balloon" and d ="small green balloon" then the
91
+ # variables are (big,red,balloon,small,green) and a = (1,1,1,0,0) and b =
92
+ # (0,0,1,1,1).
93
+ #
94
+ # query and doc are hashes of token => tf/idf score
95
+ def calc_similarity(query, doc)
96
+ tokens = Set.new(query.keys + doc.keys)
97
+
98
+ a = Vector.elements(tokens.map { |n| query[n] || 0 }, false)
99
+ b = Vector.elements(tokens.map { |n| doc[n] || 0 }, false)
100
+
101
+ return a.inner_product(b) / (a.magnitude * b.magnitude)
102
+ end
98
103
 
104
+ def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
99
105
  # tf(t,d) = count of t in d / number of words in d
100
106
  tf = token_count / num_words_in_doc.to_f
101
107
 
@@ -109,14 +115,13 @@ module Reckon
109
115
  end
110
116
 
111
117
  def tokenize(str)
112
- mk_tokens(str).inject(Hash.new(0)) do |memo, n|
118
+ mk_tokens(str).each_with_object(Hash.new(0)) do |n, memo|
113
119
  memo[n] += 1
114
- memo
115
120
  end.to_a
116
121
  end
117
122
 
118
123
  def mk_tokens(str)
119
- str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
124
+ str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/).reject(&:empty?)
120
125
  end
121
126
  end
122
127
  end
@@ -2,12 +2,13 @@ module Reckon
2
2
  class DateColumn < Array
3
3
  attr_accessor :endian_precedence
4
4
  def initialize( arr = [], options = {} )
5
+ @options = options
5
6
  arr.each do |value|
6
7
  if options[:date_format]
7
8
  begin
8
9
  value = Date.strptime(value, options[:date_format])
9
10
  rescue
10
- puts "I'm having trouble parsing #{value} with the desired format: #{options[:date_format]}"
11
+ puts "I'm having trouble parsing '#{value}' with the desired format: #{options[:date_format]}"
11
12
  exit 1
12
13
  end
13
14
  else
@@ -53,7 +54,7 @@ module Reckon
53
54
  date = self.for(index)
54
55
  return "" if date.nil?
55
56
 
56
- date.iso8601
57
+ date.strftime(@options[:ledger_date_format] || '%Y-%m-%d')
57
58
  end
58
59
 
59
60
  def self.likelihood(entry)
@@ -114,7 +114,7 @@ module Reckon
114
114
 
115
115
  def initialize(ledger, options = {})
116
116
  @options = options
117
- @date_format = options[:date_format] || '%Y-%m-%d'
117
+ @date_format = options[:ledger_date_format] || options[:date_format] || '%Y-%m-%d'
118
118
  parse(ledger)
119
119
  end
120
120
 
data/lib/reckon/money.rb CHANGED
@@ -50,11 +50,18 @@ module Reckon
50
50
  return @amount_raw[0] == '-' ? @amount_raw[1..-1] : "-#{@amount_raw}"
51
51
  end
52
52
 
53
- if @suffixed
54
- (@amount >= 0 ? " " : "") + sprintf("%0.2f #{@currency}", @amount * (negate ? -1 : 1))
55
- else
56
- (@amount >= 0 ? " " : "") + sprintf("%0.2f", @amount * (negate ? -1 : 1)).gsub(/^((\-)|)(?=\d)/, "\\1#{@currency}")
57
- end
53
+ amt = pretty_amount(@amount * (negate ? -1 : 1))
54
+ amt = if @suffixed
55
+ "#{amt} #{@currency}"
56
+ else
57
+ amt.gsub(/^((-)|)(?=\d)/, "\\1#{@currency}")
58
+ end
59
+
60
+ return (@amount >= 0 ? " " : "") + amt
61
+ end
62
+
63
+ def pretty_amount(amount)
64
+ sprintf("%0.2f", amount).reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
58
65
  end
59
66
 
60
67
  def parse(value, options = {})
@@ -0,0 +1,157 @@
1
+ module Reckon
2
+ class Options
3
+ @@cli = HighLine.new
4
+
5
+ def self.parse(args = ARGV, stdin = $stdin)
6
+ options = { output_file: $stdout }
7
+ OptionParser.new do |opts|
8
+ opts.banner = "Usage: Reckon.rb [options]"
9
+ opts.separator ""
10
+
11
+ opts.on("-f", "--file FILE", "The CSV file to parse") do |file|
12
+ options[:file] = file
13
+ end
14
+
15
+ opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a|
16
+ options[:bank_account] = a
17
+ end
18
+
19
+ opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
20
+ options[:verbose] = v
21
+ end
22
+
23
+ opts.on("-i", "--inverse", "Use the negative of each amount") do |v|
24
+ options[:inverse] = v
25
+ end
26
+
27
+ opts.on("-p", "--print-table", "Print out the parsed CSV in table form") do |p|
28
+ options[:print_table] = p
29
+ end
30
+
31
+ opts.on("-o", "--output-file FILE", "The ledger file to append to") do |o|
32
+ options[:output_file] = File.open(o, 'a')
33
+ end
34
+
35
+ opts.on("-l", "--learn-from FILE", "An existing ledger file to learn accounts from") do |l|
36
+ options[:existing_ledger_file] = l
37
+ end
38
+
39
+ opts.on("", "--ignore-columns 1,2,5", "Columns to ignore, starts from 1") do |ignore|
40
+ options[:ignore_columns] = ignore.split(",").map(&:to_i)
41
+ end
42
+
43
+ opts.on("", "--money-column 2", Integer, "Column number of the money column, starts from 1") do |col|
44
+ options[:money_column] = col
45
+ end
46
+
47
+ opts.on("", "--raw-money", "Don't format money column (for stocks)") do |n|
48
+ options[:raw] = n
49
+ end
50
+
51
+ opts.on("", "--date-column 3", Integer, "Column number of the date column, starts from 1") do |col|
52
+ options[:date_column] = col
53
+ end
54
+
55
+ opts.on("", "--contains-header [N]", Integer, "Skip N header rows - default 1") do |hdr|
56
+ options[:contains_header] = 1
57
+ options[:contains_header] = hdr.to_i
58
+ end
59
+
60
+ opts.on("", "--csv-separator ','", "CSV separator (default ',')") do |sep|
61
+ options[:csv_separator] = sep
62
+ end
63
+
64
+ opts.on("", "--comma-separates-cents", "Use comma to separate cents ($100,50 vs. $100.50)") do |c|
65
+ options[:comma_separates_cents] = c
66
+ end
67
+
68
+ opts.on("", "--encoding 'UTF-8'", "Specify an encoding for the CSV file") do |e|
69
+ options[:encoding] = e
70
+ end
71
+
72
+ opts.on("-c", "--currency '$'", "Currency symbol to use - default $ (ex £, EUR)") do |e|
73
+ options[:currency] = e
74
+ end
75
+
76
+ opts.on("", "--date-format FORMAT", "CSV file date format (see `date` for format)") do |d|
77
+ options[:date_format] = d
78
+ end
79
+
80
+ opts.on("", "--ledger-date-format FORMAT", "Ledger date format (see `date` for format)") do |d|
81
+ options[:ledger_date_format] = d
82
+ end
83
+
84
+ opts.on("-u", "--unattended", "Don't ask questions and guess all the accounts automatically. Use with --learn-from or --account-tokens options.") do |n|
85
+ options[:unattended] = n
86
+ end
87
+
88
+ opts.on("-t", "--account-tokens FILE", "YAML file with manually-assigned tokens for each account (see README)") do |a|
89
+ options[:account_tokens_file] = a
90
+ end
91
+
92
+ opts.on("", "--table-output-file FILE") do |n|
93
+ options[:table_output_file] = n
94
+ end
95
+
96
+ options[:default_into_account] = 'Expenses:Unknown'
97
+ opts.on("", "--default-into-account NAME", "Default into account") do |a|
98
+ options[:default_into_account] = a
99
+ end
100
+
101
+ options[:default_outof_account] = 'Income:Unknown'
102
+ opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a|
103
+ options[:default_outof_account] = a
104
+ end
105
+
106
+ opts.on("", "--fail-on-unknown-account", "Fail on unmatched transactions.") do |n|
107
+ options[:fail_on_unknown_account] = n
108
+ end
109
+
110
+ opts.on("", "--suffixed", "Append currency symbol as a suffix.") do |e|
111
+ options[:suffixed] = e
112
+ end
113
+
114
+ opts.on_tail("-h", "--help", "Show this message") do
115
+ puts opts
116
+ exit
117
+ end
118
+
119
+ opts.on_tail("--version", "Show version") do
120
+ puts VERSION
121
+ exit
122
+ end
123
+
124
+ opts.parse!(args)
125
+ end
126
+
127
+ if options[:file] == '-'
128
+ unless options[:unattended]
129
+ raise "--unattended is required to use STDIN as CSV source."
130
+ end
131
+
132
+ options[:string] = stdin.read
133
+ end
134
+
135
+ unless options[:file]
136
+ options[:file] = @@cli.ask("What CSV file should I parse? ")
137
+ unless options[:file].empty?
138
+ puts "\nYou must provide a CSV file to parse.\n"
139
+ puts parser
140
+ exit
141
+ end
142
+ end
143
+
144
+ unless options[:bank_account]
145
+ raise "Must specify --account in unattended mode" if options[:unattended]
146
+
147
+ options[:bank_account] = @@cli.ask("What is this account named in Ledger?\n") do |q|
148
+ q.readline = true
149
+ q.validate = /^.{2,}$/
150
+ q.default = "Assets:Bank:Checking"
151
+ end
152
+ end
153
+
154
+ return options
155
+ end
156
+ end
157
+ end
@@ -1,3 +1,3 @@
1
1
  module Reckon
2
- VERSION="0.6.2"
2
+ VERSION="0.8.0"
3
3
  end
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'pp'
4
+
5
+ require 'reckon'
6
+
7
+ ledger_file = ARGV[0]
8
+ account = ARGV[1]
9
+ seed = ARGV[2] ? ARGV[2].to_i : Random.new_seed
10
+
11
+ ledger = Reckon::LedgerParser.new(File.read(ledger_file))
12
+ matcher = Reckon::CosineSimilarity.new({})
13
+
14
+ train = []
15
+ test = []
16
+
17
+ def has_account(account, entry)
18
+ entry[:accounts].map { |a| a[:name] }.include?(account)
19
+ end
20
+
21
+ entries = ledger.entries.select { |e| has_account(account, e) }
22
+
23
+ r = Random.new(seed)
24
+ entries.length.times do |i|
25
+ r.rand < 0.9 ? train << i : test << i
26
+ end
27
+
28
+ train.each do |i|
29
+ entry = entries[i]
30
+ entry[:accounts].each do |a|
31
+ matcher.add_document(
32
+ a[:name],
33
+ [entry[:desc], a[:amount]].join(" ")
34
+ )
35
+ end
36
+ end
37
+
38
+ result = [nil] * test.length
39
+ test.each do |i|
40
+ entry = entries[i]
41
+ matches = matcher.find_similar(
42
+ entry[:desc] + " " + entry[:accounts][0][:amount].to_s
43
+ )
44
+
45
+ if !matches[0] || !has_account(matches[0][:account], entry)
46
+ result[i] = [entry, matches]
47
+ end
48
+ end
49
+
50
+ # pp result.compact
51
+ puts "using #{seed} as random seed"
52
+ puts "true: #{result.count(nil)} false: #{result.count { |v| !v.nil? }}"