reckon 0.6.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +4 -4
  3. data/.gitignore +1 -0
  4. data/CHANGELOG.md +54 -3
  5. data/Gemfile.lock +1 -1
  6. data/README.md +23 -19
  7. data/Rakefile +2 -2
  8. data/bin/build-new-version.sh +26 -0
  9. data/bin/reckon +4 -1
  10. data/lib/reckon.rb +1 -0
  11. data/lib/reckon/app.rb +13 -150
  12. data/lib/reckon/cosine_similarity.rb +67 -62
  13. data/lib/reckon/date_column.rb +3 -2
  14. data/lib/reckon/ledger_parser.rb +1 -1
  15. data/lib/reckon/money.rb +12 -5
  16. data/lib/reckon/options.rb +157 -0
  17. data/lib/reckon/version.rb +1 -1
  18. data/spec/cosine_training_and_test.rb +52 -0
  19. data/spec/integration/another_bank_example/output.ledger +3 -3
  20. data/spec/integration/ask_for_account/cli_input.exp +33 -0
  21. data/spec/integration/ask_for_account/expected_output +11 -0
  22. data/spec/integration/ask_for_account/input.csv +9 -0
  23. data/spec/integration/ask_for_account/test_args +1 -0
  24. data/spec/integration/broker_canada_example/output.ledger +2 -2
  25. data/spec/integration/chase/account_tokens_and_regex/output.ledger +3 -3
  26. data/spec/integration/chase/default_account_names/output.ledger +3 -3
  27. data/spec/integration/chase/learn_from_existing/output.ledger +3 -3
  28. data/spec/integration/chase/simple/output.ledger +3 -3
  29. data/spec/integration/danish_kroner_nordea_example/output.ledger +1 -1
  30. data/spec/integration/extratofake/output.ledger +1 -1
  31. data/spec/integration/harder_date_example/output.ledger +2 -2
  32. data/spec/integration/invalid_header_example/test_args +1 -1
  33. data/spec/integration/ledger_date_format/compare_cmds +1 -0
  34. data/spec/integration/ledger_date_format/input.csv +3 -0
  35. data/spec/integration/ledger_date_format/output.ledger +12 -0
  36. data/spec/integration/ledger_date_format/test_args +1 -0
  37. data/spec/integration/test.sh +78 -27
  38. data/spec/reckon/app_spec.rb +21 -19
  39. data/spec/reckon/csv_parser_spec.rb +3 -3
  40. data/spec/reckon/date_column_spec.rb +12 -0
  41. data/spec/reckon/money_spec.rb +3 -3
  42. data/spec/reckon/options_spec.rb +17 -0
  43. data/spec/spec_helper.rb +6 -1
  44. metadata +15 -2
@@ -1,47 +1,52 @@
1
1
  require 'matrix'
2
2
  require 'set'
3
3
 
4
- # Implementation of consine similarity using TF-IDF for vectorization.
5
- # Used to suggest which account a transaction should be assigned to
4
+ # Implementation of cosine similarity using TF-IDF for vectorization.
5
+ #
6
+ # In information retrieval, tf–idf, short for term frequency–inverse document frequency,
7
+ # is a numerical statistic that is intended to reflect how important a word is to a
8
+ # document in a collection or corpus
9
+ #
10
+ # Cosine Similarity a measurement to determine how similar 2 documents are to each other.
11
+ #
12
+ # These weights and measures are used to suggest which account a transaction should be
13
+ # assigned to.
6
14
  module Reckon
7
15
  class CosineSimilarity
16
+ DocumentInfo = Struct.new(:tokens, :accounts)
17
+
8
18
  def initialize(options)
19
+ @docs = DocumentInfo.new({}, {})
9
20
  @options = options
10
- @tokens = {}
11
- @accounts = Hash.new(0)
12
21
  end
13
22
 
14
23
  def add_document(account, doc)
15
- tokenize(doc).each do |n|
24
+ tokens = tokenize(doc)
25
+ LOGGER.info "doc tokens: #{tokens}"
26
+ tokens.each do |n|
16
27
  (token, count) = n
17
28
 
18
- @tokens[token] ||= {}
19
- @tokens[token][account] ||= 0
20
- @tokens[token][account] += count
21
- @accounts[account] += count
29
+ @docs.tokens[token] ||= Hash.new(0)
30
+ @docs.tokens[token][account] += count
31
+ @docs.accounts[account] ||= Hash.new(0)
32
+ @docs.accounts[account][token] += count
22
33
  end
23
34
  end
24
35
 
25
36
  # find most similar documents to query
26
37
  def find_similar(query)
27
- (query_scores, corpus_scores) = td_idf_scores_for(query)
38
+ LOGGER.info "find_similar #{query}"
28
39
 
29
- query_vector = Vector.elements(query_scores, false)
40
+ accounts = docs_to_check(query).map do |a|
41
+ [a, tfidf(@docs.accounts[a])]
42
+ end
30
43
 
31
- # For each doc, calculate the similarity to the query
32
- suggestions = corpus_scores.map do |account, scores|
33
- acct_vector = Vector.elements(scores, false)
44
+ q = tfidf(tokenize(query))
34
45
 
35
- acct_query_dp = acct_vector.inner_product(query_vector)
36
- # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
37
- # exactly opposite
38
- # see https://en.wikipedia.org/wiki/Cosine_similarity
39
- # cos(theta) = (A . B) / (||A|| ||B||)
40
- # where A . B is the "dot product" and ||A|| is the magnitude of A
41
- # ruby has the 'matrix' library we can use to do these calculations.
46
+ suggestions = accounts.map do |a, d|
42
47
  {
43
- similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
44
- account: account,
48
+ similarity: calc_similarity(q, d),
49
+ account: a
45
50
  }
46
51
  end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
47
52
 
@@ -52,50 +57,51 @@ module Reckon
52
57
 
53
58
  private
54
59
 
55
- def td_idf_scores_for(query)
56
- query_tokens = tokenize(query)
57
- corpus = Set.new
58
- corpus_scores = {}
59
- query_scores = []
60
- num_docs = @accounts.length
61
-
62
- query_tokens.each do |n|
63
- (token, _count) = n
64
- next unless @tokens[token]
65
- corpus = corpus.union(Set.new(@tokens[token].keys))
60
+ def docs_to_check(query)
61
+ return tokenize(query).reduce(Set.new) do |corpus, t|
62
+ corpus.union(Set.new(@docs.tokens[t[0]]&.keys))
66
63
  end
64
+ end
67
65
 
68
- query_tokens.each do |n|
69
- (token, count) = n
70
-
71
- # if no other docs have token, ignore it
72
- next unless @tokens[token]
66
+ def tfidf(tokens)
67
+ scores = {}
73
68
 
74
- ## First, calculate scores for our query as we're building scores for the corpus
75
- query_scores << calc_tf_idf(
76
- count,
77
- query_tokens.length,
78
- @tokens[token].length,
79
- num_docs
69
+ tokens.each do |t, n|
70
+ scores[t] = calc_tf_idf(
71
+ n,
72
+ tokens.length,
73
+ @docs.tokens[t]&.length&.to_f || 0,
74
+ @docs.accounts.length
80
75
  )
81
-
82
- ## Next, calculate for the corpus, where our "account" is a document
83
- corpus.each do |account|
84
- corpus_scores[account] ||= []
85
-
86
- corpus_scores[account] << calc_tf_idf(
87
- (@tokens[token][account] || 0),
88
- @accounts[account].to_f,
89
- @tokens[token].length.to_f,
90
- num_docs
91
- )
92
- end
93
76
  end
94
- [query_scores, corpus_scores]
77
+
78
+ return scores
95
79
  end
96
80
 
97
- def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
81
+ # Cosine similarity is used to compare how similar 2 documents are. Returns a float
82
+ # between 1 and -1, where 1 is exactly the same and -1 is exactly opposite.
83
+ #
84
+ # see https://en.wikipedia.org/wiki/Cosine_similarity
85
+ # cos(theta) = (A . B) / (||A|| ||B||)
86
+ # where A . B is the "dot product" and ||A|| is the magnitude of A
87
+ #
88
+ # The variables A and B are the set of unique terms in q and d.
89
+ #
90
+ # For example, when q = "big red balloon" and d ="small green balloon" then the
91
+ # variables are (big,red,balloon,small,green) and a = (1,1,1,0,0) and b =
92
+ # (0,0,1,1,1).
93
+ #
94
+ # query and doc are hashes of token => tf/idf score
95
+ def calc_similarity(query, doc)
96
+ tokens = Set.new(query.keys + doc.keys)
97
+
98
+ a = Vector.elements(tokens.map { |n| query[n] || 0 }, false)
99
+ b = Vector.elements(tokens.map { |n| doc[n] || 0 }, false)
100
+
101
+ return a.inner_product(b) / (a.magnitude * b.magnitude)
102
+ end
98
103
 
104
+ def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
99
105
  # tf(t,d) = count of t in d / number of words in d
100
106
  tf = token_count / num_words_in_doc.to_f
101
107
 
@@ -109,14 +115,13 @@ module Reckon
109
115
  end
110
116
 
111
117
  def tokenize(str)
112
- mk_tokens(str).inject(Hash.new(0)) do |memo, n|
118
+ mk_tokens(str).each_with_object(Hash.new(0)) do |n, memo|
113
119
  memo[n] += 1
114
- memo
115
120
  end.to_a
116
121
  end
117
122
 
118
123
  def mk_tokens(str)
119
- str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
124
+ str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/).reject(&:empty?)
120
125
  end
121
126
  end
122
127
  end
@@ -2,12 +2,13 @@ module Reckon
2
2
  class DateColumn < Array
3
3
  attr_accessor :endian_precedence
4
4
  def initialize( arr = [], options = {} )
5
+ @options = options
5
6
  arr.each do |value|
6
7
  if options[:date_format]
7
8
  begin
8
9
  value = Date.strptime(value, options[:date_format])
9
10
  rescue
10
- puts "I'm having trouble parsing #{value} with the desired format: #{options[:date_format]}"
11
+ puts "I'm having trouble parsing '#{value}' with the desired format: #{options[:date_format]}"
11
12
  exit 1
12
13
  end
13
14
  else
@@ -53,7 +54,7 @@ module Reckon
53
54
  date = self.for(index)
54
55
  return "" if date.nil?
55
56
 
56
- date.iso8601
57
+ date.strftime(@options[:ledger_date_format] || '%Y-%m-%d')
57
58
  end
58
59
 
59
60
  def self.likelihood(entry)
@@ -114,7 +114,7 @@ module Reckon
114
114
 
115
115
  def initialize(ledger, options = {})
116
116
  @options = options
117
- @date_format = options[:date_format] || '%Y-%m-%d'
117
+ @date_format = options[:ledger_date_format] || options[:date_format] || '%Y-%m-%d'
118
118
  parse(ledger)
119
119
  end
120
120
 
data/lib/reckon/money.rb CHANGED
@@ -50,11 +50,18 @@ module Reckon
50
50
  return @amount_raw[0] == '-' ? @amount_raw[1..-1] : "-#{@amount_raw}"
51
51
  end
52
52
 
53
- if @suffixed
54
- (@amount >= 0 ? " " : "") + sprintf("%0.2f #{@currency}", @amount * (negate ? -1 : 1))
55
- else
56
- (@amount >= 0 ? " " : "") + sprintf("%0.2f", @amount * (negate ? -1 : 1)).gsub(/^((\-)|)(?=\d)/, "\\1#{@currency}")
57
- end
53
+ amt = pretty_amount(@amount * (negate ? -1 : 1))
54
+ amt = if @suffixed
55
+ "#{amt} #{@currency}"
56
+ else
57
+ amt.gsub(/^((-)|)(?=\d)/, "\\1#{@currency}")
58
+ end
59
+
60
+ return (@amount >= 0 ? " " : "") + amt
61
+ end
62
+
63
+ def pretty_amount(amount)
64
+ sprintf("%0.2f", amount).reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
58
65
  end
59
66
 
60
67
  def parse(value, options = {})
@@ -0,0 +1,157 @@
1
+ module Reckon
2
+ class Options
3
+ @@cli = HighLine.new
4
+
5
+ def self.parse(args = ARGV, stdin = $stdin)
6
+ options = { output_file: $stdout }
7
+ OptionParser.new do |opts|
8
+ opts.banner = "Usage: Reckon.rb [options]"
9
+ opts.separator ""
10
+
11
+ opts.on("-f", "--file FILE", "The CSV file to parse") do |file|
12
+ options[:file] = file
13
+ end
14
+
15
+ opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a|
16
+ options[:bank_account] = a
17
+ end
18
+
19
+ opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
20
+ options[:verbose] = v
21
+ end
22
+
23
+ opts.on("-i", "--inverse", "Use the negative of each amount") do |v|
24
+ options[:inverse] = v
25
+ end
26
+
27
+ opts.on("-p", "--print-table", "Print out the parsed CSV in table form") do |p|
28
+ options[:print_table] = p
29
+ end
30
+
31
+ opts.on("-o", "--output-file FILE", "The ledger file to append to") do |o|
32
+ options[:output_file] = File.open(o, 'a')
33
+ end
34
+
35
+ opts.on("-l", "--learn-from FILE", "An existing ledger file to learn accounts from") do |l|
36
+ options[:existing_ledger_file] = l
37
+ end
38
+
39
+ opts.on("", "--ignore-columns 1,2,5", "Columns to ignore, starts from 1") do |ignore|
40
+ options[:ignore_columns] = ignore.split(",").map(&:to_i)
41
+ end
42
+
43
+ opts.on("", "--money-column 2", Integer, "Column number of the money column, starts from 1") do |col|
44
+ options[:money_column] = col
45
+ end
46
+
47
+ opts.on("", "--raw-money", "Don't format money column (for stocks)") do |n|
48
+ options[:raw] = n
49
+ end
50
+
51
+ opts.on("", "--date-column 3", Integer, "Column number of the date column, starts from 1") do |col|
52
+ options[:date_column] = col
53
+ end
54
+
55
+ opts.on("", "--contains-header [N]", Integer, "Skip N header rows - default 1") do |hdr|
56
+ options[:contains_header] = 1
57
+ options[:contains_header] = hdr.to_i
58
+ end
59
+
60
+ opts.on("", "--csv-separator ','", "CSV separator (default ',')") do |sep|
61
+ options[:csv_separator] = sep
62
+ end
63
+
64
+ opts.on("", "--comma-separates-cents", "Use comma to separate cents ($100,50 vs. $100.50)") do |c|
65
+ options[:comma_separates_cents] = c
66
+ end
67
+
68
+ opts.on("", "--encoding 'UTF-8'", "Specify an encoding for the CSV file") do |e|
69
+ options[:encoding] = e
70
+ end
71
+
72
+ opts.on("-c", "--currency '$'", "Currency symbol to use - default $ (ex £, EUR)") do |e|
73
+ options[:currency] = e
74
+ end
75
+
76
+ opts.on("", "--date-format FORMAT", "CSV file date format (see `date` for format)") do |d|
77
+ options[:date_format] = d
78
+ end
79
+
80
+ opts.on("", "--ledger-date-format FORMAT", "Ledger date format (see `date` for format)") do |d|
81
+ options[:ledger_date_format] = d
82
+ end
83
+
84
+ opts.on("-u", "--unattended", "Don't ask questions and guess all the accounts automatically. Use with --learn-from or --account-tokens options.") do |n|
85
+ options[:unattended] = n
86
+ end
87
+
88
+ opts.on("-t", "--account-tokens FILE", "YAML file with manually-assigned tokens for each account (see README)") do |a|
89
+ options[:account_tokens_file] = a
90
+ end
91
+
92
+ opts.on("", "--table-output-file FILE") do |n|
93
+ options[:table_output_file] = n
94
+ end
95
+
96
+ options[:default_into_account] = 'Expenses:Unknown'
97
+ opts.on("", "--default-into-account NAME", "Default into account") do |a|
98
+ options[:default_into_account] = a
99
+ end
100
+
101
+ options[:default_outof_account] = 'Income:Unknown'
102
+ opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a|
103
+ options[:default_outof_account] = a
104
+ end
105
+
106
+ opts.on("", "--fail-on-unknown-account", "Fail on unmatched transactions.") do |n|
107
+ options[:fail_on_unknown_account] = n
108
+ end
109
+
110
+ opts.on("", "--suffixed", "Append currency symbol as a suffix.") do |e|
111
+ options[:suffixed] = e
112
+ end
113
+
114
+ opts.on_tail("-h", "--help", "Show this message") do
115
+ puts opts
116
+ exit
117
+ end
118
+
119
+ opts.on_tail("--version", "Show version") do
120
+ puts VERSION
121
+ exit
122
+ end
123
+
124
+ opts.parse!(args)
125
+ end
126
+
127
+ if options[:file] == '-'
128
+ unless options[:unattended]
129
+ raise "--unattended is required to use STDIN as CSV source."
130
+ end
131
+
132
+ options[:string] = stdin.read
133
+ end
134
+
135
+ unless options[:file]
136
+ options[:file] = @@cli.ask("What CSV file should I parse? ")
137
+ unless options[:file].empty?
138
+ puts "\nYou must provide a CSV file to parse.\n"
139
+ puts parser
140
+ exit
141
+ end
142
+ end
143
+
144
+ unless options[:bank_account]
145
+ raise "Must specify --account in unattended mode" if options[:unattended]
146
+
147
+ options[:bank_account] = @@cli.ask("What is this account named in Ledger?\n") do |q|
148
+ q.readline = true
149
+ q.validate = /^.{2,}$/
150
+ q.default = "Assets:Bank:Checking"
151
+ end
152
+ end
153
+
154
+ return options
155
+ end
156
+ end
157
+ end
@@ -1,3 +1,3 @@
1
1
  module Reckon
2
- VERSION="0.6.2"
2
+ VERSION="0.8.0"
3
3
  end
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'pp'
4
+
5
+ require 'reckon'
6
+
7
+ ledger_file = ARGV[0]
8
+ account = ARGV[1]
9
+ seed = ARGV[2] ? ARGV[2].to_i : Random.new_seed
10
+
11
+ ledger = Reckon::LedgerParser.new(File.read(ledger_file))
12
+ matcher = Reckon::CosineSimilarity.new({})
13
+
14
+ train = []
15
+ test = []
16
+
17
+ def has_account(account, entry)
18
+ entry[:accounts].map { |a| a[:name] }.include?(account)
19
+ end
20
+
21
+ entries = ledger.entries.select { |e| has_account(account, e) }
22
+
23
+ r = Random.new(seed)
24
+ entries.length.times do |i|
25
+ r.rand < 0.9 ? train << i : test << i
26
+ end
27
+
28
+ train.each do |i|
29
+ entry = entries[i]
30
+ entry[:accounts].each do |a|
31
+ matcher.add_document(
32
+ a[:name],
33
+ [entry[:desc], a[:amount]].join(" ")
34
+ )
35
+ end
36
+ end
37
+
38
+ result = [nil] * test.length
39
+ test.each do |i|
40
+ entry = entries[i]
41
+ matches = matcher.find_similar(
42
+ entry[:desc] + " " + entry[:accounts][0][:amount].to_s
43
+ )
44
+
45
+ if !matches[0] || !has_account(matches[0][:account], entry)
46
+ result[i] = [entry, matches]
47
+ end
48
+ end
49
+
50
+ # pp result.compact
51
+ puts "using #{seed} as random seed"
52
+ puts "true: #{result.count(nil)} false: #{result.count { |v| !v.nil? }}"