reckon 0.6.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +54 -3
- data/Gemfile.lock +1 -1
- data/README.md +23 -19
- data/Rakefile +2 -2
- data/bin/build-new-version.sh +26 -0
- data/bin/reckon +4 -1
- data/lib/reckon.rb +1 -0
- data/lib/reckon/app.rb +13 -150
- data/lib/reckon/cosine_similarity.rb +67 -62
- data/lib/reckon/date_column.rb +3 -2
- data/lib/reckon/ledger_parser.rb +1 -1
- data/lib/reckon/money.rb +12 -5
- data/lib/reckon/options.rb +157 -0
- data/lib/reckon/version.rb +1 -1
- data/spec/cosine_training_and_test.rb +52 -0
- data/spec/integration/another_bank_example/output.ledger +3 -3
- data/spec/integration/ask_for_account/cli_input.exp +33 -0
- data/spec/integration/ask_for_account/expected_output +11 -0
- data/spec/integration/ask_for_account/input.csv +9 -0
- data/spec/integration/ask_for_account/test_args +1 -0
- data/spec/integration/broker_canada_example/output.ledger +2 -2
- data/spec/integration/chase/account_tokens_and_regex/output.ledger +3 -3
- data/spec/integration/chase/default_account_names/output.ledger +3 -3
- data/spec/integration/chase/learn_from_existing/output.ledger +3 -3
- data/spec/integration/chase/simple/output.ledger +3 -3
- data/spec/integration/danish_kroner_nordea_example/output.ledger +1 -1
- data/spec/integration/extratofake/output.ledger +1 -1
- data/spec/integration/harder_date_example/output.ledger +2 -2
- data/spec/integration/invalid_header_example/test_args +1 -1
- data/spec/integration/ledger_date_format/compare_cmds +1 -0
- data/spec/integration/ledger_date_format/input.csv +3 -0
- data/spec/integration/ledger_date_format/output.ledger +12 -0
- data/spec/integration/ledger_date_format/test_args +1 -0
- data/spec/integration/test.sh +78 -27
- data/spec/reckon/app_spec.rb +21 -19
- data/spec/reckon/csv_parser_spec.rb +3 -3
- data/spec/reckon/date_column_spec.rb +12 -0
- data/spec/reckon/money_spec.rb +3 -3
- data/spec/reckon/options_spec.rb +17 -0
- data/spec/spec_helper.rb +6 -1
- metadata +15 -2
@@ -1,47 +1,52 @@
|
|
1
1
|
require 'matrix'
|
2
2
|
require 'set'
|
3
3
|
|
4
|
-
# Implementation of
|
5
|
-
#
|
4
|
+
# Implementation of cosine similarity using TF-IDF for vectorization.
|
5
|
+
#
|
6
|
+
# In information retrieval, tf–idf, short for term frequency–inverse document frequency,
|
7
|
+
# is a numerical statistic that is intended to reflect how important a word is to a
|
8
|
+
# document in a collection or corpus
|
9
|
+
#
|
10
|
+
# Cosine Similarity a measurement to determine how similar 2 documents are to each other.
|
11
|
+
#
|
12
|
+
# These weights and measures are used to suggest which account a transaction should be
|
13
|
+
# assigned to.
|
6
14
|
module Reckon
|
7
15
|
class CosineSimilarity
|
16
|
+
DocumentInfo = Struct.new(:tokens, :accounts)
|
17
|
+
|
8
18
|
def initialize(options)
|
19
|
+
@docs = DocumentInfo.new({}, {})
|
9
20
|
@options = options
|
10
|
-
@tokens = {}
|
11
|
-
@accounts = Hash.new(0)
|
12
21
|
end
|
13
22
|
|
14
23
|
def add_document(account, doc)
|
15
|
-
tokenize(doc)
|
24
|
+
tokens = tokenize(doc)
|
25
|
+
LOGGER.info "doc tokens: #{tokens}"
|
26
|
+
tokens.each do |n|
|
16
27
|
(token, count) = n
|
17
28
|
|
18
|
-
@tokens[token] ||=
|
19
|
-
@tokens[token][account]
|
20
|
-
@
|
21
|
-
@accounts[account] += count
|
29
|
+
@docs.tokens[token] ||= Hash.new(0)
|
30
|
+
@docs.tokens[token][account] += count
|
31
|
+
@docs.accounts[account] ||= Hash.new(0)
|
32
|
+
@docs.accounts[account][token] += count
|
22
33
|
end
|
23
34
|
end
|
24
35
|
|
25
36
|
# find most similar documents to query
|
26
37
|
def find_similar(query)
|
27
|
-
|
38
|
+
LOGGER.info "find_similar #{query}"
|
28
39
|
|
29
|
-
|
40
|
+
accounts = docs_to_check(query).map do |a|
|
41
|
+
[a, tfidf(@docs.accounts[a])]
|
42
|
+
end
|
30
43
|
|
31
|
-
|
32
|
-
suggestions = corpus_scores.map do |account, scores|
|
33
|
-
acct_vector = Vector.elements(scores, false)
|
44
|
+
q = tfidf(tokenize(query))
|
34
45
|
|
35
|
-
|
36
|
-
# similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
|
37
|
-
# exactly opposite
|
38
|
-
# see https://en.wikipedia.org/wiki/Cosine_similarity
|
39
|
-
# cos(theta) = (A . B) / (||A|| ||B||)
|
40
|
-
# where A . B is the "dot product" and ||A|| is the magnitude of A
|
41
|
-
# ruby has the 'matrix' library we can use to do these calculations.
|
46
|
+
suggestions = accounts.map do |a, d|
|
42
47
|
{
|
43
|
-
similarity:
|
44
|
-
account:
|
48
|
+
similarity: calc_similarity(q, d),
|
49
|
+
account: a
|
45
50
|
}
|
46
51
|
end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
|
47
52
|
|
@@ -52,50 +57,51 @@ module Reckon
|
|
52
57
|
|
53
58
|
private
|
54
59
|
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
corpus_scores = {}
|
59
|
-
query_scores = []
|
60
|
-
num_docs = @accounts.length
|
61
|
-
|
62
|
-
query_tokens.each do |n|
|
63
|
-
(token, _count) = n
|
64
|
-
next unless @tokens[token]
|
65
|
-
corpus = corpus.union(Set.new(@tokens[token].keys))
|
60
|
+
def docs_to_check(query)
|
61
|
+
return tokenize(query).reduce(Set.new) do |corpus, t|
|
62
|
+
corpus.union(Set.new(@docs.tokens[t[0]]&.keys))
|
66
63
|
end
|
64
|
+
end
|
67
65
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
# if no other docs have token, ignore it
|
72
|
-
next unless @tokens[token]
|
66
|
+
def tfidf(tokens)
|
67
|
+
scores = {}
|
73
68
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
@tokens[
|
79
|
-
|
69
|
+
tokens.each do |t, n|
|
70
|
+
scores[t] = calc_tf_idf(
|
71
|
+
n,
|
72
|
+
tokens.length,
|
73
|
+
@docs.tokens[t]&.length&.to_f || 0,
|
74
|
+
@docs.accounts.length
|
80
75
|
)
|
81
|
-
|
82
|
-
## Next, calculate for the corpus, where our "account" is a document
|
83
|
-
corpus.each do |account|
|
84
|
-
corpus_scores[account] ||= []
|
85
|
-
|
86
|
-
corpus_scores[account] << calc_tf_idf(
|
87
|
-
(@tokens[token][account] || 0),
|
88
|
-
@accounts[account].to_f,
|
89
|
-
@tokens[token].length.to_f,
|
90
|
-
num_docs
|
91
|
-
)
|
92
|
-
end
|
93
76
|
end
|
94
|
-
|
77
|
+
|
78
|
+
return scores
|
95
79
|
end
|
96
80
|
|
97
|
-
|
81
|
+
# Cosine similarity is used to compare how similar 2 documents are. Returns a float
|
82
|
+
# between 1 and -1, where 1 is exactly the same and -1 is exactly opposite.
|
83
|
+
#
|
84
|
+
# see https://en.wikipedia.org/wiki/Cosine_similarity
|
85
|
+
# cos(theta) = (A . B) / (||A|| ||B||)
|
86
|
+
# where A . B is the "dot product" and ||A|| is the magnitude of A
|
87
|
+
#
|
88
|
+
# The variables A and B are the set of unique terms in q and d.
|
89
|
+
#
|
90
|
+
# For example, when q = "big red balloon" and d ="small green balloon" then the
|
91
|
+
# variables are (big,red,balloon,small,green) and a = (1,1,1,0,0) and b =
|
92
|
+
# (0,0,1,1,1).
|
93
|
+
#
|
94
|
+
# query and doc are hashes of token => tf/idf score
|
95
|
+
def calc_similarity(query, doc)
|
96
|
+
tokens = Set.new(query.keys + doc.keys)
|
97
|
+
|
98
|
+
a = Vector.elements(tokens.map { |n| query[n] || 0 }, false)
|
99
|
+
b = Vector.elements(tokens.map { |n| doc[n] || 0 }, false)
|
100
|
+
|
101
|
+
return a.inner_product(b) / (a.magnitude * b.magnitude)
|
102
|
+
end
|
98
103
|
|
104
|
+
def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
|
99
105
|
# tf(t,d) = count of t in d / number of words in d
|
100
106
|
tf = token_count / num_words_in_doc.to_f
|
101
107
|
|
@@ -109,14 +115,13 @@ module Reckon
|
|
109
115
|
end
|
110
116
|
|
111
117
|
def tokenize(str)
|
112
|
-
mk_tokens(str).
|
118
|
+
mk_tokens(str).each_with_object(Hash.new(0)) do |n, memo|
|
113
119
|
memo[n] += 1
|
114
|
-
memo
|
115
120
|
end.to_a
|
116
121
|
end
|
117
122
|
|
118
123
|
def mk_tokens(str)
|
119
|
-
str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
|
124
|
+
str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/).reject(&:empty?)
|
120
125
|
end
|
121
126
|
end
|
122
127
|
end
|
data/lib/reckon/date_column.rb
CHANGED
@@ -2,12 +2,13 @@ module Reckon
|
|
2
2
|
class DateColumn < Array
|
3
3
|
attr_accessor :endian_precedence
|
4
4
|
def initialize( arr = [], options = {} )
|
5
|
+
@options = options
|
5
6
|
arr.each do |value|
|
6
7
|
if options[:date_format]
|
7
8
|
begin
|
8
9
|
value = Date.strptime(value, options[:date_format])
|
9
10
|
rescue
|
10
|
-
puts "I'm having trouble parsing #{value} with the desired format: #{options[:date_format]}"
|
11
|
+
puts "I'm having trouble parsing '#{value}' with the desired format: #{options[:date_format]}"
|
11
12
|
exit 1
|
12
13
|
end
|
13
14
|
else
|
@@ -53,7 +54,7 @@ module Reckon
|
|
53
54
|
date = self.for(index)
|
54
55
|
return "" if date.nil?
|
55
56
|
|
56
|
-
date.
|
57
|
+
date.strftime(@options[:ledger_date_format] || '%Y-%m-%d')
|
57
58
|
end
|
58
59
|
|
59
60
|
def self.likelihood(entry)
|
data/lib/reckon/ledger_parser.rb
CHANGED
@@ -114,7 +114,7 @@ module Reckon
|
|
114
114
|
|
115
115
|
def initialize(ledger, options = {})
|
116
116
|
@options = options
|
117
|
-
@date_format = options[:date_format] || '%Y-%m-%d'
|
117
|
+
@date_format = options[:ledger_date_format] || options[:date_format] || '%Y-%m-%d'
|
118
118
|
parse(ledger)
|
119
119
|
end
|
120
120
|
|
data/lib/reckon/money.rb
CHANGED
@@ -50,11 +50,18 @@ module Reckon
|
|
50
50
|
return @amount_raw[0] == '-' ? @amount_raw[1..-1] : "-#{@amount_raw}"
|
51
51
|
end
|
52
52
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
53
|
+
amt = pretty_amount(@amount * (negate ? -1 : 1))
|
54
|
+
amt = if @suffixed
|
55
|
+
"#{amt} #{@currency}"
|
56
|
+
else
|
57
|
+
amt.gsub(/^((-)|)(?=\d)/, "\\1#{@currency}")
|
58
|
+
end
|
59
|
+
|
60
|
+
return (@amount >= 0 ? " " : "") + amt
|
61
|
+
end
|
62
|
+
|
63
|
+
def pretty_amount(amount)
|
64
|
+
sprintf("%0.2f", amount).reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
|
58
65
|
end
|
59
66
|
|
60
67
|
def parse(value, options = {})
|
@@ -0,0 +1,157 @@
|
|
1
|
+
module Reckon
|
2
|
+
class Options
|
3
|
+
@@cli = HighLine.new
|
4
|
+
|
5
|
+
def self.parse(args = ARGV, stdin = $stdin)
|
6
|
+
options = { output_file: $stdout }
|
7
|
+
OptionParser.new do |opts|
|
8
|
+
opts.banner = "Usage: Reckon.rb [options]"
|
9
|
+
opts.separator ""
|
10
|
+
|
11
|
+
opts.on("-f", "--file FILE", "The CSV file to parse") do |file|
|
12
|
+
options[:file] = file
|
13
|
+
end
|
14
|
+
|
15
|
+
opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a|
|
16
|
+
options[:bank_account] = a
|
17
|
+
end
|
18
|
+
|
19
|
+
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
20
|
+
options[:verbose] = v
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-i", "--inverse", "Use the negative of each amount") do |v|
|
24
|
+
options[:inverse] = v
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on("-p", "--print-table", "Print out the parsed CSV in table form") do |p|
|
28
|
+
options[:print_table] = p
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-o", "--output-file FILE", "The ledger file to append to") do |o|
|
32
|
+
options[:output_file] = File.open(o, 'a')
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on("-l", "--learn-from FILE", "An existing ledger file to learn accounts from") do |l|
|
36
|
+
options[:existing_ledger_file] = l
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on("", "--ignore-columns 1,2,5", "Columns to ignore, starts from 1") do |ignore|
|
40
|
+
options[:ignore_columns] = ignore.split(",").map(&:to_i)
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on("", "--money-column 2", Integer, "Column number of the money column, starts from 1") do |col|
|
44
|
+
options[:money_column] = col
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on("", "--raw-money", "Don't format money column (for stocks)") do |n|
|
48
|
+
options[:raw] = n
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on("", "--date-column 3", Integer, "Column number of the date column, starts from 1") do |col|
|
52
|
+
options[:date_column] = col
|
53
|
+
end
|
54
|
+
|
55
|
+
opts.on("", "--contains-header [N]", Integer, "Skip N header rows - default 1") do |hdr|
|
56
|
+
options[:contains_header] = 1
|
57
|
+
options[:contains_header] = hdr.to_i
|
58
|
+
end
|
59
|
+
|
60
|
+
opts.on("", "--csv-separator ','", "CSV separator (default ',')") do |sep|
|
61
|
+
options[:csv_separator] = sep
|
62
|
+
end
|
63
|
+
|
64
|
+
opts.on("", "--comma-separates-cents", "Use comma to separate cents ($100,50 vs. $100.50)") do |c|
|
65
|
+
options[:comma_separates_cents] = c
|
66
|
+
end
|
67
|
+
|
68
|
+
opts.on("", "--encoding 'UTF-8'", "Specify an encoding for the CSV file") do |e|
|
69
|
+
options[:encoding] = e
|
70
|
+
end
|
71
|
+
|
72
|
+
opts.on("-c", "--currency '$'", "Currency symbol to use - default $ (ex £, EUR)") do |e|
|
73
|
+
options[:currency] = e
|
74
|
+
end
|
75
|
+
|
76
|
+
opts.on("", "--date-format FORMAT", "CSV file date format (see `date` for format)") do |d|
|
77
|
+
options[:date_format] = d
|
78
|
+
end
|
79
|
+
|
80
|
+
opts.on("", "--ledger-date-format FORMAT", "Ledger date format (see `date` for format)") do |d|
|
81
|
+
options[:ledger_date_format] = d
|
82
|
+
end
|
83
|
+
|
84
|
+
opts.on("-u", "--unattended", "Don't ask questions and guess all the accounts automatically. Use with --learn-from or --account-tokens options.") do |n|
|
85
|
+
options[:unattended] = n
|
86
|
+
end
|
87
|
+
|
88
|
+
opts.on("-t", "--account-tokens FILE", "YAML file with manually-assigned tokens for each account (see README)") do |a|
|
89
|
+
options[:account_tokens_file] = a
|
90
|
+
end
|
91
|
+
|
92
|
+
opts.on("", "--table-output-file FILE") do |n|
|
93
|
+
options[:table_output_file] = n
|
94
|
+
end
|
95
|
+
|
96
|
+
options[:default_into_account] = 'Expenses:Unknown'
|
97
|
+
opts.on("", "--default-into-account NAME", "Default into account") do |a|
|
98
|
+
options[:default_into_account] = a
|
99
|
+
end
|
100
|
+
|
101
|
+
options[:default_outof_account] = 'Income:Unknown'
|
102
|
+
opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a|
|
103
|
+
options[:default_outof_account] = a
|
104
|
+
end
|
105
|
+
|
106
|
+
opts.on("", "--fail-on-unknown-account", "Fail on unmatched transactions.") do |n|
|
107
|
+
options[:fail_on_unknown_account] = n
|
108
|
+
end
|
109
|
+
|
110
|
+
opts.on("", "--suffixed", "Append currency symbol as a suffix.") do |e|
|
111
|
+
options[:suffixed] = e
|
112
|
+
end
|
113
|
+
|
114
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
115
|
+
puts opts
|
116
|
+
exit
|
117
|
+
end
|
118
|
+
|
119
|
+
opts.on_tail("--version", "Show version") do
|
120
|
+
puts VERSION
|
121
|
+
exit
|
122
|
+
end
|
123
|
+
|
124
|
+
opts.parse!(args)
|
125
|
+
end
|
126
|
+
|
127
|
+
if options[:file] == '-'
|
128
|
+
unless options[:unattended]
|
129
|
+
raise "--unattended is required to use STDIN as CSV source."
|
130
|
+
end
|
131
|
+
|
132
|
+
options[:string] = stdin.read
|
133
|
+
end
|
134
|
+
|
135
|
+
unless options[:file]
|
136
|
+
options[:file] = @@cli.ask("What CSV file should I parse? ")
|
137
|
+
unless options[:file].empty?
|
138
|
+
puts "\nYou must provide a CSV file to parse.\n"
|
139
|
+
puts parser
|
140
|
+
exit
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
unless options[:bank_account]
|
145
|
+
raise "Must specify --account in unattended mode" if options[:unattended]
|
146
|
+
|
147
|
+
options[:bank_account] = @@cli.ask("What is this account named in Ledger?\n") do |q|
|
148
|
+
q.readline = true
|
149
|
+
q.validate = /^.{2,}$/
|
150
|
+
q.default = "Assets:Bank:Checking"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
return options
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
data/lib/reckon/version.rb
CHANGED
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'pp'
|
4
|
+
|
5
|
+
require 'reckon'
|
6
|
+
|
7
|
+
ledger_file = ARGV[0]
|
8
|
+
account = ARGV[1]
|
9
|
+
seed = ARGV[2] ? ARGV[2].to_i : Random.new_seed
|
10
|
+
|
11
|
+
ledger = Reckon::LedgerParser.new(File.read(ledger_file))
|
12
|
+
matcher = Reckon::CosineSimilarity.new({})
|
13
|
+
|
14
|
+
train = []
|
15
|
+
test = []
|
16
|
+
|
17
|
+
def has_account(account, entry)
|
18
|
+
entry[:accounts].map { |a| a[:name] }.include?(account)
|
19
|
+
end
|
20
|
+
|
21
|
+
entries = ledger.entries.select { |e| has_account(account, e) }
|
22
|
+
|
23
|
+
r = Random.new(seed)
|
24
|
+
entries.length.times do |i|
|
25
|
+
r.rand < 0.9 ? train << i : test << i
|
26
|
+
end
|
27
|
+
|
28
|
+
train.each do |i|
|
29
|
+
entry = entries[i]
|
30
|
+
entry[:accounts].each do |a|
|
31
|
+
matcher.add_document(
|
32
|
+
a[:name],
|
33
|
+
[entry[:desc], a[:amount]].join(" ")
|
34
|
+
)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
result = [nil] * test.length
|
39
|
+
test.each do |i|
|
40
|
+
entry = entries[i]
|
41
|
+
matches = matcher.find_similar(
|
42
|
+
entry[:desc] + " " + entry[:accounts][0][:amount].to_s
|
43
|
+
)
|
44
|
+
|
45
|
+
if !matches[0] || !has_account(matches[0][:account], entry)
|
46
|
+
result[i] = [entry, matches]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# pp result.compact
|
51
|
+
puts "using #{seed} as random seed"
|
52
|
+
puts "true: #{result.count(nil)} false: #{result.count { |v| !v.nil? }}"
|