reckon 0.6.2 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +54 -3
- data/Gemfile.lock +1 -1
- data/README.md +23 -19
- data/Rakefile +2 -2
- data/bin/build-new-version.sh +26 -0
- data/bin/reckon +4 -1
- data/lib/reckon.rb +1 -0
- data/lib/reckon/app.rb +13 -150
- data/lib/reckon/cosine_similarity.rb +67 -62
- data/lib/reckon/date_column.rb +3 -2
- data/lib/reckon/ledger_parser.rb +1 -1
- data/lib/reckon/money.rb +12 -5
- data/lib/reckon/options.rb +157 -0
- data/lib/reckon/version.rb +1 -1
- data/spec/cosine_training_and_test.rb +52 -0
- data/spec/integration/another_bank_example/output.ledger +3 -3
- data/spec/integration/ask_for_account/cli_input.exp +33 -0
- data/spec/integration/ask_for_account/expected_output +11 -0
- data/spec/integration/ask_for_account/input.csv +9 -0
- data/spec/integration/ask_for_account/test_args +1 -0
- data/spec/integration/broker_canada_example/output.ledger +2 -2
- data/spec/integration/chase/account_tokens_and_regex/output.ledger +3 -3
- data/spec/integration/chase/default_account_names/output.ledger +3 -3
- data/spec/integration/chase/learn_from_existing/output.ledger +3 -3
- data/spec/integration/chase/simple/output.ledger +3 -3
- data/spec/integration/danish_kroner_nordea_example/output.ledger +1 -1
- data/spec/integration/extratofake/output.ledger +1 -1
- data/spec/integration/harder_date_example/output.ledger +2 -2
- data/spec/integration/invalid_header_example/test_args +1 -1
- data/spec/integration/ledger_date_format/compare_cmds +1 -0
- data/spec/integration/ledger_date_format/input.csv +3 -0
- data/spec/integration/ledger_date_format/output.ledger +12 -0
- data/spec/integration/ledger_date_format/test_args +1 -0
- data/spec/integration/test.sh +78 -27
- data/spec/reckon/app_spec.rb +21 -19
- data/spec/reckon/csv_parser_spec.rb +3 -3
- data/spec/reckon/date_column_spec.rb +12 -0
- data/spec/reckon/money_spec.rb +3 -3
- data/spec/reckon/options_spec.rb +17 -0
- data/spec/spec_helper.rb +6 -1
- metadata +15 -2
@@ -1,47 +1,52 @@
|
|
1
1
|
require 'matrix'
|
2
2
|
require 'set'
|
3
3
|
|
4
|
-
# Implementation of
|
5
|
-
#
|
4
|
+
# Implementation of cosine similarity using TF-IDF for vectorization.
|
5
|
+
#
|
6
|
+
# In information retrieval, tf–idf, short for term frequency–inverse document frequency,
|
7
|
+
# is a numerical statistic that is intended to reflect how important a word is to a
|
8
|
+
# document in a collection or corpus
|
9
|
+
#
|
10
|
+
# Cosine Similarity a measurement to determine how similar 2 documents are to each other.
|
11
|
+
#
|
12
|
+
# These weights and measures are used to suggest which account a transaction should be
|
13
|
+
# assigned to.
|
6
14
|
module Reckon
|
7
15
|
class CosineSimilarity
|
16
|
+
DocumentInfo = Struct.new(:tokens, :accounts)
|
17
|
+
|
8
18
|
def initialize(options)
|
19
|
+
@docs = DocumentInfo.new({}, {})
|
9
20
|
@options = options
|
10
|
-
@tokens = {}
|
11
|
-
@accounts = Hash.new(0)
|
12
21
|
end
|
13
22
|
|
14
23
|
def add_document(account, doc)
|
15
|
-
tokenize(doc)
|
24
|
+
tokens = tokenize(doc)
|
25
|
+
LOGGER.info "doc tokens: #{tokens}"
|
26
|
+
tokens.each do |n|
|
16
27
|
(token, count) = n
|
17
28
|
|
18
|
-
@tokens[token] ||=
|
19
|
-
@tokens[token][account]
|
20
|
-
@
|
21
|
-
@accounts[account] += count
|
29
|
+
@docs.tokens[token] ||= Hash.new(0)
|
30
|
+
@docs.tokens[token][account] += count
|
31
|
+
@docs.accounts[account] ||= Hash.new(0)
|
32
|
+
@docs.accounts[account][token] += count
|
22
33
|
end
|
23
34
|
end
|
24
35
|
|
25
36
|
# find most similar documents to query
|
26
37
|
def find_similar(query)
|
27
|
-
|
38
|
+
LOGGER.info "find_similar #{query}"
|
28
39
|
|
29
|
-
|
40
|
+
accounts = docs_to_check(query).map do |a|
|
41
|
+
[a, tfidf(@docs.accounts[a])]
|
42
|
+
end
|
30
43
|
|
31
|
-
|
32
|
-
suggestions = corpus_scores.map do |account, scores|
|
33
|
-
acct_vector = Vector.elements(scores, false)
|
44
|
+
q = tfidf(tokenize(query))
|
34
45
|
|
35
|
-
|
36
|
-
# similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
|
37
|
-
# exactly opposite
|
38
|
-
# see https://en.wikipedia.org/wiki/Cosine_similarity
|
39
|
-
# cos(theta) = (A . B) / (||A|| ||B||)
|
40
|
-
# where A . B is the "dot product" and ||A|| is the magnitude of A
|
41
|
-
# ruby has the 'matrix' library we can use to do these calculations.
|
46
|
+
suggestions = accounts.map do |a, d|
|
42
47
|
{
|
43
|
-
similarity:
|
44
|
-
account:
|
48
|
+
similarity: calc_similarity(q, d),
|
49
|
+
account: a
|
45
50
|
}
|
46
51
|
end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
|
47
52
|
|
@@ -52,50 +57,51 @@ module Reckon
|
|
52
57
|
|
53
58
|
private
|
54
59
|
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
corpus_scores = {}
|
59
|
-
query_scores = []
|
60
|
-
num_docs = @accounts.length
|
61
|
-
|
62
|
-
query_tokens.each do |n|
|
63
|
-
(token, _count) = n
|
64
|
-
next unless @tokens[token]
|
65
|
-
corpus = corpus.union(Set.new(@tokens[token].keys))
|
60
|
+
def docs_to_check(query)
|
61
|
+
return tokenize(query).reduce(Set.new) do |corpus, t|
|
62
|
+
corpus.union(Set.new(@docs.tokens[t[0]]&.keys))
|
66
63
|
end
|
64
|
+
end
|
67
65
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
# if no other docs have token, ignore it
|
72
|
-
next unless @tokens[token]
|
66
|
+
def tfidf(tokens)
|
67
|
+
scores = {}
|
73
68
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
@tokens[
|
79
|
-
|
69
|
+
tokens.each do |t, n|
|
70
|
+
scores[t] = calc_tf_idf(
|
71
|
+
n,
|
72
|
+
tokens.length,
|
73
|
+
@docs.tokens[t]&.length&.to_f || 0,
|
74
|
+
@docs.accounts.length
|
80
75
|
)
|
81
|
-
|
82
|
-
## Next, calculate for the corpus, where our "account" is a document
|
83
|
-
corpus.each do |account|
|
84
|
-
corpus_scores[account] ||= []
|
85
|
-
|
86
|
-
corpus_scores[account] << calc_tf_idf(
|
87
|
-
(@tokens[token][account] || 0),
|
88
|
-
@accounts[account].to_f,
|
89
|
-
@tokens[token].length.to_f,
|
90
|
-
num_docs
|
91
|
-
)
|
92
|
-
end
|
93
76
|
end
|
94
|
-
|
77
|
+
|
78
|
+
return scores
|
95
79
|
end
|
96
80
|
|
97
|
-
|
81
|
+
# Cosine similarity is used to compare how similar 2 documents are. Returns a float
|
82
|
+
# between 1 and -1, where 1 is exactly the same and -1 is exactly opposite.
|
83
|
+
#
|
84
|
+
# see https://en.wikipedia.org/wiki/Cosine_similarity
|
85
|
+
# cos(theta) = (A . B) / (||A|| ||B||)
|
86
|
+
# where A . B is the "dot product" and ||A|| is the magnitude of A
|
87
|
+
#
|
88
|
+
# The variables A and B are the set of unique terms in q and d.
|
89
|
+
#
|
90
|
+
# For example, when q = "big red balloon" and d ="small green balloon" then the
|
91
|
+
# variables are (big,red,balloon,small,green) and a = (1,1,1,0,0) and b =
|
92
|
+
# (0,0,1,1,1).
|
93
|
+
#
|
94
|
+
# query and doc are hashes of token => tf/idf score
|
95
|
+
def calc_similarity(query, doc)
|
96
|
+
tokens = Set.new(query.keys + doc.keys)
|
97
|
+
|
98
|
+
a = Vector.elements(tokens.map { |n| query[n] || 0 }, false)
|
99
|
+
b = Vector.elements(tokens.map { |n| doc[n] || 0 }, false)
|
100
|
+
|
101
|
+
return a.inner_product(b) / (a.magnitude * b.magnitude)
|
102
|
+
end
|
98
103
|
|
104
|
+
def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
|
99
105
|
# tf(t,d) = count of t in d / number of words in d
|
100
106
|
tf = token_count / num_words_in_doc.to_f
|
101
107
|
|
@@ -109,14 +115,13 @@ module Reckon
|
|
109
115
|
end
|
110
116
|
|
111
117
|
def tokenize(str)
|
112
|
-
mk_tokens(str).
|
118
|
+
mk_tokens(str).each_with_object(Hash.new(0)) do |n, memo|
|
113
119
|
memo[n] += 1
|
114
|
-
memo
|
115
120
|
end.to_a
|
116
121
|
end
|
117
122
|
|
118
123
|
def mk_tokens(str)
|
119
|
-
str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
|
124
|
+
str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/).reject(&:empty?)
|
120
125
|
end
|
121
126
|
end
|
122
127
|
end
|
data/lib/reckon/date_column.rb
CHANGED
@@ -2,12 +2,13 @@ module Reckon
|
|
2
2
|
class DateColumn < Array
|
3
3
|
attr_accessor :endian_precedence
|
4
4
|
def initialize( arr = [], options = {} )
|
5
|
+
@options = options
|
5
6
|
arr.each do |value|
|
6
7
|
if options[:date_format]
|
7
8
|
begin
|
8
9
|
value = Date.strptime(value, options[:date_format])
|
9
10
|
rescue
|
10
|
-
puts "I'm having trouble parsing #{value} with the desired format: #{options[:date_format]}"
|
11
|
+
puts "I'm having trouble parsing '#{value}' with the desired format: #{options[:date_format]}"
|
11
12
|
exit 1
|
12
13
|
end
|
13
14
|
else
|
@@ -53,7 +54,7 @@ module Reckon
|
|
53
54
|
date = self.for(index)
|
54
55
|
return "" if date.nil?
|
55
56
|
|
56
|
-
date.
|
57
|
+
date.strftime(@options[:ledger_date_format] || '%Y-%m-%d')
|
57
58
|
end
|
58
59
|
|
59
60
|
def self.likelihood(entry)
|
data/lib/reckon/ledger_parser.rb
CHANGED
@@ -114,7 +114,7 @@ module Reckon
|
|
114
114
|
|
115
115
|
def initialize(ledger, options = {})
|
116
116
|
@options = options
|
117
|
-
@date_format = options[:date_format] || '%Y-%m-%d'
|
117
|
+
@date_format = options[:ledger_date_format] || options[:date_format] || '%Y-%m-%d'
|
118
118
|
parse(ledger)
|
119
119
|
end
|
120
120
|
|
data/lib/reckon/money.rb
CHANGED
@@ -50,11 +50,18 @@ module Reckon
|
|
50
50
|
return @amount_raw[0] == '-' ? @amount_raw[1..-1] : "-#{@amount_raw}"
|
51
51
|
end
|
52
52
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
53
|
+
amt = pretty_amount(@amount * (negate ? -1 : 1))
|
54
|
+
amt = if @suffixed
|
55
|
+
"#{amt} #{@currency}"
|
56
|
+
else
|
57
|
+
amt.gsub(/^((-)|)(?=\d)/, "\\1#{@currency}")
|
58
|
+
end
|
59
|
+
|
60
|
+
return (@amount >= 0 ? " " : "") + amt
|
61
|
+
end
|
62
|
+
|
63
|
+
def pretty_amount(amount)
|
64
|
+
sprintf("%0.2f", amount).reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
|
58
65
|
end
|
59
66
|
|
60
67
|
def parse(value, options = {})
|
@@ -0,0 +1,157 @@
|
|
1
|
+
module Reckon
|
2
|
+
class Options
|
3
|
+
@@cli = HighLine.new
|
4
|
+
|
5
|
+
def self.parse(args = ARGV, stdin = $stdin)
|
6
|
+
options = { output_file: $stdout }
|
7
|
+
OptionParser.new do |opts|
|
8
|
+
opts.banner = "Usage: Reckon.rb [options]"
|
9
|
+
opts.separator ""
|
10
|
+
|
11
|
+
opts.on("-f", "--file FILE", "The CSV file to parse") do |file|
|
12
|
+
options[:file] = file
|
13
|
+
end
|
14
|
+
|
15
|
+
opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a|
|
16
|
+
options[:bank_account] = a
|
17
|
+
end
|
18
|
+
|
19
|
+
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
20
|
+
options[:verbose] = v
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-i", "--inverse", "Use the negative of each amount") do |v|
|
24
|
+
options[:inverse] = v
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on("-p", "--print-table", "Print out the parsed CSV in table form") do |p|
|
28
|
+
options[:print_table] = p
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-o", "--output-file FILE", "The ledger file to append to") do |o|
|
32
|
+
options[:output_file] = File.open(o, 'a')
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on("-l", "--learn-from FILE", "An existing ledger file to learn accounts from") do |l|
|
36
|
+
options[:existing_ledger_file] = l
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on("", "--ignore-columns 1,2,5", "Columns to ignore, starts from 1") do |ignore|
|
40
|
+
options[:ignore_columns] = ignore.split(",").map(&:to_i)
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on("", "--money-column 2", Integer, "Column number of the money column, starts from 1") do |col|
|
44
|
+
options[:money_column] = col
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on("", "--raw-money", "Don't format money column (for stocks)") do |n|
|
48
|
+
options[:raw] = n
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on("", "--date-column 3", Integer, "Column number of the date column, starts from 1") do |col|
|
52
|
+
options[:date_column] = col
|
53
|
+
end
|
54
|
+
|
55
|
+
opts.on("", "--contains-header [N]", Integer, "Skip N header rows - default 1") do |hdr|
|
56
|
+
options[:contains_header] = 1
|
57
|
+
options[:contains_header] = hdr.to_i
|
58
|
+
end
|
59
|
+
|
60
|
+
opts.on("", "--csv-separator ','", "CSV separator (default ',')") do |sep|
|
61
|
+
options[:csv_separator] = sep
|
62
|
+
end
|
63
|
+
|
64
|
+
opts.on("", "--comma-separates-cents", "Use comma to separate cents ($100,50 vs. $100.50)") do |c|
|
65
|
+
options[:comma_separates_cents] = c
|
66
|
+
end
|
67
|
+
|
68
|
+
opts.on("", "--encoding 'UTF-8'", "Specify an encoding for the CSV file") do |e|
|
69
|
+
options[:encoding] = e
|
70
|
+
end
|
71
|
+
|
72
|
+
opts.on("-c", "--currency '$'", "Currency symbol to use - default $ (ex £, EUR)") do |e|
|
73
|
+
options[:currency] = e
|
74
|
+
end
|
75
|
+
|
76
|
+
opts.on("", "--date-format FORMAT", "CSV file date format (see `date` for format)") do |d|
|
77
|
+
options[:date_format] = d
|
78
|
+
end
|
79
|
+
|
80
|
+
opts.on("", "--ledger-date-format FORMAT", "Ledger date format (see `date` for format)") do |d|
|
81
|
+
options[:ledger_date_format] = d
|
82
|
+
end
|
83
|
+
|
84
|
+
opts.on("-u", "--unattended", "Don't ask questions and guess all the accounts automatically. Use with --learn-from or --account-tokens options.") do |n|
|
85
|
+
options[:unattended] = n
|
86
|
+
end
|
87
|
+
|
88
|
+
opts.on("-t", "--account-tokens FILE", "YAML file with manually-assigned tokens for each account (see README)") do |a|
|
89
|
+
options[:account_tokens_file] = a
|
90
|
+
end
|
91
|
+
|
92
|
+
opts.on("", "--table-output-file FILE") do |n|
|
93
|
+
options[:table_output_file] = n
|
94
|
+
end
|
95
|
+
|
96
|
+
options[:default_into_account] = 'Expenses:Unknown'
|
97
|
+
opts.on("", "--default-into-account NAME", "Default into account") do |a|
|
98
|
+
options[:default_into_account] = a
|
99
|
+
end
|
100
|
+
|
101
|
+
options[:default_outof_account] = 'Income:Unknown'
|
102
|
+
opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a|
|
103
|
+
options[:default_outof_account] = a
|
104
|
+
end
|
105
|
+
|
106
|
+
opts.on("", "--fail-on-unknown-account", "Fail on unmatched transactions.") do |n|
|
107
|
+
options[:fail_on_unknown_account] = n
|
108
|
+
end
|
109
|
+
|
110
|
+
opts.on("", "--suffixed", "Append currency symbol as a suffix.") do |e|
|
111
|
+
options[:suffixed] = e
|
112
|
+
end
|
113
|
+
|
114
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
115
|
+
puts opts
|
116
|
+
exit
|
117
|
+
end
|
118
|
+
|
119
|
+
opts.on_tail("--version", "Show version") do
|
120
|
+
puts VERSION
|
121
|
+
exit
|
122
|
+
end
|
123
|
+
|
124
|
+
opts.parse!(args)
|
125
|
+
end
|
126
|
+
|
127
|
+
if options[:file] == '-'
|
128
|
+
unless options[:unattended]
|
129
|
+
raise "--unattended is required to use STDIN as CSV source."
|
130
|
+
end
|
131
|
+
|
132
|
+
options[:string] = stdin.read
|
133
|
+
end
|
134
|
+
|
135
|
+
unless options[:file]
|
136
|
+
options[:file] = @@cli.ask("What CSV file should I parse? ")
|
137
|
+
unless options[:file].empty?
|
138
|
+
puts "\nYou must provide a CSV file to parse.\n"
|
139
|
+
puts parser
|
140
|
+
exit
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
unless options[:bank_account]
|
145
|
+
raise "Must specify --account in unattended mode" if options[:unattended]
|
146
|
+
|
147
|
+
options[:bank_account] = @@cli.ask("What is this account named in Ledger?\n") do |q|
|
148
|
+
q.readline = true
|
149
|
+
q.validate = /^.{2,}$/
|
150
|
+
q.default = "Assets:Bank:Checking"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
return options
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
data/lib/reckon/version.rb
CHANGED
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'pp'
|
4
|
+
|
5
|
+
require 'reckon'
|
6
|
+
|
7
|
+
ledger_file = ARGV[0]
|
8
|
+
account = ARGV[1]
|
9
|
+
seed = ARGV[2] ? ARGV[2].to_i : Random.new_seed
|
10
|
+
|
11
|
+
ledger = Reckon::LedgerParser.new(File.read(ledger_file))
|
12
|
+
matcher = Reckon::CosineSimilarity.new({})
|
13
|
+
|
14
|
+
train = []
|
15
|
+
test = []
|
16
|
+
|
17
|
+
def has_account(account, entry)
|
18
|
+
entry[:accounts].map { |a| a[:name] }.include?(account)
|
19
|
+
end
|
20
|
+
|
21
|
+
entries = ledger.entries.select { |e| has_account(account, e) }
|
22
|
+
|
23
|
+
r = Random.new(seed)
|
24
|
+
entries.length.times do |i|
|
25
|
+
r.rand < 0.9 ? train << i : test << i
|
26
|
+
end
|
27
|
+
|
28
|
+
train.each do |i|
|
29
|
+
entry = entries[i]
|
30
|
+
entry[:accounts].each do |a|
|
31
|
+
matcher.add_document(
|
32
|
+
a[:name],
|
33
|
+
[entry[:desc], a[:amount]].join(" ")
|
34
|
+
)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
result = [nil] * test.length
|
39
|
+
test.each do |i|
|
40
|
+
entry = entries[i]
|
41
|
+
matches = matcher.find_similar(
|
42
|
+
entry[:desc] + " " + entry[:accounts][0][:amount].to_s
|
43
|
+
)
|
44
|
+
|
45
|
+
if !matches[0] || !has_account(matches[0][:account], entry)
|
46
|
+
result[i] = [entry, matches]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# pp result.compact
|
51
|
+
puts "using #{seed} as random seed"
|
52
|
+
puts "true: #{result.count(nil)} false: #{result.count { |v| !v.nil? }}"
|