reckon 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.ruby-version +1 -1
- data/CHANGELOG.md +55 -1
- data/Gemfile.lock +1 -5
- data/README.md +1 -1
- data/lib/reckon.rb +7 -9
- data/lib/reckon/app.rb +140 -69
- data/lib/reckon/cosine_similarity.rb +92 -89
- data/lib/reckon/csv_parser.rb +70 -113
- data/lib/reckon/date_column.rb +60 -0
- data/lib/reckon/ledger_parser.rb +11 -1
- data/lib/reckon/logger.rb +4 -0
- data/lib/reckon/money.rb +4 -59
- data/lib/reckon/version.rb +3 -0
- data/reckon.gemspec +3 -3
- data/spec/data_fixtures/51-sample.csv +8 -0
- data/spec/data_fixtures/51-tokens.yml +9 -0
- data/spec/data_fixtures/85-date-example.csv +2 -0
- data/spec/data_fixtures/test_money_column.csv +3 -0
- data/spec/reckon/app_spec.rb +32 -2
- data/spec/reckon/csv_parser_spec.rb +129 -129
- data/spec/reckon/date_column_spec.rb +12 -13
- data/spec/reckon/ledger_parser_spec.rb +42 -5
- data/spec/reckon/money_spec.rb +42 -29
- data/spec/spec_helper.rb +19 -0
- metadata +12 -19
@@ -1,119 +1,122 @@
|
|
1
1
|
require 'matrix'
|
2
|
+
require 'set'
|
2
3
|
|
3
4
|
# Implementation of consine similarity using TF-IDF for vectorization.
|
4
5
|
# Used to suggest which account a transaction should be assigned to
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def add_document(account, doc)
|
13
|
-
tokenize(doc).each do |n|
|
14
|
-
(token, count) = n
|
15
|
-
|
16
|
-
@tokens[token] ||= {}
|
17
|
-
@tokens[token][account] ||= 0
|
18
|
-
@tokens[token][account] += count
|
19
|
-
@accounts[account] += count
|
6
|
+
module Reckon
|
7
|
+
class CosineSimilarity
|
8
|
+
def initialize(options)
|
9
|
+
@options = options
|
10
|
+
@tokens = {}
|
11
|
+
@accounts = Hash.new(0)
|
20
12
|
end
|
21
|
-
end
|
22
|
-
|
23
|
-
# find most similar documents to query
|
24
|
-
def find_similar(query)
|
25
|
-
(query_scores, corpus_scores) = td_idf_scores_for(query)
|
26
13
|
|
27
|
-
|
14
|
+
def add_document(account, doc)
|
15
|
+
tokenize(doc).each do |n|
|
16
|
+
(token, count) = n
|
28
17
|
|
29
|
-
|
30
|
-
|
31
|
-
|
18
|
+
@tokens[token] ||= {}
|
19
|
+
@tokens[token][account] ||= 0
|
20
|
+
@tokens[token][account] += count
|
21
|
+
@accounts[account] += count
|
22
|
+
end
|
23
|
+
end
|
32
24
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
# see https://en.wikipedia.org/wiki/Cosine_similarity
|
37
|
-
# cos(theta) = (A . B) / (||A|| ||B||)
|
38
|
-
# where A . B is the "dot product" and ||A|| is the magnitude of A
|
39
|
-
# ruby has the 'matrix' library we can use to do these calculations.
|
40
|
-
{
|
41
|
-
similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
|
42
|
-
account: account,
|
43
|
-
}
|
44
|
-
end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
|
25
|
+
# find most similar documents to query
|
26
|
+
def find_similar(query)
|
27
|
+
(query_scores, corpus_scores) = td_idf_scores_for(query)
|
45
28
|
|
46
|
-
|
29
|
+
query_vector = Vector.elements(query_scores, false)
|
47
30
|
|
48
|
-
|
49
|
-
|
31
|
+
# For each doc, calculate the similarity to the query
|
32
|
+
suggestions = corpus_scores.map do |account, scores|
|
33
|
+
acct_vector = Vector.elements(scores, false)
|
50
34
|
|
51
|
-
|
35
|
+
acct_query_dp = acct_vector.inner_product(query_vector)
|
36
|
+
# similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
|
37
|
+
# exactly opposite
|
38
|
+
# see https://en.wikipedia.org/wiki/Cosine_similarity
|
39
|
+
# cos(theta) = (A . B) / (||A|| ||B||)
|
40
|
+
# where A . B is the "dot product" and ||A|| is the magnitude of A
|
41
|
+
# ruby has the 'matrix' library we can use to do these calculations.
|
42
|
+
{
|
43
|
+
similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
|
44
|
+
account: account,
|
45
|
+
}
|
46
|
+
end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
|
52
47
|
|
53
|
-
|
54
|
-
query_tokens = tokenize(query)
|
55
|
-
corpus = Set.new
|
56
|
-
corpus_scores = {}
|
57
|
-
query_scores = []
|
58
|
-
num_docs = @accounts.length
|
48
|
+
LOGGER.info "most similar accounts: #{suggestions}"
|
59
49
|
|
60
|
-
|
61
|
-
(token, _count) = n
|
62
|
-
next unless @tokens[token]
|
63
|
-
corpus = corpus.union(Set.new(@tokens[token].keys))
|
50
|
+
return suggestions
|
64
51
|
end
|
65
52
|
|
66
|
-
|
67
|
-
(token, count) = n
|
53
|
+
private
|
68
54
|
|
69
|
-
|
70
|
-
|
55
|
+
def td_idf_scores_for(query)
|
56
|
+
query_tokens = tokenize(query)
|
57
|
+
corpus = Set.new
|
58
|
+
corpus_scores = {}
|
59
|
+
query_scores = []
|
60
|
+
num_docs = @accounts.length
|
61
|
+
|
62
|
+
query_tokens.each do |n|
|
63
|
+
(token, _count) = n
|
64
|
+
next unless @tokens[token]
|
65
|
+
corpus = corpus.union(Set.new(@tokens[token].keys))
|
66
|
+
end
|
71
67
|
|
72
|
-
|
73
|
-
|
74
|
-
count,
|
75
|
-
query_tokens.length,
|
76
|
-
@tokens[token].length,
|
77
|
-
num_docs
|
78
|
-
)
|
68
|
+
query_tokens.each do |n|
|
69
|
+
(token, count) = n
|
79
70
|
|
80
|
-
|
81
|
-
|
82
|
-
corpus_scores[account] ||= []
|
71
|
+
# if no other docs have token, ignore it
|
72
|
+
next unless @tokens[token]
|
83
73
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
74
|
+
## First, calculate scores for our query as we're building scores for the corpus
|
75
|
+
query_scores << calc_tf_idf(
|
76
|
+
count,
|
77
|
+
query_tokens.length,
|
78
|
+
@tokens[token].length,
|
88
79
|
num_docs
|
89
80
|
)
|
81
|
+
|
82
|
+
## Next, calculate for the corpus, where our "account" is a document
|
83
|
+
corpus.each do |account|
|
84
|
+
corpus_scores[account] ||= []
|
85
|
+
|
86
|
+
corpus_scores[account] << calc_tf_idf(
|
87
|
+
(@tokens[token][account] || 0),
|
88
|
+
@accounts[account].to_f,
|
89
|
+
@tokens[token].length.to_f,
|
90
|
+
num_docs
|
91
|
+
)
|
92
|
+
end
|
90
93
|
end
|
94
|
+
[query_scores, corpus_scores]
|
91
95
|
end
|
92
|
-
[query_scores, corpus_scores]
|
93
|
-
end
|
94
96
|
|
95
|
-
|
97
|
+
def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
|
96
98
|
|
97
|
-
|
98
|
-
|
99
|
+
# tf(t,d) = count of t in d / number of words in d
|
100
|
+
tf = token_count / num_words_in_doc.to_f
|
99
101
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
102
|
+
# smooth idf weight
|
103
|
+
# see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
|
104
|
+
# df(t) = num of documents with term t in them
|
105
|
+
# idf(t) = log(N/(1 + df )) + 1
|
106
|
+
idf = Math.log(num_docs.to_f / (1 + df)) + 1
|
105
107
|
|
106
|
-
|
107
|
-
|
108
|
+
tf * idf
|
109
|
+
end
|
108
110
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
end
|
111
|
+
def tokenize(str)
|
112
|
+
mk_tokens(str).inject(Hash.new(0)) do |memo, n|
|
113
|
+
memo[n] += 1
|
114
|
+
memo
|
115
|
+
end.to_a
|
116
|
+
end
|
116
117
|
|
117
|
-
def mk_tokens(str)
|
118
|
-
|
118
|
+
def mk_tokens(str)
|
119
|
+
str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
|
120
|
+
end
|
121
|
+
end
|
119
122
|
end
|
data/lib/reckon/csv_parser.rb
CHANGED
@@ -12,38 +12,69 @@ module Reckon
|
|
12
12
|
detect_columns
|
13
13
|
end
|
14
14
|
|
15
|
-
def
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
15
|
+
def columns
|
16
|
+
@columns ||=
|
17
|
+
begin
|
18
|
+
last_row_length = nil
|
19
|
+
csv_data.inject([]) do |memo, row|
|
20
|
+
unless row.all? { |i| i.nil? || i.length == 0 }
|
21
|
+
row.each_with_index do |entry, index|
|
22
|
+
memo[index] ||= []
|
23
|
+
memo[index] << (entry || '').strip
|
24
|
+
end
|
25
|
+
last_row_length = row.length
|
26
|
+
end
|
27
|
+
memo
|
28
|
+
end
|
20
29
|
end
|
21
|
-
@columns = new_columns
|
22
|
-
end
|
23
30
|
end
|
24
31
|
|
25
|
-
def
|
26
|
-
@
|
32
|
+
def date_for(index)
|
33
|
+
@date_column.for(index)
|
27
34
|
end
|
28
35
|
|
29
|
-
def
|
30
|
-
|
36
|
+
def pretty_date_for(index)
|
37
|
+
@date_column.pretty_for( index )
|
38
|
+
end
|
39
|
+
|
40
|
+
def money_for(index)
|
41
|
+
@money_column[index]
|
31
42
|
end
|
32
43
|
|
33
44
|
def pretty_money(amount, negate = false)
|
34
45
|
Money.new( amount, @options ).pretty( negate )
|
35
46
|
end
|
36
47
|
|
37
|
-
def
|
38
|
-
|
39
|
-
|
48
|
+
def pretty_money_for(index, negate = false)
|
49
|
+
money = money_for(index)
|
50
|
+
return 0 if money.nil?
|
40
51
|
|
41
|
-
|
42
|
-
@date_column.pretty_for( index )
|
52
|
+
money.pretty(negate)
|
43
53
|
end
|
44
54
|
|
45
55
|
def description_for(index)
|
46
|
-
description_column_indices.map { |i| columns[i][index]
|
56
|
+
description_column_indices.map { |i| columns[i][index].to_s.strip }
|
57
|
+
.reject(&:empty?)
|
58
|
+
.join("; ")
|
59
|
+
.squeeze(" ")
|
60
|
+
.gsub(/(;\s+){2,}/, '')
|
61
|
+
.strip
|
62
|
+
end
|
63
|
+
|
64
|
+
def row(index)
|
65
|
+
csv_data[index].join(", ")
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def filter_csv
|
71
|
+
if options[:ignore_columns]
|
72
|
+
new_columns = []
|
73
|
+
columns.each_with_index do |column, index|
|
74
|
+
new_columns << column unless options[:ignore_columns].include?(index + 1)
|
75
|
+
end
|
76
|
+
@columns = new_columns
|
77
|
+
end
|
47
78
|
end
|
48
79
|
|
49
80
|
def evaluate_columns(cols)
|
@@ -87,48 +118,24 @@ module Reckon
|
|
87
118
|
results << { :index => index, :money_score => money_score, :date_score => date_score }
|
88
119
|
end
|
89
120
|
|
90
|
-
|
91
|
-
end
|
121
|
+
results.sort_by! { |n| -n[:money_score] }
|
92
122
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
.map { |m| m.amount.to_s }
|
100
|
-
output_columns << new_column
|
101
|
-
elsif index == b
|
102
|
-
# skip
|
103
|
-
else
|
104
|
-
output_columns << column
|
105
|
-
end
|
123
|
+
# check if it looks like a 2-column file with a balance field
|
124
|
+
if results.length >= 3 && results[1][:money_score] + results[2][:money_score] >= results[0][:money_score]
|
125
|
+
results[1][:is_money_column] = true
|
126
|
+
results[2][:is_money_column] = true
|
127
|
+
else
|
128
|
+
results[0][:is_money_column] = true
|
106
129
|
end
|
107
|
-
output_columns
|
108
|
-
end
|
109
130
|
|
110
|
-
|
111
|
-
merged_columns = merge_columns( id1, id2 )
|
112
|
-
results, found_likely_money_column = evaluate_columns( merged_columns )
|
113
|
-
if !found_likely_money_column
|
114
|
-
new_res = results.find { |el| el[:index] == id1 }
|
115
|
-
old_res1 = unmerged_results.find { |el| el[:index] == id1 }
|
116
|
-
old_res2 = unmerged_results.find { |el| el[:index] == id2 }
|
117
|
-
if new_res[:money_score] > old_res1[:money_score] &&
|
118
|
-
new_res[:money_score] > old_res2[:money_score]
|
119
|
-
found_likely_money_column = true
|
120
|
-
end
|
121
|
-
end
|
122
|
-
[results, found_likely_money_column]
|
131
|
+
return results.sort_by { |n| n[:index] }
|
123
132
|
end
|
124
133
|
|
125
|
-
def found_double_money_column(
|
126
|
-
self.money_column_indices = [
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
puts "please report this issue to us so we can take a look!\n"
|
131
|
-
end
|
134
|
+
def found_double_money_column(id1, id2)
|
135
|
+
self.money_column_indices = [id1, id2]
|
136
|
+
puts "It looks like this CSV has two seperate columns for money, one of which shows positive"
|
137
|
+
puts "changes and one of which shows negative changes. If this is true, great. Otherwise,"
|
138
|
+
puts "please report this issue to us so we can take a look!\n"
|
132
139
|
end
|
133
140
|
|
134
141
|
# Some csv files negative/positive amounts are indicated in separate account
|
@@ -158,41 +165,18 @@ module Reckon
|
|
158
165
|
end
|
159
166
|
|
160
167
|
def detect_columns
|
161
|
-
results
|
168
|
+
results = evaluate_columns(columns)
|
169
|
+
|
162
170
|
if options[:money_column]
|
163
|
-
found_likely_money_column = true
|
164
171
|
self.money_column_indices = [ options[:money_column] - 1 ]
|
165
172
|
else
|
166
|
-
self.money_column_indices =
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
_, found_likely_double_money_columns = evaluate_columns(merge_columns(i, i+1))
|
174
|
-
if found_likely_double_money_columns
|
175
|
-
found_double_money_column( i, i + 1 )
|
176
|
-
break
|
177
|
-
end
|
178
|
-
end
|
179
|
-
end
|
180
|
-
|
181
|
-
if !found_likely_double_money_columns
|
182
|
-
0.upto(columns.length - 2) do |i|
|
183
|
-
if MoneyColumn.new( columns[i] ).merge!( MoneyColumn.new( columns[i+1] ) )
|
184
|
-
# Try a more specific test
|
185
|
-
_, found_likely_double_money_columns = evaluate_two_money_columns( columns, i, i+1, results )
|
186
|
-
if found_likely_double_money_columns
|
187
|
-
found_double_money_column( i, i + 1 )
|
188
|
-
break
|
189
|
-
end
|
190
|
-
end
|
191
|
-
end
|
192
|
-
end
|
193
|
-
|
194
|
-
if !found_likely_double_money_columns && !settings[:testing]
|
195
|
-
puts "I didn't find a high-likelyhood money column, but I'm taking my best guess with column #{money_column_indices.first + 1}."
|
173
|
+
self.money_column_indices = results.select { |n| n[:is_money_column] }.map { |n| n[:index] }
|
174
|
+
if self.money_column_indices.length == 1
|
175
|
+
puts "Using column #{money_column_indices.first + 1} as the money column. Use --money-colum to specify a different one."
|
176
|
+
elsif self.money_column_indices.length == 2
|
177
|
+
found_double_money_column(*self.money_column_indices)
|
178
|
+
else
|
179
|
+
puts "Unable to determine a money column, use --money-column to specify the column reckon should use."
|
196
180
|
end
|
197
181
|
end
|
198
182
|
|
@@ -216,23 +200,6 @@ module Reckon
|
|
216
200
|
self.description_column_indices = results.map { |i| i[:index] }
|
217
201
|
end
|
218
202
|
|
219
|
-
def columns
|
220
|
-
@columns ||= begin
|
221
|
-
last_row_length = nil
|
222
|
-
csv_data.inject([]) do |memo, row|
|
223
|
-
# fail "Input CSV must have consistent row lengths." if last_row_length && row.length != last_row_length
|
224
|
-
unless row.all? { |i| i.nil? || i.length == 0 }
|
225
|
-
row.each_with_index do |entry, index|
|
226
|
-
memo[index] ||= []
|
227
|
-
memo[index] << (entry || '').strip
|
228
|
-
end
|
229
|
-
last_row_length = row.length
|
230
|
-
end
|
231
|
-
memo
|
232
|
-
end
|
233
|
-
end
|
234
|
-
end
|
235
|
-
|
236
203
|
def parse(data, filename=nil)
|
237
204
|
# Use force_encoding to convert the string to utf-8 with as few invalid characters
|
238
205
|
# as possible.
|
@@ -274,15 +241,5 @@ module Reckon
|
|
274
241
|
end
|
275
242
|
m && m[1]
|
276
243
|
end
|
277
|
-
|
278
|
-
@settings = { :testing => false }
|
279
|
-
|
280
|
-
def self.settings
|
281
|
-
@settings
|
282
|
-
end
|
283
|
-
|
284
|
-
def settings
|
285
|
-
self.class.settings
|
286
|
-
end
|
287
244
|
end
|
288
245
|
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Reckon
|
2
|
+
class DateColumn < Array
|
3
|
+
attr_accessor :endian_precedence
|
4
|
+
def initialize( arr = [], options = {} )
|
5
|
+
arr.each do |value|
|
6
|
+
if options[:date_format]
|
7
|
+
begin
|
8
|
+
value = Date.strptime(value, options[:date_format])
|
9
|
+
rescue
|
10
|
+
puts "I'm having trouble parsing #{value} with the desired format: #{options[:date_format]}"
|
11
|
+
exit 1
|
12
|
+
end
|
13
|
+
else
|
14
|
+
value = [$1, $2, $3].join("/") if value =~ /^(\d{4})(\d{2})(\d{2})\d+\[\d+\:GMT\]$/ # chase format
|
15
|
+
value = [$3, $2, $1].join("/") if value =~ /^(\d{2})\.(\d{2})\.(\d{4})$/ # german format
|
16
|
+
value = [$3, $2, $1].join("/") if value =~ /^(\d{2})\-(\d{2})\-(\d{4})$/ # nordea format
|
17
|
+
value = [$1, $2, $3].join("/") if value =~ /^(\d{4})\-(\d{2})\-(\d{2})$/ # yyyy-mm-dd format
|
18
|
+
value = [$1, $2, $3].join("/") if value =~ /^(\d{4})(\d{2})(\d{2})/ # yyyymmdd format
|
19
|
+
|
20
|
+
|
21
|
+
unless @endian_precedence # Try to detect endian_precedence
|
22
|
+
reg_match = value.match( /^(\d\d)\/(\d\d)\/\d\d\d?\d?/ )
|
23
|
+
# If first one is not \d\d/\d\d/\d\d\d?\d set it to default
|
24
|
+
if !reg_match
|
25
|
+
@endian_precedence = [:middle, :little]
|
26
|
+
elsif reg_match[1].to_i > 12
|
27
|
+
@endian_precedence = [:little]
|
28
|
+
elsif reg_match[2].to_i > 12
|
29
|
+
@endian_precedence = [:middle]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
self.push( value )
|
34
|
+
end
|
35
|
+
# if endian_precedence still nil, raise error
|
36
|
+
unless @endian_precedence || options[:date_format]
|
37
|
+
raise( "Unable to determine date format. Please specify using --date-format" )
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def for( index )
|
42
|
+
value = self.at( index )
|
43
|
+
guess = Chronic.parse(value, :context => :past,
|
44
|
+
:endian_precedence => @endian_precedence )
|
45
|
+
if guess.to_i < 953236800 && value =~ /\//
|
46
|
+
guess = Chronic.parse((value.split("/")[0...-1] + [(2000 + value.split("/").last.to_i).to_s]).join("/"), :context => :past,
|
47
|
+
:endian_precedence => @endian_precedence)
|
48
|
+
end
|
49
|
+
guess && guess.to_date
|
50
|
+
end
|
51
|
+
|
52
|
+
def pretty_for(index)
|
53
|
+
date = self.for(index)
|
54
|
+
return "" if date.nil?
|
55
|
+
|
56
|
+
date.iso8601
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|