reckon 0.4.4 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +3 -0
- data/.ruby-version +1 -1
- data/.travis.yml +10 -2
- data/CHANGELOG.md +235 -0
- data/Gemfile +0 -1
- data/Gemfile.lock +73 -15
- data/README.md +12 -5
- data/lib/reckon.rb +13 -12
- data/lib/reckon/app.rb +94 -116
- data/lib/reckon/cosine_similarity.rb +122 -0
- data/lib/reckon/csv_parser.rb +116 -129
- data/lib/reckon/date_column.rb +60 -0
- data/lib/reckon/ledger_parser.rb +204 -30
- data/lib/reckon/logger.rb +4 -0
- data/lib/reckon/money.rb +6 -62
- data/lib/reckon/version.rb +3 -0
- data/reckon.gemspec +8 -5
- data/spec/data_fixtures/51-sample.csv +8 -0
- data/spec/data_fixtures/51-tokens.yml +9 -0
- data/spec/data_fixtures/73-sample.csv +2 -0
- data/spec/data_fixtures/73-tokens.yml +8 -0
- data/spec/data_fixtures/73-transactions.ledger +7 -0
- data/spec/data_fixtures/85-date-example.csv +2 -0
- data/spec/data_fixtures/austrian_example.csv +13 -0
- data/spec/data_fixtures/bom_utf8_file.csv +1 -0
- data/spec/data_fixtures/broker_canada_example.csv +12 -0
- data/spec/data_fixtures/chase.csv +9 -0
- data/spec/data_fixtures/danish_kroner_nordea_example.csv +6 -0
- data/spec/data_fixtures/english_date_example.csv +3 -0
- data/spec/data_fixtures/french_example.csv +9 -0
- data/spec/data_fixtures/german_date_example.csv +3 -0
- data/spec/data_fixtures/harder_date_example.csv +5 -0
- data/spec/data_fixtures/ing.csv +3 -0
- data/spec/data_fixtures/intuit_mint_example.csv +7 -0
- data/spec/data_fixtures/invalid_header_example.csv +6 -0
- data/spec/data_fixtures/inversed_credit_card.csv +16 -0
- data/spec/data_fixtures/nationwide.csv +4 -0
- data/spec/data_fixtures/simple.csv +2 -0
- data/spec/data_fixtures/some_other.csv +9 -0
- data/spec/data_fixtures/spanish_date_example.csv +3 -0
- data/spec/data_fixtures/suntrust.csv +7 -0
- data/spec/data_fixtures/test_money_column.csv +3 -0
- data/spec/data_fixtures/two_money_columns.csv +5 -0
- data/spec/data_fixtures/yyyymmdd_date_example.csv +1 -0
- data/spec/reckon/app_spec.rb +96 -34
- data/spec/reckon/csv_parser_spec.rb +185 -307
- data/spec/reckon/date_column_spec.rb +12 -13
- data/spec/reckon/ledger_parser_spec.rb +99 -9
- data/spec/reckon/money_spec.rb +42 -29
- data/spec/spec_helper.rb +22 -0
- metadata +85 -21
- data/CHANGES.md +0 -9
data/lib/reckon/app.rb
CHANGED
@@ -1,21 +1,20 @@
|
|
1
|
-
#coding: utf-8
|
1
|
+
# coding: utf-8
|
2
2
|
require 'pp'
|
3
3
|
require 'yaml'
|
4
4
|
|
5
5
|
module Reckon
|
6
6
|
class App
|
7
|
-
|
8
|
-
attr_accessor :options, :accounts, :tokens, :seen, :csv_parser, :regexps
|
7
|
+
attr_accessor :options, :seen, :csv_parser, :regexps, :matcher
|
9
8
|
|
10
9
|
def initialize(options = {})
|
10
|
+
LOGGER.level = Logger::INFO if options[:verbose]
|
11
11
|
self.options = options
|
12
|
-
self.tokens = {}
|
13
12
|
self.regexps = {}
|
14
|
-
self.accounts = {}
|
15
13
|
self.seen = {}
|
16
14
|
self.options[:currency] ||= '$'
|
17
15
|
options[:string] = File.read(options[:file]) unless options[:string]
|
18
16
|
@csv_parser = CSVParser.new( options )
|
17
|
+
@matcher = CosineSimilarity.new(options)
|
19
18
|
learn!
|
20
19
|
end
|
21
20
|
|
@@ -24,21 +23,44 @@ module Reckon
|
|
24
23
|
puts str
|
25
24
|
end
|
26
25
|
|
26
|
+
def learn!
|
27
|
+
learn_from_account_tokens(options[:account_tokens_file])
|
28
|
+
|
29
|
+
ledger_file = options[:existing_ledger_file]
|
30
|
+
return unless ledger_file
|
31
|
+
fail "#{ledger_file} doesn't exist!" unless File.exists?(ledger_file)
|
32
|
+
learn_from(File.read(ledger_file))
|
33
|
+
end
|
34
|
+
|
35
|
+
def learn_from_account_tokens(filename)
|
36
|
+
return unless filename
|
37
|
+
|
38
|
+
fail "#{filename} doesn't exist!" unless File.exists?(filename)
|
39
|
+
|
40
|
+
extract_account_tokens(YAML.load_file(filename)).each do |account, tokens|
|
41
|
+
tokens.each do |t|
|
42
|
+
if t.start_with?('/')
|
43
|
+
add_regexp(account, t)
|
44
|
+
else
|
45
|
+
@matcher.add_document(account, t)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
27
51
|
def learn_from(ledger)
|
28
52
|
LedgerParser.new(ledger).entries.each do |entry|
|
29
53
|
entry[:accounts].each do |account|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
seen[
|
54
|
+
str = [entry[:desc], account[:amount]].join(" ")
|
55
|
+
@matcher.add_document(account[:name], str) unless account[:name] == options[:bank_account]
|
56
|
+
pretty_date = entry[:date].iso8601
|
57
|
+
seen[pretty_date] ||= {}
|
58
|
+
seen[pretty_date][@csv_parser.pretty_money(account[:amount])] = true
|
34
59
|
end
|
35
60
|
end
|
36
61
|
end
|
37
62
|
|
38
|
-
|
39
|
-
seen[row[:pretty_date]] && seen[row[:pretty_date]][row[:pretty_money]]
|
40
|
-
end
|
41
|
-
|
63
|
+
# Add tokens from account_tokens_file to accounts
|
42
64
|
def extract_account_tokens(subtree, account = nil)
|
43
65
|
if subtree.nil?
|
44
66
|
puts "Warning: empty #{account} tree"
|
@@ -46,50 +68,26 @@ module Reckon
|
|
46
68
|
elsif subtree.is_a?(Array)
|
47
69
|
{ account => subtree }
|
48
70
|
else
|
49
|
-
at = subtree.map
|
50
|
-
|
51
|
-
|
52
|
-
end
|
53
|
-
|
54
|
-
def learn!
|
55
|
-
if options[:account_tokens_file]
|
56
|
-
fail "#{options[:account_tokens_file]} doesn't exist!" unless File.exists?(options[:account_tokens_file])
|
57
|
-
extract_account_tokens(YAML.load_file(options[:account_tokens_file])).each do |account, tokens|
|
58
|
-
tokens.each { |t| learn_about_account(account, t, true) }
|
71
|
+
at = subtree.map do |k, v|
|
72
|
+
merged_acct = [account, k].compact.join(':')
|
73
|
+
extract_account_tokens(v, merged_acct)
|
59
74
|
end
|
75
|
+
at.inject({}) { |memo, e| memo.merge!(e)}
|
60
76
|
end
|
61
|
-
return unless options[:existing_ledger_file]
|
62
|
-
fail "#{options[:existing_ledger_file]} doesn't exist!" unless File.exists?(options[:existing_ledger_file])
|
63
|
-
ledger_data = File.read(options[:existing_ledger_file])
|
64
|
-
learn_from(ledger_data)
|
65
77
|
end
|
66
78
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
when 'x' then options |= Regexp::EXTENDED
|
77
|
-
when 'i' then options |= Regexp::IGNORECASE
|
78
|
-
end
|
79
|
-
end
|
80
|
-
regexps[Regexp.new(match[1], options)] = account
|
81
|
-
else
|
82
|
-
tokenize(data).each do |token|
|
83
|
-
tokens[token] ||= {}
|
84
|
-
tokens[token][account] ||= 0
|
85
|
-
tokens[token][account] += 1
|
86
|
-
accounts[account] += 1
|
79
|
+
def add_regexp(account, regex_str)
|
80
|
+
# https://github.com/tenderlove/psych/blob/master/lib/psych/visitors/to_ruby.rb
|
81
|
+
match = regex_str.match(/^\/(.*)\/([ix]*)$/m)
|
82
|
+
fail "failed to parse regexp #{regex_str}" unless match
|
83
|
+
options = 0
|
84
|
+
(match[2] || '').split('').each do |option|
|
85
|
+
case option
|
86
|
+
when 'x' then options |= Regexp::EXTENDED
|
87
|
+
when 'i' then options |= Regexp::IGNORECASE
|
87
88
|
end
|
88
89
|
end
|
89
|
-
|
90
|
-
|
91
|
-
def tokenize(str)
|
92
|
-
str.downcase.split(/[\s\-]/)
|
90
|
+
regexps[Regexp.new(match[1], options)] = account
|
93
91
|
end
|
94
92
|
|
95
93
|
def walk_backwards
|
@@ -107,8 +105,7 @@ module Reckon
|
|
107
105
|
seen_anything_new = true
|
108
106
|
end
|
109
107
|
|
110
|
-
possible_answers =
|
111
|
-
possible_answers = weighted_account_match( row ).map! { |a| a[:account] } if possible_answers.empty?
|
108
|
+
possible_answers = suggest(row)
|
112
109
|
|
113
110
|
ledger = if row[:money] > 0
|
114
111
|
if options[:unattended]
|
@@ -156,15 +153,21 @@ module Reckon
|
|
156
153
|
end
|
157
154
|
end
|
158
155
|
|
159
|
-
def
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
156
|
+
def each_row_backwards
|
157
|
+
rows = []
|
158
|
+
(0...@csv_parser.columns.first.length).to_a.each do |index|
|
159
|
+
if @csv_parser.date_for(index).nil?
|
160
|
+
LOGGER.warn("Skipping row: '#{@csv_parser.row(index)}' that doesn't have a valid date")
|
161
|
+
next
|
162
|
+
end
|
163
|
+
rows << { :date => @csv_parser.date_for(index),
|
164
|
+
:pretty_date => @csv_parser.pretty_date_for(index),
|
165
|
+
:pretty_money => @csv_parser.pretty_money_for(index),
|
166
|
+
:pretty_money_negated => @csv_parser.pretty_money_for(index, :negate),
|
167
|
+
:money => @csv_parser.money_for(index),
|
168
|
+
:description => @csv_parser.description_for(index) }
|
169
|
+
end
|
170
|
+
rows.sort_by { |n| n[:date] }.each {|row| yield row }
|
168
171
|
end
|
169
172
|
|
170
173
|
def most_specific_regexp_match( row )
|
@@ -176,41 +179,9 @@ module Reckon
|
|
176
179
|
matches.sort_by! { |account, matched_text| matched_text.length }.map(&:first)
|
177
180
|
end
|
178
181
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
search_vector = []
|
184
|
-
account_vectors = {}
|
185
|
-
|
186
|
-
query_tokens.each do |token|
|
187
|
-
idf = Math.log((accounts.keys.length + 1) / ((tokens[token] || {}).keys.length.to_f + 1))
|
188
|
-
tf = 1.0 / query_tokens.length.to_f
|
189
|
-
search_vector << tf*idf
|
190
|
-
|
191
|
-
accounts.each do |account, total_terms|
|
192
|
-
tf = (tokens[token] && tokens[token][account]) ? tokens[token][account] / total_terms.to_f : 0
|
193
|
-
account_vectors[account] ||= []
|
194
|
-
account_vectors[account] << tf*idf
|
195
|
-
end
|
196
|
-
end
|
197
|
-
|
198
|
-
# Should I normalize the vectors? Probably unnecessary due to tf-idf and short documents.
|
199
|
-
|
200
|
-
account_vectors = account_vectors.to_a.map do |account, account_vector|
|
201
|
-
{ :cosine => (0...account_vector.length).to_a.inject(0) { |m, i| m + search_vector[i] * account_vector[i] },
|
202
|
-
:account => account }
|
203
|
-
end
|
204
|
-
account_vectors.sort! {|a, b| b[:cosine] <=> a[:cosine] }
|
205
|
-
|
206
|
-
# Return empty set if no accounts matched so that we can fallback to the defaults in the unattended mode
|
207
|
-
if options[:unattended]
|
208
|
-
if account_vectors.first && account_vectors.first[:account]
|
209
|
-
account_vectors = [] if account_vectors.first[:cosine] == 0
|
210
|
-
end
|
211
|
-
end
|
212
|
-
|
213
|
-
return account_vectors
|
182
|
+
def suggest(row)
|
183
|
+
most_specific_regexp_match(row) +
|
184
|
+
@matcher.find_similar(row[:description]).map { |n| n[:account] }
|
214
185
|
end
|
215
186
|
|
216
187
|
def ledger_format(row, line1, line2)
|
@@ -220,6 +191,21 @@ module Reckon
|
|
220
191
|
out
|
221
192
|
end
|
222
193
|
|
194
|
+
def output(ledger_line)
|
195
|
+
options[:output_file].puts ledger_line
|
196
|
+
options[:output_file].flush
|
197
|
+
end
|
198
|
+
|
199
|
+
def already_seen?(row)
|
200
|
+
seen[row[:pretty_date]] && seen[row[:pretty_date]][row[:pretty_money]]
|
201
|
+
end
|
202
|
+
|
203
|
+
def finish
|
204
|
+
options[:output_file].close unless options[:output_file] == STDOUT
|
205
|
+
interactive_output "Exiting."
|
206
|
+
exit
|
207
|
+
end
|
208
|
+
|
223
209
|
def output_table
|
224
210
|
output = Terminal::Table.new do |t|
|
225
211
|
t.headings = 'Date', 'Amount', 'Description'
|
@@ -230,21 +216,6 @@ module Reckon
|
|
230
216
|
interactive_output output
|
231
217
|
end
|
232
218
|
|
233
|
-
def each_row_backwards
|
234
|
-
rows = []
|
235
|
-
(0...@csv_parser.columns.first.length).to_a.each do |index|
|
236
|
-
rows << { :date => @csv_parser.date_for(index),
|
237
|
-
:pretty_date => @csv_parser.pretty_date_for(index),
|
238
|
-
:pretty_money => @csv_parser.pretty_money_for(index),
|
239
|
-
:pretty_money_negated => @csv_parser.pretty_money_for(index, :negate),
|
240
|
-
:money => @csv_parser.money_for(index),
|
241
|
-
:description => @csv_parser.description_for(index) }
|
242
|
-
end
|
243
|
-
rows.sort { |a, b| a[:date] <=> b[:date] }.each do |row|
|
244
|
-
yield row
|
245
|
-
end
|
246
|
-
end
|
247
|
-
|
248
219
|
def self.parse_opts(args = ARGV)
|
249
220
|
options = { :output_file => STDOUT }
|
250
221
|
parser = OptionParser.new do |opts|
|
@@ -255,7 +226,7 @@ module Reckon
|
|
255
226
|
options[:file] = file
|
256
227
|
end
|
257
228
|
|
258
|
-
opts.on("-a", "--account
|
229
|
+
opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a|
|
259
230
|
options[:bank_account] = a
|
260
231
|
end
|
261
232
|
|
@@ -283,6 +254,14 @@ module Reckon
|
|
283
254
|
options[:ignore_columns] = ignore.split(",").map { |i| i.to_i }
|
284
255
|
end
|
285
256
|
|
257
|
+
opts.on("", "--money-column 2", Integer, "Specify the money column instead of letting Reckon guess - the first column is column 1") do |column_number|
|
258
|
+
options[:money_column] = column_number
|
259
|
+
end
|
260
|
+
|
261
|
+
opts.on("", "--date-column 3", Integer, "Specify the date column instead of letting Reckon guess - the first column is column 1") do |column_number|
|
262
|
+
options[:date_column] = column_number
|
263
|
+
end
|
264
|
+
|
286
265
|
opts.on("", "--contains-header [N]", "The first row of the CSV is a header and should be skipped. Optionally add the number of rows to skip.") do |contains_header|
|
287
266
|
options[:contains_header] = 1
|
288
267
|
options[:contains_header] = contains_header.to_i if contains_header
|
@@ -316,11 +295,11 @@ module Reckon
|
|
316
295
|
options[:account_tokens_file] = a
|
317
296
|
end
|
318
297
|
|
319
|
-
opts.on("", "--default-into-account
|
298
|
+
opts.on("", "--default-into-account NAME", "Default into account") do |a|
|
320
299
|
options[:default_into_account] = a
|
321
300
|
end
|
322
301
|
|
323
|
-
opts.on("", "--default-outof-account
|
302
|
+
opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a|
|
324
303
|
options[:default_outof_account] = a
|
325
304
|
end
|
326
305
|
|
@@ -351,7 +330,6 @@ module Reckon
|
|
351
330
|
end
|
352
331
|
|
353
332
|
unless options[:bank_account]
|
354
|
-
|
355
333
|
fail "Please specify --account for the unattended mode" if options[:unattended]
|
356
334
|
|
357
335
|
options[:bank_account] = ask("What is the account name of this bank account in Ledger? ") do |q|
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'matrix'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
# Implementation of consine similarity using TF-IDF for vectorization.
|
5
|
+
# Used to suggest which account a transaction should be assigned to
|
6
|
+
module Reckon
|
7
|
+
class CosineSimilarity
|
8
|
+
def initialize(options)
|
9
|
+
@options = options
|
10
|
+
@tokens = {}
|
11
|
+
@accounts = Hash.new(0)
|
12
|
+
end
|
13
|
+
|
14
|
+
def add_document(account, doc)
|
15
|
+
tokenize(doc).each do |n|
|
16
|
+
(token, count) = n
|
17
|
+
|
18
|
+
@tokens[token] ||= {}
|
19
|
+
@tokens[token][account] ||= 0
|
20
|
+
@tokens[token][account] += count
|
21
|
+
@accounts[account] += count
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# find most similar documents to query
|
26
|
+
def find_similar(query)
|
27
|
+
(query_scores, corpus_scores) = td_idf_scores_for(query)
|
28
|
+
|
29
|
+
query_vector = Vector.elements(query_scores, false)
|
30
|
+
|
31
|
+
# For each doc, calculate the similarity to the query
|
32
|
+
suggestions = corpus_scores.map do |account, scores|
|
33
|
+
acct_vector = Vector.elements(scores, false)
|
34
|
+
|
35
|
+
acct_query_dp = acct_vector.inner_product(query_vector)
|
36
|
+
# similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
|
37
|
+
# exactly opposite
|
38
|
+
# see https://en.wikipedia.org/wiki/Cosine_similarity
|
39
|
+
# cos(theta) = (A . B) / (||A|| ||B||)
|
40
|
+
# where A . B is the "dot product" and ||A|| is the magnitude of A
|
41
|
+
# ruby has the 'matrix' library we can use to do these calculations.
|
42
|
+
{
|
43
|
+
similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
|
44
|
+
account: account,
|
45
|
+
}
|
46
|
+
end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
|
47
|
+
|
48
|
+
LOGGER.info "most similar accounts: #{suggestions}"
|
49
|
+
|
50
|
+
return suggestions
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def td_idf_scores_for(query)
|
56
|
+
query_tokens = tokenize(query)
|
57
|
+
corpus = Set.new
|
58
|
+
corpus_scores = {}
|
59
|
+
query_scores = []
|
60
|
+
num_docs = @accounts.length
|
61
|
+
|
62
|
+
query_tokens.each do |n|
|
63
|
+
(token, _count) = n
|
64
|
+
next unless @tokens[token]
|
65
|
+
corpus = corpus.union(Set.new(@tokens[token].keys))
|
66
|
+
end
|
67
|
+
|
68
|
+
query_tokens.each do |n|
|
69
|
+
(token, count) = n
|
70
|
+
|
71
|
+
# if no other docs have token, ignore it
|
72
|
+
next unless @tokens[token]
|
73
|
+
|
74
|
+
## First, calculate scores for our query as we're building scores for the corpus
|
75
|
+
query_scores << calc_tf_idf(
|
76
|
+
count,
|
77
|
+
query_tokens.length,
|
78
|
+
@tokens[token].length,
|
79
|
+
num_docs
|
80
|
+
)
|
81
|
+
|
82
|
+
## Next, calculate for the corpus, where our "account" is a document
|
83
|
+
corpus.each do |account|
|
84
|
+
corpus_scores[account] ||= []
|
85
|
+
|
86
|
+
corpus_scores[account] << calc_tf_idf(
|
87
|
+
(@tokens[token][account] || 0),
|
88
|
+
@accounts[account].to_f,
|
89
|
+
@tokens[token].length.to_f,
|
90
|
+
num_docs
|
91
|
+
)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
[query_scores, corpus_scores]
|
95
|
+
end
|
96
|
+
|
97
|
+
def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
|
98
|
+
|
99
|
+
# tf(t,d) = count of t in d / number of words in d
|
100
|
+
tf = token_count / num_words_in_doc.to_f
|
101
|
+
|
102
|
+
# smooth idf weight
|
103
|
+
# see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
|
104
|
+
# df(t) = num of documents with term t in them
|
105
|
+
# idf(t) = log(N/(1 + df )) + 1
|
106
|
+
idf = Math.log(num_docs.to_f / (1 + df)) + 1
|
107
|
+
|
108
|
+
tf * idf
|
109
|
+
end
|
110
|
+
|
111
|
+
def tokenize(str)
|
112
|
+
mk_tokens(str).inject(Hash.new(0)) do |memo, n|
|
113
|
+
memo[n] += 1
|
114
|
+
memo
|
115
|
+
end.to_a
|
116
|
+
end
|
117
|
+
|
118
|
+
def mk_tokens(str)
|
119
|
+
str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
data/lib/reckon/csv_parser.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#coding: utf-8
|
2
|
-
require 'pp'
|
3
2
|
|
4
3
|
module Reckon
|
5
4
|
class CSVParser
|
@@ -8,43 +7,74 @@ module Reckon
|
|
8
7
|
def initialize(options = {})
|
9
8
|
self.options = options
|
10
9
|
self.options[:currency] ||= '$'
|
11
|
-
parse
|
10
|
+
@csv_data = parse(options[:string] || File.read(options[:file]), options[:file])
|
12
11
|
filter_csv
|
13
12
|
detect_columns
|
14
13
|
end
|
15
14
|
|
16
|
-
def
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
def columns
|
16
|
+
@columns ||=
|
17
|
+
begin
|
18
|
+
last_row_length = nil
|
19
|
+
csv_data.inject([]) do |memo, row|
|
20
|
+
unless row.all? { |i| i.nil? || i.length == 0 }
|
21
|
+
row.each_with_index do |entry, index|
|
22
|
+
memo[index] ||= []
|
23
|
+
memo[index] << (entry || '').strip
|
24
|
+
end
|
25
|
+
last_row_length = row.length
|
26
|
+
end
|
27
|
+
memo
|
28
|
+
end
|
21
29
|
end
|
22
|
-
@columns = new_columns
|
23
|
-
end
|
24
30
|
end
|
25
31
|
|
26
|
-
def
|
27
|
-
@
|
32
|
+
def date_for(index)
|
33
|
+
@date_column.for(index)
|
28
34
|
end
|
29
35
|
|
30
|
-
def
|
31
|
-
|
36
|
+
def pretty_date_for(index)
|
37
|
+
@date_column.pretty_for( index )
|
38
|
+
end
|
39
|
+
|
40
|
+
def money_for(index)
|
41
|
+
@money_column[index]
|
32
42
|
end
|
33
43
|
|
34
44
|
def pretty_money(amount, negate = false)
|
35
45
|
Money.new( amount, @options ).pretty( negate )
|
36
46
|
end
|
37
47
|
|
38
|
-
def
|
39
|
-
|
40
|
-
|
48
|
+
def pretty_money_for(index, negate = false)
|
49
|
+
money = money_for(index)
|
50
|
+
return 0 if money.nil?
|
41
51
|
|
42
|
-
|
43
|
-
@date_column.pretty_for( index )
|
52
|
+
money.pretty(negate)
|
44
53
|
end
|
45
54
|
|
46
55
|
def description_for(index)
|
47
|
-
description_column_indices.map { |i| columns[i][index]
|
56
|
+
description_column_indices.map { |i| columns[i][index].to_s.strip }
|
57
|
+
.reject(&:empty?)
|
58
|
+
.join("; ")
|
59
|
+
.squeeze(" ")
|
60
|
+
.gsub(/(;\s+){2,}/, '')
|
61
|
+
.strip
|
62
|
+
end
|
63
|
+
|
64
|
+
def row(index)
|
65
|
+
csv_data[index].join(", ")
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def filter_csv
|
71
|
+
if options[:ignore_columns]
|
72
|
+
new_columns = []
|
73
|
+
columns.each_with_index do |column, index|
|
74
|
+
new_columns << column unless options[:ignore_columns].include?(index + 1)
|
75
|
+
end
|
76
|
+
@columns = new_columns
|
77
|
+
end
|
48
78
|
end
|
49
79
|
|
50
80
|
def evaluate_columns(cols)
|
@@ -88,48 +118,24 @@ module Reckon
|
|
88
118
|
results << { :index => index, :money_score => money_score, :date_score => date_score }
|
89
119
|
end
|
90
120
|
|
91
|
-
|
92
|
-
end
|
121
|
+
results.sort_by! { |n| -n[:money_score] }
|
93
122
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
.map { |m| m.amount.to_s }
|
101
|
-
output_columns << new_column
|
102
|
-
elsif index == b
|
103
|
-
# skip
|
104
|
-
else
|
105
|
-
output_columns << column
|
106
|
-
end
|
123
|
+
# check if it looks like a 2-column file with a balance field
|
124
|
+
if results.length >= 3 && results[1][:money_score] + results[2][:money_score] >= results[0][:money_score]
|
125
|
+
results[1][:is_money_column] = true
|
126
|
+
results[2][:is_money_column] = true
|
127
|
+
else
|
128
|
+
results[0][:is_money_column] = true
|
107
129
|
end
|
108
|
-
output_columns
|
109
|
-
end
|
110
130
|
|
111
|
-
|
112
|
-
merged_columns = merge_columns( id1, id2 )
|
113
|
-
results, found_likely_money_column = evaluate_columns( merged_columns )
|
114
|
-
if !found_likely_money_column
|
115
|
-
new_res = results.find { |el| el[:index] == id1 }
|
116
|
-
old_res1 = unmerged_results.find { |el| el[:index] == id1 }
|
117
|
-
old_res2 = unmerged_results.find { |el| el[:index] == id2 }
|
118
|
-
if new_res[:money_score] > old_res1[:money_score] &&
|
119
|
-
new_res[:money_score] > old_res2[:money_score]
|
120
|
-
found_likely_money_column = true
|
121
|
-
end
|
122
|
-
end
|
123
|
-
[results, found_likely_money_column]
|
131
|
+
return results.sort_by { |n| n[:index] }
|
124
132
|
end
|
125
133
|
|
126
|
-
def found_double_money_column(
|
127
|
-
self.money_column_indices = [
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
puts "please report this issue to us so we can take a look!\n"
|
132
|
-
end
|
134
|
+
def found_double_money_column(id1, id2)
|
135
|
+
self.money_column_indices = [id1, id2]
|
136
|
+
puts "It looks like this CSV has two seperate columns for money, one of which shows positive"
|
137
|
+
puts "changes and one of which shows negative changes. If this is true, great. Otherwise,"
|
138
|
+
puts "please report this issue to us so we can take a look!\n"
|
133
139
|
end
|
134
140
|
|
135
141
|
# Some csv files negative/positive amounts are indicated in separate account
|
@@ -159,100 +165,81 @@ module Reckon
|
|
159
165
|
end
|
160
166
|
|
161
167
|
def detect_columns
|
162
|
-
results
|
163
|
-
self.money_column_indices = [ results.sort { |a, b| b[:money_score] <=> a[:money_score] }.first[:index] ]
|
164
|
-
|
165
|
-
if !found_likely_money_column
|
166
|
-
found_likely_double_money_columns = false
|
167
|
-
0.upto(columns.length - 2) do |i|
|
168
|
-
if MoneyColumn.new( columns[i] ).merge!( MoneyColumn.new( columns[i+1] ) )
|
169
|
-
_, found_likely_double_money_columns = evaluate_columns(merge_columns(i, i+1))
|
170
|
-
if found_likely_double_money_columns
|
171
|
-
found_double_money_column( i, i + 1 )
|
172
|
-
break
|
173
|
-
end
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
if !found_likely_double_money_columns
|
178
|
-
0.upto(columns.length - 2) do |i|
|
179
|
-
if MoneyColumn.new( columns[i] ).merge!( MoneyColumn.new( columns[i+1] ) )
|
180
|
-
# Try a more specific test
|
181
|
-
_, found_likely_double_money_columns = evaluate_two_money_columns( columns, i, i+1, results )
|
182
|
-
if found_likely_double_money_columns
|
183
|
-
found_double_money_column( i, i + 1 )
|
184
|
-
break
|
185
|
-
end
|
186
|
-
end
|
187
|
-
end
|
188
|
-
end
|
168
|
+
results = evaluate_columns(columns)
|
189
169
|
|
190
|
-
|
191
|
-
|
170
|
+
if options[:money_column]
|
171
|
+
self.money_column_indices = [ options[:money_column] - 1 ]
|
172
|
+
else
|
173
|
+
self.money_column_indices = results.select { |n| n[:is_money_column] }.map { |n| n[:index] }
|
174
|
+
if self.money_column_indices.length == 1
|
175
|
+
puts "Using column #{money_column_indices.first + 1} as the money column. Use --money-colum to specify a different one."
|
176
|
+
elsif self.money_column_indices.length == 2
|
177
|
+
found_double_money_column(*self.money_column_indices)
|
178
|
+
else
|
179
|
+
puts "Unable to determine a money column, use --money-column to specify the column reckon should use."
|
192
180
|
end
|
193
181
|
end
|
194
182
|
|
195
|
-
results.reject! {|i| money_column_indices.include?(i[:index]) }
|
196
|
-
|
197
|
-
|
198
|
-
|
183
|
+
results.reject! { |i| money_column_indices.include?(i[:index]) }
|
184
|
+
if options[:date_column]
|
185
|
+
@date_column_index = options[:date_column] - 1
|
186
|
+
else
|
187
|
+
# sort by highest score followed by lowest index
|
188
|
+
@date_column_index = results.max_by { |n| [n[:date_score], -n[:index]] }[:index]
|
189
|
+
end
|
190
|
+
results.reject! { |i| i[:index] == date_column_index }
|
191
|
+
@date_column = DateColumn.new(columns[date_column_index], @options)
|
199
192
|
|
200
|
-
|
201
|
-
|
202
|
-
@options )
|
193
|
+
@money_column = MoneyColumn.new(columns[money_column_indices[0]], @options)
|
194
|
+
if money_column_indices.length == 1
|
203
195
|
detect_sign_column if @money_column.positive?
|
204
196
|
else
|
205
|
-
@money_column
|
206
|
-
@options )
|
207
|
-
@money_column.merge!(
|
208
|
-
MoneyColumn.new( columns[money_column_indices[1]], @options ) )
|
197
|
+
@money_column.merge! MoneyColumn.new(columns[money_column_indices[1]], @options)
|
209
198
|
end
|
210
199
|
|
211
200
|
self.description_column_indices = results.map { |i| i[:index] }
|
212
201
|
end
|
213
202
|
|
214
|
-
def
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
end
|
226
|
-
memo
|
227
|
-
end
|
203
|
+
def parse(data, filename=nil)
|
204
|
+
# Use force_encoding to convert the string to utf-8 with as few invalid characters
|
205
|
+
# as possible.
|
206
|
+
data.force_encoding(try_encoding(data, filename))
|
207
|
+
data = data.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
208
|
+
data.sub!("\xEF\xBB\xBF", '') # strip byte order marker, if it exists
|
209
|
+
|
210
|
+
rows = []
|
211
|
+
data.each_line.with_index do |line, i|
|
212
|
+
next if i < (options[:contains_header] || 0)
|
213
|
+
rows << CSV.parse_line(line, col_sep: options[:csv_separator] || ',')
|
228
214
|
end
|
215
|
+
|
216
|
+
rows
|
229
217
|
end
|
230
218
|
|
231
|
-
def
|
232
|
-
|
219
|
+
def try_encoding(data, filename = nil)
|
220
|
+
encoding = try_encoding_from_file(filename)
|
233
221
|
|
234
|
-
|
235
|
-
|
236
|
-
csv_engine = CSV
|
237
|
-
else
|
238
|
-
csv_engine = FasterCSV
|
239
|
-
end
|
222
|
+
cd = CharDet.detect(data)
|
223
|
+
encoding ||= cd['encoding']
|
240
224
|
|
241
|
-
|
242
|
-
if options[:contains_header]
|
243
|
-
options[:contains_header].times { csv_data.shift }
|
244
|
-
end
|
245
|
-
csv_data
|
246
|
-
end
|
225
|
+
encoding ||= 'BINARY'
|
247
226
|
|
248
|
-
|
227
|
+
LOGGER.info("suggested file encoding: #{encoding}")
|
249
228
|
|
250
|
-
|
251
|
-
@settings
|
229
|
+
options[:encoding] || encoding
|
252
230
|
end
|
253
231
|
|
254
|
-
def
|
255
|
-
|
232
|
+
def try_encoding_from_file(filename = nil)
|
233
|
+
return unless filename
|
234
|
+
|
235
|
+
m = nil
|
236
|
+
os = Gem::Platform.local.os
|
237
|
+
if os == 'linux'
|
238
|
+
m = `file -i #{filename}`.match(/charset=(\S+)/)
|
239
|
+
elsif os == 'darwin'
|
240
|
+
m = `file -I #{filename}`.match(/charset=(\S+)/)
|
241
|
+
end
|
242
|
+
m && m[1]
|
256
243
|
end
|
257
244
|
end
|
258
245
|
end
|