reckon 0.4.4 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +3 -0
  3. data/.ruby-version +1 -1
  4. data/.travis.yml +10 -2
  5. data/CHANGELOG.md +235 -0
  6. data/Gemfile +0 -1
  7. data/Gemfile.lock +73 -15
  8. data/README.md +12 -5
  9. data/lib/reckon.rb +13 -12
  10. data/lib/reckon/app.rb +94 -116
  11. data/lib/reckon/cosine_similarity.rb +122 -0
  12. data/lib/reckon/csv_parser.rb +116 -129
  13. data/lib/reckon/date_column.rb +60 -0
  14. data/lib/reckon/ledger_parser.rb +204 -30
  15. data/lib/reckon/logger.rb +4 -0
  16. data/lib/reckon/money.rb +6 -62
  17. data/lib/reckon/version.rb +3 -0
  18. data/reckon.gemspec +8 -5
  19. data/spec/data_fixtures/51-sample.csv +8 -0
  20. data/spec/data_fixtures/51-tokens.yml +9 -0
  21. data/spec/data_fixtures/73-sample.csv +2 -0
  22. data/spec/data_fixtures/73-tokens.yml +8 -0
  23. data/spec/data_fixtures/73-transactions.ledger +7 -0
  24. data/spec/data_fixtures/85-date-example.csv +2 -0
  25. data/spec/data_fixtures/austrian_example.csv +13 -0
  26. data/spec/data_fixtures/bom_utf8_file.csv +1 -0
  27. data/spec/data_fixtures/broker_canada_example.csv +12 -0
  28. data/spec/data_fixtures/chase.csv +9 -0
  29. data/spec/data_fixtures/danish_kroner_nordea_example.csv +6 -0
  30. data/spec/data_fixtures/english_date_example.csv +3 -0
  31. data/spec/data_fixtures/french_example.csv +9 -0
  32. data/spec/data_fixtures/german_date_example.csv +3 -0
  33. data/spec/data_fixtures/harder_date_example.csv +5 -0
  34. data/spec/data_fixtures/ing.csv +3 -0
  35. data/spec/data_fixtures/intuit_mint_example.csv +7 -0
  36. data/spec/data_fixtures/invalid_header_example.csv +6 -0
  37. data/spec/data_fixtures/inversed_credit_card.csv +16 -0
  38. data/spec/data_fixtures/nationwide.csv +4 -0
  39. data/spec/data_fixtures/simple.csv +2 -0
  40. data/spec/data_fixtures/some_other.csv +9 -0
  41. data/spec/data_fixtures/spanish_date_example.csv +3 -0
  42. data/spec/data_fixtures/suntrust.csv +7 -0
  43. data/spec/data_fixtures/test_money_column.csv +3 -0
  44. data/spec/data_fixtures/two_money_columns.csv +5 -0
  45. data/spec/data_fixtures/yyyymmdd_date_example.csv +1 -0
  46. data/spec/reckon/app_spec.rb +96 -34
  47. data/spec/reckon/csv_parser_spec.rb +185 -307
  48. data/spec/reckon/date_column_spec.rb +12 -13
  49. data/spec/reckon/ledger_parser_spec.rb +99 -9
  50. data/spec/reckon/money_spec.rb +42 -29
  51. data/spec/spec_helper.rb +22 -0
  52. metadata +85 -21
  53. data/CHANGES.md +0 -9
@@ -1,21 +1,20 @@
1
- #coding: utf-8
1
+ # coding: utf-8
2
2
  require 'pp'
3
3
  require 'yaml'
4
4
 
5
5
  module Reckon
6
6
  class App
7
- VERSION = "Reckon 0.4.4"
8
- attr_accessor :options, :accounts, :tokens, :seen, :csv_parser, :regexps
7
+ attr_accessor :options, :seen, :csv_parser, :regexps, :matcher
9
8
 
10
9
  def initialize(options = {})
10
+ LOGGER.level = Logger::INFO if options[:verbose]
11
11
  self.options = options
12
- self.tokens = {}
13
12
  self.regexps = {}
14
- self.accounts = {}
15
13
  self.seen = {}
16
14
  self.options[:currency] ||= '$'
17
15
  options[:string] = File.read(options[:file]) unless options[:string]
18
16
  @csv_parser = CSVParser.new( options )
17
+ @matcher = CosineSimilarity.new(options)
19
18
  learn!
20
19
  end
21
20
 
@@ -24,21 +23,44 @@ module Reckon
24
23
  puts str
25
24
  end
26
25
 
26
+ def learn!
27
+ learn_from_account_tokens(options[:account_tokens_file])
28
+
29
+ ledger_file = options[:existing_ledger_file]
30
+ return unless ledger_file
31
+ fail "#{ledger_file} doesn't exist!" unless File.exists?(ledger_file)
32
+ learn_from(File.read(ledger_file))
33
+ end
34
+
35
+ def learn_from_account_tokens(filename)
36
+ return unless filename
37
+
38
+ fail "#{filename} doesn't exist!" unless File.exists?(filename)
39
+
40
+ extract_account_tokens(YAML.load_file(filename)).each do |account, tokens|
41
+ tokens.each do |t|
42
+ if t.start_with?('/')
43
+ add_regexp(account, t)
44
+ else
45
+ @matcher.add_document(account, t)
46
+ end
47
+ end
48
+ end
49
+ end
50
+
27
51
  def learn_from(ledger)
28
52
  LedgerParser.new(ledger).entries.each do |entry|
29
53
  entry[:accounts].each do |account|
30
- learn_about_account( account[:name],
31
- [entry[:desc], account[:amount]].join(" ") ) unless account[:name] == options[:bank_account]
32
- seen[entry[:date]] ||= {}
33
- seen[entry[:date]][@csv_parser.pretty_money(account[:amount])] = true
54
+ str = [entry[:desc], account[:amount]].join(" ")
55
+ @matcher.add_document(account[:name], str) unless account[:name] == options[:bank_account]
56
+ pretty_date = entry[:date].iso8601
57
+ seen[pretty_date] ||= {}
58
+ seen[pretty_date][@csv_parser.pretty_money(account[:amount])] = true
34
59
  end
35
60
  end
36
61
  end
37
62
 
38
- def already_seen?(row)
39
- seen[row[:pretty_date]] && seen[row[:pretty_date]][row[:pretty_money]]
40
- end
41
-
63
+ # Add tokens from account_tokens_file to accounts
42
64
  def extract_account_tokens(subtree, account = nil)
43
65
  if subtree.nil?
44
66
  puts "Warning: empty #{account} tree"
@@ -46,50 +68,26 @@ module Reckon
46
68
  elsif subtree.is_a?(Array)
47
69
  { account => subtree }
48
70
  else
49
- at = subtree.map { |k, v| extract_account_tokens(v, [account, k].compact.join(':')) }
50
- at.inject({}) { |k, v| k = k.merge(v)}
51
- end
52
- end
53
-
54
- def learn!
55
- if options[:account_tokens_file]
56
- fail "#{options[:account_tokens_file]} doesn't exist!" unless File.exists?(options[:account_tokens_file])
57
- extract_account_tokens(YAML.load_file(options[:account_tokens_file])).each do |account, tokens|
58
- tokens.each { |t| learn_about_account(account, t, true) }
71
+ at = subtree.map do |k, v|
72
+ merged_acct = [account, k].compact.join(':')
73
+ extract_account_tokens(v, merged_acct)
59
74
  end
75
+ at.inject({}) { |memo, e| memo.merge!(e)}
60
76
  end
61
- return unless options[:existing_ledger_file]
62
- fail "#{options[:existing_ledger_file]} doesn't exist!" unless File.exists?(options[:existing_ledger_file])
63
- ledger_data = File.read(options[:existing_ledger_file])
64
- learn_from(ledger_data)
65
77
  end
66
78
 
67
- def learn_about_account(account, data, parse_regexps = false)
68
- accounts[account] ||= 0
69
- if parse_regexps && data.start_with?('/')
70
- # https://github.com/tenderlove/psych/blob/master/lib/psych/visitors/to_ruby.rb
71
- match = data.match(/^\/(.*)\/([ix]*)$/m)
72
- fail "failed to parse regexp #{data}" unless match
73
- options = 0
74
- (match[2] || '').split('').each do |option|
75
- case option
76
- when 'x' then options |= Regexp::EXTENDED
77
- when 'i' then options |= Regexp::IGNORECASE
78
- end
79
- end
80
- regexps[Regexp.new(match[1], options)] = account
81
- else
82
- tokenize(data).each do |token|
83
- tokens[token] ||= {}
84
- tokens[token][account] ||= 0
85
- tokens[token][account] += 1
86
- accounts[account] += 1
79
+ def add_regexp(account, regex_str)
80
+ # https://github.com/tenderlove/psych/blob/master/lib/psych/visitors/to_ruby.rb
81
+ match = regex_str.match(/^\/(.*)\/([ix]*)$/m)
82
+ fail "failed to parse regexp #{regex_str}" unless match
83
+ options = 0
84
+ (match[2] || '').split('').each do |option|
85
+ case option
86
+ when 'x' then options |= Regexp::EXTENDED
87
+ when 'i' then options |= Regexp::IGNORECASE
87
88
  end
88
89
  end
89
- end
90
-
91
- def tokenize(str)
92
- str.downcase.split(/[\s\-]/)
90
+ regexps[Regexp.new(match[1], options)] = account
93
91
  end
94
92
 
95
93
  def walk_backwards
@@ -107,8 +105,7 @@ module Reckon
107
105
  seen_anything_new = true
108
106
  end
109
107
 
110
- possible_answers = most_specific_regexp_match(row)
111
- possible_answers = weighted_account_match( row ).map! { |a| a[:account] } if possible_answers.empty?
108
+ possible_answers = suggest(row)
112
109
 
113
110
  ledger = if row[:money] > 0
114
111
  if options[:unattended]
@@ -156,15 +153,21 @@ module Reckon
156
153
  end
157
154
  end
158
155
 
159
- def finish
160
- options[:output_file].close unless options[:output_file] == STDOUT
161
- interactive_output "Exiting."
162
- exit
163
- end
164
-
165
- def output(ledger_line)
166
- options[:output_file].puts ledger_line
167
- options[:output_file].flush
156
+ def each_row_backwards
157
+ rows = []
158
+ (0...@csv_parser.columns.first.length).to_a.each do |index|
159
+ if @csv_parser.date_for(index).nil?
160
+ LOGGER.warn("Skipping row: '#{@csv_parser.row(index)}' that doesn't have a valid date")
161
+ next
162
+ end
163
+ rows << { :date => @csv_parser.date_for(index),
164
+ :pretty_date => @csv_parser.pretty_date_for(index),
165
+ :pretty_money => @csv_parser.pretty_money_for(index),
166
+ :pretty_money_negated => @csv_parser.pretty_money_for(index, :negate),
167
+ :money => @csv_parser.money_for(index),
168
+ :description => @csv_parser.description_for(index) }
169
+ end
170
+ rows.sort_by { |n| n[:date] }.each {|row| yield row }
168
171
  end
169
172
 
170
173
  def most_specific_regexp_match( row )
@@ -176,41 +179,9 @@ module Reckon
176
179
  matches.sort_by! { |account, matched_text| matched_text.length }.map(&:first)
177
180
  end
178
181
 
179
- # Weigh accounts by how well they match the row
180
- def weighted_account_match( row )
181
- query_tokens = tokenize(row[:description])
182
-
183
- search_vector = []
184
- account_vectors = {}
185
-
186
- query_tokens.each do |token|
187
- idf = Math.log((accounts.keys.length + 1) / ((tokens[token] || {}).keys.length.to_f + 1))
188
- tf = 1.0 / query_tokens.length.to_f
189
- search_vector << tf*idf
190
-
191
- accounts.each do |account, total_terms|
192
- tf = (tokens[token] && tokens[token][account]) ? tokens[token][account] / total_terms.to_f : 0
193
- account_vectors[account] ||= []
194
- account_vectors[account] << tf*idf
195
- end
196
- end
197
-
198
- # Should I normalize the vectors? Probably unnecessary due to tf-idf and short documents.
199
-
200
- account_vectors = account_vectors.to_a.map do |account, account_vector|
201
- { :cosine => (0...account_vector.length).to_a.inject(0) { |m, i| m + search_vector[i] * account_vector[i] },
202
- :account => account }
203
- end
204
- account_vectors.sort! {|a, b| b[:cosine] <=> a[:cosine] }
205
-
206
- # Return empty set if no accounts matched so that we can fallback to the defaults in the unattended mode
207
- if options[:unattended]
208
- if account_vectors.first && account_vectors.first[:account]
209
- account_vectors = [] if account_vectors.first[:cosine] == 0
210
- end
211
- end
212
-
213
- return account_vectors
182
+ def suggest(row)
183
+ most_specific_regexp_match(row) +
184
+ @matcher.find_similar(row[:description]).map { |n| n[:account] }
214
185
  end
215
186
 
216
187
  def ledger_format(row, line1, line2)
@@ -220,6 +191,21 @@ module Reckon
220
191
  out
221
192
  end
222
193
 
194
+ def output(ledger_line)
195
+ options[:output_file].puts ledger_line
196
+ options[:output_file].flush
197
+ end
198
+
199
+ def already_seen?(row)
200
+ seen[row[:pretty_date]] && seen[row[:pretty_date]][row[:pretty_money]]
201
+ end
202
+
203
+ def finish
204
+ options[:output_file].close unless options[:output_file] == STDOUT
205
+ interactive_output "Exiting."
206
+ exit
207
+ end
208
+
223
209
  def output_table
224
210
  output = Terminal::Table.new do |t|
225
211
  t.headings = 'Date', 'Amount', 'Description'
@@ -230,21 +216,6 @@ module Reckon
230
216
  interactive_output output
231
217
  end
232
218
 
233
- def each_row_backwards
234
- rows = []
235
- (0...@csv_parser.columns.first.length).to_a.each do |index|
236
- rows << { :date => @csv_parser.date_for(index),
237
- :pretty_date => @csv_parser.pretty_date_for(index),
238
- :pretty_money => @csv_parser.pretty_money_for(index),
239
- :pretty_money_negated => @csv_parser.pretty_money_for(index, :negate),
240
- :money => @csv_parser.money_for(index),
241
- :description => @csv_parser.description_for(index) }
242
- end
243
- rows.sort { |a, b| a[:date] <=> b[:date] }.each do |row|
244
- yield row
245
- end
246
- end
247
-
248
219
  def self.parse_opts(args = ARGV)
249
220
  options = { :output_file => STDOUT }
250
221
  parser = OptionParser.new do |opts|
@@ -255,7 +226,7 @@ module Reckon
255
226
  options[:file] = file
256
227
  end
257
228
 
258
- opts.on("-a", "--account name", "The Ledger Account this file is for") do |a|
229
+ opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a|
259
230
  options[:bank_account] = a
260
231
  end
261
232
 
@@ -283,6 +254,14 @@ module Reckon
283
254
  options[:ignore_columns] = ignore.split(",").map { |i| i.to_i }
284
255
  end
285
256
 
257
+ opts.on("", "--money-column 2", Integer, "Specify the money column instead of letting Reckon guess - the first column is column 1") do |column_number|
258
+ options[:money_column] = column_number
259
+ end
260
+
261
+ opts.on("", "--date-column 3", Integer, "Specify the date column instead of letting Reckon guess - the first column is column 1") do |column_number|
262
+ options[:date_column] = column_number
263
+ end
264
+
286
265
  opts.on("", "--contains-header [N]", "The first row of the CSV is a header and should be skipped. Optionally add the number of rows to skip.") do |contains_header|
287
266
  options[:contains_header] = 1
288
267
  options[:contains_header] = contains_header.to_i if contains_header
@@ -316,11 +295,11 @@ module Reckon
316
295
  options[:account_tokens_file] = a
317
296
  end
318
297
 
319
- opts.on("", "--default-into-account name", "Default into account") do |a|
298
+ opts.on("", "--default-into-account NAME", "Default into account") do |a|
320
299
  options[:default_into_account] = a
321
300
  end
322
301
 
323
- opts.on("", "--default-outof-account name", "Default 'out of' account") do |a|
302
+ opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a|
324
303
  options[:default_outof_account] = a
325
304
  end
326
305
 
@@ -351,7 +330,6 @@ module Reckon
351
330
  end
352
331
 
353
332
  unless options[:bank_account]
354
-
355
333
  fail "Please specify --account for the unattended mode" if options[:unattended]
356
334
 
357
335
  options[:bank_account] = ask("What is the account name of this bank account in Ledger? ") do |q|
@@ -0,0 +1,122 @@
1
+ require 'matrix'
2
+ require 'set'
3
+
4
+ # Implementation of consine similarity using TF-IDF for vectorization.
5
+ # Used to suggest which account a transaction should be assigned to
6
+ module Reckon
7
+ class CosineSimilarity
8
+ def initialize(options)
9
+ @options = options
10
+ @tokens = {}
11
+ @accounts = Hash.new(0)
12
+ end
13
+
14
+ def add_document(account, doc)
15
+ tokenize(doc).each do |n|
16
+ (token, count) = n
17
+
18
+ @tokens[token] ||= {}
19
+ @tokens[token][account] ||= 0
20
+ @tokens[token][account] += count
21
+ @accounts[account] += count
22
+ end
23
+ end
24
+
25
+ # find most similar documents to query
26
+ def find_similar(query)
27
+ (query_scores, corpus_scores) = td_idf_scores_for(query)
28
+
29
+ query_vector = Vector.elements(query_scores, false)
30
+
31
+ # For each doc, calculate the similarity to the query
32
+ suggestions = corpus_scores.map do |account, scores|
33
+ acct_vector = Vector.elements(scores, false)
34
+
35
+ acct_query_dp = acct_vector.inner_product(query_vector)
36
+ # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
37
+ # exactly opposite
38
+ # see https://en.wikipedia.org/wiki/Cosine_similarity
39
+ # cos(theta) = (A . B) / (||A|| ||B||)
40
+ # where A . B is the "dot product" and ||A|| is the magnitude of A
41
+ # ruby has the 'matrix' library we can use to do these calculations.
42
+ {
43
+ similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
44
+ account: account,
45
+ }
46
+ end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
47
+
48
+ LOGGER.info "most similar accounts: #{suggestions}"
49
+
50
+ return suggestions
51
+ end
52
+
53
+ private
54
+
55
+ def td_idf_scores_for(query)
56
+ query_tokens = tokenize(query)
57
+ corpus = Set.new
58
+ corpus_scores = {}
59
+ query_scores = []
60
+ num_docs = @accounts.length
61
+
62
+ query_tokens.each do |n|
63
+ (token, _count) = n
64
+ next unless @tokens[token]
65
+ corpus = corpus.union(Set.new(@tokens[token].keys))
66
+ end
67
+
68
+ query_tokens.each do |n|
69
+ (token, count) = n
70
+
71
+ # if no other docs have token, ignore it
72
+ next unless @tokens[token]
73
+
74
+ ## First, calculate scores for our query as we're building scores for the corpus
75
+ query_scores << calc_tf_idf(
76
+ count,
77
+ query_tokens.length,
78
+ @tokens[token].length,
79
+ num_docs
80
+ )
81
+
82
+ ## Next, calculate for the corpus, where our "account" is a document
83
+ corpus.each do |account|
84
+ corpus_scores[account] ||= []
85
+
86
+ corpus_scores[account] << calc_tf_idf(
87
+ (@tokens[token][account] || 0),
88
+ @accounts[account].to_f,
89
+ @tokens[token].length.to_f,
90
+ num_docs
91
+ )
92
+ end
93
+ end
94
+ [query_scores, corpus_scores]
95
+ end
96
+
97
+ def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
98
+
99
+ # tf(t,d) = count of t in d / number of words in d
100
+ tf = token_count / num_words_in_doc.to_f
101
+
102
+ # smooth idf weight
103
+ # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
104
+ # df(t) = num of documents with term t in them
105
+ # idf(t) = log(N/(1 + df )) + 1
106
+ idf = Math.log(num_docs.to_f / (1 + df)) + 1
107
+
108
+ tf * idf
109
+ end
110
+
111
+ def tokenize(str)
112
+ mk_tokens(str).inject(Hash.new(0)) do |memo, n|
113
+ memo[n] += 1
114
+ memo
115
+ end.to_a
116
+ end
117
+
118
+ def mk_tokens(str)
119
+ str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
120
+ end
121
+ end
122
+ end
@@ -1,5 +1,4 @@
1
1
  #coding: utf-8
2
- require 'pp'
3
2
 
4
3
  module Reckon
5
4
  class CSVParser
@@ -8,43 +7,74 @@ module Reckon
8
7
  def initialize(options = {})
9
8
  self.options = options
10
9
  self.options[:currency] ||= '$'
11
- parse
10
+ @csv_data = parse(options[:string] || File.read(options[:file]), options[:file])
12
11
  filter_csv
13
12
  detect_columns
14
13
  end
15
14
 
16
- def filter_csv
17
- if options[:ignore_columns]
18
- new_columns = []
19
- columns.each_with_index do |column, index|
20
- new_columns << column unless options[:ignore_columns].include?(index + 1)
15
+ def columns
16
+ @columns ||=
17
+ begin
18
+ last_row_length = nil
19
+ csv_data.inject([]) do |memo, row|
20
+ unless row.all? { |i| i.nil? || i.length == 0 }
21
+ row.each_with_index do |entry, index|
22
+ memo[index] ||= []
23
+ memo[index] << (entry || '').strip
24
+ end
25
+ last_row_length = row.length
26
+ end
27
+ memo
28
+ end
21
29
  end
22
- @columns = new_columns
23
- end
24
30
  end
25
31
 
26
- def money_for(index)
27
- @money_column[index]
32
+ def date_for(index)
33
+ @date_column.for(index)
28
34
  end
29
35
 
30
- def pretty_money_for(index, negate = false)
31
- money_for( index ).pretty( negate )
36
+ def pretty_date_for(index)
37
+ @date_column.pretty_for( index )
38
+ end
39
+
40
+ def money_for(index)
41
+ @money_column[index]
32
42
  end
33
43
 
34
44
  def pretty_money(amount, negate = false)
35
45
  Money.new( amount, @options ).pretty( negate )
36
46
  end
37
47
 
38
- def date_for(index)
39
- @date_column.for( index )
40
- end
48
+ def pretty_money_for(index, negate = false)
49
+ money = money_for(index)
50
+ return 0 if money.nil?
41
51
 
42
- def pretty_date_for(index)
43
- @date_column.pretty_for( index )
52
+ money.pretty(negate)
44
53
  end
45
54
 
46
55
  def description_for(index)
47
- description_column_indices.map { |i| columns[i][index] }.reject { |a| a.empty? }.join("; ").squeeze(" ").gsub(/(;\s+){2,}/, '').strip
56
+ description_column_indices.map { |i| columns[i][index].to_s.strip }
57
+ .reject(&:empty?)
58
+ .join("; ")
59
+ .squeeze(" ")
60
+ .gsub(/(;\s+){2,}/, '')
61
+ .strip
62
+ end
63
+
64
+ def row(index)
65
+ csv_data[index].join(", ")
66
+ end
67
+
68
+ private
69
+
70
+ def filter_csv
71
+ if options[:ignore_columns]
72
+ new_columns = []
73
+ columns.each_with_index do |column, index|
74
+ new_columns << column unless options[:ignore_columns].include?(index + 1)
75
+ end
76
+ @columns = new_columns
77
+ end
48
78
  end
49
79
 
50
80
  def evaluate_columns(cols)
@@ -88,48 +118,24 @@ module Reckon
88
118
  results << { :index => index, :money_score => money_score, :date_score => date_score }
89
119
  end
90
120
 
91
- return [results, found_likely_money_column]
92
- end
121
+ results.sort_by! { |n| -n[:money_score] }
93
122
 
94
- def merge_columns(a, b)
95
- output_columns = []
96
- columns.each_with_index do |column, index|
97
- if index == a
98
- new_column = MoneyColumn.new( column )
99
- .merge!( MoneyColumn.new( columns[b] ) )
100
- .map { |m| m.amount.to_s }
101
- output_columns << new_column
102
- elsif index == b
103
- # skip
104
- else
105
- output_columns << column
106
- end
123
+ # check if it looks like a 2-column file with a balance field
124
+ if results.length >= 3 && results[1][:money_score] + results[2][:money_score] >= results[0][:money_score]
125
+ results[1][:is_money_column] = true
126
+ results[2][:is_money_column] = true
127
+ else
128
+ results[0][:is_money_column] = true
107
129
  end
108
- output_columns
109
- end
110
130
 
111
- def evaluate_two_money_columns( columns, id1, id2, unmerged_results )
112
- merged_columns = merge_columns( id1, id2 )
113
- results, found_likely_money_column = evaluate_columns( merged_columns )
114
- if !found_likely_money_column
115
- new_res = results.find { |el| el[:index] == id1 }
116
- old_res1 = unmerged_results.find { |el| el[:index] == id1 }
117
- old_res2 = unmerged_results.find { |el| el[:index] == id2 }
118
- if new_res[:money_score] > old_res1[:money_score] &&
119
- new_res[:money_score] > old_res2[:money_score]
120
- found_likely_money_column = true
121
- end
122
- end
123
- [results, found_likely_money_column]
131
+ return results.sort_by { |n| n[:index] }
124
132
  end
125
133
 
126
- def found_double_money_column( id1, id2 )
127
- self.money_column_indices = [ id1, id2 ]
128
- unless settings[:testing]
129
- puts "It looks like this CSV has two seperate columns for money, one of which shows positive"
130
- puts "changes and one of which shows negative changes. If this is true, great. Otherwise,"
131
- puts "please report this issue to us so we can take a look!\n"
132
- end
134
+ def found_double_money_column(id1, id2)
135
+ self.money_column_indices = [id1, id2]
136
+ puts "It looks like this CSV has two seperate columns for money, one of which shows positive"
137
+ puts "changes and one of which shows negative changes. If this is true, great. Otherwise,"
138
+ puts "please report this issue to us so we can take a look!\n"
133
139
  end
134
140
 
135
141
  # Some csv files negative/positive amounts are indicated in separate account
@@ -159,100 +165,81 @@ module Reckon
159
165
  end
160
166
 
161
167
  def detect_columns
162
- results, found_likely_money_column = evaluate_columns(columns)
163
- self.money_column_indices = [ results.sort { |a, b| b[:money_score] <=> a[:money_score] }.first[:index] ]
164
-
165
- if !found_likely_money_column
166
- found_likely_double_money_columns = false
167
- 0.upto(columns.length - 2) do |i|
168
- if MoneyColumn.new( columns[i] ).merge!( MoneyColumn.new( columns[i+1] ) )
169
- _, found_likely_double_money_columns = evaluate_columns(merge_columns(i, i+1))
170
- if found_likely_double_money_columns
171
- found_double_money_column( i, i + 1 )
172
- break
173
- end
174
- end
175
- end
176
-
177
- if !found_likely_double_money_columns
178
- 0.upto(columns.length - 2) do |i|
179
- if MoneyColumn.new( columns[i] ).merge!( MoneyColumn.new( columns[i+1] ) )
180
- # Try a more specific test
181
- _, found_likely_double_money_columns = evaluate_two_money_columns( columns, i, i+1, results )
182
- if found_likely_double_money_columns
183
- found_double_money_column( i, i + 1 )
184
- break
185
- end
186
- end
187
- end
188
- end
168
+ results = evaluate_columns(columns)
189
169
 
190
- if !found_likely_double_money_columns && !settings[:testing]
191
- puts "I didn't find a high-likelyhood money column, but I'm taking my best guess with column #{money_column_indices.first + 1}."
170
+ if options[:money_column]
171
+ self.money_column_indices = [ options[:money_column] - 1 ]
172
+ else
173
+ self.money_column_indices = results.select { |n| n[:is_money_column] }.map { |n| n[:index] }
174
+ if self.money_column_indices.length == 1
175
+ puts "Using column #{money_column_indices.first + 1} as the money column. Use --money-colum to specify a different one."
176
+ elsif self.money_column_indices.length == 2
177
+ found_double_money_column(*self.money_column_indices)
178
+ else
179
+ puts "Unable to determine a money column, use --money-column to specify the column reckon should use."
192
180
  end
193
181
  end
194
182
 
195
- results.reject! {|i| money_column_indices.include?(i[:index]) }
196
- self.date_column_index = results.sort { |a, b| b[:date_score] <=> a[:date_score] }.first[:index]
197
- results.reject! {|i| i[:index] == date_column_index }
198
- @date_column = DateColumn.new( columns[ self.date_column_index ], @options )
183
+ results.reject! { |i| money_column_indices.include?(i[:index]) }
184
+ if options[:date_column]
185
+ @date_column_index = options[:date_column] - 1
186
+ else
187
+ # sort by highest score followed by lowest index
188
+ @date_column_index = results.max_by { |n| [n[:date_score], -n[:index]] }[:index]
189
+ end
190
+ results.reject! { |i| i[:index] == date_column_index }
191
+ @date_column = DateColumn.new(columns[date_column_index], @options)
199
192
 
200
- if ( money_column_indices.length == 1 )
201
- @money_column = MoneyColumn.new( columns[money_column_indices[0]],
202
- @options )
193
+ @money_column = MoneyColumn.new(columns[money_column_indices[0]], @options)
194
+ if money_column_indices.length == 1
203
195
  detect_sign_column if @money_column.positive?
204
196
  else
205
- @money_column = MoneyColumn.new( columns[money_column_indices[0]],
206
- @options )
207
- @money_column.merge!(
208
- MoneyColumn.new( columns[money_column_indices[1]], @options ) )
197
+ @money_column.merge! MoneyColumn.new(columns[money_column_indices[1]], @options)
209
198
  end
210
199
 
211
200
  self.description_column_indices = results.map { |i| i[:index] }
212
201
  end
213
202
 
214
- def columns
215
- @columns ||= begin
216
- last_row_length = nil
217
- csv_data.inject([]) do |memo, row|
218
- # fail "Input CSV must have consistent row lengths." if last_row_length && row.length != last_row_length
219
- unless row.all? { |i| i.nil? || i.length == 0 }
220
- row.each_with_index do |entry, index|
221
- memo[index] ||= []
222
- memo[index] << (entry || '').strip
223
- end
224
- last_row_length = row.length
225
- end
226
- memo
227
- end
203
+ def parse(data, filename=nil)
204
+ # Use force_encoding to convert the string to utf-8 with as few invalid characters
205
+ # as possible.
206
+ data.force_encoding(try_encoding(data, filename))
207
+ data = data.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
208
+ data.sub!("\xEF\xBB\xBF", '') # strip byte order marker, if it exists
209
+
210
+ rows = []
211
+ data.each_line.with_index do |line, i|
212
+ next if i < (options[:contains_header] || 0)
213
+ rows << CSV.parse_line(line, col_sep: options[:csv_separator] || ',')
228
214
  end
215
+
216
+ rows
229
217
  end
230
218
 
231
- def parse
232
- data = options[:string] || File.read(options[:file])
219
+ def try_encoding(data, filename = nil)
220
+ encoding = try_encoding_from_file(filename)
233
221
 
234
- if RUBY_VERSION =~ /^1\.9/ || RUBY_VERSION =~ /^2/
235
- data = data.force_encoding(options[:encoding] || 'BINARY').encode('UTF-8', :invalid => :replace, :undef => :replace, :replace => '?')
236
- csv_engine = CSV
237
- else
238
- csv_engine = FasterCSV
239
- end
222
+ cd = CharDet.detect(data)
223
+ encoding ||= cd['encoding']
240
224
 
241
- @csv_data = csv_engine.parse data.strip, :col_sep => options[:csv_separator] || ','
242
- if options[:contains_header]
243
- options[:contains_header].times { csv_data.shift }
244
- end
245
- csv_data
246
- end
225
+ encoding ||= 'BINARY'
247
226
 
248
- @settings = { :testing => false }
227
+ LOGGER.info("suggested file encoding: #{encoding}")
249
228
 
250
- def self.settings
251
- @settings
229
+ options[:encoding] || encoding
252
230
  end
253
231
 
254
- def settings
255
- self.class.settings
232
+ def try_encoding_from_file(filename = nil)
233
+ return unless filename
234
+
235
+ m = nil
236
+ os = Gem::Platform.local.os
237
+ if os == 'linux'
238
+ m = `file -i #{filename}`.match(/charset=(\S+)/)
239
+ elsif os == 'darwin'
240
+ m = `file -I #{filename}`.match(/charset=(\S+)/)
241
+ end
242
+ m && m[1]
256
243
  end
257
244
  end
258
245
  end