reckon 0.4.4 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +5 -5
  2. data/.ruby-version +1 -1
  3. data/.travis.yml +10 -2
  4. data/CHANGELOG.md +197 -0
  5. data/Gemfile +0 -1
  6. data/Gemfile.lock +33 -15
  7. data/README.md +2 -5
  8. data/lib/reckon.rb +10 -8
  9. data/lib/reckon/app.rb +92 -116
  10. data/lib/reckon/cosine_similarity.rb +119 -0
  11. data/lib/reckon/csv_parser.rb +57 -27
  12. data/lib/reckon/ledger_parser.rb +194 -30
  13. data/lib/reckon/money.rb +3 -4
  14. data/reckon.gemspec +6 -5
  15. data/spec/data_fixtures/73-sample.csv +2 -0
  16. data/spec/data_fixtures/73-tokens.yml +8 -0
  17. data/spec/data_fixtures/73-transactions.ledger +7 -0
  18. data/spec/data_fixtures/austrian_example.csv +13 -0
  19. data/spec/data_fixtures/bom_utf8_file.csv +1 -0
  20. data/spec/data_fixtures/broker_canada_example.csv +12 -0
  21. data/spec/data_fixtures/chase.csv +9 -0
  22. data/spec/data_fixtures/danish_kroner_nordea_example.csv +6 -0
  23. data/spec/data_fixtures/english_date_example.csv +3 -0
  24. data/spec/data_fixtures/french_example.csv +9 -0
  25. data/spec/data_fixtures/german_date_example.csv +3 -0
  26. data/spec/data_fixtures/harder_date_example.csv +5 -0
  27. data/spec/data_fixtures/ing.csv +3 -0
  28. data/spec/data_fixtures/intuit_mint_example.csv +7 -0
  29. data/spec/data_fixtures/invalid_header_example.csv +6 -0
  30. data/spec/data_fixtures/inversed_credit_card.csv +16 -0
  31. data/spec/data_fixtures/nationwide.csv +4 -0
  32. data/spec/data_fixtures/simple.csv +2 -0
  33. data/spec/data_fixtures/some_other.csv +9 -0
  34. data/spec/data_fixtures/spanish_date_example.csv +3 -0
  35. data/spec/data_fixtures/suntrust.csv +7 -0
  36. data/spec/data_fixtures/two_money_columns.csv +5 -0
  37. data/spec/data_fixtures/yyyymmdd_date_example.csv +1 -0
  38. data/spec/reckon/app_spec.rb +66 -34
  39. data/spec/reckon/csv_parser_spec.rb +79 -201
  40. data/spec/reckon/ledger_parser_spec.rb +62 -9
  41. data/spec/spec_helper.rb +3 -0
  42. metadata +62 -19
  43. data/CHANGES.md +0 -9
@@ -0,0 +1,119 @@
1
+ require 'matrix'
2
+
3
+ # Implementation of consine similarity using TF-IDF for vectorization.
4
+ # Used to suggest which account a transaction should be assigned to
5
+ class CosineSimilarity
6
+ def initialize(options)
7
+ @options = options
8
+ @tokens = {}
9
+ @accounts = Hash.new(0)
10
+ end
11
+
12
+ def add_document(account, doc)
13
+ tokenize(doc).each do |n|
14
+ (token, count) = n
15
+
16
+ @tokens[token] ||= {}
17
+ @tokens[token][account] ||= 0
18
+ @tokens[token][account] += count
19
+ @accounts[account] += count
20
+ end
21
+ end
22
+
23
+ # find most similar documents to query
24
+ def find_similar(query)
25
+ (query_scores, corpus_scores) = td_idf_scores_for(query)
26
+
27
+ query_vector = Vector.elements(query_scores, false)
28
+
29
+ # For each doc, calculate the similarity to the query
30
+ suggestions = corpus_scores.map do |account, scores|
31
+ acct_vector = Vector.elements(scores, false)
32
+
33
+ acct_query_dp = acct_vector.inner_product(query_vector)
34
+ # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
35
+ # exactly opposite
36
+ # see https://en.wikipedia.org/wiki/Cosine_similarity
37
+ # cos(theta) = (A . B) / (||A|| ||B||)
38
+ # where A . B is the "dot product" and ||A|| is the magnitude of A
39
+ # ruby has the 'matrix' library we can use to do these calculations.
40
+ {
41
+ similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
42
+ account: account,
43
+ }
44
+ end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
45
+
46
+ LOGGER.info "most similar accounts: #{suggestions}"
47
+
48
+ return suggestions
49
+ end
50
+
51
+ private
52
+
53
+ def td_idf_scores_for(query)
54
+ query_tokens = tokenize(query)
55
+ corpus = Set.new
56
+ corpus_scores = {}
57
+ query_scores = []
58
+ num_docs = @accounts.length
59
+
60
+ query_tokens.each do |n|
61
+ (token, _count) = n
62
+ next unless @tokens[token]
63
+ corpus = corpus.union(Set.new(@tokens[token].keys))
64
+ end
65
+
66
+ query_tokens.each do |n|
67
+ (token, count) = n
68
+
69
+ # if no other docs have token, ignore it
70
+ next unless @tokens[token]
71
+
72
+ ## First, calculate scores for our query as we're building scores for the corpus
73
+ query_scores << calc_tf_idf(
74
+ count,
75
+ query_tokens.length,
76
+ @tokens[token].length,
77
+ num_docs
78
+ )
79
+
80
+ ## Next, calculate for the corpus, where our "account" is a document
81
+ corpus.each do |account|
82
+ corpus_scores[account] ||= []
83
+
84
+ corpus_scores[account] << calc_tf_idf(
85
+ (@tokens[token][account] || 0),
86
+ @accounts[account].to_f,
87
+ @tokens[token].length.to_f,
88
+ num_docs
89
+ )
90
+ end
91
+ end
92
+ [query_scores, corpus_scores]
93
+ end
94
+
95
+ def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
96
+
97
+ # tf(t,d) = count of t in d / number of words in d
98
+ tf = token_count / num_words_in_doc.to_f
99
+
100
+ # smooth idf weight
101
+ # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
102
+ # df(t) = num of documents with term t in them
103
+ # idf(t) = log(N/(1 + df )) + 1
104
+ idf = Math.log(num_docs.to_f / (1 + df)) + 1
105
+
106
+ tf * idf
107
+ end
108
+
109
+ def tokenize(str)
110
+ mk_tokens(str).inject(Hash.new(0)) do |memo, n|
111
+ memo[n] += 1
112
+ memo
113
+ end.to_a
114
+ end
115
+ end
116
+
117
+ def mk_tokens(str)
118
+ str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
119
+ end
@@ -1,5 +1,4 @@
1
1
  #coding: utf-8
2
- require 'pp'
3
2
 
4
3
  module Reckon
5
4
  class CSVParser
@@ -8,7 +7,7 @@ module Reckon
8
7
  def initialize(options = {})
9
8
  self.options = options
10
9
  self.options[:currency] ||= '$'
11
- parse
10
+ @csv_data = parse(options[:string] || File.read(options[:file]), options[:file])
12
11
  filter_csv
13
12
  detect_columns
14
13
  end
@@ -44,7 +43,7 @@ module Reckon
44
43
  end
45
44
 
46
45
  def description_for(index)
47
- description_column_indices.map { |i| columns[i][index] }.reject { |a| a.empty? }.join("; ").squeeze(" ").gsub(/(;\s+){2,}/, '').strip
46
+ description_column_indices.map { |i| columns[i][index] }.reject(&:empty?).join("; ").squeeze(" ").gsub(/(;\s+){2,}/, '').strip
48
47
  end
49
48
 
50
49
  def evaluate_columns(cols)
@@ -160,7 +159,12 @@ module Reckon
160
159
 
161
160
  def detect_columns
162
161
  results, found_likely_money_column = evaluate_columns(columns)
163
- self.money_column_indices = [ results.sort { |a, b| b[:money_score] <=> a[:money_score] }.first[:index] ]
162
+ if options[:money_column]
163
+ found_likely_money_column = true
164
+ self.money_column_indices = [ options[:money_column] - 1 ]
165
+ else
166
+ self.money_column_indices = [ results.max_by { |n| n[:money_score] }[:index] ]
167
+ end
164
168
 
165
169
  if !found_likely_money_column
166
170
  found_likely_double_money_columns = false
@@ -192,20 +196,21 @@ module Reckon
192
196
  end
193
197
  end
194
198
 
195
- results.reject! {|i| money_column_indices.include?(i[:index]) }
196
- self.date_column_index = results.sort { |a, b| b[:date_score] <=> a[:date_score] }.first[:index]
197
- results.reject! {|i| i[:index] == date_column_index }
198
- @date_column = DateColumn.new( columns[ self.date_column_index ], @options )
199
+ results.reject! { |i| money_column_indices.include?(i[:index]) }
200
+ if options[:date_column]
201
+ @date_column_index = options[:date_column] - 1
202
+ else
203
+ # sort by highest score followed by lowest index
204
+ @date_column_index = results.max_by { |n| [n[:date_score], -n[:index]] }[:index]
205
+ end
206
+ results.reject! { |i| i[:index] == date_column_index }
207
+ @date_column = DateColumn.new(columns[date_column_index], @options)
199
208
 
200
- if ( money_column_indices.length == 1 )
201
- @money_column = MoneyColumn.new( columns[money_column_indices[0]],
202
- @options )
209
+ @money_column = MoneyColumn.new(columns[money_column_indices[0]], @options)
210
+ if money_column_indices.length == 1
203
211
  detect_sign_column if @money_column.positive?
204
212
  else
205
- @money_column = MoneyColumn.new( columns[money_column_indices[0]],
206
- @options )
207
- @money_column.merge!(
208
- MoneyColumn.new( columns[money_column_indices[1]], @options ) )
213
+ @money_column.merge! MoneyColumn.new(columns[money_column_indices[1]], @options)
209
214
  end
210
215
 
211
216
  self.description_column_indices = results.map { |i| i[:index] }
@@ -228,21 +233,46 @@ module Reckon
228
233
  end
229
234
  end
230
235
 
231
- def parse
232
- data = options[:string] || File.read(options[:file])
233
-
234
- if RUBY_VERSION =~ /^1\.9/ || RUBY_VERSION =~ /^2/
235
- data = data.force_encoding(options[:encoding] || 'BINARY').encode('UTF-8', :invalid => :replace, :undef => :replace, :replace => '?')
236
- csv_engine = CSV
237
- else
238
- csv_engine = FasterCSV
236
+ def parse(data, filename=nil)
237
+ # Use force_encoding to convert the string to utf-8 with as few invalid characters
238
+ # as possible.
239
+ data.force_encoding(try_encoding(data, filename))
240
+ data = data.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
241
+ data.sub!("\xEF\xBB\xBF", '') # strip byte order marker, if it exists
242
+
243
+ rows = []
244
+ data.each_line.with_index do |line, i|
245
+ next if i < (options[:contains_header] || 0)
246
+ rows << CSV.parse_line(line, col_sep: options[:csv_separator] || ',')
239
247
  end
240
248
 
241
- @csv_data = csv_engine.parse data.strip, :col_sep => options[:csv_separator] || ','
242
- if options[:contains_header]
243
- options[:contains_header].times { csv_data.shift }
249
+ rows
250
+ end
251
+
252
+ def try_encoding(data, filename = nil)
253
+ encoding = try_encoding_from_file(filename)
254
+
255
+ cd = CharDet.detect(data)
256
+ encoding ||= cd['encoding']
257
+
258
+ encoding ||= 'BINARY'
259
+
260
+ LOGGER.info("suggested file encoding: #{encoding}")
261
+
262
+ options[:encoding] || encoding
263
+ end
264
+
265
+ def try_encoding_from_file(filename = nil)
266
+ return unless filename
267
+
268
+ m = nil
269
+ os = Gem::Platform.local.os
270
+ if os == 'linux'
271
+ m = `file -i #{filename}`.match(/charset=(\S+)/)
272
+ elsif os == 'darwin'
273
+ m = `file -I #{filename}`.match(/charset=(\S+)/)
244
274
  end
245
- csv_data
275
+ m && m[1]
246
276
  end
247
277
 
248
278
  @settings = { :testing => false }
@@ -1,4 +1,109 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # From: https://www.ledger-cli.org/3.0/doc/ledger3.html#Transactions-and-Comments
5
+ #
6
+ # The ledger file format is quite simple, but also very flexible. It supports many
7
+ # options, though typically the user can ignore most of them. They are summarized below.
8
+ #
9
+ # The initial character of each line determines what the line means, and how it should
10
+ # be interpreted. Allowable initial characters are:
11
+ #
12
+ # NUMBER
13
+ # A line beginning with a number denotes an entry. It may be followed by any
14
+ # number of lines, each beginning with whitespace, to denote the entry's account
15
+ # transactions. The format of the first line is:
16
+ #
17
+ # DATE[=EDATE] [*|!] [(CODE)] DESC
18
+ #
19
+ # If '*' appears after the date (with optional effective date), it indicates the
20
+ # entry is "cleared", which can mean whatever the user wants it to mean. If '!'
21
+ # appears after the date, it indicates d the entry is "pending"; i.e., tentatively
22
+ # cleared from the user's point of view, but not yet actually cleared. If a 'CODE'
23
+ # appears in parentheses, it may be used to indicate a check number, or the type of
24
+ # the transaction. Following these is the payee, or a description of the
25
+ # transaction.
26
+ #
27
+ # The format of each following transaction is:
28
+ #
29
+ # ACCOUNT AMOUNT [; NOTE]
30
+ #
31
+ # The 'ACCOUNT' may be surrounded by parentheses if it is a virtual transactions, or
32
+ # square brackets if it is a virtual transactions that must balance. The 'AMOUNT'
33
+ # can be followed by a per-unit transaction cost, by specifying '@ AMOUNT', or a
34
+ # complete transaction cost with '@@ AMOUNT'. Lastly, the 'NOTE' may specify an
35
+ # actual and/or effective date for the transaction by using the syntax
36
+ # '[ACTUAL_DATE]' or '[=EFFECTIVE_DATE]' or '[ACTUAL_DATE=EFFECtIVE_DATE]'.
37
+ # =
38
+ # An automated entry. A value expression must appear after the equal sign.
39
+ #
40
+ # After this initial line there should be a set of one or more transactions, just as
41
+ # if it were normal entry. If the amounts of the transactions have no commodity,
42
+ # they will be applied as modifiers to whichever real transaction is matched by the
43
+ # value expression.
44
+ # ~
45
+ # A period entry. A period expression must appear after the tilde.
46
+ #
47
+ # After this initial line there should be a set of one or more transactions, just as
48
+ # if it were normal entry.
49
+ # !
50
+ # A line beginning with an exclamation mark denotes a command directive. It must be
51
+ # immediately followed by the command word. The supported commands are:
52
+ #
53
+ # '!include'
54
+ # Include the stated ledger file.
55
+ #
56
+ # '!account'
57
+ # The account name is given is taken to be the parent of all transactions that
58
+ # follow, until '!end' is seen.
59
+ #
60
+ # '!end'
61
+ # Ends an account block.
62
+ #
63
+ # ;
64
+ # A line beginning with a colon indicates a comment, and is ignored.
65
+ # Y
66
+ # If a line begins with a capital Y, it denotes the year used for all subsequent
67
+ # entries that give a date without a year. The year should appear immediately after
68
+ # the Y, for example: 'Y2004'. This is useful at the beginning of a file, to specify
69
+ # the year for that file. If all entries specify a year, however, this command has
70
+ # no effect.
71
+ #
72
+ # P
73
+ # Specifies a historical price for a commodity. These are usually found in a pricing
74
+ # history file (see the -Q option). The syntax is:
75
+ #
76
+ # P DATE SYMBOL PRICE
77
+ #
78
+ # N SYMBOL
79
+ # Indicates that pricing information is to be ignored for a given symbol, nor will
80
+ # quotes ever be downloaded for that symbol. Useful with a home currency, such as
81
+ # the dollar ($). It is recommended that these pricing options be set in the price
82
+ # database file, which defaults to ~/.pricedb. The syntax for this command is:
83
+ #
84
+ # N SYMBOL
85
+ #
86
+ # D AMOUNT
87
+ # Specifies the default commodity to use, by specifying an amount in the expected
88
+ # format. The entry command will use this commodity as the default when none other
89
+ # can be determined. This command may be used multiple times, to set the default
90
+ # flags for different commodities; whichever is seen last is used as the default
91
+ # commodity. For example, to set US dollars as the default commodity, while also
92
+ # setting the thousands flag and decimal flag for that commodity, use:
93
+ #
94
+ # D $1,000.00
95
+ #
96
+ # C AMOUNT1 = AMOUNT2
97
+ # Specifies a commodity conversion, where the first amount is given to be equivalent
98
+ # to the second amount. The first amount should use the decimal precision desired
99
+ # during reporting:
100
+ #
101
+ # C 1.00 Kb = 1024 bytes
102
+ #
103
+ # i, o, b, h
104
+ # These four relate to timeclock support, which permits ledger to read timelog
105
+ # files. See the timeclock's documentation for more info on the syntax of its
106
+ # timelog files.
2
107
 
3
108
  require 'rubygems'
4
109
 
@@ -8,54 +113,113 @@ module Reckon
8
113
  attr_accessor :entries
9
114
 
10
115
  def initialize(ledger, options = {})
11
- @entries = []
116
+ @options = options
117
+ @date_format = options[:date_format] || '%Y-%m-%d'
12
118
  parse(ledger)
13
119
  end
14
120
 
15
121
  def parse(ledger)
16
122
  @entries = []
17
- date = desc = nil
18
- accounts = []
123
+ new_entry = {}
19
124
  ledger.strip.split("\n").each do |entry|
20
- next if entry =~ /^\s*$/ || entry =~ /^[^ \t\d]/
21
- if entry =~ /^([\d\/-]+)(\=[\d\/-]+)?(\s+[\*!]?\s*.*?)$/
22
- @entries << { :date => date.strip, :desc => desc.strip, :accounts => balance(accounts) } if date
23
- date = $1
24
- desc = $3
25
- accounts = []
26
- elsif date && entry =~ /^\s+([a-z\s:_\-]+)(\s*$|(\s+[\$\.,\-\d\+]+)($|\s+($|[^\$\.,\-\d\+])))/i
27
- accounts << { :name => $1.strip, :amount => clean_money($3) }
125
+ next if entry =~ /^\s*$/ || entry =~ /^\s*;/
126
+
127
+ # (date, type, code, description), type and code are optional
128
+ if (m = entry.match(%r{^(\d+[\d/-]+)\s+([*!])?\s*(\([^)]+\))?\s*(.*)$}))
129
+ add_entry(new_entry)
130
+ new_entry = {
131
+ date: try_parse_date(m[1]),
132
+ type: m[2] || "",
133
+ code: m[3] && m[3].tr('()', '') || "",
134
+ desc: m[4].strip,
135
+ accounts: []
136
+ }
137
+ elsif new_entry[:date] && entry =~ /^\s+/
138
+ new_entry[:accounts] << parse_account_line(entry)
28
139
  else
29
- @entries << { :date => date.strip, :desc => desc.strip, :accounts => balance(accounts) } if date
30
- date = desc = nil
31
- accounts = []
140
+ LOGGER.info("Unknown entry type: #{entry}")
141
+ add_entry(new_entry)
142
+ new_entry = {}
32
143
  end
33
144
  end
34
- @entries << { :date => date.strip, :desc => desc.strip, :accounts => balance(accounts) } if date
145
+ add_entry(new_entry)
35
146
  end
36
147
 
37
- def balance(accounts)
38
- if accounts.any? { |i| i[:amount].nil? }
39
- sum = accounts.inject(0) {|m, account| m + (account[:amount] || 0) }
40
- count = 0
41
- accounts.each do |account|
42
- if account[:amount].nil?
43
- count += 1
44
- account[:amount] = 0 - sum
45
- end
46
- end
47
- if count > 1
48
- puts "Warning: unparsable entry due to more than one missing money value."
49
- p accounts
50
- puts
148
+ # roughly matches ledger csv format
149
+ def to_csv
150
+ return @entries.flat_map do |n|
151
+ n[:accounts].map do |a|
152
+ row = [
153
+ n[:date].strftime(@date_format),
154
+ n[:code],
155
+ n[:desc],
156
+ a[:name],
157
+ "", # currency (not implemented)
158
+ a[:amount],
159
+ n[:type],
160
+ "", # account comment (not implemented)
161
+ ]
162
+ CSV.generate_line(row).strip
51
163
  end
52
164
  end
165
+ end
166
+
167
+ private
168
+
169
+ def add_entry(entry)
170
+ return unless entry[:date] && entry[:accounts].length > 1
171
+
172
+ entry[:accounts] = balance(entry[:accounts])
173
+ @entries << entry
174
+ end
175
+
176
+ def try_parse_date(date_str)
177
+ date = Date.parse(date_str)
178
+ return nil if date.year > 9999 || date.year < 1000
179
+
180
+ date
181
+ rescue ArgumentError
182
+ nil
183
+ end
184
+
185
+ def parse_account_line(entry)
186
+ (account_name, rest) = entry.strip.split(/\s{2,}|\t+/, 2)
187
+
188
+ return {
189
+ name: account_name,
190
+ amount: clean_money("")
191
+ } if rest.nil? || rest.empty?
192
+
193
+ (value, _comment) = rest.split(/;/)
194
+ return {
195
+ name: account_name,
196
+ amount: clean_money(value || "")
197
+ }
198
+ end
199
+
200
+ def balance(accounts)
201
+ return accounts unless accounts.any? { |i| i[:amount].nil? }
202
+
203
+ sum = accounts.reduce(0) { |m, n| m + (n[:amount] || 0) }
204
+ count = 0
205
+ accounts.each do |account|
206
+ next unless account[:amount].nil?
207
+
208
+ count += 1
209
+ account[:amount] = -sum
210
+ end
211
+ if count > 1
212
+ puts "Warning: unparsable entry due to more than one missing money value."
213
+ p accounts
214
+ puts
215
+ end
53
216
 
54
217
  accounts
55
218
  end
56
219
 
57
220
  def clean_money(money)
58
- return nil if money.nil? || money.length == 0
221
+ return nil if money.nil? || money.empty?
222
+
59
223
  money.gsub(/[^0-9.-]/, '').to_f
60
224
  end
61
225
  end