reckon 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +5 -5
  2. data/.ruby-version +1 -1
  3. data/.travis.yml +10 -2
  4. data/CHANGELOG.md +197 -0
  5. data/Gemfile +0 -1
  6. data/Gemfile.lock +33 -15
  7. data/README.md +2 -5
  8. data/lib/reckon.rb +10 -8
  9. data/lib/reckon/app.rb +92 -116
  10. data/lib/reckon/cosine_similarity.rb +119 -0
  11. data/lib/reckon/csv_parser.rb +57 -27
  12. data/lib/reckon/ledger_parser.rb +194 -30
  13. data/lib/reckon/money.rb +3 -4
  14. data/reckon.gemspec +6 -5
  15. data/spec/data_fixtures/73-sample.csv +2 -0
  16. data/spec/data_fixtures/73-tokens.yml +8 -0
  17. data/spec/data_fixtures/73-transactions.ledger +7 -0
  18. data/spec/data_fixtures/austrian_example.csv +13 -0
  19. data/spec/data_fixtures/bom_utf8_file.csv +1 -0
  20. data/spec/data_fixtures/broker_canada_example.csv +12 -0
  21. data/spec/data_fixtures/chase.csv +9 -0
  22. data/spec/data_fixtures/danish_kroner_nordea_example.csv +6 -0
  23. data/spec/data_fixtures/english_date_example.csv +3 -0
  24. data/spec/data_fixtures/french_example.csv +9 -0
  25. data/spec/data_fixtures/german_date_example.csv +3 -0
  26. data/spec/data_fixtures/harder_date_example.csv +5 -0
  27. data/spec/data_fixtures/ing.csv +3 -0
  28. data/spec/data_fixtures/intuit_mint_example.csv +7 -0
  29. data/spec/data_fixtures/invalid_header_example.csv +6 -0
  30. data/spec/data_fixtures/inversed_credit_card.csv +16 -0
  31. data/spec/data_fixtures/nationwide.csv +4 -0
  32. data/spec/data_fixtures/simple.csv +2 -0
  33. data/spec/data_fixtures/some_other.csv +9 -0
  34. data/spec/data_fixtures/spanish_date_example.csv +3 -0
  35. data/spec/data_fixtures/suntrust.csv +7 -0
  36. data/spec/data_fixtures/two_money_columns.csv +5 -0
  37. data/spec/data_fixtures/yyyymmdd_date_example.csv +1 -0
  38. data/spec/reckon/app_spec.rb +66 -34
  39. data/spec/reckon/csv_parser_spec.rb +79 -201
  40. data/spec/reckon/ledger_parser_spec.rb +62 -9
  41. data/spec/spec_helper.rb +3 -0
  42. metadata +62 -19
  43. data/CHANGES.md +0 -9
@@ -0,0 +1,119 @@
1
+ require 'matrix'
2
+
3
+ # Implementation of consine similarity using TF-IDF for vectorization.
4
+ # Used to suggest which account a transaction should be assigned to
5
+ class CosineSimilarity
6
+ def initialize(options)
7
+ @options = options
8
+ @tokens = {}
9
+ @accounts = Hash.new(0)
10
+ end
11
+
12
+ def add_document(account, doc)
13
+ tokenize(doc).each do |n|
14
+ (token, count) = n
15
+
16
+ @tokens[token] ||= {}
17
+ @tokens[token][account] ||= 0
18
+ @tokens[token][account] += count
19
+ @accounts[account] += count
20
+ end
21
+ end
22
+
23
+ # find most similar documents to query
24
+ def find_similar(query)
25
+ (query_scores, corpus_scores) = td_idf_scores_for(query)
26
+
27
+ query_vector = Vector.elements(query_scores, false)
28
+
29
+ # For each doc, calculate the similarity to the query
30
+ suggestions = corpus_scores.map do |account, scores|
31
+ acct_vector = Vector.elements(scores, false)
32
+
33
+ acct_query_dp = acct_vector.inner_product(query_vector)
34
+ # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
35
+ # exactly opposite
36
+ # see https://en.wikipedia.org/wiki/Cosine_similarity
37
+ # cos(theta) = (A . B) / (||A|| ||B||)
38
+ # where A . B is the "dot product" and ||A|| is the magnitude of A
39
+ # ruby has the 'matrix' library we can use to do these calculations.
40
+ {
41
+ similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
42
+ account: account,
43
+ }
44
+ end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
45
+
46
+ LOGGER.info "most similar accounts: #{suggestions}"
47
+
48
+ return suggestions
49
+ end
50
+
51
+ private
52
+
53
+ def td_idf_scores_for(query)
54
+ query_tokens = tokenize(query)
55
+ corpus = Set.new
56
+ corpus_scores = {}
57
+ query_scores = []
58
+ num_docs = @accounts.length
59
+
60
+ query_tokens.each do |n|
61
+ (token, _count) = n
62
+ next unless @tokens[token]
63
+ corpus = corpus.union(Set.new(@tokens[token].keys))
64
+ end
65
+
66
+ query_tokens.each do |n|
67
+ (token, count) = n
68
+
69
+ # if no other docs have token, ignore it
70
+ next unless @tokens[token]
71
+
72
+ ## First, calculate scores for our query as we're building scores for the corpus
73
+ query_scores << calc_tf_idf(
74
+ count,
75
+ query_tokens.length,
76
+ @tokens[token].length,
77
+ num_docs
78
+ )
79
+
80
+ ## Next, calculate for the corpus, where our "account" is a document
81
+ corpus.each do |account|
82
+ corpus_scores[account] ||= []
83
+
84
+ corpus_scores[account] << calc_tf_idf(
85
+ (@tokens[token][account] || 0),
86
+ @accounts[account].to_f,
87
+ @tokens[token].length.to_f,
88
+ num_docs
89
+ )
90
+ end
91
+ end
92
+ [query_scores, corpus_scores]
93
+ end
94
+
95
+ def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
96
+
97
+ # tf(t,d) = count of t in d / number of words in d
98
+ tf = token_count / num_words_in_doc.to_f
99
+
100
+ # smooth idf weight
101
+ # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
102
+ # df(t) = num of documents with term t in them
103
+ # idf(t) = log(N/(1 + df )) + 1
104
+ idf = Math.log(num_docs.to_f / (1 + df)) + 1
105
+
106
+ tf * idf
107
+ end
108
+
109
+ def tokenize(str)
110
+ mk_tokens(str).inject(Hash.new(0)) do |memo, n|
111
+ memo[n] += 1
112
+ memo
113
+ end.to_a
114
+ end
115
+ end
116
+
117
+ def mk_tokens(str)
118
+ str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
119
+ end
@@ -1,5 +1,4 @@
1
1
  #coding: utf-8
2
- require 'pp'
3
2
 
4
3
  module Reckon
5
4
  class CSVParser
@@ -8,7 +7,7 @@ module Reckon
8
7
  def initialize(options = {})
9
8
  self.options = options
10
9
  self.options[:currency] ||= '$'
11
- parse
10
+ @csv_data = parse(options[:string] || File.read(options[:file]), options[:file])
12
11
  filter_csv
13
12
  detect_columns
14
13
  end
@@ -44,7 +43,7 @@ module Reckon
44
43
  end
45
44
 
46
45
  def description_for(index)
47
- description_column_indices.map { |i| columns[i][index] }.reject { |a| a.empty? }.join("; ").squeeze(" ").gsub(/(;\s+){2,}/, '').strip
46
+ description_column_indices.map { |i| columns[i][index] }.reject(&:empty?).join("; ").squeeze(" ").gsub(/(;\s+){2,}/, '').strip
48
47
  end
49
48
 
50
49
  def evaluate_columns(cols)
@@ -160,7 +159,12 @@ module Reckon
160
159
 
161
160
  def detect_columns
162
161
  results, found_likely_money_column = evaluate_columns(columns)
163
- self.money_column_indices = [ results.sort { |a, b| b[:money_score] <=> a[:money_score] }.first[:index] ]
162
+ if options[:money_column]
163
+ found_likely_money_column = true
164
+ self.money_column_indices = [ options[:money_column] - 1 ]
165
+ else
166
+ self.money_column_indices = [ results.max_by { |n| n[:money_score] }[:index] ]
167
+ end
164
168
 
165
169
  if !found_likely_money_column
166
170
  found_likely_double_money_columns = false
@@ -192,20 +196,21 @@ module Reckon
192
196
  end
193
197
  end
194
198
 
195
- results.reject! {|i| money_column_indices.include?(i[:index]) }
196
- self.date_column_index = results.sort { |a, b| b[:date_score] <=> a[:date_score] }.first[:index]
197
- results.reject! {|i| i[:index] == date_column_index }
198
- @date_column = DateColumn.new( columns[ self.date_column_index ], @options )
199
+ results.reject! { |i| money_column_indices.include?(i[:index]) }
200
+ if options[:date_column]
201
+ @date_column_index = options[:date_column] - 1
202
+ else
203
+ # sort by highest score followed by lowest index
204
+ @date_column_index = results.max_by { |n| [n[:date_score], -n[:index]] }[:index]
205
+ end
206
+ results.reject! { |i| i[:index] == date_column_index }
207
+ @date_column = DateColumn.new(columns[date_column_index], @options)
199
208
 
200
- if ( money_column_indices.length == 1 )
201
- @money_column = MoneyColumn.new( columns[money_column_indices[0]],
202
- @options )
209
+ @money_column = MoneyColumn.new(columns[money_column_indices[0]], @options)
210
+ if money_column_indices.length == 1
203
211
  detect_sign_column if @money_column.positive?
204
212
  else
205
- @money_column = MoneyColumn.new( columns[money_column_indices[0]],
206
- @options )
207
- @money_column.merge!(
208
- MoneyColumn.new( columns[money_column_indices[1]], @options ) )
213
+ @money_column.merge! MoneyColumn.new(columns[money_column_indices[1]], @options)
209
214
  end
210
215
 
211
216
  self.description_column_indices = results.map { |i| i[:index] }
@@ -228,21 +233,46 @@ module Reckon
228
233
  end
229
234
  end
230
235
 
231
- def parse
232
- data = options[:string] || File.read(options[:file])
233
-
234
- if RUBY_VERSION =~ /^1\.9/ || RUBY_VERSION =~ /^2/
235
- data = data.force_encoding(options[:encoding] || 'BINARY').encode('UTF-8', :invalid => :replace, :undef => :replace, :replace => '?')
236
- csv_engine = CSV
237
- else
238
- csv_engine = FasterCSV
236
+ def parse(data, filename=nil)
237
+ # Use force_encoding to convert the string to utf-8 with as few invalid characters
238
+ # as possible.
239
+ data.force_encoding(try_encoding(data, filename))
240
+ data = data.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
241
+ data.sub!("\xEF\xBB\xBF", '') # strip byte order marker, if it exists
242
+
243
+ rows = []
244
+ data.each_line.with_index do |line, i|
245
+ next if i < (options[:contains_header] || 0)
246
+ rows << CSV.parse_line(line, col_sep: options[:csv_separator] || ',')
239
247
  end
240
248
 
241
- @csv_data = csv_engine.parse data.strip, :col_sep => options[:csv_separator] || ','
242
- if options[:contains_header]
243
- options[:contains_header].times { csv_data.shift }
249
+ rows
250
+ end
251
+
252
+ def try_encoding(data, filename = nil)
253
+ encoding = try_encoding_from_file(filename)
254
+
255
+ cd = CharDet.detect(data)
256
+ encoding ||= cd['encoding']
257
+
258
+ encoding ||= 'BINARY'
259
+
260
+ LOGGER.info("suggested file encoding: #{encoding}")
261
+
262
+ options[:encoding] || encoding
263
+ end
264
+
265
+ def try_encoding_from_file(filename = nil)
266
+ return unless filename
267
+
268
+ m = nil
269
+ os = Gem::Platform.local.os
270
+ if os == 'linux'
271
+ m = `file -i #{filename}`.match(/charset=(\S+)/)
272
+ elsif os == 'darwin'
273
+ m = `file -I #{filename}`.match(/charset=(\S+)/)
244
274
  end
245
- csv_data
275
+ m && m[1]
246
276
  end
247
277
 
248
278
  @settings = { :testing => false }
@@ -1,4 +1,109 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # From: https://www.ledger-cli.org/3.0/doc/ledger3.html#Transactions-and-Comments
5
+ #
6
+ # The ledger file format is quite simple, but also very flexible. It supports many
7
+ # options, though typically the user can ignore most of them. They are summarized below.
8
+ #
9
+ # The initial character of each line determines what the line means, and how it should
10
+ # be interpreted. Allowable initial characters are:
11
+ #
12
+ # NUMBER
13
+ # A line beginning with a number denotes an entry. It may be followed by any
14
+ # number of lines, each beginning with whitespace, to denote the entry's account
15
+ # transactions. The format of the first line is:
16
+ #
17
+ # DATE[=EDATE] [*|!] [(CODE)] DESC
18
+ #
19
+ # If '*' appears after the date (with optional effective date), it indicates the
20
+ # entry is "cleared", which can mean whatever the user wants it to mean. If '!'
21
+ # appears after the date, it indicates d the entry is "pending"; i.e., tentatively
22
+ # cleared from the user's point of view, but not yet actually cleared. If a 'CODE'
23
+ # appears in parentheses, it may be used to indicate a check number, or the type of
24
+ # the transaction. Following these is the payee, or a description of the
25
+ # transaction.
26
+ #
27
+ # The format of each following transaction is:
28
+ #
29
+ # ACCOUNT AMOUNT [; NOTE]
30
+ #
31
+ # The 'ACCOUNT' may be surrounded by parentheses if it is a virtual transactions, or
32
+ # square brackets if it is a virtual transactions that must balance. The 'AMOUNT'
33
+ # can be followed by a per-unit transaction cost, by specifying '@ AMOUNT', or a
34
+ # complete transaction cost with '@@ AMOUNT'. Lastly, the 'NOTE' may specify an
35
+ # actual and/or effective date for the transaction by using the syntax
36
+ # '[ACTUAL_DATE]' or '[=EFFECTIVE_DATE]' or '[ACTUAL_DATE=EFFECtIVE_DATE]'.
37
+ # =
38
+ # An automated entry. A value expression must appear after the equal sign.
39
+ #
40
+ # After this initial line there should be a set of one or more transactions, just as
41
+ # if it were normal entry. If the amounts of the transactions have no commodity,
42
+ # they will be applied as modifiers to whichever real transaction is matched by the
43
+ # value expression.
44
+ # ~
45
+ # A period entry. A period expression must appear after the tilde.
46
+ #
47
+ # After this initial line there should be a set of one or more transactions, just as
48
+ # if it were normal entry.
49
+ # !
50
+ # A line beginning with an exclamation mark denotes a command directive. It must be
51
+ # immediately followed by the command word. The supported commands are:
52
+ #
53
+ # '!include'
54
+ # Include the stated ledger file.
55
+ #
56
+ # '!account'
57
+ # The account name is given is taken to be the parent of all transactions that
58
+ # follow, until '!end' is seen.
59
+ #
60
+ # '!end'
61
+ # Ends an account block.
62
+ #
63
+ # ;
64
+ # A line beginning with a colon indicates a comment, and is ignored.
65
+ # Y
66
+ # If a line begins with a capital Y, it denotes the year used for all subsequent
67
+ # entries that give a date without a year. The year should appear immediately after
68
+ # the Y, for example: 'Y2004'. This is useful at the beginning of a file, to specify
69
+ # the year for that file. If all entries specify a year, however, this command has
70
+ # no effect.
71
+ #
72
+ # P
73
+ # Specifies a historical price for a commodity. These are usually found in a pricing
74
+ # history file (see the -Q option). The syntax is:
75
+ #
76
+ # P DATE SYMBOL PRICE
77
+ #
78
+ # N SYMBOL
79
+ # Indicates that pricing information is to be ignored for a given symbol, nor will
80
+ # quotes ever be downloaded for that symbol. Useful with a home currency, such as
81
+ # the dollar ($). It is recommended that these pricing options be set in the price
82
+ # database file, which defaults to ~/.pricedb. The syntax for this command is:
83
+ #
84
+ # N SYMBOL
85
+ #
86
+ # D AMOUNT
87
+ # Specifies the default commodity to use, by specifying an amount in the expected
88
+ # format. The entry command will use this commodity as the default when none other
89
+ # can be determined. This command may be used multiple times, to set the default
90
+ # flags for different commodities; whichever is seen last is used as the default
91
+ # commodity. For example, to set US dollars as the default commodity, while also
92
+ # setting the thousands flag and decimal flag for that commodity, use:
93
+ #
94
+ # D $1,000.00
95
+ #
96
+ # C AMOUNT1 = AMOUNT2
97
+ # Specifies a commodity conversion, where the first amount is given to be equivalent
98
+ # to the second amount. The first amount should use the decimal precision desired
99
+ # during reporting:
100
+ #
101
+ # C 1.00 Kb = 1024 bytes
102
+ #
103
+ # i, o, b, h
104
+ # These four relate to timeclock support, which permits ledger to read timelog
105
+ # files. See the timeclock's documentation for more info on the syntax of its
106
+ # timelog files.
2
107
 
3
108
  require 'rubygems'
4
109
 
@@ -8,54 +113,113 @@ module Reckon
8
113
  attr_accessor :entries
9
114
 
10
115
  def initialize(ledger, options = {})
11
- @entries = []
116
+ @options = options
117
+ @date_format = options[:date_format] || '%Y-%m-%d'
12
118
  parse(ledger)
13
119
  end
14
120
 
15
121
  def parse(ledger)
16
122
  @entries = []
17
- date = desc = nil
18
- accounts = []
123
+ new_entry = {}
19
124
  ledger.strip.split("\n").each do |entry|
20
- next if entry =~ /^\s*$/ || entry =~ /^[^ \t\d]/
21
- if entry =~ /^([\d\/-]+)(\=[\d\/-]+)?(\s+[\*!]?\s*.*?)$/
22
- @entries << { :date => date.strip, :desc => desc.strip, :accounts => balance(accounts) } if date
23
- date = $1
24
- desc = $3
25
- accounts = []
26
- elsif date && entry =~ /^\s+([a-z\s:_\-]+)(\s*$|(\s+[\$\.,\-\d\+]+)($|\s+($|[^\$\.,\-\d\+])))/i
27
- accounts << { :name => $1.strip, :amount => clean_money($3) }
125
+ next if entry =~ /^\s*$/ || entry =~ /^\s*;/
126
+
127
+ # (date, type, code, description), type and code are optional
128
+ if (m = entry.match(%r{^(\d+[\d/-]+)\s+([*!])?\s*(\([^)]+\))?\s*(.*)$}))
129
+ add_entry(new_entry)
130
+ new_entry = {
131
+ date: try_parse_date(m[1]),
132
+ type: m[2] || "",
133
+ code: m[3] && m[3].tr('()', '') || "",
134
+ desc: m[4].strip,
135
+ accounts: []
136
+ }
137
+ elsif new_entry[:date] && entry =~ /^\s+/
138
+ new_entry[:accounts] << parse_account_line(entry)
28
139
  else
29
- @entries << { :date => date.strip, :desc => desc.strip, :accounts => balance(accounts) } if date
30
- date = desc = nil
31
- accounts = []
140
+ LOGGER.info("Unknown entry type: #{entry}")
141
+ add_entry(new_entry)
142
+ new_entry = {}
32
143
  end
33
144
  end
34
- @entries << { :date => date.strip, :desc => desc.strip, :accounts => balance(accounts) } if date
145
+ add_entry(new_entry)
35
146
  end
36
147
 
37
- def balance(accounts)
38
- if accounts.any? { |i| i[:amount].nil? }
39
- sum = accounts.inject(0) {|m, account| m + (account[:amount] || 0) }
40
- count = 0
41
- accounts.each do |account|
42
- if account[:amount].nil?
43
- count += 1
44
- account[:amount] = 0 - sum
45
- end
46
- end
47
- if count > 1
48
- puts "Warning: unparsable entry due to more than one missing money value."
49
- p accounts
50
- puts
148
+ # roughly matches ledger csv format
149
+ def to_csv
150
+ return @entries.flat_map do |n|
151
+ n[:accounts].map do |a|
152
+ row = [
153
+ n[:date].strftime(@date_format),
154
+ n[:code],
155
+ n[:desc],
156
+ a[:name],
157
+ "", # currency (not implemented)
158
+ a[:amount],
159
+ n[:type],
160
+ "", # account comment (not implemented)
161
+ ]
162
+ CSV.generate_line(row).strip
51
163
  end
52
164
  end
165
+ end
166
+
167
+ private
168
+
169
+ def add_entry(entry)
170
+ return unless entry[:date] && entry[:accounts].length > 1
171
+
172
+ entry[:accounts] = balance(entry[:accounts])
173
+ @entries << entry
174
+ end
175
+
176
+ def try_parse_date(date_str)
177
+ date = Date.parse(date_str)
178
+ return nil if date.year > 9999 || date.year < 1000
179
+
180
+ date
181
+ rescue ArgumentError
182
+ nil
183
+ end
184
+
185
+ def parse_account_line(entry)
186
+ (account_name, rest) = entry.strip.split(/\s{2,}|\t+/, 2)
187
+
188
+ return {
189
+ name: account_name,
190
+ amount: clean_money("")
191
+ } if rest.nil? || rest.empty?
192
+
193
+ (value, _comment) = rest.split(/;/)
194
+ return {
195
+ name: account_name,
196
+ amount: clean_money(value || "")
197
+ }
198
+ end
199
+
200
+ def balance(accounts)
201
+ return accounts unless accounts.any? { |i| i[:amount].nil? }
202
+
203
+ sum = accounts.reduce(0) { |m, n| m + (n[:amount] || 0) }
204
+ count = 0
205
+ accounts.each do |account|
206
+ next unless account[:amount].nil?
207
+
208
+ count += 1
209
+ account[:amount] = -sum
210
+ end
211
+ if count > 1
212
+ puts "Warning: unparsable entry due to more than one missing money value."
213
+ p accounts
214
+ puts
215
+ end
53
216
 
54
217
  accounts
55
218
  end
56
219
 
57
220
  def clean_money(money)
58
- return nil if money.nil? || money.length == 0
221
+ return nil if money.nil? || money.empty?
222
+
59
223
  money.gsub(/[^0-9.-]/, '').to_f
60
224
  end
61
225
  end