reckon 0.6.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +50 -0
- data/.gitignore +3 -0
- data/.ruby-version +1 -1
- data/CHANGELOG.md +75 -7
- data/Gemfile.lock +1 -1
- data/README.md +85 -24
- data/Rakefile +17 -1
- data/bin/build-new-version.sh +26 -0
- data/bin/reckon +9 -1
- data/lib/reckon.rb +1 -0
- data/lib/reckon/app.rb +18 -141
- data/lib/reckon/cosine_similarity.rb +67 -62
- data/lib/reckon/csv_parser.rb +2 -7
- data/lib/reckon/date_column.rb +10 -0
- data/lib/reckon/money.rb +59 -52
- data/lib/reckon/options.rb +153 -0
- data/lib/reckon/version.rb +1 -1
- data/spec/cosine_training_and_test.rb +52 -0
- data/spec/integration/another_bank_example/input.csv +9 -0
- data/spec/integration/another_bank_example/output.ledger +36 -0
- data/spec/integration/another_bank_example/test_args +1 -0
- data/spec/integration/ask_for_account/cli_input.exp +33 -0
- data/spec/integration/ask_for_account/expected_output +11 -0
- data/spec/integration/ask_for_account/input.csv +9 -0
- data/spec/integration/ask_for_account/test_args +1 -0
- data/spec/integration/austrian_example/input.csv +13 -0
- data/spec/integration/austrian_example/output.ledger +52 -0
- data/spec/integration/austrian_example/test_args +2 -0
- data/spec/integration/bom_utf8_file/input.csv +3 -0
- data/spec/integration/bom_utf8_file/output.ledger +4 -0
- data/spec/integration/bom_utf8_file/test_args +3 -0
- data/spec/integration/broker_canada_example/input.csv +12 -0
- data/spec/integration/broker_canada_example/output.ledger +48 -0
- data/spec/integration/broker_canada_example/test_args +1 -0
- data/spec/integration/chase/account_tokens_and_regex/output.ledger +36 -0
- data/spec/integration/chase/account_tokens_and_regex/test_args +2 -0
- data/spec/integration/chase/account_tokens_and_regex/tokens.yml +16 -0
- data/spec/integration/chase/default_account_names/output.ledger +36 -0
- data/spec/integration/chase/default_account_names/test_args +3 -0
- data/spec/integration/chase/input.csv +9 -0
- data/spec/integration/chase/learn_from_existing/learn.ledger +7 -0
- data/spec/integration/chase/learn_from_existing/output.ledger +36 -0
- data/spec/integration/chase/learn_from_existing/test_args +1 -0
- data/spec/integration/chase/simple/output.ledger +36 -0
- data/spec/integration/chase/simple/test_args +1 -0
- data/spec/integration/danish_kroner_nordea_example/input.csv +6 -0
- data/spec/integration/danish_kroner_nordea_example/output.ledger +24 -0
- data/spec/integration/danish_kroner_nordea_example/test_args +1 -0
- data/spec/integration/english_date_example/input.csv +3 -0
- data/spec/integration/english_date_example/output.ledger +12 -0
- data/spec/integration/english_date_example/test_args +1 -0
- data/spec/integration/extratofake/input.csv +24 -0
- data/spec/integration/extratofake/output.ledger +92 -0
- data/spec/integration/extratofake/test_args +1 -0
- data/spec/integration/french_example/input.csv +9 -0
- data/spec/integration/french_example/output.ledger +36 -0
- data/spec/integration/french_example/test_args +2 -0
- data/spec/integration/german_date_example/input.csv +3 -0
- data/spec/integration/german_date_example/output.ledger +12 -0
- data/spec/integration/german_date_example/test_args +1 -0
- data/spec/integration/harder_date_example/input.csv +5 -0
- data/spec/integration/harder_date_example/output.ledger +20 -0
- data/spec/integration/harder_date_example/test_args +1 -0
- data/spec/integration/ing/input.csv +3 -0
- data/spec/integration/ing/output.ledger +12 -0
- data/spec/integration/ing/test_args +1 -0
- data/spec/integration/intuit_mint_example/input.csv +7 -0
- data/spec/integration/intuit_mint_example/output.ledger +28 -0
- data/spec/integration/intuit_mint_example/test_args +1 -0
- data/spec/integration/invalid_header_example/input.csv +6 -0
- data/spec/integration/invalid_header_example/output.ledger +8 -0
- data/spec/integration/invalid_header_example/test_args +1 -0
- data/spec/integration/inversed_credit_card/input.csv +16 -0
- data/spec/integration/inversed_credit_card/output.ledger +64 -0
- data/spec/integration/inversed_credit_card/test_args +1 -0
- data/spec/integration/nationwide/input.csv +4 -0
- data/spec/integration/nationwide/output.ledger +16 -0
- data/spec/integration/nationwide/test_args +1 -0
- data/spec/integration/regression/issue_51_account_tokens/input.csv +8 -0
- data/spec/integration/regression/issue_51_account_tokens/output.ledger +32 -0
- data/spec/integration/regression/issue_51_account_tokens/test_args +4 -0
- data/spec/integration/regression/issue_51_account_tokens/tokens.yml +9 -0
- data/spec/integration/regression/issue_64_date_column/input.csv +3 -0
- data/spec/integration/regression/issue_64_date_column/output.ledger +8 -0
- data/spec/integration/regression/issue_64_date_column/test_args +1 -0
- data/spec/integration/regression/issue_73_account_token_matching/input.csv +2 -0
- data/spec/integration/regression/issue_73_account_token_matching/output.ledger +4 -0
- data/spec/integration/regression/issue_73_account_token_matching/test_args +6 -0
- data/spec/integration/regression/issue_73_account_token_matching/tokens.yml +8 -0
- data/spec/integration/regression/issue_85_date_example/input.csv +2 -0
- data/spec/integration/regression/issue_85_date_example/output.ledger +8 -0
- data/spec/integration/regression/issue_85_date_example/test_args +1 -0
- data/spec/integration/spanish_date_example/input.csv +3 -0
- data/spec/integration/spanish_date_example/output.ledger +12 -0
- data/spec/integration/spanish_date_example/test_args +1 -0
- data/spec/integration/suntrust/input.csv +7 -0
- data/spec/integration/suntrust/output.ledger +28 -0
- data/spec/integration/suntrust/test_args +1 -0
- data/spec/integration/test.sh +123 -0
- data/spec/integration/test_money_column/input.csv +3 -0
- data/spec/integration/test_money_column/output.ledger +8 -0
- data/spec/integration/test_money_column/test_args +1 -0
- data/spec/integration/two_money_columns/input.csv +5 -0
- data/spec/integration/two_money_columns/output.ledger +20 -0
- data/spec/integration/two_money_columns/test_args +1 -0
- data/spec/integration/yyyymmdd_date_example/input.csv +1 -0
- data/spec/integration/yyyymmdd_date_example/output.ledger +4 -0
- data/spec/integration/yyyymmdd_date_example/test_args +1 -0
- data/spec/reckon/app_spec.rb +24 -6
- data/spec/reckon/csv_parser_spec.rb +3 -3
- data/spec/reckon/money_column_spec.rb +24 -24
- data/spec/reckon/money_spec.rb +15 -34
- data/spec/reckon/options_spec.rb +17 -0
- data/spec/spec_helper.rb +6 -1
- metadata +102 -7
- data/.travis.yml +0 -13
data/lib/reckon.rb
CHANGED
data/lib/reckon/app.rb
CHANGED
|
@@ -8,9 +8,10 @@ module Reckon
|
|
|
8
8
|
attr_accessor :options, :seen, :csv_parser, :regexps, :matcher
|
|
9
9
|
@@cli = HighLine.new
|
|
10
10
|
|
|
11
|
-
def initialize(
|
|
11
|
+
def initialize(opts = {})
|
|
12
|
+
self.options = opts
|
|
12
13
|
LOGGER.level = Logger::INFO if options[:verbose]
|
|
13
|
-
|
|
14
|
+
|
|
14
15
|
self.regexps = {}
|
|
15
16
|
self.seen = Set.new
|
|
16
17
|
self.options[:currency] ||= '$'
|
|
@@ -19,10 +20,10 @@ module Reckon
|
|
|
19
20
|
learn!
|
|
20
21
|
end
|
|
21
22
|
|
|
22
|
-
def interactive_output(str)
|
|
23
|
+
def interactive_output(str, fh = $stdout)
|
|
23
24
|
return if options[:unattended]
|
|
24
25
|
|
|
25
|
-
puts str
|
|
26
|
+
fh.puts str
|
|
26
27
|
end
|
|
27
28
|
|
|
28
29
|
def learn!
|
|
@@ -157,10 +158,10 @@ module Reckon
|
|
|
157
158
|
:money => @csv_parser.money_for(index),
|
|
158
159
|
:description => @csv_parser.description_for(index) }
|
|
159
160
|
end
|
|
160
|
-
rows.sort_by { |n| n[:date] }.each { |row| yield row }
|
|
161
|
+
rows.sort_by { |n| [n[:date], -n[:money], n[:description]] }.each { |row| yield row }
|
|
161
162
|
end
|
|
162
163
|
|
|
163
|
-
def print_transaction(rows)
|
|
164
|
+
def print_transaction(rows, fh = $stdout)
|
|
164
165
|
str = "\n"
|
|
165
166
|
header = %w[Date Amount Description Note]
|
|
166
167
|
maxes = header.map(&:length)
|
|
@@ -184,7 +185,7 @@ module Reckon
|
|
|
184
185
|
str += "\n"
|
|
185
186
|
end
|
|
186
187
|
|
|
187
|
-
interactive_output str
|
|
188
|
+
interactive_output str, fh
|
|
188
189
|
end
|
|
189
190
|
|
|
190
191
|
def ask_account_question(msg, row)
|
|
@@ -192,11 +193,13 @@ module Reckon
|
|
|
192
193
|
LOGGER.info "possible_answers===> #{possible_answers.inspect}"
|
|
193
194
|
|
|
194
195
|
if options[:unattended]
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
196
|
+
if options[:fail_on_unknown_account] && possible_answers.empty?
|
|
197
|
+
raise %(Couldn't find any matches for '#{row[:description]}'
|
|
198
|
+
Try adding an account token with --account-tokens)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
default = options[:default_outof_account]
|
|
202
|
+
default = options[:default_into_account] if row[:pretty_money][0] == '-'
|
|
200
203
|
return possible_answers[0] || default
|
|
201
204
|
end
|
|
202
205
|
|
|
@@ -252,7 +255,7 @@ module Reckon
|
|
|
252
255
|
end
|
|
253
256
|
|
|
254
257
|
def ledger_format(row, line1, line2)
|
|
255
|
-
out = "#{row[:pretty_date]}\t#{row[:description]}\t;
|
|
258
|
+
out = "#{row[:pretty_date]}\t#{row[:description]}#{row[:note] ? "\t; " + row[:note]: ""}\n"
|
|
256
259
|
out += "\t#{line1.first}\t\t\t#{line1.last}\n"
|
|
257
260
|
out += "\t#{line2.first}\t\t\t#{line2.last}\n\n"
|
|
258
261
|
out
|
|
@@ -277,138 +280,12 @@ module Reckon
|
|
|
277
280
|
exit
|
|
278
281
|
end
|
|
279
282
|
|
|
280
|
-
def output_table
|
|
283
|
+
def output_table(fh = $stdout)
|
|
281
284
|
rows = []
|
|
282
285
|
each_row_backwards do |row|
|
|
283
286
|
rows << row
|
|
284
287
|
end
|
|
285
|
-
print_transaction(rows)
|
|
286
|
-
end
|
|
287
|
-
|
|
288
|
-
def self.parse_opts(args = ARGV)
|
|
289
|
-
options = { :output_file => STDOUT }
|
|
290
|
-
parser = OptionParser.new do |opts|
|
|
291
|
-
opts.banner = "Usage: Reckon.rb [options]"
|
|
292
|
-
opts.separator ""
|
|
293
|
-
|
|
294
|
-
opts.on("-f", "--file FILE", "The CSV file to parse") do |file|
|
|
295
|
-
options[:file] = file
|
|
296
|
-
end
|
|
297
|
-
|
|
298
|
-
opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a|
|
|
299
|
-
options[:bank_account] = a
|
|
300
|
-
end
|
|
301
|
-
|
|
302
|
-
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
|
303
|
-
options[:verbose] = v
|
|
304
|
-
end
|
|
305
|
-
|
|
306
|
-
opts.on("-i", "--inverse", "Use the negative of each amount") do |v|
|
|
307
|
-
options[:inverse] = v
|
|
308
|
-
end
|
|
309
|
-
|
|
310
|
-
opts.on("-p", "--print-table", "Print out the parsed CSV in table form") do |p|
|
|
311
|
-
options[:print_table] = p
|
|
312
|
-
end
|
|
313
|
-
|
|
314
|
-
opts.on("-o", "--output-file FILE", "The ledger file to append to") do |o|
|
|
315
|
-
options[:output_file] = File.open(o, 'a')
|
|
316
|
-
end
|
|
317
|
-
|
|
318
|
-
opts.on("-l", "--learn-from FILE", "An existing ledger file to learn accounts from") do |l|
|
|
319
|
-
options[:existing_ledger_file] = l
|
|
320
|
-
end
|
|
321
|
-
|
|
322
|
-
opts.on("", "--ignore-columns 1,2,5", "Columns to ignore in the CSV file - the first column is column 1") do |ignore|
|
|
323
|
-
options[:ignore_columns] = ignore.split(",").map { |i| i.to_i }
|
|
324
|
-
end
|
|
325
|
-
|
|
326
|
-
opts.on("", "--money-column 2", Integer, "Specify the money column instead of letting Reckon guess - the first column is column 1") do |column_number|
|
|
327
|
-
options[:money_column] = column_number
|
|
328
|
-
end
|
|
329
|
-
|
|
330
|
-
opts.on("", "--date-column 3", Integer, "Specify the date column instead of letting Reckon guess - the first column is column 1") do |column_number|
|
|
331
|
-
options[:date_column] = column_number
|
|
332
|
-
end
|
|
333
|
-
|
|
334
|
-
opts.on("", "--contains-header [N]", "The first row of the CSV is a header and should be skipped. Optionally add the number of rows to skip.") do |contains_header|
|
|
335
|
-
options[:contains_header] = 1
|
|
336
|
-
options[:contains_header] = contains_header.to_i if contains_header
|
|
337
|
-
end
|
|
338
|
-
|
|
339
|
-
opts.on("", "--csv-separator ','", "Separator for parsing the CSV - default is comma.") do |csv_separator|
|
|
340
|
-
options[:csv_separator] = csv_separator
|
|
341
|
-
end
|
|
342
|
-
|
|
343
|
-
opts.on("", "--comma-separates-cents", "Use comma instead of period to deliminate dollars from cents when parsing ($100,50 instead of $100.50)") do |c|
|
|
344
|
-
options[:comma_separates_cents] = c
|
|
345
|
-
end
|
|
346
|
-
|
|
347
|
-
opts.on("", "--encoding 'UTF-8'", "Specify an encoding for the CSV file; not usually needed") do |e|
|
|
348
|
-
options[:encoding] = e
|
|
349
|
-
end
|
|
350
|
-
|
|
351
|
-
opts.on("-c", "--currency '$'", "Currency symbol to use, defaults to $ (£, EUR)") do |e|
|
|
352
|
-
options[:currency] = e
|
|
353
|
-
end
|
|
354
|
-
|
|
355
|
-
opts.on("", "--date-format '%d/%m/%Y'", "Force the date format (see Ruby DateTime strftime)") do |d|
|
|
356
|
-
options[:date_format] = d
|
|
357
|
-
end
|
|
358
|
-
|
|
359
|
-
opts.on("-u", "--unattended", "Don't ask questions and guess all the accounts automatically. Used with --learn-from or --account-tokens options.") do |n|
|
|
360
|
-
options[:unattended] = n
|
|
361
|
-
end
|
|
362
|
-
|
|
363
|
-
opts.on("-t", "--account-tokens FILE", "YAML file with manually-assigned tokens for each account (see README)") do |a|
|
|
364
|
-
options[:account_tokens_file] = a
|
|
365
|
-
end
|
|
366
|
-
|
|
367
|
-
opts.on("", "--default-into-account NAME", "Default into account") do |a|
|
|
368
|
-
options[:default_into_account] = a
|
|
369
|
-
end
|
|
370
|
-
|
|
371
|
-
opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a|
|
|
372
|
-
options[:default_outof_account] = a
|
|
373
|
-
end
|
|
374
|
-
|
|
375
|
-
opts.on("", "--suffixed", "If --currency should be used as a suffix. Defaults to false.") do |e|
|
|
376
|
-
options[:suffixed] = e
|
|
377
|
-
end
|
|
378
|
-
|
|
379
|
-
opts.on_tail("-h", "--help", "Show this message") do
|
|
380
|
-
puts opts
|
|
381
|
-
exit
|
|
382
|
-
end
|
|
383
|
-
|
|
384
|
-
opts.on_tail("--version", "Show version") do
|
|
385
|
-
puts VERSION
|
|
386
|
-
exit
|
|
387
|
-
end
|
|
388
|
-
|
|
389
|
-
opts.parse!(args)
|
|
390
|
-
end
|
|
391
|
-
|
|
392
|
-
unless options[:file]
|
|
393
|
-
options[:file] = @@cli.ask("What CSV file should I parse? ")
|
|
394
|
-
unless options[:file].length > 0
|
|
395
|
-
puts "\nYou must provide a CSV file to parse.\n"
|
|
396
|
-
puts parser
|
|
397
|
-
exit
|
|
398
|
-
end
|
|
399
|
-
end
|
|
400
|
-
|
|
401
|
-
unless options[:bank_account]
|
|
402
|
-
fail "Please specify --account for the unattended mode" if options[:unattended]
|
|
403
|
-
|
|
404
|
-
options[:bank_account] = @@cli.ask("What is the account name of this bank account in Ledger? ") do |q|
|
|
405
|
-
q.readline = true
|
|
406
|
-
q.validate = /^.{2,}$/
|
|
407
|
-
q.default = "Assets:Bank:Checking"
|
|
408
|
-
end
|
|
409
|
-
end
|
|
410
|
-
|
|
411
|
-
options
|
|
288
|
+
print_transaction(rows, fh)
|
|
412
289
|
end
|
|
413
290
|
end
|
|
414
291
|
end
|
|
@@ -1,47 +1,52 @@
|
|
|
1
1
|
require 'matrix'
|
|
2
2
|
require 'set'
|
|
3
3
|
|
|
4
|
-
# Implementation of
|
|
5
|
-
#
|
|
4
|
+
# Implementation of cosine similarity using TF-IDF for vectorization.
|
|
5
|
+
#
|
|
6
|
+
# In information retrieval, tf–idf, short for term frequency–inverse document frequency,
|
|
7
|
+
# is a numerical statistic that is intended to reflect how important a word is to a
|
|
8
|
+
# document in a collection or corpus
|
|
9
|
+
#
|
|
10
|
+
# Cosine Similarity a measurement to determine how similar 2 documents are to each other.
|
|
11
|
+
#
|
|
12
|
+
# These weights and measures are used to suggest which account a transaction should be
|
|
13
|
+
# assigned to.
|
|
6
14
|
module Reckon
|
|
7
15
|
class CosineSimilarity
|
|
16
|
+
DocumentInfo = Struct.new(:tokens, :accounts)
|
|
17
|
+
|
|
8
18
|
def initialize(options)
|
|
19
|
+
@docs = DocumentInfo.new({}, {})
|
|
9
20
|
@options = options
|
|
10
|
-
@tokens = {}
|
|
11
|
-
@accounts = Hash.new(0)
|
|
12
21
|
end
|
|
13
22
|
|
|
14
23
|
def add_document(account, doc)
|
|
15
|
-
tokenize(doc)
|
|
24
|
+
tokens = tokenize(doc)
|
|
25
|
+
LOGGER.info "doc tokens: #{tokens}"
|
|
26
|
+
tokens.each do |n|
|
|
16
27
|
(token, count) = n
|
|
17
28
|
|
|
18
|
-
@tokens[token] ||=
|
|
19
|
-
@tokens[token][account]
|
|
20
|
-
@
|
|
21
|
-
@accounts[account] += count
|
|
29
|
+
@docs.tokens[token] ||= Hash.new(0)
|
|
30
|
+
@docs.tokens[token][account] += count
|
|
31
|
+
@docs.accounts[account] ||= Hash.new(0)
|
|
32
|
+
@docs.accounts[account][token] += count
|
|
22
33
|
end
|
|
23
34
|
end
|
|
24
35
|
|
|
25
36
|
# find most similar documents to query
|
|
26
37
|
def find_similar(query)
|
|
27
|
-
|
|
38
|
+
LOGGER.info "find_similar #{query}"
|
|
28
39
|
|
|
29
|
-
|
|
40
|
+
accounts = docs_to_check(query).map do |a|
|
|
41
|
+
[a, tfidf(@docs.accounts[a])]
|
|
42
|
+
end
|
|
30
43
|
|
|
31
|
-
|
|
32
|
-
suggestions = corpus_scores.map do |account, scores|
|
|
33
|
-
acct_vector = Vector.elements(scores, false)
|
|
44
|
+
q = tfidf(tokenize(query))
|
|
34
45
|
|
|
35
|
-
|
|
36
|
-
# similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
|
|
37
|
-
# exactly opposite
|
|
38
|
-
# see https://en.wikipedia.org/wiki/Cosine_similarity
|
|
39
|
-
# cos(theta) = (A . B) / (||A|| ||B||)
|
|
40
|
-
# where A . B is the "dot product" and ||A|| is the magnitude of A
|
|
41
|
-
# ruby has the 'matrix' library we can use to do these calculations.
|
|
46
|
+
suggestions = accounts.map do |a, d|
|
|
42
47
|
{
|
|
43
|
-
similarity:
|
|
44
|
-
account:
|
|
48
|
+
similarity: calc_similarity(q, d),
|
|
49
|
+
account: a
|
|
45
50
|
}
|
|
46
51
|
end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
|
|
47
52
|
|
|
@@ -52,50 +57,51 @@ module Reckon
|
|
|
52
57
|
|
|
53
58
|
private
|
|
54
59
|
|
|
55
|
-
def
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
corpus_scores = {}
|
|
59
|
-
query_scores = []
|
|
60
|
-
num_docs = @accounts.length
|
|
61
|
-
|
|
62
|
-
query_tokens.each do |n|
|
|
63
|
-
(token, _count) = n
|
|
64
|
-
next unless @tokens[token]
|
|
65
|
-
corpus = corpus.union(Set.new(@tokens[token].keys))
|
|
60
|
+
def docs_to_check(query)
|
|
61
|
+
return tokenize(query).reduce(Set.new) do |corpus, t|
|
|
62
|
+
corpus.union(Set.new(@docs.tokens[t[0]]&.keys))
|
|
66
63
|
end
|
|
64
|
+
end
|
|
67
65
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
# if no other docs have token, ignore it
|
|
72
|
-
next unless @tokens[token]
|
|
66
|
+
def tfidf(tokens)
|
|
67
|
+
scores = {}
|
|
73
68
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
@tokens[
|
|
79
|
-
|
|
69
|
+
tokens.each do |t, n|
|
|
70
|
+
scores[t] = calc_tf_idf(
|
|
71
|
+
n,
|
|
72
|
+
tokens.length,
|
|
73
|
+
@docs.tokens[t]&.length&.to_f || 0,
|
|
74
|
+
@docs.accounts.length
|
|
80
75
|
)
|
|
81
|
-
|
|
82
|
-
## Next, calculate for the corpus, where our "account" is a document
|
|
83
|
-
corpus.each do |account|
|
|
84
|
-
corpus_scores[account] ||= []
|
|
85
|
-
|
|
86
|
-
corpus_scores[account] << calc_tf_idf(
|
|
87
|
-
(@tokens[token][account] || 0),
|
|
88
|
-
@accounts[account].to_f,
|
|
89
|
-
@tokens[token].length.to_f,
|
|
90
|
-
num_docs
|
|
91
|
-
)
|
|
92
|
-
end
|
|
93
76
|
end
|
|
94
|
-
|
|
77
|
+
|
|
78
|
+
return scores
|
|
95
79
|
end
|
|
96
80
|
|
|
97
|
-
|
|
81
|
+
# Cosine similarity is used to compare how similar 2 documents are. Returns a float
|
|
82
|
+
# between 1 and -1, where 1 is exactly the same and -1 is exactly opposite.
|
|
83
|
+
#
|
|
84
|
+
# see https://en.wikipedia.org/wiki/Cosine_similarity
|
|
85
|
+
# cos(theta) = (A . B) / (||A|| ||B||)
|
|
86
|
+
# where A . B is the "dot product" and ||A|| is the magnitude of A
|
|
87
|
+
#
|
|
88
|
+
# The variables A and B are the set of unique terms in q and d.
|
|
89
|
+
#
|
|
90
|
+
# For example, when q = "big red balloon" and d ="small green balloon" then the
|
|
91
|
+
# variables are (big,red,balloon,small,green) and a = (1,1,1,0,0) and b =
|
|
92
|
+
# (0,0,1,1,1).
|
|
93
|
+
#
|
|
94
|
+
# query and doc are hashes of token => tf/idf score
|
|
95
|
+
def calc_similarity(query, doc)
|
|
96
|
+
tokens = Set.new(query.keys + doc.keys)
|
|
97
|
+
|
|
98
|
+
a = Vector.elements(tokens.map { |n| query[n] || 0 }, false)
|
|
99
|
+
b = Vector.elements(tokens.map { |n| doc[n] || 0 }, false)
|
|
100
|
+
|
|
101
|
+
return a.inner_product(b) / (a.magnitude * b.magnitude)
|
|
102
|
+
end
|
|
98
103
|
|
|
104
|
+
def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
|
|
99
105
|
# tf(t,d) = count of t in d / number of words in d
|
|
100
106
|
tf = token_count / num_words_in_doc.to_f
|
|
101
107
|
|
|
@@ -109,14 +115,13 @@ module Reckon
|
|
|
109
115
|
end
|
|
110
116
|
|
|
111
117
|
def tokenize(str)
|
|
112
|
-
mk_tokens(str).
|
|
118
|
+
mk_tokens(str).each_with_object(Hash.new(0)) do |n, memo|
|
|
113
119
|
memo[n] += 1
|
|
114
|
-
memo
|
|
115
120
|
end.to_a
|
|
116
121
|
end
|
|
117
122
|
|
|
118
123
|
def mk_tokens(str)
|
|
119
|
-
str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
|
|
124
|
+
str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/).reject(&:empty?)
|
|
120
125
|
end
|
|
121
126
|
end
|
|
122
127
|
end
|
data/lib/reckon/csv_parser.rb
CHANGED
|
@@ -89,12 +89,7 @@ module Reckon
|
|
|
89
89
|
money_score += Money::likelihood( entry )
|
|
90
90
|
possible_neg_money_count += 1 if entry =~ /^\$?[\-\(]\$?\d+/
|
|
91
91
|
possible_pos_money_count += 1 if entry =~ /^\+?\$?\+?\d+/
|
|
92
|
-
date_score +=
|
|
93
|
-
date_score += 5 if entry =~ /^[\-\/\.\d:\[\]]+$/
|
|
94
|
-
date_score += entry.gsub(/[^\-\/\.\d:\[\]]/, '').length if entry.gsub(/[^\-\/\.\d:\[\]]/, '').length > 3
|
|
95
|
-
date_score -= entry.gsub(/[\-\/\.\d:\[\]]/, '').length
|
|
96
|
-
date_score += 30 if entry =~ /^\d+[:\/\.-]\d+[:\/\.-]\d+([ :]\d+[:\/\.]\d+)?$/
|
|
97
|
-
date_score += 10 if entry =~ /^\d+\[\d+:GMT\]$/i
|
|
92
|
+
date_score += DateColumn.likelihood(entry)
|
|
98
93
|
|
|
99
94
|
# Try to determine if this is a balance column
|
|
100
95
|
entry_as_num = entry.gsub(/[^\-\d\.]/, '').to_f
|
|
@@ -168,7 +163,7 @@ module Reckon
|
|
|
168
163
|
results = evaluate_columns(columns)
|
|
169
164
|
|
|
170
165
|
if options[:money_column]
|
|
171
|
-
self.money_column_indices = [
|
|
166
|
+
self.money_column_indices = [options[:money_column] - 1]
|
|
172
167
|
else
|
|
173
168
|
self.money_column_indices = results.select { |n| n[:is_money_column] }.map { |n| n[:index] }
|
|
174
169
|
if self.money_column_indices.length == 1
|
data/lib/reckon/date_column.rb
CHANGED
|
@@ -56,5 +56,15 @@ module Reckon
|
|
|
56
56
|
date.iso8601
|
|
57
57
|
end
|
|
58
58
|
|
|
59
|
+
def self.likelihood(entry)
|
|
60
|
+
date_score = 0
|
|
61
|
+
date_score += 10 if entry =~ /\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/i
|
|
62
|
+
date_score += 5 if entry =~ /^[\-\/\.\d:\[\]]+$/
|
|
63
|
+
date_score += entry.gsub(/[^\-\/\.\d:\[\]]/, '').length if entry.gsub(/[^\-\/\.\d:\[\]]/, '').length > 3
|
|
64
|
+
date_score -= entry.gsub(/[\-\/\.\d:\[\]]/, '').length
|
|
65
|
+
date_score += 30 if entry =~ /^\d+[:\/\.-]\d+[:\/\.-]\d+([ :]\d+[:\/\.]\d+)?$/
|
|
66
|
+
date_score += 10 if entry =~ /^\d+\[\d+:GMT\]$/i
|
|
67
|
+
return date_score
|
|
68
|
+
end
|
|
59
69
|
end
|
|
60
70
|
end
|
data/lib/reckon/money.rb
CHANGED
|
@@ -5,12 +5,13 @@ module Reckon
|
|
|
5
5
|
class Money
|
|
6
6
|
include Comparable
|
|
7
7
|
attr_accessor :amount, :currency, :suffixed
|
|
8
|
-
def initialize(
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
8
|
+
def initialize(amount, options = {})
|
|
9
|
+
@options = options
|
|
10
|
+
@amount_raw = amount
|
|
11
|
+
@raw = options[:raw]
|
|
12
|
+
|
|
13
|
+
@amount = parse(amount, options)
|
|
14
|
+
@amount = -@amount if options[:inverse]
|
|
14
15
|
@currency = options[:currency] || "$"
|
|
15
16
|
@suffixed = options[:suffixed]
|
|
16
17
|
end
|
|
@@ -19,11 +20,19 @@ module Reckon
|
|
|
19
20
|
return @amount
|
|
20
21
|
end
|
|
21
22
|
|
|
23
|
+
def to_s
|
|
24
|
+
return @options[:raw] ? "#{@amount_raw} | #{@amount}" : @amount
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# unary minus
|
|
28
|
+
# ex
|
|
29
|
+
# m = Money.new
|
|
30
|
+
# -m
|
|
22
31
|
def -@
|
|
23
|
-
Money.new(
|
|
32
|
+
Money.new(-@amount, :currency => @currency, :suffixed => @suffixed)
|
|
24
33
|
end
|
|
25
34
|
|
|
26
|
-
def <=>(
|
|
35
|
+
def <=>(mon)
|
|
27
36
|
other_amount = mon.to_f
|
|
28
37
|
if @amount < other_amount
|
|
29
38
|
-1
|
|
@@ -34,42 +43,41 @@ module Reckon
|
|
|
34
43
|
end
|
|
35
44
|
end
|
|
36
45
|
|
|
37
|
-
def pretty(
|
|
38
|
-
if @
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
46
|
+
def pretty(negate = false)
|
|
47
|
+
if @raw
|
|
48
|
+
return @amount_raw unless negate
|
|
49
|
+
|
|
50
|
+
return @amount_raw[0] == '-' ? @amount_raw[1..-1] : "-#{@amount_raw}"
|
|
42
51
|
end
|
|
52
|
+
|
|
53
|
+
amt = pretty_amount(@amount * (negate ? -1 : 1))
|
|
54
|
+
amt = if @suffixed
|
|
55
|
+
"#{amt} #{@currency}"
|
|
56
|
+
else
|
|
57
|
+
amt.gsub(/^((-)|)(?=\d)/, "\\1#{@currency}")
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
return (@amount >= 0 ? " " : "") + amt
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def pretty_amount(amount)
|
|
64
|
+
sprintf("%0.2f", amount).reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
|
|
43
65
|
end
|
|
44
66
|
|
|
45
|
-
def
|
|
67
|
+
def parse(value, options = {})
|
|
68
|
+
value = value.to_s
|
|
46
69
|
# Empty string is treated as money with value 0
|
|
47
|
-
return
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
value = value.
|
|
52
|
-
value = value.
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
any_number_regex = /^(.*?)([\d\.]+)/
|
|
56
|
-
|
|
57
|
-
# Prefer matching the money_format, match any number otherwise
|
|
58
|
-
m = value.match( money_format_regex ) ||
|
|
59
|
-
value.match( any_number_regex )
|
|
60
|
-
if m
|
|
61
|
-
amount = m[2].to_f
|
|
62
|
-
# Check whether the money had a - or (, which indicates negative amounts
|
|
63
|
-
if (m[1].match( /^[\(-]/ ) || m[1].match( /-$/ ))
|
|
64
|
-
amount *= -1
|
|
65
|
-
end
|
|
66
|
-
return Money.new( amount, options )
|
|
67
|
-
else
|
|
68
|
-
return nil
|
|
69
|
-
end
|
|
70
|
+
return value.to_f if value.to_s.empty?
|
|
71
|
+
|
|
72
|
+
invert = value.match(/^\(.*\)$/)
|
|
73
|
+
value = value.gsub(/[^0-9,.-]/, '')
|
|
74
|
+
value = value.tr('.', '').tr(',', '.') if options[:comma_separates_cents]
|
|
75
|
+
value = value.tr(',', '')
|
|
76
|
+
value = value.to_f
|
|
77
|
+
return invert ? -value : value
|
|
70
78
|
end
|
|
71
79
|
|
|
72
|
-
def Money::likelihood(
|
|
80
|
+
def Money::likelihood(entry)
|
|
73
81
|
money_score = 0
|
|
74
82
|
# digits separated by , or . with no more than 2 trailing digits
|
|
75
83
|
money_score += 40 if entry.match(/\d+[,.]\d{2}[^\d]*$/)
|
|
@@ -83,31 +91,30 @@ module Reckon
|
|
|
83
91
|
end
|
|
84
92
|
|
|
85
93
|
class MoneyColumn < Array
|
|
86
|
-
def initialize(
|
|
87
|
-
arr.each { |str|
|
|
94
|
+
def initialize(arr = [], options = {})
|
|
95
|
+
arr.each { |str| push(Money.new(str, options)) }
|
|
88
96
|
end
|
|
89
97
|
|
|
90
98
|
def positive?
|
|
91
|
-
|
|
92
|
-
return false if money < 0
|
|
99
|
+
each do |money|
|
|
100
|
+
return false if money && money < 0
|
|
93
101
|
end
|
|
94
102
|
true
|
|
95
103
|
end
|
|
96
104
|
|
|
97
|
-
def merge!(
|
|
105
|
+
def merge!(other_column)
|
|
98
106
|
invert = false
|
|
99
|
-
invert = true if
|
|
100
|
-
|
|
107
|
+
invert = true if positive? && other_column.positive?
|
|
108
|
+
each_with_index do |mon, i|
|
|
101
109
|
other = other_column[i]
|
|
102
|
-
return nil if
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
elsif mon == 0.00 && other != 0.00
|
|
110
|
+
return nil if !mon || !other
|
|
111
|
+
|
|
112
|
+
if mon != 0.0 && other == 0.0
|
|
113
|
+
self[i] = -mon if invert
|
|
114
|
+
elsif mon == 0.0 && other != 0.0
|
|
108
115
|
self[i] = other
|
|
109
116
|
else
|
|
110
|
-
|
|
117
|
+
self[i] = Money.new(0)
|
|
111
118
|
end
|
|
112
119
|
end
|
|
113
120
|
self
|