reckon 0.6.0 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +50 -0
- data/.gitignore +3 -0
- data/.ruby-version +1 -1
- data/CHANGELOG.md +75 -7
- data/Gemfile.lock +1 -1
- data/README.md +85 -24
- data/Rakefile +17 -1
- data/bin/build-new-version.sh +26 -0
- data/bin/reckon +9 -1
- data/lib/reckon.rb +1 -0
- data/lib/reckon/app.rb +18 -141
- data/lib/reckon/cosine_similarity.rb +67 -62
- data/lib/reckon/csv_parser.rb +2 -7
- data/lib/reckon/date_column.rb +10 -0
- data/lib/reckon/money.rb +59 -52
- data/lib/reckon/options.rb +153 -0
- data/lib/reckon/version.rb +1 -1
- data/spec/cosine_training_and_test.rb +52 -0
- data/spec/integration/another_bank_example/input.csv +9 -0
- data/spec/integration/another_bank_example/output.ledger +36 -0
- data/spec/integration/another_bank_example/test_args +1 -0
- data/spec/integration/ask_for_account/cli_input.exp +33 -0
- data/spec/integration/ask_for_account/expected_output +11 -0
- data/spec/integration/ask_for_account/input.csv +9 -0
- data/spec/integration/ask_for_account/test_args +1 -0
- data/spec/integration/austrian_example/input.csv +13 -0
- data/spec/integration/austrian_example/output.ledger +52 -0
- data/spec/integration/austrian_example/test_args +2 -0
- data/spec/integration/bom_utf8_file/input.csv +3 -0
- data/spec/integration/bom_utf8_file/output.ledger +4 -0
- data/spec/integration/bom_utf8_file/test_args +3 -0
- data/spec/integration/broker_canada_example/input.csv +12 -0
- data/spec/integration/broker_canada_example/output.ledger +48 -0
- data/spec/integration/broker_canada_example/test_args +1 -0
- data/spec/integration/chase/account_tokens_and_regex/output.ledger +36 -0
- data/spec/integration/chase/account_tokens_and_regex/test_args +2 -0
- data/spec/integration/chase/account_tokens_and_regex/tokens.yml +16 -0
- data/spec/integration/chase/default_account_names/output.ledger +36 -0
- data/spec/integration/chase/default_account_names/test_args +3 -0
- data/spec/integration/chase/input.csv +9 -0
- data/spec/integration/chase/learn_from_existing/learn.ledger +7 -0
- data/spec/integration/chase/learn_from_existing/output.ledger +36 -0
- data/spec/integration/chase/learn_from_existing/test_args +1 -0
- data/spec/integration/chase/simple/output.ledger +36 -0
- data/spec/integration/chase/simple/test_args +1 -0
- data/spec/integration/danish_kroner_nordea_example/input.csv +6 -0
- data/spec/integration/danish_kroner_nordea_example/output.ledger +24 -0
- data/spec/integration/danish_kroner_nordea_example/test_args +1 -0
- data/spec/integration/english_date_example/input.csv +3 -0
- data/spec/integration/english_date_example/output.ledger +12 -0
- data/spec/integration/english_date_example/test_args +1 -0
- data/spec/integration/extratofake/input.csv +24 -0
- data/spec/integration/extratofake/output.ledger +92 -0
- data/spec/integration/extratofake/test_args +1 -0
- data/spec/integration/french_example/input.csv +9 -0
- data/spec/integration/french_example/output.ledger +36 -0
- data/spec/integration/french_example/test_args +2 -0
- data/spec/integration/german_date_example/input.csv +3 -0
- data/spec/integration/german_date_example/output.ledger +12 -0
- data/spec/integration/german_date_example/test_args +1 -0
- data/spec/integration/harder_date_example/input.csv +5 -0
- data/spec/integration/harder_date_example/output.ledger +20 -0
- data/spec/integration/harder_date_example/test_args +1 -0
- data/spec/integration/ing/input.csv +3 -0
- data/spec/integration/ing/output.ledger +12 -0
- data/spec/integration/ing/test_args +1 -0
- data/spec/integration/intuit_mint_example/input.csv +7 -0
- data/spec/integration/intuit_mint_example/output.ledger +28 -0
- data/spec/integration/intuit_mint_example/test_args +1 -0
- data/spec/integration/invalid_header_example/input.csv +6 -0
- data/spec/integration/invalid_header_example/output.ledger +8 -0
- data/spec/integration/invalid_header_example/test_args +1 -0
- data/spec/integration/inversed_credit_card/input.csv +16 -0
- data/spec/integration/inversed_credit_card/output.ledger +64 -0
- data/spec/integration/inversed_credit_card/test_args +1 -0
- data/spec/integration/nationwide/input.csv +4 -0
- data/spec/integration/nationwide/output.ledger +16 -0
- data/spec/integration/nationwide/test_args +1 -0
- data/spec/integration/regression/issue_51_account_tokens/input.csv +8 -0
- data/spec/integration/regression/issue_51_account_tokens/output.ledger +32 -0
- data/spec/integration/regression/issue_51_account_tokens/test_args +4 -0
- data/spec/integration/regression/issue_51_account_tokens/tokens.yml +9 -0
- data/spec/integration/regression/issue_64_date_column/input.csv +3 -0
- data/spec/integration/regression/issue_64_date_column/output.ledger +8 -0
- data/spec/integration/regression/issue_64_date_column/test_args +1 -0
- data/spec/integration/regression/issue_73_account_token_matching/input.csv +2 -0
- data/spec/integration/regression/issue_73_account_token_matching/output.ledger +4 -0
- data/spec/integration/regression/issue_73_account_token_matching/test_args +6 -0
- data/spec/integration/regression/issue_73_account_token_matching/tokens.yml +8 -0
- data/spec/integration/regression/issue_85_date_example/input.csv +2 -0
- data/spec/integration/regression/issue_85_date_example/output.ledger +8 -0
- data/spec/integration/regression/issue_85_date_example/test_args +1 -0
- data/spec/integration/spanish_date_example/input.csv +3 -0
- data/spec/integration/spanish_date_example/output.ledger +12 -0
- data/spec/integration/spanish_date_example/test_args +1 -0
- data/spec/integration/suntrust/input.csv +7 -0
- data/spec/integration/suntrust/output.ledger +28 -0
- data/spec/integration/suntrust/test_args +1 -0
- data/spec/integration/test.sh +123 -0
- data/spec/integration/test_money_column/input.csv +3 -0
- data/spec/integration/test_money_column/output.ledger +8 -0
- data/spec/integration/test_money_column/test_args +1 -0
- data/spec/integration/two_money_columns/input.csv +5 -0
- data/spec/integration/two_money_columns/output.ledger +20 -0
- data/spec/integration/two_money_columns/test_args +1 -0
- data/spec/integration/yyyymmdd_date_example/input.csv +1 -0
- data/spec/integration/yyyymmdd_date_example/output.ledger +4 -0
- data/spec/integration/yyyymmdd_date_example/test_args +1 -0
- data/spec/reckon/app_spec.rb +24 -6
- data/spec/reckon/csv_parser_spec.rb +3 -3
- data/spec/reckon/money_column_spec.rb +24 -24
- data/spec/reckon/money_spec.rb +15 -34
- data/spec/reckon/options_spec.rb +17 -0
- data/spec/spec_helper.rb +6 -1
- metadata +102 -7
- data/.travis.yml +0 -13
data/lib/reckon.rb
CHANGED
data/lib/reckon/app.rb
CHANGED
@@ -8,9 +8,10 @@ module Reckon
|
|
8
8
|
attr_accessor :options, :seen, :csv_parser, :regexps, :matcher
|
9
9
|
@@cli = HighLine.new
|
10
10
|
|
11
|
-
def initialize(
|
11
|
+
def initialize(opts = {})
|
12
|
+
self.options = opts
|
12
13
|
LOGGER.level = Logger::INFO if options[:verbose]
|
13
|
-
|
14
|
+
|
14
15
|
self.regexps = {}
|
15
16
|
self.seen = Set.new
|
16
17
|
self.options[:currency] ||= '$'
|
@@ -19,10 +20,10 @@ module Reckon
|
|
19
20
|
learn!
|
20
21
|
end
|
21
22
|
|
22
|
-
def interactive_output(str)
|
23
|
+
def interactive_output(str, fh = $stdout)
|
23
24
|
return if options[:unattended]
|
24
25
|
|
25
|
-
puts str
|
26
|
+
fh.puts str
|
26
27
|
end
|
27
28
|
|
28
29
|
def learn!
|
@@ -157,10 +158,10 @@ module Reckon
|
|
157
158
|
:money => @csv_parser.money_for(index),
|
158
159
|
:description => @csv_parser.description_for(index) }
|
159
160
|
end
|
160
|
-
rows.sort_by { |n| n[:date] }.each { |row| yield row }
|
161
|
+
rows.sort_by { |n| [n[:date], -n[:money], n[:description]] }.each { |row| yield row }
|
161
162
|
end
|
162
163
|
|
163
|
-
def print_transaction(rows)
|
164
|
+
def print_transaction(rows, fh = $stdout)
|
164
165
|
str = "\n"
|
165
166
|
header = %w[Date Amount Description Note]
|
166
167
|
maxes = header.map(&:length)
|
@@ -184,7 +185,7 @@ module Reckon
|
|
184
185
|
str += "\n"
|
185
186
|
end
|
186
187
|
|
187
|
-
interactive_output str
|
188
|
+
interactive_output str, fh
|
188
189
|
end
|
189
190
|
|
190
191
|
def ask_account_question(msg, row)
|
@@ -192,11 +193,13 @@ module Reckon
|
|
192
193
|
LOGGER.info "possible_answers===> #{possible_answers.inspect}"
|
193
194
|
|
194
195
|
if options[:unattended]
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
196
|
+
if options[:fail_on_unknown_account] && possible_answers.empty?
|
197
|
+
raise %(Couldn't find any matches for '#{row[:description]}'
|
198
|
+
Try adding an account token with --account-tokens)
|
199
|
+
end
|
200
|
+
|
201
|
+
default = options[:default_outof_account]
|
202
|
+
default = options[:default_into_account] if row[:pretty_money][0] == '-'
|
200
203
|
return possible_answers[0] || default
|
201
204
|
end
|
202
205
|
|
@@ -252,7 +255,7 @@ module Reckon
|
|
252
255
|
end
|
253
256
|
|
254
257
|
def ledger_format(row, line1, line2)
|
255
|
-
out = "#{row[:pretty_date]}\t#{row[:description]}\t;
|
258
|
+
out = "#{row[:pretty_date]}\t#{row[:description]}#{row[:note] ? "\t; " + row[:note]: ""}\n"
|
256
259
|
out += "\t#{line1.first}\t\t\t#{line1.last}\n"
|
257
260
|
out += "\t#{line2.first}\t\t\t#{line2.last}\n\n"
|
258
261
|
out
|
@@ -277,138 +280,12 @@ module Reckon
|
|
277
280
|
exit
|
278
281
|
end
|
279
282
|
|
280
|
-
def output_table
|
283
|
+
def output_table(fh = $stdout)
|
281
284
|
rows = []
|
282
285
|
each_row_backwards do |row|
|
283
286
|
rows << row
|
284
287
|
end
|
285
|
-
print_transaction(rows)
|
286
|
-
end
|
287
|
-
|
288
|
-
def self.parse_opts(args = ARGV)
|
289
|
-
options = { :output_file => STDOUT }
|
290
|
-
parser = OptionParser.new do |opts|
|
291
|
-
opts.banner = "Usage: Reckon.rb [options]"
|
292
|
-
opts.separator ""
|
293
|
-
|
294
|
-
opts.on("-f", "--file FILE", "The CSV file to parse") do |file|
|
295
|
-
options[:file] = file
|
296
|
-
end
|
297
|
-
|
298
|
-
opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a|
|
299
|
-
options[:bank_account] = a
|
300
|
-
end
|
301
|
-
|
302
|
-
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
303
|
-
options[:verbose] = v
|
304
|
-
end
|
305
|
-
|
306
|
-
opts.on("-i", "--inverse", "Use the negative of each amount") do |v|
|
307
|
-
options[:inverse] = v
|
308
|
-
end
|
309
|
-
|
310
|
-
opts.on("-p", "--print-table", "Print out the parsed CSV in table form") do |p|
|
311
|
-
options[:print_table] = p
|
312
|
-
end
|
313
|
-
|
314
|
-
opts.on("-o", "--output-file FILE", "The ledger file to append to") do |o|
|
315
|
-
options[:output_file] = File.open(o, 'a')
|
316
|
-
end
|
317
|
-
|
318
|
-
opts.on("-l", "--learn-from FILE", "An existing ledger file to learn accounts from") do |l|
|
319
|
-
options[:existing_ledger_file] = l
|
320
|
-
end
|
321
|
-
|
322
|
-
opts.on("", "--ignore-columns 1,2,5", "Columns to ignore in the CSV file - the first column is column 1") do |ignore|
|
323
|
-
options[:ignore_columns] = ignore.split(",").map { |i| i.to_i }
|
324
|
-
end
|
325
|
-
|
326
|
-
opts.on("", "--money-column 2", Integer, "Specify the money column instead of letting Reckon guess - the first column is column 1") do |column_number|
|
327
|
-
options[:money_column] = column_number
|
328
|
-
end
|
329
|
-
|
330
|
-
opts.on("", "--date-column 3", Integer, "Specify the date column instead of letting Reckon guess - the first column is column 1") do |column_number|
|
331
|
-
options[:date_column] = column_number
|
332
|
-
end
|
333
|
-
|
334
|
-
opts.on("", "--contains-header [N]", "The first row of the CSV is a header and should be skipped. Optionally add the number of rows to skip.") do |contains_header|
|
335
|
-
options[:contains_header] = 1
|
336
|
-
options[:contains_header] = contains_header.to_i if contains_header
|
337
|
-
end
|
338
|
-
|
339
|
-
opts.on("", "--csv-separator ','", "Separator for parsing the CSV - default is comma.") do |csv_separator|
|
340
|
-
options[:csv_separator] = csv_separator
|
341
|
-
end
|
342
|
-
|
343
|
-
opts.on("", "--comma-separates-cents", "Use comma instead of period to deliminate dollars from cents when parsing ($100,50 instead of $100.50)") do |c|
|
344
|
-
options[:comma_separates_cents] = c
|
345
|
-
end
|
346
|
-
|
347
|
-
opts.on("", "--encoding 'UTF-8'", "Specify an encoding for the CSV file; not usually needed") do |e|
|
348
|
-
options[:encoding] = e
|
349
|
-
end
|
350
|
-
|
351
|
-
opts.on("-c", "--currency '$'", "Currency symbol to use, defaults to $ (£, EUR)") do |e|
|
352
|
-
options[:currency] = e
|
353
|
-
end
|
354
|
-
|
355
|
-
opts.on("", "--date-format '%d/%m/%Y'", "Force the date format (see Ruby DateTime strftime)") do |d|
|
356
|
-
options[:date_format] = d
|
357
|
-
end
|
358
|
-
|
359
|
-
opts.on("-u", "--unattended", "Don't ask questions and guess all the accounts automatically. Used with --learn-from or --account-tokens options.") do |n|
|
360
|
-
options[:unattended] = n
|
361
|
-
end
|
362
|
-
|
363
|
-
opts.on("-t", "--account-tokens FILE", "YAML file with manually-assigned tokens for each account (see README)") do |a|
|
364
|
-
options[:account_tokens_file] = a
|
365
|
-
end
|
366
|
-
|
367
|
-
opts.on("", "--default-into-account NAME", "Default into account") do |a|
|
368
|
-
options[:default_into_account] = a
|
369
|
-
end
|
370
|
-
|
371
|
-
opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a|
|
372
|
-
options[:default_outof_account] = a
|
373
|
-
end
|
374
|
-
|
375
|
-
opts.on("", "--suffixed", "If --currency should be used as a suffix. Defaults to false.") do |e|
|
376
|
-
options[:suffixed] = e
|
377
|
-
end
|
378
|
-
|
379
|
-
opts.on_tail("-h", "--help", "Show this message") do
|
380
|
-
puts opts
|
381
|
-
exit
|
382
|
-
end
|
383
|
-
|
384
|
-
opts.on_tail("--version", "Show version") do
|
385
|
-
puts VERSION
|
386
|
-
exit
|
387
|
-
end
|
388
|
-
|
389
|
-
opts.parse!(args)
|
390
|
-
end
|
391
|
-
|
392
|
-
unless options[:file]
|
393
|
-
options[:file] = @@cli.ask("What CSV file should I parse? ")
|
394
|
-
unless options[:file].length > 0
|
395
|
-
puts "\nYou must provide a CSV file to parse.\n"
|
396
|
-
puts parser
|
397
|
-
exit
|
398
|
-
end
|
399
|
-
end
|
400
|
-
|
401
|
-
unless options[:bank_account]
|
402
|
-
fail "Please specify --account for the unattended mode" if options[:unattended]
|
403
|
-
|
404
|
-
options[:bank_account] = @@cli.ask("What is the account name of this bank account in Ledger? ") do |q|
|
405
|
-
q.readline = true
|
406
|
-
q.validate = /^.{2,}$/
|
407
|
-
q.default = "Assets:Bank:Checking"
|
408
|
-
end
|
409
|
-
end
|
410
|
-
|
411
|
-
options
|
288
|
+
print_transaction(rows, fh)
|
412
289
|
end
|
413
290
|
end
|
414
291
|
end
|
@@ -1,47 +1,52 @@
|
|
1
1
|
require 'matrix'
|
2
2
|
require 'set'
|
3
3
|
|
4
|
-
# Implementation of
|
5
|
-
#
|
4
|
+
# Implementation of cosine similarity using TF-IDF for vectorization.
|
5
|
+
#
|
6
|
+
# In information retrieval, tf–idf, short for term frequency–inverse document frequency,
|
7
|
+
# is a numerical statistic that is intended to reflect how important a word is to a
|
8
|
+
# document in a collection or corpus
|
9
|
+
#
|
10
|
+
# Cosine Similarity a measurement to determine how similar 2 documents are to each other.
|
11
|
+
#
|
12
|
+
# These weights and measures are used to suggest which account a transaction should be
|
13
|
+
# assigned to.
|
6
14
|
module Reckon
|
7
15
|
class CosineSimilarity
|
16
|
+
DocumentInfo = Struct.new(:tokens, :accounts)
|
17
|
+
|
8
18
|
def initialize(options)
|
19
|
+
@docs = DocumentInfo.new({}, {})
|
9
20
|
@options = options
|
10
|
-
@tokens = {}
|
11
|
-
@accounts = Hash.new(0)
|
12
21
|
end
|
13
22
|
|
14
23
|
def add_document(account, doc)
|
15
|
-
tokenize(doc)
|
24
|
+
tokens = tokenize(doc)
|
25
|
+
LOGGER.info "doc tokens: #{tokens}"
|
26
|
+
tokens.each do |n|
|
16
27
|
(token, count) = n
|
17
28
|
|
18
|
-
@tokens[token] ||=
|
19
|
-
@tokens[token][account]
|
20
|
-
@
|
21
|
-
@accounts[account] += count
|
29
|
+
@docs.tokens[token] ||= Hash.new(0)
|
30
|
+
@docs.tokens[token][account] += count
|
31
|
+
@docs.accounts[account] ||= Hash.new(0)
|
32
|
+
@docs.accounts[account][token] += count
|
22
33
|
end
|
23
34
|
end
|
24
35
|
|
25
36
|
# find most similar documents to query
|
26
37
|
def find_similar(query)
|
27
|
-
|
38
|
+
LOGGER.info "find_similar #{query}"
|
28
39
|
|
29
|
-
|
40
|
+
accounts = docs_to_check(query).map do |a|
|
41
|
+
[a, tfidf(@docs.accounts[a])]
|
42
|
+
end
|
30
43
|
|
31
|
-
|
32
|
-
suggestions = corpus_scores.map do |account, scores|
|
33
|
-
acct_vector = Vector.elements(scores, false)
|
44
|
+
q = tfidf(tokenize(query))
|
34
45
|
|
35
|
-
|
36
|
-
# similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
|
37
|
-
# exactly opposite
|
38
|
-
# see https://en.wikipedia.org/wiki/Cosine_similarity
|
39
|
-
# cos(theta) = (A . B) / (||A|| ||B||)
|
40
|
-
# where A . B is the "dot product" and ||A|| is the magnitude of A
|
41
|
-
# ruby has the 'matrix' library we can use to do these calculations.
|
46
|
+
suggestions = accounts.map do |a, d|
|
42
47
|
{
|
43
|
-
similarity:
|
44
|
-
account:
|
48
|
+
similarity: calc_similarity(q, d),
|
49
|
+
account: a
|
45
50
|
}
|
46
51
|
end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
|
47
52
|
|
@@ -52,50 +57,51 @@ module Reckon
|
|
52
57
|
|
53
58
|
private
|
54
59
|
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
corpus_scores = {}
|
59
|
-
query_scores = []
|
60
|
-
num_docs = @accounts.length
|
61
|
-
|
62
|
-
query_tokens.each do |n|
|
63
|
-
(token, _count) = n
|
64
|
-
next unless @tokens[token]
|
65
|
-
corpus = corpus.union(Set.new(@tokens[token].keys))
|
60
|
+
def docs_to_check(query)
|
61
|
+
return tokenize(query).reduce(Set.new) do |corpus, t|
|
62
|
+
corpus.union(Set.new(@docs.tokens[t[0]]&.keys))
|
66
63
|
end
|
64
|
+
end
|
67
65
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
# if no other docs have token, ignore it
|
72
|
-
next unless @tokens[token]
|
66
|
+
def tfidf(tokens)
|
67
|
+
scores = {}
|
73
68
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
@tokens[
|
79
|
-
|
69
|
+
tokens.each do |t, n|
|
70
|
+
scores[t] = calc_tf_idf(
|
71
|
+
n,
|
72
|
+
tokens.length,
|
73
|
+
@docs.tokens[t]&.length&.to_f || 0,
|
74
|
+
@docs.accounts.length
|
80
75
|
)
|
81
|
-
|
82
|
-
## Next, calculate for the corpus, where our "account" is a document
|
83
|
-
corpus.each do |account|
|
84
|
-
corpus_scores[account] ||= []
|
85
|
-
|
86
|
-
corpus_scores[account] << calc_tf_idf(
|
87
|
-
(@tokens[token][account] || 0),
|
88
|
-
@accounts[account].to_f,
|
89
|
-
@tokens[token].length.to_f,
|
90
|
-
num_docs
|
91
|
-
)
|
92
|
-
end
|
93
76
|
end
|
94
|
-
|
77
|
+
|
78
|
+
return scores
|
95
79
|
end
|
96
80
|
|
97
|
-
|
81
|
+
# Cosine similarity is used to compare how similar 2 documents are. Returns a float
|
82
|
+
# between 1 and -1, where 1 is exactly the same and -1 is exactly opposite.
|
83
|
+
#
|
84
|
+
# see https://en.wikipedia.org/wiki/Cosine_similarity
|
85
|
+
# cos(theta) = (A . B) / (||A|| ||B||)
|
86
|
+
# where A . B is the "dot product" and ||A|| is the magnitude of A
|
87
|
+
#
|
88
|
+
# The variables A and B are the set of unique terms in q and d.
|
89
|
+
#
|
90
|
+
# For example, when q = "big red balloon" and d ="small green balloon" then the
|
91
|
+
# variables are (big,red,balloon,small,green) and a = (1,1,1,0,0) and b =
|
92
|
+
# (0,0,1,1,1).
|
93
|
+
#
|
94
|
+
# query and doc are hashes of token => tf/idf score
|
95
|
+
def calc_similarity(query, doc)
|
96
|
+
tokens = Set.new(query.keys + doc.keys)
|
97
|
+
|
98
|
+
a = Vector.elements(tokens.map { |n| query[n] || 0 }, false)
|
99
|
+
b = Vector.elements(tokens.map { |n| doc[n] || 0 }, false)
|
100
|
+
|
101
|
+
return a.inner_product(b) / (a.magnitude * b.magnitude)
|
102
|
+
end
|
98
103
|
|
104
|
+
def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
|
99
105
|
# tf(t,d) = count of t in d / number of words in d
|
100
106
|
tf = token_count / num_words_in_doc.to_f
|
101
107
|
|
@@ -109,14 +115,13 @@ module Reckon
|
|
109
115
|
end
|
110
116
|
|
111
117
|
def tokenize(str)
|
112
|
-
mk_tokens(str).
|
118
|
+
mk_tokens(str).each_with_object(Hash.new(0)) do |n, memo|
|
113
119
|
memo[n] += 1
|
114
|
-
memo
|
115
120
|
end.to_a
|
116
121
|
end
|
117
122
|
|
118
123
|
def mk_tokens(str)
|
119
|
-
str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
|
124
|
+
str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/).reject(&:empty?)
|
120
125
|
end
|
121
126
|
end
|
122
127
|
end
|
data/lib/reckon/csv_parser.rb
CHANGED
@@ -89,12 +89,7 @@ module Reckon
|
|
89
89
|
money_score += Money::likelihood( entry )
|
90
90
|
possible_neg_money_count += 1 if entry =~ /^\$?[\-\(]\$?\d+/
|
91
91
|
possible_pos_money_count += 1 if entry =~ /^\+?\$?\+?\d+/
|
92
|
-
date_score +=
|
93
|
-
date_score += 5 if entry =~ /^[\-\/\.\d:\[\]]+$/
|
94
|
-
date_score += entry.gsub(/[^\-\/\.\d:\[\]]/, '').length if entry.gsub(/[^\-\/\.\d:\[\]]/, '').length > 3
|
95
|
-
date_score -= entry.gsub(/[\-\/\.\d:\[\]]/, '').length
|
96
|
-
date_score += 30 if entry =~ /^\d+[:\/\.-]\d+[:\/\.-]\d+([ :]\d+[:\/\.]\d+)?$/
|
97
|
-
date_score += 10 if entry =~ /^\d+\[\d+:GMT\]$/i
|
92
|
+
date_score += DateColumn.likelihood(entry)
|
98
93
|
|
99
94
|
# Try to determine if this is a balance column
|
100
95
|
entry_as_num = entry.gsub(/[^\-\d\.]/, '').to_f
|
@@ -168,7 +163,7 @@ module Reckon
|
|
168
163
|
results = evaluate_columns(columns)
|
169
164
|
|
170
165
|
if options[:money_column]
|
171
|
-
self.money_column_indices = [
|
166
|
+
self.money_column_indices = [options[:money_column] - 1]
|
172
167
|
else
|
173
168
|
self.money_column_indices = results.select { |n| n[:is_money_column] }.map { |n| n[:index] }
|
174
169
|
if self.money_column_indices.length == 1
|
data/lib/reckon/date_column.rb
CHANGED
@@ -56,5 +56,15 @@ module Reckon
|
|
56
56
|
date.iso8601
|
57
57
|
end
|
58
58
|
|
59
|
+
def self.likelihood(entry)
|
60
|
+
date_score = 0
|
61
|
+
date_score += 10 if entry =~ /\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/i
|
62
|
+
date_score += 5 if entry =~ /^[\-\/\.\d:\[\]]+$/
|
63
|
+
date_score += entry.gsub(/[^\-\/\.\d:\[\]]/, '').length if entry.gsub(/[^\-\/\.\d:\[\]]/, '').length > 3
|
64
|
+
date_score -= entry.gsub(/[\-\/\.\d:\[\]]/, '').length
|
65
|
+
date_score += 30 if entry =~ /^\d+[:\/\.-]\d+[:\/\.-]\d+([ :]\d+[:\/\.]\d+)?$/
|
66
|
+
date_score += 10 if entry =~ /^\d+\[\d+:GMT\]$/i
|
67
|
+
return date_score
|
68
|
+
end
|
59
69
|
end
|
60
70
|
end
|
data/lib/reckon/money.rb
CHANGED
@@ -5,12 +5,13 @@ module Reckon
|
|
5
5
|
class Money
|
6
6
|
include Comparable
|
7
7
|
attr_accessor :amount, :currency, :suffixed
|
8
|
-
def initialize(
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
8
|
+
def initialize(amount, options = {})
|
9
|
+
@options = options
|
10
|
+
@amount_raw = amount
|
11
|
+
@raw = options[:raw]
|
12
|
+
|
13
|
+
@amount = parse(amount, options)
|
14
|
+
@amount = -@amount if options[:inverse]
|
14
15
|
@currency = options[:currency] || "$"
|
15
16
|
@suffixed = options[:suffixed]
|
16
17
|
end
|
@@ -19,11 +20,19 @@ module Reckon
|
|
19
20
|
return @amount
|
20
21
|
end
|
21
22
|
|
23
|
+
def to_s
|
24
|
+
return @options[:raw] ? "#{@amount_raw} | #{@amount}" : @amount
|
25
|
+
end
|
26
|
+
|
27
|
+
# unary minus
|
28
|
+
# ex
|
29
|
+
# m = Money.new
|
30
|
+
# -m
|
22
31
|
def -@
|
23
|
-
Money.new(
|
32
|
+
Money.new(-@amount, :currency => @currency, :suffixed => @suffixed)
|
24
33
|
end
|
25
34
|
|
26
|
-
def <=>(
|
35
|
+
def <=>(mon)
|
27
36
|
other_amount = mon.to_f
|
28
37
|
if @amount < other_amount
|
29
38
|
-1
|
@@ -34,42 +43,41 @@ module Reckon
|
|
34
43
|
end
|
35
44
|
end
|
36
45
|
|
37
|
-
def pretty(
|
38
|
-
if @
|
39
|
-
|
40
|
-
|
41
|
-
|
46
|
+
def pretty(negate = false)
|
47
|
+
if @raw
|
48
|
+
return @amount_raw unless negate
|
49
|
+
|
50
|
+
return @amount_raw[0] == '-' ? @amount_raw[1..-1] : "-#{@amount_raw}"
|
42
51
|
end
|
52
|
+
|
53
|
+
amt = pretty_amount(@amount * (negate ? -1 : 1))
|
54
|
+
amt = if @suffixed
|
55
|
+
"#{amt} #{@currency}"
|
56
|
+
else
|
57
|
+
amt.gsub(/^((-)|)(?=\d)/, "\\1#{@currency}")
|
58
|
+
end
|
59
|
+
|
60
|
+
return (@amount >= 0 ? " " : "") + amt
|
61
|
+
end
|
62
|
+
|
63
|
+
def pretty_amount(amount)
|
64
|
+
sprintf("%0.2f", amount).reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
|
43
65
|
end
|
44
66
|
|
45
|
-
def
|
67
|
+
def parse(value, options = {})
|
68
|
+
value = value.to_s
|
46
69
|
# Empty string is treated as money with value 0
|
47
|
-
return
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
value = value.
|
52
|
-
value = value.
|
53
|
-
|
54
|
-
|
55
|
-
any_number_regex = /^(.*?)([\d\.]+)/
|
56
|
-
|
57
|
-
# Prefer matching the money_format, match any number otherwise
|
58
|
-
m = value.match( money_format_regex ) ||
|
59
|
-
value.match( any_number_regex )
|
60
|
-
if m
|
61
|
-
amount = m[2].to_f
|
62
|
-
# Check whether the money had a - or (, which indicates negative amounts
|
63
|
-
if (m[1].match( /^[\(-]/ ) || m[1].match( /-$/ ))
|
64
|
-
amount *= -1
|
65
|
-
end
|
66
|
-
return Money.new( amount, options )
|
67
|
-
else
|
68
|
-
return nil
|
69
|
-
end
|
70
|
+
return value.to_f if value.to_s.empty?
|
71
|
+
|
72
|
+
invert = value.match(/^\(.*\)$/)
|
73
|
+
value = value.gsub(/[^0-9,.-]/, '')
|
74
|
+
value = value.tr('.', '').tr(',', '.') if options[:comma_separates_cents]
|
75
|
+
value = value.tr(',', '')
|
76
|
+
value = value.to_f
|
77
|
+
return invert ? -value : value
|
70
78
|
end
|
71
79
|
|
72
|
-
def Money::likelihood(
|
80
|
+
def Money::likelihood(entry)
|
73
81
|
money_score = 0
|
74
82
|
# digits separated by , or . with no more than 2 trailing digits
|
75
83
|
money_score += 40 if entry.match(/\d+[,.]\d{2}[^\d]*$/)
|
@@ -83,31 +91,30 @@ module Reckon
|
|
83
91
|
end
|
84
92
|
|
85
93
|
class MoneyColumn < Array
|
86
|
-
def initialize(
|
87
|
-
arr.each { |str|
|
94
|
+
def initialize(arr = [], options = {})
|
95
|
+
arr.each { |str| push(Money.new(str, options)) }
|
88
96
|
end
|
89
97
|
|
90
98
|
def positive?
|
91
|
-
|
92
|
-
return false if money < 0
|
99
|
+
each do |money|
|
100
|
+
return false if money && money < 0
|
93
101
|
end
|
94
102
|
true
|
95
103
|
end
|
96
104
|
|
97
|
-
def merge!(
|
105
|
+
def merge!(other_column)
|
98
106
|
invert = false
|
99
|
-
invert = true if
|
100
|
-
|
107
|
+
invert = true if positive? && other_column.positive?
|
108
|
+
each_with_index do |mon, i|
|
101
109
|
other = other_column[i]
|
102
|
-
return nil if
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
elsif mon == 0.00 && other != 0.00
|
110
|
+
return nil if !mon || !other
|
111
|
+
|
112
|
+
if mon != 0.0 && other == 0.0
|
113
|
+
self[i] = -mon if invert
|
114
|
+
elsif mon == 0.0 && other != 0.0
|
108
115
|
self[i] = other
|
109
116
|
else
|
110
|
-
|
117
|
+
self[i] = Money.new(0)
|
111
118
|
end
|
112
119
|
end
|
113
120
|
self
|