reckon 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,119 +1,122 @@
1
1
  require 'matrix'
2
+ require 'set'
2
3
 
3
4
  # Implementation of consine similarity using TF-IDF for vectorization.
4
5
  # Used to suggest which account a transaction should be assigned to
5
- class CosineSimilarity
6
- def initialize(options)
7
- @options = options
8
- @tokens = {}
9
- @accounts = Hash.new(0)
10
- end
11
-
12
- def add_document(account, doc)
13
- tokenize(doc).each do |n|
14
- (token, count) = n
15
-
16
- @tokens[token] ||= {}
17
- @tokens[token][account] ||= 0
18
- @tokens[token][account] += count
19
- @accounts[account] += count
6
+ module Reckon
7
+ class CosineSimilarity
8
+ def initialize(options)
9
+ @options = options
10
+ @tokens = {}
11
+ @accounts = Hash.new(0)
20
12
  end
21
- end
22
-
23
- # find most similar documents to query
24
- def find_similar(query)
25
- (query_scores, corpus_scores) = td_idf_scores_for(query)
26
13
 
27
- query_vector = Vector.elements(query_scores, false)
14
+ def add_document(account, doc)
15
+ tokenize(doc).each do |n|
16
+ (token, count) = n
28
17
 
29
- # For each doc, calculate the similarity to the query
30
- suggestions = corpus_scores.map do |account, scores|
31
- acct_vector = Vector.elements(scores, false)
18
+ @tokens[token] ||= {}
19
+ @tokens[token][account] ||= 0
20
+ @tokens[token][account] += count
21
+ @accounts[account] += count
22
+ end
23
+ end
32
24
 
33
- acct_query_dp = acct_vector.inner_product(query_vector)
34
- # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
35
- # exactly opposite
36
- # see https://en.wikipedia.org/wiki/Cosine_similarity
37
- # cos(theta) = (A . B) / (||A|| ||B||)
38
- # where A . B is the "dot product" and ||A|| is the magnitude of A
39
- # ruby has the 'matrix' library we can use to do these calculations.
40
- {
41
- similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
42
- account: account,
43
- }
44
- end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
25
+ # find most similar documents to query
26
+ def find_similar(query)
27
+ (query_scores, corpus_scores) = td_idf_scores_for(query)
45
28
 
46
- LOGGER.info "most similar accounts: #{suggestions}"
29
+ query_vector = Vector.elements(query_scores, false)
47
30
 
48
- return suggestions
49
- end
31
+ # For each doc, calculate the similarity to the query
32
+ suggestions = corpus_scores.map do |account, scores|
33
+ acct_vector = Vector.elements(scores, false)
50
34
 
51
- private
35
+ acct_query_dp = acct_vector.inner_product(query_vector)
36
+ # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is
37
+ # exactly opposite
38
+ # see https://en.wikipedia.org/wiki/Cosine_similarity
39
+ # cos(theta) = (A . B) / (||A|| ||B||)
40
+ # where A . B is the "dot product" and ||A|| is the magnitude of A
41
+ # ruby has the 'matrix' library we can use to do these calculations.
42
+ {
43
+ similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude),
44
+ account: account,
45
+ }
46
+ end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }
52
47
 
53
- def td_idf_scores_for(query)
54
- query_tokens = tokenize(query)
55
- corpus = Set.new
56
- corpus_scores = {}
57
- query_scores = []
58
- num_docs = @accounts.length
48
+ LOGGER.info "most similar accounts: #{suggestions}"
59
49
 
60
- query_tokens.each do |n|
61
- (token, _count) = n
62
- next unless @tokens[token]
63
- corpus = corpus.union(Set.new(@tokens[token].keys))
50
+ return suggestions
64
51
  end
65
52
 
66
- query_tokens.each do |n|
67
- (token, count) = n
53
+ private
68
54
 
69
- # if no other docs have token, ignore it
70
- next unless @tokens[token]
55
+ def td_idf_scores_for(query)
56
+ query_tokens = tokenize(query)
57
+ corpus = Set.new
58
+ corpus_scores = {}
59
+ query_scores = []
60
+ num_docs = @accounts.length
61
+
62
+ query_tokens.each do |n|
63
+ (token, _count) = n
64
+ next unless @tokens[token]
65
+ corpus = corpus.union(Set.new(@tokens[token].keys))
66
+ end
71
67
 
72
- ## First, calculate scores for our query as we're building scores for the corpus
73
- query_scores << calc_tf_idf(
74
- count,
75
- query_tokens.length,
76
- @tokens[token].length,
77
- num_docs
78
- )
68
+ query_tokens.each do |n|
69
+ (token, count) = n
79
70
 
80
- ## Next, calculate for the corpus, where our "account" is a document
81
- corpus.each do |account|
82
- corpus_scores[account] ||= []
71
+ # if no other docs have token, ignore it
72
+ next unless @tokens[token]
83
73
 
84
- corpus_scores[account] << calc_tf_idf(
85
- (@tokens[token][account] || 0),
86
- @accounts[account].to_f,
87
- @tokens[token].length.to_f,
74
+ ## First, calculate scores for our query as we're building scores for the corpus
75
+ query_scores << calc_tf_idf(
76
+ count,
77
+ query_tokens.length,
78
+ @tokens[token].length,
88
79
  num_docs
89
80
  )
81
+
82
+ ## Next, calculate for the corpus, where our "account" is a document
83
+ corpus.each do |account|
84
+ corpus_scores[account] ||= []
85
+
86
+ corpus_scores[account] << calc_tf_idf(
87
+ (@tokens[token][account] || 0),
88
+ @accounts[account].to_f,
89
+ @tokens[token].length.to_f,
90
+ num_docs
91
+ )
92
+ end
90
93
  end
94
+ [query_scores, corpus_scores]
91
95
  end
92
- [query_scores, corpus_scores]
93
- end
94
96
 
95
- def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
97
+ def calc_tf_idf(token_count, num_words_in_doc, df, num_docs)
96
98
 
97
- # tf(t,d) = count of t in d / number of words in d
98
- tf = token_count / num_words_in_doc.to_f
99
+ # tf(t,d) = count of t in d / number of words in d
100
+ tf = token_count / num_words_in_doc.to_f
99
101
 
100
- # smooth idf weight
101
- # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
102
- # df(t) = num of documents with term t in them
103
- # idf(t) = log(N/(1 + df )) + 1
104
- idf = Math.log(num_docs.to_f / (1 + df)) + 1
102
+ # smooth idf weight
103
+ # see https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2
104
+ # df(t) = num of documents with term t in them
105
+ # idf(t) = log(N/(1 + df )) + 1
106
+ idf = Math.log(num_docs.to_f / (1 + df)) + 1
105
107
 
106
- tf * idf
107
- end
108
+ tf * idf
109
+ end
108
110
 
109
- def tokenize(str)
110
- mk_tokens(str).inject(Hash.new(0)) do |memo, n|
111
- memo[n] += 1
112
- memo
113
- end.to_a
114
- end
115
- end
111
+ def tokenize(str)
112
+ mk_tokens(str).inject(Hash.new(0)) do |memo, n|
113
+ memo[n] += 1
114
+ memo
115
+ end.to_a
116
+ end
116
117
 
117
- def mk_tokens(str)
118
- str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
118
+ def mk_tokens(str)
119
+ str.downcase.tr(';', ' ').tr("'", '').split(/[^a-z0-9.]+/)
120
+ end
121
+ end
119
122
  end
@@ -12,38 +12,69 @@ module Reckon
12
12
  detect_columns
13
13
  end
14
14
 
15
- def filter_csv
16
- if options[:ignore_columns]
17
- new_columns = []
18
- columns.each_with_index do |column, index|
19
- new_columns << column unless options[:ignore_columns].include?(index + 1)
15
+ def columns
16
+ @columns ||=
17
+ begin
18
+ last_row_length = nil
19
+ csv_data.inject([]) do |memo, row|
20
+ unless row.all? { |i| i.nil? || i.length == 0 }
21
+ row.each_with_index do |entry, index|
22
+ memo[index] ||= []
23
+ memo[index] << (entry || '').strip
24
+ end
25
+ last_row_length = row.length
26
+ end
27
+ memo
28
+ end
20
29
  end
21
- @columns = new_columns
22
- end
23
30
  end
24
31
 
25
- def money_for(index)
26
- @money_column[index]
32
+ def date_for(index)
33
+ @date_column.for(index)
27
34
  end
28
35
 
29
- def pretty_money_for(index, negate = false)
30
- money_for( index ).pretty( negate )
36
+ def pretty_date_for(index)
37
+ @date_column.pretty_for( index )
38
+ end
39
+
40
+ def money_for(index)
41
+ @money_column[index]
31
42
  end
32
43
 
33
44
  def pretty_money(amount, negate = false)
34
45
  Money.new( amount, @options ).pretty( negate )
35
46
  end
36
47
 
37
- def date_for(index)
38
- @date_column.for( index )
39
- end
48
+ def pretty_money_for(index, negate = false)
49
+ money = money_for(index)
50
+ return 0 if money.nil?
40
51
 
41
- def pretty_date_for(index)
42
- @date_column.pretty_for( index )
52
+ money.pretty(negate)
43
53
  end
44
54
 
45
55
  def description_for(index)
46
- description_column_indices.map { |i| columns[i][index] }.reject(&:empty?).join("; ").squeeze(" ").gsub(/(;\s+){2,}/, '').strip
56
+ description_column_indices.map { |i| columns[i][index].to_s.strip }
57
+ .reject(&:empty?)
58
+ .join("; ")
59
+ .squeeze(" ")
60
+ .gsub(/(;\s+){2,}/, '')
61
+ .strip
62
+ end
63
+
64
+ def row(index)
65
+ csv_data[index].join(", ")
66
+ end
67
+
68
+ private
69
+
70
+ def filter_csv
71
+ if options[:ignore_columns]
72
+ new_columns = []
73
+ columns.each_with_index do |column, index|
74
+ new_columns << column unless options[:ignore_columns].include?(index + 1)
75
+ end
76
+ @columns = new_columns
77
+ end
47
78
  end
48
79
 
49
80
  def evaluate_columns(cols)
@@ -87,48 +118,24 @@ module Reckon
87
118
  results << { :index => index, :money_score => money_score, :date_score => date_score }
88
119
  end
89
120
 
90
- return [results, found_likely_money_column]
91
- end
121
+ results.sort_by! { |n| -n[:money_score] }
92
122
 
93
- def merge_columns(a, b)
94
- output_columns = []
95
- columns.each_with_index do |column, index|
96
- if index == a
97
- new_column = MoneyColumn.new( column )
98
- .merge!( MoneyColumn.new( columns[b] ) )
99
- .map { |m| m.amount.to_s }
100
- output_columns << new_column
101
- elsif index == b
102
- # skip
103
- else
104
- output_columns << column
105
- end
123
+ # check if it looks like a 2-column file with a balance field
124
+ if results.length >= 3 && results[1][:money_score] + results[2][:money_score] >= results[0][:money_score]
125
+ results[1][:is_money_column] = true
126
+ results[2][:is_money_column] = true
127
+ else
128
+ results[0][:is_money_column] = true
106
129
  end
107
- output_columns
108
- end
109
130
 
110
- def evaluate_two_money_columns( columns, id1, id2, unmerged_results )
111
- merged_columns = merge_columns( id1, id2 )
112
- results, found_likely_money_column = evaluate_columns( merged_columns )
113
- if !found_likely_money_column
114
- new_res = results.find { |el| el[:index] == id1 }
115
- old_res1 = unmerged_results.find { |el| el[:index] == id1 }
116
- old_res2 = unmerged_results.find { |el| el[:index] == id2 }
117
- if new_res[:money_score] > old_res1[:money_score] &&
118
- new_res[:money_score] > old_res2[:money_score]
119
- found_likely_money_column = true
120
- end
121
- end
122
- [results, found_likely_money_column]
131
+ return results.sort_by { |n| n[:index] }
123
132
  end
124
133
 
125
- def found_double_money_column( id1, id2 )
126
- self.money_column_indices = [ id1, id2 ]
127
- unless settings[:testing]
128
- puts "It looks like this CSV has two seperate columns for money, one of which shows positive"
129
- puts "changes and one of which shows negative changes. If this is true, great. Otherwise,"
130
- puts "please report this issue to us so we can take a look!\n"
131
- end
134
+ def found_double_money_column(id1, id2)
135
+ self.money_column_indices = [id1, id2]
136
+ puts "It looks like this CSV has two seperate columns for money, one of which shows positive"
137
+ puts "changes and one of which shows negative changes. If this is true, great. Otherwise,"
138
+ puts "please report this issue to us so we can take a look!\n"
132
139
  end
133
140
 
134
141
  # Some csv files negative/positive amounts are indicated in separate account
@@ -158,41 +165,18 @@ module Reckon
158
165
  end
159
166
 
160
167
  def detect_columns
161
- results, found_likely_money_column = evaluate_columns(columns)
168
+ results = evaluate_columns(columns)
169
+
162
170
  if options[:money_column]
163
- found_likely_money_column = true
164
171
  self.money_column_indices = [ options[:money_column] - 1 ]
165
172
  else
166
- self.money_column_indices = [ results.max_by { |n| n[:money_score] }[:index] ]
167
- end
168
-
169
- if !found_likely_money_column
170
- found_likely_double_money_columns = false
171
- 0.upto(columns.length - 2) do |i|
172
- if MoneyColumn.new( columns[i] ).merge!( MoneyColumn.new( columns[i+1] ) )
173
- _, found_likely_double_money_columns = evaluate_columns(merge_columns(i, i+1))
174
- if found_likely_double_money_columns
175
- found_double_money_column( i, i + 1 )
176
- break
177
- end
178
- end
179
- end
180
-
181
- if !found_likely_double_money_columns
182
- 0.upto(columns.length - 2) do |i|
183
- if MoneyColumn.new( columns[i] ).merge!( MoneyColumn.new( columns[i+1] ) )
184
- # Try a more specific test
185
- _, found_likely_double_money_columns = evaluate_two_money_columns( columns, i, i+1, results )
186
- if found_likely_double_money_columns
187
- found_double_money_column( i, i + 1 )
188
- break
189
- end
190
- end
191
- end
192
- end
193
-
194
- if !found_likely_double_money_columns && !settings[:testing]
195
- puts "I didn't find a high-likelyhood money column, but I'm taking my best guess with column #{money_column_indices.first + 1}."
173
+ self.money_column_indices = results.select { |n| n[:is_money_column] }.map { |n| n[:index] }
174
+ if self.money_column_indices.length == 1
175
+ puts "Using column #{money_column_indices.first + 1} as the money column. Use --money-colum to specify a different one."
176
+ elsif self.money_column_indices.length == 2
177
+ found_double_money_column(*self.money_column_indices)
178
+ else
179
+ puts "Unable to determine a money column, use --money-column to specify the column reckon should use."
196
180
  end
197
181
  end
198
182
 
@@ -216,23 +200,6 @@ module Reckon
216
200
  self.description_column_indices = results.map { |i| i[:index] }
217
201
  end
218
202
 
219
- def columns
220
- @columns ||= begin
221
- last_row_length = nil
222
- csv_data.inject([]) do |memo, row|
223
- # fail "Input CSV must have consistent row lengths." if last_row_length && row.length != last_row_length
224
- unless row.all? { |i| i.nil? || i.length == 0 }
225
- row.each_with_index do |entry, index|
226
- memo[index] ||= []
227
- memo[index] << (entry || '').strip
228
- end
229
- last_row_length = row.length
230
- end
231
- memo
232
- end
233
- end
234
- end
235
-
236
203
  def parse(data, filename=nil)
237
204
  # Use force_encoding to convert the string to utf-8 with as few invalid characters
238
205
  # as possible.
@@ -274,15 +241,5 @@ module Reckon
274
241
  end
275
242
  m && m[1]
276
243
  end
277
-
278
- @settings = { :testing => false }
279
-
280
- def self.settings
281
- @settings
282
- end
283
-
284
- def settings
285
- self.class.settings
286
- end
287
244
  end
288
245
  end
@@ -0,0 +1,60 @@
1
+ module Reckon
2
+ class DateColumn < Array
3
+ attr_accessor :endian_precedence
4
+ def initialize( arr = [], options = {} )
5
+ arr.each do |value|
6
+ if options[:date_format]
7
+ begin
8
+ value = Date.strptime(value, options[:date_format])
9
+ rescue
10
+ puts "I'm having trouble parsing #{value} with the desired format: #{options[:date_format]}"
11
+ exit 1
12
+ end
13
+ else
14
+ value = [$1, $2, $3].join("/") if value =~ /^(\d{4})(\d{2})(\d{2})\d+\[\d+\:GMT\]$/ # chase format
15
+ value = [$3, $2, $1].join("/") if value =~ /^(\d{2})\.(\d{2})\.(\d{4})$/ # german format
16
+ value = [$3, $2, $1].join("/") if value =~ /^(\d{2})\-(\d{2})\-(\d{4})$/ # nordea format
17
+ value = [$1, $2, $3].join("/") if value =~ /^(\d{4})\-(\d{2})\-(\d{2})$/ # yyyy-mm-dd format
18
+ value = [$1, $2, $3].join("/") if value =~ /^(\d{4})(\d{2})(\d{2})/ # yyyymmdd format
19
+
20
+
21
+ unless @endian_precedence # Try to detect endian_precedence
22
+ reg_match = value.match( /^(\d\d)\/(\d\d)\/\d\d\d?\d?/ )
23
+ # If first one is not \d\d/\d\d/\d\d\d?\d set it to default
24
+ if !reg_match
25
+ @endian_precedence = [:middle, :little]
26
+ elsif reg_match[1].to_i > 12
27
+ @endian_precedence = [:little]
28
+ elsif reg_match[2].to_i > 12
29
+ @endian_precedence = [:middle]
30
+ end
31
+ end
32
+ end
33
+ self.push( value )
34
+ end
35
+ # if endian_precedence still nil, raise error
36
+ unless @endian_precedence || options[:date_format]
37
+ raise( "Unable to determine date format. Please specify using --date-format" )
38
+ end
39
+ end
40
+
41
+ def for( index )
42
+ value = self.at( index )
43
+ guess = Chronic.parse(value, :context => :past,
44
+ :endian_precedence => @endian_precedence )
45
+ if guess.to_i < 953236800 && value =~ /\//
46
+ guess = Chronic.parse((value.split("/")[0...-1] + [(2000 + value.split("/").last.to_i).to_s]).join("/"), :context => :past,
47
+ :endian_precedence => @endian_precedence)
48
+ end
49
+ guess && guess.to_date
50
+ end
51
+
52
+ def pretty_for(index)
53
+ date = self.for(index)
54
+ return "" if date.nil?
55
+
56
+ date.iso8601
57
+ end
58
+
59
+ end
60
+ end