reckon 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 26126e1e4ead4fcd93a94093f8e4a4fd557a9c46ccc6983198d4bdd516e639ee
4
- data.tar.gz: f196a86ca58ebaeee4f27030bb21af9fd3c3001d20d06b74f2063ea42d96cfcc
3
+ metadata.gz: 03c20b48d4333969c8304a5bb9a3c01fc6053050ab9146329ce14ae6a9886b38
4
+ data.tar.gz: 27a2ce4e8db5c7818cc4cefb19f180a7c727190f0a990403f565fad503e749a9
5
5
  SHA512:
6
- metadata.gz: 02ad471caf5a5b6b69d98cde9cfa5e0579bbe680f50bb7fa8c1f5b3a7908018a48075531674a27d23139913be9156f889cac8010e06de87439c0665b064d7171
7
- data.tar.gz: a8abf375fab7ba91d31a0d05ee372a5fc788feca0fe6a61859cb4ed6c72a3387238a4df2d2dac2ce300fe4f047142ea18bbe03a364e008e2b9588143f0223852
6
+ metadata.gz: 2f569b3d5cf4038714065a6d184d6c07f57d10598e5efc610eeb9919e8b18c65aff5e5329ab89a9ed30f72cabce9d11f5645af4d0df3bda6d05ad9afd988f7e7
7
+ data.tar.gz: 1783a63ba138c2b87a0756d6b9bcfbce068daf977e582a4c920a37ff50358328f8514f308dbbf932ef5cc4111e9e52dadfaed5876b9d30f4759d4a1eb31299fa
@@ -9,7 +9,9 @@ name: Build Status
9
9
 
10
10
  on:
11
11
  push:
12
+ branches: [ master ]
12
13
  pull_request:
14
+ branches: [ master ]
13
15
 
14
16
  jobs:
15
17
  test:
data/.rubocop.yml ADDED
@@ -0,0 +1,20 @@
1
+ Layout/LineLength:
2
+ Max: 88
3
+
4
+ Style/StringLiterals:
5
+ Enabled: false
6
+
7
+ Style/RedundantReturn:
8
+ Enabled: false
9
+
10
+ Metrics/ClassLength:
11
+ Enabled: False
12
+
13
+ Metrics/MethodLength:
14
+ Enabled: False
15
+
16
+ Metrics/AbcSize:
17
+ Enabled: False
18
+
19
+ Style/NumericPredicate:
20
+ Enabled: False
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  # Changelog
2
2
 
3
+ ## [v0.9.1](https://github.com/cantino/reckon/tree/v0.9.1) (2023-03-19)
4
+
5
+ [Full Changelog](https://github.com/cantino/reckon/compare/v0.9.0...v0.9.1)
6
+
7
+ **Closed issues:**
8
+
9
+ - More than one column support [\#120](https://github.com/cantino/reckon/issues/120)
10
+ - Beancount support [\#119](https://github.com/cantino/reckon/issues/119)
11
+ - Problem with importing CSV [\#60](https://github.com/cantino/reckon/issues/60)
12
+
3
13
  ## [v0.9.0](https://github.com/cantino/reckon/tree/v0.9.0) (2023-02-23)
4
14
 
5
15
  [Full Changelog](https://github.com/cantino/reckon/compare/v0.9.0-beta...v0.9.0)
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- reckon (0.9.0)
4
+ reckon (0.9.1)
5
5
  chronic (>= 0.3.0)
6
6
  highline (>= 1.5.2)
7
7
  matrix (>= 0.4.2)
data/Rakefile CHANGED
@@ -13,10 +13,10 @@ task :test_all do
13
13
  puts "Running unit tests"
14
14
  Rake::Task["spec"].invoke
15
15
  puts "Running integration tests"
16
- Rake::Task["integration_tests"].invoke
16
+ Rake::Task["test_integration"].invoke
17
17
  end
18
18
 
19
- task :integration_tests do
19
+ task :test_integration do
20
20
  cmd = 'prove -v ./spec/integration/test.sh'
21
21
  raise 'Integration tests failed' unless system(cmd)
22
22
  end
@@ -1,6 +1,6 @@
1
1
  #!/bin/bash
2
2
 
3
- set -e
3
+ set -xe
4
4
 
5
5
  VERSION=$1
6
6
 
@@ -8,7 +8,7 @@ echo "Install github_changelog_generator"
8
8
  gem install --user github_changelog_generator
9
9
 
10
10
  echo "Update 'lib/reckon/version.rb'"
11
- echo -e "module Reckon\n VERSION=\"$VERSION\"\nend" > lib/reckon/version.rb
11
+ echo -e "module Reckon\n VERSION = \"$VERSION\"\nend" > lib/reckon/version.rb
12
12
  echo "Run `bundle install` to build updated Gemfile.lock"
13
13
  bundle install
14
14
  echo "Run changelog generator (requires $TOKEN to be your github token)"
@@ -24,3 +24,4 @@ echo "Push changes and tags"
24
24
  echo "git push && git push --tags"
25
25
  echo "Push new gem"
26
26
  echo "gem push reckon-$VERSION.gem"
27
+ gh release create v$VERSION reckon-$VERSION.gem --draft --generate-notes
data/bin/reckon CHANGED
@@ -4,7 +4,7 @@ require 'rubygems'
4
4
  require 'reckon'
5
5
 
6
6
  begin
7
- options = Reckon::Options.parse
7
+ options = Reckon::Options.parse_command_line_options
8
8
  rescue RuntimeError => e
9
9
  puts("ERROR: #{e}")
10
10
  exit(1)
data/lib/reckon/app.rb CHANGED
@@ -1,12 +1,12 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
- require 'pp'
4
3
  require 'yaml'
4
+ require 'stringio'
5
5
 
6
6
  module Reckon
7
+ # The main app
7
8
  class App
8
9
  attr_accessor :options, :seen, :csv_parser, :regexps, :matcher
9
- @@cli = HighLine.new
10
10
 
11
11
  def initialize(opts = {})
12
12
  self.options = opts
@@ -14,9 +14,10 @@ module Reckon
14
14
 
15
15
  self.regexps = {}
16
16
  self.seen = Set.new
17
- self.options[:currency] ||= '$'
18
- @csv_parser = CSVParser.new( options )
17
+ @cli = HighLine.new
18
+ @csv_parser = CSVParser.new(options)
19
19
  @matcher = CosineSimilarity.new(options)
20
+ @parser = options[:format] =~ /beancount/i ? BeancountParser.new : LedgerParser.new
20
21
  learn!
21
22
  end
22
23
 
@@ -26,9 +27,13 @@ module Reckon
26
27
  fh.puts str
27
28
  end
28
29
 
30
+ # Learn from previous transactions. Used to recommend accounts for a transaction.
29
31
  def learn!
30
32
  learn_from_account_tokens(options[:account_tokens_file])
31
33
  learn_from_ledger_file(options[:existing_ledger_file])
34
+ # TODO: make this work
35
+ # this doesn't work because output_file is an IO object
36
+ # learn_from_ledger_file(options[:output_file]) if File.exist?(options[:output_file])
32
37
  end
33
38
 
34
39
  def learn_from_account_tokens(filename)
@@ -52,12 +57,13 @@ module Reckon
52
57
 
53
58
  raise "#{ledger_file} doesn't exist!" unless File.exist?(ledger_file)
54
59
 
55
- learn_from_ledger(File.read(ledger_file))
60
+ learn_from_ledger(File.new(ledger_file))
56
61
  end
57
62
 
63
+ # Takes an IO-like object
58
64
  def learn_from_ledger(ledger)
59
65
  LOGGER.info "learning from #{ledger}"
60
- LedgerParser.new(ledger).entries.each do |entry|
66
+ @parser.parse(ledger).each do |entry|
61
67
  entry[:accounts].each do |account|
62
68
  str = [entry[:desc], account[:amount]].join(" ")
63
69
  if account[:name] != options[:bank_account]
@@ -84,7 +90,7 @@ module Reckon
84
90
  merged_acct = [account, k].compact.join(':')
85
91
  extract_account_tokens(v, merged_acct)
86
92
  end
87
- at.inject({}) { |memo, e| memo.merge!(e)}
93
+ at.inject({}) { |memo, e| memo.merge!(e) }
88
94
  end
89
95
  end
90
96
 
@@ -92,6 +98,7 @@ module Reckon
92
98
  # https://github.com/tenderlove/psych/blob/master/lib/psych/visitors/to_ruby.rb
93
99
  match = regex_str.match(/^\/(.*)\/([ix]*)$/m)
94
100
  fail "failed to parse regexp #{regex_str}" unless match
101
+
95
102
  options = 0
96
103
  (match[2] || '').split('').each do |option|
97
104
  case option
@@ -120,13 +127,16 @@ module Reckon
120
127
 
121
128
  if row[:money] > 0
122
129
  # out_of_account
123
- answer = ask_account_question("Which account provided this income? (#{cmd_options})", row)
130
+ answer = ask_account_question(
131
+ "Which account provided this income? (#{cmd_options})", row
132
+ )
124
133
  line1 = [options[:bank_account], row[:pretty_money]]
125
134
  line2 = [answer, ""]
126
135
  else
127
136
  # into_account
128
- answer = ask_account_question("To which account did this money go? (#{cmd_options})", row)
129
- # line1 = [answer, row[:pretty_money_negated]]
137
+ answer = ask_account_question(
138
+ "To which account did this money go? (#{cmd_options})", row
139
+ )
130
140
  line1 = [answer, ""]
131
141
  line2 = [options[:bank_account], row[:pretty_money]]
132
142
  end
@@ -137,9 +147,9 @@ module Reckon
137
147
  next
138
148
  end
139
149
 
140
- ledger = ledger_format(row, line1, line2)
150
+ ledger = @parser.format_row(row, line1, line2)
141
151
  LOGGER.info "ledger line: #{ledger}"
142
- learn_from_ledger(ledger) unless options[:account_tokens_file]
152
+ learn_from_ledger(StringIO.new(ledger)) unless options[:account_tokens_file]
143
153
  output(ledger)
144
154
  end
145
155
  end
@@ -203,7 +213,7 @@ module Reckon
203
213
  return possible_answers[0] || default
204
214
  end
205
215
 
206
- answer = @@cli.ask(msg) do |q|
216
+ answer = @cli.ask(msg) do |q|
207
217
  q.completion = possible_answers
208
218
  q.readline = true
209
219
  q.default = possible_answers.first
@@ -221,7 +231,7 @@ module Reckon
221
231
  end
222
232
 
223
233
  def add_description(row)
224
- desc_answer = @@cli.ask("Enter a new description for this transaction (empty line aborts)\n") do |q|
234
+ desc_answer = @cli.ask("Enter a new description for this transaction (empty line aborts)\n") do |q|
225
235
  q.overwrite = true
226
236
  q.readline = true
227
237
  q.default = row[:description]
@@ -231,7 +241,7 @@ module Reckon
231
241
  end
232
242
 
233
243
  def add_note(row)
234
- desc_answer = @@cli.ask("Enter a new note for this transaction (empty line aborts)\n") do |q|
244
+ desc_answer = @cli.ask("Enter a new note for this transaction (empty line aborts)\n") do |q|
235
245
  q.overwrite = true
236
246
  q.readline = true
237
247
  q.default = row[:note]
@@ -246,7 +256,7 @@ module Reckon
246
256
  [account, match[0]]
247
257
  end
248
258
  }.compact
249
- matches.sort_by! { |_account, matched_text| matched_text.length }.map(&:first)
259
+ matches.sort_by { |_account, matched_text| matched_text.length }.map(&:first)
250
260
  end
251
261
 
252
262
  def suggest(row)
@@ -254,13 +264,6 @@ module Reckon
254
264
  @matcher.find_similar(row[:description]).map { |n| n[:account] }
255
265
  end
256
266
 
257
- def ledger_format(row, line1, line2)
258
- out = "#{row[:pretty_date]}\t#{row[:description]}#{row[:note] ? "\t; " + row[:note]: ""}\n"
259
- out += "\t#{line1.first}\t\t\t#{line1.last}\n"
260
- out += "\t#{line2.first}\t\t\t#{line2.last}\n\n"
261
- out
262
- end
263
-
264
267
  def output(ledger_line)
265
268
  options[:output_file].puts ledger_line
266
269
  options[:output_file].flush
@@ -0,0 +1,150 @@
1
+ require 'rubygems'
2
+ require 'date'
3
+
4
+ module Reckon
5
+ class BeancountParser
6
+
7
+ attr_accessor :entries
8
+
9
+ def initialize(options = {})
10
+ @options = options
11
+ @date_format = options[:ledger_date_format] || options[:date_format] || '%Y-%m-%d'
12
+ end
13
+
14
+ # 2015-01-01 * "Opening Balance for checking account"
15
+ # Assets:US:BofA:Checking 3490.52 USD
16
+ # Equity:Opening-Balances -3490.52 USD
17
+
18
+ # input is an object that response to #each_line,
19
+ # (i.e. a StringIO or an IO object)
20
+ def parse(input)
21
+ entries = []
22
+ comment_chars = ';#%*|'
23
+ new_entry = {}
24
+
25
+ input.each_line do |entry|
26
+
27
+ next if entry =~ /^\s*[#{comment_chars}]/
28
+
29
+ m = entry.match(%r{
30
+ ^
31
+ (\d+[\d/-]+) # date
32
+ \s+
33
+ ([*!])? # type
34
+ \s*
35
+ ("[^"]*")? # description (optional)
36
+ \s*
37
+ ("[^"]*")? # notes (optional)
38
+ # tags (not implemented)
39
+ }x)
40
+
41
+ # (date, type, code, description), type and code are optional
42
+ if (m)
43
+ add_entry(entries, new_entry)
44
+ new_entry = {
45
+ date: try_parse_date(m[1]),
46
+ type: m[2] || "",
47
+ desc: trim_quote(m[3]),
48
+ notes: trim_quote(m[4]),
49
+ accounts: []
50
+ }
51
+ elsif entry =~ /^\s*$/ && new_entry[:date]
52
+ add_entry(entries, new_entry)
53
+ new_entry = {}
54
+ elsif new_entry[:date] && entry =~ /^\s+/
55
+ LOGGER.info("Adding new account #{entry}")
56
+ new_entry[:accounts] << parse_account_line(entry)
57
+ else
58
+ LOGGER.info("Unknown entry type: #{entry}")
59
+ add_entry(entries, new_entry)
60
+ new_entry = {}
61
+ end
62
+
63
+ end
64
+ entries
65
+ end
66
+
67
+ def format_row(row, line1, line2)
68
+ out = %Q{#{row[:pretty_date]} * "#{row[:description]}" "#{row[:note]}\n}
69
+ out += "\t#{line1.first}\t\t\t#{line1.last}\n"
70
+ out += "\t#{line2.first}\t\t\t#{line2.last}\n\n"
71
+ out
72
+ end
73
+
74
+ private
75
+
76
+ # remove leading and trailing quote character (")
77
+ def trim_quote(str)
78
+ return str if !str
79
+ str.gsub(/^"([^"]*)"$/, '\1')
80
+ end
81
+
82
+ def add_entry(entries, entry)
83
+ return unless entry[:date] && entry[:accounts].length > 1
84
+
85
+ entry[:accounts] = balance(entry[:accounts])
86
+ entries << entry
87
+ end
88
+
89
+ def try_parse_date(date_str)
90
+ date = Date.parse(date_str)
91
+ return nil if date.year > 9999 || date.year < 1000
92
+
93
+ date
94
+ rescue ArgumentError
95
+ nil
96
+ end
97
+
98
+ def parse_account_line(entry)
99
+ # TODO handle buying stocks
100
+ # Assets:US:ETrade:VHT 19 VHT {132.32 USD, 2017-08-27}
101
+ (account_name, rest) = entry.strip.split(/\s{2,}|\t+/, 2)
102
+
103
+ if rest.nil? || rest.empty?
104
+ return {
105
+ name: account_name,
106
+ amount: clean_money("")
107
+ }
108
+ end
109
+
110
+ value = if rest =~ /{/
111
+ (qty, dollar_value, date) = rest.split(/[{,]/)
112
+ (qty.to_f * dollar_value.to_f).to_s
113
+ else
114
+ rest
115
+ end
116
+
117
+ return {
118
+ name: account_name,
119
+ amount: clean_money(value || "")
120
+ }
121
+ end
122
+
123
+ def balance(accounts)
124
+ return accounts unless accounts.any? { |i| i[:amount].nil? }
125
+
126
+ sum = accounts.reduce(0) { |m, n| m + (n[:amount] || 0) }
127
+ count = 0
128
+ accounts.each do |account|
129
+ next unless account[:amount].nil?
130
+
131
+ count += 1
132
+ account[:amount] = -sum
133
+ end
134
+ if count > 1
135
+ puts "Warning: unparsable entry due to more than one missing money value."
136
+ p accounts
137
+ puts
138
+ end
139
+
140
+ accounts
141
+ end
142
+
143
+ def clean_money(money)
144
+ return nil if money.nil? || money.empty?
145
+
146
+ money.gsub(/[^0-9.-]/, '').to_f
147
+ end
148
+ end
149
+ end
150
+
@@ -17,7 +17,6 @@ module Reckon
17
17
 
18
18
  def initialize(options)
19
19
  @docs = DocumentInfo.new({}, {})
20
- @options = options
21
20
  end
22
21
 
23
22
  def add_document(account, doc)
@@ -1,32 +1,28 @@
1
- #coding: utf-8
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
2
4
 
3
5
  module Reckon
6
+ # Parses CSV files
4
7
  class CSVParser
5
- attr_accessor :options, :csv_data, :money_column_indices, :date_column_index, :description_column_indices, :money_column, :date_column
8
+ attr_accessor :options, :csv_data, :money_column_indices, :date_column_index,
9
+ :description_column_indices, :money_column, :date_column
6
10
 
7
11
  def initialize(options = {})
8
12
  self.options = options
13
+
14
+ self.options[:csv_separator] = "\t" if options[:csv_separator] == '\t'
9
15
  self.options[:currency] ||= '$'
16
+
17
+ # we convert to a string so we can do character encoding cleanup
10
18
  @csv_data = parse(options[:string] || File.read(options[:file]), options[:file])
11
19
  filter_csv
12
20
  detect_columns
13
21
  end
14
22
 
23
+ # transpose csv_data (array of rows) to an array of columns
15
24
  def columns
16
- @columns ||=
17
- begin
18
- last_row_length = nil
19
- csv_data.inject([]) do |memo, row|
20
- unless row.all? { |i| i.nil? || i.length == 0 }
21
- row.each_with_index do |entry, index|
22
- memo[index] ||= []
23
- memo[index] << (entry || '').strip
24
- end
25
- last_row_length = row.length
26
- end
27
- memo
28
- end
29
- end
25
+ @columns ||= @csv_data[0].zip(*@csv_data[1..])
30
26
  end
31
27
 
32
28
  def date_for(index)
@@ -34,7 +30,7 @@ module Reckon
34
30
  end
35
31
 
36
32
  def pretty_date_for(index)
37
- @date_column.pretty_for( index )
33
+ @date_column.pretty_for(index)
38
34
  end
39
35
 
40
36
  def money_for(index)
@@ -42,7 +38,7 @@ module Reckon
42
38
  end
43
39
 
44
40
  def pretty_money(amount, negate = false)
45
- Money.new( amount, @options ).pretty( negate )
41
+ Money.new(amount, @options).pretty(negate)
46
42
  end
47
43
 
48
44
  def pretty_money_for(index, negate = false)
@@ -54,11 +50,11 @@ module Reckon
54
50
 
55
51
  def description_for(index)
56
52
  description_column_indices.map { |i| columns[i][index].to_s.strip }
57
- .reject(&:empty?)
58
- .join("; ")
59
- .squeeze(" ")
60
- .gsub(/(;\s+){2,}/, '')
61
- .strip
53
+ .reject(&:empty?)
54
+ .join("; ")
55
+ .squeeze(" ")
56
+ .gsub(/(;\s+){2,}/, '')
57
+ .strip
62
58
  end
63
59
 
64
60
  def row(index)
@@ -84,9 +80,10 @@ module Reckon
84
80
  money_score = date_score = possible_neg_money_count = possible_pos_money_count = 0
85
81
  last = nil
86
82
  column.reverse.each_with_index do |entry, row_from_bottom|
83
+ entry ||= "" # entries can be nil
87
84
  row = csv_data[csv_data.length - 1 - row_from_bottom]
88
85
  entry = entry.strip
89
- money_score += Money::likelihood( entry )
86
+ money_score += Money::likelihood(entry)
90
87
  possible_neg_money_count += 1 if entry =~ /^\$?[\-\(]\$?\d+/
91
88
  possible_pos_money_count += 1 if entry =~ /^\+?\$?\+?\d+/
92
89
  date_score += DateColumn.likelihood(entry)
@@ -97,8 +94,8 @@ module Reckon
97
94
  row.each do |row_entry|
98
95
  row_entry = row_entry.to_s.gsub(/[^\-\d\.]/, '').to_f
99
96
  if row_entry != 0 && last + row_entry == entry_as_num
100
- money_score -= 10
101
- break
97
+ money_score -= 10
98
+ break
102
99
  end
103
100
  end
104
101
  end
@@ -110,7 +107,8 @@ module Reckon
110
107
  found_likely_money_column = true
111
108
  end
112
109
 
113
- results << { :index => index, :money_score => money_score, :date_score => date_score }
110
+ results << { :index => index, :money_score => money_score,
111
+ :date_score => date_score }
114
112
  end
115
113
 
116
114
  results.sort_by! { |n| -n[:money_score] }
@@ -129,14 +127,15 @@ module Reckon
129
127
  # Some csv files negative/positive amounts are indicated in separate account
130
128
  def detect_sign_column
131
129
  return if columns[0].length <= 2 # This test needs requires more than two rows otherwise will lead to false positives
130
+
132
131
  signs = []
133
132
  if @money_column_indices[0] > 0
134
- column = columns[ @money_column_indices[0] - 1 ]
133
+ column = columns[@money_column_indices[0] - 1]
135
134
  signs = column.uniq
136
135
  end
137
136
  if (signs.length != 2 &&
138
137
  (@money_column_indices[0] + 1 < columns.length))
139
- column = columns[ @money_column_indices[0] + 1 ]
138
+ column = columns[@money_column_indices[0] + 1]
140
139
  signs = column.uniq
141
140
  end
142
141
  if signs.length == 2
@@ -166,15 +165,19 @@ module Reckon
166
165
  self.money_column_indices = [options[:money_column] - 1]
167
166
  elsif options[:money_columns].length == 2
168
167
  in_col, out_col = options[:money_columns]
169
- self.money_column_indices = [in_col -1, out_col -1]
168
+ self.money_column_indices = [in_col - 1, out_col - 1]
170
169
  else
171
170
  puts "Unable to determine money columns, use --money-columns to specify the 1 or 2 column(s) reckon should use."
172
171
  end
173
172
 
174
173
  # If no money_column(s) argument is supplied, try to automatically infer money_column(s)
175
174
  else
176
- self.money_column_indices = results.select { |n| n[:is_money_column] }.map { |n| n[:index] }
175
+ self.money_column_indices = results.select { |n|
176
+ n[:is_money_column]
177
+ }.map { |n| n[:index] }
177
178
  if self.money_column_indices.length == 1
179
+ # TODO: print the unfiltered column number, not the filtered
180
+ # ie if money column is 7, but we ignore columns 4 and 5, this prints "Using column 5 as the money column"
178
181
  puts "Using column #{money_column_indices.first + 1} as the money column. Use --money-colum to specify a different one."
179
182
  elsif self.money_column_indices.length == 2
180
183
  puts "Using columns #{money_column_indices[0] + 1} and #{money_column_indices[1] + 1} as money column. Use --money-columns to specify different ones."
@@ -204,20 +207,53 @@ module Reckon
204
207
  self.description_column_indices = results.map { |i| i[:index] }
205
208
  end
206
209
 
207
- def parse(data, filename=nil)
210
+ def parse(data, filename = nil)
208
211
  # Use force_encoding to convert the string to utf-8 with as few invalid characters
209
212
  # as possible.
210
213
  data.force_encoding(try_encoding(data, filename))
211
214
  data = data.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
212
215
  data.sub!("\xEF\xBB\xBF", '') # strip byte order marker, if it exists
213
216
 
214
- rows = []
215
- data.each_line.with_index do |line, i|
216
- next if i < (options[:contains_header] || 0)
217
- rows << CSV.parse_line(line, col_sep: options[:csv_separator] || ',')
217
+ separator = options[:csv_separator] || guess_column_separator(data)
218
+ header_lines_to_skip = options[:contains_header] || 0
219
+ # -1 is skip 0 footer rows
220
+ footer_lines_to_skip = (options[:contains_footer] || 0) + 1
221
+
222
+ # convert to a stringio object to handle multi-line fields
223
+ parser_opts = {
224
+ col_sep: separator,
225
+ skip_blanks: true
226
+ }
227
+ begin
228
+ rows = CSV.parse(StringIO.new(data), **parser_opts)
229
+ rows[header_lines_to_skip..-footer_lines_to_skip]
230
+ rescue CSV::MalformedCSVError
231
+ # try removing N header lines before parsing
232
+ index = 0
233
+ count = 0
234
+ while count < header_lines_to_skip
235
+ index = data.index("\n", index) + 1 # skip over newline character
236
+ count += 1
237
+ end
238
+ rows = CSV.parse(StringIO.new(data[index..-1]), **parser_opts)
239
+ rows[0..-footer_lines_to_skip]
240
+ end
241
+ end
242
+
243
+ def guess_column_separator(data)
244
+ delimiters = [',', "\t", ';', ':', '|']
245
+
246
+ counts = [0] * delimiters.length
247
+
248
+ data.each_line do |line|
249
+ delimiters.each_with_index do |delim, i|
250
+ counts[i] += line.count(delim)
251
+ end
218
252
  end
219
253
 
220
- rows
254
+ LOGGER.info("guessing #{delimiters[counts.index(counts.max)]} as csv separator")
255
+
256
+ delimiters[counts.index(counts.max)]
221
257
  end
222
258
 
223
259
  def try_encoding(data, filename = nil)