bank_statement_parser 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8b4ac87d92f4024ad763ee53e76547c6677c01de
4
+ data.tar.gz: 0549a669283aa82da48fa7c54fc1d6e2cffcd8a0
5
+ SHA512:
6
+ metadata.gz: 60756a4ea07f81fb4bb0e8ed98335f68cd68bbde0152d95865c0457d2853d79af88d7f15359c243b68c821b891e62ec61e210d33c05f37cf0e65de784639686b
7
+ data.tar.gz: 2283b2bafe0d69d8125ced615be1833d0ac12c0ebeadfdf9884155fc3b525dfb7caa4d0f9fc12271bc17b12d90d0a4d3647d2a78606e6e08c303b0ff7213eed5
@@ -0,0 +1,56 @@
1
+ module BankStatementParser
2
+
3
+ # Base class for statement parsers
4
+ #
5
+ # Subclasses must implement the following instance methods
6
+ #
7
+ # * void reset()
8
+ # * bool handle_line(String line)
9
+ class Base
10
+
11
+ require 'fileutils.rb'
12
+
13
+ attr_accessor :sort_code, :account_number, :statement_date, :records
14
+
15
+ # Constructor
16
+ def initialize
17
+ reset
18
+ end
19
+
20
+ # Parse the specified text file
21
+ def parse path
22
+ raise "Expected a text file path" unless path =~ /\.txt\z/
23
+
24
+ reset
25
+
26
+ # Grab the full text file content
27
+ full_text = File.read(path)
28
+
29
+ # Process each line in turn
30
+ full_text.split("\n").each do |line|
31
+ break unless handle_line(line)
32
+ end
33
+
34
+ # Sanity checking
35
+ raise "Failed to find sort code" if @sort_code.nil?
36
+ raise "Failed to find account number" if @account_number.nil?
37
+ raise "Failed to find statement date" if @statement_date.nil?
38
+ end
39
+
40
+ protected
41
+
42
+ # Convenience method to access the logger
43
+ def logger
44
+ BankStatementParser.logger
45
+ end
46
+
47
+ # Reset the parser
48
+ def reset
49
+ @sort_code = nil
50
+ @account_number = nil
51
+ @statement_date = nil
52
+ @records = []
53
+ end
54
+
55
+ end
56
+ end
@@ -0,0 +1,354 @@
1
+ module BankStatementParser
2
+
3
+ # Parser for HSBC bank statements
4
+ class HSBC < Base
5
+
6
+ # Handle the specified line
7
+ #
8
+ # Returns true if parsing should continue; false to terminate the parser
9
+ def handle_line line
10
+
11
+ # Skip blank lines
12
+ return true if line =~ /\A\s*\z/
13
+
14
+ # Sanity checking
15
+ raise "line contains TAB characters" if line =~ /\t/
16
+
17
+ # Stop line
18
+ if line =~ /\A\s+AER\s+EAR\s*\z/
19
+ logger.debug { "Found stop line (2nd form)" }
20
+ return false
21
+ end
22
+ if line =~ /\AStatements produced from \d{1,2} (?:#{MONTHS.join('|')}) \d{4} are available in PDF format\.\s*\z/
23
+ logger.debug { "Found stop line (1st form)" }
24
+ return false
25
+ end
26
+
27
+ # Look for sort code and account number lines, if we haven't found
28
+ # one yet
29
+ if @sort_code.nil? && @account_number.nil?
30
+ if line =~ /(?:\A[A-Z][\w\s]+|,)\s+(?<sort_code>\d{2}-\d{2}-\d{2})\s+(?<account_number>\d{8})(?:\s*|\s+\d+)\z/
31
+ logger.debug { "Found sort code and account number" }
32
+ @sort_code = Regexp.last_match(:sort_code)
33
+ @account_number = Regexp.last_match(:account_number)
34
+ end
35
+ end
36
+
37
+ # Look for statement date lines, if we haven't found one yet
38
+ if @statement_date.nil?
39
+ if line =~ /\A\s*(?<statement_date>\d{2} (?:#{MONTHS.map{|m| m[0,3]}.join('|')}) \d{4})\s*\z/
40
+ logger.debug { "Found statement date (1st form)" }
41
+ @statement_format = StatementFormat::FORMAT_1ST
42
+
43
+ # Parse statement date
44
+ date_string = Regexp.last_match(:statement_date)
45
+ @statement_date = DateTime.parse(date_string)
46
+ elsif line =~ /\A(?<date_range_start>\d+\s+(?:#{MONTHS.join('|')})(?:\s+\d{4})?)\s+to\s+(?<date_range_end>\d+\s+(?:#{MONTHS.join('|')})\s+\d{4})\b/
47
+ logger.debug { "Found statement date (2nd form)" }
48
+ @statement_format = StatementFormat::FORMAT_2ND
49
+
50
+ date_range_start = Regexp.last_match(:date_range_start)
51
+ date_range_end = Regexp.last_match(:date_range_end)
52
+ logger.debug { "Found statement date range #{date_range_start}-#{date_range_end}" }
53
+
54
+ # Parse range end date
55
+ @statement_date = DateTime.parse(date_range_end)
56
+ end
57
+ end
58
+
59
+ if !@sort_code.nil? && !@account_number.nil? && !@statement_date.nil?
60
+
61
+ # Look for statement records proper
62
+ headings = nil
63
+ case @statement_format
64
+ when StatementFormat::FORMAT_UNKNOWN
65
+ raise "Failed to detect statement format before start of records"
66
+ when StatementFormat::FORMAT_1ST
67
+ headings = COLUMN_HEADINGS_1ST
68
+ when StatementFormat::FORMAT_2ND
69
+ headings = COLUMN_HEADINGS_2ND
70
+ end
71
+ logger.debug { "Parsing potential record line (format #{@statement_format})" }
72
+ parse_record_line_format(line, headings)
73
+
74
+ end
75
+
76
+ return true
77
+ end
78
+
79
+ private
80
+
81
+ TYPES = ["ATM", "BP", "CHQ", "CIR", "CR", "DD", "DIV", "DR",
82
+ "MAE", "PIM", "SO", "TFR", "VIS", ")))"]
83
+
84
+ MONTHS = Date::MONTHNAMES[1..12]
85
+
86
+ # N.B. Unicode pound symbol deleted from brackets in balance column heading
87
+ COLUMN_HEADINGS_1ST = ["Date",
88
+ "Type\\s+Description",
89
+ "Paid out",
90
+ "Paid in",
91
+ "Balance \\(\\)"]
92
+
93
+ COLUMN_HEADINGS_2ND = ["Date",
94
+ "Payment type and details",
95
+ "Paid out",
96
+ "Paid in",
97
+ "Balance"]
98
+
99
+ # Enumerate statement formats
100
+ module StatementFormat
101
+ FORMAT_UNKNOWN = 0
102
+ FORMAT_1ST = 1 # "old" style, browser-printed PDF
103
+ FORMAT_2ND = 2 # "new", pre-formatted PDF
104
+ end
105
+
106
+ # Reset the parser
107
+ def reset
108
+ super
109
+
110
+ @statement_format = StatementFormat::FORMAT_UNKNOWN
111
+
112
+ # Somewhere to cache the most-recent statement record date
113
+ @cached_statement_date = nil
114
+
115
+ # Somewhere to cached the most-recent statement record type
116
+ @cached_payment_type = nil
117
+
118
+ # Somewhere to cache details for the ongoing statement record
119
+ @cached_details = []
120
+
121
+ # Somewhere to cache column alignments
122
+ @cols = []
123
+
124
+ # Flag to temporarily pause the parser
125
+ @parser_paused = false
126
+ end
127
+
128
+ # Fix the year of the specified record date
129
+ #
130
+ # Returns the record date, with the year fixed
131
+ def fix_record_date_year record_date
132
+ # Sanity checking
133
+ if Date.today.year != record_date.year
134
+ logger.info { "No need to fix year for statement record date" }
135
+ return record_date
136
+ end
137
+
138
+ # The date we have parsed will have the year set to the current year.
139
+ #
140
+ # We need to figure out the correct year, from the statement date.
141
+ raise "No statement date" unless @statement_date
142
+ record_date = record_date.change(year: @statement_date.year)
143
+ logger.debug { "record date #{record_date}" }
144
+ if @statement_date.month != record_date.month
145
+ logger.debug { "record month differs from statement month" }
146
+ if 1 == @statement_date.month
147
+ # Assume that the statement crosses a year boundary: the record
148
+ # must be from the end of the previous year
149
+ raise "Expected a record from December" unless
150
+ 12 == record_date.month
151
+ record_date = record_date.prev_year
152
+ end
153
+ end
154
+
155
+ record_date
156
+ end
157
+
158
+ # If the specified line is a headings line, use it to update our column
159
+ # alignments
160
+ #
161
+ # Returns true if column alignments were updated; false otherwise
162
+ def update_columns line, headings
163
+ # Look for lines that allow us to match column alignments
164
+ raise "Expected a five-column layout" unless 5 == headings.size
165
+
166
+ # Build a regexp for matching the column header line
167
+ column_heading_regexp_str = '\A'
168
+ headings.each_with_index do |item,index|
169
+ pre_space_match = ''
170
+ post_space_quantifier = '{2,}'
171
+ if 0 == index
172
+ pre_space_match = '\s*'
173
+ elsif (headings.length - 1) == index
174
+ post_space_quantifier = '*'
175
+ end
176
+ column_heading_regexp_str +=
177
+ '(?<col' + index.to_s + '>' + pre_space_match + item + '\s' + post_space_quantifier + ')'
178
+ end
179
+ column_heading_regexp_str += '\z'
180
+ column_heading_regexp = Regexp.new(column_heading_regexp_str)
181
+
182
+ if line =~ column_heading_regexp
183
+ if @cols.empty?
184
+ logger.debug { "Setting column alignments from line #{line}" }
185
+ else
186
+ logger.debug { "Updating column alignments from line #{line}" }
187
+ end
188
+ (0...headings.size).each do |i|
189
+ str_i = "col" + i.to_s
190
+ sym_i = str_i.to_sym
191
+ @cols[i] = Regexp.last_match.offset(sym_i)[0]
192
+ end
193
+
194
+ return true
195
+ end
196
+
197
+ return false
198
+ end
199
+
200
+ # Split the specified line into an array of column fragments
201
+ def get_column_fragments line
202
+ col_fragments = []
203
+
204
+ @cols.reverse.each_with_index do |i,index|
205
+ # We need to be flexible here, because the columns can (and do)
206
+ # fail to line up with the heading alignments
207
+ #
208
+ # Check whether the supposed column boundary has whitespace on
209
+ # at least one side:
210
+ #
211
+ # * If so, then this is a correct column boundary
212
+ # * If not, then (somewhat arbitrarily, based on cases that have
213
+ # been seen) opt to move the column left until we hit whitespace
214
+ if (i > 0) && (i < line.length)
215
+ char_before_boundary = line[i-1]
216
+ char_after_boundary = line[i]
217
+ unless char_before_boundary =~ /\A\s\z/ ||
218
+ char_after_boundary =~ /\A\s\z/
219
+ logger.warn { "Column boundary failure: #{char_before_boundary}|#{char_after_boundary}" }
220
+
221
+ # Shift down until we hit whitespace before the boundary
222
+ boundary_limit =
223
+ ((index + 1) < @cols.reverse.size) ? @cols.reverse[index + 1] : -1
224
+ logger.debug { "Boundary adjust limit #{boundary_limit}" }
225
+ new_boundary = i
226
+ while new_boundary > boundary_limit
227
+ left = line[new_boundary]
228
+ if left =~ /\A\s\z/
229
+ logger.debug { "Adjusting column boundary from #{i} to #{new_boundary}" }
230
+ i = new_boundary
231
+ break
232
+ end
233
+ new_boundary -= 1
234
+ end
235
+
236
+ raise "Failed to adjust column boundary" if 0 == new_boundary
237
+
238
+ end
239
+ end
240
+
241
+ fragment_i = line[i...(line.length)]
242
+ unless fragment_i.nil?
243
+ fragment_i.strip!
244
+ if fragment_i.empty?
245
+ fragment_i = nil
246
+ end
247
+ end
248
+ col_fragments.unshift(fragment_i)
249
+ line = line[0...i]
250
+ end
251
+
252
+ return col_fragments
253
+ end
254
+
255
+ # Parse the specified line, looking for records
256
+ def parse_record_line_format line, headings
257
+
258
+ if update_columns(line, headings)
259
+ if @parser_paused
260
+ logger.debug { "Resuming parser: set/updated columns" }
261
+ @parser_paused = false
262
+ end
263
+ return
264
+ end
265
+
266
+ return if @cols.empty?
267
+
268
+ return if @parser_paused
269
+
270
+ col_fragments = get_column_fragments(line)
271
+
272
+ # N.B. Detect and fix up failed column splitting
273
+ date_string = col_fragments[0]
274
+ unless date_string.nil?
275
+ if date_string =~ /(?<date_proper>.+)\s+(?<spurious_tail>[A-Z]+)\z/
276
+ date_proper = Regexp.last_match(:date_proper)
277
+ spurious_tail = Regexp.last_match(:spurious_tail)
278
+ logger.warn { "Must fix date string #{date_string}|#{date_proper}|#{spurious_tail}" }
279
+ col_fragments[0] = date_proper
280
+ col_fragments[1] = spurious_tail + " " + col_fragments[1]
281
+ end
282
+ end
283
+
284
+ date_string = col_fragments[0]
285
+ unless date_string.nil?
286
+ begin
287
+ @cached_statement_date = DateTime.parse(date_string)
288
+ @cached_statement_date =
289
+ fix_record_date_year(@cached_statement_date)
290
+ rescue ArgumentError => e
291
+ raise "Failed to parse date/time #{date_string}: #{e}"
292
+ end
293
+ end
294
+
295
+ payment_type_and_details = col_fragments[1]
296
+
297
+ if payment_type_and_details =~ /\ABALANCE CARRIED FORWARD\z/i
298
+ logger.debug { "Pausing parser" }
299
+ @parser_paused = true
300
+ return
301
+ elsif payment_type_and_details =~ /\ABALANCE BROUGHT FORWARD\z/i
302
+ if @parser_paused
303
+ logger.debug { "Resuming parser" }
304
+ @parser_paused = false
305
+ else
306
+ logger.debug { "Skipping parser resume line" }
307
+ end
308
+ return
309
+ end
310
+ if @parser_paused
311
+ logger.debug { "Skipping line: parser paused" }
312
+ return
313
+ end
314
+
315
+ payment_details = nil
316
+ if payment_type_and_details =~ /\A(?<payment_type>#{TYPES.map{ |t| Regexp.quote(t) }.join('|')})\s+(?<payment_details>.*)\z/
317
+ logger.debug { "Found the start of a record (group)" }
318
+ @cached_payment_type = Regexp.last_match(:payment_type)
319
+ payment_details = Regexp.last_match(:payment_details)
320
+ else
321
+ payment_details = payment_type_and_details
322
+ end
323
+ @cached_details << payment_details
324
+
325
+ paid_out = col_fragments[2]
326
+ paid_in = col_fragments[3]
327
+ paid_out.delete!(",") unless paid_out.nil?
328
+ paid_in.delete!(",") unless paid_in.nil?
329
+ balance = col_fragments[4]
330
+
331
+ if !paid_out.nil? || !paid_in.nil?
332
+ logger.debug { "Found the end of a record (group)" }
333
+ full_details = @cached_details.join("\n")
334
+
335
+ record_credit = !paid_in.nil?
336
+ record_amount = record_credit ? paid_in.to_f : paid_out.to_f
337
+
338
+ # Create statement record
339
+ record = StatementRecord.new(date: @cached_statement_date,
340
+ type: @cached_payment_type,
341
+ credit: record_credit,
342
+ amount: record_amount,
343
+ detail: full_details)
344
+ logger.debug { "Created statement record: #{record}" }
345
+ @records << record
346
+
347
+ @cached_payment_type = nil
348
+ @cached_details = []
349
+ end
350
+
351
+ end
352
+
353
+ end
354
+ end
@@ -0,0 +1,31 @@
1
+ module BankStatementParser
2
+
3
+ # A bank statement record
4
+ class StatementRecord
5
+ attr_accessor :date, :type, :credit, :amount, :detail
6
+
7
+ # Constructor
8
+ def initialize date: nil, type: nil, credit: nil, amount: nil, detail: nil
9
+ @date = date
10
+ @type = type
11
+ @credit = credit
12
+ @amount = amount
13
+ @detail = detail
14
+ end
15
+
16
+ # Stringify
17
+ def to_s
18
+ "%s:%s:%s:%f:%s" % [date, type, credit.to_s, amount, detail]
19
+ end
20
+
21
+ # Equality test
22
+ def ==(other)
23
+ super || (date == other.date &&
24
+ type == other.type &&
25
+ credit == other.credit &&
26
+ amount == other.amount &&
27
+ detail = other.detail)
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,11 @@
1
+ module BankStatementParser
2
+
3
+ @@logger = Logger.new(STDERR)
4
+ def self.logger
5
+ @@logger
6
+ end
7
+ def self.logger=(logger)
8
+ @@logger = logger
9
+ end
10
+
11
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bank_statement_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Simon Dawson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-24 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A gem for parsing bank statements
14
+ email: spdawson@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/bank_statement_parser.rb
20
+ - lib/bank_statement_parser/base.rb
21
+ - lib/bank_statement_parser/hsbc.rb
22
+ - lib/bank_statement_parser/statement_record.rb
23
+ homepage: http://rubygems.org/gems/bank_statement_parser
24
+ licenses:
25
+ - GPLv3
26
+ metadata: {}
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 2.2.2
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: Bank statement parser
47
+ test_files: []