bank_statement_parser 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/bank_statement_parser/base.rb +56 -0
- data/lib/bank_statement_parser/hsbc.rb +354 -0
- data/lib/bank_statement_parser/statement_record.rb +31 -0
- data/lib/bank_statement_parser.rb +11 -0
- metadata +47 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8b4ac87d92f4024ad763ee53e76547c6677c01de
|
4
|
+
data.tar.gz: 0549a669283aa82da48fa7c54fc1d6e2cffcd8a0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 60756a4ea07f81fb4bb0e8ed98335f68cd68bbde0152d95865c0457d2853d79af88d7f15359c243b68c821b891e62ec61e210d33c05f37cf0e65de784639686b
|
7
|
+
data.tar.gz: 2283b2bafe0d69d8125ced615be1833d0ac12c0ebeadfdf9884155fc3b525dfb7caa4d0f9fc12271bc17b12d90d0a4d3647d2a78606e6e08c303b0ff7213eed5
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module BankStatementParser
|
2
|
+
|
3
|
+
# Base class for statement parsers
|
4
|
+
#
|
5
|
+
# Subclasses must implement the following instance methods
|
6
|
+
#
|
7
|
+
# * void reset()
|
8
|
+
# * bool handle_line(String line)
|
9
|
+
class Base
|
10
|
+
|
11
|
+
require 'fileutils.rb'
|
12
|
+
|
13
|
+
attr_accessor :sort_code, :account_number, :statement_date, :records
|
14
|
+
|
15
|
+
# Constructor
|
16
|
+
def initialize
|
17
|
+
reset
|
18
|
+
end
|
19
|
+
|
20
|
+
# Parse the specified text file
|
21
|
+
def parse path
|
22
|
+
raise "Expected a text file path" unless path =~ /\.txt\z/
|
23
|
+
|
24
|
+
reset
|
25
|
+
|
26
|
+
# Grab the full text file content
|
27
|
+
full_text = File.read(path)
|
28
|
+
|
29
|
+
# Process each line in turn
|
30
|
+
full_text.split("\n").each do |line|
|
31
|
+
break unless handle_line(line)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Sanity checking
|
35
|
+
raise "Failed to find sort code" if @sort_code.nil?
|
36
|
+
raise "Failed to find account number" if @account_number.nil?
|
37
|
+
raise "Failed to find statement date" if @statement_date.nil?
|
38
|
+
end
|
39
|
+
|
40
|
+
protected
|
41
|
+
|
42
|
+
# Convenience method to access the logger
|
43
|
+
def logger
|
44
|
+
BankStatementParser.logger
|
45
|
+
end
|
46
|
+
|
47
|
+
# Reset the parser
|
48
|
+
def reset
|
49
|
+
@sort_code = nil
|
50
|
+
@account_number = nil
|
51
|
+
@statement_date = nil
|
52
|
+
@records = []
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,354 @@
|
|
1
|
+
module BankStatementParser
|
2
|
+
|
3
|
+
# Parser for HSBC bank statements
|
4
|
+
class HSBC < Base
|
5
|
+
|
6
|
+
# Handle the specified line
|
7
|
+
#
|
8
|
+
# Returns true if parsing should continue; false to terminate the parser
|
9
|
+
def handle_line line
|
10
|
+
|
11
|
+
# Skip blank lines
|
12
|
+
return true if line =~ /\A\s*\z/
|
13
|
+
|
14
|
+
# Sanity checking
|
15
|
+
raise "line contains TAB characters" if line =~ /\t/
|
16
|
+
|
17
|
+
# Stop line
|
18
|
+
if line =~ /\A\s+AER\s+EAR\s*\z/
|
19
|
+
logger.debug { "Found stop line (2nd form)" }
|
20
|
+
return false
|
21
|
+
end
|
22
|
+
if line =~ /\AStatements produced from \d{1,2} (?:#{MONTHS.join('|')}) \d{4} are available in PDF format\.\s*\z/
|
23
|
+
logger.debug { "Found stop line (1st form)" }
|
24
|
+
return false
|
25
|
+
end
|
26
|
+
|
27
|
+
# Look for sort code and account number lines, if we haven't found
|
28
|
+
# one yet
|
29
|
+
if @sort_code.nil? && @account_number.nil?
|
30
|
+
if line =~ /(?:\A[A-Z][\w\s]+|,)\s+(?<sort_code>\d{2}-\d{2}-\d{2})\s+(?<account_number>\d{8})(?:\s*|\s+\d+)\z/
|
31
|
+
logger.debug { "Found sort code and account number" }
|
32
|
+
@sort_code = Regexp.last_match(:sort_code)
|
33
|
+
@account_number = Regexp.last_match(:account_number)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Look for statement date lines, if we haven't found one yet
|
38
|
+
if @statement_date.nil?
|
39
|
+
if line =~ /\A\s*(?<statement_date>\d{2} (?:#{MONTHS.map{|m| m[0,3]}.join('|')}) \d{4})\s*\z/
|
40
|
+
logger.debug { "Found statement date (1st form)" }
|
41
|
+
@statement_format = StatementFormat::FORMAT_1ST
|
42
|
+
|
43
|
+
# Parse statement date
|
44
|
+
date_string = Regexp.last_match(:statement_date)
|
45
|
+
@statement_date = DateTime.parse(date_string)
|
46
|
+
elsif line =~ /\A(?<date_range_start>\d+\s+(?:#{MONTHS.join('|')})(?:\s+\d{4})?)\s+to\s+(?<date_range_end>\d+\s+(?:#{MONTHS.join('|')})\s+\d{4})\b/
|
47
|
+
logger.debug { "Found statement date (2nd form)" }
|
48
|
+
@statement_format = StatementFormat::FORMAT_2ND
|
49
|
+
|
50
|
+
date_range_start = Regexp.last_match(:date_range_start)
|
51
|
+
date_range_end = Regexp.last_match(:date_range_end)
|
52
|
+
logger.debug { "Found statement date range #{date_range_start}-#{date_range_end}" }
|
53
|
+
|
54
|
+
# Parse range end date
|
55
|
+
@statement_date = DateTime.parse(date_range_end)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
if !@sort_code.nil? && !@account_number.nil? && !@statement_date.nil?
|
60
|
+
|
61
|
+
# Look for statement records proper
|
62
|
+
headings = nil
|
63
|
+
case @statement_format
|
64
|
+
when StatementFormat::FORMAT_UNKNOWN
|
65
|
+
raise "Failed to detect statement format before start of records"
|
66
|
+
when StatementFormat::FORMAT_1ST
|
67
|
+
headings = COLUMN_HEADINGS_1ST
|
68
|
+
when StatementFormat::FORMAT_2ND
|
69
|
+
headings = COLUMN_HEADINGS_2ND
|
70
|
+
end
|
71
|
+
logger.debug { "Parsing potential record line (format #{@statement_format})" }
|
72
|
+
parse_record_line_format(line, headings)
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
return true
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
TYPES = ["ATM", "BP", "CHQ", "CIR", "CR", "DD", "DIV", "DR",
|
82
|
+
"MAE", "PIM", "SO", "TFR", "VIS", ")))"]
|
83
|
+
|
84
|
+
MONTHS = Date::MONTHNAMES[1..12]
|
85
|
+
|
86
|
+
# N.B. Unicode pound symbol deleted from brackets in balance column heading
|
87
|
+
COLUMN_HEADINGS_1ST = ["Date",
|
88
|
+
"Type\\s+Description",
|
89
|
+
"Paid out",
|
90
|
+
"Paid in",
|
91
|
+
"Balance \\(\\)"]
|
92
|
+
|
93
|
+
COLUMN_HEADINGS_2ND = ["Date",
|
94
|
+
"Payment type and details",
|
95
|
+
"Paid out",
|
96
|
+
"Paid in",
|
97
|
+
"Balance"]
|
98
|
+
|
99
|
+
# Enumerate statement formats
|
100
|
+
module StatementFormat
|
101
|
+
FORMAT_UNKNOWN = 0
|
102
|
+
FORMAT_1ST = 1 # "old" style, browser-printed PDF
|
103
|
+
FORMAT_2ND = 2 # "new", pre-formatted PDF
|
104
|
+
end
|
105
|
+
|
106
|
+
# Reset the parser
|
107
|
+
def reset
|
108
|
+
super
|
109
|
+
|
110
|
+
@statement_format = StatementFormat::FORMAT_UNKNOWN
|
111
|
+
|
112
|
+
# Somewhere to cache the most-recent statement record date
|
113
|
+
@cached_statement_date = nil
|
114
|
+
|
115
|
+
# Somewhere to cached the most-recent statement record type
|
116
|
+
@cached_payment_type = nil
|
117
|
+
|
118
|
+
# Somewhere to cache details for the ongoing statement record
|
119
|
+
@cached_details = []
|
120
|
+
|
121
|
+
# Somewhere to cache column alignments
|
122
|
+
@cols = []
|
123
|
+
|
124
|
+
# Flag to temporarily pause the parser
|
125
|
+
@parser_paused = false
|
126
|
+
end
|
127
|
+
|
128
|
+
# Fix the year of the specified record date
|
129
|
+
#
|
130
|
+
# Returns the record date, with the year fixed
|
131
|
+
def fix_record_date_year record_date
|
132
|
+
# Sanity checking
|
133
|
+
if Date.today.year != record_date.year
|
134
|
+
logger.info { "No need to fix year for statement record date" }
|
135
|
+
return record_date
|
136
|
+
end
|
137
|
+
|
138
|
+
# The date we have parsed will have the year set to the current year.
|
139
|
+
#
|
140
|
+
# We need to figure out the correct year, from the statement date.
|
141
|
+
raise "No statement date" unless @statement_date
|
142
|
+
record_date = record_date.change(year: @statement_date.year)
|
143
|
+
logger.debug { "record date #{record_date}" }
|
144
|
+
if @statement_date.month != record_date.month
|
145
|
+
logger.debug { "record month differs from statement month" }
|
146
|
+
if 1 == @statement_date.month
|
147
|
+
# Assume that the statement crosses a year boundary: the record
|
148
|
+
# must be from the end of the previous year
|
149
|
+
raise "Expected a record from December" unless
|
150
|
+
12 == record_date.month
|
151
|
+
record_date = record_date.prev_year
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
record_date
|
156
|
+
end
|
157
|
+
|
158
|
+
# If the specified line is a headings line, use it to update our column
|
159
|
+
# alignments
|
160
|
+
#
|
161
|
+
# Returns true if column alignments were updated; false otherwise
|
162
|
+
def update_columns line, headings
|
163
|
+
# Look for lines that allow us to match column alignments
|
164
|
+
raise "Expected a five-column layout" unless 5 == headings.size
|
165
|
+
|
166
|
+
# Build a regexp for matching the column header line
|
167
|
+
column_heading_regexp_str = '\A'
|
168
|
+
headings.each_with_index do |item,index|
|
169
|
+
pre_space_match = ''
|
170
|
+
post_space_quantifier = '{2,}'
|
171
|
+
if 0 == index
|
172
|
+
pre_space_match = '\s*'
|
173
|
+
elsif (headings.length - 1) == index
|
174
|
+
post_space_quantifier = '*'
|
175
|
+
end
|
176
|
+
column_heading_regexp_str +=
|
177
|
+
'(?<col' + index.to_s + '>' + pre_space_match + item + '\s' + post_space_quantifier + ')'
|
178
|
+
end
|
179
|
+
column_heading_regexp_str += '\z'
|
180
|
+
column_heading_regexp = Regexp.new(column_heading_regexp_str)
|
181
|
+
|
182
|
+
if line =~ column_heading_regexp
|
183
|
+
if @cols.empty?
|
184
|
+
logger.debug { "Setting column alignments from line #{line}" }
|
185
|
+
else
|
186
|
+
logger.debug { "Updating column alignments from line #{line}" }
|
187
|
+
end
|
188
|
+
(0...headings.size).each do |i|
|
189
|
+
str_i = "col" + i.to_s
|
190
|
+
sym_i = str_i.to_sym
|
191
|
+
@cols[i] = Regexp.last_match.offset(sym_i)[0]
|
192
|
+
end
|
193
|
+
|
194
|
+
return true
|
195
|
+
end
|
196
|
+
|
197
|
+
return false
|
198
|
+
end
|
199
|
+
|
200
|
+
# Split the specified line into an array of column fragments
|
201
|
+
def get_column_fragments line
|
202
|
+
col_fragments = []
|
203
|
+
|
204
|
+
@cols.reverse.each_with_index do |i,index|
|
205
|
+
# We need to be flexible here, because the columns can (and do)
|
206
|
+
# fail to line up with the heading alignments
|
207
|
+
#
|
208
|
+
# Check whether the supposed column boundary has whitespace on
|
209
|
+
# at least one side:
|
210
|
+
#
|
211
|
+
# * If so, then this is a correct column boundary
|
212
|
+
# * If not, then (somewhat arbitrarily, based on cases that have
|
213
|
+
# been seen) opt to move the column left until we hit whitespace
|
214
|
+
if (i > 0) && (i < line.length)
|
215
|
+
char_before_boundary = line[i-1]
|
216
|
+
char_after_boundary = line[i]
|
217
|
+
unless char_before_boundary =~ /\A\s\z/ ||
|
218
|
+
char_after_boundary =~ /\A\s\z/
|
219
|
+
logger.warn { "Column boundary failure: #{char_before_boundary}|#{char_after_boundary}" }
|
220
|
+
|
221
|
+
# Shift down until we hit whitespace before the boundary
|
222
|
+
boundary_limit =
|
223
|
+
((index + 1) < @cols.reverse.size) ? @cols.reverse[index + 1] : -1
|
224
|
+
logger.debug { "Boundary adjust limit #{boundary_limit}" }
|
225
|
+
new_boundary = i
|
226
|
+
while new_boundary > boundary_limit
|
227
|
+
left = line[new_boundary]
|
228
|
+
if left =~ /\A\s\z/
|
229
|
+
logger.debug { "Adjusting column boundary from #{i} to #{new_boundary}" }
|
230
|
+
i = new_boundary
|
231
|
+
break
|
232
|
+
end
|
233
|
+
new_boundary -= 1
|
234
|
+
end
|
235
|
+
|
236
|
+
raise "Failed to adjust column boundary" if 0 == new_boundary
|
237
|
+
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
fragment_i = line[i...(line.length)]
|
242
|
+
unless fragment_i.nil?
|
243
|
+
fragment_i.strip!
|
244
|
+
if fragment_i.empty?
|
245
|
+
fragment_i = nil
|
246
|
+
end
|
247
|
+
end
|
248
|
+
col_fragments.unshift(fragment_i)
|
249
|
+
line = line[0...i]
|
250
|
+
end
|
251
|
+
|
252
|
+
return col_fragments
|
253
|
+
end
|
254
|
+
|
255
|
+
# Parse the specified line, looking for records
|
256
|
+
def parse_record_line_format line, headings
|
257
|
+
|
258
|
+
if update_columns(line, headings)
|
259
|
+
if @parser_paused
|
260
|
+
logger.debug { "Resuming parser: set/updated columns" }
|
261
|
+
@parser_paused = false
|
262
|
+
end
|
263
|
+
return
|
264
|
+
end
|
265
|
+
|
266
|
+
return if @cols.empty?
|
267
|
+
|
268
|
+
return if @parser_paused
|
269
|
+
|
270
|
+
col_fragments = get_column_fragments(line)
|
271
|
+
|
272
|
+
# N.B. Detect and fix up failed column splitting
|
273
|
+
date_string = col_fragments[0]
|
274
|
+
unless date_string.nil?
|
275
|
+
if date_string =~ /(?<date_proper>.+)\s+(?<spurious_tail>[A-Z]+)\z/
|
276
|
+
date_proper = Regexp.last_match(:date_proper)
|
277
|
+
spurious_tail = Regexp.last_match(:spurious_tail)
|
278
|
+
logger.warn { "Must fix date string #{date_string}|#{date_proper}|#{spurious_tail}" }
|
279
|
+
col_fragments[0] = date_proper
|
280
|
+
col_fragments[1] = spurious_tail + " " + col_fragments[1]
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
date_string = col_fragments[0]
|
285
|
+
unless date_string.nil?
|
286
|
+
begin
|
287
|
+
@cached_statement_date = DateTime.parse(date_string)
|
288
|
+
@cached_statement_date =
|
289
|
+
fix_record_date_year(@cached_statement_date)
|
290
|
+
rescue ArgumentError => e
|
291
|
+
raise "Failed to parse date/time #{date_string}: #{e}"
|
292
|
+
end
|
293
|
+
end
|
294
|
+
|
295
|
+
payment_type_and_details = col_fragments[1]
|
296
|
+
|
297
|
+
if payment_type_and_details =~ /\ABALANCE CARRIED FORWARD\z/i
|
298
|
+
logger.debug { "Pausing parser" }
|
299
|
+
@parser_paused = true
|
300
|
+
return
|
301
|
+
elsif payment_type_and_details =~ /\ABALANCE BROUGHT FORWARD\z/i
|
302
|
+
if @parser_paused
|
303
|
+
logger.debug { "Resuming parser" }
|
304
|
+
@parser_paused = false
|
305
|
+
else
|
306
|
+
logger.debug { "Skipping parser resume line" }
|
307
|
+
end
|
308
|
+
return
|
309
|
+
end
|
310
|
+
if @parser_paused
|
311
|
+
logger.debug { "Skipping line: parser paused" }
|
312
|
+
return
|
313
|
+
end
|
314
|
+
|
315
|
+
payment_details = nil
|
316
|
+
if payment_type_and_details =~ /\A(?<payment_type>#{TYPES.map{ |t| Regexp.quote(t) }.join('|')})\s+(?<payment_details>.*)\z/
|
317
|
+
logger.debug { "Found the start of a record (group)" }
|
318
|
+
@cached_payment_type = Regexp.last_match(:payment_type)
|
319
|
+
payment_details = Regexp.last_match(:payment_details)
|
320
|
+
else
|
321
|
+
payment_details = payment_type_and_details
|
322
|
+
end
|
323
|
+
@cached_details << payment_details
|
324
|
+
|
325
|
+
paid_out = col_fragments[2]
|
326
|
+
paid_in = col_fragments[3]
|
327
|
+
paid_out.delete!(",") unless paid_out.nil?
|
328
|
+
paid_in.delete!(",") unless paid_in.nil?
|
329
|
+
balance = col_fragments[4]
|
330
|
+
|
331
|
+
if !paid_out.nil? || !paid_in.nil?
|
332
|
+
logger.debug { "Found the end of a record (group)" }
|
333
|
+
full_details = @cached_details.join("\n")
|
334
|
+
|
335
|
+
record_credit = !paid_in.nil?
|
336
|
+
record_amount = record_credit ? paid_in.to_f : paid_out.to_f
|
337
|
+
|
338
|
+
# Create statement record
|
339
|
+
record = StatementRecord.new(date: @cached_statement_date,
|
340
|
+
type: @cached_payment_type,
|
341
|
+
credit: record_credit,
|
342
|
+
amount: record_amount,
|
343
|
+
detail: full_details)
|
344
|
+
logger.debug { "Created statement record: #{record}" }
|
345
|
+
@records << record
|
346
|
+
|
347
|
+
@cached_payment_type = nil
|
348
|
+
@cached_details = []
|
349
|
+
end
|
350
|
+
|
351
|
+
end
|
352
|
+
|
353
|
+
end
|
354
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module BankStatementParser
|
2
|
+
|
3
|
+
# A bank statement record
|
4
|
+
class StatementRecord
|
5
|
+
attr_accessor :date, :type, :credit, :amount, :detail
|
6
|
+
|
7
|
+
# Constructor
|
8
|
+
def initialize date: nil, type: nil, credit: nil, amount: nil, detail: nil
|
9
|
+
@date = date
|
10
|
+
@type = type
|
11
|
+
@credit = credit
|
12
|
+
@amount = amount
|
13
|
+
@detail = detail
|
14
|
+
end
|
15
|
+
|
16
|
+
# Stringify
|
17
|
+
def to_s
|
18
|
+
"%s:%s:%s:%f:%s" % [date, type, credit.to_s, amount, detail]
|
19
|
+
end
|
20
|
+
|
21
|
+
# Equality test
|
22
|
+
def ==(other)
|
23
|
+
super || (date == other.date &&
|
24
|
+
type == other.type &&
|
25
|
+
credit == other.credit &&
|
26
|
+
amount == other.amount &&
|
27
|
+
detail = other.detail)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bank_statement_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Simon Dawson
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-03-24 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A gem for parsing bank statements
|
14
|
+
email: spdawson@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/bank_statement_parser.rb
|
20
|
+
- lib/bank_statement_parser/base.rb
|
21
|
+
- lib/bank_statement_parser/hsbc.rb
|
22
|
+
- lib/bank_statement_parser/statement_record.rb
|
23
|
+
homepage: http://rubygems.org/gems/bank_statement_parser
|
24
|
+
licenses:
|
25
|
+
- GPLv3
|
26
|
+
metadata: {}
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubyforge_project:
|
43
|
+
rubygems_version: 2.2.2
|
44
|
+
signing_key:
|
45
|
+
specification_version: 4
|
46
|
+
summary: Bank statement parser
|
47
|
+
test_files: []
|