hsbc_pdf_statement_parser 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/hsbc_pdf_statement_parser/parser.rb +137 -133
- data/lib/hsbc_pdf_statement_parser/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1aeed5005a826d427a328576db61241bab262e44941ae1c0af5d34f7028f1281
|
4
|
+
data.tar.gz: 1af68498d1b4e4732e42a7422a32930170f4870f85a458b0a166b95e0ef7b338
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 64d6adeb4671eff35241d94d8f848fe54cbf6c324aa366e75d658d44b5113b468badde10b262ab092ab8b43e87402dad81cc10f9ba4b281c6fefc325c4533199
|
7
|
+
data.tar.gz: bebec0676834fd0da389f2094017a930c795f103a7477e67062449d31c3598cd9571f5f6dcbb57ab06549e4b761515915e9f211d936bf1e7660452c2497a32db
|
@@ -1,161 +1,165 @@
|
|
1
|
-
|
1
|
+
module HsbcPdfStatementParser
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
end
|
13
|
-
|
14
|
-
# Returns an array of the transactions in the document as hashes.
|
15
|
-
#
|
16
|
-
# === Hash keys
|
17
|
-
#
|
18
|
-
# [:date] the date of the transaction _(Date)_
|
19
|
-
# [:type] the type of the transaction, eg ‘VISA’, ‘DD’, ‘ATM’, etc _(String)_
|
20
|
-
# [:details] the details of the transaction. This can span multiple lines _(String)_
|
21
|
-
# [:out] the amount of the transaction, if a debit _(Float, nil)_
|
22
|
-
# [:in] the amount of the transaction, if a credit _(Float, nil)_
|
23
|
-
# [:change] the amount of the transacation: negative if a debit, positive if a credit _(Float)_
|
24
|
-
def transactions
|
25
|
-
|
26
|
-
@_transactions ||= begin
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
# Creates a new parser from a PDF file.
|
6
|
+
#
|
7
|
+
# === Parameters
|
8
|
+
#
|
9
|
+
# [filename] the filename to parse
|
10
|
+
def initialize( filename )
|
27
11
|
|
28
|
-
|
29
|
-
current_date = nil
|
30
|
-
transactions = []
|
12
|
+
@reader = PDF::Reader.new( filename )
|
31
13
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
14
|
+
end
|
15
|
+
|
16
|
+
# Returns an array of the transactions in the document as hashes.
|
17
|
+
#
|
18
|
+
# === Hash keys
|
19
|
+
#
|
20
|
+
# [:date] the date of the transaction _(Date)_
|
21
|
+
# [:type] the type of the transaction, eg ‘VISA’, ‘DD’, ‘ATM’, etc _(String)_
|
22
|
+
# [:details] the details of the transaction. This can span multiple lines _(String)_
|
23
|
+
# [:out] the amount of the transaction, if a debit _(Float, nil)_
|
24
|
+
# [:in] the amount of the transaction, if a credit _(Float, nil)_
|
25
|
+
# [:change] the amount of the transacation: negative if a debit, positive if a credit _(Float)_
|
26
|
+
def transactions
|
27
|
+
|
28
|
+
@_transactions ||= begin
|
37
29
|
|
38
|
-
|
39
|
-
|
30
|
+
current_transaction = nil
|
31
|
+
current_date = nil
|
32
|
+
transactions = []
|
40
33
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
34
|
+
document_text
|
35
|
+
.scan( /BALANCE\s?BROUGHT\s?FORWARD(?:.*?)\n(.*?)BALANCE\s?CARRIED\s?FORWARD/im )
|
36
|
+
.map{ |text| parse_page( text[0] )}
|
37
|
+
.flatten
|
38
|
+
.each do |line|
|
39
|
+
|
40
|
+
# store the current date
|
41
|
+
current_date = line[:date] unless line[:date].nil?
|
42
|
+
|
43
|
+
# if we have a type, start a new transaction
|
44
|
+
unless line[:type].nil?
|
45
|
+
transactions << current_transaction unless current_transaction.nil?
|
46
|
+
current_transaction = line.merge( date: current_date )
|
47
|
+
next
|
48
|
+
end
|
49
|
+
|
50
|
+
# merge things in
|
51
|
+
current_transaction.merge!( line.select{ |k,v| v }, { details: "#{current_transaction[:details]}\n#{line[:details]}" })
|
52
|
+
|
46
53
|
end
|
47
54
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
55
|
+
# dump the final transaction + return
|
56
|
+
transactions << current_transaction unless current_transaction.nil?
|
57
|
+
transactions
|
58
|
+
end
|
52
59
|
|
53
|
-
# dump the final transaction + return
|
54
|
-
transactions << current_transaction unless current_transaction.nil?
|
55
|
-
transactions
|
56
60
|
end
|
57
61
|
|
58
|
-
|
59
|
-
|
60
|
-
# Returns the opening balance of the statement read from the table on the first page.
|
61
|
-
def opening_balance
|
62
|
-
|
63
|
-
@_opening_balance ||= scan_figure( 'Opening Balance' )
|
64
|
-
|
65
|
-
end
|
66
|
-
|
67
|
-
# Returns the closing balance of the statement read from the table on the first page.
|
68
|
-
def closing_balance
|
69
|
-
|
70
|
-
@_closing_balance ||= scan_figure( 'Closing Balance' )
|
71
|
-
|
72
|
-
end
|
73
|
-
|
74
|
-
# Returns the total value of payments in during the statement read from the table on the first page (ie: not calculated)
|
75
|
-
def payments_in
|
76
|
-
|
77
|
-
@_payments_in ||= scan_figure( 'Payments In' )
|
78
|
-
|
79
|
-
end
|
80
|
-
|
81
|
-
# Returns the total value of payments out during the statement read from the table on the first page (ie: not calculated)
|
82
|
-
def payments_out
|
83
|
-
|
84
|
-
@_payments_out ||= scan_figure( 'Payments Out' )
|
85
|
-
|
86
|
-
end
|
87
|
-
|
88
|
-
private
|
89
|
-
|
90
|
-
def document_text
|
91
|
-
|
92
|
-
@text ||= begin
|
62
|
+
# Returns the opening balance of the statement read from the table on the first page.
|
63
|
+
def opening_balance
|
93
64
|
|
94
|
-
@
|
65
|
+
@_opening_balance ||= scan_figure( 'Opening Balance' )
|
95
66
|
|
96
67
|
end
|
97
68
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
match = Regexp.new( "#{search_string}(?:.*?)([0-9\.\,]{4,})", Regexp::IGNORECASE ).match( @_first_page )
|
105
|
-
return nil if match.nil?
|
106
|
-
|
107
|
-
match[1].gsub( ',', '' ).to_f
|
108
|
-
|
109
|
-
end
|
110
|
-
|
111
|
-
def parse_page( page_str )
|
69
|
+
# Returns the closing balance of the statement read from the table on the first page.
|
70
|
+
def closing_balance
|
71
|
+
|
72
|
+
@_closing_balance ||= scan_figure( 'Closing Balance' )
|
73
|
+
|
74
|
+
end
|
112
75
|
|
113
|
-
#
|
114
|
-
|
115
|
-
|
76
|
+
# Returns the total value of payments in during the statement read from the table on the first page (ie: not calculated)
|
77
|
+
def payments_in
|
78
|
+
|
79
|
+
@_payments_in ||= scan_figure( 'Payments In' )
|
80
|
+
|
81
|
+
end
|
116
82
|
|
117
|
-
|
83
|
+
# Returns the total value of payments out during the statement read from the table on the first page (ie: not calculated)
|
84
|
+
def payments_out
|
118
85
|
|
119
|
-
|
120
|
-
|
121
|
-
|
86
|
+
@_payments_out ||= scan_figure( 'Payments Out' )
|
87
|
+
|
88
|
+
end
|
122
89
|
|
123
|
-
|
124
|
-
return nil if line_str.strip.empty?
|
90
|
+
private
|
125
91
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
92
|
+
def document_text
|
93
|
+
|
94
|
+
@text ||= begin
|
95
|
+
|
96
|
+
@reader.pages.map( &:text ).join
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
135
101
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
102
|
+
def scan_figure( search_string )
|
103
|
+
|
104
|
+
@_first_page ||= @reader.pages.first.text
|
105
|
+
|
106
|
+
match = Regexp.new( "#{search_string}(?:.*?)([0-9\.\,]{4,})", Regexp::IGNORECASE ).match( @_first_page )
|
107
|
+
return nil if match.nil?
|
108
|
+
|
109
|
+
match[1].gsub( ',', '' ).to_f
|
110
|
+
|
111
|
+
end
|
141
112
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
113
|
+
def parse_page( page_str )
|
114
|
+
|
115
|
+
# grab lines + get the longest
|
116
|
+
lines = page_str.lines
|
117
|
+
max_length = lines.map( &:length ).max
|
118
|
+
|
119
|
+
lines.map{ |line| parse_line( line.rstrip.ljust( max_length ))}.compact
|
120
|
+
|
149
121
|
end
|
150
122
|
|
151
|
-
|
152
|
-
row
|
123
|
+
def parse_line( line_str )
|
153
124
|
|
154
|
-
|
155
|
-
|
156
|
-
|
125
|
+
# if we’re blank…
|
126
|
+
return nil if line_str.strip.empty?
|
127
|
+
|
128
|
+
# start cutting things up
|
129
|
+
row = {
|
130
|
+
date: empty_to_nil( line_str[0..12] ),
|
131
|
+
type: empty_to_nil( line_str[12..20] ),
|
132
|
+
details: empty_to_nil( line_str[20..70] ),
|
133
|
+
out: empty_to_nil( line_str[70..90] ),
|
134
|
+
in: empty_to_nil( line_str[90..110] ),
|
135
|
+
balance: empty_to_nil( line_str[110..130] )
|
136
|
+
}
|
137
|
+
|
138
|
+
# munge things further
|
139
|
+
row[:date] = Date.strptime( row[:date], '%d %b %y' ) unless row[:date].nil?
|
140
|
+
row[:out] = row[:out].gsub( ',', '' ).to_f unless row[:out].nil?
|
141
|
+
row[:in] = row[:in].gsub( ',', '' ).to_f unless row[:in].nil?
|
142
|
+
row[:balance] = row[:balance].gsub( ',', '' ).to_f unless row[:balance].nil?
|
143
|
+
|
144
|
+
# set a change amount
|
145
|
+
row[:change] = if !row[:out].nil?
|
146
|
+
0 - row[:out]
|
147
|
+
elsif !row[:in].nil?
|
148
|
+
row[:in]
|
149
|
+
else
|
150
|
+
nil
|
151
|
+
end
|
152
|
+
|
153
|
+
# return
|
154
|
+
row
|
155
|
+
|
156
|
+
end
|
157
157
|
|
158
|
-
( str
|
158
|
+
def empty_to_nil( str )
|
159
|
+
|
160
|
+
( str.strip!.empty? ) ? nil : str
|
161
|
+
|
162
|
+
end
|
159
163
|
|
160
164
|
end
|
161
165
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hsbc_pdf_statement_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jon Pearse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-11-
|
11
|
+
date: 2019-11-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pdf-reader
|