hsbc_pdf_statement_parser 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/hsbc_pdf_statement_parser/parser.rb +137 -133
- data/lib/hsbc_pdf_statement_parser/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1aeed5005a826d427a328576db61241bab262e44941ae1c0af5d34f7028f1281
|
4
|
+
data.tar.gz: 1af68498d1b4e4732e42a7422a32930170f4870f85a458b0a166b95e0ef7b338
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 64d6adeb4671eff35241d94d8f848fe54cbf6c324aa366e75d658d44b5113b468badde10b262ab092ab8b43e87402dad81cc10f9ba4b281c6fefc325c4533199
|
7
|
+
data.tar.gz: bebec0676834fd0da389f2094017a930c795f103a7477e67062449d31c3598cd9571f5f6dcbb57ab06549e4b761515915e9f211d936bf1e7660452c2497a32db
|
@@ -1,161 +1,165 @@
|
|
1
|
-
|
1
|
+
module HsbcPdfStatementParser
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
end
|
13
|
-
|
14
|
-
# Returns an array of the transactions in the document as hashes.
|
15
|
-
#
|
16
|
-
# === Hash keys
|
17
|
-
#
|
18
|
-
# [:date] the date of the transaction _(Date)_
|
19
|
-
# [:type] the type of the transaction, eg ‘VISA’, ‘DD’, ‘ATM’, etc _(String)_
|
20
|
-
# [:details] the details of the transaction. This can span multiple lines _(String)_
|
21
|
-
# [:out] the amount of the transaction, if a debit _(Float, nil)_
|
22
|
-
# [:in] the amount of the transaction, if a credit _(Float, nil)_
|
23
|
-
# [:change] the amount of the transacation: negative if a debit, positive if a credit _(Float)_
|
24
|
-
def transactions
|
25
|
-
|
26
|
-
@_transactions ||= begin
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
# Creates a new parser from a PDF file.
|
6
|
+
#
|
7
|
+
# === Parameters
|
8
|
+
#
|
9
|
+
# [filename] the filename to parse
|
10
|
+
def initialize( filename )
|
27
11
|
|
28
|
-
|
29
|
-
current_date = nil
|
30
|
-
transactions = []
|
12
|
+
@reader = PDF::Reader.new( filename )
|
31
13
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
14
|
+
end
|
15
|
+
|
16
|
+
# Returns an array of the transactions in the document as hashes.
|
17
|
+
#
|
18
|
+
# === Hash keys
|
19
|
+
#
|
20
|
+
# [:date] the date of the transaction _(Date)_
|
21
|
+
# [:type] the type of the transaction, eg ‘VISA’, ‘DD’, ‘ATM’, etc _(String)_
|
22
|
+
# [:details] the details of the transaction. This can span multiple lines _(String)_
|
23
|
+
# [:out] the amount of the transaction, if a debit _(Float, nil)_
|
24
|
+
# [:in] the amount of the transaction, if a credit _(Float, nil)_
|
25
|
+
# [:change] the amount of the transacation: negative if a debit, positive if a credit _(Float)_
|
26
|
+
def transactions
|
27
|
+
|
28
|
+
@_transactions ||= begin
|
37
29
|
|
38
|
-
|
39
|
-
|
30
|
+
current_transaction = nil
|
31
|
+
current_date = nil
|
32
|
+
transactions = []
|
40
33
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
34
|
+
document_text
|
35
|
+
.scan( /BALANCE\s?BROUGHT\s?FORWARD(?:.*?)\n(.*?)BALANCE\s?CARRIED\s?FORWARD/im )
|
36
|
+
.map{ |text| parse_page( text[0] )}
|
37
|
+
.flatten
|
38
|
+
.each do |line|
|
39
|
+
|
40
|
+
# store the current date
|
41
|
+
current_date = line[:date] unless line[:date].nil?
|
42
|
+
|
43
|
+
# if we have a type, start a new transaction
|
44
|
+
unless line[:type].nil?
|
45
|
+
transactions << current_transaction unless current_transaction.nil?
|
46
|
+
current_transaction = line.merge( date: current_date )
|
47
|
+
next
|
48
|
+
end
|
49
|
+
|
50
|
+
# merge things in
|
51
|
+
current_transaction.merge!( line.select{ |k,v| v }, { details: "#{current_transaction[:details]}\n#{line[:details]}" })
|
52
|
+
|
46
53
|
end
|
47
54
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
55
|
+
# dump the final transaction + return
|
56
|
+
transactions << current_transaction unless current_transaction.nil?
|
57
|
+
transactions
|
58
|
+
end
|
52
59
|
|
53
|
-
# dump the final transaction + return
|
54
|
-
transactions << current_transaction unless current_transaction.nil?
|
55
|
-
transactions
|
56
60
|
end
|
57
61
|
|
58
|
-
|
59
|
-
|
60
|
-
# Returns the opening balance of the statement read from the table on the first page.
|
61
|
-
def opening_balance
|
62
|
-
|
63
|
-
@_opening_balance ||= scan_figure( 'Opening Balance' )
|
64
|
-
|
65
|
-
end
|
66
|
-
|
67
|
-
# Returns the closing balance of the statement read from the table on the first page.
|
68
|
-
def closing_balance
|
69
|
-
|
70
|
-
@_closing_balance ||= scan_figure( 'Closing Balance' )
|
71
|
-
|
72
|
-
end
|
73
|
-
|
74
|
-
# Returns the total value of payments in during the statement read from the table on the first page (ie: not calculated)
|
75
|
-
def payments_in
|
76
|
-
|
77
|
-
@_payments_in ||= scan_figure( 'Payments In' )
|
78
|
-
|
79
|
-
end
|
80
|
-
|
81
|
-
# Returns the total value of payments out during the statement read from the table on the first page (ie: not calculated)
|
82
|
-
def payments_out
|
83
|
-
|
84
|
-
@_payments_out ||= scan_figure( 'Payments Out' )
|
85
|
-
|
86
|
-
end
|
87
|
-
|
88
|
-
private
|
89
|
-
|
90
|
-
def document_text
|
91
|
-
|
92
|
-
@text ||= begin
|
62
|
+
# Returns the opening balance of the statement read from the table on the first page.
|
63
|
+
def opening_balance
|
93
64
|
|
94
|
-
@
|
65
|
+
@_opening_balance ||= scan_figure( 'Opening Balance' )
|
95
66
|
|
96
67
|
end
|
97
68
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
match = Regexp.new( "#{search_string}(?:.*?)([0-9\.\,]{4,})", Regexp::IGNORECASE ).match( @_first_page )
|
105
|
-
return nil if match.nil?
|
106
|
-
|
107
|
-
match[1].gsub( ',', '' ).to_f
|
108
|
-
|
109
|
-
end
|
110
|
-
|
111
|
-
def parse_page( page_str )
|
69
|
+
# Returns the closing balance of the statement read from the table on the first page.
|
70
|
+
def closing_balance
|
71
|
+
|
72
|
+
@_closing_balance ||= scan_figure( 'Closing Balance' )
|
73
|
+
|
74
|
+
end
|
112
75
|
|
113
|
-
#
|
114
|
-
|
115
|
-
|
76
|
+
# Returns the total value of payments in during the statement read from the table on the first page (ie: not calculated)
|
77
|
+
def payments_in
|
78
|
+
|
79
|
+
@_payments_in ||= scan_figure( 'Payments In' )
|
80
|
+
|
81
|
+
end
|
116
82
|
|
117
|
-
|
83
|
+
# Returns the total value of payments out during the statement read from the table on the first page (ie: not calculated)
|
84
|
+
def payments_out
|
118
85
|
|
119
|
-
|
120
|
-
|
121
|
-
|
86
|
+
@_payments_out ||= scan_figure( 'Payments Out' )
|
87
|
+
|
88
|
+
end
|
122
89
|
|
123
|
-
|
124
|
-
return nil if line_str.strip.empty?
|
90
|
+
private
|
125
91
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
92
|
+
def document_text
|
93
|
+
|
94
|
+
@text ||= begin
|
95
|
+
|
96
|
+
@reader.pages.map( &:text ).join
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
135
101
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
102
|
+
def scan_figure( search_string )
|
103
|
+
|
104
|
+
@_first_page ||= @reader.pages.first.text
|
105
|
+
|
106
|
+
match = Regexp.new( "#{search_string}(?:.*?)([0-9\.\,]{4,})", Regexp::IGNORECASE ).match( @_first_page )
|
107
|
+
return nil if match.nil?
|
108
|
+
|
109
|
+
match[1].gsub( ',', '' ).to_f
|
110
|
+
|
111
|
+
end
|
141
112
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
113
|
+
def parse_page( page_str )
|
114
|
+
|
115
|
+
# grab lines + get the longest
|
116
|
+
lines = page_str.lines
|
117
|
+
max_length = lines.map( &:length ).max
|
118
|
+
|
119
|
+
lines.map{ |line| parse_line( line.rstrip.ljust( max_length ))}.compact
|
120
|
+
|
149
121
|
end
|
150
122
|
|
151
|
-
|
152
|
-
row
|
123
|
+
def parse_line( line_str )
|
153
124
|
|
154
|
-
|
155
|
-
|
156
|
-
|
125
|
+
# if we’re blank…
|
126
|
+
return nil if line_str.strip.empty?
|
127
|
+
|
128
|
+
# start cutting things up
|
129
|
+
row = {
|
130
|
+
date: empty_to_nil( line_str[0..12] ),
|
131
|
+
type: empty_to_nil( line_str[12..20] ),
|
132
|
+
details: empty_to_nil( line_str[20..70] ),
|
133
|
+
out: empty_to_nil( line_str[70..90] ),
|
134
|
+
in: empty_to_nil( line_str[90..110] ),
|
135
|
+
balance: empty_to_nil( line_str[110..130] )
|
136
|
+
}
|
137
|
+
|
138
|
+
# munge things further
|
139
|
+
row[:date] = Date.strptime( row[:date], '%d %b %y' ) unless row[:date].nil?
|
140
|
+
row[:out] = row[:out].gsub( ',', '' ).to_f unless row[:out].nil?
|
141
|
+
row[:in] = row[:in].gsub( ',', '' ).to_f unless row[:in].nil?
|
142
|
+
row[:balance] = row[:balance].gsub( ',', '' ).to_f unless row[:balance].nil?
|
143
|
+
|
144
|
+
# set a change amount
|
145
|
+
row[:change] = if !row[:out].nil?
|
146
|
+
0 - row[:out]
|
147
|
+
elsif !row[:in].nil?
|
148
|
+
row[:in]
|
149
|
+
else
|
150
|
+
nil
|
151
|
+
end
|
152
|
+
|
153
|
+
# return
|
154
|
+
row
|
155
|
+
|
156
|
+
end
|
157
157
|
|
158
|
-
( str
|
158
|
+
def empty_to_nil( str )
|
159
|
+
|
160
|
+
( str.strip!.empty? ) ? nil : str
|
161
|
+
|
162
|
+
end
|
159
163
|
|
160
164
|
end
|
161
165
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hsbc_pdf_statement_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jon Pearse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-11-
|
11
|
+
date: 2019-11-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pdf-reader
|