hsbc_pdf_statement_parser 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d9737c59472032f49b8bc7539d08856feb6e5703a5a43966b8a80340566e8de6
4
+ data.tar.gz: 989e26dddd81066f6b2d931067acc47e6b23d60adf35daef7a30bdf27830317e
5
+ SHA512:
6
+ metadata.gz: ec9375eeee5b241bfc50fa13a1f39b99587e25fde0c5bde829bcd97d166124727837f788686218b5e46ad3aaaa54dc1707733f6902bd9e400447f93a35eaadf6
7
+ data.tar.gz: 3db670821e2d4b6868ace32aefe4688645e4766d4e68a28c3c5d64e320fa7d68764799101c050ab1b9b1c2ca2bab907db6c9e6cc950ae3789edc09fbfb0cff65
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in hsbc-pdf-statement-parser.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 Jon Pearse
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,62 @@
1
+ h1. HSBC PDF Statement Parser
2
+
3
+ This is a _very_ quick and dirty gem that swallows downloaded PDF files from HSBC (UK) and parses them into an array of hashes containing each transaction.
4
+
5
+ It exists soley because HSBC doesn’t seem to offer any way of exporting old statements as anything other than PDFs, which makes it a pain in the backside to import anything into any kind of finance packages.
6
+ You probably shouldn’t use it (see warnings below)
7
+
8
+ h2. Installation
9
+
10
+ Using bundler on the command line:
11
+
12
+ <pre>
13
+ $ bundle add hsbc_pdf_statement_parser
14
+ $ bundle
15
+ </pre>
16
+
17
+ h2. Usage
18
+
19
+ <pre>
20
+ require 'hsbc_pdf_statement_parser'
21
+
22
+ parser = HsbcPdfStatementParser::Parser.new( 'path/to/statement.pdf' )
23
+ parser.transactions.each do |tx|
24
+
25
+ printf( "%s: %-40s %7.02f\n", tx[:date], tx[:details].lines.first.strip, tx[:change] )
26
+
27
+ end
28
+ </pre>
29
+
30
+ h3. Methods
31
+
32
+ - @statements@ := returns transactions as an array of hashes (see hash keys section, below) _(Array Hash)_
33
+ - @opening_balance@ := returns the opening balance of the statement _(Float)_
34
+ - @closing_balance@ := returns the closing balance of the statement _(Float)_
35
+ - @payments_in@ := returns the total of all payments in _(Float)_
36
+ - @payments_out@ := returns the total of all payments out _(Float)_
37
+
38
+ Note that @payments_in@ and @payments_out@ are read from the header section at the top of the statement, and are not calculated from the transactions in the statement.
39
+
40
+ h3. Transaction hash keys
41
+
42
+ - @:date@ := a `Date` object describing the date of the transaction _(Date)_
43
+ - @:type@ := the type of the description, eg `DD` for a direct debit, `VIS` for visa, `(((` for contactless _(String)_
44
+ - @:details@ := the text descrption of the transaction. This may span multiple lines _(String)_
45
+ - @:in@ := the amount entering your account, or nil if an outbound transaction _(Float, nil)_
46
+ - @:out@ := the amount leaving your account, or nil if an inbound transaction _(Float, nil)_
47
+ - @:change@ := a calculated field showing the change to your bank balance: negative for debits, positive for credits _(Float)_
48
+ - @:balance@ := the balance of your account after the transaction, if present in the PDF _(Float, nil)_
49
+
50
+ *Note:* that the @:balance@ key is pulled straight from the PDF and will only be present for the last transaction on a particular day. I’m not doing anything even remotely clever here :)
51
+
52
+ h2. ⚠️ Warnings
53
+
54
+ This gem has been thrown together for my own needs, and is offered to the world just in case someone else might want to play around with it.
55
+ It seems to work pretty well with statements from my Advance account here in the UK, and may also work with other flavours of accounts from elsewhere in the world, but comes with absolutely zero guarantees or assurances.
56
+
57
+ That is to say: it seems to work OK for mucking around, but I’d recommend not using it for anything mission-critical, or in a situation that might lead you or others into making any kind of financial decisions.
58
+ Any dumb financial decisions made are entirely on you =)
59
+
60
+ Also, this gem contains a patch for "pdf-reader":https://github.com/yob/pdf-reader to help it better cope with the weird way in which HSBC seems to generate PDFs. This is unapologetically a massive hack, and really could do with someone far smarter than me to come up with a better solution.
61
+ For more information, see "pdf-reader issue #169":https://github.com/yob/pdf-reader/issues/169 which goes into a little more detail about what’s going on (my files seem to terminate the image data with @0xE0@, per my patch)
62
+
@@ -0,0 +1,20 @@
1
+ lib = File.expand_path("lib", __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require "hsbc_pdf_statement_parser/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+
7
+ spec.name = 'hsbc_pdf_statement_parser'
8
+ spec.summary = 'Quick and dirty RubyGem to parse HSBC’s statement PDFs'
9
+ spec.license = 'MIT'
10
+
11
+ spec.authors = 'Jon Pearse'
12
+ spec.email = 'hello@jonpearse.net'
13
+ spec.homepage = 'https://github.com/jonpearse/hsbc-pdf-statement-parser'
14
+
15
+ spec.version = HsbcPdfStatementParser::VERSION
16
+ spec.files = `git ls-files`.split($\)
17
+
18
+ spec.add_dependency 'pdf-reader', '~> 2.4'
19
+
20
+ end
@@ -0,0 +1,7 @@
1
+ require 'date'
2
+ require 'pdf-reader'
3
+ require 'hsbc_pdf_statement_parser/pdf_reader_patch'
4
+ require 'hsbc_pdf_statement_parser/parser'
5
+
6
+ module HsbcPdfStatementParser
7
+ end
@@ -0,0 +1,162 @@
1
+ class HsbcPdfStatementParser::Parser
2
+
3
+ # Creates a new parser from a PDF file.
4
+ #
5
+ # === Parameters
6
+ #
7
+ # [filename] the filename to parse
8
+ def initialize( filename )
9
+
10
+ @reader = PDF::Reader.new( filename )
11
+
12
+ end
13
+
14
+ # Returns an array of the transactions in the document as hashes.
15
+ #
16
+ # === Hash keys
17
+ #
18
+ # [:date] the date of the transaction _(Date)_
19
+ # [:type] the type of the transaction, eg ‘VISA’, ‘DD’, ‘ATM’, etc _(String)_
20
+ # [:details] the details of the transaction. This can span multiple lines _(String)_
21
+ # [:out] the amount of the transaction, if a debit _(Float, nil)_
22
+ # [:in] the amount of the transaction, if a credit _(Float, nil)_
23
+ # [:change] the amount of the transacation: negative if a debit, positive if a credit _(Float)_
24
+ def transactions
25
+
26
+ @_transactions ||= begin
27
+
28
+ current_transaction = nil
29
+ current_date = nil
30
+ transactions = []
31
+
32
+ document_text
33
+ .scan( /BALANCE\s?BROUGHT\s?FORWARD(?:.*?)\n(.*?)BALANCE\s?CARRIED\s?FORWARD/im )
34
+ .map{ |text| parse_page( text[0] )}
35
+ .flatten
36
+ .each do |line|
37
+
38
+ # store the current date
39
+ current_date = line[:date] unless line[:date].nil?
40
+
41
+ # if we have a type, start a new transaction
42
+ unless line[:type].nil?
43
+ transactions << current_transaction unless current_transaction.nil?
44
+ current_transaction = line.merge( date: current_date )
45
+ next
46
+ end
47
+
48
+ # merge things in
49
+ current_transaction.merge!( line.select{ |k,v| v }, { details: "#{current_transaction[:details]}\n#{line[:details]}" })
50
+
51
+ end
52
+
53
+ # dump the final transaction + return
54
+ transactions << current_transaction unless current_transaction.nil?
55
+ transactions
56
+ end
57
+
58
+ end
59
+
60
+ # Returns the opening balance of the statement read from the table on the first page.
61
+ def opening_balance
62
+
63
+ @_opening_balance ||= scan_figure( 'Opening Balance' )
64
+
65
+ end
66
+
67
+ # Returns the closing balance of the statement read from the table on the first page.
68
+ def closing_balance
69
+
70
+ @_closing_balance ||= scan_figure( 'Closing Balance' )
71
+
72
+ end
73
+
74
+ # Returns the total value of payments in during the statement read from the table on the first page (ie: not calculated)
75
+ def payments_in
76
+
77
+ @_payments_in ||= scan_figure( 'Payments In' )
78
+
79
+ end
80
+
81
+ # Returns the total value of payments out during the statement read from the table on the first page (ie: not calculated)
82
+ def payments_out
83
+
84
+ @_payments_out ||= scan_figure( 'Payments Out' )
85
+
86
+ end
87
+
88
+ private
89
+
90
+ def document_text
91
+
92
+ @text ||= begin
93
+
94
+ @reader.pages.map( &:text ).join
95
+
96
+ end
97
+
98
+ end
99
+
100
+ def scan_figure( search_string )
101
+
102
+ @_first_page ||= @reader.pages.first.text
103
+
104
+ match = Regexp.new( "#{search_string}(?:.*?)([0-9\.\,]{4,})", Regexp::IGNORECASE ).match( @_first_page )
105
+ return nil if match.nil?
106
+
107
+ match[1].gsub( ',', '' ).to_f
108
+
109
+ end
110
+
111
+ def parse_page( page_str )
112
+
113
+ # grab lines + get the longest
114
+ lines = page_str.lines
115
+ max_length = lines.map( &:length ).max
116
+
117
+ lines.map{ |line| parse_line( line.rstrip.ljust( max_length ))}.compact
118
+
119
+ end
120
+
121
+ def parse_line( line_str )
122
+
123
+ # if we’re blank…
124
+ return nil if line_str.strip.empty?
125
+
126
+ # start cutting things up
127
+ row = {
128
+ date: empty_to_nil( line_str[0..12] ),
129
+ type: empty_to_nil( line_str[12..20] ),
130
+ details: empty_to_nil( line_str[20..70] ),
131
+ out: empty_to_nil( line_str[70..90] ),
132
+ in: empty_to_nil( line_str[90..110] ),
133
+ balance: empty_to_nil( line_str[110..130] )
134
+ }
135
+
136
+ # munge things further
137
+ row[:date] = Date.strptime( row[:date], '%d %b %y' ) unless row[:date].nil?
138
+ row[:out] = row[:out].gsub( ',', '' ).to_f unless row[:out].nil?
139
+ row[:in] = row[:in].gsub( ',', '' ).to_f unless row[:in].nil?
140
+ row[:balance] = row[:balance].gsub( ',', '' ).to_f unless row[:balance].nil?
141
+
142
+ # set a change amount
143
+ row[:change] = if !row[:out].nil?
144
+ 0 - row[:out]
145
+ elsif !row[:in].nil?
146
+ row[:in]
147
+ else
148
+ nil
149
+ end
150
+
151
+ # return
152
+ row
153
+
154
+ end
155
+
156
+ def empty_to_nil( str )
157
+
158
+ ( str.strip!.empty? ) ? nil : str
159
+
160
+ end
161
+
162
+ end
@@ -0,0 +1,44 @@
1
+ # This is a horrendous patch to work around issue #169 in the pdf-reader repo (https://github.com/yob/pdf-reader/issues/169).
2
+ #
3
+ # Short version is that whatever HSBC is using to generate PDFs doesn’t seem to null-/whitespace-terminate inline image
4
+ # data. Thus, when PdfReader tries to find the ‘EI’ token when parsing inline media, it can’t and simply runs off the end
5
+ # of the document, causing a TypeError to be thrown.
6
+ #
7
+ # The PDF files I’m getting all seem to end some of the images with xE0, so I’ve simply monkey-patched this into the
8
+ # library for use with my files.
9
+ # This may not be the case for anyone else, in which case maybe add whatever your problem character is to the regex and
10
+ # open a PR should you feel the need.
11
+ #
12
+ # Or, y’know, look at the PdfReader source and see if you can work out something better, because this is horrendous :/
13
+
14
+ module PdfReaderPatch
15
+ def self.included( base )
16
+ base.class_eval do
17
+ def prepare_inline_token
18
+ str = "".dup
19
+
20
+ buffer = []
21
+ to_rewind = -3
22
+
23
+ until buffer[0] =~ /\s|\0|\xE0/n && buffer[1, 2] == ['E', 'I']
24
+ chr = @io.read(1)
25
+ buffer << chr
26
+
27
+ if buffer.length > 3
28
+ str << buffer.shift
29
+ end
30
+
31
+ to_rewind = -2 if buffer.first =~ /\xE0/n
32
+ end
33
+
34
+ str << '\0' if buffer.first == '\0'
35
+
36
+ @tokens << string_token(str)
37
+
38
+ @io.seek(to_rewind, IO::SEEK_CUR) unless chr.nil?
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ PDF::Reader::Buffer.send( :include, PdfReaderPatch )
@@ -0,0 +1,3 @@
1
+ module HsbcPdfStatementParser
2
+ VERSION = "1.0.0"
3
+ end
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hsbc_pdf_statement_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Jon Pearse
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-11-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: pdf-reader
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.4'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.4'
27
+ description:
28
+ email: hello@jonpearse.net
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - Gemfile
34
+ - LICENSE.txt
35
+ - README.textile
36
+ - hsbc_pdf_statement_parser.gemspec
37
+ - lib/hsbc_pdf_statement_parser.rb
38
+ - lib/hsbc_pdf_statement_parser/parser.rb
39
+ - lib/hsbc_pdf_statement_parser/pdf_reader_patch.rb
40
+ - lib/hsbc_pdf_statement_parser/version.rb
41
+ homepage: https://github.com/jonpearse/hsbc-pdf-statement-parser
42
+ licenses:
43
+ - MIT
44
+ metadata: {}
45
+ post_install_message:
46
+ rdoc_options: []
47
+ require_paths:
48
+ - lib
49
+ required_ruby_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ requirements: []
60
+ rubygems_version: 3.0.6
61
+ signing_key:
62
+ specification_version: 4
63
+ summary: Quick and dirty RubyGem to parse HSBC’s statement PDFs
64
+ test_files: []