ing_kontoauszug_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'text_parser'
4
+ require_relative 'pdf_extractor'
5
+
6
+ module IngKontoauszugParser
7
+ # The main interface for parsing ING Bank account statements.
8
+ #
9
+ # StatementParser converts PDF or text statement exports into structured
10
+ # Ruby hashes containing account information and individual transactions.
11
+ # It handles the entire pipeline: PDF text extraction, IBAN validation,
12
+ # transaction parsing, and SEPA reference extraction.
13
+ #
14
+ # == Quick Start
15
+ #
16
+ # For most use cases, create a parser and call {#parse} with a PDF path:
17
+ #
18
+ # parser = IngKontoauszugParser::StatementParser.new
19
+ # result = parser.parse(file_path: 'statement.pdf')
20
+ #
21
+ # == Return Structure
22
+ #
23
+ # All parsing methods return a hash with the following structure:
24
+ #
25
+ # {
26
+ # header: {
27
+ # iban: "DE89 3704 0044 0532 0130 00"
28
+ # },
29
+ # statements: [
30
+ # {
31
+ # booking_date: "01.08.2025",
32
+ # value_date: "01.08.2025",
33
+ # transfer_type: "Lastschrift",
34
+ # recipient: "Allianz Direct Vers.",
35
+ # amount_eur: "-31,49",
36
+ # amount_eur_numeric: "-31.49",
37
+ # amount_direction: "debit",
38
+ # narrative: "Versicherungsbeitrag August",
39
+ # mandate_id: "MA123456", # optional
40
+ # reference: "RF123456789", # optional
41
+ # google_pay: true # optional
42
+ # },
43
+ # # ... more statements
44
+ # ],
45
+ # warnings: [...] # optional, only present if issues occurred
46
+ # }
47
+ #
48
+ # == PDF Backend Selection
49
+ #
50
+ # The parser supports two PDF extraction methods:
51
+ # - +:poppler+ - Uses the +pdftotext+ command (5-10x faster, recommended)
52
+ # - +:pdf_reader+ - Pure Ruby extraction (no system dependencies)
53
+ #
54
+ # By default, poppler is used if available, falling back to pdf-reader.
55
+ #
56
+ # == Language Configuration
57
+ #
58
+ # The parser is configured for German ING statements by default. For other
59
+ # languages, override the marker and label parameters:
60
+ #
61
+ # parser = StatementParser.new(
62
+ # end_markers: ['New Balance'],
63
+ # mandate_label: 'Mandate:',
64
+ # reference_label: 'Reference:'
65
+ # )
66
+ #
67
+ # @example Parse a PDF statement
68
+ # parser = IngKontoauszugParser::StatementParser.new
69
+ # result = parser.parse(file_path: '/path/to/statement.pdf')
70
+ # puts "Account: #{result[:header][:iban]}"
71
+ # result[:statements].each do |s|
72
+ # puts "#{s[:booking_date]}: #{s[:recipient]} #{s[:amount_eur]}"
73
+ # end
74
+ #
75
+ # @example Parse with specific PDF backend
76
+ # parser = IngKontoauszugParser::StatementParser.new(pdf_backend: :poppler)
77
+ # result = parser.parse(file_path: 'statement.pdf')
78
+ #
79
+ # @example Parse text export (for testing or text files)
80
+ # parser = IngKontoauszugParser::StatementParser.new
81
+ # result = parser.parse_text(File.read('statement.txt'))
82
+ #
83
+ # @example Skip IBAN validation
84
+ # parser = IngKontoauszugParser::StatementParser.new(validate_iban: false)
85
+ class StatementParser
86
+ # Creates a new parser with optional language and backend configuration.
87
+ #
88
+ # All parameters have sensible defaults for German ING statements. Override
89
+ # as needed for other languages or specific requirements.
90
+ #
91
+ # @param end_markers [Array<String>, nil] phrases that signal the end of the
92
+ # transaction list. When any of these appear, parsing stops.
93
+ # Default: ["Neuer Betrag", "Neuer Saldo"] (German)
94
+ # @param mandate_label [String, nil] the label preceding SEPA mandate IDs in
95
+ # the narrative text. Default: "Mandat:" (German)
96
+ # @param reference_label [String, nil] the label preceding SEPA references in
97
+ # the narrative text. Default: "Referenz:" (German)
98
+ # @param validate_iban [Boolean] whether to verify the IBAN checksum using
99
+ # ISO 13616 mod-97. Set to false for faster parsing when validation is
100
+ # handled elsewhere. Default: true
101
+ # @param pdf_backend [Symbol, nil] force a specific PDF extraction backend:
102
+ # - +:poppler+ - Use pdftotext (requires poppler-utils installed)
103
+ # - +:pdf_reader+ - Use pure Ruby pdf-reader gem
104
+ # - +nil+ - Auto-select fastest available (default)
105
+ def initialize(end_markers: nil, mandate_label: nil, reference_label: nil,
106
+ validate_iban: true, pdf_backend: nil)
107
+ @parser_options = {
108
+ validate_iban: validate_iban
109
+ }
110
+ @parser_options[:end_markers] = end_markers if end_markers
111
+ @parser_options[:mandate_label] = mandate_label if mandate_label
112
+ @parser_options[:reference_label] = reference_label if reference_label
113
+ @pdf_backend = pdf_backend
114
+ end
115
+
116
+ # Parses an ING PDF statement file into structured data.
117
+ #
118
+ # This is the primary method for processing PDF statements. It extracts
119
+ # text from the PDF, locates the IBAN, and parses all transactions.
120
+ #
121
+ # @param file_path [String] path to the PDF statement file (absolute or relative)
122
+ # @param reader [Class, nil] custom PDF reader class for testing. When provided,
123
+ # bypasses normal extraction and uses the given class directly. The class
124
+ # must respond to +new(path)+ and return an object with +pages+ that yield
125
+ # objects responding to +text+.
126
+ # @return [Hash] parsed result containing +:header+ and +:statements+ keys.
127
+ # See class documentation for the complete structure.
128
+ # @raise [ArgumentError] if file_path is nil or empty
129
+ # @raise [IngKontoauszugParser::Error] if the file doesn't exist or PDF extraction fails
130
+ # @raise [IngKontoauszugParser::HeaderNotFound] if no IBAN is found in the statement
131
+ # @raise [IngKontoauszugParser::InvalidIBAN] if IBAN validation fails (when enabled)
132
+ # @raise [IngKontoauszugParser::BookingParseError] if no transactions can be parsed
133
+ def parse(file_path:, reader: nil)
134
+ validate_file_path!(file_path)
135
+
136
+ lines = if reader
137
+ # Legacy path: use provided reader class (for testing)
138
+ extract_with_reader(file_path, reader)
139
+ else
140
+ # Optimized path: use PdfExtractor with backend selection
141
+ PdfExtractor.extract_lines(file_path, backend: @pdf_backend)
142
+ end
143
+
144
+ text_parser.parse_lines(lines)
145
+ end
146
+
147
+ # Parses statement text directly without PDF extraction.
148
+ #
149
+ # Use this method when you have the statement as plain text (e.g., from
150
+ # a text export, copy-paste, or for testing). The text should contain
151
+ # the complete statement including the IBAN line and all transactions.
152
+ #
153
+ # @param text [String] complete statement text with IBAN and transactions
154
+ # @return [Hash] parsed result with the same structure as {#parse}
155
+ # @raise [IngKontoauszugParser::HeaderNotFound] if no IBAN is found
156
+ # @raise [IngKontoauszugParser::InvalidIBAN] if IBAN validation fails
157
+ # @raise [IngKontoauszugParser::BookingParseError] if no transactions found
158
+ #
159
+ # @example Parse text export
160
+ # text = File.read('statement_export.txt')
161
+ # result = parser.parse_text(text)
162
+ def parse_text(text)
163
+ text_parser.parse_text(text)
164
+ end
165
+
166
+ # Parses pre-split statement lines.
167
+ #
168
+ # Use this when you already have the statement as an array of lines
169
+ # (e.g., from custom preprocessing or line-by-line file reading).
170
+ #
171
+ # @param lines [Array<String>] statement lines including IBAN and transactions
172
+ # @return [Hash] parsed result with the same structure as {#parse}
173
+ # @raise [IngKontoauszugParser::HeaderNotFound] if no IBAN is found
174
+ # @raise [IngKontoauszugParser::InvalidIBAN] if IBAN validation fails
175
+ # @raise [IngKontoauszugParser::BookingParseError] if no transactions found
176
+ def parse_lines(lines)
177
+ text_parser.parse_lines(lines)
178
+ end
179
+
180
+ # Parses transaction lines without requiring an IBAN header.
181
+ #
182
+ # Use this when you have only the transaction portion of a statement
183
+ # (without the account header), or when the IBAN is already known from
184
+ # another source.
185
+ #
186
+ # Unlike {#parse}, {#parse_text}, and {#parse_lines}, this method returns
187
+ # only the statements array, not a hash with header information.
188
+ #
189
+ # @param lines [Array<String>] lines containing only the transactions
190
+ # @return [Array<Hash>] parsed transaction hashes (not wrapped in a result hash)
191
+ #
192
+ # @example Parse transactions only
193
+ # transactions = parser.parse_statement_lines(booking_lines)
194
+ # transactions.first[:recipient] #=> "Amazon EU"
195
+ def parse_statement_lines(lines)
196
+ text_parser.parse_statement_lines(lines)
197
+ end
198
+
199
+ # Checks if the fast poppler PDF backend is available.
200
+ #
201
+ # Use this to conditionally inform users about installation recommendations
202
+ # or to choose between processing strategies.
203
+ #
204
+ # @return [Boolean] true if +pdftotext+ is found in the system PATH
205
+ #
206
+ # @example Show recommendation to user
207
+ # unless StatementParser.poppler_available?
208
+ # puts "Install poppler-utils for 5-10x faster PDF processing"
209
+ # end
210
+ def self.poppler_available?
211
+ PdfExtractor.poppler_available?
212
+ end
213
+
214
+ private
215
+
216
+ # Returns a memoized TextParser instance configured with parser options.
217
+ # @return [TextParser]
218
+ # @api private
219
+ def text_parser
220
+ @text_parser ||= TextParser.new(**@parser_options)
221
+ end
222
+
223
+ # Validates that a file path is provided and the file exists.
224
+ #
225
+ # @param file_path [String, nil] the path to validate
226
+ # @raise [ArgumentError] if path is nil or empty
227
+ # @raise [IngKontoauszugParser::Error] if file doesn't exist
228
+ # @api private
229
+ def validate_file_path!(file_path)
230
+ raise ArgumentError, 'file_path is required' if file_path.nil? || file_path.empty?
231
+
232
+ return if File.exist?(file_path)
233
+
234
+ raise IngKontoauszugParser::Error,
235
+ "File not found: #{file_path}"
236
+ end
237
+
238
+ # Extracts text using a custom PDF reader class (for testing).
239
+ #
240
+ # This method provides backward compatibility with tests that inject
241
+ # custom PDF reader implementations. Production code should use
242
+ # {PdfExtractor} directly.
243
+ #
244
+ # @param file_path [String] path to the PDF file
245
+ # @param reader_class [Class] PDF reader class to use
246
+ # @return [Array<String>] extracted text lines
247
+ # @raise [IngKontoauszugParser::Error] if PDF extraction fails
248
+ # @api private
249
+ def extract_with_reader(file_path, reader_class)
250
+ pdf = reader_class.new(file_path)
251
+ lines = []
252
+
253
+ pdf.pages.each do |page|
254
+ next unless page.respond_to?(:text)
255
+
256
+ page.text.to_s.each_line do |line|
257
+ lines << line.rstrip
258
+ end
259
+ end
260
+
261
+ lines
262
+ rescue StandardError => e
263
+ # Wrap PDF-related errors for consistent API
264
+ raise IngKontoauszugParser::Error, e.message if e.class.name.start_with?('PDF::Reader::')
265
+
266
+ raise
267
+ end
268
+ end
269
+ end