ing_kontoauszug_parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +19 -0
- data/LICENSE +21 -0
- data/README.md +159 -0
- data/bin/console +8 -0
- data/bin/pdf_to_json +64 -0
- data/bin/setup +5 -0
- data/lib/ing_kontoauszug_parser/header.rb +146 -0
- data/lib/ing_kontoauszug_parser/pdf_extractor.rb +233 -0
- data/lib/ing_kontoauszug_parser/statement_parser.rb +269 -0
- data/lib/ing_kontoauszug_parser/text_parser.rb +905 -0
- data/lib/ing_kontoauszug_parser/version.rb +14 -0
- data/lib/ing_kontoauszug_parser.rb +105 -0
- metadata +74 -0
|
@@ -0,0 +1,905 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'bigdecimal'
|
|
4
|
+
require 'strscan'
|
|
5
|
+
require_relative 'header'
|
|
6
|
+
|
|
7
|
+
module IngKontoauszugParser
|
|
8
|
+
# Parses ING bank statement text into structured transaction data.
|
|
9
|
+
#
|
|
10
|
+
# This class implements a single-pass streaming parser optimized for processing
|
|
11
|
+
# ING Germany statement exports. It handles both PDF-extracted text and direct
|
|
12
|
+
# text exports, extracting IBAN, transaction details, SEPA references, and
|
|
13
|
+
# narrative information.
|
|
14
|
+
#
|
|
15
|
+
# == Architecture
|
|
16
|
+
#
|
|
17
|
+
# The parser uses a state machine approach with lazy evaluation:
|
|
18
|
+
# - IBAN extraction happens opportunistically while scanning for the first booking
|
|
19
|
+
# - Booking lines are detected via fast character checks before regex matching
|
|
20
|
+
# - Line caching avoids redundant string operations within a single line
|
|
21
|
+
# - Memory-efficient string building minimizes allocations
|
|
22
|
+
#
|
|
23
|
+
# == Performance Optimizations
|
|
24
|
+
#
|
|
25
|
+
# Several techniques reduce parsing time for large statements:
|
|
26
|
+
# - First-character rejection filters out non-matching lines before regex
|
|
27
|
+
# - Pre-compiled regex patterns avoid repeated compilation
|
|
28
|
+
# - Memoized strip operations via @line_cache
|
|
29
|
+
# - StringScanner for structured booking line parsing
|
|
30
|
+
#
|
|
31
|
+
# == Supported Formats
|
|
32
|
+
#
|
|
33
|
+
# The parser recognizes German ING statement format with configurable labels
|
|
34
|
+
# for international variants:
|
|
35
|
+
# - Booking lines: "DD.MM.YYYY TransferType Recipient Amount"
|
|
36
|
+
# - Value date lines: "DD.MM.YYYY" or "DD.MM.YYYY Narrative text"
|
|
37
|
+
# - SEPA fields: "Mandat: XXX" and "Referenz: XXX" (configurable)
|
|
38
|
+
# - End markers: "Neuer Betrag" or "Neuer Saldo" (configurable)
|
|
39
|
+
#
|
|
40
|
+
# @api private
|
|
41
|
+
# @see StatementParser The public API that delegates to this class
|
|
42
|
+
class TextParser
|
|
43
|
+
# Phrases that signal the end of the transaction list in German statements.
|
|
44
|
+
# When any of these appear on a line, parsing stops.
|
|
45
|
+
# @return [Array<String>] frozen array of German end markers
|
|
46
|
+
DEFAULT_END_MARKERS = ['Neuer Betrag', 'Neuer Saldo'].freeze
|
|
47
|
+
|
|
48
|
+
# SEPA mandate identifier label in German statements.
|
|
49
|
+
# @return [String] the German label for mandate IDs
|
|
50
|
+
DEFAULT_MANDATE_LABEL = 'Mandat:'
|
|
51
|
+
|
|
52
|
+
# SEPA reference label in German statements.
|
|
53
|
+
# @return [String] the German label for payment references
|
|
54
|
+
DEFAULT_REFERENCE_LABEL = 'Referenz:'
|
|
55
|
+
|
|
56
|
+
# Creates a new parser with language-specific configuration.
|
|
57
|
+
#
|
|
58
|
+
# Override the defaults to parse statements in other languages or with
|
|
59
|
+
# custom field labels.
|
|
60
|
+
#
|
|
61
|
+
# @param end_markers [Array<String>] phrases that signal the end of the
|
|
62
|
+
# transaction list. Parsing stops when any marker is found.
|
|
63
|
+
# @param mandate_label [String] the label preceding SEPA mandate IDs
|
|
64
|
+
# (e.g., "Mandate:" for English statements)
|
|
65
|
+
# @param reference_label [String] the label preceding SEPA references
|
|
66
|
+
# (e.g., "Reference:" for English statements)
|
|
67
|
+
# @param validate_iban [Boolean] whether to verify the IBAN checksum
|
|
68
|
+
# using ISO 13616 mod-97. Set to false for faster parsing when
|
|
69
|
+
# IBAN validation is handled elsewhere.
|
|
70
|
+
#
|
|
71
|
+
# @example German defaults (no arguments needed)
|
|
72
|
+
# parser = TextParser.new
|
|
73
|
+
#
|
|
74
|
+
# @example English statement configuration
|
|
75
|
+
# parser = TextParser.new(
|
|
76
|
+
# end_markers: ['New Balance'],
|
|
77
|
+
# mandate_label: 'Mandate:',
|
|
78
|
+
# reference_label: 'Reference:'
|
|
79
|
+
# )
|
|
80
|
+
def initialize(end_markers: DEFAULT_END_MARKERS,
|
|
81
|
+
mandate_label: DEFAULT_MANDATE_LABEL,
|
|
82
|
+
reference_label: DEFAULT_REFERENCE_LABEL,
|
|
83
|
+
validate_iban: true)
|
|
84
|
+
@end_markers = end_markers
|
|
85
|
+
@mandate_label = mandate_label
|
|
86
|
+
@reference_label = reference_label
|
|
87
|
+
@validate_iban = validate_iban
|
|
88
|
+
compile_dynamic_patterns
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Parses a complete statement text and extracts all transactions.
|
|
92
|
+
#
|
|
93
|
+
# This is the primary entry point for parsing raw statement text. It performs
|
|
94
|
+
# a single-pass scan that simultaneously extracts the IBAN and parses all
|
|
95
|
+
# booking lines, minimizing memory allocations.
|
|
96
|
+
#
|
|
97
|
+
# @param text [String] the complete statement text (from PDF extraction or
|
|
98
|
+
# text export)
|
|
99
|
+
# @return [Hash] parsed result with the following structure:
|
|
100
|
+
# - +:header+ [Hash] containing +:iban+ (String)
|
|
101
|
+
# - +:statements+ [Array<Hash>] list of parsed transactions
|
|
102
|
+
# - +:warnings+ [Array<String>] optional, present only if issues occurred
|
|
103
|
+
# @raise [HeaderNotFound] if no valid IBAN line is found in the text
|
|
104
|
+
# @raise [BookingParseError] if no booking lines can be identified
|
|
105
|
+
#
|
|
106
|
+
# @example Parse statement text
|
|
107
|
+
# result = parser.parse_text(pdf_text)
|
|
108
|
+
# result[:header][:iban] #=> "DE89 3704 0044 0532 0130 00"
|
|
109
|
+
# result[:statements].first[:amount_eur] #=> "-31,49"
|
|
110
|
+
def parse_text(text)
|
|
111
|
+
text_str = text.to_s
|
|
112
|
+
@warnings = []
|
|
113
|
+
|
|
114
|
+
# Single-pass streaming parse with lazy IBAN extraction
|
|
115
|
+
iban, statements = parse_streaming_with_iban(text_str)
|
|
116
|
+
|
|
117
|
+
build_result(iban, statements)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Parses pre-split lines when the text is already line-separated.
|
|
121
|
+
#
|
|
122
|
+
# Use this when you have an array of lines (e.g., from splitting on newlines
|
|
123
|
+
# or reading line-by-line from a file). This avoids re-splitting the text
|
|
124
|
+
# internally.
|
|
125
|
+
#
|
|
126
|
+
# @param lines [Array<String>] individual lines from the statement
|
|
127
|
+
# @return [Hash] parsed result with +:header+ and +:statements+ keys
|
|
128
|
+
# @raise [HeaderNotFound] if no valid IBAN line is found
|
|
129
|
+
# @raise [BookingParseError] if no booking lines can be identified
|
|
130
|
+
#
|
|
131
|
+
# @see #parse_text For parsing unsplit text
|
|
132
|
+
def parse_lines(lines)
|
|
133
|
+
lines_array = Array(lines)
|
|
134
|
+
@warnings = []
|
|
135
|
+
|
|
136
|
+
# Parse with lazy IBAN extraction from lines
|
|
137
|
+
iban, statements = parse_lines_with_iban(lines_array)
|
|
138
|
+
|
|
139
|
+
build_result(iban, statements)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Parses transaction lines without requiring an IBAN header.
|
|
143
|
+
#
|
|
144
|
+
# Use this when parsing statement fragments or when the IBAN is already
|
|
145
|
+
# known from another source. Skips header extraction entirely and returns
|
|
146
|
+
# only the statements array.
|
|
147
|
+
#
|
|
148
|
+
# @param lines [Array<String>] lines containing only the transaction portion
|
|
149
|
+
# @return [Array<Hash>] parsed transactions (not wrapped in a result hash)
|
|
150
|
+
#
|
|
151
|
+
# @example Parse transactions without header
|
|
152
|
+
# statements = parser.parse_statement_lines(transaction_lines)
|
|
153
|
+
# statements.first[:recipient] #=> "Amazon EU"
|
|
154
|
+
def parse_statement_lines(lines)
|
|
155
|
+
@warnings = []
|
|
156
|
+
@line_cache = {}
|
|
157
|
+
parse_lines_internal(Array(lines))
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
private
|
|
161
|
+
|
|
162
|
+
# String constants to avoid repeated allocations during parsing.
|
|
163
|
+
# @api private
|
|
164
|
+
NEWLINE = "\n"
|
|
165
|
+
EMPTY_STRING = ''
|
|
166
|
+
SPACE = ' '
|
|
167
|
+
IBAN_MARKER = 'IBAN'
|
|
168
|
+
private_constant :NEWLINE, :EMPTY_STRING, :SPACE, :IBAN_MARKER
|
|
169
|
+
|
|
170
|
+
# Assembles the final result hash from parsed components.
|
|
171
|
+
#
|
|
172
|
+
# @param iban [String] the extracted IBAN
|
|
173
|
+
# @param statements [Array<Hash>] the parsed transaction list
|
|
174
|
+
# @return [Hash] the complete result with header, statements, and optional warnings
|
|
175
|
+
# @api private
|
|
176
|
+
def build_result(iban, statements)
|
|
177
|
+
result = { header: { iban: iban }, statements: statements }
|
|
178
|
+
result[:warnings] = @warnings unless @warnings.empty?
|
|
179
|
+
result
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Performs streaming parse with simultaneous IBAN and transaction extraction.
|
|
183
|
+
#
|
|
184
|
+
# This is the core parsing loop that processes text line-by-line. It lazily
|
|
185
|
+
# extracts the IBAN (stopping search once found) while building up the
|
|
186
|
+
# transaction list. Uses early termination when end markers are found.
|
|
187
|
+
#
|
|
188
|
+
# @param text [String] the complete statement text
|
|
189
|
+
# @return [Array(String, Array<Hash>)] tuple of [iban, statements]
|
|
190
|
+
# @raise [HeaderNotFound] if IBAN cannot be found
|
|
191
|
+
# @api private
|
|
192
|
+
def parse_streaming_with_iban(text) # rubocop:disable Metrics/MethodLength
|
|
193
|
+
statements = []
|
|
194
|
+
current = nil
|
|
195
|
+
found_first_booking = false
|
|
196
|
+
finished = false
|
|
197
|
+
iban = nil
|
|
198
|
+
@line_cache = {}
|
|
199
|
+
|
|
200
|
+
text.each_line do |raw_line|
|
|
201
|
+
next if finished
|
|
202
|
+
|
|
203
|
+
line = raw_line.chomp.rstrip
|
|
204
|
+
@line_cache = {} # Reset cache for new line
|
|
205
|
+
|
|
206
|
+
# Lazy IBAN extraction - only scan lines until we find it
|
|
207
|
+
iban = extract_iban_from_line(line) if iban.nil? && line.include?(IBAN_MARKER)
|
|
208
|
+
|
|
209
|
+
# Skip until we find first booking line
|
|
210
|
+
unless found_first_booking
|
|
211
|
+
next unless booking_line_candidate?(line)
|
|
212
|
+
|
|
213
|
+
found_first_booking = true
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Skip page break lines
|
|
217
|
+
next if page_break_line_fast?(line)
|
|
218
|
+
|
|
219
|
+
# Process artifact removal
|
|
220
|
+
cleaned = remove_known_prefix_artifact_fast(line)
|
|
221
|
+
|
|
222
|
+
# Reset cache if line was modified by artifact removal
|
|
223
|
+
@line_cache = {} if cleaned != line
|
|
224
|
+
|
|
225
|
+
current, finished = process_line(current, cleaned, statements)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Validate IBAN was found
|
|
229
|
+
unless iban
|
|
230
|
+
raise IngKontoauszugParser::HeaderNotFound,
|
|
231
|
+
'IBAN line not found in statement text'
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
statements << current if current
|
|
235
|
+
[iban, finalize_statements(statements)]
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Parses a line array with lazy IBAN extraction.
|
|
239
|
+
#
|
|
240
|
+
# Similar to {#parse_streaming_with_iban} but optimized for pre-split lines.
|
|
241
|
+
# Performs a single pass to find both the IBAN and the first booking line
|
|
242
|
+
# index, then delegates to {#parse_lines_internal} for transaction parsing.
|
|
243
|
+
#
|
|
244
|
+
# @param lines [Array<String>] the statement lines
|
|
245
|
+
# @return [Array(String, Array<Hash>)] tuple of [iban, statements]
|
|
246
|
+
# @raise [HeaderNotFound] if IBAN cannot be found
|
|
247
|
+
# @raise [BookingParseError] if no booking lines exist
|
|
248
|
+
# @api private
|
|
249
|
+
def parse_lines_with_iban(lines)
|
|
250
|
+
iban = nil
|
|
251
|
+
first_idx = nil
|
|
252
|
+
@line_cache = {}
|
|
253
|
+
|
|
254
|
+
# Single pass to find IBAN and first booking
|
|
255
|
+
lines.each_with_index do |line, idx|
|
|
256
|
+
line_str = line.to_s
|
|
257
|
+
|
|
258
|
+
# Lazy IBAN extraction
|
|
259
|
+
iban = extract_iban_from_line(line_str) if iban.nil? && line_str.include?(IBAN_MARKER)
|
|
260
|
+
|
|
261
|
+
# Find first booking line
|
|
262
|
+
if first_idx.nil? && booking_line_candidate?(line_str)
|
|
263
|
+
first_idx = idx
|
|
264
|
+
break if iban # Found both, stop scanning
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
validate_iban_and_booking_found!(iban, first_idx)
|
|
269
|
+
[iban, parse_lines_internal(lines, first_idx)]
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# Validates that both IBAN and first booking line were found.
|
|
273
|
+
# @param iban [String, nil] the extracted IBAN
|
|
274
|
+
# @param first_idx [Integer, nil] index of first booking line
|
|
275
|
+
# @raise [HeaderNotFound] if IBAN is nil
|
|
276
|
+
# @raise [BookingParseError] if first_idx is nil
|
|
277
|
+
# @api private
|
|
278
|
+
def validate_iban_and_booking_found!(iban, first_idx)
|
|
279
|
+
unless iban
|
|
280
|
+
raise IngKontoauszugParser::HeaderNotFound,
|
|
281
|
+
'IBAN line not found in statement text'
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
return if first_idx
|
|
285
|
+
|
|
286
|
+
raise IngKontoauszugParser::BookingParseError,
|
|
287
|
+
'Could not locate the first booking line in the statement text'
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
# Attempts to extract an IBAN from a single line.
|
|
291
|
+
#
|
|
292
|
+
# Matches the ING statement format "IBAN DE89 3704 0044 ..." and optionally
|
|
293
|
+
# validates the checksum. Returns nil if the line doesn't contain an IBAN
|
|
294
|
+
# or if validation fails.
|
|
295
|
+
#
|
|
296
|
+
# @param line [String] a single line that may contain an IBAN
|
|
297
|
+
# @return [String, nil] the extracted IBAN or nil if not found/invalid
|
|
298
|
+
# @api private
|
|
299
|
+
def extract_iban_from_line(line)
|
|
300
|
+
match = line.match(IBAN_LINE_REGEX)
|
|
301
|
+
return nil unless match
|
|
302
|
+
|
|
303
|
+
iban = match[1].strip
|
|
304
|
+
Header.validate_iban!(iban) if @validate_iban
|
|
305
|
+
iban
|
|
306
|
+
rescue IngKontoauszugParser::InvalidIBAN
|
|
307
|
+
nil # Try next line if validation fails
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# Quickly determines if a line might be a booking (transaction) line.
|
|
311
|
+
#
|
|
312
|
+
# Uses a two-stage check: first a fast character test (must start with a digit),
|
|
313
|
+
# then a regex match. This avoids expensive regex operations on lines that
|
|
314
|
+
# clearly aren't bookings.
|
|
315
|
+
#
|
|
316
|
+
# @param line [String] the line to check
|
|
317
|
+
# @return [Boolean] true if the line looks like a booking line
|
|
318
|
+
# @api private
|
|
319
|
+
def booking_line_candidate?(line)
|
|
320
|
+
return false if line.empty?
|
|
321
|
+
|
|
322
|
+
# Fast reject: first non-space char must be a digit
|
|
323
|
+
first_char = line.lstrip[0]
|
|
324
|
+
return false unless first_char && first_char >= '0' && first_char <= '9'
|
|
325
|
+
|
|
326
|
+
# Only run regex if fast check passes
|
|
327
|
+
line.match?(FIRST_BOOKING_LINE_REGEX)
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# Finds the first booking line and parses from that point.
|
|
331
|
+
#
|
|
332
|
+
# Legacy entry point that scans for the first booking and delegates
|
|
333
|
+
# to {#parse_lines_internal}. Used when IBAN extraction is handled separately.
|
|
334
|
+
#
|
|
335
|
+
# @param lines [Array<String>] statement lines
|
|
336
|
+
# @return [Array<Hash>] parsed transactions
|
|
337
|
+
# @raise [BookingParseError] if no booking lines can be found
|
|
338
|
+
# @api private
|
|
339
|
+
def parse_from_lines(lines)
|
|
340
|
+
first_idx = nil
|
|
341
|
+
lines.each_with_index do |line, idx|
|
|
342
|
+
if booking_line_candidate?(line.to_s)
|
|
343
|
+
first_idx = idx
|
|
344
|
+
break
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
unless first_idx
|
|
349
|
+
raise IngKontoauszugParser::BookingParseError,
|
|
350
|
+
'Could not locate the first booking line in the statement text'
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
parse_lines_internal(lines, first_idx)
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
# Core line-by-line transaction parsing loop.
|
|
357
|
+
#
|
|
358
|
+
# Iterates through lines starting at start_idx, building transaction objects
|
|
359
|
+
# as it encounters booking lines and accumulating narrative text for each.
|
|
360
|
+
# Stops when an end marker is found.
|
|
361
|
+
#
|
|
362
|
+
# @param lines [Array<String>] the statement lines
|
|
363
|
+
# @param start_idx [Integer] index to start parsing from (default: 0)
|
|
364
|
+
# @return [Array<Hash>] finalized list of parsed transactions
|
|
365
|
+
# @api private
|
|
366
|
+
def parse_lines_internal(lines, start_idx = 0)
|
|
367
|
+
statements = []
|
|
368
|
+
current = nil
|
|
369
|
+
@line_cache = {}
|
|
370
|
+
|
|
371
|
+
idx = start_idx
|
|
372
|
+
len = lines.length
|
|
373
|
+
|
|
374
|
+
while idx < len
|
|
375
|
+
raw_line = lines[idx]
|
|
376
|
+
idx += 1
|
|
377
|
+
|
|
378
|
+
line = raw_line.to_s.rstrip
|
|
379
|
+
@line_cache = {} # Reset cache for new line
|
|
380
|
+
|
|
381
|
+
# Skip page breaks
|
|
382
|
+
next if page_break_line_fast?(line)
|
|
383
|
+
|
|
384
|
+
# Artifact removal
|
|
385
|
+
cleaned = remove_known_prefix_artifact_fast(line)
|
|
386
|
+
|
|
387
|
+
# Reset cache if line was modified by artifact removal
|
|
388
|
+
@line_cache = {} if cleaned != line
|
|
389
|
+
|
|
390
|
+
current, finished = process_line(current, cleaned, statements)
|
|
391
|
+
break if finished
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
statements << current if current
|
|
395
|
+
finalize_statements(statements)
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
# Processes a single line within the parsing state machine.
|
|
399
|
+
#
|
|
400
|
+
# Determines the line type and updates state accordingly:
|
|
401
|
+
# - End marker: finishes current statement, signals completion
|
|
402
|
+
# - Booking line: starts a new transaction
|
|
403
|
+
# - Other: adds to current transaction's narrative or value date
|
|
404
|
+
#
|
|
405
|
+
# @param current [Hash, nil] the in-progress transaction, or nil if none
|
|
406
|
+
# @param line [String] the line to process
|
|
407
|
+
# @param statements [Array<Hash>] accumulator for completed transactions
|
|
408
|
+
# @return [Array(Hash, Boolean)] tuple of [updated_current, finished_flag]
|
|
409
|
+
# @api private
|
|
410
|
+
def process_line(current, line, statements)
|
|
411
|
+
# Use memoized stripped value
|
|
412
|
+
stripped = get_stripped(line)
|
|
413
|
+
|
|
414
|
+
if end_of_statements_fast?(stripped)
|
|
415
|
+
statements << current if current
|
|
416
|
+
return [nil, true]
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
if (booking = parse_booking_line_fast(line))
|
|
420
|
+
statements << current if current
|
|
421
|
+
return [build_statement_from_booking(booking), false]
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
return [current, false] unless current
|
|
425
|
+
|
|
426
|
+
[apply_value_or_narrative(current, line), false]
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
# Returns the stripped version of a line, caching the result.
|
|
430
|
+
#
|
|
431
|
+
# Avoids calling String#strip multiple times for the same line during
|
|
432
|
+
# a single processing cycle. Cache is cleared for each new line.
|
|
433
|
+
#
|
|
434
|
+
# @param line [String] the line to strip
|
|
435
|
+
# @return [String] the stripped line (cached)
|
|
436
|
+
# @api private
|
|
437
|
+
def get_stripped(line)
|
|
438
|
+
@line_cache ||= {}
|
|
439
|
+
@line_cache[:stripped] ||= line.strip
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
# Determines if a line is a page break or header that should be skipped.
|
|
443
|
+
#
|
|
444
|
+
# ING statements repeat page headers/footers on each page. This method
|
|
445
|
+
# identifies common patterns like "Seite X von Y", column headers, and
|
|
446
|
+
# date lines that aren't part of the transaction data.
|
|
447
|
+
#
|
|
448
|
+
# @param line [String] the line to check
|
|
449
|
+
# @return [Boolean] true if the line should be skipped
|
|
450
|
+
# @api private
|
|
451
|
+
def page_break_line_fast?(line)
|
|
452
|
+
stripped = get_stripped(line)
|
|
453
|
+
return true if stripped.empty?
|
|
454
|
+
|
|
455
|
+
# Check first character for fast rejection of most lines
|
|
456
|
+
first_char = stripped[0]
|
|
457
|
+
return false unless PAGE_BREAK_FIRST_CHARS.include?(first_char)
|
|
458
|
+
|
|
459
|
+
PAGE_BREAK_PATTERNS.any? { |re| stripped.match?(re) }
|
|
460
|
+
end
|
|
461
|
+
|
|
462
|
+
# Checks if a line contains an end-of-statements marker.
|
|
463
|
+
#
|
|
464
|
+
# Uses first-character rejection to quickly filter out non-matching lines
|
|
465
|
+
# before running the full regex check against configured end markers.
|
|
466
|
+
#
|
|
467
|
+
# @param stripped [String] a pre-stripped line
|
|
468
|
+
# @return [Boolean] true if line contains an end marker
|
|
469
|
+
# @api private
|
|
470
|
+
def end_of_statements_fast?(stripped)
|
|
471
|
+
return false if stripped.empty?
|
|
472
|
+
|
|
473
|
+
# Fast reject using first character
|
|
474
|
+
first_char = stripped[0]
|
|
475
|
+
return false unless @end_markers_first_chars.include?(first_char)
|
|
476
|
+
|
|
477
|
+
stripped.match?(@end_of_statements_regex)
|
|
478
|
+
end
|
|
479
|
+
|
|
480
|
+
# Legacy wrapper for backward compatibility.
|
|
481
|
+
# @param line [String] line to check (will be stripped)
|
|
482
|
+
# @return [Boolean] true if end marker found
|
|
483
|
+
# @api private
|
|
484
|
+
# @deprecated Use {#end_of_statements_fast?} with pre-stripped input
|
|
485
|
+
def end_of_statements?(line)
|
|
486
|
+
end_of_statements_fast?(line.strip)
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
# Removes OCR artifacts that appear at line beginnings.
|
|
490
|
+
#
|
|
491
|
+
# PDF extraction sometimes produces spurious "T" prefixes before dates
|
|
492
|
+
# or SEPA labels due to OCR errors or text positioning issues. This method
|
|
493
|
+
# detects and removes these artifacts while preserving legitimate content.
|
|
494
|
+
#
|
|
495
|
+
# @param str [String] the line that may contain artifacts
|
|
496
|
+
# @return [String] the cleaned line (same object if no artifacts found)
|
|
497
|
+
# @api private
|
|
498
|
+
def remove_known_prefix_artifact_fast(str)
|
|
499
|
+
return str if str.empty?
|
|
500
|
+
|
|
501
|
+
stripped = str.strip
|
|
502
|
+
return str if stripped.empty?
|
|
503
|
+
|
|
504
|
+
# Check for T prefix artifact
|
|
505
|
+
if (m = str.match(ARTIFACT_PREFIX_REGEX))
|
|
506
|
+
leading_ws = m[1]
|
|
507
|
+
rest = m[2]
|
|
508
|
+
return "#{leading_ws}#{rest}" if rest.match?(ARTIFACT_DATE_REGEX)
|
|
509
|
+
return "#{leading_ws}#{rest}" if rest.match?(@artifact_label_regex)
|
|
510
|
+
end
|
|
511
|
+
|
|
512
|
+
str
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
# Legacy wrapper for artifact removal.
|
|
516
|
+
# @param str [String] line to clean
|
|
517
|
+
# @return [String] cleaned line
|
|
518
|
+
# @api private
|
|
519
|
+
# @deprecated Use {#remove_known_prefix_artifact_fast}
|
|
520
|
+
def remove_known_prefix_artifact(str)
|
|
521
|
+
remove_known_prefix_artifact_fast(str)
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
# Filters and cleans an array of lines for parsing.
|
|
525
|
+
#
|
|
526
|
+
# Removes page breaks and OCR artifacts from each line. Used as a
|
|
527
|
+
# preprocessing step when lines need cleaning before parsing.
|
|
528
|
+
#
|
|
529
|
+
# @param lines [Array<String>] raw lines from PDF extraction
|
|
530
|
+
# @return [Array<String>] cleaned lines with page breaks removed
|
|
531
|
+
# @api private
|
|
532
|
+
# @deprecated Prefer streaming parsing which handles cleanup inline
|
|
533
|
+
def sanitize_lines(lines)
|
|
534
|
+
lines.each_with_object([]) do |line, acc|
|
|
535
|
+
@line_cache = {} # Reset cache for each line
|
|
536
|
+
next if page_break_line_fast?(line)
|
|
537
|
+
|
|
538
|
+
cleaned = line.to_s.dup.rstrip
|
|
539
|
+
cleaned = remove_known_prefix_artifact_fast(cleaned)
|
|
540
|
+
acc << cleaned
|
|
541
|
+
end
|
|
542
|
+
end
|
|
543
|
+
|
|
544
|
+
# Compiles regex patterns from configurable labels at initialization.
|
|
545
|
+
#
|
|
546
|
+
# Creates optimized patterns for end markers, mandate/reference extraction,
|
|
547
|
+
# and artifact detection based on the configured labels. Also pre-computes
|
|
548
|
+
# first-character sets for fast rejection filtering.
|
|
549
|
+
#
|
|
550
|
+
# @return [void]
|
|
551
|
+
# @api private
|
|
552
|
+
def compile_dynamic_patterns
|
|
553
|
+
# Build end-of-statements regex from configured markers
|
|
554
|
+
markers_pattern = @end_markers.map { |m| Regexp.escape(m) }.join('|')
|
|
555
|
+
@end_of_statements_regex = /\b(?:#{markers_pattern})\b/
|
|
556
|
+
|
|
557
|
+
# Pre-compute first characters of end markers for fast rejection
|
|
558
|
+
@end_markers_first_chars = @end_markers.map { |m| m[0] }.uniq.freeze
|
|
559
|
+
|
|
560
|
+
# Build mandate/reference regexes from configured labels
|
|
561
|
+
mandate_escaped = Regexp.escape(@mandate_label)
|
|
562
|
+
reference_escaped = Regexp.escape(@reference_label)
|
|
563
|
+
@mandate_regex = /\b#{mandate_escaped}\s*([^\s]+(?:\s(?!#{reference_escaped})[^\s]+)*)/
|
|
564
|
+
@reference_regex = /\b#{reference_escaped}\s*([^\s]+(?:\s(?!#{mandate_escaped})[^\s]+)*)/
|
|
565
|
+
|
|
566
|
+
# Build artifact label regex
|
|
567
|
+
@artifact_label_regex = /\A\s*(#{reference_escaped}|#{mandate_escaped})/
|
|
568
|
+
end
|
|
569
|
+
|
|
570
|
+
# Matches IBAN line format: "IBAN DE89 3704 0044 0532 0130 00"
|
|
571
|
+
# Captures the IBAN portion (country code + check digits + BBAN).
|
|
572
|
+
IBAN_LINE_REGEX = /IBAN\s+([A-Z0-9 ]{10,})/
|
|
573
|
+
private_constant :IBAN_LINE_REGEX
|
|
574
|
+
|
|
575
|
+
# First characters of page break patterns for fast rejection.
|
|
576
|
+
# Lines not starting with these characters can skip the full regex check.
|
|
577
|
+
# Covers: Seite, Valuta, Girokonto, Kontoauszug, Datum, Buchung, Betrag
|
|
578
|
+
PAGE_BREAK_FIRST_CHARS = %w[S V G K D B].freeze
|
|
579
|
+
private_constant :PAGE_BREAK_FIRST_CHARS
|
|
580
|
+
|
|
581
|
+
# German date format: DD.MM.YYYY (e.g., "01.08.2025")
|
|
582
|
+
DATE_PATTERN = /\d{2}\.\d{2}\.\d{4}/
|
|
583
|
+
|
|
584
|
+
# German currency format with thousand separators and comma decimal.
|
|
585
|
+
# Matches: "1.234,56", "-31,49", "+100,00"
|
|
586
|
+
AMOUNT_PATTERN_FULL = /[-+]?\d{1,3}(?:\.\d{3})*,\d{2}/
|
|
587
|
+
|
|
588
|
+
# Value date followed by narrative text on the same line.
|
|
589
|
+
# Captures: (1) date, (2) remaining text
|
|
590
|
+
VALUE_DATE_WITH_TEXT_REGEX = /\A\s*(\d{2}\.\d{2}\.\d{4})\s+(.*)\z/
|
|
591
|
+
|
|
592
|
+
# Value date alone on a line (no narrative text).
|
|
593
|
+
VALUE_DATE_ONLY_REGEX = /\A\d{2}\.\d{2}\.\d{4}\z/
|
|
594
|
+
|
|
595
|
+
# Amount pattern for detecting stray amount lines.
|
|
596
|
+
AMOUNT_PATTERN = /[-+]?\d{1,3}(?:\.\d{3})*,\d{2}/
|
|
597
|
+
|
|
598
|
+
# Booking line format: "DD.MM.YYYY TransferType ..."
|
|
599
|
+
# Must start with date followed by whitespace and non-whitespace.
|
|
600
|
+
FIRST_BOOKING_LINE_REGEX = /^\s*\d{2}\.\d{2}\.\d{4}\s+\S/
|
|
601
|
+
|
|
602
|
+
# Acquirer Reference Number (ARN) for card transactions.
|
|
603
|
+
# Used as fallback reference when SEPA reference is not present.
|
|
604
|
+
ARN_REGEX = /\bARN\d{8,}\b/
|
|
605
|
+
|
|
606
|
+
# Google Pay indicator in narrative text.
|
|
607
|
+
GOOGLE_PAY_REGEX = /\bgoogle\s+pay\b/i
|
|
608
|
+
|
|
609
|
+
# OCR artifact pattern: spurious "T" prefix before content.
|
|
610
|
+
# Captures: (1) leading whitespace, (2) content after "T"
|
|
611
|
+
ARTIFACT_PREFIX_REGEX = /\A(\s*)T(.*)\z/
|
|
612
|
+
|
|
613
|
+
# Date pattern for artifact validation (ensures "T" was before a date).
|
|
614
|
+
ARTIFACT_DATE_REGEX = /\A\s*\d{2}\.\d{2}\.\d{4}\b/
|
|
615
|
+
|
|
616
|
+
private_constant :DATE_PATTERN, :AMOUNT_PATTERN_FULL,
|
|
617
|
+
:VALUE_DATE_WITH_TEXT_REGEX, :VALUE_DATE_ONLY_REGEX, :AMOUNT_PATTERN,
|
|
618
|
+
:FIRST_BOOKING_LINE_REGEX, :ARN_REGEX,
|
|
619
|
+
:GOOGLE_PAY_REGEX,
|
|
620
|
+
:ARTIFACT_PREFIX_REGEX, :ARTIFACT_DATE_REGEX
|
|
621
|
+
|
|
622
|
+
# Parses a booking (transaction) line into its components.
|
|
623
|
+
#
|
|
624
|
+
# Uses StringScanner for efficient left-to-right parsing of the format:
|
|
625
|
+
# "DD.MM.YYYY TransferType Recipient Amount"
|
|
626
|
+
#
|
|
627
|
+
# The amount is located by scanning from the right since recipient names
|
|
628
|
+
# can contain spaces.
|
|
629
|
+
#
|
|
630
|
+
# @param line [String] a potential booking line
|
|
631
|
+
# @return [Hash, nil] parsed components or nil if not a valid booking line
|
|
632
|
+
# - +:booking_date+ [String] the transaction date
|
|
633
|
+
# - +:transfer_type+ [String] transaction type (e.g., "Lastschrift")
|
|
634
|
+
# - +:recipient+ [String] counterparty name
|
|
635
|
+
# - +:amount_raw+ [String] amount in German format
|
|
636
|
+
# @api private
|
|
637
|
+
def parse_booking_line_fast(line)
|
|
638
|
+
# Fast reject: must start with optional whitespace then digit
|
|
639
|
+
stripped = get_stripped(line)
|
|
640
|
+
return nil if stripped.empty?
|
|
641
|
+
|
|
642
|
+
first_char = stripped[0]
|
|
643
|
+
return nil unless first_char.between?('0', '9')
|
|
644
|
+
|
|
645
|
+
# Use StringScanner for structured parsing
|
|
646
|
+
scanner = StringScanner.new(line)
|
|
647
|
+
|
|
648
|
+
# Skip leading whitespace
|
|
649
|
+
scanner.skip(/\s*/)
|
|
650
|
+
|
|
651
|
+
# Match date (DD.MM.YYYY)
|
|
652
|
+
date = scanner.scan(/\d{2}\.\d{2}\.\d{4}/)
|
|
653
|
+
return nil unless date
|
|
654
|
+
|
|
655
|
+
# Must have whitespace after date
|
|
656
|
+
return nil unless scanner.skip(/\s+/)
|
|
657
|
+
|
|
658
|
+
# Match transfer type (non-whitespace)
|
|
659
|
+
transfer_type = scanner.scan(/\S+/)
|
|
660
|
+
return nil unless transfer_type
|
|
661
|
+
|
|
662
|
+
# Must have whitespace
|
|
663
|
+
return nil unless scanner.skip(/\s+/)
|
|
664
|
+
|
|
665
|
+
# Scan to find amount at end - work backwards
|
|
666
|
+
rest = scanner.rest.rstrip
|
|
667
|
+
|
|
668
|
+
# Find the last amount pattern in the line
|
|
669
|
+
amount_match = rest.match(/(.*?)\s+([-+]?\d{1,3}(?:\.\d{3})*,\d{2})\s*\z/)
|
|
670
|
+
return nil unless amount_match
|
|
671
|
+
|
|
672
|
+
recipient = amount_match[1].strip
|
|
673
|
+
amount = amount_match[2]
|
|
674
|
+
|
|
675
|
+
{ booking_date: date, transfer_type: transfer_type, recipient: recipient, amount_raw: amount }
|
|
676
|
+
end
|
|
677
|
+
|
|
678
|
+
# Legacy wrapper for booking line parsing.
|
|
679
|
+
# @param line [String] line to parse
|
|
680
|
+
# @return [Hash, nil] parsed booking or nil
|
|
681
|
+
# @api private
|
|
682
|
+
# @deprecated Use {#parse_booking_line_fast}
|
|
683
|
+
def parse_booking_line(line)
|
|
684
|
+
parse_booking_line_fast(line)
|
|
685
|
+
end
|
|
686
|
+
|
|
687
|
+
# Creates an initial statement hash from parsed booking data.
|
|
688
|
+
#
|
|
689
|
+
# Converts the raw booking components into the statement structure,
|
|
690
|
+
# including amount normalization and direction detection.
|
|
691
|
+
#
|
|
692
|
+
# @param booking [Hash] output from {#parse_booking_line_fast}
|
|
693
|
+
# @return [Hash] statement hash ready for narrative accumulation
|
|
694
|
+
# @api private
|
|
695
|
+
def build_statement_from_booking(booking)
|
|
696
|
+
amount_numeric = parse_amount(booking[:amount_raw])
|
|
697
|
+
{
|
|
698
|
+
booking_date: booking[:booking_date],
|
|
699
|
+
transfer_type: booking[:transfer_type],
|
|
700
|
+
recipient: booking[:recipient],
|
|
701
|
+
amount_eur: booking[:amount_raw],
|
|
702
|
+
amount_eur_numeric: amount_numeric.to_s('F'),
|
|
703
|
+
amount_direction: amount_direction(amount_numeric),
|
|
704
|
+
value_date: nil,
|
|
705
|
+
narrative_lines: []
|
|
706
|
+
}
|
|
707
|
+
end
|
|
708
|
+
|
|
709
|
+
# Converts German amount format to BigDecimal.
|
|
710
|
+
#
|
|
711
|
+
# German format uses dots as thousand separators and commas for decimals.
|
|
712
|
+
# "-1.234,56" becomes BigDecimal("-1234.56")
|
|
713
|
+
#
|
|
714
|
+
# @param raw [String] amount in German format (e.g., "-1.234,56")
|
|
715
|
+
# @return [BigDecimal] the numeric amount
|
|
716
|
+
# @api private
|
|
717
|
+
def parse_amount(raw)
|
|
718
|
+
BigDecimal(raw.delete('.').tr(',', '.'))
|
|
719
|
+
end
|
|
720
|
+
|
|
721
|
+
# Determines the transaction direction from the amount sign.
|
|
722
|
+
#
|
|
723
|
+
# @param amount_numeric [BigDecimal] the parsed amount
|
|
724
|
+
# @return [String] "debit" for negative, "credit" for positive, "neutral" for zero
|
|
725
|
+
# @api private
|
|
726
|
+
def amount_direction(amount_numeric)
|
|
727
|
+
return 'debit' if amount_numeric.negative?
|
|
728
|
+
return 'credit' if amount_numeric.positive?
|
|
729
|
+
|
|
730
|
+
'neutral'
|
|
731
|
+
end
|
|
732
|
+
|
|
733
|
+
# Adds value date or narrative text to an in-progress statement.
|
|
734
|
+
#
|
|
735
|
+
# After a booking line, subsequent lines are either:
|
|
736
|
+
# - A value date (alone or with narrative)
|
|
737
|
+
# - Pure narrative text
|
|
738
|
+
# - Stray amount lines (logged as warnings)
|
|
739
|
+
#
|
|
740
|
+
# @param current [Hash] the in-progress statement
|
|
741
|
+
# @param line [String] the line to process
|
|
742
|
+
# @return [Hash] the updated statement
|
|
743
|
+
# @api private
|
|
744
|
+
def apply_value_or_narrative(current, line)
|
|
745
|
+
stripped = get_stripped(line)
|
|
746
|
+
|
|
747
|
+
if current[:value_date].nil?
|
|
748
|
+
if (m = line.match(VALUE_DATE_WITH_TEXT_REGEX))
|
|
749
|
+
current[:value_date] = m[1]
|
|
750
|
+
initial = m[2].strip
|
|
751
|
+
current[:narrative_lines] << initial unless initial.empty?
|
|
752
|
+
elsif stripped.match?(VALUE_DATE_ONLY_REGEX)
|
|
753
|
+
current[:value_date] = stripped
|
|
754
|
+
elsif stripped.match?(AMOUNT_PATTERN)
|
|
755
|
+
@warnings << 'Encountered amount line without active statement'
|
|
756
|
+
else
|
|
757
|
+
current[:narrative_lines] << stripped
|
|
758
|
+
end
|
|
759
|
+
else
|
|
760
|
+
current[:narrative_lines] << stripped
|
|
761
|
+
end
|
|
762
|
+
|
|
763
|
+
current
|
|
764
|
+
end
|
|
765
|
+
|
|
766
|
+
# Post-processes all parsed statements for final output.
|
|
767
|
+
#
|
|
768
|
+
# Converts accumulated narrative lines to a single string, extracts
|
|
769
|
+
# mandate/reference identifiers, detects Google Pay transactions, and
|
|
770
|
+
# filters out incomplete statements (missing value date).
|
|
771
|
+
#
|
|
772
|
+
# @param statements [Array<Hash>] raw statement hashes with narrative_lines
|
|
773
|
+
# @return [Array<Hash>] finalized statements ready for output
|
|
774
|
+
# @api private
|
|
775
|
+
def finalize_statements(statements)
|
|
776
|
+
return [] if statements.empty?
|
|
777
|
+
|
|
778
|
+
statements.filter_map do |statement|
|
|
779
|
+
unless statement[:value_date]
|
|
780
|
+
@warnings << "Could not locate value date for statement: #{statement[:recipient]}"
|
|
781
|
+
next
|
|
782
|
+
end
|
|
783
|
+
|
|
784
|
+
narrative_lines = statement.delete(:narrative_lines)
|
|
785
|
+
narrative = build_narrative(narrative_lines)
|
|
786
|
+
s = statement.merge(narrative: narrative)
|
|
787
|
+
|
|
788
|
+
mandate_id, reference = extract_mandate_and_reference(narrative)
|
|
789
|
+
s[:mandate_id] = mandate_id if mandate_id
|
|
790
|
+
s[:reference] = reference if reference
|
|
791
|
+
|
|
792
|
+
s[:google_pay] = true if narrative.match?(GOOGLE_PAY_REGEX)
|
|
793
|
+
s
|
|
794
|
+
end
|
|
795
|
+
end
|
|
796
|
+
|
|
797
|
+
# Joins narrative lines into a single space-separated string.
|
|
798
|
+
#
|
|
799
|
+
# Uses pre-allocated string capacity and avoids intermediate array
|
|
800
|
+
# creation for better memory efficiency with long narratives.
|
|
801
|
+
#
|
|
802
|
+
# @param lines [Array<String>] individual narrative lines
|
|
803
|
+
# @return [String] joined narrative text
|
|
804
|
+
# @api private
|
|
805
|
+
def build_narrative(lines)
|
|
806
|
+
return EMPTY_STRING if lines.empty?
|
|
807
|
+
|
|
808
|
+
result = String.new(capacity: lines.sum(&:length) + lines.length)
|
|
809
|
+
first = true
|
|
810
|
+
|
|
811
|
+
lines.each do |line|
|
|
812
|
+
next if line.empty?
|
|
813
|
+
|
|
814
|
+
if first
|
|
815
|
+
first = false
|
|
816
|
+
else
|
|
817
|
+
result << SPACE
|
|
818
|
+
end
|
|
819
|
+
result << line
|
|
820
|
+
end
|
|
821
|
+
|
|
822
|
+
result
|
|
823
|
+
end
|
|
824
|
+
|
|
825
|
+
# Extracts SEPA mandate ID and reference from narrative text.
|
|
826
|
+
#
|
|
827
|
+
# Searches for configured labels (e.g., "Mandat:", "Referenz:") in the
|
|
828
|
+
# narrative. Falls back to ARN pattern for card transactions.
|
|
829
|
+
#
|
|
830
|
+
# @param narrative [String] the joined narrative text
|
|
831
|
+
# @return [Array(String, String)] tuple of [mandate_id, reference], either may be nil
|
|
832
|
+
# @api private
|
|
833
|
+
def extract_mandate_and_reference(narrative)
|
|
834
|
+
mandate_match = narrative.match(@mandate_regex)
|
|
835
|
+
reference_match = narrative.match(@reference_regex)
|
|
836
|
+
|
|
837
|
+
mandate = sanitize_identifier(mandate_match[1]) if mandate_match
|
|
838
|
+
reference = sanitize_identifier(reference_match[1]) if reference_match
|
|
839
|
+
reference ||= extract_arn_reference(narrative)
|
|
840
|
+
[mandate, reference]
|
|
841
|
+
end
|
|
842
|
+
|
|
843
|
+
# Matches whitespace for identifier normalization.
|
|
844
|
+
WHITESPACE_REGEX = /\s+/
|
|
845
|
+
private_constant :WHITESPACE_REGEX
|
|
846
|
+
|
|
847
|
+
# Removes internal whitespace from identifiers.
|
|
848
|
+
#
|
|
849
|
+
# SEPA mandate IDs and references may contain spaces from line wrapping
|
|
850
|
+
# that should be removed for consistent output.
|
|
851
|
+
#
|
|
852
|
+
# @param value [String] identifier that may contain whitespace
|
|
853
|
+
# @return [String] identifier with whitespace removed
|
|
854
|
+
# @api private
|
|
855
|
+
def sanitize_identifier(value)
|
|
856
|
+
value.gsub(WHITESPACE_REGEX, EMPTY_STRING)
|
|
857
|
+
end
|
|
858
|
+
|
|
859
|
+
# Extracts ARN (Acquirer Reference Number) from narrative.
|
|
860
|
+
#
|
|
861
|
+
# ARN is used for card transactions and serves as a fallback reference
|
|
862
|
+
# when no SEPA reference is present.
|
|
863
|
+
#
|
|
864
|
+
# @param narrative [String] the narrative text to search
|
|
865
|
+
# @return [String, nil] the ARN if found, nil otherwise
|
|
866
|
+
# @api private
|
|
867
|
+
def extract_arn_reference(narrative)
|
|
868
|
+
narrative[ARN_REGEX]
|
|
869
|
+
end
|
|
870
|
+
|
|
871
|
+
# Patterns matching ING statement page headers and footers.
|
|
872
|
+
#
|
|
873
|
+
# These lines appear on each page of multi-page statements and should be
|
|
874
|
+
# skipped during parsing. Ordered by likelihood for faster early exit.
|
|
875
|
+
#
|
|
876
|
+
# Patterns match:
|
|
877
|
+
# - "Seite X von Y" (page numbers)
|
|
878
|
+
# - "Valuta" (column header)
|
|
879
|
+
# - "Girokonto Nummer ..." (account header)
|
|
880
|
+
# - "Kontoauszug ..." (statement header)
|
|
881
|
+
# - "Datum DD.MM.YYYY" (date header)
|
|
882
|
+
# - "Buchung / Verwendungszweck" (column headers)
|
|
883
|
+
# - "Betrag (EUR)" (amount column header)
|
|
884
|
+
PAGE_BREAK_PATTERNS = [
|
|
885
|
+
/\ASeite\b/,
|
|
886
|
+
/\AValuta\z/,
|
|
887
|
+
/\AGirokonto Nummer\b/,
|
|
888
|
+
/\AKontoauszug\b/,
|
|
889
|
+
/\ADatum\b.*\d{2}\.\d{2}\.\d{4}\z/,
|
|
890
|
+
%r{\ABuchung\s+Buchung\s*/\s*Verwendungszweck\b},
|
|
891
|
+
/\ABetrag \(EUR\)\z/
|
|
892
|
+
].freeze
|
|
893
|
+
private_constant :PAGE_BREAK_PATTERNS
|
|
894
|
+
|
|
895
|
+
# Legacy wrapper for page break detection.
|
|
896
|
+
# @param line [String] line to check
|
|
897
|
+
# @return [Boolean] true if line is a page break
|
|
898
|
+
# @api private
|
|
899
|
+
# @deprecated Use {#page_break_line_fast?}
|
|
900
|
+
def page_break_line?(line)
|
|
901
|
+
@line_cache = {}
|
|
902
|
+
page_break_line_fast?(line)
|
|
903
|
+
end
|
|
904
|
+
end
|
|
905
|
+
end
|