ing_kontoauszug_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,905 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bigdecimal'
4
+ require 'strscan'
5
+ require_relative 'header'
6
+
7
+ module IngKontoauszugParser
8
+ # Parses ING bank statement text into structured transaction data.
9
+ #
10
+ # This class implements a single-pass streaming parser optimized for processing
11
+ # ING Germany statement exports. It handles both PDF-extracted text and direct
12
+ # text exports, extracting IBAN, transaction details, SEPA references, and
13
+ # narrative information.
14
+ #
15
+ # == Architecture
16
+ #
17
+ # The parser uses a state machine approach with lazy evaluation:
18
+ # - IBAN extraction happens opportunistically while scanning for the first booking
19
+ # - Booking lines are detected via fast character checks before regex matching
20
+ # - Line caching avoids redundant string operations within a single line
21
+ # - Memory-efficient string building minimizes allocations
22
+ #
23
+ # == Performance Optimizations
24
+ #
25
+ # Several techniques reduce parsing time for large statements:
26
+ # - First-character rejection filters out non-matching lines before regex
27
+ # - Pre-compiled regex patterns avoid repeated compilation
28
+ # - Memoized strip operations via @line_cache
29
+ # - StringScanner for structured booking line parsing
30
+ #
31
+ # == Supported Formats
32
+ #
33
+ # The parser recognizes German ING statement format with configurable labels
34
+ # for international variants:
35
+ # - Booking lines: "DD.MM.YYYY TransferType Recipient Amount"
36
+ # - Value date lines: "DD.MM.YYYY" or "DD.MM.YYYY Narrative text"
37
+ # - SEPA fields: "Mandat: XXX" and "Referenz: XXX" (configurable)
38
+ # - End markers: "Neuer Betrag" or "Neuer Saldo" (configurable)
39
+ #
40
+ # @api private
41
+ # @see StatementParser The public API that delegates to this class
42
+ class TextParser
43
+ # Phrases that signal the end of the transaction list in German statements.
44
+ # When any of these appear on a line, parsing stops.
45
+ # @return [Array<String>] frozen array of German end markers
46
+ DEFAULT_END_MARKERS = ['Neuer Betrag', 'Neuer Saldo'].freeze
47
+
48
+ # SEPA mandate identifier label in German statements.
49
+ # @return [String] the German label for mandate IDs
50
+ DEFAULT_MANDATE_LABEL = 'Mandat:'
51
+
52
+ # SEPA reference label in German statements.
53
+ # @return [String] the German label for payment references
54
+ DEFAULT_REFERENCE_LABEL = 'Referenz:'
55
+
56
+ # Creates a new parser with language-specific configuration.
57
+ #
58
+ # Override the defaults to parse statements in other languages or with
59
+ # custom field labels.
60
+ #
61
+ # @param end_markers [Array<String>] phrases that signal the end of the
62
+ # transaction list. Parsing stops when any marker is found.
63
+ # @param mandate_label [String] the label preceding SEPA mandate IDs
64
+ # (e.g., "Mandate:" for English statements)
65
+ # @param reference_label [String] the label preceding SEPA references
66
+ # (e.g., "Reference:" for English statements)
67
+ # @param validate_iban [Boolean] whether to verify the IBAN checksum
68
+ # using ISO 13616 mod-97. Set to false for faster parsing when
69
+ # IBAN validation is handled elsewhere.
70
+ #
71
+ # @example German defaults (no arguments needed)
72
+ # parser = TextParser.new
73
+ #
74
+ # @example English statement configuration
75
+ # parser = TextParser.new(
76
+ # end_markers: ['New Balance'],
77
+ # mandate_label: 'Mandate:',
78
+ # reference_label: 'Reference:'
79
+ # )
80
+ def initialize(end_markers: DEFAULT_END_MARKERS,
81
+ mandate_label: DEFAULT_MANDATE_LABEL,
82
+ reference_label: DEFAULT_REFERENCE_LABEL,
83
+ validate_iban: true)
84
+ @end_markers = end_markers
85
+ @mandate_label = mandate_label
86
+ @reference_label = reference_label
87
+ @validate_iban = validate_iban
88
+ compile_dynamic_patterns
89
+ end
90
+
91
+ # Parses a complete statement text and extracts all transactions.
92
+ #
93
+ # This is the primary entry point for parsing raw statement text. It performs
94
+ # a single-pass scan that simultaneously extracts the IBAN and parses all
95
+ # booking lines, minimizing memory allocations.
96
+ #
97
+ # @param text [String] the complete statement text (from PDF extraction or
98
+ # text export)
99
+ # @return [Hash] parsed result with the following structure:
100
+ # - +:header+ [Hash] containing +:iban+ (String)
101
+ # - +:statements+ [Array<Hash>] list of parsed transactions
102
+ # - +:warnings+ [Array<String>] optional, present only if issues occurred
103
+ # @raise [HeaderNotFound] if no valid IBAN line is found in the text
104
+ # @raise [BookingParseError] if no booking lines can be identified
105
+ #
106
+ # @example Parse statement text
107
+ # result = parser.parse_text(pdf_text)
108
+ # result[:header][:iban] #=> "DE89 3704 0044 0532 0130 00"
109
+ # result[:statements].first[:amount_eur] #=> "-31,49"
110
+ def parse_text(text)
111
+ text_str = text.to_s
112
+ @warnings = []
113
+
114
+ # Single-pass streaming parse with lazy IBAN extraction
115
+ iban, statements = parse_streaming_with_iban(text_str)
116
+
117
+ build_result(iban, statements)
118
+ end
119
+
120
+ # Parses pre-split lines when the text is already line-separated.
121
+ #
122
+ # Use this when you have an array of lines (e.g., from splitting on newlines
123
+ # or reading line-by-line from a file). This avoids re-splitting the text
124
+ # internally.
125
+ #
126
+ # @param lines [Array<String>] individual lines from the statement
127
+ # @return [Hash] parsed result with +:header+ and +:statements+ keys
128
+ # @raise [HeaderNotFound] if no valid IBAN line is found
129
+ # @raise [BookingParseError] if no booking lines can be identified
130
+ #
131
+ # @see #parse_text For parsing unsplit text
132
+ def parse_lines(lines)
133
+ lines_array = Array(lines)
134
+ @warnings = []
135
+
136
+ # Parse with lazy IBAN extraction from lines
137
+ iban, statements = parse_lines_with_iban(lines_array)
138
+
139
+ build_result(iban, statements)
140
+ end
141
+
142
+ # Parses transaction lines without requiring an IBAN header.
143
+ #
144
+ # Use this when parsing statement fragments or when the IBAN is already
145
+ # known from another source. Skips header extraction entirely and returns
146
+ # only the statements array.
147
+ #
148
+ # @param lines [Array<String>] lines containing only the transaction portion
149
+ # @return [Array<Hash>] parsed transactions (not wrapped in a result hash)
150
+ #
151
+ # @example Parse transactions without header
152
+ # statements = parser.parse_statement_lines(transaction_lines)
153
+ # statements.first[:recipient] #=> "Amazon EU"
154
+ def parse_statement_lines(lines)
155
+ @warnings = []
156
+ @line_cache = {}
157
+ parse_lines_internal(Array(lines))
158
+ end
159
+
160
+ private
161
+
162
+ # String constants to avoid repeated allocations during parsing.
163
+ # @api private
164
+ NEWLINE = "\n"
165
+ EMPTY_STRING = ''
166
+ SPACE = ' '
167
+ IBAN_MARKER = 'IBAN'
168
+ private_constant :NEWLINE, :EMPTY_STRING, :SPACE, :IBAN_MARKER
169
+
170
+ # Assembles the final result hash from parsed components.
171
+ #
172
+ # @param iban [String] the extracted IBAN
173
+ # @param statements [Array<Hash>] the parsed transaction list
174
+ # @return [Hash] the complete result with header, statements, and optional warnings
175
+ # @api private
176
+ def build_result(iban, statements)
177
+ result = { header: { iban: iban }, statements: statements }
178
+ result[:warnings] = @warnings unless @warnings.empty?
179
+ result
180
+ end
181
+
182
+ # Performs streaming parse with simultaneous IBAN and transaction extraction.
183
+ #
184
+ # This is the core parsing loop that processes text line-by-line. It lazily
185
+ # extracts the IBAN (stopping search once found) while building up the
186
+ # transaction list. Uses early termination when end markers are found.
187
+ #
188
+ # @param text [String] the complete statement text
189
+ # @return [Array(String, Array<Hash>)] tuple of [iban, statements]
190
+ # @raise [HeaderNotFound] if IBAN cannot be found
191
+ # @api private
192
+ def parse_streaming_with_iban(text) # rubocop:disable Metrics/MethodLength
193
+ statements = []
194
+ current = nil
195
+ found_first_booking = false
196
+ finished = false
197
+ iban = nil
198
+ @line_cache = {}
199
+
200
+ text.each_line do |raw_line|
201
+ next if finished
202
+
203
+ line = raw_line.chomp.rstrip
204
+ @line_cache = {} # Reset cache for new line
205
+
206
+ # Lazy IBAN extraction - only scan lines until we find it
207
+ iban = extract_iban_from_line(line) if iban.nil? && line.include?(IBAN_MARKER)
208
+
209
+ # Skip until we find first booking line
210
+ unless found_first_booking
211
+ next unless booking_line_candidate?(line)
212
+
213
+ found_first_booking = true
214
+ end
215
+
216
+ # Skip page break lines
217
+ next if page_break_line_fast?(line)
218
+
219
+ # Process artifact removal
220
+ cleaned = remove_known_prefix_artifact_fast(line)
221
+
222
+ # Reset cache if line was modified by artifact removal
223
+ @line_cache = {} if cleaned != line
224
+
225
+ current, finished = process_line(current, cleaned, statements)
226
+ end
227
+
228
+ # Validate IBAN was found
229
+ unless iban
230
+ raise IngKontoauszugParser::HeaderNotFound,
231
+ 'IBAN line not found in statement text'
232
+ end
233
+
234
+ statements << current if current
235
+ [iban, finalize_statements(statements)]
236
+ end
237
+
238
+ # Parses a line array with lazy IBAN extraction.
239
+ #
240
+ # Similar to {#parse_streaming_with_iban} but optimized for pre-split lines.
241
+ # Performs a single pass to find both the IBAN and the first booking line
242
+ # index, then delegates to {#parse_lines_internal} for transaction parsing.
243
+ #
244
+ # @param lines [Array<String>] the statement lines
245
+ # @return [Array(String, Array<Hash>)] tuple of [iban, statements]
246
+ # @raise [HeaderNotFound] if IBAN cannot be found
247
+ # @raise [BookingParseError] if no booking lines exist
248
+ # @api private
249
+ def parse_lines_with_iban(lines)
250
+ iban = nil
251
+ first_idx = nil
252
+ @line_cache = {}
253
+
254
+ # Single pass to find IBAN and first booking
255
+ lines.each_with_index do |line, idx|
256
+ line_str = line.to_s
257
+
258
+ # Lazy IBAN extraction
259
+ iban = extract_iban_from_line(line_str) if iban.nil? && line_str.include?(IBAN_MARKER)
260
+
261
+ # Find first booking line
262
+ if first_idx.nil? && booking_line_candidate?(line_str)
263
+ first_idx = idx
264
+ break if iban # Found both, stop scanning
265
+ end
266
+ end
267
+
268
+ validate_iban_and_booking_found!(iban, first_idx)
269
+ [iban, parse_lines_internal(lines, first_idx)]
270
+ end
271
+
272
+ # Validates that both IBAN and first booking line were found.
273
+ # @param iban [String, nil] the extracted IBAN
274
+ # @param first_idx [Integer, nil] index of first booking line
275
+ # @raise [HeaderNotFound] if IBAN is nil
276
+ # @raise [BookingParseError] if first_idx is nil
277
+ # @api private
278
+ def validate_iban_and_booking_found!(iban, first_idx)
279
+ unless iban
280
+ raise IngKontoauszugParser::HeaderNotFound,
281
+ 'IBAN line not found in statement text'
282
+ end
283
+
284
+ return if first_idx
285
+
286
+ raise IngKontoauszugParser::BookingParseError,
287
+ 'Could not locate the first booking line in the statement text'
288
+ end
289
+
290
+ # Attempts to extract an IBAN from a single line.
291
+ #
292
+ # Matches the ING statement format "IBAN DE89 3704 0044 ..." and optionally
293
+ # validates the checksum. Returns nil if the line doesn't contain an IBAN
294
+ # or if validation fails.
295
+ #
296
+ # @param line [String] a single line that may contain an IBAN
297
+ # @return [String, nil] the extracted IBAN or nil if not found/invalid
298
+ # @api private
299
+ def extract_iban_from_line(line)
300
+ match = line.match(IBAN_LINE_REGEX)
301
+ return nil unless match
302
+
303
+ iban = match[1].strip
304
+ Header.validate_iban!(iban) if @validate_iban
305
+ iban
306
+ rescue IngKontoauszugParser::InvalidIBAN
307
+ nil # Try next line if validation fails
308
+ end
309
+
310
+ # Quickly determines if a line might be a booking (transaction) line.
311
+ #
312
+ # Uses a two-stage check: first a fast character test (must start with a digit),
313
+ # then a regex match. This avoids expensive regex operations on lines that
314
+ # clearly aren't bookings.
315
+ #
316
+ # @param line [String] the line to check
317
+ # @return [Boolean] true if the line looks like a booking line
318
+ # @api private
319
+ def booking_line_candidate?(line)
320
+ return false if line.empty?
321
+
322
+ # Fast reject: first non-space char must be a digit
323
+ first_char = line.lstrip[0]
324
+ return false unless first_char && first_char >= '0' && first_char <= '9'
325
+
326
+ # Only run regex if fast check passes
327
+ line.match?(FIRST_BOOKING_LINE_REGEX)
328
+ end
329
+
330
+ # Finds the first booking line and parses from that point.
331
+ #
332
+ # Legacy entry point that scans for the first booking and delegates
333
+ # to {#parse_lines_internal}. Used when IBAN extraction is handled separately.
334
+ #
335
+ # @param lines [Array<String>] statement lines
336
+ # @return [Array<Hash>] parsed transactions
337
+ # @raise [BookingParseError] if no booking lines can be found
338
+ # @api private
339
+ def parse_from_lines(lines)
340
+ first_idx = nil
341
+ lines.each_with_index do |line, idx|
342
+ if booking_line_candidate?(line.to_s)
343
+ first_idx = idx
344
+ break
345
+ end
346
+ end
347
+
348
+ unless first_idx
349
+ raise IngKontoauszugParser::BookingParseError,
350
+ 'Could not locate the first booking line in the statement text'
351
+ end
352
+
353
+ parse_lines_internal(lines, first_idx)
354
+ end
355
+
356
+ # Core line-by-line transaction parsing loop.
357
+ #
358
+ # Iterates through lines starting at start_idx, building transaction objects
359
+ # as it encounters booking lines and accumulating narrative text for each.
360
+ # Stops when an end marker is found.
361
+ #
362
+ # @param lines [Array<String>] the statement lines
363
+ # @param start_idx [Integer] index to start parsing from (default: 0)
364
+ # @return [Array<Hash>] finalized list of parsed transactions
365
+ # @api private
366
+ def parse_lines_internal(lines, start_idx = 0)
367
+ statements = []
368
+ current = nil
369
+ @line_cache = {}
370
+
371
+ idx = start_idx
372
+ len = lines.length
373
+
374
+ while idx < len
375
+ raw_line = lines[idx]
376
+ idx += 1
377
+
378
+ line = raw_line.to_s.rstrip
379
+ @line_cache = {} # Reset cache for new line
380
+
381
+ # Skip page breaks
382
+ next if page_break_line_fast?(line)
383
+
384
+ # Artifact removal
385
+ cleaned = remove_known_prefix_artifact_fast(line)
386
+
387
+ # Reset cache if line was modified by artifact removal
388
+ @line_cache = {} if cleaned != line
389
+
390
+ current, finished = process_line(current, cleaned, statements)
391
+ break if finished
392
+ end
393
+
394
+ statements << current if current
395
+ finalize_statements(statements)
396
+ end
397
+
398
+ # Processes a single line within the parsing state machine.
399
+ #
400
+ # Determines the line type and updates state accordingly:
401
+ # - End marker: finishes current statement, signals completion
402
+ # - Booking line: starts a new transaction
403
+ # - Other: adds to current transaction's narrative or value date
404
+ #
405
+ # @param current [Hash, nil] the in-progress transaction, or nil if none
406
+ # @param line [String] the line to process
407
+ # @param statements [Array<Hash>] accumulator for completed transactions
408
+ # @return [Array(Hash, Boolean)] tuple of [updated_current, finished_flag]
409
+ # @api private
410
+ def process_line(current, line, statements)
411
+ # Use memoized stripped value
412
+ stripped = get_stripped(line)
413
+
414
+ if end_of_statements_fast?(stripped)
415
+ statements << current if current
416
+ return [nil, true]
417
+ end
418
+
419
+ if (booking = parse_booking_line_fast(line))
420
+ statements << current if current
421
+ return [build_statement_from_booking(booking), false]
422
+ end
423
+
424
+ return [current, false] unless current
425
+
426
+ [apply_value_or_narrative(current, line), false]
427
+ end
428
+
429
+ # Returns the stripped version of a line, caching the result.
430
+ #
431
+ # Avoids calling String#strip multiple times for the same line during
432
+ # a single processing cycle. Cache is cleared for each new line.
433
+ #
434
+ # @param line [String] the line to strip
435
+ # @return [String] the stripped line (cached)
436
+ # @api private
437
+ def get_stripped(line)
438
+ @line_cache ||= {}
439
+ @line_cache[:stripped] ||= line.strip
440
+ end
441
+
442
+ # Determines if a line is a page break or header that should be skipped.
443
+ #
444
+ # ING statements repeat page headers/footers on each page. This method
445
+ # identifies common patterns like "Seite X von Y", column headers, and
446
+ # date lines that aren't part of the transaction data.
447
+ #
448
+ # @param line [String] the line to check
449
+ # @return [Boolean] true if the line should be skipped
450
+ # @api private
451
+ def page_break_line_fast?(line)
452
+ stripped = get_stripped(line)
453
+ return true if stripped.empty?
454
+
455
+ # Check first character for fast rejection of most lines
456
+ first_char = stripped[0]
457
+ return false unless PAGE_BREAK_FIRST_CHARS.include?(first_char)
458
+
459
+ PAGE_BREAK_PATTERNS.any? { |re| stripped.match?(re) }
460
+ end
461
+
462
+ # Checks if a line contains an end-of-statements marker.
463
+ #
464
+ # Uses first-character rejection to quickly filter out non-matching lines
465
+ # before running the full regex check against configured end markers.
466
+ #
467
+ # @param stripped [String] a pre-stripped line
468
+ # @return [Boolean] true if line contains an end marker
469
+ # @api private
470
+ def end_of_statements_fast?(stripped)
471
+ return false if stripped.empty?
472
+
473
+ # Fast reject using first character
474
+ first_char = stripped[0]
475
+ return false unless @end_markers_first_chars.include?(first_char)
476
+
477
+ stripped.match?(@end_of_statements_regex)
478
+ end
479
+
480
+ # Legacy wrapper for backward compatibility.
481
+ # @param line [String] line to check (will be stripped)
482
+ # @return [Boolean] true if end marker found
483
+ # @api private
484
+ # @deprecated Use {#end_of_statements_fast?} with pre-stripped input
485
+ def end_of_statements?(line)
486
+ end_of_statements_fast?(line.strip)
487
+ end
488
+
489
+ # Removes OCR artifacts that appear at line beginnings.
490
+ #
491
+ # PDF extraction sometimes produces spurious "T" prefixes before dates
492
+ # or SEPA labels due to OCR errors or text positioning issues. This method
493
+ # detects and removes these artifacts while preserving legitimate content.
494
+ #
495
+ # @param str [String] the line that may contain artifacts
496
+ # @return [String] the cleaned line (same object if no artifacts found)
497
+ # @api private
498
+ def remove_known_prefix_artifact_fast(str)
499
+ return str if str.empty?
500
+
501
+ stripped = str.strip
502
+ return str if stripped.empty?
503
+
504
+ # Check for T prefix artifact
505
+ if (m = str.match(ARTIFACT_PREFIX_REGEX))
506
+ leading_ws = m[1]
507
+ rest = m[2]
508
+ return "#{leading_ws}#{rest}" if rest.match?(ARTIFACT_DATE_REGEX)
509
+ return "#{leading_ws}#{rest}" if rest.match?(@artifact_label_regex)
510
+ end
511
+
512
+ str
513
+ end
514
+
515
+ # Legacy wrapper for artifact removal.
516
+ # @param str [String] line to clean
517
+ # @return [String] cleaned line
518
+ # @api private
519
+ # @deprecated Use {#remove_known_prefix_artifact_fast}
520
+ def remove_known_prefix_artifact(str)
521
+ remove_known_prefix_artifact_fast(str)
522
+ end
523
+
524
+ # Filters and cleans an array of lines for parsing.
525
+ #
526
+ # Removes page breaks and OCR artifacts from each line. Used as a
527
+ # preprocessing step when lines need cleaning before parsing.
528
+ #
529
+ # @param lines [Array<String>] raw lines from PDF extraction
530
+ # @return [Array<String>] cleaned lines with page breaks removed
531
+ # @api private
532
+ # @deprecated Prefer streaming parsing which handles cleanup inline
533
+ def sanitize_lines(lines)
534
+ lines.each_with_object([]) do |line, acc|
535
+ @line_cache = {} # Reset cache for each line
536
+ next if page_break_line_fast?(line)
537
+
538
+ cleaned = line.to_s.dup.rstrip
539
+ cleaned = remove_known_prefix_artifact_fast(cleaned)
540
+ acc << cleaned
541
+ end
542
+ end
543
+
544
+ # Compiles regex patterns from configurable labels at initialization.
545
+ #
546
+ # Creates optimized patterns for end markers, mandate/reference extraction,
547
+ # and artifact detection based on the configured labels. Also pre-computes
548
+ # first-character sets for fast rejection filtering.
549
+ #
550
+ # @return [void]
551
+ # @api private
552
+ def compile_dynamic_patterns
553
+ # Build end-of-statements regex from configured markers
554
+ markers_pattern = @end_markers.map { |m| Regexp.escape(m) }.join('|')
555
+ @end_of_statements_regex = /\b(?:#{markers_pattern})\b/
556
+
557
+ # Pre-compute first characters of end markers for fast rejection
558
+ @end_markers_first_chars = @end_markers.map { |m| m[0] }.uniq.freeze
559
+
560
+ # Build mandate/reference regexes from configured labels
561
+ mandate_escaped = Regexp.escape(@mandate_label)
562
+ reference_escaped = Regexp.escape(@reference_label)
563
+ @mandate_regex = /\b#{mandate_escaped}\s*([^\s]+(?:\s(?!#{reference_escaped})[^\s]+)*)/
564
+ @reference_regex = /\b#{reference_escaped}\s*([^\s]+(?:\s(?!#{mandate_escaped})[^\s]+)*)/
565
+
566
+ # Build artifact label regex
567
+ @artifact_label_regex = /\A\s*(#{reference_escaped}|#{mandate_escaped})/
568
+ end
569
+
570
+ # Matches IBAN line format: "IBAN DE89 3704 0044 0532 0130 00"
571
+ # Captures the IBAN portion (country code + check digits + BBAN).
572
+ IBAN_LINE_REGEX = /IBAN\s+([A-Z0-9 ]{10,})/
573
+ private_constant :IBAN_LINE_REGEX
574
+
575
+ # First characters of page break patterns for fast rejection.
576
+ # Lines not starting with these characters can skip the full regex check.
577
+ # Covers: Seite, Valuta, Girokonto, Kontoauszug, Datum, Buchung, Betrag
578
+ PAGE_BREAK_FIRST_CHARS = %w[S V G K D B].freeze
579
+ private_constant :PAGE_BREAK_FIRST_CHARS
580
+
581
+ # German date format: DD.MM.YYYY (e.g., "01.08.2025")
582
+ DATE_PATTERN = /\d{2}\.\d{2}\.\d{4}/
583
+
584
+ # German currency format with thousand separators and comma decimal.
585
+ # Matches: "1.234,56", "-31,49", "+100,00"
586
+ AMOUNT_PATTERN_FULL = /[-+]?\d{1,3}(?:\.\d{3})*,\d{2}/
587
+
588
+ # Value date followed by narrative text on the same line.
589
+ # Captures: (1) date, (2) remaining text
590
+ VALUE_DATE_WITH_TEXT_REGEX = /\A\s*(\d{2}\.\d{2}\.\d{4})\s+(.*)\z/
591
+
592
+ # Value date alone on a line (no narrative text).
593
+ VALUE_DATE_ONLY_REGEX = /\A\d{2}\.\d{2}\.\d{4}\z/
594
+
595
+ # Amount pattern for detecting stray amount lines.
596
+ AMOUNT_PATTERN = /[-+]?\d{1,3}(?:\.\d{3})*,\d{2}/
597
+
598
+ # Booking line format: "DD.MM.YYYY TransferType ..."
599
+ # Must start with date followed by whitespace and non-whitespace.
600
+ FIRST_BOOKING_LINE_REGEX = /^\s*\d{2}\.\d{2}\.\d{4}\s+\S/
601
+
602
+ # Acquirer Reference Number (ARN) for card transactions.
603
+ # Used as fallback reference when SEPA reference is not present.
604
+ ARN_REGEX = /\bARN\d{8,}\b/
605
+
606
+ # Google Pay indicator in narrative text.
607
+ GOOGLE_PAY_REGEX = /\bgoogle\s+pay\b/i
608
+
609
+ # OCR artifact pattern: spurious "T" prefix before content.
610
+ # Captures: (1) leading whitespace, (2) content after "T"
611
+ ARTIFACT_PREFIX_REGEX = /\A(\s*)T(.*)\z/
612
+
613
+ # Date pattern for artifact validation (ensures "T" was before a date).
614
+ ARTIFACT_DATE_REGEX = /\A\s*\d{2}\.\d{2}\.\d{4}\b/
615
+
616
+ private_constant :DATE_PATTERN, :AMOUNT_PATTERN_FULL,
617
+ :VALUE_DATE_WITH_TEXT_REGEX, :VALUE_DATE_ONLY_REGEX, :AMOUNT_PATTERN,
618
+ :FIRST_BOOKING_LINE_REGEX, :ARN_REGEX,
619
+ :GOOGLE_PAY_REGEX,
620
+ :ARTIFACT_PREFIX_REGEX, :ARTIFACT_DATE_REGEX
621
+
622
+ # Parses a booking (transaction) line into its components.
623
+ #
624
+ # Uses StringScanner for efficient left-to-right parsing of the format:
625
+ # "DD.MM.YYYY TransferType Recipient Amount"
626
+ #
627
+ # The amount is located by scanning from the right since recipient names
628
+ # can contain spaces.
629
+ #
630
+ # @param line [String] a potential booking line
631
+ # @return [Hash, nil] parsed components or nil if not a valid booking line
632
+ # - +:booking_date+ [String] the transaction date
633
+ # - +:transfer_type+ [String] transaction type (e.g., "Lastschrift")
634
+ # - +:recipient+ [String] counterparty name
635
+ # - +:amount_raw+ [String] amount in German format
636
+ # @api private
637
+ def parse_booking_line_fast(line)
638
+ # Fast reject: must start with optional whitespace then digit
639
+ stripped = get_stripped(line)
640
+ return nil if stripped.empty?
641
+
642
+ first_char = stripped[0]
643
+ return nil unless first_char.between?('0', '9')
644
+
645
+ # Use StringScanner for structured parsing
646
+ scanner = StringScanner.new(line)
647
+
648
+ # Skip leading whitespace
649
+ scanner.skip(/\s*/)
650
+
651
+ # Match date (DD.MM.YYYY)
652
+ date = scanner.scan(/\d{2}\.\d{2}\.\d{4}/)
653
+ return nil unless date
654
+
655
+ # Must have whitespace after date
656
+ return nil unless scanner.skip(/\s+/)
657
+
658
+ # Match transfer type (non-whitespace)
659
+ transfer_type = scanner.scan(/\S+/)
660
+ return nil unless transfer_type
661
+
662
+ # Must have whitespace
663
+ return nil unless scanner.skip(/\s+/)
664
+
665
+ # Scan to find amount at end - work backwards
666
+ rest = scanner.rest.rstrip
667
+
668
+ # Find the last amount pattern in the line
669
+ amount_match = rest.match(/(.*?)\s+([-+]?\d{1,3}(?:\.\d{3})*,\d{2})\s*\z/)
670
+ return nil unless amount_match
671
+
672
+ recipient = amount_match[1].strip
673
+ amount = amount_match[2]
674
+
675
+ { booking_date: date, transfer_type: transfer_type, recipient: recipient, amount_raw: amount }
676
+ end
677
+
678
+ # Legacy wrapper for booking line parsing.
679
+ # @param line [String] line to parse
680
+ # @return [Hash, nil] parsed booking or nil
681
+ # @api private
682
+ # @deprecated Use {#parse_booking_line_fast}
683
+ def parse_booking_line(line)
684
+ parse_booking_line_fast(line)
685
+ end
686
+
687
+ # Creates an initial statement hash from parsed booking data.
688
+ #
689
+ # Converts the raw booking components into the statement structure,
690
+ # including amount normalization and direction detection.
691
+ #
692
+ # @param booking [Hash] output from {#parse_booking_line_fast}
693
+ # @return [Hash] statement hash ready for narrative accumulation
694
+ # @api private
695
+ def build_statement_from_booking(booking)
696
+ amount_numeric = parse_amount(booking[:amount_raw])
697
+ {
698
+ booking_date: booking[:booking_date],
699
+ transfer_type: booking[:transfer_type],
700
+ recipient: booking[:recipient],
701
+ amount_eur: booking[:amount_raw],
702
+ amount_eur_numeric: amount_numeric.to_s('F'),
703
+ amount_direction: amount_direction(amount_numeric),
704
+ value_date: nil,
705
+ narrative_lines: []
706
+ }
707
+ end
708
+
709
+ # Converts German amount format to BigDecimal.
710
+ #
711
+ # German format uses dots as thousand separators and commas for decimals.
712
+ # "-1.234,56" becomes BigDecimal("-1234.56")
713
+ #
714
+ # @param raw [String] amount in German format (e.g., "-1.234,56")
715
+ # @return [BigDecimal] the numeric amount
716
+ # @api private
717
+ def parse_amount(raw)
718
+ BigDecimal(raw.delete('.').tr(',', '.'))
719
+ end
720
+
721
+ # Determines the transaction direction from the amount sign.
722
+ #
723
+ # @param amount_numeric [BigDecimal] the parsed amount
724
+ # @return [String] "debit" for negative, "credit" for positive, "neutral" for zero
725
+ # @api private
726
+ def amount_direction(amount_numeric)
727
+ return 'debit' if amount_numeric.negative?
728
+ return 'credit' if amount_numeric.positive?
729
+
730
+ 'neutral'
731
+ end
732
+
733
+ # Adds value date or narrative text to an in-progress statement.
734
+ #
735
+ # After a booking line, subsequent lines are either:
736
+ # - A value date (alone or with narrative)
737
+ # - Pure narrative text
738
+ # - Stray amount lines (logged as warnings)
739
+ #
740
+ # @param current [Hash] the in-progress statement
741
+ # @param line [String] the line to process
742
+ # @return [Hash] the updated statement
743
+ # @api private
744
+ def apply_value_or_narrative(current, line)
745
+ stripped = get_stripped(line)
746
+
747
+ if current[:value_date].nil?
748
+ if (m = line.match(VALUE_DATE_WITH_TEXT_REGEX))
749
+ current[:value_date] = m[1]
750
+ initial = m[2].strip
751
+ current[:narrative_lines] << initial unless initial.empty?
752
+ elsif stripped.match?(VALUE_DATE_ONLY_REGEX)
753
+ current[:value_date] = stripped
754
+ elsif stripped.match?(AMOUNT_PATTERN)
755
+ @warnings << 'Encountered amount line without active statement'
756
+ else
757
+ current[:narrative_lines] << stripped
758
+ end
759
+ else
760
+ current[:narrative_lines] << stripped
761
+ end
762
+
763
+ current
764
+ end
765
+
766
+ # Post-processes all parsed statements for final output.
767
+ #
768
+ # Converts accumulated narrative lines to a single string, extracts
769
+ # mandate/reference identifiers, detects Google Pay transactions, and
770
+ # filters out incomplete statements (missing value date).
771
+ #
772
+ # @param statements [Array<Hash>] raw statement hashes with narrative_lines
773
+ # @return [Array<Hash>] finalized statements ready for output
774
+ # @api private
775
+ def finalize_statements(statements)
776
+ return [] if statements.empty?
777
+
778
+ statements.filter_map do |statement|
779
+ unless statement[:value_date]
780
+ @warnings << "Could not locate value date for statement: #{statement[:recipient]}"
781
+ next
782
+ end
783
+
784
+ narrative_lines = statement.delete(:narrative_lines)
785
+ narrative = build_narrative(narrative_lines)
786
+ s = statement.merge(narrative: narrative)
787
+
788
+ mandate_id, reference = extract_mandate_and_reference(narrative)
789
+ s[:mandate_id] = mandate_id if mandate_id
790
+ s[:reference] = reference if reference
791
+
792
+ s[:google_pay] = true if narrative.match?(GOOGLE_PAY_REGEX)
793
+ s
794
+ end
795
+ end
796
+
797
+ # Joins narrative lines into a single space-separated string.
798
+ #
799
+ # Uses pre-allocated string capacity and avoids intermediate array
800
+ # creation for better memory efficiency with long narratives.
801
+ #
802
+ # @param lines [Array<String>] individual narrative lines
803
+ # @return [String] joined narrative text
804
+ # @api private
805
+ def build_narrative(lines)
806
+ return EMPTY_STRING if lines.empty?
807
+
808
+ result = String.new(capacity: lines.sum(&:length) + lines.length)
809
+ first = true
810
+
811
+ lines.each do |line|
812
+ next if line.empty?
813
+
814
+ if first
815
+ first = false
816
+ else
817
+ result << SPACE
818
+ end
819
+ result << line
820
+ end
821
+
822
+ result
823
+ end
824
+
825
+ # Extracts SEPA mandate ID and reference from narrative text.
826
+ #
827
+ # Searches for configured labels (e.g., "Mandat:", "Referenz:") in the
828
+ # narrative. Falls back to ARN pattern for card transactions.
829
+ #
830
+ # @param narrative [String] the joined narrative text
831
+ # @return [Array(String, String)] tuple of [mandate_id, reference], either may be nil
832
+ # @api private
833
+ def extract_mandate_and_reference(narrative)
834
+ mandate_match = narrative.match(@mandate_regex)
835
+ reference_match = narrative.match(@reference_regex)
836
+
837
+ mandate = sanitize_identifier(mandate_match[1]) if mandate_match
838
+ reference = sanitize_identifier(reference_match[1]) if reference_match
839
+ reference ||= extract_arn_reference(narrative)
840
+ [mandate, reference]
841
+ end
842
+
843
+ # Matches whitespace for identifier normalization.
844
+ WHITESPACE_REGEX = /\s+/
845
+ private_constant :WHITESPACE_REGEX
846
+
847
+ # Removes internal whitespace from identifiers.
848
+ #
849
+ # SEPA mandate IDs and references may contain spaces from line wrapping
850
+ # that should be removed for consistent output.
851
+ #
852
+ # @param value [String] identifier that may contain whitespace
853
+ # @return [String] identifier with whitespace removed
854
+ # @api private
855
+ def sanitize_identifier(value)
856
+ value.gsub(WHITESPACE_REGEX, EMPTY_STRING)
857
+ end
858
+
859
+ # Extracts ARN (Acquirer Reference Number) from narrative.
860
+ #
861
+ # ARN is used for card transactions and serves as a fallback reference
862
+ # when no SEPA reference is present.
863
+ #
864
+ # @param narrative [String] the narrative text to search
865
+ # @return [String, nil] the ARN if found, nil otherwise
866
+ # @api private
867
+ def extract_arn_reference(narrative)
868
+ narrative[ARN_REGEX]
869
+ end
870
+
871
+ # Patterns matching ING statement page headers and footers.
872
+ #
873
+ # These lines appear on each page of multi-page statements and should be
874
+ # skipped during parsing. Ordered by likelihood for faster early exit.
875
+ #
876
+ # Patterns match:
877
+ # - "Seite X von Y" (page numbers)
878
+ # - "Valuta" (column header)
879
+ # - "Girokonto Nummer ..." (account header)
880
+ # - "Kontoauszug ..." (statement header)
881
+ # - "Datum DD.MM.YYYY" (date header)
882
+ # - "Buchung / Verwendungszweck" (column headers)
883
+ # - "Betrag (EUR)" (amount column header)
884
+ PAGE_BREAK_PATTERNS = [
885
+ /\ASeite\b/,
886
+ /\AValuta\z/,
887
+ /\AGirokonto Nummer\b/,
888
+ /\AKontoauszug\b/,
889
+ /\ADatum\b.*\d{2}\.\d{2}\.\d{4}\z/,
890
+ %r{\ABuchung\s+Buchung\s*/\s*Verwendungszweck\b},
891
+ /\ABetrag \(EUR\)\z/
892
+ ].freeze
893
+ private_constant :PAGE_BREAK_PATTERNS
894
+
895
+ # Legacy wrapper for page break detection.
896
+ # @param line [String] line to check
897
+ # @return [Boolean] true if line is a page break
898
+ # @api private
899
+ # @deprecated Use {#page_break_line_fast?}
900
+ def page_break_line?(line)
901
+ @line_cache = {}
902
+ page_break_line_fast?(line)
903
+ end
904
+ end
905
+ end