ing_kontoauszug_parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +19 -0
- data/LICENSE +21 -0
- data/README.md +159 -0
- data/bin/console +8 -0
- data/bin/pdf_to_json +64 -0
- data/bin/setup +5 -0
- data/lib/ing_kontoauszug_parser/header.rb +146 -0
- data/lib/ing_kontoauszug_parser/pdf_extractor.rb +233 -0
- data/lib/ing_kontoauszug_parser/statement_parser.rb +269 -0
- data/lib/ing_kontoauszug_parser/text_parser.rb +905 -0
- data/lib/ing_kontoauszug_parser/version.rb +14 -0
- data/lib/ing_kontoauszug_parser.rb +105 -0
- metadata +74 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: b08a3098282dadebc68193749586e1ca664548d062dd022f4db358f21b0776a5
|
|
4
|
+
data.tar.gz: 9c107cf1bc1080515e0440401c0b6c7eceb5abe6db6d521f5424103c20579cf2
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 5f36aa98eaf89d07fe86b6f8f5dc147092b9b495894ef4397c224191b9a1b508e97fbf9a97e3e1daae417eaadc0e73ed1068f00f12f9c0e22925548da68af9e1
|
|
7
|
+
data.tar.gz: 3b059deaba3f60f49d66c5b5db6da84fee6b568ed3cb7149cc109186497dad2d9ef773442ef0ef58188ead16f5342a6773f1debd72f11fd2555e1e85707d296b
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
## [Unreleased]
|
|
6
|
+
|
|
7
|
+
## [0.1.0] - 2026-01-21
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
- Initial release of `IngKontoauszugParser` gem.
|
|
11
|
+
- `StatementParser` class to parse ING Bank (Germany) statement PDFs into structured JSON.
|
|
12
|
+
- PDF text extraction via poppler (fast) or pdf-reader (portable).
|
|
13
|
+
- IBAN extraction and validation using ISO 13616 checksum.
|
|
14
|
+
- Transaction parsing with dates, amounts, recipients, and narratives.
|
|
15
|
+
- SEPA metadata extraction (`mandate_id`, `reference`) with ARN fallback.
|
|
16
|
+
- Google Pay detection in transaction narratives.
|
|
17
|
+
- CLI tool `bin/pdf_to_json` for direct PDF-to-JSON conversion.
|
|
18
|
+
- Interactive console via `bin/console`.
|
|
19
|
+
- Comprehensive test suite with Minitest.
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Go
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# ING Kontoauszug Parser
|
|
2
|
+
|
|
3
|
+
A Ruby gem for parsing ING Bank (Germany) statement PDFs and text exports into structured JSON.
|
|
4
|
+
|
|
5
|
+
## Why This Gem?
|
|
6
|
+
|
|
7
|
+
ING Germany provides account statements as PDFs, but extracting transaction data programmatically is challenging:
|
|
8
|
+
|
|
9
|
+
- **PDF text extraction is messy** – Column alignment, page breaks, and OCR artifacts make raw text unreliable
|
|
10
|
+
- **No official export API** – ING doesn't provide a machine-readable format like CSV or OFX
|
|
11
|
+
- **Manual categorization is tedious** – Hundreds of transactions need consistent tagging for budgeting
|
|
12
|
+
|
|
13
|
+
This gem solves these problems by:
|
|
14
|
+
|
|
15
|
+
1. **Extracting clean text** from statement PDFs using poppler (fast) or pdf-reader (portable)
|
|
16
|
+
2. **Parsing transactions** into structured data with dates, amounts, recipients, and narratives
|
|
17
|
+
3. **Extracting SEPA metadata** like mandate IDs and references for reconciliation
|
|
18
|
+
4. **Validating IBANs** using the ISO 13616 checksum algorithm
|
|
19
|
+
5. **Detecting payment methods** like Google Pay for automatic categorization
|
|
20
|
+
|
|
21
|
+
Use it to build budgeting tools, import transactions into accounting software, or automate expense tracking.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
# Gemfile
|
|
27
|
+
gem 'ing_kontoauszug_parser'
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
bundle install
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
```ruby
|
|
37
|
+
require 'ing_kontoauszug_parser'
|
|
38
|
+
|
|
39
|
+
parser = IngKontoauszugParser::StatementParser.new
|
|
40
|
+
result = parser.parse(file_path: 'statement.pdf')
|
|
41
|
+
|
|
42
|
+
result[:header][:iban] # => "DE89 3704 0044 0532 0130 00"
|
|
43
|
+
result[:statements].first[:amount_eur_numeric] # => "-31.49"
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## API
|
|
47
|
+
|
|
48
|
+
```ruby
|
|
49
|
+
parser = IngKontoauszugParser::StatementParser.new
|
|
50
|
+
|
|
51
|
+
# Parse PDF file
|
|
52
|
+
result = parser.parse(file_path: 'statement.pdf')
|
|
53
|
+
|
|
54
|
+
# Parse text export
|
|
55
|
+
result = parser.parse_text(File.read('export.txt'))
|
|
56
|
+
|
|
57
|
+
# Parse pre-split lines
|
|
58
|
+
result = parser.parse_lines(lines)
|
|
59
|
+
|
|
60
|
+
# Parse booking lines only (no header)
|
|
61
|
+
statements = parser.parse_statement_lines(lines)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Output Format
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
{
|
|
68
|
+
"header": {
|
|
69
|
+
"iban": "DE89 3704 0044 0532 0130 00"
|
|
70
|
+
},
|
|
71
|
+
"statements": [
|
|
72
|
+
{
|
|
73
|
+
"booking_date": "01.08.2025",
|
|
74
|
+
"transfer_type": "Lastschrift",
|
|
75
|
+
"recipient": "Allianz Direct Vers.",
|
|
76
|
+
"amount_eur": "-31,49",
|
|
77
|
+
"amount_eur_numeric": "-31.49",
|
|
78
|
+
"amount_direction": "debit",
|
|
79
|
+
"value_date": "01.08.2025",
|
|
80
|
+
"narrative": "Versicherungsbeitrag August",
|
|
81
|
+
"mandate_id": "MA123456",
|
|
82
|
+
"reference": "RF123456789",
|
|
83
|
+
"google_pay": true
|
|
84
|
+
}
|
|
85
|
+
]
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Statement Fields
|
|
90
|
+
|
|
91
|
+
| Field | Type | Description |
|
|
92
|
+
|-------|------|-------------|
|
|
93
|
+
| `booking_date` | string | Booking date (dd.mm.yyyy) |
|
|
94
|
+
| `transfer_type` | string | Transaction type (e.g., Lastschrift, Gutschrift) |
|
|
95
|
+
| `recipient` | string | Counterparty name |
|
|
96
|
+
| `amount_eur` | string | Original amount with German formatting |
|
|
97
|
+
| `amount_eur_numeric` | string | Normalized decimal (BigDecimal as string) |
|
|
98
|
+
| `amount_direction` | string | `debit`, `credit`, or `neutral` |
|
|
99
|
+
| `value_date` | string | Value date (dd.mm.yyyy) |
|
|
100
|
+
| `narrative` | string | Transaction details |
|
|
101
|
+
| `mandate_id` | string? | SEPA mandate ID (if present) |
|
|
102
|
+
| `reference` | string? | SEPA reference or ARN (if present) |
|
|
103
|
+
| `google_pay` | boolean? | `true` if Google Pay detected |
|
|
104
|
+
|
|
105
|
+
## CLI Tools
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
# Parse PDF statement to JSON
|
|
109
|
+
bin/pdf_to_json statement.pdf
|
|
110
|
+
|
|
111
|
+
# Write to file instead of stdout
|
|
112
|
+
bin/pdf_to_json -o output.json statement.pdf
|
|
113
|
+
|
|
114
|
+
# Interactive console
|
|
115
|
+
bin/console
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Error Handling
|
|
119
|
+
|
|
120
|
+
PDF reader errors are wrapped as `IngKontoauszugParser::Error`:
|
|
121
|
+
|
|
122
|
+
```ruby
|
|
123
|
+
begin
|
|
124
|
+
parser.parse(file_path: 'encrypted.pdf')
|
|
125
|
+
rescue IngKontoauszugParser::Error => e
|
|
126
|
+
puts e.message
|
|
127
|
+
end
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Non-fatal issues (e.g., statements missing value dates) are collected in an optional `warnings` array:
|
|
131
|
+
|
|
132
|
+
```ruby
|
|
133
|
+
result = parser.parse_text(text)
|
|
134
|
+
result[:warnings]&.each { |w| puts "Warning: #{w}" }
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Development
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
bin/setup # Install dependencies
|
|
141
|
+
rake test # Run tests
|
|
142
|
+
rake rubocop # Lint
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Requirements
|
|
146
|
+
|
|
147
|
+
- Ruby 3.0+
|
|
148
|
+
- pdf-reader gem
|
|
149
|
+
- Input files must be UTF-8 encoded
|
|
150
|
+
|
|
151
|
+
## Documentation
|
|
152
|
+
|
|
153
|
+
- [Feature Summary](docs/FEATURE_SUMMARY.md)
|
|
154
|
+
- [Technical Summary](docs/TECHNICAL_SUMMARY.md)
|
|
155
|
+
- [Changelog](CHANGELOG.md)
|
|
156
|
+
|
|
157
|
+
## Contributing
|
|
158
|
+
|
|
159
|
+
Bug reports and pull requests welcome on GitHub.
|
data/bin/console
ADDED
data/bin/pdf_to_json
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require 'json'
|
|
5
|
+
require 'optparse'
|
|
6
|
+
require 'pdf/reader'
|
|
7
|
+
|
|
8
|
+
$LOAD_PATH.unshift(File.expand_path('../lib', __dir__)) unless $LOAD_PATH.include?(File.expand_path('../lib', __dir__))
|
|
9
|
+
require 'ing_kontoauszug_parser'
|
|
10
|
+
|
|
11
|
+
options = {}
|
|
12
|
+
parser = OptionParser.new do |opts|
|
|
13
|
+
opts.banner = 'Usage: pdf_to_json [options] FILE'
|
|
14
|
+
opts.separator ''
|
|
15
|
+
opts.separator 'Parse an ING statement PDF directly into JSON.'
|
|
16
|
+
opts.separator ''
|
|
17
|
+
|
|
18
|
+
opts.on('-o', '--output FILE', 'Write JSON to FILE instead of STDOUT') do |file|
|
|
19
|
+
options[:output] = file
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
opts.on('-h', '--help', 'Show this help message') do
|
|
23
|
+
puts opts
|
|
24
|
+
exit
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
opts.on('-v', '--version', 'Show version') do
|
|
28
|
+
puts "ing_kontoauszug_parser #{IngKontoauszugParser::VERSION}"
|
|
29
|
+
exit
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
parser.parse!
|
|
34
|
+
|
|
35
|
+
if ARGV.empty?
|
|
36
|
+
warn parser
|
|
37
|
+
exit 1
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
pdf_path = ARGV.first
|
|
41
|
+
unless File.exist?(pdf_path)
|
|
42
|
+
warn "File not found: #{pdf_path}"
|
|
43
|
+
exit 1
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
begin
|
|
47
|
+
reader = PDF::Reader.new(pdf_path)
|
|
48
|
+
rescue PDF::Reader::MalformedPDFError, PDF::Reader::EncryptedPDFError => e
|
|
49
|
+
warn "Could not open PDF: #{e.message}"
|
|
50
|
+
exit 1
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
full_text = reader.pages.map { |page| page.text.to_s }.join("\n")
|
|
54
|
+
|
|
55
|
+
statement_parser = IngKontoauszugParser::StatementParser.new
|
|
56
|
+
result = statement_parser.parse_text(full_text)
|
|
57
|
+
|
|
58
|
+
json_output = JSON.pretty_generate(result)
|
|
59
|
+
|
|
60
|
+
if options[:output]
|
|
61
|
+
File.write(options[:output], json_output)
|
|
62
|
+
else
|
|
63
|
+
puts json_output
|
|
64
|
+
end
|
data/bin/setup
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module IngKontoauszugParser
|
|
4
|
+
# Extracts and validates IBANs from bank statement text.
|
|
5
|
+
#
|
|
6
|
+
# This module handles the extraction of International Bank Account Numbers
|
|
7
|
+
# from ING statement headers and validates them using the ISO 13616 mod-97
|
|
8
|
+
# checksum algorithm.
|
|
9
|
+
#
|
|
10
|
+
# == IBAN Format
|
|
11
|
+
#
|
|
12
|
+
# IBANs consist of:
|
|
13
|
+
# - 2-letter country code (e.g., "DE" for Germany)
|
|
14
|
+
# - 2 check digits
|
|
15
|
+
# - Up to 30 alphanumeric characters (Basic Bank Account Number - BBAN)
|
|
16
|
+
#
|
|
17
|
+
# Example: "DE89 3704 0044 0532 0130 00"
|
|
18
|
+
#
|
|
19
|
+
# == Validation Algorithm
|
|
20
|
+
#
|
|
21
|
+
# The ISO 13616 mod-97 algorithm:
|
|
22
|
+
# 1. Move the first 4 characters to the end
|
|
23
|
+
# 2. Convert letters to numbers (A=10, B=11, ..., Z=35)
|
|
24
|
+
# 3. Calculate modulo 97 of the resulting number
|
|
25
|
+
# 4. Valid if remainder equals 1
|
|
26
|
+
#
|
|
27
|
+
# @example Extract and validate IBAN from statement text
|
|
28
|
+
# iban = Header.extract_iban(statement_text)
|
|
29
|
+
# #=> "DE89 3704 0044 0532 0130 00"
|
|
30
|
+
#
|
|
31
|
+
# @example Validate an IBAN directly
|
|
32
|
+
# Header.valid_iban?("DE89 3704 0044 0532 0130 00") #=> true
|
|
33
|
+
# Header.valid_iban?("DE00 0000 0000 0000 0000 00") #=> false
|
|
34
|
+
#
|
|
35
|
+
# @api private
|
|
36
|
+
# @see https://en.wikipedia.org/wiki/International_Bank_Account_Number IBAN specification
|
|
37
|
+
module Header
|
|
38
|
+
# Pattern to match IBAN lines in ING statement format.
|
|
39
|
+
# Captures the IBAN portion after the "IBAN" label.
|
|
40
|
+
IBAN_REGEX = /IBAN\s+([A-Z0-9 ]{10,})/
|
|
41
|
+
private_constant :IBAN_REGEX
|
|
42
|
+
|
|
43
|
+
module_function
|
|
44
|
+
|
|
45
|
+
# Finds and extracts the IBAN from statement text.
|
|
46
|
+
#
|
|
47
|
+
# Scans the text for a line containing "IBAN" and extracts the account
|
|
48
|
+
# number that follows. By default, validates the extracted IBAN using
|
|
49
|
+
# the mod-97 checksum algorithm.
|
|
50
|
+
#
|
|
51
|
+
# @param text [String] statement text (typically the header section or full statement)
|
|
52
|
+
# @param validate [Boolean] whether to verify the IBAN checksum. Set to false
|
|
53
|
+
# for faster extraction when validation is handled elsewhere.
|
|
54
|
+
# @return [String] the extracted IBAN with original spacing preserved
|
|
55
|
+
# @raise [IngKontoauszugParser::HeaderNotFound] if no IBAN line is found in the text
|
|
56
|
+
# @raise [IngKontoauszugParser::InvalidIBAN] if validation is enabled and the
|
|
57
|
+
# checksum verification fails
|
|
58
|
+
#
|
|
59
|
+
# @example Extract with validation (default)
|
|
60
|
+
# Header.extract_iban("IBAN DE89 3704 0044 0532 0130 00\n...")
|
|
61
|
+
# #=> "DE89 3704 0044 0532 0130 00"
|
|
62
|
+
#
|
|
63
|
+
# @example Extract without validation
|
|
64
|
+
# Header.extract_iban(text, validate: false)
|
|
65
|
+
def extract_iban(text, validate: true)
|
|
66
|
+
line = text.to_s.lines.find { |header_line| header_line.include?('IBAN') }
|
|
67
|
+
unless line
|
|
68
|
+
raise IngKontoauszugParser::HeaderNotFound,
|
|
69
|
+
'IBAN line not found in statement text'
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
match = line.match(IBAN_REGEX)
|
|
73
|
+
unless match
|
|
74
|
+
raise IngKontoauszugParser::HeaderNotFound,
|
|
75
|
+
'IBAN value not found in statement text'
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
iban = match[1].strip
|
|
79
|
+
validate_iban!(iban) if validate
|
|
80
|
+
iban
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Validates an IBAN using the ISO 13616 mod-97 checksum algorithm.
|
|
84
|
+
#
|
|
85
|
+
# The algorithm rearranges the IBAN (moving country code and check digits
|
|
86
|
+
# to the end), converts letters to numbers, and verifies that the resulting
|
|
87
|
+
# number modulo 97 equals 1.
|
|
88
|
+
#
|
|
89
|
+
# @param iban [String] the IBAN to validate. Spaces are allowed and will be
|
|
90
|
+
# stripped before validation. Case-insensitive.
|
|
91
|
+
# @return [true] if the IBAN is valid
|
|
92
|
+
# @raise [IngKontoauszugParser::InvalidIBAN] if the checksum fails, with a
|
|
93
|
+
# message containing the actual remainder for debugging
|
|
94
|
+
#
|
|
95
|
+
# @example Validate a correct IBAN
|
|
96
|
+
# Header.validate_iban!("DE89 3704 0044 0532 0130 00") #=> true
|
|
97
|
+
#
|
|
98
|
+
# @example Invalid IBAN raises exception
|
|
99
|
+
# Header.validate_iban!("DE00 0000 0000 0000 0000 00")
|
|
100
|
+
# #=> raises InvalidIBAN: "IBAN checksum validation failed..."
|
|
101
|
+
def validate_iban!(iban)
|
|
102
|
+
normalized = iban.gsub(/\s/, '').upcase
|
|
103
|
+
return true if normalized.length < 5 # Too short to validate meaningfully
|
|
104
|
+
|
|
105
|
+
# Move first 4 characters to end
|
|
106
|
+
rearranged = normalized[4..] + normalized[0, 4]
|
|
107
|
+
|
|
108
|
+
# Convert letters to numbers (A=10, B=11, ..., Z=35)
|
|
109
|
+
numeric_string = rearranged.chars.map do |char|
|
|
110
|
+
if char.match?(/[A-Z]/)
|
|
111
|
+
(char.ord - 'A'.ord + 10).to_s
|
|
112
|
+
else
|
|
113
|
+
char
|
|
114
|
+
end
|
|
115
|
+
end.join
|
|
116
|
+
|
|
117
|
+
# Perform mod-97 check
|
|
118
|
+
remainder = numeric_string.to_i % 97
|
|
119
|
+
return true if remainder == 1
|
|
120
|
+
|
|
121
|
+
raise IngKontoauszugParser::InvalidIBAN,
|
|
122
|
+
"IBAN checksum validation failed for #{iban} (remainder: #{remainder}, expected: 1)"
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Checks if an IBAN is valid without raising an exception.
|
|
126
|
+
#
|
|
127
|
+
# This is a convenience wrapper around {#validate_iban!} that returns
|
|
128
|
+
# a boolean instead of raising an exception on invalid IBANs.
|
|
129
|
+
#
|
|
130
|
+
# @param iban [String] the IBAN to validate
|
|
131
|
+
# @return [Boolean] true if the IBAN passes checksum validation, false otherwise
|
|
132
|
+
#
|
|
133
|
+
# @example Check validity
|
|
134
|
+
# if Header.valid_iban?(user_input)
|
|
135
|
+
# process_payment(user_input)
|
|
136
|
+
# else
|
|
137
|
+
# show_error("Invalid IBAN")
|
|
138
|
+
# end
|
|
139
|
+
def valid_iban?(iban)
|
|
140
|
+
validate_iban!(iban)
|
|
141
|
+
true
|
|
142
|
+
rescue InvalidIBAN
|
|
143
|
+
false
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'English'
|
|
4
|
+
|
|
5
|
+
module IngKontoauszugParser
|
|
6
|
+
# Extracts text from PDF statement files for subsequent parsing.
|
|
7
|
+
#
|
|
8
|
+
# This class provides a unified interface to PDF text extraction with automatic
|
|
9
|
+
# backend selection. It supports two extraction methods:
|
|
10
|
+
#
|
|
11
|
+
# == Poppler Backend (Recommended)
|
|
12
|
+
#
|
|
13
|
+
# Uses the +pdftotext+ command-line utility from poppler-utils. This is
|
|
14
|
+
# 5-10x faster than the pure Ruby alternative and handles complex PDFs better.
|
|
15
|
+
# Install via your package manager:
|
|
16
|
+
#
|
|
17
|
+
# # Debian/Ubuntu
|
|
18
|
+
# apt-get install poppler-utils
|
|
19
|
+
#
|
|
20
|
+
# # macOS
|
|
21
|
+
# brew install poppler
|
|
22
|
+
#
|
|
23
|
+
# == PDF::Reader Backend (Fallback)
|
|
24
|
+
#
|
|
25
|
+
# Uses the pdf-reader gem for pure Ruby extraction. Automatically selected
|
|
26
|
+
# when poppler is not available. Supports parallel page processing for
|
|
27
|
+
# multi-page statements to improve performance.
|
|
28
|
+
#
|
|
29
|
+
# == Backend Selection
|
|
30
|
+
#
|
|
31
|
+
# By default, the fastest available backend is selected automatically:
|
|
32
|
+
# 1. Poppler if +pdftotext+ is found in PATH
|
|
33
|
+
# 2. PDF::Reader with parallel processing otherwise
|
|
34
|
+
#
|
|
35
|
+
# You can force a specific backend via the +:backend+ parameter.
|
|
36
|
+
#
|
|
37
|
+
# @example Automatic backend selection
|
|
38
|
+
# lines = PdfExtractor.extract_lines('statement.pdf')
|
|
39
|
+
#
|
|
40
|
+
# @example Force specific backend
|
|
41
|
+
# lines = PdfExtractor.extract_lines('statement.pdf', backend: :pdf_reader)
|
|
42
|
+
#
|
|
43
|
+
# @api private
|
|
44
|
+
# @see StatementParser#parse The public API that uses this class
|
|
45
|
+
class PdfExtractor
|
|
46
|
+
# Minimum page count before parallel processing is used with PDF::Reader.
|
|
47
|
+
# Below this threshold, thread overhead exceeds the parallelization benefit.
|
|
48
|
+
# @api private
|
|
49
|
+
PARALLEL_THRESHOLD = 4
|
|
50
|
+
private_constant :PARALLEL_THRESHOLD
|
|
51
|
+
|
|
52
|
+
# Checks whether the poppler +pdftotext+ utility is installed.
|
|
53
|
+
#
|
|
54
|
+
# The result is cached after the first call for performance. Use this
|
|
55
|
+
# to conditionally show backend recommendations to users.
|
|
56
|
+
#
|
|
57
|
+
# @return [Boolean] true if +pdftotext+ is found in PATH
|
|
58
|
+
def self.poppler_available?
|
|
59
|
+
return @poppler_available if defined?(@poppler_available)
|
|
60
|
+
|
|
61
|
+
@poppler_available = system('which pdftotext > /dev/null 2>&1')
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Extracts text lines from a PDF using the best available method.
|
|
65
|
+
#
|
|
66
|
+
# This is the primary entry point for PDF extraction. It automatically
|
|
67
|
+
# selects the fastest backend and returns an array of text lines ready
|
|
68
|
+
# for parsing by {TextParser}.
|
|
69
|
+
#
|
|
70
|
+
# @param file_path [String] absolute or relative path to the PDF file
|
|
71
|
+
# @param backend [Symbol, nil] force a specific backend:
|
|
72
|
+
# - +:poppler+ - use pdftotext (fails if not installed)
|
|
73
|
+
# - +:pdf_reader+ - use pdf-reader gem
|
|
74
|
+
# - +nil+ - auto-select fastest available (default)
|
|
75
|
+
# @param parallel [Boolean] enable threaded page processing for pdf-reader.
|
|
76
|
+
# Has no effect when using poppler backend. Default: true
|
|
77
|
+
# @return [Array<String>] text lines with trailing whitespace stripped
|
|
78
|
+
# @raise [IngKontoauszugParser::Error] if PDF extraction fails
|
|
79
|
+
# @raise [ArgumentError] if an unknown backend is specified
|
|
80
|
+
#
|
|
81
|
+
# @example Extract with automatic backend selection
|
|
82
|
+
# lines = PdfExtractor.extract_lines('/path/to/statement.pdf')
|
|
83
|
+
#
|
|
84
|
+
# @example Force pdf-reader without parallelization
|
|
85
|
+
# lines = PdfExtractor.extract_lines(path, backend: :pdf_reader, parallel: false)
|
|
86
|
+
def self.extract_lines(file_path, backend: nil, parallel: true)
|
|
87
|
+
backend ||= poppler_available? ? :poppler : :pdf_reader
|
|
88
|
+
|
|
89
|
+
case backend
|
|
90
|
+
when :poppler
|
|
91
|
+
extract_with_poppler(file_path)
|
|
92
|
+
when :pdf_reader
|
|
93
|
+
parallel ? extract_with_pdf_reader_parallel(file_path) : extract_with_pdf_reader(file_path)
|
|
94
|
+
else
|
|
95
|
+
raise ArgumentError, "Unknown PDF backend: #{backend}"
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Extracts text using the poppler +pdftotext+ command-line utility.
|
|
100
|
+
#
|
|
101
|
+
# Uses the +-layout+ flag to preserve text positioning, which is essential
|
|
102
|
+
# for correctly parsing the columnar ING statement format. The +-nopgbrk+
|
|
103
|
+
# flag removes page break characters for cleaner output.
|
|
104
|
+
#
|
|
105
|
+
# @param file_path [String] path to the PDF file
|
|
106
|
+
# @return [Array<String>] extracted text lines
|
|
107
|
+
# @raise [IngKontoauszugParser::Error] if pdftotext returns a non-zero exit code
|
|
108
|
+
def self.extract_with_poppler(file_path)
|
|
109
|
+
escaped_path = Shellwords.escape(file_path)
|
|
110
|
+
output = `pdftotext -layout -nopgbrk #{escaped_path} - 2>&1`
|
|
111
|
+
|
|
112
|
+
raise IngKontoauszugParser::Error, "pdftotext failed: #{output}" unless $CHILD_STATUS.success?
|
|
113
|
+
|
|
114
|
+
# Split into lines efficiently, stripping trailing whitespace
|
|
115
|
+
lines = []
|
|
116
|
+
output.each_line do |line|
|
|
117
|
+
lines << line.rstrip
|
|
118
|
+
end
|
|
119
|
+
lines
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Extracts text using the pdf-reader gem with optional parallelization.
|
|
123
|
+
#
|
|
124
|
+
# For multi-page statements (4+ pages), spawns threads to process pages
|
|
125
|
+
# concurrently. Smaller documents use sequential processing to avoid
|
|
126
|
+
# thread overhead. Results are sorted by page index to maintain order.
|
|
127
|
+
#
|
|
128
|
+
# @param file_path [String] path to the PDF file
|
|
129
|
+
# @return [Array<String>] extracted text lines in page order
|
|
130
|
+
# @raise [IngKontoauszugParser::Error] if pdf-reader encounters a PDF error
|
|
131
|
+
def self.extract_with_pdf_reader_parallel(file_path)
|
|
132
|
+
require 'pdf-reader'
|
|
133
|
+
|
|
134
|
+
reader = PDF::Reader.new(file_path)
|
|
135
|
+
pages = reader.pages.select { |p| p.respond_to?(:text) }
|
|
136
|
+
|
|
137
|
+
# Use sequential processing for small documents
|
|
138
|
+
return extract_pages_sequential(pages) if pages.length < PARALLEL_THRESHOLD
|
|
139
|
+
|
|
140
|
+
# Process pages in parallel using threads
|
|
141
|
+
extract_pages_parallel(pages)
|
|
142
|
+
rescue StandardError => e
|
|
143
|
+
raise IngKontoauszugParser::Error, e.message if pdf_reader_error?(e)
|
|
144
|
+
|
|
145
|
+
raise
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Extracts text using the pdf-reader gem with sequential processing.
|
|
149
|
+
#
|
|
150
|
+
# This is the simplest extraction method, processing pages one at a time.
|
|
151
|
+
# Use this for debugging or when thread safety is a concern. For production
|
|
152
|
+
# use, prefer {.extract_with_pdf_reader_parallel}.
|
|
153
|
+
#
|
|
154
|
+
# @param file_path [String] path to the PDF file
|
|
155
|
+
# @return [Array<String>] extracted text lines in page order
|
|
156
|
+
# @raise [IngKontoauszugParser::Error] if pdf-reader encounters a PDF error
|
|
157
|
+
def self.extract_with_pdf_reader(file_path)
|
|
158
|
+
require 'pdf-reader'
|
|
159
|
+
|
|
160
|
+
reader = PDF::Reader.new(file_path)
|
|
161
|
+
lines = []
|
|
162
|
+
|
|
163
|
+
reader.pages.each do |page|
|
|
164
|
+
next unless page.respond_to?(:text)
|
|
165
|
+
|
|
166
|
+
page.text.to_s.each_line do |line|
|
|
167
|
+
lines << line.rstrip
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
lines
|
|
172
|
+
rescue StandardError => e
|
|
173
|
+
raise IngKontoauszugParser::Error, e.message if pdf_reader_error?(e)
|
|
174
|
+
|
|
175
|
+
raise
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Determines if an error originated from the pdf-reader gem.
|
|
179
|
+
#
|
|
180
|
+
# Used to wrap pdf-reader exceptions in {IngKontoauszugParser::Error} for
|
|
181
|
+
# consistent error handling in the public API.
|
|
182
|
+
#
|
|
183
|
+
# @param error [StandardError] the caught exception
|
|
184
|
+
# @return [Boolean] true if error class is from PDF::Reader namespace
|
|
185
|
+
# @api private
|
|
186
|
+
def self.pdf_reader_error?(error)
|
|
187
|
+
error.class.name.start_with?('PDF::Reader::')
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Extracts text from pages one at a time.
|
|
191
|
+
#
|
|
192
|
+
# @param pages [Array<PDF::Reader::Page>] pages to process
|
|
193
|
+
# @return [Array<String>] concatenated text lines from all pages
|
|
194
|
+
# @api private
|
|
195
|
+
def self.extract_pages_sequential(pages)
|
|
196
|
+
lines = []
|
|
197
|
+
pages.each do |page|
|
|
198
|
+
page.text.to_s.each_line do |line|
|
|
199
|
+
lines << line.rstrip
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
lines
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Extracts text from pages concurrently using threads.
|
|
206
|
+
#
|
|
207
|
+
# Spawns one thread per page and collects results. Page order is preserved
|
|
208
|
+
# by sorting on page index before flattening.
|
|
209
|
+
#
|
|
210
|
+
# @param pages [Array<PDF::Reader::Page>] pages to process
|
|
211
|
+
# @return [Array<String>] concatenated text lines in page order
|
|
212
|
+
# @api private
|
|
213
|
+
def self.extract_pages_parallel(pages)
|
|
214
|
+
# Create threads to process each page
|
|
215
|
+
threads = pages.map.with_index do |page, idx|
|
|
216
|
+
Thread.new do
|
|
217
|
+
page_lines = []
|
|
218
|
+
page.text.to_s.each_line do |line|
|
|
219
|
+
page_lines << line.rstrip
|
|
220
|
+
end
|
|
221
|
+
[idx, page_lines]
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Collect results and sort by page index to maintain order
|
|
226
|
+
results = threads.map(&:value)
|
|
227
|
+
results.sort_by(&:first).flat_map(&:last)
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Pre-load shellwords for poppler extraction
|
|
231
|
+
require 'shellwords'
|
|
232
|
+
end
|
|
233
|
+
end
|