bankstatementparser 0.0.4__tar.gz → 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/PKG-INFO +132 -19
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/README.md +122 -16
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/__init__.py +3 -2
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/bank_statement_parsers.py +1 -1
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/base_parser.py +1 -1
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/camt_parser.py +2 -1
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/cli.py +365 -6
- bankstatementparser-0.0.7/bankstatementparser/enrichment/__init__.py +61 -0
- bankstatementparser-0.0.7/bankstatementparser/enrichment/categorizer.py +485 -0
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/exceptions.py +1 -1
- bankstatementparser-0.0.7/bankstatementparser/hybrid/__init__.py +64 -0
- bankstatementparser-0.0.7/bankstatementparser/hybrid/llm_extractor.py +356 -0
- bankstatementparser-0.0.7/bankstatementparser/hybrid/ollama_direct.py +219 -0
- bankstatementparser-0.0.7/bankstatementparser/hybrid/orchestrator.py +497 -0
- bankstatementparser-0.0.7/bankstatementparser/hybrid/pdf_text.py +113 -0
- bankstatementparser-0.0.7/bankstatementparser/hybrid/prompts.py +69 -0
- bankstatementparser-0.0.7/bankstatementparser/hybrid/verification.py +133 -0
- bankstatementparser-0.0.7/bankstatementparser/hybrid/vision.py +544 -0
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/input_validator.py +7 -1
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/pain001_parser.py +1 -1
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/parallel.py +1 -1
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/record_types.py +1 -1
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/transaction_deduplicator.py +37 -1
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/transaction_models.py +126 -5
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/pyproject.toml +25 -5
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/LICENSE +0 -0
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/additional_parsers.py +0 -0
- {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/zip_security.py +0 -0
|
@@ -1,68 +1,122 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bankstatementparser
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.7
|
|
4
4
|
Summary: BankStatementParser is your essential tool for easy bank statement management. Designed with finance and treasury experts in mind, it offers a simple way to handle CAMT (ISO 20022) formats and more. Get quick, accurate insights from your financial data and spend less time on processing. It's the smart, hassle-free way to stay on top of your transactions.
|
|
5
5
|
License: Apache Software License
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Author: Sebastien Rousseau
|
|
8
8
|
Author-email: sebastian.rousseau@gmail.com
|
|
9
|
-
Requires-Python: >=3.
|
|
9
|
+
Requires-Python: >=3.10,<4.0
|
|
10
10
|
Classifier: License :: Other/Proprietary License
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.10
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.11
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.13
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.14
|
|
17
|
+
Provides-Extra: enrichment
|
|
18
|
+
Provides-Extra: hybrid
|
|
19
|
+
Provides-Extra: hybrid-plus
|
|
20
|
+
Provides-Extra: hybrid-vision
|
|
18
21
|
Provides-Extra: polars
|
|
19
22
|
Requires-Dist: defusedxml (>=0.7.1,<0.8.0)
|
|
23
|
+
Requires-Dist: litellm (>=1.83.0) ; extra == "hybrid" or extra == "hybrid-plus" or extra == "hybrid-vision" or extra == "enrichment"
|
|
20
24
|
Requires-Dist: lxml (>=4.9.3)
|
|
21
25
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
|
22
26
|
Requires-Dist: pandas (>=2.3.3,<3.0.0)
|
|
27
|
+
Requires-Dist: pdfplumber (>=0.11.0) ; extra == "hybrid-plus"
|
|
23
28
|
Requires-Dist: polars (>=1.32.0,<2.0.0) ; extra == "polars"
|
|
24
29
|
Requires-Dist: pydantic (>=2.11.0,<3.0.0)
|
|
30
|
+
Requires-Dist: pypdf (>=4.0.0) ; extra == "hybrid" or extra == "hybrid-plus" or extra == "hybrid-vision"
|
|
31
|
+
Requires-Dist: pypdfium2 (>=4.30.0) ; extra == "hybrid-vision"
|
|
25
32
|
Project-URL: Homepage, https://bankstatementparser.com
|
|
26
33
|
Project-URL: Repository, https://github.com/sebastienrousseau/bankstatementparser
|
|
27
34
|
Description-Content-Type: text/markdown
|
|
28
35
|
|
|
29
36
|
# Bank Statement Parser
|
|
30
37
|
|
|
31
|
-
Parse bank statements across six formats
|
|
38
|
+
Parse bank statements across **six structured formats** (CAMT, PAIN.001, CSV, OFX/QFX, MT940) **and PDFs** — both digital and scanned — into a single unified `Transaction` model. ISO 20022 files take the deterministic path; PDFs fall through to a configurable LLM (Ollama by default, any LiteLLM-supported provider) and finally to a multimodal vision model for scanned/photocopied statements.
|
|
32
39
|
|
|
33
|
-
Built for finance teams, treasury analysts, and fintech developers who need reliable, auditable extraction
|
|
40
|
+
Built for finance teams, treasury analysts, and fintech developers who need reliable, auditable extraction across the full spectrum of bank statement formats — without sending data to external services unless they explicitly opt in.
|
|
34
41
|
|
|
35
|
-
[](https://pypi.org/project/bankstatementparser/)
|
|
42
|
+
[](https://pypi.org/project/bankstatementparser/)
|
|
36
43
|
[](https://pypi.org/project/bankstatementparser/)
|
|
37
44
|
[](https://codecov.io/github/sebastienrousseau/bankstatementparser?branch=main)
|
|
38
45
|
[](LICENSE)
|
|
39
46
|
|
|
47
|
+
## How it works
|
|
48
|
+
|
|
49
|
+
`smart_ingest()` routes any input file through the cheapest viable extraction path. Deterministic parsers always run first ($0 cost). Text and vision LLMs are fallbacks for unstandardized PDFs — both are opt-in via separate install extras and can be swapped between any LiteLLM-supported provider (Ollama, Anthropic, OpenAI, Gemini, …).
|
|
50
|
+
|
|
51
|
+
```mermaid
|
|
52
|
+
flowchart TD
|
|
53
|
+
A[smart_ingest(path)] --> B{detect_statement_format}
|
|
54
|
+
B -- CAMT/PAIN/OFX/MT940/CSV --> C[Path A: deterministic parser<br/>$0, fastest]
|
|
55
|
+
C --> Z[IngestResult<br/>source_method='deterministic']
|
|
56
|
+
|
|
57
|
+
B -- pdf or unknown --> D[pypdf extract_text]
|
|
58
|
+
D --> E{text len >= 50?}
|
|
59
|
+
|
|
60
|
+
E -- yes --> F[Path B: text-LLM<br/>default ollama/llama3]
|
|
61
|
+
F --> Y[IngestResult<br/>source_method='llm']
|
|
62
|
+
|
|
63
|
+
E -- no --> G[Path C: vision-LLM<br/>opt-in via BSP_HYBRID_VISION_MODEL]
|
|
64
|
+
G --> X[IngestResult<br/>source_method='vision']
|
|
65
|
+
|
|
66
|
+
Z --> V[verify_balance<br/>Golden Rule]
|
|
67
|
+
Y --> V
|
|
68
|
+
X --> V
|
|
69
|
+
V --> R[VERIFIED / DISCREPANCY / FAILED]
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Every extracted row carries an immutable `transaction_hash`, an audit-trail `source_method` tag, and (for LLM rows) a `confidence` score — see [Hybrid extraction](#hybrid-extraction-pdfs-included-v005) below for the full surface.
|
|
73
|
+
|
|
40
74
|
## Key Features
|
|
41
75
|
|
|
42
76
|
| Feature | Description |
|
|
43
77
|
|---|---|
|
|
44
|
-
| **6 formats** | CAMT.053, PAIN.001, CSV, OFX, QFX, MT940 |
|
|
78
|
+
| **6 structured formats** | CAMT.053, PAIN.001, CSV, OFX, QFX, MT940 |
|
|
79
|
+
| **Hybrid PDF pipeline** *(v0.0.5)* | `smart_ingest()` routes digital PDFs through a text-LLM and scanned PDFs through a multimodal vision model. Deterministic parsers always tried first ($0 cost). |
|
|
80
|
+
| **Local-first LLM** *(v0.0.5)* | Ollama is the default backend; switch to Anthropic, OpenAI, or any LiteLLM provider via `BSP_HYBRID_MODEL`. Vision is opt-in via `BSP_HYBRID_VISION_MODEL` — no surprise downloads. |
|
|
81
|
+
| **Golden Rule verification** *(v0.0.5)* | Every result carries `opening + credits − debits == closing` status: `VERIFIED`, `DISCREPANCY`, or `FAILED`. |
|
|
82
|
+
| **Idempotent dedup** *(v0.0.5)* | Every `Transaction` carries a stable `transaction_hash` (MD5 of date + normalized description + amount). `Deduplicator.dedupe_by_hash()` makes incremental ingestion safe to re-run. |
|
|
83
|
+
| **Categorization** *(v0.0.6)* | `bankstatementparser.enrichment.Categorizer` tags transactions with a pluggable category schema (Plaid 13-category default) and an optional `is_business_expense` flag. Wrapper model — never mutates the original `Transaction`. |
|
|
84
|
+
| **Interactive review** *(v0.0.6)* | `--type review` CLI walks through discrepancies with accept/edit/skip/delete/quit. `IngestResult.to_json()` / `.from_json()` for stable round-trip with embedded audit trail. |
|
|
85
|
+
| **Bounding boxes** *(v0.0.6)* | `Transaction.source_bbox` carries per-row normalized coordinates from the vision path for downstream review UIs. |
|
|
86
|
+
| **Direct Ollama bridge** *(v0.0.7)* | Auto-bypasses the upstream LiteLLM ↔ Ollama hang on long vision prompts. `ollama/minicpm-v` recommended over `ollama/llava` for document OCR. |
|
|
87
|
+
| **Strip mode** *(v0.0.7)* | `VisionExtractor(strip_rows=True)` splits dense pages into overlapping bands for small local models — fixes sign-flip errors and improves accuracy on 15+ row statements. |
|
|
45
88
|
| **Auto-detection** | `detect_statement_format()` identifies the format; `create_parser()` returns the right parser |
|
|
46
|
-
| **Deduplication** | `Deduplicator` detects exact duplicates and suspected matches across sources with explainable confidence scores |
|
|
47
89
|
| **PII redaction** | Names, IBANs, and addresses masked by default — opt in with `--show-pii` |
|
|
48
90
|
| **Streaming** | `parse_streaming()` at 27,000+ tx/s (CAMT) and 52,000+ tx/s (PAIN.001) with bounded memory |
|
|
49
91
|
| **Parallel** | `parse_files_parallel()` for multi-file batch processing across CPU cores |
|
|
50
92
|
| **Secure ZIP** | `iter_secure_xml_entries()` rejects zip bombs, encrypted entries, and suspicious compression ratios |
|
|
51
93
|
| **In-memory parsing** | `from_string()` and `from_bytes()` parse XML without touching disk |
|
|
52
94
|
| **Export** | CSV, JSON, Excel (`.xlsx`), and optional Polars DataFrames |
|
|
53
|
-
| **100% coverage** |
|
|
95
|
+
| **100% coverage** | 672 tests, 100% branch coverage, property-based fuzzing with Hypothesis |
|
|
54
96
|
|
|
55
97
|
## Requirements
|
|
56
98
|
|
|
57
|
-
- Python **3.
|
|
99
|
+
- Python **3.10** through **3.14** (Python 3.9 was dropped in v0.0.6 — pin to v0.0.5 if you cannot upgrade your interpreter)
|
|
58
100
|
- Poetry (for local development)
|
|
59
101
|
|
|
60
102
|
## Install
|
|
61
103
|
|
|
62
104
|
```bash
|
|
105
|
+
# Core install — deterministic parsers only (CAMT, PAIN.001, CSV, OFX, QFX, MT940)
|
|
63
106
|
pip install bankstatementparser
|
|
107
|
+
|
|
108
|
+
# Add the text-LLM path for digital PDFs (litellm + pypdf)
|
|
109
|
+
pip install 'bankstatementparser[hybrid]'
|
|
110
|
+
|
|
111
|
+
# Add higher-fidelity table extraction (adds pdfplumber)
|
|
112
|
+
pip install 'bankstatementparser[hybrid-plus]'
|
|
113
|
+
|
|
114
|
+
# Add the multimodal vision path for scanned/photocopied PDFs (adds pypdfium2)
|
|
115
|
+
pip install 'bankstatementparser[hybrid-vision]'
|
|
64
116
|
```
|
|
65
117
|
|
|
118
|
+
The core install has zero AI dependencies. Every `[hybrid*]` extra is opt-in and pure-Python — no `poppler`, no system libraries, no GPU required.
|
|
119
|
+
|
|
66
120
|
### Local Development
|
|
67
121
|
|
|
68
122
|
Clone and install on **macOS, Linux, or WSL**:
|
|
@@ -74,6 +128,7 @@ python3 -m venv .venv
|
|
|
74
128
|
source .venv/bin/activate
|
|
75
129
|
pip install poetry
|
|
76
130
|
poetry install --with dev
|
|
131
|
+
make install-hooks # pre-commit hook runs `make verify` before every commit
|
|
77
132
|
```
|
|
78
133
|
|
|
79
134
|
## Quick Start
|
|
@@ -123,6 +178,37 @@ records = parser.parse()
|
|
|
123
178
|
|
|
124
179
|
Works with `.xml`, `.csv`, `.ofx`, `.qfx`, and `.mt940` files.
|
|
125
180
|
|
|
181
|
+
### Hybrid extraction (PDFs included) *(v0.0.5)*
|
|
182
|
+
|
|
183
|
+
`smart_ingest()` is the single entry point that routes any file through the cheapest viable extraction path:
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from bankstatementparser.hybrid import smart_ingest
|
|
187
|
+
|
|
188
|
+
# Path A — deterministic parser (free, fastest, $0)
|
|
189
|
+
result = smart_ingest("statement.xml")
|
|
190
|
+
print(result.source_method) # "deterministic"
|
|
191
|
+
|
|
192
|
+
# Path B — text-LLM for digital PDFs (set BSP_HYBRID_MODEL=ollama/llama3)
|
|
193
|
+
result = smart_ingest("statement.pdf")
|
|
194
|
+
print(result.source_method) # "llm"
|
|
195
|
+
print(result.verification.status) # VERIFIED | DISCREPANCY | FAILED
|
|
196
|
+
|
|
197
|
+
# Path C — multimodal vision for scanned PDFs (set BSP_HYBRID_VISION_MODEL)
|
|
198
|
+
# auto-routed when pypdf cannot extract enough text
|
|
199
|
+
result = smart_ingest("scan.pdf")
|
|
200
|
+
print(result.source_method) # "vision"
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
Every row carries:
|
|
204
|
+
|
|
205
|
+
- `source_method` — `"deterministic"`, `"llm"`, or `"vision"` for full audit provenance
|
|
206
|
+
- `transaction_hash` — MD5 fingerprint of `date | normalized_description | amount`, ready for idempotent re-ingestion
|
|
207
|
+
- `confidence` — float between 0 and 1 for LLM rows, `None` for deterministic
|
|
208
|
+
- `raw_source_text` — best-effort source-text slice for the v0.0.6 review-mode UI
|
|
209
|
+
|
|
210
|
+
A complete walkthrough with synthetic UK-bank PDFs, mock vs. live mode, and a Mermaid flow diagram lives in [`examples/hybrid/README.md`](examples/hybrid/README.md).
|
|
211
|
+
|
|
126
212
|
### Parse from memory (no disk I/O)
|
|
127
213
|
|
|
128
214
|
```python
|
|
@@ -210,18 +296,28 @@ Uses `ProcessPoolExecutor` to bypass the GIL. Each file is parsed in its own wor
|
|
|
210
296
|
|
|
211
297
|
## Command Line
|
|
212
298
|
|
|
299
|
+
After installation a `bankstatementparser` console script is available on `PATH`:
|
|
300
|
+
|
|
213
301
|
```bash
|
|
214
302
|
# Parse and display
|
|
215
|
-
|
|
303
|
+
bankstatementparser --type camt --input statement.xml
|
|
216
304
|
|
|
217
305
|
# Export to CSV
|
|
218
|
-
|
|
306
|
+
bankstatementparser --type camt --input statement.xml --output transactions.csv
|
|
219
307
|
|
|
220
308
|
# Stream with PII visible
|
|
221
|
-
|
|
309
|
+
bankstatementparser --type camt --input statement.xml --streaming --show-pii
|
|
310
|
+
|
|
311
|
+
# v0.0.5 — hybrid pipeline (auto-routes deterministic / text-LLM / vision)
|
|
312
|
+
bankstatementparser --type ingest --input statement.pdf
|
|
313
|
+
bankstatementparser --type ingest --input statement.pdf --output ledger.csv
|
|
314
|
+
|
|
315
|
+
# v0.0.6 — interactive review of saved IngestResult JSON
|
|
316
|
+
bankstatementparser --type review --input result.json
|
|
317
|
+
bankstatementparser --type review --input result.json --output reviewed.json
|
|
222
318
|
```
|
|
223
319
|
|
|
224
|
-
Supports `--type camt` and `--type
|
|
320
|
+
Supports `--type camt`, `--type pain001`, `--type ingest` (v0.0.5), and `--type review` (v0.0.6). The `python -m bankstatementparser.cli ...` invocation form continues to work for parity with older releases.
|
|
225
321
|
|
|
226
322
|
## Deduplication
|
|
227
323
|
|
|
@@ -270,7 +366,9 @@ Install with `pip install bankstatementparser[polars]`.
|
|
|
270
366
|
|
|
271
367
|
## Examples
|
|
272
368
|
|
|
273
|
-
See [`examples/`](examples/README.md) for
|
|
369
|
+
See [`examples/`](examples/README.md) for 22 runnable scripts (14 deterministic + 8 hybrid):
|
|
370
|
+
|
|
371
|
+
### Deterministic parsers
|
|
274
372
|
|
|
275
373
|
| Example | What it demonstrates |
|
|
276
374
|
|---|---|
|
|
@@ -289,6 +387,20 @@ See [`examples/`](examples/README.md) for 14 runnable scripts:
|
|
|
289
387
|
| `compatibility_wrappers.py` | Legacy API wrappers |
|
|
290
388
|
| `cli_examples.sh` | CLI commands for CAMT and PAIN.001 |
|
|
291
389
|
|
|
390
|
+
### Hybrid pipeline *(v0.0.5)*
|
|
391
|
+
|
|
392
|
+
| Example | What it demonstrates |
|
|
393
|
+
|---|---|
|
|
394
|
+
| `hybrid/generate_sample_pdfs.py` | Produce reproducible synthetic UK-bank PDFs (digital + scanned) |
|
|
395
|
+
| `hybrid/01_smart_ingest_deterministic.py` | Path A — `smart_ingest()` against a CAMT.053 fixture, $0 cost |
|
|
396
|
+
| `hybrid/02_smart_ingest_text_llm.py` | Path B — text-LLM extraction from a digital PDF (mock or live Ollama) |
|
|
397
|
+
| `hybrid/03_smart_ingest_vision.py` | Path C — multimodal vision extraction with `LOW_TEXT_DENSITY` auto-routing |
|
|
398
|
+
| `hybrid/04_golden_rule.py` | All three `verify_balance()` outcomes |
|
|
399
|
+
| `hybrid/05_dedupe_recurring.py` | `normalize_description()` + `dedupe_by_hash()` for idempotent batching |
|
|
400
|
+
| `hybrid/06_cli_walkthrough.sh` | Four flavours of the new `--type ingest` CLI subcommand |
|
|
401
|
+
|
|
402
|
+
See [`examples/hybrid/README.md`](examples/hybrid/README.md) for the full walkthrough including a Mermaid flow diagram, the cross-platform verification matrix, and the Ollama smoke-test results.
|
|
403
|
+
|
|
292
404
|
## XML Tag Mapping
|
|
293
405
|
|
|
294
406
|
See [`docs/MAPPING.md`](docs/MAPPING.md) for a complete reference of ISO 20022 XML tags to DataFrame columns across all six formats. Use this when integrating with ERP systems or building reconciliation pipelines.
|
|
@@ -296,11 +408,12 @@ See [`docs/MAPPING.md`](docs/MAPPING.md) for a complete reference of ISO 20022 X
|
|
|
296
408
|
## Project Layout
|
|
297
409
|
|
|
298
410
|
```text
|
|
299
|
-
bankstatementparser/ Source code (
|
|
300
|
-
|
|
301
|
-
|
|
411
|
+
bankstatementparser/ Source code (24 modules: deterministic core + hybrid + enrichment subpackages, 100% branch coverage)
|
|
412
|
+
bankstatementparser/hybrid/ PDF pipeline: orchestrator, llm_extractor, vision, pdf_text, prompts, verification, ollama_direct
|
|
413
|
+
docs/compliance/ ISO 13485 validation, risk register, traceability matrix
|
|
414
|
+
examples/ 14 deterministic + 8 hybrid runnable example scripts
|
|
302
415
|
scripts/ SBOM generation, checksums, signature verification
|
|
303
|
-
tests/
|
|
416
|
+
tests/ 672 tests (unit, integration, property-based, security, hybrid mocks)
|
|
304
417
|
```
|
|
305
418
|
|
|
306
419
|
## Security
|
|
@@ -1,40 +1,87 @@
|
|
|
1
1
|
# Bank Statement Parser
|
|
2
2
|
|
|
3
|
-
Parse bank statements across six formats
|
|
3
|
+
Parse bank statements across **six structured formats** (CAMT, PAIN.001, CSV, OFX/QFX, MT940) **and PDFs** — both digital and scanned — into a single unified `Transaction` model. ISO 20022 files take the deterministic path; PDFs fall through to a configurable LLM (Ollama by default, any LiteLLM-supported provider) and finally to a multimodal vision model for scanned/photocopied statements.
|
|
4
4
|
|
|
5
|
-
Built for finance teams, treasury analysts, and fintech developers who need reliable, auditable extraction
|
|
5
|
+
Built for finance teams, treasury analysts, and fintech developers who need reliable, auditable extraction across the full spectrum of bank statement formats — without sending data to external services unless they explicitly opt in.
|
|
6
6
|
|
|
7
|
-
[](https://pypi.org/project/bankstatementparser/)
|
|
7
|
+
[](https://pypi.org/project/bankstatementparser/)
|
|
8
8
|
[](https://pypi.org/project/bankstatementparser/)
|
|
9
9
|
[](https://codecov.io/github/sebastienrousseau/bankstatementparser?branch=main)
|
|
10
10
|
[](LICENSE)
|
|
11
11
|
|
|
12
|
+
## How it works
|
|
13
|
+
|
|
14
|
+
`smart_ingest()` routes any input file through the cheapest viable extraction path. Deterministic parsers always run first ($0 cost). Text and vision LLMs are fallbacks for unstandardized PDFs — both are opt-in via separate install extras and can be swapped between any LiteLLM-supported provider (Ollama, Anthropic, OpenAI, Gemini, …).
|
|
15
|
+
|
|
16
|
+
```mermaid
|
|
17
|
+
flowchart TD
|
|
18
|
+
A[smart_ingest(path)] --> B{detect_statement_format}
|
|
19
|
+
B -- CAMT/PAIN/OFX/MT940/CSV --> C[Path A: deterministic parser<br/>$0, fastest]
|
|
20
|
+
C --> Z[IngestResult<br/>source_method='deterministic']
|
|
21
|
+
|
|
22
|
+
B -- pdf or unknown --> D[pypdf extract_text]
|
|
23
|
+
D --> E{text len >= 50?}
|
|
24
|
+
|
|
25
|
+
E -- yes --> F[Path B: text-LLM<br/>default ollama/llama3]
|
|
26
|
+
F --> Y[IngestResult<br/>source_method='llm']
|
|
27
|
+
|
|
28
|
+
E -- no --> G[Path C: vision-LLM<br/>opt-in via BSP_HYBRID_VISION_MODEL]
|
|
29
|
+
G --> X[IngestResult<br/>source_method='vision']
|
|
30
|
+
|
|
31
|
+
Z --> V[verify_balance<br/>Golden Rule]
|
|
32
|
+
Y --> V
|
|
33
|
+
X --> V
|
|
34
|
+
V --> R[VERIFIED / DISCREPANCY / FAILED]
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Every extracted row carries an immutable `transaction_hash`, an audit-trail `source_method` tag, and (for LLM rows) a `confidence` score — see [Hybrid extraction](#hybrid-extraction-pdfs-included-v005) below for the full surface.
|
|
38
|
+
|
|
12
39
|
## Key Features
|
|
13
40
|
|
|
14
41
|
| Feature | Description |
|
|
15
42
|
|---|---|
|
|
16
|
-
| **6 formats** | CAMT.053, PAIN.001, CSV, OFX, QFX, MT940 |
|
|
43
|
+
| **6 structured formats** | CAMT.053, PAIN.001, CSV, OFX, QFX, MT940 |
|
|
44
|
+
| **Hybrid PDF pipeline** *(v0.0.5)* | `smart_ingest()` routes digital PDFs through a text-LLM and scanned PDFs through a multimodal vision model. Deterministic parsers always tried first ($0 cost). |
|
|
45
|
+
| **Local-first LLM** *(v0.0.5)* | Ollama is the default backend; switch to Anthropic, OpenAI, or any LiteLLM provider via `BSP_HYBRID_MODEL`. Vision is opt-in via `BSP_HYBRID_VISION_MODEL` — no surprise downloads. |
|
|
46
|
+
| **Golden Rule verification** *(v0.0.5)* | Every result carries `opening + credits − debits == closing` status: `VERIFIED`, `DISCREPANCY`, or `FAILED`. |
|
|
47
|
+
| **Idempotent dedup** *(v0.0.5)* | Every `Transaction` carries a stable `transaction_hash` (MD5 of date + normalized description + amount). `Deduplicator.dedupe_by_hash()` makes incremental ingestion safe to re-run. |
|
|
48
|
+
| **Categorization** *(v0.0.6)* | `bankstatementparser.enrichment.Categorizer` tags transactions with a pluggable category schema (Plaid 13-category default) and an optional `is_business_expense` flag. Wrapper model — never mutates the original `Transaction`. |
|
|
49
|
+
| **Interactive review** *(v0.0.6)* | `--type review` CLI walks through discrepancies with accept/edit/skip/delete/quit. `IngestResult.to_json()` / `.from_json()` for stable round-trip with embedded audit trail. |
|
|
50
|
+
| **Bounding boxes** *(v0.0.6)* | `Transaction.source_bbox` carries per-row normalized coordinates from the vision path for downstream review UIs. |
|
|
51
|
+
| **Direct Ollama bridge** *(v0.0.7)* | Auto-bypasses the upstream LiteLLM ↔ Ollama hang on long vision prompts. `ollama/minicpm-v` recommended over `ollama/llava` for document OCR. |
|
|
52
|
+
| **Strip mode** *(v0.0.7)* | `VisionExtractor(strip_rows=True)` splits dense pages into overlapping bands for small local models — fixes sign-flip errors and improves accuracy on 15+ row statements. |
|
|
17
53
|
| **Auto-detection** | `detect_statement_format()` identifies the format; `create_parser()` returns the right parser |
|
|
18
|
-
| **Deduplication** | `Deduplicator` detects exact duplicates and suspected matches across sources with explainable confidence scores |
|
|
19
54
|
| **PII redaction** | Names, IBANs, and addresses masked by default — opt in with `--show-pii` |
|
|
20
55
|
| **Streaming** | `parse_streaming()` at 27,000+ tx/s (CAMT) and 52,000+ tx/s (PAIN.001) with bounded memory |
|
|
21
56
|
| **Parallel** | `parse_files_parallel()` for multi-file batch processing across CPU cores |
|
|
22
57
|
| **Secure ZIP** | `iter_secure_xml_entries()` rejects zip bombs, encrypted entries, and suspicious compression ratios |
|
|
23
58
|
| **In-memory parsing** | `from_string()` and `from_bytes()` parse XML without touching disk |
|
|
24
59
|
| **Export** | CSV, JSON, Excel (`.xlsx`), and optional Polars DataFrames |
|
|
25
|
-
| **100% coverage** |
|
|
60
|
+
| **100% coverage** | 672 tests, 100% branch coverage, property-based fuzzing with Hypothesis |
|
|
26
61
|
|
|
27
62
|
## Requirements
|
|
28
63
|
|
|
29
|
-
- Python **3.
|
|
64
|
+
- Python **3.10** through **3.14** (Python 3.9 was dropped in v0.0.6 — pin to v0.0.5 if you cannot upgrade your interpreter)
|
|
30
65
|
- Poetry (for local development)
|
|
31
66
|
|
|
32
67
|
## Install
|
|
33
68
|
|
|
34
69
|
```bash
|
|
70
|
+
# Core install — deterministic parsers only (CAMT, PAIN.001, CSV, OFX, QFX, MT940)
|
|
35
71
|
pip install bankstatementparser
|
|
72
|
+
|
|
73
|
+
# Add the text-LLM path for digital PDFs (litellm + pypdf)
|
|
74
|
+
pip install 'bankstatementparser[hybrid]'
|
|
75
|
+
|
|
76
|
+
# Add higher-fidelity table extraction (adds pdfplumber)
|
|
77
|
+
pip install 'bankstatementparser[hybrid-plus]'
|
|
78
|
+
|
|
79
|
+
# Add the multimodal vision path for scanned/photocopied PDFs (adds pypdfium2)
|
|
80
|
+
pip install 'bankstatementparser[hybrid-vision]'
|
|
36
81
|
```
|
|
37
82
|
|
|
83
|
+
The core install has zero AI dependencies. Every `[hybrid*]` extra is opt-in and pure-Python — no `poppler`, no system libraries, no GPU required.
|
|
84
|
+
|
|
38
85
|
### Local Development
|
|
39
86
|
|
|
40
87
|
Clone and install on **macOS, Linux, or WSL**:
|
|
@@ -46,6 +93,7 @@ python3 -m venv .venv
|
|
|
46
93
|
source .venv/bin/activate
|
|
47
94
|
pip install poetry
|
|
48
95
|
poetry install --with dev
|
|
96
|
+
make install-hooks # pre-commit hook runs `make verify` before every commit
|
|
49
97
|
```
|
|
50
98
|
|
|
51
99
|
## Quick Start
|
|
@@ -95,6 +143,37 @@ records = parser.parse()
|
|
|
95
143
|
|
|
96
144
|
Works with `.xml`, `.csv`, `.ofx`, `.qfx`, and `.mt940` files.
|
|
97
145
|
|
|
146
|
+
### Hybrid extraction (PDFs included) *(v0.0.5)*
|
|
147
|
+
|
|
148
|
+
`smart_ingest()` is the single entry point that routes any file through the cheapest viable extraction path:
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from bankstatementparser.hybrid import smart_ingest
|
|
152
|
+
|
|
153
|
+
# Path A — deterministic parser (free, fastest, $0)
|
|
154
|
+
result = smart_ingest("statement.xml")
|
|
155
|
+
print(result.source_method) # "deterministic"
|
|
156
|
+
|
|
157
|
+
# Path B — text-LLM for digital PDFs (set BSP_HYBRID_MODEL=ollama/llama3)
|
|
158
|
+
result = smart_ingest("statement.pdf")
|
|
159
|
+
print(result.source_method) # "llm"
|
|
160
|
+
print(result.verification.status) # VERIFIED | DISCREPANCY | FAILED
|
|
161
|
+
|
|
162
|
+
# Path C — multimodal vision for scanned PDFs (set BSP_HYBRID_VISION_MODEL)
|
|
163
|
+
# auto-routed when pypdf cannot extract enough text
|
|
164
|
+
result = smart_ingest("scan.pdf")
|
|
165
|
+
print(result.source_method) # "vision"
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Every row carries:
|
|
169
|
+
|
|
170
|
+
- `source_method` — `"deterministic"`, `"llm"`, or `"vision"` for full audit provenance
|
|
171
|
+
- `transaction_hash` — MD5 fingerprint of `date | normalized_description | amount`, ready for idempotent re-ingestion
|
|
172
|
+
- `confidence` — float between 0 and 1 for LLM rows, `None` for deterministic
|
|
173
|
+
- `raw_source_text` — best-effort source-text slice for the v0.0.6 review-mode UI
|
|
174
|
+
|
|
175
|
+
A complete walkthrough with synthetic UK-bank PDFs, mock vs. live mode, and a Mermaid flow diagram lives in [`examples/hybrid/README.md`](examples/hybrid/README.md).
|
|
176
|
+
|
|
98
177
|
### Parse from memory (no disk I/O)
|
|
99
178
|
|
|
100
179
|
```python
|
|
@@ -182,18 +261,28 @@ Uses `ProcessPoolExecutor` to bypass the GIL. Each file is parsed in its own wor
|
|
|
182
261
|
|
|
183
262
|
## Command Line
|
|
184
263
|
|
|
264
|
+
After installation a `bankstatementparser` console script is available on `PATH`:
|
|
265
|
+
|
|
185
266
|
```bash
|
|
186
267
|
# Parse and display
|
|
187
|
-
|
|
268
|
+
bankstatementparser --type camt --input statement.xml
|
|
188
269
|
|
|
189
270
|
# Export to CSV
|
|
190
|
-
|
|
271
|
+
bankstatementparser --type camt --input statement.xml --output transactions.csv
|
|
191
272
|
|
|
192
273
|
# Stream with PII visible
|
|
193
|
-
|
|
274
|
+
bankstatementparser --type camt --input statement.xml --streaming --show-pii
|
|
275
|
+
|
|
276
|
+
# v0.0.5 — hybrid pipeline (auto-routes deterministic / text-LLM / vision)
|
|
277
|
+
bankstatementparser --type ingest --input statement.pdf
|
|
278
|
+
bankstatementparser --type ingest --input statement.pdf --output ledger.csv
|
|
279
|
+
|
|
280
|
+
# v0.0.6 — interactive review of saved IngestResult JSON
|
|
281
|
+
bankstatementparser --type review --input result.json
|
|
282
|
+
bankstatementparser --type review --input result.json --output reviewed.json
|
|
194
283
|
```
|
|
195
284
|
|
|
196
|
-
Supports `--type camt` and `--type
|
|
285
|
+
Supports `--type camt`, `--type pain001`, `--type ingest` (v0.0.5), and `--type review` (v0.0.6). The `python -m bankstatementparser.cli ...` invocation form continues to work for parity with older releases.
|
|
197
286
|
|
|
198
287
|
## Deduplication
|
|
199
288
|
|
|
@@ -242,7 +331,9 @@ Install with `pip install bankstatementparser[polars]`.
|
|
|
242
331
|
|
|
243
332
|
## Examples
|
|
244
333
|
|
|
245
|
-
See [`examples/`](examples/README.md) for
|
|
334
|
+
See [`examples/`](examples/README.md) for 22 runnable scripts (14 deterministic + 8 hybrid):
|
|
335
|
+
|
|
336
|
+
### Deterministic parsers
|
|
246
337
|
|
|
247
338
|
| Example | What it demonstrates |
|
|
248
339
|
|---|---|
|
|
@@ -261,6 +352,20 @@ See [`examples/`](examples/README.md) for 14 runnable scripts:
|
|
|
261
352
|
| `compatibility_wrappers.py` | Legacy API wrappers |
|
|
262
353
|
| `cli_examples.sh` | CLI commands for CAMT and PAIN.001 |
|
|
263
354
|
|
|
355
|
+
### Hybrid pipeline *(v0.0.5)*
|
|
356
|
+
|
|
357
|
+
| Example | What it demonstrates |
|
|
358
|
+
|---|---|
|
|
359
|
+
| `hybrid/generate_sample_pdfs.py` | Produce reproducible synthetic UK-bank PDFs (digital + scanned) |
|
|
360
|
+
| `hybrid/01_smart_ingest_deterministic.py` | Path A — `smart_ingest()` against a CAMT.053 fixture, $0 cost |
|
|
361
|
+
| `hybrid/02_smart_ingest_text_llm.py` | Path B — text-LLM extraction from a digital PDF (mock or live Ollama) |
|
|
362
|
+
| `hybrid/03_smart_ingest_vision.py` | Path C — multimodal vision extraction with `LOW_TEXT_DENSITY` auto-routing |
|
|
363
|
+
| `hybrid/04_golden_rule.py` | All three `verify_balance()` outcomes |
|
|
364
|
+
| `hybrid/05_dedupe_recurring.py` | `normalize_description()` + `dedupe_by_hash()` for idempotent batching |
|
|
365
|
+
| `hybrid/06_cli_walkthrough.sh` | Four flavours of the new `--type ingest` CLI subcommand |
|
|
366
|
+
|
|
367
|
+
See [`examples/hybrid/README.md`](examples/hybrid/README.md) for the full walkthrough including a Mermaid flow diagram, the cross-platform verification matrix, and the Ollama smoke-test results.
|
|
368
|
+
|
|
264
369
|
## XML Tag Mapping
|
|
265
370
|
|
|
266
371
|
See [`docs/MAPPING.md`](docs/MAPPING.md) for a complete reference of ISO 20022 XML tags to DataFrame columns across all six formats. Use this when integrating with ERP systems or building reconciliation pipelines.
|
|
@@ -268,11 +373,12 @@ See [`docs/MAPPING.md`](docs/MAPPING.md) for a complete reference of ISO 20022 X
|
|
|
268
373
|
## Project Layout
|
|
269
374
|
|
|
270
375
|
```text
|
|
271
|
-
bankstatementparser/ Source code (
|
|
272
|
-
|
|
273
|
-
|
|
376
|
+
bankstatementparser/ Source code (24 modules: deterministic core + hybrid + enrichment subpackages, 100% branch coverage)
|
|
377
|
+
bankstatementparser/hybrid/ PDF pipeline: orchestrator, llm_extractor, vision, pdf_text, prompts, verification, ollama_direct
|
|
378
|
+
docs/compliance/ ISO 13485 validation, risk register, traceability matrix
|
|
379
|
+
examples/ 14 deterministic + 8 hybrid runnable example scripts
|
|
274
380
|
scripts/ SBOM generation, checksums, signature verification
|
|
275
|
-
tests/
|
|
381
|
+
tests/ 672 tests (unit, integration, property-based, security, hybrid mocks)
|
|
276
382
|
```
|
|
277
383
|
|
|
278
384
|
## Security
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2023
|
|
1
|
+
# Copyright (C) 2023-2026 Bank Statement Parser. All rights reserved.
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -46,7 +46,7 @@ from .transaction_deduplicator import (
|
|
|
46
46
|
ExactDuplicateGroup,
|
|
47
47
|
MatchGroup,
|
|
48
48
|
)
|
|
49
|
-
from .transaction_models import Transaction
|
|
49
|
+
from .transaction_models import BoundingBox, Transaction
|
|
50
50
|
from .zip_security import (
|
|
51
51
|
ZipSecurityError,
|
|
52
52
|
ZipXMLSource,
|
|
@@ -67,6 +67,7 @@ __all__ = [
|
|
|
67
67
|
"Pain001Parser",
|
|
68
68
|
"Pain001ParseError",
|
|
69
69
|
"ParserError",
|
|
70
|
+
"BoundingBox",
|
|
70
71
|
"Transaction",
|
|
71
72
|
"Deduplicator",
|
|
72
73
|
"DeduplicationResult",
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2023
|
|
1
|
+
# Copyright (C) 2023-2026 Bank Statement Parser. All rights reserved.
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -546,6 +546,7 @@ class CamtParser(BankStatementParser):
|
|
|
546
546
|
booking_dates,
|
|
547
547
|
debtor_addresses,
|
|
548
548
|
creditor_addresses,
|
|
549
|
+
strict=False,
|
|
549
550
|
)
|
|
550
551
|
):
|
|
551
552
|
# Apply debit sign adjustment
|