bankstatementparser 0.0.4__tar.gz → 0.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/PKG-INFO +132 -19
  2. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/README.md +122 -16
  3. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/__init__.py +3 -2
  4. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/bank_statement_parsers.py +1 -1
  5. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/base_parser.py +1 -1
  6. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/camt_parser.py +2 -1
  7. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/cli.py +365 -6
  8. bankstatementparser-0.0.7/bankstatementparser/enrichment/__init__.py +61 -0
  9. bankstatementparser-0.0.7/bankstatementparser/enrichment/categorizer.py +485 -0
  10. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/exceptions.py +1 -1
  11. bankstatementparser-0.0.7/bankstatementparser/hybrid/__init__.py +64 -0
  12. bankstatementparser-0.0.7/bankstatementparser/hybrid/llm_extractor.py +356 -0
  13. bankstatementparser-0.0.7/bankstatementparser/hybrid/ollama_direct.py +219 -0
  14. bankstatementparser-0.0.7/bankstatementparser/hybrid/orchestrator.py +497 -0
  15. bankstatementparser-0.0.7/bankstatementparser/hybrid/pdf_text.py +113 -0
  16. bankstatementparser-0.0.7/bankstatementparser/hybrid/prompts.py +69 -0
  17. bankstatementparser-0.0.7/bankstatementparser/hybrid/verification.py +133 -0
  18. bankstatementparser-0.0.7/bankstatementparser/hybrid/vision.py +544 -0
  19. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/input_validator.py +7 -1
  20. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/pain001_parser.py +1 -1
  21. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/parallel.py +1 -1
  22. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/record_types.py +1 -1
  23. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/transaction_deduplicator.py +37 -1
  24. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/transaction_models.py +126 -5
  25. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/pyproject.toml +25 -5
  26. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/LICENSE +0 -0
  27. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/additional_parsers.py +0 -0
  28. {bankstatementparser-0.0.4 → bankstatementparser-0.0.7}/bankstatementparser/zip_security.py +0 -0
@@ -1,68 +1,122 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bankstatementparser
3
- Version: 0.0.4
3
+ Version: 0.0.7
4
4
  Summary: BankStatementParser is your essential tool for easy bank statement management. Designed with finance and treasury experts in mind, it offers a simple way to handle CAMT (ISO 20022) formats and more. Get quick, accurate insights from your financial data and spend less time on processing. It's the smart, hassle-free way to stay on top of your transactions.
5
5
  License: Apache Software License
6
6
  License-File: LICENSE
7
7
  Author: Sebastien Rousseau
8
8
  Author-email: sebastian.rousseau@gmail.com
9
- Requires-Python: >=3.9
9
+ Requires-Python: >=3.10,<4.0
10
10
  Classifier: License :: Other/Proprietary License
11
11
  Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.9
13
12
  Classifier: Programming Language :: Python :: 3.10
14
13
  Classifier: Programming Language :: Python :: 3.11
15
14
  Classifier: Programming Language :: Python :: 3.12
16
15
  Classifier: Programming Language :: Python :: 3.13
17
16
  Classifier: Programming Language :: Python :: 3.14
17
+ Provides-Extra: enrichment
18
+ Provides-Extra: hybrid
19
+ Provides-Extra: hybrid-plus
20
+ Provides-Extra: hybrid-vision
18
21
  Provides-Extra: polars
19
22
  Requires-Dist: defusedxml (>=0.7.1,<0.8.0)
23
+ Requires-Dist: litellm (>=1.83.0) ; extra == "hybrid" or extra == "hybrid-plus" or extra == "hybrid-vision" or extra == "enrichment"
20
24
  Requires-Dist: lxml (>=4.9.3)
21
25
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
22
26
  Requires-Dist: pandas (>=2.3.3,<3.0.0)
27
+ Requires-Dist: pdfplumber (>=0.11.0) ; extra == "hybrid-plus"
23
28
  Requires-Dist: polars (>=1.32.0,<2.0.0) ; extra == "polars"
24
29
  Requires-Dist: pydantic (>=2.11.0,<3.0.0)
30
+ Requires-Dist: pypdf (>=4.0.0) ; extra == "hybrid" or extra == "hybrid-plus" or extra == "hybrid-vision"
31
+ Requires-Dist: pypdfium2 (>=4.30.0) ; extra == "hybrid-vision"
25
32
  Project-URL: Homepage, https://bankstatementparser.com
26
33
  Project-URL: Repository, https://github.com/sebastienrousseau/bankstatementparser
27
34
  Description-Content-Type: text/markdown
28
35
 
29
36
  # Bank Statement Parser
30
37
 
31
- Parse bank statements across six formats CAMT, PAIN.001, CSV, OFX/QFX, and MT940 — into structured DataFrames. Process ZIP archives safely. Redact PII by default. Stream files of any size.
38
+ Parse bank statements across **six structured formats** (CAMT, PAIN.001, CSV, OFX/QFX, MT940) **and PDFs**both digital and scanned — into a single unified `Transaction` model. ISO 20022 files take the deterministic path; PDFs fall through to a configurable LLM (Ollama by default, any LiteLLM-supported provider) and finally to a multimodal vision model for scanned/photocopied statements.
32
39
 
33
- Built for finance teams, treasury analysts, and fintech developers who need reliable, auditable extraction from ISO 20022 and legacy banking formats without sending data to external services.
40
+ Built for finance teams, treasury analysts, and fintech developers who need reliable, auditable extraction across the full spectrum of bank statement formats without sending data to external services unless they explicitly opt in.
34
41
 
35
- [![PyPI](https://img.shields.io/pypi/pyversions/bankstatementparser.svg?style=for-the-badge)](https://pypi.org/project/bankstatementparser/)
42
+ [![PyPI](https://img.shields.io/pypi/pyversions/bankstatementparser.svg?style=for-the-badge&v=0.0.7)](https://pypi.org/project/bankstatementparser/)
36
43
  [![PyPI Downloads](https://img.shields.io/pypi/dm/bankstatementparser.svg?style=for-the-badge)](https://pypi.org/project/bankstatementparser/)
37
44
  [![Codecov](https://img.shields.io/codecov/c/github/sebastienrousseau/bankstatementparser?style=for-the-badge)](https://codecov.io/github/sebastienrousseau/bankstatementparser?branch=main)
38
45
  [![License](https://img.shields.io/github/license/sebastienrousseau/bankstatementparser?style=for-the-badge)](LICENSE)
39
46
 
47
+ ## How it works
48
+
49
+ `smart_ingest()` routes any input file through the cheapest viable extraction path. Deterministic parsers always run first ($0 cost). Text and vision LLMs are fallbacks for unstandardized PDFs — both are opt-in via separate install extras and can be swapped between any LiteLLM-supported provider (Ollama, Anthropic, OpenAI, Gemini, …).
50
+
51
+ ```mermaid
52
+ flowchart TD
53
+ A[smart_ingest&lpar;path&rpar;] --> B{detect_statement_format}
54
+ B -- CAMT/PAIN/OFX/MT940/CSV --> C[Path A: deterministic parser<br/>$0, fastest]
55
+ C --> Z[IngestResult<br/>source_method='deterministic']
56
+
57
+ B -- pdf or unknown --> D[pypdf extract_text]
58
+ D --> E{text len &gt;= 50?}
59
+
60
+ E -- yes --> F[Path B: text-LLM<br/>default ollama/llama3]
61
+ F --> Y[IngestResult<br/>source_method='llm']
62
+
63
+ E -- no --> G[Path C: vision-LLM<br/>opt-in via BSP_HYBRID_VISION_MODEL]
64
+ G --> X[IngestResult<br/>source_method='vision']
65
+
66
+ Z --> V[verify_balance<br/>Golden Rule]
67
+ Y --> V
68
+ X --> V
69
+ V --> R[VERIFIED / DISCREPANCY / FAILED]
70
+ ```
71
+
72
+ Every extracted row carries an immutable `transaction_hash`, an audit-trail `source_method` tag, and (for LLM rows) a `confidence` score — see [Hybrid extraction](#hybrid-extraction-pdfs-included-v005) below for the full surface.
73
+
40
74
  ## Key Features
41
75
 
42
76
  | Feature | Description |
43
77
  |---|---|
44
- | **6 formats** | CAMT.053, PAIN.001, CSV, OFX, QFX, MT940 |
78
+ | **6 structured formats** | CAMT.053, PAIN.001, CSV, OFX, QFX, MT940 |
79
+ | **Hybrid PDF pipeline** *(v0.0.5)* | `smart_ingest()` routes digital PDFs through a text-LLM and scanned PDFs through a multimodal vision model. Deterministic parsers always tried first ($0 cost). |
80
+ | **Local-first LLM** *(v0.0.5)* | Ollama is the default backend; switch to Anthropic, OpenAI, or any LiteLLM provider via `BSP_HYBRID_MODEL`. Vision is opt-in via `BSP_HYBRID_VISION_MODEL` — no surprise downloads. |
81
+ | **Golden Rule verification** *(v0.0.5)* | Every result carries `opening + credits − debits == closing` status: `VERIFIED`, `DISCREPANCY`, or `FAILED`. |
82
+ | **Idempotent dedup** *(v0.0.5)* | Every `Transaction` carries a stable `transaction_hash` (MD5 of date + normalized description + amount). `Deduplicator.dedupe_by_hash()` makes incremental ingestion safe to re-run. |
83
+ | **Categorization** *(v0.0.6)* | `bankstatementparser.enrichment.Categorizer` tags transactions with a pluggable category schema (Plaid 13-category default) and an optional `is_business_expense` flag. Wrapper model — never mutates the original `Transaction`. |
84
+ | **Interactive review** *(v0.0.6)* | `--type review` CLI walks through discrepancies with accept/edit/skip/delete/quit. `IngestResult.to_json()` / `.from_json()` for stable round-trip with embedded audit trail. |
85
+ | **Bounding boxes** *(v0.0.6)* | `Transaction.source_bbox` carries per-row normalized coordinates from the vision path for downstream review UIs. |
86
+ | **Direct Ollama bridge** *(v0.0.7)* | Auto-bypasses the upstream LiteLLM ↔ Ollama hang on long vision prompts. `ollama/minicpm-v` recommended over `ollama/llava` for document OCR. |
87
+ | **Strip mode** *(v0.0.7)* | `VisionExtractor(strip_rows=True)` splits dense pages into overlapping bands for small local models — fixes sign-flip errors and improves accuracy on 15+ row statements. |
45
88
  | **Auto-detection** | `detect_statement_format()` identifies the format; `create_parser()` returns the right parser |
46
- | **Deduplication** | `Deduplicator` detects exact duplicates and suspected matches across sources with explainable confidence scores |
47
89
  | **PII redaction** | Names, IBANs, and addresses masked by default — opt in with `--show-pii` |
48
90
  | **Streaming** | `parse_streaming()` at 27,000+ tx/s (CAMT) and 52,000+ tx/s (PAIN.001) with bounded memory |
49
91
  | **Parallel** | `parse_files_parallel()` for multi-file batch processing across CPU cores |
50
92
  | **Secure ZIP** | `iter_secure_xml_entries()` rejects zip bombs, encrypted entries, and suspicious compression ratios |
51
93
  | **In-memory parsing** | `from_string()` and `from_bytes()` parse XML without touching disk |
52
94
  | **Export** | CSV, JSON, Excel (`.xlsx`), and optional Polars DataFrames |
53
- | **100% coverage** | 467 tests, 100% branch coverage, property-based fuzzing with Hypothesis |
95
+ | **100% coverage** | 672 tests, 100% branch coverage, property-based fuzzing with Hypothesis |
54
96
 
55
97
  ## Requirements
56
98
 
57
- - Python **3.9** through **3.14**
99
+ - Python **3.10** through **3.14** (Python 3.9 was dropped in v0.0.6 — pin to v0.0.5 if you cannot upgrade your interpreter)
58
100
  - Poetry (for local development)
59
101
 
60
102
  ## Install
61
103
 
62
104
  ```bash
105
+ # Core install — deterministic parsers only (CAMT, PAIN.001, CSV, OFX, QFX, MT940)
63
106
  pip install bankstatementparser
107
+
108
+ # Add the text-LLM path for digital PDFs (litellm + pypdf)
109
+ pip install 'bankstatementparser[hybrid]'
110
+
111
+ # Add higher-fidelity table extraction (adds pdfplumber)
112
+ pip install 'bankstatementparser[hybrid-plus]'
113
+
114
+ # Add the multimodal vision path for scanned/photocopied PDFs (adds pypdfium2)
115
+ pip install 'bankstatementparser[hybrid-vision]'
64
116
  ```
65
117
 
118
+ The core install has zero AI dependencies. Every `[hybrid*]` extra is opt-in and pure-Python — no `poppler`, no system libraries, no GPU required.
119
+
66
120
  ### Local Development
67
121
 
68
122
  Clone and install on **macOS, Linux, or WSL**:
@@ -74,6 +128,7 @@ python3 -m venv .venv
74
128
  source .venv/bin/activate
75
129
  pip install poetry
76
130
  poetry install --with dev
131
+ make install-hooks # pre-commit hook runs `make verify` before every commit
77
132
  ```
78
133
 
79
134
  ## Quick Start
@@ -123,6 +178,37 @@ records = parser.parse()
123
178
 
124
179
  Works with `.xml`, `.csv`, `.ofx`, `.qfx`, and `.mt940` files.
125
180
 
181
+ ### Hybrid extraction (PDFs included) *(v0.0.5)*
182
+
183
+ `smart_ingest()` is the single entry point that routes any file through the cheapest viable extraction path:
184
+
185
+ ```python
186
+ from bankstatementparser.hybrid import smart_ingest
187
+
188
+ # Path A — deterministic parser (free, fastest, $0)
189
+ result = smart_ingest("statement.xml")
190
+ print(result.source_method) # "deterministic"
191
+
192
+ # Path B — text-LLM for digital PDFs (set BSP_HYBRID_MODEL=ollama/llama3)
193
+ result = smart_ingest("statement.pdf")
194
+ print(result.source_method) # "llm"
195
+ print(result.verification.status) # VERIFIED | DISCREPANCY | FAILED
196
+
197
+ # Path C — multimodal vision for scanned PDFs (set BSP_HYBRID_VISION_MODEL)
198
+ # auto-routed when pypdf cannot extract enough text
199
+ result = smart_ingest("scan.pdf")
200
+ print(result.source_method) # "vision"
201
+ ```
202
+
203
+ Every row carries:
204
+
205
+ - `source_method` — `"deterministic"`, `"llm"`, or `"vision"` for full audit provenance
206
+ - `transaction_hash` — MD5 fingerprint of `date | normalized_description | amount`, ready for idempotent re-ingestion
207
+ - `confidence` — float between 0 and 1 for LLM rows, `None` for deterministic
208
+ - `raw_source_text` — best-effort source-text slice for the v0.0.6 review-mode UI
209
+
210
+ A complete walkthrough with synthetic UK-bank PDFs, mock vs. live mode, and a Mermaid flow diagram lives in [`examples/hybrid/README.md`](examples/hybrid/README.md).
211
+
126
212
  ### Parse from memory (no disk I/O)
127
213
 
128
214
  ```python
@@ -210,18 +296,28 @@ Uses `ProcessPoolExecutor` to bypass the GIL. Each file is parsed in its own wor
210
296
 
211
297
  ## Command Line
212
298
 
299
+ After installation a `bankstatementparser` console script is available on `PATH`:
300
+
213
301
  ```bash
214
302
  # Parse and display
215
- python -m bankstatementparser.cli --type camt --input statement.xml
303
+ bankstatementparser --type camt --input statement.xml
216
304
 
217
305
  # Export to CSV
218
- python -m bankstatementparser.cli --type camt --input statement.xml --output transactions.csv
306
+ bankstatementparser --type camt --input statement.xml --output transactions.csv
219
307
 
220
308
  # Stream with PII visible
221
- python -m bankstatementparser.cli --type camt --input statement.xml --streaming --show-pii
309
+ bankstatementparser --type camt --input statement.xml --streaming --show-pii
310
+
311
+ # v0.0.5 — hybrid pipeline (auto-routes deterministic / text-LLM / vision)
312
+ bankstatementparser --type ingest --input statement.pdf
313
+ bankstatementparser --type ingest --input statement.pdf --output ledger.csv
314
+
315
+ # v0.0.6 — interactive review of saved IngestResult JSON
316
+ bankstatementparser --type review --input result.json
317
+ bankstatementparser --type review --input result.json --output reviewed.json
222
318
  ```
223
319
 
224
- Supports `--type camt` and `--type pain001`.
320
+ Supports `--type camt`, `--type pain001`, `--type ingest` (v0.0.5), and `--type review` (v0.0.6). The `python -m bankstatementparser.cli ...` invocation form continues to work for parity with older releases.
225
321
 
226
322
  ## Deduplication
227
323
 
@@ -270,7 +366,9 @@ Install with `pip install bankstatementparser[polars]`.
270
366
 
271
367
  ## Examples
272
368
 
273
- See [`examples/`](examples/README.md) for 14 runnable scripts:
369
+ See [`examples/`](examples/README.md) for 22 runnable scripts (14 deterministic + 8 hybrid):
370
+
371
+ ### Deterministic parsers
274
372
 
275
373
  | Example | What it demonstrates |
276
374
  |---|---|
@@ -289,6 +387,20 @@ See [`examples/`](examples/README.md) for 14 runnable scripts:
289
387
  | `compatibility_wrappers.py` | Legacy API wrappers |
290
388
  | `cli_examples.sh` | CLI commands for CAMT and PAIN.001 |
291
389
 
390
+ ### Hybrid pipeline *(v0.0.5)*
391
+
392
+ | Example | What it demonstrates |
393
+ |---|---|
394
+ | `hybrid/generate_sample_pdfs.py` | Produce reproducible synthetic UK-bank PDFs (digital + scanned) |
395
+ | `hybrid/01_smart_ingest_deterministic.py` | Path A — `smart_ingest()` against a CAMT.053 fixture, $0 cost |
396
+ | `hybrid/02_smart_ingest_text_llm.py` | Path B — text-LLM extraction from a digital PDF (mock or live Ollama) |
397
+ | `hybrid/03_smart_ingest_vision.py` | Path C — multimodal vision extraction with `LOW_TEXT_DENSITY` auto-routing |
398
+ | `hybrid/04_golden_rule.py` | All three `verify_balance()` outcomes |
399
+ | `hybrid/05_dedupe_recurring.py` | `normalize_description()` + `dedupe_by_hash()` for idempotent batching |
400
+ | `hybrid/06_cli_walkthrough.sh` | Four flavours of the new `--type ingest` CLI subcommand |
401
+
402
+ See [`examples/hybrid/README.md`](examples/hybrid/README.md) for the full walkthrough including a Mermaid flow diagram, the cross-platform verification matrix, and the Ollama smoke-test results.
403
+
292
404
  ## XML Tag Mapping
293
405
 
294
406
  See [`docs/MAPPING.md`](docs/MAPPING.md) for a complete reference of ISO 20022 XML tags to DataFrame columns across all six formats. Use this when integrating with ERP systems or building reconciliation pipelines.
@@ -296,11 +408,12 @@ See [`docs/MAPPING.md`](docs/MAPPING.md) for a complete reference of ISO 20022 X
296
408
  ## Project Layout
297
409
 
298
410
  ```text
299
- bankstatementparser/ Source code (13 modules, 100% branch coverage)
300
- docs/compliance/ ISO 13485 validation, risk register, traceability
301
- examples/ 14 runnable example scripts
411
+ bankstatementparser/ Source code (24 modules: deterministic core + hybrid + enrichment subpackages, 100% branch coverage)
412
+ bankstatementparser/hybrid/ PDF pipeline: orchestrator, llm_extractor, vision, pdf_text, prompts, verification, ollama_direct
413
+ docs/compliance/ ISO 13485 validation, risk register, traceability matrix
414
+ examples/ 14 deterministic + 8 hybrid runnable example scripts
302
415
  scripts/ SBOM generation, checksums, signature verification
303
- tests/ 467 tests (unit, integration, property-based, security)
416
+ tests/ 672 tests (unit, integration, property-based, security, hybrid mocks)
304
417
  ```
305
418
 
306
419
  ## Security
@@ -1,40 +1,87 @@
1
1
  # Bank Statement Parser
2
2
 
3
- Parse bank statements across six formats CAMT, PAIN.001, CSV, OFX/QFX, and MT940 — into structured DataFrames. Process ZIP archives safely. Redact PII by default. Stream files of any size.
3
+ Parse bank statements across **six structured formats** (CAMT, PAIN.001, CSV, OFX/QFX, MT940) **and PDFs**both digital and scanned — into a single unified `Transaction` model. ISO 20022 files take the deterministic path; PDFs fall through to a configurable LLM (Ollama by default, any LiteLLM-supported provider) and finally to a multimodal vision model for scanned/photocopied statements.
4
4
 
5
- Built for finance teams, treasury analysts, and fintech developers who need reliable, auditable extraction from ISO 20022 and legacy banking formats without sending data to external services.
5
+ Built for finance teams, treasury analysts, and fintech developers who need reliable, auditable extraction across the full spectrum of bank statement formats without sending data to external services unless they explicitly opt in.
6
6
 
7
- [![PyPI](https://img.shields.io/pypi/pyversions/bankstatementparser.svg?style=for-the-badge)](https://pypi.org/project/bankstatementparser/)
7
+ [![PyPI](https://img.shields.io/pypi/pyversions/bankstatementparser.svg?style=for-the-badge&v=0.0.7)](https://pypi.org/project/bankstatementparser/)
8
8
  [![PyPI Downloads](https://img.shields.io/pypi/dm/bankstatementparser.svg?style=for-the-badge)](https://pypi.org/project/bankstatementparser/)
9
9
  [![Codecov](https://img.shields.io/codecov/c/github/sebastienrousseau/bankstatementparser?style=for-the-badge)](https://codecov.io/github/sebastienrousseau/bankstatementparser?branch=main)
10
10
  [![License](https://img.shields.io/github/license/sebastienrousseau/bankstatementparser?style=for-the-badge)](LICENSE)
11
11
 
12
+ ## How it works
13
+
14
+ `smart_ingest()` routes any input file through the cheapest viable extraction path. Deterministic parsers always run first ($0 cost). Text and vision LLMs are fallbacks for unstandardized PDFs — both are opt-in via separate install extras and can be swapped between any LiteLLM-supported provider (Ollama, Anthropic, OpenAI, Gemini, …).
15
+
16
+ ```mermaid
17
+ flowchart TD
18
+ A[smart_ingest&lpar;path&rpar;] --> B{detect_statement_format}
19
+ B -- CAMT/PAIN/OFX/MT940/CSV --> C[Path A: deterministic parser<br/>$0, fastest]
20
+ C --> Z[IngestResult<br/>source_method='deterministic']
21
+
22
+ B -- pdf or unknown --> D[pypdf extract_text]
23
+ D --> E{text len &gt;= 50?}
24
+
25
+ E -- yes --> F[Path B: text-LLM<br/>default ollama/llama3]
26
+ F --> Y[IngestResult<br/>source_method='llm']
27
+
28
+ E -- no --> G[Path C: vision-LLM<br/>opt-in via BSP_HYBRID_VISION_MODEL]
29
+ G --> X[IngestResult<br/>source_method='vision']
30
+
31
+ Z --> V[verify_balance<br/>Golden Rule]
32
+ Y --> V
33
+ X --> V
34
+ V --> R[VERIFIED / DISCREPANCY / FAILED]
35
+ ```
36
+
37
+ Every extracted row carries an immutable `transaction_hash`, an audit-trail `source_method` tag, and (for LLM rows) a `confidence` score — see [Hybrid extraction](#hybrid-extraction-pdfs-included-v005) below for the full surface.
38
+
12
39
  ## Key Features
13
40
 
14
41
  | Feature | Description |
15
42
  |---|---|
16
- | **6 formats** | CAMT.053, PAIN.001, CSV, OFX, QFX, MT940 |
43
+ | **6 structured formats** | CAMT.053, PAIN.001, CSV, OFX, QFX, MT940 |
44
+ | **Hybrid PDF pipeline** *(v0.0.5)* | `smart_ingest()` routes digital PDFs through a text-LLM and scanned PDFs through a multimodal vision model. Deterministic parsers always tried first ($0 cost). |
45
+ | **Local-first LLM** *(v0.0.5)* | Ollama is the default backend; switch to Anthropic, OpenAI, or any LiteLLM provider via `BSP_HYBRID_MODEL`. Vision is opt-in via `BSP_HYBRID_VISION_MODEL` — no surprise downloads. |
46
+ | **Golden Rule verification** *(v0.0.5)* | Every result carries `opening + credits − debits == closing` status: `VERIFIED`, `DISCREPANCY`, or `FAILED`. |
47
+ | **Idempotent dedup** *(v0.0.5)* | Every `Transaction` carries a stable `transaction_hash` (MD5 of date + normalized description + amount). `Deduplicator.dedupe_by_hash()` makes incremental ingestion safe to re-run. |
48
+ | **Categorization** *(v0.0.6)* | `bankstatementparser.enrichment.Categorizer` tags transactions with a pluggable category schema (Plaid 13-category default) and an optional `is_business_expense` flag. Wrapper model — never mutates the original `Transaction`. |
49
+ | **Interactive review** *(v0.0.6)* | `--type review` CLI walks through discrepancies with accept/edit/skip/delete/quit. `IngestResult.to_json()` / `.from_json()` for stable round-trip with embedded audit trail. |
50
+ | **Bounding boxes** *(v0.0.6)* | `Transaction.source_bbox` carries per-row normalized coordinates from the vision path for downstream review UIs. |
51
+ | **Direct Ollama bridge** *(v0.0.7)* | Auto-bypasses the upstream LiteLLM ↔ Ollama hang on long vision prompts. `ollama/minicpm-v` recommended over `ollama/llava` for document OCR. |
52
+ | **Strip mode** *(v0.0.7)* | `VisionExtractor(strip_rows=True)` splits dense pages into overlapping bands for small local models — fixes sign-flip errors and improves accuracy on 15+ row statements. |
17
53
  | **Auto-detection** | `detect_statement_format()` identifies the format; `create_parser()` returns the right parser |
18
- | **Deduplication** | `Deduplicator` detects exact duplicates and suspected matches across sources with explainable confidence scores |
19
54
  | **PII redaction** | Names, IBANs, and addresses masked by default — opt in with `--show-pii` |
20
55
  | **Streaming** | `parse_streaming()` at 27,000+ tx/s (CAMT) and 52,000+ tx/s (PAIN.001) with bounded memory |
21
56
  | **Parallel** | `parse_files_parallel()` for multi-file batch processing across CPU cores |
22
57
  | **Secure ZIP** | `iter_secure_xml_entries()` rejects zip bombs, encrypted entries, and suspicious compression ratios |
23
58
  | **In-memory parsing** | `from_string()` and `from_bytes()` parse XML without touching disk |
24
59
  | **Export** | CSV, JSON, Excel (`.xlsx`), and optional Polars DataFrames |
25
- | **100% coverage** | 467 tests, 100% branch coverage, property-based fuzzing with Hypothesis |
60
+ | **100% coverage** | 672 tests, 100% branch coverage, property-based fuzzing with Hypothesis |
26
61
 
27
62
  ## Requirements
28
63
 
29
- - Python **3.9** through **3.14**
64
+ - Python **3.10** through **3.14** (Python 3.9 was dropped in v0.0.6 — pin to v0.0.5 if you cannot upgrade your interpreter)
30
65
  - Poetry (for local development)
31
66
 
32
67
  ## Install
33
68
 
34
69
  ```bash
70
+ # Core install — deterministic parsers only (CAMT, PAIN.001, CSV, OFX, QFX, MT940)
35
71
  pip install bankstatementparser
72
+
73
+ # Add the text-LLM path for digital PDFs (litellm + pypdf)
74
+ pip install 'bankstatementparser[hybrid]'
75
+
76
+ # Add higher-fidelity table extraction (adds pdfplumber)
77
+ pip install 'bankstatementparser[hybrid-plus]'
78
+
79
+ # Add the multimodal vision path for scanned/photocopied PDFs (adds pypdfium2)
80
+ pip install 'bankstatementparser[hybrid-vision]'
36
81
  ```
37
82
 
83
+ The core install has zero AI dependencies. Every `[hybrid*]` extra is opt-in and pure-Python — no `poppler`, no system libraries, no GPU required.
84
+
38
85
  ### Local Development
39
86
 
40
87
  Clone and install on **macOS, Linux, or WSL**:
@@ -46,6 +93,7 @@ python3 -m venv .venv
46
93
  source .venv/bin/activate
47
94
  pip install poetry
48
95
  poetry install --with dev
96
+ make install-hooks # pre-commit hook runs `make verify` before every commit
49
97
  ```
50
98
 
51
99
  ## Quick Start
@@ -95,6 +143,37 @@ records = parser.parse()
95
143
 
96
144
  Works with `.xml`, `.csv`, `.ofx`, `.qfx`, and `.mt940` files.
97
145
 
146
+ ### Hybrid extraction (PDFs included) *(v0.0.5)*
147
+
148
+ `smart_ingest()` is the single entry point that routes any file through the cheapest viable extraction path:
149
+
150
+ ```python
151
+ from bankstatementparser.hybrid import smart_ingest
152
+
153
+ # Path A — deterministic parser (free, fastest, $0)
154
+ result = smart_ingest("statement.xml")
155
+ print(result.source_method) # "deterministic"
156
+
157
+ # Path B — text-LLM for digital PDFs (set BSP_HYBRID_MODEL=ollama/llama3)
158
+ result = smart_ingest("statement.pdf")
159
+ print(result.source_method) # "llm"
160
+ print(result.verification.status) # VERIFIED | DISCREPANCY | FAILED
161
+
162
+ # Path C — multimodal vision for scanned PDFs (set BSP_HYBRID_VISION_MODEL)
163
+ # auto-routed when pypdf cannot extract enough text
164
+ result = smart_ingest("scan.pdf")
165
+ print(result.source_method) # "vision"
166
+ ```
167
+
168
+ Every row carries:
169
+
170
+ - `source_method` — `"deterministic"`, `"llm"`, or `"vision"` for full audit provenance
171
+ - `transaction_hash` — MD5 fingerprint of `date | normalized_description | amount`, ready for idempotent re-ingestion
172
+ - `confidence` — float between 0 and 1 for LLM rows, `None` for deterministic
173
+ - `raw_source_text` — best-effort source-text slice for the v0.0.6 review-mode UI
174
+
175
+ A complete walkthrough with synthetic UK-bank PDFs, mock vs. live mode, and a Mermaid flow diagram lives in [`examples/hybrid/README.md`](examples/hybrid/README.md).
176
+
98
177
  ### Parse from memory (no disk I/O)
99
178
 
100
179
  ```python
@@ -182,18 +261,28 @@ Uses `ProcessPoolExecutor` to bypass the GIL. Each file is parsed in its own wor
182
261
 
183
262
  ## Command Line
184
263
 
264
+ After installation a `bankstatementparser` console script is available on `PATH`:
265
+
185
266
  ```bash
186
267
  # Parse and display
187
- python -m bankstatementparser.cli --type camt --input statement.xml
268
+ bankstatementparser --type camt --input statement.xml
188
269
 
189
270
  # Export to CSV
190
- python -m bankstatementparser.cli --type camt --input statement.xml --output transactions.csv
271
+ bankstatementparser --type camt --input statement.xml --output transactions.csv
191
272
 
192
273
  # Stream with PII visible
193
- python -m bankstatementparser.cli --type camt --input statement.xml --streaming --show-pii
274
+ bankstatementparser --type camt --input statement.xml --streaming --show-pii
275
+
276
+ # v0.0.5 — hybrid pipeline (auto-routes deterministic / text-LLM / vision)
277
+ bankstatementparser --type ingest --input statement.pdf
278
+ bankstatementparser --type ingest --input statement.pdf --output ledger.csv
279
+
280
+ # v0.0.6 — interactive review of saved IngestResult JSON
281
+ bankstatementparser --type review --input result.json
282
+ bankstatementparser --type review --input result.json --output reviewed.json
194
283
  ```
195
284
 
196
- Supports `--type camt` and `--type pain001`.
285
+ Supports `--type camt`, `--type pain001`, `--type ingest` (v0.0.5), and `--type review` (v0.0.6). The `python -m bankstatementparser.cli ...` invocation form continues to work for parity with older releases.
197
286
 
198
287
  ## Deduplication
199
288
 
@@ -242,7 +331,9 @@ Install with `pip install bankstatementparser[polars]`.
242
331
 
243
332
  ## Examples
244
333
 
245
- See [`examples/`](examples/README.md) for 14 runnable scripts:
334
+ See [`examples/`](examples/README.md) for 22 runnable scripts (14 deterministic + 8 hybrid):
335
+
336
+ ### Deterministic parsers
246
337
 
247
338
  | Example | What it demonstrates |
248
339
  |---|---|
@@ -261,6 +352,20 @@ See [`examples/`](examples/README.md) for 14 runnable scripts:
261
352
  | `compatibility_wrappers.py` | Legacy API wrappers |
262
353
  | `cli_examples.sh` | CLI commands for CAMT and PAIN.001 |
263
354
 
355
+ ### Hybrid pipeline *(v0.0.5)*
356
+
357
+ | Example | What it demonstrates |
358
+ |---|---|
359
+ | `hybrid/generate_sample_pdfs.py` | Produce reproducible synthetic UK-bank PDFs (digital + scanned) |
360
+ | `hybrid/01_smart_ingest_deterministic.py` | Path A — `smart_ingest()` against a CAMT.053 fixture, $0 cost |
361
+ | `hybrid/02_smart_ingest_text_llm.py` | Path B — text-LLM extraction from a digital PDF (mock or live Ollama) |
362
+ | `hybrid/03_smart_ingest_vision.py` | Path C — multimodal vision extraction with `LOW_TEXT_DENSITY` auto-routing |
363
+ | `hybrid/04_golden_rule.py` | All three `verify_balance()` outcomes |
364
+ | `hybrid/05_dedupe_recurring.py` | `normalize_description()` + `dedupe_by_hash()` for idempotent batching |
365
+ | `hybrid/06_cli_walkthrough.sh` | Four flavours of the new `--type ingest` CLI subcommand |
366
+
367
+ See [`examples/hybrid/README.md`](examples/hybrid/README.md) for the full walkthrough including a Mermaid flow diagram, the cross-platform verification matrix, and the Ollama smoke-test results.
368
+
264
369
  ## XML Tag Mapping
265
370
 
266
371
  See [`docs/MAPPING.md`](docs/MAPPING.md) for a complete reference of ISO 20022 XML tags to DataFrame columns across all six formats. Use this when integrating with ERP systems or building reconciliation pipelines.
@@ -268,11 +373,12 @@ See [`docs/MAPPING.md`](docs/MAPPING.md) for a complete reference of ISO 20022 X
268
373
  ## Project Layout
269
374
 
270
375
  ```text
271
- bankstatementparser/ Source code (13 modules, 100% branch coverage)
272
- docs/compliance/ ISO 13485 validation, risk register, traceability
273
- examples/ 14 runnable example scripts
376
+ bankstatementparser/ Source code (24 modules: deterministic core + hybrid + enrichment subpackages, 100% branch coverage)
377
+ bankstatementparser/hybrid/ PDF pipeline: orchestrator, llm_extractor, vision, pdf_text, prompts, verification, ollama_direct
378
+ docs/compliance/ ISO 13485 validation, risk register, traceability matrix
379
+ examples/ 14 deterministic + 8 hybrid runnable example scripts
274
380
  scripts/ SBOM generation, checksums, signature verification
275
- tests/ 467 tests (unit, integration, property-based, security)
381
+ tests/ 672 tests (unit, integration, property-based, security, hybrid mocks)
276
382
  ```
277
383
 
278
384
  ## Security
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2023 Sebastien Rousseau.
1
+ # Copyright (C) 2023-2026 Bank Statement Parser. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ from .transaction_deduplicator import (
46
46
  ExactDuplicateGroup,
47
47
  MatchGroup,
48
48
  )
49
- from .transaction_models import Transaction
49
+ from .transaction_models import BoundingBox, Transaction
50
50
  from .zip_security import (
51
51
  ZipSecurityError,
52
52
  ZipXMLSource,
@@ -67,6 +67,7 @@ __all__ = [
67
67
  "Pain001Parser",
68
68
  "Pain001ParseError",
69
69
  "ParserError",
70
+ "BoundingBox",
70
71
  "Transaction",
71
72
  "Deduplicator",
72
73
  "DeduplicationResult",
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2023 Sebastien Rousseau.
1
+ # Copyright (C) 2023-2026 Bank Statement Parser. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2023 Sebastien Rousseau.
1
+ # Copyright (C) 2023-2026 Bank Statement Parser. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2023 Sebastien Rousseau.
1
+ # Copyright (C) 2023-2026 Bank Statement Parser. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -546,6 +546,7 @@ class CamtParser(BankStatementParser):
546
546
  booking_dates,
547
547
  debtor_addresses,
548
548
  creditor_addresses,
549
+ strict=False,
549
550
  )
550
551
  ):
551
552
  # Apply debit sign adjustment