bankstatementparser 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,196 @@
1
+ # Copyright (C) 2023 Sebastien Rousseau.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
+ # implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """Deterministic transaction models used across parser outputs."""
17
+
18
+ from __future__ import annotations
19
+
20
+ import re
21
+ from collections.abc import Mapping
22
+ from datetime import date, datetime
23
+ from decimal import Decimal
24
+ from typing import Optional
25
+
26
+ from pydantic import BaseModel, ConfigDict, Field
27
+
28
+
29
+ def _coerce_decimal(value: object) -> Decimal:
30
+ text = str(value).strip()
31
+ if not text:
32
+ raise ValueError("amount is required")
33
+ return Decimal(text)
34
+
35
+
36
+ def _parse_date(value: object) -> date | None:
37
+ if value in (None, ""):
38
+ return None
39
+ if isinstance(value, date) and not isinstance(value, datetime):
40
+ return value
41
+
42
+ text = str(value).strip()
43
+ if not text:
44
+ return None
45
+ if len(text) >= 10:
46
+ text = text[:10]
47
+
48
+ for fmt in ("%Y-%m-%d", "%Y%m%d", "%d/%m/%Y", "%Y/%m/%d"):
49
+ try:
50
+ return datetime.strptime(text, fmt).date()
51
+ except ValueError:
52
+ continue
53
+
54
+ try:
55
+ return datetime.fromisoformat(str(value)).date()
56
+ except ValueError as exc:
57
+ raise ValueError(f"unsupported date format: {value}") from exc
58
+
59
+
60
+ def normalize_description(value: str | None) -> str:
61
+ if value is None:
62
+ return ""
63
+ collapsed = re.sub(r"\s+", " ", value).strip().lower()
64
+ return re.sub(r"[^a-z0-9 ]+", "", collapsed)
65
+
66
+
67
+ def _first_value(
68
+ record: Mapping[str, object], *keys: str
69
+ ) -> object | None:
70
+ for key in keys:
71
+ if key in record and record[key] not in (None, ""):
72
+ return record[key]
73
+ return None
74
+
75
+
76
+ class Transaction(BaseModel):
77
+ """Normalized transaction model for deterministic downstream logic."""
78
+
79
+ model_config = ConfigDict(frozen=True)
80
+
81
+ account_id: Optional[str] = None
82
+ currency: Optional[str] = None
83
+ amount: Decimal
84
+ booking_date: Optional[date] = None
85
+ value_date: Optional[date] = None
86
+ description: Optional[str] = None
87
+ normalized_description: str = Field(default="")
88
+ reference: Optional[str] = None
89
+ transaction_id: Optional[str] = None
90
+ counterparty: Optional[str] = None
91
+ source: Optional[str] = None
92
+ source_index: Optional[int] = None
93
+
94
+ @classmethod
95
+ def from_record(
96
+ cls,
97
+ record: Mapping[str, object],
98
+ *,
99
+ source: str | None = None,
100
+ source_index: int | None = None,
101
+ ) -> Transaction:
102
+ """Create a normalized transaction from parser output."""
103
+ description = _first_value(
104
+ record,
105
+ "description",
106
+ "Description",
107
+ "RmtInf",
108
+ "Reference",
109
+ "reference",
110
+ "Memo",
111
+ "memo",
112
+ "Name",
113
+ "CdtrNm",
114
+ )
115
+ reference = _first_value(
116
+ record,
117
+ "Reference",
118
+ "reference",
119
+ "RmtInf",
120
+ "transaction_id",
121
+ "transactionId",
122
+ "EndToEndId",
123
+ "FITID",
124
+ )
125
+ counterparty = _first_value(
126
+ record,
127
+ "Creditor",
128
+ "Debtor",
129
+ "CdtrNm",
130
+ "Name",
131
+ "payee",
132
+ "counterparty",
133
+ )
134
+ amount = _coerce_decimal(
135
+ _first_value(record, "Amount", "amount", "InstdAmt")
136
+ )
137
+ account_id = _first_value(
138
+ record,
139
+ "AccountId",
140
+ "account_id",
141
+ "DbtrIBAN",
142
+ "CreditorAccount",
143
+ )
144
+ currency = _first_value(record, "Currency", "currency")
145
+
146
+ return cls(
147
+ account_id=str(account_id)
148
+ if account_id is not None
149
+ else None,
150
+ currency=str(currency).upper()
151
+ if currency is not None
152
+ else None,
153
+ amount=amount,
154
+ booking_date=_parse_date(
155
+ _first_value(record, "BookgDt", "booking_date", "date")
156
+ ),
157
+ value_date=_parse_date(
158
+ _first_value(record, "ValDt", "value_date", "date")
159
+ ),
160
+ description=str(description)
161
+ if description is not None
162
+ else None,
163
+ normalized_description=normalize_description(
164
+ str(description) if description is not None else None
165
+ ),
166
+ reference=str(reference) if reference is not None else None,
167
+ transaction_id=(
168
+ str(
169
+ _first_value(
170
+ record,
171
+ "transaction_id",
172
+ "TransactionId",
173
+ "FITID",
174
+ "EndToEndId",
175
+ )
176
+ )
177
+ if _first_value(
178
+ record,
179
+ "transaction_id",
180
+ "TransactionId",
181
+ "FITID",
182
+ "EndToEndId",
183
+ )
184
+ is not None
185
+ else None
186
+ ),
187
+ counterparty=(
188
+ str(counterparty) if counterparty is not None else None
189
+ ),
190
+ source=source,
191
+ source_index=source_index,
192
+ )
193
+
194
+ def amount_key(self) -> str:
195
+ """Return a stable amount key for hashing and comparisons."""
196
+ return format(self.amount.normalize(), "f")
@@ -0,0 +1,141 @@
1
+ """
2
+ Secure helpers for reading XML bank statement files from ZIP archives.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from collections.abc import Generator
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from zipfile import BadZipFile, ZipFile, ZipInfo
11
+
12
+ from .input_validator import InputValidator, ValidationError
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class ZipXMLSource:
17
+ """Validated XML payload extracted from a ZIP archive."""
18
+
19
+ source_name: str
20
+ xml_bytes: bytes
21
+
22
+
23
+ class ZipSecurityError(ValidationError):
24
+ """Raised when a ZIP archive or ZIP member violates security policy."""
25
+
26
+
27
+ def iter_secure_xml_entries(
28
+ zip_path: str | Path,
29
+ *,
30
+ max_entry_size: int = 10 * 1024 * 1024,
31
+ max_total_uncompressed_size: int = 50 * 1024 * 1024,
32
+ max_compression_ratio: float = 100.0,
33
+ ) -> Generator[ZipXMLSource, None, None]:
34
+ """
35
+ Yield validated XML members from a ZIP archive.
36
+
37
+ This helper is intentionally strict because ZIP archives may come from
38
+ untrusted banks, middleware, or user uploads.
39
+ """
40
+ if max_entry_size <= 0:
41
+ raise ZipSecurityError(
42
+ "max_entry_size must be greater than zero"
43
+ )
44
+ if max_total_uncompressed_size <= 0:
45
+ raise ZipSecurityError(
46
+ "max_total_uncompressed_size must be greater than zero"
47
+ )
48
+ if max_compression_ratio <= 0:
49
+ raise ZipSecurityError(
50
+ "max_compression_ratio must be greater than zero"
51
+ )
52
+
53
+ archive_path = Path(zip_path)
54
+ total_uncompressed_size = 0
55
+ validator = InputValidator(max_file_size=max_entry_size)
56
+
57
+ try:
58
+ with ZipFile(archive_path) as zf:
59
+ members = zf.infolist()
60
+ if not members:
61
+ raise ZipSecurityError(
62
+ "ZIP archive does not contain any entries"
63
+ )
64
+
65
+ for member in members:
66
+ if member.is_dir():
67
+ continue
68
+ if not member.filename.lower().endswith(".xml"):
69
+ continue
70
+
71
+ _validate_zip_member(
72
+ member,
73
+ max_entry_size=max_entry_size,
74
+ max_compression_ratio=max_compression_ratio,
75
+ )
76
+
77
+ total_uncompressed_size += member.file_size
78
+ if (
79
+ total_uncompressed_size
80
+ > max_total_uncompressed_size
81
+ ):
82
+ raise ZipSecurityError(
83
+ "ZIP archive exceeds the total allowed uncompressed XML size"
84
+ )
85
+
86
+ xml_bytes = zf.read(member)
87
+ try:
88
+ (
89
+ xml_bytes,
90
+ safe_name,
91
+ ) = validator.validate_xml_content(
92
+ xml_bytes, source_name=member.filename
93
+ )
94
+ except ValidationError as exc:
95
+ raise ZipSecurityError(str(exc)) from exc
96
+ yield ZipXMLSource(
97
+ source_name=safe_name,
98
+ xml_bytes=xml_bytes,
99
+ )
100
+ except BadZipFile as exc:
101
+ raise ZipSecurityError(
102
+ f"Invalid ZIP archive: {archive_path}"
103
+ ) from exc
104
+
105
+
106
+ def _validate_zip_member(
107
+ member: ZipInfo,
108
+ *,
109
+ max_entry_size: int,
110
+ max_compression_ratio: float,
111
+ ) -> None:
112
+ """Validate a ZIP member before it is read into memory."""
113
+ validator = InputValidator()
114
+ safe_name = validator.sanitize_source_name(member.filename)
115
+
116
+ if member.flag_bits & 0x1:
117
+ raise ZipSecurityError(
118
+ f"Encrypted ZIP entries are not supported: {safe_name}"
119
+ )
120
+
121
+ if member.file_size <= 0:
122
+ raise ZipSecurityError(
123
+ f"ZIP entry is empty or invalid: {safe_name}"
124
+ )
125
+
126
+ if member.file_size > max_entry_size:
127
+ raise ZipSecurityError(
128
+ f"ZIP entry exceeds the allowed uncompressed size limit: {safe_name}"
129
+ )
130
+
131
+ compressed_size = member.compress_size
132
+ if compressed_size <= 0:
133
+ raise ZipSecurityError(
134
+ f"ZIP entry has an invalid compressed size: {safe_name}"
135
+ )
136
+
137
+ compression_ratio = member.file_size / compressed_size
138
+ if compression_ratio > max_compression_ratio:
139
+ raise ZipSecurityError(
140
+ f"ZIP entry compression ratio exceeds the allowed limit: {safe_name}"
141
+ )
@@ -0,0 +1,363 @@
1
+ Metadata-Version: 2.4
2
+ Name: bankstatementparser
3
+ Version: 0.0.4
4
+ Summary: BankStatementParser is your essential tool for easy bank statement management. Designed with finance and treasury experts in mind, it offers a simple way to handle CAMT (ISO 20022) formats and more. Get quick, accurate insights from your financial data and spend less time on processing. It's the smart, hassle-free way to stay on top of your transactions.
5
+ License: Apache Software License
6
+ License-File: LICENSE
7
+ Author: Sebastien Rousseau
8
+ Author-email: sebastian.rousseau@gmail.com
9
+ Requires-Python: >=3.9
10
+ Classifier: License :: Other/Proprietary License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Provides-Extra: polars
19
+ Requires-Dist: defusedxml (>=0.7.1,<0.8.0)
20
+ Requires-Dist: lxml (>=4.9.3)
21
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
22
+ Requires-Dist: pandas (>=2.3.3,<3.0.0)
23
+ Requires-Dist: polars (>=1.32.0,<2.0.0) ; extra == "polars"
24
+ Requires-Dist: pydantic (>=2.11.0,<3.0.0)
25
+ Project-URL: Homepage, https://bankstatementparser.com
26
+ Project-URL: Repository, https://github.com/sebastienrousseau/bankstatementparser
27
+ Description-Content-Type: text/markdown
28
+
29
+ # Bank Statement Parser
30
+
31
+ Parse bank statements across six formats — CAMT, PAIN.001, CSV, OFX/QFX, and MT940 — into structured DataFrames. Process ZIP archives safely. Redact PII by default. Stream files of any size.
32
+
33
+ Built for finance teams, treasury analysts, and fintech developers who need reliable, auditable extraction from ISO 20022 and legacy banking formats without sending data to external services.
34
+
35
+ [![PyPI](https://img.shields.io/pypi/pyversions/bankstatementparser.svg?style=for-the-badge)](https://pypi.org/project/bankstatementparser/)
36
+ [![PyPI Downloads](https://img.shields.io/pypi/dm/bankstatementparser.svg?style=for-the-badge)](https://pypi.org/project/bankstatementparser/)
37
+ [![Codecov](https://img.shields.io/codecov/c/github/sebastienrousseau/bankstatementparser?style=for-the-badge)](https://codecov.io/github/sebastienrousseau/bankstatementparser?branch=main)
38
+ [![License](https://img.shields.io/github/license/sebastienrousseau/bankstatementparser?style=for-the-badge)](LICENSE)
39
+
40
+ ## Key Features
41
+
42
+ | Feature | Description |
43
+ |---|---|
44
+ | **6 formats** | CAMT.053, PAIN.001, CSV, OFX, QFX, MT940 |
45
+ | **Auto-detection** | `detect_statement_format()` identifies the format; `create_parser()` returns the right parser |
46
+ | **Deduplication** | `Deduplicator` detects exact duplicates and suspected matches across sources with explainable confidence scores |
47
+ | **PII redaction** | Names, IBANs, and addresses masked by default — opt in with `--show-pii` |
48
+ | **Streaming** | `parse_streaming()` at 27,000+ tx/s (CAMT) and 52,000+ tx/s (PAIN.001) with bounded memory |
49
+ | **Parallel** | `parse_files_parallel()` for multi-file batch processing across CPU cores |
50
+ | **Secure ZIP** | `iter_secure_xml_entries()` rejects zip bombs, encrypted entries, and suspicious compression ratios |
51
+ | **In-memory parsing** | `from_string()` and `from_bytes()` parse XML without touching disk |
52
+ | **Export** | CSV, JSON, Excel (`.xlsx`), and optional Polars DataFrames |
53
+ | **100% coverage** | 467 tests, 100% branch coverage, property-based fuzzing with Hypothesis |
54
+
55
+ ## Requirements
56
+
57
+ - Python **3.9** through **3.14**
58
+ - Poetry (for local development)
59
+
60
+ ## Install
61
+
62
+ ```bash
63
+ pip install bankstatementparser
64
+ ```
65
+
66
+ ### Local Development
67
+
68
+ Clone and install on **macOS, Linux, or WSL**:
69
+
70
+ ```bash
71
+ git clone https://github.com/sebastienrousseau/bankstatementparser.git
72
+ cd bankstatementparser
73
+ python3 -m venv .venv
74
+ source .venv/bin/activate
75
+ pip install poetry
76
+ poetry install --with dev
77
+ ```
78
+
79
+ ## Quick Start
80
+
81
+ ### Parse a CAMT statement
82
+
83
+ ```python
84
+ from bankstatementparser import CamtParser
85
+
86
+ parser = CamtParser("statement.xml")
87
+ transactions = parser.parse()
88
+ print(transactions)
89
+ ```
90
+
91
+ ```text
92
+ Amount Currency DrCr Debtor Creditor ValDt AccountId
93
+ 105678.5 SEK CRDT MUELLER 2010-10-18 50000000054910
94
+ -200000.0 SEK DBIT 2010-10-18 50000000054910
95
+ 30000.0 SEK CRDT 2010-10-18 50000000054910
96
+ ```
97
+
98
+ ### Parse a PAIN.001 payment file
99
+
100
+ ```python
101
+ from bankstatementparser import Pain001Parser
102
+
103
+ parser = Pain001Parser("payment.xml")
104
+ payments = parser.parse()
105
+ print(payments)
106
+ ```
107
+
108
+ ```text
109
+ PmtInfId PmtMtd InstdAmt Currency CdtrNm EndToEndId
110
+ PMT-001 TRF 1500.00 EUR ACME Corp E2E-001
111
+ PMT-001 TRF 2300.50 EUR Global Ltd E2E-002
112
+ ```
113
+
114
+ ### Auto-detect the format
115
+
116
+ ```python
117
+ from bankstatementparser import create_parser, detect_statement_format
118
+
119
+ fmt = detect_statement_format("transactions.ofx")
120
+ parser = create_parser("transactions.ofx", fmt)
121
+ records = parser.parse()
122
+ ```
123
+
124
+ Works with `.xml`, `.csv`, `.ofx`, `.qfx`, and `.mt940` files.
125
+
126
+ ### Parse from memory (no disk I/O)
127
+
128
+ ```python
129
+ from bankstatementparser import CamtParser
130
+
131
+ xml_bytes = download_from_sftp() # your own function
132
+ parser = CamtParser.from_bytes(xml_bytes, source_name="daily.xml")
133
+ transactions = parser.parse()
134
+ ```
135
+
136
+ Pass only decompressed XML to `from_string()` or `from_bytes()`. For ZIP archives, use `iter_secure_xml_entries()`.
137
+
138
+ ### Parse XML files inside a ZIP archive
139
+
140
+ ```python
141
+ from bankstatementparser import CamtParser, iter_secure_xml_entries
142
+
143
+ for entry in iter_secure_xml_entries("statements.zip"):
144
+ parser = CamtParser.from_bytes(entry.xml_bytes, source_name=entry.source_name)
145
+ transactions = parser.parse()
146
+ print(entry.source_name, len(transactions), "transactions")
147
+ ```
148
+
149
+ The iterator enforces size limits, blocks encrypted entries, and rejects suspicious compression ratios before any XML parsing occurs.
150
+
151
+ ## PII Redaction
152
+
153
+ PII (names, IBANs, addresses) is **redacted by default** in console output and streaming mode.
154
+
155
+ ```python
156
+ # Redacted by default
157
+ for tx in parser.parse_streaming(redact_pii=True):
158
+ print(tx) # Names and addresses show as ***REDACTED***
159
+
160
+ # Opt in to see full data
161
+ for tx in parser.parse_streaming(redact_pii=False):
162
+ print(tx)
163
+ ```
164
+
165
+ File exports (CSV, JSON, Excel) always contain the full unredacted data.
166
+
167
+ ## Streaming
168
+
169
+ Process large files incrementally. Memory stays bounded regardless of file size — tested at 50,000 transactions with sub-2x memory scaling.
170
+
171
+ ```python
172
+ from bankstatementparser import CamtParser
173
+
174
+ parser = CamtParser("large_statement.xml")
175
+ for transaction in parser.parse_streaming():
176
+ process(transaction) # each transaction is a dict
177
+ ```
178
+
179
+ Works with both `CamtParser` and `Pain001Parser`. PAIN.001 files over 50 MB use chunk-based namespace stripping via a temporary file — the full document is never loaded into memory.
180
+
181
+ ## Performance
182
+
183
+ | Metric | CAMT | PAIN.001 |
184
+ |---|---|---|
185
+ | **Throughput** | 27,000+ tx/s | 52,000+ tx/s |
186
+ | **Per-transaction latency** | 37 us | 19 us |
187
+ | **Time to first result** | < 1 ms | < 2 ms |
188
+ | **Memory scaling** | Constant (1K–50K) | Constant (1K–50K) |
189
+
190
+ Performance is flat from 1,000 to 50,000 transactions. CI enforces minimum TPS and latency thresholds.
191
+
192
+ ## Parallel Parsing
193
+
194
+ Process multiple files simultaneously across CPU cores:
195
+
196
+ ```python
197
+ from bankstatementparser import parse_files_parallel
198
+
199
+ results = parse_files_parallel([
200
+ "statements/jan.xml",
201
+ "statements/feb.xml",
202
+ "statements/mar.xml",
203
+ ])
204
+
205
+ for r in results:
206
+ print(r.path, r.status, len(r.transactions), "rows")
207
+ ```
208
+
209
+ Uses `ProcessPoolExecutor` to bypass the GIL. Each file is parsed in its own worker process. Auto-detects format per file, or force with `format_name="camt"`.
210
+
211
+ ## Command Line
212
+
213
+ ```bash
214
+ # Parse and display
215
+ python -m bankstatementparser.cli --type camt --input statement.xml
216
+
217
+ # Export to CSV
218
+ python -m bankstatementparser.cli --type camt --input statement.xml --output transactions.csv
219
+
220
+ # Stream with PII visible
221
+ python -m bankstatementparser.cli --type camt --input statement.xml --streaming --show-pii
222
+ ```
223
+
224
+ Supports `--type camt` and `--type pain001`.
225
+
226
+ ## Deduplication
227
+
228
+ Detect duplicate transactions across multiple sources:
229
+
230
+ ```python
231
+ from bankstatementparser import CamtParser, Deduplicator
232
+
233
+ parser = CamtParser("statement.xml")
234
+ dedup = Deduplicator()
235
+ result = dedup.deduplicate(dedup.from_dataframe(parser.parse()))
236
+
237
+ print(f"Unique: {len(result.unique_transactions)}")
238
+ print(f"Exact duplicates: {len(result.exact_duplicates)}")
239
+ print(f"Suspected matches: {len(result.suspected_matches)}")
240
+ ```
241
+
242
+ The `Deduplicator` uses deterministic hashing for exact matches and configurable similarity thresholds for suspected matches. Each match group includes a confidence score and reason for auditability.
243
+
244
+ ## Export
245
+
246
+ ```python
247
+ parser = CamtParser("statement.xml")
248
+ parser.parse()
249
+
250
+ # CSV
251
+ parser.export_csv("output.csv")
252
+
253
+ # JSON (includes summary + transactions)
254
+ parser.export_json("output.json")
255
+
256
+ # Excel
257
+ parser.camt_to_excel("output.xlsx")
258
+ ```
259
+
260
+ ### Polars (optional)
261
+
262
+ Convert any parser output to a Polars DataFrame:
263
+
264
+ ```python
265
+ polars_df = parser.to_polars()
266
+ lazy_df = parser.to_polars_lazy()
267
+ ```
268
+
269
+ Install with `pip install bankstatementparser[polars]`.
270
+
271
+ ## Examples
272
+
273
+ See [`examples/`](examples/README.md) for 14 runnable scripts:
274
+
275
+ | Example | What it demonstrates |
276
+ |---|---|
277
+ | `parse_camt_basic.py` | Load a CAMT.053 file and print transactions |
278
+ | `parse_camt_from_string.py` | Parse CAMT from an in-memory XML string |
279
+ | `inspect_camt.py` | Extract balances, stats, and summaries |
280
+ | `export_camt.py` | Export to CSV and JSON |
281
+ | `export_camt_excel.py` | Export to Excel workbook |
282
+ | `stream_camt.py` | Stream transactions incrementally |
283
+ | `parse_camt_zip.py` | Secure ZIP archive processing |
284
+ | `parse_detected_formats.py` | Auto-detect CSV, OFX, MT940, and XML formats |
285
+ | `parse_pain001_basic.py` | Parse a PAIN.001 payment file |
286
+ | `export_pain001.py` | Export PAIN.001 to CSV and JSON |
287
+ | `stream_pain001.py` | Stream payments incrementally |
288
+ | `validate_input.py` | Validate file paths with InputValidator |
289
+ | `compatibility_wrappers.py` | Legacy API wrappers |
290
+ | `cli_examples.sh` | CLI commands for CAMT and PAIN.001 |
291
+
292
+ ## XML Tag Mapping
293
+
294
+ See [`docs/MAPPING.md`](docs/MAPPING.md) for a complete reference of ISO 20022 XML tags to DataFrame columns across all six formats. Use this when integrating with ERP systems or building reconciliation pipelines.
295
+
296
+ ## Project Layout
297
+
298
+ ```text
299
+ bankstatementparser/ Source code (13 modules, 100% branch coverage)
300
+ docs/compliance/ ISO 13485 validation, risk register, traceability
301
+ examples/ 14 runnable example scripts
302
+ scripts/ SBOM generation, checksums, signature verification
303
+ tests/ 467 tests (unit, integration, property-based, security)
304
+ ```
305
+
306
+ ## Security
307
+
308
+ Bank statement files contain sensitive financial and personal data. This library is designed with security as a primary constraint:
309
+
310
+ - **XXE protection** — `resolve_entities=False`, `no_network=True`, `load_dtd=False`
311
+ - **ZIP bomb protection** — compression ratio limits, entry size caps, encrypted entry rejection
312
+ - **Path traversal prevention** — dangerous pattern blocklist, symlink resolution
313
+ - **PII redaction** — default masking of names, IBANs, and addresses
314
+ - **Signed commits** — enforced in CI via GitHub API verification
315
+ - **Supply chain** — SHA-256 hash-locked dependencies, CycloneDX SBOM, build provenance attestation
316
+
317
+ For vulnerability reports, see [SECURITY.md](.github/SECURITY.md).
318
+
319
+ For the full compliance suite, see [`docs/compliance/`](docs/compliance/).
320
+
321
+ ## Verify the Repository
322
+
323
+ Run the full validation suite locally:
324
+
325
+ ```bash
326
+ ruff check bankstatementparser tests examples scripts
327
+ python -m mypy bankstatementparser
328
+ python -m pytest
329
+ bandit -r bankstatementparser examples scripts -q
330
+ ```
331
+
332
+ ## Contributing
333
+
334
+ Signed commits required. See [CONTRIBUTING.md](CONTRIBUTING.md).
335
+
336
+ ## License
337
+
338
+ Apache License 2.0. See [LICENSE](LICENSE).
339
+
340
+ ## FAQ
341
+
342
+ **What formats are supported?**
343
+ CAMT.053, PAIN.001, CSV, OFX, QFX, and MT940.
344
+
345
+ **Does any data leave my infrastructure?**
346
+ No. Zero network calls. XML parsers enforce `no_network=True`. No cloud, no telemetry.
347
+
348
+ **Is PII redacted automatically?**
349
+ Yes. Names, IBANs, and addresses are masked by default in console output and streaming. File exports retain full data.
350
+
351
+ **Is the extraction deterministic?**
352
+ Yes. Same input produces byte-identical output. Critical for financial auditing.
353
+
354
+ **Can it handle large files?**
355
+ Yes. `parse_streaming()` is tested at 50,000 transactions (~25 MB) with bounded memory. Files over 50 MB use chunk-based streaming.
356
+
357
+ See [FAQ.md](FAQ.md) for the complete FAQ covering data privacy, technical specs, and treasury workflows.
358
+
359
+ ---
360
+
361
+ THE ARCHITECT ᛫ Sebastien Rousseau ᛫ https://sebastienrousseau.com
362
+ THE ENGINE ᛞ EUXIS ᛫ Enterprise Unified Execution Intelligence System ᛫ https://euxis.co
363
+