PyPI - bankstatementparser - Versions diffs - 0.0.4__py3-none-any.whl - Mend

bankstatementparser 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

bankstatementparser/__init__.py +82 -0
bankstatementparser/additional_parsers.py +376 -0
bankstatementparser/bank_statement_parsers.py +370 -0
bankstatementparser/base_parser.py +205 -0
bankstatementparser/camt_parser.py +971 -0
bankstatementparser/cli.py +575 -0
bankstatementparser/exceptions.py +36 -0
bankstatementparser/input_validator.py +628 -0
bankstatementparser/pain001_parser.py +742 -0
bankstatementparser/parallel.py +127 -0
bankstatementparser/record_types.py +94 -0
bankstatementparser/transaction_deduplicator.py +402 -0
bankstatementparser/transaction_models.py +196 -0
bankstatementparser/zip_security.py +141 -0
bankstatementparser-0.0.4.dist-info/METADATA +363 -0
bankstatementparser-0.0.4.dist-info/RECORD +18 -0
bankstatementparser-0.0.4.dist-info/WHEEL +4 -0
bankstatementparser-0.0.4.dist-info/licenses/LICENSE +203 -0

bankstatementparser/transaction_models.py ADDED Viewed

@@ -0,0 +1,196 @@
+# Copyright (C) 2023 Sebastien Rousseau.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Deterministic transaction models used across parser outputs."""
+from __future__ import annotations
+import re
+from collections.abc import Mapping
+from datetime import date, datetime
+from decimal import Decimal
+from typing import Optional
+from pydantic import BaseModel, ConfigDict, Field
+def _coerce_decimal(value: object) -> Decimal:
+    text = str(value).strip()
+    if not text:
+        raise ValueError("amount is required")
+    return Decimal(text)
+def _parse_date(value: object) -> date | None:
+    if value in (None, ""):
+        return None
+    if isinstance(value, date) and not isinstance(value, datetime):
+        return value
+    text = str(value).strip()
+    if not text:
+        return None
+    if len(text) >= 10:
+        text = text[:10]
+    for fmt in ("%Y-%m-%d", "%Y%m%d", "%d/%m/%Y", "%Y/%m/%d"):
+        try:
+            return datetime.strptime(text, fmt).date()
+        except ValueError:
+            continue
+    try:
+        return datetime.fromisoformat(str(value)).date()
+    except ValueError as exc:
+        raise ValueError(f"unsupported date format: {value}") from exc
+def normalize_description(value: str | None) -> str:
+    if value is None:
+        return ""
+    collapsed = re.sub(r"\s+", " ", value).strip().lower()
+    return re.sub(r"[^a-z0-9 ]+", "", collapsed)
+def _first_value(
+    record: Mapping[str, object], *keys: str
+) -> object | None:
+    for key in keys:
+        if key in record and record[key] not in (None, ""):
+            return record[key]
+    return None
+class Transaction(BaseModel):
+    """Normalized transaction model for deterministic downstream logic."""
+    model_config = ConfigDict(frozen=True)
+    account_id: Optional[str] = None
+    currency: Optional[str] = None
+    amount: Decimal
+    booking_date: Optional[date] = None
+    value_date: Optional[date] = None
+    description: Optional[str] = None
+    normalized_description: str = Field(default="")
+    reference: Optional[str] = None
+    transaction_id: Optional[str] = None
+    counterparty: Optional[str] = None
+    source: Optional[str] = None
+    source_index: Optional[int] = None
+    @classmethod
+    def from_record(
+        cls,
+        record: Mapping[str, object],
+        *,
+        source: str | None = None,
+        source_index: int | None = None,
+    ) -> Transaction:
+        """Create a normalized transaction from parser output."""
+        description = _first_value(
+            record,
+            "description",
+            "Description",
+            "RmtInf",
+            "Reference",
+            "reference",
+            "Memo",
+            "memo",
+            "Name",
+            "CdtrNm",
+        )
+        reference = _first_value(
+            record,
+            "Reference",
+            "reference",
+            "RmtInf",
+            "transaction_id",
+            "transactionId",
+            "EndToEndId",
+            "FITID",
+        )
+        counterparty = _first_value(
+            record,
+            "Creditor",
+            "Debtor",
+            "CdtrNm",
+            "Name",
+            "payee",
+            "counterparty",
+        )
+        amount = _coerce_decimal(
+            _first_value(record, "Amount", "amount", "InstdAmt")
+        )
+        account_id = _first_value(
+            record,
+            "AccountId",
+            "account_id",
+            "DbtrIBAN",
+            "CreditorAccount",
+        )
+        currency = _first_value(record, "Currency", "currency")
+        return cls(
+            account_id=str(account_id)
+            if account_id is not None
+            else None,
+            currency=str(currency).upper()
+            if currency is not None
+            else None,
+            amount=amount,
+            booking_date=_parse_date(
+                _first_value(record, "BookgDt", "booking_date", "date")
+            ),
+            value_date=_parse_date(
+                _first_value(record, "ValDt", "value_date", "date")
+            ),
+            description=str(description)
+            if description is not None
+            else None,
+            normalized_description=normalize_description(
+                str(description) if description is not None else None
+            ),
+            reference=str(reference) if reference is not None else None,
+            transaction_id=(
+                str(
+                    _first_value(
+                        record,
+                        "transaction_id",
+                        "TransactionId",
+                        "FITID",
+                        "EndToEndId",
+                    )
+                )
+                if _first_value(
+                    record,
+                    "transaction_id",
+                    "TransactionId",
+                    "FITID",
+                    "EndToEndId",
+                )
+                is not None
+                else None
+            ),
+            counterparty=(
+                str(counterparty) if counterparty is not None else None
+            ),
+            source=source,
+            source_index=source_index,
+        )
+    def amount_key(self) -> str:
+        """Return a stable amount key for hashing and comparisons."""
+        return format(self.amount.normalize(), "f")

bankstatementparser/zip_security.py ADDED Viewed

@@ -0,0 +1,141 @@
+"""
+Secure helpers for reading XML bank statement files from ZIP archives.
+"""
+from __future__ import annotations
+from collections.abc import Generator
+from dataclasses import dataclass
+from pathlib import Path
+from zipfile import BadZipFile, ZipFile, ZipInfo
+from .input_validator import InputValidator, ValidationError
+@dataclass(frozen=True)
+class ZipXMLSource:
+    """Validated XML payload extracted from a ZIP archive."""
+    source_name: str
+    xml_bytes: bytes
+class ZipSecurityError(ValidationError):
+    """Raised when a ZIP archive or ZIP member violates security policy."""
+def iter_secure_xml_entries(
+    zip_path: str | Path,
+    *,
+    max_entry_size: int = 10 * 1024 * 1024,
+    max_total_uncompressed_size: int = 50 * 1024 * 1024,
+    max_compression_ratio: float = 100.0,
+) -> Generator[ZipXMLSource, None, None]:
+    """
+    Yield validated XML members from a ZIP archive.
+    This helper is intentionally strict because ZIP archives may come from
+    untrusted banks, middleware, or user uploads.
+    """
+    if max_entry_size <= 0:
+        raise ZipSecurityError(
+            "max_entry_size must be greater than zero"
+        )
+    if max_total_uncompressed_size <= 0:
+        raise ZipSecurityError(
+            "max_total_uncompressed_size must be greater than zero"
+        )
+    if max_compression_ratio <= 0:
+        raise ZipSecurityError(
+            "max_compression_ratio must be greater than zero"
+        )
+    archive_path = Path(zip_path)
+    total_uncompressed_size = 0
+    validator = InputValidator(max_file_size=max_entry_size)
+    try:
+        with ZipFile(archive_path) as zf:
+            members = zf.infolist()
+            if not members:
+                raise ZipSecurityError(
+                    "ZIP archive does not contain any entries"
+                )
+            for member in members:
+                if member.is_dir():
+                    continue
+                if not member.filename.lower().endswith(".xml"):
+                    continue
+                _validate_zip_member(
+                    member,
+                    max_entry_size=max_entry_size,
+                    max_compression_ratio=max_compression_ratio,
+                )
+                total_uncompressed_size += member.file_size
+                if (
+                    total_uncompressed_size
+                    > max_total_uncompressed_size
+                ):
+                    raise ZipSecurityError(
+                        "ZIP archive exceeds the total allowed uncompressed XML size"
+                    )
+                xml_bytes = zf.read(member)
+                try:
+                    (
+                        xml_bytes,
+                        safe_name,
+                    ) = validator.validate_xml_content(
+                        xml_bytes, source_name=member.filename
+                    )
+                except ValidationError as exc:
+                    raise ZipSecurityError(str(exc)) from exc
+                yield ZipXMLSource(
+                    source_name=safe_name,
+                    xml_bytes=xml_bytes,
+                )
+    except BadZipFile as exc:
+        raise ZipSecurityError(
+            f"Invalid ZIP archive: {archive_path}"
+        ) from exc
+def _validate_zip_member(
+    member: ZipInfo,
+    *,
+    max_entry_size: int,
+    max_compression_ratio: float,
+) -> None:
+    """Validate a ZIP member before it is read into memory."""
+    validator = InputValidator()
+    safe_name = validator.sanitize_source_name(member.filename)
+    if member.flag_bits & 0x1:
+        raise ZipSecurityError(
+            f"Encrypted ZIP entries are not supported: {safe_name}"
+        )
+    if member.file_size <= 0:
+        raise ZipSecurityError(
+            f"ZIP entry is empty or invalid: {safe_name}"
+        )
+    if member.file_size > max_entry_size:
+        raise ZipSecurityError(
+            f"ZIP entry exceeds the allowed uncompressed size limit: {safe_name}"
+        )
+    compressed_size = member.compress_size
+    if compressed_size <= 0:
+        raise ZipSecurityError(
+            f"ZIP entry has an invalid compressed size: {safe_name}"
+        )
+    compression_ratio = member.file_size / compressed_size
+    if compression_ratio > max_compression_ratio:
+        raise ZipSecurityError(
+            f"ZIP entry compression ratio exceeds the allowed limit: {safe_name}"
+        )

bankstatementparser-0.0.4.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,363 @@
+Metadata-Version: 2.4
+Name: bankstatementparser
+Version: 0.0.4
+Summary: BankStatementParser is your essential tool for easy bank statement management. Designed with finance and treasury experts in mind, it offers a simple way to handle CAMT (ISO 20022) formats and more. Get quick, accurate insights from your financial data and spend less time on processing. It's the smart, hassle-free way to stay on top of your transactions.
+License: Apache Software License
+License-File: LICENSE
+Author: Sebastien Rousseau
+Author-email: sebastian.rousseau@gmail.com
+Requires-Python: >=3.9
+Classifier: License :: Other/Proprietary License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Provides-Extra: polars
+Requires-Dist: defusedxml (>=0.7.1,<0.8.0)
+Requires-Dist: lxml (>=4.9.3)
+Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
+Requires-Dist: pandas (>=2.3.3,<3.0.0)
+Requires-Dist: polars (>=1.32.0,<2.0.0) ; extra == "polars"
+Requires-Dist: pydantic (>=2.11.0,<3.0.0)
+Project-URL: Homepage, https://bankstatementparser.com
+Project-URL: Repository, https://github.com/sebastienrousseau/bankstatementparser
+Description-Content-Type: text/markdown
+# Bank Statement Parser
+Parse bank statements across six formats — CAMT, PAIN.001, CSV, OFX/QFX, and MT940 — into structured DataFrames. Process ZIP archives safely. Redact PII by default. Stream files of any size.
+Built for finance teams, treasury analysts, and fintech developers who need reliable, auditable extraction from ISO 20022 and legacy banking formats without sending data to external services.
+[![PyPI](https://img.shields.io/pypi/pyversions/bankstatementparser.svg?style=for-the-badge)](https://pypi.org/project/bankstatementparser/)
+[![PyPI Downloads](https://img.shields.io/pypi/dm/bankstatementparser.svg?style=for-the-badge)](https://pypi.org/project/bankstatementparser/)
+[![Codecov](https://img.shields.io/codecov/c/github/sebastienrousseau/bankstatementparser?style=for-the-badge)](https://codecov.io/github/sebastienrousseau/bankstatementparser?branch=main)
+[![License](https://img.shields.io/github/license/sebastienrousseau/bankstatementparser?style=for-the-badge)](LICENSE)
+## Key Features
+| Feature | Description |
+|---|---|
+| **6 formats** | CAMT.053, PAIN.001, CSV, OFX, QFX, MT940 |
+| **Auto-detection** | `detect_statement_format()` identifies the format; `create_parser()` returns the right parser |
+| **Deduplication** | `Deduplicator` detects exact duplicates and suspected matches across sources with explainable confidence scores |
+| **PII redaction** | Names, IBANs, and addresses masked by default — opt in with `--show-pii` |
+| **Streaming** | `parse_streaming()` at 27,000+ tx/s (CAMT) and 52,000+ tx/s (PAIN.001) with bounded memory |
+| **Parallel** | `parse_files_parallel()` for multi-file batch processing across CPU cores |
+| **Secure ZIP** | `iter_secure_xml_entries()` rejects zip bombs, encrypted entries, and suspicious compression ratios |
+| **In-memory parsing** | `from_string()` and `from_bytes()` parse XML without touching disk |
+| **Export** | CSV, JSON, Excel (`.xlsx`), and optional Polars DataFrames |
+| **100% coverage** | 467 tests, 100% branch coverage, property-based fuzzing with Hypothesis |
+## Requirements
+- Python **3.9** through **3.14**
+- Poetry (for local development)
+## Install
+```bash
+pip install bankstatementparser
+```
+### Local Development
+Clone and install on **macOS, Linux, or WSL**:
+```bash
+git clone https://github.com/sebastienrousseau/bankstatementparser.git
+cd bankstatementparser
+python3 -m venv .venv
+source .venv/bin/activate
+pip install poetry
+poetry install --with dev
+```
+## Quick Start
+### Parse a CAMT statement
+```python
+from bankstatementparser import CamtParser
+parser = CamtParser("statement.xml")
+transactions = parser.parse()
+print(transactions)
+```
+```text
+   Amount Currency DrCr  Debtor Creditor      ValDt      AccountId
+ 105678.5      SEK CRDT MUELLER          2010-10-18 50000000054910
+-200000.0      SEK DBIT                  2010-10-18 50000000054910
+  30000.0      SEK CRDT                  2010-10-18 50000000054910
+```
+### Parse a PAIN.001 payment file
+```python
+from bankstatementparser import Pain001Parser
+parser = Pain001Parser("payment.xml")
+payments = parser.parse()
+print(payments)
+```
+```text
+  PmtInfId PmtMtd  InstdAmt Currency  CdtrNm         EndToEndId
+  PMT-001  TRF     1500.00  EUR       ACME Corp      E2E-001
+  PMT-001  TRF     2300.50  EUR       Global Ltd     E2E-002
+```
+### Auto-detect the format
+```python
+from bankstatementparser import create_parser, detect_statement_format
+fmt = detect_statement_format("transactions.ofx")
+parser = create_parser("transactions.ofx", fmt)
+records = parser.parse()
+```
+Works with `.xml`, `.csv`, `.ofx`, `.qfx`, and `.mt940` files.
+### Parse from memory (no disk I/O)
+```python
+from bankstatementparser import CamtParser
+xml_bytes = download_from_sftp()  # your own function
+parser = CamtParser.from_bytes(xml_bytes, source_name="daily.xml")
+transactions = parser.parse()
+```
+Pass only decompressed XML to `from_string()` or `from_bytes()`. For ZIP archives, use `iter_secure_xml_entries()`.
+### Parse XML files inside a ZIP archive
+```python
+from bankstatementparser import CamtParser, iter_secure_xml_entries
+for entry in iter_secure_xml_entries("statements.zip"):
+    parser = CamtParser.from_bytes(entry.xml_bytes, source_name=entry.source_name)
+    transactions = parser.parse()
+    print(entry.source_name, len(transactions), "transactions")
+```
+The iterator enforces size limits, blocks encrypted entries, and rejects suspicious compression ratios before any XML parsing occurs.
+## PII Redaction
+PII (names, IBANs, addresses) is **redacted by default** in console output and streaming mode.
+```python
+# Redacted by default
+for tx in parser.parse_streaming(redact_pii=True):
+    print(tx)  # Names and addresses show as ***REDACTED***
+# Opt in to see full data
+for tx in parser.parse_streaming(redact_pii=False):
+    print(tx)
+```
+File exports (CSV, JSON, Excel) always contain the full unredacted data.
+## Streaming
+Process large files incrementally. Memory stays bounded regardless of file size — tested at 50,000 transactions with sub-2x memory scaling.
+```python
+from bankstatementparser import CamtParser
+parser = CamtParser("large_statement.xml")
+for transaction in parser.parse_streaming():
+    process(transaction)  # each transaction is a dict
+```
+Works with both `CamtParser` and `Pain001Parser`. PAIN.001 files over 50 MB use chunk-based namespace stripping via a temporary file — the full document is never loaded into memory.
+## Performance
+| Metric | CAMT | PAIN.001 |
+|---|---|---|
+| **Throughput** | 27,000+ tx/s | 52,000+ tx/s |
+| **Per-transaction latency** | 37 us | 19 us |
+| **Time to first result** | < 1 ms | < 2 ms |
+| **Memory scaling** | Constant (1K–50K) | Constant (1K–50K) |
+Performance is flat from 1,000 to 50,000 transactions. CI enforces minimum TPS and latency thresholds.
+## Parallel Parsing
+Process multiple files simultaneously across CPU cores:
+```python
+from bankstatementparser import parse_files_parallel
+results = parse_files_parallel([
+    "statements/jan.xml",
+    "statements/feb.xml",
+    "statements/mar.xml",
+])
+for r in results:
+    print(r.path, r.status, len(r.transactions), "rows")
+```
+Uses `ProcessPoolExecutor` to bypass the GIL. Each file is parsed in its own worker process. Auto-detects format per file, or force with `format_name="camt"`.
+## Command Line
+```bash
+# Parse and display
+python -m bankstatementparser.cli --type camt --input statement.xml
+# Export to CSV
+python -m bankstatementparser.cli --type camt --input statement.xml --output transactions.csv
+# Stream with PII visible
+python -m bankstatementparser.cli --type camt --input statement.xml --streaming --show-pii
+```
+Supports `--type camt` and `--type pain001`.
+## Deduplication
+Detect duplicate transactions across multiple sources:
+```python
+from bankstatementparser import CamtParser, Deduplicator
+parser = CamtParser("statement.xml")
+dedup = Deduplicator()
+result = dedup.deduplicate(dedup.from_dataframe(parser.parse()))
+print(f"Unique: {len(result.unique_transactions)}")
+print(f"Exact duplicates: {len(result.exact_duplicates)}")
+print(f"Suspected matches: {len(result.suspected_matches)}")
+```
+The `Deduplicator` uses deterministic hashing for exact matches and configurable similarity thresholds for suspected matches. Each match group includes a confidence score and reason for auditability.
+## Export
+```python
+parser = CamtParser("statement.xml")
+parser.parse()
+# CSV
+parser.export_csv("output.csv")
+# JSON (includes summary + transactions)
+parser.export_json("output.json")
+# Excel
+parser.camt_to_excel("output.xlsx")
+```
+### Polars (optional)
+Convert any parser output to a Polars DataFrame:
+```python
+polars_df = parser.to_polars()
+lazy_df = parser.to_polars_lazy()
+```
+Install with `pip install bankstatementparser[polars]`.
+## Examples
+See [`examples/`](examples/README.md) for 14 runnable scripts:
+| Example | What it demonstrates |
+|---|---|
+| `parse_camt_basic.py` | Load a CAMT.053 file and print transactions |
+| `parse_camt_from_string.py` | Parse CAMT from an in-memory XML string |
+| `inspect_camt.py` | Extract balances, stats, and summaries |
+| `export_camt.py` | Export to CSV and JSON |
+| `export_camt_excel.py` | Export to Excel workbook |
+| `stream_camt.py` | Stream transactions incrementally |
+| `parse_camt_zip.py` | Secure ZIP archive processing |
+| `parse_detected_formats.py` | Auto-detect CSV, OFX, MT940, and XML formats |
+| `parse_pain001_basic.py` | Parse a PAIN.001 payment file |
+| `export_pain001.py` | Export PAIN.001 to CSV and JSON |
+| `stream_pain001.py` | Stream payments incrementally |
+| `validate_input.py` | Validate file paths with InputValidator |
+| `compatibility_wrappers.py` | Legacy API wrappers |
+| `cli_examples.sh` | CLI commands for CAMT and PAIN.001 |
+## XML Tag Mapping
+See [`docs/MAPPING.md`](docs/MAPPING.md) for a complete reference of ISO 20022 XML tags to DataFrame columns across all six formats. Use this when integrating with ERP systems or building reconciliation pipelines.
+## Project Layout
+```text
+bankstatementparser/   Source code (13 modules, 100% branch coverage)
+docs/compliance/       ISO 13485 validation, risk register, traceability
+examples/              14 runnable example scripts
+scripts/               SBOM generation, checksums, signature verification
+tests/                 467 tests (unit, integration, property-based, security)
+```
+## Security
+Bank statement files contain sensitive financial and personal data. This library is designed with security as a primary constraint:
+- **XXE protection** — `resolve_entities=False`, `no_network=True`, `load_dtd=False`
+- **ZIP bomb protection** — compression ratio limits, entry size caps, encrypted entry rejection
+- **Path traversal prevention** — dangerous pattern blocklist, symlink resolution
+- **PII redaction** — default masking of names, IBANs, and addresses
+- **Signed commits** — enforced in CI via GitHub API verification
+- **Supply chain** — SHA-256 hash-locked dependencies, CycloneDX SBOM, build provenance attestation
+For vulnerability reports, see [SECURITY.md](.github/SECURITY.md).
+For the full compliance suite, see [`docs/compliance/`](docs/compliance/).
+## Verify the Repository
+Run the full validation suite locally:
+```bash
+ruff check bankstatementparser tests examples scripts
+python -m mypy bankstatementparser
+python -m pytest
+bandit -r bankstatementparser examples scripts -q
+```
+## Contributing
+Signed commits required. See [CONTRIBUTING.md](CONTRIBUTING.md).
+## License
+Apache License 2.0. See [LICENSE](LICENSE).
+## FAQ
+**What formats are supported?**
+CAMT.053, PAIN.001, CSV, OFX, QFX, and MT940.
+**Does any data leave my infrastructure?**
+No. Zero network calls. XML parsers enforce `no_network=True`. No cloud, no telemetry.
+**Is PII redacted automatically?**
+Yes. Names, IBANs, and addresses are masked by default in console output and streaming. File exports retain full data.
+**Is the extraction deterministic?**
+Yes. Same input produces byte-identical output. Critical for financial auditing.
+**Can it handle large files?**
+Yes. `parse_streaming()` is tested at 50,000 transactions (~25 MB) with bounded memory. Files over 50 MB use chunk-based streaming.
+See [FAQ.md](FAQ.md) for the complete FAQ covering data privacy, technical specs, and treasury workflows.
+---
+THE ARCHITECT ᛫ Sebastien Rousseau ᛫ https://sebastienrousseau.com
+THE ENGINE ᛞ EUXIS ᛫ Enterprise Unified Execution Intelligence System ᛫ https://euxis.co