PyPI - bankstatementparser - Versions diffs - 0.0.4__py3-none-any.whl - Mend

bankstatementparser 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

bankstatementparser/__init__.py +82 -0
bankstatementparser/additional_parsers.py +376 -0
bankstatementparser/bank_statement_parsers.py +370 -0
bankstatementparser/base_parser.py +205 -0
bankstatementparser/camt_parser.py +971 -0
bankstatementparser/cli.py +575 -0
bankstatementparser/exceptions.py +36 -0
bankstatementparser/input_validator.py +628 -0
bankstatementparser/pain001_parser.py +742 -0
bankstatementparser/parallel.py +127 -0
bankstatementparser/record_types.py +94 -0
bankstatementparser/transaction_deduplicator.py +402 -0
bankstatementparser/transaction_models.py +196 -0
bankstatementparser/zip_security.py +141 -0
bankstatementparser-0.0.4.dist-info/METADATA +363 -0
bankstatementparser-0.0.4.dist-info/RECORD +18 -0
bankstatementparser-0.0.4.dist-info/WHEEL +4 -0
bankstatementparser-0.0.4.dist-info/licenses/LICENSE +203 -0

bankstatementparser/bank_statement_parsers.py ADDED Viewed

@@ -0,0 +1,370 @@
+# Copyright (C) 2023 Sebastien Rousseau.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+bank_statement_parsers.py
+This module provides consolidated access to bank statement parsing functionality.
+The actual parser implementations are in standalone modules with compatibility wrappers.
+"""
+import os
+from pathlib import Path
+from typing import Any, Union
+import pandas as pd
+from lxml.etree import _Element
+# Import parsers from standalone modules
+from .camt_parser import CamtParser
+from .input_validator import ValidationError
+from .pain001_parser import Pain001Parser as StandalonePain001Parser
+class FileParserError(Exception):
+    """Custom exception for file parsing errors."""
+    pass
+class Pain001Parser:
+    """
+    Compatibility wrapper for SEPA Pain.001 credit transfer files.
+    This maintains the original API while delegating to enhanced standalone implementation.
+    Attributes:
+        batches (list): List of batch elements parsed from the file.
+        payments (list): List of parsed payment dictionaries.
+        batches_count (int): The number of payment batches in the file.
+        total_payments_count (int): The total number of payments across all batches.
+    """
+    def __init__(
+        self, file_name: Union[str, Path], redact_pii: bool = False
+    ) -> None:
+        """
+        Initializes the parser and parses payments from the given file.
+        Parameters:
+            file_name (Union[str, Path]): The path to the SEPA Pain.001 XML file.
+            redact_pii (bool): Whether to redact PII data (address fields).
+        Raises:
+            FileNotFoundError: If the specified file cannot be found.
+        """
+        # Store redact_pii setting
+        self._redact_pii = redact_pii
+        # Delegate to the standalone parser for file I/O and XML parsing
+        self._standalone_parser = StandalonePain001Parser(
+            str(file_name)
+        )
+        tree = self._standalone_parser.tree
+        # Extract payment batches from the already-parsed XML tree.
+        self.batches: list[_Element] = tree.xpath(".//PmtInf")
+        self.batches_count: int = len(self.batches)
+        # Parse payments from each batch.
+        self.payments: list[dict[str, Any]] = []
+        for batch in self.batches:
+            payments: list[dict[str, Any]] = self._parse_batch(batch)
+            self.payments.extend(payments)
+        self.total_payments_count: int = len(self.payments)
+    def _parse_batch_header(self, batch: _Element) -> dict[str, str]:
+        """
+        Parses header data for a payment batch.
+        Parameters:
+            batch (_Element): The XML element representing a payment batch.
+        Returns:
+            Dict[str, str]: A dictionary containing header information of the batch.
+        """
+        # Extract relevant information from the batch header.
+        exec_elems = batch.xpath(".//ReqdExctnDt")
+        execution_date: str = exec_elems[0].text if exec_elems else ""
+        debtor_elems = batch.xpath(".//Dbtr/Nm")
+        debtor_name: str = debtor_elems[0].text if debtor_elems else ""
+        debtor_account: str = (
+            batch.xpath(".//DbtrAcct/Id/IBAN|.//DbtrAcct/Id/Othr/Id")[
+                0
+            ].text
+            if batch.xpath(".//DbtrAcct/Id/IBAN|.//DbtrAcct/Id/Othr/Id")
+            else ""
+        )
+        return {
+            "debtor_name": debtor_name,
+            "debtor_account": debtor_account,
+            "execution_date": execution_date,
+        }
+    def _parse_batch(self, batch: _Element) -> list[dict[str, Any]]:
+        """
+        Parses all payments in a payment batch.
+        Parameters:
+            batch (_Element): The XML element representing a payment batch.
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries, each representing a payment.
+        """
+        # Parse header data for the batch.
+        header: dict[str, str] = self._parse_batch_header(batch)
+        # Parse each payment in the batch.
+        payments: list[dict[str, Any]] = []
+        for payment in batch.xpath(".//CdtTrfTxInf"):
+            payment_dict: dict[str, Any] = self._parse_payment(
+                payment, self._redact_pii
+            )
+            payment_dict.update(header)
+            payments.append(payment_dict)
+        return payments
+    def _parse_payment(
+        self, payment: _Element, redact_pii: bool = False
+    ) -> dict[str, Any]:
+        """
+        Parses a single payment within a payment batch.
+        Parameters:
+            payment (_Element): The XML element representing a single payment.
+            redact_pii (bool): Whether to redact PII data (address fields).
+        Returns:
+            Dict[str, Any]: A dictionary containing information about the payment.
+        """
+        # Extract relevant information from the payment.
+        amount: str = payment.xpath(".//InstdAmt")[0].text
+        currency: str = payment.xpath(".//InstdAmt/@Ccy")[0]
+        name: str = payment.xpath(".//Cdtr/Nm")[0].text
+        account: str = (
+            payment.xpath(".//CdtrAcct/Id/IBAN|.//CdtrAcct/Id/Othr/Id")[
+                0
+            ].text
+            if payment.xpath(
+                ".//CdtrAcct/Id/IBAN|.//CdtrAcct/Id/Othr/Id"
+            )
+            else ""
+        )
+        country: str = (
+            payment.xpath(".//Ctry")[0].text
+            if payment.xpath(".//Ctry")
+            else ""
+        )
+        references: list[str] = [
+            ref.text for ref in payment.xpath(".//RmtInf/Ustrd")
+        ]
+        reference: str = " ".join(references)
+        address_lines: list[str] = [
+            line.text for line in payment.xpath(".//AdrLine")
+        ]
+        address: str = " ".join(address_lines)
+        # Apply PII redaction if requested
+        if redact_pii:
+            address = "***REDACTED***" if address else address
+        return {
+            "Name": name,
+            "Amount": float(amount),
+            "Currency": currency,
+            "Reference": reference,
+            "CreditorAccount": account,
+            "Country": country,
+            "Address": address,
+        }
+    def __repr__(self) -> str:
+        """
+        Returns a string representation of the Pain001Parser instance.
+        Returns:
+            str: A string representation of the instance.
+        """
+        return (
+            f"Pain001Parser(batches={self.batches_count}, "
+            f"payments={self.total_payments_count})"
+        )
+class Camt053Parser:
+    """
+    Compatibility wrapper for CAMT.053 bank account statement files.
+    This maintains the original API while delegating to enhanced standalone implementation.
+    Attributes:
+        statements (list): A list of dictionaries, each representing a statement.
+        transactions (list): A list of dictionaries, each representing a transaction.
+    """
+    # Balance type definitions.
+    DEFINITIONS = {
+        "OPBD": "Opening booked balance",
+        "CLBD": "Closing booked balance",
+        "CLAV": "Closing available balance",
+    }
+    def __init__(
+        self, file_name: Union[str, Path], redact_pii: bool = False
+    ) -> None:
+        """
+        Initializes the parser and parses statements and transactions from the given file.
+        Parameters:
+            file_name (Union[str, Path]): The path to the CAMT.053 XML file.
+            redact_pii (bool): Whether to redact PII data (address fields).
+        Raises:
+            FileNotFoundError: If the specified file cannot be found.
+            FileParserError: If the file is not a valid CAMT.053 file or if it
+            does not contain any statements.
+        """
+        # Use the enhanced standalone parser internally
+        try:
+            self._parser = CamtParser(str(file_name))
+            # Convert standalone parser output to original API format
+            # Get data from enhanced parser
+            balances_df = self._parser.get_account_balances(
+                redact_pii=redact_pii
+            )
+            transactions_df = self._parser.get_transactions(
+                redact_pii=redact_pii
+            )
+            stats_df = self._parser.get_statement_stats(
+                redact_pii=redact_pii
+            )
+            # Convert to original format
+            self.statements = (
+                stats_df.to_dict("records")
+                if not stats_df.empty
+                else []
+            )
+            self.transactions = (
+                transactions_df.to_dict("records")
+                if not transactions_df.empty
+                else []
+            )
+            # Add balance information to statements if available
+            if not balances_df.empty:
+                balances_by_account: dict[
+                    str, dict[str, dict[str, str]]
+                ] = {}
+                for account_id, group in balances_df.groupby(
+                    "AccountId"
+                ):
+                    balances_by_account[account_id] = {
+                        str(row["Code"]): {
+                            "Amount": str(row["Amount"]),
+                            "Description": str(row["Description"]),
+                        }
+                        for row in group.to_dict("records")
+                    }
+                for stmt in self.statements:
+                    account_id = stmt.get("AccountId")
+                    if account_id in balances_by_account:
+                        stmt.update(balances_by_account[account_id])
+        except ValidationError as e:
+            raise FileParserError("Not a valid CAMT.053 file") from e
+        except FileNotFoundError as e:
+            raise FileNotFoundError(
+                f"File {file_name} not found!"
+            ) from e
+        except Exception as e:
+            raise FileParserError("Not a valid CAMT.053 file") from e
+    def __repr__(self) -> str:
+        """
+        Returns a string representation of the Camt053Parser instance.
+        Returns:
+            str: A string representation of the instance.
+        """
+        return (
+            f"Camt053Parser("
+            f"statements={len(self.statements)}, "
+            f"transactions={len(self.transactions)})"
+        )
+def process_camt053_folder(
+    folder: Union[str, Path], redact_pii: bool = False
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    Processes all CAMT.053 files in a specified folder.
+    Parameters:
+        folder (Union[str, Path]): The path to the folder containing CAMT.053 files.
+        redact_pii (bool): Whether to redact PII data (address fields).
+    Returns:
+        Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three pandas DataFrames:
+            - files_df: A DataFrame with information about the processed files.
+            - statements_df: A DataFrame with parsed statement data.
+            - transactions_df: A DataFrame with parsed transaction data.
+    """
+    files_df_list: list[dict[str, str]] = []
+    statements_df: pd.DataFrame = pd.DataFrame()
+    transactions_df: pd.DataFrame = pd.DataFrame()
+    # Loop through each file in the specified folder.
+    for file_name in os.listdir(folder):
+        file_path: str = os.path.join(folder, file_name)
+        if os.path.isfile(file_path):
+            try:
+                # Attempt to parse the file using the compatibility wrapper.
+                parser: Camt053Parser = Camt053Parser(
+                    file_path, redact_pii=redact_pii
+                )
+                # Append parsed data to the respective DataFrames.
+                statement_rows: list[dict[str, Any]] = list(
+                    parser.statements
+                )
+                statements_df = pd.concat(
+                    [statements_df, pd.DataFrame(statement_rows)]
+                )
+                transaction_rows: list[dict[str, Any]] = list(
+                    parser.transactions
+                )
+                transactions_df = pd.concat(
+                    [transactions_df, pd.DataFrame(transaction_rows)]
+                )
+                # Record the successful processing of the file.
+                files_df_list.append(
+                    {"FileName": file_name, "Status": "Success"}
+                )
+            except Exception as e:
+                # Record any failures along with the associated error message.
+                files_df_list.append(
+                    {"FileName": file_name, "Status": f"Failed: {e}"}
+                )
+    # Convert the list of file statuses to a DataFrame.
+    files_df: pd.DataFrame = pd.DataFrame(files_df_list)
+    return files_df, statements_df, transactions_df

bankstatementparser/base_parser.py ADDED Viewed

@@ -0,0 +1,205 @@
+# Copyright (C) 2023 Sebastien Rousseau.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+base_parser.py
+Abstract base class for bank statement parsers providing a standardized
+interface for parsing different bank statement formats.
+"""
+import importlib
+import json
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import TYPE_CHECKING, Union
+import pandas as pd
+from .exceptions import ExportError
+from .record_types import SummaryRecord
+if TYPE_CHECKING:
+    import polars as pl
+class BankStatementParser(ABC):
+    """
+    Abstract base class for bank statement parsers.
+    This class defines a standardized interface that all bank statement
+    parsers should implement, ensuring consistency across different
+    statement formats (CAMT, PAIN001, etc.).
+    Attributes:
+        file_name (str): Path to the bank statement file being parsed.
+    """
+    def __init__(self, file_name: Union[str, Path]) -> None:
+        """
+        Initialize the parser with a file path.
+        Args:
+            file_name (Union[str, Path]): Path to the bank statement file.
+        """
+        self.file_name = str(file_name)
+    @abstractmethod
+    def parse(self) -> pd.DataFrame:
+        """
+        Parse the bank statement file and return structured data.
+        This method should parse the bank statement file and return
+        a pandas DataFrame containing the parsed transaction data
+        in a standardized format.
+        Returns:
+            pd.DataFrame: Parsed transaction data with standardized columns.
+        Raises:
+            FileNotFoundError: If the file cannot be found.
+            ValidationError: If the file format is invalid.
+            Exception: For other parsing errors.
+        """
+        pass
+    @abstractmethod
+    def get_summary(self) -> SummaryRecord:
+        """
+        Get a summary of the parsed bank statement data.
+        This method should return key statistics and metadata about
+        the bank statement, such as account information, balance data,
+        transaction counts, and totals.
+        Returns:
+            Dict[str, Any]: Summary information including:
+                - account_id: Account identifier
+                - statement_date: Statement date/period
+                - transaction_count: Number of transactions
+                - total_amount: Sum of all transactions
+                - opening_balance: Opening balance (if available)
+                - closing_balance: Closing balance (if available)
+                - currency: Statement currency
+        """
+        pass
+    def export_csv(self, output_path: Union[str, Path]) -> None:
+        """
+        Export parsed data to a CSV file.
+        Args:
+            output_path (Union[str, Path]): Path where CSV file should be saved.
+        Raises:
+            IOError: If file cannot be written.
+        """
+        temp_path = Path(f"{output_path}.tmp")
+        try:
+            df = self.parse()
+            df.to_csv(temp_path, index=False)
+            # Atomic rename to prevent corruption
+            temp_path.replace(output_path)
+        except Exception as exc:
+            # Clean up temp file if it exists
+            if temp_path.exists():
+                temp_path.unlink()
+            raise ExportError(f"Failed to export CSV: {exc}") from exc
+    def export_json(self, output_path: Union[str, Path]) -> None:
+        """
+        Export parsed data to a JSON file.
+        Args:
+            output_path (Union[str, Path]): Path where JSON file should be saved.
+        Raises:
+            IOError: If file cannot be written.
+        """
+        temp_path = Path(f"{output_path}.tmp")
+        try:
+            df = self.parse()
+            # Create structured JSON with summary and transactions
+            data = {
+                "summary": self.get_summary(),
+                "transactions": df.to_dict("records"),
+            }
+            with open(temp_path, "w", encoding="utf-8") as f:
+                json.dump(data, f, indent=2, default=str)
+            # Atomic rename to prevent corruption
+            temp_path.replace(output_path)
+        except Exception as exc:
+            # Clean up temp file if it exists
+            if temp_path.exists():
+                temp_path.unlink()
+            raise ExportError(f"Failed to export JSON: {exc}") from exc
+    def to_polars(self) -> "pl.DataFrame":
+        """
+        Convert parsed transaction data to a Polars DataFrame.
+        Returns:
+            Any: ``polars.DataFrame`` for the parsed data.
+        Raises:
+            ImportError: If the optional ``polars`` dependency is not installed.
+        """
+        try:
+            polars = importlib.import_module("polars")
+        except ImportError as exc:
+            raise ImportError(
+                "Run 'pip install bankstatementparser[polars]' to use this feature."
+            ) from exc
+        return polars.from_pandas(self.parse())
+    def to_polars_lazy(self) -> "pl.LazyFrame":
+        """
+        Convert parsed transaction data to a Polars LazyFrame.
+        Returns:
+            Any: ``polars.LazyFrame`` for the parsed data.
+        """
+        return self.to_polars().lazy()
+    def __repr__(self) -> str:
+        """
+        Return a string representation of the parser.
+        Returns:
+            str: String representation including parser type and file name.
+        """
+        return f"{self.__class__.__name__}(file='{self.file_name}')"
+    def __str__(self) -> str:
+        """
+        Return a human-readable string representation.
+        Returns:
+            str: Human-readable representation with summary information.
+        """
+        try:
+            summary = self.get_summary()
+            return (
+                f"{self.__class__.__name__}: "
+                f"Account {summary.get('account_id', 'Unknown')}, "
+                f"{summary.get('transaction_count', 0)} transactions"
+            )
+        except Exception:
+            return f"{self.__class__.__name__}(file='{self.file_name}')"