bankstatementparser 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,370 @@
1
+ # Copyright (C) 2023 Sebastien Rousseau.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
+ # implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ bank_statement_parsers.py
18
+
19
+ This module provides consolidated access to bank statement parsing functionality.
20
+ The actual parser implementations are in standalone modules with compatibility wrappers.
21
+ """
22
+
23
+ import os
24
+ from pathlib import Path
25
+ from typing import Any, Union
26
+
27
+ import pandas as pd
28
+ from lxml.etree import _Element
29
+
30
+ # Import parsers from standalone modules
31
+ from .camt_parser import CamtParser
32
+ from .input_validator import ValidationError
33
+ from .pain001_parser import Pain001Parser as StandalonePain001Parser
34
+
35
+
36
+ class FileParserError(Exception):
37
+ """Custom exception for file parsing errors."""
38
+
39
+ pass
40
+
41
+
42
+ class Pain001Parser:
43
+ """
44
+ Compatibility wrapper for SEPA Pain.001 credit transfer files.
45
+
46
+ This maintains the original API while delegating to enhanced standalone implementation.
47
+
48
+ Attributes:
49
+ batches (list): List of batch elements parsed from the file.
50
+ payments (list): List of parsed payment dictionaries.
51
+ batches_count (int): The number of payment batches in the file.
52
+ total_payments_count (int): The total number of payments across all batches.
53
+ """
54
+
55
+ def __init__(
56
+ self, file_name: Union[str, Path], redact_pii: bool = False
57
+ ) -> None:
58
+ """
59
+ Initializes the parser and parses payments from the given file.
60
+
61
+ Parameters:
62
+ file_name (Union[str, Path]): The path to the SEPA Pain.001 XML file.
63
+ redact_pii (bool): Whether to redact PII data (address fields).
64
+
65
+ Raises:
66
+ FileNotFoundError: If the specified file cannot be found.
67
+ """
68
+ # Store redact_pii setting
69
+ self._redact_pii = redact_pii
70
+
71
+ # Delegate to the standalone parser for file I/O and XML parsing
72
+ self._standalone_parser = StandalonePain001Parser(
73
+ str(file_name)
74
+ )
75
+ tree = self._standalone_parser.tree
76
+
77
+ # Extract payment batches from the already-parsed XML tree.
78
+ self.batches: list[_Element] = tree.xpath(".//PmtInf")
79
+ self.batches_count: int = len(self.batches)
80
+
81
+ # Parse payments from each batch.
82
+ self.payments: list[dict[str, Any]] = []
83
+ for batch in self.batches:
84
+ payments: list[dict[str, Any]] = self._parse_batch(batch)
85
+ self.payments.extend(payments)
86
+
87
+ self.total_payments_count: int = len(self.payments)
88
+
89
+ def _parse_batch_header(self, batch: _Element) -> dict[str, str]:
90
+ """
91
+ Parses header data for a payment batch.
92
+
93
+ Parameters:
94
+ batch (_Element): The XML element representing a payment batch.
95
+
96
+ Returns:
97
+ Dict[str, str]: A dictionary containing header information of the batch.
98
+ """
99
+ # Extract relevant information from the batch header.
100
+ exec_elems = batch.xpath(".//ReqdExctnDt")
101
+ execution_date: str = exec_elems[0].text if exec_elems else ""
102
+ debtor_elems = batch.xpath(".//Dbtr/Nm")
103
+ debtor_name: str = debtor_elems[0].text if debtor_elems else ""
104
+ debtor_account: str = (
105
+ batch.xpath(".//DbtrAcct/Id/IBAN|.//DbtrAcct/Id/Othr/Id")[
106
+ 0
107
+ ].text
108
+ if batch.xpath(".//DbtrAcct/Id/IBAN|.//DbtrAcct/Id/Othr/Id")
109
+ else ""
110
+ )
111
+
112
+ return {
113
+ "debtor_name": debtor_name,
114
+ "debtor_account": debtor_account,
115
+ "execution_date": execution_date,
116
+ }
117
+
118
+ def _parse_batch(self, batch: _Element) -> list[dict[str, Any]]:
119
+ """
120
+ Parses all payments in a payment batch.
121
+
122
+ Parameters:
123
+ batch (_Element): The XML element representing a payment batch.
124
+
125
+ Returns:
126
+ List[Dict[str, Any]]: A list of dictionaries, each representing a payment.
127
+ """
128
+ # Parse header data for the batch.
129
+ header: dict[str, str] = self._parse_batch_header(batch)
130
+
131
+ # Parse each payment in the batch.
132
+ payments: list[dict[str, Any]] = []
133
+ for payment in batch.xpath(".//CdtTrfTxInf"):
134
+ payment_dict: dict[str, Any] = self._parse_payment(
135
+ payment, self._redact_pii
136
+ )
137
+ payment_dict.update(header)
138
+ payments.append(payment_dict)
139
+
140
+ return payments
141
+
142
+ def _parse_payment(
143
+ self, payment: _Element, redact_pii: bool = False
144
+ ) -> dict[str, Any]:
145
+ """
146
+ Parses a single payment within a payment batch.
147
+
148
+ Parameters:
149
+ payment (_Element): The XML element representing a single payment.
150
+ redact_pii (bool): Whether to redact PII data (address fields).
151
+
152
+ Returns:
153
+ Dict[str, Any]: A dictionary containing information about the payment.
154
+ """
155
+ # Extract relevant information from the payment.
156
+ amount: str = payment.xpath(".//InstdAmt")[0].text
157
+ currency: str = payment.xpath(".//InstdAmt/@Ccy")[0]
158
+ name: str = payment.xpath(".//Cdtr/Nm")[0].text
159
+ account: str = (
160
+ payment.xpath(".//CdtrAcct/Id/IBAN|.//CdtrAcct/Id/Othr/Id")[
161
+ 0
162
+ ].text
163
+ if payment.xpath(
164
+ ".//CdtrAcct/Id/IBAN|.//CdtrAcct/Id/Othr/Id"
165
+ )
166
+ else ""
167
+ )
168
+ country: str = (
169
+ payment.xpath(".//Ctry")[0].text
170
+ if payment.xpath(".//Ctry")
171
+ else ""
172
+ )
173
+ references: list[str] = [
174
+ ref.text for ref in payment.xpath(".//RmtInf/Ustrd")
175
+ ]
176
+ reference: str = " ".join(references)
177
+ address_lines: list[str] = [
178
+ line.text for line in payment.xpath(".//AdrLine")
179
+ ]
180
+ address: str = " ".join(address_lines)
181
+
182
+ # Apply PII redaction if requested
183
+ if redact_pii:
184
+ address = "***REDACTED***" if address else address
185
+
186
+ return {
187
+ "Name": name,
188
+ "Amount": float(amount),
189
+ "Currency": currency,
190
+ "Reference": reference,
191
+ "CreditorAccount": account,
192
+ "Country": country,
193
+ "Address": address,
194
+ }
195
+
196
+ def __repr__(self) -> str:
197
+ """
198
+ Returns a string representation of the Pain001Parser instance.
199
+
200
+ Returns:
201
+ str: A string representation of the instance.
202
+ """
203
+ return (
204
+ f"Pain001Parser(batches={self.batches_count}, "
205
+ f"payments={self.total_payments_count})"
206
+ )
207
+
208
+
209
+ class Camt053Parser:
210
+ """
211
+ Compatibility wrapper for CAMT.053 bank account statement files.
212
+
213
+ This maintains the original API while delegating to enhanced standalone implementation.
214
+
215
+ Attributes:
216
+ statements (list): A list of dictionaries, each representing a statement.
217
+ transactions (list): A list of dictionaries, each representing a transaction.
218
+ """
219
+
220
+ # Balance type definitions.
221
+ DEFINITIONS = {
222
+ "OPBD": "Opening booked balance",
223
+ "CLBD": "Closing booked balance",
224
+ "CLAV": "Closing available balance",
225
+ }
226
+
227
+ def __init__(
228
+ self, file_name: Union[str, Path], redact_pii: bool = False
229
+ ) -> None:
230
+ """
231
+ Initializes the parser and parses statements and transactions from the given file.
232
+
233
+ Parameters:
234
+ file_name (Union[str, Path]): The path to the CAMT.053 XML file.
235
+ redact_pii (bool): Whether to redact PII data (address fields).
236
+
237
+ Raises:
238
+ FileNotFoundError: If the specified file cannot be found.
239
+ FileParserError: If the file is not a valid CAMT.053 file or if it
240
+ does not contain any statements.
241
+ """
242
+ # Use the enhanced standalone parser internally
243
+ try:
244
+ self._parser = CamtParser(str(file_name))
245
+
246
+ # Convert standalone parser output to original API format
247
+ # Get data from enhanced parser
248
+ balances_df = self._parser.get_account_balances(
249
+ redact_pii=redact_pii
250
+ )
251
+ transactions_df = self._parser.get_transactions(
252
+ redact_pii=redact_pii
253
+ )
254
+ stats_df = self._parser.get_statement_stats(
255
+ redact_pii=redact_pii
256
+ )
257
+
258
+ # Convert to original format
259
+ self.statements = (
260
+ stats_df.to_dict("records")
261
+ if not stats_df.empty
262
+ else []
263
+ )
264
+ self.transactions = (
265
+ transactions_df.to_dict("records")
266
+ if not transactions_df.empty
267
+ else []
268
+ )
269
+
270
+ # Add balance information to statements if available
271
+ if not balances_df.empty:
272
+ balances_by_account: dict[
273
+ str, dict[str, dict[str, str]]
274
+ ] = {}
275
+ for account_id, group in balances_df.groupby(
276
+ "AccountId"
277
+ ):
278
+ balances_by_account[account_id] = {
279
+ str(row["Code"]): {
280
+ "Amount": str(row["Amount"]),
281
+ "Description": str(row["Description"]),
282
+ }
283
+ for row in group.to_dict("records")
284
+ }
285
+
286
+ for stmt in self.statements:
287
+ account_id = stmt.get("AccountId")
288
+ if account_id in balances_by_account:
289
+ stmt.update(balances_by_account[account_id])
290
+
291
+ except ValidationError as e:
292
+ raise FileParserError("Not a valid CAMT.053 file") from e
293
+ except FileNotFoundError as e:
294
+ raise FileNotFoundError(
295
+ f"File {file_name} not found!"
296
+ ) from e
297
+ except Exception as e:
298
+ raise FileParserError("Not a valid CAMT.053 file") from e
299
+
300
+ def __repr__(self) -> str:
301
+ """
302
+ Returns a string representation of the Camt053Parser instance.
303
+
304
+ Returns:
305
+ str: A string representation of the instance.
306
+ """
307
+ return (
308
+ f"Camt053Parser("
309
+ f"statements={len(self.statements)}, "
310
+ f"transactions={len(self.transactions)})"
311
+ )
312
+
313
+
314
+ def process_camt053_folder(
315
+ folder: Union[str, Path], redact_pii: bool = False
316
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
317
+ """
318
+ Processes all CAMT.053 files in a specified folder.
319
+
320
+ Parameters:
321
+ folder (Union[str, Path]): The path to the folder containing CAMT.053 files.
322
+ redact_pii (bool): Whether to redact PII data (address fields).
323
+
324
+ Returns:
325
+ Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three pandas DataFrames:
326
+ - files_df: A DataFrame with information about the processed files.
327
+ - statements_df: A DataFrame with parsed statement data.
328
+ - transactions_df: A DataFrame with parsed transaction data.
329
+ """
330
+ files_df_list: list[dict[str, str]] = []
331
+ statements_df: pd.DataFrame = pd.DataFrame()
332
+ transactions_df: pd.DataFrame = pd.DataFrame()
333
+
334
+ # Loop through each file in the specified folder.
335
+ for file_name in os.listdir(folder):
336
+ file_path: str = os.path.join(folder, file_name)
337
+ if os.path.isfile(file_path):
338
+ try:
339
+ # Attempt to parse the file using the compatibility wrapper.
340
+ parser: Camt053Parser = Camt053Parser(
341
+ file_path, redact_pii=redact_pii
342
+ )
343
+
344
+ # Append parsed data to the respective DataFrames.
345
+ statement_rows: list[dict[str, Any]] = list(
346
+ parser.statements
347
+ )
348
+ statements_df = pd.concat(
349
+ [statements_df, pd.DataFrame(statement_rows)]
350
+ )
351
+ transaction_rows: list[dict[str, Any]] = list(
352
+ parser.transactions
353
+ )
354
+ transactions_df = pd.concat(
355
+ [transactions_df, pd.DataFrame(transaction_rows)]
356
+ )
357
+
358
+ # Record the successful processing of the file.
359
+ files_df_list.append(
360
+ {"FileName": file_name, "Status": "Success"}
361
+ )
362
+ except Exception as e:
363
+ # Record any failures along with the associated error message.
364
+ files_df_list.append(
365
+ {"FileName": file_name, "Status": f"Failed: {e}"}
366
+ )
367
+
368
+ # Convert the list of file statuses to a DataFrame.
369
+ files_df: pd.DataFrame = pd.DataFrame(files_df_list)
370
+ return files_df, statements_df, transactions_df
@@ -0,0 +1,205 @@
1
+ # Copyright (C) 2023 Sebastien Rousseau.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
+ # implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ base_parser.py
18
+
19
+ Abstract base class for bank statement parsers providing a standardized
20
+ interface for parsing different bank statement formats.
21
+ """
22
+
23
+ import importlib
24
+ import json
25
+ from abc import ABC, abstractmethod
26
+ from pathlib import Path
27
+ from typing import TYPE_CHECKING, Union
28
+
29
+ import pandas as pd
30
+
31
+ from .exceptions import ExportError
32
+ from .record_types import SummaryRecord
33
+
34
+ if TYPE_CHECKING:
35
+ import polars as pl
36
+
37
+
38
+ class BankStatementParser(ABC):
39
+ """
40
+ Abstract base class for bank statement parsers.
41
+
42
+ This class defines a standardized interface that all bank statement
43
+ parsers should implement, ensuring consistency across different
44
+ statement formats (CAMT, PAIN001, etc.).
45
+
46
+ Attributes:
47
+ file_name (str): Path to the bank statement file being parsed.
48
+ """
49
+
50
+ def __init__(self, file_name: Union[str, Path]) -> None:
51
+ """
52
+ Initialize the parser with a file path.
53
+
54
+ Args:
55
+ file_name (Union[str, Path]): Path to the bank statement file.
56
+ """
57
+ self.file_name = str(file_name)
58
+
59
+ @abstractmethod
60
+ def parse(self) -> pd.DataFrame:
61
+ """
62
+ Parse the bank statement file and return structured data.
63
+
64
+ This method should parse the bank statement file and return
65
+ a pandas DataFrame containing the parsed transaction data
66
+ in a standardized format.
67
+
68
+ Returns:
69
+ pd.DataFrame: Parsed transaction data with standardized columns.
70
+
71
+ Raises:
72
+ FileNotFoundError: If the file cannot be found.
73
+ ValidationError: If the file format is invalid.
74
+ Exception: For other parsing errors.
75
+ """
76
+ pass
77
+
78
+ @abstractmethod
79
+ def get_summary(self) -> SummaryRecord:
80
+ """
81
+ Get a summary of the parsed bank statement data.
82
+
83
+ This method should return key statistics and metadata about
84
+ the bank statement, such as account information, balance data,
85
+ transaction counts, and totals.
86
+
87
+ Returns:
88
+ Dict[str, Any]: Summary information including:
89
+ - account_id: Account identifier
90
+ - statement_date: Statement date/period
91
+ - transaction_count: Number of transactions
92
+ - total_amount: Sum of all transactions
93
+ - opening_balance: Opening balance (if available)
94
+ - closing_balance: Closing balance (if available)
95
+ - currency: Statement currency
96
+ """
97
+ pass
98
+
99
+ def export_csv(self, output_path: Union[str, Path]) -> None:
100
+ """
101
+ Export parsed data to a CSV file.
102
+
103
+ Args:
104
+ output_path (Union[str, Path]): Path where CSV file should be saved.
105
+
106
+ Raises:
107
+ IOError: If file cannot be written.
108
+ """
109
+ temp_path = Path(f"{output_path}.tmp")
110
+ try:
111
+ df = self.parse()
112
+ df.to_csv(temp_path, index=False)
113
+
114
+ # Atomic rename to prevent corruption
115
+ temp_path.replace(output_path)
116
+ except Exception as exc:
117
+ # Clean up temp file if it exists
118
+ if temp_path.exists():
119
+ temp_path.unlink()
120
+ raise ExportError(f"Failed to export CSV: {exc}") from exc
121
+
122
+ def export_json(self, output_path: Union[str, Path]) -> None:
123
+ """
124
+ Export parsed data to a JSON file.
125
+
126
+ Args:
127
+ output_path (Union[str, Path]): Path where JSON file should be saved.
128
+
129
+ Raises:
130
+ IOError: If file cannot be written.
131
+ """
132
+ temp_path = Path(f"{output_path}.tmp")
133
+ try:
134
+ df = self.parse()
135
+
136
+ # Create structured JSON with summary and transactions
137
+ data = {
138
+ "summary": self.get_summary(),
139
+ "transactions": df.to_dict("records"),
140
+ }
141
+
142
+ with open(temp_path, "w", encoding="utf-8") as f:
143
+ json.dump(data, f, indent=2, default=str)
144
+
145
+ # Atomic rename to prevent corruption
146
+ temp_path.replace(output_path)
147
+ except Exception as exc:
148
+ # Clean up temp file if it exists
149
+ if temp_path.exists():
150
+ temp_path.unlink()
151
+ raise ExportError(f"Failed to export JSON: {exc}") from exc
152
+
153
+ def to_polars(self) -> "pl.DataFrame":
154
+ """
155
+ Convert parsed transaction data to a Polars DataFrame.
156
+
157
+ Returns:
158
+ Any: ``polars.DataFrame`` for the parsed data.
159
+
160
+ Raises:
161
+ ImportError: If the optional ``polars`` dependency is not installed.
162
+ """
163
+ try:
164
+ polars = importlib.import_module("polars")
165
+ except ImportError as exc:
166
+ raise ImportError(
167
+ "Run 'pip install bankstatementparser[polars]' to use this feature."
168
+ ) from exc
169
+
170
+ return polars.from_pandas(self.parse())
171
+
172
+ def to_polars_lazy(self) -> "pl.LazyFrame":
173
+ """
174
+ Convert parsed transaction data to a Polars LazyFrame.
175
+
176
+ Returns:
177
+ Any: ``polars.LazyFrame`` for the parsed data.
178
+ """
179
+ return self.to_polars().lazy()
180
+
181
+ def __repr__(self) -> str:
182
+ """
183
+ Return a string representation of the parser.
184
+
185
+ Returns:
186
+ str: String representation including parser type and file name.
187
+ """
188
+ return f"{self.__class__.__name__}(file='{self.file_name}')"
189
+
190
+ def __str__(self) -> str:
191
+ """
192
+ Return a human-readable string representation.
193
+
194
+ Returns:
195
+ str: Human-readable representation with summary information.
196
+ """
197
+ try:
198
+ summary = self.get_summary()
199
+ return (
200
+ f"{self.__class__.__name__}: "
201
+ f"Account {summary.get('account_id', 'Unknown')}, "
202
+ f"{summary.get('transaction_count', 0)} transactions"
203
+ )
204
+ except Exception:
205
+ return f"{self.__class__.__name__}(file='{self.file_name}')"