bankstatementparser 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,971 @@
1
+ # Copyright (C) 2023 Sebastien Rousseau.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
+ # implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ camt_parser.py
18
+
19
+ Provides a class CamtParser for parsing CAMT format bank statement files.
20
+ """
21
+
22
+ import logging
23
+ import re
24
+ from collections.abc import Generator
25
+ from io import BytesIO
26
+ from pathlib import Path
27
+ from typing import Optional, Union
28
+
29
+ import pandas as pd
30
+ from lxml import etree
31
+
32
+ from .base_parser import BankStatementParser
33
+ from .input_validator import InputValidator, ValidationError
34
+ from .record_types import (
35
+ BalanceRecord,
36
+ StatementStatsRecord,
37
+ SummaryRecord,
38
+ TransactionRecord,
39
+ )
40
+
41
+ # Configuring the logging
42
+ logger = logging.getLogger(__name__)
43
+
44
+ CAMT_NAMESPACE_PATTERN = re.compile(
45
+ r'\s+xmlns="urn:iso:std:iso:20022:tech:xsd:camt\.\d{3}\.\d{3}\.\d{2}"'
46
+ )
47
+
48
+
49
+ class CamtParser(BankStatementParser):
50
+ """
51
+ Class to parse CAMT format bank statement files.
52
+
53
+ Attributes:
54
+ tree (etree.Element): The Element object representing the parsed XML
55
+ file.
56
+ definitions (dict): Dictionary mapping balance codes to descriptions.
57
+ """
58
+
59
+ _file_path: Optional[str]
60
+ _source_name: str
61
+ _source_is_memory: bool
62
+ _xml_bytes: bytes
63
+
64
+ def __init__(self, file_name: Union[str, Path]) -> None:
65
+ """
66
+ Initializes the parser with the given file.
67
+
68
+ Parameters:
69
+ file_name (str): Path to the CAMT format statement file.
70
+
71
+ Raises:
72
+ FileNotFoundError: If file does not exist.
73
+ ValidationError: If file validation fails.
74
+ etree.XMLSyntaxError: If there is an issue parsing the XML.
75
+ """
76
+ super().__init__(file_name)
77
+ self._initialize_from_file(file_name)
78
+ self._set_definitions()
79
+
80
+ @classmethod
81
+ def from_string(
82
+ cls,
83
+ xml_content: str,
84
+ *,
85
+ source_name: str = "<memory>",
86
+ max_bytes: Optional[int] = None,
87
+ ) -> "CamtParser":
88
+ """Create a parser from an in-memory XML string."""
89
+ return cls._from_memory(
90
+ xml_content, source_name=source_name, max_bytes=max_bytes
91
+ )
92
+
93
+ @classmethod
94
+ def from_bytes(
95
+ cls,
96
+ xml_content: bytes,
97
+ *,
98
+ source_name: str = "<memory>",
99
+ max_bytes: Optional[int] = None,
100
+ ) -> "CamtParser":
101
+ """Create a parser from in-memory XML bytes."""
102
+ return cls._from_memory(
103
+ xml_content, source_name=source_name, max_bytes=max_bytes
104
+ )
105
+
106
+ @classmethod
107
+ def _from_memory(
108
+ cls,
109
+ xml_content: Union[str, bytes],
110
+ *,
111
+ source_name: str,
112
+ max_bytes: Optional[int],
113
+ ) -> "CamtParser":
114
+ """Internal constructor for memory-backed XML sources."""
115
+ validator = InputValidator(max_file_size=max_bytes)
116
+ raw_bytes, safe_source_name = validator.validate_xml_content(
117
+ xml_content, source_name=source_name
118
+ )
119
+
120
+ parser = cls.__new__(cls)
121
+ BankStatementParser.__init__(parser, safe_source_name)
122
+ parser._original_file_name = safe_source_name
123
+ parser._file_path = None
124
+ parser._source_name = safe_source_name
125
+ parser._source_is_memory = True
126
+ parser._xml_bytes = parser._normalize_xml_bytes(raw_bytes)
127
+ parser.tree = parser._parse_xml_bytes(
128
+ parser._xml_bytes, parser._source_name
129
+ )
130
+ parser._set_definitions()
131
+ return parser
132
+
133
+ def _initialize_from_file(
134
+ self, file_name: Union[str, Path]
135
+ ) -> None:
136
+ """Initialize parser state from a validated filesystem path."""
137
+ validator = InputValidator()
138
+ validated_path: Union[str, Path] = file_name
139
+
140
+ if isinstance(file_name, str):
141
+ try:
142
+ validated_path = validator.validate_input_file_path(
143
+ file_name
144
+ )
145
+ logger.info("Input file validated: %s", validated_path)
146
+ except (ValidationError, FileNotFoundError) as e:
147
+ logger.error(
148
+ "File validation failed for %s: %s", file_name, e
149
+ )
150
+ raise
151
+
152
+ self._original_file_name = file_name
153
+ self._file_path = str(validated_path)
154
+ self._source_name = str(validated_path)
155
+ self._source_is_memory = False
156
+
157
+ raw_bytes = self._read_xml_file_bytes(self._file_path)
158
+ self._xml_bytes = self._normalize_xml_bytes(raw_bytes)
159
+ self.tree = self._parse_xml_bytes(
160
+ self._xml_bytes, self._source_name
161
+ )
162
+
163
+ def _set_definitions(self) -> None:
164
+ """Set static balance code definitions."""
165
+ self.definitions = {
166
+ "OPBD": "Opening Booked balance",
167
+ "CLBD": "Closing Booked balance",
168
+ "CLAV": "Closing Available balance",
169
+ "PRCD": "Previously Closed Booked balance",
170
+ "FWAV": "Forward Available balance",
171
+ }
172
+
173
+ def _read_xml_file_bytes(self, file_name: str) -> bytes:
174
+ """Read XML file bytes without exposing payload contents in errors."""
175
+ try:
176
+ with open(file_name, "rb") as f:
177
+ return f.read()
178
+ except FileNotFoundError as exc:
179
+ logger.error("File %s not found!", file_name)
180
+ raise FileNotFoundError(
181
+ f"CAMT file not found: {file_name}"
182
+ ) from exc
183
+ except PermissionError as exc:
184
+ logger.error(
185
+ "Permission denied reading file: %s", file_name
186
+ )
187
+ raise ValidationError(
188
+ f"Permission denied reading file: {file_name}"
189
+ ) from exc
190
+ except OSError as e:
191
+ logger.error(
192
+ "An error occurred while reading the file: %s", str(e)
193
+ )
194
+ raise ValidationError(
195
+ f"Error reading file {file_name}: {str(e)}"
196
+ ) from e
197
+
198
+ def _normalize_xml_bytes(self, raw_bytes: bytes) -> bytes:
199
+ """Normalize namespace handling while preserving UTF-8-only parsing."""
200
+ validator = InputValidator()
201
+ validated_bytes, _safe_source = validator.validate_xml_content(
202
+ raw_bytes, source_name=self._source_name
203
+ )
204
+ data = validated_bytes.decode("utf-8")
205
+ data = CAMT_NAMESPACE_PATTERN.sub("", data)
206
+ return data.encode("utf-8")
207
+
208
+ def _parse_xml_bytes(
209
+ self, data_bytes: bytes, source_name: str
210
+ ) -> etree._Element:
211
+ """Parse normalized XML bytes with hardened lxml settings."""
212
+ try:
213
+ strict_parser = etree.XMLParser(
214
+ recover=False,
215
+ encoding="utf-8",
216
+ resolve_entities=False,
217
+ load_dtd=False,
218
+ no_network=True,
219
+ huge_tree=False,
220
+ )
221
+ try:
222
+ return etree.fromstring(data_bytes, strict_parser)
223
+ except etree.XMLSyntaxError as strict_err:
224
+ error_msg = str(strict_err).lower()
225
+ is_entity_error = any(
226
+ kw in error_msg
227
+ for kw in [
228
+ "entity",
229
+ "doctype",
230
+ "dtd",
231
+ "undefined entity",
232
+ "internal error",
233
+ "undeclared entity",
234
+ ]
235
+ )
236
+ if is_entity_error:
237
+ recovery_parser = etree.XMLParser(
238
+ recover=True,
239
+ encoding="utf-8",
240
+ resolve_entities=False,
241
+ load_dtd=False,
242
+ no_network=True,
243
+ huge_tree=False,
244
+ )
245
+ return etree.fromstring(data_bytes, recovery_parser)
246
+ raise
247
+ except etree.XMLSyntaxError as e:
248
+ logger.error(
249
+ "XML syntax error in %s: %s", source_name, str(e)
250
+ )
251
+ raise
252
+ except Exception as e:
253
+ logger.error(
254
+ "An error occurred while parsing XML from %s: %s",
255
+ source_name,
256
+ str(e),
257
+ )
258
+ raise
259
+
260
+ def get_account_balances(
261
+ self, redact_pii: bool = False
262
+ ) -> pd.DataFrame:
263
+ """
264
+ Returns a DataFrame with balances by account.
265
+
266
+ Returns:
267
+ pd.DataFrame: Dataframe with columns:
268
+ Amount, Currency, Code, Description, DrCr, Date, AccountId.
269
+
270
+ Raises:
271
+ ValueError: If a statement contains balance-like elements but no
272
+ properly structured Bal elements.
273
+ """
274
+ # Find all bank statements in the XML
275
+ statements = self.tree.xpath(".//Stmt")
276
+ balances = []
277
+
278
+ # Iterate through each statement to gather balance information
279
+ for statement in statements:
280
+ # Get the balances for the current statement
281
+ bal_list = self._get_balances_for_statement(statement)
282
+
283
+ # Validate: if statement has child elements but no proper balances
284
+ # and no proper account structure, it may be malformed
285
+ if not bal_list and len(statement) > 0:
286
+ has_account = bool(statement.xpath("./Acct"))
287
+ has_entries = bool(statement.xpath("./Ntry"))
288
+ has_bal = bool(statement.xpath(".//Bal"))
289
+ # If statement has children but no standard CAMT elements,
290
+ # it's likely a malformed structure
291
+ if not has_account and not has_entries and not has_bal:
292
+ raise ValueError(
293
+ "Malformed CAMT statement structure: "
294
+ "statement contains unrecognized elements"
295
+ )
296
+
297
+ # Get the account ID for the current statement
298
+ account_id = self._get_account_id(statement)
299
+
300
+ # Add the account ID to each balance entry
301
+ for bal in bal_list:
302
+ bal["AccountId"] = account_id
303
+
304
+ # Add the balances to the list
305
+ balances.extend(bal_list)
306
+
307
+ # Convert the list of balances to a DataFrame and return
308
+ return pd.DataFrame.from_records(balances)
309
+
310
+ def _get_balances_for_statement(
311
+ self, statement: etree._Element
312
+ ) -> list[BalanceRecord]:
313
+ """
314
+ Helper method to extract balances for a single statement.
315
+
316
+ Parameters:
317
+ statement (etree.Element): XML Element representing the statement.
318
+
319
+ Returns:
320
+ list: List of parsed balance dictionaries.
321
+ """
322
+ # Find all balance elements in the statement
323
+ bal_elems = statement.xpath(".//Bal")
324
+
325
+ if not bal_elems:
326
+ return []
327
+
328
+ balances: list[BalanceRecord] = []
329
+
330
+ for elem in bal_elems:
331
+ # Safely extract required fields, skipping malformed balance elements
332
+ code_elems = elem.xpath(".//Cd")
333
+ prtry_elems = elem.xpath(".//Prtry")
334
+ amt_elems = elem.xpath(".//Amt")
335
+ ccy_elems = elem.xpath(".//Amt/@Ccy")
336
+ cdt_dbt_elems = elem.xpath(".//CdtDbtInd")
337
+ date_elems = elem.xpath("./Dt/Dt|./Dt/DtTm")
338
+
339
+ if (
340
+ not amt_elems
341
+ or not ccy_elems
342
+ or not cdt_dbt_elems
343
+ or not date_elems
344
+ ):
345
+ logger.warning(
346
+ "Skipping malformed balance element: missing required fields"
347
+ )
348
+ continue
349
+
350
+ # ISO 20022: Type element contains either Cd or Prtry
351
+ if code_elems:
352
+ code = code_elems[0].text
353
+ elif prtry_elems:
354
+ code = f"Proprietary: {prtry_elems[0].text}"
355
+ else:
356
+ logger.warning(
357
+ "Balance element missing both Cd and Prtry type elements, using N/A"
358
+ )
359
+ code = "N/A"
360
+ amount = float(amt_elems[0].text)
361
+ currency = ccy_elems[0]
362
+ cdt_dbt = cdt_dbt_elems[0].text
363
+ date = date_elems[0].text
364
+ # Apply debit sign adjustment
365
+ if cdt_dbt == "DBIT":
366
+ amount = -amount
367
+
368
+ description = self.definitions.get(code, "Unknown code")
369
+
370
+ balances.append(
371
+ {
372
+ "Amount": amount,
373
+ "Currency": currency,
374
+ "Code": code,
375
+ "Description": description,
376
+ "DrCr": cdt_dbt,
377
+ "Date": date,
378
+ }
379
+ )
380
+
381
+ return balances
382
+
383
+ def get_transactions(
384
+ self, redact_pii: bool = False
385
+ ) -> pd.DataFrame:
386
+ """
387
+ Returns a DataFrame with transactions by account.
388
+
389
+ Returns:
390
+ pd.DataFrame: Dataframe with columns:
391
+ Amount, Currency, DrCr, Debtor, Creditor, Reference,
392
+ ValDt, BookgDt, AccountId.
393
+ """
394
+ # Find all bank statements in the XML
395
+ statements = self.tree.xpath(".//Stmt")
396
+ transactions = []
397
+
398
+ # Iterate through each statement to gather transaction information
399
+ for statement in statements:
400
+ # Get the transactions for the current statement
401
+ tx_list = self._get_transactions_for_statement(
402
+ statement, redact_pii
403
+ )
404
+
405
+ # Get the account ID for the current statement
406
+ account_id = self._get_account_id(statement)
407
+
408
+ # Add the account ID to each transaction entry
409
+ for tx in tx_list:
410
+ tx["AccountId"] = account_id
411
+
412
+ # Add the transactions to the list
413
+ transactions.extend(tx_list)
414
+
415
+ # Convert the list of transactions to a DataFrame and return
416
+ return pd.DataFrame.from_records(transactions)
417
+
418
+ def _get_transactions_for_statement(
419
+ self, statement: etree._Element, redact_pii: bool = False
420
+ ) -> list[TransactionRecord]:
421
+ """
422
+ Helper method to extract transactions for a single statement.
423
+
424
+ Parameters:
425
+ statement (etree.Element): XML Element representing the statement.
426
+ redact_pii (bool): Whether to redact PII data (address fields).
427
+
428
+ Returns:
429
+ list: List of parsed transaction dictionaries.
430
+ """
431
+ # Find all entry elements (transactions) in the statement
432
+ entries = statement.xpath("./Ntry")
433
+
434
+ if not entries:
435
+ return []
436
+
437
+ # Batch XPath queries to eliminate N+1 pattern
438
+ # Pre-extract all data with single queries per field type
439
+ amounts = []
440
+ currencies = []
441
+ cdt_dbt_inds = []
442
+ debtors = []
443
+ creditors = []
444
+ references = []
445
+ value_dates = []
446
+ booking_dates = []
447
+ debtor_addresses = []
448
+ creditor_addresses = []
449
+
450
+ for entry in entries:
451
+ # Essential transaction fields - skip entries missing required fields
452
+ amount_elems = entry.xpath("./Amt")
453
+ currency_elems = entry.xpath("./Amt/@Ccy")
454
+ cdt_dbt_elems = entry.xpath("./CdtDbtInd")
455
+
456
+ if (
457
+ not amount_elems
458
+ or not currency_elems
459
+ or not cdt_dbt_elems
460
+ ):
461
+ logger.warning(
462
+ "Skipping malformed transaction entry: missing required fields"
463
+ )
464
+ continue
465
+
466
+ amounts.append(float(amount_elems[0].text))
467
+ currencies.append(currency_elems[0])
468
+ cdt_dbt_inds.append(cdt_dbt_elems[0].text)
469
+
470
+ # Party information
471
+ debtor_elems = entry.xpath(".//Dbtr/Nm")
472
+ debtors.append(debtor_elems[0].text if debtor_elems else "")
473
+
474
+ creditor_elems = entry.xpath(".//Cdtr/Nm")
475
+ creditors.append(
476
+ creditor_elems[0].text if creditor_elems else ""
477
+ )
478
+
479
+ # References
480
+ ref_elems = entry.xpath(".//Ustrd")
481
+ references.append(
482
+ "".join([ref.text for ref in ref_elems if ref.text])
483
+ )
484
+
485
+ # Dates
486
+ val_date_elems = entry.xpath("./ValDt/Dt")
487
+ if not val_date_elems:
488
+ val_date_elems = entry.xpath("./ValDt/DtTm")
489
+ value_dates.append(
490
+ val_date_elems[0].text if val_date_elems else ""
491
+ )
492
+
493
+ booking_date_elems = entry.xpath("./BookgDt/Dt")
494
+ if not booking_date_elems:
495
+ booking_date_elems = entry.xpath("./BookgDt/DtTm")
496
+ booking_dates.append(
497
+ booking_date_elems[0].text if booking_date_elems else ""
498
+ )
499
+
500
+ # Address information
501
+ debtor_addr_elems = entry.xpath(".//Dbtr/PstlAdr/AdrLine")
502
+ if not debtor_addr_elems:
503
+ debtor_addr_elems = entry.xpath(
504
+ ".//Dbtr/PstlAdr/StrtNm"
505
+ )
506
+ debtor_addr = (
507
+ debtor_addr_elems[0].text if debtor_addr_elems else ""
508
+ )
509
+ debtor_addresses.append(debtor_addr)
510
+
511
+ creditor_addr_elems = entry.xpath(".//Cdtr/PstlAdr/AdrLine")
512
+ if not creditor_addr_elems:
513
+ creditor_addr_elems = entry.xpath(
514
+ ".//Cdtr/PstlAdr/StrtNm"
515
+ )
516
+ creditor_addr = (
517
+ creditor_addr_elems[0].text
518
+ if creditor_addr_elems
519
+ else ""
520
+ )
521
+ creditor_addresses.append(creditor_addr)
522
+
523
+ transactions: list[TransactionRecord] = []
524
+
525
+ # Reconstruct transactions from batched data
526
+ for _i, (
527
+ amount,
528
+ currency,
529
+ cdt_dbt,
530
+ debtor,
531
+ creditor,
532
+ reference,
533
+ val_date,
534
+ book_date,
535
+ debtor_addr,
536
+ creditor_addr,
537
+ ) in enumerate(
538
+ zip(
539
+ amounts,
540
+ currencies,
541
+ cdt_dbt_inds,
542
+ debtors,
543
+ creditors,
544
+ references,
545
+ value_dates,
546
+ booking_dates,
547
+ debtor_addresses,
548
+ creditor_addresses,
549
+ )
550
+ ):
551
+ # Apply debit sign adjustment
552
+ if cdt_dbt == "DBIT":
553
+ amount = -amount
554
+
555
+ # Apply PII redaction if requested
556
+ if redact_pii:
557
+ if debtor_addr:
558
+ debtor_addr = "***REDACTED***"
559
+ if creditor_addr:
560
+ creditor_addr = "***REDACTED***"
561
+
562
+ # Build transaction dictionary
563
+ result: TransactionRecord = {
564
+ "Amount": amount,
565
+ "Currency": currency,
566
+ "DrCr": cdt_dbt,
567
+ "Debtor": debtor,
568
+ "Creditor": creditor,
569
+ "Reference": reference,
570
+ "ValDt": val_date,
571
+ "BookgDt": book_date,
572
+ }
573
+
574
+ # Only add address fields if they exist
575
+ if debtor_addr:
576
+ result["DebtorAddress"] = debtor_addr
577
+ if creditor_addr:
578
+ result["CreditorAddress"] = creditor_addr
579
+
580
+ transactions.append(result)
581
+
582
+ return transactions
583
+
584
+ def _get_element_text(
585
+ self, parent: etree._Element, xpath: str
586
+ ) -> str:
587
+ """
588
+ Helper method to safely get text content of an XML element.
589
+
590
+ Parameters:
591
+ parent (etree.Element): Parent XML element.
592
+ xpath (str): XPath expression to find the child element.
593
+
594
+ Returns:
595
+ str: Text content of the child element if it exists, else an empty
596
+ string.
597
+ """
598
+ element = parent.xpath(xpath)
599
+ return element[0].text if element else ""
600
+
601
+ def _get_account_id(self, statement: etree._Element) -> str:
602
+ """
603
+ Extracts the account ID from a bank statement.
604
+
605
+ Parameters:
606
+ statement (etree.Element): XML Element representing the bank
607
+ statement.
608
+
609
+ Returns:
610
+ str: Account ID.
611
+ """
612
+ id_elems = statement.xpath("./Acct/Id/IBAN|./Acct/Id/Othr/Id")
613
+ return id_elems[0].text if id_elems else ""
614
+
615
+ def get_statement_stats(
616
+ self, redact_pii: bool = False
617
+ ) -> pd.DataFrame:
618
+ """
619
+ Returns a DataFrame with statistics for each bank statement.
620
+
621
+ Returns:
622
+ pd.DataFrame: Dataframe with columns:
623
+ AccountId, StatementCreated, NumTransactions, NetAmount.
624
+ """
625
+ # Find all bank statements in the XML
626
+ statements = self.tree.xpath(".//Stmt")
627
+ stats = []
628
+
629
+ # Iterate through each statement to gather statistics
630
+ for statement in statements:
631
+ stmt_stats = self._get_statement_stats(
632
+ statement, redact_pii
633
+ )
634
+ stats.append(stmt_stats)
635
+
636
+ # Convert the list of statistics to a DataFrame and return
637
+ return pd.DataFrame.from_records(stats)
638
+
639
+ def _get_statement_stats(
640
+ self, statement: etree._Element, redact_pii: bool = False
641
+ ) -> StatementStatsRecord:
642
+ """
643
+ Extracts statistics for a single bank statement.
644
+
645
+ Parameters:
646
+ statement (etree.Element): XML Element representing the bank
647
+ statement.
648
+ redact_pii (bool): Whether to redact PII data (address fields).
649
+
650
+ Returns:
651
+ dict: Statement statistics.
652
+ """
653
+ # Extract basic information about the statement with batched XPath queries
654
+ account_id = self._get_account_id(statement)
655
+
656
+ # Batch these queries instead of calling _get_element_text multiple times
657
+ id_elems = statement.xpath("./Id")
658
+ statement_id = id_elems[0].text if id_elems else ""
659
+
660
+ created_elems = statement.xpath("./CreDtTm")
661
+ created = created_elems[0].text if created_elems else ""
662
+
663
+ # Optimize: calculate transaction stats directly from XPath rather than
664
+ # reprocessing through _get_transactions_for_statement
665
+ entry_elems = statement.xpath("./Ntry")
666
+ num_transactions = len(entry_elems)
667
+
668
+ # Calculate net amount directly without full transaction parsing
669
+ net_amount = 0.0
670
+ if entry_elems:
671
+ for entry in entry_elems:
672
+ amount_elems = entry.xpath("./Amt")
673
+ cdt_dbt_elems = entry.xpath("./CdtDbtInd")
674
+
675
+ if amount_elems and cdt_dbt_elems:
676
+ amount = float(amount_elems[0].text)
677
+ if cdt_dbt_elems[0].text == "DBIT":
678
+ amount = -amount
679
+ net_amount += amount
680
+
681
+ # Return the statistics as a dictionary
682
+ return {
683
+ "StatementId": statement_id,
684
+ "AccountId": account_id,
685
+ "StatementCreated": created,
686
+ "NumTransactions": num_transactions,
687
+ "NetAmount": net_amount,
688
+ }
689
+
690
+ def __repr__(self) -> str:
691
+ """
692
+ Returns a string representation of the parsed data.
693
+
694
+ Returns:
695
+ str: String representation.
696
+ """
697
+ return str(self.get_statement_stats())
698
+
699
+ def parse(self, redact_pii: bool = False) -> pd.DataFrame:
700
+ """
701
+ Parse the CAMT file and return transaction data.
702
+
703
+ Parameters:
704
+ redact_pii (bool): Whether to redact PII data (address fields).
705
+
706
+ Returns:
707
+ pd.DataFrame: Parsed transaction data with standardized columns.
708
+ """
709
+ return self.get_transactions(redact_pii=redact_pii)
710
+
711
+ def parse_streaming(
712
+ self, redact_pii: bool = False
713
+ ) -> Generator[TransactionRecord, None, None]:
714
+ """
715
+ Parse the CAMT file using streaming XML parsing for large files.
716
+ Yields transaction data incrementally to keep memory usage low.
717
+
718
+ Parameters:
719
+ redact_pii (bool): Whether to redact PII data (address fields).
720
+
721
+ Yields:
722
+ Dict[str, Any]: Individual transaction data with standardized structure.
723
+ """
724
+ source_stream = BytesIO(self._xml_bytes)
725
+
726
+ current_statement = None
727
+ current_account_id = ""
728
+
729
+ for event, elem in etree.iterparse(
730
+ source_stream,
731
+ events=("start", "end"),
732
+ resolve_entities=False,
733
+ load_dtd=False,
734
+ no_network=True,
735
+ huge_tree=False,
736
+ ):
737
+ if event == "start" and elem.tag == "Stmt":
738
+ current_statement = elem
739
+
740
+ elif (
741
+ event == "end"
742
+ and elem.tag == "Stmt"
743
+ and current_statement is not None
744
+ ):
745
+ id_elems = current_statement.xpath(
746
+ "./Acct/Id/IBAN|./Acct/Id/Othr/Id"
747
+ )
748
+ current_account_id = (
749
+ id_elems[0].text if id_elems else ""
750
+ )
751
+ current_statement = None
752
+
753
+ elif event == "end" and elem.tag == "Ntry":
754
+ try:
755
+ transaction_data = (
756
+ self._parse_streaming_transaction(
757
+ elem, current_account_id, redact_pii
758
+ )
759
+ )
760
+ yield transaction_data
761
+ except Exception as e:
762
+ logger.warning("Error parsing transaction: %s", e)
763
+ continue
764
+ finally:
765
+ elem.clear()
766
+ while elem.getprevious() is not None:
767
+ del elem.getparent()[0]
768
+
769
+ def _parse_streaming_transaction(
770
+ self,
771
+ entry_elem: etree._Element,
772
+ account_id: str,
773
+ redact_pii: bool = False,
774
+ ) -> TransactionRecord:
775
+ """
776
+ Parse a single transaction entry element for streaming mode.
777
+
778
+ Parameters:
779
+ entry_elem (etree.Element): XML element representing a transaction entry.
780
+ account_id (str): Account ID for this transaction.
781
+ redact_pii (bool): Whether to redact PII data (address fields).
782
+
783
+ Returns:
784
+ Dict[str, Any]: Parsed transaction data.
785
+ """
786
+ # Fast-path extraction using find/findtext instead of xpath.
787
+ # find() uses direct tree traversal — ~5x faster than xpath().
788
+ amt_elem = entry_elem.find("Amt")
789
+ amount = (
790
+ float(amt_elem.text) if amt_elem is not None else 0.0
791
+ )
792
+ currency = (
793
+ amt_elem.get("Ccy", "") if amt_elem is not None else ""
794
+ )
795
+
796
+ cdt_dbt_elem = entry_elem.find("CdtDbtInd")
797
+ cdt_dbt = (
798
+ cdt_dbt_elem.text
799
+ if cdt_dbt_elem is not None
800
+ else ""
801
+ )
802
+
803
+ if cdt_dbt == "DBIT":
804
+ amount = -amount
805
+
806
+ # Party information — single find() per field.
807
+ debtor = ""
808
+ creditor = ""
809
+ reference = ""
810
+ debtor_addr = ""
811
+ creditor_addr = ""
812
+
813
+ tx_dtls = entry_elem.find("NtryDtls/TxDtls")
814
+ if tx_dtls is not None:
815
+ dbtr = tx_dtls.find("RltdPties/Dbtr/Nm")
816
+ if dbtr is not None:
817
+ debtor = dbtr.text or ""
818
+ cdtr = tx_dtls.find("RltdPties/Cdtr/Nm")
819
+ if cdtr is not None:
820
+ creditor = cdtr.text or ""
821
+ ustrd = tx_dtls.find("RmtInf/Ustrd")
822
+ if ustrd is not None and ustrd.text:
823
+ reference = ustrd.text
824
+
825
+ da = tx_dtls.find(
826
+ "RltdPties/Dbtr/PstlAdr/AdrLine"
827
+ )
828
+ if da is None:
829
+ da = tx_dtls.find(
830
+ "RltdPties/Dbtr/PstlAdr/StrtNm"
831
+ )
832
+ if da is not None:
833
+ debtor_addr = da.text or ""
834
+
835
+ ca = tx_dtls.find(
836
+ "RltdPties/Cdtr/PstlAdr/AdrLine"
837
+ )
838
+ if ca is None:
839
+ ca = tx_dtls.find(
840
+ "RltdPties/Cdtr/PstlAdr/StrtNm"
841
+ )
842
+ if ca is not None:
843
+ creditor_addr = ca.text or ""
844
+
845
+ # Fallback for CAMT dialects with Ustrd outside TxDtls
846
+ if not reference:
847
+ ustrd_fb = entry_elem.find(".//Ustrd")
848
+ if ustrd_fb is not None and ustrd_fb.text:
849
+ reference = ustrd_fb.text
850
+
851
+ # Dates — direct child lookup
852
+ val_date_elem = entry_elem.find("ValDt/Dt")
853
+ if val_date_elem is None:
854
+ val_date_elem = entry_elem.find("ValDt/DtTm")
855
+ val_date = (
856
+ val_date_elem.text
857
+ if val_date_elem is not None
858
+ else ""
859
+ )
860
+
861
+ booking_date_elem = entry_elem.find("BookgDt/Dt")
862
+ if booking_date_elem is None:
863
+ booking_date_elem = entry_elem.find("BookgDt/DtTm")
864
+ booking_date = (
865
+ booking_date_elem.text
866
+ if booking_date_elem is not None
867
+ else ""
868
+ )
869
+
870
+ # Apply PII redaction if requested
871
+ if redact_pii:
872
+ if debtor_addr:
873
+ debtor_addr = "***REDACTED***"
874
+ if creditor_addr:
875
+ creditor_addr = "***REDACTED***"
876
+
877
+ # Build transaction dictionary
878
+ result: TransactionRecord = {
879
+ "Amount": amount,
880
+ "Currency": currency,
881
+ "DrCr": cdt_dbt,
882
+ "Debtor": debtor,
883
+ "Creditor": creditor,
884
+ "Reference": reference,
885
+ "ValDt": val_date,
886
+ "BookgDt": booking_date,
887
+ "AccountId": account_id,
888
+ }
889
+
890
+ # Only add address fields if they exist
891
+ if debtor_addr:
892
+ result["DebtorAddress"] = debtor_addr
893
+ if creditor_addr:
894
+ result["CreditorAddress"] = creditor_addr
895
+
896
+ return result
897
+
898
+ def get_summary(self) -> SummaryRecord:
899
+ """
900
+ Get a summary of the parsed CAMT statement data.
901
+
902
+ Returns:
903
+ Dict[str, Any]: Summary information including account details,
904
+ transaction counts, and balance information.
905
+ """
906
+ stats_df = self.get_statement_stats()
907
+ balances_df = self.get_account_balances()
908
+
909
+ # Get the first statement's summary (most files have one statement)
910
+ summary: SummaryRecord = {}
911
+ if not stats_df.empty:
912
+ first_stat = stats_df.iloc[0]
913
+ summary = {
914
+ "account_id": first_stat.get("AccountId", "Unknown"),
915
+ "statement_date": first_stat.get(
916
+ "StatementCreated", "Unknown"
917
+ ),
918
+ "transaction_count": first_stat.get(
919
+ "NumTransactions", 0
920
+ ),
921
+ "total_amount": first_stat.get("NetAmount", 0.0),
922
+ "currency": "Unknown", # Will be extracted from first transaction if available
923
+ }
924
+
925
+ # Extract currency from first transaction
926
+ transactions = self.get_transactions()
927
+ if not transactions.empty:
928
+ summary["currency"] = transactions.iloc[0].get(
929
+ "Currency", "Unknown"
930
+ )
931
+
932
+ # Add balance information if available
933
+ if not balances_df.empty:
934
+ # Find opening and closing balances
935
+ opening_balance = balances_df[balances_df["Code"] == "OPBD"]
936
+ closing_balance = balances_df[balances_df["Code"] == "CLBD"]
937
+
938
+ if not opening_balance.empty:
939
+ summary["opening_balance"] = opening_balance.iloc[0][
940
+ "Amount"
941
+ ]
942
+ if not closing_balance.empty:
943
+ summary["closing_balance"] = closing_balance.iloc[0][
944
+ "Amount"
945
+ ]
946
+
947
+ return summary
948
+
949
+ def camt_to_excel(self, filename: str) -> None:
950
+ """
951
+ Exports parsed CAMT data to an Excel file.
952
+
953
+ Parameters:
954
+ filename (str): Path to the output Excel file.
955
+ """
956
+ # Retrieve dataframes for balances, transactions, and statement
957
+ # statistics
958
+ balances = self.get_account_balances()
959
+ transactions = self.get_transactions()
960
+ stats = self.get_statement_stats()
961
+
962
+ # Write the dataframes to the Excel file using the openpyxl engine
963
+ # pylint: disable=E0110
964
+ with pd.ExcelWriter(filename, engine="openpyxl") as writer:
965
+ balances.to_excel(
966
+ writer, sheet_name="Balances", index=False
967
+ )
968
+ transactions.to_excel(
969
+ writer, sheet_name="Transactions", index=False
970
+ )
971
+ stats.to_excel(writer, sheet_name="Stats", index=False)