bankstatementparser 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,575 @@
1
+ # Copyright (C) 2023 Sebastien Rousseau.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
+ # implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ This module provides a command line interface for parsing bank statement
18
+ files in various formats. Currently, it supports CAMT (ISO 20022) format, with
19
+ potential to extend support to other formats.
20
+ """
21
+
22
+ import argparse
23
+ import logging
24
+ import os
25
+ import sys
26
+ from pathlib import Path
27
+ from typing import Optional
28
+
29
+ import pandas as pd
30
+
31
+ from bankstatementparser import CamtParser, Pain001Parser
32
+ from bankstatementparser.input_validator import (
33
+ InputValidator,
34
+ ValidationError,
35
+ )
36
+
37
+ # Set up logging
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ def setup_logging(level: int = logging.INFO) -> None:
42
+ """
43
+ Configure logging for the CLI application.
44
+
45
+ Args:
46
+ level (int): Logging level (default: INFO)
47
+ """
48
+ logging.basicConfig(
49
+ level=level,
50
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
51
+ handlers=[logging.StreamHandler(sys.stderr)],
52
+ )
53
+
54
+
55
+ class BankStatementCLI:
56
+ """A command line interface for parsing bank statement files."""
57
+
58
+ def __init__(self) -> None:
59
+ """Initialize the CLI by setting up the argument parser."""
60
+ self.parser = self.setup_arg_parser()
61
+ self.validator = InputValidator()
62
+
63
+ def _sanitize_file_path(self, file_path: str) -> str:
64
+ """
65
+ Sanitize and validate file path for security.
66
+
67
+ Args:
68
+ file_path (str): Input file path to sanitize.
69
+
70
+ Returns:
71
+ str: Sanitized absolute path.
72
+
73
+ Raises:
74
+ ValidationError: If path is invalid or potentially dangerous.
75
+ """
76
+ # Check for None or empty path
77
+ if file_path is None:
78
+ raise ValueError("File path cannot be None")
79
+
80
+ # Convert to absolute path to prevent directory traversal
81
+ abs_path = os.path.abspath(file_path)
82
+
83
+ # Get the common path with current working directory to prevent escaping
84
+ cwd = os.path.abspath(os.getcwd())
85
+ try:
86
+ common_path = os.path.commonpath([abs_path, cwd])
87
+ # Allow paths under current working directory or use system temp directory
88
+ import tempfile
89
+
90
+ system_temp = os.path.abspath(tempfile.gettempdir())
91
+ if not (
92
+ common_path == cwd or abs_path.startswith(system_temp)
93
+ ):
94
+ # For production, you might want to be more restrictive
95
+ logger.info(
96
+ f"Path outside working directory: {file_path}"
97
+ )
98
+ except ValueError:
99
+ # Different drives on Windows or other path issues
100
+ logger.warning(f"Path validation warning for: {file_path}")
101
+
102
+ return abs_path
103
+
104
+ def _redact_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
105
+ """
106
+ Redact sensitive PII columns in a DataFrame.
107
+
108
+ Args:
109
+ df (pd.DataFrame): Original DataFrame containing potentially sensitive data.
110
+
111
+ Returns:
112
+ pd.DataFrame: DataFrame with PII columns redacted.
113
+ """
114
+ # Create a copy to avoid modifying original data
115
+ redacted_df = df.copy()
116
+
117
+ # Define PII keywords to identify sensitive columns
118
+ pii_keywords = ["address", "iban", "account", "name", "bic"]
119
+
120
+ # Check each column for PII keywords (case-insensitive)
121
+ for column in redacted_df.columns:
122
+ column_lower = column.lower()
123
+ for keyword in pii_keywords:
124
+ if keyword in column_lower:
125
+ redacted_df[column] = "***REDACTED***"
126
+ break
127
+
128
+ return redacted_df
129
+
130
+ def setup_arg_parser(self) -> argparse.ArgumentParser:
131
+ """
132
+ Set up the command line argument parser.
133
+
134
+ Returns:
135
+ argparse.ArgumentParser: The configured argument parser.
136
+ """
137
+ parser = argparse.ArgumentParser(
138
+ description="Parse bank statement files."
139
+ )
140
+ parser.add_argument(
141
+ "--type",
142
+ type=str,
143
+ required=True,
144
+ choices=["camt", "pain001"],
145
+ help='Type of the bank statement file: "camt" or "pain001".',
146
+ )
147
+ parser.add_argument(
148
+ "--input",
149
+ type=str,
150
+ required=True,
151
+ help="Path to the bank statement file.",
152
+ )
153
+ parser.add_argument(
154
+ "--output",
155
+ type=str,
156
+ required=False,
157
+ help="Path to save parsed data; if not provided, data is printed.",
158
+ )
159
+ parser.add_argument(
160
+ "--max-size",
161
+ type=int,
162
+ required=False,
163
+ default=100,
164
+ help="Maximum file size in MB (default: 100MB).",
165
+ )
166
+ parser.add_argument(
167
+ "--verbose",
168
+ "-v",
169
+ action="store_true",
170
+ help="Enable verbose debug logging.",
171
+ )
172
+ parser.add_argument(
173
+ "--show-pii",
174
+ action="store_true",
175
+ help="Display unredacted PII data in console output (default: False).",
176
+ )
177
+ parser.add_argument(
178
+ "--streaming",
179
+ action="store_true",
180
+ help="Use streaming XML parsing to keep memory usage under 50MB for large files (default: False).",
181
+ )
182
+ return parser
183
+
184
+ def parse_camt(
185
+ self,
186
+ file_path: Path,
187
+ output_path: Optional[Path] = None,
188
+ show_pii: bool = False,
189
+ streaming: bool = False,
190
+ ) -> None:
191
+ """
192
+ Parse a CAMT format bank statement file and print or save the results.
193
+
194
+ Args:
195
+ file_path (Path): Validated path to the CAMT file.
196
+ output_path (Path, optional): Validated path to save the parsed data.
197
+ If None, data is printed to console.
198
+ show_pii (bool): Whether to display unredacted PII data.
199
+ streaming (bool): Whether to use streaming parsing for large files.
200
+ """
201
+ try:
202
+ parser = CamtParser(str(file_path))
203
+
204
+ if streaming:
205
+ # Use streaming parsing to process transactions incrementally
206
+ transactions = []
207
+ transaction_count = 0
208
+
209
+ if output_path:
210
+ # For output file, use atomic write operation with temp file
211
+ safe_name = self.validator.get_safe_filename(
212
+ output_path.name
213
+ )
214
+ safe_output_path = str(
215
+ output_path.parent / safe_name
216
+ )
217
+ temp_output = f"{safe_output_path}.tmp"
218
+
219
+ with open(temp_output, "w", encoding="utf-8") as f:
220
+ # Write CSV header
221
+ header_written = False
222
+
223
+ for transaction_data in parser.parse_streaming(
224
+ redact_pii=not show_pii
225
+ ):
226
+ transaction_count += 1
227
+
228
+ # Convert to DataFrame for consistent formatting
229
+ tx_df = pd.DataFrame([transaction_data])
230
+
231
+ if not header_written:
232
+ # Write header on first transaction
233
+ tx_df.to_csv(f, index=False, mode="w")
234
+ header_written = True
235
+ else:
236
+ # Write data without header
237
+ tx_df.to_csv(
238
+ f,
239
+ index=False,
240
+ mode="a",
241
+ header=False,
242
+ )
243
+
244
+ # Atomically move temp file to final location
245
+ os.replace(temp_output, safe_output_path)
246
+ print(
247
+ f"Parsed {transaction_count} transactions in streaming mode, saved to {safe_output_path}"
248
+ )
249
+
250
+ else:
251
+ # For console output, collect a reasonable number of transactions
252
+ max_console_transactions = 100
253
+
254
+ for transaction_data in parser.parse_streaming(
255
+ redact_pii=not show_pii
256
+ ):
257
+ transactions.append(transaction_data)
258
+ transaction_count += 1
259
+
260
+ # Limit console output to prevent overwhelming display
261
+ if (
262
+ transaction_count
263
+ >= max_console_transactions
264
+ ):
265
+ break
266
+
267
+ data_df = pd.DataFrame(transactions)
268
+
269
+ if show_pii:
270
+ print("WARNING: Displaying unredacted PII data")
271
+ print(data_df)
272
+ else:
273
+ redacted_df = self._redact_dataframe(data_df)
274
+ print(redacted_df)
275
+
276
+ if transaction_count >= max_console_transactions:
277
+ print(
278
+ f"\n... (showing first {max_console_transactions} transactions in streaming mode)"
279
+ )
280
+
281
+ else:
282
+ # Use traditional parsing
283
+ data = parser.get_statement_stats()
284
+
285
+ if isinstance(data, dict):
286
+ data = [data]
287
+
288
+ data_df = pd.DataFrame(data)
289
+
290
+ if output_path:
291
+ # Use safe filename for output
292
+ safe_name = self.validator.get_safe_filename(
293
+ output_path.name
294
+ )
295
+ safe_output_path = str(
296
+ output_path.parent / safe_name
297
+ )
298
+ data_df.to_csv(safe_output_path, index=False)
299
+ print(f"Parsed data saved to {safe_output_path}")
300
+ else:
301
+ if show_pii:
302
+ print("WARNING: Displaying unredacted PII data")
303
+ print(data_df)
304
+ else:
305
+ redacted_df = self._redact_dataframe(data_df)
306
+ print(redacted_df)
307
+
308
+ except FileNotFoundError as e:
309
+ logger.error(f"File not found: {e}")
310
+ print(f"Error: Input file not found - {str(e)}")
311
+ sys.exit(1)
312
+ except ValidationError as e:
313
+ logger.error(f"Validation error: {e}")
314
+ print(f"Error: Invalid input - {str(e)}")
315
+ sys.exit(1)
316
+ except Exception as e:
317
+ logger.error(f"Unexpected error during CAMT parsing: {e}")
318
+ print(f"Error: Failed to parse CAMT file - {str(e)}")
319
+ sys.exit(1)
320
+
321
+ def parse_pain(
322
+ self,
323
+ file_path: Path,
324
+ output_path: Optional[Path] = None,
325
+ show_pii: bool = False,
326
+ streaming: bool = False,
327
+ ) -> None:
328
+ """
329
+ Parse a PAIN.001 format bank statement file and print or save the
330
+ results.
331
+
332
+ Args:
333
+ file_path (Path): Validated path to the PAIN.001 file.
334
+ output_path (Path, optional): Validated path to save the parsed data.
335
+ If None, data is printed to console.
336
+ show_pii (bool): Whether to display unredacted PII data.
337
+ streaming (bool): Whether to use streaming parsing for large files.
338
+ """
339
+ try:
340
+ # Instantiate the PAIN.001 parser
341
+ parser = Pain001Parser(str(file_path))
342
+
343
+ if streaming:
344
+ # Use streaming parsing to process payments incrementally
345
+ payments = []
346
+ payment_count = 0
347
+
348
+ if output_path:
349
+ # For output file, use atomic write operation with temp file
350
+ safe_name = self.validator.get_safe_filename(
351
+ output_path.name
352
+ )
353
+ safe_output_path = str(
354
+ output_path.parent / safe_name
355
+ )
356
+ temp_output = f"{safe_output_path}.tmp"
357
+
358
+ with open(temp_output, "w", encoding="utf-8") as f:
359
+ # Write CSV header
360
+ header_written = False
361
+
362
+ for payment_data in parser.parse_streaming(
363
+ redact_pii=not show_pii
364
+ ):
365
+ payment_count += 1
366
+
367
+ # Convert to DataFrame for consistent formatting
368
+ payment_df = pd.DataFrame([payment_data])
369
+
370
+ if not header_written:
371
+ # Write header on first payment
372
+ payment_df.to_csv(
373
+ f, index=False, mode="w"
374
+ )
375
+ header_written = True
376
+ else:
377
+ # Write data without header
378
+ payment_df.to_csv(
379
+ f,
380
+ index=False,
381
+ mode="a",
382
+ header=False,
383
+ )
384
+
385
+ # Atomically move temp file to final location
386
+ os.replace(temp_output, safe_output_path)
387
+ print(
388
+ f"Parsed {payment_count} payments in streaming mode, saved to {safe_output_path}"
389
+ )
390
+
391
+ else:
392
+ # For console output, collect a reasonable number of payments
393
+ max_console_payments = 100
394
+
395
+ for payment_data in parser.parse_streaming(
396
+ redact_pii=not show_pii
397
+ ):
398
+ payments.append(payment_data)
399
+ payment_count += 1
400
+
401
+ # Limit console output to prevent overwhelming display
402
+ if payment_count >= max_console_payments:
403
+ break
404
+
405
+ data_df = pd.DataFrame(payments)
406
+
407
+ if show_pii:
408
+ print("WARNING: Displaying unredacted PII data")
409
+ print(data_df)
410
+ else:
411
+ redacted_df = self._redact_dataframe(data_df)
412
+ print(redacted_df)
413
+
414
+ if payment_count >= max_console_payments:
415
+ print(
416
+ f"\n... (showing first {max_console_payments} payments in streaming mode)"
417
+ )
418
+
419
+ else:
420
+ # Use traditional parsing
421
+ parsed_data = parser.parse()
422
+ data_df = pd.DataFrame(parsed_data)
423
+
424
+ if output_path:
425
+ # Use safe filename for output
426
+ safe_name = self.validator.get_safe_filename(
427
+ output_path.name
428
+ )
429
+ safe_output_path = str(
430
+ output_path.parent / safe_name
431
+ )
432
+ data_df.to_csv(safe_output_path, index=False)
433
+ print(f"Parsed data saved to {safe_output_path}")
434
+ else:
435
+ if show_pii:
436
+ print("WARNING: Displaying unredacted PII data")
437
+ print(data_df)
438
+ else:
439
+ redacted_df = self._redact_dataframe(data_df)
440
+ print(redacted_df)
441
+
442
+ except FileNotFoundError as e:
443
+ logger.error(f"File not found: {e}")
444
+ print(f"Error: Input file not found - {str(e)}")
445
+ sys.exit(1)
446
+ except ValidationError as e:
447
+ logger.error(f"Validation error: {e}")
448
+ print(f"Error: Invalid input - {str(e)}")
449
+ sys.exit(1)
450
+ except Exception as e:
451
+ logger.error(
452
+ f"Unexpected error during PAIN.001 parsing: {e}"
453
+ )
454
+ print(f"Error: Failed to parse PAIN.001 file - {str(e)}")
455
+ sys.exit(1)
456
+
457
+ def run(self) -> None:
458
+ """
459
+ Parse command line arguments and perform the requested action.
460
+
461
+ Validates input/output paths, configures logging, and delegates
462
+ to the appropriate parser (CAMT or PAIN.001) based on the --type
463
+ argument. Supports both traditional and streaming parsing modes.
464
+
465
+ Raises:
466
+ SystemExit: On invalid arguments, validation failure, or parse errors.
467
+ """
468
+ if len(sys.argv) == 1:
469
+ self.parser.print_help(sys.stderr)
470
+ sys.exit(1)
471
+
472
+ try:
473
+ args = self.parser.parse_args()
474
+ except SystemExit:
475
+ # argparse failed, which means required arguments are missing
476
+ print("Error: Missing required arguments")
477
+ sys.exit(1)
478
+ return # pragma: no cover
479
+
480
+ # Check if required arguments are present (safety check)
481
+ if (
482
+ not hasattr(args, "input")
483
+ or args.input is None
484
+ or not hasattr(args, "type")
485
+ or args.type is None
486
+ ):
487
+ print("Error: Missing required arguments")
488
+ sys.exit(1)
489
+ return # Defensive programming: ensure we don't continue if sys.exit is mocked
490
+
491
+ # Set up logging based on verbosity
492
+ log_level = logging.DEBUG if args.verbose else logging.INFO
493
+ setup_logging(log_level)
494
+
495
+ # Update validator max file size setting
496
+ max_size_bytes = (
497
+ args.max_size * 1024 * 1024
498
+ ) # Convert MB to bytes
499
+ self.validator.max_file_size = max_size_bytes
500
+
501
+ # Validate input file
502
+ try:
503
+ # First sanitize the path for security
504
+ sanitized_input = self._sanitize_file_path(args.input)
505
+ validated_input_path = (
506
+ self.validator.validate_input_file_path(sanitized_input)
507
+ )
508
+ logger.info(f"Input file validated: {validated_input_path}")
509
+ except (ValidationError, FileNotFoundError) as e:
510
+ logger.error(f"Input validation failed: {e}")
511
+ print(f"Error: {str(e)}")
512
+ sys.exit(1)
513
+ return # Defensive programming: ensure we don't continue if sys.exit is mocked
514
+
515
+ # Validate output file if provided
516
+ validated_output_path = None
517
+ if args.output:
518
+ try:
519
+ # First sanitize the path for security
520
+ sanitized_output = self._sanitize_file_path(args.output)
521
+ validated_output_path = (
522
+ self.validator.validate_output_file_path(
523
+ sanitized_output
524
+ )
525
+ )
526
+ logger.info(
527
+ f"Output file validated: {validated_output_path}"
528
+ )
529
+ except ValidationError as e:
530
+ logger.error(f"Output validation failed: {e}")
531
+ print(f"Error: {str(e)}")
532
+ sys.exit(1)
533
+
534
+ # Parse based on type
535
+ try:
536
+ if args.type == "camt":
537
+ if args.streaming:
538
+ self.parse_camt(
539
+ validated_input_path,
540
+ validated_output_path,
541
+ args.show_pii,
542
+ args.streaming,
543
+ )
544
+ else:
545
+ self.parse_camt(
546
+ validated_input_path,
547
+ validated_output_path,
548
+ args.show_pii,
549
+ )
550
+ elif args.type == "pain001":
551
+ if args.streaming:
552
+ self.parse_pain(
553
+ validated_input_path,
554
+ validated_output_path,
555
+ args.show_pii,
556
+ args.streaming,
557
+ )
558
+ else:
559
+ self.parse_pain(
560
+ validated_input_path,
561
+ validated_output_path,
562
+ args.show_pii,
563
+ )
564
+ else:
565
+ print("Error: The specified type is not supported.")
566
+ sys.exit(1)
567
+ except Exception as e:
568
+ logger.error(f"Parsing failed: {e}")
569
+ print(f"Error: Parsing failed - {str(e)}")
570
+ sys.exit(1)
571
+
572
+
573
+ if __name__ == "__main__": # pragma: no cover
574
+ cli = BankStatementCLI()
575
+ cli.run()
@@ -0,0 +1,36 @@
1
+ # Copyright (C) 2023 Sebastien Rousseau.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
+ # implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """Repository-wide exception hierarchy."""
17
+
18
+ from __future__ import annotations
19
+
20
+ from defusedxml.ElementTree import ParseError
21
+
22
+
23
+ class BankStatementParserError(Exception):
24
+ """Base exception for parser-specific failures."""
25
+
26
+
27
+ class ExportError(OSError, BankStatementParserError):
28
+ """Raised when an export operation cannot complete safely."""
29
+
30
+
31
+ class ParserError(BankStatementParserError):
32
+ """Raised when parser logic cannot produce a valid result."""
33
+
34
+
35
+ class Pain001ParseError(ParseError, ParserError): # type: ignore[misc]
36
+ """Raised when PAIN.001 parsing fails after XML loading succeeds."""