bankstatementparser 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bankstatementparser/__init__.py +82 -0
- bankstatementparser/additional_parsers.py +376 -0
- bankstatementparser/bank_statement_parsers.py +370 -0
- bankstatementparser/base_parser.py +205 -0
- bankstatementparser/camt_parser.py +971 -0
- bankstatementparser/cli.py +575 -0
- bankstatementparser/exceptions.py +36 -0
- bankstatementparser/input_validator.py +628 -0
- bankstatementparser/pain001_parser.py +742 -0
- bankstatementparser/parallel.py +127 -0
- bankstatementparser/record_types.py +94 -0
- bankstatementparser/transaction_deduplicator.py +402 -0
- bankstatementparser/transaction_models.py +196 -0
- bankstatementparser/zip_security.py +141 -0
- bankstatementparser-0.0.4.dist-info/METADATA +363 -0
- bankstatementparser-0.0.4.dist-info/RECORD +18 -0
- bankstatementparser-0.0.4.dist-info/WHEEL +4 -0
- bankstatementparser-0.0.4.dist-info/licenses/LICENSE +203 -0
|
@@ -0,0 +1,575 @@
|
|
|
1
|
+
# Copyright (C) 2023 Sebastien Rousseau.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
12
|
+
# implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
This module provides a command line interface for parsing bank statement
|
|
18
|
+
files in various formats. Currently, it supports CAMT (ISO 20022) format, with
|
|
19
|
+
potential to extend support to other formats.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import logging
|
|
24
|
+
import os
|
|
25
|
+
import sys
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Optional
|
|
28
|
+
|
|
29
|
+
import pandas as pd
|
|
30
|
+
|
|
31
|
+
from bankstatementparser import CamtParser, Pain001Parser
|
|
32
|
+
from bankstatementparser.input_validator import (
|
|
33
|
+
InputValidator,
|
|
34
|
+
ValidationError,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Set up logging
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def setup_logging(level: int = logging.INFO) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Configure logging for the CLI application.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
level (int): Logging level (default: INFO)
|
|
47
|
+
"""
|
|
48
|
+
logging.basicConfig(
|
|
49
|
+
level=level,
|
|
50
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
51
|
+
handlers=[logging.StreamHandler(sys.stderr)],
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class BankStatementCLI:
|
|
56
|
+
"""A command line interface for parsing bank statement files."""
|
|
57
|
+
|
|
58
|
+
def __init__(self) -> None:
|
|
59
|
+
"""Initialize the CLI by setting up the argument parser."""
|
|
60
|
+
self.parser = self.setup_arg_parser()
|
|
61
|
+
self.validator = InputValidator()
|
|
62
|
+
|
|
63
|
+
def _sanitize_file_path(self, file_path: str) -> str:
|
|
64
|
+
"""
|
|
65
|
+
Sanitize and validate file path for security.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
file_path (str): Input file path to sanitize.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
str: Sanitized absolute path.
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ValidationError: If path is invalid or potentially dangerous.
|
|
75
|
+
"""
|
|
76
|
+
# Check for None or empty path
|
|
77
|
+
if file_path is None:
|
|
78
|
+
raise ValueError("File path cannot be None")
|
|
79
|
+
|
|
80
|
+
# Convert to absolute path to prevent directory traversal
|
|
81
|
+
abs_path = os.path.abspath(file_path)
|
|
82
|
+
|
|
83
|
+
# Get the common path with current working directory to prevent escaping
|
|
84
|
+
cwd = os.path.abspath(os.getcwd())
|
|
85
|
+
try:
|
|
86
|
+
common_path = os.path.commonpath([abs_path, cwd])
|
|
87
|
+
# Allow paths under current working directory or use system temp directory
|
|
88
|
+
import tempfile
|
|
89
|
+
|
|
90
|
+
system_temp = os.path.abspath(tempfile.gettempdir())
|
|
91
|
+
if not (
|
|
92
|
+
common_path == cwd or abs_path.startswith(system_temp)
|
|
93
|
+
):
|
|
94
|
+
# For production, you might want to be more restrictive
|
|
95
|
+
logger.info(
|
|
96
|
+
f"Path outside working directory: {file_path}"
|
|
97
|
+
)
|
|
98
|
+
except ValueError:
|
|
99
|
+
# Different drives on Windows or other path issues
|
|
100
|
+
logger.warning(f"Path validation warning for: {file_path}")
|
|
101
|
+
|
|
102
|
+
return abs_path
|
|
103
|
+
|
|
104
|
+
def _redact_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
105
|
+
"""
|
|
106
|
+
Redact sensitive PII columns in a DataFrame.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
df (pd.DataFrame): Original DataFrame containing potentially sensitive data.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
pd.DataFrame: DataFrame with PII columns redacted.
|
|
113
|
+
"""
|
|
114
|
+
# Create a copy to avoid modifying original data
|
|
115
|
+
redacted_df = df.copy()
|
|
116
|
+
|
|
117
|
+
# Define PII keywords to identify sensitive columns
|
|
118
|
+
pii_keywords = ["address", "iban", "account", "name", "bic"]
|
|
119
|
+
|
|
120
|
+
# Check each column for PII keywords (case-insensitive)
|
|
121
|
+
for column in redacted_df.columns:
|
|
122
|
+
column_lower = column.lower()
|
|
123
|
+
for keyword in pii_keywords:
|
|
124
|
+
if keyword in column_lower:
|
|
125
|
+
redacted_df[column] = "***REDACTED***"
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
return redacted_df
|
|
129
|
+
|
|
130
|
+
def setup_arg_parser(self) -> argparse.ArgumentParser:
|
|
131
|
+
"""
|
|
132
|
+
Set up the command line argument parser.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
argparse.ArgumentParser: The configured argument parser.
|
|
136
|
+
"""
|
|
137
|
+
parser = argparse.ArgumentParser(
|
|
138
|
+
description="Parse bank statement files."
|
|
139
|
+
)
|
|
140
|
+
parser.add_argument(
|
|
141
|
+
"--type",
|
|
142
|
+
type=str,
|
|
143
|
+
required=True,
|
|
144
|
+
choices=["camt", "pain001"],
|
|
145
|
+
help='Type of the bank statement file: "camt" or "pain001".',
|
|
146
|
+
)
|
|
147
|
+
parser.add_argument(
|
|
148
|
+
"--input",
|
|
149
|
+
type=str,
|
|
150
|
+
required=True,
|
|
151
|
+
help="Path to the bank statement file.",
|
|
152
|
+
)
|
|
153
|
+
parser.add_argument(
|
|
154
|
+
"--output",
|
|
155
|
+
type=str,
|
|
156
|
+
required=False,
|
|
157
|
+
help="Path to save parsed data; if not provided, data is printed.",
|
|
158
|
+
)
|
|
159
|
+
parser.add_argument(
|
|
160
|
+
"--max-size",
|
|
161
|
+
type=int,
|
|
162
|
+
required=False,
|
|
163
|
+
default=100,
|
|
164
|
+
help="Maximum file size in MB (default: 100MB).",
|
|
165
|
+
)
|
|
166
|
+
parser.add_argument(
|
|
167
|
+
"--verbose",
|
|
168
|
+
"-v",
|
|
169
|
+
action="store_true",
|
|
170
|
+
help="Enable verbose debug logging.",
|
|
171
|
+
)
|
|
172
|
+
parser.add_argument(
|
|
173
|
+
"--show-pii",
|
|
174
|
+
action="store_true",
|
|
175
|
+
help="Display unredacted PII data in console output (default: False).",
|
|
176
|
+
)
|
|
177
|
+
parser.add_argument(
|
|
178
|
+
"--streaming",
|
|
179
|
+
action="store_true",
|
|
180
|
+
help="Use streaming XML parsing to keep memory usage under 50MB for large files (default: False).",
|
|
181
|
+
)
|
|
182
|
+
return parser
|
|
183
|
+
|
|
184
|
+
def parse_camt(
|
|
185
|
+
self,
|
|
186
|
+
file_path: Path,
|
|
187
|
+
output_path: Optional[Path] = None,
|
|
188
|
+
show_pii: bool = False,
|
|
189
|
+
streaming: bool = False,
|
|
190
|
+
) -> None:
|
|
191
|
+
"""
|
|
192
|
+
Parse a CAMT format bank statement file and print or save the results.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
file_path (Path): Validated path to the CAMT file.
|
|
196
|
+
output_path (Path, optional): Validated path to save the parsed data.
|
|
197
|
+
If None, data is printed to console.
|
|
198
|
+
show_pii (bool): Whether to display unredacted PII data.
|
|
199
|
+
streaming (bool): Whether to use streaming parsing for large files.
|
|
200
|
+
"""
|
|
201
|
+
try:
|
|
202
|
+
parser = CamtParser(str(file_path))
|
|
203
|
+
|
|
204
|
+
if streaming:
|
|
205
|
+
# Use streaming parsing to process transactions incrementally
|
|
206
|
+
transactions = []
|
|
207
|
+
transaction_count = 0
|
|
208
|
+
|
|
209
|
+
if output_path:
|
|
210
|
+
# For output file, use atomic write operation with temp file
|
|
211
|
+
safe_name = self.validator.get_safe_filename(
|
|
212
|
+
output_path.name
|
|
213
|
+
)
|
|
214
|
+
safe_output_path = str(
|
|
215
|
+
output_path.parent / safe_name
|
|
216
|
+
)
|
|
217
|
+
temp_output = f"{safe_output_path}.tmp"
|
|
218
|
+
|
|
219
|
+
with open(temp_output, "w", encoding="utf-8") as f:
|
|
220
|
+
# Write CSV header
|
|
221
|
+
header_written = False
|
|
222
|
+
|
|
223
|
+
for transaction_data in parser.parse_streaming(
|
|
224
|
+
redact_pii=not show_pii
|
|
225
|
+
):
|
|
226
|
+
transaction_count += 1
|
|
227
|
+
|
|
228
|
+
# Convert to DataFrame for consistent formatting
|
|
229
|
+
tx_df = pd.DataFrame([transaction_data])
|
|
230
|
+
|
|
231
|
+
if not header_written:
|
|
232
|
+
# Write header on first transaction
|
|
233
|
+
tx_df.to_csv(f, index=False, mode="w")
|
|
234
|
+
header_written = True
|
|
235
|
+
else:
|
|
236
|
+
# Write data without header
|
|
237
|
+
tx_df.to_csv(
|
|
238
|
+
f,
|
|
239
|
+
index=False,
|
|
240
|
+
mode="a",
|
|
241
|
+
header=False,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Atomically move temp file to final location
|
|
245
|
+
os.replace(temp_output, safe_output_path)
|
|
246
|
+
print(
|
|
247
|
+
f"Parsed {transaction_count} transactions in streaming mode, saved to {safe_output_path}"
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
else:
|
|
251
|
+
# For console output, collect a reasonable number of transactions
|
|
252
|
+
max_console_transactions = 100
|
|
253
|
+
|
|
254
|
+
for transaction_data in parser.parse_streaming(
|
|
255
|
+
redact_pii=not show_pii
|
|
256
|
+
):
|
|
257
|
+
transactions.append(transaction_data)
|
|
258
|
+
transaction_count += 1
|
|
259
|
+
|
|
260
|
+
# Limit console output to prevent overwhelming display
|
|
261
|
+
if (
|
|
262
|
+
transaction_count
|
|
263
|
+
>= max_console_transactions
|
|
264
|
+
):
|
|
265
|
+
break
|
|
266
|
+
|
|
267
|
+
data_df = pd.DataFrame(transactions)
|
|
268
|
+
|
|
269
|
+
if show_pii:
|
|
270
|
+
print("WARNING: Displaying unredacted PII data")
|
|
271
|
+
print(data_df)
|
|
272
|
+
else:
|
|
273
|
+
redacted_df = self._redact_dataframe(data_df)
|
|
274
|
+
print(redacted_df)
|
|
275
|
+
|
|
276
|
+
if transaction_count >= max_console_transactions:
|
|
277
|
+
print(
|
|
278
|
+
f"\n... (showing first {max_console_transactions} transactions in streaming mode)"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
else:
|
|
282
|
+
# Use traditional parsing
|
|
283
|
+
data = parser.get_statement_stats()
|
|
284
|
+
|
|
285
|
+
if isinstance(data, dict):
|
|
286
|
+
data = [data]
|
|
287
|
+
|
|
288
|
+
data_df = pd.DataFrame(data)
|
|
289
|
+
|
|
290
|
+
if output_path:
|
|
291
|
+
# Use safe filename for output
|
|
292
|
+
safe_name = self.validator.get_safe_filename(
|
|
293
|
+
output_path.name
|
|
294
|
+
)
|
|
295
|
+
safe_output_path = str(
|
|
296
|
+
output_path.parent / safe_name
|
|
297
|
+
)
|
|
298
|
+
data_df.to_csv(safe_output_path, index=False)
|
|
299
|
+
print(f"Parsed data saved to {safe_output_path}")
|
|
300
|
+
else:
|
|
301
|
+
if show_pii:
|
|
302
|
+
print("WARNING: Displaying unredacted PII data")
|
|
303
|
+
print(data_df)
|
|
304
|
+
else:
|
|
305
|
+
redacted_df = self._redact_dataframe(data_df)
|
|
306
|
+
print(redacted_df)
|
|
307
|
+
|
|
308
|
+
except FileNotFoundError as e:
|
|
309
|
+
logger.error(f"File not found: {e}")
|
|
310
|
+
print(f"Error: Input file not found - {str(e)}")
|
|
311
|
+
sys.exit(1)
|
|
312
|
+
except ValidationError as e:
|
|
313
|
+
logger.error(f"Validation error: {e}")
|
|
314
|
+
print(f"Error: Invalid input - {str(e)}")
|
|
315
|
+
sys.exit(1)
|
|
316
|
+
except Exception as e:
|
|
317
|
+
logger.error(f"Unexpected error during CAMT parsing: {e}")
|
|
318
|
+
print(f"Error: Failed to parse CAMT file - {str(e)}")
|
|
319
|
+
sys.exit(1)
|
|
320
|
+
|
|
321
|
+
def parse_pain(
|
|
322
|
+
self,
|
|
323
|
+
file_path: Path,
|
|
324
|
+
output_path: Optional[Path] = None,
|
|
325
|
+
show_pii: bool = False,
|
|
326
|
+
streaming: bool = False,
|
|
327
|
+
) -> None:
|
|
328
|
+
"""
|
|
329
|
+
Parse a PAIN.001 format bank statement file and print or save the
|
|
330
|
+
results.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
file_path (Path): Validated path to the PAIN.001 file.
|
|
334
|
+
output_path (Path, optional): Validated path to save the parsed data.
|
|
335
|
+
If None, data is printed to console.
|
|
336
|
+
show_pii (bool): Whether to display unredacted PII data.
|
|
337
|
+
streaming (bool): Whether to use streaming parsing for large files.
|
|
338
|
+
"""
|
|
339
|
+
try:
|
|
340
|
+
# Instantiate the PAIN.001 parser
|
|
341
|
+
parser = Pain001Parser(str(file_path))
|
|
342
|
+
|
|
343
|
+
if streaming:
|
|
344
|
+
# Use streaming parsing to process payments incrementally
|
|
345
|
+
payments = []
|
|
346
|
+
payment_count = 0
|
|
347
|
+
|
|
348
|
+
if output_path:
|
|
349
|
+
# For output file, use atomic write operation with temp file
|
|
350
|
+
safe_name = self.validator.get_safe_filename(
|
|
351
|
+
output_path.name
|
|
352
|
+
)
|
|
353
|
+
safe_output_path = str(
|
|
354
|
+
output_path.parent / safe_name
|
|
355
|
+
)
|
|
356
|
+
temp_output = f"{safe_output_path}.tmp"
|
|
357
|
+
|
|
358
|
+
with open(temp_output, "w", encoding="utf-8") as f:
|
|
359
|
+
# Write CSV header
|
|
360
|
+
header_written = False
|
|
361
|
+
|
|
362
|
+
for payment_data in parser.parse_streaming(
|
|
363
|
+
redact_pii=not show_pii
|
|
364
|
+
):
|
|
365
|
+
payment_count += 1
|
|
366
|
+
|
|
367
|
+
# Convert to DataFrame for consistent formatting
|
|
368
|
+
payment_df = pd.DataFrame([payment_data])
|
|
369
|
+
|
|
370
|
+
if not header_written:
|
|
371
|
+
# Write header on first payment
|
|
372
|
+
payment_df.to_csv(
|
|
373
|
+
f, index=False, mode="w"
|
|
374
|
+
)
|
|
375
|
+
header_written = True
|
|
376
|
+
else:
|
|
377
|
+
# Write data without header
|
|
378
|
+
payment_df.to_csv(
|
|
379
|
+
f,
|
|
380
|
+
index=False,
|
|
381
|
+
mode="a",
|
|
382
|
+
header=False,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
# Atomically move temp file to final location
|
|
386
|
+
os.replace(temp_output, safe_output_path)
|
|
387
|
+
print(
|
|
388
|
+
f"Parsed {payment_count} payments in streaming mode, saved to {safe_output_path}"
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
else:
|
|
392
|
+
# For console output, collect a reasonable number of payments
|
|
393
|
+
max_console_payments = 100
|
|
394
|
+
|
|
395
|
+
for payment_data in parser.parse_streaming(
|
|
396
|
+
redact_pii=not show_pii
|
|
397
|
+
):
|
|
398
|
+
payments.append(payment_data)
|
|
399
|
+
payment_count += 1
|
|
400
|
+
|
|
401
|
+
# Limit console output to prevent overwhelming display
|
|
402
|
+
if payment_count >= max_console_payments:
|
|
403
|
+
break
|
|
404
|
+
|
|
405
|
+
data_df = pd.DataFrame(payments)
|
|
406
|
+
|
|
407
|
+
if show_pii:
|
|
408
|
+
print("WARNING: Displaying unredacted PII data")
|
|
409
|
+
print(data_df)
|
|
410
|
+
else:
|
|
411
|
+
redacted_df = self._redact_dataframe(data_df)
|
|
412
|
+
print(redacted_df)
|
|
413
|
+
|
|
414
|
+
if payment_count >= max_console_payments:
|
|
415
|
+
print(
|
|
416
|
+
f"\n... (showing first {max_console_payments} payments in streaming mode)"
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
else:
|
|
420
|
+
# Use traditional parsing
|
|
421
|
+
parsed_data = parser.parse()
|
|
422
|
+
data_df = pd.DataFrame(parsed_data)
|
|
423
|
+
|
|
424
|
+
if output_path:
|
|
425
|
+
# Use safe filename for output
|
|
426
|
+
safe_name = self.validator.get_safe_filename(
|
|
427
|
+
output_path.name
|
|
428
|
+
)
|
|
429
|
+
safe_output_path = str(
|
|
430
|
+
output_path.parent / safe_name
|
|
431
|
+
)
|
|
432
|
+
data_df.to_csv(safe_output_path, index=False)
|
|
433
|
+
print(f"Parsed data saved to {safe_output_path}")
|
|
434
|
+
else:
|
|
435
|
+
if show_pii:
|
|
436
|
+
print("WARNING: Displaying unredacted PII data")
|
|
437
|
+
print(data_df)
|
|
438
|
+
else:
|
|
439
|
+
redacted_df = self._redact_dataframe(data_df)
|
|
440
|
+
print(redacted_df)
|
|
441
|
+
|
|
442
|
+
except FileNotFoundError as e:
|
|
443
|
+
logger.error(f"File not found: {e}")
|
|
444
|
+
print(f"Error: Input file not found - {str(e)}")
|
|
445
|
+
sys.exit(1)
|
|
446
|
+
except ValidationError as e:
|
|
447
|
+
logger.error(f"Validation error: {e}")
|
|
448
|
+
print(f"Error: Invalid input - {str(e)}")
|
|
449
|
+
sys.exit(1)
|
|
450
|
+
except Exception as e:
|
|
451
|
+
logger.error(
|
|
452
|
+
f"Unexpected error during PAIN.001 parsing: {e}"
|
|
453
|
+
)
|
|
454
|
+
print(f"Error: Failed to parse PAIN.001 file - {str(e)}")
|
|
455
|
+
sys.exit(1)
|
|
456
|
+
|
|
457
|
+
def run(self) -> None:
|
|
458
|
+
"""
|
|
459
|
+
Parse command line arguments and perform the requested action.
|
|
460
|
+
|
|
461
|
+
Validates input/output paths, configures logging, and delegates
|
|
462
|
+
to the appropriate parser (CAMT or PAIN.001) based on the --type
|
|
463
|
+
argument. Supports both traditional and streaming parsing modes.
|
|
464
|
+
|
|
465
|
+
Raises:
|
|
466
|
+
SystemExit: On invalid arguments, validation failure, or parse errors.
|
|
467
|
+
"""
|
|
468
|
+
if len(sys.argv) == 1:
|
|
469
|
+
self.parser.print_help(sys.stderr)
|
|
470
|
+
sys.exit(1)
|
|
471
|
+
|
|
472
|
+
try:
|
|
473
|
+
args = self.parser.parse_args()
|
|
474
|
+
except SystemExit:
|
|
475
|
+
# argparse failed, which means required arguments are missing
|
|
476
|
+
print("Error: Missing required arguments")
|
|
477
|
+
sys.exit(1)
|
|
478
|
+
return # pragma: no cover
|
|
479
|
+
|
|
480
|
+
# Check if required arguments are present (safety check)
|
|
481
|
+
if (
|
|
482
|
+
not hasattr(args, "input")
|
|
483
|
+
or args.input is None
|
|
484
|
+
or not hasattr(args, "type")
|
|
485
|
+
or args.type is None
|
|
486
|
+
):
|
|
487
|
+
print("Error: Missing required arguments")
|
|
488
|
+
sys.exit(1)
|
|
489
|
+
return # Defensive programming: ensure we don't continue if sys.exit is mocked
|
|
490
|
+
|
|
491
|
+
# Set up logging based on verbosity
|
|
492
|
+
log_level = logging.DEBUG if args.verbose else logging.INFO
|
|
493
|
+
setup_logging(log_level)
|
|
494
|
+
|
|
495
|
+
# Update validator max file size setting
|
|
496
|
+
max_size_bytes = (
|
|
497
|
+
args.max_size * 1024 * 1024
|
|
498
|
+
) # Convert MB to bytes
|
|
499
|
+
self.validator.max_file_size = max_size_bytes
|
|
500
|
+
|
|
501
|
+
# Validate input file
|
|
502
|
+
try:
|
|
503
|
+
# First sanitize the path for security
|
|
504
|
+
sanitized_input = self._sanitize_file_path(args.input)
|
|
505
|
+
validated_input_path = (
|
|
506
|
+
self.validator.validate_input_file_path(sanitized_input)
|
|
507
|
+
)
|
|
508
|
+
logger.info(f"Input file validated: {validated_input_path}")
|
|
509
|
+
except (ValidationError, FileNotFoundError) as e:
|
|
510
|
+
logger.error(f"Input validation failed: {e}")
|
|
511
|
+
print(f"Error: {str(e)}")
|
|
512
|
+
sys.exit(1)
|
|
513
|
+
return # Defensive programming: ensure we don't continue if sys.exit is mocked
|
|
514
|
+
|
|
515
|
+
# Validate output file if provided
|
|
516
|
+
validated_output_path = None
|
|
517
|
+
if args.output:
|
|
518
|
+
try:
|
|
519
|
+
# First sanitize the path for security
|
|
520
|
+
sanitized_output = self._sanitize_file_path(args.output)
|
|
521
|
+
validated_output_path = (
|
|
522
|
+
self.validator.validate_output_file_path(
|
|
523
|
+
sanitized_output
|
|
524
|
+
)
|
|
525
|
+
)
|
|
526
|
+
logger.info(
|
|
527
|
+
f"Output file validated: {validated_output_path}"
|
|
528
|
+
)
|
|
529
|
+
except ValidationError as e:
|
|
530
|
+
logger.error(f"Output validation failed: {e}")
|
|
531
|
+
print(f"Error: {str(e)}")
|
|
532
|
+
sys.exit(1)
|
|
533
|
+
|
|
534
|
+
# Parse based on type
|
|
535
|
+
try:
|
|
536
|
+
if args.type == "camt":
|
|
537
|
+
if args.streaming:
|
|
538
|
+
self.parse_camt(
|
|
539
|
+
validated_input_path,
|
|
540
|
+
validated_output_path,
|
|
541
|
+
args.show_pii,
|
|
542
|
+
args.streaming,
|
|
543
|
+
)
|
|
544
|
+
else:
|
|
545
|
+
self.parse_camt(
|
|
546
|
+
validated_input_path,
|
|
547
|
+
validated_output_path,
|
|
548
|
+
args.show_pii,
|
|
549
|
+
)
|
|
550
|
+
elif args.type == "pain001":
|
|
551
|
+
if args.streaming:
|
|
552
|
+
self.parse_pain(
|
|
553
|
+
validated_input_path,
|
|
554
|
+
validated_output_path,
|
|
555
|
+
args.show_pii,
|
|
556
|
+
args.streaming,
|
|
557
|
+
)
|
|
558
|
+
else:
|
|
559
|
+
self.parse_pain(
|
|
560
|
+
validated_input_path,
|
|
561
|
+
validated_output_path,
|
|
562
|
+
args.show_pii,
|
|
563
|
+
)
|
|
564
|
+
else:
|
|
565
|
+
print("Error: The specified type is not supported.")
|
|
566
|
+
sys.exit(1)
|
|
567
|
+
except Exception as e:
|
|
568
|
+
logger.error(f"Parsing failed: {e}")
|
|
569
|
+
print(f"Error: Parsing failed - {str(e)}")
|
|
570
|
+
sys.exit(1)
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
if __name__ == "__main__": # pragma: no cover
|
|
574
|
+
cli = BankStatementCLI()
|
|
575
|
+
cli.run()
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Copyright (C) 2023 Sebastien Rousseau.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
12
|
+
# implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
"""Repository-wide exception hierarchy."""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from defusedxml.ElementTree import ParseError
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BankStatementParserError(Exception):
|
|
24
|
+
"""Base exception for parser-specific failures."""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ExportError(OSError, BankStatementParserError):
|
|
28
|
+
"""Raised when an export operation cannot complete safely."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ParserError(BankStatementParserError):
|
|
32
|
+
"""Raised when parser logic cannot produce a valid result."""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Pain001ParseError(ParseError, ParserError): # type: ignore[misc]
|
|
36
|
+
"""Raised when PAIN.001 parsing fails after XML loading succeeds."""
|