bankstatementparser 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,628 @@
1
+ # Copyright (C) 2023 Sebastien Rousseau.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
+ # implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ input_validator.py
18
+
19
+ Provides comprehensive input validation for file paths, sizes, and formats
20
+ used throughout the bank statement parser.
21
+ """
22
+
23
+ import logging
24
+ import mimetypes
25
+ import os
26
+ import re
27
+ from pathlib import Path
28
+ from typing import Optional, Union
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class ValidationError(Exception):
34
+ """Custom exception for validation errors."""
35
+
36
+ pass
37
+
38
+
39
+ class InputValidator:
40
+ """Comprehensive input validator for file operations."""
41
+
42
+ # Default configuration
43
+ MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024 # 100MB default
44
+ MIN_FILE_SIZE_BYTES = 1 # 1 byte minimum
45
+
46
+ # Allowed file extensions for input files
47
+ ALLOWED_INPUT_EXTENSIONS = {
48
+ ".xml",
49
+ ".XML",
50
+ ".csv",
51
+ ".CSV",
52
+ ".ofx",
53
+ ".OFX",
54
+ ".qfx",
55
+ ".QFX",
56
+ ".mt940",
57
+ ".MT940",
58
+ ".sta",
59
+ ".STA",
60
+ }
61
+
62
+ # Allowed file extensions for output files
63
+ ALLOWED_OUTPUT_EXTENSIONS = {
64
+ ".csv",
65
+ ".CSV",
66
+ ".xlsx",
67
+ ".XLSX",
68
+ ".xls",
69
+ ".XLS",
70
+ }
71
+
72
+ # Dangerous path patterns to block
73
+ DANGEROUS_PATTERNS = [
74
+ r"\.\.", # Directory traversal (catches ../.. and ..\.. patterns)
75
+ r"/\./", # Hidden directory traversal
76
+ r"~/", # Home directory shortcuts (can be allowed if needed)
77
+ r"\$\{", # Variable expansion
78
+ r"%[A-Z_]+%", # Windows environment variables
79
+ ]
80
+
81
+ # System directories to block (platform-specific)
82
+ BLOCKED_DIRECTORIES = {
83
+ # Unix/Linux/macOS
84
+ "/etc",
85
+ "/bin",
86
+ "/sbin",
87
+ "/usr/bin",
88
+ "/usr/sbin",
89
+ "/sys",
90
+ "/proc",
91
+ "/dev",
92
+ "/boot",
93
+ "/root",
94
+ # Windows
95
+ "C:\\Windows",
96
+ "C:\\Program Files",
97
+ "C:\\Program Files (x86)",
98
+ "C:\\System32",
99
+ "C:\\Windows\\System32",
100
+ # macOS specific
101
+ "/System",
102
+ "/Library/System",
103
+ "/private/var/db",
104
+ }
105
+
106
+ def __init__(self, max_file_size: Optional[int] = None):
107
+ """
108
+ Initialize the validator with optional custom configuration.
109
+
110
+ Args:
111
+ max_file_size: Maximum allowed file size in bytes.
112
+ """
113
+ self.max_file_size = max_file_size or self.MAX_FILE_SIZE_BYTES
114
+
115
+ def validate_input_file_path(self, file_path: str) -> Path:
116
+ """
117
+ Validate and sanitize an input file path.
118
+
119
+ Args:
120
+ file_path: Raw file path string to validate.
121
+
122
+ Returns:
123
+ Path: Validated and resolved path object.
124
+
125
+ Raises:
126
+ ValidationError: If validation fails.
127
+ FileNotFoundError: If file doesn't exist.
128
+ """
129
+ if not isinstance(file_path, str):
130
+ raise ValidationError(
131
+ "File path must be a non-empty string"
132
+ )
133
+
134
+ if not file_path:
135
+ raise ValidationError("File path cannot be empty")
136
+
137
+ # Remove leading/trailing whitespace
138
+ file_path = file_path.strip()
139
+
140
+ if not file_path:
141
+ raise ValidationError(
142
+ "File path cannot be empty or whitespace only"
143
+ )
144
+
145
+ # Check for dangerous patterns FIRST - before checking file existence
146
+ # This ensures security validation happens regardless of file existence
147
+ self._check_dangerous_patterns(file_path)
148
+
149
+ # Convert to Path object and resolve
150
+ try:
151
+ path = Path(file_path).resolve()
152
+ except (OSError, ValueError) as e:
153
+ raise ValidationError(
154
+ f"Invalid file path format: {e}"
155
+ ) from e
156
+
157
+ # Check for symlink attacks: reject if the original path is a symlink
158
+ # pointing outside its parent directory
159
+ raw_path = Path(file_path)
160
+ if raw_path.is_symlink():
161
+ link_target = raw_path.resolve()
162
+ link_parent = raw_path.parent.resolve()
163
+ try:
164
+ link_target.relative_to(link_parent)
165
+ except ValueError as exc:
166
+ raise ValidationError(
167
+ f"Symlink target is outside the parent directory: {file_path}"
168
+ ) from exc
169
+
170
+ # Additional security check on resolved path
171
+ self._check_dangerous_patterns(str(path))
172
+
173
+ # Check if file exists (use os.path for robustness)
174
+ if not os.path.exists(str(path)):
175
+ raise FileNotFoundError(f"Input file not found: {path}")
176
+
177
+ # Check if it's actually a file
178
+ if not os.path.isfile(str(path)):
179
+ raise ValidationError(
180
+ f"Path exists but is not a file: {path}"
181
+ )
182
+
183
+ # Check if we can read the file
184
+ if not os.access(path, os.R_OK):
185
+ raise ValidationError(f"File is not readable: {path}")
186
+
187
+ # Validate file extension
188
+ self._validate_input_extension(path)
189
+
190
+ # Check file size
191
+ self._validate_file_size(path)
192
+
193
+ # Validate file format
194
+ self._validate_input_format(path)
195
+
196
+ return path
197
+
198
+ def validate_output_file_path(self, file_path: str) -> Path:
199
+ """
200
+ Validate and sanitize an output file path.
201
+
202
+ Args:
203
+ file_path: Raw output file path string to validate.
204
+
205
+ Returns:
206
+ Path: Validated path object.
207
+
208
+ Raises:
209
+ ValidationError: If validation fails.
210
+ """
211
+ if not isinstance(file_path, str):
212
+ raise ValidationError(
213
+ "Output file path must be a non-empty string"
214
+ )
215
+
216
+ # Remove leading/trailing whitespace
217
+ file_path = file_path.strip()
218
+
219
+ if not file_path:
220
+ raise ValidationError(
221
+ "Output file path cannot be empty or whitespace only"
222
+ )
223
+
224
+ # Check for dangerous patterns
225
+ self._check_dangerous_patterns(file_path)
226
+
227
+ # Convert to Path object and resolve
228
+ try:
229
+ path = Path(file_path).resolve()
230
+ except (OSError, ValueError) as e:
231
+ raise ValidationError(
232
+ f"Invalid output file path format: {e}"
233
+ ) from e
234
+
235
+ # Check parent directory exists and is writable
236
+ parent_dir = path.parent
237
+ if not parent_dir.exists():
238
+ try:
239
+ parent_dir.mkdir(parents=True, exist_ok=True)
240
+ except OSError as e:
241
+ raise ValidationError(
242
+ f"Cannot create output directory: {e}"
243
+ ) from e
244
+
245
+ if not os.access(parent_dir, os.W_OK):
246
+ raise ValidationError(
247
+ f"Output directory is not writable: {parent_dir}"
248
+ )
249
+
250
+ # Validate output file extension
251
+ self._validate_output_extension(path)
252
+
253
+ # Check if file already exists and warn
254
+ if path.exists():
255
+ logger.warning(
256
+ f"Output file already exists and will be overwritten: {path}"
257
+ )
258
+
259
+ return path
260
+
261
+ def sanitize_source_name(
262
+ self, source_name: Optional[str], default: str = "<memory>"
263
+ ) -> str:
264
+ """
265
+ Sanitize a caller-supplied source name used only for diagnostics.
266
+
267
+ Args:
268
+ source_name: Optional caller-provided source identifier.
269
+ default: Fallback label when source_name is empty.
270
+
271
+ Returns:
272
+ str: A bounded, log-safe source label.
273
+
274
+ Raises:
275
+ ValidationError: If source_name is not a string.
276
+ """
277
+ if source_name is None:
278
+ return default
279
+
280
+ if not isinstance(source_name, str):
281
+ raise ValidationError("Source name must be a string")
282
+
283
+ cleaned = []
284
+ for char in source_name.strip():
285
+ if ord(char) < 32 or char in {
286
+ "\u202e",
287
+ "\u202d",
288
+ "\u200f",
289
+ "\u200e",
290
+ "\u2066",
291
+ "\u2067",
292
+ "\u2068",
293
+ "\u2069",
294
+ "\u202a",
295
+ "\u202b",
296
+ "\u202c",
297
+ }:
298
+ cleaned.append("?")
299
+ else:
300
+ cleaned.append(char)
301
+
302
+ sanitized = "".join(cleaned)[:255]
303
+ return sanitized or default
304
+
305
+ def validate_xml_content(
306
+ self,
307
+ xml_content: Union[str, bytes],
308
+ *,
309
+ source_name: Optional[str] = None,
310
+ ) -> tuple[bytes, str]:
311
+ """
312
+ Validate in-memory XML content.
313
+
314
+ Args:
315
+ xml_content: XML payload as text or bytes.
316
+ source_name: Optional source label used for diagnostics only.
317
+
318
+ Returns:
319
+ tuple[bytes, str]: UTF-8 XML bytes and sanitized source name.
320
+
321
+ Raises:
322
+ ValidationError: If content is unsafe, empty, oversized, or not XML.
323
+ """
324
+ safe_source_name = self.sanitize_source_name(source_name)
325
+
326
+ if isinstance(xml_content, str):
327
+ if not xml_content.strip():
328
+ raise ValidationError("XML content cannot be empty")
329
+ xml_bytes = xml_content.encode("utf-8")
330
+ elif isinstance(xml_content, bytes):
331
+ if not xml_content.strip():
332
+ raise ValidationError("XML content cannot be empty")
333
+ xml_bytes = xml_content
334
+ else:
335
+ raise ValidationError(
336
+ "XML content must be provided as a string or bytes"
337
+ )
338
+
339
+ self._validate_bytes_size(xml_bytes)
340
+ self._validate_xml_bytes_format(xml_bytes, safe_source_name)
341
+
342
+ return xml_bytes, safe_source_name
343
+
344
+ def _check_dangerous_patterns(self, file_path: str) -> None:
345
+ """Check for dangerous patterns in file path."""
346
+ # Check for dangerous Unicode characters (null bytes, BiDi overrides, etc.)
347
+ dangerous_unicode = [
348
+ "\u0000", # Null byte
349
+ "\u202e", # Right-to-left override
350
+ "\u202d", # Left-to-right override
351
+ "\u200f", # Right-to-left mark
352
+ "\u200e", # Left-to-right mark
353
+ "\u2066", # Left-to-right isolate
354
+ "\u2067", # Right-to-left isolate
355
+ "\u2068", # First strong isolate
356
+ "\u2069", # Pop directional isolate
357
+ "\u202a", # Left-to-right embedding
358
+ "\u202b", # Right-to-left embedding
359
+ "\u202c", # Pop directional formatting
360
+ ]
361
+ for char in dangerous_unicode:
362
+ if char in file_path:
363
+ raise ValidationError(
364
+ "Potentially dangerous path pattern detected"
365
+ )
366
+
367
+ for pattern in self.DANGEROUS_PATTERNS:
368
+ if re.search(pattern, file_path, re.IGNORECASE):
369
+ raise ValidationError(
370
+ "Potentially dangerous path pattern detected"
371
+ )
372
+
373
+ # Check for blocked directories (case-insensitive comparison)
374
+ # Also check the original path to catch Windows paths on Unix systems
375
+ abs_path = os.path.abspath(file_path).lower()
376
+ original_path = file_path.lower()
377
+
378
+ for blocked_dir in self.BLOCKED_DIRECTORIES:
379
+ blocked_dir_lower = blocked_dir.lower()
380
+ if abs_path.startswith(
381
+ blocked_dir_lower
382
+ ) or original_path.startswith(blocked_dir_lower):
383
+ raise ValidationError(
384
+ "Access to system directory blocked: file not found or not accessible"
385
+ )
386
+
387
+ def _validate_input_extension(self, path: Path) -> None:
388
+ """Validate input file extension."""
389
+ if path.suffix.lower() not in {
390
+ ext.lower() for ext in self.ALLOWED_INPUT_EXTENSIONS
391
+ }:
392
+ allowed = ", ".join(sorted(self.ALLOWED_INPUT_EXTENSIONS))
393
+ raise ValidationError(
394
+ f"Invalid input file extension '{path.suffix}'. "
395
+ f"Allowed extensions: {allowed}"
396
+ )
397
+
398
+ def _validate_output_extension(self, path: Path) -> None:
399
+ """Validate output file extension."""
400
+ if path.suffix.lower() not in {
401
+ ext.lower() for ext in self.ALLOWED_OUTPUT_EXTENSIONS
402
+ }:
403
+ allowed = ", ".join(sorted(self.ALLOWED_OUTPUT_EXTENSIONS))
404
+ raise ValidationError(
405
+ f"Invalid output file extension '{path.suffix}'. "
406
+ f"Allowed extensions: {allowed}"
407
+ )
408
+
409
+ def _validate_file_size(self, path: Path) -> None:
410
+ """Validate file size constraints."""
411
+ try:
412
+ file_size = path.stat().st_size
413
+ except OSError as e:
414
+ raise ValidationError(
415
+ f"Cannot determine file size: {e}"
416
+ ) from e
417
+
418
+ if file_size < self.MIN_FILE_SIZE_BYTES:
419
+ raise ValidationError(
420
+ f"File is too small ({file_size} bytes). Minimum: {self.MIN_FILE_SIZE_BYTES} bytes"
421
+ )
422
+
423
+ if file_size > self.max_file_size:
424
+ size_mb = file_size / (1024 * 1024)
425
+ max_mb = self.max_file_size / (1024 * 1024)
426
+ raise ValidationError(
427
+ f"File is too large ({size_mb:.1f}MB). Maximum allowed: {max_mb:.1f}MB"
428
+ )
429
+
430
+ def _validate_bytes_size(self, data: bytes) -> None:
431
+ """Validate in-memory payload size constraints."""
432
+ payload_size = len(data)
433
+
434
+ if payload_size < self.MIN_FILE_SIZE_BYTES:
435
+ raise ValidationError(
436
+ f"XML content is too small ({payload_size} bytes). Minimum: {self.MIN_FILE_SIZE_BYTES} bytes"
437
+ )
438
+
439
+ if payload_size > self.max_file_size:
440
+ size_mb = payload_size / (1024 * 1024)
441
+ max_mb = self.max_file_size / (1024 * 1024)
442
+ raise ValidationError(
443
+ f"XML content is too large ({size_mb:.1f}MB). Maximum allowed: {max_mb:.1f}MB"
444
+ )
445
+
446
+ def _validate_input_format(self, path: Path) -> None:
447
+ """
448
+ Validate input file format by checking file content.
449
+
450
+ Args:
451
+ path: File path to validate.
452
+
453
+ Raises:
454
+ ValidationError: If format validation fails.
455
+ """
456
+ try:
457
+ # Check MIME type
458
+ mime_type, _ = mimetypes.guess_type(str(path))
459
+ if mime_type and not any(
460
+ xml_type in mime_type for xml_type in ["xml", "text"]
461
+ ):
462
+ logger.warning(
463
+ f"Unexpected MIME type '{mime_type}' for file: {path}"
464
+ )
465
+
466
+ # Read first few bytes to check for XML declaration
467
+ with open(path, "rb") as f:
468
+ header = f.read(1024) # Read first 1KB
469
+
470
+ # Check for known binary file signatures (magic bytes)
471
+ binary_signatures = [
472
+ b"\x89PNG", # PNG
473
+ b"GIF8", # GIF
474
+ b"\xff\xd8\xff", # JPEG
475
+ b"PK", # ZIP/XLSX/DOCX
476
+ b"\x7fELF", # ELF executable
477
+ b"MZ", # Windows executable
478
+ b"\x00\x00\x01\x00", # ICO
479
+ b"%PDF", # PDF
480
+ ]
481
+ for sig in binary_signatures:
482
+ if header[: len(sig)] == sig:
483
+ raise ValidationError(
484
+ f"File appears to contain binary data, expected XML: {path}"
485
+ )
486
+
487
+ # Validate UTF-8 encoding
488
+ try:
489
+ header.decode("utf-8")
490
+ except UnicodeDecodeError as exc:
491
+ raise ValidationError(
492
+ f"File encoding is not valid UTF-8: {path}"
493
+ ) from exc
494
+
495
+ if path.suffix.lower() != ".xml":
496
+ return
497
+
498
+ # Check for XML declaration or root elements
499
+ header_str = header.decode("utf-8", errors="ignore").lower()
500
+
501
+ # Look for XML indicators
502
+ xml_indicators = [
503
+ "<?xml",
504
+ "<document",
505
+ "xmlns",
506
+ "camt.053",
507
+ "pain.001",
508
+ "iso:std:iso:20022",
509
+ ]
510
+
511
+ has_xml_indicator = any(
512
+ indicator in header_str for indicator in xml_indicators
513
+ )
514
+
515
+ if not has_xml_indicator:
516
+ # Check if it's binary data (control chars other than whitespace)
517
+ if any(
518
+ c < 32 and c not in (9, 10, 13)
519
+ for c in header[:100]
520
+ ):
521
+ raise ValidationError(
522
+ f"File appears to contain binary data, expected XML: {path}"
523
+ )
524
+ else:
525
+ logger.warning(
526
+ f"File may not be a valid XML document: {path}"
527
+ )
528
+
529
+ except UnicodeDecodeError as exc:
530
+ raise ValidationError(
531
+ f"File encoding is not valid UTF-8: {path}"
532
+ ) from exc
533
+ except OSError as e:
534
+ raise ValidationError(
535
+ f"Cannot read file for format validation: {e}"
536
+ ) from e
537
+
538
+ def _validate_xml_bytes_format(
539
+ self, xml_bytes: bytes, source_name: str
540
+ ) -> None:
541
+ """
542
+ Validate XML bytes using the same checks applied to file-backed input.
543
+
544
+ Args:
545
+ xml_bytes: Raw XML bytes.
546
+ source_name: Sanitized source label for diagnostics.
547
+
548
+ Raises:
549
+ ValidationError: If content is not plausible UTF-8 XML.
550
+ """
551
+ header = xml_bytes[:1024]
552
+
553
+ binary_signatures = [
554
+ b"\x89PNG",
555
+ b"GIF8",
556
+ b"\xff\xd8\xff",
557
+ b"PK",
558
+ b"\x7fELF",
559
+ b"MZ",
560
+ b"\x00\x00\x01\x00",
561
+ b"%PDF",
562
+ ]
563
+ for sig in binary_signatures:
564
+ if header[: len(sig)] == sig:
565
+ raise ValidationError(
566
+ f"XML content appears to contain binary data, expected XML: {source_name}"
567
+ )
568
+
569
+ try:
570
+ header_str = header.decode("utf-8")
571
+ except UnicodeDecodeError as exc:
572
+ raise ValidationError(
573
+ f"XML content encoding is not valid UTF-8: {source_name}"
574
+ ) from exc
575
+
576
+ header_lower = header_str.lower()
577
+ xml_indicators = [
578
+ "<?xml",
579
+ "<document",
580
+ "xmlns",
581
+ "camt.",
582
+ "iso:std:iso:20022",
583
+ ]
584
+
585
+ has_xml_indicator = any(
586
+ indicator in header_lower for indicator in xml_indicators
587
+ )
588
+
589
+ if not has_xml_indicator:
590
+ if any(
591
+ byte < 32 and byte not in (9, 10, 13)
592
+ for byte in header[:100]
593
+ ):
594
+ raise ValidationError(
595
+ f"XML content appears to contain binary data, expected XML: {source_name}"
596
+ )
597
+
598
+ logger.warning(
599
+ "XML content may not be a valid XML document: %s",
600
+ source_name,
601
+ )
602
+
603
+ def get_safe_filename(self, filename: str) -> str:
604
+ """
605
+ Generate a safe filename by removing/replacing dangerous characters.
606
+
607
+ Args:
608
+ filename: Original filename.
609
+
610
+ Returns:
611
+ str: Safe filename.
612
+ """
613
+ # Remove or replace dangerous characters
614
+ safe_chars = re.sub(r'[<>:"/\\|?*\x00-\x1f]', "_", filename)
615
+
616
+ # Remove leading/trailing dots and spaces
617
+ safe_chars = safe_chars.strip(". ")
618
+
619
+ # Ensure filename is not empty
620
+ if not safe_chars:
621
+ safe_chars = "unnamed_file"
622
+
623
+ # Truncate if too long (keeping extension)
624
+ if len(safe_chars) > 255:
625
+ name, ext = os.path.splitext(safe_chars)
626
+ safe_chars = name[: 255 - len(ext)] + ext
627
+
628
+ return safe_chars