bankstatementparser 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bankstatementparser/__init__.py +82 -0
- bankstatementparser/additional_parsers.py +376 -0
- bankstatementparser/bank_statement_parsers.py +370 -0
- bankstatementparser/base_parser.py +205 -0
- bankstatementparser/camt_parser.py +971 -0
- bankstatementparser/cli.py +575 -0
- bankstatementparser/exceptions.py +36 -0
- bankstatementparser/input_validator.py +628 -0
- bankstatementparser/pain001_parser.py +742 -0
- bankstatementparser/parallel.py +127 -0
- bankstatementparser/record_types.py +94 -0
- bankstatementparser/transaction_deduplicator.py +402 -0
- bankstatementparser/transaction_models.py +196 -0
- bankstatementparser/zip_security.py +141 -0
- bankstatementparser-0.0.4.dist-info/METADATA +363 -0
- bankstatementparser-0.0.4.dist-info/RECORD +18 -0
- bankstatementparser-0.0.4.dist-info/WHEEL +4 -0
- bankstatementparser-0.0.4.dist-info/licenses/LICENSE +203 -0
|
@@ -0,0 +1,628 @@
|
|
|
1
|
+
# Copyright (C) 2023 Sebastien Rousseau.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
12
|
+
# implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
input_validator.py
|
|
18
|
+
|
|
19
|
+
Provides comprehensive input validation for file paths, sizes, and formats
|
|
20
|
+
used throughout the bank statement parser.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import logging
|
|
24
|
+
import mimetypes
|
|
25
|
+
import os
|
|
26
|
+
import re
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Optional, Union
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ValidationError(Exception):
|
|
34
|
+
"""Custom exception for validation errors."""
|
|
35
|
+
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class InputValidator:
|
|
40
|
+
"""Comprehensive input validator for file operations."""
|
|
41
|
+
|
|
42
|
+
# Default configuration
|
|
43
|
+
MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024 # 100MB default
|
|
44
|
+
MIN_FILE_SIZE_BYTES = 1 # 1 byte minimum
|
|
45
|
+
|
|
46
|
+
# Allowed file extensions for input files
|
|
47
|
+
ALLOWED_INPUT_EXTENSIONS = {
|
|
48
|
+
".xml",
|
|
49
|
+
".XML",
|
|
50
|
+
".csv",
|
|
51
|
+
".CSV",
|
|
52
|
+
".ofx",
|
|
53
|
+
".OFX",
|
|
54
|
+
".qfx",
|
|
55
|
+
".QFX",
|
|
56
|
+
".mt940",
|
|
57
|
+
".MT940",
|
|
58
|
+
".sta",
|
|
59
|
+
".STA",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
# Allowed file extensions for output files
|
|
63
|
+
ALLOWED_OUTPUT_EXTENSIONS = {
|
|
64
|
+
".csv",
|
|
65
|
+
".CSV",
|
|
66
|
+
".xlsx",
|
|
67
|
+
".XLSX",
|
|
68
|
+
".xls",
|
|
69
|
+
".XLS",
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# Dangerous path patterns to block
|
|
73
|
+
DANGEROUS_PATTERNS = [
|
|
74
|
+
r"\.\.", # Directory traversal (catches ../.. and ..\.. patterns)
|
|
75
|
+
r"/\./", # Hidden directory traversal
|
|
76
|
+
r"~/", # Home directory shortcuts (can be allowed if needed)
|
|
77
|
+
r"\$\{", # Variable expansion
|
|
78
|
+
r"%[A-Z_]+%", # Windows environment variables
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
# System directories to block (platform-specific)
|
|
82
|
+
BLOCKED_DIRECTORIES = {
|
|
83
|
+
# Unix/Linux/macOS
|
|
84
|
+
"/etc",
|
|
85
|
+
"/bin",
|
|
86
|
+
"/sbin",
|
|
87
|
+
"/usr/bin",
|
|
88
|
+
"/usr/sbin",
|
|
89
|
+
"/sys",
|
|
90
|
+
"/proc",
|
|
91
|
+
"/dev",
|
|
92
|
+
"/boot",
|
|
93
|
+
"/root",
|
|
94
|
+
# Windows
|
|
95
|
+
"C:\\Windows",
|
|
96
|
+
"C:\\Program Files",
|
|
97
|
+
"C:\\Program Files (x86)",
|
|
98
|
+
"C:\\System32",
|
|
99
|
+
"C:\\Windows\\System32",
|
|
100
|
+
# macOS specific
|
|
101
|
+
"/System",
|
|
102
|
+
"/Library/System",
|
|
103
|
+
"/private/var/db",
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
def __init__(self, max_file_size: Optional[int] = None):
|
|
107
|
+
"""
|
|
108
|
+
Initialize the validator with optional custom configuration.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
max_file_size: Maximum allowed file size in bytes.
|
|
112
|
+
"""
|
|
113
|
+
self.max_file_size = max_file_size or self.MAX_FILE_SIZE_BYTES
|
|
114
|
+
|
|
115
|
+
def validate_input_file_path(self, file_path: str) -> Path:
|
|
116
|
+
"""
|
|
117
|
+
Validate and sanitize an input file path.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
file_path: Raw file path string to validate.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Path: Validated and resolved path object.
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
ValidationError: If validation fails.
|
|
127
|
+
FileNotFoundError: If file doesn't exist.
|
|
128
|
+
"""
|
|
129
|
+
if not isinstance(file_path, str):
|
|
130
|
+
raise ValidationError(
|
|
131
|
+
"File path must be a non-empty string"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if not file_path:
|
|
135
|
+
raise ValidationError("File path cannot be empty")
|
|
136
|
+
|
|
137
|
+
# Remove leading/trailing whitespace
|
|
138
|
+
file_path = file_path.strip()
|
|
139
|
+
|
|
140
|
+
if not file_path:
|
|
141
|
+
raise ValidationError(
|
|
142
|
+
"File path cannot be empty or whitespace only"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Check for dangerous patterns FIRST - before checking file existence
|
|
146
|
+
# This ensures security validation happens regardless of file existence
|
|
147
|
+
self._check_dangerous_patterns(file_path)
|
|
148
|
+
|
|
149
|
+
# Convert to Path object and resolve
|
|
150
|
+
try:
|
|
151
|
+
path = Path(file_path).resolve()
|
|
152
|
+
except (OSError, ValueError) as e:
|
|
153
|
+
raise ValidationError(
|
|
154
|
+
f"Invalid file path format: {e}"
|
|
155
|
+
) from e
|
|
156
|
+
|
|
157
|
+
# Check for symlink attacks: reject if the original path is a symlink
|
|
158
|
+
# pointing outside its parent directory
|
|
159
|
+
raw_path = Path(file_path)
|
|
160
|
+
if raw_path.is_symlink():
|
|
161
|
+
link_target = raw_path.resolve()
|
|
162
|
+
link_parent = raw_path.parent.resolve()
|
|
163
|
+
try:
|
|
164
|
+
link_target.relative_to(link_parent)
|
|
165
|
+
except ValueError as exc:
|
|
166
|
+
raise ValidationError(
|
|
167
|
+
f"Symlink target is outside the parent directory: {file_path}"
|
|
168
|
+
) from exc
|
|
169
|
+
|
|
170
|
+
# Additional security check on resolved path
|
|
171
|
+
self._check_dangerous_patterns(str(path))
|
|
172
|
+
|
|
173
|
+
# Check if file exists (use os.path for robustness)
|
|
174
|
+
if not os.path.exists(str(path)):
|
|
175
|
+
raise FileNotFoundError(f"Input file not found: {path}")
|
|
176
|
+
|
|
177
|
+
# Check if it's actually a file
|
|
178
|
+
if not os.path.isfile(str(path)):
|
|
179
|
+
raise ValidationError(
|
|
180
|
+
f"Path exists but is not a file: {path}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Check if we can read the file
|
|
184
|
+
if not os.access(path, os.R_OK):
|
|
185
|
+
raise ValidationError(f"File is not readable: {path}")
|
|
186
|
+
|
|
187
|
+
# Validate file extension
|
|
188
|
+
self._validate_input_extension(path)
|
|
189
|
+
|
|
190
|
+
# Check file size
|
|
191
|
+
self._validate_file_size(path)
|
|
192
|
+
|
|
193
|
+
# Validate file format
|
|
194
|
+
self._validate_input_format(path)
|
|
195
|
+
|
|
196
|
+
return path
|
|
197
|
+
|
|
198
|
+
def validate_output_file_path(self, file_path: str) -> Path:
|
|
199
|
+
"""
|
|
200
|
+
Validate and sanitize an output file path.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
file_path: Raw output file path string to validate.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Path: Validated path object.
|
|
207
|
+
|
|
208
|
+
Raises:
|
|
209
|
+
ValidationError: If validation fails.
|
|
210
|
+
"""
|
|
211
|
+
if not isinstance(file_path, str):
|
|
212
|
+
raise ValidationError(
|
|
213
|
+
"Output file path must be a non-empty string"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Remove leading/trailing whitespace
|
|
217
|
+
file_path = file_path.strip()
|
|
218
|
+
|
|
219
|
+
if not file_path:
|
|
220
|
+
raise ValidationError(
|
|
221
|
+
"Output file path cannot be empty or whitespace only"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Check for dangerous patterns
|
|
225
|
+
self._check_dangerous_patterns(file_path)
|
|
226
|
+
|
|
227
|
+
# Convert to Path object and resolve
|
|
228
|
+
try:
|
|
229
|
+
path = Path(file_path).resolve()
|
|
230
|
+
except (OSError, ValueError) as e:
|
|
231
|
+
raise ValidationError(
|
|
232
|
+
f"Invalid output file path format: {e}"
|
|
233
|
+
) from e
|
|
234
|
+
|
|
235
|
+
# Check parent directory exists and is writable
|
|
236
|
+
parent_dir = path.parent
|
|
237
|
+
if not parent_dir.exists():
|
|
238
|
+
try:
|
|
239
|
+
parent_dir.mkdir(parents=True, exist_ok=True)
|
|
240
|
+
except OSError as e:
|
|
241
|
+
raise ValidationError(
|
|
242
|
+
f"Cannot create output directory: {e}"
|
|
243
|
+
) from e
|
|
244
|
+
|
|
245
|
+
if not os.access(parent_dir, os.W_OK):
|
|
246
|
+
raise ValidationError(
|
|
247
|
+
f"Output directory is not writable: {parent_dir}"
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Validate output file extension
|
|
251
|
+
self._validate_output_extension(path)
|
|
252
|
+
|
|
253
|
+
# Check if file already exists and warn
|
|
254
|
+
if path.exists():
|
|
255
|
+
logger.warning(
|
|
256
|
+
f"Output file already exists and will be overwritten: {path}"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
return path
|
|
260
|
+
|
|
261
|
+
def sanitize_source_name(
|
|
262
|
+
self, source_name: Optional[str], default: str = "<memory>"
|
|
263
|
+
) -> str:
|
|
264
|
+
"""
|
|
265
|
+
Sanitize a caller-supplied source name used only for diagnostics.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
source_name: Optional caller-provided source identifier.
|
|
269
|
+
default: Fallback label when source_name is empty.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
str: A bounded, log-safe source label.
|
|
273
|
+
|
|
274
|
+
Raises:
|
|
275
|
+
ValidationError: If source_name is not a string.
|
|
276
|
+
"""
|
|
277
|
+
if source_name is None:
|
|
278
|
+
return default
|
|
279
|
+
|
|
280
|
+
if not isinstance(source_name, str):
|
|
281
|
+
raise ValidationError("Source name must be a string")
|
|
282
|
+
|
|
283
|
+
cleaned = []
|
|
284
|
+
for char in source_name.strip():
|
|
285
|
+
if ord(char) < 32 or char in {
|
|
286
|
+
"\u202e",
|
|
287
|
+
"\u202d",
|
|
288
|
+
"\u200f",
|
|
289
|
+
"\u200e",
|
|
290
|
+
"\u2066",
|
|
291
|
+
"\u2067",
|
|
292
|
+
"\u2068",
|
|
293
|
+
"\u2069",
|
|
294
|
+
"\u202a",
|
|
295
|
+
"\u202b",
|
|
296
|
+
"\u202c",
|
|
297
|
+
}:
|
|
298
|
+
cleaned.append("?")
|
|
299
|
+
else:
|
|
300
|
+
cleaned.append(char)
|
|
301
|
+
|
|
302
|
+
sanitized = "".join(cleaned)[:255]
|
|
303
|
+
return sanitized or default
|
|
304
|
+
|
|
305
|
+
def validate_xml_content(
|
|
306
|
+
self,
|
|
307
|
+
xml_content: Union[str, bytes],
|
|
308
|
+
*,
|
|
309
|
+
source_name: Optional[str] = None,
|
|
310
|
+
) -> tuple[bytes, str]:
|
|
311
|
+
"""
|
|
312
|
+
Validate in-memory XML content.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
xml_content: XML payload as text or bytes.
|
|
316
|
+
source_name: Optional source label used for diagnostics only.
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
tuple[bytes, str]: UTF-8 XML bytes and sanitized source name.
|
|
320
|
+
|
|
321
|
+
Raises:
|
|
322
|
+
ValidationError: If content is unsafe, empty, oversized, or not XML.
|
|
323
|
+
"""
|
|
324
|
+
safe_source_name = self.sanitize_source_name(source_name)
|
|
325
|
+
|
|
326
|
+
if isinstance(xml_content, str):
|
|
327
|
+
if not xml_content.strip():
|
|
328
|
+
raise ValidationError("XML content cannot be empty")
|
|
329
|
+
xml_bytes = xml_content.encode("utf-8")
|
|
330
|
+
elif isinstance(xml_content, bytes):
|
|
331
|
+
if not xml_content.strip():
|
|
332
|
+
raise ValidationError("XML content cannot be empty")
|
|
333
|
+
xml_bytes = xml_content
|
|
334
|
+
else:
|
|
335
|
+
raise ValidationError(
|
|
336
|
+
"XML content must be provided as a string or bytes"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
self._validate_bytes_size(xml_bytes)
|
|
340
|
+
self._validate_xml_bytes_format(xml_bytes, safe_source_name)
|
|
341
|
+
|
|
342
|
+
return xml_bytes, safe_source_name
|
|
343
|
+
|
|
344
|
+
def _check_dangerous_patterns(self, file_path: str) -> None:
|
|
345
|
+
"""Check for dangerous patterns in file path."""
|
|
346
|
+
# Check for dangerous Unicode characters (null bytes, BiDi overrides, etc.)
|
|
347
|
+
dangerous_unicode = [
|
|
348
|
+
"\u0000", # Null byte
|
|
349
|
+
"\u202e", # Right-to-left override
|
|
350
|
+
"\u202d", # Left-to-right override
|
|
351
|
+
"\u200f", # Right-to-left mark
|
|
352
|
+
"\u200e", # Left-to-right mark
|
|
353
|
+
"\u2066", # Left-to-right isolate
|
|
354
|
+
"\u2067", # Right-to-left isolate
|
|
355
|
+
"\u2068", # First strong isolate
|
|
356
|
+
"\u2069", # Pop directional isolate
|
|
357
|
+
"\u202a", # Left-to-right embedding
|
|
358
|
+
"\u202b", # Right-to-left embedding
|
|
359
|
+
"\u202c", # Pop directional formatting
|
|
360
|
+
]
|
|
361
|
+
for char in dangerous_unicode:
|
|
362
|
+
if char in file_path:
|
|
363
|
+
raise ValidationError(
|
|
364
|
+
"Potentially dangerous path pattern detected"
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
for pattern in self.DANGEROUS_PATTERNS:
|
|
368
|
+
if re.search(pattern, file_path, re.IGNORECASE):
|
|
369
|
+
raise ValidationError(
|
|
370
|
+
"Potentially dangerous path pattern detected"
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# Check for blocked directories (case-insensitive comparison)
|
|
374
|
+
# Also check the original path to catch Windows paths on Unix systems
|
|
375
|
+
abs_path = os.path.abspath(file_path).lower()
|
|
376
|
+
original_path = file_path.lower()
|
|
377
|
+
|
|
378
|
+
for blocked_dir in self.BLOCKED_DIRECTORIES:
|
|
379
|
+
blocked_dir_lower = blocked_dir.lower()
|
|
380
|
+
if abs_path.startswith(
|
|
381
|
+
blocked_dir_lower
|
|
382
|
+
) or original_path.startswith(blocked_dir_lower):
|
|
383
|
+
raise ValidationError(
|
|
384
|
+
"Access to system directory blocked: file not found or not accessible"
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
def _validate_input_extension(self, path: Path) -> None:
|
|
388
|
+
"""Validate input file extension."""
|
|
389
|
+
if path.suffix.lower() not in {
|
|
390
|
+
ext.lower() for ext in self.ALLOWED_INPUT_EXTENSIONS
|
|
391
|
+
}:
|
|
392
|
+
allowed = ", ".join(sorted(self.ALLOWED_INPUT_EXTENSIONS))
|
|
393
|
+
raise ValidationError(
|
|
394
|
+
f"Invalid input file extension '{path.suffix}'. "
|
|
395
|
+
f"Allowed extensions: {allowed}"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
def _validate_output_extension(self, path: Path) -> None:
|
|
399
|
+
"""Validate output file extension."""
|
|
400
|
+
if path.suffix.lower() not in {
|
|
401
|
+
ext.lower() for ext in self.ALLOWED_OUTPUT_EXTENSIONS
|
|
402
|
+
}:
|
|
403
|
+
allowed = ", ".join(sorted(self.ALLOWED_OUTPUT_EXTENSIONS))
|
|
404
|
+
raise ValidationError(
|
|
405
|
+
f"Invalid output file extension '{path.suffix}'. "
|
|
406
|
+
f"Allowed extensions: {allowed}"
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
def _validate_file_size(self, path: Path) -> None:
|
|
410
|
+
"""Validate file size constraints."""
|
|
411
|
+
try:
|
|
412
|
+
file_size = path.stat().st_size
|
|
413
|
+
except OSError as e:
|
|
414
|
+
raise ValidationError(
|
|
415
|
+
f"Cannot determine file size: {e}"
|
|
416
|
+
) from e
|
|
417
|
+
|
|
418
|
+
if file_size < self.MIN_FILE_SIZE_BYTES:
|
|
419
|
+
raise ValidationError(
|
|
420
|
+
f"File is too small ({file_size} bytes). Minimum: {self.MIN_FILE_SIZE_BYTES} bytes"
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
if file_size > self.max_file_size:
|
|
424
|
+
size_mb = file_size / (1024 * 1024)
|
|
425
|
+
max_mb = self.max_file_size / (1024 * 1024)
|
|
426
|
+
raise ValidationError(
|
|
427
|
+
f"File is too large ({size_mb:.1f}MB). Maximum allowed: {max_mb:.1f}MB"
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
def _validate_bytes_size(self, data: bytes) -> None:
|
|
431
|
+
"""Validate in-memory payload size constraints."""
|
|
432
|
+
payload_size = len(data)
|
|
433
|
+
|
|
434
|
+
if payload_size < self.MIN_FILE_SIZE_BYTES:
|
|
435
|
+
raise ValidationError(
|
|
436
|
+
f"XML content is too small ({payload_size} bytes). Minimum: {self.MIN_FILE_SIZE_BYTES} bytes"
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
if payload_size > self.max_file_size:
|
|
440
|
+
size_mb = payload_size / (1024 * 1024)
|
|
441
|
+
max_mb = self.max_file_size / (1024 * 1024)
|
|
442
|
+
raise ValidationError(
|
|
443
|
+
f"XML content is too large ({size_mb:.1f}MB). Maximum allowed: {max_mb:.1f}MB"
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
def _validate_input_format(self, path: Path) -> None:
|
|
447
|
+
"""
|
|
448
|
+
Validate input file format by checking file content.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
path: File path to validate.
|
|
452
|
+
|
|
453
|
+
Raises:
|
|
454
|
+
ValidationError: If format validation fails.
|
|
455
|
+
"""
|
|
456
|
+
try:
|
|
457
|
+
# Check MIME type
|
|
458
|
+
mime_type, _ = mimetypes.guess_type(str(path))
|
|
459
|
+
if mime_type and not any(
|
|
460
|
+
xml_type in mime_type for xml_type in ["xml", "text"]
|
|
461
|
+
):
|
|
462
|
+
logger.warning(
|
|
463
|
+
f"Unexpected MIME type '{mime_type}' for file: {path}"
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
# Read first few bytes to check for XML declaration
|
|
467
|
+
with open(path, "rb") as f:
|
|
468
|
+
header = f.read(1024) # Read first 1KB
|
|
469
|
+
|
|
470
|
+
# Check for known binary file signatures (magic bytes)
|
|
471
|
+
binary_signatures = [
|
|
472
|
+
b"\x89PNG", # PNG
|
|
473
|
+
b"GIF8", # GIF
|
|
474
|
+
b"\xff\xd8\xff", # JPEG
|
|
475
|
+
b"PK", # ZIP/XLSX/DOCX
|
|
476
|
+
b"\x7fELF", # ELF executable
|
|
477
|
+
b"MZ", # Windows executable
|
|
478
|
+
b"\x00\x00\x01\x00", # ICO
|
|
479
|
+
b"%PDF", # PDF
|
|
480
|
+
]
|
|
481
|
+
for sig in binary_signatures:
|
|
482
|
+
if header[: len(sig)] == sig:
|
|
483
|
+
raise ValidationError(
|
|
484
|
+
f"File appears to contain binary data, expected XML: {path}"
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
# Validate UTF-8 encoding
|
|
488
|
+
try:
|
|
489
|
+
header.decode("utf-8")
|
|
490
|
+
except UnicodeDecodeError as exc:
|
|
491
|
+
raise ValidationError(
|
|
492
|
+
f"File encoding is not valid UTF-8: {path}"
|
|
493
|
+
) from exc
|
|
494
|
+
|
|
495
|
+
if path.suffix.lower() != ".xml":
|
|
496
|
+
return
|
|
497
|
+
|
|
498
|
+
# Check for XML declaration or root elements
|
|
499
|
+
header_str = header.decode("utf-8", errors="ignore").lower()
|
|
500
|
+
|
|
501
|
+
# Look for XML indicators
|
|
502
|
+
xml_indicators = [
|
|
503
|
+
"<?xml",
|
|
504
|
+
"<document",
|
|
505
|
+
"xmlns",
|
|
506
|
+
"camt.053",
|
|
507
|
+
"pain.001",
|
|
508
|
+
"iso:std:iso:20022",
|
|
509
|
+
]
|
|
510
|
+
|
|
511
|
+
has_xml_indicator = any(
|
|
512
|
+
indicator in header_str for indicator in xml_indicators
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
if not has_xml_indicator:
|
|
516
|
+
# Check if it's binary data (control chars other than whitespace)
|
|
517
|
+
if any(
|
|
518
|
+
c < 32 and c not in (9, 10, 13)
|
|
519
|
+
for c in header[:100]
|
|
520
|
+
):
|
|
521
|
+
raise ValidationError(
|
|
522
|
+
f"File appears to contain binary data, expected XML: {path}"
|
|
523
|
+
)
|
|
524
|
+
else:
|
|
525
|
+
logger.warning(
|
|
526
|
+
f"File may not be a valid XML document: {path}"
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
except UnicodeDecodeError as exc:
|
|
530
|
+
raise ValidationError(
|
|
531
|
+
f"File encoding is not valid UTF-8: {path}"
|
|
532
|
+
) from exc
|
|
533
|
+
except OSError as e:
|
|
534
|
+
raise ValidationError(
|
|
535
|
+
f"Cannot read file for format validation: {e}"
|
|
536
|
+
) from e
|
|
537
|
+
|
|
538
|
+
def _validate_xml_bytes_format(
|
|
539
|
+
self, xml_bytes: bytes, source_name: str
|
|
540
|
+
) -> None:
|
|
541
|
+
"""
|
|
542
|
+
Validate XML bytes using the same checks applied to file-backed input.
|
|
543
|
+
|
|
544
|
+
Args:
|
|
545
|
+
xml_bytes: Raw XML bytes.
|
|
546
|
+
source_name: Sanitized source label for diagnostics.
|
|
547
|
+
|
|
548
|
+
Raises:
|
|
549
|
+
ValidationError: If content is not plausible UTF-8 XML.
|
|
550
|
+
"""
|
|
551
|
+
header = xml_bytes[:1024]
|
|
552
|
+
|
|
553
|
+
binary_signatures = [
|
|
554
|
+
b"\x89PNG",
|
|
555
|
+
b"GIF8",
|
|
556
|
+
b"\xff\xd8\xff",
|
|
557
|
+
b"PK",
|
|
558
|
+
b"\x7fELF",
|
|
559
|
+
b"MZ",
|
|
560
|
+
b"\x00\x00\x01\x00",
|
|
561
|
+
b"%PDF",
|
|
562
|
+
]
|
|
563
|
+
for sig in binary_signatures:
|
|
564
|
+
if header[: len(sig)] == sig:
|
|
565
|
+
raise ValidationError(
|
|
566
|
+
f"XML content appears to contain binary data, expected XML: {source_name}"
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
try:
|
|
570
|
+
header_str = header.decode("utf-8")
|
|
571
|
+
except UnicodeDecodeError as exc:
|
|
572
|
+
raise ValidationError(
|
|
573
|
+
f"XML content encoding is not valid UTF-8: {source_name}"
|
|
574
|
+
) from exc
|
|
575
|
+
|
|
576
|
+
header_lower = header_str.lower()
|
|
577
|
+
xml_indicators = [
|
|
578
|
+
"<?xml",
|
|
579
|
+
"<document",
|
|
580
|
+
"xmlns",
|
|
581
|
+
"camt.",
|
|
582
|
+
"iso:std:iso:20022",
|
|
583
|
+
]
|
|
584
|
+
|
|
585
|
+
has_xml_indicator = any(
|
|
586
|
+
indicator in header_lower for indicator in xml_indicators
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
if not has_xml_indicator:
|
|
590
|
+
if any(
|
|
591
|
+
byte < 32 and byte not in (9, 10, 13)
|
|
592
|
+
for byte in header[:100]
|
|
593
|
+
):
|
|
594
|
+
raise ValidationError(
|
|
595
|
+
f"XML content appears to contain binary data, expected XML: {source_name}"
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
logger.warning(
|
|
599
|
+
"XML content may not be a valid XML document: %s",
|
|
600
|
+
source_name,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
def get_safe_filename(self, filename: str) -> str:
|
|
604
|
+
"""
|
|
605
|
+
Generate a safe filename by removing/replacing dangerous characters.
|
|
606
|
+
|
|
607
|
+
Args:
|
|
608
|
+
filename: Original filename.
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
str: Safe filename.
|
|
612
|
+
"""
|
|
613
|
+
# Remove or replace dangerous characters
|
|
614
|
+
safe_chars = re.sub(r'[<>:"/\\|?*\x00-\x1f]', "_", filename)
|
|
615
|
+
|
|
616
|
+
# Remove leading/trailing dots and spaces
|
|
617
|
+
safe_chars = safe_chars.strip(". ")
|
|
618
|
+
|
|
619
|
+
# Ensure filename is not empty
|
|
620
|
+
if not safe_chars:
|
|
621
|
+
safe_chars = "unnamed_file"
|
|
622
|
+
|
|
623
|
+
# Truncate if too long (keeping extension)
|
|
624
|
+
if len(safe_chars) > 255:
|
|
625
|
+
name, ext = os.path.splitext(safe_chars)
|
|
626
|
+
safe_chars = name[: 255 - len(ext)] + ext
|
|
627
|
+
|
|
628
|
+
return safe_chars
|