splurge-dsv 2025.2.0__py3-none-any.whl → 2025.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- splurge_dsv/__init__.py +16 -5
- splurge_dsv/cli.py +137 -26
- splurge_dsv/dsv.py +101 -7
- splurge_dsv/dsv_helper.py +417 -43
- splurge_dsv/exceptions.py +22 -1
- splurge_dsv/string_tokenizer.py +7 -1
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/METADATA +78 -5
- splurge_dsv-2025.3.1.dist-info/RECORD +13 -0
- splurge_dsv/path_validator.py +0 -298
- splurge_dsv/safe_text_file_reader.py +0 -177
- splurge_dsv/safe_text_file_writer.py +0 -136
- splurge_dsv/text_file_helper.py +0 -240
- splurge_dsv-2025.2.0.dist-info/RECORD +0 -17
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/WHEEL +0 -0
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/entry_points.txt +0 -0
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/licenses/LICENSE +0 -0
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/top_level.txt +0 -0
splurge_dsv/string_tokenizer.py
CHANGED
@@ -58,8 +58,14 @@ class StringTokenizer:
|
|
58
58
|
if delimiter is None or delimiter == "":
|
59
59
|
raise SplurgeDsvParameterError("delimiter cannot be empty or None")
|
60
60
|
|
61
|
+
# If stripping is enabled and the input is only whitespace (or
|
62
|
+
# empty), treat it as a single empty token rather than returning an
|
63
|
+
# empty list. Returning [] causes downstream code that expects the
|
64
|
+
# same number of columns as the header to raise IndexError. The
|
65
|
+
# external safe reader yields empty strings for blank lines, so we
|
66
|
+
# preserve that semantic here.
|
61
67
|
if strip and not content.strip():
|
62
|
-
return []
|
68
|
+
return [""]
|
63
69
|
|
64
70
|
result: list[str] = content.split(delimiter)
|
65
71
|
if strip:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: splurge-dsv
|
3
|
-
Version: 2025.
|
3
|
+
Version: 2025.3.1
|
4
4
|
Summary: A utility library for working with DSV (Delimited String Values) files
|
5
5
|
Author: Jim Schilling
|
6
6
|
License-Expression: MIT
|
@@ -21,10 +21,11 @@ Classifier: Topic :: Text Processing :: Filters
|
|
21
21
|
Requires-Python: >=3.10
|
22
22
|
Description-Content-Type: text/markdown
|
23
23
|
License-File: LICENSE
|
24
|
+
Requires-Dist: splurge-safe-io>=2025.0.5
|
25
|
+
Requires-Dist: PyYAML>=6.0
|
24
26
|
Provides-Extra: dev
|
25
27
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
26
28
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
27
|
-
Requires-Dist: pytest-xdist>=3.0.0; extra == "dev"
|
28
29
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
29
30
|
Requires-Dist: ruff>=0.0.241; extra == "dev"
|
30
31
|
Requires-Dist: pytest-mock>=3.0.0; extra == "dev"
|
@@ -38,7 +39,7 @@ Dynamic: license-file
|
|
38
39
|
[](https://opensource.org/licenses/MIT)
|
39
40
|
|
40
41
|
[](https://github.com/jim-schilling/splurge-dsv/actions/workflows/ci-quick-test.yml)
|
41
|
-
[](https://github.com/jim-schilling/splurge-dsv)
|
42
43
|
[](https://github.com/astral-sh/ruff)
|
43
44
|
[](https://mypy-lang.org/)
|
44
45
|
|
@@ -46,12 +47,49 @@ A robust Python library for parsing and processing delimited-separated value (DS
|
|
46
47
|
|
47
48
|
## Features
|
48
49
|
|
49
|
-
- **Multi-format DSV Support**: Parse CSV, TSV, pipe-delimited, and custom delimiter files
|
50
|
+
- **Multi-format DSV Support**: Parse CSV, TSV, pipe-delimited, and custom delimiter separated value files/objects
|
51
|
+
- **Configurable Parsing**: Flexible options for delimiters, quote characters, escape characters, header/footer row(s) handling
|
50
52
|
- **Memory-Efficient Streaming**: Process large files without loading entire content into memory
|
51
53
|
- **Security & Validation**: Comprehensive path validation and file permission checks
|
52
54
|
- **Unicode Support**: Full Unicode character and encoding support
|
53
55
|
- **Type Safety**: Full type annotations with mypy validation
|
54
|
-
- **
|
56
|
+
- **Deterministic Newline Handling**: Consistent handling of CRLF, CR, and LF newlines across platforms
|
57
|
+
- **CLI Tool**: Command-line interface for quick parsing and inspection of DSV files
|
58
|
+
- **Robust Error Handling**: Clear and specific exceptions for various error scenarios
|
59
|
+
- **Modern API**: Object-oriented API with `Dsv` and `DsvConfig` classes for easy configuration and reuse
|
60
|
+
- **Comprehensive Documentation**: In-depth API reference and usage examples
|
61
|
+
- **Exhaustive Testing**: 283 tests with 93% code coverage including property-based testing, edge case testing, and cross-platform compatibility validation
|
62
|
+
|
63
|
+
**⚠️ CHANGES in v2025.3.1**
|
64
|
+
> - **skip_empty_lines** option added to `DsvConfig`, `DsvHelper`, and CLI.
|
65
|
+
> - This option allows users to skip logical empty lines when parsing DSV files.
|
66
|
+
|
67
|
+
**⚠️ CHANGES in v2025.3.0**
|
68
|
+
> - **Commit-Only Release**: v2025.3.0 is a commit-only release and will not be published to PyPI.
|
69
|
+
> - The legacy `parse_stream()` helpers were removed in release 2025.3.0.
|
70
|
+
> - Use `parse_file_stream()` on `Dsv`/`DsvHelper` for stream-based parsing of files. This standardizes the API naming and clarifies that streaming helpers accept file paths rather than arbitrary iterables.
|
71
|
+
> - TextFileHelper, SafeTextFileReader, SafeTextFileWriter, and PathValidator, as well as all their associated tests have been removed in this release.
|
72
|
+
> - Their functionality has been migrated in favor of the `splurge-safe-io` package, which provides robust and secure file I/O operations.
|
73
|
+
> - This change reduces code duplication and improves maintainability by leveraging the functionality of `splurge-safe-io`.
|
74
|
+
> - Users should refer to the `splurge-safe-io` documentation for details on its usage and features.
|
75
|
+
> - **See API-REFERENCE.md for migration guidance and complete reference documentation, with usage examples.**
|
76
|
+
|
77
|
+
**⚠️ CHANGES in v2025.2.2**
|
78
|
+
> - **Deprecated Warning**: The following modules and their associated classes and functions are deprecated and will be removed in a future release (2025.3.0). Users are encouraged to transition to the `splurge-safe-io` package for these functionalities:
|
79
|
+
> - `splurge_dsv.safe_text_file_reader`
|
80
|
+
> - `splurge_dsv.safe_text_file_writer`
|
81
|
+
> - `splurge_dsv.path_validator`
|
82
|
+
> - `splurge_dsv.text_file_helper`
|
83
|
+
> - **New Exception**: Added `SplurgeDsvFileExistsError` to handle file existence errors.
|
84
|
+
> - **Fixed Exception Mapping**: Many errors were incorrectly mapped to SplurgeDsvEncodingError; this has been corrected to use appropriate exception types.
|
85
|
+
> - Some exceptions were not mapped to any SplurgeDsv* exception; these have also been corrected.
|
86
|
+
> - **3rd-Party Dependency Additions**: Added `splurge-safe-io (v2025.0.4)`.
|
87
|
+
> - `splurge-safe-io` is a new dependency that provides robust and secure file I/O operations, including safe text file reading and writing with deterministic newline handling and path validation.
|
88
|
+
> - This change reduces code duplication and improves maintainability by leveraging the functionality of `splurge-safe-io`.
|
89
|
+
> - Users should refer to the `splurge-safe-io` documentation for details on its usage and features.
|
90
|
+
> - **Code Refactoring**: Refactored `SafeTextFileReader`, `SafeTextFileWriter`, and `PathValidator` to utilize `splurge-safe-io` implementations internally, ensuring consistent behavior and reducing maintenance overhead.
|
91
|
+
> - **This release maintains backward compatibility** for existing users, but users are encouraged to transition to `splurge-safe-io` for future-proofing their codebases.
|
92
|
+
> - **_This release is a commit-only release and will not be published to PyPI._**
|
55
93
|
|
56
94
|
**⚠️ BREAKING CHANGES in v2025.2.0**
|
57
95
|
>
|
@@ -78,6 +116,41 @@ python -m splurge_dsv data.csv --delimiter ,
|
|
78
116
|
python -m splurge_dsv large_file.csv --delimiter , --stream --chunk-size 1000
|
79
117
|
```
|
80
118
|
|
119
|
+
### YAML configuration file
|
120
|
+
|
121
|
+
You can place CLI-equivalent options in a YAML file and pass it to the CLI
|
122
|
+
using `--config` (or `-c`). CLI arguments override values found in the
|
123
|
+
YAML file. Example `config.yaml`:
|
124
|
+
|
125
|
+
```yaml
|
126
|
+
delimiter: ","
|
127
|
+
strip: true
|
128
|
+
bookend: '"'
|
129
|
+
encoding: utf-8
|
130
|
+
skip_header_rows: 1
|
131
|
+
skip_footer_rows: 0
|
132
|
+
skip_empty_lines: false
|
133
|
+
detect_columns: true
|
134
|
+
chunk_size: 500
|
135
|
+
max_detect_chunks: 5
|
136
|
+
raise_on_missing_columns: false
|
137
|
+
raise_on_extra_columns: false
|
138
|
+
```
|
139
|
+
|
140
|
+
Usage with CLI:
|
141
|
+
|
142
|
+
```bash
|
143
|
+
python -m splurge_dsv data.csv --config config.yaml --delimiter "|"
|
144
|
+
# The CLI delimiter '|' overrides the YAML delimiter
|
145
|
+
```
|
146
|
+
|
147
|
+
Example using the shipped example config in the repository:
|
148
|
+
|
149
|
+
```bash
|
150
|
+
# Use the example file provided at examples/config.yaml
|
151
|
+
python -m splurge_dsv data.csv --config examples/config.yaml
|
152
|
+
```
|
153
|
+
|
81
154
|
### API Usage
|
82
155
|
|
83
156
|
```python
|
@@ -0,0 +1,13 @@
|
|
1
|
+
splurge_dsv/__init__.py,sha256=Kbxp7HgplCz8nSISs3JlS9yv2tcm4ksNKOlfZdFF4Xg,3838
|
2
|
+
splurge_dsv/__main__.py,sha256=6dpfX_96hEpOqxv5X4bK73xX86YTgK0Adad1uTWSABM,426
|
3
|
+
splurge_dsv/cli.py,sha256=Ohpr7Pa62hAL3_ga9Pffw-0oWyBva4Wg49uk6wxf3hQ,11859
|
4
|
+
splurge_dsv/dsv.py,sha256=gU6PmY09dJG-UiTui1dnBl6lzMzXRHyTflcUaLCNODw,13849
|
5
|
+
splurge_dsv/dsv_helper.py,sha256=qcj93SfKldNpS27liSLxaBW3wARnUErgY-Gdz8__qRo,29393
|
6
|
+
splurge_dsv/exceptions.py,sha256=X7dq0eNSp_QPyXP12YJHuoLembR8OY48AQC0BPmAPPw,6026
|
7
|
+
splurge_dsv/string_tokenizer.py,sha256=_yShUGbW8cK8X7b9f2INj5C2B1Zzz4j3evI_rx-X58U,4913
|
8
|
+
splurge_dsv-2025.3.1.dist-info/licenses/LICENSE,sha256=fPgtg-tIFHinQvJH0arRfv50AuxikD5eHw6rrPy2A5w,1091
|
9
|
+
splurge_dsv-2025.3.1.dist-info/METADATA,sha256=cQJW9gez32t7daLRZxRK903xPUIj6OMO-gX9vapP5yA,13022
|
10
|
+
splurge_dsv-2025.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
11
|
+
splurge_dsv-2025.3.1.dist-info/entry_points.txt,sha256=QmGyc3qHYtY61uanRxNOXw-waSJ01qypSCI8Kb3zgsU,56
|
12
|
+
splurge_dsv-2025.3.1.dist-info/top_level.txt,sha256=D6Si3FTfpRYqH7kzM7tSQAyaKbbraO6UPLpcqcY4XXM,12
|
13
|
+
splurge_dsv-2025.3.1.dist-info/RECORD,,
|
splurge_dsv/path_validator.py
DELETED
@@ -1,298 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
File path validation utilities for secure file operations.
|
3
|
-
|
4
|
-
This module provides utilities for validating file paths to prevent
|
5
|
-
path traversal attacks and ensure secure file operations.
|
6
|
-
|
7
|
-
Copyright (c) 2025 Jim Schilling
|
8
|
-
|
9
|
-
Please preserve this header and all related material when sharing!
|
10
|
-
|
11
|
-
This module is licensed under the MIT License.
|
12
|
-
"""
|
13
|
-
|
14
|
-
# Standard library imports
|
15
|
-
import os
|
16
|
-
import re
|
17
|
-
from pathlib import Path
|
18
|
-
|
19
|
-
# Local imports
|
20
|
-
from splurge_dsv.exceptions import (
|
21
|
-
SplurgeDsvFileNotFoundError,
|
22
|
-
SplurgeDsvFilePermissionError,
|
23
|
-
SplurgeDsvPathValidationError,
|
24
|
-
)
|
25
|
-
|
26
|
-
# Module-level constants for path validation
|
27
|
-
_MAX_PATH_LENGTH = 4096 # Maximum path length for most filesystems
|
28
|
-
_DEFAULT_FILENAME = "unnamed_file" # Default filename when sanitization results in empty string
|
29
|
-
|
30
|
-
|
31
|
-
class PathValidator:
|
32
|
-
"""
|
33
|
-
Utility class for validating file paths securely.
|
34
|
-
|
35
|
-
This class provides methods to validate file paths and prevent
|
36
|
-
path traversal attacks and other security vulnerabilities.
|
37
|
-
"""
|
38
|
-
|
39
|
-
# Private constants for path validation
|
40
|
-
_PATH_TRAVERSAL_PATTERNS = [
|
41
|
-
r"\.\.", # Directory traversal
|
42
|
-
r"//+", # Multiple forward slashes (including //)
|
43
|
-
r"\\{2,}", # Two or more consecutive backslashes (not normal Windows paths)
|
44
|
-
r"~", # Home directory expansion
|
45
|
-
]
|
46
|
-
|
47
|
-
_DANGEROUS_CHARS = [
|
48
|
-
"<",
|
49
|
-
">",
|
50
|
-
'"',
|
51
|
-
"|",
|
52
|
-
"?",
|
53
|
-
"*", # Windows reserved characters (excluding ':' for drive letters)
|
54
|
-
"\x00",
|
55
|
-
"\x01",
|
56
|
-
"\x02",
|
57
|
-
"\x03",
|
58
|
-
"\x04",
|
59
|
-
"\x05",
|
60
|
-
"\x06",
|
61
|
-
"\x07", # Control characters
|
62
|
-
"\x08",
|
63
|
-
"\x09",
|
64
|
-
"\x0a",
|
65
|
-
"\x0b",
|
66
|
-
"\x0c",
|
67
|
-
"\x0d",
|
68
|
-
"\x0e",
|
69
|
-
"\x0f",
|
70
|
-
"\x10",
|
71
|
-
"\x11",
|
72
|
-
"\x12",
|
73
|
-
"\x13",
|
74
|
-
"\x14",
|
75
|
-
"\x15",
|
76
|
-
"\x16",
|
77
|
-
"\x17",
|
78
|
-
"\x18",
|
79
|
-
"\x19",
|
80
|
-
"\x1a",
|
81
|
-
"\x1b",
|
82
|
-
"\x1c",
|
83
|
-
"\x1d",
|
84
|
-
"\x1e",
|
85
|
-
"\x1f",
|
86
|
-
]
|
87
|
-
|
88
|
-
MAX_PATH_LENGTH = _MAX_PATH_LENGTH
|
89
|
-
|
90
|
-
@classmethod
|
91
|
-
def validate_path(
|
92
|
-
cls,
|
93
|
-
file_path: str | Path,
|
94
|
-
*,
|
95
|
-
must_exist: bool = False,
|
96
|
-
must_be_file: bool = False,
|
97
|
-
must_be_readable: bool = False,
|
98
|
-
allow_relative: bool = True,
|
99
|
-
base_directory: str | Path | None = None,
|
100
|
-
) -> Path:
|
101
|
-
"""Validate a filesystem path for security and correctness.
|
102
|
-
|
103
|
-
This is the central path validation routine used across the package.
|
104
|
-
|
105
|
-
Args:
|
106
|
-
file_path: Path or string to validate.
|
107
|
-
must_exist: If True, require the path to exist.
|
108
|
-
must_be_file: If True, require the path to be a regular file.
|
109
|
-
must_be_readable: If True, check read permission via os.access().
|
110
|
-
allow_relative: If False, disallow relative paths.
|
111
|
-
base_directory: Optional directory to resolve relative paths
|
112
|
-
against and to restrict the resolved path to.
|
113
|
-
|
114
|
-
Returns:
|
115
|
-
pathlib.Path: Resolved and normalized path.
|
116
|
-
|
117
|
-
Raises:
|
118
|
-
SplurgeDsvPathValidationError: If any validation rule fails.
|
119
|
-
SplurgeDsvFileNotFoundError: If must_exist is True and file is missing.
|
120
|
-
SplurgeDsvFilePermissionError: If must_be_readable is True and the
|
121
|
-
file is not readable.
|
122
|
-
"""
|
123
|
-
# Convert to Path object
|
124
|
-
path = Path(file_path) if isinstance(file_path, str) else file_path
|
125
|
-
|
126
|
-
# Get the original string for validation (before Path normalization)
|
127
|
-
path_str = str(file_path) if isinstance(file_path, str) else str(path)
|
128
|
-
|
129
|
-
# Check for dangerous characters
|
130
|
-
cls._check_dangerous_characters(path_str)
|
131
|
-
|
132
|
-
# Check for path traversal patterns
|
133
|
-
cls._check_path_traversal(path_str)
|
134
|
-
|
135
|
-
# Check path length
|
136
|
-
cls._check_path_length(path_str)
|
137
|
-
|
138
|
-
# Handle relative paths
|
139
|
-
if not path.is_absolute() and not allow_relative:
|
140
|
-
raise SplurgeDsvPathValidationError(
|
141
|
-
f"Relative paths are not allowed: {path}", details="Set allow_relative=True to allow relative paths"
|
142
|
-
)
|
143
|
-
|
144
|
-
# Resolve path (handles symlinks and normalizes)
|
145
|
-
try:
|
146
|
-
if base_directory:
|
147
|
-
base_path = Path(base_directory).resolve()
|
148
|
-
if not path.is_absolute():
|
149
|
-
resolved_path = (base_path / path).resolve()
|
150
|
-
else:
|
151
|
-
resolved_path = path.resolve()
|
152
|
-
|
153
|
-
# Ensure resolved path is within base directory
|
154
|
-
try:
|
155
|
-
resolved_path.relative_to(base_path)
|
156
|
-
except ValueError:
|
157
|
-
raise SplurgeDsvPathValidationError(
|
158
|
-
f"Path {path} resolves outside base directory {base_directory}",
|
159
|
-
details="Path traversal detected",
|
160
|
-
) from None
|
161
|
-
else:
|
162
|
-
resolved_path = path.resolve()
|
163
|
-
except (OSError, RuntimeError) as e:
|
164
|
-
raise SplurgeDsvPathValidationError(
|
165
|
-
f"Failed to resolve path {path}: {e}", details="Check if path contains invalid characters or symlinks"
|
166
|
-
) from e
|
167
|
-
|
168
|
-
# Check if file exists
|
169
|
-
if must_exist and not resolved_path.exists():
|
170
|
-
raise SplurgeDsvFileNotFoundError(
|
171
|
-
f"File does not exist: {resolved_path}", details="Set must_exist=False to allow non-existent files"
|
172
|
-
)
|
173
|
-
|
174
|
-
# Check if it's a file (not directory)
|
175
|
-
if must_be_file and resolved_path.exists() and not resolved_path.is_file():
|
176
|
-
raise SplurgeDsvPathValidationError(
|
177
|
-
f"Path is not a file: {resolved_path}", details="Path exists but is not a regular file"
|
178
|
-
)
|
179
|
-
|
180
|
-
# Check if file is readable
|
181
|
-
if must_be_readable:
|
182
|
-
if not resolved_path.exists():
|
183
|
-
raise SplurgeDsvFileNotFoundError(
|
184
|
-
f"Cannot check readability of non-existent file: {resolved_path}",
|
185
|
-
details="File must exist to check readability",
|
186
|
-
)
|
187
|
-
|
188
|
-
if not os.access(resolved_path, os.R_OK):
|
189
|
-
raise SplurgeDsvFilePermissionError(
|
190
|
-
f"File is not readable: {resolved_path}", details="Check file permissions"
|
191
|
-
)
|
192
|
-
|
193
|
-
return resolved_path
|
194
|
-
|
195
|
-
@classmethod
|
196
|
-
def _is_valid_windows_drive_pattern(cls, path_str: str) -> bool:
|
197
|
-
"""Return True if ``path_str`` looks like a valid Windows drive pattern.
|
198
|
-
|
199
|
-
Accepts both ``C:`` and ``C:\\...`` or ``C:/...`` forms.
|
200
|
-
"""
|
201
|
-
# Must be C: at the end of the string, or C:\ (or C:/) followed by path
|
202
|
-
return bool(re.match(r"^[A-Za-z]:$", path_str)) or bool(re.match(r"^[A-Za-z]:[\\/]", path_str))
|
203
|
-
|
204
|
-
@classmethod
|
205
|
-
def _check_dangerous_characters(cls, path_str: str) -> None:
|
206
|
-
"""Raise if ``path_str`` contains characters disallowed by policy.
|
207
|
-
|
208
|
-
This guards against NULs, control characters, and reserved filesystem
|
209
|
-
characters which may be used in injection or traversal attacks.
|
210
|
-
"""
|
211
|
-
# Check for dangerous characters, but allow colons in Windows drive letters
|
212
|
-
for char in cls._DANGEROUS_CHARS:
|
213
|
-
if char in path_str:
|
214
|
-
raise SplurgeDsvPathValidationError(
|
215
|
-
f"Path contains dangerous character: {repr(char)}",
|
216
|
-
details=f"Character at position {path_str.find(char)}",
|
217
|
-
)
|
218
|
-
|
219
|
-
# Special handling for colons - only allow them in Windows drive letters (e.g., C:)
|
220
|
-
if ":" in path_str:
|
221
|
-
if not cls._is_valid_windows_drive_pattern(path_str):
|
222
|
-
raise SplurgeDsvPathValidationError(
|
223
|
-
"Path contains colon in invalid position",
|
224
|
-
details="Colons are only allowed in Windows drive letters (e.g., C: or C:\\)",
|
225
|
-
)
|
226
|
-
|
227
|
-
@classmethod
|
228
|
-
def _check_path_traversal(cls, path_str: str) -> None:
|
229
|
-
"""Raise if ``path_str`` contains obvious traversal patterns.
|
230
|
-
|
231
|
-
This is a best-effort check that catches sequences such as ``..``
|
232
|
-
and unusual repeated separators that are likely malicious.
|
233
|
-
"""
|
234
|
-
for pattern in cls._PATH_TRAVERSAL_PATTERNS:
|
235
|
-
if re.search(pattern, path_str):
|
236
|
-
raise SplurgeDsvPathValidationError(
|
237
|
-
f"Path contains traversal pattern: {pattern}", details="Path traversal attacks are not allowed"
|
238
|
-
)
|
239
|
-
|
240
|
-
@classmethod
|
241
|
-
def _check_path_length(cls, path_str: str) -> None:
|
242
|
-
"""Raise if the path exceeds the configured maximum length.
|
243
|
-
|
244
|
-
Long paths can indicate malformed input or attempt to overflow
|
245
|
-
downstream APIs; this check enforces a sane upper bound.
|
246
|
-
"""
|
247
|
-
if len(path_str) > cls.MAX_PATH_LENGTH:
|
248
|
-
raise SplurgeDsvPathValidationError(
|
249
|
-
f"Path is too long: {len(path_str)} characters",
|
250
|
-
details=f"Maximum allowed length is {cls.MAX_PATH_LENGTH} characters",
|
251
|
-
)
|
252
|
-
|
253
|
-
@classmethod
|
254
|
-
def sanitize_filename(cls, filename: str) -> str:
|
255
|
-
"""
|
256
|
-
Sanitize a filename by removing dangerous characters.
|
257
|
-
|
258
|
-
Args:
|
259
|
-
filename: Original filename
|
260
|
-
|
261
|
-
Returns:
|
262
|
-
Sanitized filename
|
263
|
-
"""
|
264
|
-
# Remove or replace dangerous characters
|
265
|
-
sanitized = filename
|
266
|
-
|
267
|
-
# Replace Windows reserved characters
|
268
|
-
for char in ["<", ">", ":", '"', "|", "?", "*"]:
|
269
|
-
sanitized = sanitized.replace(char, "_")
|
270
|
-
|
271
|
-
# Remove control characters
|
272
|
-
sanitized = "".join(char for char in sanitized if ord(char) >= 32)
|
273
|
-
|
274
|
-
# Remove leading/trailing spaces and dots
|
275
|
-
sanitized = sanitized.strip(" .")
|
276
|
-
|
277
|
-
# Ensure filename is not empty
|
278
|
-
if not sanitized:
|
279
|
-
sanitized = _DEFAULT_FILENAME
|
280
|
-
|
281
|
-
return sanitized
|
282
|
-
|
283
|
-
@classmethod
|
284
|
-
def is_safe_path(cls, file_path: str | Path) -> bool:
|
285
|
-
"""
|
286
|
-
Check if a path is safe without raising exceptions.
|
287
|
-
|
288
|
-
Args:
|
289
|
-
file_path: Path to check
|
290
|
-
|
291
|
-
Returns:
|
292
|
-
True if path is safe, False otherwise
|
293
|
-
"""
|
294
|
-
try:
|
295
|
-
cls.validate_path(file_path)
|
296
|
-
return True
|
297
|
-
except (SplurgeDsvPathValidationError, SplurgeDsvFileNotFoundError, SplurgeDsvFilePermissionError):
|
298
|
-
return False
|
@@ -1,177 +0,0 @@
|
|
1
|
-
"""Safe text file reader utilities.
|
2
|
-
|
3
|
-
This module implements :class:`SafeTextFileReader`, a small helper that reads
|
4
|
-
text files in binary mode and performs deterministic newline normalization.
|
5
|
-
It intentionally decodes bytes explicitly to avoid platform newline
|
6
|
-
translation side-effects and centralizes encoding error handling into a
|
7
|
-
package-specific exception type.
|
8
|
-
|
9
|
-
Public API summary:
|
10
|
-
- SafeTextFileReader: Read, preview, and stream text files with normalized
|
11
|
-
newlines and optional header/footer skipping.
|
12
|
-
- open_text: Context manager returning an in-memory text stream for
|
13
|
-
callers that expect a file-like object.
|
14
|
-
|
15
|
-
Example:
|
16
|
-
reader = SafeTextFileReader("data.csv", encoding="utf-8")
|
17
|
-
lines = reader.read()
|
18
|
-
|
19
|
-
License: MIT
|
20
|
-
|
21
|
-
Copyright (c) 2025 Jim Schilling
|
22
|
-
"""
|
23
|
-
|
24
|
-
from __future__ import annotations
|
25
|
-
|
26
|
-
from collections.abc import Iterator
|
27
|
-
from contextlib import contextmanager
|
28
|
-
from io import StringIO
|
29
|
-
from pathlib import Path
|
30
|
-
|
31
|
-
from splurge_dsv.exceptions import SplurgeDsvFileEncodingError
|
32
|
-
|
33
|
-
|
34
|
-
class SafeTextFileReader:
|
35
|
-
"""Read text files with deterministic newline normalization.
|
36
|
-
|
37
|
-
The class reads raw bytes from disk and decodes using the provided
|
38
|
-
encoding. Newline sequences are normalized to ``\n`` (LF). Public
|
39
|
-
methods provide convenience wrappers for full reads, previews and
|
40
|
-
chunked streaming.
|
41
|
-
|
42
|
-
Args:
|
43
|
-
file_path (Path | str): Path to the file to read.
|
44
|
-
encoding (str): Encoding to use when decoding bytes (default: utf-8).
|
45
|
-
|
46
|
-
Example:
|
47
|
-
reader = SafeTextFileReader("/tmp/data.csv", encoding="utf-8")
|
48
|
-
rows = reader.read(skip_header_rows=1)
|
49
|
-
"""
|
50
|
-
|
51
|
-
def __init__(self, file_path: Path | str, *, encoding: str = "utf-8") -> None:
|
52
|
-
self.path = Path(file_path)
|
53
|
-
self.encoding = encoding
|
54
|
-
|
55
|
-
def _read_text(self) -> str:
|
56
|
-
"""Read the file bytes and return decoded text with no newline normalization applied.
|
57
|
-
|
58
|
-
Returns:
|
59
|
-
Decoded text (str).
|
60
|
-
|
61
|
-
Raises:
|
62
|
-
SplurgeDsvFileEncodingError: If decoding fails or the file cannot
|
63
|
-
be read.
|
64
|
-
"""
|
65
|
-
try:
|
66
|
-
# Read raw bytes and decode explicitly to avoid the platform's
|
67
|
-
# text-mode newline translations which can alter mixed line endings.
|
68
|
-
with self.path.open("rb") as fh:
|
69
|
-
raw = fh.read()
|
70
|
-
return raw.decode(self.encoding)
|
71
|
-
except Exception as e:
|
72
|
-
raise SplurgeDsvFileEncodingError(f"Encoding error reading file: {self.path}", details=str(e)) from e
|
73
|
-
|
74
|
-
def read(self, *, strip: bool = True, skip_header_rows: int = 0, skip_footer_rows: int = 0) -> list[str]:
|
75
|
-
"""Read the entire file and return a list of normalized lines.
|
76
|
-
|
77
|
-
Newlines are normalized to ``\n`` and optional header/footer rows
|
78
|
-
can be skipped. If ``strip`` is True, whitespace surrounding each
|
79
|
-
line is removed.
|
80
|
-
|
81
|
-
Args:
|
82
|
-
strip (bool): Strip whitespace from each line (default: True).
|
83
|
-
skip_header_rows (int): Number of rows to skip at the start.
|
84
|
-
skip_footer_rows (int): Number of rows to skip at the end.
|
85
|
-
|
86
|
-
Returns:
|
87
|
-
List of lines as strings.
|
88
|
-
"""
|
89
|
-
text = self._read_text()
|
90
|
-
# Normalize newlines to LF
|
91
|
-
normalized = text.replace("\r\n", "\n").replace("\r", "\n")
|
92
|
-
lines = normalized.splitlines()
|
93
|
-
|
94
|
-
if skip_header_rows:
|
95
|
-
lines = lines[skip_header_rows:]
|
96
|
-
if skip_footer_rows:
|
97
|
-
if skip_footer_rows >= len(lines):
|
98
|
-
return []
|
99
|
-
lines = lines[:-skip_footer_rows]
|
100
|
-
|
101
|
-
if strip:
|
102
|
-
return [ln.strip() for ln in lines]
|
103
|
-
return list(lines)
|
104
|
-
|
105
|
-
def preview(self, max_lines: int = 100, *, strip: bool = True, skip_header_rows: int = 0) -> list[str]:
|
106
|
-
"""Return the first ``max_lines`` lines of the file after normalization.
|
107
|
-
|
108
|
-
Args:
|
109
|
-
max_lines (int): Maximum number of lines to return.
|
110
|
-
strip (bool): Strip whitespace from each returned line.
|
111
|
-
skip_header_rows (int): Number of header rows to skip before previewing.
|
112
|
-
|
113
|
-
Returns:
|
114
|
-
A list of preview lines.
|
115
|
-
"""
|
116
|
-
text = self._read_text()
|
117
|
-
normalized = text.replace("\r\n", "\n").replace("\r", "\n")
|
118
|
-
lines = normalized.splitlines()
|
119
|
-
if skip_header_rows:
|
120
|
-
lines = lines[skip_header_rows:]
|
121
|
-
if max_lines < 1:
|
122
|
-
return []
|
123
|
-
result = lines[:max_lines]
|
124
|
-
return [ln.strip() for ln in result] if strip else list(result)
|
125
|
-
|
126
|
-
def read_as_stream(
|
127
|
-
self, *, strip: bool = True, skip_header_rows: int = 0, skip_footer_rows: int = 0, chunk_size: int = 500
|
128
|
-
) -> Iterator[list[str]]:
|
129
|
-
"""Yield chunks of lines from the file.
|
130
|
-
|
131
|
-
This convenience method currently reads the decoded file into memory
|
132
|
-
and yields chunks of ``chunk_size`` lines. For very large files this
|
133
|
-
could be optimized to stream from disk without full materialization.
|
134
|
-
|
135
|
-
Args:
|
136
|
-
strip (bool): Whether to strip whitespace from each line.
|
137
|
-
skip_header_rows (int): Number of header rows to skip.
|
138
|
-
skip_footer_rows (int): Number of footer rows to skip.
|
139
|
-
chunk_size (int): Number of lines per yielded chunk.
|
140
|
-
|
141
|
-
Yields:
|
142
|
-
Lists of lines (each list length <= chunk_size).
|
143
|
-
"""
|
144
|
-
lines = self.read(strip=strip, skip_header_rows=skip_header_rows, skip_footer_rows=skip_footer_rows)
|
145
|
-
chunk: list[str] = []
|
146
|
-
for ln in lines:
|
147
|
-
chunk.append(ln)
|
148
|
-
if len(chunk) >= chunk_size:
|
149
|
-
yield chunk
|
150
|
-
chunk = []
|
151
|
-
if chunk:
|
152
|
-
yield chunk
|
153
|
-
|
154
|
-
|
155
|
-
@contextmanager
|
156
|
-
def open_text(file_path: Path | str, *, encoding: str = "utf-8"):
|
157
|
-
"""Context manager returning a text stream (io.StringIO) with normalized newlines.
|
158
|
-
|
159
|
-
Useful when an API expects a file-like object. The returned StringIO
|
160
|
-
contains the normalized text (LF newlines) and is closed automatically
|
161
|
-
when the context exits.
|
162
|
-
|
163
|
-
Args:
|
164
|
-
file_path: Path to the file to open.
|
165
|
-
encoding: Encoding to decode the file with.
|
166
|
-
|
167
|
-
Yields:
|
168
|
-
io.StringIO: In-memory text buffer with normalized newlines.
|
169
|
-
"""
|
170
|
-
reader = SafeTextFileReader(file_path, encoding=encoding)
|
171
|
-
text_lines = reader.read(strip=False)
|
172
|
-
text = "\n".join(text_lines)
|
173
|
-
sio = StringIO(text)
|
174
|
-
try:
|
175
|
-
yield sio
|
176
|
-
finally:
|
177
|
-
sio.close()
|