splurge-dsv 2025.1.5__py3-none-any.whl → 2025.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- splurge_dsv/__init__.py +70 -58
- splurge_dsv/__main__.py +10 -6
- splurge_dsv/cli.py +47 -43
- splurge_dsv/dsv.py +280 -0
- splurge_dsv/dsv_helper.py +114 -54
- splurge_dsv/exceptions.py +92 -75
- splurge_dsv/path_validator.py +49 -36
- splurge_dsv/safe_text_file_reader.py +177 -0
- splurge_dsv/safe_text_file_writer.py +136 -0
- splurge_dsv/string_tokenizer.py +34 -26
- splurge_dsv/text_file_helper.py +96 -177
- splurge_dsv-2025.2.1.dist-info/METADATA +217 -0
- splurge_dsv-2025.2.1.dist-info/RECORD +17 -0
- splurge_dsv-2025.2.1.dist-info/entry_points.txt +2 -0
- splurge_dsv/resource_manager.py +0 -371
- splurge_dsv-2025.1.5.dist-info/METADATA +0 -268
- splurge_dsv-2025.1.5.dist-info/RECORD +0 -14
- {splurge_dsv-2025.1.5.dist-info → splurge_dsv-2025.2.1.dist-info}/WHEEL +0 -0
- {splurge_dsv-2025.1.5.dist-info → splurge_dsv-2025.2.1.dist-info}/licenses/LICENSE +0 -0
- {splurge_dsv-2025.1.5.dist-info → splurge_dsv-2025.2.1.dist-info}/top_level.txt +0 -0
splurge_dsv/path_validator.py
CHANGED
@@ -17,7 +17,11 @@ import re
|
|
17
17
|
from pathlib import Path
|
18
18
|
|
19
19
|
# Local imports
|
20
|
-
from splurge_dsv.exceptions import
|
20
|
+
from splurge_dsv.exceptions import (
|
21
|
+
SplurgeDsvFileNotFoundError,
|
22
|
+
SplurgeDsvFilePermissionError,
|
23
|
+
SplurgeDsvPathValidationError,
|
24
|
+
)
|
21
25
|
|
22
26
|
# Module-level constants for path validation
|
23
27
|
_MAX_PATH_LENGTH = 4096 # Maximum path length for most filesystems
|
@@ -94,24 +98,27 @@ class PathValidator:
|
|
94
98
|
allow_relative: bool = True,
|
95
99
|
base_directory: str | Path | None = None,
|
96
100
|
) -> Path:
|
97
|
-
"""
|
98
|
-
|
101
|
+
"""Validate a filesystem path for security and correctness.
|
102
|
+
|
103
|
+
This is the central path validation routine used across the package.
|
99
104
|
|
100
105
|
Args:
|
101
|
-
file_path: Path to validate
|
102
|
-
must_exist:
|
103
|
-
must_be_file:
|
104
|
-
must_be_readable:
|
105
|
-
allow_relative:
|
106
|
-
base_directory:
|
106
|
+
file_path: Path or string to validate.
|
107
|
+
must_exist: If True, require the path to exist.
|
108
|
+
must_be_file: If True, require the path to be a regular file.
|
109
|
+
must_be_readable: If True, check read permission via os.access().
|
110
|
+
allow_relative: If False, disallow relative paths.
|
111
|
+
base_directory: Optional directory to resolve relative paths
|
112
|
+
against and to restrict the resolved path to.
|
107
113
|
|
108
114
|
Returns:
|
109
|
-
|
115
|
+
pathlib.Path: Resolved and normalized path.
|
110
116
|
|
111
117
|
Raises:
|
112
|
-
|
113
|
-
|
114
|
-
|
118
|
+
SplurgeDsvPathValidationError: If any validation rule fails.
|
119
|
+
SplurgeDsvFileNotFoundError: If must_exist is True and file is missing.
|
120
|
+
SplurgeDsvFilePermissionError: If must_be_readable is True and the
|
121
|
+
file is not readable.
|
115
122
|
"""
|
116
123
|
# Convert to Path object
|
117
124
|
path = Path(file_path) if isinstance(file_path, str) else file_path
|
@@ -130,7 +137,7 @@ class PathValidator:
|
|
130
137
|
|
131
138
|
# Handle relative paths
|
132
139
|
if not path.is_absolute() and not allow_relative:
|
133
|
-
raise
|
140
|
+
raise SplurgeDsvPathValidationError(
|
134
141
|
f"Relative paths are not allowed: {path}", details="Set allow_relative=True to allow relative paths"
|
135
142
|
)
|
136
143
|
|
@@ -147,39 +154,39 @@ class PathValidator:
|
|
147
154
|
try:
|
148
155
|
resolved_path.relative_to(base_path)
|
149
156
|
except ValueError:
|
150
|
-
raise
|
157
|
+
raise SplurgeDsvPathValidationError(
|
151
158
|
f"Path {path} resolves outside base directory {base_directory}",
|
152
159
|
details="Path traversal detected",
|
153
160
|
) from None
|
154
161
|
else:
|
155
162
|
resolved_path = path.resolve()
|
156
163
|
except (OSError, RuntimeError) as e:
|
157
|
-
raise
|
164
|
+
raise SplurgeDsvPathValidationError(
|
158
165
|
f"Failed to resolve path {path}: {e}", details="Check if path contains invalid characters or symlinks"
|
159
166
|
) from e
|
160
167
|
|
161
168
|
# Check if file exists
|
162
169
|
if must_exist and not resolved_path.exists():
|
163
|
-
raise
|
170
|
+
raise SplurgeDsvFileNotFoundError(
|
164
171
|
f"File does not exist: {resolved_path}", details="Set must_exist=False to allow non-existent files"
|
165
172
|
)
|
166
173
|
|
167
174
|
# Check if it's a file (not directory)
|
168
175
|
if must_be_file and resolved_path.exists() and not resolved_path.is_file():
|
169
|
-
raise
|
176
|
+
raise SplurgeDsvPathValidationError(
|
170
177
|
f"Path is not a file: {resolved_path}", details="Path exists but is not a regular file"
|
171
178
|
)
|
172
179
|
|
173
180
|
# Check if file is readable
|
174
181
|
if must_be_readable:
|
175
182
|
if not resolved_path.exists():
|
176
|
-
raise
|
183
|
+
raise SplurgeDsvFileNotFoundError(
|
177
184
|
f"Cannot check readability of non-existent file: {resolved_path}",
|
178
185
|
details="File must exist to check readability",
|
179
186
|
)
|
180
187
|
|
181
188
|
if not os.access(resolved_path, os.R_OK):
|
182
|
-
raise
|
189
|
+
raise SplurgeDsvFilePermissionError(
|
183
190
|
f"File is not readable: {resolved_path}", details="Check file permissions"
|
184
191
|
)
|
185
192
|
|
@@ -187,26 +194,24 @@ class PathValidator:
|
|
187
194
|
|
188
195
|
@classmethod
|
189
196
|
def _is_valid_windows_drive_pattern(cls, path_str: str) -> bool:
|
190
|
-
"""
|
191
|
-
Check if a path string contains a valid Windows drive letter pattern.
|
197
|
+
"""Return True if ``path_str`` looks like a valid Windows drive pattern.
|
192
198
|
|
193
|
-
|
194
|
-
path_str: Path string to validate
|
195
|
-
|
196
|
-
Returns:
|
197
|
-
True if the path contains a valid Windows drive letter pattern,
|
198
|
-
False otherwise
|
199
|
+
Accepts both ``C:`` and ``C:\\...`` or ``C:/...`` forms.
|
199
200
|
"""
|
200
201
|
# Must be C: at the end of the string, or C:\ (or C:/) followed by path
|
201
202
|
return bool(re.match(r"^[A-Za-z]:$", path_str)) or bool(re.match(r"^[A-Za-z]:[\\/]", path_str))
|
202
203
|
|
203
204
|
@classmethod
|
204
205
|
def _check_dangerous_characters(cls, path_str: str) -> None:
|
205
|
-
"""
|
206
|
+
"""Raise if ``path_str`` contains characters disallowed by policy.
|
207
|
+
|
208
|
+
This guards against NULs, control characters, and reserved filesystem
|
209
|
+
characters which may be used in injection or traversal attacks.
|
210
|
+
"""
|
206
211
|
# Check for dangerous characters, but allow colons in Windows drive letters
|
207
212
|
for char in cls._DANGEROUS_CHARS:
|
208
213
|
if char in path_str:
|
209
|
-
raise
|
214
|
+
raise SplurgeDsvPathValidationError(
|
210
215
|
f"Path contains dangerous character: {repr(char)}",
|
211
216
|
details=f"Character at position {path_str.find(char)}",
|
212
217
|
)
|
@@ -214,25 +219,33 @@ class PathValidator:
|
|
214
219
|
# Special handling for colons - only allow them in Windows drive letters (e.g., C:)
|
215
220
|
if ":" in path_str:
|
216
221
|
if not cls._is_valid_windows_drive_pattern(path_str):
|
217
|
-
raise
|
222
|
+
raise SplurgeDsvPathValidationError(
|
218
223
|
"Path contains colon in invalid position",
|
219
224
|
details="Colons are only allowed in Windows drive letters (e.g., C: or C:\\)",
|
220
225
|
)
|
221
226
|
|
222
227
|
@classmethod
|
223
228
|
def _check_path_traversal(cls, path_str: str) -> None:
|
224
|
-
"""
|
229
|
+
"""Raise if ``path_str`` contains obvious traversal patterns.
|
230
|
+
|
231
|
+
This is a best-effort check that catches sequences such as ``..``
|
232
|
+
and unusual repeated separators that are likely malicious.
|
233
|
+
"""
|
225
234
|
for pattern in cls._PATH_TRAVERSAL_PATTERNS:
|
226
235
|
if re.search(pattern, path_str):
|
227
|
-
raise
|
236
|
+
raise SplurgeDsvPathValidationError(
|
228
237
|
f"Path contains traversal pattern: {pattern}", details="Path traversal attacks are not allowed"
|
229
238
|
)
|
230
239
|
|
231
240
|
@classmethod
|
232
241
|
def _check_path_length(cls, path_str: str) -> None:
|
233
|
-
"""
|
242
|
+
"""Raise if the path exceeds the configured maximum length.
|
243
|
+
|
244
|
+
Long paths can indicate malformed input or attempt to overflow
|
245
|
+
downstream APIs; this check enforces a sane upper bound.
|
246
|
+
"""
|
234
247
|
if len(path_str) > cls.MAX_PATH_LENGTH:
|
235
|
-
raise
|
248
|
+
raise SplurgeDsvPathValidationError(
|
236
249
|
f"Path is too long: {len(path_str)} characters",
|
237
250
|
details=f"Maximum allowed length is {cls.MAX_PATH_LENGTH} characters",
|
238
251
|
)
|
@@ -281,5 +294,5 @@ class PathValidator:
|
|
281
294
|
try:
|
282
295
|
cls.validate_path(file_path)
|
283
296
|
return True
|
284
|
-
except (
|
297
|
+
except (SplurgeDsvPathValidationError, SplurgeDsvFileNotFoundError, SplurgeDsvFilePermissionError):
|
285
298
|
return False
|
@@ -0,0 +1,177 @@
|
|
1
|
+
"""Safe text file reader utilities.
|
2
|
+
|
3
|
+
This module implements :class:`SafeTextFileReader`, a small helper that reads
|
4
|
+
text files in binary mode and performs deterministic newline normalization.
|
5
|
+
It intentionally decodes bytes explicitly to avoid platform newline
|
6
|
+
translation side-effects and centralizes encoding error handling into a
|
7
|
+
package-specific exception type.
|
8
|
+
|
9
|
+
Public API summary:
|
10
|
+
- SafeTextFileReader: Read, preview, and stream text files with normalized
|
11
|
+
newlines and optional header/footer skipping.
|
12
|
+
- open_text: Context manager returning an in-memory text stream for
|
13
|
+
callers that expect a file-like object.
|
14
|
+
|
15
|
+
Example:
|
16
|
+
reader = SafeTextFileReader("data.csv", encoding="utf-8")
|
17
|
+
lines = reader.read()
|
18
|
+
|
19
|
+
License: MIT
|
20
|
+
|
21
|
+
Copyright (c) 2025 Jim Schilling
|
22
|
+
"""
|
23
|
+
|
24
|
+
from __future__ import annotations
|
25
|
+
|
26
|
+
from collections.abc import Iterator
|
27
|
+
from contextlib import contextmanager
|
28
|
+
from io import StringIO
|
29
|
+
from pathlib import Path
|
30
|
+
|
31
|
+
from splurge_dsv.exceptions import SplurgeDsvFileEncodingError
|
32
|
+
|
33
|
+
|
34
|
+
class SafeTextFileReader:
|
35
|
+
"""Read text files with deterministic newline normalization.
|
36
|
+
|
37
|
+
The class reads raw bytes from disk and decodes using the provided
|
38
|
+
encoding. Newline sequences are normalized to ``\n`` (LF). Public
|
39
|
+
methods provide convenience wrappers for full reads, previews and
|
40
|
+
chunked streaming.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
file_path (Path | str): Path to the file to read.
|
44
|
+
encoding (str): Encoding to use when decoding bytes (default: utf-8).
|
45
|
+
|
46
|
+
Example:
|
47
|
+
reader = SafeTextFileReader("/tmp/data.csv", encoding="utf-8")
|
48
|
+
rows = reader.read(skip_header_rows=1)
|
49
|
+
"""
|
50
|
+
|
51
|
+
def __init__(self, file_path: Path | str, *, encoding: str = "utf-8") -> None:
|
52
|
+
self.path = Path(file_path)
|
53
|
+
self.encoding = encoding
|
54
|
+
|
55
|
+
def _read_text(self) -> str:
|
56
|
+
"""Read the file bytes and return decoded text with no newline normalization applied.
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
Decoded text (str).
|
60
|
+
|
61
|
+
Raises:
|
62
|
+
SplurgeDsvFileEncodingError: If decoding fails or the file cannot
|
63
|
+
be read.
|
64
|
+
"""
|
65
|
+
try:
|
66
|
+
# Read raw bytes and decode explicitly to avoid the platform's
|
67
|
+
# text-mode newline translations which can alter mixed line endings.
|
68
|
+
with self.path.open("rb") as fh:
|
69
|
+
raw = fh.read()
|
70
|
+
return raw.decode(self.encoding)
|
71
|
+
except Exception as e:
|
72
|
+
raise SplurgeDsvFileEncodingError(f"Encoding error reading file: {self.path}", details=str(e)) from e
|
73
|
+
|
74
|
+
def read(self, *, strip: bool = True, skip_header_rows: int = 0, skip_footer_rows: int = 0) -> list[str]:
|
75
|
+
"""Read the entire file and return a list of normalized lines.
|
76
|
+
|
77
|
+
Newlines are normalized to ``\n`` and optional header/footer rows
|
78
|
+
can be skipped. If ``strip`` is True, whitespace surrounding each
|
79
|
+
line is removed.
|
80
|
+
|
81
|
+
Args:
|
82
|
+
strip (bool): Strip whitespace from each line (default: True).
|
83
|
+
skip_header_rows (int): Number of rows to skip at the start.
|
84
|
+
skip_footer_rows (int): Number of rows to skip at the end.
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
List of lines as strings.
|
88
|
+
"""
|
89
|
+
text = self._read_text()
|
90
|
+
# Normalize newlines to LF
|
91
|
+
normalized = text.replace("\r\n", "\n").replace("\r", "\n")
|
92
|
+
lines = normalized.splitlines()
|
93
|
+
|
94
|
+
if skip_header_rows:
|
95
|
+
lines = lines[skip_header_rows:]
|
96
|
+
if skip_footer_rows:
|
97
|
+
if skip_footer_rows >= len(lines):
|
98
|
+
return []
|
99
|
+
lines = lines[:-skip_footer_rows]
|
100
|
+
|
101
|
+
if strip:
|
102
|
+
return [ln.strip() for ln in lines]
|
103
|
+
return list(lines)
|
104
|
+
|
105
|
+
def preview(self, max_lines: int = 100, *, strip: bool = True, skip_header_rows: int = 0) -> list[str]:
|
106
|
+
"""Return the first ``max_lines`` lines of the file after normalization.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
max_lines (int): Maximum number of lines to return.
|
110
|
+
strip (bool): Strip whitespace from each returned line.
|
111
|
+
skip_header_rows (int): Number of header rows to skip before previewing.
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
A list of preview lines.
|
115
|
+
"""
|
116
|
+
text = self._read_text()
|
117
|
+
normalized = text.replace("\r\n", "\n").replace("\r", "\n")
|
118
|
+
lines = normalized.splitlines()
|
119
|
+
if skip_header_rows:
|
120
|
+
lines = lines[skip_header_rows:]
|
121
|
+
if max_lines < 1:
|
122
|
+
return []
|
123
|
+
result = lines[:max_lines]
|
124
|
+
return [ln.strip() for ln in result] if strip else list(result)
|
125
|
+
|
126
|
+
def read_as_stream(
|
127
|
+
self, *, strip: bool = True, skip_header_rows: int = 0, skip_footer_rows: int = 0, chunk_size: int = 500
|
128
|
+
) -> Iterator[list[str]]:
|
129
|
+
"""Yield chunks of lines from the file.
|
130
|
+
|
131
|
+
This convenience method currently reads the decoded file into memory
|
132
|
+
and yields chunks of ``chunk_size`` lines. For very large files this
|
133
|
+
could be optimized to stream from disk without full materialization.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
strip (bool): Whether to strip whitespace from each line.
|
137
|
+
skip_header_rows (int): Number of header rows to skip.
|
138
|
+
skip_footer_rows (int): Number of footer rows to skip.
|
139
|
+
chunk_size (int): Number of lines per yielded chunk.
|
140
|
+
|
141
|
+
Yields:
|
142
|
+
Lists of lines (each list length <= chunk_size).
|
143
|
+
"""
|
144
|
+
lines = self.read(strip=strip, skip_header_rows=skip_header_rows, skip_footer_rows=skip_footer_rows)
|
145
|
+
chunk: list[str] = []
|
146
|
+
for ln in lines:
|
147
|
+
chunk.append(ln)
|
148
|
+
if len(chunk) >= chunk_size:
|
149
|
+
yield chunk
|
150
|
+
chunk = []
|
151
|
+
if chunk:
|
152
|
+
yield chunk
|
153
|
+
|
154
|
+
|
155
|
+
@contextmanager
|
156
|
+
def open_text(file_path: Path | str, *, encoding: str = "utf-8"):
|
157
|
+
"""Context manager returning a text stream (io.StringIO) with normalized newlines.
|
158
|
+
|
159
|
+
Useful when an API expects a file-like object. The returned StringIO
|
160
|
+
contains the normalized text (LF newlines) and is closed automatically
|
161
|
+
when the context exits.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
file_path: Path to the file to open.
|
165
|
+
encoding: Encoding to decode the file with.
|
166
|
+
|
167
|
+
Yields:
|
168
|
+
io.StringIO: In-memory text buffer with normalized newlines.
|
169
|
+
"""
|
170
|
+
reader = SafeTextFileReader(file_path, encoding=encoding)
|
171
|
+
text_lines = reader.read(strip=False)
|
172
|
+
text = "\n".join(text_lines)
|
173
|
+
sio = StringIO(text)
|
174
|
+
try:
|
175
|
+
yield sio
|
176
|
+
finally:
|
177
|
+
sio.close()
|
@@ -0,0 +1,136 @@
|
|
1
|
+
"""Deterministic text-only writer utilities.
|
2
|
+
|
3
|
+
This module implements :class:`SafeTextFileWriter` and a convenience
|
4
|
+
``open_text_writer`` context manager. Writes always use the configured
|
5
|
+
encoding and normalize newline characters to a canonical form (LF) to
|
6
|
+
ensure consistent files across platforms.
|
7
|
+
|
8
|
+
Example:
|
9
|
+
with open_text_writer("out.txt") as buf:
|
10
|
+
buf.write("line1\nline2\n")
|
11
|
+
|
12
|
+
Copyright (c) 2025 Jim Schilling
|
13
|
+
Please preserve this header and all related material when sharing!
|
14
|
+
|
15
|
+
License: MIT
|
16
|
+
"""
|
17
|
+
|
18
|
+
from __future__ import annotations
|
19
|
+
|
20
|
+
import io
|
21
|
+
from collections.abc import Iterable, Iterator
|
22
|
+
from contextlib import contextmanager
|
23
|
+
from pathlib import Path
|
24
|
+
from typing import cast
|
25
|
+
|
26
|
+
from .exceptions import SplurgeDsvFileEncodingError
|
27
|
+
|
28
|
+
|
29
|
+
class SafeTextFileWriter:
|
30
|
+
"""Helper for deterministic text writes with newline normalization.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
file_path: Destination file path.
|
34
|
+
encoding: Text encoding to use (default: 'utf-8').
|
35
|
+
newline: Canonical newline sequence to write (default: '\n').
|
36
|
+
|
37
|
+
The class exposes a minimal file-like API and will raise
|
38
|
+
:class:`SplurgeDsvFileEncodingError` when the underlying file cannot be
|
39
|
+
opened with the requested encoding.
|
40
|
+
"""
|
41
|
+
|
42
|
+
def __init__(self, file_path: Path, *, encoding: str = "utf-8", newline: str | None = "\n") -> None:
|
43
|
+
self._path = Path(file_path)
|
44
|
+
self._encoding = encoding
|
45
|
+
# newline is the canonical newline we will write; default to LF
|
46
|
+
self._newline = "\n" if newline is None else newline
|
47
|
+
self._file: io.TextIOBase | None = None
|
48
|
+
|
49
|
+
def open(self, mode: str = "w") -> io.TextIOBase:
|
50
|
+
"""Open the underlying file for text writing.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
mode: File open mode (default: 'w').
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
The opened text file object.
|
57
|
+
|
58
|
+
Raises:
|
59
|
+
SplurgeDsvFileEncodingError: If the file cannot be opened with the
|
60
|
+
requested encoding or underlying OS error occurs.
|
61
|
+
"""
|
62
|
+
try:
|
63
|
+
# open with newline="" to allow us to manage newline normalization
|
64
|
+
fp = open(self._path, mode, encoding=self._encoding, newline="")
|
65
|
+
# cast to TextIOBase for precise typing
|
66
|
+
self._file = cast(io.TextIOBase, fp)
|
67
|
+
return self._file
|
68
|
+
except (LookupError, OSError) as exc:
|
69
|
+
raise SplurgeDsvFileEncodingError(str(exc)) from exc
|
70
|
+
|
71
|
+
def write(self, text: str) -> int:
|
72
|
+
"""Normalize newlines and write ``text`` to the opened file.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
text: Text to write (newlines will be normalized).
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
Number of characters written.
|
79
|
+
"""
|
80
|
+
if self._file is None:
|
81
|
+
raise ValueError("file not opened")
|
82
|
+
normalized = text.replace("\r\n", "\n").replace("\r", "\n")
|
83
|
+
return self._file.write(normalized)
|
84
|
+
|
85
|
+
def writelines(self, lines: Iterable[str]) -> None:
|
86
|
+
if self._file is None:
|
87
|
+
raise ValueError("file not opened")
|
88
|
+
for line in lines:
|
89
|
+
self.write(line)
|
90
|
+
|
91
|
+
def flush(self) -> None:
|
92
|
+
if self._file is None:
|
93
|
+
return
|
94
|
+
self._file.flush()
|
95
|
+
|
96
|
+
def close(self) -> None:
|
97
|
+
if self._file is None:
|
98
|
+
return
|
99
|
+
try:
|
100
|
+
self._file.close()
|
101
|
+
finally:
|
102
|
+
self._file = None
|
103
|
+
|
104
|
+
|
105
|
+
@contextmanager
|
106
|
+
def open_text_writer(file_path: Path | str, *, encoding: str = "utf-8", mode: str = "w") -> Iterator[io.StringIO]:
|
107
|
+
"""Context manager yielding an in-memory StringIO to accumulate text.
|
108
|
+
|
109
|
+
On successful exit, the buffered content is normalized and written to
|
110
|
+
disk using :class:`SafeTextFileWriter`. If an exception occurs inside
|
111
|
+
the context, nothing is written and the exception is propagated.
|
112
|
+
|
113
|
+
Args:
|
114
|
+
file_path: Destination path to write to on successful exit.
|
115
|
+
encoding: Encoding to use when writing.
|
116
|
+
mode: File open mode passed to writer (default: 'w').
|
117
|
+
|
118
|
+
Yields:
|
119
|
+
io.StringIO: Buffer to write textual content into.
|
120
|
+
"""
|
121
|
+
path = Path(file_path)
|
122
|
+
buffer = io.StringIO()
|
123
|
+
try:
|
124
|
+
yield buffer
|
125
|
+
except Exception:
|
126
|
+
# Do not write on exceptions; re-raise
|
127
|
+
raise
|
128
|
+
else:
|
129
|
+
content = buffer.getvalue()
|
130
|
+
writer = SafeTextFileWriter(path, encoding=encoding)
|
131
|
+
try:
|
132
|
+
writer.open(mode=mode)
|
133
|
+
writer.write(content)
|
134
|
+
writer.flush()
|
135
|
+
finally:
|
136
|
+
writer.close()
|
splurge_dsv/string_tokenizer.py
CHANGED
@@ -12,7 +12,7 @@ This module is licensed under the MIT License.
|
|
12
12
|
"""
|
13
13
|
|
14
14
|
# Local imports
|
15
|
-
from splurge_dsv.exceptions import
|
15
|
+
from splurge_dsv.exceptions import SplurgeDsvParameterError
|
16
16
|
|
17
17
|
|
18
18
|
class StringTokenizer:
|
@@ -29,21 +29,24 @@ class StringTokenizer:
|
|
29
29
|
|
30
30
|
@staticmethod
|
31
31
|
def parse(content: str | None, *, delimiter: str, strip: bool = DEFAULT_STRIP) -> list[str]:
|
32
|
-
"""
|
33
|
-
|
32
|
+
"""Tokenize a single string using ``delimiter``.
|
33
|
+
|
34
|
+
The function preserves empty tokens (e.g. ``"a,,c"`` with
|
35
|
+
delimiter ``","`` yields ``['a', '', 'c']``). If ``content`` is
|
36
|
+
None an empty list is returned.
|
34
37
|
|
35
38
|
Args:
|
36
|
-
content
|
37
|
-
delimiter
|
38
|
-
strip
|
39
|
+
content: The input string to tokenize, or ``None``.
|
40
|
+
delimiter: The delimiter string to split on.
|
41
|
+
strip: If True, strip leading/trailing whitespace from each token.
|
39
42
|
|
40
43
|
Returns:
|
41
|
-
list
|
44
|
+
A list of tokens. Empty tokens are preserved.
|
42
45
|
|
43
46
|
Raises:
|
44
|
-
|
47
|
+
SplurgeDsvParameterError: If ``delimiter`` is empty or ``None``.
|
45
48
|
|
46
|
-
|
49
|
+
Examples:
|
47
50
|
>>> StringTokenizer.parse("a,b,c", delimiter=",")
|
48
51
|
['a', 'b', 'c']
|
49
52
|
>>> StringTokenizer.parse("a,,c", delimiter=",")
|
@@ -53,7 +56,7 @@ class StringTokenizer:
|
|
53
56
|
return []
|
54
57
|
|
55
58
|
if delimiter is None or delimiter == "":
|
56
|
-
raise
|
59
|
+
raise SplurgeDsvParameterError("delimiter cannot be empty or None")
|
57
60
|
|
58
61
|
if strip and not content.strip():
|
59
62
|
return []
|
@@ -65,51 +68,56 @@ class StringTokenizer:
|
|
65
68
|
|
66
69
|
@classmethod
|
67
70
|
def parses(cls, content: list[str], *, delimiter: str, strip: bool = DEFAULT_STRIP) -> list[list[str]]:
|
68
|
-
"""
|
69
|
-
Process multiple strings into lists of tokens.
|
71
|
+
"""Tokenize multiple strings.
|
70
72
|
|
71
73
|
Args:
|
72
|
-
content
|
73
|
-
delimiter
|
74
|
-
strip
|
74
|
+
content: A list of strings to tokenize.
|
75
|
+
delimiter: The delimiter to use for splitting.
|
76
|
+
strip: If True, strip whitespace from tokens.
|
75
77
|
|
76
78
|
Returns:
|
77
|
-
list
|
79
|
+
A list where each element is the token list for the corresponding
|
80
|
+
input string.
|
78
81
|
|
79
82
|
Raises:
|
80
|
-
|
83
|
+
SplurgeDsvParameterError: If ``delimiter`` is empty or ``None``.
|
81
84
|
|
82
85
|
Example:
|
83
86
|
>>> StringTokenizer.parses(["a,b", "c,d"], delimiter=",")
|
84
87
|
[['a', 'b'], ['c', 'd']]
|
85
88
|
"""
|
86
89
|
if delimiter is None or delimiter == "":
|
87
|
-
raise
|
90
|
+
raise SplurgeDsvParameterError("delimiter cannot be empty or None")
|
88
91
|
|
89
92
|
return [cls.parse(text, delimiter=delimiter, strip=strip) for text in content]
|
90
93
|
|
91
94
|
@staticmethod
|
92
95
|
def remove_bookends(content: str, *, bookend: str, strip: bool = DEFAULT_STRIP) -> str:
|
93
|
-
"""
|
94
|
-
|
96
|
+
"""Remove matching bookend characters from both endpoints of ``content``.
|
97
|
+
|
98
|
+
The function optionally strips surrounding whitespace before checking
|
99
|
+
for matching bookend characters. If both ends match the provided
|
100
|
+
``bookend`` and the remaining content is long enough, the bookends are
|
101
|
+
removed; otherwise the possibly-stripped input is returned unchanged.
|
95
102
|
|
96
103
|
Args:
|
97
|
-
content
|
98
|
-
bookend
|
99
|
-
strip
|
104
|
+
content: The input string to process.
|
105
|
+
bookend: The bookend string to remove from both ends (e.g. '"').
|
106
|
+
strip: If True, strip whitespace prior to bookend removal.
|
100
107
|
|
101
108
|
Returns:
|
102
|
-
|
109
|
+
The input string with matching bookend characters removed when
|
110
|
+
applicable.
|
103
111
|
|
104
112
|
Raises:
|
105
|
-
|
113
|
+
SplurgeDsvParameterError: If ``bookend`` is empty or ``None``.
|
106
114
|
|
107
115
|
Example:
|
108
116
|
>>> StringTokenizer.remove_bookends("'hello'", bookend="'")
|
109
117
|
'hello'
|
110
118
|
"""
|
111
119
|
if bookend is None or bookend == "":
|
112
|
-
raise
|
120
|
+
raise SplurgeDsvParameterError("bookend cannot be empty or None")
|
113
121
|
|
114
122
|
value: str = content.strip() if strip else content
|
115
123
|
if value.startswith(bookend) and value.endswith(bookend) and len(value) > 2 * len(bookend) - 1:
|