splurge-dsv 2025.2.0__py3-none-any.whl → 2025.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- splurge_dsv/__init__.py +16 -5
- splurge_dsv/cli.py +137 -26
- splurge_dsv/dsv.py +101 -7
- splurge_dsv/dsv_helper.py +417 -43
- splurge_dsv/exceptions.py +22 -1
- splurge_dsv/string_tokenizer.py +7 -1
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/METADATA +78 -5
- splurge_dsv-2025.3.1.dist-info/RECORD +13 -0
- splurge_dsv/path_validator.py +0 -298
- splurge_dsv/safe_text_file_reader.py +0 -177
- splurge_dsv/safe_text_file_writer.py +0 -136
- splurge_dsv/text_file_helper.py +0 -240
- splurge_dsv-2025.2.0.dist-info/RECORD +0 -17
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/WHEEL +0 -0
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/entry_points.txt +0 -0
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/licenses/LICENSE +0 -0
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/top_level.txt +0 -0
splurge_dsv/dsv_helper.py
CHANGED
@@ -11,11 +11,23 @@ This module is licensed under the MIT License.
|
|
11
11
|
# Standard library imports
|
12
12
|
from collections.abc import Iterator
|
13
13
|
from os import PathLike
|
14
|
+
from pathlib import Path
|
15
|
+
|
16
|
+
import splurge_safe_io.constants as safe_io_constants
|
17
|
+
import splurge_safe_io.path_validator as safe_io_path_validator
|
18
|
+
import splurge_safe_io.safe_text_file_reader as safe_io_text_file_reader
|
14
19
|
|
15
20
|
# Local imports
|
16
|
-
from splurge_dsv.exceptions import
|
21
|
+
from splurge_dsv.exceptions import (
|
22
|
+
SplurgeDsvColumnMismatchError,
|
23
|
+
SplurgeDsvError,
|
24
|
+
SplurgeDsvFileDecodingError,
|
25
|
+
SplurgeDsvFileNotFoundError,
|
26
|
+
SplurgeDsvFilePermissionError,
|
27
|
+
SplurgeDsvParameterError,
|
28
|
+
SplurgeDsvPathValidationError,
|
29
|
+
)
|
17
30
|
from splurge_dsv.string_tokenizer import StringTokenizer
|
18
|
-
from splurge_dsv.text_file_helper import TextFileHelper
|
19
31
|
|
20
32
|
|
21
33
|
class DsvHelper:
|
@@ -26,22 +38,31 @@ class DsvHelper:
|
|
26
38
|
Supports configurable delimiters, text bookends, and whitespace handling options.
|
27
39
|
"""
|
28
40
|
|
29
|
-
DEFAULT_CHUNK_SIZE =
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
41
|
+
DEFAULT_CHUNK_SIZE = safe_io_constants.DEFAULT_CHUNK_SIZE
|
42
|
+
# When detecting normalize_columns across a stream, how many chunks to scan
|
43
|
+
# before giving up. Scanning more chunks increases work but helps if the
|
44
|
+
# first logical row starts later than the first chunk (e.g., many blank lines
|
45
|
+
# or very small chunks). Keep small by default to avoid buffering too much.
|
46
|
+
MAX_DETECT_CHUNKS = 10
|
47
|
+
DEFAULT_ENCODING = "utf-8"
|
48
|
+
DEFAULT_SKIP_HEADER_ROWS = 0
|
49
|
+
DEFAULT_SKIP_FOOTER_ROWS = 0
|
50
|
+
DEFAULT_MIN_CHUNK_SIZE = safe_io_constants.MIN_CHUNK_SIZE
|
34
51
|
DEFAULT_STRIP = True
|
35
52
|
DEFAULT_BOOKEND_STRIP = True
|
36
53
|
|
37
|
-
@
|
54
|
+
@classmethod
|
38
55
|
def parse(
|
56
|
+
cls,
|
39
57
|
content: str,
|
40
58
|
*,
|
41
59
|
delimiter: str,
|
42
60
|
strip: bool = DEFAULT_STRIP,
|
43
61
|
bookend: str | None = None,
|
44
62
|
bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
|
63
|
+
normalize_columns: int = 0,
|
64
|
+
raise_on_missing_columns: bool = False,
|
65
|
+
raise_on_extra_columns: bool = False,
|
45
66
|
) -> list[str]:
|
46
67
|
"""Parse a single DSV line into tokens.
|
47
68
|
|
@@ -56,12 +77,18 @@ class DsvHelper:
|
|
56
77
|
strip: If True, strip leading/trailing whitespace from each token.
|
57
78
|
bookend: Optional bookend character to remove from token ends.
|
58
79
|
bookend_strip: If True, strip whitespace after removing bookends.
|
80
|
+
normalize_columns: If > 0, ensure the returned list has exactly this many columns,
|
81
|
+
padding with empty strings or truncating as needed.
|
82
|
+
raise_on_missing_columns: If True, raise an error if the line has fewer columns than ``normalize_columns``.
|
83
|
+
raise_on_extra_columns: If True, raise an error if the line has more columns than
|
59
84
|
|
60
85
|
Returns:
|
61
86
|
A list of parsed token strings.
|
62
87
|
|
63
88
|
Raises:
|
64
89
|
SplurgeDsvParameterError: If ``delimiter`` is empty or None.
|
90
|
+
SplurgeDsvParameterError: If ``normalize_columns`` is negative.
|
91
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
65
92
|
|
66
93
|
Examples:
|
67
94
|
>>> DsvHelper.parse("a,b,c", delimiter=",")
|
@@ -77,8 +104,79 @@ class DsvHelper:
|
|
77
104
|
if bookend:
|
78
105
|
tokens = [StringTokenizer.remove_bookends(token, bookend=bookend, strip=bookend_strip) for token in tokens]
|
79
106
|
|
107
|
+
# If requested, validate columns (raises) and/or normalize the row length
|
108
|
+
if normalize_columns and normalize_columns > 0:
|
109
|
+
# Validation is only performed if the caller asked for raises
|
110
|
+
if raise_on_missing_columns or raise_on_extra_columns:
|
111
|
+
cls._validate_columns(
|
112
|
+
len(tokens),
|
113
|
+
expected_columns=normalize_columns,
|
114
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
115
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
116
|
+
)
|
117
|
+
|
118
|
+
tokens = cls._normalize_columns(tokens, expected_columns=normalize_columns)
|
119
|
+
|
80
120
|
return tokens
|
81
121
|
|
122
|
+
@classmethod
|
123
|
+
def _normalize_columns(cls, row: list[str], *, expected_columns: int) -> list[str]:
|
124
|
+
"""Normalize a token list to the expected number of columns.
|
125
|
+
|
126
|
+
If the row has fewer columns than expected, append empty strings to reach
|
127
|
+
the expected length. If the row has more columns than expected, truncate
|
128
|
+
the excess columns.
|
129
|
+
|
130
|
+
Args:
|
131
|
+
row: The list of tokens to normalize.
|
132
|
+
expected_columns: Desired number of columns.
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
A new list of tokens with length == expected_columns.
|
136
|
+
|
137
|
+
Raises:
|
138
|
+
SplurgeDsvParameterError: If ``expected_columns`` is negative.
|
139
|
+
"""
|
140
|
+
if expected_columns < 0:
|
141
|
+
raise SplurgeDsvParameterError("expected_columns must be non-negative")
|
142
|
+
|
143
|
+
current = len(row)
|
144
|
+
if current == expected_columns:
|
145
|
+
return row
|
146
|
+
if current < expected_columns:
|
147
|
+
# append empty strings
|
148
|
+
return row + [""] * (expected_columns - current)
|
149
|
+
# current > expected -> truncate
|
150
|
+
return row[:expected_columns]
|
151
|
+
|
152
|
+
@classmethod
|
153
|
+
def _validate_columns(
|
154
|
+
cls, actual_columns: int, *, expected_columns: int, raise_on_missing_columns: bool, raise_on_extra_columns: bool
|
155
|
+
) -> None:
|
156
|
+
"""Validate column count against expected_columns.
|
157
|
+
|
158
|
+
Raises a SplurgeDsvError (or a more specific subclass) when the
|
159
|
+
validation fails according to the provided flags.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
actual_columns: The actual number of columns in the row.
|
163
|
+
expected_columns: The expected number of columns.
|
164
|
+
raise_on_missing_columns: If True, raise an error if actual_columns < expected_columns.
|
165
|
+
raise_on_extra_columns: If True, raise an error if actual_columns > expected_columns.
|
166
|
+
|
167
|
+
Raises:
|
168
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
169
|
+
SplurgeDsvParameterError: If ``expected_columns`` is negative.
|
170
|
+
"""
|
171
|
+
if expected_columns < 0:
|
172
|
+
raise SplurgeDsvParameterError("expected_columns must be non-negative")
|
173
|
+
|
174
|
+
if raise_on_missing_columns and actual_columns < expected_columns:
|
175
|
+
raise SplurgeDsvColumnMismatchError(f"Row is missing columns: ({actual_columns} < {expected_columns})")
|
176
|
+
|
177
|
+
if raise_on_extra_columns and actual_columns > expected_columns:
|
178
|
+
raise SplurgeDsvColumnMismatchError(f"Row has extra columns: ({actual_columns} > {expected_columns})")
|
179
|
+
|
82
180
|
@classmethod
|
83
181
|
def parses(
|
84
182
|
cls,
|
@@ -88,6 +186,10 @@ class DsvHelper:
|
|
88
186
|
strip: bool = DEFAULT_STRIP,
|
89
187
|
bookend: str | None = None,
|
90
188
|
bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
|
189
|
+
normalize_columns: int = 0,
|
190
|
+
raise_on_missing_columns: bool = False,
|
191
|
+
raise_on_extra_columns: bool = False,
|
192
|
+
detect_columns: bool = False,
|
91
193
|
) -> list[list[str]]:
|
92
194
|
"""Parse multiple DSV lines.
|
93
195
|
|
@@ -100,13 +202,19 @@ class DsvHelper:
|
|
100
202
|
strip: If True, strip whitespace from tokens.
|
101
203
|
bookend: Optional bookend character to remove from tokens.
|
102
204
|
bookend_strip: If True, strip whitespace after removing bookends.
|
205
|
+
normalize_columns: If > 0, ensure each returned list has exactly this many columns,
|
206
|
+
padding with empty strings or truncating as needed.
|
207
|
+
raise_on_missing_columns: If True, raise an error if a line has fewer columns than ``normalize_columns``.
|
208
|
+
raise_on_extra_columns: If True, raise an error if a line has more columns than ``normalize_columns``.
|
209
|
+
detect_columns: If True and ``normalize_columns`` is not set or <= 0, detect the number of columns from the content.
|
103
210
|
|
104
211
|
Returns:
|
105
212
|
A list of token lists, one per input line.
|
106
213
|
|
107
214
|
Raises:
|
108
|
-
SplurgeDsvParameterError: If ``content`` is not a list of strings or
|
109
|
-
if ``delimiter`` is empty or None.
|
215
|
+
SplurgeDsvParameterError: If ``content`` is not a list of strings, or
|
216
|
+
if ``delimiter`` is empty or None, or if ``normalize_columns`` is negative.
|
217
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
110
218
|
|
111
219
|
Example:
|
112
220
|
>>> DsvHelper.parses(["a,b,c", "d,e,f"], delimiter=",")
|
@@ -118,15 +226,82 @@ class DsvHelper:
|
|
118
226
|
if not all(isinstance(item, str) for item in content):
|
119
227
|
raise SplurgeDsvParameterError("content must be a list of strings")
|
120
228
|
|
229
|
+
# If requested, detect expected columns from the first logical row
|
230
|
+
if detect_columns and (not normalize_columns or normalize_columns <= 0):
|
231
|
+
if not content:
|
232
|
+
return []
|
233
|
+
# Find the first non-blank logical row in the provided content
|
234
|
+
first_non_blank = None
|
235
|
+
for ln in content:
|
236
|
+
if isinstance(ln, str) and ln.strip() != "":
|
237
|
+
first_non_blank = ln
|
238
|
+
break
|
239
|
+
if first_non_blank is None:
|
240
|
+
return []
|
241
|
+
|
242
|
+
detected = cls.parse(
|
243
|
+
first_non_blank,
|
244
|
+
delimiter=delimiter,
|
245
|
+
strip=strip,
|
246
|
+
bookend=bookend,
|
247
|
+
bookend_strip=bookend_strip,
|
248
|
+
normalize_columns=0,
|
249
|
+
raise_on_missing_columns=False,
|
250
|
+
raise_on_extra_columns=False,
|
251
|
+
)
|
252
|
+
normalize_columns = len(detected)
|
253
|
+
|
121
254
|
return [
|
122
|
-
cls.parse(
|
255
|
+
cls.parse(
|
256
|
+
item,
|
257
|
+
delimiter=delimiter,
|
258
|
+
strip=strip,
|
259
|
+
bookend=bookend,
|
260
|
+
bookend_strip=bookend_strip,
|
261
|
+
normalize_columns=normalize_columns,
|
262
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
263
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
264
|
+
)
|
123
265
|
for item in content
|
124
266
|
]
|
125
267
|
|
268
|
+
@staticmethod
|
269
|
+
def _validate_file_path(
|
270
|
+
file_path: Path | str, *, must_exist: bool = True, must_be_file: bool = True, must_be_readable: bool = True
|
271
|
+
) -> Path:
|
272
|
+
"""Validate the provided file path.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
file_path: The file path to validate.
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
A validated Path object.
|
279
|
+
|
280
|
+
Raises:
|
281
|
+
SplurgeDsvPathValidationError: If the file path is invalid.
|
282
|
+
SplurgeDsvFileNotFoundError: If the file does not exist.
|
283
|
+
SplurgeDsvFilePermissionError: If the file cannot be accessed due to permission restrictions
|
284
|
+
SplurgeDsvError: For other unexpected errors.
|
285
|
+
"""
|
286
|
+
try:
|
287
|
+
effective_path = safe_io_path_validator.PathValidator.validate_path(
|
288
|
+
Path(file_path), must_exist=must_exist, must_be_file=must_be_file, must_be_readable=must_be_readable
|
289
|
+
)
|
290
|
+
except safe_io_path_validator.SplurgeSafeIoPathValidationError as ex:
|
291
|
+
raise SplurgeDsvPathValidationError(f"Invalid file path: {file_path}") from ex
|
292
|
+
except safe_io_path_validator.SplurgeSafeIoFileNotFoundError as ex:
|
293
|
+
raise SplurgeDsvFileNotFoundError(f"File not found: {file_path}") from ex
|
294
|
+
except safe_io_path_validator.SplurgeSafeIoFilePermissionError as ex:
|
295
|
+
raise SplurgeDsvFilePermissionError(f"File permission error: {file_path}") from ex
|
296
|
+
except Exception as ex:
|
297
|
+
raise SplurgeDsvError(f"Unexpected error validating file path: {file_path}") from ex
|
298
|
+
|
299
|
+
return effective_path
|
300
|
+
|
126
301
|
@classmethod
|
127
302
|
def parse_file(
|
128
303
|
cls,
|
129
|
-
file_path: PathLike[str] | str,
|
304
|
+
file_path: PathLike[str] | Path | str,
|
130
305
|
*,
|
131
306
|
delimiter: str,
|
132
307
|
strip: bool = DEFAULT_STRIP,
|
@@ -135,11 +310,16 @@ class DsvHelper:
|
|
135
310
|
encoding: str = DEFAULT_ENCODING,
|
136
311
|
skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
|
137
312
|
skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
|
313
|
+
skip_empty_lines: bool = False,
|
314
|
+
normalize_columns: int = 0,
|
315
|
+
raise_on_missing_columns: bool = False,
|
316
|
+
raise_on_extra_columns: bool = False,
|
317
|
+
detect_columns: bool = False,
|
138
318
|
) -> list[list[str]]:
|
139
319
|
"""Read and parse an entire DSV file.
|
140
320
|
|
141
321
|
This convenience reads all lines from ``file_path`` using
|
142
|
-
:class:`
|
322
|
+
:class:`splurge_safe_io.safe_text_file_reader.SafeTextFileReader` and then parses each
|
143
323
|
line into tokens. Header and footer rows may be skipped via the
|
144
324
|
``skip_header_rows`` and ``skip_footer_rows`` parameters.
|
145
325
|
|
@@ -152,23 +332,64 @@ class DsvHelper:
|
|
152
332
|
encoding: Text encoding to use when reading the file.
|
153
333
|
skip_header_rows: Number of leading lines to ignore.
|
154
334
|
skip_footer_rows: Number of trailing lines to ignore.
|
335
|
+
normalize_columns: Number of columns to normalize.
|
336
|
+
raise_on_missing_columns: Raise an error if a line has fewer columns than ``normalize_columns``.
|
337
|
+
raise_on_extra_columns: Raise an error if a line has more columns than ``normalize_columns``.
|
155
338
|
|
156
339
|
Returns:
|
157
340
|
A list of token lists (one list per non-skipped line).
|
158
341
|
|
159
342
|
Raises:
|
160
|
-
SplurgeDsvParameterError: If ``delimiter`` is empty or None.
|
343
|
+
SplurgeDsvParameterError: If ``delimiter`` is empty or None, or if ``normalize_columns`` is negative.
|
161
344
|
SplurgeDsvFileNotFoundError: If the file at ``file_path`` does not exist.
|
162
|
-
SplurgeDsvFilePermissionError: If the file cannot be accessed due to
|
163
|
-
|
164
|
-
|
165
|
-
|
345
|
+
SplurgeDsvFilePermissionError: If the file cannot be accessed due to permission restrictions.
|
346
|
+
SplurgeDsvFileDecodingError: If the file cannot be decoded using the provided ``encoding``.
|
347
|
+
SplurgeDsvPathValidationError: If the file path is invalid.
|
348
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
349
|
+
SplurgeDsvError: For other unexpected errors.
|
166
350
|
"""
|
167
|
-
|
168
|
-
file_path, encoding=encoding, skip_header_rows=skip_header_rows, skip_footer_rows=skip_footer_rows
|
169
|
-
)
|
351
|
+
effective_file_path = cls._validate_file_path(Path(file_path))
|
170
352
|
|
171
|
-
|
353
|
+
skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
|
354
|
+
skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
|
355
|
+
|
356
|
+
try:
|
357
|
+
reader = safe_io_text_file_reader.SafeTextFileReader(
|
358
|
+
effective_file_path,
|
359
|
+
encoding=encoding,
|
360
|
+
skip_header_lines=skip_header_rows,
|
361
|
+
skip_footer_lines=skip_footer_rows,
|
362
|
+
strip=strip,
|
363
|
+
skip_empty_lines=skip_empty_lines,
|
364
|
+
)
|
365
|
+
lines: list[str] = reader.read()
|
366
|
+
|
367
|
+
except safe_io_text_file_reader.SplurgeSafeIoFileDecodingError as ex:
|
368
|
+
raise SplurgeDsvFileDecodingError(f"File decoding error: {effective_file_path}") from ex
|
369
|
+
except safe_io_text_file_reader.SplurgeSafeIoFilePermissionError as ex:
|
370
|
+
raise SplurgeDsvFilePermissionError(f"File permission error: {effective_file_path}") from ex
|
371
|
+
except safe_io_text_file_reader.SplurgeSafeIoOsError as ex:
|
372
|
+
raise SplurgeDsvFilePermissionError(f"File access error: {effective_file_path}") from ex
|
373
|
+
except Exception as ex:
|
374
|
+
# If the exception is already a SplurgeDsvError (or subclass),
|
375
|
+
# re-raise it unchanged so callers can handle specific errors
|
376
|
+
# (for example, SplurgeDsvColumnMismatchError from validation).
|
377
|
+
if isinstance(ex, SplurgeDsvError):
|
378
|
+
raise
|
379
|
+
|
380
|
+
raise SplurgeDsvError(f"Unexpected error reading file: {effective_file_path}") from ex
|
381
|
+
|
382
|
+
return cls.parses(
|
383
|
+
lines,
|
384
|
+
delimiter=delimiter,
|
385
|
+
strip=strip,
|
386
|
+
bookend=bookend,
|
387
|
+
bookend_strip=bookend_strip,
|
388
|
+
normalize_columns=normalize_columns,
|
389
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
390
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
391
|
+
detect_columns=detect_columns,
|
392
|
+
)
|
172
393
|
|
173
394
|
@classmethod
|
174
395
|
def _process_stream_chunk(
|
@@ -179,10 +400,13 @@ class DsvHelper:
|
|
179
400
|
strip: bool = DEFAULT_STRIP,
|
180
401
|
bookend: str | None = None,
|
181
402
|
bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
|
403
|
+
normalize_columns: int = 0,
|
404
|
+
raise_on_missing_columns: bool = False,
|
405
|
+
raise_on_extra_columns: bool = False,
|
182
406
|
) -> list[list[str]]:
|
183
407
|
"""Parse a chunk of lines into tokenized rows.
|
184
408
|
|
185
|
-
Designed to be used by :meth:`
|
409
|
+
Designed to be used by :meth:`parse_file_stream` as a helper for converting a
|
186
410
|
batch of raw lines into parsed rows.
|
187
411
|
|
188
412
|
Args:
|
@@ -191,15 +415,34 @@ class DsvHelper:
|
|
191
415
|
strip: If True, strip whitespace from tokens.
|
192
416
|
bookend: Optional bookend character to remove from tokens.
|
193
417
|
bookend_strip: If True, strip whitespace after removing bookends.
|
418
|
+
normalize_columns: If > 0, ensure each returned list has exactly this many columns,
|
419
|
+
padding with empty strings or truncating as needed.
|
420
|
+
raise_on_missing_columns: If True, raise an error if a line has fewer columns than ``normalize_columns``.
|
421
|
+
raise_on_extra_columns: If True, raise an error if a line has more columns than ``normalize_columns``.
|
422
|
+
|
423
|
+
Raises:
|
424
|
+
SplurgeDsvParameterError: If ``delimiter`` is empty or None,
|
425
|
+
or if ``normalize_columns`` is negative,
|
426
|
+
or if ``chunk`` is not a list of strings, or if any element in ``chunk`` is not a string.
|
427
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
194
428
|
|
195
429
|
Returns:
|
196
430
|
A list where each element is the token list for a corresponding
|
197
431
|
input line from ``chunk``.
|
198
432
|
"""
|
199
|
-
return cls.parses(
|
433
|
+
return cls.parses(
|
434
|
+
chunk,
|
435
|
+
delimiter=delimiter,
|
436
|
+
strip=strip,
|
437
|
+
bookend=bookend,
|
438
|
+
bookend_strip=bookend_strip,
|
439
|
+
normalize_columns=normalize_columns,
|
440
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
441
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
442
|
+
)
|
200
443
|
|
201
444
|
@classmethod
|
202
|
-
def
|
445
|
+
def parse_file_stream(
|
203
446
|
cls,
|
204
447
|
file_path: PathLike[str] | str,
|
205
448
|
*,
|
@@ -210,10 +453,19 @@ class DsvHelper:
|
|
210
453
|
encoding: str = DEFAULT_ENCODING,
|
211
454
|
skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
|
212
455
|
skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
|
456
|
+
skip_empty_lines: bool = False,
|
457
|
+
normalize_columns: int = 0,
|
458
|
+
raise_on_missing_columns: bool = False,
|
459
|
+
raise_on_extra_columns: bool = False,
|
460
|
+
detect_columns: bool = False,
|
213
461
|
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
462
|
+
# How many chunks to scan when attempting to detect normalize_columns
|
463
|
+
# from the beginning of a stream. Only used when
|
464
|
+
# `detect_columns is True` and `normalize_columns` is falsy.
|
465
|
+
max_detect_chunks: int = MAX_DETECT_CHUNKS,
|
214
466
|
) -> Iterator[list[list[str]]]:
|
215
467
|
"""
|
216
|
-
Stream-parse a DSV file
|
468
|
+
Stream-parse a DSV file into chunks of lines.
|
217
469
|
|
218
470
|
Args:
|
219
471
|
file_path (PathLike[str] | str): The path to the file to parse.
|
@@ -224,34 +476,156 @@ class DsvHelper:
|
|
224
476
|
encoding (str): The file encoding.
|
225
477
|
skip_header_rows (int): Number of header rows to skip.
|
226
478
|
skip_footer_rows (int): Number of footer rows to skip.
|
479
|
+
normalize_columns (int): If > 0, ensure each returned list has exactly this many columns,
|
480
|
+
padding with empty strings or truncating as needed.
|
481
|
+
raise_on_missing_columns (bool): If True, raise an error if a line has fewer columns than ``normalize_columns``.
|
482
|
+
raise_on_extra_columns (bool): If True, raise an error if a line has more columns than ``normalize_columns``.
|
483
|
+
detect_columns (bool): If True and ``normalize_columns`` is not set or <= 0,
|
484
|
+
detect the expected number of columns from the first non-blank logical row.
|
227
485
|
chunk_size (int): Number of lines per chunk (default: 100).
|
486
|
+
max_detect_chunks (int): When detecting columns, how many chunks to scan
|
487
|
+
from the start of the stream before giving up (default: 10).
|
228
488
|
|
229
489
|
Yields:
|
230
490
|
list[list[str]]: Parsed rows for each chunk.
|
231
491
|
|
232
492
|
Raises:
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
493
|
+
SplurgeDsvParameterError: If delimiter is empty or None, or if ``normalize_columns`` is negative,
|
494
|
+
or if ``chunk`` is not a list of strings, or if any element in ``chunk`` is not a string.
|
495
|
+
SplurgeDsvFileNotFoundError: If the file does not exist.
|
496
|
+
SplurgeDsvFilePermissionError: If the file cannot be accessed.
|
497
|
+
SplurgeDsvFileDecodingError: If the file cannot be decoded with the specified encoding.
|
498
|
+
SplurgeDsvPathValidationError: If the file path is invalid.
|
499
|
+
SplurgeDsvError: For other unexpected errors.
|
500
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
237
501
|
"""
|
238
|
-
|
239
|
-
|
502
|
+
|
503
|
+
effective_file_path = cls._validate_file_path(Path(file_path))
|
240
504
|
|
241
505
|
chunk_size = max(chunk_size, cls.DEFAULT_MIN_CHUNK_SIZE)
|
242
506
|
skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
|
243
507
|
skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
|
244
|
-
|
245
|
-
#
|
246
|
-
|
247
|
-
cls.
|
248
|
-
|
249
|
-
)
|
250
|
-
|
251
|
-
|
508
|
+
# Allow callers to pass None to use the module default. Ensure we have
|
509
|
+
# a positive integer to drive the detection loop.
|
510
|
+
if max_detect_chunks is None:
|
511
|
+
max_detect_chunks = cls.MAX_DETECT_CHUNKS
|
512
|
+
else:
|
513
|
+
max_detect_chunks = max(int(max_detect_chunks), 1)
|
514
|
+
|
515
|
+
try:
|
516
|
+
reader = safe_io_text_file_reader.SafeTextFileReader(
|
517
|
+
effective_file_path,
|
252
518
|
encoding=encoding,
|
253
|
-
|
254
|
-
|
519
|
+
skip_header_lines=skip_header_rows,
|
520
|
+
skip_footer_lines=skip_footer_rows,
|
521
|
+
strip=strip,
|
522
|
+
skip_empty_lines=skip_empty_lines,
|
255
523
|
chunk_size=chunk_size,
|
256
524
|
)
|
257
|
-
|
525
|
+
stream_iter = reader.read_as_stream()
|
526
|
+
|
527
|
+
if detect_columns and (not normalize_columns or normalize_columns <= 0):
|
528
|
+
# Buffer up to `max_detect_chunks` from the stream while
|
529
|
+
# searching for the first non-blank logical row. This allows us
|
530
|
+
# to detect the expected column count even if the first logical
|
531
|
+
# row doesn't appear in the very first chunk (for example,
|
532
|
+
# when the file begins with many blank lines or very small
|
533
|
+
# chunks).
|
534
|
+
buffered_chunks: list[list[str]] = []
|
535
|
+
max_scan = max_detect_chunks if max_detect_chunks is not None else cls.MAX_DETECT_CHUNKS
|
536
|
+
chunks_scanned = 0
|
537
|
+
|
538
|
+
while chunks_scanned < max_scan:
|
539
|
+
try:
|
540
|
+
chunk = next(stream_iter)
|
541
|
+
except StopIteration:
|
542
|
+
break
|
543
|
+
buffered_chunks.append(chunk)
|
544
|
+
|
545
|
+
# Inspect this chunk for the first non-blank logical row
|
546
|
+
first_line = None
|
547
|
+
for ln in chunk:
|
548
|
+
if isinstance(ln, str) and ln.strip() != "":
|
549
|
+
first_line = ln
|
550
|
+
break
|
551
|
+
|
552
|
+
if first_line is not None:
|
553
|
+
detected = cls.parse(
|
554
|
+
first_line,
|
555
|
+
delimiter=delimiter,
|
556
|
+
strip=strip,
|
557
|
+
bookend=bookend,
|
558
|
+
bookend_strip=bookend_strip,
|
559
|
+
normalize_columns=0,
|
560
|
+
raise_on_missing_columns=False,
|
561
|
+
raise_on_extra_columns=False,
|
562
|
+
)
|
563
|
+
normalize_columns = len(detected)
|
564
|
+
# remember which buffered chunk contained the first
|
565
|
+
# logical row so we can start applying normalization
|
566
|
+
# beginning with that chunk only
|
567
|
+
detected_index = len(buffered_chunks) - 1
|
568
|
+
break
|
569
|
+
|
570
|
+
chunks_scanned += 1
|
571
|
+
|
572
|
+
# Replay any buffered chunks (in order) so callers receive the
|
573
|
+
# full content starting at the beginning of the file. If we
|
574
|
+
# detected the first logical row in one of the buffered chunks
|
575
|
+
# then only apply normalization beginning with that chunk;
|
576
|
+
# earlier buffered chunks must be emitted without
|
577
|
+
# normalization so we don't convert blank-only lines into
|
578
|
+
# padded empty-token rows.
|
579
|
+
if "detected_index" in locals():
|
580
|
+
for idx, b in enumerate(buffered_chunks):
|
581
|
+
use_norm = normalize_columns if idx == detected_index else 0
|
582
|
+
yield cls._process_stream_chunk(
|
583
|
+
b,
|
584
|
+
delimiter=delimiter,
|
585
|
+
strip=strip,
|
586
|
+
bookend=bookend,
|
587
|
+
bookend_strip=bookend_strip,
|
588
|
+
normalize_columns=use_norm,
|
589
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
590
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
591
|
+
)
|
592
|
+
else:
|
593
|
+
for b in buffered_chunks:
|
594
|
+
yield cls._process_stream_chunk(
|
595
|
+
b,
|
596
|
+
delimiter=delimiter,
|
597
|
+
strip=strip,
|
598
|
+
bookend=bookend,
|
599
|
+
bookend_strip=bookend_strip,
|
600
|
+
normalize_columns=0,
|
601
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
602
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
603
|
+
)
|
604
|
+
|
605
|
+
# Continue streaming the rest of the file
|
606
|
+
for chunk in stream_iter:
|
607
|
+
yield cls._process_stream_chunk(
|
608
|
+
chunk,
|
609
|
+
delimiter=delimiter,
|
610
|
+
strip=strip,
|
611
|
+
bookend=bookend,
|
612
|
+
bookend_strip=bookend_strip,
|
613
|
+
normalize_columns=normalize_columns,
|
614
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
615
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
616
|
+
)
|
617
|
+
except safe_io_text_file_reader.SplurgeSafeIoFileDecodingError as ex:
|
618
|
+
raise SplurgeDsvFileDecodingError(f"File decoding error: {effective_file_path}") from ex
|
619
|
+
except safe_io_text_file_reader.SplurgeSafeIoFilePermissionError as ex:
|
620
|
+
raise SplurgeDsvFilePermissionError(f"File permission error: {effective_file_path}") from ex
|
621
|
+
except safe_io_text_file_reader.SplurgeSafeIoOsError as ex:
|
622
|
+
raise SplurgeDsvFilePermissionError(f"File access error: {effective_file_path}") from ex
|
623
|
+
except Exception as ex:
|
624
|
+
# Preserve and re-raise known SplurgeDsvError subclasses so
|
625
|
+
# callers can handle specific errors (e.g. column mismatch) as
|
626
|
+
# intended. Only wrap unknown exceptions in a generic
|
627
|
+
# SplurgeDsvError.
|
628
|
+
if isinstance(ex, SplurgeDsvError):
|
629
|
+
raise
|
630
|
+
|
631
|
+
raise SplurgeDsvError(f"Unexpected error reading file: {effective_file_path}") from ex
|
splurge_dsv/exceptions.py
CHANGED
@@ -65,6 +65,15 @@ class SplurgeDsvFileNotFoundError(SplurgeDsvFileOperationError):
|
|
65
65
|
"""
|
66
66
|
|
67
67
|
|
68
|
+
class SplurgeDsvFileExistsError(SplurgeDsvFileOperationError):
|
69
|
+
"""Raised when attempting to create a file that already exists.
|
70
|
+
|
71
|
+
This typically maps to ``FileExistsError`` semantics but uses the
|
72
|
+
package-specific exception hierarchy so callers can distinguish
|
73
|
+
file errors from other error types.
|
74
|
+
"""
|
75
|
+
|
76
|
+
|
68
77
|
class SplurgeDsvFilePermissionError(SplurgeDsvFileOperationError):
|
69
78
|
"""Raised for permission or access-related file errors.
|
70
79
|
|
@@ -73,7 +82,7 @@ class SplurgeDsvFilePermissionError(SplurgeDsvFileOperationError):
|
|
73
82
|
"""
|
74
83
|
|
75
84
|
|
76
|
-
class
|
85
|
+
class SplurgeDsvFileDecodingError(SplurgeDsvFileOperationError):
|
77
86
|
"""Raised when decoding or encoding a text file fails.
|
78
87
|
|
79
88
|
The exception typically wraps the underlying decoding error and
|
@@ -81,6 +90,14 @@ class SplurgeDsvFileEncodingError(SplurgeDsvFileOperationError):
|
|
81
90
|
"""
|
82
91
|
|
83
92
|
|
93
|
+
class SplurgeDsvFileEncodingError(SplurgeDsvFileOperationError):
|
94
|
+
"""Raised when encoding a text file fails.
|
95
|
+
|
96
|
+
The exception typically wraps the underlying encoding error and
|
97
|
+
provides a descriptive message and optional details for diagnostics.
|
98
|
+
"""
|
99
|
+
|
100
|
+
|
84
101
|
class SplurgeDsvPathValidationError(SplurgeDsvFileOperationError):
|
85
102
|
"""Raised when a provided filesystem path fails validation checks.
|
86
103
|
|
@@ -101,6 +118,10 @@ class SplurgeDsvParsingError(SplurgeDsvDataProcessingError):
|
|
101
118
|
"""Raised when parsing fails due to malformed or unexpected content."""
|
102
119
|
|
103
120
|
|
121
|
+
class SplurgeDsvColumnMismatchError(SplurgeDsvDataProcessingError):
|
122
|
+
"""Raised when a row has a different number of columns than expected."""
|
123
|
+
|
124
|
+
|
104
125
|
class SplurgeDsvTypeConversionError(SplurgeDsvDataProcessingError):
|
105
126
|
"""Raised when a value cannot be converted to the requested type."""
|
106
127
|
|