splurge-dsv 2025.2.1__py3-none-any.whl → 2025.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- splurge_dsv/__init__.py +16 -5
- splurge_dsv/cli.py +137 -26
- splurge_dsv/dsv.py +100 -30
- splurge_dsv/dsv_helper.py +415 -90
- splurge_dsv/exceptions.py +22 -1
- splurge_dsv/string_tokenizer.py +7 -1
- {splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/METADATA +78 -5
- splurge_dsv-2025.3.1.dist-info/RECORD +13 -0
- splurge_dsv/path_validator.py +0 -298
- splurge_dsv/safe_text_file_reader.py +0 -177
- splurge_dsv/safe_text_file_writer.py +0 -136
- splurge_dsv/text_file_helper.py +0 -240
- splurge_dsv-2025.2.1.dist-info/RECORD +0 -17
- {splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/WHEEL +0 -0
- {splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/entry_points.txt +0 -0
- {splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/licenses/LICENSE +0 -0
- {splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/top_level.txt +0 -0
splurge_dsv/dsv_helper.py
CHANGED
@@ -9,14 +9,25 @@ This module is licensed under the MIT License.
|
|
9
9
|
"""
|
10
10
|
|
11
11
|
# Standard library imports
|
12
|
-
import warnings
|
13
12
|
from collections.abc import Iterator
|
14
13
|
from os import PathLike
|
14
|
+
from pathlib import Path
|
15
|
+
|
16
|
+
import splurge_safe_io.constants as safe_io_constants
|
17
|
+
import splurge_safe_io.path_validator as safe_io_path_validator
|
18
|
+
import splurge_safe_io.safe_text_file_reader as safe_io_text_file_reader
|
15
19
|
|
16
20
|
# Local imports
|
17
|
-
from splurge_dsv.exceptions import
|
21
|
+
from splurge_dsv.exceptions import (
|
22
|
+
SplurgeDsvColumnMismatchError,
|
23
|
+
SplurgeDsvError,
|
24
|
+
SplurgeDsvFileDecodingError,
|
25
|
+
SplurgeDsvFileNotFoundError,
|
26
|
+
SplurgeDsvFilePermissionError,
|
27
|
+
SplurgeDsvParameterError,
|
28
|
+
SplurgeDsvPathValidationError,
|
29
|
+
)
|
18
30
|
from splurge_dsv.string_tokenizer import StringTokenizer
|
19
|
-
from splurge_dsv.text_file_helper import TextFileHelper
|
20
31
|
|
21
32
|
|
22
33
|
class DsvHelper:
|
@@ -27,22 +38,31 @@ class DsvHelper:
|
|
27
38
|
Supports configurable delimiters, text bookends, and whitespace handling options.
|
28
39
|
"""
|
29
40
|
|
30
|
-
DEFAULT_CHUNK_SIZE =
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
41
|
+
DEFAULT_CHUNK_SIZE = safe_io_constants.DEFAULT_CHUNK_SIZE
|
42
|
+
# When detecting normalize_columns across a stream, how many chunks to scan
|
43
|
+
# before giving up. Scanning more chunks increases work but helps if the
|
44
|
+
# first logical row starts later than the first chunk (e.g., many blank lines
|
45
|
+
# or very small chunks). Keep small by default to avoid buffering too much.
|
46
|
+
MAX_DETECT_CHUNKS = 10
|
47
|
+
DEFAULT_ENCODING = "utf-8"
|
48
|
+
DEFAULT_SKIP_HEADER_ROWS = 0
|
49
|
+
DEFAULT_SKIP_FOOTER_ROWS = 0
|
50
|
+
DEFAULT_MIN_CHUNK_SIZE = safe_io_constants.MIN_CHUNK_SIZE
|
35
51
|
DEFAULT_STRIP = True
|
36
52
|
DEFAULT_BOOKEND_STRIP = True
|
37
53
|
|
38
|
-
@
|
54
|
+
@classmethod
|
39
55
|
def parse(
|
56
|
+
cls,
|
40
57
|
content: str,
|
41
58
|
*,
|
42
59
|
delimiter: str,
|
43
60
|
strip: bool = DEFAULT_STRIP,
|
44
61
|
bookend: str | None = None,
|
45
62
|
bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
|
63
|
+
normalize_columns: int = 0,
|
64
|
+
raise_on_missing_columns: bool = False,
|
65
|
+
raise_on_extra_columns: bool = False,
|
46
66
|
) -> list[str]:
|
47
67
|
"""Parse a single DSV line into tokens.
|
48
68
|
|
@@ -57,12 +77,18 @@ class DsvHelper:
|
|
57
77
|
strip: If True, strip leading/trailing whitespace from each token.
|
58
78
|
bookend: Optional bookend character to remove from token ends.
|
59
79
|
bookend_strip: If True, strip whitespace after removing bookends.
|
80
|
+
normalize_columns: If > 0, ensure the returned list has exactly this many columns,
|
81
|
+
padding with empty strings or truncating as needed.
|
82
|
+
raise_on_missing_columns: If True, raise an error if the line has fewer columns than ``normalize_columns``.
|
83
|
+
raise_on_extra_columns: If True, raise an error if the line has more columns than
|
60
84
|
|
61
85
|
Returns:
|
62
86
|
A list of parsed token strings.
|
63
87
|
|
64
88
|
Raises:
|
65
89
|
SplurgeDsvParameterError: If ``delimiter`` is empty or None.
|
90
|
+
SplurgeDsvParameterError: If ``normalize_columns`` is negative.
|
91
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
66
92
|
|
67
93
|
Examples:
|
68
94
|
>>> DsvHelper.parse("a,b,c", delimiter=",")
|
@@ -78,8 +104,79 @@ class DsvHelper:
|
|
78
104
|
if bookend:
|
79
105
|
tokens = [StringTokenizer.remove_bookends(token, bookend=bookend, strip=bookend_strip) for token in tokens]
|
80
106
|
|
107
|
+
# If requested, validate columns (raises) and/or normalize the row length
|
108
|
+
if normalize_columns and normalize_columns > 0:
|
109
|
+
# Validation is only performed if the caller asked for raises
|
110
|
+
if raise_on_missing_columns or raise_on_extra_columns:
|
111
|
+
cls._validate_columns(
|
112
|
+
len(tokens),
|
113
|
+
expected_columns=normalize_columns,
|
114
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
115
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
116
|
+
)
|
117
|
+
|
118
|
+
tokens = cls._normalize_columns(tokens, expected_columns=normalize_columns)
|
119
|
+
|
81
120
|
return tokens
|
82
121
|
|
122
|
+
@classmethod
|
123
|
+
def _normalize_columns(cls, row: list[str], *, expected_columns: int) -> list[str]:
|
124
|
+
"""Normalize a token list to the expected number of columns.
|
125
|
+
|
126
|
+
If the row has fewer columns than expected, append empty strings to reach
|
127
|
+
the expected length. If the row has more columns than expected, truncate
|
128
|
+
the excess columns.
|
129
|
+
|
130
|
+
Args:
|
131
|
+
row: The list of tokens to normalize.
|
132
|
+
expected_columns: Desired number of columns.
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
A new list of tokens with length == expected_columns.
|
136
|
+
|
137
|
+
Raises:
|
138
|
+
SplurgeDsvParameterError: If ``expected_columns`` is negative.
|
139
|
+
"""
|
140
|
+
if expected_columns < 0:
|
141
|
+
raise SplurgeDsvParameterError("expected_columns must be non-negative")
|
142
|
+
|
143
|
+
current = len(row)
|
144
|
+
if current == expected_columns:
|
145
|
+
return row
|
146
|
+
if current < expected_columns:
|
147
|
+
# append empty strings
|
148
|
+
return row + [""] * (expected_columns - current)
|
149
|
+
# current > expected -> truncate
|
150
|
+
return row[:expected_columns]
|
151
|
+
|
152
|
+
@classmethod
|
153
|
+
def _validate_columns(
|
154
|
+
cls, actual_columns: int, *, expected_columns: int, raise_on_missing_columns: bool, raise_on_extra_columns: bool
|
155
|
+
) -> None:
|
156
|
+
"""Validate column count against expected_columns.
|
157
|
+
|
158
|
+
Raises a SplurgeDsvError (or a more specific subclass) when the
|
159
|
+
validation fails according to the provided flags.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
actual_columns: The actual number of columns in the row.
|
163
|
+
expected_columns: The expected number of columns.
|
164
|
+
raise_on_missing_columns: If True, raise an error if actual_columns < expected_columns.
|
165
|
+
raise_on_extra_columns: If True, raise an error if actual_columns > expected_columns.
|
166
|
+
|
167
|
+
Raises:
|
168
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
169
|
+
SplurgeDsvParameterError: If ``expected_columns`` is negative.
|
170
|
+
"""
|
171
|
+
if expected_columns < 0:
|
172
|
+
raise SplurgeDsvParameterError("expected_columns must be non-negative")
|
173
|
+
|
174
|
+
if raise_on_missing_columns and actual_columns < expected_columns:
|
175
|
+
raise SplurgeDsvColumnMismatchError(f"Row is missing columns: ({actual_columns} < {expected_columns})")
|
176
|
+
|
177
|
+
if raise_on_extra_columns and actual_columns > expected_columns:
|
178
|
+
raise SplurgeDsvColumnMismatchError(f"Row has extra columns: ({actual_columns} > {expected_columns})")
|
179
|
+
|
83
180
|
@classmethod
|
84
181
|
def parses(
|
85
182
|
cls,
|
@@ -89,6 +186,10 @@ class DsvHelper:
|
|
89
186
|
strip: bool = DEFAULT_STRIP,
|
90
187
|
bookend: str | None = None,
|
91
188
|
bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
|
189
|
+
normalize_columns: int = 0,
|
190
|
+
raise_on_missing_columns: bool = False,
|
191
|
+
raise_on_extra_columns: bool = False,
|
192
|
+
detect_columns: bool = False,
|
92
193
|
) -> list[list[str]]:
|
93
194
|
"""Parse multiple DSV lines.
|
94
195
|
|
@@ -101,13 +202,19 @@ class DsvHelper:
|
|
101
202
|
strip: If True, strip whitespace from tokens.
|
102
203
|
bookend: Optional bookend character to remove from tokens.
|
103
204
|
bookend_strip: If True, strip whitespace after removing bookends.
|
205
|
+
normalize_columns: If > 0, ensure each returned list has exactly this many columns,
|
206
|
+
padding with empty strings or truncating as needed.
|
207
|
+
raise_on_missing_columns: If True, raise an error if a line has fewer columns than ``normalize_columns``.
|
208
|
+
raise_on_extra_columns: If True, raise an error if a line has more columns than ``normalize_columns``.
|
209
|
+
detect_columns: If True and ``normalize_columns`` is not set or <= 0, detect the number of columns from the content.
|
104
210
|
|
105
211
|
Returns:
|
106
212
|
A list of token lists, one per input line.
|
107
213
|
|
108
214
|
Raises:
|
109
|
-
SplurgeDsvParameterError: If ``content`` is not a list of strings or
|
110
|
-
if ``delimiter`` is empty or None.
|
215
|
+
SplurgeDsvParameterError: If ``content`` is not a list of strings, or
|
216
|
+
if ``delimiter`` is empty or None, or if ``normalize_columns`` is negative.
|
217
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
111
218
|
|
112
219
|
Example:
|
113
220
|
>>> DsvHelper.parses(["a,b,c", "d,e,f"], delimiter=",")
|
@@ -119,15 +226,82 @@ class DsvHelper:
|
|
119
226
|
if not all(isinstance(item, str) for item in content):
|
120
227
|
raise SplurgeDsvParameterError("content must be a list of strings")
|
121
228
|
|
229
|
+
# If requested, detect expected columns from the first logical row
|
230
|
+
if detect_columns and (not normalize_columns or normalize_columns <= 0):
|
231
|
+
if not content:
|
232
|
+
return []
|
233
|
+
# Find the first non-blank logical row in the provided content
|
234
|
+
first_non_blank = None
|
235
|
+
for ln in content:
|
236
|
+
if isinstance(ln, str) and ln.strip() != "":
|
237
|
+
first_non_blank = ln
|
238
|
+
break
|
239
|
+
if first_non_blank is None:
|
240
|
+
return []
|
241
|
+
|
242
|
+
detected = cls.parse(
|
243
|
+
first_non_blank,
|
244
|
+
delimiter=delimiter,
|
245
|
+
strip=strip,
|
246
|
+
bookend=bookend,
|
247
|
+
bookend_strip=bookend_strip,
|
248
|
+
normalize_columns=0,
|
249
|
+
raise_on_missing_columns=False,
|
250
|
+
raise_on_extra_columns=False,
|
251
|
+
)
|
252
|
+
normalize_columns = len(detected)
|
253
|
+
|
122
254
|
return [
|
123
|
-
cls.parse(
|
255
|
+
cls.parse(
|
256
|
+
item,
|
257
|
+
delimiter=delimiter,
|
258
|
+
strip=strip,
|
259
|
+
bookend=bookend,
|
260
|
+
bookend_strip=bookend_strip,
|
261
|
+
normalize_columns=normalize_columns,
|
262
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
263
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
264
|
+
)
|
124
265
|
for item in content
|
125
266
|
]
|
126
267
|
|
268
|
+
@staticmethod
|
269
|
+
def _validate_file_path(
|
270
|
+
file_path: Path | str, *, must_exist: bool = True, must_be_file: bool = True, must_be_readable: bool = True
|
271
|
+
) -> Path:
|
272
|
+
"""Validate the provided file path.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
file_path: The file path to validate.
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
A validated Path object.
|
279
|
+
|
280
|
+
Raises:
|
281
|
+
SplurgeDsvPathValidationError: If the file path is invalid.
|
282
|
+
SplurgeDsvFileNotFoundError: If the file does not exist.
|
283
|
+
SplurgeDsvFilePermissionError: If the file cannot be accessed due to permission restrictions
|
284
|
+
SplurgeDsvError: For other unexpected errors.
|
285
|
+
"""
|
286
|
+
try:
|
287
|
+
effective_path = safe_io_path_validator.PathValidator.validate_path(
|
288
|
+
Path(file_path), must_exist=must_exist, must_be_file=must_be_file, must_be_readable=must_be_readable
|
289
|
+
)
|
290
|
+
except safe_io_path_validator.SplurgeSafeIoPathValidationError as ex:
|
291
|
+
raise SplurgeDsvPathValidationError(f"Invalid file path: {file_path}") from ex
|
292
|
+
except safe_io_path_validator.SplurgeSafeIoFileNotFoundError as ex:
|
293
|
+
raise SplurgeDsvFileNotFoundError(f"File not found: {file_path}") from ex
|
294
|
+
except safe_io_path_validator.SplurgeSafeIoFilePermissionError as ex:
|
295
|
+
raise SplurgeDsvFilePermissionError(f"File permission error: {file_path}") from ex
|
296
|
+
except Exception as ex:
|
297
|
+
raise SplurgeDsvError(f"Unexpected error validating file path: {file_path}") from ex
|
298
|
+
|
299
|
+
return effective_path
|
300
|
+
|
127
301
|
@classmethod
|
128
302
|
def parse_file(
|
129
303
|
cls,
|
130
|
-
file_path: PathLike[str] | str,
|
304
|
+
file_path: PathLike[str] | Path | str,
|
131
305
|
*,
|
132
306
|
delimiter: str,
|
133
307
|
strip: bool = DEFAULT_STRIP,
|
@@ -136,11 +310,16 @@ class DsvHelper:
|
|
136
310
|
encoding: str = DEFAULT_ENCODING,
|
137
311
|
skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
|
138
312
|
skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
|
313
|
+
skip_empty_lines: bool = False,
|
314
|
+
normalize_columns: int = 0,
|
315
|
+
raise_on_missing_columns: bool = False,
|
316
|
+
raise_on_extra_columns: bool = False,
|
317
|
+
detect_columns: bool = False,
|
139
318
|
) -> list[list[str]]:
|
140
319
|
"""Read and parse an entire DSV file.
|
141
320
|
|
142
321
|
This convenience reads all lines from ``file_path`` using
|
143
|
-
:class:`
|
322
|
+
:class:`splurge_safe_io.safe_text_file_reader.SafeTextFileReader` and then parses each
|
144
323
|
line into tokens. Header and footer rows may be skipped via the
|
145
324
|
``skip_header_rows`` and ``skip_footer_rows`` parameters.
|
146
325
|
|
@@ -153,23 +332,64 @@ class DsvHelper:
|
|
153
332
|
encoding: Text encoding to use when reading the file.
|
154
333
|
skip_header_rows: Number of leading lines to ignore.
|
155
334
|
skip_footer_rows: Number of trailing lines to ignore.
|
335
|
+
normalize_columns: Number of columns to normalize.
|
336
|
+
raise_on_missing_columns: Raise an error if a line has fewer columns than ``normalize_columns``.
|
337
|
+
raise_on_extra_columns: Raise an error if a line has more columns than ``normalize_columns``.
|
156
338
|
|
157
339
|
Returns:
|
158
340
|
A list of token lists (one list per non-skipped line).
|
159
341
|
|
160
342
|
Raises:
|
161
|
-
SplurgeDsvParameterError: If ``delimiter`` is empty or None.
|
343
|
+
SplurgeDsvParameterError: If ``delimiter`` is empty or None, or if ``normalize_columns`` is negative.
|
162
344
|
SplurgeDsvFileNotFoundError: If the file at ``file_path`` does not exist.
|
163
|
-
SplurgeDsvFilePermissionError: If the file cannot be accessed due to
|
164
|
-
|
165
|
-
|
166
|
-
|
345
|
+
SplurgeDsvFilePermissionError: If the file cannot be accessed due to permission restrictions.
|
346
|
+
SplurgeDsvFileDecodingError: If the file cannot be decoded using the provided ``encoding``.
|
347
|
+
SplurgeDsvPathValidationError: If the file path is invalid.
|
348
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
349
|
+
SplurgeDsvError: For other unexpected errors.
|
167
350
|
"""
|
168
|
-
|
169
|
-
file_path, encoding=encoding, skip_header_rows=skip_header_rows, skip_footer_rows=skip_footer_rows
|
170
|
-
)
|
351
|
+
effective_file_path = cls._validate_file_path(Path(file_path))
|
171
352
|
|
172
|
-
|
353
|
+
skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
|
354
|
+
skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
|
355
|
+
|
356
|
+
try:
|
357
|
+
reader = safe_io_text_file_reader.SafeTextFileReader(
|
358
|
+
effective_file_path,
|
359
|
+
encoding=encoding,
|
360
|
+
skip_header_lines=skip_header_rows,
|
361
|
+
skip_footer_lines=skip_footer_rows,
|
362
|
+
strip=strip,
|
363
|
+
skip_empty_lines=skip_empty_lines,
|
364
|
+
)
|
365
|
+
lines: list[str] = reader.read()
|
366
|
+
|
367
|
+
except safe_io_text_file_reader.SplurgeSafeIoFileDecodingError as ex:
|
368
|
+
raise SplurgeDsvFileDecodingError(f"File decoding error: {effective_file_path}") from ex
|
369
|
+
except safe_io_text_file_reader.SplurgeSafeIoFilePermissionError as ex:
|
370
|
+
raise SplurgeDsvFilePermissionError(f"File permission error: {effective_file_path}") from ex
|
371
|
+
except safe_io_text_file_reader.SplurgeSafeIoOsError as ex:
|
372
|
+
raise SplurgeDsvFilePermissionError(f"File access error: {effective_file_path}") from ex
|
373
|
+
except Exception as ex:
|
374
|
+
# If the exception is already a SplurgeDsvError (or subclass),
|
375
|
+
# re-raise it unchanged so callers can handle specific errors
|
376
|
+
# (for example, SplurgeDsvColumnMismatchError from validation).
|
377
|
+
if isinstance(ex, SplurgeDsvError):
|
378
|
+
raise
|
379
|
+
|
380
|
+
raise SplurgeDsvError(f"Unexpected error reading file: {effective_file_path}") from ex
|
381
|
+
|
382
|
+
return cls.parses(
|
383
|
+
lines,
|
384
|
+
delimiter=delimiter,
|
385
|
+
strip=strip,
|
386
|
+
bookend=bookend,
|
387
|
+
bookend_strip=bookend_strip,
|
388
|
+
normalize_columns=normalize_columns,
|
389
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
390
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
391
|
+
detect_columns=detect_columns,
|
392
|
+
)
|
173
393
|
|
174
394
|
@classmethod
|
175
395
|
def _process_stream_chunk(
|
@@ -180,10 +400,13 @@ class DsvHelper:
|
|
180
400
|
strip: bool = DEFAULT_STRIP,
|
181
401
|
bookend: str | None = None,
|
182
402
|
bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
|
403
|
+
normalize_columns: int = 0,
|
404
|
+
raise_on_missing_columns: bool = False,
|
405
|
+
raise_on_extra_columns: bool = False,
|
183
406
|
) -> list[list[str]]:
|
184
407
|
"""Parse a chunk of lines into tokenized rows.
|
185
408
|
|
186
|
-
Designed to be used by :meth:`
|
409
|
+
Designed to be used by :meth:`parse_file_stream` as a helper for converting a
|
187
410
|
batch of raw lines into parsed rows.
|
188
411
|
|
189
412
|
Args:
|
@@ -192,12 +415,31 @@ class DsvHelper:
|
|
192
415
|
strip: If True, strip whitespace from tokens.
|
193
416
|
bookend: Optional bookend character to remove from tokens.
|
194
417
|
bookend_strip: If True, strip whitespace after removing bookends.
|
418
|
+
normalize_columns: If > 0, ensure each returned list has exactly this many columns,
|
419
|
+
padding with empty strings or truncating as needed.
|
420
|
+
raise_on_missing_columns: If True, raise an error if a line has fewer columns than ``normalize_columns``.
|
421
|
+
raise_on_extra_columns: If True, raise an error if a line has more columns than ``normalize_columns``.
|
422
|
+
|
423
|
+
Raises:
|
424
|
+
SplurgeDsvParameterError: If ``delimiter`` is empty or None,
|
425
|
+
or if ``normalize_columns`` is negative,
|
426
|
+
or if ``chunk`` is not a list of strings, or if any element in ``chunk`` is not a string.
|
427
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
195
428
|
|
196
429
|
Returns:
|
197
430
|
A list where each element is the token list for a corresponding
|
198
431
|
input line from ``chunk``.
|
199
432
|
"""
|
200
|
-
return cls.parses(
|
433
|
+
return cls.parses(
|
434
|
+
chunk,
|
435
|
+
delimiter=delimiter,
|
436
|
+
strip=strip,
|
437
|
+
bookend=bookend,
|
438
|
+
bookend_strip=bookend_strip,
|
439
|
+
normalize_columns=normalize_columns,
|
440
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
441
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
442
|
+
)
|
201
443
|
|
202
444
|
@classmethod
|
203
445
|
def parse_file_stream(
|
@@ -211,7 +453,16 @@ class DsvHelper:
|
|
211
453
|
encoding: str = DEFAULT_ENCODING,
|
212
454
|
skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
|
213
455
|
skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
|
456
|
+
skip_empty_lines: bool = False,
|
457
|
+
normalize_columns: int = 0,
|
458
|
+
raise_on_missing_columns: bool = False,
|
459
|
+
raise_on_extra_columns: bool = False,
|
460
|
+
detect_columns: bool = False,
|
214
461
|
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
462
|
+
# How many chunks to scan when attempting to detect normalize_columns
|
463
|
+
# from the beginning of a stream. Only used when
|
464
|
+
# `detect_columns is True` and `normalize_columns` is falsy.
|
465
|
+
max_detect_chunks: int = MAX_DETECT_CHUNKS,
|
215
466
|
) -> Iterator[list[list[str]]]:
|
216
467
|
"""
|
217
468
|
Stream-parse a DSV file into chunks of lines.
|
@@ -225,82 +476,156 @@ class DsvHelper:
|
|
225
476
|
encoding (str): The file encoding.
|
226
477
|
skip_header_rows (int): Number of header rows to skip.
|
227
478
|
skip_footer_rows (int): Number of footer rows to skip.
|
479
|
+
normalize_columns (int): If > 0, ensure each returned list has exactly this many columns,
|
480
|
+
padding with empty strings or truncating as needed.
|
481
|
+
raise_on_missing_columns (bool): If True, raise an error if a line has fewer columns than ``normalize_columns``.
|
482
|
+
raise_on_extra_columns (bool): If True, raise an error if a line has more columns than ``normalize_columns``.
|
483
|
+
detect_columns (bool): If True and ``normalize_columns`` is not set or <= 0,
|
484
|
+
detect the expected number of columns from the first non-blank logical row.
|
228
485
|
chunk_size (int): Number of lines per chunk (default: 100).
|
486
|
+
max_detect_chunks (int): When detecting columns, how many chunks to scan
|
487
|
+
from the start of the stream before giving up (default: 10).
|
229
488
|
|
230
489
|
Yields:
|
231
490
|
list[list[str]]: Parsed rows for each chunk.
|
232
491
|
|
233
492
|
Raises:
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
493
|
+
SplurgeDsvParameterError: If delimiter is empty or None, or if ``normalize_columns`` is negative,
|
494
|
+
or if ``chunk`` is not a list of strings, or if any element in ``chunk`` is not a string.
|
495
|
+
SplurgeDsvFileNotFoundError: If the file does not exist.
|
496
|
+
SplurgeDsvFilePermissionError: If the file cannot be accessed.
|
497
|
+
SplurgeDsvFileDecodingError: If the file cannot be decoded with the specified encoding.
|
498
|
+
SplurgeDsvPathValidationError: If the file path is invalid.
|
499
|
+
SplurgeDsvError: For other unexpected errors.
|
500
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
238
501
|
"""
|
239
|
-
|
240
|
-
|
502
|
+
|
503
|
+
effective_file_path = cls._validate_file_path(Path(file_path))
|
241
504
|
|
242
505
|
chunk_size = max(chunk_size, cls.DEFAULT_MIN_CHUNK_SIZE)
|
243
506
|
skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
|
244
507
|
skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
|
245
|
-
|
246
|
-
#
|
247
|
-
|
248
|
-
cls.
|
249
|
-
|
250
|
-
)
|
251
|
-
|
252
|
-
|
508
|
+
# Allow callers to pass None to use the module default. Ensure we have
|
509
|
+
# a positive integer to drive the detection loop.
|
510
|
+
if max_detect_chunks is None:
|
511
|
+
max_detect_chunks = cls.MAX_DETECT_CHUNKS
|
512
|
+
else:
|
513
|
+
max_detect_chunks = max(int(max_detect_chunks), 1)
|
514
|
+
|
515
|
+
try:
|
516
|
+
reader = safe_io_text_file_reader.SafeTextFileReader(
|
517
|
+
effective_file_path,
|
253
518
|
encoding=encoding,
|
254
|
-
|
255
|
-
|
519
|
+
skip_header_lines=skip_header_rows,
|
520
|
+
skip_footer_lines=skip_footer_rows,
|
521
|
+
strip=strip,
|
522
|
+
skip_empty_lines=skip_empty_lines,
|
256
523
|
chunk_size=chunk_size,
|
257
524
|
)
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
525
|
+
stream_iter = reader.read_as_stream()
|
526
|
+
|
527
|
+
if detect_columns and (not normalize_columns or normalize_columns <= 0):
|
528
|
+
# Buffer up to `max_detect_chunks` from the stream while
|
529
|
+
# searching for the first non-blank logical row. This allows us
|
530
|
+
# to detect the expected column count even if the first logical
|
531
|
+
# row doesn't appear in the very first chunk (for example,
|
532
|
+
# when the file begins with many blank lines or very small
|
533
|
+
# chunks).
|
534
|
+
buffered_chunks: list[list[str]] = []
|
535
|
+
max_scan = max_detect_chunks if max_detect_chunks is not None else cls.MAX_DETECT_CHUNKS
|
536
|
+
chunks_scanned = 0
|
537
|
+
|
538
|
+
while chunks_scanned < max_scan:
|
539
|
+
try:
|
540
|
+
chunk = next(stream_iter)
|
541
|
+
except StopIteration:
|
542
|
+
break
|
543
|
+
buffered_chunks.append(chunk)
|
544
|
+
|
545
|
+
# Inspect this chunk for the first non-blank logical row
|
546
|
+
first_line = None
|
547
|
+
for ln in chunk:
|
548
|
+
if isinstance(ln, str) and ln.strip() != "":
|
549
|
+
first_line = ln
|
550
|
+
break
|
551
|
+
|
552
|
+
if first_line is not None:
|
553
|
+
detected = cls.parse(
|
554
|
+
first_line,
|
555
|
+
delimiter=delimiter,
|
556
|
+
strip=strip,
|
557
|
+
bookend=bookend,
|
558
|
+
bookend_strip=bookend_strip,
|
559
|
+
normalize_columns=0,
|
560
|
+
raise_on_missing_columns=False,
|
561
|
+
raise_on_extra_columns=False,
|
562
|
+
)
|
563
|
+
normalize_columns = len(detected)
|
564
|
+
# remember which buffered chunk contained the first
|
565
|
+
# logical row so we can start applying normalization
|
566
|
+
# beginning with that chunk only
|
567
|
+
detected_index = len(buffered_chunks) - 1
|
568
|
+
break
|
569
|
+
|
570
|
+
chunks_scanned += 1
|
571
|
+
|
572
|
+
# Replay any buffered chunks (in order) so callers receive the
|
573
|
+
# full content starting at the beginning of the file. If we
|
574
|
+
# detected the first logical row in one of the buffered chunks
|
575
|
+
# then only apply normalization beginning with that chunk;
|
576
|
+
# earlier buffered chunks must be emitted without
|
577
|
+
# normalization so we don't convert blank-only lines into
|
578
|
+
# padded empty-token rows.
|
579
|
+
if "detected_index" in locals():
|
580
|
+
for idx, b in enumerate(buffered_chunks):
|
581
|
+
use_norm = normalize_columns if idx == detected_index else 0
|
582
|
+
yield cls._process_stream_chunk(
|
583
|
+
b,
|
584
|
+
delimiter=delimiter,
|
585
|
+
strip=strip,
|
586
|
+
bookend=bookend,
|
587
|
+
bookend_strip=bookend_strip,
|
588
|
+
normalize_columns=use_norm,
|
589
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
590
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
591
|
+
)
|
592
|
+
else:
|
593
|
+
for b in buffered_chunks:
|
594
|
+
yield cls._process_stream_chunk(
|
595
|
+
b,
|
596
|
+
delimiter=delimiter,
|
597
|
+
strip=strip,
|
598
|
+
bookend=bookend,
|
599
|
+
bookend_strip=bookend_strip,
|
600
|
+
normalize_columns=0,
|
601
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
602
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
603
|
+
)
|
604
|
+
|
605
|
+
# Continue streaming the rest of the file
|
606
|
+
for chunk in stream_iter:
|
607
|
+
yield cls._process_stream_chunk(
|
608
|
+
chunk,
|
609
|
+
delimiter=delimiter,
|
610
|
+
strip=strip,
|
611
|
+
bookend=bookend,
|
612
|
+
bookend_strip=bookend_strip,
|
613
|
+
normalize_columns=normalize_columns,
|
614
|
+
raise_on_missing_columns=raise_on_missing_columns,
|
615
|
+
raise_on_extra_columns=raise_on_extra_columns,
|
616
|
+
)
|
617
|
+
except safe_io_text_file_reader.SplurgeSafeIoFileDecodingError as ex:
|
618
|
+
raise SplurgeDsvFileDecodingError(f"File decoding error: {effective_file_path}") from ex
|
619
|
+
except safe_io_text_file_reader.SplurgeSafeIoFilePermissionError as ex:
|
620
|
+
raise SplurgeDsvFilePermissionError(f"File permission error: {effective_file_path}") from ex
|
621
|
+
except safe_io_text_file_reader.SplurgeSafeIoOsError as ex:
|
622
|
+
raise SplurgeDsvFilePermissionError(f"File access error: {effective_file_path}") from ex
|
623
|
+
except Exception as ex:
|
624
|
+
# Preserve and re-raise known SplurgeDsvError subclasses so
|
625
|
+
# callers can handle specific errors (e.g. column mismatch) as
|
626
|
+
# intended. Only wrap unknown exceptions in a generic
|
627
|
+
# SplurgeDsvError.
|
628
|
+
if isinstance(ex, SplurgeDsvError):
|
629
|
+
raise
|
630
|
+
|
631
|
+
raise SplurgeDsvError(f"Unexpected error reading file: {effective_file_path}") from ex
|