splurge-dsv 2025.2.1__py3-none-any.whl → 2025.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
splurge_dsv/dsv_helper.py CHANGED
@@ -9,14 +9,25 @@ This module is licensed under the MIT License.
9
9
  """
10
10
 
11
11
  # Standard library imports
12
- import warnings
13
12
  from collections.abc import Iterator
14
13
  from os import PathLike
14
+ from pathlib import Path
15
+
16
+ import splurge_safe_io.constants as safe_io_constants
17
+ import splurge_safe_io.path_validator as safe_io_path_validator
18
+ import splurge_safe_io.safe_text_file_reader as safe_io_text_file_reader
15
19
 
16
20
  # Local imports
17
- from splurge_dsv.exceptions import SplurgeDsvParameterError
21
+ from splurge_dsv.exceptions import (
22
+ SplurgeDsvColumnMismatchError,
23
+ SplurgeDsvError,
24
+ SplurgeDsvFileDecodingError,
25
+ SplurgeDsvFileNotFoundError,
26
+ SplurgeDsvFilePermissionError,
27
+ SplurgeDsvParameterError,
28
+ SplurgeDsvPathValidationError,
29
+ )
18
30
  from splurge_dsv.string_tokenizer import StringTokenizer
19
- from splurge_dsv.text_file_helper import TextFileHelper
20
31
 
21
32
 
22
33
  class DsvHelper:
@@ -27,22 +38,31 @@ class DsvHelper:
27
38
  Supports configurable delimiters, text bookends, and whitespace handling options.
28
39
  """
29
40
 
30
- DEFAULT_CHUNK_SIZE = 500 # Default chunk size for streaming operations
31
- DEFAULT_ENCODING = "utf-8" # Default text encoding for file operations
32
- DEFAULT_SKIP_HEADER_ROWS = 0 # Default number of header rows to skip
33
- DEFAULT_SKIP_FOOTER_ROWS = 0 # Default number of footer rows to skip
34
- DEFAULT_MIN_CHUNK_SIZE = 100
41
+ DEFAULT_CHUNK_SIZE = safe_io_constants.DEFAULT_CHUNK_SIZE
42
+ # When detecting normalize_columns across a stream, how many chunks to scan
43
+ # before giving up. Scanning more chunks increases work but helps if the
44
+ # first logical row starts later than the first chunk (e.g., many blank lines
45
+ # or very small chunks). Keep small by default to avoid buffering too much.
46
+ MAX_DETECT_CHUNKS = 10
47
+ DEFAULT_ENCODING = "utf-8"
48
+ DEFAULT_SKIP_HEADER_ROWS = 0
49
+ DEFAULT_SKIP_FOOTER_ROWS = 0
50
+ DEFAULT_MIN_CHUNK_SIZE = safe_io_constants.MIN_CHUNK_SIZE
35
51
  DEFAULT_STRIP = True
36
52
  DEFAULT_BOOKEND_STRIP = True
37
53
 
38
- @staticmethod
54
+ @classmethod
39
55
  def parse(
56
+ cls,
40
57
  content: str,
41
58
  *,
42
59
  delimiter: str,
43
60
  strip: bool = DEFAULT_STRIP,
44
61
  bookend: str | None = None,
45
62
  bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
63
+ normalize_columns: int = 0,
64
+ raise_on_missing_columns: bool = False,
65
+ raise_on_extra_columns: bool = False,
46
66
  ) -> list[str]:
47
67
  """Parse a single DSV line into tokens.
48
68
 
@@ -57,12 +77,18 @@ class DsvHelper:
57
77
  strip: If True, strip leading/trailing whitespace from each token.
58
78
  bookend: Optional bookend character to remove from token ends.
59
79
  bookend_strip: If True, strip whitespace after removing bookends.
80
+ normalize_columns: If > 0, ensure the returned list has exactly this many columns,
81
+ padding with empty strings or truncating as needed.
82
+ raise_on_missing_columns: If True, raise an error if the line has fewer columns than ``normalize_columns``.
83
+ raise_on_extra_columns: If True, raise an error if the line has more columns than
60
84
 
61
85
  Returns:
62
86
  A list of parsed token strings.
63
87
 
64
88
  Raises:
65
89
  SplurgeDsvParameterError: If ``delimiter`` is empty or None.
90
+ SplurgeDsvParameterError: If ``normalize_columns`` is negative.
91
+ SplurgeDsvColumnMismatchError: If column validation fails.
66
92
 
67
93
  Examples:
68
94
  >>> DsvHelper.parse("a,b,c", delimiter=",")
@@ -78,8 +104,79 @@ class DsvHelper:
78
104
  if bookend:
79
105
  tokens = [StringTokenizer.remove_bookends(token, bookend=bookend, strip=bookend_strip) for token in tokens]
80
106
 
107
+ # If requested, validate columns (raises) and/or normalize the row length
108
+ if normalize_columns and normalize_columns > 0:
109
+ # Validation is only performed if the caller asked for raises
110
+ if raise_on_missing_columns or raise_on_extra_columns:
111
+ cls._validate_columns(
112
+ len(tokens),
113
+ expected_columns=normalize_columns,
114
+ raise_on_missing_columns=raise_on_missing_columns,
115
+ raise_on_extra_columns=raise_on_extra_columns,
116
+ )
117
+
118
+ tokens = cls._normalize_columns(tokens, expected_columns=normalize_columns)
119
+
81
120
  return tokens
82
121
 
122
+ @classmethod
123
+ def _normalize_columns(cls, row: list[str], *, expected_columns: int) -> list[str]:
124
+ """Normalize a token list to the expected number of columns.
125
+
126
+ If the row has fewer columns than expected, append empty strings to reach
127
+ the expected length. If the row has more columns than expected, truncate
128
+ the excess columns.
129
+
130
+ Args:
131
+ row: The list of tokens to normalize.
132
+ expected_columns: Desired number of columns.
133
+
134
+ Returns:
135
+ A new list of tokens with length == expected_columns.
136
+
137
+ Raises:
138
+ SplurgeDsvParameterError: If ``expected_columns`` is negative.
139
+ """
140
+ if expected_columns < 0:
141
+ raise SplurgeDsvParameterError("expected_columns must be non-negative")
142
+
143
+ current = len(row)
144
+ if current == expected_columns:
145
+ return row
146
+ if current < expected_columns:
147
+ # append empty strings
148
+ return row + [""] * (expected_columns - current)
149
+ # current > expected -> truncate
150
+ return row[:expected_columns]
151
+
152
+ @classmethod
153
+ def _validate_columns(
154
+ cls, actual_columns: int, *, expected_columns: int, raise_on_missing_columns: bool, raise_on_extra_columns: bool
155
+ ) -> None:
156
+ """Validate column count against expected_columns.
157
+
158
+ Raises a SplurgeDsvError (or a more specific subclass) when the
159
+ validation fails according to the provided flags.
160
+
161
+ Args:
162
+ actual_columns: The actual number of columns in the row.
163
+ expected_columns: The expected number of columns.
164
+ raise_on_missing_columns: If True, raise an error if actual_columns < expected_columns.
165
+ raise_on_extra_columns: If True, raise an error if actual_columns > expected_columns.
166
+
167
+ Raises:
168
+ SplurgeDsvColumnMismatchError: If column validation fails.
169
+ SplurgeDsvParameterError: If ``expected_columns`` is negative.
170
+ """
171
+ if expected_columns < 0:
172
+ raise SplurgeDsvParameterError("expected_columns must be non-negative")
173
+
174
+ if raise_on_missing_columns and actual_columns < expected_columns:
175
+ raise SplurgeDsvColumnMismatchError(f"Row is missing columns: ({actual_columns} < {expected_columns})")
176
+
177
+ if raise_on_extra_columns and actual_columns > expected_columns:
178
+ raise SplurgeDsvColumnMismatchError(f"Row has extra columns: ({actual_columns} > {expected_columns})")
179
+
83
180
  @classmethod
84
181
  def parses(
85
182
  cls,
@@ -89,6 +186,10 @@ class DsvHelper:
89
186
  strip: bool = DEFAULT_STRIP,
90
187
  bookend: str | None = None,
91
188
  bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
189
+ normalize_columns: int = 0,
190
+ raise_on_missing_columns: bool = False,
191
+ raise_on_extra_columns: bool = False,
192
+ detect_columns: bool = False,
92
193
  ) -> list[list[str]]:
93
194
  """Parse multiple DSV lines.
94
195
 
@@ -101,13 +202,19 @@ class DsvHelper:
101
202
  strip: If True, strip whitespace from tokens.
102
203
  bookend: Optional bookend character to remove from tokens.
103
204
  bookend_strip: If True, strip whitespace after removing bookends.
205
+ normalize_columns: If > 0, ensure each returned list has exactly this many columns,
206
+ padding with empty strings or truncating as needed.
207
+ raise_on_missing_columns: If True, raise an error if a line has fewer columns than ``normalize_columns``.
208
+ raise_on_extra_columns: If True, raise an error if a line has more columns than ``normalize_columns``.
209
+ detect_columns: If True and ``normalize_columns`` is not set or <= 0, detect the number of columns from the content.
104
210
 
105
211
  Returns:
106
212
  A list of token lists, one per input line.
107
213
 
108
214
  Raises:
109
- SplurgeDsvParameterError: If ``content`` is not a list of strings or
110
- if ``delimiter`` is empty or None.
215
+ SplurgeDsvParameterError: If ``content`` is not a list of strings, or
216
+ if ``delimiter`` is empty or None, or if ``normalize_columns`` is negative.
217
+ SplurgeDsvColumnMismatchError: If column validation fails.
111
218
 
112
219
  Example:
113
220
  >>> DsvHelper.parses(["a,b,c", "d,e,f"], delimiter=",")
@@ -119,15 +226,82 @@ class DsvHelper:
119
226
  if not all(isinstance(item, str) for item in content):
120
227
  raise SplurgeDsvParameterError("content must be a list of strings")
121
228
 
229
+ # If requested, detect expected columns from the first logical row
230
+ if detect_columns and (not normalize_columns or normalize_columns <= 0):
231
+ if not content:
232
+ return []
233
+ # Find the first non-blank logical row in the provided content
234
+ first_non_blank = None
235
+ for ln in content:
236
+ if isinstance(ln, str) and ln.strip() != "":
237
+ first_non_blank = ln
238
+ break
239
+ if first_non_blank is None:
240
+ return []
241
+
242
+ detected = cls.parse(
243
+ first_non_blank,
244
+ delimiter=delimiter,
245
+ strip=strip,
246
+ bookend=bookend,
247
+ bookend_strip=bookend_strip,
248
+ normalize_columns=0,
249
+ raise_on_missing_columns=False,
250
+ raise_on_extra_columns=False,
251
+ )
252
+ normalize_columns = len(detected)
253
+
122
254
  return [
123
- cls.parse(item, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
255
+ cls.parse(
256
+ item,
257
+ delimiter=delimiter,
258
+ strip=strip,
259
+ bookend=bookend,
260
+ bookend_strip=bookend_strip,
261
+ normalize_columns=normalize_columns,
262
+ raise_on_missing_columns=raise_on_missing_columns,
263
+ raise_on_extra_columns=raise_on_extra_columns,
264
+ )
124
265
  for item in content
125
266
  ]
126
267
 
268
+ @staticmethod
269
+ def _validate_file_path(
270
+ file_path: Path | str, *, must_exist: bool = True, must_be_file: bool = True, must_be_readable: bool = True
271
+ ) -> Path:
272
+ """Validate the provided file path.
273
+
274
+ Args:
275
+ file_path: The file path to validate.
276
+
277
+ Returns:
278
+ A validated Path object.
279
+
280
+ Raises:
281
+ SplurgeDsvPathValidationError: If the file path is invalid.
282
+ SplurgeDsvFileNotFoundError: If the file does not exist.
283
+ SplurgeDsvFilePermissionError: If the file cannot be accessed due to permission restrictions
284
+ SplurgeDsvError: For other unexpected errors.
285
+ """
286
+ try:
287
+ effective_path = safe_io_path_validator.PathValidator.validate_path(
288
+ Path(file_path), must_exist=must_exist, must_be_file=must_be_file, must_be_readable=must_be_readable
289
+ )
290
+ except safe_io_path_validator.SplurgeSafeIoPathValidationError as ex:
291
+ raise SplurgeDsvPathValidationError(f"Invalid file path: {file_path}") from ex
292
+ except safe_io_path_validator.SplurgeSafeIoFileNotFoundError as ex:
293
+ raise SplurgeDsvFileNotFoundError(f"File not found: {file_path}") from ex
294
+ except safe_io_path_validator.SplurgeSafeIoFilePermissionError as ex:
295
+ raise SplurgeDsvFilePermissionError(f"File permission error: {file_path}") from ex
296
+ except Exception as ex:
297
+ raise SplurgeDsvError(f"Unexpected error validating file path: {file_path}") from ex
298
+
299
+ return effective_path
300
+
127
301
  @classmethod
128
302
  def parse_file(
129
303
  cls,
130
- file_path: PathLike[str] | str,
304
+ file_path: PathLike[str] | Path | str,
131
305
  *,
132
306
  delimiter: str,
133
307
  strip: bool = DEFAULT_STRIP,
@@ -136,11 +310,16 @@ class DsvHelper:
136
310
  encoding: str = DEFAULT_ENCODING,
137
311
  skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
138
312
  skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
313
+ skip_empty_lines: bool = False,
314
+ normalize_columns: int = 0,
315
+ raise_on_missing_columns: bool = False,
316
+ raise_on_extra_columns: bool = False,
317
+ detect_columns: bool = False,
139
318
  ) -> list[list[str]]:
140
319
  """Read and parse an entire DSV file.
141
320
 
142
321
  This convenience reads all lines from ``file_path`` using
143
- :class:`splurge_dsv.text_file_helper.TextFileHelper` and then parses each
322
+ :class:`splurge_safe_io.safe_text_file_reader.SafeTextFileReader` and then parses each
144
323
  line into tokens. Header and footer rows may be skipped via the
145
324
  ``skip_header_rows`` and ``skip_footer_rows`` parameters.
146
325
 
@@ -153,23 +332,64 @@ class DsvHelper:
153
332
  encoding: Text encoding to use when reading the file.
154
333
  skip_header_rows: Number of leading lines to ignore.
155
334
  skip_footer_rows: Number of trailing lines to ignore.
335
+ normalize_columns: Number of columns to normalize.
336
+ raise_on_missing_columns: Raise an error if a line has fewer columns than ``normalize_columns``.
337
+ raise_on_extra_columns: Raise an error if a line has more columns than ``normalize_columns``.
156
338
 
157
339
  Returns:
158
340
  A list of token lists (one list per non-skipped line).
159
341
 
160
342
  Raises:
161
- SplurgeDsvParameterError: If ``delimiter`` is empty or None.
343
+ SplurgeDsvParameterError: If ``delimiter`` is empty or None, or if ``normalize_columns`` is negative.
162
344
  SplurgeDsvFileNotFoundError: If the file at ``file_path`` does not exist.
163
- SplurgeDsvFilePermissionError: If the file cannot be accessed due to
164
- permission restrictions.
165
- SplurgeDsvFileEncodingError: If the file cannot be decoded using
166
- the provided ``encoding``.
345
+ SplurgeDsvFilePermissionError: If the file cannot be accessed due to permission restrictions.
346
+ SplurgeDsvFileDecodingError: If the file cannot be decoded using the provided ``encoding``.
347
+ SplurgeDsvPathValidationError: If the file path is invalid.
348
+ SplurgeDsvColumnMismatchError: If column validation fails.
349
+ SplurgeDsvError: For other unexpected errors.
167
350
  """
168
- lines: list[str] = TextFileHelper.read(
169
- file_path, encoding=encoding, skip_header_rows=skip_header_rows, skip_footer_rows=skip_footer_rows
170
- )
351
+ effective_file_path = cls._validate_file_path(Path(file_path))
171
352
 
172
- return cls.parses(lines, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
353
+ skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
354
+ skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
355
+
356
+ try:
357
+ reader = safe_io_text_file_reader.SafeTextFileReader(
358
+ effective_file_path,
359
+ encoding=encoding,
360
+ skip_header_lines=skip_header_rows,
361
+ skip_footer_lines=skip_footer_rows,
362
+ strip=strip,
363
+ skip_empty_lines=skip_empty_lines,
364
+ )
365
+ lines: list[str] = reader.read()
366
+
367
+ except safe_io_text_file_reader.SplurgeSafeIoFileDecodingError as ex:
368
+ raise SplurgeDsvFileDecodingError(f"File decoding error: {effective_file_path}") from ex
369
+ except safe_io_text_file_reader.SplurgeSafeIoFilePermissionError as ex:
370
+ raise SplurgeDsvFilePermissionError(f"File permission error: {effective_file_path}") from ex
371
+ except safe_io_text_file_reader.SplurgeSafeIoOsError as ex:
372
+ raise SplurgeDsvFilePermissionError(f"File access error: {effective_file_path}") from ex
373
+ except Exception as ex:
374
+ # If the exception is already a SplurgeDsvError (or subclass),
375
+ # re-raise it unchanged so callers can handle specific errors
376
+ # (for example, SplurgeDsvColumnMismatchError from validation).
377
+ if isinstance(ex, SplurgeDsvError):
378
+ raise
379
+
380
+ raise SplurgeDsvError(f"Unexpected error reading file: {effective_file_path}") from ex
381
+
382
+ return cls.parses(
383
+ lines,
384
+ delimiter=delimiter,
385
+ strip=strip,
386
+ bookend=bookend,
387
+ bookend_strip=bookend_strip,
388
+ normalize_columns=normalize_columns,
389
+ raise_on_missing_columns=raise_on_missing_columns,
390
+ raise_on_extra_columns=raise_on_extra_columns,
391
+ detect_columns=detect_columns,
392
+ )
173
393
 
174
394
  @classmethod
175
395
  def _process_stream_chunk(
@@ -180,10 +400,13 @@ class DsvHelper:
180
400
  strip: bool = DEFAULT_STRIP,
181
401
  bookend: str | None = None,
182
402
  bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
403
+ normalize_columns: int = 0,
404
+ raise_on_missing_columns: bool = False,
405
+ raise_on_extra_columns: bool = False,
183
406
  ) -> list[list[str]]:
184
407
  """Parse a chunk of lines into tokenized rows.
185
408
 
186
- Designed to be used by :meth:`parse_stream` as a helper for converting a
409
+ Designed to be used by :meth:`parse_file_stream` as a helper for converting a
187
410
  batch of raw lines into parsed rows.
188
411
 
189
412
  Args:
@@ -192,12 +415,31 @@ class DsvHelper:
192
415
  strip: If True, strip whitespace from tokens.
193
416
  bookend: Optional bookend character to remove from tokens.
194
417
  bookend_strip: If True, strip whitespace after removing bookends.
418
+ normalize_columns: If > 0, ensure each returned list has exactly this many columns,
419
+ padding with empty strings or truncating as needed.
420
+ raise_on_missing_columns: If True, raise an error if a line has fewer columns than ``normalize_columns``.
421
+ raise_on_extra_columns: If True, raise an error if a line has more columns than ``normalize_columns``.
422
+
423
+ Raises:
424
+ SplurgeDsvParameterError: If ``delimiter`` is empty or None,
425
+ or if ``normalize_columns`` is negative,
426
+ or if ``chunk`` is not a list of strings, or if any element in ``chunk`` is not a string.
427
+ SplurgeDsvColumnMismatchError: If column validation fails.
195
428
 
196
429
  Returns:
197
430
  A list where each element is the token list for a corresponding
198
431
  input line from ``chunk``.
199
432
  """
200
- return cls.parses(chunk, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
433
+ return cls.parses(
434
+ chunk,
435
+ delimiter=delimiter,
436
+ strip=strip,
437
+ bookend=bookend,
438
+ bookend_strip=bookend_strip,
439
+ normalize_columns=normalize_columns,
440
+ raise_on_missing_columns=raise_on_missing_columns,
441
+ raise_on_extra_columns=raise_on_extra_columns,
442
+ )
201
443
 
202
444
  @classmethod
203
445
  def parse_file_stream(
@@ -211,7 +453,16 @@ class DsvHelper:
211
453
  encoding: str = DEFAULT_ENCODING,
212
454
  skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
213
455
  skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
456
+ skip_empty_lines: bool = False,
457
+ normalize_columns: int = 0,
458
+ raise_on_missing_columns: bool = False,
459
+ raise_on_extra_columns: bool = False,
460
+ detect_columns: bool = False,
214
461
  chunk_size: int = DEFAULT_CHUNK_SIZE,
462
+ # How many chunks to scan when attempting to detect normalize_columns
463
+ # from the beginning of a stream. Only used when
464
+ # `detect_columns is True` and `normalize_columns` is falsy.
465
+ max_detect_chunks: int = MAX_DETECT_CHUNKS,
215
466
  ) -> Iterator[list[list[str]]]:
216
467
  """
217
468
  Stream-parse a DSV file into chunks of lines.
@@ -225,82 +476,156 @@ class DsvHelper:
225
476
  encoding (str): The file encoding.
226
477
  skip_header_rows (int): Number of header rows to skip.
227
478
  skip_footer_rows (int): Number of footer rows to skip.
479
+ normalize_columns (int): If > 0, ensure each returned list has exactly this many columns,
480
+ padding with empty strings or truncating as needed.
481
+ raise_on_missing_columns (bool): If True, raise an error if a line has fewer columns than ``normalize_columns``.
482
+ raise_on_extra_columns (bool): If True, raise an error if a line has more columns than ``normalize_columns``.
483
+ detect_columns (bool): If True and ``normalize_columns`` is not set or <= 0,
484
+ detect the expected number of columns from the first non-blank logical row.
228
485
  chunk_size (int): Number of lines per chunk (default: 100).
486
+ max_detect_chunks (int): When detecting columns, how many chunks to scan
487
+ from the start of the stream before giving up (default: 10).
229
488
 
230
489
  Yields:
231
490
  list[list[str]]: Parsed rows for each chunk.
232
491
 
233
492
  Raises:
234
- SplurgeParameterError: If delimiter is empty or None.
235
- SplurgeFileNotFoundError: If the file does not exist.
236
- SplurgeFilePermissionError: If the file cannot be accessed.
237
- SplurgeFileEncodingError: If the file cannot be decoded with the specified encoding.
493
+ SplurgeDsvParameterError: If delimiter is empty or None, or if ``normalize_columns`` is negative,
494
+ or if ``chunk`` is not a list of strings, or if any element in ``chunk`` is not a string.
495
+ SplurgeDsvFileNotFoundError: If the file does not exist.
496
+ SplurgeDsvFilePermissionError: If the file cannot be accessed.
497
+ SplurgeDsvFileDecodingError: If the file cannot be decoded with the specified encoding.
498
+ SplurgeDsvPathValidationError: If the file path is invalid.
499
+ SplurgeDsvError: For other unexpected errors.
500
+ SplurgeDsvColumnMismatchError: If column validation fails.
238
501
  """
239
- if delimiter is None or delimiter == "":
240
- raise SplurgeDsvParameterError("delimiter cannot be empty or None")
502
+
503
+ effective_file_path = cls._validate_file_path(Path(file_path))
241
504
 
242
505
  chunk_size = max(chunk_size, cls.DEFAULT_MIN_CHUNK_SIZE)
243
506
  skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
244
507
  skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
245
-
246
- # Use TextFileHelper.read_as_stream for consistent error handling
247
- yield from (
248
- cls._process_stream_chunk(
249
- chunk, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip
250
- )
251
- for chunk in TextFileHelper.read_as_stream(
252
- file_path,
508
+ # Allow callers to pass None to use the module default. Ensure we have
509
+ # a positive integer to drive the detection loop.
510
+ if max_detect_chunks is None:
511
+ max_detect_chunks = cls.MAX_DETECT_CHUNKS
512
+ else:
513
+ max_detect_chunks = max(int(max_detect_chunks), 1)
514
+
515
+ try:
516
+ reader = safe_io_text_file_reader.SafeTextFileReader(
517
+ effective_file_path,
253
518
  encoding=encoding,
254
- skip_header_rows=skip_header_rows,
255
- skip_footer_rows=skip_footer_rows,
519
+ skip_header_lines=skip_header_rows,
520
+ skip_footer_lines=skip_footer_rows,
521
+ strip=strip,
522
+ skip_empty_lines=skip_empty_lines,
256
523
  chunk_size=chunk_size,
257
524
  )
258
- )
259
-
260
- @classmethod
261
- def parse_stream(
262
- cls,
263
- file_path: PathLike[str] | str,
264
- *,
265
- delimiter: str,
266
- strip: bool = DEFAULT_STRIP,
267
- bookend: str | None = None,
268
- bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
269
- encoding: str = DEFAULT_ENCODING,
270
- skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
271
- skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
272
- chunk_size: int = DEFAULT_CHUNK_SIZE,
273
- ) -> Iterator[list[list[str]]]:
274
- """
275
- Stream-parse a DSV file, yielding chunks of parsed rows.
276
-
277
- The method yields lists of parsed rows (each row itself is a list of
278
- strings). Chunk sizing is controlled by the bound configuration's
279
- ``chunk_size`` value.
280
-
281
- Args:
282
- file_path: Path to the file to parse.
283
-
284
- Yields:
285
- Lists of parsed rows, each list containing up to ``chunk_size`` rows.
286
-
287
- Deprecated: Use `parse_file_stream` instead. This method will be removed in a future release.
288
- """
289
- # Emit a DeprecationWarning to signal removal in a future release
290
- warnings.warn(
291
- "DsvHelper.parse_stream() is deprecated and will be removed in a future release; use DsvHelper.parse_file_stream() instead.",
292
- DeprecationWarning,
293
- stacklevel=2,
294
- )
295
-
296
- return cls.parse_file_stream(
297
- file_path,
298
- delimiter=delimiter,
299
- strip=strip,
300
- bookend=bookend,
301
- bookend_strip=bookend_strip,
302
- encoding=encoding,
303
- skip_header_rows=skip_header_rows,
304
- skip_footer_rows=skip_footer_rows,
305
- chunk_size=chunk_size,
306
- )
525
+ stream_iter = reader.read_as_stream()
526
+
527
+ if detect_columns and (not normalize_columns or normalize_columns <= 0):
528
+ # Buffer up to `max_detect_chunks` from the stream while
529
+ # searching for the first non-blank logical row. This allows us
530
+ # to detect the expected column count even if the first logical
531
+ # row doesn't appear in the very first chunk (for example,
532
+ # when the file begins with many blank lines or very small
533
+ # chunks).
534
+ buffered_chunks: list[list[str]] = []
535
+ max_scan = max_detect_chunks if max_detect_chunks is not None else cls.MAX_DETECT_CHUNKS
536
+ chunks_scanned = 0
537
+
538
+ while chunks_scanned < max_scan:
539
+ try:
540
+ chunk = next(stream_iter)
541
+ except StopIteration:
542
+ break
543
+ buffered_chunks.append(chunk)
544
+
545
+ # Inspect this chunk for the first non-blank logical row
546
+ first_line = None
547
+ for ln in chunk:
548
+ if isinstance(ln, str) and ln.strip() != "":
549
+ first_line = ln
550
+ break
551
+
552
+ if first_line is not None:
553
+ detected = cls.parse(
554
+ first_line,
555
+ delimiter=delimiter,
556
+ strip=strip,
557
+ bookend=bookend,
558
+ bookend_strip=bookend_strip,
559
+ normalize_columns=0,
560
+ raise_on_missing_columns=False,
561
+ raise_on_extra_columns=False,
562
+ )
563
+ normalize_columns = len(detected)
564
+ # remember which buffered chunk contained the first
565
+ # logical row so we can start applying normalization
566
+ # beginning with that chunk only
567
+ detected_index = len(buffered_chunks) - 1
568
+ break
569
+
570
+ chunks_scanned += 1
571
+
572
+ # Replay any buffered chunks (in order) so callers receive the
573
+ # full content starting at the beginning of the file. If we
574
+ # detected the first logical row in one of the buffered chunks
575
+ # then only apply normalization beginning with that chunk;
576
+ # earlier buffered chunks must be emitted without
577
+ # normalization so we don't convert blank-only lines into
578
+ # padded empty-token rows.
579
+ if "detected_index" in locals():
580
+ for idx, b in enumerate(buffered_chunks):
581
+ use_norm = normalize_columns if idx == detected_index else 0
582
+ yield cls._process_stream_chunk(
583
+ b,
584
+ delimiter=delimiter,
585
+ strip=strip,
586
+ bookend=bookend,
587
+ bookend_strip=bookend_strip,
588
+ normalize_columns=use_norm,
589
+ raise_on_missing_columns=raise_on_missing_columns,
590
+ raise_on_extra_columns=raise_on_extra_columns,
591
+ )
592
+ else:
593
+ for b in buffered_chunks:
594
+ yield cls._process_stream_chunk(
595
+ b,
596
+ delimiter=delimiter,
597
+ strip=strip,
598
+ bookend=bookend,
599
+ bookend_strip=bookend_strip,
600
+ normalize_columns=0,
601
+ raise_on_missing_columns=raise_on_missing_columns,
602
+ raise_on_extra_columns=raise_on_extra_columns,
603
+ )
604
+
605
+ # Continue streaming the rest of the file
606
+ for chunk in stream_iter:
607
+ yield cls._process_stream_chunk(
608
+ chunk,
609
+ delimiter=delimiter,
610
+ strip=strip,
611
+ bookend=bookend,
612
+ bookend_strip=bookend_strip,
613
+ normalize_columns=normalize_columns,
614
+ raise_on_missing_columns=raise_on_missing_columns,
615
+ raise_on_extra_columns=raise_on_extra_columns,
616
+ )
617
+ except safe_io_text_file_reader.SplurgeSafeIoFileDecodingError as ex:
618
+ raise SplurgeDsvFileDecodingError(f"File decoding error: {effective_file_path}") from ex
619
+ except safe_io_text_file_reader.SplurgeSafeIoFilePermissionError as ex:
620
+ raise SplurgeDsvFilePermissionError(f"File permission error: {effective_file_path}") from ex
621
+ except safe_io_text_file_reader.SplurgeSafeIoOsError as ex:
622
+ raise SplurgeDsvFilePermissionError(f"File access error: {effective_file_path}") from ex
623
+ except Exception as ex:
624
+ # Preserve and re-raise known SplurgeDsvError subclasses so
625
+ # callers can handle specific errors (e.g. column mismatch) as
626
+ # intended. Only wrap unknown exceptions in a generic
627
+ # SplurgeDsvError.
628
+ if isinstance(ex, SplurgeDsvError):
629
+ raise
630
+
631
+ raise SplurgeDsvError(f"Unexpected error reading file: {effective_file_path}") from ex