splurge-dsv 2025.2.0__py3-none-any.whl → 2025.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
splurge_dsv/dsv_helper.py CHANGED
@@ -11,11 +11,23 @@ This module is licensed under the MIT License.
11
11
  # Standard library imports
12
12
  from collections.abc import Iterator
13
13
  from os import PathLike
14
+ from pathlib import Path
15
+
16
+ import splurge_safe_io.constants as safe_io_constants
17
+ import splurge_safe_io.path_validator as safe_io_path_validator
18
+ import splurge_safe_io.safe_text_file_reader as safe_io_text_file_reader
14
19
 
15
20
  # Local imports
16
- from splurge_dsv.exceptions import SplurgeDsvParameterError
21
+ from splurge_dsv.exceptions import (
22
+ SplurgeDsvColumnMismatchError,
23
+ SplurgeDsvError,
24
+ SplurgeDsvFileDecodingError,
25
+ SplurgeDsvFileNotFoundError,
26
+ SplurgeDsvFilePermissionError,
27
+ SplurgeDsvParameterError,
28
+ SplurgeDsvPathValidationError,
29
+ )
17
30
  from splurge_dsv.string_tokenizer import StringTokenizer
18
- from splurge_dsv.text_file_helper import TextFileHelper
19
31
 
20
32
 
21
33
  class DsvHelper:
@@ -26,22 +38,31 @@ class DsvHelper:
26
38
  Supports configurable delimiters, text bookends, and whitespace handling options.
27
39
  """
28
40
 
29
- DEFAULT_CHUNK_SIZE = 500 # Default chunk size for streaming operations
30
- DEFAULT_ENCODING = "utf-8" # Default text encoding for file operations
31
- DEFAULT_SKIP_HEADER_ROWS = 0 # Default number of header rows to skip
32
- DEFAULT_SKIP_FOOTER_ROWS = 0 # Default number of footer rows to skip
33
- DEFAULT_MIN_CHUNK_SIZE = 100
41
+ DEFAULT_CHUNK_SIZE = safe_io_constants.DEFAULT_CHUNK_SIZE
42
+ # When detecting normalize_columns across a stream, how many chunks to scan
43
+ # before giving up. Scanning more chunks increases work but helps if the
44
+ # first logical row starts later than the first chunk (e.g., many blank lines
45
+ # or very small chunks). Keep small by default to avoid buffering too much.
46
+ MAX_DETECT_CHUNKS = 10
47
+ DEFAULT_ENCODING = "utf-8"
48
+ DEFAULT_SKIP_HEADER_ROWS = 0
49
+ DEFAULT_SKIP_FOOTER_ROWS = 0
50
+ DEFAULT_MIN_CHUNK_SIZE = safe_io_constants.MIN_CHUNK_SIZE
34
51
  DEFAULT_STRIP = True
35
52
  DEFAULT_BOOKEND_STRIP = True
36
53
 
37
- @staticmethod
54
+ @classmethod
38
55
  def parse(
56
+ cls,
39
57
  content: str,
40
58
  *,
41
59
  delimiter: str,
42
60
  strip: bool = DEFAULT_STRIP,
43
61
  bookend: str | None = None,
44
62
  bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
63
+ normalize_columns: int = 0,
64
+ raise_on_missing_columns: bool = False,
65
+ raise_on_extra_columns: bool = False,
45
66
  ) -> list[str]:
46
67
  """Parse a single DSV line into tokens.
47
68
 
@@ -56,12 +77,18 @@ class DsvHelper:
56
77
  strip: If True, strip leading/trailing whitespace from each token.
57
78
  bookend: Optional bookend character to remove from token ends.
58
79
  bookend_strip: If True, strip whitespace after removing bookends.
80
+ normalize_columns: If > 0, ensure the returned list has exactly this many columns,
81
+ padding with empty strings or truncating as needed.
82
+ raise_on_missing_columns: If True, raise an error if the line has fewer columns than ``normalize_columns``.
83
+ raise_on_extra_columns: If True, raise an error if the line has more columns than
59
84
 
60
85
  Returns:
61
86
  A list of parsed token strings.
62
87
 
63
88
  Raises:
64
89
  SplurgeDsvParameterError: If ``delimiter`` is empty or None.
90
+ SplurgeDsvParameterError: If ``normalize_columns`` is negative.
91
+ SplurgeDsvColumnMismatchError: If column validation fails.
65
92
 
66
93
  Examples:
67
94
  >>> DsvHelper.parse("a,b,c", delimiter=",")
@@ -77,8 +104,79 @@ class DsvHelper:
77
104
  if bookend:
78
105
  tokens = [StringTokenizer.remove_bookends(token, bookend=bookend, strip=bookend_strip) for token in tokens]
79
106
 
107
+ # If requested, validate columns (raises) and/or normalize the row length
108
+ if normalize_columns and normalize_columns > 0:
109
+ # Validation is only performed if the caller asked for raises
110
+ if raise_on_missing_columns or raise_on_extra_columns:
111
+ cls._validate_columns(
112
+ len(tokens),
113
+ expected_columns=normalize_columns,
114
+ raise_on_missing_columns=raise_on_missing_columns,
115
+ raise_on_extra_columns=raise_on_extra_columns,
116
+ )
117
+
118
+ tokens = cls._normalize_columns(tokens, expected_columns=normalize_columns)
119
+
80
120
  return tokens
81
121
 
122
+ @classmethod
123
+ def _normalize_columns(cls, row: list[str], *, expected_columns: int) -> list[str]:
124
+ """Normalize a token list to the expected number of columns.
125
+
126
+ If the row has fewer columns than expected, append empty strings to reach
127
+ the expected length. If the row has more columns than expected, truncate
128
+ the excess columns.
129
+
130
+ Args:
131
+ row: The list of tokens to normalize.
132
+ expected_columns: Desired number of columns.
133
+
134
+ Returns:
135
+ A new list of tokens with length == expected_columns.
136
+
137
+ Raises:
138
+ SplurgeDsvParameterError: If ``expected_columns`` is negative.
139
+ """
140
+ if expected_columns < 0:
141
+ raise SplurgeDsvParameterError("expected_columns must be non-negative")
142
+
143
+ current = len(row)
144
+ if current == expected_columns:
145
+ return row
146
+ if current < expected_columns:
147
+ # append empty strings
148
+ return row + [""] * (expected_columns - current)
149
+ # current > expected -> truncate
150
+ return row[:expected_columns]
151
+
152
+ @classmethod
153
+ def _validate_columns(
154
+ cls, actual_columns: int, *, expected_columns: int, raise_on_missing_columns: bool, raise_on_extra_columns: bool
155
+ ) -> None:
156
+ """Validate column count against expected_columns.
157
+
158
+ Raises a SplurgeDsvError (or a more specific subclass) when the
159
+ validation fails according to the provided flags.
160
+
161
+ Args:
162
+ actual_columns: The actual number of columns in the row.
163
+ expected_columns: The expected number of columns.
164
+ raise_on_missing_columns: If True, raise an error if actual_columns < expected_columns.
165
+ raise_on_extra_columns: If True, raise an error if actual_columns > expected_columns.
166
+
167
+ Raises:
168
+ SplurgeDsvColumnMismatchError: If column validation fails.
169
+ SplurgeDsvParameterError: If ``expected_columns`` is negative.
170
+ """
171
+ if expected_columns < 0:
172
+ raise SplurgeDsvParameterError("expected_columns must be non-negative")
173
+
174
+ if raise_on_missing_columns and actual_columns < expected_columns:
175
+ raise SplurgeDsvColumnMismatchError(f"Row is missing columns: ({actual_columns} < {expected_columns})")
176
+
177
+ if raise_on_extra_columns and actual_columns > expected_columns:
178
+ raise SplurgeDsvColumnMismatchError(f"Row has extra columns: ({actual_columns} > {expected_columns})")
179
+
82
180
  @classmethod
83
181
  def parses(
84
182
  cls,
@@ -88,6 +186,10 @@ class DsvHelper:
88
186
  strip: bool = DEFAULT_STRIP,
89
187
  bookend: str | None = None,
90
188
  bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
189
+ normalize_columns: int = 0,
190
+ raise_on_missing_columns: bool = False,
191
+ raise_on_extra_columns: bool = False,
192
+ detect_columns: bool = False,
91
193
  ) -> list[list[str]]:
92
194
  """Parse multiple DSV lines.
93
195
 
@@ -100,13 +202,19 @@ class DsvHelper:
100
202
  strip: If True, strip whitespace from tokens.
101
203
  bookend: Optional bookend character to remove from tokens.
102
204
  bookend_strip: If True, strip whitespace after removing bookends.
205
+ normalize_columns: If > 0, ensure each returned list has exactly this many columns,
206
+ padding with empty strings or truncating as needed.
207
+ raise_on_missing_columns: If True, raise an error if a line has fewer columns than ``normalize_columns``.
208
+ raise_on_extra_columns: If True, raise an error if a line has more columns than ``normalize_columns``.
209
+ detect_columns: If True and ``normalize_columns`` is not set or <= 0, detect the number of columns from the content.
103
210
 
104
211
  Returns:
105
212
  A list of token lists, one per input line.
106
213
 
107
214
  Raises:
108
- SplurgeDsvParameterError: If ``content`` is not a list of strings or
109
- if ``delimiter`` is empty or None.
215
+ SplurgeDsvParameterError: If ``content`` is not a list of strings, or
216
+ if ``delimiter`` is empty or None, or if ``normalize_columns`` is negative.
217
+ SplurgeDsvColumnMismatchError: If column validation fails.
110
218
 
111
219
  Example:
112
220
  >>> DsvHelper.parses(["a,b,c", "d,e,f"], delimiter=",")
@@ -118,15 +226,82 @@ class DsvHelper:
118
226
  if not all(isinstance(item, str) for item in content):
119
227
  raise SplurgeDsvParameterError("content must be a list of strings")
120
228
 
229
+ # If requested, detect expected columns from the first logical row
230
+ if detect_columns and (not normalize_columns or normalize_columns <= 0):
231
+ if not content:
232
+ return []
233
+ # Find the first non-blank logical row in the provided content
234
+ first_non_blank = None
235
+ for ln in content:
236
+ if isinstance(ln, str) and ln.strip() != "":
237
+ first_non_blank = ln
238
+ break
239
+ if first_non_blank is None:
240
+ return []
241
+
242
+ detected = cls.parse(
243
+ first_non_blank,
244
+ delimiter=delimiter,
245
+ strip=strip,
246
+ bookend=bookend,
247
+ bookend_strip=bookend_strip,
248
+ normalize_columns=0,
249
+ raise_on_missing_columns=False,
250
+ raise_on_extra_columns=False,
251
+ )
252
+ normalize_columns = len(detected)
253
+
121
254
  return [
122
- cls.parse(item, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
255
+ cls.parse(
256
+ item,
257
+ delimiter=delimiter,
258
+ strip=strip,
259
+ bookend=bookend,
260
+ bookend_strip=bookend_strip,
261
+ normalize_columns=normalize_columns,
262
+ raise_on_missing_columns=raise_on_missing_columns,
263
+ raise_on_extra_columns=raise_on_extra_columns,
264
+ )
123
265
  for item in content
124
266
  ]
125
267
 
268
+ @staticmethod
269
+ def _validate_file_path(
270
+ file_path: Path | str, *, must_exist: bool = True, must_be_file: bool = True, must_be_readable: bool = True
271
+ ) -> Path:
272
+ """Validate the provided file path.
273
+
274
+ Args:
275
+ file_path: The file path to validate.
276
+
277
+ Returns:
278
+ A validated Path object.
279
+
280
+ Raises:
281
+ SplurgeDsvPathValidationError: If the file path is invalid.
282
+ SplurgeDsvFileNotFoundError: If the file does not exist.
283
+ SplurgeDsvFilePermissionError: If the file cannot be accessed due to permission restrictions
284
+ SplurgeDsvError: For other unexpected errors.
285
+ """
286
+ try:
287
+ effective_path = safe_io_path_validator.PathValidator.validate_path(
288
+ Path(file_path), must_exist=must_exist, must_be_file=must_be_file, must_be_readable=must_be_readable
289
+ )
290
+ except safe_io_path_validator.SplurgeSafeIoPathValidationError as ex:
291
+ raise SplurgeDsvPathValidationError(f"Invalid file path: {file_path}") from ex
292
+ except safe_io_path_validator.SplurgeSafeIoFileNotFoundError as ex:
293
+ raise SplurgeDsvFileNotFoundError(f"File not found: {file_path}") from ex
294
+ except safe_io_path_validator.SplurgeSafeIoFilePermissionError as ex:
295
+ raise SplurgeDsvFilePermissionError(f"File permission error: {file_path}") from ex
296
+ except Exception as ex:
297
+ raise SplurgeDsvError(f"Unexpected error validating file path: {file_path}") from ex
298
+
299
+ return effective_path
300
+
126
301
  @classmethod
127
302
  def parse_file(
128
303
  cls,
129
- file_path: PathLike[str] | str,
304
+ file_path: PathLike[str] | Path | str,
130
305
  *,
131
306
  delimiter: str,
132
307
  strip: bool = DEFAULT_STRIP,
@@ -135,11 +310,16 @@ class DsvHelper:
135
310
  encoding: str = DEFAULT_ENCODING,
136
311
  skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
137
312
  skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
313
+ skip_empty_lines: bool = False,
314
+ normalize_columns: int = 0,
315
+ raise_on_missing_columns: bool = False,
316
+ raise_on_extra_columns: bool = False,
317
+ detect_columns: bool = False,
138
318
  ) -> list[list[str]]:
139
319
  """Read and parse an entire DSV file.
140
320
 
141
321
  This convenience reads all lines from ``file_path`` using
142
- :class:`splurge_dsv.text_file_helper.TextFileHelper` and then parses each
322
+ :class:`splurge_safe_io.safe_text_file_reader.SafeTextFileReader` and then parses each
143
323
  line into tokens. Header and footer rows may be skipped via the
144
324
  ``skip_header_rows`` and ``skip_footer_rows`` parameters.
145
325
 
@@ -152,23 +332,64 @@ class DsvHelper:
152
332
  encoding: Text encoding to use when reading the file.
153
333
  skip_header_rows: Number of leading lines to ignore.
154
334
  skip_footer_rows: Number of trailing lines to ignore.
335
+ normalize_columns: Number of columns to normalize.
336
+ raise_on_missing_columns: Raise an error if a line has fewer columns than ``normalize_columns``.
337
+ raise_on_extra_columns: Raise an error if a line has more columns than ``normalize_columns``.
155
338
 
156
339
  Returns:
157
340
  A list of token lists (one list per non-skipped line).
158
341
 
159
342
  Raises:
160
- SplurgeDsvParameterError: If ``delimiter`` is empty or None.
343
+ SplurgeDsvParameterError: If ``delimiter`` is empty or None, or if ``normalize_columns`` is negative.
161
344
  SplurgeDsvFileNotFoundError: If the file at ``file_path`` does not exist.
162
- SplurgeDsvFilePermissionError: If the file cannot be accessed due to
163
- permission restrictions.
164
- SplurgeDsvFileEncodingError: If the file cannot be decoded using
165
- the provided ``encoding``.
345
+ SplurgeDsvFilePermissionError: If the file cannot be accessed due to permission restrictions.
346
+ SplurgeDsvFileDecodingError: If the file cannot be decoded using the provided ``encoding``.
347
+ SplurgeDsvPathValidationError: If the file path is invalid.
348
+ SplurgeDsvColumnMismatchError: If column validation fails.
349
+ SplurgeDsvError: For other unexpected errors.
166
350
  """
167
- lines: list[str] = TextFileHelper.read(
168
- file_path, encoding=encoding, skip_header_rows=skip_header_rows, skip_footer_rows=skip_footer_rows
169
- )
351
+ effective_file_path = cls._validate_file_path(Path(file_path))
170
352
 
171
- return cls.parses(lines, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
353
+ skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
354
+ skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
355
+
356
+ try:
357
+ reader = safe_io_text_file_reader.SafeTextFileReader(
358
+ effective_file_path,
359
+ encoding=encoding,
360
+ skip_header_lines=skip_header_rows,
361
+ skip_footer_lines=skip_footer_rows,
362
+ strip=strip,
363
+ skip_empty_lines=skip_empty_lines,
364
+ )
365
+ lines: list[str] = reader.read()
366
+
367
+ except safe_io_text_file_reader.SplurgeSafeIoFileDecodingError as ex:
368
+ raise SplurgeDsvFileDecodingError(f"File decoding error: {effective_file_path}") from ex
369
+ except safe_io_text_file_reader.SplurgeSafeIoFilePermissionError as ex:
370
+ raise SplurgeDsvFilePermissionError(f"File permission error: {effective_file_path}") from ex
371
+ except safe_io_text_file_reader.SplurgeSafeIoOsError as ex:
372
+ raise SplurgeDsvFilePermissionError(f"File access error: {effective_file_path}") from ex
373
+ except Exception as ex:
374
+ # If the exception is already a SplurgeDsvError (or subclass),
375
+ # re-raise it unchanged so callers can handle specific errors
376
+ # (for example, SplurgeDsvColumnMismatchError from validation).
377
+ if isinstance(ex, SplurgeDsvError):
378
+ raise
379
+
380
+ raise SplurgeDsvError(f"Unexpected error reading file: {effective_file_path}") from ex
381
+
382
+ return cls.parses(
383
+ lines,
384
+ delimiter=delimiter,
385
+ strip=strip,
386
+ bookend=bookend,
387
+ bookend_strip=bookend_strip,
388
+ normalize_columns=normalize_columns,
389
+ raise_on_missing_columns=raise_on_missing_columns,
390
+ raise_on_extra_columns=raise_on_extra_columns,
391
+ detect_columns=detect_columns,
392
+ )
172
393
 
173
394
  @classmethod
174
395
  def _process_stream_chunk(
@@ -179,10 +400,13 @@ class DsvHelper:
179
400
  strip: bool = DEFAULT_STRIP,
180
401
  bookend: str | None = None,
181
402
  bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
403
+ normalize_columns: int = 0,
404
+ raise_on_missing_columns: bool = False,
405
+ raise_on_extra_columns: bool = False,
182
406
  ) -> list[list[str]]:
183
407
  """Parse a chunk of lines into tokenized rows.
184
408
 
185
- Designed to be used by :meth:`parse_stream` as a helper for converting a
409
+ Designed to be used by :meth:`parse_file_stream` as a helper for converting a
186
410
  batch of raw lines into parsed rows.
187
411
 
188
412
  Args:
@@ -191,15 +415,34 @@ class DsvHelper:
191
415
  strip: If True, strip whitespace from tokens.
192
416
  bookend: Optional bookend character to remove from tokens.
193
417
  bookend_strip: If True, strip whitespace after removing bookends.
418
+ normalize_columns: If > 0, ensure each returned list has exactly this many columns,
419
+ padding with empty strings or truncating as needed.
420
+ raise_on_missing_columns: If True, raise an error if a line has fewer columns than ``normalize_columns``.
421
+ raise_on_extra_columns: If True, raise an error if a line has more columns than ``normalize_columns``.
422
+
423
+ Raises:
424
+ SplurgeDsvParameterError: If ``delimiter`` is empty or None,
425
+ or if ``normalize_columns`` is negative,
426
+ or if ``chunk`` is not a list of strings, or if any element in ``chunk`` is not a string.
427
+ SplurgeDsvColumnMismatchError: If column validation fails.
194
428
 
195
429
  Returns:
196
430
  A list where each element is the token list for a corresponding
197
431
  input line from ``chunk``.
198
432
  """
199
- return cls.parses(chunk, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
433
+ return cls.parses(
434
+ chunk,
435
+ delimiter=delimiter,
436
+ strip=strip,
437
+ bookend=bookend,
438
+ bookend_strip=bookend_strip,
439
+ normalize_columns=normalize_columns,
440
+ raise_on_missing_columns=raise_on_missing_columns,
441
+ raise_on_extra_columns=raise_on_extra_columns,
442
+ )
200
443
 
201
444
  @classmethod
202
- def parse_stream(
445
+ def parse_file_stream(
203
446
  cls,
204
447
  file_path: PathLike[str] | str,
205
448
  *,
@@ -210,10 +453,19 @@ class DsvHelper:
210
453
  encoding: str = DEFAULT_ENCODING,
211
454
  skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
212
455
  skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
456
+ skip_empty_lines: bool = False,
457
+ normalize_columns: int = 0,
458
+ raise_on_missing_columns: bool = False,
459
+ raise_on_extra_columns: bool = False,
460
+ detect_columns: bool = False,
213
461
  chunk_size: int = DEFAULT_CHUNK_SIZE,
462
+ # How many chunks to scan when attempting to detect normalize_columns
463
+ # from the beginning of a stream. Only used when
464
+ # `detect_columns is True` and `normalize_columns` is falsy.
465
+ max_detect_chunks: int = MAX_DETECT_CHUNKS,
214
466
  ) -> Iterator[list[list[str]]]:
215
467
  """
216
- Stream-parse a DSV file in chunks of lines.
468
+ Stream-parse a DSV file into chunks of lines.
217
469
 
218
470
  Args:
219
471
  file_path (PathLike[str] | str): The path to the file to parse.
@@ -224,34 +476,156 @@ class DsvHelper:
224
476
  encoding (str): The file encoding.
225
477
  skip_header_rows (int): Number of header rows to skip.
226
478
  skip_footer_rows (int): Number of footer rows to skip.
479
+ normalize_columns (int): If > 0, ensure each returned list has exactly this many columns,
480
+ padding with empty strings or truncating as needed.
481
+ raise_on_missing_columns (bool): If True, raise an error if a line has fewer columns than ``normalize_columns``.
482
+ raise_on_extra_columns (bool): If True, raise an error if a line has more columns than ``normalize_columns``.
483
+ detect_columns (bool): If True and ``normalize_columns`` is not set or <= 0,
484
+ detect the expected number of columns from the first non-blank logical row.
227
485
  chunk_size (int): Number of lines per chunk (default: 100).
486
+ max_detect_chunks (int): When detecting columns, how many chunks to scan
487
+ from the start of the stream before giving up (default: 10).
228
488
 
229
489
  Yields:
230
490
  list[list[str]]: Parsed rows for each chunk.
231
491
 
232
492
  Raises:
233
- SplurgeParameterError: If delimiter is empty or None.
234
- SplurgeFileNotFoundError: If the file does not exist.
235
- SplurgeFilePermissionError: If the file cannot be accessed.
236
- SplurgeFileEncodingError: If the file cannot be decoded with the specified encoding.
493
+ SplurgeDsvParameterError: If delimiter is empty or None, or if ``normalize_columns`` is negative,
494
+ or if ``chunk`` is not a list of strings, or if any element in ``chunk`` is not a string.
495
+ SplurgeDsvFileNotFoundError: If the file does not exist.
496
+ SplurgeDsvFilePermissionError: If the file cannot be accessed.
497
+ SplurgeDsvFileDecodingError: If the file cannot be decoded with the specified encoding.
498
+ SplurgeDsvPathValidationError: If the file path is invalid.
499
+ SplurgeDsvError: For other unexpected errors.
500
+ SplurgeDsvColumnMismatchError: If column validation fails.
237
501
  """
238
- if delimiter is None or delimiter == "":
239
- raise SplurgeDsvParameterError("delimiter cannot be empty or None")
502
+
503
+ effective_file_path = cls._validate_file_path(Path(file_path))
240
504
 
241
505
  chunk_size = max(chunk_size, cls.DEFAULT_MIN_CHUNK_SIZE)
242
506
  skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
243
507
  skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
244
-
245
- # Use TextFileHelper.read_as_stream for consistent error handling
246
- yield from (
247
- cls._process_stream_chunk(
248
- chunk, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip
249
- )
250
- for chunk in TextFileHelper.read_as_stream(
251
- file_path,
508
+ # Allow callers to pass None to use the module default. Ensure we have
509
+ # a positive integer to drive the detection loop.
510
+ if max_detect_chunks is None:
511
+ max_detect_chunks = cls.MAX_DETECT_CHUNKS
512
+ else:
513
+ max_detect_chunks = max(int(max_detect_chunks), 1)
514
+
515
+ try:
516
+ reader = safe_io_text_file_reader.SafeTextFileReader(
517
+ effective_file_path,
252
518
  encoding=encoding,
253
- skip_header_rows=skip_header_rows,
254
- skip_footer_rows=skip_footer_rows,
519
+ skip_header_lines=skip_header_rows,
520
+ skip_footer_lines=skip_footer_rows,
521
+ strip=strip,
522
+ skip_empty_lines=skip_empty_lines,
255
523
  chunk_size=chunk_size,
256
524
  )
257
- )
525
+ stream_iter = reader.read_as_stream()
526
+
527
+ if detect_columns and (not normalize_columns or normalize_columns <= 0):
528
+ # Buffer up to `max_detect_chunks` from the stream while
529
+ # searching for the first non-blank logical row. This allows us
530
+ # to detect the expected column count even if the first logical
531
+ # row doesn't appear in the very first chunk (for example,
532
+ # when the file begins with many blank lines or very small
533
+ # chunks).
534
+ buffered_chunks: list[list[str]] = []
535
+ max_scan = max_detect_chunks if max_detect_chunks is not None else cls.MAX_DETECT_CHUNKS
536
+ chunks_scanned = 0
537
+
538
+ while chunks_scanned < max_scan:
539
+ try:
540
+ chunk = next(stream_iter)
541
+ except StopIteration:
542
+ break
543
+ buffered_chunks.append(chunk)
544
+
545
+ # Inspect this chunk for the first non-blank logical row
546
+ first_line = None
547
+ for ln in chunk:
548
+ if isinstance(ln, str) and ln.strip() != "":
549
+ first_line = ln
550
+ break
551
+
552
+ if first_line is not None:
553
+ detected = cls.parse(
554
+ first_line,
555
+ delimiter=delimiter,
556
+ strip=strip,
557
+ bookend=bookend,
558
+ bookend_strip=bookend_strip,
559
+ normalize_columns=0,
560
+ raise_on_missing_columns=False,
561
+ raise_on_extra_columns=False,
562
+ )
563
+ normalize_columns = len(detected)
564
+ # remember which buffered chunk contained the first
565
+ # logical row so we can start applying normalization
566
+ # beginning with that chunk only
567
+ detected_index = len(buffered_chunks) - 1
568
+ break
569
+
570
+ chunks_scanned += 1
571
+
572
+ # Replay any buffered chunks (in order) so callers receive the
573
+ # full content starting at the beginning of the file. If we
574
+ # detected the first logical row in one of the buffered chunks
575
+ # then only apply normalization beginning with that chunk;
576
+ # earlier buffered chunks must be emitted without
577
+ # normalization so we don't convert blank-only lines into
578
+ # padded empty-token rows.
579
+ if "detected_index" in locals():
580
+ for idx, b in enumerate(buffered_chunks):
581
+ use_norm = normalize_columns if idx == detected_index else 0
582
+ yield cls._process_stream_chunk(
583
+ b,
584
+ delimiter=delimiter,
585
+ strip=strip,
586
+ bookend=bookend,
587
+ bookend_strip=bookend_strip,
588
+ normalize_columns=use_norm,
589
+ raise_on_missing_columns=raise_on_missing_columns,
590
+ raise_on_extra_columns=raise_on_extra_columns,
591
+ )
592
+ else:
593
+ for b in buffered_chunks:
594
+ yield cls._process_stream_chunk(
595
+ b,
596
+ delimiter=delimiter,
597
+ strip=strip,
598
+ bookend=bookend,
599
+ bookend_strip=bookend_strip,
600
+ normalize_columns=0,
601
+ raise_on_missing_columns=raise_on_missing_columns,
602
+ raise_on_extra_columns=raise_on_extra_columns,
603
+ )
604
+
605
+ # Continue streaming the rest of the file
606
+ for chunk in stream_iter:
607
+ yield cls._process_stream_chunk(
608
+ chunk,
609
+ delimiter=delimiter,
610
+ strip=strip,
611
+ bookend=bookend,
612
+ bookend_strip=bookend_strip,
613
+ normalize_columns=normalize_columns,
614
+ raise_on_missing_columns=raise_on_missing_columns,
615
+ raise_on_extra_columns=raise_on_extra_columns,
616
+ )
617
+ except safe_io_text_file_reader.SplurgeSafeIoFileDecodingError as ex:
618
+ raise SplurgeDsvFileDecodingError(f"File decoding error: {effective_file_path}") from ex
619
+ except safe_io_text_file_reader.SplurgeSafeIoFilePermissionError as ex:
620
+ raise SplurgeDsvFilePermissionError(f"File permission error: {effective_file_path}") from ex
621
+ except safe_io_text_file_reader.SplurgeSafeIoOsError as ex:
622
+ raise SplurgeDsvFilePermissionError(f"File access error: {effective_file_path}") from ex
623
+ except Exception as ex:
624
+ # Preserve and re-raise known SplurgeDsvError subclasses so
625
+ # callers can handle specific errors (e.g. column mismatch) as
626
+ # intended. Only wrap unknown exceptions in a generic
627
+ # SplurgeDsvError.
628
+ if isinstance(ex, SplurgeDsvError):
629
+ raise
630
+
631
+ raise SplurgeDsvError(f"Unexpected error reading file: {effective_file_path}") from ex
splurge_dsv/exceptions.py CHANGED
@@ -65,6 +65,15 @@ class SplurgeDsvFileNotFoundError(SplurgeDsvFileOperationError):
65
65
  """
66
66
 
67
67
 
68
+ class SplurgeDsvFileExistsError(SplurgeDsvFileOperationError):
69
+ """Raised when attempting to create a file that already exists.
70
+
71
+ This typically maps to ``FileExistsError`` semantics but uses the
72
+ package-specific exception hierarchy so callers can distinguish
73
+ file errors from other error types.
74
+ """
75
+
76
+
68
77
  class SplurgeDsvFilePermissionError(SplurgeDsvFileOperationError):
69
78
  """Raised for permission or access-related file errors.
70
79
 
@@ -73,7 +82,7 @@ class SplurgeDsvFilePermissionError(SplurgeDsvFileOperationError):
73
82
  """
74
83
 
75
84
 
76
- class SplurgeDsvFileEncodingError(SplurgeDsvFileOperationError):
85
+ class SplurgeDsvFileDecodingError(SplurgeDsvFileOperationError):
77
86
  """Raised when decoding or encoding a text file fails.
78
87
 
79
88
  The exception typically wraps the underlying decoding error and
@@ -81,6 +90,14 @@ class SplurgeDsvFileEncodingError(SplurgeDsvFileOperationError):
81
90
  """
82
91
 
83
92
 
93
+ class SplurgeDsvFileEncodingError(SplurgeDsvFileOperationError):
94
+ """Raised when encoding a text file fails.
95
+
96
+ The exception typically wraps the underlying encoding error and
97
+ provides a descriptive message and optional details for diagnostics.
98
+ """
99
+
100
+
84
101
  class SplurgeDsvPathValidationError(SplurgeDsvFileOperationError):
85
102
  """Raised when a provided filesystem path fails validation checks.
86
103
 
@@ -101,6 +118,10 @@ class SplurgeDsvParsingError(SplurgeDsvDataProcessingError):
101
118
  """Raised when parsing fails due to malformed or unexpected content."""
102
119
 
103
120
 
121
+ class SplurgeDsvColumnMismatchError(SplurgeDsvDataProcessingError):
122
+ """Raised when a row has a different number of columns than expected."""
123
+
124
+
104
125
  class SplurgeDsvTypeConversionError(SplurgeDsvDataProcessingError):
105
126
  """Raised when a value cannot be converted to the requested type."""
106
127