PyPI - splurge-dsv - Versions diffs - 2025.2.0__py3-none-any.whl → 2025.3.1__py3-none-any.whl - Mend

splurge-dsv 2025.2.0py3-none-any.whl → 2025.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

splurge_dsv/__init__.py +16 -5
splurge_dsv/cli.py +137 -26
splurge_dsv/dsv.py +101 -7
splurge_dsv/dsv_helper.py +417 -43
splurge_dsv/exceptions.py +22 -1
splurge_dsv/string_tokenizer.py +7 -1
{splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/METADATA +78 -5
splurge_dsv-2025.3.1.dist-info/RECORD +13 -0
splurge_dsv/path_validator.py +0 -298
splurge_dsv/safe_text_file_reader.py +0 -177
splurge_dsv/safe_text_file_writer.py +0 -136
splurge_dsv/text_file_helper.py +0 -240
splurge_dsv-2025.2.0.dist-info/RECORD +0 -17
{splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/WHEEL +0 -0
{splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/entry_points.txt +0 -0
{splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/licenses/LICENSE +0 -0
{splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/top_level.txt +0 -0

splurge_dsv/dsv_helper.py CHANGED Viewed

@@ -11,11 +11,23 @@ This module is licensed under the MIT License.
 # Standard library imports
 from collections.abc import Iterator
 from os import PathLike
+from pathlib import Path
+import splurge_safe_io.constants as safe_io_constants
+import splurge_safe_io.path_validator as safe_io_path_validator
+import splurge_safe_io.safe_text_file_reader as safe_io_text_file_reader
 # Local imports
-from splurge_dsv.exceptions import SplurgeDsvParameterError
+from splurge_dsv.exceptions import (
+    SplurgeDsvColumnMismatchError,
+    SplurgeDsvError,
+    SplurgeDsvFileDecodingError,
+    SplurgeDsvFileNotFoundError,
+    SplurgeDsvFilePermissionError,
+    SplurgeDsvParameterError,
+    SplurgeDsvPathValidationError,
+)
 from splurge_dsv.string_tokenizer import StringTokenizer
-from splurge_dsv.text_file_helper import TextFileHelper
 class DsvHelper:
@@ -26,22 +38,31 @@ class DsvHelper:
     Supports configurable delimiters, text bookends, and whitespace handling options.
     """
-    DEFAULT_CHUNK_SIZE = 500  # Default chunk size for streaming operations
-    DEFAULT_ENCODING = "utf-8"  # Default text encoding for file operations
-    DEFAULT_SKIP_HEADER_ROWS = 0  # Default number of header rows to skip
-    DEFAULT_SKIP_FOOTER_ROWS = 0  # Default number of footer rows to skip
-    DEFAULT_MIN_CHUNK_SIZE = 100
+    DEFAULT_CHUNK_SIZE = safe_io_constants.DEFAULT_CHUNK_SIZE
+    # When detecting normalize_columns across a stream, how many chunks to scan
+    # before giving up. Scanning more chunks increases work but helps if the
+    # first logical row starts later than the first chunk (e.g., many blank lines
+    # or very small chunks). Keep small by default to avoid buffering too much.
+    MAX_DETECT_CHUNKS = 10
+    DEFAULT_ENCODING = "utf-8"
+    DEFAULT_SKIP_HEADER_ROWS = 0
+    DEFAULT_SKIP_FOOTER_ROWS = 0
+    DEFAULT_MIN_CHUNK_SIZE = safe_io_constants.MIN_CHUNK_SIZE
     DEFAULT_STRIP = True
     DEFAULT_BOOKEND_STRIP = True
-    @staticmethod
+    @classmethod
     def parse(
+        cls,
         content: str,
         *,
         delimiter: str,
         strip: bool = DEFAULT_STRIP,
         bookend: str | None = None,
         bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
+        normalize_columns: int = 0,
+        raise_on_missing_columns: bool = False,
+        raise_on_extra_columns: bool = False,
     ) -> list[str]:
         """Parse a single DSV line into tokens.
@@ -56,12 +77,18 @@ class DsvHelper:
             strip: If True, strip leading/trailing whitespace from each token.
             bookend: Optional bookend character to remove from token ends.
             bookend_strip: If True, strip whitespace after removing bookends.
+            normalize_columns: If > 0, ensure the returned list has exactly this many columns,
+                padding with empty strings or truncating as needed.
+            raise_on_missing_columns: If True, raise an error if the line has fewer columns than ``normalize_columns``.
+            raise_on_extra_columns: If True, raise an error if the line has more columns than
         Returns:
             A list of parsed token strings.
         Raises:
             SplurgeDsvParameterError: If ``delimiter`` is empty or None.
+            SplurgeDsvParameterError: If ``normalize_columns`` is negative.
+            SplurgeDsvColumnMismatchError: If column validation fails.
         Examples:
             >>> DsvHelper.parse("a,b,c", delimiter=",")
@@ -77,8 +104,79 @@ class DsvHelper:
         if bookend:
             tokens = [StringTokenizer.remove_bookends(token, bookend=bookend, strip=bookend_strip) for token in tokens]
+        # If requested, validate columns (raises) and/or normalize the row length
+        if normalize_columns and normalize_columns > 0:
+            # Validation is only performed if the caller asked for raises
+            if raise_on_missing_columns or raise_on_extra_columns:
+                cls._validate_columns(
+                    len(tokens),
+                    expected_columns=normalize_columns,
+                    raise_on_missing_columns=raise_on_missing_columns,
+                    raise_on_extra_columns=raise_on_extra_columns,
+                )
+            tokens = cls._normalize_columns(tokens, expected_columns=normalize_columns)
         return tokens
+    @classmethod
+    def _normalize_columns(cls, row: list[str], *, expected_columns: int) -> list[str]:
+        """Normalize a token list to the expected number of columns.
+        If the row has fewer columns than expected, append empty strings to reach
+        the expected length. If the row has more columns than expected, truncate
+        the excess columns.
+        Args:
+            row: The list of tokens to normalize.
+            expected_columns: Desired number of columns.
+        Returns:
+            A new list of tokens with length == expected_columns.
+        Raises:
+            SplurgeDsvParameterError: If ``expected_columns`` is negative.
+        """
+        if expected_columns < 0:
+            raise SplurgeDsvParameterError("expected_columns must be non-negative")
+        current = len(row)
+        if current == expected_columns:
+            return row
+        if current < expected_columns:
+            # append empty strings
+            return row + [""] * (expected_columns - current)
+        # current > expected -> truncate
+        return row[:expected_columns]
+    @classmethod
+    def _validate_columns(
+        cls, actual_columns: int, *, expected_columns: int, raise_on_missing_columns: bool, raise_on_extra_columns: bool
+    ) -> None:
+        """Validate column count against expected_columns.
+        Raises a SplurgeDsvError (or a more specific subclass) when the
+        validation fails according to the provided flags.
+        Args:
+            actual_columns: The actual number of columns in the row.
+            expected_columns: The expected number of columns.
+            raise_on_missing_columns: If True, raise an error if actual_columns < expected_columns.
+            raise_on_extra_columns: If True, raise an error if actual_columns > expected_columns.
+        Raises:
+            SplurgeDsvColumnMismatchError: If column validation fails.
+            SplurgeDsvParameterError: If ``expected_columns`` is negative.
+        """
+        if expected_columns < 0:
+            raise SplurgeDsvParameterError("expected_columns must be non-negative")
+        if raise_on_missing_columns and actual_columns < expected_columns:
+            raise SplurgeDsvColumnMismatchError(f"Row is missing columns: ({actual_columns} < {expected_columns})")
+        if raise_on_extra_columns and actual_columns > expected_columns:
+            raise SplurgeDsvColumnMismatchError(f"Row has extra columns: ({actual_columns} > {expected_columns})")
     @classmethod
     def parses(
         cls,
@@ -88,6 +186,10 @@ class DsvHelper:
         strip: bool = DEFAULT_STRIP,
         bookend: str | None = None,
         bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
+        normalize_columns: int = 0,
+        raise_on_missing_columns: bool = False,
+        raise_on_extra_columns: bool = False,
+        detect_columns: bool = False,
     ) -> list[list[str]]:
         """Parse multiple DSV lines.
@@ -100,13 +202,19 @@ class DsvHelper:
             strip: If True, strip whitespace from tokens.
             bookend: Optional bookend character to remove from tokens.
             bookend_strip: If True, strip whitespace after removing bookends.
+            normalize_columns: If > 0, ensure each returned list has exactly this many columns,
+                padding with empty strings or truncating as needed.
+            raise_on_missing_columns: If True, raise an error if a line has fewer columns than ``normalize_columns``.
+            raise_on_extra_columns: If True, raise an error if a line has more columns than ``normalize_columns``.
+            detect_columns: If True and ``normalize_columns`` is not set or <= 0, detect the number of columns from the content.
         Returns:
             A list of token lists, one per input line.
         Raises:
-            SplurgeDsvParameterError: If ``content`` is not a list of strings or
-                if ``delimiter`` is empty or None.
+            SplurgeDsvParameterError: If ``content`` is not a list of strings, or
+                if ``delimiter`` is empty or None, or if ``normalize_columns`` is negative.
+            SplurgeDsvColumnMismatchError: If column validation fails.
         Example:
             >>> DsvHelper.parses(["a,b,c", "d,e,f"], delimiter=",")
@@ -118,15 +226,82 @@ class DsvHelper:
         if not all(isinstance(item, str) for item in content):
             raise SplurgeDsvParameterError("content must be a list of strings")
+        # If requested, detect expected columns from the first logical row
+        if detect_columns and (not normalize_columns or normalize_columns <= 0):
+            if not content:
+                return []
+            # Find the first non-blank logical row in the provided content
+            first_non_blank = None
+            for ln in content:
+                if isinstance(ln, str) and ln.strip() != "":
+                    first_non_blank = ln
+                    break
+            if first_non_blank is None:
+                return []
+            detected = cls.parse(
+                first_non_blank,
+                delimiter=delimiter,
+                strip=strip,
+                bookend=bookend,
+                bookend_strip=bookend_strip,
+                normalize_columns=0,
+                raise_on_missing_columns=False,
+                raise_on_extra_columns=False,
+            )
+            normalize_columns = len(detected)
         return [
-            cls.parse(item, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
+            cls.parse(
+                item,
+                delimiter=delimiter,
+                strip=strip,
+                bookend=bookend,
+                bookend_strip=bookend_strip,
+                normalize_columns=normalize_columns,
+                raise_on_missing_columns=raise_on_missing_columns,
+                raise_on_extra_columns=raise_on_extra_columns,
+            )
             for item in content
         ]
+    @staticmethod
+    def _validate_file_path(
+        file_path: Path | str, *, must_exist: bool = True, must_be_file: bool = True, must_be_readable: bool = True
+    ) -> Path:
+        """Validate the provided file path.
+        Args:
+            file_path: The file path to validate.
+        Returns:
+            A validated Path object.
+        Raises:
+            SplurgeDsvPathValidationError: If the file path is invalid.
+            SplurgeDsvFileNotFoundError: If the file does not exist.
+            SplurgeDsvFilePermissionError: If the file cannot be accessed due to permission restrictions
+            SplurgeDsvError: For other unexpected errors.
+        """
+        try:
+            effective_path = safe_io_path_validator.PathValidator.validate_path(
+                Path(file_path), must_exist=must_exist, must_be_file=must_be_file, must_be_readable=must_be_readable
+            )
+        except safe_io_path_validator.SplurgeSafeIoPathValidationError as ex:
+            raise SplurgeDsvPathValidationError(f"Invalid file path: {file_path}") from ex
+        except safe_io_path_validator.SplurgeSafeIoFileNotFoundError as ex:
+            raise SplurgeDsvFileNotFoundError(f"File not found: {file_path}") from ex
+        except safe_io_path_validator.SplurgeSafeIoFilePermissionError as ex:
+            raise SplurgeDsvFilePermissionError(f"File permission error: {file_path}") from ex
+        except Exception as ex:
+            raise SplurgeDsvError(f"Unexpected error validating file path: {file_path}") from ex
+        return effective_path
     @classmethod
     def parse_file(
         cls,
-        file_path: PathLike[str] | str,
+        file_path: PathLike[str] | Path | str,
         *,
         delimiter: str,
         strip: bool = DEFAULT_STRIP,
@@ -135,11 +310,16 @@ class DsvHelper:
         encoding: str = DEFAULT_ENCODING,
         skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
         skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
+        skip_empty_lines: bool = False,
+        normalize_columns: int = 0,
+        raise_on_missing_columns: bool = False,
+        raise_on_extra_columns: bool = False,
+        detect_columns: bool = False,
     ) -> list[list[str]]:
         """Read and parse an entire DSV file.
         This convenience reads all lines from ``file_path`` using
-        :class:`splurge_dsv.text_file_helper.TextFileHelper` and then parses each
+        :class:`splurge_safe_io.safe_text_file_reader.SafeTextFileReader` and then parses each
         line into tokens. Header and footer rows may be skipped via the
         ``skip_header_rows`` and ``skip_footer_rows`` parameters.
@@ -152,23 +332,64 @@ class DsvHelper:
             encoding: Text encoding to use when reading the file.
             skip_header_rows: Number of leading lines to ignore.
             skip_footer_rows: Number of trailing lines to ignore.
+            normalize_columns: Number of columns to normalize.
+            raise_on_missing_columns: Raise an error if a line has fewer columns than ``normalize_columns``.
+            raise_on_extra_columns: Raise an error if a line has more columns than ``normalize_columns``.
         Returns:
             A list of token lists (one list per non-skipped line).
         Raises:
-            SplurgeDsvParameterError: If ``delimiter`` is empty or None.
+            SplurgeDsvParameterError: If ``delimiter`` is empty or None, or if ``normalize_columns`` is negative.
             SplurgeDsvFileNotFoundError: If the file at ``file_path`` does not exist.
-            SplurgeDsvFilePermissionError: If the file cannot be accessed due to
-                permission restrictions.
-            SplurgeDsvFileEncodingError: If the file cannot be decoded using
-                the provided ``encoding``.
+            SplurgeDsvFilePermissionError: If the file cannot be accessed due to permission restrictions.
+            SplurgeDsvFileDecodingError: If the file cannot be decoded using the provided ``encoding``.
+            SplurgeDsvPathValidationError: If the file path is invalid.
+            SplurgeDsvColumnMismatchError: If column validation fails.
+            SplurgeDsvError: For other unexpected errors.
         """
-        lines: list[str] = TextFileHelper.read(
-            file_path, encoding=encoding, skip_header_rows=skip_header_rows, skip_footer_rows=skip_footer_rows
-        )
+        effective_file_path = cls._validate_file_path(Path(file_path))
-        return cls.parses(lines, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
+        skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
+        skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
+        try:
+            reader = safe_io_text_file_reader.SafeTextFileReader(
+                effective_file_path,
+                encoding=encoding,
+                skip_header_lines=skip_header_rows,
+                skip_footer_lines=skip_footer_rows,
+                strip=strip,
+                skip_empty_lines=skip_empty_lines,
+            )
+            lines: list[str] = reader.read()
+        except safe_io_text_file_reader.SplurgeSafeIoFileDecodingError as ex:
+            raise SplurgeDsvFileDecodingError(f"File decoding error: {effective_file_path}") from ex
+        except safe_io_text_file_reader.SplurgeSafeIoFilePermissionError as ex:
+            raise SplurgeDsvFilePermissionError(f"File permission error: {effective_file_path}") from ex
+        except safe_io_text_file_reader.SplurgeSafeIoOsError as ex:
+            raise SplurgeDsvFilePermissionError(f"File access error: {effective_file_path}") from ex
+        except Exception as ex:
+            # If the exception is already a SplurgeDsvError (or subclass),
+            # re-raise it unchanged so callers can handle specific errors
+            # (for example, SplurgeDsvColumnMismatchError from validation).
+            if isinstance(ex, SplurgeDsvError):
+                raise
+            raise SplurgeDsvError(f"Unexpected error reading file: {effective_file_path}") from ex
+        return cls.parses(
+            lines,
+            delimiter=delimiter,
+            strip=strip,
+            bookend=bookend,
+            bookend_strip=bookend_strip,
+            normalize_columns=normalize_columns,
+            raise_on_missing_columns=raise_on_missing_columns,
+            raise_on_extra_columns=raise_on_extra_columns,
+            detect_columns=detect_columns,
+        )
     @classmethod
     def _process_stream_chunk(
@@ -179,10 +400,13 @@ class DsvHelper:
         strip: bool = DEFAULT_STRIP,
         bookend: str | None = None,
         bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
+        normalize_columns: int = 0,
+        raise_on_missing_columns: bool = False,
+        raise_on_extra_columns: bool = False,
     ) -> list[list[str]]:
         """Parse a chunk of lines into tokenized rows.
-        Designed to be used by :meth:`parse_stream` as a helper for converting a
+        Designed to be used by :meth:`parse_file_stream` as a helper for converting a
         batch of raw lines into parsed rows.
         Args:
@@ -191,15 +415,34 @@ class DsvHelper:
             strip: If True, strip whitespace from tokens.
             bookend: Optional bookend character to remove from tokens.
             bookend_strip: If True, strip whitespace after removing bookends.
+            normalize_columns: If > 0, ensure each returned list has exactly this many columns,
+                padding with empty strings or truncating as needed.
+            raise_on_missing_columns: If True, raise an error if a line has fewer columns than ``normalize_columns``.
+            raise_on_extra_columns: If True, raise an error if a line has more columns than ``normalize_columns``.
+        Raises:
+            SplurgeDsvParameterError: If ``delimiter`` is empty or None,
+                or if ``normalize_columns`` is negative,
+                or if ``chunk`` is not a list of strings, or if any element in ``chunk`` is not a string.
+            SplurgeDsvColumnMismatchError: If column validation fails.
         Returns:
             A list where each element is the token list for a corresponding
             input line from ``chunk``.
         """
-        return cls.parses(chunk, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
+        return cls.parses(
+            chunk,
+            delimiter=delimiter,
+            strip=strip,
+            bookend=bookend,
+            bookend_strip=bookend_strip,
+            normalize_columns=normalize_columns,
+            raise_on_missing_columns=raise_on_missing_columns,
+            raise_on_extra_columns=raise_on_extra_columns,
+        )
     @classmethod
-    def parse_stream(
+    def parse_file_stream(
         cls,
         file_path: PathLike[str] | str,
         *,
@@ -210,10 +453,19 @@ class DsvHelper:
         encoding: str = DEFAULT_ENCODING,
         skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
         skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
+        skip_empty_lines: bool = False,
+        normalize_columns: int = 0,
+        raise_on_missing_columns: bool = False,
+        raise_on_extra_columns: bool = False,
+        detect_columns: bool = False,
         chunk_size: int = DEFAULT_CHUNK_SIZE,
+        # How many chunks to scan when attempting to detect normalize_columns
+        # from the beginning of a stream. Only used when
+        # `detect_columns is True` and `normalize_columns` is falsy.
+        max_detect_chunks: int = MAX_DETECT_CHUNKS,
     ) -> Iterator[list[list[str]]]:
         """
-        Stream-parse a DSV file in chunks of lines.
+        Stream-parse a DSV file into chunks of lines.
         Args:
             file_path (PathLike[str] | str): The path to the file to parse.
@@ -224,34 +476,156 @@ class DsvHelper:
             encoding (str): The file encoding.
             skip_header_rows (int): Number of header rows to skip.
             skip_footer_rows (int): Number of footer rows to skip.
+            normalize_columns (int): If > 0, ensure each returned list has exactly this many columns,
+                padding with empty strings or truncating as needed.
+            raise_on_missing_columns (bool): If True, raise an error if a line has fewer columns than ``normalize_columns``.
+            raise_on_extra_columns (bool): If True, raise an error if a line has more columns than ``normalize_columns``.
+            detect_columns (bool): If True and ``normalize_columns`` is not set or <= 0,
+                detect the expected number of columns from the first non-blank logical row.
             chunk_size (int): Number of lines per chunk (default: 100).
+            max_detect_chunks (int): When detecting columns, how many chunks to scan
+                from the start of the stream before giving up (default: 10).
         Yields:
             list[list[str]]: Parsed rows for each chunk.
         Raises:
-            SplurgeParameterError: If delimiter is empty or None.
-            SplurgeFileNotFoundError: If the file does not exist.
-            SplurgeFilePermissionError: If the file cannot be accessed.
-            SplurgeFileEncodingError: If the file cannot be decoded with the specified encoding.
+            SplurgeDsvParameterError: If delimiter is empty or None, or if ``normalize_columns`` is negative,
+                or if ``chunk`` is not a list of strings, or if any element in ``chunk`` is not a string.
+            SplurgeDsvFileNotFoundError: If the file does not exist.
+            SplurgeDsvFilePermissionError: If the file cannot be accessed.
+            SplurgeDsvFileDecodingError: If the file cannot be decoded with the specified encoding.
+            SplurgeDsvPathValidationError: If the file path is invalid.
+            SplurgeDsvError: For other unexpected errors.
+            SplurgeDsvColumnMismatchError: If column validation fails.
         """
-        if delimiter is None or delimiter == "":
-            raise SplurgeDsvParameterError("delimiter cannot be empty or None")
+        effective_file_path = cls._validate_file_path(Path(file_path))
         chunk_size = max(chunk_size, cls.DEFAULT_MIN_CHUNK_SIZE)
         skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
         skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
-        # Use TextFileHelper.read_as_stream for consistent error handling
-        yield from (
-            cls._process_stream_chunk(
-                chunk, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip
-            )
-            for chunk in TextFileHelper.read_as_stream(
-                file_path,
+        # Allow callers to pass None to use the module default. Ensure we have
+        # a positive integer to drive the detection loop.
+        if max_detect_chunks is None:
+            max_detect_chunks = cls.MAX_DETECT_CHUNKS
+        else:
+            max_detect_chunks = max(int(max_detect_chunks), 1)
+        try:
+            reader = safe_io_text_file_reader.SafeTextFileReader(
+                effective_file_path,
                 encoding=encoding,
-                skip_header_rows=skip_header_rows,
-                skip_footer_rows=skip_footer_rows,
+                skip_header_lines=skip_header_rows,
+                skip_footer_lines=skip_footer_rows,
+                strip=strip,
+                skip_empty_lines=skip_empty_lines,
                 chunk_size=chunk_size,
             )
-        )
+            stream_iter = reader.read_as_stream()
+            if detect_columns and (not normalize_columns or normalize_columns <= 0):
+                # Buffer up to `max_detect_chunks` from the stream while
+                # searching for the first non-blank logical row. This allows us
+                # to detect the expected column count even if the first logical
+                # row doesn't appear in the very first chunk (for example,
+                # when the file begins with many blank lines or very small
+                # chunks).
+                buffered_chunks: list[list[str]] = []
+                max_scan = max_detect_chunks if max_detect_chunks is not None else cls.MAX_DETECT_CHUNKS
+                chunks_scanned = 0
+                while chunks_scanned < max_scan:
+                    try:
+                        chunk = next(stream_iter)
+                    except StopIteration:
+                        break
+                    buffered_chunks.append(chunk)
+                    # Inspect this chunk for the first non-blank logical row
+                    first_line = None
+                    for ln in chunk:
+                        if isinstance(ln, str) and ln.strip() != "":
+                            first_line = ln
+                            break
+                    if first_line is not None:
+                        detected = cls.parse(
+                            first_line,
+                            delimiter=delimiter,
+                            strip=strip,
+                            bookend=bookend,
+                            bookend_strip=bookend_strip,
+                            normalize_columns=0,
+                            raise_on_missing_columns=False,
+                            raise_on_extra_columns=False,
+                        )
+                        normalize_columns = len(detected)
+                        # remember which buffered chunk contained the first
+                        # logical row so we can start applying normalization
+                        # beginning with that chunk only
+                        detected_index = len(buffered_chunks) - 1
+                        break
+                    chunks_scanned += 1
+                # Replay any buffered chunks (in order) so callers receive the
+                # full content starting at the beginning of the file. If we
+                # detected the first logical row in one of the buffered chunks
+                # then only apply normalization beginning with that chunk;
+                # earlier buffered chunks must be emitted without
+                # normalization so we don't convert blank-only lines into
+                # padded empty-token rows.
+                if "detected_index" in locals():
+                    for idx, b in enumerate(buffered_chunks):
+                        use_norm = normalize_columns if idx == detected_index else 0
+                        yield cls._process_stream_chunk(
+                            b,
+                            delimiter=delimiter,
+                            strip=strip,
+                            bookend=bookend,
+                            bookend_strip=bookend_strip,
+                            normalize_columns=use_norm,
+                            raise_on_missing_columns=raise_on_missing_columns,
+                            raise_on_extra_columns=raise_on_extra_columns,
+                        )
+                else:
+                    for b in buffered_chunks:
+                        yield cls._process_stream_chunk(
+                            b,
+                            delimiter=delimiter,
+                            strip=strip,
+                            bookend=bookend,
+                            bookend_strip=bookend_strip,
+                            normalize_columns=0,
+                            raise_on_missing_columns=raise_on_missing_columns,
+                            raise_on_extra_columns=raise_on_extra_columns,
+                        )
+            # Continue streaming the rest of the file
+            for chunk in stream_iter:
+                yield cls._process_stream_chunk(
+                    chunk,
+                    delimiter=delimiter,
+                    strip=strip,
+                    bookend=bookend,
+                    bookend_strip=bookend_strip,
+                    normalize_columns=normalize_columns,
+                    raise_on_missing_columns=raise_on_missing_columns,
+                    raise_on_extra_columns=raise_on_extra_columns,
+                )
+        except safe_io_text_file_reader.SplurgeSafeIoFileDecodingError as ex:
+            raise SplurgeDsvFileDecodingError(f"File decoding error: {effective_file_path}") from ex
+        except safe_io_text_file_reader.SplurgeSafeIoFilePermissionError as ex:
+            raise SplurgeDsvFilePermissionError(f"File permission error: {effective_file_path}") from ex
+        except safe_io_text_file_reader.SplurgeSafeIoOsError as ex:
+            raise SplurgeDsvFilePermissionError(f"File access error: {effective_file_path}") from ex
+        except Exception as ex:
+            # Preserve and re-raise known SplurgeDsvError subclasses so
+            # callers can handle specific errors (e.g. column mismatch) as
+            # intended. Only wrap unknown exceptions in a generic
+            # SplurgeDsvError.
+            if isinstance(ex, SplurgeDsvError):
+                raise
+            raise SplurgeDsvError(f"Unexpected error reading file: {effective_file_path}") from ex

splurge_dsv/exceptions.py CHANGED Viewed

@@ -65,6 +65,15 @@ class SplurgeDsvFileNotFoundError(SplurgeDsvFileOperationError):
     """
+class SplurgeDsvFileExistsError(SplurgeDsvFileOperationError):
+    """Raised when attempting to create a file that already exists.
+    This typically maps to ``FileExistsError`` semantics but uses the
+    package-specific exception hierarchy so callers can distinguish
+    file errors from other error types.
+    """
 class SplurgeDsvFilePermissionError(SplurgeDsvFileOperationError):
     """Raised for permission or access-related file errors.
@@ -73,7 +82,7 @@ class SplurgeDsvFilePermissionError(SplurgeDsvFileOperationError):
     """
-class SplurgeDsvFileEncodingError(SplurgeDsvFileOperationError):
+class SplurgeDsvFileDecodingError(SplurgeDsvFileOperationError):
     """Raised when decoding or encoding a text file fails.
     The exception typically wraps the underlying decoding error and
@@ -81,6 +90,14 @@ class SplurgeDsvFileEncodingError(SplurgeDsvFileOperationError):
     """
+class SplurgeDsvFileEncodingError(SplurgeDsvFileOperationError):
+    """Raised when encoding a text file fails.
+    The exception typically wraps the underlying encoding error and
+    provides a descriptive message and optional details for diagnostics.
+    """
 class SplurgeDsvPathValidationError(SplurgeDsvFileOperationError):
     """Raised when a provided filesystem path fails validation checks.
@@ -101,6 +118,10 @@ class SplurgeDsvParsingError(SplurgeDsvDataProcessingError):
     """Raised when parsing fails due to malformed or unexpected content."""
+class SplurgeDsvColumnMismatchError(SplurgeDsvDataProcessingError):
+    """Raised when a row has a different number of columns than expected."""
 class SplurgeDsvTypeConversionError(SplurgeDsvDataProcessingError):
     """Raised when a value cannot be converted to the requested type."""

splurge-dsv 2025.2.0__py3-none-any.whl → 2025.3.1__py3-none-any.whl

splurge-dsv 2025.2.0py3-none-any.whl → 2025.3.1py3-none-any.whl