PyPI - splurge-dsv - Versions diffs - 2025.2.1__py3-none-any.whl → 2025.3.1__py3-none-any.whl - Mend

splurge-dsv 2025.2.1py3-none-any.whl → 2025.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

splurge_dsv/__init__.py +16 -5
splurge_dsv/cli.py +137 -26
splurge_dsv/dsv.py +100 -30
splurge_dsv/dsv_helper.py +415 -90
splurge_dsv/exceptions.py +22 -1
splurge_dsv/string_tokenizer.py +7 -1
{splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/METADATA +78 -5
splurge_dsv-2025.3.1.dist-info/RECORD +13 -0
splurge_dsv/path_validator.py +0 -298
splurge_dsv/safe_text_file_reader.py +0 -177
splurge_dsv/safe_text_file_writer.py +0 -136
splurge_dsv/text_file_helper.py +0 -240
splurge_dsv-2025.2.1.dist-info/RECORD +0 -17
{splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/WHEEL +0 -0
{splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/entry_points.txt +0 -0
{splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/licenses/LICENSE +0 -0
{splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/top_level.txt +0 -0

splurge_dsv/__init__.py CHANGED Viewed

@@ -13,9 +13,20 @@ Copyright (c) 2025 Jim Schilling
 # test cases may remove the process working directory which causes calls to
 # os.getcwd() to raise FileNotFoundError later during test execution. Guard
 # against that here by switching to this package directory when cwd is missing.
+# Ensure the required external implementation is available on import so the
+# rest of the package can rely on its APIs. Fail fast with a helpful message
+# instructing the user to install the package if it's missing.
+import importlib as _importlib
 import os
 from pathlib import Path as _Path
+try:  # pragma: no cover - import-time guard
+    _importlib.import_module("splurge_safe_io")
+except Exception as e:
+    raise ImportError(
+        "Missing required dependency 'splurge-safe-io'. Please install it: `pip install splurge-safe-io`"
+    ) from e
 try:
     try:
         # os.getcwd() can raise FileNotFoundError in CI/runner environments
@@ -35,11 +46,13 @@ except Exception:
 from splurge_dsv.dsv import Dsv, DsvConfig
 from splurge_dsv.dsv_helper import DsvHelper
 from splurge_dsv.exceptions import (
+    SplurgeDsvColumnMismatchError,
     SplurgeDsvConfigurationError,
     SplurgeDsvDataProcessingError,
     # canonical SplurgeDsv* exception names
     SplurgeDsvError,
     SplurgeDsvFileEncodingError,
+    SplurgeDsvFileExistsError,
     SplurgeDsvFileNotFoundError,
     SplurgeDsvFileOperationError,
     SplurgeDsvFilePermissionError,
@@ -56,11 +69,9 @@ from splurge_dsv.exceptions import (
     SplurgeDsvTypeConversionError,
     SplurgeDsvValidationError,
 )
-from splurge_dsv.path_validator import PathValidator
 from splurge_dsv.string_tokenizer import StringTokenizer
-from splurge_dsv.text_file_helper import TextFileHelper
-__version__ = "2025.2.1"
+__version__ = "2025.3.1"
 __author__ = "Jim Schilling"
 __license__ = "MIT"
@@ -79,6 +90,7 @@ __all__ = [
     "SplurgeDsvPathValidationError",
     "SplurgeDsvDataProcessingError",
     "SplurgeDsvParsingError",
+    "SplurgeDsvColumnMismatchError",
     "SplurgeDsvTypeConversionError",
     "SplurgeDsvStreamingError",
     "SplurgeDsvConfigurationError",
@@ -89,8 +101,7 @@ __all__ = [
     "SplurgeDsvParameterError",
     "SplurgeDsvRangeError",
     "SplurgeDsvFormatError",
+    "SplurgeDsvFileExistsError",
     # Utility classes
     "StringTokenizer",
-    "TextFileHelper",
-    "PathValidator",
 ]

splurge_dsv/cli.py CHANGED Viewed

@@ -23,6 +23,7 @@ from pathlib import Path
 # Local imports
 from splurge_dsv import __version__
 from splurge_dsv.dsv import Dsv, DsvConfig
+from splurge_dsv.dsv_helper import DsvHelper
 from splurge_dsv.exceptions import SplurgeDsvError
@@ -39,14 +40,31 @@ def parse_arguments() -> argparse.Namespace:
         epilog="""
 Examples:
   python -m splurge_dsv data.csv --delimiter ,
-  python -m splurge_dsv data.tsv --delimiter "\\t"
+  python -m splurge_dsv data.tsv --delimiter "\t"
   python -m splurge_dsv data.txt --delimiter "|" --bookend '"'
-        """,
+  # Auto-detect the expected column count and normalize rows
+  python -m splurge_dsv data.csv --delimiter , --detect-columns --max-detect-chunks 5
+  # Stream a large file while attempting to detect the column count from the first non-blank logical row
+  python -m splurge_dsv large.csv --delimiter , --stream --detect-columns --max-detect-chunks 10
+    """,
     )
     parser.add_argument("file_path", type=str, help="Path to the DSV file to parse")
-    parser.add_argument("--delimiter", "-d", type=str, required=True, help="Delimiter character to use for parsing")
+    parser.add_argument(
+        "--config",
+        "-c",
+        dest="config",
+        type=str,
+        help="Path to a YAML config file that mirrors CLI options (values overridden by CLI args)",
+    )
+    parser.add_argument(
+        "--delimiter",
+        "-d",
+        type=str,
+        help="Delimiter character to use for parsing (may also be provided via --config)",
+    )
     parser.add_argument("--bookend", "-b", type=str, help="Bookend character for text fields (e.g., '\"')")
@@ -64,7 +82,53 @@ Examples:
         "--stream", "-s", action="store_true", help="Stream the file in chunks instead of loading entirely into memory"
     )
-    parser.add_argument("--chunk-size", type=int, default=500, help="Chunk size for streaming (default: 500)")
+    parser.add_argument(
+        "--detect-columns",
+        action="store_true",
+        help=(
+            "Auto-detect the expected column count from the first non-blank logical row "
+            "and normalize subsequent rows to that count. For streamed parsing, the "
+            "detector may scan up to --max-detect-chunks chunks from the start of the file."
+        ),
+    )
+    parser.add_argument(
+        "--raise-on-missing-columns",
+        action="store_true",
+        help="Raise an error if a row has fewer columns than the detected/expected count",
+    )
+    parser.add_argument(
+        "--raise-on-extra-columns",
+        action="store_true",
+        help="Raise an error if a row has more columns than the detected/expected count",
+    )
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=DsvHelper.DEFAULT_CHUNK_SIZE,
+        help=(
+            f"Chunk size for streaming (minimum: {DsvHelper.DEFAULT_MIN_CHUNK_SIZE}, "
+            f"default: {DsvHelper.DEFAULT_CHUNK_SIZE})"
+        ),
+    )
+    parser.add_argument(
+        "--max-detect-chunks",
+        type=int,
+        default=DsvHelper.MAX_DETECT_CHUNKS,
+        help=(
+            "When detecting columns while streaming (use --detect-normalize-columns), "
+            f"scan up to N chunks from the start of the stream before giving up (default: {DsvHelper.MAX_DETECT_CHUNKS})."
+        ),
+    )
+    parser.add_argument(
+        "--skip-empty-lines",
+        action="store_true",
+        help="Have the underlying reader skip raw empty logical lines (line.strip() == '') before parsing",
+    )
     parser.add_argument(
         "--output-format",
@@ -141,17 +205,56 @@ def run_cli() -> int:
             print(f"Error: '{args.file_path}' is not a file.", file=sys.stderr)
             return 1
+        # Build base config either from YAML file (if provided) or from CLI args
+        base_params = {}
+        if args.config:
+            try:
+                import yaml  # type: ignore
+                cfg_path = Path(args.config)
+                if not cfg_path.exists():
+                    print(f"Error: Config file '{args.config}' not found.", file=sys.stderr)
+                    return 1
+                with cfg_path.open("r", encoding="utf-8") as fh:
+                    file_cfg = yaml.safe_load(fh) or {}
+                if not isinstance(file_cfg, dict):
+                    print(f"Error: Config file '{args.config}' must contain a mapping/dictionary.", file=sys.stderr)
+                    return 1
+                base_params.update(file_cfg)
+            except Exception as e:
+                print(f"Error reading config file '{args.config}': {e}", file=sys.stderr)
+                return 1
+        # CLI args override YAML values when provided. Build the parameter map
+        cli_params = {
+            "delimiter": args.delimiter,
+            "strip": not args.no_strip,
+            "bookend": args.bookend,
+            "bookend_strip": not args.no_bookend_strip,
+            "encoding": args.encoding,
+            "skip_header_rows": args.skip_header,
+            "skip_footer_rows": args.skip_footer,
+            "chunk_size": args.chunk_size,
+            "detect_columns": args.detect_columns,
+            "raise_on_missing_columns": args.raise_on_missing_columns,
+            "raise_on_extra_columns": args.raise_on_extra_columns,
+            "max_detect_chunks": args.max_detect_chunks,
+            "skip_empty_lines": args.skip_empty_lines,
+        }
+        # Merge: start from file (if any), then overlay CLI-provided values
+        merged = {**base_params, **{k: v for k, v in cli_params.items() if v is not None}}
         # Create configuration and Dsv instance for parsing
-        config = DsvConfig(
-            delimiter=args.delimiter,
-            strip=not args.no_strip,
-            bookend=args.bookend,
-            bookend_strip=not args.no_bookend_strip,
-            encoding=args.encoding,
-            skip_header_rows=args.skip_header,
-            skip_footer_rows=args.skip_footer,
-            chunk_size=args.chunk_size,
-        )
+        try:
+            config = DsvConfig.from_params(**merged)
+        except Exception as e:
+            print(f"Error building configuration: {e}", file=sys.stderr)
+            return 1
+        dsv = Dsv(config)
         dsv = Dsv(config)
         # Parse the file
@@ -161,18 +264,26 @@ def run_cli() -> int:
             chunk_count = 0
             total_rows = 0
-            for chunk in dsv.parse_file_stream(file_path):
-                chunk_count += 1
-                total_rows += len(chunk)
-                if args.output_format == "json":
-                    print(json.dumps(chunk, ensure_ascii=False))
-                elif args.output_format == "ndjson":
-                    for row in chunk:
-                        print(json.dumps(row, ensure_ascii=False))
-                else:
-                    print(f"Chunk {chunk_count}: {len(chunk)} rows")
-                    print_results(chunk, args.delimiter)
-                    print()
+            try:
+                for chunk in dsv.parse_file_stream(file_path):
+                    chunk_count += 1
+                    total_rows += len(chunk)
+                    if args.output_format == "json":
+                        print(json.dumps(chunk, ensure_ascii=False))
+                    elif args.output_format == "ndjson":
+                        for row in chunk:
+                            print(json.dumps(row, ensure_ascii=False))
+                    else:
+                        print(f"Chunk {chunk_count}: {len(chunk)} rows")
+                        print_results(chunk, args.delimiter)
+                        print()
+            except Exception as e:
+                print(f"Error during streaming: {e}", file=sys.stderr)
+                import traceback
+                traceback.print_exc(file=sys.stderr)
+                return 1
             if args.output_format not in ["json", "ndjson"]:
                 print(f"Total: {total_rows} rows in {chunk_count} chunks")

splurge_dsv/dsv.py CHANGED Viewed

@@ -9,7 +9,7 @@ files, and streaming large inputs.
 Public API:
     - DsvConfig: Configuration dataclass for parsing behavior.
-    - Dsv: Parser instance that performs parse/parse_file/parse_stream.
+    - Dsv: Parser instance that performs parse/parse_file/parse_file_stream.
 License: MIT
@@ -17,10 +17,10 @@ Copyright (c) 2025 Jim Schilling
 """
 # Standard library imports
-import warnings
 from collections.abc import Iterator
 from dataclasses import dataclass, fields
 from os import PathLike
+from pathlib import Path
 # Local imports
 from splurge_dsv.dsv_helper import DsvHelper
@@ -43,6 +43,10 @@ class DsvConfig:
         skip_header_rows: Number of header rows to skip when reading files.
         skip_footer_rows: Number of footer rows to skip when reading files.
         chunk_size: Size of chunks for streaming operations.
+        detect_columns: Whether to auto-detect column count from data.
+        raise_on_missing_columns: If True, raise an error if rows have fewer columns than detected
+        raise_on_extra_columns: If True, raise an error if rows have more columns than detected
+        max_detect_chunks: Maximum number of chunks to scan for column detection
     Raises:
         SplurgeDsvParameterError: If delimiter is empty, chunk_size is too
@@ -56,7 +60,16 @@ class DsvConfig:
     encoding: str = "utf-8"
     skip_header_rows: int = 0
     skip_footer_rows: int = 0
-    chunk_size: int = 500
+    # When True, instruct the underlying SafeTextFileReader to remove raw
+    # empty logical lines (where line.strip() == "") before returning
+    # content. Defaults to False to preserve historical behavior.
+    skip_empty_lines: bool = False
+    chunk_size: int = DsvHelper.DEFAULT_MIN_CHUNK_SIZE
+    # Column normalization and detection flags
+    detect_columns: bool = False
+    raise_on_missing_columns: bool = False
+    raise_on_extra_columns: bool = False
+    max_detect_chunks: int = DsvHelper.MAX_DETECT_CHUNKS
     def __post_init__(self) -> None:
         """Validate configuration after initialization.
@@ -137,6 +150,53 @@ class DsvConfig:
         filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_fields}
         return cls(**filtered_kwargs)
+    @classmethod
+    def from_file(cls, file_path: PathLike[str] | Path | str) -> "DsvConfig":
+        """
+        Load a YAML configuration file and return a DsvConfig instance.
+        The YAML should contain a mapping whose keys correspond to
+        DsvConfig field names (for example: delimiter, strip, bookend,
+        encoding, skip_header_rows, etc.). Unknown keys are ignored.
+        Args:
+            file_path: Path to the YAML configuration file.
+        Returns:
+            DsvConfig: Configuration object built from the YAML file.
+        Raises:
+            SplurgeDsvParameterError: If the file cannot be read, parsed,
+                or does not contain a mapping at the top level.
+        """
+        try:
+            import yaml  # type: ignore
+        except Exception as e:  # pragma: no cover - dependency issues surfaced elsewhere
+            raise SplurgeDsvParameterError(f"PyYAML is required to load config files: {e}") from e
+        p = Path(file_path)
+        if not p.exists():
+            raise SplurgeDsvParameterError(f"Config file '{file_path}' not found")
+        try:
+            with p.open("r", encoding="utf-8") as fh:
+                data = yaml.safe_load(fh) or {}
+        except Exception as e:
+            raise SplurgeDsvParameterError(f"Failed to read or parse config file '{file_path}': {e}") from e
+        if not isinstance(data, dict):
+            raise SplurgeDsvParameterError("Config file must contain a top-level mapping/dictionary of options")
+        # Filter and construct via existing from_params helper
+        valid_fields = {f.name for f in fields(cls)}
+        filtered = {k: v for k, v in data.items() if k in valid_fields}
+        # Ensure required values are present in the config (delimiter is required)
+        if "delimiter" not in filtered:
+            raise SplurgeDsvParameterError("Config file must include the required 'delimiter' option")
+        return cls.from_params(**filtered)
 class Dsv:
     """Parser class that binds a :class:`DsvConfig` to parsing operations.
@@ -173,6 +233,7 @@ class Dsv:
         Raises:
             SplurgeDsvParameterError: If the configured delimiter is invalid.
+            SplurgeDsvColumnMismatchError: If column validation fails.
         """
         return DsvHelper.parse(
             content,
@@ -180,6 +241,9 @@ class Dsv:
             strip=self.config.strip,
             bookend=self.config.bookend,
             bookend_strip=self.config.bookend_strip,
+            normalize_columns=0,
+            raise_on_missing_columns=self.config.raise_on_missing_columns,
+            raise_on_extra_columns=self.config.raise_on_extra_columns,
         )
     def parses(self, content: list[str]) -> list[list[str]]:
@@ -192,6 +256,10 @@ class Dsv:
         Returns:
             List of lists of parsed strings
+        Raises:
+            SplurgeDsvParameterError: If the configured delimiter is invalid.
+            SplurgeDsvColumnMismatchError: If column validation fails.
         Example:
             >>> parser = Dsv(DsvConfig(delimiter=","))
             >>> parser.parses(["a,b", "c,d"])
@@ -203,9 +271,13 @@ class Dsv:
             strip=self.config.strip,
             bookend=self.config.bookend,
             bookend_strip=self.config.bookend_strip,
+            normalize_columns=0,
+            raise_on_missing_columns=self.config.raise_on_missing_columns,
+            raise_on_extra_columns=self.config.raise_on_extra_columns,
+            detect_columns=self.config.detect_columns,
         )
-    def parse_file(self, file_path: PathLike[str] | str) -> list[list[str]]:
+    def parse_file(self, file_path: PathLike[str] | Path | str) -> list[list[str]]:
         """Parse a DSV file and return all rows as lists of strings.
         Args:
@@ -215,10 +287,13 @@ class Dsv:
             A list of rows, where each row is a list of string tokens.
         Raises:
+            SplurgeDsvPathValidationError: If the file path is invalid.
             SplurgeDsvFileNotFoundError: If the file cannot be found.
             SplurgeDsvFilePermissionError: If the file cannot be read.
-            SplurgeDsvFileEncodingError: If the file cannot be decoded with
-                the configured encoding.
+            SplurgeDsvFileDecodingError: If the file cannot be decoded with the configured encoding.
+            SplurgeDsvColumnMismatchError: If column validation fails.
+            SplurgeDsvParameterError: If the configured delimiter is invalid.
+            SplurgeDsvError: For other unexpected errors.
         """
         return DsvHelper.parse_file(
             file_path,
@@ -228,10 +303,14 @@ class Dsv:
             bookend_strip=self.config.bookend_strip,
             encoding=self.config.encoding,
             skip_header_rows=self.config.skip_header_rows,
+            skip_empty_lines=self.config.skip_empty_lines,
             skip_footer_rows=self.config.skip_footer_rows,
+            detect_columns=self.config.detect_columns,
+            raise_on_missing_columns=self.config.raise_on_missing_columns,
+            raise_on_extra_columns=self.config.raise_on_extra_columns,
         )
-    def parse_file_stream(self, file_path: PathLike[str] | str) -> Iterator[list[list[str]]]:
+    def parse_file_stream(self, file_path: PathLike[str] | Path | str) -> Iterator[list[list[str]]]:
         """Stream-parse a DSV file, yielding chunks of parsed rows.
         The method yields lists of parsed rows (each row itself is a list of
@@ -243,6 +322,15 @@ class Dsv:
         Yields:
             Lists of parsed rows, each list containing up to ``chunk_size`` rows.
+        Raises:
+            SplurgeDsvPathValidationError: If the file path is invalid.
+            SplurgeDsvFileNotFoundError: If the file cannot be found.
+            SplurgeDsvFilePermissionError: If the file cannot be read.
+            SplurgeDsvFileDecodingError: If the file cannot be decoded with the configured encoding.
+            SplurgeDsvColumnMismatchError: If column validation fails.
+            SplurgeDsvParameterError: If the configured delimiter is invalid.
+            SplurgeDsvError: For other unexpected errors.
         """
         return DsvHelper.parse_file_stream(
             file_path,
@@ -252,29 +340,11 @@ class Dsv:
             bookend_strip=self.config.bookend_strip,
             encoding=self.config.encoding,
             skip_header_rows=self.config.skip_header_rows,
+            skip_empty_lines=self.config.skip_empty_lines,
             skip_footer_rows=self.config.skip_footer_rows,
+            detect_columns=self.config.detect_columns,
+            raise_on_missing_columns=self.config.raise_on_missing_columns,
+            raise_on_extra_columns=self.config.raise_on_extra_columns,
             chunk_size=self.config.chunk_size,
+            max_detect_chunks=self.config.max_detect_chunks,
         )
-    def parse_stream(self, file_path: PathLike[str] | str) -> Iterator[list[list[str]]]:
-        """Stream-parse a DSV file, yielding chunks of parsed rows.
-        The method yields lists of parsed rows (each row itself is a list of
-        strings). Chunk sizing is controlled by the bound configuration's
-        ``chunk_size`` value.
-        Args:
-            file_path: Path to the file to parse.
-        Yields:
-            Lists of parsed rows, each list containing up to ``chunk_size`` rows.
-        Deprecated: Use `parse_file_stream` instead. This method will be removed in a future release.
-        """
-        # Emit a DeprecationWarning to signal removal in a future release
-        warnings.warn(
-            "Dsv.parse_stream() is deprecated and will be removed in a future release; use Dsv.parse_file_stream() instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return Dsv.parse_file_stream(self, file_path)

splurge-dsv 2025.2.1__py3-none-any.whl → 2025.3.1__py3-none-any.whl

splurge-dsv 2025.2.1py3-none-any.whl → 2025.3.1py3-none-any.whl