PyPI - pytest-pyspark-utils - Versions diffs - 1.0.2__py3-none-any.whl - Mend

pytest-pyspark-utils 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

pytest_pyspark_utils/__init__.py +4 -0
pytest_pyspark_utils/delta_caching.py +230 -0
pytest_pyspark_utils/plugin.py +320 -0
pytest_pyspark_utils-1.0.2.dist-info/METADATA +702 -0
pytest_pyspark_utils-1.0.2.dist-info/RECORD +11 -0
pytest_pyspark_utils-1.0.2.dist-info/WHEEL +5 -0
pytest_pyspark_utils-1.0.2.dist-info/entry_points.txt +2 -0
pytest_pyspark_utils-1.0.2.dist-info/licenses/LICENSE +675 -0
pytest_pyspark_utils-1.0.2.dist-info/top_level.txt +2 -0
sample_pyspark_app/__init__.py +0 -0
sample_pyspark_app/stats.py +14 -0

pytest_pyspark_utils/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from pytest_pyspark_utils.plugin import TableConfig, DeltaTablesResult
+from pytest_pyspark_utils.delta_caching import DeltaCaching
+__all__ = ["TableConfig", "DeltaTablesResult", "DeltaCaching"]

pytest_pyspark_utils/delta_caching.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""
+Delta Lake caching layer for pytest-pyspark-utils.
+Converts CSV/JSONL source files to Delta format and caches them on disk.
+Cache validity is determined by an MD5 hash of the source file content
+and schema. A cache hit skips re-conversion; a miss cleans the old cache
+and re-generates it.
+"""
+import hashlib
+import logging
+import shutil
+from pathlib import Path
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.types import StructType
+logger = logging.getLogger(__name__)
+class DeltaCaching:
+    """Manages on-disk Delta Lake caching for a single CSV or JSONL source file.
+    On first use (or when the source file changes) the data is converted to
+    Delta format and stored under ``cache_base_dir / dataset_name``.  Subsequent
+    calls with the same source file and schema reuse the cached Delta data.
+    Args:
+        source_path: Absolute or relative path to the CSV or JSONL source file.
+        cache_base_dir: Directory under which per-dataset Delta caches are stored.
+        spark: Active SparkSession.
+        schema: Optional Spark schema.  If omitted, schema is inferred from the file.
+        partition_by: Column names used for Delta partitioning or liquid clustering.
+        liquid_clustering: Write using Delta liquid clustering instead of Hive
+            partitioning.  Requires *schema* to be provided.
+        debug: Emit extra debug log messages.
+    """
+    def __init__(
+        self,
+        source_path: str,
+        cache_base_dir: Path,
+        spark: SparkSession,
+        schema: StructType | None = None,
+        partition_by: list[str] | None = None,
+        liquid_clustering: bool = False,
+        debug: bool = False,
+    ) -> None:
+        self.source_path = Path(source_path)
+        self.spark = spark
+        self.schema = schema
+        self.partition_by = partition_by
+        self.liquid_clustering = liquid_clustering
+        self.debug = debug
+        self.dataset = self.source_path.stem
+        self.cached_path = cache_base_dir / self.dataset
+        if self.debug:
+            logger.debug("cache_base_dir=%s", cache_base_dir)
+            logger.debug("dataset=%s", self.dataset)
+            logger.debug("cached_path=%s", self.cached_path)
+    @property
+    def hash_source(self) -> str:
+        """MD5 hex digest of the source file content combined with the schema JSON.
+        Returns ``"-2"`` when the source file does not exist so that a missing
+        file never falsely matches a populated cache (which uses ``"-1"`` as its
+        sentinel for a missing hash file).
+        """
+        if not self.source_path.exists():
+            return "-2"
+        content = self.source_path.read_text().encode("UTF-16")
+        schema_content = self.schema.json().encode("UTF-16") if self.schema else b""
+        return hashlib.md5(content + schema_content).hexdigest()
+    @property
+    def hash_cache(self) -> str:
+        """MD5 hex digest stored alongside the cached Delta table.
+        Returns ``"-1"`` when no hash file exists (cache is absent or corrupted).
+        """
+        hash_file = self.cached_path / "_source_data_hash"
+        if hash_file.exists():
+            return hash_file.read_text()
+        else:
+            return "-1"
+    def probe_cache(self) -> bool:
+        """Return ``True`` if the cached Delta table is up-to-date with the source."""
+        return self.hash_source == self.hash_cache
+    def cache(self) -> DataFrame:
+        """Ensure the Delta cache is valid and return a DataFrame over the source.
+        If the source hash matches the stored hash the existing cache is reused.
+        Otherwise the cache directory is cleaned, the source is converted to Delta,
+        and the hash file is written.
+        Returns:
+            DataFrame read from the source file (not from the Delta cache).
+        """
+        if self.probe_cache():
+            if self.debug:
+                logger.debug("%s: skipping, cached data is up to date", self.dataset)
+            return self.read()
+        else:
+            if self.debug:
+                logger.debug("%s: refreshing the cache", self.dataset)
+        self.clean_cache()
+        df = self.write_delta()
+        self.write_cache_hash()
+        self.remove_crc_files()
+        return df
+    def remove_crc_files(self) -> None:
+        """Delete all Hadoop CRC sidecar files from the cache directory."""
+        for crc_file in Path(self.cached_path).glob("**/*.crc"):
+            crc_file.unlink()
+    def write_cache_hash(self) -> None:
+        """Write the current source hash into the cache directory."""
+        self.cached_path.joinpath("_source_data_hash").write_text(self.hash_source)
+    def read(self) -> DataFrame:
+        """Read the source file into a Spark DataFrame.
+        Dispatches to :meth:`read_csv` or :meth:`read_jsonl` based on the file
+        extension.
+        Raises:
+            ValueError: For file extensions other than ``.csv`` or ``.jsonl``.
+        """
+        if self.source_path.suffix == ".csv":
+            return self.read_csv()
+        elif self.source_path.suffix == ".jsonl":
+            return self.read_jsonl()
+        else:
+            raise ValueError(f"Unsupported file format: {self.source_path.suffix}")
+    def read_jsonl(self) -> DataFrame:
+        """Read a JSONL file into a Spark DataFrame.
+        Uses the provided schema when available; otherwise infers the schema.
+        """
+        jsonl_path = self.source_path.as_posix()
+        if self.schema:
+            return self.spark.read.schema(self.schema).json(jsonl_path)
+        else:
+            return self.spark.read.option("inferSchema", "true").json(jsonl_path)
+    def read_csv(self) -> DataFrame:
+        """Read a CSV file into a Spark DataFrame.
+        Expects a header row.  Uses the provided schema when available;
+        otherwise infers the schema.  Empty strings are treated as ``null``.
+        """
+        csv_path = self.source_path.as_posix()
+        if self.schema:
+            return self.spark.read.options(header=True).option("nullValue", "null").schema(self.schema).csv(csv_path)
+        else:
+            return self.spark.read.options(header=True, inferSchema=True).option("nullValue", "null").csv(csv_path)
+    def write_delta(self) -> DataFrame:
+        """Convert the source file to Delta and write it to the cache directory.
+        Three write modes are supported (in priority order):
+        1. **Liquid clustering** — when ``liquid_clustering=True``, creates the table
+           via DDL and saves with ``saveAsTable``.  Requires *schema*.
+        2. **Partitioned** — when ``partition_by`` is set, writes a Hive-partitioned
+           Delta table.
+        3. **Plain** — unpartitioned Delta table saved directly to ``cached_path``.
+        Returns:
+            The source DataFrame (same object returned by :meth:`read`).
+        Raises:
+            ValueError: When ``liquid_clustering=True`` but no schema is provided.
+        """
+        if self.liquid_clustering and self.schema is None:
+            raise ValueError("liquid_clustering=True requires an explicit schema to be provided")
+        reader = self.read()
+        delta_location = self.cached_path.as_posix()
+        df_writer = reader.repartition(1).write.format("delta").mode("overwrite")
+        if self.liquid_clustering:
+            ddl = self.construct_table_ddl()
+            self.spark.sql(ddl)
+            df_writer.saveAsTable(self.dataset)
+        elif self.partition_by:
+            df_writer.partitionBy(self.partition_by).save(delta_location)
+        else:
+            df_writer.save(delta_location)
+        return reader
+    def clean_cache(self) -> None:
+        """Remove the cached Delta directory if it exists."""
+        if self.cached_path.exists():
+            shutil.rmtree(self.cached_path)
+    def construct_table_ddl(self) -> str:
+        """Build a ``CREATE TABLE … USING DELTA`` DDL statement for liquid clustering.
+        Uses ``self.schema`` to enumerate columns and ``self.partition_by`` for the
+        ``CLUSTER BY`` clause.
+        Returns:
+            A DDL string ready to pass to ``spark.sql()``.
+        """
+        delta_location = self.cached_path.as_posix()
+        columns = [f"{field.name} {field.dataType.simpleString()}" for field in self.schema.fields]
+        columns_str = ",\n ".join(columns)
+        if self.partition_by:
+            cluster_str = f"""CLUSTER BY ({", ".join(self.partition_by)})"""
+        else:
+            cluster_str = ""
+        return f"""
+            CREATE TABLE {self.dataset}({columns_str})
+            USING DELTA LOCATION '{delta_location}'
+            {cluster_str}
+        """

pytest_pyspark_utils/plugin.py ADDED Viewed

@@ -0,0 +1,320 @@
+"""pytest plugin providing PySpark fixtures with Delta Lake table caching.
+Fixtures:
+    spark: Session-scoped SparkSession with optional Delta Lake support.
+    delta_tables: Function-scoped ``DeltaTablesResult`` with per-test isolation.
+    set_utc_timezone: Sets TZ=UTC for the test session.
+    drop_hive_objects: Drops all Hive tables (utility, not auto-used).
+Configuration (pytest.ini / pyproject.toml / CLI):
+    delta_jar: Maven coordinates for Delta Lake JAR.
+    spark_app_name: Spark application name (default: pytest-pyspark).
+    delta_cache_dir: Cache directory name (default: _delta_cache).
+Usage:
+    Define a module-scoped ``delta_tables_config`` fixture returning
+    ``dict[str, TableConfig]``, then use ``delta_tables`` in your tests.
+"""
+import logging
+import os
+import random
+import shutil
+import string
+import sys
+import time
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+import pytest
+from pytest_pyspark_utils.delta_caching import DeltaCaching
+from pyspark.sql import DataFrame
+from pyspark.sql.types import StructType
+logger = logging.getLogger(__name__)
+@dataclass
+class TableConfig:
+    """Configuration for a single test table loaded from CSV or JSONL.
+    Args:
+        source: Subdirectory under the test folder where the source file lives
+            (``"input"`` or ``"expected"``). Defaults to ``"input"``.
+        schema: Explicit Spark schema. If ``None``, schema is inferred from the file.
+        table_name: SQL table name to register. Defaults to the source filename stem.
+        partition_by: Column names to partition the Delta table by.
+            Mutually exclusive with ``liquid_clustering``.
+        liquid_clustering: Enable Delta Lake liquid clustering (requires ``schema``).
+            Mutually exclusive with ``partition_by``.
+    """
+    source: str = "input"
+    schema: StructType | None = None
+    table_name: str | None = None
+    partition_by: list[str] | None = None
+    liquid_clustering: bool = False
+@dataclass
+class DeltaTablesResult:
+    """Result returned by the :func:`delta_tables` fixture.
+    Attributes:
+        tables: Mapping from config key (filename stem) to the corresponding DataFrame.
+        path: Filesystem path to the isolated Delta table copies for this test.
+    """
+    tables: dict[str, DataFrame]
+    path: str
+@dataclass
+class _CachedTables:
+    """Internal module-level cache returned by ``_prepare_tables_for_test``."""
+    entries: dict[str, tuple[str, DataFrame]] = field(default_factory=dict)
+    path: str = ""
+def determine_file_path(base_path: str, filename: str) -> str:
+    """Find the unique CSV or JSONL file matching *filename* in *base_path*.
+    Args:
+        base_path: Directory to search in.
+        filename: Stem name (no extension) to match.
+    Returns:
+        Absolute path string to the matched file.
+    Raises:
+        FileNotFoundError: If no matching file exists.
+        FileExistsError: If more than one matching file exists.
+    """
+    file_matches = [file for file in Path(base_path).glob(f"{filename}.*") if file.suffix in [".jsonl", ".csv"]]
+    if not file_matches:
+        raise FileNotFoundError(f"No file found for {filename} in {base_path}")
+    elif len(file_matches) > 1:
+        raise FileExistsError(
+            f"Multiple files found for {filename} in {base_path}: {[file.name for file in file_matches]}. "
+            f"Please ensure there is only one file for {filename} in the directory."
+        )
+    else:
+        return f"{base_path}/{file_matches[0].name}"
+def pytest_addoption(parser):
+    """Register CLI flags and INI options for the pyspark-delta-caching plugin."""
+    group = parser.getgroup("pyspark-delta-caching")
+    group.addoption(
+        "--delta-jar",
+        action="store",
+        dest="delta_jar",
+        default=None,
+        help=("Delta Lake Maven coordinates for spark.jars.packages, " "e.g. io.delta:delta-spark_2.13:4.0.1"),
+    )
+    parser.addini(
+        "delta_jar",
+        "Delta Lake Maven coordinates for spark.jars.packages",
+        default=None,
+    )
+    parser.addini(
+        "spark_app_name",
+        "Spark application name used in tests",
+        default="pytest-pyspark",
+    )
+    parser.addini(
+        "delta_cache_dir",
+        "Directory for persistent delta table cache (relative to rootdir)",
+        default="_delta_cache",
+    )
+# --- Internal fixtures ---
+@pytest.fixture(scope="session")
+def _pyspark_tmp_dir(tmp_path_factory):
+    base = tmp_path_factory.mktemp("delta")
+    yield base
+    shutil.rmtree(base, ignore_errors=True)
+@pytest.fixture(scope="module")
+def _pyspark_module_delta_path(_pyspark_tmp_dir, request):
+    return (_pyspark_tmp_dir / Path(request.node.name).stem).as_posix()
+@pytest.fixture(scope="module")
+def _prepare_tables_for_test(spark, _pyspark_module_delta_path, request):
+    def _prepare_tables_for_test(files: dict) -> _CachedTables:
+        start = datetime.now()
+        test_dir = request.path.parent
+        cache_base_dir = test_dir / request.config.getini("delta_cache_dir")
+        temp_delta = Path(_pyspark_module_delta_path)
+        entries: dict[str, tuple[str, DataFrame]] = {}
+        for filename, config in files.items():
+            table_name = config.table_name or filename
+            location = (test_dir / config.source).as_posix()
+            file_path = determine_file_path(base_path=location, filename=filename)
+            delta_caching = DeltaCaching(
+                source_path=file_path,
+                cache_base_dir=cache_base_dir,
+                spark=spark,
+                schema=config.schema,
+                partition_by=config.partition_by,
+                liquid_clustering=config.liquid_clustering,
+            )
+            _df = delta_caching.cache()
+            delta_target_path = temp_delta / table_name
+            shutil.copytree(delta_caching.cached_path, delta_target_path)
+            spark.sql(f"DROP TABLE IF EXISTS {table_name}")
+            spark.sql(f"CREATE TABLE {table_name} USING DELTA LOCATION '{delta_target_path.as_posix()}'")
+            entries[filename] = (table_name, _df)
+            print(f"successfully created delta table for {filename}")
+        duration = round((datetime.now() - start).total_seconds(), 1)
+        print(f"done with creating tables ({duration}s).")
+        return _CachedTables(entries=entries, path=_pyspark_module_delta_path)
+    return _prepare_tables_for_test
+@pytest.fixture(scope="module")
+def _delta_tables_cached(_prepare_tables_for_test, delta_tables_config) -> _CachedTables:
+    return _prepare_tables_for_test(delta_tables_config)
+# --- Public fixtures ---
+@pytest.fixture(scope="session")
+def set_utc_timezone():
+    """Set the process timezone to UTC for the duration of the test session."""
+    os.environ["TZ"] = "UTC"
+    time.tzset()
+@pytest.fixture(scope="session")
+def spark(set_utc_timezone, request):
+    """Create a session-scoped SparkSession configured for local testing.
+    Enables Delta Lake support when ``delta_jar`` is configured.  The session
+    uses a randomly-named database so parallel test runs remain isolated in the
+    Hive metastore.
+    Yields:
+        SparkSession ready for use in tests.
+    """
+    from pyspark.sql import SparkSession
+    delta_jar = request.config.getoption("--delta-jar") or request.config.getini("delta_jar") or None
+    app_name = request.config.getini("spark_app_name")
+    database_name = "pytest_" + "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(4))
+    os.environ["PYSPARK_PYTHON"] = sys.executable
+    os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
+    builder = (
+        SparkSession.builder.master("local[*]")
+        .appName(app_name)
+        .config("spark.sql.shuffle.partitions", "1")
+        .config("spark.databricks.delta.snapshotPartitions", "2")
+        .config("spark.ui.showConsoleProgress", "false")
+        .config("spark.ui.enabled", "false")
+        .config("spark.ui.dagGraph.retainedRootRDDs", "1")
+        .config("spark.ui.retainedJobs", "1")
+        .config("spark.ui.retainedStages", "1")
+        .config("spark.ui.retainedTasks", "1")
+        .config("spark.sql.ui.retainedExecutions", "1")
+        .config("spark.worker.ui.retainedExecutors", "1")
+        .config("spark.worker.ui.retainedDrivers", "1")
+        .config("spark.driver.memory", "4g")
+        .config("spark.sql.autoBroadcastJoinThreshold", "-1")
+        .config(
+            "spark.driver.extraJavaOptions",
+            "-Duser.timezone=UTC -XX:+UseCompressedOops",
+        )
+        .config("spark.executor.extraJavaOptions", "-Duser.timezone=UTC")
+        .config("spark.sql.session.timeZone", "UTC")
+    )
+    if delta_jar:
+        builder = (
+            builder.config("spark.jars.packages", delta_jar)
+            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+            .config(
+                "spark.sql.catalog.spark_catalog",
+                "org.apache.spark.sql.delta.catalog.DeltaCatalog",
+            )
+        )
+    spark_session = builder.getOrCreate()
+    spark_session.sparkContext.setLogLevel("ERROR")
+    spark_session.sql(f"create database if not exists {database_name}")
+    spark_session.sql(f"use {database_name}")
+    try:
+        yield spark_session
+    finally:
+        spark_session.stop()
+@pytest.fixture(scope="function")
+def delta_tables(spark, _delta_tables_cached: _CachedTables, _pyspark_tmp_dir, tmp_path) -> DeltaTablesResult:
+    """Provide per-test isolated Delta tables as a :class:`DeltaTablesResult`.
+    Copies the module-level cached tables to a function-specific directory,
+    drops all existing Hive tables, and re-registers fresh copies.  Mutations
+    made during a test do not affect sibling tests.
+    Args:
+        spark: The session-scoped SparkSession.
+        _delta_tables_cached: Module-level cached table entries.
+        _pyspark_tmp_dir: Session-scoped temp directory.
+        tmp_path: pytest-provided per-test temp directory (used as a unique suffix).
+    Returns:
+        A :class:`DeltaTablesResult` with ``tables`` (filename → DataFrame) and
+        ``path`` (directory holding the isolated Delta copies).
+    """
+    source = _delta_tables_cached.path
+    dest = Path(str(_pyspark_tmp_dir)) / "isolated_tables" / tmp_path.name
+    shutil.copytree(Path(source), dest, dirs_exist_ok=True)
+    tables = spark.sql("SHOW TABLES").collect()
+    for table in tables:
+        fqn = f"{table.namespace}.{table.tableName}" if table.namespace else table.tableName
+        spark.sql(f"DROP TABLE IF EXISTS {fqn}")
+    result_tables: dict[str, DataFrame] = {}
+    for filename, (table_name, df) in _delta_tables_cached.entries.items():
+        table_path = dest / table_name
+        spark.sql(f"CREATE TABLE {table_name} USING DELTA LOCATION '{table_path.as_posix()}'")
+        result_tables[filename] = df
+    return DeltaTablesResult(tables=result_tables, path=dest.as_posix())
+@pytest.fixture(scope="function")
+def drop_hive_objects(spark):
+    """Drop all Hive tables in the current database.
+    Useful as an explicit teardown step in tests that create their own tables
+    outside of the ``delta_tables`` fixture.
+    """
+    tables = spark.sql("SHOW TABLES").collect()
+    for table in tables:
+        fqn = f"{table.namespace}.{table.tableName}" if table.namespace else table.tableName
+        spark.sql(f"DROP TABLE IF EXISTS {fqn}")