PyPI - snowpark-checkpoints-validators - Versions diffs - 0.2.0rc1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

snowpark-checkpoints-validators 0.2.0rc1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

snowflake/snowpark_checkpoints/utils/utils_checks.py ADDED Viewed

@@ -0,0 +1,398 @@
+# Copyright 2025 Snowflake Inc.
+# SPDX-License-Identifier: Apache-2.0
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import json
+import logging
+import os
+import re
+from datetime import datetime
+from typing import Any, Optional
+import numpy as np
+from pandera import DataFrameSchema
+from snowflake.snowpark import DataFrame as SnowparkDataFrame
+from snowflake.snowpark_checkpoints.errors import SchemaValidationError
+from snowflake.snowpark_checkpoints.io_utils.io_file_manager import get_io_file_manager
+from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
+from snowflake.snowpark_checkpoints.snowpark_sampler import (
+    SamplingAdapter,
+    SamplingStrategy,
+)
+from snowflake.snowpark_checkpoints.utils.constants import (
+    CHECKPOINT_JSON_OUTPUT_FILE_FORMAT_NAME,
+    CHECKPOINT_TABLE_NAME_FORMAT,
+    COLUMNS_KEY,
+    DATAFRAME_CUSTOM_DATA_KEY,
+    DATAFRAME_EXECUTION_MODE,
+    DATAFRAME_PANDERA_SCHEMA_KEY,
+    DEFAULT_KEY,
+    EXCEPT_HASH_AGG_QUERY,
+    FAIL_STATUS,
+    PASS_STATUS,
+    SNOWPARK_CHECKPOINTS_OUTPUT_DIRECTORY_NAME,
+)
+from snowflake.snowpark_checkpoints.utils.extra_config import (
+    get_checkpoint_file,
+)
+from snowflake.snowpark_checkpoints.utils.pandera_check_manager import (
+    PanderaCheckManager,
+)
+from snowflake.snowpark_checkpoints.utils.telemetry import STATUS_KEY, report_telemetry
+from snowflake.snowpark_checkpoints.validation_result_metadata import (
+    ValidationResultsMetadata,
+)
+from snowflake.snowpark_checkpoints.validation_results import ValidationResult
+LOGGER = logging.getLogger(__name__)
+def _replace_special_characters(checkpoint_name: str) -> str:
+    """Replace special characters in the checkpoint name with underscores.
+    Args:
+        checkpoint_name (str): The checkpoint name to process.
+    Returns:
+        str: The checkpoint name with special characters replaced by underscores.
+    """
+    regex = r"^[a-zA-Z_\s-][a-zA-Z0-9$_\s-]*$"
+    if not bool(re.match(regex, checkpoint_name)):
+        raise ValueError(
+            f"Invalid checkpoint name: {checkpoint_name}",
+            "Checkpoint name must contain only alphanumeric characters, hyphens, underscores and dollar signs.",
+        )
+    return re.sub(r"[\s-]", "_", checkpoint_name)
+def _process_sampling(
+    df: SnowparkDataFrame,
+    pandera_schema: DataFrameSchema,
+    job_context: Optional[SnowparkJobContext] = None,
+    sample_frac: Optional[float] = 1.0,
+    sample_number: Optional[int] = None,
+    sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
+):
+    """Process a Snowpark DataFrame by sampling it according to the specified parameters.
+    Adjusts the column casing of the provided Pandera schema to uppercase.
+    Args:
+        df (SnowparkDataFrame): The Snowpark DataFrame to be sampled.
+        pandera_schema (DataFrameSchema): The Pandera schema to validate the DataFrame.
+        job_context (SnowparkJobContext, optional): The job context for the sampling operation.
+            Defaults to None.
+        sample_frac (Optional[float], optional): The fraction of rows to sample.
+            Defaults to 0.1.
+        sample_number (Optional[int], optional): The number of rows to sample.
+            Defaults to None.
+        sampling_strategy (Optional[SamplingStrategy], optional): The strategy to use for sampling.
+            Defaults to SamplingStrategy.RANDOM_SAMPLE.
+    Returns:
+        Tuple[DataFrameSchema, pd.DataFrame]: A tuple containing the adjusted Pandera schema with uppercase column names
+        and the sampled pandas DataFrame.
+    """
+    sampler = SamplingAdapter(
+        job_context, sample_frac, sample_number, sampling_strategy
+    )
+    sampler.process_args([df])
+    # fix up the column casing
+    pandera_schema_upper = pandera_schema
+    new_columns: dict[Any, Any] = {}
+    for col in pandera_schema.columns:
+        new_columns[col.upper()] = pandera_schema.columns[col]
+    pandera_schema_upper = pandera_schema_upper.remove_columns(pandera_schema.columns)
+    pandera_schema_upper = pandera_schema_upper.add_columns(new_columns)
+    sample_df = sampler.get_sampled_pandas_args()[0]
+    sample_df.index = np.ones(sample_df.count().iloc[0])
+    return pandera_schema_upper, sample_df
+def _generate_schema(
+    checkpoint_name: str, output_path: Optional[str] = None
+) -> DataFrameSchema:
+    """Generate a DataFrameSchema based on the checkpoint name provided.
+    This function reads a JSON file corresponding to the checkpoint name,
+    extracts schema information, and constructs a DataFrameSchema object.
+    It also adds custom checks for numeric and boolean types if specified
+    in the JSON file.
+    Args:
+        checkpoint_name (str): The name of the checkpoint used to locate
+                               the JSON file containing schema information.
+        output_path (str): The path to the output directory.
+        DataFrameSchema: A schema object representing the structure and
+                         constraints of the DataFrame.
+                         constraints of the DataFrame.
+    """
+    LOGGER.info(
+        "Generating Pandera DataFrameSchema for checkpoint: '%s'", checkpoint_name
+    )
+    current_directory_path = (
+        output_path if output_path else get_io_file_manager().getcwd()
+    )
+    output_directory_path = os.path.join(
+        current_directory_path, SNOWPARK_CHECKPOINTS_OUTPUT_DIRECTORY_NAME
+    )
+    if not get_io_file_manager().folder_exists(output_directory_path):
+        raise ValueError(
+            """Output directory snowpark-checkpoints-output does not exist.
+Please run the Snowpark checkpoint collector first."""
+        )
+    checkpoint_schema_file_path = os.path.join(
+        output_directory_path,
+        CHECKPOINT_JSON_OUTPUT_FILE_FORMAT_NAME.format(checkpoint_name),
+    )
+    if not get_io_file_manager().file_exists(checkpoint_schema_file_path):
+        raise ValueError(
+            f"Checkpoint {checkpoint_name} JSON file not found. Please run the Snowpark checkpoint collector first."
+        )
+    LOGGER.info("Reading schema from file: '%s'", checkpoint_schema_file_path)
+    schema_file = get_io_file_manager().read(checkpoint_schema_file_path)
+    checkpoint_schema_config = json.loads(schema_file)
+    if DATAFRAME_PANDERA_SCHEMA_KEY not in checkpoint_schema_config:
+        raise ValueError(
+            f"Pandera schema not found in the JSON file for checkpoint: {checkpoint_name}"
+        )
+    schema_dict = checkpoint_schema_config.get(DATAFRAME_PANDERA_SCHEMA_KEY)
+    schema_dict_str = json.dumps(schema_dict)
+    schema = DataFrameSchema.from_json(schema_dict_str)
+    if DATAFRAME_CUSTOM_DATA_KEY not in checkpoint_schema_config:
+        LOGGER.info(
+            "No custom data found in the JSON file for checkpoint: '%s'",
+            checkpoint_name,
+        )
+        return schema
+    custom_data = checkpoint_schema_config.get(DATAFRAME_CUSTOM_DATA_KEY)
+    if COLUMNS_KEY not in custom_data:
+        raise ValueError(
+            f"Columns not found in the JSON file for checkpoint: {checkpoint_name}"
+        )
+    pandera_check_manager = PanderaCheckManager(
+        checkpoint_name=checkpoint_name, schema=schema
+    )
+    schema = pandera_check_manager.proccess_checks(custom_data)
+    return schema
+def _check_compare_data(
+    df: SnowparkDataFrame,
+    job_context: Optional[SnowparkJobContext],
+    checkpoint_name: str,
+    output_path: Optional[str] = None,
+):
+    """Compare the data in the provided Snowpark DataFrame with the data in a checkpoint table.
+    This function writes the provided DataFrame to a table and compares it with an existing checkpoint table
+    using a hash aggregation query. If there is a data mismatch, it marks the job context as failed and raises a
+    SchemaValidationError. If the data matches, it marks the job context as passed.
+    Args:
+        df (SnowparkDataFrame): The Snowpark DataFrame to compare.
+        job_context (Optional[SnowparkJobContext]): The job context containing the Snowpark session and job state.
+        checkpoint_name (str): The name of the checkpoint table to compare against.
+        output_path (Optional[str]): The path to the output directory.
+    Raises:
+        SchemaValidationError: If there is a data mismatch between the DataFrame and the checkpoint table.
+    """
+    _, err = _compare_data(df, job_context, checkpoint_name, output_path)
+    if err is not None:
+        raise err
+@report_telemetry(
+    params_list=["df"], return_indexes=[(STATUS_KEY, 0)], multiple_return=True
+)
+def _compare_data(
+    df: SnowparkDataFrame,
+    job_context: Optional[SnowparkJobContext],
+    checkpoint_name: str,
+    output_path: Optional[str] = None,
+) -> tuple[bool, Optional[SchemaValidationError]]:
+    """Compare the data in the provided Snowpark DataFrame with the data in a checkpoint table.
+    This function writes the provided DataFrame to a table and compares it with an existing checkpoint table
+    using a hash aggregation query. If there is a data mismatch, it marks the job context as failed and raises a
+    SchemaValidationError. If the data matches, it marks the job context as passed.
+    Args:
+        df (SnowparkDataFrame): The Snowpark DataFrame to compare.
+        job_context (Optional[SnowparkJobContext]): The job context containing the Snowpark session and job state.
+        checkpoint_name (str): The name of the checkpoint table to compare against.
+        output_path (Optional[str]): The path to the output directory.
+    Returns:
+        Tuple[bool, Optional[SchemaValidationError]]: A tuple containing a boolean indicating if the data matches
+        and an optional SchemaValidationError if there is a data mismatch.
+    Raises:
+        SchemaValidationError: If there is a data mismatch between the DataFrame and the checkpoint table.
+    """
+    new_table_name = CHECKPOINT_TABLE_NAME_FORMAT.format(checkpoint_name)
+    LOGGER.info(
+        "Writing Snowpark DataFrame to table: '%s' for checkpoint: '%s'",
+        new_table_name,
+        checkpoint_name,
+    )
+    df.write.save_as_table(table_name=new_table_name, mode="overwrite")
+    LOGGER.info(
+        "Comparing DataFrame to checkpoint table: '%s' for checkpoint: '%s'",
+        new_table_name,
+        checkpoint_name,
+    )
+    expect_df = job_context.snowpark_session.sql(
+        EXCEPT_HASH_AGG_QUERY, [checkpoint_name, new_table_name]
+    )
+    if expect_df.count() != 0:
+        error_message = f"Data mismatch for checkpoint {checkpoint_name}"
+        job_context._mark_fail(
+            error_message,
+            checkpoint_name,
+            df,
+            DATAFRAME_EXECUTION_MODE,
+        )
+        _update_validation_result(
+            checkpoint_name,
+            FAIL_STATUS,
+            output_path,
+        )
+        return False, SchemaValidationError(
+            error_message,
+            job_context,
+            checkpoint_name,
+            df,
+        )
+    else:
+        _update_validation_result(checkpoint_name, PASS_STATUS, output_path)
+        job_context._mark_pass(checkpoint_name, DATAFRAME_EXECUTION_MODE)
+        return True, None
+def _find_frame_in(stack: list[inspect.FrameInfo]) -> tuple:
+    """Find a specific frame in the provided stack trace.
+    This function searches through the provided stack trace to find a frame that matches
+    certain criteria. It looks for frames where the function name is "wrapper" or where
+    the code context matches specific regular expressions.
+    Args:
+        stack (list[inspect.FrameInfo]): A list of frame information objects representing
+                                         the current stack trace.
+    Returns:
+        tuple: A tuple containing the relative path of the file and the line number of the
+               matched frame. If no frame is matched, it returns a default key and -1.
+    """
+    regex = (
+        r"(?<!_check_dataframe_schema_file)"
+        r"(?<!_check_dataframe_schema)"
+        r"(validate_dataframe_checkpoint|check_dataframe_schema)"
+    )
+    first_frames = stack[:7]
+    first_frames.reverse()
+    for i, frame in enumerate(first_frames):
+        if frame.function == "wrapper" and i - 1 >= 0:
+            next_frame = first_frames[i - 1]
+            return _get_relative_path(next_frame.filename), next_frame.lineno
+        if len(frame.code_context) >= 0 and re.search(regex, frame.code_context[0]):
+            return _get_relative_path(frame.filename), frame.lineno
+    return DEFAULT_KEY, -1
+def _get_relative_path(file_path: str) -> str:
+    """Get the relative path of a file.
+    Args:
+        file_path (str): The path to the file.
+    Returns:
+        str: The relative path of the file.
+    """
+    current_directory = get_io_file_manager().getcwd()
+    return os.path.relpath(file_path, current_directory)
+def _update_validation_result(
+    checkpoint_name: str, validation_status: str, output_path: Optional[str] = None
+) -> None:
+    """Update the validation result file with the status of a given checkpoint.
+    Args:
+        checkpoint_name (str): The name of the checkpoint to update.
+        validation_status (str): The validation status to record for the checkpoint.
+        output_path (str): The path to the output directory.
+    Returns:
+        None
+    """
+    _file = get_checkpoint_file(checkpoint_name)
+    stack = inspect.stack()
+    _file_from_stack, _line_of_code = _find_frame_in(stack)
+    pipeline_result_metadata = ValidationResultsMetadata(output_path)
+    pipeline_result_metadata.clean()
+    pipeline_result_metadata.add_validation_result(
+        ValidationResult(
+            timestamp=datetime.now().isoformat(),
+            file=_file if _file else _file_from_stack,
+            line_of_code=_line_of_code,
+            checkpoint_name=checkpoint_name,
+            result=validation_status,
+        )
+    )
+    pipeline_result_metadata.save()

snowflake/snowpark_checkpoints/validation_result_metadata.py ADDED Viewed

@@ -0,0 +1,159 @@
+# Copyright 2025 Snowflake Inc.
+# SPDX-License-Identifier: Apache-2.0
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from typing import Optional
+from snowflake.snowpark_checkpoints.io_utils.io_file_manager import get_io_file_manager
+from snowflake.snowpark_checkpoints.singleton import Singleton
+from snowflake.snowpark_checkpoints.utils.constants import (
+    SNOWPARK_CHECKPOINTS_OUTPUT_DIRECTORY_NAME,
+    VALIDATION_RESULTS_JSON_FILE_NAME,
+)
+from snowflake.snowpark_checkpoints.validation_results import (
+    ValidationResult,
+    ValidationResults,
+)
+LOGGER = logging.getLogger(__name__)
+class ValidationResultsMetadata(metaclass=Singleton):
+    """ValidationResultsMetadata is a class that manages the loading, storing, and updating of validation results.
+    Attributes:
+        validation_results (list): A list to store validation results.
+        validation_results_file (str): The path to the validation results file.
+    Methods:
+        __init__(path: Optional[str] = None):
+            Initializes the PipelineResultMetadata instance and loads validation results from a JSON file
+            if a path is provided.
+        _load(path: Optional[str] = None):
+            Loads validation results from a JSON file. If no path is provided, the current working directory is used.
+        add_validation_result(validation_result: dict):
+            Adds a validation result to the pipeline result list.
+        save():
+            Saves the validation results to a JSON file in the current working directory.
+    """
+    def __init__(self, path: Optional[str] = None):
+        self._load(path)
+    def _load(self, path: Optional[str] = None):
+        """Load validation results from a JSON file.
+        Args:
+            path (Optional[str]): The directory path where the validation results file is located.
+                                  If not provided, the current working directory is used.
+        Raises:
+            Exception: If there is an error reading the validation results file.
+        """
+        self.validation_results_directory = (
+            path if path else get_io_file_manager().getcwd()
+        )
+        self.validation_results_directory = os.path.join(
+            self.validation_results_directory,
+            SNOWPARK_CHECKPOINTS_OUTPUT_DIRECTORY_NAME,
+        )
+        LOGGER.debug(
+            "Setting validation results directory to: '%s'",
+            self.validation_results_directory,
+        )
+        self.validation_results_file = os.path.join(
+            self.validation_results_directory,
+            VALIDATION_RESULTS_JSON_FILE_NAME,
+        )
+        LOGGER.debug(
+            "Setting validation results file to: '%s'", self.validation_results_file
+        )
+        self.validation_results = ValidationResults(results=[])
+        if get_io_file_manager().file_exists(self.validation_results_file):
+            LOGGER.info(
+                "Loading validation results from: '%s'", self.validation_results_file
+            )
+            try:
+                validation_result_json = get_io_file_manager().read(
+                    self.validation_results_file
+                )
+                self.validation_results = ValidationResults.model_validate_json(
+                    validation_result_json
+                )
+            except Exception as e:
+                raise Exception(
+                    f"Error reading validation results file: {self.validation_results_file} \n {e}"
+                ) from None
+        else:
+            LOGGER.info(
+                "Validation results file not found: '%s'",
+                self.validation_results_file,
+            )
+    def clean(self):
+        """Clean the validation results list.
+        This method empties the validation results list.
+        """
+        if not get_io_file_manager().file_exists(self.validation_results_file):
+            LOGGER.info("Cleaning validation results...")
+            self.validation_results.results = []
+    def add_validation_result(self, validation_result: ValidationResult):
+        """Add a validation result to the pipeline result list.
+        Args:
+            validation_result (dict): The validation result to be added.
+        """
+        self.validation_results.results.append(validation_result)
+    def save(self):
+        """Save the validation results to a file.
+        This method checks if the directory specified by validation results directory
+        exists, and if not, it creates the directory. Then, it writes the validation results
+        to a file specified by validation results file in JSON format.
+        Raises:
+            OSError: If the directory cannot be created or the file cannot be written.
+        """
+        if not get_io_file_manager().folder_exists(self.validation_results_directory):
+            LOGGER.debug(
+                "Validation results directory '%s' does not exist. Creating it...",
+                self.validation_results_directory,
+            )
+            get_io_file_manager().mkdir(self.validation_results_directory)
+        get_io_file_manager().write(
+            self.validation_results_file, self.validation_results.model_dump_json()
+        )
+        LOGGER.info(
+            "Validation results successfully saved to: '%s'",
+            self.validation_results_file,
+        )

snowflake/snowpark_checkpoints/validation_results.py ADDED Viewed

@@ -0,0 +1,49 @@
+# Copyright 2025 Snowflake Inc.
+# SPDX-License-Identifier: Apache-2.0
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pydantic import BaseModel
+class ValidationResult(BaseModel):
+    """ValidationResult represents the result of a validation checkpoint.
+    Attributes:
+        result (str): The result of the validation.
+        timestamp (datetime): The timestamp when the validation was performed.
+        file (str): The file where the validation checkpoint is located.
+        line_of_code (int): The line number in the file where the validation checkpoint is located.
+        checkpoint_name (str): The name of the validation checkpoint.
+    """
+    result: str
+    timestamp: str
+    file: str
+    line_of_code: int
+    checkpoint_name: str
+class ValidationResults(BaseModel):
+    """ValidationResults is a model that holds a list of validation results.
+    Attributes:
+        results (list[ValidationResult]): A list of validation results.
+    """
+    results: list[ValidationResult]

snowpark-checkpoints-validators 0.2.0rc1__py3-none-any.whl → 0.3.0__py3-none-any.whl

snowpark-checkpoints-validators 0.2.0rc1py3-none-any.whl → 0.3.0py3-none-any.whl