PyPI - snowpark-checkpoints-validators - Versions diffs - 0.3.2__tar.gz → 0.4.0__tar.gz - Mend

snowpark-checkpoints-validators 0.3.2tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: snowpark-checkpoints-validators
-Version: 0.3.2
+Version: 0.4.0
 Summary: Migration tools for Snowpark
 Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
 Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/pyproject.toml RENAMED Viewed

@@ -127,7 +127,7 @@ exclude_lines = [
 [tool.hatch.envs.linter.scripts]
 check = [
-  'ruff check --fix .',
+  "echo 'Running linting checks...' && ruff check --config=../ruff.toml --statistics --verbose . || (echo '❌ LINTING FAILED: Please fix the above linting issues before proceeding. Use \"ruff check --config=../ruff.toml --fix .\" to auto-fix some issues, or fix them manually.' && exit 1)",
 ]
 [tool.hatch.envs.test.scripts]

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/src/snowflake/snowpark_checkpoints/__version__.py RENAMED Viewed

@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.3.2"
+__version__ = "0.4.0"

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/src/snowflake/snowpark_checkpoints/checkpoint.py RENAMED Viewed

@@ -332,7 +332,7 @@ def _check_dataframe_schema(
     pandera_schema_upper, sample_df = _process_sampling(
         df, pandera_schema, job_context, sample_frac, sample_number, sampling_strategy
     )
-    is_valid, validation_result = _validate(pandera_schema_upper, sample_df)
+    is_valid, validation_result = validate(pandera_schema_upper, sample_df)
     if is_valid:
         LOGGER.info(
             "DataFrame schema validation passed for checkpoint '%s'",
@@ -438,7 +438,7 @@ def check_output_schema(
             sampler.process_args([snowpark_results])
             pandas_sample_args = sampler.get_sampled_pandas_args()
-            is_valid, validation_result = _validate(
+            is_valid, validation_result = validate(
                 pandera_schema, pandas_sample_args[0]
             )
             if is_valid:
@@ -565,7 +565,7 @@ def check_input_schema(
                     )
                     continue
-                is_valid, validation_result = _validate(
+                is_valid, validation_result = validate(
                     pandera_schema,
                     arg,
                 )
@@ -606,11 +606,31 @@ def check_input_schema(
     return check_input_with_decorator
-def _validate(
+def validate(
     schema: Union[type[DataFrameModel], DataFrameSchema],
     df: PandasDataFrame,
     lazy: bool = True,
 ) -> tuple[bool, PandasDataFrame]:
+    """Validate a Pandas DataFrame against a given Pandera schema.
+    Args:
+        schema (Union[type[DataFrameModel], DataFrameSchema]):
+            The schema to validate against. Can be a Pandera `DataFrameSchema` or
+            a `DataFrameModel` class.
+        df (PandasDataFrame):
+            The Pandas DataFrame to be validated.
+        lazy (bool, optional):
+            If `True`, collect all validation errors before raising an exception.
+            If `False`, raise an exception as soon as the first error is encountered.
+            Defaults to `True`.
+    Returns:
+        tuple[bool, PandasDataFrame]:
+            A tuple containing:
+            - A boolean indicating whether validation passed.
+            - The validated DataFrame if successful, or the failure cases DataFrame if not.
+    """
     if not isinstance(schema, DataFrameSchema):
         schema = schema.to_schema()
     is_valid = True

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/src/snowflake/snowpark_checkpoints/snowpark_sampler.py RENAMED Viewed

@@ -20,7 +20,21 @@ from typing import Optional
 import pandas
 from snowflake.snowpark import DataFrame as SnowparkDataFrame
+from snowflake.snowpark.types import (
+    BinaryType,
+    BooleanType,
+    DateType,
+    FloatType,
+    StringType,
+    TimestampType,
+)
 from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
+from snowflake.snowpark_checkpoints.utils.constants import (
+    INTEGER_TYPE_COLLECTION,
+    PANDAS_FLOAT_TYPE,
+    PANDAS_LONG_TYPE,
+    PANDAS_STRING_TYPE,
+)
 LOGGER = logging.getLogger(__name__)
@@ -73,17 +87,17 @@ class SamplingAdapter:
                             "Applying random sampling with fraction %s",
                             self.sample_frac,
                         )
-                        df_sample = arg.sample(frac=self.sample_frac).to_pandas()
+                        df_sample = to_pandas(arg.sample(frac=self.sample_frac))
                     else:
                         LOGGER.info(
                             "Applying random sampling with size %s", self.sample_number
                         )
-                        df_sample = arg.sample(n=self.sample_number).to_pandas()
+                        df_sample = to_pandas(arg.sample(n=self.sample_number))
                 else:
                     LOGGER.info(
                         "Applying limit sampling with size %s", self.sample_number
                     )
-                    df_sample = arg.limit(self.sample_number).to_pandas()
+                    df_sample = to_pandas(arg.limit(self.sample_number))
                 LOGGER.info(
                     "Successfully sampled the DataFrame. Resulting DataFrame shape: %s",
@@ -122,3 +136,90 @@ class SamplingAdapter:
             else:
                 pyspark_sample_args.append(arg)
         return pyspark_sample_args
+def to_pandas(sampled_df: SnowparkDataFrame) -> pandas.DataFrame:
+    """Convert a Snowpark DataFrame to a Pandas DataFrame, handling missing values and type conversions."""
+    LOGGER.debug("Converting Snowpark DataFrame to Pandas DataFrame")
+    pandas_df = sampled_df.toPandas()
+    for field in sampled_df.schema.fields:
+        is_snowpark_integer = field.datatype.typeName() in INTEGER_TYPE_COLLECTION
+        is_snowpark_string = isinstance(field.datatype, StringType)
+        is_snowpark_binary = isinstance(field.datatype, BinaryType)
+        is_snowpark_timestamp = isinstance(field.datatype, TimestampType)
+        is_snowpark_float = isinstance(field.datatype, FloatType)
+        is_snowpark_boolean = isinstance(field.datatype, BooleanType)
+        is_snowpark_date = isinstance(field.datatype, DateType)
+        if is_snowpark_integer:
+            LOGGER.debug(
+                "Converting Spark integer column '%s' to Pandas nullable '%s' type",
+                field.name,
+                PANDAS_LONG_TYPE,
+            )
+            pandas_df[field.name] = (
+                pandas_df[field.name].astype(PANDAS_LONG_TYPE).fillna(0)
+            )
+        elif is_snowpark_string or is_snowpark_binary:
+            LOGGER.debug(
+                "Converting Spark string column '%s' to Pandas nullable '%s' type",
+                field.name,
+                PANDAS_STRING_TYPE,
+            )
+            pandas_df[field.name] = (
+                pandas_df[field.name].astype(PANDAS_STRING_TYPE).fillna("")
+            )
+        elif is_snowpark_timestamp:
+            LOGGER.debug(
+                "Converting Spark timestamp column '%s' to UTC naive Pandas datetime",
+                field.name,
+            )
+            pandas_df[field.name] = convert_all_to_utc_naive(
+                pandas_df[field.name]
+            ).fillna(pandas.NaT)
+        elif is_snowpark_float:
+            LOGGER.debug(
+                "Converting Spark float column '%s' to Pandas nullable float",
+                field.name,
+            )
+            pandas_df[field.name] = (
+                pandas_df[field.name].astype(PANDAS_FLOAT_TYPE).fillna(0.0)
+            )
+        elif is_snowpark_boolean:
+            LOGGER.debug(
+                "Converting Spark boolean column '%s' to Pandas nullable boolean",
+                field.name,
+            )
+            pandas_df[field.name] = (
+                pandas_df[field.name].astype("boolean").fillna(False)
+            )
+        elif is_snowpark_date:
+            LOGGER.debug(
+                "Converting Spark date column '%s' to Pandas nullable datetime",
+                field.name,
+            )
+            pandas_df[field.name] = pandas_df[field.name].fillna(pandas.NaT)
+    return pandas_df
+def convert_all_to_utc_naive(series: pandas.Series) -> pandas.Series:
+    """Convert all timezone-aware or naive timestamps in a series to UTC naive.
+    Naive timestamps are assumed to be in UTC and localized accordingly.
+    Timezone-aware timestamps are converted to UTC and then made naive.
+    Args:
+        series (pandas.Series): A Pandas Series of `pd.Timestamp` objects,
+            either naive or timezone-aware.
+    Returns:
+        pandas.Series: A Series of UTC-normalized naive timestamps (`tzinfo=None`).
+    """
+    def convert(ts):
+        if ts.tz is None:
+            ts = ts.tz_localize("UTC")
+        return ts.tz_convert("UTC").tz_localize(None)
+    return series.apply(convert)

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/src/snowflake/snowpark_checkpoints/utils/constants.py RENAMED Viewed

@@ -133,3 +133,17 @@ VALIDATION_RESULTS_JSON_FILE_NAME: Final[str] = "checkpoint_validation_results.j
 SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR: Final[
     str
 ] = "SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH"
+BYTE_COLUMN_TYPE = "byte"
+INTEGER_COLUMN_TYPE = "integer"
+LONG_COLUMN_TYPE = "long"
+SHORT_COLUMN_TYPE = "short"
+PANDAS_FLOAT_TYPE = "float64"
+PANDAS_LONG_TYPE = "Int64"
+PANDAS_STRING_TYPE = "string"
+INTEGER_TYPE_COLLECTION = [
+    BYTE_COLUMN_TYPE,
+    INTEGER_COLUMN_TYPE,
+    LONG_COLUMN_TYPE,
+    SHORT_COLUMN_TYPE,
+]

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/src/snowflake/snowpark_checkpoints/utils/utils_checks.py RENAMED Viewed

@@ -27,6 +27,9 @@ import numpy as np
 from pandera import DataFrameSchema
 from snowflake.snowpark import DataFrame as SnowparkDataFrame
+from snowflake.snowpark import Session
+from snowflake.snowpark.functions import col, expr
+from snowflake.snowpark.types import TimestampType
 from snowflake.snowpark_checkpoints.errors import SchemaValidationError
 from snowflake.snowpark_checkpoints.io_utils.io_file_manager import get_io_file_manager
 from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
@@ -42,7 +45,6 @@ from snowflake.snowpark_checkpoints.utils.constants import (
     DATAFRAME_EXECUTION_MODE,
     DATAFRAME_PANDERA_SCHEMA_KEY,
     DEFAULT_KEY,
-    EXCEPT_HASH_AGG_QUERY,
     FAIL_STATUS,
     PASS_STATUS,
     SNOWPARK_CHECKPOINTS_OUTPUT_DIRECTORY_NAME,
@@ -120,14 +122,14 @@ def _process_sampling(
     pandera_schema_upper = pandera_schema
     new_columns: dict[Any, Any] = {}
-    for col in pandera_schema.columns:
-        new_columns[col.upper()] = pandera_schema.columns[col]
+    for column in pandera_schema.columns:
+        new_columns[column.upper()] = pandera_schema.columns[column]
     pandera_schema_upper = pandera_schema_upper.remove_columns(pandera_schema.columns)
     pandera_schema_upper = pandera_schema_upper.add_columns(new_columns)
     sample_df = sampler.get_sampled_pandas_args()[0]
-    sample_df.index = np.ones(sample_df.count().iloc[0])
+    sample_df.index = np.ones(sample_df.count().iloc[0], dtype=int)
     return pandera_schema_upper, sample_df
@@ -191,6 +193,7 @@ Please run the Snowpark checkpoint collector first."""
     schema_dict = checkpoint_schema_config.get(DATAFRAME_PANDERA_SCHEMA_KEY)
     schema_dict_str = json.dumps(schema_dict)
     schema = DataFrameSchema.from_json(schema_dict_str)
+    schema.coerce = False  # Disable coercion to ensure strict validation
     if DATAFRAME_CUSTOM_DATA_KEY not in checkpoint_schema_config:
         LOGGER.info(
@@ -270,6 +273,7 @@ def _compare_data(
         SchemaValidationError: If there is a data mismatch between the DataFrame and the checkpoint table.
     """
+    df = convert_timestamps_to_utc_date(df)
     new_table_name = CHECKPOINT_TABLE_NAME_FORMAT.format(checkpoint_name)
     LOGGER.info(
         "Writing Snowpark DataFrame to table: '%s' for checkpoint: '%s'",
@@ -283,12 +287,12 @@ def _compare_data(
         new_table_name,
         checkpoint_name,
     )
-    expect_df = job_context.snowpark_session.sql(
-        EXCEPT_HASH_AGG_QUERY, [checkpoint_name, new_table_name]
-    )
-    if expect_df.count() != 0:
-        error_message = f"Data mismatch for checkpoint {checkpoint_name}"
+    session = job_context.snowpark_session
+    result = get_comparison_differences(session, checkpoint_name, new_table_name)
+    has_failed = result.get("spark_only_rows") or result.get("snowpark_only_rows")
+    if has_failed or result.get("error"):
+        error_message = f"Data mismatch for checkpoint {checkpoint_name}: {result}"
         job_context._mark_fail(
             error_message,
             checkpoint_name,
@@ -312,6 +316,80 @@ def _compare_data(
         return True, None
+def get_comparison_differences(
+    session: Session, spark_table: str, snowpark_table: str
+) -> dict:
+    """Compare two tables and return the differences."""
+    try:
+        spark_raw_schema = session.table(spark_table).schema.names
+        snowpark_raw_schema = session.table(snowpark_table).schema.names
+        spark_normalized = {
+            col_name.strip('"').upper(): col_name for col_name in spark_raw_schema
+        }
+        snowpark_normalized = {
+            col_name.strip('"').upper(): col_name for col_name in snowpark_raw_schema
+        }
+        common_cols = sorted(
+            list(
+                set(spark_normalized.keys()).intersection(
+                    set(snowpark_normalized.keys())
+                )
+            )
+        )
+        if not common_cols:
+            return {
+                "error": f"No common columns found between {spark_table} and {snowpark_table}",
+            }
+        cols_for_spark_selection = [
+            spark_normalized[norm_col_name] for norm_col_name in common_cols
+        ]
+        cols_for_snowpark_selection = [
+            snowpark_normalized[norm_col_name] for norm_col_name in common_cols
+        ]
+        spark_ordered = session.table(spark_table).select(
+            *[col(c) for c in cols_for_spark_selection]
+        )
+        snowpark_ordered = session.table(snowpark_table).select(
+            *[col(c) for c in cols_for_snowpark_selection]
+        )
+        spark_leftovers = spark_ordered.except_(snowpark_ordered).collect()
+        snowpark_leftovers = snowpark_ordered.except_(spark_ordered).collect()
+        spark_only_rows = [row.asDict() for row in spark_leftovers]
+        snowpark_only_rows = [row.asDict() for row in snowpark_leftovers]
+        return {
+            "spark_only_rows": spark_only_rows,
+            "snowpark_only_rows": snowpark_only_rows,
+        }
+    except Exception as e:
+        return {"error": f"An error occurred: {str(e)}"}
+def convert_timestamps_to_utc_date(df):
+    """Convert and normalize all Snowpark timestamp columns to UTC.
+    This function ensures timestamps are consistent across environments for reliable comparison.
+    """
+    new_cols = []
+    for field in df.schema.fields:
+        if isinstance(field.datatype, TimestampType):
+            utc_midnight_ts = expr(
+                f"convert_timezone('UTC', cast(to_date({field.name}) as timestamp_tz))"
+            ).alias(field.name)
+            new_cols.append(utc_midnight_ts)
+        else:
+            new_cols.append(col(field.name))
+    return df.select(new_cols)
 def _find_frame_in(stack: list[inspect.FrameInfo]) -> tuple:
     """Find a specific frame in the provided stack trace.

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/test/integ/telemetry_expected/test_df_check_custom_check_telemetry.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
     "message": {
-        "data": "{\"function\": \"_check_dataframe_schema\", \"mode\": 1, \"status\": true, \"schema_types\": [\"int8\", \"float64\"]}",
+        "data": "{\"function\": \"_check_dataframe_schema\", \"mode\": 1, \"status\": true, \"schema_types\": [\"int64\", \"float64\"]}",
         "driver_type": "PythonConnector",
         "driver_version": "3.12.4",
         "event_name": "DataFrame_Validator_Schema",

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/test/integ/telemetry_expected/test_df_check_from_file_telemetry.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
     "message": {
-        "data": "{\"function\": \"_check_dataframe_schema\", \"mode\": 1, \"status\": true, \"schema_types\": [\"int8\", \"float64\"]}",
+        "data": "{\"function\": \"_check_dataframe_schema\", \"mode\": 1, \"status\": true, \"schema_types\": [\"int64\", \"float64\"]}",
         "driver_type": "PythonConnector",
         "driver_version": "3.12.4",
         "event_name": "DataFrame_Validator_Schema",

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/test/integ/telemetry_expected/test_df_check_skip_check_telemetry.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
     "message": {
-        "data": "{\"function\": \"_check_dataframe_schema\", \"mode\": 1, \"status\": true, \"schema_types\": [\"int8\", \"float64\"]}",
+        "data": "{\"function\": \"_check_dataframe_schema\", \"mode\": 1, \"status\": true, \"schema_types\": [\"int64\", \"float64\"]}",
         "driver_type": "PythonConnector",
         "driver_version": "3.12.4",
         "event_name": "DataFrame_Validator_Schema",

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/test/integ/telemetry_expected/test_df_check_telemetry.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
     "message": {
-        "data": "{\"function\": \"_check_dataframe_schema\", \"mode\": 1, \"status\": true, \"schema_types\": [\"int8\", \"float64\"]}",
+        "data": "{\"function\": \"_check_dataframe_schema\", \"mode\": 1, \"status\": true, \"schema_types\": [\"int64\", \"float64\"]}",
         "driver_type": "PythonConnector",
         "driver_version": "3.12.4",
         "event_name": "DataFrame_Validator_Schema",

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/test/integ/telemetry_expected/test_input_telemetry.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
     "message": {
-        "data": "{\"function\": \"check_input_schema\", \"schema_types\": [\"int8\", \"float64\"]}",
+        "data": "{\"function\": \"check_input_schema\", \"schema_types\": [\"int64\", \"float64\"]}",
         "driver_type": "PythonConnector",
         "driver_version": "3.12.4",
         "event_name": "DataFrame_Validator_Schema",

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/test/integ/telemetry_expected/test_output_telemetry.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
     "message": {
-        "data": "{\"function\": \"check_output_schema\", \"schema_types\": [\"int8\", \"float64\", \"float64\"]}",
+        "data": "{\"function\": \"check_output_schema\", \"schema_types\": [\"int64\", \"float64\", \"float64\"]}",
         "driver_type": "PythonConnector",
         "driver_version": "3.12.4",
         "event_name": "DataFrame_Validator_Schema",

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/test/integ/test_pandera.py RENAMED Viewed

@@ -23,7 +23,7 @@ from unittest.mock import MagicMock, patch
 import pytest
-from numpy import int8
+from numpy import int8, int64
 from pandas import DataFrame as PandasDataFrame
 from pandera import Check, Column, DataFrameSchema
 from pytest import raises
@@ -78,7 +78,7 @@ def test_input(telemetry_output_path):
     in_schema = DataFrameSchema(
         {
-            "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
+            "COLUMN1": Column(int64, Check(lambda x: 0 <= x <= 10, element_wise=True)),
             "COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
         }
     )
@@ -161,7 +161,7 @@ def test_output(telemetry_output_path):
     out_schema = DataFrameSchema(
         {
             "COLUMN1": Column(
-                int8, Check.between(0, 10, include_max=True, include_min=True)
+                int64, Check.between(0, 10, include_max=True, include_min=True)
             ),
             "COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
             "COLUMN3": Column(float, Check.less_than(10)),
@@ -244,7 +244,7 @@ def test_df_check(telemetry_output_path):
     schema = DataFrameSchema(
         {
-            "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
+            "COLUMN1": Column(int64, Check(lambda x: 0 <= x <= 10, element_wise=True)),
             "COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
         }
     )
@@ -320,7 +320,7 @@ def test_df_check_from_file(telemetry_output_path):
     schema = DataFrameSchema(
         {
-            "COLUMN1": Column(int8, Check.between(0, 10)),
+            "COLUMN1": Column(int64, Check.between(0, 10)),
             "COLUMN2": Column(float, Check.between(-20.5, -1.0)),
         }
     )
@@ -409,7 +409,7 @@ def test_df_check_custom_check(telemetry_output_path):
     schema = DataFrameSchema(
         {
-            "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
+            "COLUMN1": Column(int64, Check(lambda x: 0 <= x <= 10, element_wise=True)),
             "COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
         }
     )
@@ -454,7 +454,7 @@ def test_df_check_skip_check(telemetry_output_path):
     schema = DataFrameSchema(
         {
-            "COLUMN1": Column(int8, Check.between(0, 10, element_wise=True)),
+            "COLUMN1": Column(int64, Check.between(0, 10, element_wise=True)),
             "COLUMN2": Column(
                 float,
                 [

snowpark_checkpoints_validators-0.4.0/test/unit/test_pandera_validations.py ADDED Viewed

@@ -0,0 +1,130 @@
+# Copyright 2025 Snowflake Inc.
+# SPDX-License-Identifier: Apache-2.0
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from snowflake.snowpark_checkpoints.checkpoint import validate
+from pandera import DataFrameSchema, Column, Check
+import pandas as pd
+import pytz
+def test_pandera_validate_equivalent_dataframes():
+    schema = DataFrameSchema(
+        {
+            "a": Column(
+                int, checks=Check(lambda s: s > 0, element_wise=True), nullable=False
+            )
+        }
+    )
+    df = pd.DataFrame({"a": [1, 2, 3]})
+    result, validated_df = validate(schema, df)
+    assert result
+    pd.testing.assert_frame_equal(validated_df, df)
+def test_pandera_validate_object_vs_string():
+    schema = DataFrameSchema({"a": Column(str, nullable=False)})
+    df_object = pd.DataFrame({"a": pd.Series(["x", "y", "z"], dtype="object")})
+    result, validated_df = validate(schema, df_object)
+    assert result
+    df_int_as_string = pd.DataFrame({"a": ["1", "2", "3"]})
+    result, validated_df = validate(schema, df_int_as_string)
+    assert result
+    df_mixed = pd.DataFrame({"a": ["x", 1, "z"]})
+    result, validated_df = validate(schema, df_mixed)
+    assert not result
+def test_pandera_validate_int_vs_string():
+    schema = DataFrameSchema({"a": Column(int, nullable=False)})
+    df_valid_int = pd.DataFrame({"a": [1, 2, 3]})
+    result, _ = validate(schema, df_valid_int)
+    assert result
+    df_string_numbers = pd.DataFrame({"a": ["1", "2", "3"]})
+    result, failure_cases = validate(schema, df_string_numbers)
+    assert not result
+    df_mixed = pd.DataFrame({"a": [1, "2", 3]})
+    result, failure_cases = validate(schema, df_mixed)
+    assert not result
+def test_timestamp_ntz():
+    schema = DataFrameSchema({"ts": Column(pd.Timestamp, nullable=False)})
+    df = pd.DataFrame(
+        {
+            "ts": pd.to_datetime(
+                ["2024-01-01 10:00", "2024-01-02 11:00", "2024-01-03 12:00"]
+            )
+        }
+    )
+    result, validated_df = validate(schema, df)
+    assert result
+    assert validated_df["ts"].dt.tz is None
+def test_timestamp_utc_timezone():
+    schema = DataFrameSchema({"ts": Column(pd.Timestamp, nullable=False)})
+    df = pd.DataFrame(
+        {
+            "ts": pd.to_datetime(
+                [
+                    "2024-01-01 10:00+00:00",
+                    "2024-01-02 11:00+00:00",
+                    "2024-01-03 12:00+00:00",
+                ]
+            )
+        }
+    )
+    df["ts"] = df["ts"].dt.tz_convert("UTC").dt.tz_localize(None)
+    result, validated_df = validate(schema, df)
+    assert result
+    assert validated_df["ts"].dt.tz is None
+def convert_all_to_utc_naive(series: pd.Series) -> pd.Series:
+    def convert(ts):
+        if ts.tz is None:
+            ts = ts.tz_localize("UTC")
+        return ts.tz_convert("UTC").tz_localize(None)
+    return series.apply(convert)
+def test_timestamp_mixed_timezones_fails():
+    schema = DataFrameSchema({"ts": Column(pd.Timestamp, nullable=False)})
+    eastern = pytz.timezone("US/Eastern")
+    df = pd.DataFrame(
+        {
+            "ts": [
+                pd.Timestamp("2024-01-01 10:00"),
+                eastern.localize(pd.Timestamp("2024-01-02 11:00")),
+                pd.Timestamp("2024-01-03 12:00+00:00"),
+            ]
+        }
+    )
+    df["ts"] = convert_all_to_utc_naive(df["ts"])
+    result, validated_df = validate(schema, df)
+    assert result
+    assert validated_df["ts"].dt.tz is None

snowpark_checkpoints_validators-0.4.0/test/unit/test_snowpark_sampler.py ADDED Viewed

@@ -0,0 +1,117 @@
+# Copyright 2025 Snowflake Inc.
+# SPDX-License-Identifier: Apache-2.0
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pandas as pd
+from snowflake.snowpark_checkpoints.snowpark_sampler import (
+    to_pandas,
+    convert_all_to_utc_naive,
+)
+from snowflake.snowpark.types import (
+    BinaryType,
+    FloatType,
+    StringType,
+    TimestampType,
+)
+from snowflake.snowpark_checkpoints.utils.constants import (
+    PANDAS_FLOAT_TYPE,
+    PANDAS_LONG_TYPE,
+    PANDAS_STRING_TYPE,
+)
+class DummyDataType:
+    def __init__(self, name):
+        self._name = name
+    def typeName(self):
+        return self._name
+class DummyField:
+    def __init__(self, name, datatype):
+        self.name = name
+        self.datatype = datatype
+        self.typeName = datatype
+class DummySchema:
+    def __init__(self, fields):
+        self.fields = fields
+class DummySnowparkDF:
+    def __init__(self, pandas_df, fields):
+        self._pandas_df = pandas_df
+        self.schema = DummySchema(fields)
+    def toPandas(self):
+        return self._pandas_df
+def test_to_pandas_integer_conversion():
+    df = pd.DataFrame({"int_col": [1, None]}, dtype="float")
+    fields = [DummyField("int_col", DummyDataType("integer"))]
+    sp_df = DummySnowparkDF(df, fields)
+    result = to_pandas(sp_df)
+    assert result["int_col"].dtype == PANDAS_LONG_TYPE
+    assert result["int_col"].iloc[1] == 0
+def test_to_pandas_string_and_binary_conversion():
+    df = pd.DataFrame({"str_col": ["a", None], "bin_col": ["b", None]})
+    fields = [
+        DummyField("str_col", StringType()),
+        DummyField("bin_col", BinaryType()),
+    ]
+    sp_df = DummySnowparkDF(df, fields)
+    result = to_pandas(sp_df)
+    assert result["str_col"].dtype == PANDAS_STRING_TYPE
+    assert result["bin_col"].dtype == PANDAS_STRING_TYPE
+def test_to_pandas_float_conversion():
+    df = pd.DataFrame({"float_col": [1.1, None]}, dtype="float")
+    fields = [DummyField("float_col", FloatType())]
+    sp_df = DummySnowparkDF(df, fields)
+    result = to_pandas(sp_df)
+    assert result["float_col"].dtype == PANDAS_FLOAT_TYPE
+def test_to_pandas_timestamp_conversion():
+    utc_ts = pd.Timestamp("2023-01-01 12:00:00", tz="UTC")
+    naive_ts = pd.Timestamp("2023-01-02 12:00:00")
+    df = pd.DataFrame({"ts_col": [utc_ts, naive_ts]})
+    fields = [DummyField("ts_col", TimestampType())]
+    sp_df = DummySnowparkDF(df, fields)
+    result = to_pandas(sp_df)
+    assert pd.api.types.is_datetime64_any_dtype(result["ts_col"])
+    assert result["ts_col"].iloc[0].tzinfo is None
+    assert result["ts_col"].iloc[1].tzinfo is None
+def test_convert_all_to_utc_naive_behavior():
+    utc_ts = pd.Timestamp("2024-01-01 10:00:00", tz="UTC")
+    naive_ts = pd.Timestamp("2024-01-01 12:00:00")
+    none_val = pd.NaT
+    series = pd.Series([utc_ts, naive_ts, none_val])
+    result = convert_all_to_utc_naive(series)
+    assert result[0].tzinfo is None
+    assert result[1].tzinfo is None
+    assert pd.isna(result[2])

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/test/unit/test_utils_checks.py RENAMED Viewed

@@ -282,6 +282,7 @@ def test_compare_data_match():
     job_context = MagicMock(spec=SnowparkJobContext)
     session = MagicMock()
     job_context.snowpark_session = session
+    job_context.job_name = checkpoint_name
     # Mock session.sql to return an empty DataFrame (indicating no mismatch)
     session.sql.return_value.count.return_value = 0
@@ -289,7 +290,6 @@ def test_compare_data_match():
     checkpoint_name = "test_checkpoint"
     validation_status = PASS_STATUS
     output_path = "test_output_path/utils/"
     with (
         patch("os.getcwd", return_value="/mocked/path"),
         patch("os.path.exists", return_value=False),
@@ -298,6 +298,14 @@ def test_compare_data_match():
         patch(
             "snowflake.snowpark_checkpoints.utils.utils_checks._update_validation_result"
         ) as mock_update_validation_result,
+        patch(
+            "snowflake.snowpark_checkpoints.utils.utils_checks.convert_timestamps_to_utc_date",
+            return_value=df,
+        ),
+        patch(
+            "snowflake.snowpark_checkpoints.utils.utils_checks.get_comparison_differences",
+            return_value={},
+        ) as mock_get_comparison_differences,
     ):
         # Call the function
         _check_compare_data(df, job_context, checkpoint_name, output_path)
@@ -309,11 +317,7 @@ def test_compare_data_match():
     df.write.save_as_table.assert_called_once_with(
         table_name=new_checkpoint_name, mode=OVERWRITE_MODE
     )
-    calls = [
-        call(EXCEPT_HASH_AGG_QUERY, [checkpoint_name, new_checkpoint_name]),
-        call().count(),
-    ]
-    session.sql.assert_has_calls(calls)
+    mock_get_comparison_differences.assert_called_once()
     job_context._mark_pass.assert_called_once_with(
         checkpoint_name, DATAFRAME_EXECUTION_MODE
     )
@@ -344,6 +348,13 @@ def test_compare_data_mismatch():
         patch(
             "snowflake.snowpark_checkpoints.utils.utils_checks._update_validation_result"
         ) as mock_update_validation_result,
+        patch(
+            "snowflake.snowpark_checkpoints.utils.utils_checks.convert_timestamps_to_utc_date",
+            return_value=df,
+        ),
+        patch(
+            "snowflake.snowpark_checkpoints.utils.utils_checks.get_comparison_differences"
+        ) as mock_get_comparison_differences,
     ):
         # Call the function and expect a SchemaValidationError
         with raises(
@@ -359,11 +370,7 @@ def test_compare_data_mismatch():
     df.write.save_as_table.assert_called_once_with(
         table_name=new_checkpoint_name, mode=OVERWRITE_MODE
     )
-    calls = [
-        call(EXCEPT_HASH_AGG_QUERY, [checkpoint_name, new_checkpoint_name]),
-        call().count(),
-    ]
-    session.sql.assert_has_calls(calls)
+    mock_get_comparison_differences.assert_called_once()
     job_context._mark_fail.assert_called()
     job_context._mark_pass.assert_not_called()

{snowpark_checkpoints_validators-0.3.2 → snowpark_checkpoints_validators-0.4.0}/test/unit/test_validation_result_metadata.py RENAMED Viewed

@@ -16,7 +16,7 @@ from snowflake.snowpark_checkpoints.validation_results import (
 )
 from pandas import DataFrame as PandasDataFrame, testing as PandasTesting
 from pandera import DataFrameSchema, Column, Check
-from snowflake.snowpark_checkpoints.checkpoint import _validate
+from snowflake.snowpark_checkpoints.checkpoint import validate
 @fixture()
@@ -204,7 +204,7 @@ def test_clean_with_no_file():
 def test_validate_valid_schema(sample_data):
     df, valid_schema, _ = sample_data
-    is_valid, result = _validate(valid_schema, df)
+    is_valid, result = validate(valid_schema, df)
     assert is_valid
     assert isinstance(result, PandasDataFrame)
     PandasTesting.assert_frame_equal(result, df)
@@ -212,7 +212,7 @@ def test_validate_valid_schema(sample_data):
 def test_validate_invalid_schema(sample_data):
     df, _, invalid_schema = sample_data
-    is_valid, result = _validate(invalid_schema, df)
+    is_valid, result = validate(invalid_schema, df)
     assert not is_valid
     assert isinstance(result, PandasDataFrame)
     assert "failure_case" in result.columns