PyPI - snowpark-checkpoints-validators - Versions diffs - 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

snowpark-checkpoints-validators 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

snowflake/snowpark_checkpoints/__version__.py CHANGED Viewed

@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.3.2"
+__version__ = "0.4.0"

snowflake/snowpark_checkpoints/checkpoint.py CHANGED Viewed

@@ -332,7 +332,7 @@ def _check_dataframe_schema(
     pandera_schema_upper, sample_df = _process_sampling(
         df, pandera_schema, job_context, sample_frac, sample_number, sampling_strategy
     )
-    is_valid, validation_result = _validate(pandera_schema_upper, sample_df)
+    is_valid, validation_result = validate(pandera_schema_upper, sample_df)
     if is_valid:
         LOGGER.info(
             "DataFrame schema validation passed for checkpoint '%s'",
@@ -438,7 +438,7 @@ def check_output_schema(
             sampler.process_args([snowpark_results])
             pandas_sample_args = sampler.get_sampled_pandas_args()
-            is_valid, validation_result = _validate(
+            is_valid, validation_result = validate(
                 pandera_schema, pandas_sample_args[0]
             )
             if is_valid:
@@ -565,7 +565,7 @@ def check_input_schema(
                     )
                     continue
-                is_valid, validation_result = _validate(
+                is_valid, validation_result = validate(
                     pandera_schema,
                     arg,
                 )
@@ -606,11 +606,31 @@ def check_input_schema(
     return check_input_with_decorator
-def _validate(
+def validate(
     schema: Union[type[DataFrameModel], DataFrameSchema],
     df: PandasDataFrame,
     lazy: bool = True,
 ) -> tuple[bool, PandasDataFrame]:
+    """Validate a Pandas DataFrame against a given Pandera schema.
+    Args:
+        schema (Union[type[DataFrameModel], DataFrameSchema]):
+            The schema to validate against. Can be a Pandera `DataFrameSchema` or
+            a `DataFrameModel` class.
+        df (PandasDataFrame):
+            The Pandas DataFrame to be validated.
+        lazy (bool, optional):
+            If `True`, collect all validation errors before raising an exception.
+            If `False`, raise an exception as soon as the first error is encountered.
+            Defaults to `True`.
+    Returns:
+        tuple[bool, PandasDataFrame]:
+            A tuple containing:
+            - A boolean indicating whether validation passed.
+            - The validated DataFrame if successful, or the failure cases DataFrame if not.
+    """
     if not isinstance(schema, DataFrameSchema):
         schema = schema.to_schema()
     is_valid = True

snowflake/snowpark_checkpoints/snowpark_sampler.py CHANGED Viewed

@@ -20,7 +20,21 @@ from typing import Optional
 import pandas
 from snowflake.snowpark import DataFrame as SnowparkDataFrame
+from snowflake.snowpark.types import (
+    BinaryType,
+    BooleanType,
+    DateType,
+    FloatType,
+    StringType,
+    TimestampType,
+)
 from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
+from snowflake.snowpark_checkpoints.utils.constants import (
+    INTEGER_TYPE_COLLECTION,
+    PANDAS_FLOAT_TYPE,
+    PANDAS_LONG_TYPE,
+    PANDAS_STRING_TYPE,
+)
 LOGGER = logging.getLogger(__name__)
@@ -73,17 +87,17 @@ class SamplingAdapter:
                             "Applying random sampling with fraction %s",
                             self.sample_frac,
                         )
-                        df_sample = arg.sample(frac=self.sample_frac).to_pandas()
+                        df_sample = to_pandas(arg.sample(frac=self.sample_frac))
                     else:
                         LOGGER.info(
                             "Applying random sampling with size %s", self.sample_number
                         )
-                        df_sample = arg.sample(n=self.sample_number).to_pandas()
+                        df_sample = to_pandas(arg.sample(n=self.sample_number))
                 else:
                     LOGGER.info(
                         "Applying limit sampling with size %s", self.sample_number
                     )
-                    df_sample = arg.limit(self.sample_number).to_pandas()
+                    df_sample = to_pandas(arg.limit(self.sample_number))
                 LOGGER.info(
                     "Successfully sampled the DataFrame. Resulting DataFrame shape: %s",
@@ -122,3 +136,90 @@ class SamplingAdapter:
             else:
                 pyspark_sample_args.append(arg)
         return pyspark_sample_args
+def to_pandas(sampled_df: SnowparkDataFrame) -> pandas.DataFrame:
+    """Convert a Snowpark DataFrame to a Pandas DataFrame, handling missing values and type conversions."""
+    LOGGER.debug("Converting Snowpark DataFrame to Pandas DataFrame")
+    pandas_df = sampled_df.toPandas()
+    for field in sampled_df.schema.fields:
+        is_snowpark_integer = field.datatype.typeName() in INTEGER_TYPE_COLLECTION
+        is_snowpark_string = isinstance(field.datatype, StringType)
+        is_snowpark_binary = isinstance(field.datatype, BinaryType)
+        is_snowpark_timestamp = isinstance(field.datatype, TimestampType)
+        is_snowpark_float = isinstance(field.datatype, FloatType)
+        is_snowpark_boolean = isinstance(field.datatype, BooleanType)
+        is_snowpark_date = isinstance(field.datatype, DateType)
+        if is_snowpark_integer:
+            LOGGER.debug(
+                "Converting Spark integer column '%s' to Pandas nullable '%s' type",
+                field.name,
+                PANDAS_LONG_TYPE,
+            )
+            pandas_df[field.name] = (
+                pandas_df[field.name].astype(PANDAS_LONG_TYPE).fillna(0)
+            )
+        elif is_snowpark_string or is_snowpark_binary:
+            LOGGER.debug(
+                "Converting Spark string column '%s' to Pandas nullable '%s' type",
+                field.name,
+                PANDAS_STRING_TYPE,
+            )
+            pandas_df[field.name] = (
+                pandas_df[field.name].astype(PANDAS_STRING_TYPE).fillna("")
+            )
+        elif is_snowpark_timestamp:
+            LOGGER.debug(
+                "Converting Spark timestamp column '%s' to UTC naive Pandas datetime",
+                field.name,
+            )
+            pandas_df[field.name] = convert_all_to_utc_naive(
+                pandas_df[field.name]
+            ).fillna(pandas.NaT)
+        elif is_snowpark_float:
+            LOGGER.debug(
+                "Converting Spark float column '%s' to Pandas nullable float",
+                field.name,
+            )
+            pandas_df[field.name] = (
+                pandas_df[field.name].astype(PANDAS_FLOAT_TYPE).fillna(0.0)
+            )
+        elif is_snowpark_boolean:
+            LOGGER.debug(
+                "Converting Spark boolean column '%s' to Pandas nullable boolean",
+                field.name,
+            )
+            pandas_df[field.name] = (
+                pandas_df[field.name].astype("boolean").fillna(False)
+            )
+        elif is_snowpark_date:
+            LOGGER.debug(
+                "Converting Spark date column '%s' to Pandas nullable datetime",
+                field.name,
+            )
+            pandas_df[field.name] = pandas_df[field.name].fillna(pandas.NaT)
+    return pandas_df
+def convert_all_to_utc_naive(series: pandas.Series) -> pandas.Series:
+    """Convert all timezone-aware or naive timestamps in a series to UTC naive.
+    Naive timestamps are assumed to be in UTC and localized accordingly.
+    Timezone-aware timestamps are converted to UTC and then made naive.
+    Args:
+        series (pandas.Series): A Pandas Series of `pd.Timestamp` objects,
+            either naive or timezone-aware.
+    Returns:
+        pandas.Series: A Series of UTC-normalized naive timestamps (`tzinfo=None`).
+    """
+    def convert(ts):
+        if ts.tz is None:
+            ts = ts.tz_localize("UTC")
+        return ts.tz_convert("UTC").tz_localize(None)
+    return series.apply(convert)

snowflake/snowpark_checkpoints/utils/constants.py CHANGED Viewed

@@ -133,3 +133,17 @@ VALIDATION_RESULTS_JSON_FILE_NAME: Final[str] = "checkpoint_validation_results.j
 SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR: Final[
     str
 ] = "SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH"
+BYTE_COLUMN_TYPE = "byte"
+INTEGER_COLUMN_TYPE = "integer"
+LONG_COLUMN_TYPE = "long"
+SHORT_COLUMN_TYPE = "short"
+PANDAS_FLOAT_TYPE = "float64"
+PANDAS_LONG_TYPE = "Int64"
+PANDAS_STRING_TYPE = "string"
+INTEGER_TYPE_COLLECTION = [
+    BYTE_COLUMN_TYPE,
+    INTEGER_COLUMN_TYPE,
+    LONG_COLUMN_TYPE,
+    SHORT_COLUMN_TYPE,
+]

snowflake/snowpark_checkpoints/utils/utils_checks.py CHANGED Viewed

@@ -27,6 +27,9 @@ import numpy as np
 from pandera import DataFrameSchema
 from snowflake.snowpark import DataFrame as SnowparkDataFrame
+from snowflake.snowpark import Session
+from snowflake.snowpark.functions import col, expr
+from snowflake.snowpark.types import TimestampType
 from snowflake.snowpark_checkpoints.errors import SchemaValidationError
 from snowflake.snowpark_checkpoints.io_utils.io_file_manager import get_io_file_manager
 from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
@@ -42,7 +45,6 @@ from snowflake.snowpark_checkpoints.utils.constants import (
     DATAFRAME_EXECUTION_MODE,
     DATAFRAME_PANDERA_SCHEMA_KEY,
     DEFAULT_KEY,
-    EXCEPT_HASH_AGG_QUERY,
     FAIL_STATUS,
     PASS_STATUS,
     SNOWPARK_CHECKPOINTS_OUTPUT_DIRECTORY_NAME,
@@ -120,14 +122,14 @@ def _process_sampling(
     pandera_schema_upper = pandera_schema
     new_columns: dict[Any, Any] = {}
-    for col in pandera_schema.columns:
-        new_columns[col.upper()] = pandera_schema.columns[col]
+    for column in pandera_schema.columns:
+        new_columns[column.upper()] = pandera_schema.columns[column]
     pandera_schema_upper = pandera_schema_upper.remove_columns(pandera_schema.columns)
     pandera_schema_upper = pandera_schema_upper.add_columns(new_columns)
     sample_df = sampler.get_sampled_pandas_args()[0]
-    sample_df.index = np.ones(sample_df.count().iloc[0])
+    sample_df.index = np.ones(sample_df.count().iloc[0], dtype=int)
     return pandera_schema_upper, sample_df
@@ -191,6 +193,7 @@ Please run the Snowpark checkpoint collector first."""
     schema_dict = checkpoint_schema_config.get(DATAFRAME_PANDERA_SCHEMA_KEY)
     schema_dict_str = json.dumps(schema_dict)
     schema = DataFrameSchema.from_json(schema_dict_str)
+    schema.coerce = False  # Disable coercion to ensure strict validation
     if DATAFRAME_CUSTOM_DATA_KEY not in checkpoint_schema_config:
         LOGGER.info(
@@ -270,6 +273,7 @@ def _compare_data(
         SchemaValidationError: If there is a data mismatch between the DataFrame and the checkpoint table.
     """
+    df = convert_timestamps_to_utc_date(df)
     new_table_name = CHECKPOINT_TABLE_NAME_FORMAT.format(checkpoint_name)
     LOGGER.info(
         "Writing Snowpark DataFrame to table: '%s' for checkpoint: '%s'",
@@ -283,12 +287,12 @@ def _compare_data(
         new_table_name,
         checkpoint_name,
     )
-    expect_df = job_context.snowpark_session.sql(
-        EXCEPT_HASH_AGG_QUERY, [checkpoint_name, new_table_name]
-    )
-    if expect_df.count() != 0:
-        error_message = f"Data mismatch for checkpoint {checkpoint_name}"
+    session = job_context.snowpark_session
+    result = get_comparison_differences(session, checkpoint_name, new_table_name)
+    has_failed = result.get("spark_only_rows") or result.get("snowpark_only_rows")
+    if has_failed or result.get("error"):
+        error_message = f"Data mismatch for checkpoint {checkpoint_name}: {result}"
         job_context._mark_fail(
             error_message,
             checkpoint_name,
@@ -312,6 +316,80 @@ def _compare_data(
         return True, None
+def get_comparison_differences(
+    session: Session, spark_table: str, snowpark_table: str
+) -> dict:
+    """Compare two tables and return the differences."""
+    try:
+        spark_raw_schema = session.table(spark_table).schema.names
+        snowpark_raw_schema = session.table(snowpark_table).schema.names
+        spark_normalized = {
+            col_name.strip('"').upper(): col_name for col_name in spark_raw_schema
+        }
+        snowpark_normalized = {
+            col_name.strip('"').upper(): col_name for col_name in snowpark_raw_schema
+        }
+        common_cols = sorted(
+            list(
+                set(spark_normalized.keys()).intersection(
+                    set(snowpark_normalized.keys())
+                )
+            )
+        )
+        if not common_cols:
+            return {
+                "error": f"No common columns found between {spark_table} and {snowpark_table}",
+            }
+        cols_for_spark_selection = [
+            spark_normalized[norm_col_name] for norm_col_name in common_cols
+        ]
+        cols_for_snowpark_selection = [
+            snowpark_normalized[norm_col_name] for norm_col_name in common_cols
+        ]
+        spark_ordered = session.table(spark_table).select(
+            *[col(c) for c in cols_for_spark_selection]
+        )
+        snowpark_ordered = session.table(snowpark_table).select(
+            *[col(c) for c in cols_for_snowpark_selection]
+        )
+        spark_leftovers = spark_ordered.except_(snowpark_ordered).collect()
+        snowpark_leftovers = snowpark_ordered.except_(spark_ordered).collect()
+        spark_only_rows = [row.asDict() for row in spark_leftovers]
+        snowpark_only_rows = [row.asDict() for row in snowpark_leftovers]
+        return {
+            "spark_only_rows": spark_only_rows,
+            "snowpark_only_rows": snowpark_only_rows,
+        }
+    except Exception as e:
+        return {"error": f"An error occurred: {str(e)}"}
+def convert_timestamps_to_utc_date(df):
+    """Convert and normalize all Snowpark timestamp columns to UTC.
+    This function ensures timestamps are consistent across environments for reliable comparison.
+    """
+    new_cols = []
+    for field in df.schema.fields:
+        if isinstance(field.datatype, TimestampType):
+            utc_midnight_ts = expr(
+                f"convert_timezone('UTC', cast(to_date({field.name}) as timestamp_tz))"
+            ).alias(field.name)
+            new_cols.append(utc_midnight_ts)
+        else:
+            new_cols.append(col(field.name))
+    return df.select(new_cols)
 def _find_frame_in(stack: list[inspect.FrameInfo]) -> tuple:
     """Find a specific frame in the provided stack trace.

{snowpark_checkpoints_validators-0.3.2.dist-info → snowpark_checkpoints_validators-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: snowpark-checkpoints-validators
-Version: 0.3.2
+Version: 0.4.0
 Summary: Migration tools for Snowpark
 Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
 Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/

{snowpark_checkpoints_validators-0.3.2.dist-info → snowpark_checkpoints_validators-0.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
 snowflake/snowpark_checkpoints/__init__.py,sha256=CfKakKzrSymSDP9zGSE2iK4RAHcHZSfL-zEG_8GnHnc,1509
-snowflake/snowpark_checkpoints/__version__.py,sha256=1W0aBeLTL5Svy-qrNkZc6gAKtQLDbncpMyN2SlnJhoU,632
-snowflake/snowpark_checkpoints/checkpoint.py,sha256=pU-HdpoS4SYzJU0qEaFzS5QBUE8K55Sn8K27zJe9_xM,24187
+snowflake/snowpark_checkpoints/__version__.py,sha256=mZG_4eaVJdzo54iJo1tR3khnIA6lKjmN2lUgMoangNY,632
+snowflake/snowpark_checkpoints/checkpoint.py,sha256=4IzS_wuONVQdxUnCixymb1HJr3eeiiGEzXfvi74I1Qc,25002
 snowflake/snowpark_checkpoints/errors.py,sha256=9KjzRf8bjDZTTNL4LeySJAwuucDOyz0Ka7EFBKWFpyg,1821
 snowflake/snowpark_checkpoints/job_context.py,sha256=RMK0g0HrbDVrOAvai4PgsGvsAn_GIo9aFmh-tWlyieY,4183
 snowflake/snowpark_checkpoints/singleton.py,sha256=7AgIHQBXVRvPBBCkmBplzkdrrm-xVWf_N8svzA2vF8E,836
-snowflake/snowpark_checkpoints/snowpark_sampler.py,sha256=Qxv-8nRGuf-ab3GoSUt8_MNL0ppjoBIMOFIMkqmwN5I,4668
+snowflake/snowpark_checkpoints/snowpark_sampler.py,sha256=soew7FBnWqGp6VeBEFDakNbyjJD1imVJepGf6UmbFew,8426
 snowflake/snowpark_checkpoints/spark_migration.py,sha256=s2HqomYx76Hqn71g9TleBeHI3t1nirgfPvkggqQQdts,10253
 snowflake/snowpark_checkpoints/validation_result_metadata.py,sha256=5C8f1g-Grs2ydpXiZBLGt5n9cvEHBaw2-CDeb2vnhpg,5847
 snowflake/snowpark_checkpoints/validation_results.py,sha256=J8OcpNty6hQD8RbAy8xmA0UMbPWfXSmQnHYspWWSisk,1502
@@ -13,14 +13,14 @@ snowflake/snowpark_checkpoints/io_utils/io_default_strategy.py,sha256=VMfdqj4uDg
 snowflake/snowpark_checkpoints/io_utils/io_env_strategy.py,sha256=ltG_rxm0CkJFXpskOf__ByZw-C6B9LtycqlyB9EmaJI,3569
 snowflake/snowpark_checkpoints/io_utils/io_file_manager.py,sha256=YHrxRBzTlhIUrSFrsoWkRY_Qa-TXgDWglr00T98Tc5g,2485
 snowflake/snowpark_checkpoints/utils/__init__.py,sha256=I4srmZ8G1q9DU6Suo1S91aVfNvETyisKH95uvLAvEJ0,609
-snowflake/snowpark_checkpoints/utils/constants.py,sha256=M3vLdvKiVOhHMo0oPu4P42Wn_v6UDqmK6wHOGuoG6sY,4179
+snowflake/snowpark_checkpoints/utils/constants.py,sha256=SscPXRhTKfT2moChXheMDJBs1A8YWKvjNuQkwV8FT38,4501
 snowflake/snowpark_checkpoints/utils/extra_config.py,sha256=xOYaG6MfsUCAHI0C_7qWF_m96xcLIZWwrgxY4UlpaZI,4325
 snowflake/snowpark_checkpoints/utils/logging_utils.py,sha256=yyi6X5DqKeTg0HRhvsH6ymYp2P0wbnyKIzI2RzrQS7k,2278
 snowflake/snowpark_checkpoints/utils/pandera_check_manager.py,sha256=tQIozLO-2kM8WZ-gGKfRwmXBx1cDPaIZB0qIcArp8xA,16100
 snowflake/snowpark_checkpoints/utils/supported_types.py,sha256=GrMX2tHdSFnK7LlPbZx20UufD6Br6TNVRkkBwIxdPy0,1433
 snowflake/snowpark_checkpoints/utils/telemetry.py,sha256=GfuyIaI3QG4a4_qWwyJHvWRM0GENunNexuEJ6IgscF4,32684
-snowflake/snowpark_checkpoints/utils/utils_checks.py,sha256=oQ1c4n-uAA2kFIpWIRPWhbCW8e-wwOIL8qDqLvr5Fok,14398
-snowpark_checkpoints_validators-0.3.2.dist-info/METADATA,sha256=COJncHytOF0_orQJPFUkPgcNKMaQWk5l-TYVb2nQBMg,12676
-snowpark_checkpoints_validators-0.3.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-snowpark_checkpoints_validators-0.3.2.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
-snowpark_checkpoints_validators-0.3.2.dist-info/RECORD,,
+snowflake/snowpark_checkpoints/utils/utils_checks.py,sha256=5-EdkNnCjCYfwzdDLgVg0GykbsueXaGYhh5pOE1j0Z8,17325
+snowpark_checkpoints_validators-0.4.0.dist-info/METADATA,sha256=LtNR7bV-MskVmJ-4CzqEWcFia5_wNT8cJV8JEbeHy5s,12676
+snowpark_checkpoints_validators-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+snowpark_checkpoints_validators-0.4.0.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
+snowpark_checkpoints_validators-0.4.0.dist-info/RECORD,,

{snowpark_checkpoints_validators-0.3.2.dist-info → snowpark_checkpoints_validators-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{snowpark_checkpoints_validators-0.3.2.dist-info → snowpark_checkpoints_validators-0.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

snowpark-checkpoints-validators 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

snowpark-checkpoints-validators 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl