snowpark-checkpoints-validators 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,6 +27,7 @@ from snowflake.snowpark_checkpoints.checkpoint import (
27
27
  check_input_schema,
28
28
  check_output_schema,
29
29
  validate_dataframe_checkpoint,
30
+ xvalidate_dataframe_checkpoint,
30
31
  )
31
32
  from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
32
33
  from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
@@ -13,4 +13,4 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- __version__ = "0.3.1"
16
+ __version__ = "0.3.3"
@@ -21,6 +21,10 @@ import pandas
21
21
 
22
22
  from snowflake.snowpark import DataFrame as SnowparkDataFrame
23
23
  from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
24
+ from snowflake.snowpark_checkpoints.utils.constants import (
25
+ INTEGER_TYPE_COLLECTION,
26
+ PANDAS_LONG_TYPE,
27
+ )
24
28
 
25
29
 
26
30
  LOGGER = logging.getLogger(__name__)
@@ -73,17 +77,17 @@ class SamplingAdapter:
73
77
  "Applying random sampling with fraction %s",
74
78
  self.sample_frac,
75
79
  )
76
- df_sample = arg.sample(frac=self.sample_frac).to_pandas()
80
+ df_sample = to_pandas(arg.sample(frac=self.sample_frac))
77
81
  else:
78
82
  LOGGER.info(
79
83
  "Applying random sampling with size %s", self.sample_number
80
84
  )
81
- df_sample = arg.sample(n=self.sample_number).to_pandas()
85
+ df_sample = to_pandas(arg.sample(n=self.sample_number))
82
86
  else:
83
87
  LOGGER.info(
84
88
  "Applying limit sampling with size %s", self.sample_number
85
89
  )
86
- df_sample = arg.limit(self.sample_number).to_pandas()
90
+ df_sample = to_pandas(arg.limit(self.sample_number))
87
91
 
88
92
  LOGGER.info(
89
93
  "Successfully sampled the DataFrame. Resulting DataFrame shape: %s",
@@ -122,3 +126,26 @@ class SamplingAdapter:
122
126
  else:
123
127
  pyspark_sample_args.append(arg)
124
128
  return pyspark_sample_args
129
+
130
+
131
+ def to_pandas(sampled_df: SnowparkDataFrame) -> pandas.DataFrame:
132
+ """Convert a Snowpark DataFrame to a Pandas DataFrame, handling missing values and type conversions."""
133
+ LOGGER.debug("Converting Snowpark DataFrame to Pandas DataFrame")
134
+ pandas_df = sampled_df.toPandas()
135
+ pandas_df = normalize_missing_values_pandas(pandas_df)
136
+ return pandas_df
137
+
138
+
139
+ def normalize_missing_values_pandas(df: pandas.DataFrame) -> pandas.DataFrame:
140
+ """Normalize missing values in a Pandas DataFrame to ensure consistent handling of NA values."""
141
+ fill_values = {}
142
+ for col, dtype in df.dtypes.items():
143
+ if dtype in INTEGER_TYPE_COLLECTION or str(dtype) in PANDAS_LONG_TYPE:
144
+ fill_values[col] = 0
145
+ elif dtype is float or dtype == "float64":
146
+ fill_values[col] = 0.0
147
+ elif dtype is bool or dtype == "bool" or dtype == "boolean":
148
+ fill_values[col] = False
149
+ elif dtype is object or dtype == "object" or dtype is str:
150
+ fill_values[col] = ""
151
+ return df.fillna(value=fill_values)
@@ -133,3 +133,16 @@ VALIDATION_RESULTS_JSON_FILE_NAME: Final[str] = "checkpoint_validation_results.j
133
133
  SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR: Final[
134
134
  str
135
135
  ] = "SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH"
136
+
137
+ BYTE_COLUMN_TYPE = "byte"
138
+ INTEGER_COLUMN_TYPE = "integer"
139
+ LONG_COLUMN_TYPE = "long"
140
+ SHORT_COLUMN_TYPE = "short"
141
+
142
+ PANDAS_LONG_TYPE = "Int64"
143
+ INTEGER_TYPE_COLLECTION = [
144
+ BYTE_COLUMN_TYPE,
145
+ INTEGER_COLUMN_TYPE,
146
+ LONG_COLUMN_TYPE,
147
+ SHORT_COLUMN_TYPE,
148
+ ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: snowpark-checkpoints-validators
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: Migration tools for Snowpark
5
5
  Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
6
  Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
@@ -1,10 +1,10 @@
1
- snowflake/snowpark_checkpoints/__init__.py,sha256=V2HtQkoek-2twos_Qit-ZRS9FrSsbQ58nVf0uuGnyyk,1473
2
- snowflake/snowpark_checkpoints/__version__.py,sha256=uSRs7fRupFeQ-z3PtU_6qh6ry8YBaSAnEIAvLhJKUR8,632
1
+ snowflake/snowpark_checkpoints/__init__.py,sha256=CfKakKzrSymSDP9zGSE2iK4RAHcHZSfL-zEG_8GnHnc,1509
2
+ snowflake/snowpark_checkpoints/__version__.py,sha256=Ui7rlwZptAHMmUJyQDko698T0mUizyxibZ43mcDDaqk,632
3
3
  snowflake/snowpark_checkpoints/checkpoint.py,sha256=pU-HdpoS4SYzJU0qEaFzS5QBUE8K55Sn8K27zJe9_xM,24187
4
4
  snowflake/snowpark_checkpoints/errors.py,sha256=9KjzRf8bjDZTTNL4LeySJAwuucDOyz0Ka7EFBKWFpyg,1821
5
5
  snowflake/snowpark_checkpoints/job_context.py,sha256=RMK0g0HrbDVrOAvai4PgsGvsAn_GIo9aFmh-tWlyieY,4183
6
6
  snowflake/snowpark_checkpoints/singleton.py,sha256=7AgIHQBXVRvPBBCkmBplzkdrrm-xVWf_N8svzA2vF8E,836
7
- snowflake/snowpark_checkpoints/snowpark_sampler.py,sha256=Qxv-8nRGuf-ab3GoSUt8_MNL0ppjoBIMOFIMkqmwN5I,4668
7
+ snowflake/snowpark_checkpoints/snowpark_sampler.py,sha256=-J1yf071UNGKxmuntGS3P2EtC2xaycYoMbYiqdDsQYw,5832
8
8
  snowflake/snowpark_checkpoints/spark_migration.py,sha256=s2HqomYx76Hqn71g9TleBeHI3t1nirgfPvkggqQQdts,10253
9
9
  snowflake/snowpark_checkpoints/validation_result_metadata.py,sha256=5C8f1g-Grs2ydpXiZBLGt5n9cvEHBaw2-CDeb2vnhpg,5847
10
10
  snowflake/snowpark_checkpoints/validation_results.py,sha256=J8OcpNty6hQD8RbAy8xmA0UMbPWfXSmQnHYspWWSisk,1502
@@ -13,14 +13,14 @@ snowflake/snowpark_checkpoints/io_utils/io_default_strategy.py,sha256=VMfdqj4uDg
13
13
  snowflake/snowpark_checkpoints/io_utils/io_env_strategy.py,sha256=ltG_rxm0CkJFXpskOf__ByZw-C6B9LtycqlyB9EmaJI,3569
14
14
  snowflake/snowpark_checkpoints/io_utils/io_file_manager.py,sha256=YHrxRBzTlhIUrSFrsoWkRY_Qa-TXgDWglr00T98Tc5g,2485
15
15
  snowflake/snowpark_checkpoints/utils/__init__.py,sha256=I4srmZ8G1q9DU6Suo1S91aVfNvETyisKH95uvLAvEJ0,609
16
- snowflake/snowpark_checkpoints/utils/constants.py,sha256=M3vLdvKiVOhHMo0oPu4P42Wn_v6UDqmK6wHOGuoG6sY,4179
16
+ snowflake/snowpark_checkpoints/utils/constants.py,sha256=_bidRtxp0QzvU3OwQUmAhbcyqEAtHWU0t7TtXzTcsyY,4442
17
17
  snowflake/snowpark_checkpoints/utils/extra_config.py,sha256=xOYaG6MfsUCAHI0C_7qWF_m96xcLIZWwrgxY4UlpaZI,4325
18
18
  snowflake/snowpark_checkpoints/utils/logging_utils.py,sha256=yyi6X5DqKeTg0HRhvsH6ymYp2P0wbnyKIzI2RzrQS7k,2278
19
19
  snowflake/snowpark_checkpoints/utils/pandera_check_manager.py,sha256=tQIozLO-2kM8WZ-gGKfRwmXBx1cDPaIZB0qIcArp8xA,16100
20
20
  snowflake/snowpark_checkpoints/utils/supported_types.py,sha256=GrMX2tHdSFnK7LlPbZx20UufD6Br6TNVRkkBwIxdPy0,1433
21
21
  snowflake/snowpark_checkpoints/utils/telemetry.py,sha256=GfuyIaI3QG4a4_qWwyJHvWRM0GENunNexuEJ6IgscF4,32684
22
22
  snowflake/snowpark_checkpoints/utils/utils_checks.py,sha256=oQ1c4n-uAA2kFIpWIRPWhbCW8e-wwOIL8qDqLvr5Fok,14398
23
- snowpark_checkpoints_validators-0.3.1.dist-info/METADATA,sha256=mtI8xnknt0g9McBdjcbNqOHRCDlJs2GtB1A1zcuH_00,12676
24
- snowpark_checkpoints_validators-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
- snowpark_checkpoints_validators-0.3.1.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
26
- snowpark_checkpoints_validators-0.3.1.dist-info/RECORD,,
23
+ snowpark_checkpoints_validators-0.3.3.dist-info/METADATA,sha256=aKQzgDOXoeJHxYrhCjtIBnLocT0EeB-0ZFydGmnWtdY,12676
24
+ snowpark_checkpoints_validators-0.3.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
+ snowpark_checkpoints_validators-0.3.3.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
26
+ snowpark_checkpoints_validators-0.3.3.dist-info/RECORD,,