snowpark-checkpoints-validators 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_checkpoints/__init__.py +1 -0
- snowflake/snowpark_checkpoints/__version__.py +1 -1
- snowflake/snowpark_checkpoints/snowpark_sampler.py +30 -3
- snowflake/snowpark_checkpoints/utils/constants.py +13 -0
- {snowpark_checkpoints_validators-0.3.1.dist-info → snowpark_checkpoints_validators-0.3.3.dist-info}/METADATA +1 -1
- {snowpark_checkpoints_validators-0.3.1.dist-info → snowpark_checkpoints_validators-0.3.3.dist-info}/RECORD +8 -8
- {snowpark_checkpoints_validators-0.3.1.dist-info → snowpark_checkpoints_validators-0.3.3.dist-info}/WHEEL +0 -0
- {snowpark_checkpoints_validators-0.3.1.dist-info → snowpark_checkpoints_validators-0.3.3.dist-info}/licenses/LICENSE +0 -0
@@ -27,6 +27,7 @@ from snowflake.snowpark_checkpoints.checkpoint import (
|
|
27
27
|
check_input_schema,
|
28
28
|
check_output_schema,
|
29
29
|
validate_dataframe_checkpoint,
|
30
|
+
xvalidate_dataframe_checkpoint,
|
30
31
|
)
|
31
32
|
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
32
33
|
from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
|
@@ -21,6 +21,10 @@ import pandas
|
|
21
21
|
|
22
22
|
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
23
23
|
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
24
|
+
from snowflake.snowpark_checkpoints.utils.constants import (
|
25
|
+
INTEGER_TYPE_COLLECTION,
|
26
|
+
PANDAS_LONG_TYPE,
|
27
|
+
)
|
24
28
|
|
25
29
|
|
26
30
|
LOGGER = logging.getLogger(__name__)
|
@@ -73,17 +77,17 @@ class SamplingAdapter:
|
|
73
77
|
"Applying random sampling with fraction %s",
|
74
78
|
self.sample_frac,
|
75
79
|
)
|
76
|
-
df_sample = arg.sample(frac=self.sample_frac)
|
80
|
+
df_sample = to_pandas(arg.sample(frac=self.sample_frac))
|
77
81
|
else:
|
78
82
|
LOGGER.info(
|
79
83
|
"Applying random sampling with size %s", self.sample_number
|
80
84
|
)
|
81
|
-
df_sample = arg.sample(n=self.sample_number)
|
85
|
+
df_sample = to_pandas(arg.sample(n=self.sample_number))
|
82
86
|
else:
|
83
87
|
LOGGER.info(
|
84
88
|
"Applying limit sampling with size %s", self.sample_number
|
85
89
|
)
|
86
|
-
df_sample = arg.limit(self.sample_number)
|
90
|
+
df_sample = to_pandas(arg.limit(self.sample_number))
|
87
91
|
|
88
92
|
LOGGER.info(
|
89
93
|
"Successfully sampled the DataFrame. Resulting DataFrame shape: %s",
|
@@ -122,3 +126,26 @@ class SamplingAdapter:
|
|
122
126
|
else:
|
123
127
|
pyspark_sample_args.append(arg)
|
124
128
|
return pyspark_sample_args
|
129
|
+
|
130
|
+
|
131
|
+
def to_pandas(sampled_df: SnowparkDataFrame) -> pandas.DataFrame:
|
132
|
+
"""Convert a Snowpark DataFrame to a Pandas DataFrame, handling missing values and type conversions."""
|
133
|
+
LOGGER.debug("Converting Snowpark DataFrame to Pandas DataFrame")
|
134
|
+
pandas_df = sampled_df.toPandas()
|
135
|
+
pandas_df = normalize_missing_values_pandas(pandas_df)
|
136
|
+
return pandas_df
|
137
|
+
|
138
|
+
|
139
|
+
def normalize_missing_values_pandas(df: pandas.DataFrame) -> pandas.DataFrame:
|
140
|
+
"""Normalize missing values in a Pandas DataFrame to ensure consistent handling of NA values."""
|
141
|
+
fill_values = {}
|
142
|
+
for col, dtype in df.dtypes.items():
|
143
|
+
if dtype in INTEGER_TYPE_COLLECTION or str(dtype) in PANDAS_LONG_TYPE:
|
144
|
+
fill_values[col] = 0
|
145
|
+
elif dtype is float or dtype == "float64":
|
146
|
+
fill_values[col] = 0.0
|
147
|
+
elif dtype is bool or dtype == "bool" or dtype == "boolean":
|
148
|
+
fill_values[col] = False
|
149
|
+
elif dtype is object or dtype == "object" or dtype is str:
|
150
|
+
fill_values[col] = ""
|
151
|
+
return df.fillna(value=fill_values)
|
@@ -133,3 +133,16 @@ VALIDATION_RESULTS_JSON_FILE_NAME: Final[str] = "checkpoint_validation_results.j
|
|
133
133
|
SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR: Final[
|
134
134
|
str
|
135
135
|
] = "SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH"
|
136
|
+
|
137
|
+
BYTE_COLUMN_TYPE = "byte"
|
138
|
+
INTEGER_COLUMN_TYPE = "integer"
|
139
|
+
LONG_COLUMN_TYPE = "long"
|
140
|
+
SHORT_COLUMN_TYPE = "short"
|
141
|
+
|
142
|
+
PANDAS_LONG_TYPE = "Int64"
|
143
|
+
INTEGER_TYPE_COLLECTION = [
|
144
|
+
BYTE_COLUMN_TYPE,
|
145
|
+
INTEGER_COLUMN_TYPE,
|
146
|
+
LONG_COLUMN_TYPE,
|
147
|
+
SHORT_COLUMN_TYPE,
|
148
|
+
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: snowpark-checkpoints-validators
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.3
|
4
4
|
Summary: Migration tools for Snowpark
|
5
5
|
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
6
|
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
@@ -1,10 +1,10 @@
|
|
1
|
-
snowflake/snowpark_checkpoints/__init__.py,sha256=
|
2
|
-
snowflake/snowpark_checkpoints/__version__.py,sha256=
|
1
|
+
snowflake/snowpark_checkpoints/__init__.py,sha256=CfKakKzrSymSDP9zGSE2iK4RAHcHZSfL-zEG_8GnHnc,1509
|
2
|
+
snowflake/snowpark_checkpoints/__version__.py,sha256=Ui7rlwZptAHMmUJyQDko698T0mUizyxibZ43mcDDaqk,632
|
3
3
|
snowflake/snowpark_checkpoints/checkpoint.py,sha256=pU-HdpoS4SYzJU0qEaFzS5QBUE8K55Sn8K27zJe9_xM,24187
|
4
4
|
snowflake/snowpark_checkpoints/errors.py,sha256=9KjzRf8bjDZTTNL4LeySJAwuucDOyz0Ka7EFBKWFpyg,1821
|
5
5
|
snowflake/snowpark_checkpoints/job_context.py,sha256=RMK0g0HrbDVrOAvai4PgsGvsAn_GIo9aFmh-tWlyieY,4183
|
6
6
|
snowflake/snowpark_checkpoints/singleton.py,sha256=7AgIHQBXVRvPBBCkmBplzkdrrm-xVWf_N8svzA2vF8E,836
|
7
|
-
snowflake/snowpark_checkpoints/snowpark_sampler.py,sha256
|
7
|
+
snowflake/snowpark_checkpoints/snowpark_sampler.py,sha256=-J1yf071UNGKxmuntGS3P2EtC2xaycYoMbYiqdDsQYw,5832
|
8
8
|
snowflake/snowpark_checkpoints/spark_migration.py,sha256=s2HqomYx76Hqn71g9TleBeHI3t1nirgfPvkggqQQdts,10253
|
9
9
|
snowflake/snowpark_checkpoints/validation_result_metadata.py,sha256=5C8f1g-Grs2ydpXiZBLGt5n9cvEHBaw2-CDeb2vnhpg,5847
|
10
10
|
snowflake/snowpark_checkpoints/validation_results.py,sha256=J8OcpNty6hQD8RbAy8xmA0UMbPWfXSmQnHYspWWSisk,1502
|
@@ -13,14 +13,14 @@ snowflake/snowpark_checkpoints/io_utils/io_default_strategy.py,sha256=VMfdqj4uDg
|
|
13
13
|
snowflake/snowpark_checkpoints/io_utils/io_env_strategy.py,sha256=ltG_rxm0CkJFXpskOf__ByZw-C6B9LtycqlyB9EmaJI,3569
|
14
14
|
snowflake/snowpark_checkpoints/io_utils/io_file_manager.py,sha256=YHrxRBzTlhIUrSFrsoWkRY_Qa-TXgDWglr00T98Tc5g,2485
|
15
15
|
snowflake/snowpark_checkpoints/utils/__init__.py,sha256=I4srmZ8G1q9DU6Suo1S91aVfNvETyisKH95uvLAvEJ0,609
|
16
|
-
snowflake/snowpark_checkpoints/utils/constants.py,sha256=
|
16
|
+
snowflake/snowpark_checkpoints/utils/constants.py,sha256=_bidRtxp0QzvU3OwQUmAhbcyqEAtHWU0t7TtXzTcsyY,4442
|
17
17
|
snowflake/snowpark_checkpoints/utils/extra_config.py,sha256=xOYaG6MfsUCAHI0C_7qWF_m96xcLIZWwrgxY4UlpaZI,4325
|
18
18
|
snowflake/snowpark_checkpoints/utils/logging_utils.py,sha256=yyi6X5DqKeTg0HRhvsH6ymYp2P0wbnyKIzI2RzrQS7k,2278
|
19
19
|
snowflake/snowpark_checkpoints/utils/pandera_check_manager.py,sha256=tQIozLO-2kM8WZ-gGKfRwmXBx1cDPaIZB0qIcArp8xA,16100
|
20
20
|
snowflake/snowpark_checkpoints/utils/supported_types.py,sha256=GrMX2tHdSFnK7LlPbZx20UufD6Br6TNVRkkBwIxdPy0,1433
|
21
21
|
snowflake/snowpark_checkpoints/utils/telemetry.py,sha256=GfuyIaI3QG4a4_qWwyJHvWRM0GENunNexuEJ6IgscF4,32684
|
22
22
|
snowflake/snowpark_checkpoints/utils/utils_checks.py,sha256=oQ1c4n-uAA2kFIpWIRPWhbCW8e-wwOIL8qDqLvr5Fok,14398
|
23
|
-
snowpark_checkpoints_validators-0.3.
|
24
|
-
snowpark_checkpoints_validators-0.3.
|
25
|
-
snowpark_checkpoints_validators-0.3.
|
26
|
-
snowpark_checkpoints_validators-0.3.
|
23
|
+
snowpark_checkpoints_validators-0.3.3.dist-info/METADATA,sha256=aKQzgDOXoeJHxYrhCjtIBnLocT0EeB-0ZFydGmnWtdY,12676
|
24
|
+
snowpark_checkpoints_validators-0.3.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
25
|
+
snowpark_checkpoints_validators-0.3.3.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
|
26
|
+
snowpark_checkpoints_validators-0.3.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|