snowpark-checkpoints-collectors 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_checkpoints_collector/__version__.py +1 -1
- snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +19 -1
- {snowpark_checkpoints_collectors-0.3.2.dist-info → snowpark_checkpoints_collectors-0.3.3.dist-info}/METADATA +1 -1
- {snowpark_checkpoints_collectors-0.3.2.dist-info → snowpark_checkpoints_collectors-0.3.3.dist-info}/RECORD +6 -6
- {snowpark_checkpoints_collectors-0.3.2.dist-info → snowpark_checkpoints_collectors-0.3.3.dist-info}/WHEEL +0 -0
- {snowpark_checkpoints_collectors-0.3.2.dist-info → snowpark_checkpoints_collectors-0.3.3.dist-info}/licenses/LICENSE +0 -0
@@ -23,9 +23,9 @@ import pandera as pa
|
|
23
23
|
|
24
24
|
from pyspark.sql import DataFrame as SparkDataFrame
|
25
25
|
from pyspark.sql.functions import col
|
26
|
+
from pyspark.sql.types import BooleanType, FloatType, IntegerType, StructField
|
26
27
|
from pyspark.sql.types import DoubleType as SparkDoubleType
|
27
28
|
from pyspark.sql.types import StringType as SparkStringType
|
28
|
-
from pyspark.sql.types import StructField
|
29
29
|
|
30
30
|
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
31
31
|
CHECKPOINT_JSON_OUTPUT_FILE_NAME_FORMAT,
|
@@ -72,6 +72,14 @@ from snowflake.snowpark_checkpoints_collector.utils.telemetry import report_tele
|
|
72
72
|
|
73
73
|
LOGGER = logging.getLogger(__name__)
|
74
74
|
|
75
|
+
default_null_types = {
|
76
|
+
IntegerType(): 0,
|
77
|
+
FloatType(): 0.0,
|
78
|
+
SparkDoubleType(): 0.0,
|
79
|
+
SparkStringType(): "",
|
80
|
+
BooleanType(): False,
|
81
|
+
}
|
82
|
+
|
75
83
|
|
76
84
|
@log
|
77
85
|
def collect_dataframe_checkpoint(
|
@@ -253,6 +261,7 @@ def _collect_dataframe_checkpoint_mode_schema(
|
|
253
261
|
column_type_dict: dict[str, any],
|
254
262
|
output_path: Optional[str] = None,
|
255
263
|
) -> None:
|
264
|
+
df = normalize_missing_values(df)
|
256
265
|
sampled_df = df.sample(sample)
|
257
266
|
if sampled_df.isEmpty():
|
258
267
|
LOGGER.warning("Sampled DataFrame is empty. Collecting full DataFrame.")
|
@@ -327,6 +336,15 @@ def _collect_dataframe_checkpoint_mode_schema(
|
|
327
336
|
)
|
328
337
|
|
329
338
|
|
339
|
+
def normalize_missing_values(df: SparkDataFrame) -> SparkDataFrame:
|
340
|
+
"""Normalize missing values in a PySpark DataFrame to ensure consistent handling of NA values."""
|
341
|
+
for field in df.schema.fields:
|
342
|
+
default_value = default_null_types.get(field.dataType, None)
|
343
|
+
if default_value is not None:
|
344
|
+
df = df.fillna({field.name: default_value})
|
345
|
+
return df
|
346
|
+
|
347
|
+
|
330
348
|
def _get_spark_column_types(df: SparkDataFrame) -> dict[str, StructField]:
|
331
349
|
schema = df.schema
|
332
350
|
column_type_collection = {}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: snowpark-checkpoints-collectors
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.3
|
4
4
|
Summary: Snowpark column and table statistics collection
|
5
5
|
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
6
|
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
@@ -1,8 +1,8 @@
|
|
1
1
|
snowflake/snowpark_checkpoints_collector/__init__.py,sha256=g4NemuA6Mj4O2jkK0yLQ8sEV3owHiiJnBEz_OWvlW1I,1179
|
2
|
-
snowflake/snowpark_checkpoints_collector/__version__.py,sha256=
|
2
|
+
snowflake/snowpark_checkpoints_collector/__version__.py,sha256=Ui7rlwZptAHMmUJyQDko698T0mUizyxibZ43mcDDaqk,632
|
3
3
|
snowflake/snowpark_checkpoints_collector/collection_common.py,sha256=ff5vYffrTRjoJXZQvVQBaOlegAUj_vXBbl1IZidz8Qo,4510
|
4
4
|
snowflake/snowpark_checkpoints_collector/singleton.py,sha256=7AgIHQBXVRvPBBCkmBplzkdrrm-xVWf_N8svzA2vF8E,836
|
5
|
-
snowflake/snowpark_checkpoints_collector/summary_stats_collector.py,sha256=
|
5
|
+
snowflake/snowpark_checkpoints_collector/summary_stats_collector.py,sha256=eh--VdfPbpdrD0fyyB8bHMtKAwxDEQgRPOf5IaR6iL0,17824
|
6
6
|
snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py,sha256=jZzx29WzrjH7C_6ZsBGoe4PxbW_oM4uIjySS1axIM34,1000
|
7
7
|
snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py,sha256=XelL7LughZpKl1B_6bJoKOc_PqQg3UleX6zdgVXqTus,2926
|
8
8
|
snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py,sha256=EY6WIIXRbvkTYC4bQn7jFALHh7D2PirVoiLZ5Kq8dNs,2659
|
@@ -37,7 +37,7 @@ snowflake/snowpark_checkpoints_collector/utils/extra_config.py,sha256=3kVf6WVA-E
|
|
37
37
|
snowflake/snowpark_checkpoints_collector/utils/file_utils.py,sha256=5ztlNCv9GdSktUvtdfydv86cCFcmSXCdD4axZXJrOQQ,5125
|
38
38
|
snowflake/snowpark_checkpoints_collector/utils/logging_utils.py,sha256=yyi6X5DqKeTg0HRhvsH6ymYp2P0wbnyKIzI2RzrQS7k,2278
|
39
39
|
snowflake/snowpark_checkpoints_collector/utils/telemetry.py,sha256=ueN9vM8j5YNax7jMcnEj_UrgGkoeMv_hJHVKjN7hiJE,32161
|
40
|
-
snowpark_checkpoints_collectors-0.3.
|
41
|
-
snowpark_checkpoints_collectors-0.3.
|
42
|
-
snowpark_checkpoints_collectors-0.3.
|
43
|
-
snowpark_checkpoints_collectors-0.3.
|
40
|
+
snowpark_checkpoints_collectors-0.3.3.dist-info/METADATA,sha256=3YCACSMQwOCF5e1lf8sX765aYYDBw38ukiEPbxb-7iA,6613
|
41
|
+
snowpark_checkpoints_collectors-0.3.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
42
|
+
snowpark_checkpoints_collectors-0.3.3.dist-info/licenses/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
|
43
|
+
snowpark_checkpoints_collectors-0.3.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|