snowpark-checkpoints-collectors 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,4 +13,4 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- __version__ = "0.3.2"
16
+ __version__ = "0.3.3"
@@ -23,9 +23,9 @@ import pandera as pa
23
23
 
24
24
  from pyspark.sql import DataFrame as SparkDataFrame
25
25
  from pyspark.sql.functions import col
26
+ from pyspark.sql.types import BooleanType, FloatType, IntegerType, StructField
26
27
  from pyspark.sql.types import DoubleType as SparkDoubleType
27
28
  from pyspark.sql.types import StringType as SparkStringType
28
- from pyspark.sql.types import StructField
29
29
 
30
30
  from snowflake.snowpark_checkpoints_collector.collection_common import (
31
31
  CHECKPOINT_JSON_OUTPUT_FILE_NAME_FORMAT,
@@ -72,6 +72,14 @@ from snowflake.snowpark_checkpoints_collector.utils.telemetry import report_tele
72
72
 
73
73
  LOGGER = logging.getLogger(__name__)
74
74
 
75
+ default_null_types = {
76
+ IntegerType(): 0,
77
+ FloatType(): 0.0,
78
+ SparkDoubleType(): 0.0,
79
+ SparkStringType(): "",
80
+ BooleanType(): False,
81
+ }
82
+
75
83
 
76
84
  @log
77
85
  def collect_dataframe_checkpoint(
@@ -253,6 +261,7 @@ def _collect_dataframe_checkpoint_mode_schema(
253
261
  column_type_dict: dict[str, any],
254
262
  output_path: Optional[str] = None,
255
263
  ) -> None:
264
+ df = normalize_missing_values(df)
256
265
  sampled_df = df.sample(sample)
257
266
  if sampled_df.isEmpty():
258
267
  LOGGER.warning("Sampled DataFrame is empty. Collecting full DataFrame.")
@@ -327,6 +336,15 @@ def _collect_dataframe_checkpoint_mode_schema(
327
336
  )
328
337
 
329
338
 
339
+ def normalize_missing_values(df: SparkDataFrame) -> SparkDataFrame:
340
+ """Normalize missing values in a PySpark DataFrame to ensure consistent handling of NA values."""
341
+ for field in df.schema.fields:
342
+ default_value = default_null_types.get(field.dataType, None)
343
+ if default_value is not None:
344
+ df = df.fillna({field.name: default_value})
345
+ return df
346
+
347
+
330
348
  def _get_spark_column_types(df: SparkDataFrame) -> dict[str, StructField]:
331
349
  schema = df.schema
332
350
  column_type_collection = {}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: snowpark-checkpoints-collectors
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: Snowpark column and table statistics collection
5
5
  Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
6
  Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
@@ -1,8 +1,8 @@
1
1
  snowflake/snowpark_checkpoints_collector/__init__.py,sha256=g4NemuA6Mj4O2jkK0yLQ8sEV3owHiiJnBEz_OWvlW1I,1179
2
- snowflake/snowpark_checkpoints_collector/__version__.py,sha256=1W0aBeLTL5Svy-qrNkZc6gAKtQLDbncpMyN2SlnJhoU,632
2
+ snowflake/snowpark_checkpoints_collector/__version__.py,sha256=Ui7rlwZptAHMmUJyQDko698T0mUizyxibZ43mcDDaqk,632
3
3
  snowflake/snowpark_checkpoints_collector/collection_common.py,sha256=ff5vYffrTRjoJXZQvVQBaOlegAUj_vXBbl1IZidz8Qo,4510
4
4
  snowflake/snowpark_checkpoints_collector/singleton.py,sha256=7AgIHQBXVRvPBBCkmBplzkdrrm-xVWf_N8svzA2vF8E,836
5
- snowflake/snowpark_checkpoints_collector/summary_stats_collector.py,sha256=kRJpVRE9Iy_uqeIPT-__Aan-YLWxQbgSjkJ3w4LpvCc,17214
5
+ snowflake/snowpark_checkpoints_collector/summary_stats_collector.py,sha256=eh--VdfPbpdrD0fyyB8bHMtKAwxDEQgRPOf5IaR6iL0,17824
6
6
  snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py,sha256=jZzx29WzrjH7C_6ZsBGoe4PxbW_oM4uIjySS1axIM34,1000
7
7
  snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py,sha256=XelL7LughZpKl1B_6bJoKOc_PqQg3UleX6zdgVXqTus,2926
8
8
  snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py,sha256=EY6WIIXRbvkTYC4bQn7jFALHh7D2PirVoiLZ5Kq8dNs,2659
@@ -37,7 +37,7 @@ snowflake/snowpark_checkpoints_collector/utils/extra_config.py,sha256=3kVf6WVA-E
37
37
  snowflake/snowpark_checkpoints_collector/utils/file_utils.py,sha256=5ztlNCv9GdSktUvtdfydv86cCFcmSXCdD4axZXJrOQQ,5125
38
38
  snowflake/snowpark_checkpoints_collector/utils/logging_utils.py,sha256=yyi6X5DqKeTg0HRhvsH6ymYp2P0wbnyKIzI2RzrQS7k,2278
39
39
  snowflake/snowpark_checkpoints_collector/utils/telemetry.py,sha256=ueN9vM8j5YNax7jMcnEj_UrgGkoeMv_hJHVKjN7hiJE,32161
40
- snowpark_checkpoints_collectors-0.3.2.dist-info/METADATA,sha256=ueYk6-aMlhiKfvH0CZbqjiEjlxUP1VQwKDejX28ju30,6613
41
- snowpark_checkpoints_collectors-0.3.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
42
- snowpark_checkpoints_collectors-0.3.2.dist-info/licenses/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
43
- snowpark_checkpoints_collectors-0.3.2.dist-info/RECORD,,
40
+ snowpark_checkpoints_collectors-0.3.3.dist-info/METADATA,sha256=3YCACSMQwOCF5e1lf8sX765aYYDBw38ukiEPbxb-7iA,6613
41
+ snowpark_checkpoints_collectors-0.3.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
42
+ snowpark_checkpoints_collectors-0.3.3.dist-info/licenses/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
43
+ snowpark_checkpoints_collectors-0.3.3.dist-info/RECORD,,