snowpark-checkpoints-collectors 0.3.1__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/PKG-INFO +1 -1
  2. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/__version__.py +1 -1
  3. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +19 -1
  4. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1.py +1 -1
  5. snowpark_checkpoints_collectors-0.3.3/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +1 -0
  6. snowpark_checkpoints_collectors-0.3.3/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +1 -0
  7. snowpark_checkpoints_collectors-0.3.3/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +1 -0
  8. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/test_summary_stats_collector.py +79 -0
  9. snowpark_checkpoints_collectors-0.3.1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -1
  10. snowpark_checkpoints_collectors-0.3.1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -1
  11. snowpark_checkpoints_collectors-0.3.1/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -1
  12. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/.gitignore +0 -0
  13. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/CHANGELOG.md +0 -0
  14. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/LICENSE +0 -0
  15. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/README.md +0 -0
  16. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/pyproject.toml +0 -0
  17. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/snowpark-testdf-schema.json +0 -0
  18. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/__init__.py +0 -0
  19. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +0 -0
  20. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
  21. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +0 -0
  22. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +0 -0
  23. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
  24. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +0 -0
  25. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
  26. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +0 -0
  27. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +0 -0
  28. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -0
  29. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +0 -0
  30. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -0
  31. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -0
  32. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -0
  33. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +0 -0
  34. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +0 -0
  35. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -0
  36. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +0 -0
  37. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -0
  38. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +0 -0
  39. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +0 -0
  40. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +0 -0
  41. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
  42. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +0 -0
  43. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/io_utils/__init__.py +0 -0
  44. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/io_utils/io_default_strategy.py +0 -0
  45. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/io_utils/io_env_strategy.py +0 -0
  46. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/io_utils/io_file_manager.py +0 -0
  47. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
  48. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
  49. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +0 -0
  50. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +0 -0
  51. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +0 -0
  52. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +0 -0
  53. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/utils/logging_utils.py +0 -0
  54. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -0
  55. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/.coveragerc +0 -0
  56. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/telemetry_compare_utils.py +0 -0
  57. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_checkpoint_name.py +0 -0
  58. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -0
  59. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -0
  60. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -0
  61. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -0
  62. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -0
  63. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
  64. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -0
  65. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
  66. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -0
  67. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -0
  68. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -0
  69. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +0 -0
  70. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -0
  71. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_1_expected/test_io_strategy.json +0 -0
  72. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_2.py +0 -0
  73. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +0 -0
  74. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +0 -0
  75. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +0 -0
  76. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_collection_result_file.py +0 -0
  77. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/integ/test_snow_connection_int.py +0 -0
  78. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/io_utils/test_default_strategy.py +0 -0
  79. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/test_checkpoint_name_utils.py +0 -0
  80. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/test_collection_point_result.py +0 -0
  81. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/test_collection_point_result_manager.py +0 -0
  82. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/test_column_collection.py +0 -0
  83. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/test_extra_config.py +0 -0
  84. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/test_file_utils.py +0 -0
  85. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/test_logger.py +0 -0
  86. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/test_logging_utils.py +0 -0
  87. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/test_pandera_column_check_manager.py +0 -0
  88. {snowpark_checkpoints_collectors-0.3.1 → snowpark_checkpoints_collectors-0.3.3}/test/unit/test_snow_connection.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: snowpark-checkpoints-collectors
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: Snowpark column and table statistics collection
5
5
  Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
6
  Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
@@ -13,4 +13,4 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- __version__ = "0.3.1"
16
+ __version__ = "0.3.3"
@@ -23,9 +23,9 @@ import pandera as pa
23
23
 
24
24
  from pyspark.sql import DataFrame as SparkDataFrame
25
25
  from pyspark.sql.functions import col
26
+ from pyspark.sql.types import BooleanType, FloatType, IntegerType, StructField
26
27
  from pyspark.sql.types import DoubleType as SparkDoubleType
27
28
  from pyspark.sql.types import StringType as SparkStringType
28
- from pyspark.sql.types import StructField
29
29
 
30
30
  from snowflake.snowpark_checkpoints_collector.collection_common import (
31
31
  CHECKPOINT_JSON_OUTPUT_FILE_NAME_FORMAT,
@@ -72,6 +72,14 @@ from snowflake.snowpark_checkpoints_collector.utils.telemetry import report_tele
72
72
 
73
73
  LOGGER = logging.getLogger(__name__)
74
74
 
75
+ default_null_types = {
76
+ IntegerType(): 0,
77
+ FloatType(): 0.0,
78
+ SparkDoubleType(): 0.0,
79
+ SparkStringType(): "",
80
+ BooleanType(): False,
81
+ }
82
+
75
83
 
76
84
  @log
77
85
  def collect_dataframe_checkpoint(
@@ -253,6 +261,7 @@ def _collect_dataframe_checkpoint_mode_schema(
253
261
  column_type_dict: dict[str, any],
254
262
  output_path: Optional[str] = None,
255
263
  ) -> None:
264
+ df = normalize_missing_values(df)
256
265
  sampled_df = df.sample(sample)
257
266
  if sampled_df.isEmpty():
258
267
  LOGGER.warning("Sampled DataFrame is empty. Collecting full DataFrame.")
@@ -327,6 +336,15 @@ def _collect_dataframe_checkpoint_mode_schema(
327
336
  )
328
337
 
329
338
 
339
+ def normalize_missing_values(df: SparkDataFrame) -> SparkDataFrame:
340
+ """Normalize missing values in a PySpark DataFrame to ensure consistent handling of NA values."""
341
+ for field in df.schema.fields:
342
+ default_value = default_null_types.get(field.dataType, None)
343
+ if default_value is not None:
344
+ df = df.fillna({field.name: default_value})
345
+ return df
346
+
347
+
330
348
  def _get_spark_column_types(df: SparkDataFrame) -> dict[str, StructField]:
331
349
  schema = df.schema
332
350
  column_type_collection = {}
@@ -378,7 +378,7 @@ def test_collect_dataframe_with_null_values(
378
378
  )
379
379
 
380
380
  validate_checkpoint_file_output(output_path, checkpoint_name)
381
- assert "Converting column 'age' to 'Int64' type" in caplog.messages
381
+ assert "Collecting column 'age' of type 'integer'" in caplog.messages
382
382
 
383
383
 
384
384
  def test_collect_sampled_dataframe(spark_session, output_path):
@@ -0,0 +1 @@
1
+ {"pandera_schema": {"schema_type": "dataframe", "version": "0.20.4", "columns": {"a": {"title": null, "description": null, "dtype": "bool", "nullable": false, "checks": {"isin": [true, false]}, "unique": false, "coerce": false, "required": true, "regex": false}, "b": {"title": null, "description": null, "dtype": "Int64", "nullable": true, "checks": {"in_range": {"min_value": 1, "max_value": 1, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "c": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "d": {"title": null, "description": null, "dtype": "timedelta64[ns]", "nullable": true, "checks": {"in_range": {"min_value": 1123200000000000, "max_value": 1123200000000000, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "e": {"title": null, "description": null, "dtype": "float64", "nullable": false, "checks": {"in_range": {"min_value": 0.0, "max_value": 2.1, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "f": {"title": null, "description": null, "dtype": "float32", "nullable": false, "checks": {"in_range": {"min_value": 0.0, "max_value": 3.109999895095825, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "g": {"title": null, "description": null, "dtype": "int32", "nullable": false, "checks": {"in_range": {"min_value": 0, "max_value": 4, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "h": {"title": null, "description": null, "dtype": "Int64", "nullable": true, "checks": {"in_range": {"min_value": 5, "max_value": 5, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "i": {"title": null, "description": null, "dtype": "Int64", "nullable": true, "checks": {"in_range": {"min_value": 6, "max_value": 6, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "j": {"title": null, "description": null, "dtype": "object", "nullable": false, "checks": {"str_length": {"min_value": 0, "max_value": 7}}, "unique": false, "coerce": false, "required": true, "regex": false}, "m": {"title": null, "description": null, "dtype": "datetime64[ns]", "nullable": true, "checks": {"in_range": {"min_value": "2000-01-01 12:53:00", "max_value": "2000-01-01 12:53:00", "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "n": {"title": null, "description": null, "dtype": "datetime64[ns]", "nullable": true, "checks": {"in_range": {"min_value": "2000-01-01 12:00:00", "max_value": "2000-01-01 12:00:00", "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "p": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "q": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "r": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "s": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "t": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}}, "checks": null, "index": [{"title": null, "description": null, "dtype": "int64", "nullable": false, "checks": {"greater_than_or_equal_to": 0.0, "less_than_or_equal_to": 2.0}, "name": null, "unique": false, "coerce": false}], "dtype": null, "coerce": true, "strict": false, "name": null, "ordered": false, "unique": null, "report_duplicates": "all", "unique_column_names": false, "add_missing_columns": false, "title": null, "description": null}, "custom_data": {"columns": [{"name": "a", "type": "boolean", "nullable": true, "rows_count": 3, "rows_not_null_count": 3, "rows_null_count": 0, "true_count": 1, "false_count": 2}, {"name": "b", "type": "byte", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": 1, "max": 1, "mean": 1.0, "decimal_precision": 0, "margin_error": 0.0}, {"name": "c", "type": "date", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": "2000-01-01", "max": "2000-01-01", "format": "%Y-%m-%d"}, {"name": "d", "type": "daytimeinterval", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": "13 days, 0:00:00", "max": "13 days, 0:00:00"}, {"name": "e", "type": "double", "nullable": true, "rows_count": 3, "rows_not_null_count": 3, "rows_null_count": 0, "min": 0.0, "max": 2.1, "mean": 1.4000000000000001, "decimal_precision": 1, "margin_error": 1.2124355652982142}, {"name": "f", "type": "float", "nullable": true, "rows_count": 3, "rows_not_null_count": 3, "rows_null_count": 0, "min": 0.0, "max": 3.109999895095825, "mean": 2.073333263397217, "decimal_precision": 15, "margin_error": 1.7955592766132826}, {"name": "g", "type": "integer", "nullable": true, "rows_count": 3, "rows_not_null_count": 3, "rows_null_count": 0, "min": 0, "max": 4, "mean": 2.6666666666666665, "decimal_precision": 0, "margin_error": 2.309401076758503}, {"name": "h", "type": "long", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": 5, "max": 5, "mean": 5.0, "decimal_precision": 0, "margin_error": 0.0}, {"name": "i", "type": "short", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": 6, "max": 6, "mean": 6.0, "decimal_precision": 0, "margin_error": 0.0}, {"name": "j", "type": "string", "nullable": true, "rows_count": 3, "rows_not_null_count": 3, "rows_null_count": 0, "min_length": 0, "max_length": 7}, {"name": "m", "type": "timestamp", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": "2000-01-01 12:53:00", "max": "2000-01-01 12:53:00", "format": "%Y-%m-%dT%H:%M:%S%z"}, {"name": "n", "type": "timestamp_ntz", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": "2000-01-01 12:00:00", "max": "2000-01-01 12:00:00", "format": "%Y-%m-%dH:%M:%S"}, {"name": "o", "type": "decimal", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": "3.1415161718190", "max": "3.1415161718190", "mean": "3.14151617181900000", "decimal_precision": 13}, {"name": "p", "type": "array", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "value_type": "string", "allow_null": true, "null_value_proportion": 10.0, "max_size": 5, "min_size": 0, "mean_size": 3.3333333333333335, "is_unique_size": false}, {"name": "q", "type": "binary", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "max_size": 6, "min_size": 0, "mean_size": 2.6666666666666665, "is_unique_size": false}, {"name": "r", "type": "map", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "key_type": "string", "value_type": "string", "allow_null": true, "null_value_proportion": 0.0, "max_size": 5, "min_size": 0, "mean_size": 3, "is_unique_size": false}, {"name": "s", "type": "void", "nullable": true, "rows_count": 3, "rows_not_null_count": 0, "rows_null_count": 3}, {"name": "t", "type": "struct", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "metadata": [{"name": "inner1", "type": "string", "nullable": false, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1}, {"name": "inner2", "type": "long", "nullable": true, "rows_count": 3, "rows_not_null_count": 1, "rows_null_count": 2}]}]}}
@@ -0,0 +1 @@
1
+ {"pandera_schema": {"schema_type": "dataframe", "version": "0.20.4", "columns": {"name": {"title": null, "description": null, "dtype": "object", "nullable": false, "checks": {"str_length": {"min_value": 0, "max_value": 6}}, "unique": false, "coerce": false, "required": true, "regex": false}, "age": {"title": null, "description": null, "dtype": "int32", "nullable": false, "checks": {"in_range": {"min_value": 0, "max_value": 51, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "active": {"title": null, "description": null, "dtype": "bool", "nullable": false, "checks": {"isin": [true, false]}, "unique": false, "coerce": false, "required": true, "regex": false}}, "checks": null, "index": [{"title": null, "description": null, "dtype": "int64", "nullable": false, "checks": {"greater_than_or_equal_to": 0.0, "less_than_or_equal_to": 4.0}, "name": null, "unique": false, "coerce": false}], "dtype": null, "coerce": true, "strict": false, "name": null, "ordered": false, "unique": null, "report_duplicates": "all", "unique_column_names": false, "add_missing_columns": false, "title": null, "description": null}, "custom_data": {"columns": [{"name": "name", "type": "string", "nullable": true, "rows_count": 5, "rows_not_null_count": 5, "rows_null_count": 0, "min_length": 0, "max_length": 6}, {"name": "age", "type": "integer", "nullable": true, "rows_count": 5, "rows_not_null_count": 5, "rows_null_count": 0, "min": 0, "max": 51, "mean": 21.8, "decimal_precision": 0, "margin_error": 22.241852440837743}, {"name": "active", "type": "boolean", "nullable": true, "rows_count": 5, "rows_not_null_count": 5, "rows_null_count": 0, "true_count": 2, "false_count": 3}]}}
@@ -0,0 +1 @@
1
+ {"pandera_schema": {"schema_type": "dataframe", "version": "0.20.4", "columns": {"Description": {"title": null, "description": null, "dtype": "object", "nullable": false, "checks": {"str_length": {"min_value": 0, "max_value": 0}}, "unique": false, "coerce": false, "required": true, "regex": false}, "Price": {"title": null, "description": null, "dtype": "float64", "nullable": false, "checks": {"in_range": {"min_value": 0.0, "max_value": 0.0, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "Active": {"title": null, "description": null, "dtype": "bool", "nullable": false, "checks": {"isin": [true, false]}, "unique": false, "coerce": false, "required": true, "regex": false}}, "checks": null, "index": [{"title": null, "description": null, "dtype": "int64", "nullable": false, "checks": {"greater_than_or_equal_to": 0.0, "less_than_or_equal_to": 0.0}, "name": null, "unique": false, "coerce": false}], "dtype": null, "coerce": true, "strict": false, "name": null, "ordered": false, "unique": null, "report_duplicates": "all", "unique_column_names": false, "add_missing_columns": false, "title": null, "description": null}, "custom_data": {"columns": [{"name": "Description", "type": "string", "nullable": true, "rows_count": 1, "rows_not_null_count": 1, "rows_null_count": 0, "min_length": 0, "max_length": 0}, {"name": "Price", "type": "double", "nullable": true, "rows_count": 1, "rows_not_null_count": 1, "rows_null_count": 0, "min": 0.0, "max": 0.0, "mean": 0.0, "decimal_precision": 1, "margin_error": null}, {"name": "Active", "type": "boolean", "nullable": true, "rows_count": 1, "rows_not_null_count": 1, "rows_null_count": 0, "true_count": 0, "false_count": 1}]}}
@@ -22,10 +22,22 @@ from typing import get_type_hints
22
22
  from unittest.mock import MagicMock, patch
23
23
 
24
24
  import pytest
25
+ from pyspark.sql import SparkSession
26
+ from pyspark.sql.types import (
27
+ StructField,
28
+ StructType,
29
+ IntegerType,
30
+ FloatType,
31
+ DoubleType,
32
+ StringType,
33
+ BooleanType,
34
+ TimestampType,
35
+ )
25
36
 
26
37
  from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
27
38
  collect_dataframe_checkpoint,
28
39
  generate_parquet_for_spark_df,
40
+ normalize_missing_values,
29
41
  xcollect_dataframe_checkpoint,
30
42
  )
31
43
 
@@ -98,3 +110,70 @@ def test_skip_collector_return_type_commutability():
98
110
  assert (
99
111
  collect_return == x_collect_return
100
112
  ), "The return type of collect_dataframe_checkpoint and xcollect_dataframe_checkpoint must be the same."
113
+
114
+
115
+ @pytest.fixture(scope="module")
116
+ def spark():
117
+ spark = SparkSession.builder.master("local[1]").appName("pytest").getOrCreate()
118
+ yield spark
119
+ spark.stop()
120
+
121
+
122
+ def test_normalize_missing_values_integer_and_float(spark):
123
+ schema = StructType(
124
+ [
125
+ StructField("a", IntegerType()),
126
+ StructField("b", FloatType()),
127
+ StructField("c", DoubleType()),
128
+ ]
129
+ )
130
+ data = [
131
+ (None, None, None),
132
+ (1, 2.0, 3.0),
133
+ ]
134
+ df = spark.createDataFrame(data, schema)
135
+ result_df = normalize_missing_values(df)
136
+ result = result_df.collect()
137
+ assert result[0]["a"] == 0
138
+ assert result[0]["b"] == 0.0
139
+ assert result[0]["c"] == 0.0
140
+ assert result[1]["a"] == 1
141
+ assert result[1]["b"] == 2.0
142
+ assert result[1]["c"] == 3.0
143
+
144
+
145
+ def test_normalize_missing_values_string_and_bool(spark):
146
+ schema = StructType(
147
+ [
148
+ StructField("s", StringType(), True),
149
+ StructField("b", BooleanType(), True),
150
+ ]
151
+ )
152
+ data = [
153
+ (None, None),
154
+ ("foo", True),
155
+ ]
156
+ df = spark.createDataFrame(data, schema)
157
+ result_df = normalize_missing_values(df)
158
+ result = result_df.collect()
159
+ assert result[0]["s"] == ""
160
+ assert result[0]["b"] is False
161
+ assert result[1]["s"] == "foo"
162
+ assert result[1]["b"] is True
163
+
164
+
165
+ def test_normalize_missing_values_unhandled_type(spark):
166
+ schema = StructType(
167
+ [
168
+ StructField("t", TimestampType(), True),
169
+ StructField("i", IntegerType(), True),
170
+ ]
171
+ )
172
+ data = [
173
+ (None, None),
174
+ ]
175
+ df = spark.createDataFrame(data, schema)
176
+ result_df = normalize_missing_values(df)
177
+ result = result_df.collect()
178
+ assert result[0]["t"] is None
179
+ assert result[0]["i"] == 0
@@ -1 +0,0 @@
1
- {"pandera_schema": {"schema_type": "dataframe", "version": "0.20.4", "columns": {"a": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": {"isin": [true, false]}, "unique": false, "coerce": false, "required": true, "regex": false}, "b": {"title": null, "description": null, "dtype": "Int64", "nullable": true, "checks": {"in_range": {"min_value": 1, "max_value": 1, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "c": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "d": {"title": null, "description": null, "dtype": "timedelta64[ns]", "nullable": true, "checks": {"in_range": {"min_value": 1123200000000000, "max_value": 1123200000000000, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "e": {"title": null, "description": null, "dtype": "float64", "nullable": true, "checks": {"in_range": {"min_value": 2.1, "max_value": 2.1, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "f": {"title": null, "description": null, "dtype": "float32", "nullable": true, "checks": {"in_range": {"min_value": 3.109999895095825, "max_value": 3.109999895095825, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "g": {"title": null, "description": null, "dtype": "Int64", "nullable": true, "checks": {"in_range": {"min_value": 4, "max_value": 4, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "h": {"title": null, "description": null, "dtype": "Int64", "nullable": true, "checks": {"in_range": {"min_value": 5, "max_value": 5, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "i": {"title": null, "description": null, "dtype": "Int64", "nullable": true, "checks": {"in_range": {"min_value": 6, "max_value": 6, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "j": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": {"str_length": {"min_value": 7, "max_value": 7}}, "unique": false, "coerce": false, "required": true, "regex": false}, "m": {"title": null, "description": null, "dtype": "datetime64[ns]", "nullable": true, "checks": {"in_range": {"min_value": "2000-01-01 12:53:00", "max_value": "2000-01-01 12:53:00", "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "n": {"title": null, "description": null, "dtype": "datetime64[ns]", "nullable": true, "checks": {"in_range": {"min_value": "2000-01-01 12:00:00", "max_value": "2000-01-01 12:00:00", "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "p": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "q": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "r": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "s": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "t": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}}, "checks": null, "index": [{"title": null, "description": null, "dtype": "int64", "nullable": false, "checks": {"greater_than_or_equal_to": 0.0, "less_than_or_equal_to": 2.0}, "name": null, "unique": false, "coerce": false}], "dtype": null, "coerce": true, "strict": false, "name": null, "ordered": false, "unique": null, "report_duplicates": "all", "unique_column_names": false, "add_missing_columns": false, "title": null, "description": null}, "custom_data": {"columns": [{"name": "a", "type": "boolean", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "true_count": 1, "false_count": 1}, {"name": "b", "type": "byte", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": 1, "max": 1, "mean": 1, "decimal_precision": 0, "margin_error": 0.0}, {"name": "c", "type": "date", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": "2000-01-01", "max": "2000-01-01", "format": "%Y-%m-%d"}, {"name": "d", "type": "daytimeinterval", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": "13 days, 0:00:00", "max": "13 days, 0:00:00"}, {"name": "e", "type": "double", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": 2.1, "max": 2.1, "mean": 2.1, "decimal_precision": 1, "margin_error": 0.0}, {"name": "f", "type": "float", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": 3.109999895095825, "max": 3.109999895095825, "mean": 3.109999895095825, "decimal_precision": 15, "margin_error": 0.0}, {"name": "g", "type": "integer", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": 4, "max": 4, "mean": 4, "decimal_precision": 0, "margin_error": 0.0}, {"name": "h", "type": "long", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": 5, "max": 5, "mean": 5, "decimal_precision": 0, "margin_error": 0.0}, {"name": "i", "type": "short", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": 6, "max": 6, "mean": 6, "decimal_precision": 0, "margin_error": 0.0}, {"name": "j", "type": "string", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min_length": 7, "max_length": 7}, {"name": "m", "type": "timestamp", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": "2000-01-01 12:53:00", "max": "2000-01-01 12:53:00", "format": "%Y-%m-%dT%H:%M:%S%z"}, {"name": "n", "type": "timestamp_ntz", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": "2000-01-01 12:00:00", "max": "2000-01-01 12:00:00", "format": "%Y-%m-%dH:%M:%S"}, {"name": "o", "type": "decimal", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "min": "3.1415161718190", "max": "3.1415161718190", "mean": "3.14151617181900000", "decimal_precision": 13}, {"name": "p", "type": "array", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "value_type": "string", "allow_null": true, "null_value_proportion": 10.0, "max_size": 5, "min_size": 0, "mean_size": 3.3333333333333335, "is_unique_size": false}, {"name": "q", "type": "binary", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "max_size": 6, "min_size": 0, "mean_size": 2.6666666666666665, "is_unique_size": false}, {"name": "r", "type": "map", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "key_type": "string", "value_type": "string", "allow_null": true, "null_value_proportion": 0.0, "max_size": 5, "min_size": 0, "mean_size": 3, "is_unique_size": false}, {"name": "s", "type": "void", "nullable": true, "rows_count": 3, "rows_not_null_count": 0, "rows_null_count": 3}, {"name": "t", "type": "struct", "nullable": true, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1, "metadata": [{"name": "inner1", "type": "string", "nullable": false, "rows_count": 3, "rows_not_null_count": 2, "rows_null_count": 1}, {"name": "inner2", "type": "long", "nullable": true, "rows_count": 3, "rows_not_null_count": 1, "rows_null_count": 2}]}]}}
@@ -1 +0,0 @@
1
- {"pandera_schema": {"schema_type": "dataframe", "version": "0.20.4", "columns": {"name": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": {"str_length": {"min_value": 4, "max_value": 6}}, "unique": false, "coerce": false, "required": true, "regex": false}, "age": {"title": null, "description": null, "dtype": "Int64", "nullable": true, "checks": {"in_range": {"min_value": 23, "max_value": 51, "include_min": true, "include_max": true}}, "unique": false, "coerce": false, "required": true, "regex": false}, "active": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": {"isin": [true, false]}, "unique": false, "coerce": false, "required": true, "regex": false}}, "checks": null, "index": [{"title": null, "description": null, "dtype": "int64", "nullable": false, "checks": {"greater_than_or_equal_to": 0.0, "less_than_or_equal_to": 4.0}, "name": null, "unique": false, "coerce": false}], "dtype": null, "coerce": true, "strict": false, "name": null, "ordered": false, "unique": null, "report_duplicates": "all", "unique_column_names": false, "add_missing_columns": false, "title": null, "description": null}, "custom_data": {"columns": [{"name": "name", "type": "string", "nullable": true, "rows_count": 5, "rows_not_null_count": 4, "rows_null_count": 1, "min_length": 4, "max_length": 6}, {"name": "age", "type": "integer", "nullable": true, "rows_count": 5, "rows_not_null_count": 3, "rows_null_count": 2, "min": 23, "max": 51, "mean": 36.333333333333336, "decimal_precision": 0, "margin_error": 14.047538337136986}, {"name": "active", "type": "boolean", "nullable": true, "rows_count": 5, "rows_not_null_count": 4, "rows_null_count": 1, "true_count": 2, "false_count": 2}]}}
@@ -1 +0,0 @@
1
- {"pandera_schema": {"schema_type": "dataframe", "version": "0.20.4", "columns": {"Description": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "Price": {"title": null, "description": null, "dtype": "float64", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}, "Active": {"title": null, "description": null, "dtype": "object", "nullable": true, "checks": null, "unique": false, "coerce": false, "required": true, "regex": false}}, "checks": null, "index": [{"title": null, "description": null, "dtype": "int64", "nullable": false, "checks": {"greater_than_or_equal_to": 0.0, "less_than_or_equal_to": 0.0}, "name": null, "unique": false, "coerce": false}], "dtype": null, "coerce": true, "strict": false, "name": null, "ordered": false, "unique": null, "report_duplicates": "all", "unique_column_names": false, "add_missing_columns": false, "title": null, "description": null}, "custom_data": {"columns": [{"name": "Description", "type": "string", "nullable": true, "rows_count": 1, "rows_not_null_count": 0, "rows_null_count": 1}, {"name": "Price", "type": "double", "nullable": true, "rows_count": 1, "rows_not_null_count": 0, "rows_null_count": 1}, {"name": "Active", "type": "boolean", "nullable": true, "rows_count": 1, "rows_not_null_count": 0, "rows_null_count": 1}]}}