snowpark-checkpoints-collectors 0.1.0rc1__tar.gz → 0.1.0rc2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/.gitignore +3 -0
  2. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/PKG-INFO +89 -18
  3. snowpark_checkpoints_collectors-0.1.0rc2/README.md +102 -0
  4. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/__init__.py +3 -2
  5. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +10 -0
  6. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +1 -1
  7. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +18 -18
  8. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +22 -16
  9. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +17 -11
  10. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +18 -11
  11. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +7 -7
  12. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +15 -8
  13. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +15 -8
  14. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +22 -10
  15. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +9 -7
  16. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +25 -17
  17. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +5 -5
  18. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +24 -11
  19. snowpark_checkpoints_collectors-0.1.0rc2/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +59 -0
  20. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +10 -8
  21. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +18 -8
  22. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +18 -8
  23. snowpark_checkpoints_collectors-0.1.0rc2/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +212 -0
  24. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +94 -45
  25. snowpark_checkpoints_collectors-0.1.0rc2/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +49 -0
  26. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/telemetry_compare_utils.py +52 -0
  27. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_checkpoint_name.py +51 -0
  28. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1.py +40 -47
  29. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +1 -0
  30. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +1 -1
  31. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +1 -0
  32. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +1 -1
  33. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +1 -0
  34. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +1 -1
  35. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +1 -1
  36. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +1 -1
  37. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +1 -1
  38. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_1_expected/test_full_df.json +1 -0
  39. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +1 -0
  40. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +1 -1
  41. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +1 -1
  42. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_2.py +56 -8
  43. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +17 -0
  44. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +17 -0
  45. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +17 -0
  46. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_2_expected/test_collect_invalid_mode_telemetry.json +17 -0
  47. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_2_expected/test_generate_parquet_for_spark_df_telemetry.json +17 -0
  48. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_2_expected/test_spark_df_mode_dataframe_telemetry.json +17 -0
  49. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collection_result_file.py +1 -1
  50. snowpark_checkpoints_collectors-0.1.0rc2/test/unit/test_checkpoint_name_utils.py +47 -0
  51. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/unit/test_collection_point_result_manager.py +1 -1
  52. snowpark_checkpoints_collectors-0.1.0rc2/test/unit/test_column_collection.py +466 -0
  53. snowpark_checkpoints_collectors-0.1.0rc2/test/unit/test_pandera_column_check_manager.py +194 -0
  54. snowpark_checkpoints_collectors-0.1.0rc1/README.md +0 -31
  55. snowpark_checkpoints_collectors-0.1.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -35
  56. snowpark_checkpoints_collectors-0.1.0rc1/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +0 -168
  57. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -1
  58. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -1
  59. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -1
  60. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -1
  61. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -1
  62. snowpark_checkpoints_collectors-0.1.0rc1/test/unit/test_column_collection.py +0 -669
  63. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/CHANGELOG.md +0 -0
  64. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/LICENSE +0 -0
  65. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/pyproject.toml +0 -0
  66. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/snowpark-testdf-schema.json +0 -0
  67. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
  68. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +0 -0
  69. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
  70. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
  71. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
  72. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
  73. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
  74. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +0 -0
  75. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +0 -0
  76. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +0 -0
  77. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -0
  78. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/.coveragerc +0 -0
  79. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
  80. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
  81. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
  82. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/integ/test_snow_connection_int.py +0 -0
  83. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/unit/test_collection_point_result.py +0 -0
  84. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/unit/test_extra_config.py +0 -0
  85. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/unit/test_file_utils.py +0 -0
  86. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/unit/test_snow_connection.py +0 -0
  87. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc2}/test/unit/test_summary_stats_collector.py +0 -0
@@ -4,10 +4,13 @@
4
4
 
5
5
  # demos
6
6
  snowpark-checkpoints-output/
7
+ Demos/Demos/
8
+ Demos/snowpark-checkpoints-output/
7
9
 
8
10
  # env
9
11
  wheelvenv/
10
12
 
13
+
11
14
  # version
12
15
  !__version__.py
13
16
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: snowpark-checkpoints-collectors
3
- Version: 0.1.0rc1
3
+ Version: 0.1.0rc2
4
4
  Summary: Snowpark column and table statistics collection
5
5
  Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
6
  Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
@@ -243,7 +243,7 @@ Requires-Dist: setuptools>=70.0.0; extra == 'development'
243
243
  Requires-Dist: twine==5.1.1; extra == 'development'
244
244
  Description-Content-Type: text/markdown
245
245
 
246
- # Data Collection from Spark Pipelines
246
+ # snowpark-checkpoints-collectors
247
247
 
248
248
  ---
249
249
  **NOTE**
@@ -252,25 +252,96 @@ This package is on Private Preview.
252
252
 
253
253
  ---
254
254
 
255
- The `snowpark-checkpoints-collector` package can collect
256
- schema and check information from a spark pipeline and
257
- record those results into a set of JSON files corresponding to different intermediate dataframes. These files can be inspected manually
258
- and handed over to teams implementing the snowpark pipeline. The `snowpark-checkpoints-collector` package is designed to have minimal
259
- dependencies and the generated files are meant to be inspected by security
260
- teams.
255
+ **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
256
+ ## Features
261
257
 
262
- On the snowpark side the `snowpark-checkpoints` package can use these files to perform schema and data validation checks against snowpark dataframes at the same, intermediate logical "checkpoints".
258
+ - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
259
+ - DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
263
260
 
264
- ## collect_dataframe_schema
265
261
 
262
+
263
+ ## Functionalities
264
+
265
+ ### Collect DataFrame Checkpoint
266
+
267
+
268
+
269
+ ```python
270
+ from pyspark.sql import DataFrame as SparkDataFrame
271
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
272
+ from typing import Optional
273
+
274
+ # Signature of the function
275
+ def collect_dataframe_checkpoint(
276
+ df: SparkDataFrame,
277
+ checkpoint_name: str,
278
+ sample: Optional[float] = None,
279
+ mode: Optional[CheckpointMode] = None,
280
+ output_path: Optional[str] = None,
281
+ ) -> None:
282
+ ...
283
+ ```
284
+
285
+ - `df`: The input Spark dataframe to collect.
286
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
287
+ - `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
288
+ - `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
289
+ - `output_path`: The output path to save the checkpoint, defaults to current working directory.
290
+
291
+
292
+ ## Usage Example
293
+
294
+ ### Schema mode
295
+
296
+ ```python
297
+ from pyspark.sql import SparkSession
298
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
299
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
300
+
301
+ spark_session = SparkSession.builder.getOrCreate()
302
+ sample_size = 1.0
303
+
304
+ pyspark_df = spark_session.createDataFrame(
305
+ [("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
306
+ )
307
+
308
+ collect_dataframe_checkpoint(
309
+ pyspark_df,
310
+ checkpoint_name="collect_checkpoint_mode_1",
311
+ sample=sample_size,
312
+ mode=CheckpointMode.SCHEMA,
313
+ )
266
314
  ```
267
- from snowflake.snowpark_checkpoints_collector import collect_dataframe_schema;
268
- collect_dataframe_schema(df:SparkDataFrame,
269
- checkpoint_name,
270
- sample=0.1)
315
+
316
+
317
+ ### Dataframe mode
318
+
319
+ ```python
320
+ from pyspark.sql import SparkSession
321
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
322
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
323
+ from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
324
+
325
+ spark_schema = StructType(
326
+ [
327
+ StructField("BYTE", ByteType(), True),
328
+ StructField("STRING", StringType(), True),
329
+ StructField("INTEGER", IntegerType(), True)
330
+ ]
331
+ )
332
+
333
+ data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
334
+
335
+ spark_session = SparkSession.builder.getOrCreate()
336
+ pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
337
+ "INTEGER"
338
+ )
339
+
340
+ collect_dataframe_checkpoint(
341
+ pyspark_df,
342
+ checkpoint_name="collect_checkpoint_mode_2",
343
+ mode=CheckpointMode.DATAFRAME,
344
+ )
271
345
  ```
272
346
 
273
- - df - the spark data frame to collect the schema from
274
- - checkpoint_name - the name of the "checkpoint". Generated JSON files
275
- will have the name "snowpark-[checkpoint_name]-schema.json"
276
- - sample - sample size of the spark data frame to use to generate the schema
347
+ ------
@@ -0,0 +1,102 @@
1
+ # snowpark-checkpoints-collectors
2
+
3
+ ---
4
+ **NOTE**
5
+
6
+ This package is on Private Preview.
7
+
8
+ ---
9
+
10
+ **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
11
+ ## Features
12
+
13
+ - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
14
+ - DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
15
+
16
+
17
+
18
+ ## Functionalities
19
+
20
+ ### Collect DataFrame Checkpoint
21
+
22
+
23
+
24
+ ```python
25
+ from pyspark.sql import DataFrame as SparkDataFrame
26
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
27
+ from typing import Optional
28
+
29
+ # Signature of the function
30
+ def collect_dataframe_checkpoint(
31
+ df: SparkDataFrame,
32
+ checkpoint_name: str,
33
+ sample: Optional[float] = None,
34
+ mode: Optional[CheckpointMode] = None,
35
+ output_path: Optional[str] = None,
36
+ ) -> None:
37
+ ...
38
+ ```
39
+
40
+ - `df`: The input Spark dataframe to collect.
41
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
42
+ - `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
43
+ - `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
44
+ - `output_path`: The output path to save the checkpoint, defaults to current working directory.
45
+
46
+
47
+ ## Usage Example
48
+
49
+ ### Schema mode
50
+
51
+ ```python
52
+ from pyspark.sql import SparkSession
53
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
54
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
55
+
56
+ spark_session = SparkSession.builder.getOrCreate()
57
+ sample_size = 1.0
58
+
59
+ pyspark_df = spark_session.createDataFrame(
60
+ [("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
61
+ )
62
+
63
+ collect_dataframe_checkpoint(
64
+ pyspark_df,
65
+ checkpoint_name="collect_checkpoint_mode_1",
66
+ sample=sample_size,
67
+ mode=CheckpointMode.SCHEMA,
68
+ )
69
+ ```
70
+
71
+
72
+ ### Dataframe mode
73
+
74
+ ```python
75
+ from pyspark.sql import SparkSession
76
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
77
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
78
+ from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
79
+
80
+ spark_schema = StructType(
81
+ [
82
+ StructField("BYTE", ByteType(), True),
83
+ StructField("STRING", StringType(), True),
84
+ StructField("INTEGER", IntegerType(), True)
85
+ ]
86
+ )
87
+
88
+ data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
89
+
90
+ spark_session = SparkSession.builder.getOrCreate()
91
+ pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
92
+ "INTEGER"
93
+ )
94
+
95
+ collect_dataframe_checkpoint(
96
+ pyspark_df,
97
+ checkpoint_name="collect_checkpoint_mode_2",
98
+ mode=CheckpointMode.DATAFRAME,
99
+ )
100
+ ```
101
+
102
+ ------
@@ -2,9 +2,10 @@
2
2
  # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
4
 
5
- __all__ = ["collect_dataframe_checkpoint", "Singleton"]
5
+ __all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
6
6
 
7
- from snowflake.snowpark_checkpoints_collector.singleton import Singleton
8
7
  from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
9
8
  collect_dataframe_checkpoint,
10
9
  )
10
+
11
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
@@ -8,8 +8,13 @@ from enum import IntEnum
8
8
 
9
9
 
10
10
  class CheckpointMode(IntEnum):
11
+
12
+ """Enum class representing the collection mode."""
13
+
11
14
  SCHEMA = 1
15
+ """Collect automatic schema inference"""
12
16
  DATAFRAME = 2
17
+ """Export DataFrame as Parquet file to Snowflake"""
13
18
 
14
19
 
15
20
  # CONSTANTS
@@ -76,11 +81,13 @@ COLUMN_IS_UNIQUE_SIZE_KEY = "is_unique_size"
76
81
  COLUMN_KEY_TYPE_KEY = "key_type"
77
82
  COLUMN_MARGIN_ERROR_KEY = "margin_error"
78
83
  COLUMN_MAX_KEY = "max"
84
+ COLUMN_MAX_LENGTH_KEY = "max_length"
79
85
  COLUMN_MAX_SIZE_KEY = "max_size"
80
86
  COLUMN_MEAN_KEY = "mean"
81
87
  COLUMN_MEAN_SIZE_KEY = "mean_size"
82
88
  COLUMN_METADATA_KEY = "metadata"
83
89
  COLUMN_MIN_KEY = "min"
90
+ COLUMN_MIN_LENGTH_KEY = "min_length"
84
91
  COLUMN_MIN_SIZE_KEY = "min_size"
85
92
  COLUMN_NAME_KEY = "name"
86
93
  COLUMN_NULL_COUNT_KEY = "null_count"
@@ -90,6 +97,7 @@ COLUMN_ROWS_NULL_COUNT_KEY = "rows_null_count"
90
97
  COLUMN_SIZE_KEY = "size"
91
98
  COLUMN_TRUE_COUNT_KEY = "true_count"
92
99
  COLUMN_TYPE_KEY = "type"
100
+ COLUMN_VALUE_KEY = "value"
93
101
  COLUMN_VALUE_TYPE_KEY = "value_type"
94
102
  COLUMNS_KEY = "columns"
95
103
 
@@ -121,6 +129,8 @@ UNKNOWN_SOURCE_FILE = "unknown"
121
129
  UNKNOWN_LINE_OF_CODE = -1
122
130
  BACKSLASH_TOKEN = "\\"
123
131
  SLASH_TOKEN = "/"
132
+ PYSPARK_NONE_SIZE_VALUE = -1
133
+ PANDAS_LONG_TYPE = "Int64"
124
134
 
125
135
  # ENVIRONMENT VARIABLES
126
136
  SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR = (
@@ -5,10 +5,10 @@ import json
5
5
 
6
6
  from typing import Optional
7
7
 
8
- from snowflake.snowpark_checkpoints_collector import Singleton
9
8
  from snowflake.snowpark_checkpoints_collector.collection_result.model import (
10
9
  CollectionPointResult,
11
10
  )
11
+ from snowflake.snowpark_checkpoints_collector.singleton import Singleton
12
12
  from snowflake.snowpark_checkpoints_collector.utils import file_utils
13
13
 
14
14
 
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
- from pandas import Series
4
+ from pyspark.sql import DataFrame as SparkDataFrame
5
5
  from pyspark.sql.types import StructField
6
6
 
7
7
  from snowflake.snowpark_checkpoints_collector.collection_common import (
@@ -88,14 +88,14 @@ class ColumnCollectorManager:
88
88
  """Manage class for column collector based on type."""
89
89
 
90
90
  def collect_column(
91
- self, clm_name: str, struct_field: StructField, values: Series
91
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
92
92
  ) -> dict[str, any]:
93
93
  """Collect the data of the column based on the column type.
94
94
 
95
95
  Args:
96
96
  clm_name (str): the name of the column.
97
97
  struct_field (pyspark.sql.types.StructField): the struct field of the column type.
98
- values (pandas.Series): the column values as Pandas.Series.
98
+ values (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
99
99
 
100
100
  Returns:
101
101
  dict[str, any]: The data collected.
@@ -112,7 +112,7 @@ class ColumnCollectorManager:
112
112
 
113
113
  @column_register(ARRAY_COLUMN_TYPE)
114
114
  def _collect_array_type_custom_data(
115
- self, clm_name: str, struct_field: StructField, values: Series
115
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
116
116
  ) -> dict[str, any]:
117
117
  column_collector = ArrayColumnCollector(clm_name, struct_field, values)
118
118
  collected_data = column_collector.get_data()
@@ -120,7 +120,7 @@ class ColumnCollectorManager:
120
120
 
121
121
  @column_register(BINARY_COLUMN_TYPE)
122
122
  def _collect_binary_type_custom_data(
123
- self, clm_name: str, struct_field: StructField, values: Series
123
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
124
124
  ) -> dict[str, any]:
125
125
  column_collector = BinaryColumnCollector(clm_name, struct_field, values)
126
126
  collected_data = column_collector.get_data()
@@ -128,7 +128,7 @@ class ColumnCollectorManager:
128
128
 
129
129
  @column_register(BOOLEAN_COLUMN_TYPE)
130
130
  def _collect_boolean_type_custom_data(
131
- self, clm_name: str, struct_field: StructField, values: Series
131
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
132
132
  ) -> dict[str, any]:
133
133
  column_collector = BooleanColumnCollector(clm_name, struct_field, values)
134
134
  collected_data = column_collector.get_data()
@@ -136,7 +136,7 @@ class ColumnCollectorManager:
136
136
 
137
137
  @column_register(DATE_COLUMN_TYPE)
138
138
  def _collect_date_type_custom_data(
139
- self, clm_name: str, struct_field: StructField, values: Series
139
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
140
140
  ) -> dict[str, any]:
141
141
  column_collector = DateColumnCollector(clm_name, struct_field, values)
142
142
  collected_data = column_collector.get_data()
@@ -144,7 +144,7 @@ class ColumnCollectorManager:
144
144
 
145
145
  @column_register(DAYTIMEINTERVAL_COLUMN_TYPE)
146
146
  def _collect_day_time_interval_type_custom_data(
147
- self, clm_name: str, struct_field: StructField, values: Series
147
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
148
148
  ) -> dict[str, any]:
149
149
  column_collector = DayTimeIntervalColumnCollector(
150
150
  clm_name, struct_field, values
@@ -154,7 +154,7 @@ class ColumnCollectorManager:
154
154
 
155
155
  @column_register(DECIMAL_COLUMN_TYPE)
156
156
  def _collect_decimal_type_custom_data(
157
- self, clm_name: str, struct_field: StructField, values: Series
157
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
158
158
  ) -> dict[str, any]:
159
159
  column_collector = DecimalColumnCollector(clm_name, struct_field, values)
160
160
  collected_data = column_collector.get_data()
@@ -162,7 +162,7 @@ class ColumnCollectorManager:
162
162
 
163
163
  @column_register(MAP_COLUMN_TYPE)
164
164
  def _collect_map_type_custom_data(
165
- self, clm_name: str, struct_field: StructField, values: Series
165
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
166
166
  ) -> dict[str, any]:
167
167
  column_collector = MapColumnCollector(clm_name, struct_field, values)
168
168
  collected_data = column_collector.get_data()
@@ -170,7 +170,7 @@ class ColumnCollectorManager:
170
170
 
171
171
  @column_register(NULL_COLUMN_TYPE)
172
172
  def _collect_null_type_custom_data(
173
- self, clm_name: str, struct_field: StructField, values: Series
173
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
174
174
  ) -> dict[str, any]:
175
175
  column_collector = NullColumnCollector(clm_name, struct_field, values)
176
176
  collected_data = column_collector.get_data()
@@ -185,7 +185,7 @@ class ColumnCollectorManager:
185
185
  DOUBLE_COLUMN_TYPE,
186
186
  )
187
187
  def _collect_numeric_type_custom_data(
188
- self, clm_name: str, struct_field: StructField, values: Series
188
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
189
189
  ) -> dict[str, any]:
190
190
  column_collector = NumericColumnCollector(clm_name, struct_field, values)
191
191
  collected_data = column_collector.get_data()
@@ -193,7 +193,7 @@ class ColumnCollectorManager:
193
193
 
194
194
  @column_register(STRING_COLUMN_TYPE)
195
195
  def _collect_string_type_custom_data(
196
- self, clm_name: str, struct_field: StructField, values: Series
196
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
197
197
  ) -> dict[str, any]:
198
198
  column_collector = StringColumnCollector(clm_name, struct_field, values)
199
199
  collected_data = column_collector.get_data()
@@ -201,7 +201,7 @@ class ColumnCollectorManager:
201
201
 
202
202
  @column_register(STRUCT_COLUMN_TYPE)
203
203
  def _collect_struct_type_custom_data(
204
- self, clm_name: str, struct_field: StructField, values: Series
204
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
205
205
  ) -> dict[str, any]:
206
206
  column_collector = StructColumnCollector(clm_name, struct_field, values)
207
207
  collected_data = column_collector.get_data()
@@ -209,7 +209,7 @@ class ColumnCollectorManager:
209
209
 
210
210
  @column_register(TIMESTAMP_COLUMN_TYPE)
211
211
  def _collect_timestamp_type_custom_data(
212
- self, clm_name: str, struct_field: StructField, values: Series
212
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
213
213
  ) -> dict[str, any]:
214
214
  column_collector = TimestampColumnCollector(clm_name, struct_field, values)
215
215
  collected_data = column_collector.get_data()
@@ -217,21 +217,21 @@ class ColumnCollectorManager:
217
217
 
218
218
  @column_register(TIMESTAMP_NTZ_COLUMN_TYPE)
219
219
  def _collect_timestampntz_type_custom_data(
220
- self, clm_name: str, struct_field: StructField, values: Series
220
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
221
221
  ) -> dict[str, any]:
222
222
  column_collector = TimestampNTZColumnCollector(clm_name, struct_field, values)
223
223
  collected_data = column_collector.get_data()
224
224
  return collected_data
225
225
 
226
226
  def collect_empty_custom_data(
227
- self, clm_name: str, struct_field: StructField, values: Series
227
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
228
228
  ) -> dict[str, any]:
229
229
  """Collect the data of a empty column.
230
230
 
231
231
  Args:
232
232
  clm_name (str): the name of the column.
233
233
  struct_field (pyspark.sql.types.StructField): the struct field of the column type.
234
- values (pandas.Series): the column values as Pandas.Series.
234
+ values (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
235
235
 
236
236
  Returns:
237
237
  dict[str, any]: The data collected.
@@ -3,7 +3,12 @@
3
3
  #
4
4
  from statistics import mean
5
5
 
6
- from pandas import Series
6
+ from pyspark.sql import DataFrame as SparkDataFrame
7
+ from pyspark.sql.functions import array as spark_array
8
+ from pyspark.sql.functions import coalesce as spark_coalesce
9
+ from pyspark.sql.functions import col as spark_col
10
+ from pyspark.sql.functions import explode as spark_explode
11
+ from pyspark.sql.functions import size as spark_size
7
12
  from pyspark.sql.types import StructField
8
13
 
9
14
  from snowflake.snowpark_checkpoints_collector.collection_common import (
@@ -13,6 +18,8 @@ from snowflake.snowpark_checkpoints_collector.collection_common import (
13
18
  COLUMN_MEAN_SIZE_KEY,
14
19
  COLUMN_MIN_SIZE_KEY,
15
20
  COLUMN_NULL_VALUE_PROPORTION_KEY,
21
+ COLUMN_SIZE_KEY,
22
+ COLUMN_VALUE_KEY,
16
23
  COLUMN_VALUE_TYPE_KEY,
17
24
  CONTAINS_NULL_KEY,
18
25
  ELEMENT_TYPE_KEY,
@@ -30,22 +37,22 @@ class ArrayColumnCollector(ColumnCollectorBase):
30
37
  name (str): the name of the column.
31
38
  type (str): the type of the column.
32
39
  struct_field (pyspark.sql.types.StructField): the struct field of the column type.
33
- values (pandas.Series): the column values as Pandas.Series.
40
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
34
41
 
35
42
  """
36
43
 
37
44
  def __init__(
38
- self, clm_name: str, struct_field: StructField, clm_values: Series
45
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
39
46
  ) -> None:
40
47
  """Init ArrayColumnCollector.
41
48
 
42
49
  Args:
43
50
  clm_name (str): the name of the column.
44
51
  struct_field (pyspark.sql.types.StructField): the struct field of the column type.
45
- clm_values (pandas.Series): the column values as Pandas.Series.
52
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
46
53
 
47
54
  """
48
- super().__init__(clm_name, struct_field, clm_values)
55
+ super().__init__(clm_name, struct_field, clm_df)
49
56
  self._array_size_collection = self._compute_array_size_collection()
50
57
 
51
58
  def get_custom_data(self) -> dict[str, any]:
@@ -73,23 +80,22 @@ class ArrayColumnCollector(ColumnCollectorBase):
73
80
  return custom_data_dict
74
81
 
75
82
  def _compute_array_size_collection(self) -> list[int]:
76
- size_collection = []
77
- for array in self.values:
78
- if array is None:
79
- continue
83
+ select_result = self.column_df.select(
84
+ spark_size(spark_coalesce(spark_col(self.name), spark_array([]))).alias(
85
+ COLUMN_SIZE_KEY
86
+ )
87
+ ).collect()
80
88
 
81
- length = len(array)
82
- size_collection.append(length)
89
+ size_collection = [row[COLUMN_SIZE_KEY] for row in select_result]
83
90
 
84
91
  return size_collection
85
92
 
86
93
  def _compute_null_value_proportion(self) -> float:
87
- null_counter = 0
88
- for array in self.values:
89
- if array is None:
90
- continue
94
+ select_result = self.column_df.select(
95
+ spark_explode(spark_col(self.name)).alias(COLUMN_VALUE_KEY)
96
+ )
91
97
 
92
- null_counter += array.count(None)
98
+ null_counter = select_result.where(spark_col(COLUMN_VALUE_KEY).isNull()).count()
93
99
 
94
100
  total_values = sum(self._array_size_collection)
95
101
  null_value_proportion = (null_counter / total_values) * 100
@@ -3,7 +3,12 @@
3
3
  #
4
4
  from statistics import mean
5
5
 
6
- from pandas import Series
6
+ from pyspark.sql import DataFrame as SparkDataFrame
7
+ from pyspark.sql.functions import coalesce as spark_coalesce
8
+ from pyspark.sql.functions import col as spark_col
9
+ from pyspark.sql.functions import length as spark_length
10
+ from pyspark.sql.functions import lit as spark_lit
11
+ from pyspark.sql.functions import to_binary as spark_to_binary
7
12
  from pyspark.sql.types import StructField
8
13
 
9
14
  from snowflake.snowpark_checkpoints_collector.collection_common import (
@@ -11,6 +16,7 @@ from snowflake.snowpark_checkpoints_collector.collection_common import (
11
16
  COLUMN_MAX_SIZE_KEY,
12
17
  COLUMN_MEAN_SIZE_KEY,
13
18
  COLUMN_MIN_SIZE_KEY,
19
+ COLUMN_SIZE_KEY,
14
20
  )
15
21
  from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
16
22
  ColumnCollectorBase,
@@ -25,22 +31,22 @@ class BinaryColumnCollector(ColumnCollectorBase):
25
31
  name (str): the name of the column.
26
32
  type (str): the type of the column.
27
33
  struct_field (pyspark.sql.types.StructField): the struct field of the column type.
28
- values (pandas.Series): the column values as Pandas.Series.
34
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
29
35
 
30
36
  """
31
37
 
32
38
  def __init__(
33
- self, clm_name: str, struct_field: StructField, clm_values: Series
39
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
34
40
  ) -> None:
35
41
  """Init BinaryColumnCollector.
36
42
 
37
43
  Args:
38
44
  clm_name (str): the name of the column.
39
45
  struct_field (pyspark.sql.types.StructField): the struct field of the column type.
40
- clm_values (pandas.Series): the column values as Pandas.Series.
46
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
41
47
 
42
48
  """
43
- super().__init__(clm_name, struct_field, clm_values)
49
+ super().__init__(clm_name, struct_field, clm_df)
44
50
  self._binary_size_collection = self._compute_binary_size_collection()
45
51
 
46
52
  def get_custom_data(self) -> dict[str, any]:
@@ -59,12 +65,12 @@ class BinaryColumnCollector(ColumnCollectorBase):
59
65
  return custom_data_dict
60
66
 
61
67
  def _compute_binary_size_collection(self) -> list[int]:
62
- size_collection = []
63
- for binary in self.values:
64
- if binary is None:
65
- continue
68
+ select_result = self.column_df.select(
69
+ spark_length(
70
+ spark_coalesce(spark_col(self.name), spark_to_binary(spark_lit(b"")))
71
+ ).alias(COLUMN_SIZE_KEY)
72
+ ).collect()
66
73
 
67
- length = len(binary)
68
- size_collection.append(length)
74
+ size_collection = [row[COLUMN_SIZE_KEY] for row in select_result]
69
75
 
70
76
  return size_collection