snowpark-checkpoints-collectors 0.1.0rc1__tar.gz → 0.2.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/PKG-INFO +89 -18
  2. snowpark_checkpoints_collectors-0.2.0rc1/README.md +102 -0
  3. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/__init__.py +3 -2
  4. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +5 -0
  5. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +1 -1
  6. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +1 -1
  7. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +1 -1
  8. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +43 -22
  9. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +66 -33
  10. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +49 -0
  11. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_checkpoint_name.py +51 -0
  12. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1.py +19 -6
  13. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +1 -0
  14. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +1 -0
  15. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +1 -0
  16. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df.json +1 -0
  17. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +1 -0
  18. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_2.py +3 -2
  19. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collection_result_file.py +1 -1
  20. snowpark_checkpoints_collectors-0.2.0rc1/test/unit/test_checkpoint_name_utils.py +47 -0
  21. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_collection_point_result_manager.py +1 -1
  22. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_column_collection.py +5 -5
  23. snowpark_checkpoints_collectors-0.1.0rc1/README.md +0 -31
  24. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -1
  25. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -1
  26. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -1
  27. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -1
  28. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -1
  29. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/.gitignore +0 -0
  30. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/CHANGELOG.md +0 -0
  31. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/LICENSE +0 -0
  32. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/pyproject.toml +0 -0
  33. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/snowpark-testdf-schema.json +0 -0
  34. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
  35. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +0 -0
  36. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
  37. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +0 -0
  38. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
  39. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +0 -0
  40. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +0 -0
  41. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -0
  42. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +0 -0
  43. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -0
  44. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -0
  45. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -0
  46. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +0 -0
  47. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +0 -0
  48. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -0
  49. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +0 -0
  50. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -0
  51. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +0 -0
  52. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
  53. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
  54. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
  55. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +0 -0
  56. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +0 -0
  57. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +0 -0
  58. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -0
  59. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/.coveragerc +0 -0
  60. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -0
  61. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -0
  62. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -0
  63. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
  64. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -0
  65. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
  66. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -0
  67. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
  68. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -0
  69. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +0 -0
  70. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -0
  71. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_snow_connection_int.py +0 -0
  72. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_collection_point_result.py +0 -0
  73. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_extra_config.py +0 -0
  74. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_file_utils.py +0 -0
  75. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_snow_connection.py +0 -0
  76. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_summary_stats_collector.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: snowpark-checkpoints-collectors
3
- Version: 0.1.0rc1
3
+ Version: 0.2.0rc1
4
4
  Summary: Snowpark column and table statistics collection
5
5
  Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
6
  Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
@@ -243,7 +243,7 @@ Requires-Dist: setuptools>=70.0.0; extra == 'development'
243
243
  Requires-Dist: twine==5.1.1; extra == 'development'
244
244
  Description-Content-Type: text/markdown
245
245
 
246
- # Data Collection from Spark Pipelines
246
+ # snowpark-checkpoints-collectors
247
247
 
248
248
  ---
249
249
  **NOTE**
@@ -252,25 +252,96 @@ This package is on Private Preview.
252
252
 
253
253
  ---
254
254
 
255
- The `snowpark-checkpoints-collector` package can collect
256
- schema and check information from a spark pipeline and
257
- record those results into a set of JSON files corresponding to different intermediate dataframes. These files can be inspected manually
258
- and handed over to teams implementing the snowpark pipeline. The `snowpark-checkpoints-collector` package is designed to have minimal
259
- dependencies and the generated files are meant to be inspected by security
260
- teams.
255
+ **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
256
+ ## Features
261
257
 
262
- On the snowpark side the `snowpark-checkpoints` package can use these files to perform schema and data validation checks against snowpark dataframes at the same, intermediate logical "checkpoints".
258
+ - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
259
+ - DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
263
260
 
264
- ## collect_dataframe_schema
265
261
 
262
+
263
+ ## Functionalities
264
+
265
+ ### Collect DataFrame Checkpoint
266
+
267
+
268
+
269
+ ```python
270
+ from pyspark.sql import DataFrame as SparkDataFrame
271
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
272
+ from typing import Optional
273
+
274
+ # Signature of the function
275
+ def collect_dataframe_checkpoint(
276
+ df: SparkDataFrame,
277
+ checkpoint_name: str,
278
+ sample: Optional[float] = None,
279
+ mode: Optional[CheckpointMode] = None,
280
+ output_path: Optional[str] = None,
281
+ ) -> None:
282
+ ...
283
+ ```
284
+
285
+ - `df`: The input Spark dataframe to collect.
286
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
287
+ - `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
288
+ - `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
289
+ - `output_path`: The output path to save the checkpoint, defaults to current working directory.
290
+
291
+
292
+ ## Usage Example
293
+
294
+ ### Schema mode
295
+
296
+ ```python
297
+ from pyspark.sql import SparkSession
298
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
299
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
300
+
301
+ spark_session = SparkSession.builder.getOrCreate()
302
+ sample_size = 1.0
303
+
304
+ pyspark_df = spark_session.createDataFrame(
305
+ [("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
306
+ )
307
+
308
+ collect_dataframe_checkpoint(
309
+ pyspark_df,
310
+ checkpoint_name="collect_checkpoint_mode_1",
311
+ sample=sample_size,
312
+ mode=CheckpointMode.SCHEMA,
313
+ )
266
314
  ```
267
- from snowflake.snowpark_checkpoints_collector import collect_dataframe_schema;
268
- collect_dataframe_schema(df:SparkDataFrame,
269
- checkpoint_name,
270
- sample=0.1)
315
+
316
+
317
+ ### Dataframe mode
318
+
319
+ ```python
320
+ from pyspark.sql import SparkSession
321
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
322
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
323
+ from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
324
+
325
+ spark_schema = StructType(
326
+ [
327
+ StructField("BYTE", ByteType(), True),
328
+ StructField("STRING", StringType(), True),
329
+ StructField("INTEGER", IntegerType(), True)
330
+ ]
331
+ )
332
+
333
+ data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
334
+
335
+ spark_session = SparkSession.builder.getOrCreate()
336
+ pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
337
+ "INTEGER"
338
+ )
339
+
340
+ collect_dataframe_checkpoint(
341
+ pyspark_df,
342
+ checkpoint_name="collect_checkpoint_mode_2",
343
+ mode=CheckpointMode.DATAFRAME,
344
+ )
271
345
  ```
272
346
 
273
- - df - the spark data frame to collect the schema from
274
- - checkpoint_name - the name of the "checkpoint". Generated JSON files
275
- will have the name "snowpark-[checkpoint_name]-schema.json"
276
- - sample - sample size of the spark data frame to use to generate the schema
347
+ ------
@@ -0,0 +1,102 @@
1
+ # snowpark-checkpoints-collectors
2
+
3
+ ---
4
+ **NOTE**
5
+
6
+ This package is on Private Preview.
7
+
8
+ ---
9
+
10
+ **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
11
+ ## Features
12
+
13
+ - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
14
+ - DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
15
+
16
+
17
+
18
+ ## Functionalities
19
+
20
+ ### Collect DataFrame Checkpoint
21
+
22
+
23
+
24
+ ```python
25
+ from pyspark.sql import DataFrame as SparkDataFrame
26
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
27
+ from typing import Optional
28
+
29
+ # Signature of the function
30
+ def collect_dataframe_checkpoint(
31
+ df: SparkDataFrame,
32
+ checkpoint_name: str,
33
+ sample: Optional[float] = None,
34
+ mode: Optional[CheckpointMode] = None,
35
+ output_path: Optional[str] = None,
36
+ ) -> None:
37
+ ...
38
+ ```
39
+
40
+ - `df`: The input Spark dataframe to collect.
41
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
42
+ - `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
43
+ - `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
44
+ - `output_path`: The output path to save the checkpoint, defaults to current working directory.
45
+
46
+
47
+ ## Usage Example
48
+
49
+ ### Schema mode
50
+
51
+ ```python
52
+ from pyspark.sql import SparkSession
53
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
54
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
55
+
56
+ spark_session = SparkSession.builder.getOrCreate()
57
+ sample_size = 1.0
58
+
59
+ pyspark_df = spark_session.createDataFrame(
60
+ [("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
61
+ )
62
+
63
+ collect_dataframe_checkpoint(
64
+ pyspark_df,
65
+ checkpoint_name="collect_checkpoint_mode_1",
66
+ sample=sample_size,
67
+ mode=CheckpointMode.SCHEMA,
68
+ )
69
+ ```
70
+
71
+
72
+ ### Dataframe mode
73
+
74
+ ```python
75
+ from pyspark.sql import SparkSession
76
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
77
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
78
+ from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
79
+
80
+ spark_schema = StructType(
81
+ [
82
+ StructField("BYTE", ByteType(), True),
83
+ StructField("STRING", StringType(), True),
84
+ StructField("INTEGER", IntegerType(), True)
85
+ ]
86
+ )
87
+
88
+ data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
89
+
90
+ spark_session = SparkSession.builder.getOrCreate()
91
+ pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
92
+ "INTEGER"
93
+ )
94
+
95
+ collect_dataframe_checkpoint(
96
+ pyspark_df,
97
+ checkpoint_name="collect_checkpoint_mode_2",
98
+ mode=CheckpointMode.DATAFRAME,
99
+ )
100
+ ```
101
+
102
+ ------
@@ -2,9 +2,10 @@
2
2
  # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
4
 
5
- __all__ = ["collect_dataframe_checkpoint", "Singleton"]
5
+ __all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
6
6
 
7
- from snowflake.snowpark_checkpoints_collector.singleton import Singleton
8
7
  from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
9
8
  collect_dataframe_checkpoint,
10
9
  )
10
+
11
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
@@ -8,8 +8,13 @@ from enum import IntEnum
8
8
 
9
9
 
10
10
  class CheckpointMode(IntEnum):
11
+
12
+ """Enum class representing the collection mode."""
13
+
11
14
  SCHEMA = 1
15
+ """Collect automatic schema inference"""
12
16
  DATAFRAME = 2
17
+ """Export DataFrame as Parquet file to Snowflake"""
13
18
 
14
19
 
15
20
  # CONSTANTS
@@ -5,10 +5,10 @@ import json
5
5
 
6
6
  from typing import Optional
7
7
 
8
- from snowflake.snowpark_checkpoints_collector import Singleton
9
8
  from snowflake.snowpark_checkpoints_collector.collection_result.model import (
10
9
  CollectionPointResult,
11
10
  )
11
+ from snowflake.snowpark_checkpoints_collector.singleton import Singleton
12
12
  from snowflake.snowpark_checkpoints_collector.utils import file_utils
13
13
 
14
14
 
@@ -14,7 +14,7 @@ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_col
14
14
  )
15
15
 
16
16
 
17
- FORMAT = "%Y-%m-%dH:%M:%S"
17
+ FORMAT = "%Y-%m-%dT%H:%M:%S%z"
18
18
 
19
19
 
20
20
  class TimestampColumnCollector(ColumnCollectorBase):
@@ -14,7 +14,7 @@ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_col
14
14
  )
15
15
 
16
16
 
17
- FORMAT = "%Y-%m-%dT%H:%M:%S%z"
17
+ FORMAT = "%Y-%m-%dH:%M:%S"
18
18
 
19
19
 
20
20
  class TimestampNTZColumnCollector(ColumnCollectorBase):
@@ -1,6 +1,7 @@
1
1
  #
2
2
  # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
+ import pandas as pd
4
5
 
5
6
  from pandas import DataFrame as PandasDataFrame
6
7
  from pandera import Check, Column
@@ -62,23 +63,6 @@ def column_register(*args):
62
63
  return wrapper
63
64
 
64
65
 
65
- def _datetime_like_type_checks(
66
- clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
67
- ) -> None:
68
- column_values = pandas_df[clm_name].dropna()
69
- min_value = str(column_values.min())
70
- max_value = str(column_values.max())
71
- pandera_column.checks.append(
72
- Check.between(
73
- min_value=min_value,
74
- max_value=max_value,
75
- include_max=True,
76
- include_min=True,
77
- title=BETWEEN_CHECK_ERROR_MESSAGE_FORMAT.format(min_value, max_value),
78
- )
79
- )
80
-
81
-
82
66
  @collector_register
83
67
  class PanderaColumnChecksManager:
84
68
 
@@ -117,13 +101,24 @@ class PanderaColumnChecksManager:
117
101
  def _add_date_type_checks(
118
102
  self, clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
119
103
  ) -> None:
120
- _datetime_like_type_checks(clm_name, pandas_df, pandera_column)
104
+ pass
121
105
 
122
106
  @column_register(DAYTIMEINTERVAL_COLUMN_TYPE)
123
107
  def _add_daytimeinterval_type_checks(
124
108
  self, clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
125
109
  ) -> None:
126
- _datetime_like_type_checks(clm_name, pandas_df, pandera_column)
110
+ column_values = pandas_df[clm_name].dropna()
111
+ min_value = pd.to_timedelta(column_values.min())
112
+ max_value = pd.to_timedelta(column_values.max())
113
+ pandera_column.checks.append(
114
+ Check.between(
115
+ min_value=min_value,
116
+ max_value=max_value,
117
+ include_max=True,
118
+ include_min=True,
119
+ title=BETWEEN_CHECK_ERROR_MESSAGE_FORMAT.format(min_value, max_value),
120
+ )
121
+ )
127
122
 
128
123
  @column_register(
129
124
  BYTE_COLUMN_TYPE,
@@ -153,16 +148,42 @@ class PanderaColumnChecksManager:
153
148
  def _add_string_type_checks(
154
149
  self, clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
155
150
  ) -> None:
156
- pass
151
+ column_values = pandas_df[clm_name].dropna()
152
+ colum_str_length = column_values.str.len()
153
+ min_length = colum_str_length.min().item()
154
+ max_length = colum_str_length.max().item()
155
+ pandera_column.checks.append(Check.str_length(min_length, max_length))
157
156
 
158
157
  @column_register(TIMESTAMP_COLUMN_TYPE)
159
158
  def _add_timestamp_type_checks(
160
159
  self, clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
161
160
  ) -> None:
162
- _datetime_like_type_checks(clm_name, pandas_df, pandera_column)
161
+ column_values = pandas_df[clm_name].dropna()
162
+ min_value = pd.Timestamp(column_values.min())
163
+ max_value = pd.Timestamp(column_values.max())
164
+ pandera_column.checks.append(
165
+ Check.between(
166
+ min_value=min_value,
167
+ max_value=max_value,
168
+ include_max=True,
169
+ include_min=True,
170
+ title=BETWEEN_CHECK_ERROR_MESSAGE_FORMAT.format(min_value, max_value),
171
+ )
172
+ )
163
173
 
164
174
  @column_register(TIMESTAMP_NTZ_COLUMN_TYPE)
165
175
  def _add_timestamp_ntz_type_checks(
166
176
  self, clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
167
177
  ) -> None:
168
- _datetime_like_type_checks(clm_name, pandas_df, pandera_column)
178
+ column_values = pandas_df[clm_name].dropna()
179
+ min_value = pd.Timestamp(column_values.min())
180
+ max_value = pd.Timestamp(column_values.max())
181
+ pandera_column.checks.append(
182
+ Check.between(
183
+ min_value=min_value,
184
+ max_value=max_value,
185
+ include_max=True,
186
+ include_min=True,
187
+ title=BETWEEN_CHECK_ERROR_MESSAGE_FORMAT.format(min_value, max_value),
188
+ )
189
+ )
@@ -41,7 +41,10 @@ from snowflake.snowpark_checkpoints_collector.column_pandera_checks import (
41
41
  from snowflake.snowpark_checkpoints_collector.snow_connection_model import (
42
42
  SnowConnection,
43
43
  )
44
- from snowflake.snowpark_checkpoints_collector.utils import file_utils
44
+ from snowflake.snowpark_checkpoints_collector.utils import (
45
+ checkpoint_name_utils,
46
+ file_utils,
47
+ )
45
48
  from snowflake.snowpark_checkpoints_collector.utils.extra_config import (
46
49
  get_checkpoint_mode,
47
50
  get_checkpoint_sample,
@@ -71,52 +74,83 @@ def collect_dataframe_checkpoint(
71
74
 
72
75
  Raises:
73
76
  Exception: Invalid mode value.
77
+ Exception: Invalid checkpoint name. Checkpoint names must only contain alphanumeric characters and underscores.
74
78
 
75
79
  """
76
- collection_point_file_path = file_utils.get_collection_point_source_file_path()
77
- collection_point_line_of_code = file_utils.get_collection_point_line_of_code()
78
- collection_point_result = CollectionPointResult(
79
- collection_point_file_path, collection_point_line_of_code, checkpoint_name
80
- )
81
-
82
80
  try:
83
- if is_checkpoint_enabled(checkpoint_name):
81
+ normalized_checkpoint_name = checkpoint_name_utils.normalize_checkpoint_name(
82
+ checkpoint_name
83
+ )
84
+ is_valid_checkpoint_name = checkpoint_name_utils.is_valid_checkpoint_name(
85
+ normalized_checkpoint_name
86
+ )
87
+ if not is_valid_checkpoint_name:
88
+ raise Exception(
89
+ f"Invalid checkpoint name: {checkpoint_name}. Checkpoint names must only contain alphanumeric "
90
+ f"characters and underscores."
91
+ )
84
92
 
85
- _sample = get_checkpoint_sample(checkpoint_name, sample)
93
+ if is_checkpoint_enabled(normalized_checkpoint_name):
86
94
 
87
- if _is_empty_dataframe_without_schema(df):
88
- raise Exception(
89
- "It is not possible to collect an empty DataFrame without schema"
90
- )
95
+ collection_point_file_path = (
96
+ file_utils.get_collection_point_source_file_path()
97
+ )
98
+ collection_point_line_of_code = (
99
+ file_utils.get_collection_point_line_of_code()
100
+ )
101
+ collection_point_result = CollectionPointResult(
102
+ collection_point_file_path,
103
+ collection_point_line_of_code,
104
+ normalized_checkpoint_name,
105
+ )
91
106
 
92
- _mode = get_checkpoint_mode(checkpoint_name, mode)
107
+ try:
93
108
 
94
- if _mode == CheckpointMode.SCHEMA:
95
- column_type_dict = _get_spark_column_types(df)
96
- _collect_dataframe_checkpoint_mode_schema(
97
- checkpoint_name, df, _sample, column_type_dict, output_path
98
- )
109
+ _sample = get_checkpoint_sample(normalized_checkpoint_name, sample)
99
110
 
100
- elif _mode == CheckpointMode.DATAFRAME:
101
- snow_connection = SnowConnection()
102
- _collect_dataframe_checkpoint_mode_dataframe(
103
- checkpoint_name, df, snow_connection, output_path
104
- )
111
+ if _is_empty_dataframe_without_schema(df):
112
+ raise Exception(
113
+ "It is not possible to collect an empty DataFrame without schema"
114
+ )
115
+
116
+ _mode = get_checkpoint_mode(normalized_checkpoint_name, mode)
117
+
118
+ if _mode == CheckpointMode.SCHEMA:
119
+ column_type_dict = _get_spark_column_types(df)
120
+ _collect_dataframe_checkpoint_mode_schema(
121
+ normalized_checkpoint_name,
122
+ df,
123
+ _sample,
124
+ column_type_dict,
125
+ output_path,
126
+ )
127
+
128
+ elif _mode == CheckpointMode.DATAFRAME:
129
+ snow_connection = SnowConnection()
130
+ _collect_dataframe_checkpoint_mode_dataframe(
131
+ normalized_checkpoint_name, df, snow_connection, output_path
132
+ )
105
133
 
106
- else:
107
- raise Exception("Invalid mode value.")
134
+ else:
135
+ raise Exception("Invalid mode value.")
108
136
 
109
- collection_point_result.result = CollectionResult.PASS
137
+ collection_point_result.result = CollectionResult.PASS
138
+
139
+ except Exception as err:
140
+ collection_point_result.result = CollectionResult.FAIL
141
+ error_message = str(err)
142
+ raise Exception(error_message) from err
143
+
144
+ finally:
145
+ collection_point_result_manager = CollectionPointResultManager(
146
+ output_path
147
+ )
148
+ collection_point_result_manager.add_result(collection_point_result)
110
149
 
111
150
  except Exception as err:
112
- collection_point_result.result = CollectionResult.FAIL
113
151
  error_message = str(err)
114
152
  raise Exception(error_message) from err
115
153
 
116
- finally:
117
- collection_point_result_manager = CollectionPointResultManager(output_path)
118
- collection_point_result_manager.add_result(collection_point_result)
119
-
120
154
 
121
155
  @report_telemetry(params_list=["column_type_dict"])
122
156
  def _collect_dataframe_checkpoint_mode_schema(
@@ -239,7 +273,6 @@ def _get_pandera_infer_schema_as_dict(
239
273
  def _generate_json_checkpoint_file(
240
274
  checkpoint_name, dataframe_schema_contract, output_path: Optional[str] = None
241
275
  ) -> None:
242
-
243
276
  checkpoint_file_name = CHECKPOINT_JSON_OUTPUT_FILE_NAME_FORMAT.format(
244
277
  checkpoint_name
245
278
  )
@@ -0,0 +1,49 @@
1
+ import re as regx
2
+
3
+
4
+ CHECKPOINT_NAME_REGEX_PATTERN = r"^[a-zA-Z_][a-zA-Z0-9_]+"
5
+ WHITESPACE_TOKEN = " "
6
+ HYPHEN_TOKEN = "-"
7
+ UNDERSCORE_TOKEN = "_"
8
+ TOKEN_TO_REPLACE_COLLECTION = [WHITESPACE_TOKEN, HYPHEN_TOKEN]
9
+
10
+
11
+ def normalize_checkpoint_name(checkpoint_name: str) -> str:
12
+ """Normalize the provided checkpoint name by replacing: the whitespace and hyphen tokens by underscore token.
13
+
14
+ Args:
15
+ checkpoint_name (str): The checkpoint name to normalize.
16
+
17
+ Returns:
18
+ str: the checkpoint name normalized.
19
+
20
+ """
21
+ normalized_checkpoint_name = checkpoint_name
22
+ for token in TOKEN_TO_REPLACE_COLLECTION:
23
+ normalized_checkpoint_name = normalized_checkpoint_name.replace(
24
+ token, UNDERSCORE_TOKEN
25
+ )
26
+
27
+ return normalized_checkpoint_name
28
+
29
+
30
+ def is_valid_checkpoint_name(checkpoint_name: str) -> bool:
31
+ """Check if the provided checkpoint name is valid.
32
+
33
+ A valid checkpoint name must:
34
+ - Start with a letter (a-z, A-Z) or an underscore (_)
35
+ - Be followed by any combination of letters, digits (0-9) and underscores (_).
36
+
37
+ Args:
38
+ checkpoint_name (str): The checkpoint name to validate.
39
+
40
+ Returns:
41
+ bool: True if the checkpoint name is valid; otherwise, False.
42
+
43
+ """
44
+ matched = regx.findall(CHECKPOINT_NAME_REGEX_PATTERN, checkpoint_name)
45
+ if len(matched) == 0 or len(matched) > 1:
46
+ return False
47
+
48
+ is_valid = matched[0] == checkpoint_name
49
+ return is_valid
@@ -0,0 +1,51 @@
1
+ import os
2
+ import tempfile
3
+
4
+ import pytest
5
+ from pyspark.sql import SparkSession
6
+ from pyspark.sql.types import StructType
7
+
8
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
9
+ from snowflake.snowpark_checkpoints_collector.singleton import Singleton
10
+ from pathlib import Path
11
+ from snowflake.snowpark_checkpoints_collector.utils.telemetry import (
12
+ get_telemetry_manager,
13
+ )
14
+
15
+ telemetry_folder = "telemetry"
16
+
17
+
18
+ @pytest.fixture
19
+ def spark_session():
20
+ return SparkSession.builder.getOrCreate()
21
+
22
+
23
+ @pytest.fixture
24
+ def singleton():
25
+ Singleton._instances = {}
26
+
27
+
28
+ @pytest.fixture(scope="function")
29
+ def output_path():
30
+ folder = os.urandom(8).hex()
31
+ directory = Path(tempfile.gettempdir()).resolve() / folder
32
+ os.makedirs(directory)
33
+ return str(directory)
34
+
35
+
36
+ def test_collect_dataframe_with_invalid_checkpoint_name(
37
+ spark_session, singleton, output_path
38
+ ):
39
+ checkpoint_name = "6*invalid"
40
+ data = []
41
+ columns = StructType()
42
+ pyspark_df = spark_session.createDataFrame(data=data, schema=columns)
43
+
44
+ with pytest.raises(Exception) as ex_info:
45
+ collect_dataframe_checkpoint(
46
+ pyspark_df, checkpoint_name=checkpoint_name, output_path=output_path
47
+ )
48
+ assert (
49
+ f"Invalid checkpoint name: {checkpoint_name}. Checkpoint names must only contain alphanumeric characters "
50
+ f"and underscores."
51
+ ) == str(ex_info.value)