snowpark-checkpoints-collectors 0.1.0rc1__tar.gz → 0.1.0rc3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/.gitignore +3 -0
  2. snowpark_checkpoints_collectors-0.1.0rc3/PKG-INFO +146 -0
  3. snowpark_checkpoints_collectors-0.1.0rc3/README.md +102 -0
  4. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/pyproject.toml +6 -5
  5. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/__init__.py +3 -2
  6. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +10 -0
  7. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +1 -1
  8. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +18 -18
  9. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +22 -16
  10. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +17 -11
  11. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +18 -11
  12. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +7 -7
  13. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +15 -8
  14. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +15 -8
  15. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +22 -10
  16. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +9 -7
  17. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +25 -17
  18. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +5 -5
  19. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +24 -11
  20. snowpark_checkpoints_collectors-0.1.0rc3/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +59 -0
  21. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +10 -8
  22. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +18 -8
  23. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +18 -8
  24. snowpark_checkpoints_collectors-0.1.0rc3/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +212 -0
  25. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +94 -45
  26. snowpark_checkpoints_collectors-0.1.0rc3/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +50 -0
  27. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/telemetry_compare_utils.py +69 -0
  28. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_checkpoint_name.py +51 -0
  29. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1.py +40 -47
  30. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +1 -0
  31. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +18 -0
  32. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +1 -0
  33. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +18 -0
  34. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +1 -0
  35. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +7 -6
  36. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +6 -5
  37. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +6 -5
  38. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +18 -0
  39. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_full_df.json +1 -0
  40. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +1 -0
  41. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +6 -5
  42. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +18 -0
  43. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_2.py +56 -8
  44. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +18 -0
  45. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +18 -0
  46. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +18 -0
  47. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_collect_invalid_mode_telemetry.json +18 -0
  48. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_generate_parquet_for_spark_df_telemetry.json +18 -0
  49. snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_spark_df_mode_dataframe_telemetry.json +18 -0
  50. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collection_result_file.py +1 -1
  51. snowpark_checkpoints_collectors-0.1.0rc3/test/unit/test_checkpoint_name_utils.py +47 -0
  52. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_collection_point_result_manager.py +1 -1
  53. snowpark_checkpoints_collectors-0.1.0rc3/test/unit/test_column_collection.py +466 -0
  54. snowpark_checkpoints_collectors-0.1.0rc3/test/unit/test_pandera_column_check_manager.py +194 -0
  55. snowpark_checkpoints_collectors-0.1.0rc1/PKG-INFO +0 -276
  56. snowpark_checkpoints_collectors-0.1.0rc1/README.md +0 -31
  57. snowpark_checkpoints_collectors-0.1.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -35
  58. snowpark_checkpoints_collectors-0.1.0rc1/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +0 -168
  59. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -1
  60. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -17
  61. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -1
  62. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -17
  63. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -1
  64. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -17
  65. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -1
  66. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -1
  67. snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -17
  68. snowpark_checkpoints_collectors-0.1.0rc1/test/unit/test_column_collection.py +0 -669
  69. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/CHANGELOG.md +0 -0
  70. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/LICENSE +0 -0
  71. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/snowpark-testdf-schema.json +0 -0
  72. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
  73. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +0 -0
  74. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
  75. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
  76. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
  77. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
  78. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
  79. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +0 -0
  80. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +0 -0
  81. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +0 -0
  82. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -0
  83. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/.coveragerc +0 -0
  84. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
  85. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
  86. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
  87. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_snow_connection_int.py +0 -0
  88. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_collection_point_result.py +0 -0
  89. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_extra_config.py +0 -0
  90. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_file_utils.py +0 -0
  91. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_snow_connection.py +0 -0
  92. {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_summary_stats_collector.py +0 -0
@@ -4,10 +4,13 @@
4
4
 
5
5
  # demos
6
6
  snowpark-checkpoints-output/
7
+ Demos/Demos/
8
+ Demos/snowpark-checkpoints-output/
7
9
 
8
10
  # env
9
11
  wheelvenv/
10
12
 
13
+
11
14
  # version
12
15
  !__version__.py
13
16
 
@@ -0,0 +1,146 @@
1
+ Metadata-Version: 2.4
2
+ Name: snowpark-checkpoints-collectors
3
+ Version: 0.1.0rc3
4
+ Summary: Snowpark column and table statistics collection
5
+ Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
+ Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
7
+ Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
8
+ License: Apache License, Version 2.0
9
+ License-File: LICENSE
10
+ Keywords: Snowflake,Snowpark,analytics,cloud,database,db
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Environment :: Other Environment
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: Information Technology
17
+ Classifier: Intended Audience :: System Administrators
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: SQL
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: Topic :: Software Development
25
+ Classifier: Topic :: Software Development :: Libraries
26
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Requires-Python: <3.12,>=3.9
29
+ Requires-Dist: pandera[io]==0.20.4
30
+ Requires-Dist: pyspark
31
+ Requires-Dist: snowflake-connector-python
32
+ Requires-Dist: snowflake-snowpark-python
33
+ Provides-Extra: development
34
+ Requires-Dist: coverage>=7.6.7; extra == 'development'
35
+ Requires-Dist: deepdiff>=8.0.0; extra == 'development'
36
+ Requires-Dist: hatchling==1.25.0; extra == 'development'
37
+ Requires-Dist: pre-commit>=4.0.1; extra == 'development'
38
+ Requires-Dist: pyarrow>=18.0.0; extra == 'development'
39
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
40
+ Requires-Dist: pytest>=8.3.3; extra == 'development'
41
+ Requires-Dist: setuptools>=70.0.0; extra == 'development'
42
+ Requires-Dist: twine==5.1.1; extra == 'development'
43
+ Description-Content-Type: text/markdown
44
+
45
+ # snowpark-checkpoints-collectors
46
+
47
+ ---
48
+ **NOTE**
49
+
50
+ This package is on Private Preview.
51
+
52
+ ---
53
+
54
+ **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
55
+ ## Features
56
+
57
+ - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
58
+ - DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
59
+
60
+
61
+
62
+ ## Functionalities
63
+
64
+ ### Collect DataFrame Checkpoint
65
+
66
+
67
+
68
+ ```python
69
+ from pyspark.sql import DataFrame as SparkDataFrame
70
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
71
+ from typing import Optional
72
+
73
+ # Signature of the function
74
+ def collect_dataframe_checkpoint(
75
+ df: SparkDataFrame,
76
+ checkpoint_name: str,
77
+ sample: Optional[float] = None,
78
+ mode: Optional[CheckpointMode] = None,
79
+ output_path: Optional[str] = None,
80
+ ) -> None:
81
+ ...
82
+ ```
83
+
84
+ - `df`: The input Spark dataframe to collect.
85
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
86
+ - `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
87
+ - `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
88
+ - `output_path`: The output path to save the checkpoint, defaults to current working directory.
89
+
90
+
91
+ ## Usage Example
92
+
93
+ ### Schema mode
94
+
95
+ ```python
96
+ from pyspark.sql import SparkSession
97
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
98
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
99
+
100
+ spark_session = SparkSession.builder.getOrCreate()
101
+ sample_size = 1.0
102
+
103
+ pyspark_df = spark_session.createDataFrame(
104
+ [("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
105
+ )
106
+
107
+ collect_dataframe_checkpoint(
108
+ pyspark_df,
109
+ checkpoint_name="collect_checkpoint_mode_1",
110
+ sample=sample_size,
111
+ mode=CheckpointMode.SCHEMA,
112
+ )
113
+ ```
114
+
115
+
116
+ ### Dataframe mode
117
+
118
+ ```python
119
+ from pyspark.sql import SparkSession
120
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
121
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
122
+ from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
123
+
124
+ spark_schema = StructType(
125
+ [
126
+ StructField("BYTE", ByteType(), True),
127
+ StructField("STRING", StringType(), True),
128
+ StructField("INTEGER", IntegerType(), True)
129
+ ]
130
+ )
131
+
132
+ data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
133
+
134
+ spark_session = SparkSession.builder.getOrCreate()
135
+ pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
136
+ "INTEGER"
137
+ )
138
+
139
+ collect_dataframe_checkpoint(
140
+ pyspark_df,
141
+ checkpoint_name="collect_checkpoint_mode_2",
142
+ mode=CheckpointMode.DATAFRAME,
143
+ )
144
+ ```
145
+
146
+ ------
@@ -0,0 +1,102 @@
1
+ # snowpark-checkpoints-collectors
2
+
3
+ ---
4
+ **NOTE**
5
+
6
+ This package is on Private Preview.
7
+
8
+ ---
9
+
10
+ **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
11
+ ## Features
12
+
13
+ - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
14
+ - DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
15
+
16
+
17
+
18
+ ## Functionalities
19
+
20
+ ### Collect DataFrame Checkpoint
21
+
22
+
23
+
24
+ ```python
25
+ from pyspark.sql import DataFrame as SparkDataFrame
26
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
27
+ from typing import Optional
28
+
29
+ # Signature of the function
30
+ def collect_dataframe_checkpoint(
31
+ df: SparkDataFrame,
32
+ checkpoint_name: str,
33
+ sample: Optional[float] = None,
34
+ mode: Optional[CheckpointMode] = None,
35
+ output_path: Optional[str] = None,
36
+ ) -> None:
37
+ ...
38
+ ```
39
+
40
+ - `df`: The input Spark dataframe to collect.
41
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
42
+ - `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
43
+ - `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
44
+ - `output_path`: The output path to save the checkpoint, defaults to current working directory.
45
+
46
+
47
+ ## Usage Example
48
+
49
+ ### Schema mode
50
+
51
+ ```python
52
+ from pyspark.sql import SparkSession
53
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
54
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
55
+
56
+ spark_session = SparkSession.builder.getOrCreate()
57
+ sample_size = 1.0
58
+
59
+ pyspark_df = spark_session.createDataFrame(
60
+ [("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
61
+ )
62
+
63
+ collect_dataframe_checkpoint(
64
+ pyspark_df,
65
+ checkpoint_name="collect_checkpoint_mode_1",
66
+ sample=sample_size,
67
+ mode=CheckpointMode.SCHEMA,
68
+ )
69
+ ```
70
+
71
+
72
+ ### Dataframe mode
73
+
74
+ ```python
75
+ from pyspark.sql import SparkSession
76
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
77
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
78
+ from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
79
+
80
+ spark_schema = StructType(
81
+ [
82
+ StructField("BYTE", ByteType(), True),
83
+ StructField("STRING", StringType(), True),
84
+ StructField("INTEGER", IntegerType(), True)
85
+ ]
86
+ )
87
+
88
+ data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
89
+
90
+ spark_session = SparkSession.builder.getOrCreate()
91
+ pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
92
+ "INTEGER"
93
+ )
94
+
95
+ collect_dataframe_checkpoint(
96
+ pyspark_df,
97
+ checkpoint_name="collect_checkpoint_mode_2",
98
+ mode=CheckpointMode.DATAFRAME,
99
+ )
100
+ ```
101
+
102
+ ------
@@ -3,7 +3,9 @@ build-backend = "hatchling.build"
3
3
  requires = ["hatchling"]
4
4
 
5
5
  [project]
6
- authors = [{name = "Snowflake Inc."}]
6
+ authors = [
7
+ {name = "Snowflake, Inc.", email = "snowflake-python-libraries-dl@snowflake.com"},
8
+ ]
7
9
  classifiers = [
8
10
  "Development Status :: 4 - Beta",
9
11
  "Environment :: Console",
@@ -30,6 +32,7 @@ dependencies = [
30
32
  "pandera[io]==0.20.4",
31
33
  ]
32
34
  description = "Snowpark column and table statistics collection"
35
+ dynamic = ['version']
33
36
  keywords = [
34
37
  'Snowflake',
35
38
  'analytics',
@@ -38,11 +41,10 @@ keywords = [
38
41
  'db',
39
42
  'Snowpark',
40
43
  ]
41
- license = {file = "LICENSE"}
44
+ license = {text = "Apache License, Version 2.0"}
42
45
  name = "snowpark-checkpoints-collectors"
43
46
  readme = "README.md"
44
47
  requires-python = '>=3.9,<3.12'
45
- dynamic = ['version']
46
48
 
47
49
  [project.optional-dependencies]
48
50
  development = [
@@ -113,7 +115,6 @@ exclude_lines = [
113
115
  "if __name__ == .__main__.:",
114
116
  ]
115
117
 
116
-
117
118
  [tool.hatch.envs.linter.scripts]
118
119
  check = [
119
120
  'ruff check --fix .',
@@ -121,7 +122,7 @@ check = [
121
122
 
122
123
  [tool.hatch.envs.test.scripts]
123
124
  check = [
124
- "pip install -e ../snowpark-checkpoints-configuration" ,
125
+ "pip install -e ../snowpark-checkpoints-configuration",
125
126
  'pytest -v --junitxml=test/outcome/test-results.xml --cov=. --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
126
127
  ]
127
128
 
@@ -2,9 +2,10 @@
2
2
  # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
4
 
5
- __all__ = ["collect_dataframe_checkpoint", "Singleton"]
5
+ __all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
6
6
 
7
- from snowflake.snowpark_checkpoints_collector.singleton import Singleton
8
7
  from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
9
8
  collect_dataframe_checkpoint,
10
9
  )
10
+
11
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
@@ -8,8 +8,13 @@ from enum import IntEnum
8
8
 
9
9
 
10
10
  class CheckpointMode(IntEnum):
11
+
12
+ """Enum class representing the collection mode."""
13
+
11
14
  SCHEMA = 1
15
+ """Collect automatic schema inference"""
12
16
  DATAFRAME = 2
17
+ """Export DataFrame as Parquet file to Snowflake"""
13
18
 
14
19
 
15
20
  # CONSTANTS
@@ -76,11 +81,13 @@ COLUMN_IS_UNIQUE_SIZE_KEY = "is_unique_size"
76
81
  COLUMN_KEY_TYPE_KEY = "key_type"
77
82
  COLUMN_MARGIN_ERROR_KEY = "margin_error"
78
83
  COLUMN_MAX_KEY = "max"
84
+ COLUMN_MAX_LENGTH_KEY = "max_length"
79
85
  COLUMN_MAX_SIZE_KEY = "max_size"
80
86
  COLUMN_MEAN_KEY = "mean"
81
87
  COLUMN_MEAN_SIZE_KEY = "mean_size"
82
88
  COLUMN_METADATA_KEY = "metadata"
83
89
  COLUMN_MIN_KEY = "min"
90
+ COLUMN_MIN_LENGTH_KEY = "min_length"
84
91
  COLUMN_MIN_SIZE_KEY = "min_size"
85
92
  COLUMN_NAME_KEY = "name"
86
93
  COLUMN_NULL_COUNT_KEY = "null_count"
@@ -90,6 +97,7 @@ COLUMN_ROWS_NULL_COUNT_KEY = "rows_null_count"
90
97
  COLUMN_SIZE_KEY = "size"
91
98
  COLUMN_TRUE_COUNT_KEY = "true_count"
92
99
  COLUMN_TYPE_KEY = "type"
100
+ COLUMN_VALUE_KEY = "value"
93
101
  COLUMN_VALUE_TYPE_KEY = "value_type"
94
102
  COLUMNS_KEY = "columns"
95
103
 
@@ -121,6 +129,8 @@ UNKNOWN_SOURCE_FILE = "unknown"
121
129
  UNKNOWN_LINE_OF_CODE = -1
122
130
  BACKSLASH_TOKEN = "\\"
123
131
  SLASH_TOKEN = "/"
132
+ PYSPARK_NONE_SIZE_VALUE = -1
133
+ PANDAS_LONG_TYPE = "Int64"
124
134
 
125
135
  # ENVIRONMENT VARIABLES
126
136
  SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR = (
@@ -5,10 +5,10 @@ import json
5
5
 
6
6
  from typing import Optional
7
7
 
8
- from snowflake.snowpark_checkpoints_collector import Singleton
9
8
  from snowflake.snowpark_checkpoints_collector.collection_result.model import (
10
9
  CollectionPointResult,
11
10
  )
11
+ from snowflake.snowpark_checkpoints_collector.singleton import Singleton
12
12
  from snowflake.snowpark_checkpoints_collector.utils import file_utils
13
13
 
14
14
 
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
- from pandas import Series
4
+ from pyspark.sql import DataFrame as SparkDataFrame
5
5
  from pyspark.sql.types import StructField
6
6
 
7
7
  from snowflake.snowpark_checkpoints_collector.collection_common import (
@@ -88,14 +88,14 @@ class ColumnCollectorManager:
88
88
  """Manage class for column collector based on type."""
89
89
 
90
90
  def collect_column(
91
- self, clm_name: str, struct_field: StructField, values: Series
91
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
92
92
  ) -> dict[str, any]:
93
93
  """Collect the data of the column based on the column type.
94
94
 
95
95
  Args:
96
96
  clm_name (str): the name of the column.
97
97
  struct_field (pyspark.sql.types.StructField): the struct field of the column type.
98
- values (pandas.Series): the column values as Pandas.Series.
98
+ values (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
99
99
 
100
100
  Returns:
101
101
  dict[str, any]: The data collected.
@@ -112,7 +112,7 @@ class ColumnCollectorManager:
112
112
 
113
113
  @column_register(ARRAY_COLUMN_TYPE)
114
114
  def _collect_array_type_custom_data(
115
- self, clm_name: str, struct_field: StructField, values: Series
115
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
116
116
  ) -> dict[str, any]:
117
117
  column_collector = ArrayColumnCollector(clm_name, struct_field, values)
118
118
  collected_data = column_collector.get_data()
@@ -120,7 +120,7 @@ class ColumnCollectorManager:
120
120
 
121
121
  @column_register(BINARY_COLUMN_TYPE)
122
122
  def _collect_binary_type_custom_data(
123
- self, clm_name: str, struct_field: StructField, values: Series
123
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
124
124
  ) -> dict[str, any]:
125
125
  column_collector = BinaryColumnCollector(clm_name, struct_field, values)
126
126
  collected_data = column_collector.get_data()
@@ -128,7 +128,7 @@ class ColumnCollectorManager:
128
128
 
129
129
  @column_register(BOOLEAN_COLUMN_TYPE)
130
130
  def _collect_boolean_type_custom_data(
131
- self, clm_name: str, struct_field: StructField, values: Series
131
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
132
132
  ) -> dict[str, any]:
133
133
  column_collector = BooleanColumnCollector(clm_name, struct_field, values)
134
134
  collected_data = column_collector.get_data()
@@ -136,7 +136,7 @@ class ColumnCollectorManager:
136
136
 
137
137
  @column_register(DATE_COLUMN_TYPE)
138
138
  def _collect_date_type_custom_data(
139
- self, clm_name: str, struct_field: StructField, values: Series
139
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
140
140
  ) -> dict[str, any]:
141
141
  column_collector = DateColumnCollector(clm_name, struct_field, values)
142
142
  collected_data = column_collector.get_data()
@@ -144,7 +144,7 @@ class ColumnCollectorManager:
144
144
 
145
145
  @column_register(DAYTIMEINTERVAL_COLUMN_TYPE)
146
146
  def _collect_day_time_interval_type_custom_data(
147
- self, clm_name: str, struct_field: StructField, values: Series
147
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
148
148
  ) -> dict[str, any]:
149
149
  column_collector = DayTimeIntervalColumnCollector(
150
150
  clm_name, struct_field, values
@@ -154,7 +154,7 @@ class ColumnCollectorManager:
154
154
 
155
155
  @column_register(DECIMAL_COLUMN_TYPE)
156
156
  def _collect_decimal_type_custom_data(
157
- self, clm_name: str, struct_field: StructField, values: Series
157
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
158
158
  ) -> dict[str, any]:
159
159
  column_collector = DecimalColumnCollector(clm_name, struct_field, values)
160
160
  collected_data = column_collector.get_data()
@@ -162,7 +162,7 @@ class ColumnCollectorManager:
162
162
 
163
163
  @column_register(MAP_COLUMN_TYPE)
164
164
  def _collect_map_type_custom_data(
165
- self, clm_name: str, struct_field: StructField, values: Series
165
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
166
166
  ) -> dict[str, any]:
167
167
  column_collector = MapColumnCollector(clm_name, struct_field, values)
168
168
  collected_data = column_collector.get_data()
@@ -170,7 +170,7 @@ class ColumnCollectorManager:
170
170
 
171
171
  @column_register(NULL_COLUMN_TYPE)
172
172
  def _collect_null_type_custom_data(
173
- self, clm_name: str, struct_field: StructField, values: Series
173
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
174
174
  ) -> dict[str, any]:
175
175
  column_collector = NullColumnCollector(clm_name, struct_field, values)
176
176
  collected_data = column_collector.get_data()
@@ -185,7 +185,7 @@ class ColumnCollectorManager:
185
185
  DOUBLE_COLUMN_TYPE,
186
186
  )
187
187
  def _collect_numeric_type_custom_data(
188
- self, clm_name: str, struct_field: StructField, values: Series
188
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
189
189
  ) -> dict[str, any]:
190
190
  column_collector = NumericColumnCollector(clm_name, struct_field, values)
191
191
  collected_data = column_collector.get_data()
@@ -193,7 +193,7 @@ class ColumnCollectorManager:
193
193
 
194
194
  @column_register(STRING_COLUMN_TYPE)
195
195
  def _collect_string_type_custom_data(
196
- self, clm_name: str, struct_field: StructField, values: Series
196
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
197
197
  ) -> dict[str, any]:
198
198
  column_collector = StringColumnCollector(clm_name, struct_field, values)
199
199
  collected_data = column_collector.get_data()
@@ -201,7 +201,7 @@ class ColumnCollectorManager:
201
201
 
202
202
  @column_register(STRUCT_COLUMN_TYPE)
203
203
  def _collect_struct_type_custom_data(
204
- self, clm_name: str, struct_field: StructField, values: Series
204
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
205
205
  ) -> dict[str, any]:
206
206
  column_collector = StructColumnCollector(clm_name, struct_field, values)
207
207
  collected_data = column_collector.get_data()
@@ -209,7 +209,7 @@ class ColumnCollectorManager:
209
209
 
210
210
  @column_register(TIMESTAMP_COLUMN_TYPE)
211
211
  def _collect_timestamp_type_custom_data(
212
- self, clm_name: str, struct_field: StructField, values: Series
212
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
213
213
  ) -> dict[str, any]:
214
214
  column_collector = TimestampColumnCollector(clm_name, struct_field, values)
215
215
  collected_data = column_collector.get_data()
@@ -217,21 +217,21 @@ class ColumnCollectorManager:
217
217
 
218
218
  @column_register(TIMESTAMP_NTZ_COLUMN_TYPE)
219
219
  def _collect_timestampntz_type_custom_data(
220
- self, clm_name: str, struct_field: StructField, values: Series
220
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
221
221
  ) -> dict[str, any]:
222
222
  column_collector = TimestampNTZColumnCollector(clm_name, struct_field, values)
223
223
  collected_data = column_collector.get_data()
224
224
  return collected_data
225
225
 
226
226
  def collect_empty_custom_data(
227
- self, clm_name: str, struct_field: StructField, values: Series
227
+ self, clm_name: str, struct_field: StructField, values: SparkDataFrame
228
228
  ) -> dict[str, any]:
229
229
  """Collect the data of a empty column.
230
230
 
231
231
  Args:
232
232
  clm_name (str): the name of the column.
233
233
  struct_field (pyspark.sql.types.StructField): the struct field of the column type.
234
- values (pandas.Series): the column values as Pandas.Series.
234
+ values (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
235
235
 
236
236
  Returns:
237
237
  dict[str, any]: The data collected.
@@ -3,7 +3,12 @@
3
3
  #
4
4
  from statistics import mean
5
5
 
6
- from pandas import Series
6
+ from pyspark.sql import DataFrame as SparkDataFrame
7
+ from pyspark.sql.functions import array as spark_array
8
+ from pyspark.sql.functions import coalesce as spark_coalesce
9
+ from pyspark.sql.functions import col as spark_col
10
+ from pyspark.sql.functions import explode as spark_explode
11
+ from pyspark.sql.functions import size as spark_size
7
12
  from pyspark.sql.types import StructField
8
13
 
9
14
  from snowflake.snowpark_checkpoints_collector.collection_common import (
@@ -13,6 +18,8 @@ from snowflake.snowpark_checkpoints_collector.collection_common import (
13
18
  COLUMN_MEAN_SIZE_KEY,
14
19
  COLUMN_MIN_SIZE_KEY,
15
20
  COLUMN_NULL_VALUE_PROPORTION_KEY,
21
+ COLUMN_SIZE_KEY,
22
+ COLUMN_VALUE_KEY,
16
23
  COLUMN_VALUE_TYPE_KEY,
17
24
  CONTAINS_NULL_KEY,
18
25
  ELEMENT_TYPE_KEY,
@@ -30,22 +37,22 @@ class ArrayColumnCollector(ColumnCollectorBase):
30
37
  name (str): the name of the column.
31
38
  type (str): the type of the column.
32
39
  struct_field (pyspark.sql.types.StructField): the struct field of the column type.
33
- values (pandas.Series): the column values as Pandas.Series.
40
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
34
41
 
35
42
  """
36
43
 
37
44
  def __init__(
38
- self, clm_name: str, struct_field: StructField, clm_values: Series
45
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
39
46
  ) -> None:
40
47
  """Init ArrayColumnCollector.
41
48
 
42
49
  Args:
43
50
  clm_name (str): the name of the column.
44
51
  struct_field (pyspark.sql.types.StructField): the struct field of the column type.
45
- clm_values (pandas.Series): the column values as Pandas.Series.
52
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
46
53
 
47
54
  """
48
- super().__init__(clm_name, struct_field, clm_values)
55
+ super().__init__(clm_name, struct_field, clm_df)
49
56
  self._array_size_collection = self._compute_array_size_collection()
50
57
 
51
58
  def get_custom_data(self) -> dict[str, any]:
@@ -73,23 +80,22 @@ class ArrayColumnCollector(ColumnCollectorBase):
73
80
  return custom_data_dict
74
81
 
75
82
  def _compute_array_size_collection(self) -> list[int]:
76
- size_collection = []
77
- for array in self.values:
78
- if array is None:
79
- continue
83
+ select_result = self.column_df.select(
84
+ spark_size(spark_coalesce(spark_col(self.name), spark_array([]))).alias(
85
+ COLUMN_SIZE_KEY
86
+ )
87
+ ).collect()
80
88
 
81
- length = len(array)
82
- size_collection.append(length)
89
+ size_collection = [row[COLUMN_SIZE_KEY] for row in select_result]
83
90
 
84
91
  return size_collection
85
92
 
86
93
  def _compute_null_value_proportion(self) -> float:
87
- null_counter = 0
88
- for array in self.values:
89
- if array is None:
90
- continue
94
+ select_result = self.column_df.select(
95
+ spark_explode(spark_col(self.name)).alias(COLUMN_VALUE_KEY)
96
+ )
91
97
 
92
- null_counter += array.count(None)
98
+ null_counter = select_result.where(spark_col(COLUMN_VALUE_KEY).isNull()).count()
93
99
 
94
100
  total_values = sum(self._array_size_collection)
95
101
  null_value_proportion = (null_counter / total_values) * 100