snowpark-checkpoints-collectors 0.1.0rc1__tar.gz → 0.2.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/PKG-INFO +89 -18
- snowpark_checkpoints_collectors-0.2.0rc1/README.md +102 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/__init__.py +3 -2
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +5 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +1 -1
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +1 -1
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +1 -1
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +43 -22
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +66 -33
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +49 -0
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_checkpoint_name.py +51 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1.py +19 -6
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +1 -0
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +1 -0
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +1 -0
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df.json +1 -0
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +1 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_2.py +3 -2
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collection_result_file.py +1 -1
- snowpark_checkpoints_collectors-0.2.0rc1/test/unit/test_checkpoint_name_utils.py +47 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_collection_point_result_manager.py +1 -1
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_column_collection.py +5 -5
- snowpark_checkpoints_collectors-0.1.0rc1/README.md +0 -31
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -1
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -1
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -1
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -1
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -1
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/.gitignore +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/CHANGELOG.md +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/LICENSE +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/pyproject.toml +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/snowpark-testdf-schema.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/.coveragerc +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/integ/test_snow_connection_int.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_collection_point_result.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_extra_config.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_file_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_snow_connection.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/test/unit/test_summary_stats_collector.py +0 -0
{snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.2.0rc1}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: snowpark-checkpoints-collectors
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0rc1
|
4
4
|
Summary: Snowpark column and table statistics collection
|
5
5
|
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
6
|
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
@@ -243,7 +243,7 @@ Requires-Dist: setuptools>=70.0.0; extra == 'development'
|
|
243
243
|
Requires-Dist: twine==5.1.1; extra == 'development'
|
244
244
|
Description-Content-Type: text/markdown
|
245
245
|
|
246
|
-
#
|
246
|
+
# snowpark-checkpoints-collectors
|
247
247
|
|
248
248
|
---
|
249
249
|
**NOTE**
|
@@ -252,25 +252,96 @@ This package is on Private Preview.
|
|
252
252
|
|
253
253
|
---
|
254
254
|
|
255
|
-
|
256
|
-
|
257
|
-
record those results into a set of JSON files corresponding to different intermediate dataframes. These files can be inspected manually
|
258
|
-
and handed over to teams implementing the snowpark pipeline. The `snowpark-checkpoints-collector` package is designed to have minimal
|
259
|
-
dependencies and the generated files are meant to be inspected by security
|
260
|
-
teams.
|
255
|
+
**snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
|
256
|
+
## Features
|
261
257
|
|
262
|
-
|
258
|
+
- Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
|
259
|
+
- DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
|
263
260
|
|
264
|
-
## collect_dataframe_schema
|
265
261
|
|
262
|
+
|
263
|
+
## Functionalities
|
264
|
+
|
265
|
+
### Collect DataFrame Checkpoint
|
266
|
+
|
267
|
+
|
268
|
+
|
269
|
+
```python
|
270
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
271
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
272
|
+
from typing import Optional
|
273
|
+
|
274
|
+
# Signature of the function
|
275
|
+
def collect_dataframe_checkpoint(
|
276
|
+
df: SparkDataFrame,
|
277
|
+
checkpoint_name: str,
|
278
|
+
sample: Optional[float] = None,
|
279
|
+
mode: Optional[CheckpointMode] = None,
|
280
|
+
output_path: Optional[str] = None,
|
281
|
+
) -> None:
|
282
|
+
...
|
283
|
+
```
|
284
|
+
|
285
|
+
- `df`: The input Spark dataframe to collect.
|
286
|
+
- `checkpoint_name`: Name of the checkpoint schema file or dataframe.
|
287
|
+
- `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
|
288
|
+
- `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
|
289
|
+
- `output_path`: The output path to save the checkpoint, defaults to current working directory.
|
290
|
+
|
291
|
+
|
292
|
+
## Usage Example
|
293
|
+
|
294
|
+
### Schema mode
|
295
|
+
|
296
|
+
```python
|
297
|
+
from pyspark.sql import SparkSession
|
298
|
+
from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
|
299
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
300
|
+
|
301
|
+
spark_session = SparkSession.builder.getOrCreate()
|
302
|
+
sample_size = 1.0
|
303
|
+
|
304
|
+
pyspark_df = spark_session.createDataFrame(
|
305
|
+
[("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
|
306
|
+
)
|
307
|
+
|
308
|
+
collect_dataframe_checkpoint(
|
309
|
+
pyspark_df,
|
310
|
+
checkpoint_name="collect_checkpoint_mode_1",
|
311
|
+
sample=sample_size,
|
312
|
+
mode=CheckpointMode.SCHEMA,
|
313
|
+
)
|
266
314
|
```
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
315
|
+
|
316
|
+
|
317
|
+
### Dataframe mode
|
318
|
+
|
319
|
+
```python
|
320
|
+
from pyspark.sql import SparkSession
|
321
|
+
from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
|
322
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
323
|
+
from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
|
324
|
+
|
325
|
+
spark_schema = StructType(
|
326
|
+
[
|
327
|
+
StructField("BYTE", ByteType(), True),
|
328
|
+
StructField("STRING", StringType(), True),
|
329
|
+
StructField("INTEGER", IntegerType(), True)
|
330
|
+
]
|
331
|
+
)
|
332
|
+
|
333
|
+
data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
|
334
|
+
|
335
|
+
spark_session = SparkSession.builder.getOrCreate()
|
336
|
+
pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
|
337
|
+
"INTEGER"
|
338
|
+
)
|
339
|
+
|
340
|
+
collect_dataframe_checkpoint(
|
341
|
+
pyspark_df,
|
342
|
+
checkpoint_name="collect_checkpoint_mode_2",
|
343
|
+
mode=CheckpointMode.DATAFRAME,
|
344
|
+
)
|
271
345
|
```
|
272
346
|
|
273
|
-
|
274
|
-
- checkpoint_name - the name of the "checkpoint". Generated JSON files
|
275
|
-
will have the name "snowpark-[checkpoint_name]-schema.json"
|
276
|
-
- sample - sample size of the spark data frame to use to generate the schema
|
347
|
+
------
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# snowpark-checkpoints-collectors
|
2
|
+
|
3
|
+
---
|
4
|
+
**NOTE**
|
5
|
+
|
6
|
+
This package is on Private Preview.
|
7
|
+
|
8
|
+
---
|
9
|
+
|
10
|
+
**snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
|
11
|
+
## Features
|
12
|
+
|
13
|
+
- Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
|
14
|
+
- DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
## Functionalities
|
19
|
+
|
20
|
+
### Collect DataFrame Checkpoint
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
```python
|
25
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
26
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
27
|
+
from typing import Optional
|
28
|
+
|
29
|
+
# Signature of the function
|
30
|
+
def collect_dataframe_checkpoint(
|
31
|
+
df: SparkDataFrame,
|
32
|
+
checkpoint_name: str,
|
33
|
+
sample: Optional[float] = None,
|
34
|
+
mode: Optional[CheckpointMode] = None,
|
35
|
+
output_path: Optional[str] = None,
|
36
|
+
) -> None:
|
37
|
+
...
|
38
|
+
```
|
39
|
+
|
40
|
+
- `df`: The input Spark dataframe to collect.
|
41
|
+
- `checkpoint_name`: Name of the checkpoint schema file or dataframe.
|
42
|
+
- `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
|
43
|
+
- `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
|
44
|
+
- `output_path`: The output path to save the checkpoint, defaults to current working directory.
|
45
|
+
|
46
|
+
|
47
|
+
## Usage Example
|
48
|
+
|
49
|
+
### Schema mode
|
50
|
+
|
51
|
+
```python
|
52
|
+
from pyspark.sql import SparkSession
|
53
|
+
from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
|
54
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
55
|
+
|
56
|
+
spark_session = SparkSession.builder.getOrCreate()
|
57
|
+
sample_size = 1.0
|
58
|
+
|
59
|
+
pyspark_df = spark_session.createDataFrame(
|
60
|
+
[("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
|
61
|
+
)
|
62
|
+
|
63
|
+
collect_dataframe_checkpoint(
|
64
|
+
pyspark_df,
|
65
|
+
checkpoint_name="collect_checkpoint_mode_1",
|
66
|
+
sample=sample_size,
|
67
|
+
mode=CheckpointMode.SCHEMA,
|
68
|
+
)
|
69
|
+
```
|
70
|
+
|
71
|
+
|
72
|
+
### Dataframe mode
|
73
|
+
|
74
|
+
```python
|
75
|
+
from pyspark.sql import SparkSession
|
76
|
+
from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
|
77
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
78
|
+
from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
|
79
|
+
|
80
|
+
spark_schema = StructType(
|
81
|
+
[
|
82
|
+
StructField("BYTE", ByteType(), True),
|
83
|
+
StructField("STRING", StringType(), True),
|
84
|
+
StructField("INTEGER", IntegerType(), True)
|
85
|
+
]
|
86
|
+
)
|
87
|
+
|
88
|
+
data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
|
89
|
+
|
90
|
+
spark_session = SparkSession.builder.getOrCreate()
|
91
|
+
pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
|
92
|
+
"INTEGER"
|
93
|
+
)
|
94
|
+
|
95
|
+
collect_dataframe_checkpoint(
|
96
|
+
pyspark_df,
|
97
|
+
checkpoint_name="collect_checkpoint_mode_2",
|
98
|
+
mode=CheckpointMode.DATAFRAME,
|
99
|
+
)
|
100
|
+
```
|
101
|
+
|
102
|
+
------
|
@@ -2,9 +2,10 @@
|
|
2
2
|
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
__all__ = ["collect_dataframe_checkpoint", "
|
5
|
+
__all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
|
6
6
|
|
7
|
-
from snowflake.snowpark_checkpoints_collector.singleton import Singleton
|
8
7
|
from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
|
9
8
|
collect_dataframe_checkpoint,
|
10
9
|
)
|
10
|
+
|
11
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
@@ -8,8 +8,13 @@ from enum import IntEnum
|
|
8
8
|
|
9
9
|
|
10
10
|
class CheckpointMode(IntEnum):
|
11
|
+
|
12
|
+
"""Enum class representing the collection mode."""
|
13
|
+
|
11
14
|
SCHEMA = 1
|
15
|
+
"""Collect automatic schema inference"""
|
12
16
|
DATAFRAME = 2
|
17
|
+
"""Export DataFrame as Parquet file to Snowflake"""
|
13
18
|
|
14
19
|
|
15
20
|
# CONSTANTS
|
@@ -5,10 +5,10 @@ import json
|
|
5
5
|
|
6
6
|
from typing import Optional
|
7
7
|
|
8
|
-
from snowflake.snowpark_checkpoints_collector import Singleton
|
9
8
|
from snowflake.snowpark_checkpoints_collector.collection_result.model import (
|
10
9
|
CollectionPointResult,
|
11
10
|
)
|
11
|
+
from snowflake.snowpark_checkpoints_collector.singleton import Singleton
|
12
12
|
from snowflake.snowpark_checkpoints_collector.utils import file_utils
|
13
13
|
|
14
14
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
|
3
3
|
#
|
4
|
+
import pandas as pd
|
4
5
|
|
5
6
|
from pandas import DataFrame as PandasDataFrame
|
6
7
|
from pandera import Check, Column
|
@@ -62,23 +63,6 @@ def column_register(*args):
|
|
62
63
|
return wrapper
|
63
64
|
|
64
65
|
|
65
|
-
def _datetime_like_type_checks(
|
66
|
-
clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
|
67
|
-
) -> None:
|
68
|
-
column_values = pandas_df[clm_name].dropna()
|
69
|
-
min_value = str(column_values.min())
|
70
|
-
max_value = str(column_values.max())
|
71
|
-
pandera_column.checks.append(
|
72
|
-
Check.between(
|
73
|
-
min_value=min_value,
|
74
|
-
max_value=max_value,
|
75
|
-
include_max=True,
|
76
|
-
include_min=True,
|
77
|
-
title=BETWEEN_CHECK_ERROR_MESSAGE_FORMAT.format(min_value, max_value),
|
78
|
-
)
|
79
|
-
)
|
80
|
-
|
81
|
-
|
82
66
|
@collector_register
|
83
67
|
class PanderaColumnChecksManager:
|
84
68
|
|
@@ -117,13 +101,24 @@ class PanderaColumnChecksManager:
|
|
117
101
|
def _add_date_type_checks(
|
118
102
|
self, clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
|
119
103
|
) -> None:
|
120
|
-
|
104
|
+
pass
|
121
105
|
|
122
106
|
@column_register(DAYTIMEINTERVAL_COLUMN_TYPE)
|
123
107
|
def _add_daytimeinterval_type_checks(
|
124
108
|
self, clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
|
125
109
|
) -> None:
|
126
|
-
|
110
|
+
column_values = pandas_df[clm_name].dropna()
|
111
|
+
min_value = pd.to_timedelta(column_values.min())
|
112
|
+
max_value = pd.to_timedelta(column_values.max())
|
113
|
+
pandera_column.checks.append(
|
114
|
+
Check.between(
|
115
|
+
min_value=min_value,
|
116
|
+
max_value=max_value,
|
117
|
+
include_max=True,
|
118
|
+
include_min=True,
|
119
|
+
title=BETWEEN_CHECK_ERROR_MESSAGE_FORMAT.format(min_value, max_value),
|
120
|
+
)
|
121
|
+
)
|
127
122
|
|
128
123
|
@column_register(
|
129
124
|
BYTE_COLUMN_TYPE,
|
@@ -153,16 +148,42 @@ class PanderaColumnChecksManager:
|
|
153
148
|
def _add_string_type_checks(
|
154
149
|
self, clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
|
155
150
|
) -> None:
|
156
|
-
|
151
|
+
column_values = pandas_df[clm_name].dropna()
|
152
|
+
colum_str_length = column_values.str.len()
|
153
|
+
min_length = colum_str_length.min().item()
|
154
|
+
max_length = colum_str_length.max().item()
|
155
|
+
pandera_column.checks.append(Check.str_length(min_length, max_length))
|
157
156
|
|
158
157
|
@column_register(TIMESTAMP_COLUMN_TYPE)
|
159
158
|
def _add_timestamp_type_checks(
|
160
159
|
self, clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
|
161
160
|
) -> None:
|
162
|
-
|
161
|
+
column_values = pandas_df[clm_name].dropna()
|
162
|
+
min_value = pd.Timestamp(column_values.min())
|
163
|
+
max_value = pd.Timestamp(column_values.max())
|
164
|
+
pandera_column.checks.append(
|
165
|
+
Check.between(
|
166
|
+
min_value=min_value,
|
167
|
+
max_value=max_value,
|
168
|
+
include_max=True,
|
169
|
+
include_min=True,
|
170
|
+
title=BETWEEN_CHECK_ERROR_MESSAGE_FORMAT.format(min_value, max_value),
|
171
|
+
)
|
172
|
+
)
|
163
173
|
|
164
174
|
@column_register(TIMESTAMP_NTZ_COLUMN_TYPE)
|
165
175
|
def _add_timestamp_ntz_type_checks(
|
166
176
|
self, clm_name: str, pandas_df: PandasDataFrame, pandera_column: Column
|
167
177
|
) -> None:
|
168
|
-
|
178
|
+
column_values = pandas_df[clm_name].dropna()
|
179
|
+
min_value = pd.Timestamp(column_values.min())
|
180
|
+
max_value = pd.Timestamp(column_values.max())
|
181
|
+
pandera_column.checks.append(
|
182
|
+
Check.between(
|
183
|
+
min_value=min_value,
|
184
|
+
max_value=max_value,
|
185
|
+
include_max=True,
|
186
|
+
include_min=True,
|
187
|
+
title=BETWEEN_CHECK_ERROR_MESSAGE_FORMAT.format(min_value, max_value),
|
188
|
+
)
|
189
|
+
)
|
@@ -41,7 +41,10 @@ from snowflake.snowpark_checkpoints_collector.column_pandera_checks import (
|
|
41
41
|
from snowflake.snowpark_checkpoints_collector.snow_connection_model import (
|
42
42
|
SnowConnection,
|
43
43
|
)
|
44
|
-
from snowflake.snowpark_checkpoints_collector.utils import
|
44
|
+
from snowflake.snowpark_checkpoints_collector.utils import (
|
45
|
+
checkpoint_name_utils,
|
46
|
+
file_utils,
|
47
|
+
)
|
45
48
|
from snowflake.snowpark_checkpoints_collector.utils.extra_config import (
|
46
49
|
get_checkpoint_mode,
|
47
50
|
get_checkpoint_sample,
|
@@ -71,52 +74,83 @@ def collect_dataframe_checkpoint(
|
|
71
74
|
|
72
75
|
Raises:
|
73
76
|
Exception: Invalid mode value.
|
77
|
+
Exception: Invalid checkpoint name. Checkpoint names must only contain alphanumeric characters and underscores.
|
74
78
|
|
75
79
|
"""
|
76
|
-
collection_point_file_path = file_utils.get_collection_point_source_file_path()
|
77
|
-
collection_point_line_of_code = file_utils.get_collection_point_line_of_code()
|
78
|
-
collection_point_result = CollectionPointResult(
|
79
|
-
collection_point_file_path, collection_point_line_of_code, checkpoint_name
|
80
|
-
)
|
81
|
-
|
82
80
|
try:
|
83
|
-
|
81
|
+
normalized_checkpoint_name = checkpoint_name_utils.normalize_checkpoint_name(
|
82
|
+
checkpoint_name
|
83
|
+
)
|
84
|
+
is_valid_checkpoint_name = checkpoint_name_utils.is_valid_checkpoint_name(
|
85
|
+
normalized_checkpoint_name
|
86
|
+
)
|
87
|
+
if not is_valid_checkpoint_name:
|
88
|
+
raise Exception(
|
89
|
+
f"Invalid checkpoint name: {checkpoint_name}. Checkpoint names must only contain alphanumeric "
|
90
|
+
f"characters and underscores."
|
91
|
+
)
|
84
92
|
|
85
|
-
|
93
|
+
if is_checkpoint_enabled(normalized_checkpoint_name):
|
86
94
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
95
|
+
collection_point_file_path = (
|
96
|
+
file_utils.get_collection_point_source_file_path()
|
97
|
+
)
|
98
|
+
collection_point_line_of_code = (
|
99
|
+
file_utils.get_collection_point_line_of_code()
|
100
|
+
)
|
101
|
+
collection_point_result = CollectionPointResult(
|
102
|
+
collection_point_file_path,
|
103
|
+
collection_point_line_of_code,
|
104
|
+
normalized_checkpoint_name,
|
105
|
+
)
|
91
106
|
|
92
|
-
|
107
|
+
try:
|
93
108
|
|
94
|
-
|
95
|
-
column_type_dict = _get_spark_column_types(df)
|
96
|
-
_collect_dataframe_checkpoint_mode_schema(
|
97
|
-
checkpoint_name, df, _sample, column_type_dict, output_path
|
98
|
-
)
|
109
|
+
_sample = get_checkpoint_sample(normalized_checkpoint_name, sample)
|
99
110
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
111
|
+
if _is_empty_dataframe_without_schema(df):
|
112
|
+
raise Exception(
|
113
|
+
"It is not possible to collect an empty DataFrame without schema"
|
114
|
+
)
|
115
|
+
|
116
|
+
_mode = get_checkpoint_mode(normalized_checkpoint_name, mode)
|
117
|
+
|
118
|
+
if _mode == CheckpointMode.SCHEMA:
|
119
|
+
column_type_dict = _get_spark_column_types(df)
|
120
|
+
_collect_dataframe_checkpoint_mode_schema(
|
121
|
+
normalized_checkpoint_name,
|
122
|
+
df,
|
123
|
+
_sample,
|
124
|
+
column_type_dict,
|
125
|
+
output_path,
|
126
|
+
)
|
127
|
+
|
128
|
+
elif _mode == CheckpointMode.DATAFRAME:
|
129
|
+
snow_connection = SnowConnection()
|
130
|
+
_collect_dataframe_checkpoint_mode_dataframe(
|
131
|
+
normalized_checkpoint_name, df, snow_connection, output_path
|
132
|
+
)
|
105
133
|
|
106
|
-
|
107
|
-
|
134
|
+
else:
|
135
|
+
raise Exception("Invalid mode value.")
|
108
136
|
|
109
|
-
|
137
|
+
collection_point_result.result = CollectionResult.PASS
|
138
|
+
|
139
|
+
except Exception as err:
|
140
|
+
collection_point_result.result = CollectionResult.FAIL
|
141
|
+
error_message = str(err)
|
142
|
+
raise Exception(error_message) from err
|
143
|
+
|
144
|
+
finally:
|
145
|
+
collection_point_result_manager = CollectionPointResultManager(
|
146
|
+
output_path
|
147
|
+
)
|
148
|
+
collection_point_result_manager.add_result(collection_point_result)
|
110
149
|
|
111
150
|
except Exception as err:
|
112
|
-
collection_point_result.result = CollectionResult.FAIL
|
113
151
|
error_message = str(err)
|
114
152
|
raise Exception(error_message) from err
|
115
153
|
|
116
|
-
finally:
|
117
|
-
collection_point_result_manager = CollectionPointResultManager(output_path)
|
118
|
-
collection_point_result_manager.add_result(collection_point_result)
|
119
|
-
|
120
154
|
|
121
155
|
@report_telemetry(params_list=["column_type_dict"])
|
122
156
|
def _collect_dataframe_checkpoint_mode_schema(
|
@@ -239,7 +273,6 @@ def _get_pandera_infer_schema_as_dict(
|
|
239
273
|
def _generate_json_checkpoint_file(
|
240
274
|
checkpoint_name, dataframe_schema_contract, output_path: Optional[str] = None
|
241
275
|
) -> None:
|
242
|
-
|
243
276
|
checkpoint_file_name = CHECKPOINT_JSON_OUTPUT_FILE_NAME_FORMAT.format(
|
244
277
|
checkpoint_name
|
245
278
|
)
|
@@ -0,0 +1,49 @@
|
|
1
|
+
import re as regx
|
2
|
+
|
3
|
+
|
4
|
+
CHECKPOINT_NAME_REGEX_PATTERN = r"^[a-zA-Z_][a-zA-Z0-9_]+"
|
5
|
+
WHITESPACE_TOKEN = " "
|
6
|
+
HYPHEN_TOKEN = "-"
|
7
|
+
UNDERSCORE_TOKEN = "_"
|
8
|
+
TOKEN_TO_REPLACE_COLLECTION = [WHITESPACE_TOKEN, HYPHEN_TOKEN]
|
9
|
+
|
10
|
+
|
11
|
+
def normalize_checkpoint_name(checkpoint_name: str) -> str:
|
12
|
+
"""Normalize the provided checkpoint name by replacing: the whitespace and hyphen tokens by underscore token.
|
13
|
+
|
14
|
+
Args:
|
15
|
+
checkpoint_name (str): The checkpoint name to normalize.
|
16
|
+
|
17
|
+
Returns:
|
18
|
+
str: the checkpoint name normalized.
|
19
|
+
|
20
|
+
"""
|
21
|
+
normalized_checkpoint_name = checkpoint_name
|
22
|
+
for token in TOKEN_TO_REPLACE_COLLECTION:
|
23
|
+
normalized_checkpoint_name = normalized_checkpoint_name.replace(
|
24
|
+
token, UNDERSCORE_TOKEN
|
25
|
+
)
|
26
|
+
|
27
|
+
return normalized_checkpoint_name
|
28
|
+
|
29
|
+
|
30
|
+
def is_valid_checkpoint_name(checkpoint_name: str) -> bool:
|
31
|
+
"""Check if the provided checkpoint name is valid.
|
32
|
+
|
33
|
+
A valid checkpoint name must:
|
34
|
+
- Start with a letter (a-z, A-Z) or an underscore (_)
|
35
|
+
- Be followed by any combination of letters, digits (0-9) and underscores (_).
|
36
|
+
|
37
|
+
Args:
|
38
|
+
checkpoint_name (str): The checkpoint name to validate.
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
bool: True if the checkpoint name is valid; otherwise, False.
|
42
|
+
|
43
|
+
"""
|
44
|
+
matched = regx.findall(CHECKPOINT_NAME_REGEX_PATTERN, checkpoint_name)
|
45
|
+
if len(matched) == 0 or len(matched) > 1:
|
46
|
+
return False
|
47
|
+
|
48
|
+
is_valid = matched[0] == checkpoint_name
|
49
|
+
return is_valid
|
@@ -0,0 +1,51 @@
|
|
1
|
+
import os
|
2
|
+
import tempfile
|
3
|
+
|
4
|
+
import pytest
|
5
|
+
from pyspark.sql import SparkSession
|
6
|
+
from pyspark.sql.types import StructType
|
7
|
+
|
8
|
+
from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
|
9
|
+
from snowflake.snowpark_checkpoints_collector.singleton import Singleton
|
10
|
+
from pathlib import Path
|
11
|
+
from snowflake.snowpark_checkpoints_collector.utils.telemetry import (
|
12
|
+
get_telemetry_manager,
|
13
|
+
)
|
14
|
+
|
15
|
+
telemetry_folder = "telemetry"
|
16
|
+
|
17
|
+
|
18
|
+
@pytest.fixture
|
19
|
+
def spark_session():
|
20
|
+
return SparkSession.builder.getOrCreate()
|
21
|
+
|
22
|
+
|
23
|
+
@pytest.fixture
|
24
|
+
def singleton():
|
25
|
+
Singleton._instances = {}
|
26
|
+
|
27
|
+
|
28
|
+
@pytest.fixture(scope="function")
|
29
|
+
def output_path():
|
30
|
+
folder = os.urandom(8).hex()
|
31
|
+
directory = Path(tempfile.gettempdir()).resolve() / folder
|
32
|
+
os.makedirs(directory)
|
33
|
+
return str(directory)
|
34
|
+
|
35
|
+
|
36
|
+
def test_collect_dataframe_with_invalid_checkpoint_name(
|
37
|
+
spark_session, singleton, output_path
|
38
|
+
):
|
39
|
+
checkpoint_name = "6*invalid"
|
40
|
+
data = []
|
41
|
+
columns = StructType()
|
42
|
+
pyspark_df = spark_session.createDataFrame(data=data, schema=columns)
|
43
|
+
|
44
|
+
with pytest.raises(Exception) as ex_info:
|
45
|
+
collect_dataframe_checkpoint(
|
46
|
+
pyspark_df, checkpoint_name=checkpoint_name, output_path=output_path
|
47
|
+
)
|
48
|
+
assert (
|
49
|
+
f"Invalid checkpoint name: {checkpoint_name}. Checkpoint names must only contain alphanumeric characters "
|
50
|
+
f"and underscores."
|
51
|
+
) == str(ex_info.value)
|