snowpark-checkpoints-collectors 0.1.0rc1__tar.gz → 0.1.0rc3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/.gitignore +3 -0
- snowpark_checkpoints_collectors-0.1.0rc3/PKG-INFO +146 -0
- snowpark_checkpoints_collectors-0.1.0rc3/README.md +102 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/pyproject.toml +6 -5
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/__init__.py +3 -2
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +10 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +1 -1
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +18 -18
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +22 -16
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +17 -11
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +18 -11
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +7 -7
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +15 -8
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +15 -8
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +22 -10
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +9 -7
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +25 -17
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +5 -5
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +24 -11
- snowpark_checkpoints_collectors-0.1.0rc3/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +59 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +10 -8
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +18 -8
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +18 -8
- snowpark_checkpoints_collectors-0.1.0rc3/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +212 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +94 -45
- snowpark_checkpoints_collectors-0.1.0rc3/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +50 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/telemetry_compare_utils.py +69 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_checkpoint_name.py +51 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1.py +40 -47
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +1 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +1 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +1 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +7 -6
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +6 -5
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +6 -5
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_full_df.json +1 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +1 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +6 -5
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +18 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_2.py +56 -8
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_collect_invalid_mode_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_generate_parquet_for_spark_df_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/integ/test_collect_df_mode_2_expected/test_spark_df_mode_dataframe_telemetry.json +18 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collection_result_file.py +1 -1
- snowpark_checkpoints_collectors-0.1.0rc3/test/unit/test_checkpoint_name_utils.py +47 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_collection_point_result_manager.py +1 -1
- snowpark_checkpoints_collectors-0.1.0rc3/test/unit/test_column_collection.py +466 -0
- snowpark_checkpoints_collectors-0.1.0rc3/test/unit/test_pandera_column_check_manager.py +194 -0
- snowpark_checkpoints_collectors-0.1.0rc1/PKG-INFO +0 -276
- snowpark_checkpoints_collectors-0.1.0rc1/README.md +0 -31
- snowpark_checkpoints_collectors-0.1.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -35
- snowpark_checkpoints_collectors-0.1.0rc1/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +0 -168
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -1
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -17
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -1
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -17
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -1
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -17
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -1
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -1
- snowpark_checkpoints_collectors-0.1.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -17
- snowpark_checkpoints_collectors-0.1.0rc1/test/unit/test_column_collection.py +0 -669
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/CHANGELOG.md +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/LICENSE +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/snowpark-testdf-schema.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/.coveragerc +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/integ/test_snow_connection_int.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_collection_point_result.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_extra_config.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_file_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_snow_connection.py +0 -0
- {snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/test/unit/test_summary_stats_collector.py +0 -0
@@ -0,0 +1,146 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: snowpark-checkpoints-collectors
|
3
|
+
Version: 0.1.0rc3
|
4
|
+
Summary: Snowpark column and table statistics collection
|
5
|
+
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
|
+
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
7
|
+
Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
|
8
|
+
License: Apache License, Version 2.0
|
9
|
+
License-File: LICENSE
|
10
|
+
Keywords: Snowflake,Snowpark,analytics,cloud,database,db
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
12
|
+
Classifier: Environment :: Console
|
13
|
+
Classifier: Environment :: Other Environment
|
14
|
+
Classifier: Intended Audience :: Developers
|
15
|
+
Classifier: Intended Audience :: Education
|
16
|
+
Classifier: Intended Audience :: Information Technology
|
17
|
+
Classifier: Intended Audience :: System Administrators
|
18
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
19
|
+
Classifier: Operating System :: OS Independent
|
20
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
21
|
+
Classifier: Programming Language :: SQL
|
22
|
+
Classifier: Topic :: Database
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
24
|
+
Classifier: Topic :: Software Development
|
25
|
+
Classifier: Topic :: Software Development :: Libraries
|
26
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
|
+
Requires-Python: <3.12,>=3.9
|
29
|
+
Requires-Dist: pandera[io]==0.20.4
|
30
|
+
Requires-Dist: pyspark
|
31
|
+
Requires-Dist: snowflake-connector-python
|
32
|
+
Requires-Dist: snowflake-snowpark-python
|
33
|
+
Provides-Extra: development
|
34
|
+
Requires-Dist: coverage>=7.6.7; extra == 'development'
|
35
|
+
Requires-Dist: deepdiff>=8.0.0; extra == 'development'
|
36
|
+
Requires-Dist: hatchling==1.25.0; extra == 'development'
|
37
|
+
Requires-Dist: pre-commit>=4.0.1; extra == 'development'
|
38
|
+
Requires-Dist: pyarrow>=18.0.0; extra == 'development'
|
39
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
|
40
|
+
Requires-Dist: pytest>=8.3.3; extra == 'development'
|
41
|
+
Requires-Dist: setuptools>=70.0.0; extra == 'development'
|
42
|
+
Requires-Dist: twine==5.1.1; extra == 'development'
|
43
|
+
Description-Content-Type: text/markdown
|
44
|
+
|
45
|
+
# snowpark-checkpoints-collectors
|
46
|
+
|
47
|
+
---
|
48
|
+
**NOTE**
|
49
|
+
|
50
|
+
This package is on Private Preview.
|
51
|
+
|
52
|
+
---
|
53
|
+
|
54
|
+
**snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
|
55
|
+
## Features
|
56
|
+
|
57
|
+
- Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
|
58
|
+
- DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
## Functionalities
|
63
|
+
|
64
|
+
### Collect DataFrame Checkpoint
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
```python
|
69
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
70
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
71
|
+
from typing import Optional
|
72
|
+
|
73
|
+
# Signature of the function
|
74
|
+
def collect_dataframe_checkpoint(
|
75
|
+
df: SparkDataFrame,
|
76
|
+
checkpoint_name: str,
|
77
|
+
sample: Optional[float] = None,
|
78
|
+
mode: Optional[CheckpointMode] = None,
|
79
|
+
output_path: Optional[str] = None,
|
80
|
+
) -> None:
|
81
|
+
...
|
82
|
+
```
|
83
|
+
|
84
|
+
- `df`: The input Spark dataframe to collect.
|
85
|
+
- `checkpoint_name`: Name of the checkpoint schema file or dataframe.
|
86
|
+
- `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
|
87
|
+
- `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
|
88
|
+
- `output_path`: The output path to save the checkpoint, defaults to current working directory.
|
89
|
+
|
90
|
+
|
91
|
+
## Usage Example
|
92
|
+
|
93
|
+
### Schema mode
|
94
|
+
|
95
|
+
```python
|
96
|
+
from pyspark.sql import SparkSession
|
97
|
+
from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
|
98
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
99
|
+
|
100
|
+
spark_session = SparkSession.builder.getOrCreate()
|
101
|
+
sample_size = 1.0
|
102
|
+
|
103
|
+
pyspark_df = spark_session.createDataFrame(
|
104
|
+
[("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
|
105
|
+
)
|
106
|
+
|
107
|
+
collect_dataframe_checkpoint(
|
108
|
+
pyspark_df,
|
109
|
+
checkpoint_name="collect_checkpoint_mode_1",
|
110
|
+
sample=sample_size,
|
111
|
+
mode=CheckpointMode.SCHEMA,
|
112
|
+
)
|
113
|
+
```
|
114
|
+
|
115
|
+
|
116
|
+
### Dataframe mode
|
117
|
+
|
118
|
+
```python
|
119
|
+
from pyspark.sql import SparkSession
|
120
|
+
from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
|
121
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
122
|
+
from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
|
123
|
+
|
124
|
+
spark_schema = StructType(
|
125
|
+
[
|
126
|
+
StructField("BYTE", ByteType(), True),
|
127
|
+
StructField("STRING", StringType(), True),
|
128
|
+
StructField("INTEGER", IntegerType(), True)
|
129
|
+
]
|
130
|
+
)
|
131
|
+
|
132
|
+
data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
|
133
|
+
|
134
|
+
spark_session = SparkSession.builder.getOrCreate()
|
135
|
+
pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
|
136
|
+
"INTEGER"
|
137
|
+
)
|
138
|
+
|
139
|
+
collect_dataframe_checkpoint(
|
140
|
+
pyspark_df,
|
141
|
+
checkpoint_name="collect_checkpoint_mode_2",
|
142
|
+
mode=CheckpointMode.DATAFRAME,
|
143
|
+
)
|
144
|
+
```
|
145
|
+
|
146
|
+
------
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# snowpark-checkpoints-collectors
|
2
|
+
|
3
|
+
---
|
4
|
+
**NOTE**
|
5
|
+
|
6
|
+
This package is on Private Preview.
|
7
|
+
|
8
|
+
---
|
9
|
+
|
10
|
+
**snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
|
11
|
+
## Features
|
12
|
+
|
13
|
+
- Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
|
14
|
+
- DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
## Functionalities
|
19
|
+
|
20
|
+
### Collect DataFrame Checkpoint
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
```python
|
25
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
26
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
27
|
+
from typing import Optional
|
28
|
+
|
29
|
+
# Signature of the function
|
30
|
+
def collect_dataframe_checkpoint(
|
31
|
+
df: SparkDataFrame,
|
32
|
+
checkpoint_name: str,
|
33
|
+
sample: Optional[float] = None,
|
34
|
+
mode: Optional[CheckpointMode] = None,
|
35
|
+
output_path: Optional[str] = None,
|
36
|
+
) -> None:
|
37
|
+
...
|
38
|
+
```
|
39
|
+
|
40
|
+
- `df`: The input Spark dataframe to collect.
|
41
|
+
- `checkpoint_name`: Name of the checkpoint schema file or dataframe.
|
42
|
+
- `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
|
43
|
+
- `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
|
44
|
+
- `output_path`: The output path to save the checkpoint, defaults to current working directory.
|
45
|
+
|
46
|
+
|
47
|
+
## Usage Example
|
48
|
+
|
49
|
+
### Schema mode
|
50
|
+
|
51
|
+
```python
|
52
|
+
from pyspark.sql import SparkSession
|
53
|
+
from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
|
54
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
55
|
+
|
56
|
+
spark_session = SparkSession.builder.getOrCreate()
|
57
|
+
sample_size = 1.0
|
58
|
+
|
59
|
+
pyspark_df = spark_session.createDataFrame(
|
60
|
+
[("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
|
61
|
+
)
|
62
|
+
|
63
|
+
collect_dataframe_checkpoint(
|
64
|
+
pyspark_df,
|
65
|
+
checkpoint_name="collect_checkpoint_mode_1",
|
66
|
+
sample=sample_size,
|
67
|
+
mode=CheckpointMode.SCHEMA,
|
68
|
+
)
|
69
|
+
```
|
70
|
+
|
71
|
+
|
72
|
+
### Dataframe mode
|
73
|
+
|
74
|
+
```python
|
75
|
+
from pyspark.sql import SparkSession
|
76
|
+
from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
|
77
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
78
|
+
from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
|
79
|
+
|
80
|
+
spark_schema = StructType(
|
81
|
+
[
|
82
|
+
StructField("BYTE", ByteType(), True),
|
83
|
+
StructField("STRING", StringType(), True),
|
84
|
+
StructField("INTEGER", IntegerType(), True)
|
85
|
+
]
|
86
|
+
)
|
87
|
+
|
88
|
+
data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
|
89
|
+
|
90
|
+
spark_session = SparkSession.builder.getOrCreate()
|
91
|
+
pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
|
92
|
+
"INTEGER"
|
93
|
+
)
|
94
|
+
|
95
|
+
collect_dataframe_checkpoint(
|
96
|
+
pyspark_df,
|
97
|
+
checkpoint_name="collect_checkpoint_mode_2",
|
98
|
+
mode=CheckpointMode.DATAFRAME,
|
99
|
+
)
|
100
|
+
```
|
101
|
+
|
102
|
+
------
|
{snowpark_checkpoints_collectors-0.1.0rc1 → snowpark_checkpoints_collectors-0.1.0rc3}/pyproject.toml
RENAMED
@@ -3,7 +3,9 @@ build-backend = "hatchling.build"
|
|
3
3
|
requires = ["hatchling"]
|
4
4
|
|
5
5
|
[project]
|
6
|
-
authors = [
|
6
|
+
authors = [
|
7
|
+
{name = "Snowflake, Inc.", email = "snowflake-python-libraries-dl@snowflake.com"},
|
8
|
+
]
|
7
9
|
classifiers = [
|
8
10
|
"Development Status :: 4 - Beta",
|
9
11
|
"Environment :: Console",
|
@@ -30,6 +32,7 @@ dependencies = [
|
|
30
32
|
"pandera[io]==0.20.4",
|
31
33
|
]
|
32
34
|
description = "Snowpark column and table statistics collection"
|
35
|
+
dynamic = ['version']
|
33
36
|
keywords = [
|
34
37
|
'Snowflake',
|
35
38
|
'analytics',
|
@@ -38,11 +41,10 @@ keywords = [
|
|
38
41
|
'db',
|
39
42
|
'Snowpark',
|
40
43
|
]
|
41
|
-
license = {
|
44
|
+
license = {text = "Apache License, Version 2.0"}
|
42
45
|
name = "snowpark-checkpoints-collectors"
|
43
46
|
readme = "README.md"
|
44
47
|
requires-python = '>=3.9,<3.12'
|
45
|
-
dynamic = ['version']
|
46
48
|
|
47
49
|
[project.optional-dependencies]
|
48
50
|
development = [
|
@@ -113,7 +115,6 @@ exclude_lines = [
|
|
113
115
|
"if __name__ == .__main__.:",
|
114
116
|
]
|
115
117
|
|
116
|
-
|
117
118
|
[tool.hatch.envs.linter.scripts]
|
118
119
|
check = [
|
119
120
|
'ruff check --fix .',
|
@@ -121,7 +122,7 @@ check = [
|
|
121
122
|
|
122
123
|
[tool.hatch.envs.test.scripts]
|
123
124
|
check = [
|
124
|
-
"pip install -e ../snowpark-checkpoints-configuration"
|
125
|
+
"pip install -e ../snowpark-checkpoints-configuration",
|
125
126
|
'pytest -v --junitxml=test/outcome/test-results.xml --cov=. --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
|
126
127
|
]
|
127
128
|
|
@@ -2,9 +2,10 @@
|
|
2
2
|
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
__all__ = ["collect_dataframe_checkpoint", "
|
5
|
+
__all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
|
6
6
|
|
7
|
-
from snowflake.snowpark_checkpoints_collector.singleton import Singleton
|
8
7
|
from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
|
9
8
|
collect_dataframe_checkpoint,
|
10
9
|
)
|
10
|
+
|
11
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
@@ -8,8 +8,13 @@ from enum import IntEnum
|
|
8
8
|
|
9
9
|
|
10
10
|
class CheckpointMode(IntEnum):
|
11
|
+
|
12
|
+
"""Enum class representing the collection mode."""
|
13
|
+
|
11
14
|
SCHEMA = 1
|
15
|
+
"""Collect automatic schema inference"""
|
12
16
|
DATAFRAME = 2
|
17
|
+
"""Export DataFrame as Parquet file to Snowflake"""
|
13
18
|
|
14
19
|
|
15
20
|
# CONSTANTS
|
@@ -76,11 +81,13 @@ COLUMN_IS_UNIQUE_SIZE_KEY = "is_unique_size"
|
|
76
81
|
COLUMN_KEY_TYPE_KEY = "key_type"
|
77
82
|
COLUMN_MARGIN_ERROR_KEY = "margin_error"
|
78
83
|
COLUMN_MAX_KEY = "max"
|
84
|
+
COLUMN_MAX_LENGTH_KEY = "max_length"
|
79
85
|
COLUMN_MAX_SIZE_KEY = "max_size"
|
80
86
|
COLUMN_MEAN_KEY = "mean"
|
81
87
|
COLUMN_MEAN_SIZE_KEY = "mean_size"
|
82
88
|
COLUMN_METADATA_KEY = "metadata"
|
83
89
|
COLUMN_MIN_KEY = "min"
|
90
|
+
COLUMN_MIN_LENGTH_KEY = "min_length"
|
84
91
|
COLUMN_MIN_SIZE_KEY = "min_size"
|
85
92
|
COLUMN_NAME_KEY = "name"
|
86
93
|
COLUMN_NULL_COUNT_KEY = "null_count"
|
@@ -90,6 +97,7 @@ COLUMN_ROWS_NULL_COUNT_KEY = "rows_null_count"
|
|
90
97
|
COLUMN_SIZE_KEY = "size"
|
91
98
|
COLUMN_TRUE_COUNT_KEY = "true_count"
|
92
99
|
COLUMN_TYPE_KEY = "type"
|
100
|
+
COLUMN_VALUE_KEY = "value"
|
93
101
|
COLUMN_VALUE_TYPE_KEY = "value_type"
|
94
102
|
COLUMNS_KEY = "columns"
|
95
103
|
|
@@ -121,6 +129,8 @@ UNKNOWN_SOURCE_FILE = "unknown"
|
|
121
129
|
UNKNOWN_LINE_OF_CODE = -1
|
122
130
|
BACKSLASH_TOKEN = "\\"
|
123
131
|
SLASH_TOKEN = "/"
|
132
|
+
PYSPARK_NONE_SIZE_VALUE = -1
|
133
|
+
PANDAS_LONG_TYPE = "Int64"
|
124
134
|
|
125
135
|
# ENVIRONMENT VARIABLES
|
126
136
|
SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR = (
|
@@ -5,10 +5,10 @@ import json
|
|
5
5
|
|
6
6
|
from typing import Optional
|
7
7
|
|
8
|
-
from snowflake.snowpark_checkpoints_collector import Singleton
|
9
8
|
from snowflake.snowpark_checkpoints_collector.collection_result.model import (
|
10
9
|
CollectionPointResult,
|
11
10
|
)
|
11
|
+
from snowflake.snowpark_checkpoints_collector.singleton import Singleton
|
12
12
|
from snowflake.snowpark_checkpoints_collector.utils import file_utils
|
13
13
|
|
14
14
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
|
3
3
|
#
|
4
|
-
from
|
4
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
5
5
|
from pyspark.sql.types import StructField
|
6
6
|
|
7
7
|
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
@@ -88,14 +88,14 @@ class ColumnCollectorManager:
|
|
88
88
|
"""Manage class for column collector based on type."""
|
89
89
|
|
90
90
|
def collect_column(
|
91
|
-
self, clm_name: str, struct_field: StructField, values:
|
91
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
92
92
|
) -> dict[str, any]:
|
93
93
|
"""Collect the data of the column based on the column type.
|
94
94
|
|
95
95
|
Args:
|
96
96
|
clm_name (str): the name of the column.
|
97
97
|
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
98
|
-
values (
|
98
|
+
values (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
99
99
|
|
100
100
|
Returns:
|
101
101
|
dict[str, any]: The data collected.
|
@@ -112,7 +112,7 @@ class ColumnCollectorManager:
|
|
112
112
|
|
113
113
|
@column_register(ARRAY_COLUMN_TYPE)
|
114
114
|
def _collect_array_type_custom_data(
|
115
|
-
self, clm_name: str, struct_field: StructField, values:
|
115
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
116
116
|
) -> dict[str, any]:
|
117
117
|
column_collector = ArrayColumnCollector(clm_name, struct_field, values)
|
118
118
|
collected_data = column_collector.get_data()
|
@@ -120,7 +120,7 @@ class ColumnCollectorManager:
|
|
120
120
|
|
121
121
|
@column_register(BINARY_COLUMN_TYPE)
|
122
122
|
def _collect_binary_type_custom_data(
|
123
|
-
self, clm_name: str, struct_field: StructField, values:
|
123
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
124
124
|
) -> dict[str, any]:
|
125
125
|
column_collector = BinaryColumnCollector(clm_name, struct_field, values)
|
126
126
|
collected_data = column_collector.get_data()
|
@@ -128,7 +128,7 @@ class ColumnCollectorManager:
|
|
128
128
|
|
129
129
|
@column_register(BOOLEAN_COLUMN_TYPE)
|
130
130
|
def _collect_boolean_type_custom_data(
|
131
|
-
self, clm_name: str, struct_field: StructField, values:
|
131
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
132
132
|
) -> dict[str, any]:
|
133
133
|
column_collector = BooleanColumnCollector(clm_name, struct_field, values)
|
134
134
|
collected_data = column_collector.get_data()
|
@@ -136,7 +136,7 @@ class ColumnCollectorManager:
|
|
136
136
|
|
137
137
|
@column_register(DATE_COLUMN_TYPE)
|
138
138
|
def _collect_date_type_custom_data(
|
139
|
-
self, clm_name: str, struct_field: StructField, values:
|
139
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
140
140
|
) -> dict[str, any]:
|
141
141
|
column_collector = DateColumnCollector(clm_name, struct_field, values)
|
142
142
|
collected_data = column_collector.get_data()
|
@@ -144,7 +144,7 @@ class ColumnCollectorManager:
|
|
144
144
|
|
145
145
|
@column_register(DAYTIMEINTERVAL_COLUMN_TYPE)
|
146
146
|
def _collect_day_time_interval_type_custom_data(
|
147
|
-
self, clm_name: str, struct_field: StructField, values:
|
147
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
148
148
|
) -> dict[str, any]:
|
149
149
|
column_collector = DayTimeIntervalColumnCollector(
|
150
150
|
clm_name, struct_field, values
|
@@ -154,7 +154,7 @@ class ColumnCollectorManager:
|
|
154
154
|
|
155
155
|
@column_register(DECIMAL_COLUMN_TYPE)
|
156
156
|
def _collect_decimal_type_custom_data(
|
157
|
-
self, clm_name: str, struct_field: StructField, values:
|
157
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
158
158
|
) -> dict[str, any]:
|
159
159
|
column_collector = DecimalColumnCollector(clm_name, struct_field, values)
|
160
160
|
collected_data = column_collector.get_data()
|
@@ -162,7 +162,7 @@ class ColumnCollectorManager:
|
|
162
162
|
|
163
163
|
@column_register(MAP_COLUMN_TYPE)
|
164
164
|
def _collect_map_type_custom_data(
|
165
|
-
self, clm_name: str, struct_field: StructField, values:
|
165
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
166
166
|
) -> dict[str, any]:
|
167
167
|
column_collector = MapColumnCollector(clm_name, struct_field, values)
|
168
168
|
collected_data = column_collector.get_data()
|
@@ -170,7 +170,7 @@ class ColumnCollectorManager:
|
|
170
170
|
|
171
171
|
@column_register(NULL_COLUMN_TYPE)
|
172
172
|
def _collect_null_type_custom_data(
|
173
|
-
self, clm_name: str, struct_field: StructField, values:
|
173
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
174
174
|
) -> dict[str, any]:
|
175
175
|
column_collector = NullColumnCollector(clm_name, struct_field, values)
|
176
176
|
collected_data = column_collector.get_data()
|
@@ -185,7 +185,7 @@ class ColumnCollectorManager:
|
|
185
185
|
DOUBLE_COLUMN_TYPE,
|
186
186
|
)
|
187
187
|
def _collect_numeric_type_custom_data(
|
188
|
-
self, clm_name: str, struct_field: StructField, values:
|
188
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
189
189
|
) -> dict[str, any]:
|
190
190
|
column_collector = NumericColumnCollector(clm_name, struct_field, values)
|
191
191
|
collected_data = column_collector.get_data()
|
@@ -193,7 +193,7 @@ class ColumnCollectorManager:
|
|
193
193
|
|
194
194
|
@column_register(STRING_COLUMN_TYPE)
|
195
195
|
def _collect_string_type_custom_data(
|
196
|
-
self, clm_name: str, struct_field: StructField, values:
|
196
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
197
197
|
) -> dict[str, any]:
|
198
198
|
column_collector = StringColumnCollector(clm_name, struct_field, values)
|
199
199
|
collected_data = column_collector.get_data()
|
@@ -201,7 +201,7 @@ class ColumnCollectorManager:
|
|
201
201
|
|
202
202
|
@column_register(STRUCT_COLUMN_TYPE)
|
203
203
|
def _collect_struct_type_custom_data(
|
204
|
-
self, clm_name: str, struct_field: StructField, values:
|
204
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
205
205
|
) -> dict[str, any]:
|
206
206
|
column_collector = StructColumnCollector(clm_name, struct_field, values)
|
207
207
|
collected_data = column_collector.get_data()
|
@@ -209,7 +209,7 @@ class ColumnCollectorManager:
|
|
209
209
|
|
210
210
|
@column_register(TIMESTAMP_COLUMN_TYPE)
|
211
211
|
def _collect_timestamp_type_custom_data(
|
212
|
-
self, clm_name: str, struct_field: StructField, values:
|
212
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
213
213
|
) -> dict[str, any]:
|
214
214
|
column_collector = TimestampColumnCollector(clm_name, struct_field, values)
|
215
215
|
collected_data = column_collector.get_data()
|
@@ -217,21 +217,21 @@ class ColumnCollectorManager:
|
|
217
217
|
|
218
218
|
@column_register(TIMESTAMP_NTZ_COLUMN_TYPE)
|
219
219
|
def _collect_timestampntz_type_custom_data(
|
220
|
-
self, clm_name: str, struct_field: StructField, values:
|
220
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
221
221
|
) -> dict[str, any]:
|
222
222
|
column_collector = TimestampNTZColumnCollector(clm_name, struct_field, values)
|
223
223
|
collected_data = column_collector.get_data()
|
224
224
|
return collected_data
|
225
225
|
|
226
226
|
def collect_empty_custom_data(
|
227
|
-
self, clm_name: str, struct_field: StructField, values:
|
227
|
+
self, clm_name: str, struct_field: StructField, values: SparkDataFrame
|
228
228
|
) -> dict[str, any]:
|
229
229
|
"""Collect the data of a empty column.
|
230
230
|
|
231
231
|
Args:
|
232
232
|
clm_name (str): the name of the column.
|
233
233
|
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
234
|
-
values (
|
234
|
+
values (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
235
235
|
|
236
236
|
Returns:
|
237
237
|
dict[str, any]: The data collected.
|
@@ -3,7 +3,12 @@
|
|
3
3
|
#
|
4
4
|
from statistics import mean
|
5
5
|
|
6
|
-
from
|
6
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
7
|
+
from pyspark.sql.functions import array as spark_array
|
8
|
+
from pyspark.sql.functions import coalesce as spark_coalesce
|
9
|
+
from pyspark.sql.functions import col as spark_col
|
10
|
+
from pyspark.sql.functions import explode as spark_explode
|
11
|
+
from pyspark.sql.functions import size as spark_size
|
7
12
|
from pyspark.sql.types import StructField
|
8
13
|
|
9
14
|
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
@@ -13,6 +18,8 @@ from snowflake.snowpark_checkpoints_collector.collection_common import (
|
|
13
18
|
COLUMN_MEAN_SIZE_KEY,
|
14
19
|
COLUMN_MIN_SIZE_KEY,
|
15
20
|
COLUMN_NULL_VALUE_PROPORTION_KEY,
|
21
|
+
COLUMN_SIZE_KEY,
|
22
|
+
COLUMN_VALUE_KEY,
|
16
23
|
COLUMN_VALUE_TYPE_KEY,
|
17
24
|
CONTAINS_NULL_KEY,
|
18
25
|
ELEMENT_TYPE_KEY,
|
@@ -30,22 +37,22 @@ class ArrayColumnCollector(ColumnCollectorBase):
|
|
30
37
|
name (str): the name of the column.
|
31
38
|
type (str): the type of the column.
|
32
39
|
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
33
|
-
|
40
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
34
41
|
|
35
42
|
"""
|
36
43
|
|
37
44
|
def __init__(
|
38
|
-
self, clm_name: str, struct_field: StructField,
|
45
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
39
46
|
) -> None:
|
40
47
|
"""Init ArrayColumnCollector.
|
41
48
|
|
42
49
|
Args:
|
43
50
|
clm_name (str): the name of the column.
|
44
51
|
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
45
|
-
|
52
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
46
53
|
|
47
54
|
"""
|
48
|
-
super().__init__(clm_name, struct_field,
|
55
|
+
super().__init__(clm_name, struct_field, clm_df)
|
49
56
|
self._array_size_collection = self._compute_array_size_collection()
|
50
57
|
|
51
58
|
def get_custom_data(self) -> dict[str, any]:
|
@@ -73,23 +80,22 @@ class ArrayColumnCollector(ColumnCollectorBase):
|
|
73
80
|
return custom_data_dict
|
74
81
|
|
75
82
|
def _compute_array_size_collection(self) -> list[int]:
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
83
|
+
select_result = self.column_df.select(
|
84
|
+
spark_size(spark_coalesce(spark_col(self.name), spark_array([]))).alias(
|
85
|
+
COLUMN_SIZE_KEY
|
86
|
+
)
|
87
|
+
).collect()
|
80
88
|
|
81
|
-
|
82
|
-
size_collection.append(length)
|
89
|
+
size_collection = [row[COLUMN_SIZE_KEY] for row in select_result]
|
83
90
|
|
84
91
|
return size_collection
|
85
92
|
|
86
93
|
def _compute_null_value_proportion(self) -> float:
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
continue
|
94
|
+
select_result = self.column_df.select(
|
95
|
+
spark_explode(spark_col(self.name)).alias(COLUMN_VALUE_KEY)
|
96
|
+
)
|
91
97
|
|
92
|
-
|
98
|
+
null_counter = select_result.where(spark_col(COLUMN_VALUE_KEY).isNull()).count()
|
93
99
|
|
94
100
|
total_values = sum(self._array_size_collection)
|
95
101
|
null_value_proportion = (null_counter / total_values) * 100
|