snowpark-checkpoints-collectors 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/PKG-INFO +17 -3
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/README.md +12 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/pyproject.toml +5 -2
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/__init__.py +10 -2
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/__version__.py +1 -1
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +5 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +23 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +18 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +38 -9
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +111 -68
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +9 -2
- snowpark_checkpoints_collectors-0.2.0/src/snowflake/snowpark_checkpoints_collector/utils/logging_utils.py +67 -0
- snowpark_checkpoints_collectors-0.2.0/test/integ/test_checkpoint_name.py +74 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1.py +89 -65
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2.py +34 -17
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collection_result_file.py +25 -10
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_collection_point_result_manager.py +31 -6
- snowpark_checkpoints_collectors-0.2.0/test/unit/test_logger.py +132 -0
- snowpark_checkpoints_collectors-0.2.0/test/unit/test_logging_utils.py +132 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_snow_connection.py +1 -1
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_summary_stats_collector.py +32 -2
- snowpark_checkpoints_collectors-0.1.3/test/integ/test_checkpoint_name.py +0 -51
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/.gitignore +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/CHANGELOG.md +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/LICENSE +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/snowpark-testdf-schema.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/.coveragerc +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/telemetry_compare_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_collect_invalid_mode_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_generate_parquet_for_spark_df_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_spark_df_mode_dataframe_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_snow_connection_int.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_checkpoint_name_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_collection_point_result.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_column_collection.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_extra_config.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_file_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_pandera_column_check_manager.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: snowpark-checkpoints-collectors
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: Snowpark column and table statistics collection
|
5
5
|
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
6
|
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
@@ -27,19 +27,21 @@ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
27
27
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
28
|
Requires-Python: <3.12,>=3.9
|
29
29
|
Requires-Dist: pandera[io]==0.20.4
|
30
|
-
Requires-Dist: pyspark
|
31
30
|
Requires-Dist: snowflake-connector-python
|
32
|
-
Requires-Dist: snowflake-snowpark-python
|
31
|
+
Requires-Dist: snowflake-snowpark-python>=1.23.0
|
33
32
|
Provides-Extra: development
|
34
33
|
Requires-Dist: coverage>=7.6.7; extra == 'development'
|
35
34
|
Requires-Dist: deepdiff>=8.0.0; extra == 'development'
|
36
35
|
Requires-Dist: hatchling==1.25.0; extra == 'development'
|
37
36
|
Requires-Dist: pre-commit>=4.0.1; extra == 'development'
|
38
37
|
Requires-Dist: pyarrow>=18.0.0; extra == 'development'
|
38
|
+
Requires-Dist: pyspark>=3.5.0; extra == 'development'
|
39
39
|
Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
|
40
40
|
Requires-Dist: pytest>=8.3.3; extra == 'development'
|
41
41
|
Requires-Dist: setuptools>=70.0.0; extra == 'development'
|
42
42
|
Requires-Dist: twine==5.1.1; extra == 'development'
|
43
|
+
Provides-Extra: pyspark
|
44
|
+
Requires-Dist: pyspark>=3.5.0; extra == 'pyspark'
|
43
45
|
Description-Content-Type: text/markdown
|
44
46
|
|
45
47
|
# snowpark-checkpoints-collectors
|
@@ -50,6 +52,18 @@ Description-Content-Type: text/markdown
|
|
50
52
|
---
|
51
53
|
|
52
54
|
**snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
|
55
|
+
|
56
|
+
---
|
57
|
+
## Install the library
|
58
|
+
```bash
|
59
|
+
pip install snowpark-checkpoints-collectors
|
60
|
+
```
|
61
|
+
This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
|
62
|
+
```bash
|
63
|
+
pip install "snowpark-checkpoints-collectors[pyspark]"
|
64
|
+
```
|
65
|
+
---
|
66
|
+
|
53
67
|
## Features
|
54
68
|
|
55
69
|
- Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
|
@@ -6,6 +6,18 @@
|
|
6
6
|
---
|
7
7
|
|
8
8
|
**snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
|
9
|
+
|
10
|
+
---
|
11
|
+
## Install the library
|
12
|
+
```bash
|
13
|
+
pip install snowpark-checkpoints-collectors
|
14
|
+
```
|
15
|
+
This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
|
16
|
+
```bash
|
17
|
+
pip install "snowpark-checkpoints-collectors[pyspark]"
|
18
|
+
```
|
19
|
+
---
|
20
|
+
|
9
21
|
## Features
|
10
22
|
|
11
23
|
- Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
|
{snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/pyproject.toml
RENAMED
@@ -26,9 +26,8 @@ classifiers = [
|
|
26
26
|
"Topic :: Scientific/Engineering :: Information Analysis",
|
27
27
|
]
|
28
28
|
dependencies = [
|
29
|
-
"snowflake-snowpark-python
|
29
|
+
"snowflake-snowpark-python>=1.23.0",
|
30
30
|
"snowflake-connector-python",
|
31
|
-
"pyspark",
|
32
31
|
"pandera[io]==0.20.4",
|
33
32
|
]
|
34
33
|
description = "Snowpark column and table statistics collection"
|
@@ -47,6 +46,9 @@ readme = "README.md"
|
|
47
46
|
requires-python = '>=3.9,<3.12'
|
48
47
|
|
49
48
|
[project.optional-dependencies]
|
49
|
+
pyspark = [
|
50
|
+
"pyspark>=3.5.0",
|
51
|
+
]
|
50
52
|
development = [
|
51
53
|
"pytest>=8.3.3",
|
52
54
|
"pytest-cov>=6.0.0",
|
@@ -57,6 +59,7 @@ development = [
|
|
57
59
|
"setuptools>=70.0.0",
|
58
60
|
"pyarrow>=18.0.0",
|
59
61
|
"deepdiff>=8.0.0",
|
62
|
+
"pyspark>=3.5.0",
|
60
63
|
]
|
61
64
|
|
62
65
|
[project.urls]
|
@@ -13,10 +13,18 @@
|
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
15
|
|
16
|
+
import logging
|
17
|
+
|
18
|
+
|
19
|
+
# Add a NullHandler to prevent logging messages from being output to
|
20
|
+
# sys.stderr if no logging configuration is provided.
|
21
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
22
|
+
|
23
|
+
# ruff: noqa: E402
|
24
|
+
|
16
25
|
__all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
|
17
26
|
|
27
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
18
28
|
from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
|
19
29
|
collect_dataframe_checkpoint,
|
20
30
|
)
|
21
|
-
|
22
|
-
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
@@ -12,7 +12,9 @@
|
|
12
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
|
+
|
15
16
|
import json
|
17
|
+
import logging
|
16
18
|
|
17
19
|
from typing import Optional
|
18
20
|
|
@@ -24,6 +26,7 @@ from snowflake.snowpark_checkpoints_collector.utils import file_utils
|
|
24
26
|
|
25
27
|
|
26
28
|
RESULTS_KEY = "results"
|
29
|
+
LOGGER = logging.getLogger(__name__)
|
27
30
|
|
28
31
|
|
29
32
|
class CollectionPointResultManager(metaclass=Singleton):
|
@@ -49,6 +52,7 @@ class CollectionPointResultManager(metaclass=Singleton):
|
|
49
52
|
|
50
53
|
"""
|
51
54
|
result_json = result.get_collection_result_data()
|
55
|
+
LOGGER.debug("Adding a new collection result: %s", result_json)
|
52
56
|
self.result_collection.append(result_json)
|
53
57
|
self._save_result()
|
54
58
|
|
@@ -65,5 +69,6 @@ class CollectionPointResultManager(metaclass=Singleton):
|
|
65
69
|
|
66
70
|
def _save_result(self) -> None:
|
67
71
|
result_collection_json = self.to_json()
|
72
|
+
LOGGER.info("Saving collection results to '%s'", self.output_file_path)
|
68
73
|
with open(self.output_file_path, "w") as f:
|
69
74
|
f.write(result_collection_json)
|
@@ -12,6 +12,9 @@
|
|
12
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
|
+
|
16
|
+
import logging
|
17
|
+
|
15
18
|
from pyspark.sql import DataFrame as SparkDataFrame
|
16
19
|
from pyspark.sql.types import StructField
|
17
20
|
|
@@ -53,6 +56,9 @@ from snowflake.snowpark_checkpoints_collector.column_collection.model import (
|
|
53
56
|
)
|
54
57
|
|
55
58
|
|
59
|
+
LOGGER = logging.getLogger(__name__)
|
60
|
+
|
61
|
+
|
56
62
|
def collector_register(cls):
|
57
63
|
"""Decorate a class with the collection type mechanism.
|
58
64
|
|
@@ -63,6 +69,7 @@ def collector_register(cls):
|
|
63
69
|
The class to decorate.
|
64
70
|
|
65
71
|
"""
|
72
|
+
LOGGER.debug("Starting to register collectors from class %s", cls.__name__)
|
66
73
|
cls._collectors = {}
|
67
74
|
for method_name in dir(cls):
|
68
75
|
method = getattr(cls, method_name)
|
@@ -70,6 +77,11 @@ def collector_register(cls):
|
|
70
77
|
col_type_collection = method._column_type
|
71
78
|
for col_type in col_type_collection:
|
72
79
|
cls._collectors[col_type] = method_name
|
80
|
+
LOGGER.debug(
|
81
|
+
"Registered collector '%s' for column type '%s'",
|
82
|
+
method_name,
|
83
|
+
col_type,
|
84
|
+
)
|
73
85
|
return cls
|
74
86
|
|
75
87
|
|
@@ -114,10 +126,21 @@ class ColumnCollectorManager:
|
|
114
126
|
"""
|
115
127
|
clm_type = struct_field.dataType.typeName()
|
116
128
|
if clm_type not in self._collectors:
|
129
|
+
LOGGER.debug(
|
130
|
+
"No collectors found for column '%s' of type '%s'. Skipping collection for this column.",
|
131
|
+
clm_name,
|
132
|
+
clm_type,
|
133
|
+
)
|
117
134
|
return {}
|
118
135
|
|
119
136
|
func_name = self._collectors[clm_type]
|
120
137
|
func = getattr(self, func_name)
|
138
|
+
LOGGER.debug(
|
139
|
+
"Collecting custom data for column '%s' of type '%s' using collector method '%s'",
|
140
|
+
clm_name,
|
141
|
+
clm_type,
|
142
|
+
func_name,
|
143
|
+
)
|
121
144
|
data = func(clm_name, struct_field, values)
|
122
145
|
return data
|
123
146
|
|
@@ -12,6 +12,9 @@
|
|
12
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
|
+
|
16
|
+
import logging
|
17
|
+
|
15
18
|
import pandas as pd
|
16
19
|
|
17
20
|
from pandera import Check, Column
|
@@ -39,6 +42,9 @@ from snowflake.snowpark_checkpoints_collector.collection_common import (
|
|
39
42
|
)
|
40
43
|
|
41
44
|
|
45
|
+
LOGGER = logging.getLogger(__name__)
|
46
|
+
|
47
|
+
|
42
48
|
def collector_register(cls):
|
43
49
|
"""Decorate a class with the checks mechanism.
|
44
50
|
|
@@ -49,6 +55,7 @@ def collector_register(cls):
|
|
49
55
|
The class to decorate.
|
50
56
|
|
51
57
|
"""
|
58
|
+
LOGGER.debug("Starting to register checks from class %s", cls.__name__)
|
52
59
|
cls._collectors = {}
|
53
60
|
for method_name in dir(cls):
|
54
61
|
method = getattr(cls, method_name)
|
@@ -56,6 +63,9 @@ def collector_register(cls):
|
|
56
63
|
col_type_collection = method._column_type
|
57
64
|
for col_type in col_type_collection:
|
58
65
|
cls._collectors[col_type] = method_name
|
66
|
+
LOGGER.debug(
|
67
|
+
"Registered check '%s' for column type '%s'", method_name, col_type
|
68
|
+
)
|
59
69
|
return cls
|
60
70
|
|
61
71
|
|
@@ -101,10 +111,18 @@ class PanderaColumnChecksManager:
|
|
101
111
|
|
102
112
|
"""
|
103
113
|
if clm_type not in self._collectors:
|
114
|
+
LOGGER.debug(
|
115
|
+
"No Pandera checks found for column '%s' of type '%s'. Skipping checks for this column.",
|
116
|
+
clm_name,
|
117
|
+
clm_type,
|
118
|
+
)
|
104
119
|
return
|
105
120
|
|
106
121
|
func_name = self._collectors[clm_type]
|
107
122
|
func = getattr(self, func_name)
|
123
|
+
LOGGER.debug(
|
124
|
+
"Adding Pandera checks to column '%s' of type '%s'", clm_name, clm_type
|
125
|
+
)
|
108
126
|
func(clm_name, pyspark_df, pandera_column)
|
109
127
|
|
110
128
|
@column_register(BOOLEAN_COLUMN_TYPE)
|
@@ -12,7 +12,9 @@
|
|
12
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
|
+
|
15
16
|
import glob
|
17
|
+
import logging
|
16
18
|
import os.path
|
17
19
|
import time
|
18
20
|
|
@@ -30,6 +32,7 @@ CREATE_STAGE_STATEMENT_FORMAT = "CREATE TEMP STAGE IF NOT EXISTS {}"
|
|
30
32
|
REMOVE_STAGE_FOLDER_STATEMENT_FORMAT = "REMOVE {}"
|
31
33
|
STAGE_PATH_FORMAT = "'@{}/{}'"
|
32
34
|
PUT_FILE_IN_STAGE_STATEMENT_FORMAT = "PUT '{}' {} AUTO_COMPRESS=FALSE"
|
35
|
+
LOGGER = logging.getLogger(__name__)
|
33
36
|
|
34
37
|
|
35
38
|
class SnowConnection:
|
@@ -41,14 +44,16 @@ class SnowConnection:
|
|
41
44
|
|
42
45
|
"""
|
43
46
|
|
44
|
-
def __init__(self, session: Session = None) -> None:
|
47
|
+
def __init__(self, session: Optional[Session] = None) -> None:
|
45
48
|
"""Init SnowConnection.
|
46
49
|
|
47
50
|
Args:
|
48
51
|
session (Snowpark.Session): the Snowpark session.
|
49
52
|
|
50
53
|
"""
|
51
|
-
self.session =
|
54
|
+
self.session = (
|
55
|
+
session if session is not None else self._create_snowpark_session()
|
56
|
+
)
|
52
57
|
self.stage_id = int(time.time())
|
53
58
|
|
54
59
|
def create_snowflake_table_from_local_parquet(
|
@@ -84,8 +89,8 @@ class SnowConnection:
|
|
84
89
|
stage_name, stage_path, input_path, is_parquet_file
|
85
90
|
)
|
86
91
|
self.create_table_from_parquet(table_name, stage_directory_path)
|
87
|
-
|
88
92
|
finally:
|
93
|
+
LOGGER.info("Removing stage folder %s", stage_directory_path)
|
89
94
|
self.session.sql(
|
90
95
|
REMOVE_STAGE_FOLDER_STATEMENT_FORMAT.format(stage_directory_path)
|
91
96
|
).collect()
|
@@ -98,6 +103,7 @@ class SnowConnection:
|
|
98
103
|
|
99
104
|
"""
|
100
105
|
create_stage_statement = CREATE_STAGE_STATEMENT_FORMAT.format(stage_name)
|
106
|
+
LOGGER.info("Creating temporal stage '%s'", stage_name)
|
101
107
|
self.session.sql(create_stage_statement).collect()
|
102
108
|
|
103
109
|
def load_files_to_stage(
|
@@ -105,7 +111,7 @@ class SnowConnection:
|
|
105
111
|
stage_name: str,
|
106
112
|
folder_name: str,
|
107
113
|
input_path: str,
|
108
|
-
filter_func: Callable = None,
|
114
|
+
filter_func: Optional[Callable] = None,
|
109
115
|
) -> None:
|
110
116
|
"""Load files to a stage in Snowflake.
|
111
117
|
|
@@ -116,6 +122,7 @@ class SnowConnection:
|
|
116
122
|
filter_func (Callable): the filter function to apply to the files.
|
117
123
|
|
118
124
|
"""
|
125
|
+
LOGGER.info("Starting to load files to '%s'", stage_name)
|
119
126
|
input_path = (
|
120
127
|
os.path.abspath(input_path)
|
121
128
|
if not os.path.isabs(input_path)
|
@@ -126,16 +133,20 @@ class SnowConnection:
|
|
126
133
|
return os.path.isfile(name) and (filter_func(name) if filter_func else True)
|
127
134
|
|
128
135
|
target_dir = os.path.join(input_path, "**", "*")
|
136
|
+
LOGGER.debug("Searching for files in '%s'", input_path)
|
129
137
|
files_collection = glob.glob(target_dir, recursive=True)
|
130
138
|
|
131
139
|
files = [file for file in files_collection if filter_files(file)]
|
140
|
+
files_count = len(files)
|
132
141
|
|
133
|
-
if
|
142
|
+
if files_count == 0:
|
134
143
|
raise Exception(f"No files were found in the input directory: {input_path}")
|
135
144
|
|
145
|
+
LOGGER.debug("Found %s files in '%s'", files_count, input_path)
|
146
|
+
|
136
147
|
for file in files:
|
137
148
|
# if file is relative path, convert to absolute path
|
138
|
-
# if absolute path, then try to resolve as some Win32 paths are
|
149
|
+
# if absolute path, then try to resolve as some Win32 paths are not in LPN.
|
139
150
|
file_full_path = (
|
140
151
|
str(os.path.abspath(file))
|
141
152
|
if not os.path.isabs(file)
|
@@ -150,6 +161,7 @@ class SnowConnection:
|
|
150
161
|
put_statement = PUT_FILE_IN_STAGE_STATEMENT_FORMAT.format(
|
151
162
|
normalize_file_path, stage_file_path
|
152
163
|
)
|
164
|
+
LOGGER.info("Loading file '%s' to %s", file_full_path, stage_file_path)
|
153
165
|
self.session.sql(put_statement).collect()
|
154
166
|
|
155
167
|
def create_table_from_parquet(
|
@@ -165,8 +177,25 @@ class SnowConnection:
|
|
165
177
|
Exception: No parquet files were found in the stage
|
166
178
|
|
167
179
|
"""
|
168
|
-
|
169
|
-
|
170
|
-
|
180
|
+
LOGGER.info("Starting to create table '%s' from parquet files", table_name)
|
181
|
+
parquet_files = self.session.sql(
|
182
|
+
f"LIST {stage_directory_path} PATTERN='.*{DOT_PARQUET_EXTENSION}'"
|
183
|
+
).collect()
|
184
|
+
parquet_files_count = len(parquet_files)
|
185
|
+
if parquet_files_count == 0:
|
186
|
+
raise Exception(
|
187
|
+
f"No parquet files were found in the stage: {stage_directory_path}"
|
188
|
+
)
|
189
|
+
|
190
|
+
LOGGER.info(
|
191
|
+
"Reading %s parquet files from %s",
|
192
|
+
parquet_files_count,
|
193
|
+
stage_directory_path,
|
194
|
+
)
|
171
195
|
dataframe = self.session.read.parquet(path=stage_directory_path)
|
196
|
+
LOGGER.info("Creating table '%s' from parquet files", table_name)
|
172
197
|
dataframe.write.save_as_table(table_name=table_name, mode="overwrite")
|
198
|
+
|
199
|
+
def _create_snowpark_session(self) -> Session:
|
200
|
+
LOGGER.info("Creating a Snowpark session using the default connection")
|
201
|
+
return Session.builder.getOrCreate()
|