snowpark-checkpoints-collectors 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/PKG-INFO +17 -3
  2. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/README.md +12 -0
  3. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/pyproject.toml +5 -2
  4. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/__init__.py +10 -2
  5. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/__version__.py +1 -1
  6. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +5 -0
  7. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +23 -0
  8. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +18 -0
  9. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +38 -9
  10. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +111 -68
  11. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +9 -2
  12. snowpark_checkpoints_collectors-0.2.0/src/snowflake/snowpark_checkpoints_collector/utils/logging_utils.py +67 -0
  13. snowpark_checkpoints_collectors-0.2.0/test/integ/test_checkpoint_name.py +74 -0
  14. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1.py +89 -65
  15. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2.py +34 -17
  16. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collection_result_file.py +25 -10
  17. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_collection_point_result_manager.py +31 -6
  18. snowpark_checkpoints_collectors-0.2.0/test/unit/test_logger.py +132 -0
  19. snowpark_checkpoints_collectors-0.2.0/test/unit/test_logging_utils.py +132 -0
  20. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_snow_connection.py +1 -1
  21. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_summary_stats_collector.py +32 -2
  22. snowpark_checkpoints_collectors-0.1.3/test/integ/test_checkpoint_name.py +0 -51
  23. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/.gitignore +0 -0
  24. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/CHANGELOG.md +0 -0
  25. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/LICENSE +0 -0
  26. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/snowpark-testdf-schema.json +0 -0
  27. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +0 -0
  28. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
  29. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +0 -0
  30. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
  31. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
  32. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +0 -0
  33. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +0 -0
  34. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -0
  35. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +0 -0
  36. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -0
  37. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -0
  38. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -0
  39. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +0 -0
  40. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +0 -0
  41. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -0
  42. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +0 -0
  43. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -0
  44. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +0 -0
  45. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +0 -0
  46. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +0 -0
  47. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
  48. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
  49. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
  50. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +0 -0
  51. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +0 -0
  52. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -0
  53. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/.coveragerc +0 -0
  54. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/telemetry_compare_utils.py +0 -0
  55. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -0
  56. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -0
  57. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -0
  58. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -0
  59. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -0
  60. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -0
  61. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
  62. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -0
  63. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
  64. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -0
  65. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
  66. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -0
  67. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -0
  68. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -0
  69. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +0 -0
  70. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -0
  71. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +0 -0
  72. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +0 -0
  73. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +0 -0
  74. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_collect_invalid_mode_telemetry.json +0 -0
  75. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_generate_parquet_for_spark_df_telemetry.json +0 -0
  76. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_collect_df_mode_2_expected/test_spark_df_mode_dataframe_telemetry.json +0 -0
  77. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/integ/test_snow_connection_int.py +0 -0
  78. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_checkpoint_name_utils.py +0 -0
  79. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_collection_point_result.py +0 -0
  80. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_column_collection.py +0 -0
  81. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_extra_config.py +0 -0
  82. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_file_utils.py +0 -0
  83. {snowpark_checkpoints_collectors-0.1.3 → snowpark_checkpoints_collectors-0.2.0}/test/unit/test_pandera_column_check_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: snowpark-checkpoints-collectors
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: Snowpark column and table statistics collection
5
5
  Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
6
  Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
@@ -27,19 +27,21 @@ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
27
27
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
28
  Requires-Python: <3.12,>=3.9
29
29
  Requires-Dist: pandera[io]==0.20.4
30
- Requires-Dist: pyspark
31
30
  Requires-Dist: snowflake-connector-python
32
- Requires-Dist: snowflake-snowpark-python==1.26.0
31
+ Requires-Dist: snowflake-snowpark-python>=1.23.0
33
32
  Provides-Extra: development
34
33
  Requires-Dist: coverage>=7.6.7; extra == 'development'
35
34
  Requires-Dist: deepdiff>=8.0.0; extra == 'development'
36
35
  Requires-Dist: hatchling==1.25.0; extra == 'development'
37
36
  Requires-Dist: pre-commit>=4.0.1; extra == 'development'
38
37
  Requires-Dist: pyarrow>=18.0.0; extra == 'development'
38
+ Requires-Dist: pyspark>=3.5.0; extra == 'development'
39
39
  Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
40
40
  Requires-Dist: pytest>=8.3.3; extra == 'development'
41
41
  Requires-Dist: setuptools>=70.0.0; extra == 'development'
42
42
  Requires-Dist: twine==5.1.1; extra == 'development'
43
+ Provides-Extra: pyspark
44
+ Requires-Dist: pyspark>=3.5.0; extra == 'pyspark'
43
45
  Description-Content-Type: text/markdown
44
46
 
45
47
  # snowpark-checkpoints-collectors
@@ -50,6 +52,18 @@ Description-Content-Type: text/markdown
50
52
  ---
51
53
 
52
54
  **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
55
+
56
+ ---
57
+ ## Install the library
58
+ ```bash
59
+ pip install snowpark-checkpoints-collectors
60
+ ```
61
+ This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
62
+ ```bash
63
+ pip install "snowpark-checkpoints-collectors[pyspark]"
64
+ ```
65
+ ---
66
+
53
67
  ## Features
54
68
 
55
69
  - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
@@ -6,6 +6,18 @@
6
6
  ---
7
7
 
8
8
  **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
9
+
10
+ ---
11
+ ## Install the library
12
+ ```bash
13
+ pip install snowpark-checkpoints-collectors
14
+ ```
15
+ This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
16
+ ```bash
17
+ pip install "snowpark-checkpoints-collectors[pyspark]"
18
+ ```
19
+ ---
20
+
9
21
  ## Features
10
22
 
11
23
  - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
@@ -26,9 +26,8 @@ classifiers = [
26
26
  "Topic :: Scientific/Engineering :: Information Analysis",
27
27
  ]
28
28
  dependencies = [
29
- "snowflake-snowpark-python==1.26.0",
29
+ "snowflake-snowpark-python>=1.23.0",
30
30
  "snowflake-connector-python",
31
- "pyspark",
32
31
  "pandera[io]==0.20.4",
33
32
  ]
34
33
  description = "Snowpark column and table statistics collection"
@@ -47,6 +46,9 @@ readme = "README.md"
47
46
  requires-python = '>=3.9,<3.12'
48
47
 
49
48
  [project.optional-dependencies]
49
+ pyspark = [
50
+ "pyspark>=3.5.0",
51
+ ]
50
52
  development = [
51
53
  "pytest>=8.3.3",
52
54
  "pytest-cov>=6.0.0",
@@ -57,6 +59,7 @@ development = [
57
59
  "setuptools>=70.0.0",
58
60
  "pyarrow>=18.0.0",
59
61
  "deepdiff>=8.0.0",
62
+ "pyspark>=3.5.0",
60
63
  ]
61
64
 
62
65
  [project.urls]
@@ -13,10 +13,18 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
+ import logging
17
+
18
+
19
+ # Add a NullHandler to prevent logging messages from being output to
20
+ # sys.stderr if no logging configuration is provided.
21
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
22
+
23
+ # ruff: noqa: E402
24
+
16
25
  __all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
17
26
 
27
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
18
28
  from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
19
29
  collect_dataframe_checkpoint,
20
30
  )
21
-
22
- from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
@@ -13,4 +13,4 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- __version__ = "0.1.3"
16
+ __version__ = "0.2.0"
@@ -12,7 +12,9 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
+
15
16
  import json
17
+ import logging
16
18
 
17
19
  from typing import Optional
18
20
 
@@ -24,6 +26,7 @@ from snowflake.snowpark_checkpoints_collector.utils import file_utils
24
26
 
25
27
 
26
28
  RESULTS_KEY = "results"
29
+ LOGGER = logging.getLogger(__name__)
27
30
 
28
31
 
29
32
  class CollectionPointResultManager(metaclass=Singleton):
@@ -49,6 +52,7 @@ class CollectionPointResultManager(metaclass=Singleton):
49
52
 
50
53
  """
51
54
  result_json = result.get_collection_result_data()
55
+ LOGGER.debug("Adding a new collection result: %s", result_json)
52
56
  self.result_collection.append(result_json)
53
57
  self._save_result()
54
58
 
@@ -65,5 +69,6 @@ class CollectionPointResultManager(metaclass=Singleton):
65
69
 
66
70
  def _save_result(self) -> None:
67
71
  result_collection_json = self.to_json()
72
+ LOGGER.info("Saving collection results to '%s'", self.output_file_path)
68
73
  with open(self.output_file_path, "w") as f:
69
74
  f.write(result_collection_json)
@@ -12,6 +12,9 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
+
16
+ import logging
17
+
15
18
  from pyspark.sql import DataFrame as SparkDataFrame
16
19
  from pyspark.sql.types import StructField
17
20
 
@@ -53,6 +56,9 @@ from snowflake.snowpark_checkpoints_collector.column_collection.model import (
53
56
  )
54
57
 
55
58
 
59
+ LOGGER = logging.getLogger(__name__)
60
+
61
+
56
62
  def collector_register(cls):
57
63
  """Decorate a class with the collection type mechanism.
58
64
 
@@ -63,6 +69,7 @@ def collector_register(cls):
63
69
  The class to decorate.
64
70
 
65
71
  """
72
+ LOGGER.debug("Starting to register collectors from class %s", cls.__name__)
66
73
  cls._collectors = {}
67
74
  for method_name in dir(cls):
68
75
  method = getattr(cls, method_name)
@@ -70,6 +77,11 @@ def collector_register(cls):
70
77
  col_type_collection = method._column_type
71
78
  for col_type in col_type_collection:
72
79
  cls._collectors[col_type] = method_name
80
+ LOGGER.debug(
81
+ "Registered collector '%s' for column type '%s'",
82
+ method_name,
83
+ col_type,
84
+ )
73
85
  return cls
74
86
 
75
87
 
@@ -114,10 +126,21 @@ class ColumnCollectorManager:
114
126
  """
115
127
  clm_type = struct_field.dataType.typeName()
116
128
  if clm_type not in self._collectors:
129
+ LOGGER.debug(
130
+ "No collectors found for column '%s' of type '%s'. Skipping collection for this column.",
131
+ clm_name,
132
+ clm_type,
133
+ )
117
134
  return {}
118
135
 
119
136
  func_name = self._collectors[clm_type]
120
137
  func = getattr(self, func_name)
138
+ LOGGER.debug(
139
+ "Collecting custom data for column '%s' of type '%s' using collector method '%s'",
140
+ clm_name,
141
+ clm_type,
142
+ func_name,
143
+ )
121
144
  data = func(clm_name, struct_field, values)
122
145
  return data
123
146
 
@@ -12,6 +12,9 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
+
16
+ import logging
17
+
15
18
  import pandas as pd
16
19
 
17
20
  from pandera import Check, Column
@@ -39,6 +42,9 @@ from snowflake.snowpark_checkpoints_collector.collection_common import (
39
42
  )
40
43
 
41
44
 
45
+ LOGGER = logging.getLogger(__name__)
46
+
47
+
42
48
  def collector_register(cls):
43
49
  """Decorate a class with the checks mechanism.
44
50
 
@@ -49,6 +55,7 @@ def collector_register(cls):
49
55
  The class to decorate.
50
56
 
51
57
  """
58
+ LOGGER.debug("Starting to register checks from class %s", cls.__name__)
52
59
  cls._collectors = {}
53
60
  for method_name in dir(cls):
54
61
  method = getattr(cls, method_name)
@@ -56,6 +63,9 @@ def collector_register(cls):
56
63
  col_type_collection = method._column_type
57
64
  for col_type in col_type_collection:
58
65
  cls._collectors[col_type] = method_name
66
+ LOGGER.debug(
67
+ "Registered check '%s' for column type '%s'", method_name, col_type
68
+ )
59
69
  return cls
60
70
 
61
71
 
@@ -101,10 +111,18 @@ class PanderaColumnChecksManager:
101
111
 
102
112
  """
103
113
  if clm_type not in self._collectors:
114
+ LOGGER.debug(
115
+ "No Pandera checks found for column '%s' of type '%s'. Skipping checks for this column.",
116
+ clm_name,
117
+ clm_type,
118
+ )
104
119
  return
105
120
 
106
121
  func_name = self._collectors[clm_type]
107
122
  func = getattr(self, func_name)
123
+ LOGGER.debug(
124
+ "Adding Pandera checks to column '%s' of type '%s'", clm_name, clm_type
125
+ )
108
126
  func(clm_name, pyspark_df, pandera_column)
109
127
 
110
128
  @column_register(BOOLEAN_COLUMN_TYPE)
@@ -12,7 +12,9 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
+
15
16
  import glob
17
+ import logging
16
18
  import os.path
17
19
  import time
18
20
 
@@ -30,6 +32,7 @@ CREATE_STAGE_STATEMENT_FORMAT = "CREATE TEMP STAGE IF NOT EXISTS {}"
30
32
  REMOVE_STAGE_FOLDER_STATEMENT_FORMAT = "REMOVE {}"
31
33
  STAGE_PATH_FORMAT = "'@{}/{}'"
32
34
  PUT_FILE_IN_STAGE_STATEMENT_FORMAT = "PUT '{}' {} AUTO_COMPRESS=FALSE"
35
+ LOGGER = logging.getLogger(__name__)
33
36
 
34
37
 
35
38
  class SnowConnection:
@@ -41,14 +44,16 @@ class SnowConnection:
41
44
 
42
45
  """
43
46
 
44
- def __init__(self, session: Session = None) -> None:
47
+ def __init__(self, session: Optional[Session] = None) -> None:
45
48
  """Init SnowConnection.
46
49
 
47
50
  Args:
48
51
  session (Snowpark.Session): the Snowpark session.
49
52
 
50
53
  """
51
- self.session = session if session is not None else Session.builder.getOrCreate()
54
+ self.session = (
55
+ session if session is not None else self._create_snowpark_session()
56
+ )
52
57
  self.stage_id = int(time.time())
53
58
 
54
59
  def create_snowflake_table_from_local_parquet(
@@ -84,8 +89,8 @@ class SnowConnection:
84
89
  stage_name, stage_path, input_path, is_parquet_file
85
90
  )
86
91
  self.create_table_from_parquet(table_name, stage_directory_path)
87
-
88
92
  finally:
93
+ LOGGER.info("Removing stage folder %s", stage_directory_path)
89
94
  self.session.sql(
90
95
  REMOVE_STAGE_FOLDER_STATEMENT_FORMAT.format(stage_directory_path)
91
96
  ).collect()
@@ -98,6 +103,7 @@ class SnowConnection:
98
103
 
99
104
  """
100
105
  create_stage_statement = CREATE_STAGE_STATEMENT_FORMAT.format(stage_name)
106
+ LOGGER.info("Creating temporal stage '%s'", stage_name)
101
107
  self.session.sql(create_stage_statement).collect()
102
108
 
103
109
  def load_files_to_stage(
@@ -105,7 +111,7 @@ class SnowConnection:
105
111
  stage_name: str,
106
112
  folder_name: str,
107
113
  input_path: str,
108
- filter_func: Callable = None,
114
+ filter_func: Optional[Callable] = None,
109
115
  ) -> None:
110
116
  """Load files to a stage in Snowflake.
111
117
 
@@ -116,6 +122,7 @@ class SnowConnection:
116
122
  filter_func (Callable): the filter function to apply to the files.
117
123
 
118
124
  """
125
+ LOGGER.info("Starting to load files to '%s'", stage_name)
119
126
  input_path = (
120
127
  os.path.abspath(input_path)
121
128
  if not os.path.isabs(input_path)
@@ -126,16 +133,20 @@ class SnowConnection:
126
133
  return os.path.isfile(name) and (filter_func(name) if filter_func else True)
127
134
 
128
135
  target_dir = os.path.join(input_path, "**", "*")
136
+ LOGGER.debug("Searching for files in '%s'", input_path)
129
137
  files_collection = glob.glob(target_dir, recursive=True)
130
138
 
131
139
  files = [file for file in files_collection if filter_files(file)]
140
+ files_count = len(files)
132
141
 
133
- if len(files) == 0:
142
+ if files_count == 0:
134
143
  raise Exception(f"No files were found in the input directory: {input_path}")
135
144
 
145
+ LOGGER.debug("Found %s files in '%s'", files_count, input_path)
146
+
136
147
  for file in files:
137
148
  # if file is relative path, convert to absolute path
138
- # if absolute path, then try to resolve as some Win32 paths are not in LPN.
149
+ # if absolute path, then try to resolve as some Win32 paths are not in LPN.
139
150
  file_full_path = (
140
151
  str(os.path.abspath(file))
141
152
  if not os.path.isabs(file)
@@ -150,6 +161,7 @@ class SnowConnection:
150
161
  put_statement = PUT_FILE_IN_STAGE_STATEMENT_FORMAT.format(
151
162
  normalize_file_path, stage_file_path
152
163
  )
164
+ LOGGER.info("Loading file '%s' to %s", file_full_path, stage_file_path)
153
165
  self.session.sql(put_statement).collect()
154
166
 
155
167
  def create_table_from_parquet(
@@ -165,8 +177,25 @@ class SnowConnection:
165
177
  Exception: No parquet files were found in the stage
166
178
 
167
179
  """
168
- files = self.session.sql(f"LIST {stage_directory_path}").collect()
169
- if len(files) == 0:
170
- raise Exception("No parquet files were found in the stage.")
180
+ LOGGER.info("Starting to create table '%s' from parquet files", table_name)
181
+ parquet_files = self.session.sql(
182
+ f"LIST {stage_directory_path} PATTERN='.*{DOT_PARQUET_EXTENSION}'"
183
+ ).collect()
184
+ parquet_files_count = len(parquet_files)
185
+ if parquet_files_count == 0:
186
+ raise Exception(
187
+ f"No parquet files were found in the stage: {stage_directory_path}"
188
+ )
189
+
190
+ LOGGER.info(
191
+ "Reading %s parquet files from %s",
192
+ parquet_files_count,
193
+ stage_directory_path,
194
+ )
171
195
  dataframe = self.session.read.parquet(path=stage_directory_path)
196
+ LOGGER.info("Creating table '%s' from parquet files", table_name)
172
197
  dataframe.write.save_as_table(table_name=table_name, mode="overwrite")
198
+
199
+ def _create_snowpark_session(self) -> Session:
200
+ LOGGER.info("Creating a Snowpark session using the default connection")
201
+ return Session.builder.getOrCreate()