snowpark-checkpoints-collectors 0.1.0rc2__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. snowflake/snowpark_checkpoints_collector/__init__.py +22 -0
  2. snowflake/snowpark_checkpoints_collector/collection_common.py +160 -0
  3. snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +24 -0
  4. snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +91 -0
  5. snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +69 -0
  6. snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +22 -0
  7. snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +253 -0
  8. snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +75 -0
  9. snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +113 -0
  10. snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +87 -0
  11. snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +71 -0
  12. snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +95 -0
  13. snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +74 -0
  14. snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +67 -0
  15. snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +92 -0
  16. snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +88 -0
  17. snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +120 -0
  18. snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +49 -0
  19. snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +108 -0
  20. snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +70 -0
  21. snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +102 -0
  22. snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +75 -0
  23. snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +75 -0
  24. snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +20 -0
  25. snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +223 -0
  26. snowflake/snowpark_checkpoints_collector/singleton.py +23 -0
  27. snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +20 -0
  28. snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +172 -0
  29. snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +366 -0
  30. snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +53 -0
  31. snowflake/snowpark_checkpoints_collector/utils/extra_config.py +112 -0
  32. snowflake/snowpark_checkpoints_collector/utils/file_utils.py +132 -0
  33. snowflake/snowpark_checkpoints_collector/utils/telemetry.py +889 -0
  34. snowpark_checkpoints_collectors-0.1.1.dist-info/METADATA +143 -0
  35. snowpark_checkpoints_collectors-0.1.1.dist-info/RECORD +37 -0
  36. {snowpark_checkpoints_collectors-0.1.0rc2.dist-info → snowpark_checkpoints_collectors-0.1.1.dist-info}/licenses/LICENSE +0 -25
  37. snowpark_checkpoints_collectors-0.1.0rc2.dist-info/METADATA +0 -347
  38. snowpark_checkpoints_collectors-0.1.0rc2.dist-info/RECORD +0 -4
  39. {snowpark_checkpoints_collectors-0.1.0rc2.dist-info → snowpark_checkpoints_collectors-0.1.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,88 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from pyspark.sql import DataFrame as SparkDataFrame
17
+ from pyspark.sql.types import StructField
18
+
19
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
20
+ ARRAY_COLUMN_TYPE,
21
+ COLUMN_ALLOW_NULL_KEY,
22
+ COLUMN_KEY_TYPE_KEY,
23
+ COLUMN_METADATA_KEY,
24
+ COLUMN_VALUE_TYPE_KEY,
25
+ CONTAINS_NULL_KEY,
26
+ ELEMENT_TYPE_KEY,
27
+ FIELD_METADATA_KEY,
28
+ FIELDS_KEY,
29
+ KEY_TYPE_KEY,
30
+ MAP_COLUMN_TYPE,
31
+ STRUCT_COLUMN_TYPE,
32
+ VALUE_CONTAINS_NULL_KEY,
33
+ VALUE_TYPE_KEY,
34
+ )
35
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
36
+ ColumnCollectorBase,
37
+ )
38
+
39
+
40
+ class EmptyColumnCollector(ColumnCollectorBase):
41
+
42
+ """Class for collect an empty column.
43
+
44
+ Attributes:
45
+ name (str): the name of the column.
46
+ type (str): the type of the column.
47
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
48
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
49
+
50
+ """
51
+
52
+ def __init__(
53
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
54
+ ) -> None:
55
+ """Init EmptyColumnCollector.
56
+
57
+ Args:
58
+ clm_name (str): the name of the column.
59
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
60
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
61
+
62
+ """
63
+ super().__init__(clm_name, struct_field, clm_df)
64
+
65
+ def get_custom_data(self) -> dict[str, any]:
66
+ custom_data = {}
67
+ data_type_dict = self.struct_field.dataType.jsonValue()
68
+ if self.type == ARRAY_COLUMN_TYPE:
69
+ custom_data[COLUMN_VALUE_TYPE_KEY] = data_type_dict[ELEMENT_TYPE_KEY]
70
+ custom_data[COLUMN_ALLOW_NULL_KEY] = data_type_dict[CONTAINS_NULL_KEY]
71
+
72
+ elif self.type == MAP_COLUMN_TYPE:
73
+ custom_data[COLUMN_KEY_TYPE_KEY] = data_type_dict[KEY_TYPE_KEY]
74
+ custom_data[COLUMN_VALUE_TYPE_KEY] = data_type_dict[VALUE_TYPE_KEY]
75
+ custom_data[COLUMN_ALLOW_NULL_KEY] = data_type_dict[VALUE_CONTAINS_NULL_KEY]
76
+
77
+ elif self.type == STRUCT_COLUMN_TYPE:
78
+ field_metadata_collection = []
79
+ struct_field_json = self.struct_field.dataType.jsonValue()
80
+ for field in struct_field_json[FIELDS_KEY]:
81
+ del field[FIELD_METADATA_KEY]
82
+ field_metadata_collection.append(field)
83
+ custom_data[COLUMN_METADATA_KEY] = field_metadata_collection
84
+
85
+ else:
86
+ custom_data = {}
87
+
88
+ return custom_data
@@ -0,0 +1,120 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from statistics import mean
16
+
17
+ from pyspark.sql import DataFrame as SparkDataFrame
18
+ from pyspark.sql.functions import coalesce as spark_coalesce
19
+ from pyspark.sql.functions import col as spark_col
20
+ from pyspark.sql.functions import create_map as spark_create_map
21
+ from pyspark.sql.functions import explode as spark_explode
22
+ from pyspark.sql.functions import map_values as spark_map_values
23
+ from pyspark.sql.functions import size as spark_size
24
+ from pyspark.sql.types import StructField
25
+
26
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
27
+ COLUMN_ALLOW_NULL_KEY,
28
+ COLUMN_IS_UNIQUE_SIZE_KEY,
29
+ COLUMN_KEY_TYPE_KEY,
30
+ COLUMN_MAX_SIZE_KEY,
31
+ COLUMN_MEAN_SIZE_KEY,
32
+ COLUMN_MIN_SIZE_KEY,
33
+ COLUMN_NULL_VALUE_PROPORTION_KEY,
34
+ COLUMN_SIZE_KEY,
35
+ COLUMN_VALUE_KEY,
36
+ COLUMN_VALUE_TYPE_KEY,
37
+ KEY_TYPE_KEY,
38
+ VALUE_CONTAINS_NULL_KEY,
39
+ VALUE_TYPE_KEY,
40
+ )
41
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
42
+ ColumnCollectorBase,
43
+ )
44
+
45
+
46
+ class MapColumnCollector(ColumnCollectorBase):
47
+
48
+ """Class for collect a map type column.
49
+
50
+ Attributes:
51
+ name (str): the name of the column.
52
+ type (str): the type of the column.
53
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
54
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
55
+
56
+ """
57
+
58
+ def __init__(
59
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
60
+ ) -> None:
61
+ """Init MapColumnCollector.
62
+
63
+ Args:
64
+ clm_name (str): the name of the column.
65
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
66
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
67
+
68
+ """
69
+ super().__init__(clm_name, struct_field, clm_df)
70
+ self._map_size_collection = self._compute_map_size_collection()
71
+
72
+ def get_custom_data(self) -> dict[str, any]:
73
+ data_type_dict = self.struct_field.dataType.jsonValue()
74
+ key_type = data_type_dict[KEY_TYPE_KEY]
75
+ value_type = data_type_dict[VALUE_TYPE_KEY]
76
+ allow_null = data_type_dict[VALUE_CONTAINS_NULL_KEY]
77
+ null_value_proportion = (
78
+ self._compute_null_value_proportion() if allow_null else 0.0
79
+ )
80
+ array_max_size = max(self._map_size_collection)
81
+ array_min_size = min(self._map_size_collection)
82
+ array_mean_size = mean(self._map_size_collection)
83
+ all_array_have_same_size = array_max_size == array_min_size
84
+
85
+ custom_data_dict = {
86
+ COLUMN_KEY_TYPE_KEY: key_type,
87
+ COLUMN_VALUE_TYPE_KEY: value_type,
88
+ COLUMN_ALLOW_NULL_KEY: allow_null,
89
+ COLUMN_NULL_VALUE_PROPORTION_KEY: null_value_proportion,
90
+ COLUMN_MAX_SIZE_KEY: array_max_size,
91
+ COLUMN_MIN_SIZE_KEY: array_min_size,
92
+ COLUMN_MEAN_SIZE_KEY: array_mean_size,
93
+ COLUMN_IS_UNIQUE_SIZE_KEY: all_array_have_same_size,
94
+ }
95
+
96
+ return custom_data_dict
97
+
98
+ def _compute_map_size_collection(self) -> list[int]:
99
+ select_result = self.column_df.select(
100
+ spark_size(
101
+ spark_coalesce(spark_col(self.name), spark_create_map([]))
102
+ ).alias(COLUMN_SIZE_KEY)
103
+ ).collect()
104
+
105
+ size_collection = [row[COLUMN_SIZE_KEY] for row in select_result]
106
+
107
+ return size_collection
108
+
109
+ def _compute_null_value_proportion(self) -> float:
110
+ select_result = self.column_df.select(
111
+ spark_explode(spark_map_values(spark_col(self.name))).alias(
112
+ COLUMN_VALUE_KEY
113
+ )
114
+ )
115
+
116
+ null_counter = select_result.where(spark_col(COLUMN_VALUE_KEY).isNull()).count()
117
+
118
+ total_values = sum(self._map_size_collection)
119
+ null_value_proportion = (null_counter / total_values) * 100
120
+ return null_value_proportion
@@ -0,0 +1,49 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from pyspark.sql import DataFrame as SparkDataFrame
16
+ from pyspark.sql.types import StructField
17
+
18
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
19
+ ColumnCollectorBase,
20
+ )
21
+
22
+
23
+ class NullColumnCollector(ColumnCollectorBase):
24
+
25
+ """Class for collect a null type column.
26
+
27
+ Attributes:
28
+ name (str): the name of the column.
29
+ type (str): the type of the column.
30
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
31
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
32
+
33
+ """
34
+
35
+ def __init__(
36
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
37
+ ) -> None:
38
+ """Init NullColumnCollector.
39
+
40
+ Args:
41
+ clm_name (str): the name of the column.
42
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
43
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
44
+
45
+ """
46
+ super().__init__(clm_name, struct_field, clm_df)
47
+
48
+ def get_custom_data(self) -> dict[str, any]:
49
+ return {}
@@ -0,0 +1,108 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from pyspark.sql import DataFrame as SparkDataFrame
17
+ from pyspark.sql.functions import col as spark_col
18
+ from pyspark.sql.functions import max as spark_max
19
+ from pyspark.sql.functions import mean as spark_mean
20
+ from pyspark.sql.functions import min as spark_min
21
+ from pyspark.sql.functions import stddev as spark_sdt
22
+ from pyspark.sql.types import StructField
23
+
24
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
25
+ COLUMN_DECIMAL_PRECISION_KEY,
26
+ COLUMN_MARGIN_ERROR_KEY,
27
+ COLUMN_MAX_KEY,
28
+ COLUMN_MEAN_KEY,
29
+ COLUMN_MIN_KEY,
30
+ INTEGER_TYPE_COLLECTION,
31
+ get_decimal_token,
32
+ )
33
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
34
+ ColumnCollectorBase,
35
+ )
36
+
37
+
38
+ class NumericColumnCollector(ColumnCollectorBase):
39
+
40
+ """Class for collect an empty column.
41
+
42
+ Attributes:
43
+ name (str): the name of the column.
44
+ type (str): the type of the column.
45
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
46
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
47
+ is_integer (boolean): a flag to indicate if values are integer or do not.
48
+
49
+ """
50
+
51
+ def __init__(
52
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
53
+ ) -> None:
54
+ """Init NumericColumnCollector.
55
+
56
+ Args:
57
+ clm_name (str): the name of the column.
58
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
59
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
60
+
61
+ """
62
+ super().__init__(clm_name, struct_field, clm_df)
63
+ self.is_integer = self.type in INTEGER_TYPE_COLLECTION
64
+
65
+ def get_custom_data(self) -> dict[str, any]:
66
+ select_result = self.column_df.select(
67
+ spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
68
+ spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
69
+ spark_mean(spark_col(self.name)).alias(COLUMN_MEAN_KEY),
70
+ spark_sdt(spark_col(self.name)).alias(COLUMN_MARGIN_ERROR_KEY),
71
+ ).collect()[0]
72
+
73
+ min_value = select_result[COLUMN_MIN_KEY]
74
+ max_value = select_result[COLUMN_MAX_KEY]
75
+ mean_value = select_result[COLUMN_MEAN_KEY]
76
+ decimal_precision = self._compute_decimal_precision()
77
+ margin_error = select_result[COLUMN_MARGIN_ERROR_KEY]
78
+
79
+ custom_data_dict = {
80
+ COLUMN_MIN_KEY: min_value,
81
+ COLUMN_MAX_KEY: max_value,
82
+ COLUMN_MEAN_KEY: mean_value,
83
+ COLUMN_DECIMAL_PRECISION_KEY: decimal_precision,
84
+ COLUMN_MARGIN_ERROR_KEY: margin_error,
85
+ }
86
+
87
+ return custom_data_dict
88
+
89
+ def _compute_decimal_precision(self) -> int:
90
+ if self.is_integer:
91
+ return 0
92
+
93
+ decimal_part_index = 1
94
+ decimal_token = get_decimal_token()
95
+ max_decimal_digits_counted = 0
96
+
97
+ row_collection = self.column_df.dropna().collect()
98
+ for row in row_collection:
99
+ value = row[0]
100
+ value_str = str(value)
101
+ value_split_by_token = value_str.split(decimal_token)
102
+ decimal_part = value_split_by_token[decimal_part_index]
103
+ decimal_digits_counted = len(decimal_part)
104
+ max_decimal_digits_counted = max(
105
+ decimal_digits_counted, max_decimal_digits_counted
106
+ )
107
+
108
+ return max_decimal_digits_counted
@@ -0,0 +1,70 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from pyspark.sql import DataFrame as SparkDataFrame
16
+ from pyspark.sql.functions import col as spark_col
17
+ from pyspark.sql.functions import length as spark_length
18
+ from pyspark.sql.functions import max as spark_max
19
+ from pyspark.sql.functions import min as spark_min
20
+ from pyspark.sql.types import StructField
21
+
22
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
23
+ COLUMN_MAX_LENGTH_KEY,
24
+ COLUMN_MIN_LENGTH_KEY,
25
+ )
26
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
27
+ ColumnCollectorBase,
28
+ )
29
+
30
+
31
+ class StringColumnCollector(ColumnCollectorBase):
32
+
33
+ """Class for collect a string type column.
34
+
35
+ Attributes:
36
+ name (str): the name of the column.
37
+ type (str): the type of the column.
38
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
39
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
40
+
41
+ """
42
+
43
+ def __init__(
44
+ self, clm_name, struct_field: StructField, clm_df: SparkDataFrame
45
+ ) -> None:
46
+ """Init StringColumnCollector.
47
+
48
+ Args:
49
+ clm_name (str): the name of the column.
50
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
51
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
52
+
53
+ """
54
+ super().__init__(clm_name, struct_field, clm_df)
55
+
56
+ def get_custom_data(self) -> dict[str, any]:
57
+ select_result = self.column_df.select(
58
+ spark_min(spark_length(spark_col(self.name))).alias(COLUMN_MIN_LENGTH_KEY),
59
+ spark_max(spark_length(spark_col(self.name))).alias(COLUMN_MAX_LENGTH_KEY),
60
+ ).collect()[0]
61
+
62
+ min_length = select_result[COLUMN_MIN_LENGTH_KEY]
63
+ max_length = select_result[COLUMN_MAX_LENGTH_KEY]
64
+
65
+ custom_data_dict = {
66
+ COLUMN_MIN_LENGTH_KEY: min_length,
67
+ COLUMN_MAX_LENGTH_KEY: max_length,
68
+ }
69
+
70
+ return custom_data_dict
@@ -0,0 +1,102 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from pyspark.sql import DataFrame as SparkDataFrame
16
+ from pyspark.sql.types import StructField
17
+
18
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
19
+ COLUMN_COUNT_KEY,
20
+ COLUMN_METADATA_KEY,
21
+ COLUMN_ROWS_NOT_NULL_COUNT_KEY,
22
+ COLUMN_ROWS_NULL_COUNT_KEY,
23
+ FIELD_METADATA_KEY,
24
+ FIELDS_KEY,
25
+ NAME_KEY,
26
+ )
27
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
28
+ ColumnCollectorBase,
29
+ )
30
+
31
+
32
+ class StructColumnCollector(ColumnCollectorBase):
33
+
34
+ """Class for collect a struct type column.
35
+
36
+ Attributes:
37
+ name (str): the name of the column.
38
+ type (str): the type of the column.
39
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
40
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
41
+
42
+ """
43
+
44
+ def __init__(
45
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
46
+ ) -> None:
47
+ """Init StructColumnCollector.
48
+
49
+ Args:
50
+ clm_name (str): the name of the column.
51
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
52
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
53
+
54
+ """
55
+ super().__init__(clm_name, struct_field, clm_df)
56
+
57
+ def get_custom_data(self) -> dict[str, any]:
58
+ metadata = self._compute_struct_metadata()
59
+
60
+ custom_data_dict = {
61
+ COLUMN_METADATA_KEY: metadata,
62
+ }
63
+
64
+ return custom_data_dict
65
+
66
+ def _compute_struct_metadata(self) -> list[dict[str, any]]:
67
+ field_metadata_collection = []
68
+ struct_field_json = self.struct_field.dataType.jsonValue()
69
+ for field in struct_field_json[FIELDS_KEY]:
70
+ del field[FIELD_METADATA_KEY]
71
+ clm_name = field[NAME_KEY]
72
+ rows_count_dict = self._compute_rows_count_by_column(clm_name)
73
+ struct_field_custom_data = dict(field | rows_count_dict)
74
+ field_metadata_collection.append(struct_field_custom_data)
75
+
76
+ return field_metadata_collection
77
+
78
+ def _compute_rows_count_by_column(self, clm_name: str) -> dict[str, int]:
79
+ rows_count = 0
80
+ rows_not_null_count = 0
81
+ rows_null_count = 0
82
+ row_collection = self.column_df.collect()
83
+ for row in row_collection:
84
+ inner_row = row[0]
85
+ rows_count += 1
86
+ if inner_row is None:
87
+ rows_null_count += 1
88
+ continue
89
+
90
+ row_clm_value = inner_row[clm_name]
91
+ if row_clm_value is None:
92
+ rows_null_count += 1
93
+ else:
94
+ rows_not_null_count += 1
95
+
96
+ rows_count_dict = {
97
+ COLUMN_COUNT_KEY: rows_count,
98
+ COLUMN_ROWS_NOT_NULL_COUNT_KEY: rows_not_null_count,
99
+ COLUMN_ROWS_NULL_COUNT_KEY: rows_null_count,
100
+ }
101
+
102
+ return rows_count_dict
@@ -0,0 +1,75 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from pyspark.sql import DataFrame as SparkDataFrame
17
+ from pyspark.sql.functions import col as spark_col
18
+ from pyspark.sql.functions import max as spark_max
19
+ from pyspark.sql.functions import min as spark_min
20
+ from pyspark.sql.types import StructField
21
+
22
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
23
+ COLUMN_FORMAT_KEY,
24
+ COLUMN_MAX_KEY,
25
+ COLUMN_MIN_KEY,
26
+ )
27
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
28
+ ColumnCollectorBase,
29
+ )
30
+
31
+
32
+ FORMAT = "%Y-%m-%dT%H:%M:%S%z"
33
+
34
+
35
+ class TimestampColumnCollector(ColumnCollectorBase):
36
+
37
+ """Class for collect a timestamp type column.
38
+
39
+ Attributes:
40
+ name (str): the name of the column.
41
+ type (str): the type of the column.
42
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
43
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
44
+
45
+ """
46
+
47
+ def __init__(
48
+ self, clm_name, struct_field: StructField, clm_df: SparkDataFrame
49
+ ) -> None:
50
+ """Init TimestampColumnCollector.
51
+
52
+ Args:
53
+ clm_name (str): the name of the column.
54
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
55
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
56
+
57
+ """
58
+ super().__init__(clm_name, struct_field, clm_df)
59
+
60
+ def get_custom_data(self) -> dict[str, any]:
61
+ select_result = self.column_df.select(
62
+ spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
63
+ spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
64
+ ).collect()[0]
65
+
66
+ min_value = str(select_result[COLUMN_MIN_KEY])
67
+ max_value = str(select_result[COLUMN_MAX_KEY])
68
+
69
+ custom_data_dict = {
70
+ COLUMN_MIN_KEY: min_value,
71
+ COLUMN_MAX_KEY: max_value,
72
+ COLUMN_FORMAT_KEY: FORMAT,
73
+ }
74
+
75
+ return custom_data_dict
@@ -0,0 +1,75 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from pyspark.sql import DataFrame as SparkDataFrame
17
+ from pyspark.sql.functions import col as spark_col
18
+ from pyspark.sql.functions import max as spark_max
19
+ from pyspark.sql.functions import min as spark_min
20
+ from pyspark.sql.types import StructField
21
+
22
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
23
+ COLUMN_FORMAT_KEY,
24
+ COLUMN_MAX_KEY,
25
+ COLUMN_MIN_KEY,
26
+ )
27
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
28
+ ColumnCollectorBase,
29
+ )
30
+
31
+
32
+ FORMAT = "%Y-%m-%dH:%M:%S"
33
+
34
+
35
+ class TimestampNTZColumnCollector(ColumnCollectorBase):
36
+
37
+ """Class for collect a timestamp ntz type column.
38
+
39
+ Attributes:
40
+ name (str): the name of the column.
41
+ type (str): the type of the column.
42
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
43
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
44
+
45
+ """
46
+
47
+ def __init__(
48
+ self, clm_name, struct_field: StructField, clm_df: SparkDataFrame
49
+ ) -> None:
50
+ """Init TimestampNTZColumnCollector.
51
+
52
+ Args:
53
+ clm_name (str): the name of the column.
54
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
55
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
56
+
57
+ """
58
+ super().__init__(clm_name, struct_field, clm_df)
59
+
60
+ def get_custom_data(self) -> dict[str, any]:
61
+ select_result = self.column_df.select(
62
+ spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
63
+ spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
64
+ ).collect()[0]
65
+
66
+ min_value = str(select_result[COLUMN_MIN_KEY])
67
+ max_value = str(select_result[COLUMN_MAX_KEY])
68
+
69
+ custom_data_dict = {
70
+ COLUMN_MIN_KEY: min_value,
71
+ COLUMN_MAX_KEY: max_value,
72
+ COLUMN_FORMAT_KEY: FORMAT,
73
+ }
74
+
75
+ return custom_data_dict