snowpark-checkpoints-collectors 0.1.0rc2__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. snowflake/snowpark_checkpoints_collector/__init__.py +22 -0
  2. snowflake/snowpark_checkpoints_collector/collection_common.py +160 -0
  3. snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +24 -0
  4. snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +91 -0
  5. snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +69 -0
  6. snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +22 -0
  7. snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +253 -0
  8. snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +75 -0
  9. snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +113 -0
  10. snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +87 -0
  11. snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +71 -0
  12. snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +95 -0
  13. snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +74 -0
  14. snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +67 -0
  15. snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +92 -0
  16. snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +88 -0
  17. snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +120 -0
  18. snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +49 -0
  19. snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +108 -0
  20. snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +70 -0
  21. snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +102 -0
  22. snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +75 -0
  23. snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +75 -0
  24. snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +20 -0
  25. snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +223 -0
  26. snowflake/snowpark_checkpoints_collector/singleton.py +23 -0
  27. snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +20 -0
  28. snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +172 -0
  29. snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +366 -0
  30. snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +53 -0
  31. snowflake/snowpark_checkpoints_collector/utils/extra_config.py +112 -0
  32. snowflake/snowpark_checkpoints_collector/utils/file_utils.py +132 -0
  33. snowflake/snowpark_checkpoints_collector/utils/telemetry.py +889 -0
  34. snowpark_checkpoints_collectors-0.1.1.dist-info/METADATA +143 -0
  35. snowpark_checkpoints_collectors-0.1.1.dist-info/RECORD +37 -0
  36. {snowpark_checkpoints_collectors-0.1.0rc2.dist-info → snowpark_checkpoints_collectors-0.1.1.dist-info}/licenses/LICENSE +0 -25
  37. snowpark_checkpoints_collectors-0.1.0rc2.dist-info/METADATA +0 -347
  38. snowpark_checkpoints_collectors-0.1.0rc2.dist-info/RECORD +0 -4
  39. {snowpark_checkpoints_collectors-0.1.0rc2.dist-info → snowpark_checkpoints_collectors-0.1.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,75 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __all__ = [
17
+ "ArrayColumnCollector",
18
+ "BinaryColumnCollector",
19
+ "BooleanColumnCollector",
20
+ "DateColumnCollector",
21
+ "DayTimeIntervalColumnCollector",
22
+ "DecimalColumnCollector",
23
+ "EmptyColumnCollector",
24
+ "MapColumnCollector",
25
+ "NumericColumnCollector",
26
+ "NullColumnCollector",
27
+ "StringColumnCollector",
28
+ "StructColumnCollector",
29
+ "TimestampColumnCollector",
30
+ "TimestampNTZColumnCollector",
31
+ ]
32
+
33
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.array_column_collector import (
34
+ ArrayColumnCollector,
35
+ )
36
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.binary_column_collector import (
37
+ BinaryColumnCollector,
38
+ )
39
+
40
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.boolean_column_collector import (
41
+ BooleanColumnCollector,
42
+ )
43
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.date_column_collector import (
44
+ DateColumnCollector,
45
+ )
46
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.day_time_interval_column_collector import (
47
+ DayTimeIntervalColumnCollector,
48
+ )
49
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.decimal_column_collector import (
50
+ DecimalColumnCollector,
51
+ )
52
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.empty_column_collector import (
53
+ EmptyColumnCollector,
54
+ )
55
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.map_column_collector import (
56
+ MapColumnCollector,
57
+ )
58
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.null_column_collector import (
59
+ NullColumnCollector,
60
+ )
61
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.numeric_column_collector import (
62
+ NumericColumnCollector,
63
+ )
64
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.string_column_collector import (
65
+ StringColumnCollector,
66
+ )
67
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.struct_column_collector import (
68
+ StructColumnCollector,
69
+ )
70
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.timestamp_column_collector import (
71
+ TimestampColumnCollector,
72
+ )
73
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.timestamp_ntz_column_collector import (
74
+ TimestampNTZColumnCollector,
75
+ )
@@ -0,0 +1,113 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from statistics import mean
16
+
17
+ from pyspark.sql import DataFrame as SparkDataFrame
18
+ from pyspark.sql.functions import array as spark_array
19
+ from pyspark.sql.functions import coalesce as spark_coalesce
20
+ from pyspark.sql.functions import col as spark_col
21
+ from pyspark.sql.functions import explode as spark_explode
22
+ from pyspark.sql.functions import size as spark_size
23
+ from pyspark.sql.types import StructField
24
+
25
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
26
+ COLUMN_ALLOW_NULL_KEY,
27
+ COLUMN_IS_UNIQUE_SIZE_KEY,
28
+ COLUMN_MAX_SIZE_KEY,
29
+ COLUMN_MEAN_SIZE_KEY,
30
+ COLUMN_MIN_SIZE_KEY,
31
+ COLUMN_NULL_VALUE_PROPORTION_KEY,
32
+ COLUMN_SIZE_KEY,
33
+ COLUMN_VALUE_KEY,
34
+ COLUMN_VALUE_TYPE_KEY,
35
+ CONTAINS_NULL_KEY,
36
+ ELEMENT_TYPE_KEY,
37
+ )
38
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
39
+ ColumnCollectorBase,
40
+ )
41
+
42
+
43
+ class ArrayColumnCollector(ColumnCollectorBase):
44
+
45
+ """Class for collect an array type column.
46
+
47
+ Attributes:
48
+ name (str): the name of the column.
49
+ type (str): the type of the column.
50
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
51
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
52
+
53
+ """
54
+
55
+ def __init__(
56
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
57
+ ) -> None:
58
+ """Init ArrayColumnCollector.
59
+
60
+ Args:
61
+ clm_name (str): the name of the column.
62
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
63
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
64
+
65
+ """
66
+ super().__init__(clm_name, struct_field, clm_df)
67
+ self._array_size_collection = self._compute_array_size_collection()
68
+
69
+ def get_custom_data(self) -> dict[str, any]:
70
+ data_type_dict = self.struct_field.dataType.jsonValue()
71
+ array_type = data_type_dict[ELEMENT_TYPE_KEY]
72
+ allow_null = data_type_dict[CONTAINS_NULL_KEY]
73
+ null_value_proportion = (
74
+ self._compute_null_value_proportion() if allow_null else 0.0
75
+ )
76
+ array_max_size = max(self._array_size_collection)
77
+ array_min_size = min(self._array_size_collection)
78
+ array_mean_size = mean(self._array_size_collection)
79
+ all_array_have_same_size = array_max_size == array_min_size
80
+
81
+ custom_data_dict = {
82
+ COLUMN_VALUE_TYPE_KEY: array_type,
83
+ COLUMN_ALLOW_NULL_KEY: allow_null,
84
+ COLUMN_NULL_VALUE_PROPORTION_KEY: null_value_proportion,
85
+ COLUMN_MAX_SIZE_KEY: array_max_size,
86
+ COLUMN_MIN_SIZE_KEY: array_min_size,
87
+ COLUMN_MEAN_SIZE_KEY: array_mean_size,
88
+ COLUMN_IS_UNIQUE_SIZE_KEY: all_array_have_same_size,
89
+ }
90
+
91
+ return custom_data_dict
92
+
93
+ def _compute_array_size_collection(self) -> list[int]:
94
+ select_result = self.column_df.select(
95
+ spark_size(spark_coalesce(spark_col(self.name), spark_array([]))).alias(
96
+ COLUMN_SIZE_KEY
97
+ )
98
+ ).collect()
99
+
100
+ size_collection = [row[COLUMN_SIZE_KEY] for row in select_result]
101
+
102
+ return size_collection
103
+
104
+ def _compute_null_value_proportion(self) -> float:
105
+ select_result = self.column_df.select(
106
+ spark_explode(spark_col(self.name)).alias(COLUMN_VALUE_KEY)
107
+ )
108
+
109
+ null_counter = select_result.where(spark_col(COLUMN_VALUE_KEY).isNull()).count()
110
+
111
+ total_values = sum(self._array_size_collection)
112
+ null_value_proportion = (null_counter / total_values) * 100
113
+ return null_value_proportion
@@ -0,0 +1,87 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from statistics import mean
16
+
17
+ from pyspark.sql import DataFrame as SparkDataFrame
18
+ from pyspark.sql.functions import coalesce as spark_coalesce
19
+ from pyspark.sql.functions import col as spark_col
20
+ from pyspark.sql.functions import length as spark_length
21
+ from pyspark.sql.functions import lit as spark_lit
22
+ from pyspark.sql.functions import to_binary as spark_to_binary
23
+ from pyspark.sql.types import StructField
24
+
25
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
26
+ COLUMN_IS_UNIQUE_SIZE_KEY,
27
+ COLUMN_MAX_SIZE_KEY,
28
+ COLUMN_MEAN_SIZE_KEY,
29
+ COLUMN_MIN_SIZE_KEY,
30
+ COLUMN_SIZE_KEY,
31
+ )
32
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
33
+ ColumnCollectorBase,
34
+ )
35
+
36
+
37
+ class BinaryColumnCollector(ColumnCollectorBase):
38
+
39
+ """Class for collect a binary type column.
40
+
41
+ Attributes:
42
+ name (str): the name of the column.
43
+ type (str): the type of the column.
44
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
45
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
46
+
47
+ """
48
+
49
+ def __init__(
50
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
51
+ ) -> None:
52
+ """Init BinaryColumnCollector.
53
+
54
+ Args:
55
+ clm_name (str): the name of the column.
56
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
57
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
58
+
59
+ """
60
+ super().__init__(clm_name, struct_field, clm_df)
61
+ self._binary_size_collection = self._compute_binary_size_collection()
62
+
63
+ def get_custom_data(self) -> dict[str, any]:
64
+ max_size_value = max(self._binary_size_collection)
65
+ min_size_value = min(self._binary_size_collection)
66
+ mean_size_value = mean(self._binary_size_collection)
67
+ all_elements_have_same_size = max_size_value == min_size_value
68
+
69
+ custom_data_dict = {
70
+ COLUMN_MAX_SIZE_KEY: max_size_value,
71
+ COLUMN_MIN_SIZE_KEY: min_size_value,
72
+ COLUMN_MEAN_SIZE_KEY: mean_size_value,
73
+ COLUMN_IS_UNIQUE_SIZE_KEY: all_elements_have_same_size,
74
+ }
75
+
76
+ return custom_data_dict
77
+
78
+ def _compute_binary_size_collection(self) -> list[int]:
79
+ select_result = self.column_df.select(
80
+ spark_length(
81
+ spark_coalesce(spark_col(self.name), spark_to_binary(spark_lit(b"")))
82
+ ).alias(COLUMN_SIZE_KEY)
83
+ ).collect()
84
+
85
+ size_collection = [row[COLUMN_SIZE_KEY] for row in select_result]
86
+
87
+ return size_collection
@@ -0,0 +1,71 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from pyspark.sql import DataFrame as SparkDataFrame
17
+ from pyspark.sql.types import StructField
18
+
19
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
20
+ COLUMN_FALSE_COUNT_KEY,
21
+ COLUMN_TRUE_COUNT_KEY,
22
+ )
23
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
24
+ ColumnCollectorBase,
25
+ )
26
+
27
+
28
+ TRUE_KEY = "True"
29
+ FALSE_KEY = "False"
30
+ NONE_KEY = "None"
31
+
32
+
33
+ class BooleanColumnCollector(ColumnCollectorBase):
34
+
35
+ """Class for collect a boolean type column.
36
+
37
+ Attributes:
38
+ name (str): the name of the column.
39
+ type (str): the type of the column.
40
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
41
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
42
+
43
+ """
44
+
45
+ def __init__(
46
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
47
+ ) -> None:
48
+ """Init BooleanColumnCollector.
49
+
50
+ Args:
51
+ clm_name (str): the name of the column.
52
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
53
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
54
+
55
+ """
56
+ super().__init__(clm_name, struct_field, clm_df)
57
+
58
+ def get_custom_data(self) -> dict[str, any]:
59
+ count_dict = self._get_count_dict()
60
+
61
+ custom_data_dict = {
62
+ COLUMN_TRUE_COUNT_KEY: count_dict.get(TRUE_KEY, 0),
63
+ COLUMN_FALSE_COUNT_KEY: count_dict.get(FALSE_KEY, 0),
64
+ }
65
+
66
+ return custom_data_dict
67
+
68
+ def _get_count_dict(self) -> dict[str, int]:
69
+ select_result = self.column_df.groupby(self.name).count().collect()
70
+ count_dict = {str(row[0]): row[1] for row in select_result}
71
+ return count_dict
@@ -0,0 +1,95 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from abc import ABC, abstractmethod
17
+
18
+ from pyspark.sql import DataFrame as SparkDataFrame
19
+ from pyspark.sql.types import StructField
20
+
21
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
22
+ COLUMN_COUNT_KEY,
23
+ COLUMN_IS_NULLABLE_KEY,
24
+ COLUMN_NAME_KEY,
25
+ COLUMN_ROWS_NOT_NULL_COUNT_KEY,
26
+ COLUMN_ROWS_NULL_COUNT_KEY,
27
+ COLUMN_TYPE_KEY,
28
+ )
29
+
30
+
31
+ class ColumnCollectorBase(ABC):
32
+
33
+ """Base class for column collector based on type.
34
+
35
+ Attributes:
36
+ name (str): the name of the column.
37
+ type (str): the type of the column.
38
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
39
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
40
+
41
+ """
42
+
43
+ def __init__(
44
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
45
+ ) -> None:
46
+ """Init ColumnCollectorBase.
47
+
48
+ Args:
49
+ clm_name (str): the name of the column.
50
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
51
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
52
+
53
+ """
54
+ self.name = clm_name
55
+ self.type = struct_field.dataType.typeName()
56
+ self.struct_field = struct_field
57
+ self.column_df = clm_df
58
+
59
+ @abstractmethod
60
+ def get_custom_data(self) -> dict[str, any]:
61
+ """Get the custom data of the column.
62
+
63
+ Returns:
64
+ dict[str, any]: The data collected.
65
+
66
+ """
67
+ pass
68
+
69
+ def _get_common_data(self) -> dict[str, any]:
70
+ column_size = self.column_df.count()
71
+ rows_not_null_count = self.column_df.dropna().count()
72
+ rows_null_count = column_size - rows_not_null_count
73
+
74
+ common_data_dict = {
75
+ COLUMN_NAME_KEY: self.name,
76
+ COLUMN_TYPE_KEY: self.type,
77
+ COLUMN_IS_NULLABLE_KEY: self.struct_field.nullable,
78
+ COLUMN_COUNT_KEY: column_size,
79
+ COLUMN_ROWS_NOT_NULL_COUNT_KEY: rows_not_null_count,
80
+ COLUMN_ROWS_NULL_COUNT_KEY: rows_null_count,
81
+ }
82
+
83
+ return common_data_dict
84
+
85
+ def get_data(self) -> dict[str, any]:
86
+ """Get the data collected of the column.
87
+
88
+ Returns:
89
+ dict[str, any]: The data collected.
90
+
91
+ """
92
+ common_data = self._get_common_data()
93
+ custom_data = self.get_custom_data()
94
+ column_data = dict(common_data | custom_data)
95
+ return column_data
@@ -0,0 +1,74 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from pyspark.sql import DataFrame as SparkDataFrame
16
+ from pyspark.sql.functions import col as spark_col
17
+ from pyspark.sql.functions import max as spark_max
18
+ from pyspark.sql.functions import min as spark_min
19
+ from pyspark.sql.types import StructField
20
+
21
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
22
+ COLUMN_FORMAT_KEY,
23
+ COLUMN_MAX_KEY,
24
+ COLUMN_MIN_KEY,
25
+ )
26
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
27
+ ColumnCollectorBase,
28
+ )
29
+
30
+
31
+ FORMAT = "%Y-%m-%d"
32
+
33
+
34
+ class DateColumnCollector(ColumnCollectorBase):
35
+
36
+ """Class for collect a date type column.
37
+
38
+ Attributes:
39
+ name (str): the name of the column.
40
+ type (str): the type of the column.
41
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
42
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
43
+
44
+ """
45
+
46
+ def __init__(
47
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
48
+ ) -> None:
49
+ """Init DateColumnCollector.
50
+
51
+ Args:
52
+ clm_name (str): the name of the column.
53
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
54
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
55
+
56
+ """
57
+ super().__init__(clm_name, struct_field, clm_df)
58
+
59
+ def get_custom_data(self) -> dict[str, any]:
60
+ select_result = self.column_df.select(
61
+ spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
62
+ spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
63
+ ).collect()[0]
64
+
65
+ min_value = str(select_result[COLUMN_MIN_KEY])
66
+ max_value = str(select_result[COLUMN_MAX_KEY])
67
+
68
+ custom_data_dict = {
69
+ COLUMN_MIN_KEY: min_value,
70
+ COLUMN_MAX_KEY: max_value,
71
+ COLUMN_FORMAT_KEY: FORMAT,
72
+ }
73
+
74
+ return custom_data_dict
@@ -0,0 +1,67 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from pyspark.sql import DataFrame as SparkDataFrame
17
+ from pyspark.sql.functions import col as spark_col
18
+ from pyspark.sql.functions import max as spark_max
19
+ from pyspark.sql.functions import min as spark_min
20
+ from pyspark.sql.types import StructField
21
+
22
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
23
+ COLUMN_MAX_KEY,
24
+ COLUMN_MIN_KEY,
25
+ )
26
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
27
+ ColumnCollectorBase,
28
+ )
29
+
30
+
31
+ class DayTimeIntervalColumnCollector(ColumnCollectorBase):
32
+
33
+ """Class for collect a date time interval type column.
34
+
35
+ Attributes:
36
+ name (str): the name of the column.
37
+ type (str): the type of the column.
38
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
39
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
40
+
41
+ """
42
+
43
+ def __init__(
44
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
45
+ ) -> None:
46
+ """Init DayTimeIntervalColumnCollector.
47
+
48
+ Args:
49
+ clm_name (str): the name of the column.
50
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
51
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
52
+
53
+ """
54
+ super().__init__(clm_name, struct_field, clm_df)
55
+
56
+ def get_custom_data(self) -> dict[str, any]:
57
+ select_result = self.column_df.select(
58
+ spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
59
+ spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
60
+ ).collect()[0]
61
+
62
+ min_value = str(select_result[COLUMN_MIN_KEY])
63
+ max_value = str(select_result[COLUMN_MAX_KEY])
64
+
65
+ custom_data_dict = {COLUMN_MIN_KEY: min_value, COLUMN_MAX_KEY: max_value}
66
+
67
+ return custom_data_dict
@@ -0,0 +1,92 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from pyspark.sql import DataFrame as SparkDataFrame
17
+ from pyspark.sql.functions import col as spark_col
18
+ from pyspark.sql.functions import max as spark_max
19
+ from pyspark.sql.functions import mean as spark_mean
20
+ from pyspark.sql.functions import min as spark_min
21
+ from pyspark.sql.types import StructField
22
+
23
+ from snowflake.snowpark_checkpoints_collector.collection_common import (
24
+ COLUMN_DECIMAL_PRECISION_KEY,
25
+ COLUMN_MAX_KEY,
26
+ COLUMN_MEAN_KEY,
27
+ COLUMN_MIN_KEY,
28
+ get_decimal_token,
29
+ )
30
+ from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
31
+ ColumnCollectorBase,
32
+ )
33
+
34
+
35
+ class DecimalColumnCollector(ColumnCollectorBase):
36
+
37
+ """Class for collect a decimal type column.
38
+
39
+ Attributes:
40
+ name (str): the name of the column.
41
+ type (str): the type of the column.
42
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
43
+ column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
44
+
45
+ """
46
+
47
+ def __init__(
48
+ self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
49
+ ) -> None:
50
+ """Init DecimalColumnCollector.
51
+
52
+ Args:
53
+ clm_name (str): the name of the column.
54
+ struct_field (pyspark.sql.types.StructField): the struct field of the column type.
55
+ clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
56
+
57
+ """
58
+ super().__init__(clm_name, struct_field, clm_df)
59
+
60
+ def get_custom_data(self) -> dict[str, any]:
61
+ select_result = self.column_df.select(
62
+ spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
63
+ spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
64
+ spark_mean(spark_col(self.name)).alias(COLUMN_MEAN_KEY),
65
+ ).collect()[0]
66
+
67
+ min_value = str(select_result[COLUMN_MIN_KEY])
68
+ max_value = str(select_result[COLUMN_MAX_KEY])
69
+ mean_value = str(select_result[COLUMN_MEAN_KEY])
70
+ decimal_precision = self._compute_decimal_precision()
71
+
72
+ custom_data_dict = {
73
+ COLUMN_MIN_KEY: min_value,
74
+ COLUMN_MAX_KEY: max_value,
75
+ COLUMN_MEAN_KEY: mean_value,
76
+ COLUMN_DECIMAL_PRECISION_KEY: decimal_precision,
77
+ }
78
+
79
+ return custom_data_dict
80
+
81
+ def _compute_decimal_precision(self) -> int:
82
+ decimal_part_index = 1
83
+ decimal_token = get_decimal_token()
84
+ value = self.column_df.dropna().collect()[0][0]
85
+ value_str = str(value)
86
+ value_split_by_token = value_str.split(decimal_token)
87
+ if len(value_split_by_token) == 1:
88
+ return 0
89
+
90
+ decimal_part = value_split_by_token[decimal_part_index]
91
+ decimal_digits_counted = len(decimal_part)
92
+ return decimal_digits_counted