snowpark-checkpoints-collectors 0.1.0rc2__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_checkpoints_collector/__init__.py +22 -0
- snowflake/snowpark_checkpoints_collector/collection_common.py +160 -0
- snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +24 -0
- snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +91 -0
- snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +69 -0
- snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +22 -0
- snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +253 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +75 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +113 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +87 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +71 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +95 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +74 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +67 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +92 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +88 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +120 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +49 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +108 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +70 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +102 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +75 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +75 -0
- snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +20 -0
- snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +223 -0
- snowflake/snowpark_checkpoints_collector/singleton.py +23 -0
- snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +20 -0
- snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +172 -0
- snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +366 -0
- snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +53 -0
- snowflake/snowpark_checkpoints_collector/utils/extra_config.py +112 -0
- snowflake/snowpark_checkpoints_collector/utils/file_utils.py +132 -0
- snowflake/snowpark_checkpoints_collector/utils/telemetry.py +889 -0
- snowpark_checkpoints_collectors-0.1.1.dist-info/METADATA +143 -0
- snowpark_checkpoints_collectors-0.1.1.dist-info/RECORD +37 -0
- {snowpark_checkpoints_collectors-0.1.0rc2.dist-info → snowpark_checkpoints_collectors-0.1.1.dist-info}/licenses/LICENSE +0 -25
- snowpark_checkpoints_collectors-0.1.0rc2.dist-info/METADATA +0 -347
- snowpark_checkpoints_collectors-0.1.0rc2.dist-info/RECORD +0 -4
- {snowpark_checkpoints_collectors-0.1.0rc2.dist-info → snowpark_checkpoints_collectors-0.1.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"ArrayColumnCollector",
|
18
|
+
"BinaryColumnCollector",
|
19
|
+
"BooleanColumnCollector",
|
20
|
+
"DateColumnCollector",
|
21
|
+
"DayTimeIntervalColumnCollector",
|
22
|
+
"DecimalColumnCollector",
|
23
|
+
"EmptyColumnCollector",
|
24
|
+
"MapColumnCollector",
|
25
|
+
"NumericColumnCollector",
|
26
|
+
"NullColumnCollector",
|
27
|
+
"StringColumnCollector",
|
28
|
+
"StructColumnCollector",
|
29
|
+
"TimestampColumnCollector",
|
30
|
+
"TimestampNTZColumnCollector",
|
31
|
+
]
|
32
|
+
|
33
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.array_column_collector import (
|
34
|
+
ArrayColumnCollector,
|
35
|
+
)
|
36
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.binary_column_collector import (
|
37
|
+
BinaryColumnCollector,
|
38
|
+
)
|
39
|
+
|
40
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.boolean_column_collector import (
|
41
|
+
BooleanColumnCollector,
|
42
|
+
)
|
43
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.date_column_collector import (
|
44
|
+
DateColumnCollector,
|
45
|
+
)
|
46
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.day_time_interval_column_collector import (
|
47
|
+
DayTimeIntervalColumnCollector,
|
48
|
+
)
|
49
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.decimal_column_collector import (
|
50
|
+
DecimalColumnCollector,
|
51
|
+
)
|
52
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.empty_column_collector import (
|
53
|
+
EmptyColumnCollector,
|
54
|
+
)
|
55
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.map_column_collector import (
|
56
|
+
MapColumnCollector,
|
57
|
+
)
|
58
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.null_column_collector import (
|
59
|
+
NullColumnCollector,
|
60
|
+
)
|
61
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.numeric_column_collector import (
|
62
|
+
NumericColumnCollector,
|
63
|
+
)
|
64
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.string_column_collector import (
|
65
|
+
StringColumnCollector,
|
66
|
+
)
|
67
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.struct_column_collector import (
|
68
|
+
StructColumnCollector,
|
69
|
+
)
|
70
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.timestamp_column_collector import (
|
71
|
+
TimestampColumnCollector,
|
72
|
+
)
|
73
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.timestamp_ntz_column_collector import (
|
74
|
+
TimestampNTZColumnCollector,
|
75
|
+
)
|
@@ -0,0 +1,113 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
from statistics import mean
|
16
|
+
|
17
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
18
|
+
from pyspark.sql.functions import array as spark_array
|
19
|
+
from pyspark.sql.functions import coalesce as spark_coalesce
|
20
|
+
from pyspark.sql.functions import col as spark_col
|
21
|
+
from pyspark.sql.functions import explode as spark_explode
|
22
|
+
from pyspark.sql.functions import size as spark_size
|
23
|
+
from pyspark.sql.types import StructField
|
24
|
+
|
25
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
26
|
+
COLUMN_ALLOW_NULL_KEY,
|
27
|
+
COLUMN_IS_UNIQUE_SIZE_KEY,
|
28
|
+
COLUMN_MAX_SIZE_KEY,
|
29
|
+
COLUMN_MEAN_SIZE_KEY,
|
30
|
+
COLUMN_MIN_SIZE_KEY,
|
31
|
+
COLUMN_NULL_VALUE_PROPORTION_KEY,
|
32
|
+
COLUMN_SIZE_KEY,
|
33
|
+
COLUMN_VALUE_KEY,
|
34
|
+
COLUMN_VALUE_TYPE_KEY,
|
35
|
+
CONTAINS_NULL_KEY,
|
36
|
+
ELEMENT_TYPE_KEY,
|
37
|
+
)
|
38
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
39
|
+
ColumnCollectorBase,
|
40
|
+
)
|
41
|
+
|
42
|
+
|
43
|
+
class ArrayColumnCollector(ColumnCollectorBase):
|
44
|
+
|
45
|
+
"""Class for collect an array type column.
|
46
|
+
|
47
|
+
Attributes:
|
48
|
+
name (str): the name of the column.
|
49
|
+
type (str): the type of the column.
|
50
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
51
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
52
|
+
|
53
|
+
"""
|
54
|
+
|
55
|
+
def __init__(
|
56
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
57
|
+
) -> None:
|
58
|
+
"""Init ArrayColumnCollector.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
clm_name (str): the name of the column.
|
62
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
63
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
64
|
+
|
65
|
+
"""
|
66
|
+
super().__init__(clm_name, struct_field, clm_df)
|
67
|
+
self._array_size_collection = self._compute_array_size_collection()
|
68
|
+
|
69
|
+
def get_custom_data(self) -> dict[str, any]:
|
70
|
+
data_type_dict = self.struct_field.dataType.jsonValue()
|
71
|
+
array_type = data_type_dict[ELEMENT_TYPE_KEY]
|
72
|
+
allow_null = data_type_dict[CONTAINS_NULL_KEY]
|
73
|
+
null_value_proportion = (
|
74
|
+
self._compute_null_value_proportion() if allow_null else 0.0
|
75
|
+
)
|
76
|
+
array_max_size = max(self._array_size_collection)
|
77
|
+
array_min_size = min(self._array_size_collection)
|
78
|
+
array_mean_size = mean(self._array_size_collection)
|
79
|
+
all_array_have_same_size = array_max_size == array_min_size
|
80
|
+
|
81
|
+
custom_data_dict = {
|
82
|
+
COLUMN_VALUE_TYPE_KEY: array_type,
|
83
|
+
COLUMN_ALLOW_NULL_KEY: allow_null,
|
84
|
+
COLUMN_NULL_VALUE_PROPORTION_KEY: null_value_proportion,
|
85
|
+
COLUMN_MAX_SIZE_KEY: array_max_size,
|
86
|
+
COLUMN_MIN_SIZE_KEY: array_min_size,
|
87
|
+
COLUMN_MEAN_SIZE_KEY: array_mean_size,
|
88
|
+
COLUMN_IS_UNIQUE_SIZE_KEY: all_array_have_same_size,
|
89
|
+
}
|
90
|
+
|
91
|
+
return custom_data_dict
|
92
|
+
|
93
|
+
def _compute_array_size_collection(self) -> list[int]:
|
94
|
+
select_result = self.column_df.select(
|
95
|
+
spark_size(spark_coalesce(spark_col(self.name), spark_array([]))).alias(
|
96
|
+
COLUMN_SIZE_KEY
|
97
|
+
)
|
98
|
+
).collect()
|
99
|
+
|
100
|
+
size_collection = [row[COLUMN_SIZE_KEY] for row in select_result]
|
101
|
+
|
102
|
+
return size_collection
|
103
|
+
|
104
|
+
def _compute_null_value_proportion(self) -> float:
|
105
|
+
select_result = self.column_df.select(
|
106
|
+
spark_explode(spark_col(self.name)).alias(COLUMN_VALUE_KEY)
|
107
|
+
)
|
108
|
+
|
109
|
+
null_counter = select_result.where(spark_col(COLUMN_VALUE_KEY).isNull()).count()
|
110
|
+
|
111
|
+
total_values = sum(self._array_size_collection)
|
112
|
+
null_value_proportion = (null_counter / total_values) * 100
|
113
|
+
return null_value_proportion
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
from statistics import mean
|
16
|
+
|
17
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
18
|
+
from pyspark.sql.functions import coalesce as spark_coalesce
|
19
|
+
from pyspark.sql.functions import col as spark_col
|
20
|
+
from pyspark.sql.functions import length as spark_length
|
21
|
+
from pyspark.sql.functions import lit as spark_lit
|
22
|
+
from pyspark.sql.functions import to_binary as spark_to_binary
|
23
|
+
from pyspark.sql.types import StructField
|
24
|
+
|
25
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
26
|
+
COLUMN_IS_UNIQUE_SIZE_KEY,
|
27
|
+
COLUMN_MAX_SIZE_KEY,
|
28
|
+
COLUMN_MEAN_SIZE_KEY,
|
29
|
+
COLUMN_MIN_SIZE_KEY,
|
30
|
+
COLUMN_SIZE_KEY,
|
31
|
+
)
|
32
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
33
|
+
ColumnCollectorBase,
|
34
|
+
)
|
35
|
+
|
36
|
+
|
37
|
+
class BinaryColumnCollector(ColumnCollectorBase):
|
38
|
+
|
39
|
+
"""Class for collect a binary type column.
|
40
|
+
|
41
|
+
Attributes:
|
42
|
+
name (str): the name of the column.
|
43
|
+
type (str): the type of the column.
|
44
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
45
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
46
|
+
|
47
|
+
"""
|
48
|
+
|
49
|
+
def __init__(
|
50
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
51
|
+
) -> None:
|
52
|
+
"""Init BinaryColumnCollector.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
clm_name (str): the name of the column.
|
56
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
57
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
58
|
+
|
59
|
+
"""
|
60
|
+
super().__init__(clm_name, struct_field, clm_df)
|
61
|
+
self._binary_size_collection = self._compute_binary_size_collection()
|
62
|
+
|
63
|
+
def get_custom_data(self) -> dict[str, any]:
|
64
|
+
max_size_value = max(self._binary_size_collection)
|
65
|
+
min_size_value = min(self._binary_size_collection)
|
66
|
+
mean_size_value = mean(self._binary_size_collection)
|
67
|
+
all_elements_have_same_size = max_size_value == min_size_value
|
68
|
+
|
69
|
+
custom_data_dict = {
|
70
|
+
COLUMN_MAX_SIZE_KEY: max_size_value,
|
71
|
+
COLUMN_MIN_SIZE_KEY: min_size_value,
|
72
|
+
COLUMN_MEAN_SIZE_KEY: mean_size_value,
|
73
|
+
COLUMN_IS_UNIQUE_SIZE_KEY: all_elements_have_same_size,
|
74
|
+
}
|
75
|
+
|
76
|
+
return custom_data_dict
|
77
|
+
|
78
|
+
def _compute_binary_size_collection(self) -> list[int]:
|
79
|
+
select_result = self.column_df.select(
|
80
|
+
spark_length(
|
81
|
+
spark_coalesce(spark_col(self.name), spark_to_binary(spark_lit(b"")))
|
82
|
+
).alias(COLUMN_SIZE_KEY)
|
83
|
+
).collect()
|
84
|
+
|
85
|
+
size_collection = [row[COLUMN_SIZE_KEY] for row in select_result]
|
86
|
+
|
87
|
+
return size_collection
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
17
|
+
from pyspark.sql.types import StructField
|
18
|
+
|
19
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
20
|
+
COLUMN_FALSE_COUNT_KEY,
|
21
|
+
COLUMN_TRUE_COUNT_KEY,
|
22
|
+
)
|
23
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
24
|
+
ColumnCollectorBase,
|
25
|
+
)
|
26
|
+
|
27
|
+
|
28
|
+
TRUE_KEY = "True"
|
29
|
+
FALSE_KEY = "False"
|
30
|
+
NONE_KEY = "None"
|
31
|
+
|
32
|
+
|
33
|
+
class BooleanColumnCollector(ColumnCollectorBase):
|
34
|
+
|
35
|
+
"""Class for collect a boolean type column.
|
36
|
+
|
37
|
+
Attributes:
|
38
|
+
name (str): the name of the column.
|
39
|
+
type (str): the type of the column.
|
40
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
41
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
42
|
+
|
43
|
+
"""
|
44
|
+
|
45
|
+
def __init__(
|
46
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
47
|
+
) -> None:
|
48
|
+
"""Init BooleanColumnCollector.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
clm_name (str): the name of the column.
|
52
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
53
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
54
|
+
|
55
|
+
"""
|
56
|
+
super().__init__(clm_name, struct_field, clm_df)
|
57
|
+
|
58
|
+
def get_custom_data(self) -> dict[str, any]:
|
59
|
+
count_dict = self._get_count_dict()
|
60
|
+
|
61
|
+
custom_data_dict = {
|
62
|
+
COLUMN_TRUE_COUNT_KEY: count_dict.get(TRUE_KEY, 0),
|
63
|
+
COLUMN_FALSE_COUNT_KEY: count_dict.get(FALSE_KEY, 0),
|
64
|
+
}
|
65
|
+
|
66
|
+
return custom_data_dict
|
67
|
+
|
68
|
+
def _get_count_dict(self) -> dict[str, int]:
|
69
|
+
select_result = self.column_df.groupby(self.name).count().collect()
|
70
|
+
count_dict = {str(row[0]): row[1] for row in select_result}
|
71
|
+
return count_dict
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from abc import ABC, abstractmethod
|
17
|
+
|
18
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
19
|
+
from pyspark.sql.types import StructField
|
20
|
+
|
21
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
22
|
+
COLUMN_COUNT_KEY,
|
23
|
+
COLUMN_IS_NULLABLE_KEY,
|
24
|
+
COLUMN_NAME_KEY,
|
25
|
+
COLUMN_ROWS_NOT_NULL_COUNT_KEY,
|
26
|
+
COLUMN_ROWS_NULL_COUNT_KEY,
|
27
|
+
COLUMN_TYPE_KEY,
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
class ColumnCollectorBase(ABC):
|
32
|
+
|
33
|
+
"""Base class for column collector based on type.
|
34
|
+
|
35
|
+
Attributes:
|
36
|
+
name (str): the name of the column.
|
37
|
+
type (str): the type of the column.
|
38
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
39
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
40
|
+
|
41
|
+
"""
|
42
|
+
|
43
|
+
def __init__(
|
44
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
45
|
+
) -> None:
|
46
|
+
"""Init ColumnCollectorBase.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
clm_name (str): the name of the column.
|
50
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
51
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
52
|
+
|
53
|
+
"""
|
54
|
+
self.name = clm_name
|
55
|
+
self.type = struct_field.dataType.typeName()
|
56
|
+
self.struct_field = struct_field
|
57
|
+
self.column_df = clm_df
|
58
|
+
|
59
|
+
@abstractmethod
|
60
|
+
def get_custom_data(self) -> dict[str, any]:
|
61
|
+
"""Get the custom data of the column.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
dict[str, any]: The data collected.
|
65
|
+
|
66
|
+
"""
|
67
|
+
pass
|
68
|
+
|
69
|
+
def _get_common_data(self) -> dict[str, any]:
|
70
|
+
column_size = self.column_df.count()
|
71
|
+
rows_not_null_count = self.column_df.dropna().count()
|
72
|
+
rows_null_count = column_size - rows_not_null_count
|
73
|
+
|
74
|
+
common_data_dict = {
|
75
|
+
COLUMN_NAME_KEY: self.name,
|
76
|
+
COLUMN_TYPE_KEY: self.type,
|
77
|
+
COLUMN_IS_NULLABLE_KEY: self.struct_field.nullable,
|
78
|
+
COLUMN_COUNT_KEY: column_size,
|
79
|
+
COLUMN_ROWS_NOT_NULL_COUNT_KEY: rows_not_null_count,
|
80
|
+
COLUMN_ROWS_NULL_COUNT_KEY: rows_null_count,
|
81
|
+
}
|
82
|
+
|
83
|
+
return common_data_dict
|
84
|
+
|
85
|
+
def get_data(self) -> dict[str, any]:
|
86
|
+
"""Get the data collected of the column.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
dict[str, any]: The data collected.
|
90
|
+
|
91
|
+
"""
|
92
|
+
common_data = self._get_common_data()
|
93
|
+
custom_data = self.get_custom_data()
|
94
|
+
column_data = dict(common_data | custom_data)
|
95
|
+
return column_data
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
16
|
+
from pyspark.sql.functions import col as spark_col
|
17
|
+
from pyspark.sql.functions import max as spark_max
|
18
|
+
from pyspark.sql.functions import min as spark_min
|
19
|
+
from pyspark.sql.types import StructField
|
20
|
+
|
21
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
22
|
+
COLUMN_FORMAT_KEY,
|
23
|
+
COLUMN_MAX_KEY,
|
24
|
+
COLUMN_MIN_KEY,
|
25
|
+
)
|
26
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
27
|
+
ColumnCollectorBase,
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
FORMAT = "%Y-%m-%d"
|
32
|
+
|
33
|
+
|
34
|
+
class DateColumnCollector(ColumnCollectorBase):
|
35
|
+
|
36
|
+
"""Class for collect a date type column.
|
37
|
+
|
38
|
+
Attributes:
|
39
|
+
name (str): the name of the column.
|
40
|
+
type (str): the type of the column.
|
41
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
42
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
43
|
+
|
44
|
+
"""
|
45
|
+
|
46
|
+
def __init__(
|
47
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
48
|
+
) -> None:
|
49
|
+
"""Init DateColumnCollector.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
clm_name (str): the name of the column.
|
53
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
54
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
55
|
+
|
56
|
+
"""
|
57
|
+
super().__init__(clm_name, struct_field, clm_df)
|
58
|
+
|
59
|
+
def get_custom_data(self) -> dict[str, any]:
|
60
|
+
select_result = self.column_df.select(
|
61
|
+
spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
|
62
|
+
spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
|
63
|
+
).collect()[0]
|
64
|
+
|
65
|
+
min_value = str(select_result[COLUMN_MIN_KEY])
|
66
|
+
max_value = str(select_result[COLUMN_MAX_KEY])
|
67
|
+
|
68
|
+
custom_data_dict = {
|
69
|
+
COLUMN_MIN_KEY: min_value,
|
70
|
+
COLUMN_MAX_KEY: max_value,
|
71
|
+
COLUMN_FORMAT_KEY: FORMAT,
|
72
|
+
}
|
73
|
+
|
74
|
+
return custom_data_dict
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
17
|
+
from pyspark.sql.functions import col as spark_col
|
18
|
+
from pyspark.sql.functions import max as spark_max
|
19
|
+
from pyspark.sql.functions import min as spark_min
|
20
|
+
from pyspark.sql.types import StructField
|
21
|
+
|
22
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
23
|
+
COLUMN_MAX_KEY,
|
24
|
+
COLUMN_MIN_KEY,
|
25
|
+
)
|
26
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
27
|
+
ColumnCollectorBase,
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
class DayTimeIntervalColumnCollector(ColumnCollectorBase):
|
32
|
+
|
33
|
+
"""Class for collect a date time interval type column.
|
34
|
+
|
35
|
+
Attributes:
|
36
|
+
name (str): the name of the column.
|
37
|
+
type (str): the type of the column.
|
38
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
39
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
40
|
+
|
41
|
+
"""
|
42
|
+
|
43
|
+
def __init__(
|
44
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
45
|
+
) -> None:
|
46
|
+
"""Init DayTimeIntervalColumnCollector.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
clm_name (str): the name of the column.
|
50
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
51
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
52
|
+
|
53
|
+
"""
|
54
|
+
super().__init__(clm_name, struct_field, clm_df)
|
55
|
+
|
56
|
+
def get_custom_data(self) -> dict[str, any]:
|
57
|
+
select_result = self.column_df.select(
|
58
|
+
spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
|
59
|
+
spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
|
60
|
+
).collect()[0]
|
61
|
+
|
62
|
+
min_value = str(select_result[COLUMN_MIN_KEY])
|
63
|
+
max_value = str(select_result[COLUMN_MAX_KEY])
|
64
|
+
|
65
|
+
custom_data_dict = {COLUMN_MIN_KEY: min_value, COLUMN_MAX_KEY: max_value}
|
66
|
+
|
67
|
+
return custom_data_dict
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
17
|
+
from pyspark.sql.functions import col as spark_col
|
18
|
+
from pyspark.sql.functions import max as spark_max
|
19
|
+
from pyspark.sql.functions import mean as spark_mean
|
20
|
+
from pyspark.sql.functions import min as spark_min
|
21
|
+
from pyspark.sql.types import StructField
|
22
|
+
|
23
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
24
|
+
COLUMN_DECIMAL_PRECISION_KEY,
|
25
|
+
COLUMN_MAX_KEY,
|
26
|
+
COLUMN_MEAN_KEY,
|
27
|
+
COLUMN_MIN_KEY,
|
28
|
+
get_decimal_token,
|
29
|
+
)
|
30
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
31
|
+
ColumnCollectorBase,
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
class DecimalColumnCollector(ColumnCollectorBase):
|
36
|
+
|
37
|
+
"""Class for collect a decimal type column.
|
38
|
+
|
39
|
+
Attributes:
|
40
|
+
name (str): the name of the column.
|
41
|
+
type (str): the type of the column.
|
42
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
43
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
44
|
+
|
45
|
+
"""
|
46
|
+
|
47
|
+
def __init__(
|
48
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
49
|
+
) -> None:
|
50
|
+
"""Init DecimalColumnCollector.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
clm_name (str): the name of the column.
|
54
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
55
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
56
|
+
|
57
|
+
"""
|
58
|
+
super().__init__(clm_name, struct_field, clm_df)
|
59
|
+
|
60
|
+
def get_custom_data(self) -> dict[str, any]:
|
61
|
+
select_result = self.column_df.select(
|
62
|
+
spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
|
63
|
+
spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
|
64
|
+
spark_mean(spark_col(self.name)).alias(COLUMN_MEAN_KEY),
|
65
|
+
).collect()[0]
|
66
|
+
|
67
|
+
min_value = str(select_result[COLUMN_MIN_KEY])
|
68
|
+
max_value = str(select_result[COLUMN_MAX_KEY])
|
69
|
+
mean_value = str(select_result[COLUMN_MEAN_KEY])
|
70
|
+
decimal_precision = self._compute_decimal_precision()
|
71
|
+
|
72
|
+
custom_data_dict = {
|
73
|
+
COLUMN_MIN_KEY: min_value,
|
74
|
+
COLUMN_MAX_KEY: max_value,
|
75
|
+
COLUMN_MEAN_KEY: mean_value,
|
76
|
+
COLUMN_DECIMAL_PRECISION_KEY: decimal_precision,
|
77
|
+
}
|
78
|
+
|
79
|
+
return custom_data_dict
|
80
|
+
|
81
|
+
def _compute_decimal_precision(self) -> int:
|
82
|
+
decimal_part_index = 1
|
83
|
+
decimal_token = get_decimal_token()
|
84
|
+
value = self.column_df.dropna().collect()[0][0]
|
85
|
+
value_str = str(value)
|
86
|
+
value_split_by_token = value_str.split(decimal_token)
|
87
|
+
if len(value_split_by_token) == 1:
|
88
|
+
return 0
|
89
|
+
|
90
|
+
decimal_part = value_split_by_token[decimal_part_index]
|
91
|
+
decimal_digits_counted = len(decimal_part)
|
92
|
+
return decimal_digits_counted
|