snowpark-checkpoints-collectors 0.1.0rc2__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_checkpoints_collector/__init__.py +22 -0
- snowflake/snowpark_checkpoints_collector/collection_common.py +160 -0
- snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +24 -0
- snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +91 -0
- snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +69 -0
- snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +22 -0
- snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +253 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +75 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +113 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +87 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +71 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +95 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +74 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +67 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +92 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +88 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +120 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +49 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +108 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +70 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +102 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +75 -0
- snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +75 -0
- snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +20 -0
- snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +223 -0
- snowflake/snowpark_checkpoints_collector/singleton.py +23 -0
- snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +20 -0
- snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +172 -0
- snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +366 -0
- snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +53 -0
- snowflake/snowpark_checkpoints_collector/utils/extra_config.py +112 -0
- snowflake/snowpark_checkpoints_collector/utils/file_utils.py +132 -0
- snowflake/snowpark_checkpoints_collector/utils/telemetry.py +889 -0
- snowpark_checkpoints_collectors-0.1.1.dist-info/METADATA +143 -0
- snowpark_checkpoints_collectors-0.1.1.dist-info/RECORD +37 -0
- {snowpark_checkpoints_collectors-0.1.0rc2.dist-info → snowpark_checkpoints_collectors-0.1.1.dist-info}/licenses/LICENSE +0 -25
- snowpark_checkpoints_collectors-0.1.0rc2.dist-info/METADATA +0 -347
- snowpark_checkpoints_collectors-0.1.0rc2.dist-info/RECORD +0 -4
- {snowpark_checkpoints_collectors-0.1.0rc2.dist-info → snowpark_checkpoints_collectors-0.1.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
17
|
+
from pyspark.sql.types import StructField
|
18
|
+
|
19
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
20
|
+
ARRAY_COLUMN_TYPE,
|
21
|
+
COLUMN_ALLOW_NULL_KEY,
|
22
|
+
COLUMN_KEY_TYPE_KEY,
|
23
|
+
COLUMN_METADATA_KEY,
|
24
|
+
COLUMN_VALUE_TYPE_KEY,
|
25
|
+
CONTAINS_NULL_KEY,
|
26
|
+
ELEMENT_TYPE_KEY,
|
27
|
+
FIELD_METADATA_KEY,
|
28
|
+
FIELDS_KEY,
|
29
|
+
KEY_TYPE_KEY,
|
30
|
+
MAP_COLUMN_TYPE,
|
31
|
+
STRUCT_COLUMN_TYPE,
|
32
|
+
VALUE_CONTAINS_NULL_KEY,
|
33
|
+
VALUE_TYPE_KEY,
|
34
|
+
)
|
35
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
36
|
+
ColumnCollectorBase,
|
37
|
+
)
|
38
|
+
|
39
|
+
|
40
|
+
class EmptyColumnCollector(ColumnCollectorBase):
|
41
|
+
|
42
|
+
"""Class for collect an empty column.
|
43
|
+
|
44
|
+
Attributes:
|
45
|
+
name (str): the name of the column.
|
46
|
+
type (str): the type of the column.
|
47
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
48
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
49
|
+
|
50
|
+
"""
|
51
|
+
|
52
|
+
def __init__(
|
53
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
54
|
+
) -> None:
|
55
|
+
"""Init EmptyColumnCollector.
|
56
|
+
|
57
|
+
Args:
|
58
|
+
clm_name (str): the name of the column.
|
59
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
60
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
61
|
+
|
62
|
+
"""
|
63
|
+
super().__init__(clm_name, struct_field, clm_df)
|
64
|
+
|
65
|
+
def get_custom_data(self) -> dict[str, any]:
|
66
|
+
custom_data = {}
|
67
|
+
data_type_dict = self.struct_field.dataType.jsonValue()
|
68
|
+
if self.type == ARRAY_COLUMN_TYPE:
|
69
|
+
custom_data[COLUMN_VALUE_TYPE_KEY] = data_type_dict[ELEMENT_TYPE_KEY]
|
70
|
+
custom_data[COLUMN_ALLOW_NULL_KEY] = data_type_dict[CONTAINS_NULL_KEY]
|
71
|
+
|
72
|
+
elif self.type == MAP_COLUMN_TYPE:
|
73
|
+
custom_data[COLUMN_KEY_TYPE_KEY] = data_type_dict[KEY_TYPE_KEY]
|
74
|
+
custom_data[COLUMN_VALUE_TYPE_KEY] = data_type_dict[VALUE_TYPE_KEY]
|
75
|
+
custom_data[COLUMN_ALLOW_NULL_KEY] = data_type_dict[VALUE_CONTAINS_NULL_KEY]
|
76
|
+
|
77
|
+
elif self.type == STRUCT_COLUMN_TYPE:
|
78
|
+
field_metadata_collection = []
|
79
|
+
struct_field_json = self.struct_field.dataType.jsonValue()
|
80
|
+
for field in struct_field_json[FIELDS_KEY]:
|
81
|
+
del field[FIELD_METADATA_KEY]
|
82
|
+
field_metadata_collection.append(field)
|
83
|
+
custom_data[COLUMN_METADATA_KEY] = field_metadata_collection
|
84
|
+
|
85
|
+
else:
|
86
|
+
custom_data = {}
|
87
|
+
|
88
|
+
return custom_data
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
from statistics import mean
|
16
|
+
|
17
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
18
|
+
from pyspark.sql.functions import coalesce as spark_coalesce
|
19
|
+
from pyspark.sql.functions import col as spark_col
|
20
|
+
from pyspark.sql.functions import create_map as spark_create_map
|
21
|
+
from pyspark.sql.functions import explode as spark_explode
|
22
|
+
from pyspark.sql.functions import map_values as spark_map_values
|
23
|
+
from pyspark.sql.functions import size as spark_size
|
24
|
+
from pyspark.sql.types import StructField
|
25
|
+
|
26
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
27
|
+
COLUMN_ALLOW_NULL_KEY,
|
28
|
+
COLUMN_IS_UNIQUE_SIZE_KEY,
|
29
|
+
COLUMN_KEY_TYPE_KEY,
|
30
|
+
COLUMN_MAX_SIZE_KEY,
|
31
|
+
COLUMN_MEAN_SIZE_KEY,
|
32
|
+
COLUMN_MIN_SIZE_KEY,
|
33
|
+
COLUMN_NULL_VALUE_PROPORTION_KEY,
|
34
|
+
COLUMN_SIZE_KEY,
|
35
|
+
COLUMN_VALUE_KEY,
|
36
|
+
COLUMN_VALUE_TYPE_KEY,
|
37
|
+
KEY_TYPE_KEY,
|
38
|
+
VALUE_CONTAINS_NULL_KEY,
|
39
|
+
VALUE_TYPE_KEY,
|
40
|
+
)
|
41
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
42
|
+
ColumnCollectorBase,
|
43
|
+
)
|
44
|
+
|
45
|
+
|
46
|
+
class MapColumnCollector(ColumnCollectorBase):
|
47
|
+
|
48
|
+
"""Class for collect a map type column.
|
49
|
+
|
50
|
+
Attributes:
|
51
|
+
name (str): the name of the column.
|
52
|
+
type (str): the type of the column.
|
53
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
54
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
55
|
+
|
56
|
+
"""
|
57
|
+
|
58
|
+
def __init__(
|
59
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
60
|
+
) -> None:
|
61
|
+
"""Init MapColumnCollector.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
clm_name (str): the name of the column.
|
65
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
66
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
67
|
+
|
68
|
+
"""
|
69
|
+
super().__init__(clm_name, struct_field, clm_df)
|
70
|
+
self._map_size_collection = self._compute_map_size_collection()
|
71
|
+
|
72
|
+
def get_custom_data(self) -> dict[str, any]:
|
73
|
+
data_type_dict = self.struct_field.dataType.jsonValue()
|
74
|
+
key_type = data_type_dict[KEY_TYPE_KEY]
|
75
|
+
value_type = data_type_dict[VALUE_TYPE_KEY]
|
76
|
+
allow_null = data_type_dict[VALUE_CONTAINS_NULL_KEY]
|
77
|
+
null_value_proportion = (
|
78
|
+
self._compute_null_value_proportion() if allow_null else 0.0
|
79
|
+
)
|
80
|
+
array_max_size = max(self._map_size_collection)
|
81
|
+
array_min_size = min(self._map_size_collection)
|
82
|
+
array_mean_size = mean(self._map_size_collection)
|
83
|
+
all_array_have_same_size = array_max_size == array_min_size
|
84
|
+
|
85
|
+
custom_data_dict = {
|
86
|
+
COLUMN_KEY_TYPE_KEY: key_type,
|
87
|
+
COLUMN_VALUE_TYPE_KEY: value_type,
|
88
|
+
COLUMN_ALLOW_NULL_KEY: allow_null,
|
89
|
+
COLUMN_NULL_VALUE_PROPORTION_KEY: null_value_proportion,
|
90
|
+
COLUMN_MAX_SIZE_KEY: array_max_size,
|
91
|
+
COLUMN_MIN_SIZE_KEY: array_min_size,
|
92
|
+
COLUMN_MEAN_SIZE_KEY: array_mean_size,
|
93
|
+
COLUMN_IS_UNIQUE_SIZE_KEY: all_array_have_same_size,
|
94
|
+
}
|
95
|
+
|
96
|
+
return custom_data_dict
|
97
|
+
|
98
|
+
def _compute_map_size_collection(self) -> list[int]:
|
99
|
+
select_result = self.column_df.select(
|
100
|
+
spark_size(
|
101
|
+
spark_coalesce(spark_col(self.name), spark_create_map([]))
|
102
|
+
).alias(COLUMN_SIZE_KEY)
|
103
|
+
).collect()
|
104
|
+
|
105
|
+
size_collection = [row[COLUMN_SIZE_KEY] for row in select_result]
|
106
|
+
|
107
|
+
return size_collection
|
108
|
+
|
109
|
+
def _compute_null_value_proportion(self) -> float:
|
110
|
+
select_result = self.column_df.select(
|
111
|
+
spark_explode(spark_map_values(spark_col(self.name))).alias(
|
112
|
+
COLUMN_VALUE_KEY
|
113
|
+
)
|
114
|
+
)
|
115
|
+
|
116
|
+
null_counter = select_result.where(spark_col(COLUMN_VALUE_KEY).isNull()).count()
|
117
|
+
|
118
|
+
total_values = sum(self._map_size_collection)
|
119
|
+
null_value_proportion = (null_counter / total_values) * 100
|
120
|
+
return null_value_proportion
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
16
|
+
from pyspark.sql.types import StructField
|
17
|
+
|
18
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
19
|
+
ColumnCollectorBase,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
class NullColumnCollector(ColumnCollectorBase):
|
24
|
+
|
25
|
+
"""Class for collect a null type column.
|
26
|
+
|
27
|
+
Attributes:
|
28
|
+
name (str): the name of the column.
|
29
|
+
type (str): the type of the column.
|
30
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
31
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
32
|
+
|
33
|
+
"""
|
34
|
+
|
35
|
+
def __init__(
|
36
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
37
|
+
) -> None:
|
38
|
+
"""Init NullColumnCollector.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
clm_name (str): the name of the column.
|
42
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
43
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
44
|
+
|
45
|
+
"""
|
46
|
+
super().__init__(clm_name, struct_field, clm_df)
|
47
|
+
|
48
|
+
def get_custom_data(self) -> dict[str, any]:
|
49
|
+
return {}
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
17
|
+
from pyspark.sql.functions import col as spark_col
|
18
|
+
from pyspark.sql.functions import max as spark_max
|
19
|
+
from pyspark.sql.functions import mean as spark_mean
|
20
|
+
from pyspark.sql.functions import min as spark_min
|
21
|
+
from pyspark.sql.functions import stddev as spark_sdt
|
22
|
+
from pyspark.sql.types import StructField
|
23
|
+
|
24
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
25
|
+
COLUMN_DECIMAL_PRECISION_KEY,
|
26
|
+
COLUMN_MARGIN_ERROR_KEY,
|
27
|
+
COLUMN_MAX_KEY,
|
28
|
+
COLUMN_MEAN_KEY,
|
29
|
+
COLUMN_MIN_KEY,
|
30
|
+
INTEGER_TYPE_COLLECTION,
|
31
|
+
get_decimal_token,
|
32
|
+
)
|
33
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
34
|
+
ColumnCollectorBase,
|
35
|
+
)
|
36
|
+
|
37
|
+
|
38
|
+
class NumericColumnCollector(ColumnCollectorBase):
|
39
|
+
|
40
|
+
"""Class for collect an empty column.
|
41
|
+
|
42
|
+
Attributes:
|
43
|
+
name (str): the name of the column.
|
44
|
+
type (str): the type of the column.
|
45
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
46
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
47
|
+
is_integer (boolean): a flag to indicate if values are integer or do not.
|
48
|
+
|
49
|
+
"""
|
50
|
+
|
51
|
+
def __init__(
|
52
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
53
|
+
) -> None:
|
54
|
+
"""Init NumericColumnCollector.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
clm_name (str): the name of the column.
|
58
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
59
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
60
|
+
|
61
|
+
"""
|
62
|
+
super().__init__(clm_name, struct_field, clm_df)
|
63
|
+
self.is_integer = self.type in INTEGER_TYPE_COLLECTION
|
64
|
+
|
65
|
+
def get_custom_data(self) -> dict[str, any]:
|
66
|
+
select_result = self.column_df.select(
|
67
|
+
spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
|
68
|
+
spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
|
69
|
+
spark_mean(spark_col(self.name)).alias(COLUMN_MEAN_KEY),
|
70
|
+
spark_sdt(spark_col(self.name)).alias(COLUMN_MARGIN_ERROR_KEY),
|
71
|
+
).collect()[0]
|
72
|
+
|
73
|
+
min_value = select_result[COLUMN_MIN_KEY]
|
74
|
+
max_value = select_result[COLUMN_MAX_KEY]
|
75
|
+
mean_value = select_result[COLUMN_MEAN_KEY]
|
76
|
+
decimal_precision = self._compute_decimal_precision()
|
77
|
+
margin_error = select_result[COLUMN_MARGIN_ERROR_KEY]
|
78
|
+
|
79
|
+
custom_data_dict = {
|
80
|
+
COLUMN_MIN_KEY: min_value,
|
81
|
+
COLUMN_MAX_KEY: max_value,
|
82
|
+
COLUMN_MEAN_KEY: mean_value,
|
83
|
+
COLUMN_DECIMAL_PRECISION_KEY: decimal_precision,
|
84
|
+
COLUMN_MARGIN_ERROR_KEY: margin_error,
|
85
|
+
}
|
86
|
+
|
87
|
+
return custom_data_dict
|
88
|
+
|
89
|
+
def _compute_decimal_precision(self) -> int:
|
90
|
+
if self.is_integer:
|
91
|
+
return 0
|
92
|
+
|
93
|
+
decimal_part_index = 1
|
94
|
+
decimal_token = get_decimal_token()
|
95
|
+
max_decimal_digits_counted = 0
|
96
|
+
|
97
|
+
row_collection = self.column_df.dropna().collect()
|
98
|
+
for row in row_collection:
|
99
|
+
value = row[0]
|
100
|
+
value_str = str(value)
|
101
|
+
value_split_by_token = value_str.split(decimal_token)
|
102
|
+
decimal_part = value_split_by_token[decimal_part_index]
|
103
|
+
decimal_digits_counted = len(decimal_part)
|
104
|
+
max_decimal_digits_counted = max(
|
105
|
+
decimal_digits_counted, max_decimal_digits_counted
|
106
|
+
)
|
107
|
+
|
108
|
+
return max_decimal_digits_counted
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
16
|
+
from pyspark.sql.functions import col as spark_col
|
17
|
+
from pyspark.sql.functions import length as spark_length
|
18
|
+
from pyspark.sql.functions import max as spark_max
|
19
|
+
from pyspark.sql.functions import min as spark_min
|
20
|
+
from pyspark.sql.types import StructField
|
21
|
+
|
22
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
23
|
+
COLUMN_MAX_LENGTH_KEY,
|
24
|
+
COLUMN_MIN_LENGTH_KEY,
|
25
|
+
)
|
26
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
27
|
+
ColumnCollectorBase,
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
class StringColumnCollector(ColumnCollectorBase):
|
32
|
+
|
33
|
+
"""Class for collect a string type column.
|
34
|
+
|
35
|
+
Attributes:
|
36
|
+
name (str): the name of the column.
|
37
|
+
type (str): the type of the column.
|
38
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
39
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
40
|
+
|
41
|
+
"""
|
42
|
+
|
43
|
+
def __init__(
|
44
|
+
self, clm_name, struct_field: StructField, clm_df: SparkDataFrame
|
45
|
+
) -> None:
|
46
|
+
"""Init StringColumnCollector.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
clm_name (str): the name of the column.
|
50
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
51
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
52
|
+
|
53
|
+
"""
|
54
|
+
super().__init__(clm_name, struct_field, clm_df)
|
55
|
+
|
56
|
+
def get_custom_data(self) -> dict[str, any]:
|
57
|
+
select_result = self.column_df.select(
|
58
|
+
spark_min(spark_length(spark_col(self.name))).alias(COLUMN_MIN_LENGTH_KEY),
|
59
|
+
spark_max(spark_length(spark_col(self.name))).alias(COLUMN_MAX_LENGTH_KEY),
|
60
|
+
).collect()[0]
|
61
|
+
|
62
|
+
min_length = select_result[COLUMN_MIN_LENGTH_KEY]
|
63
|
+
max_length = select_result[COLUMN_MAX_LENGTH_KEY]
|
64
|
+
|
65
|
+
custom_data_dict = {
|
66
|
+
COLUMN_MIN_LENGTH_KEY: min_length,
|
67
|
+
COLUMN_MAX_LENGTH_KEY: max_length,
|
68
|
+
}
|
69
|
+
|
70
|
+
return custom_data_dict
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
16
|
+
from pyspark.sql.types import StructField
|
17
|
+
|
18
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
19
|
+
COLUMN_COUNT_KEY,
|
20
|
+
COLUMN_METADATA_KEY,
|
21
|
+
COLUMN_ROWS_NOT_NULL_COUNT_KEY,
|
22
|
+
COLUMN_ROWS_NULL_COUNT_KEY,
|
23
|
+
FIELD_METADATA_KEY,
|
24
|
+
FIELDS_KEY,
|
25
|
+
NAME_KEY,
|
26
|
+
)
|
27
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
28
|
+
ColumnCollectorBase,
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
class StructColumnCollector(ColumnCollectorBase):
|
33
|
+
|
34
|
+
"""Class for collect a struct type column.
|
35
|
+
|
36
|
+
Attributes:
|
37
|
+
name (str): the name of the column.
|
38
|
+
type (str): the type of the column.
|
39
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
40
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
41
|
+
|
42
|
+
"""
|
43
|
+
|
44
|
+
def __init__(
|
45
|
+
self, clm_name: str, struct_field: StructField, clm_df: SparkDataFrame
|
46
|
+
) -> None:
|
47
|
+
"""Init StructColumnCollector.
|
48
|
+
|
49
|
+
Args:
|
50
|
+
clm_name (str): the name of the column.
|
51
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
52
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
53
|
+
|
54
|
+
"""
|
55
|
+
super().__init__(clm_name, struct_field, clm_df)
|
56
|
+
|
57
|
+
def get_custom_data(self) -> dict[str, any]:
|
58
|
+
metadata = self._compute_struct_metadata()
|
59
|
+
|
60
|
+
custom_data_dict = {
|
61
|
+
COLUMN_METADATA_KEY: metadata,
|
62
|
+
}
|
63
|
+
|
64
|
+
return custom_data_dict
|
65
|
+
|
66
|
+
def _compute_struct_metadata(self) -> list[dict[str, any]]:
|
67
|
+
field_metadata_collection = []
|
68
|
+
struct_field_json = self.struct_field.dataType.jsonValue()
|
69
|
+
for field in struct_field_json[FIELDS_KEY]:
|
70
|
+
del field[FIELD_METADATA_KEY]
|
71
|
+
clm_name = field[NAME_KEY]
|
72
|
+
rows_count_dict = self._compute_rows_count_by_column(clm_name)
|
73
|
+
struct_field_custom_data = dict(field | rows_count_dict)
|
74
|
+
field_metadata_collection.append(struct_field_custom_data)
|
75
|
+
|
76
|
+
return field_metadata_collection
|
77
|
+
|
78
|
+
def _compute_rows_count_by_column(self, clm_name: str) -> dict[str, int]:
|
79
|
+
rows_count = 0
|
80
|
+
rows_not_null_count = 0
|
81
|
+
rows_null_count = 0
|
82
|
+
row_collection = self.column_df.collect()
|
83
|
+
for row in row_collection:
|
84
|
+
inner_row = row[0]
|
85
|
+
rows_count += 1
|
86
|
+
if inner_row is None:
|
87
|
+
rows_null_count += 1
|
88
|
+
continue
|
89
|
+
|
90
|
+
row_clm_value = inner_row[clm_name]
|
91
|
+
if row_clm_value is None:
|
92
|
+
rows_null_count += 1
|
93
|
+
else:
|
94
|
+
rows_not_null_count += 1
|
95
|
+
|
96
|
+
rows_count_dict = {
|
97
|
+
COLUMN_COUNT_KEY: rows_count,
|
98
|
+
COLUMN_ROWS_NOT_NULL_COUNT_KEY: rows_not_null_count,
|
99
|
+
COLUMN_ROWS_NULL_COUNT_KEY: rows_null_count,
|
100
|
+
}
|
101
|
+
|
102
|
+
return rows_count_dict
|
snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
17
|
+
from pyspark.sql.functions import col as spark_col
|
18
|
+
from pyspark.sql.functions import max as spark_max
|
19
|
+
from pyspark.sql.functions import min as spark_min
|
20
|
+
from pyspark.sql.types import StructField
|
21
|
+
|
22
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
23
|
+
COLUMN_FORMAT_KEY,
|
24
|
+
COLUMN_MAX_KEY,
|
25
|
+
COLUMN_MIN_KEY,
|
26
|
+
)
|
27
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
28
|
+
ColumnCollectorBase,
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
FORMAT = "%Y-%m-%dT%H:%M:%S%z"
|
33
|
+
|
34
|
+
|
35
|
+
class TimestampColumnCollector(ColumnCollectorBase):
|
36
|
+
|
37
|
+
"""Class for collect a timestamp type column.
|
38
|
+
|
39
|
+
Attributes:
|
40
|
+
name (str): the name of the column.
|
41
|
+
type (str): the type of the column.
|
42
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
43
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
44
|
+
|
45
|
+
"""
|
46
|
+
|
47
|
+
def __init__(
|
48
|
+
self, clm_name, struct_field: StructField, clm_df: SparkDataFrame
|
49
|
+
) -> None:
|
50
|
+
"""Init TimestampColumnCollector.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
clm_name (str): the name of the column.
|
54
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
55
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
56
|
+
|
57
|
+
"""
|
58
|
+
super().__init__(clm_name, struct_field, clm_df)
|
59
|
+
|
60
|
+
def get_custom_data(self) -> dict[str, any]:
|
61
|
+
select_result = self.column_df.select(
|
62
|
+
spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
|
63
|
+
spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
|
64
|
+
).collect()[0]
|
65
|
+
|
66
|
+
min_value = str(select_result[COLUMN_MIN_KEY])
|
67
|
+
max_value = str(select_result[COLUMN_MAX_KEY])
|
68
|
+
|
69
|
+
custom_data_dict = {
|
70
|
+
COLUMN_MIN_KEY: min_value,
|
71
|
+
COLUMN_MAX_KEY: max_value,
|
72
|
+
COLUMN_FORMAT_KEY: FORMAT,
|
73
|
+
}
|
74
|
+
|
75
|
+
return custom_data_dict
|
snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
17
|
+
from pyspark.sql.functions import col as spark_col
|
18
|
+
from pyspark.sql.functions import max as spark_max
|
19
|
+
from pyspark.sql.functions import min as spark_min
|
20
|
+
from pyspark.sql.types import StructField
|
21
|
+
|
22
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
23
|
+
COLUMN_FORMAT_KEY,
|
24
|
+
COLUMN_MAX_KEY,
|
25
|
+
COLUMN_MIN_KEY,
|
26
|
+
)
|
27
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.model.column_collector_base import (
|
28
|
+
ColumnCollectorBase,
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
FORMAT = "%Y-%m-%dH:%M:%S"
|
33
|
+
|
34
|
+
|
35
|
+
class TimestampNTZColumnCollector(ColumnCollectorBase):
|
36
|
+
|
37
|
+
"""Class for collect a timestamp ntz type column.
|
38
|
+
|
39
|
+
Attributes:
|
40
|
+
name (str): the name of the column.
|
41
|
+
type (str): the type of the column.
|
42
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
43
|
+
column_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
44
|
+
|
45
|
+
"""
|
46
|
+
|
47
|
+
def __init__(
|
48
|
+
self, clm_name, struct_field: StructField, clm_df: SparkDataFrame
|
49
|
+
) -> None:
|
50
|
+
"""Init TimestampNTZColumnCollector.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
clm_name (str): the name of the column.
|
54
|
+
struct_field (pyspark.sql.types.StructField): the struct field of the column type.
|
55
|
+
clm_df (pyspark.sql.DataFrame): the column values as PySpark DataFrame.
|
56
|
+
|
57
|
+
"""
|
58
|
+
super().__init__(clm_name, struct_field, clm_df)
|
59
|
+
|
60
|
+
def get_custom_data(self) -> dict[str, any]:
|
61
|
+
select_result = self.column_df.select(
|
62
|
+
spark_min(spark_col(self.name)).alias(COLUMN_MIN_KEY),
|
63
|
+
spark_max(spark_col(self.name)).alias(COLUMN_MAX_KEY),
|
64
|
+
).collect()[0]
|
65
|
+
|
66
|
+
min_value = str(select_result[COLUMN_MIN_KEY])
|
67
|
+
max_value = str(select_result[COLUMN_MAX_KEY])
|
68
|
+
|
69
|
+
custom_data_dict = {
|
70
|
+
COLUMN_MIN_KEY: min_value,
|
71
|
+
COLUMN_MAX_KEY: max_value,
|
72
|
+
COLUMN_FORMAT_KEY: FORMAT,
|
73
|
+
}
|
74
|
+
|
75
|
+
return custom_data_dict
|