acryl-datahub-cloud 0.3.12rc4__py3-none-any.whl → 0.3.12rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/sdk/__init__.py +5 -1
- acryl_datahub_cloud/sdk/assertion/__init__.py +0 -0
- acryl_datahub_cloud/sdk/{assertion.py → assertion/assertion_base.py} +401 -169
- acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +224 -0
- acryl_datahub_cloud/sdk/assertion/types.py +18 -0
- acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +10 -3
- acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +8 -12
- acryl_datahub_cloud/sdk/assertions_client.py +726 -1
- acryl_datahub_cloud/sdk/entities/assertion.py +4 -0
- {acryl_datahub_cloud-0.3.12rc4.dist-info → acryl_datahub_cloud-0.3.12rc5.dist-info}/METADATA +45 -45
- {acryl_datahub_cloud-0.3.12rc4.dist-info → acryl_datahub_cloud-0.3.12rc5.dist-info}/RECORD +15 -12
- {acryl_datahub_cloud-0.3.12rc4.dist-info → acryl_datahub_cloud-0.3.12rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.12rc4.dist-info → acryl_datahub_cloud-0.3.12rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_cloud-0.3.12rc4.dist-info → acryl_datahub_cloud-0.3.12rc5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
from typing_extensions import Self
|
|
6
|
+
|
|
7
|
+
from acryl_datahub_cloud.sdk.assertion.assertion_base import (
|
|
8
|
+
AssertionMode,
|
|
9
|
+
_AssertionPublic,
|
|
10
|
+
_HasColumnMetricFunctionality,
|
|
11
|
+
_HasSchedule,
|
|
12
|
+
_HasSmartFunctionality,
|
|
13
|
+
)
|
|
14
|
+
from acryl_datahub_cloud.sdk.assertion_input.assertion_input import (
|
|
15
|
+
ASSERTION_MONITOR_DEFAULT_TRAINING_LOOKBACK_WINDOW_DAYS,
|
|
16
|
+
DEFAULT_DETECTION_MECHANISM,
|
|
17
|
+
DEFAULT_SCHEDULE,
|
|
18
|
+
DEFAULT_SENSITIVITY,
|
|
19
|
+
AssertionIncidentBehavior,
|
|
20
|
+
DetectionMechanism,
|
|
21
|
+
ExclusionWindowTypes,
|
|
22
|
+
InferenceSensitivity,
|
|
23
|
+
_DetectionMechanismTypes,
|
|
24
|
+
)
|
|
25
|
+
from acryl_datahub_cloud.sdk.assertion_input.smart_column_metric_assertion_input import (
|
|
26
|
+
MetricInputType,
|
|
27
|
+
OperatorInputType,
|
|
28
|
+
RangeInputType,
|
|
29
|
+
RangeTypeInputType,
|
|
30
|
+
ValueInputType,
|
|
31
|
+
ValueTypeInputType,
|
|
32
|
+
)
|
|
33
|
+
from acryl_datahub_cloud.sdk.entities.assertion import Assertion
|
|
34
|
+
from acryl_datahub_cloud.sdk.entities.monitor import Monitor
|
|
35
|
+
from datahub.metadata import schema_classes as models
|
|
36
|
+
from datahub.metadata.urns import AssertionUrn, CorpUserUrn, DatasetUrn, TagUrn
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class SmartColumnMetricAssertion(
|
|
42
|
+
_HasColumnMetricFunctionality,
|
|
43
|
+
_HasSmartFunctionality,
|
|
44
|
+
_HasSchedule,
|
|
45
|
+
_AssertionPublic,
|
|
46
|
+
):
|
|
47
|
+
"""
|
|
48
|
+
A class that represents a smart column metric assertion.
|
|
49
|
+
This assertion is used to validate the value of a common field / column metric (e.g. aggregation) such as null count + percentage,
|
|
50
|
+
min, max, median, and more. It uses AI to infer the assertion parameters.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
*,
|
|
56
|
+
urn: AssertionUrn,
|
|
57
|
+
dataset_urn: DatasetUrn,
|
|
58
|
+
column_name: str,
|
|
59
|
+
metric_type: MetricInputType,
|
|
60
|
+
operator: OperatorInputType,
|
|
61
|
+
# Depending on the operator, value, range (and corresponding type) or no parameters are required:
|
|
62
|
+
value: Optional[ValueInputType] = None,
|
|
63
|
+
value_type: Optional[ValueTypeInputType] = None,
|
|
64
|
+
range: Optional[RangeInputType] = None,
|
|
65
|
+
range_type: Optional[RangeTypeInputType] = None,
|
|
66
|
+
# TODO: Evaluate these params:
|
|
67
|
+
display_name: str,
|
|
68
|
+
mode: AssertionMode,
|
|
69
|
+
schedule: models.CronScheduleClass = DEFAULT_SCHEDULE,
|
|
70
|
+
sensitivity: InferenceSensitivity = DEFAULT_SENSITIVITY,
|
|
71
|
+
exclusion_windows: list[ExclusionWindowTypes],
|
|
72
|
+
training_data_lookback_days: int = ASSERTION_MONITOR_DEFAULT_TRAINING_LOOKBACK_WINDOW_DAYS,
|
|
73
|
+
incident_behavior: list[AssertionIncidentBehavior],
|
|
74
|
+
detection_mechanism: Optional[
|
|
75
|
+
_DetectionMechanismTypes
|
|
76
|
+
] = DEFAULT_DETECTION_MECHANISM,
|
|
77
|
+
tags: list[TagUrn],
|
|
78
|
+
created_by: Optional[CorpUserUrn] = None,
|
|
79
|
+
created_at: Union[datetime, None] = None,
|
|
80
|
+
updated_by: Optional[CorpUserUrn] = None,
|
|
81
|
+
updated_at: Optional[datetime] = None,
|
|
82
|
+
):
|
|
83
|
+
"""
|
|
84
|
+
Initialize a smart column metric assertion.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
urn: The URN of the assertion.
|
|
88
|
+
dataset_urn: The URN of the dataset to monitor.
|
|
89
|
+
display_name: The display name of the assertion.
|
|
90
|
+
mode: The mode of the assertion (active/inactive).
|
|
91
|
+
sensitivity: The sensitivity of the assertion (low/medium/high).
|
|
92
|
+
exclusion_windows: The exclusion windows to apply to the assertion.
|
|
93
|
+
training_data_lookback_days: The number of days of data to use for training.
|
|
94
|
+
incident_behavior: The behavior when incidents occur.
|
|
95
|
+
detection_mechanism: The mechanism used to detect changes.
|
|
96
|
+
tags: The tags to apply to the assertion.
|
|
97
|
+
created_by: The URN of the user who created the assertion.
|
|
98
|
+
created_at: The timestamp when the assertion was created.
|
|
99
|
+
updated_by: The URN of the user who last updated the assertion.
|
|
100
|
+
updated_at: The timestamp when the assertion was last updated.
|
|
101
|
+
"""
|
|
102
|
+
_AssertionPublic.__init__(
|
|
103
|
+
self,
|
|
104
|
+
urn=urn,
|
|
105
|
+
dataset_urn=dataset_urn,
|
|
106
|
+
display_name=display_name,
|
|
107
|
+
mode=mode,
|
|
108
|
+
tags=tags,
|
|
109
|
+
incident_behavior=incident_behavior,
|
|
110
|
+
detection_mechanism=detection_mechanism,
|
|
111
|
+
created_by=created_by,
|
|
112
|
+
created_at=created_at,
|
|
113
|
+
updated_by=updated_by,
|
|
114
|
+
updated_at=updated_at,
|
|
115
|
+
)
|
|
116
|
+
_HasSmartFunctionality.__init__(
|
|
117
|
+
self,
|
|
118
|
+
sensitivity=sensitivity,
|
|
119
|
+
exclusion_windows=exclusion_windows,
|
|
120
|
+
training_data_lookback_days=training_data_lookback_days,
|
|
121
|
+
)
|
|
122
|
+
_HasSchedule.__init__(
|
|
123
|
+
self,
|
|
124
|
+
schedule=schedule,
|
|
125
|
+
)
|
|
126
|
+
_HasColumnMetricFunctionality.__init__(
|
|
127
|
+
self,
|
|
128
|
+
column_name=column_name,
|
|
129
|
+
metric_type=metric_type,
|
|
130
|
+
operator=operator,
|
|
131
|
+
value=value,
|
|
132
|
+
value_type=value_type,
|
|
133
|
+
range=range,
|
|
134
|
+
range_type=range_type,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def _from_entities(cls, assertion: Assertion, monitor: Monitor) -> Self:
|
|
139
|
+
"""
|
|
140
|
+
Create a SmartColumnMetricAssertion from an Assertion and Monitor entity.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
assertion: The Assertion entity.
|
|
144
|
+
monitor: The Monitor entity.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
A SmartColumnMetricAssertion instance.
|
|
148
|
+
"""
|
|
149
|
+
return cls(
|
|
150
|
+
urn=assertion.urn,
|
|
151
|
+
dataset_urn=assertion.dataset,
|
|
152
|
+
column_name=cls._get_column_name(assertion),
|
|
153
|
+
metric_type=cls._get_metric_type(assertion),
|
|
154
|
+
operator=cls._get_operator(assertion),
|
|
155
|
+
value=cls._get_value(assertion),
|
|
156
|
+
value_type=cls._get_value_type(assertion),
|
|
157
|
+
range=cls._get_range(assertion),
|
|
158
|
+
range_type=cls._get_range_type(assertion),
|
|
159
|
+
display_name=assertion.description or "",
|
|
160
|
+
mode=cls._get_mode(monitor),
|
|
161
|
+
schedule=cls._get_schedule(monitor),
|
|
162
|
+
sensitivity=cls._get_sensitivity(monitor),
|
|
163
|
+
exclusion_windows=cls._get_exclusion_windows(monitor),
|
|
164
|
+
training_data_lookback_days=cls._get_training_data_lookback_days(monitor),
|
|
165
|
+
incident_behavior=cls._get_incident_behavior(assertion),
|
|
166
|
+
detection_mechanism=cls._get_detection_mechanism(assertion, monitor),
|
|
167
|
+
tags=cls._get_tags(assertion),
|
|
168
|
+
created_by=cls._get_created_by(assertion),
|
|
169
|
+
created_at=cls._get_created_at(assertion),
|
|
170
|
+
updated_by=cls._get_updated_by(assertion),
|
|
171
|
+
updated_at=cls._get_updated_at(assertion),
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
@staticmethod
|
|
175
|
+
def _get_detection_mechanism(
|
|
176
|
+
assertion: Assertion,
|
|
177
|
+
monitor: Monitor,
|
|
178
|
+
default: Optional[_DetectionMechanismTypes] = DEFAULT_DETECTION_MECHANISM,
|
|
179
|
+
) -> Optional[_DetectionMechanismTypes]:
|
|
180
|
+
"""Get the detection mechanism for column metric assertions."""
|
|
181
|
+
parameters = _AssertionPublic._get_validated_detection_context(
|
|
182
|
+
monitor,
|
|
183
|
+
assertion,
|
|
184
|
+
models.AssertionEvaluationParametersTypeClass.DATASET_FIELD,
|
|
185
|
+
models.FieldAssertionInfoClass,
|
|
186
|
+
default,
|
|
187
|
+
)
|
|
188
|
+
if parameters is None:
|
|
189
|
+
return default
|
|
190
|
+
if parameters.datasetFieldParameters is None:
|
|
191
|
+
logger.warning(
|
|
192
|
+
f"Monitor does not have datasetFieldParameters, defaulting detection mechanism to {default}"
|
|
193
|
+
)
|
|
194
|
+
return default
|
|
195
|
+
source_type = parameters.datasetFieldParameters.sourceType
|
|
196
|
+
if source_type == models.DatasetFieldAssertionSourceTypeClass.ALL_ROWS_QUERY:
|
|
197
|
+
additional_filter = _AssertionPublic._get_additional_filter(assertion)
|
|
198
|
+
return DetectionMechanism.ALL_ROWS_QUERY(
|
|
199
|
+
additional_filter=additional_filter
|
|
200
|
+
)
|
|
201
|
+
elif (
|
|
202
|
+
source_type
|
|
203
|
+
== models.DatasetFieldAssertionSourceTypeClass.CHANGED_ROWS_QUERY
|
|
204
|
+
):
|
|
205
|
+
if parameters.datasetFieldParameters.changedRowsField is None:
|
|
206
|
+
logger.warning(
|
|
207
|
+
f"Monitor has CHANGED_ROWS_QUERY source type but no changedRowsField, defaulting detection mechanism to {default}"
|
|
208
|
+
)
|
|
209
|
+
return default
|
|
210
|
+
column_name = parameters.datasetFieldParameters.changedRowsField.path
|
|
211
|
+
additional_filter = _AssertionPublic._get_additional_filter(assertion)
|
|
212
|
+
return DetectionMechanism.CHANGED_ROWS_QUERY(
|
|
213
|
+
column_name=column_name, additional_filter=additional_filter
|
|
214
|
+
)
|
|
215
|
+
elif (
|
|
216
|
+
source_type
|
|
217
|
+
== models.DatasetFieldAssertionSourceTypeClass.DATAHUB_DATASET_PROFILE
|
|
218
|
+
):
|
|
219
|
+
return DetectionMechanism.ALL_ROWS_QUERY_DATAHUB_DATASET_PROFILE
|
|
220
|
+
else:
|
|
221
|
+
logger.warning(
|
|
222
|
+
f"Unsupported DatasetFieldAssertionSourceType {source_type}, defaulting detection mechanism to {default}"
|
|
223
|
+
)
|
|
224
|
+
return default
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
from acryl_datahub_cloud.sdk.assertion.assertion_base import (
|
|
4
|
+
FreshnessAssertion,
|
|
5
|
+
SmartFreshnessAssertion,
|
|
6
|
+
SmartVolumeAssertion,
|
|
7
|
+
)
|
|
8
|
+
from acryl_datahub_cloud.sdk.assertion.smart_column_metric_assertion import (
|
|
9
|
+
SmartColumnMetricAssertion,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
AssertionTypes = Union[
|
|
13
|
+
SmartFreshnessAssertion,
|
|
14
|
+
SmartVolumeAssertion,
|
|
15
|
+
FreshnessAssertion,
|
|
16
|
+
SmartColumnMetricAssertion,
|
|
17
|
+
# TODO: Add other assertion types here as we add them.
|
|
18
|
+
]
|
|
@@ -44,7 +44,7 @@ DEFAULT_NAME_PREFIX = "New Assertion"
|
|
|
44
44
|
DEFAULT_NAME_SUFFIX_LENGTH = 8
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
DEFAULT_HOURLY_SCHEDULE = models.CronScheduleClass(
|
|
47
|
+
DEFAULT_HOURLY_SCHEDULE: models.CronScheduleClass = models.CronScheduleClass(
|
|
48
48
|
cron="0 * * * *", # Every hour, matches the UI default
|
|
49
49
|
timezone=str(
|
|
50
50
|
tzlocal.get_localzone()
|
|
@@ -59,6 +59,13 @@ DEFAULT_DAILY_SCHEDULE = models.CronScheduleClass(
|
|
|
59
59
|
), # User local timezone, matches the UI default
|
|
60
60
|
)
|
|
61
61
|
|
|
62
|
+
DEFAULT_EVERY_SIX_HOURS_SCHEDULE = models.CronScheduleClass(
|
|
63
|
+
cron="0 */6 * * *", # Every 6 hours, matches the UI default
|
|
64
|
+
timezone=str(
|
|
65
|
+
tzlocal.get_localzone()
|
|
66
|
+
), # User local timezone, matches the UI default
|
|
67
|
+
)
|
|
68
|
+
|
|
62
69
|
|
|
63
70
|
class AbstractDetectionMechanism(BaseModel, ABC):
|
|
64
71
|
type: str
|
|
@@ -181,7 +188,7 @@ class DetectionMechanism:
|
|
|
181
188
|
HIGH_WATERMARK_COLUMN = _HighWatermarkColumn
|
|
182
189
|
DATAHUB_OPERATION = _DataHubOperation()
|
|
183
190
|
QUERY = _Query
|
|
184
|
-
ALL_ROWS_QUERY = _AllRowsQuery
|
|
191
|
+
ALL_ROWS_QUERY = _AllRowsQuery
|
|
185
192
|
CHANGED_ROWS_QUERY = _ChangedRowsQuery
|
|
186
193
|
ALL_ROWS_QUERY_DATAHUB_DATASET_PROFILE = _AllRowsQueryDataHubDatasetProfile()
|
|
187
194
|
DATASET_PROFILE = _DatasetProfile()
|
|
@@ -710,7 +717,7 @@ def _try_parse_and_validate_schema_classes_enum(
|
|
|
710
717
|
if isinstance(value, enum_class):
|
|
711
718
|
return value
|
|
712
719
|
assert isinstance(value, str)
|
|
713
|
-
if value not in get_enum_options(enum_class):
|
|
720
|
+
if value.upper() not in get_enum_options(enum_class):
|
|
714
721
|
raise SDKUsageError(
|
|
715
722
|
f"Invalid value for {enum_class.__name__}: {value}, valid options are {get_enum_options(enum_class)}"
|
|
716
723
|
)
|
|
@@ -3,11 +3,10 @@ from datetime import datetime
|
|
|
3
3
|
from typing import Optional, Union
|
|
4
4
|
|
|
5
5
|
from acryl_datahub_cloud.sdk.assertion_input.assertion_input import (
|
|
6
|
-
|
|
6
|
+
DEFAULT_EVERY_SIX_HOURS_SCHEDULE,
|
|
7
7
|
HIGH_WATERMARK_ALLOWED_FIELD_TYPES,
|
|
8
8
|
AssertionIncidentBehavior,
|
|
9
9
|
AssertionInfoInputType,
|
|
10
|
-
DetectionMechanism,
|
|
11
10
|
DetectionMechanismInputTypes,
|
|
12
11
|
ExclusionWindowInputTypes,
|
|
13
12
|
FieldSpecType,
|
|
@@ -183,8 +182,8 @@ RangeTypeInputType = Union[
|
|
|
183
182
|
RangeTypeParsedType = tuple[ValueTypeInputType, ValueTypeInputType]
|
|
184
183
|
OperatorInputType = Union[str, models.AssertionStdOperatorClass]
|
|
185
184
|
|
|
186
|
-
DEFAULT_DETECTION_MECHANISM_SMART_COLUMN_METRIC_ASSERTION = (
|
|
187
|
-
|
|
185
|
+
DEFAULT_DETECTION_MECHANISM_SMART_COLUMN_METRIC_ASSERTION: _AllRowsQuery = (
|
|
186
|
+
_AllRowsQuery()
|
|
188
187
|
)
|
|
189
188
|
|
|
190
189
|
|
|
@@ -512,7 +511,7 @@ class _SmartColumnMetricAssertionInput(_AssertionInput, _HasSmartAssertionInputs
|
|
|
512
511
|
A CronScheduleClass with appropriate schedule settings.
|
|
513
512
|
"""
|
|
514
513
|
if self.schedule is None:
|
|
515
|
-
return
|
|
514
|
+
return DEFAULT_EVERY_SIX_HOURS_SCHEDULE
|
|
516
515
|
|
|
517
516
|
return models.CronScheduleClass(
|
|
518
517
|
cron=self.schedule.cron,
|
|
@@ -815,13 +814,10 @@ def _try_parse_and_validate_value_type(
|
|
|
815
814
|
) -> models.AssertionStdParameterTypeClass:
|
|
816
815
|
if value_type is None:
|
|
817
816
|
raise SDKUsageError("Value type is required")
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
f"Invalid value type: {value_type}, valid options are {get_enum_options(models.AssertionStdParameterTypeClass)}"
|
|
823
|
-
)
|
|
824
|
-
return getattr(models.AssertionStdParameterTypeClass, value_type)
|
|
817
|
+
|
|
818
|
+
return _try_parse_and_validate_schema_classes_enum(
|
|
819
|
+
value_type, models.AssertionStdParameterTypeClass
|
|
820
|
+
)
|
|
825
821
|
|
|
826
822
|
|
|
827
823
|
def _try_parse_and_validate_value(
|