acryl-datahub-cloud 0.3.12rc4__py3-none-any.whl → 0.3.12rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

@@ -0,0 +1,224 @@
1
+ import logging
2
+ from datetime import datetime
3
+ from typing import Optional, Union
4
+
5
+ from typing_extensions import Self
6
+
7
+ from acryl_datahub_cloud.sdk.assertion.assertion_base import (
8
+ AssertionMode,
9
+ _AssertionPublic,
10
+ _HasColumnMetricFunctionality,
11
+ _HasSchedule,
12
+ _HasSmartFunctionality,
13
+ )
14
+ from acryl_datahub_cloud.sdk.assertion_input.assertion_input import (
15
+ ASSERTION_MONITOR_DEFAULT_TRAINING_LOOKBACK_WINDOW_DAYS,
16
+ DEFAULT_DETECTION_MECHANISM,
17
+ DEFAULT_SCHEDULE,
18
+ DEFAULT_SENSITIVITY,
19
+ AssertionIncidentBehavior,
20
+ DetectionMechanism,
21
+ ExclusionWindowTypes,
22
+ InferenceSensitivity,
23
+ _DetectionMechanismTypes,
24
+ )
25
+ from acryl_datahub_cloud.sdk.assertion_input.smart_column_metric_assertion_input import (
26
+ MetricInputType,
27
+ OperatorInputType,
28
+ RangeInputType,
29
+ RangeTypeInputType,
30
+ ValueInputType,
31
+ ValueTypeInputType,
32
+ )
33
+ from acryl_datahub_cloud.sdk.entities.assertion import Assertion
34
+ from acryl_datahub_cloud.sdk.entities.monitor import Monitor
35
+ from datahub.metadata import schema_classes as models
36
+ from datahub.metadata.urns import AssertionUrn, CorpUserUrn, DatasetUrn, TagUrn
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ class SmartColumnMetricAssertion(
42
+ _HasColumnMetricFunctionality,
43
+ _HasSmartFunctionality,
44
+ _HasSchedule,
45
+ _AssertionPublic,
46
+ ):
47
+ """
48
+ A class that represents a smart column metric assertion.
49
+ This assertion is used to validate the value of a common field / column metric (e.g. aggregation) such as null count + percentage,
50
+ min, max, median, and more. It uses AI to infer the assertion parameters.
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ *,
56
+ urn: AssertionUrn,
57
+ dataset_urn: DatasetUrn,
58
+ column_name: str,
59
+ metric_type: MetricInputType,
60
+ operator: OperatorInputType,
61
+ # Depending on the operator, value, range (and corresponding type) or no parameters are required:
62
+ value: Optional[ValueInputType] = None,
63
+ value_type: Optional[ValueTypeInputType] = None,
64
+ range: Optional[RangeInputType] = None,
65
+ range_type: Optional[RangeTypeInputType] = None,
66
+ # TODO: Evaluate these params:
67
+ display_name: str,
68
+ mode: AssertionMode,
69
+ schedule: models.CronScheduleClass = DEFAULT_SCHEDULE,
70
+ sensitivity: InferenceSensitivity = DEFAULT_SENSITIVITY,
71
+ exclusion_windows: list[ExclusionWindowTypes],
72
+ training_data_lookback_days: int = ASSERTION_MONITOR_DEFAULT_TRAINING_LOOKBACK_WINDOW_DAYS,
73
+ incident_behavior: list[AssertionIncidentBehavior],
74
+ detection_mechanism: Optional[
75
+ _DetectionMechanismTypes
76
+ ] = DEFAULT_DETECTION_MECHANISM,
77
+ tags: list[TagUrn],
78
+ created_by: Optional[CorpUserUrn] = None,
79
+ created_at: Union[datetime, None] = None,
80
+ updated_by: Optional[CorpUserUrn] = None,
81
+ updated_at: Optional[datetime] = None,
82
+ ):
83
+ """
84
+ Initialize a smart column metric assertion.
85
+
86
+ Args:
87
+ urn: The URN of the assertion.
88
+ dataset_urn: The URN of the dataset to monitor.
89
+ display_name: The display name of the assertion.
90
+ mode: The mode of the assertion (active/inactive).
91
+ sensitivity: The sensitivity of the assertion (low/medium/high).
92
+ exclusion_windows: The exclusion windows to apply to the assertion.
93
+ training_data_lookback_days: The number of days of data to use for training.
94
+ incident_behavior: The behavior when incidents occur.
95
+ detection_mechanism: The mechanism used to detect changes.
96
+ tags: The tags to apply to the assertion.
97
+ created_by: The URN of the user who created the assertion.
98
+ created_at: The timestamp when the assertion was created.
99
+ updated_by: The URN of the user who last updated the assertion.
100
+ updated_at: The timestamp when the assertion was last updated.
101
+ """
102
+ _AssertionPublic.__init__(
103
+ self,
104
+ urn=urn,
105
+ dataset_urn=dataset_urn,
106
+ display_name=display_name,
107
+ mode=mode,
108
+ tags=tags,
109
+ incident_behavior=incident_behavior,
110
+ detection_mechanism=detection_mechanism,
111
+ created_by=created_by,
112
+ created_at=created_at,
113
+ updated_by=updated_by,
114
+ updated_at=updated_at,
115
+ )
116
+ _HasSmartFunctionality.__init__(
117
+ self,
118
+ sensitivity=sensitivity,
119
+ exclusion_windows=exclusion_windows,
120
+ training_data_lookback_days=training_data_lookback_days,
121
+ )
122
+ _HasSchedule.__init__(
123
+ self,
124
+ schedule=schedule,
125
+ )
126
+ _HasColumnMetricFunctionality.__init__(
127
+ self,
128
+ column_name=column_name,
129
+ metric_type=metric_type,
130
+ operator=operator,
131
+ value=value,
132
+ value_type=value_type,
133
+ range=range,
134
+ range_type=range_type,
135
+ )
136
+
137
+ @classmethod
138
+ def _from_entities(cls, assertion: Assertion, monitor: Monitor) -> Self:
139
+ """
140
+ Create a SmartColumnMetricAssertion from an Assertion and Monitor entity.
141
+
142
+ Args:
143
+ assertion: The Assertion entity.
144
+ monitor: The Monitor entity.
145
+
146
+ Returns:
147
+ A SmartColumnMetricAssertion instance.
148
+ """
149
+ return cls(
150
+ urn=assertion.urn,
151
+ dataset_urn=assertion.dataset,
152
+ column_name=cls._get_column_name(assertion),
153
+ metric_type=cls._get_metric_type(assertion),
154
+ operator=cls._get_operator(assertion),
155
+ value=cls._get_value(assertion),
156
+ value_type=cls._get_value_type(assertion),
157
+ range=cls._get_range(assertion),
158
+ range_type=cls._get_range_type(assertion),
159
+ display_name=assertion.description or "",
160
+ mode=cls._get_mode(monitor),
161
+ schedule=cls._get_schedule(monitor),
162
+ sensitivity=cls._get_sensitivity(monitor),
163
+ exclusion_windows=cls._get_exclusion_windows(monitor),
164
+ training_data_lookback_days=cls._get_training_data_lookback_days(monitor),
165
+ incident_behavior=cls._get_incident_behavior(assertion),
166
+ detection_mechanism=cls._get_detection_mechanism(assertion, monitor),
167
+ tags=cls._get_tags(assertion),
168
+ created_by=cls._get_created_by(assertion),
169
+ created_at=cls._get_created_at(assertion),
170
+ updated_by=cls._get_updated_by(assertion),
171
+ updated_at=cls._get_updated_at(assertion),
172
+ )
173
+
174
+ @staticmethod
175
+ def _get_detection_mechanism(
176
+ assertion: Assertion,
177
+ monitor: Monitor,
178
+ default: Optional[_DetectionMechanismTypes] = DEFAULT_DETECTION_MECHANISM,
179
+ ) -> Optional[_DetectionMechanismTypes]:
180
+ """Get the detection mechanism for column metric assertions."""
181
+ parameters = _AssertionPublic._get_validated_detection_context(
182
+ monitor,
183
+ assertion,
184
+ models.AssertionEvaluationParametersTypeClass.DATASET_FIELD,
185
+ models.FieldAssertionInfoClass,
186
+ default,
187
+ )
188
+ if parameters is None:
189
+ return default
190
+ if parameters.datasetFieldParameters is None:
191
+ logger.warning(
192
+ f"Monitor does not have datasetFieldParameters, defaulting detection mechanism to {default}"
193
+ )
194
+ return default
195
+ source_type = parameters.datasetFieldParameters.sourceType
196
+ if source_type == models.DatasetFieldAssertionSourceTypeClass.ALL_ROWS_QUERY:
197
+ additional_filter = _AssertionPublic._get_additional_filter(assertion)
198
+ return DetectionMechanism.ALL_ROWS_QUERY(
199
+ additional_filter=additional_filter
200
+ )
201
+ elif (
202
+ source_type
203
+ == models.DatasetFieldAssertionSourceTypeClass.CHANGED_ROWS_QUERY
204
+ ):
205
+ if parameters.datasetFieldParameters.changedRowsField is None:
206
+ logger.warning(
207
+ f"Monitor has CHANGED_ROWS_QUERY source type but no changedRowsField, defaulting detection mechanism to {default}"
208
+ )
209
+ return default
210
+ column_name = parameters.datasetFieldParameters.changedRowsField.path
211
+ additional_filter = _AssertionPublic._get_additional_filter(assertion)
212
+ return DetectionMechanism.CHANGED_ROWS_QUERY(
213
+ column_name=column_name, additional_filter=additional_filter
214
+ )
215
+ elif (
216
+ source_type
217
+ == models.DatasetFieldAssertionSourceTypeClass.DATAHUB_DATASET_PROFILE
218
+ ):
219
+ return DetectionMechanism.ALL_ROWS_QUERY_DATAHUB_DATASET_PROFILE
220
+ else:
221
+ logger.warning(
222
+ f"Unsupported DatasetFieldAssertionSourceType {source_type}, defaulting detection mechanism to {default}"
223
+ )
224
+ return default
@@ -0,0 +1,18 @@
1
+ from typing import Union
2
+
3
+ from acryl_datahub_cloud.sdk.assertion.assertion_base import (
4
+ FreshnessAssertion,
5
+ SmartFreshnessAssertion,
6
+ SmartVolumeAssertion,
7
+ )
8
+ from acryl_datahub_cloud.sdk.assertion.smart_column_metric_assertion import (
9
+ SmartColumnMetricAssertion,
10
+ )
11
+
12
+ AssertionTypes = Union[
13
+ SmartFreshnessAssertion,
14
+ SmartVolumeAssertion,
15
+ FreshnessAssertion,
16
+ SmartColumnMetricAssertion,
17
+ # TODO: Add other assertion types here as we add them.
18
+ ]
@@ -44,7 +44,7 @@ DEFAULT_NAME_PREFIX = "New Assertion"
44
44
  DEFAULT_NAME_SUFFIX_LENGTH = 8
45
45
 
46
46
 
47
- DEFAULT_HOURLY_SCHEDULE = models.CronScheduleClass(
47
+ DEFAULT_HOURLY_SCHEDULE: models.CronScheduleClass = models.CronScheduleClass(
48
48
  cron="0 * * * *", # Every hour, matches the UI default
49
49
  timezone=str(
50
50
  tzlocal.get_localzone()
@@ -59,6 +59,13 @@ DEFAULT_DAILY_SCHEDULE = models.CronScheduleClass(
59
59
  ), # User local timezone, matches the UI default
60
60
  )
61
61
 
62
+ DEFAULT_EVERY_SIX_HOURS_SCHEDULE = models.CronScheduleClass(
63
+ cron="0 */6 * * *", # Every 6 hours, matches the UI default
64
+ timezone=str(
65
+ tzlocal.get_localzone()
66
+ ), # User local timezone, matches the UI default
67
+ )
68
+
62
69
 
63
70
  class AbstractDetectionMechanism(BaseModel, ABC):
64
71
  type: str
@@ -181,7 +188,7 @@ class DetectionMechanism:
181
188
  HIGH_WATERMARK_COLUMN = _HighWatermarkColumn
182
189
  DATAHUB_OPERATION = _DataHubOperation()
183
190
  QUERY = _Query
184
- ALL_ROWS_QUERY = _AllRowsQuery()
191
+ ALL_ROWS_QUERY = _AllRowsQuery
185
192
  CHANGED_ROWS_QUERY = _ChangedRowsQuery
186
193
  ALL_ROWS_QUERY_DATAHUB_DATASET_PROFILE = _AllRowsQueryDataHubDatasetProfile()
187
194
  DATASET_PROFILE = _DatasetProfile()
@@ -710,7 +717,7 @@ def _try_parse_and_validate_schema_classes_enum(
710
717
  if isinstance(value, enum_class):
711
718
  return value
712
719
  assert isinstance(value, str)
713
- if value not in get_enum_options(enum_class):
720
+ if value.upper() not in get_enum_options(enum_class):
714
721
  raise SDKUsageError(
715
722
  f"Invalid value for {enum_class.__name__}: {value}, valid options are {get_enum_options(enum_class)}"
716
723
  )
@@ -3,11 +3,10 @@ from datetime import datetime
3
3
  from typing import Optional, Union
4
4
 
5
5
  from acryl_datahub_cloud.sdk.assertion_input.assertion_input import (
6
- DEFAULT_HOURLY_SCHEDULE,
6
+ DEFAULT_EVERY_SIX_HOURS_SCHEDULE,
7
7
  HIGH_WATERMARK_ALLOWED_FIELD_TYPES,
8
8
  AssertionIncidentBehavior,
9
9
  AssertionInfoInputType,
10
- DetectionMechanism,
11
10
  DetectionMechanismInputTypes,
12
11
  ExclusionWindowInputTypes,
13
12
  FieldSpecType,
@@ -183,8 +182,8 @@ RangeTypeInputType = Union[
183
182
  RangeTypeParsedType = tuple[ValueTypeInputType, ValueTypeInputType]
184
183
  OperatorInputType = Union[str, models.AssertionStdOperatorClass]
185
184
 
186
- DEFAULT_DETECTION_MECHANISM_SMART_COLUMN_METRIC_ASSERTION = (
187
- DetectionMechanism.ALL_ROWS_QUERY
185
+ DEFAULT_DETECTION_MECHANISM_SMART_COLUMN_METRIC_ASSERTION: _AllRowsQuery = (
186
+ _AllRowsQuery()
188
187
  )
189
188
 
190
189
 
@@ -512,7 +511,7 @@ class _SmartColumnMetricAssertionInput(_AssertionInput, _HasSmartAssertionInputs
512
511
  A CronScheduleClass with appropriate schedule settings.
513
512
  """
514
513
  if self.schedule is None:
515
- return DEFAULT_HOURLY_SCHEDULE
514
+ return DEFAULT_EVERY_SIX_HOURS_SCHEDULE
516
515
 
517
516
  return models.CronScheduleClass(
518
517
  cron=self.schedule.cron,
@@ -815,13 +814,10 @@ def _try_parse_and_validate_value_type(
815
814
  ) -> models.AssertionStdParameterTypeClass:
816
815
  if value_type is None:
817
816
  raise SDKUsageError("Value type is required")
818
- if isinstance(value_type, models.AssertionStdParameterTypeClass):
819
- return value_type
820
- if value_type not in get_enum_options(models.AssertionStdParameterTypeClass):
821
- raise SDKUsageError(
822
- f"Invalid value type: {value_type}, valid options are {get_enum_options(models.AssertionStdParameterTypeClass)}"
823
- )
824
- return getattr(models.AssertionStdParameterTypeClass, value_type)
817
+
818
+ return _try_parse_and_validate_schema_classes_enum(
819
+ value_type, models.AssertionStdParameterTypeClass
820
+ )
825
821
 
826
822
 
827
823
  def _try_parse_and_validate_value(