acryl-datahub-cloud 0.3.12rc3__py3-none-any.whl → 0.3.12rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +37 -2
- acryl_datahub_cloud/metadata/schema.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +9 -0
- acryl_datahub_cloud/sdk/__init__.py +10 -2
- acryl_datahub_cloud/sdk/assertion/__init__.py +0 -0
- acryl_datahub_cloud/sdk/{assertion.py → assertion/assertion_base.py} +614 -231
- acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +224 -0
- acryl_datahub_cloud/sdk/assertion/types.py +18 -0
- acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
- acryl_datahub_cloud/sdk/{assertion_input.py → assertion_input/assertion_input.py} +437 -147
- acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +261 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +943 -0
- acryl_datahub_cloud/sdk/assertions_client.py +1281 -70
- acryl_datahub_cloud/sdk/entities/assertion.py +8 -1
- {acryl_datahub_cloud-0.3.12rc3.dist-info → acryl_datahub_cloud-0.3.12rc5.dist-info}/METADATA +41 -41
- {acryl_datahub_cloud-0.3.12rc3.dist-info → acryl_datahub_cloud-0.3.12rc5.dist-info}/RECORD +20 -14
- {acryl_datahub_cloud-0.3.12rc3.dist-info → acryl_datahub_cloud-0.3.12rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.12rc3.dist-info → acryl_datahub_cloud-0.3.12rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_cloud-0.3.12rc3.dist-info → acryl_datahub_cloud-0.3.12rc5.dist-info}/top_level.txt +0 -0
|
@@ -4,21 +4,38 @@ import logging
|
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
6
6
|
|
|
7
|
-
from acryl_datahub_cloud.sdk.assertion import (
|
|
7
|
+
from acryl_datahub_cloud.sdk.assertion.assertion_base import (
|
|
8
8
|
AssertionMode,
|
|
9
|
+
FreshnessAssertion,
|
|
9
10
|
SmartFreshnessAssertion,
|
|
10
11
|
SmartVolumeAssertion,
|
|
11
12
|
_AssertionPublic,
|
|
12
13
|
)
|
|
13
|
-
from acryl_datahub_cloud.sdk.
|
|
14
|
+
from acryl_datahub_cloud.sdk.assertion.smart_column_metric_assertion import (
|
|
15
|
+
SmartColumnMetricAssertion,
|
|
16
|
+
)
|
|
17
|
+
from acryl_datahub_cloud.sdk.assertion_input.assertion_input import (
|
|
14
18
|
AssertionIncidentBehavior,
|
|
15
19
|
DetectionMechanismInputTypes,
|
|
16
20
|
ExclusionWindowInputTypes,
|
|
17
21
|
InferenceSensitivity,
|
|
22
|
+
TimeWindowSizeInputTypes,
|
|
18
23
|
_AssertionInput,
|
|
19
24
|
_SmartFreshnessAssertionInput,
|
|
20
25
|
_SmartVolumeAssertionInput,
|
|
21
26
|
)
|
|
27
|
+
from acryl_datahub_cloud.sdk.assertion_input.freshness_assertion_input import (
|
|
28
|
+
_FreshnessAssertionInput,
|
|
29
|
+
)
|
|
30
|
+
from acryl_datahub_cloud.sdk.assertion_input.smart_column_metric_assertion_input import (
|
|
31
|
+
MetricInputType,
|
|
32
|
+
OperatorInputType,
|
|
33
|
+
RangeInputType,
|
|
34
|
+
RangeTypeInputType,
|
|
35
|
+
ValueInputType,
|
|
36
|
+
ValueTypeInputType,
|
|
37
|
+
_SmartColumnMetricAssertionInput,
|
|
38
|
+
)
|
|
22
39
|
from acryl_datahub_cloud.sdk.entities.assertion import Assertion, TagsInputType
|
|
23
40
|
from acryl_datahub_cloud.sdk.entities.monitor import Monitor
|
|
24
41
|
from acryl_datahub_cloud.sdk.errors import SDKUsageError
|
|
@@ -171,7 +188,7 @@ class AssertionsClient:
|
|
|
171
188
|
# 3. Merge the assertion input with the existing assertion and monitor entities or create a new assertion
|
|
172
189
|
# if the assertion does not exist:
|
|
173
190
|
merged_assertion_input_or_created_assertion = (
|
|
174
|
-
self.
|
|
191
|
+
self._retrieve_and_merge_smart_freshness_assertion_and_monitor(
|
|
175
192
|
assertion_input=assertion_input,
|
|
176
193
|
dataset_urn=dataset_urn,
|
|
177
194
|
urn=urn,
|
|
@@ -212,7 +229,7 @@ class AssertionsClient:
|
|
|
212
229
|
|
|
213
230
|
return SmartFreshnessAssertion._from_entities(assertion_entity, monitor_entity)
|
|
214
231
|
|
|
215
|
-
def
|
|
232
|
+
def _retrieve_and_merge_smart_freshness_assertion_and_monitor(
|
|
216
233
|
self,
|
|
217
234
|
assertion_input: _SmartFreshnessAssertionInput,
|
|
218
235
|
dataset_urn: Union[str, DatasetUrn],
|
|
@@ -277,7 +294,7 @@ class AssertionsClient:
|
|
|
277
294
|
)
|
|
278
295
|
|
|
279
296
|
# 4. Merge the existing assertion with the validated input:
|
|
280
|
-
merged_assertion_input = self.
|
|
297
|
+
merged_assertion_input = self._merge_smart_freshness_input(
|
|
281
298
|
dataset_urn=dataset_urn,
|
|
282
299
|
urn=urn,
|
|
283
300
|
display_name=display_name,
|
|
@@ -350,6 +367,7 @@ class AssertionsClient:
|
|
|
350
367
|
incident_behavior=incident_behavior,
|
|
351
368
|
tags=tags,
|
|
352
369
|
created_by=updated_by,
|
|
370
|
+
schedule=schedule,
|
|
353
371
|
)
|
|
354
372
|
|
|
355
373
|
# 3. Check for any issues e.g. different dataset urns
|
|
@@ -363,7 +381,7 @@ class AssertionsClient:
|
|
|
363
381
|
)
|
|
364
382
|
|
|
365
383
|
# 4. Merge the existing assertion with the validated input:
|
|
366
|
-
merged_assertion_input = self.
|
|
384
|
+
merged_assertion_input = self._merge_smart_volume_input(
|
|
367
385
|
dataset_urn=dataset_urn,
|
|
368
386
|
urn=urn,
|
|
369
387
|
display_name=display_name,
|
|
@@ -384,6 +402,93 @@ class AssertionsClient:
|
|
|
384
402
|
|
|
385
403
|
return merged_assertion_input
|
|
386
404
|
|
|
405
|
+
def _retrieve_and_merge_freshness_assertion_and_monitor(
|
|
406
|
+
self,
|
|
407
|
+
assertion_input: _FreshnessAssertionInput,
|
|
408
|
+
dataset_urn: Union[str, DatasetUrn],
|
|
409
|
+
urn: Union[str, AssertionUrn],
|
|
410
|
+
display_name: Optional[str],
|
|
411
|
+
enabled: Optional[bool],
|
|
412
|
+
detection_mechanism: DetectionMechanismInputTypes,
|
|
413
|
+
incident_behavior: Optional[
|
|
414
|
+
Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
|
|
415
|
+
],
|
|
416
|
+
tags: Optional[TagsInputType],
|
|
417
|
+
updated_by: Optional[Union[str, CorpUserUrn]],
|
|
418
|
+
now_utc: datetime,
|
|
419
|
+
schedule: Optional[Union[str, models.CronScheduleClass]],
|
|
420
|
+
freshness_schedule_check_type: Optional[
|
|
421
|
+
Union[str, models.FreshnessAssertionScheduleTypeClass]
|
|
422
|
+
] = None,
|
|
423
|
+
lookback_window: Optional[TimeWindowSizeInputTypes] = None,
|
|
424
|
+
) -> Union[FreshnessAssertion, _FreshnessAssertionInput]:
|
|
425
|
+
# 1. Retrieve any existing assertion and monitor entities:
|
|
426
|
+
maybe_assertion_entity, monitor_urn, maybe_monitor_entity = (
|
|
427
|
+
self._retrieve_assertion_and_monitor(assertion_input)
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# 2.1 If the assertion and monitor entities exist, create an assertion object from them:
|
|
431
|
+
if maybe_assertion_entity and maybe_monitor_entity:
|
|
432
|
+
existing_assertion = FreshnessAssertion._from_entities(
|
|
433
|
+
maybe_assertion_entity, maybe_monitor_entity
|
|
434
|
+
)
|
|
435
|
+
# 2.2 If the assertion exists but the monitor does not, create a placeholder monitor entity to be able to create the assertion:
|
|
436
|
+
elif maybe_assertion_entity and not maybe_monitor_entity:
|
|
437
|
+
monitor_mode = (
|
|
438
|
+
"ACTIVE" if enabled else "INACTIVE" if enabled is not None else "ACTIVE"
|
|
439
|
+
)
|
|
440
|
+
existing_assertion = FreshnessAssertion._from_entities(
|
|
441
|
+
maybe_assertion_entity,
|
|
442
|
+
Monitor(id=monitor_urn, info=("ASSERTION", monitor_mode)),
|
|
443
|
+
)
|
|
444
|
+
# 2.3 If the assertion does not exist, create a new assertion with a generated urn and return the assertion input:
|
|
445
|
+
elif not maybe_assertion_entity:
|
|
446
|
+
logger.info(
|
|
447
|
+
f"No existing assertion entity found for assertion urn {urn}, creating a new assertion with a generated urn"
|
|
448
|
+
)
|
|
449
|
+
return self._create_freshness_assertion(
|
|
450
|
+
dataset_urn=dataset_urn,
|
|
451
|
+
display_name=display_name,
|
|
452
|
+
detection_mechanism=detection_mechanism,
|
|
453
|
+
incident_behavior=incident_behavior,
|
|
454
|
+
tags=tags,
|
|
455
|
+
created_by=updated_by,
|
|
456
|
+
schedule=schedule,
|
|
457
|
+
freshness_schedule_check_type=freshness_schedule_check_type,
|
|
458
|
+
lookback_window=lookback_window,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
# 3. Check for any issues e.g. different dataset urns
|
|
462
|
+
if (
|
|
463
|
+
existing_assertion
|
|
464
|
+
and hasattr(existing_assertion, "dataset_urn")
|
|
465
|
+
and existing_assertion.dataset_urn != assertion_input.dataset_urn
|
|
466
|
+
):
|
|
467
|
+
raise SDKUsageError(
|
|
468
|
+
f"Dataset URN mismatch, existing assertion: {existing_assertion.dataset_urn} != new assertion: {dataset_urn}"
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
# 4. Merge the existing assertion with the validated input:
|
|
472
|
+
merged_assertion_input = self._merge_freshness_input(
|
|
473
|
+
dataset_urn=dataset_urn,
|
|
474
|
+
urn=urn,
|
|
475
|
+
display_name=display_name,
|
|
476
|
+
enabled=enabled,
|
|
477
|
+
detection_mechanism=detection_mechanism,
|
|
478
|
+
incident_behavior=incident_behavior,
|
|
479
|
+
tags=tags,
|
|
480
|
+
now_utc=now_utc,
|
|
481
|
+
assertion_input=assertion_input,
|
|
482
|
+
maybe_assertion_entity=maybe_assertion_entity,
|
|
483
|
+
maybe_monitor_entity=maybe_monitor_entity,
|
|
484
|
+
existing_assertion=existing_assertion,
|
|
485
|
+
schedule=schedule,
|
|
486
|
+
freshness_schedule_check_type=freshness_schedule_check_type,
|
|
487
|
+
lookback_window=lookback_window,
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
return merged_assertion_input
|
|
491
|
+
|
|
387
492
|
def _retrieve_assertion_and_monitor(
|
|
388
493
|
self,
|
|
389
494
|
assertion_input: _AssertionInput,
|
|
@@ -423,7 +528,7 @@ class AssertionsClient:
|
|
|
423
528
|
|
|
424
529
|
return maybe_assertion_entity, monitor_urn, maybe_monitor_entity
|
|
425
530
|
|
|
426
|
-
def
|
|
531
|
+
def _merge_smart_freshness_input(
|
|
427
532
|
self,
|
|
428
533
|
dataset_urn: Union[str, DatasetUrn],
|
|
429
534
|
urn: Union[str, AssertionUrn],
|
|
@@ -554,7 +659,129 @@ class AssertionsClient:
|
|
|
554
659
|
|
|
555
660
|
return merged_assertion_input
|
|
556
661
|
|
|
557
|
-
def
|
|
662
|
+
def _merge_freshness_input(
|
|
663
|
+
self,
|
|
664
|
+
dataset_urn: Union[str, DatasetUrn],
|
|
665
|
+
urn: Union[str, AssertionUrn],
|
|
666
|
+
display_name: Optional[str],
|
|
667
|
+
enabled: Optional[bool],
|
|
668
|
+
detection_mechanism: DetectionMechanismInputTypes,
|
|
669
|
+
incident_behavior: Optional[
|
|
670
|
+
Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
|
|
671
|
+
],
|
|
672
|
+
tags: Optional[TagsInputType],
|
|
673
|
+
now_utc: datetime,
|
|
674
|
+
assertion_input: _FreshnessAssertionInput,
|
|
675
|
+
maybe_assertion_entity: Optional[Assertion],
|
|
676
|
+
maybe_monitor_entity: Optional[Monitor],
|
|
677
|
+
existing_assertion: FreshnessAssertion,
|
|
678
|
+
schedule: Optional[Union[str, models.CronScheduleClass]],
|
|
679
|
+
freshness_schedule_check_type: Optional[
|
|
680
|
+
Union[str, models.FreshnessAssertionScheduleTypeClass]
|
|
681
|
+
] = None,
|
|
682
|
+
lookback_window: Optional[TimeWindowSizeInputTypes] = None,
|
|
683
|
+
) -> _FreshnessAssertionInput:
|
|
684
|
+
"""Merge the input with the existing assertion and monitor entities.
|
|
685
|
+
|
|
686
|
+
Args:
|
|
687
|
+
dataset_urn: The urn of the dataset to be monitored.
|
|
688
|
+
urn: The urn of the assertion.
|
|
689
|
+
display_name: The display name of the assertion.
|
|
690
|
+
enabled: Whether the assertion is enabled.
|
|
691
|
+
incident_behavior: The incident behavior to be applied to the assertion.
|
|
692
|
+
tags: The tags to be applied to the assertion.
|
|
693
|
+
now_utc: The current UTC time from when the function is called.
|
|
694
|
+
assertion_input: The validated input to the function.
|
|
695
|
+
maybe_assertion_entity: The existing assertion entity from the DataHub instance.
|
|
696
|
+
maybe_monitor_entity: The existing monitor entity from the DataHub instance.
|
|
697
|
+
existing_assertion: The existing assertion from the DataHub instance.
|
|
698
|
+
schedule: The schedule to be applied to the assertion.
|
|
699
|
+
freshness_schedule_check_type: The freshness schedule check type to be applied to the assertion.
|
|
700
|
+
lookback_window: The lookback window to be applied to the assertion.
|
|
701
|
+
|
|
702
|
+
Returns:
|
|
703
|
+
The merged assertion input.
|
|
704
|
+
"""
|
|
705
|
+
merged_assertion_input = _FreshnessAssertionInput(
|
|
706
|
+
urn=urn,
|
|
707
|
+
entity_client=self.client.entities,
|
|
708
|
+
dataset_urn=dataset_urn,
|
|
709
|
+
display_name=_merge_field(
|
|
710
|
+
display_name,
|
|
711
|
+
"display_name",
|
|
712
|
+
assertion_input,
|
|
713
|
+
existing_assertion,
|
|
714
|
+
maybe_assertion_entity.description if maybe_assertion_entity else None,
|
|
715
|
+
),
|
|
716
|
+
enabled=_merge_field(
|
|
717
|
+
enabled,
|
|
718
|
+
"enabled",
|
|
719
|
+
assertion_input,
|
|
720
|
+
existing_assertion,
|
|
721
|
+
existing_assertion.mode == AssertionMode.ACTIVE
|
|
722
|
+
if existing_assertion
|
|
723
|
+
else None,
|
|
724
|
+
),
|
|
725
|
+
schedule=_merge_field(
|
|
726
|
+
schedule,
|
|
727
|
+
"schedule",
|
|
728
|
+
assertion_input,
|
|
729
|
+
existing_assertion,
|
|
730
|
+
existing_assertion.schedule if existing_assertion else None,
|
|
731
|
+
),
|
|
732
|
+
freshness_schedule_check_type=_merge_field(
|
|
733
|
+
freshness_schedule_check_type,
|
|
734
|
+
"freshness_schedule_check_type",
|
|
735
|
+
assertion_input,
|
|
736
|
+
existing_assertion,
|
|
737
|
+
existing_assertion._freshness_schedule_check_type
|
|
738
|
+
if existing_assertion
|
|
739
|
+
else None,
|
|
740
|
+
),
|
|
741
|
+
lookback_window=_merge_field(
|
|
742
|
+
lookback_window,
|
|
743
|
+
"lookback_window",
|
|
744
|
+
assertion_input,
|
|
745
|
+
existing_assertion,
|
|
746
|
+
existing_assertion.lookback_window if existing_assertion else None,
|
|
747
|
+
),
|
|
748
|
+
detection_mechanism=_merge_field(
|
|
749
|
+
detection_mechanism,
|
|
750
|
+
"detection_mechanism",
|
|
751
|
+
assertion_input,
|
|
752
|
+
existing_assertion,
|
|
753
|
+
FreshnessAssertion._get_detection_mechanism(
|
|
754
|
+
maybe_assertion_entity, maybe_monitor_entity, default=None
|
|
755
|
+
)
|
|
756
|
+
if maybe_assertion_entity and maybe_monitor_entity
|
|
757
|
+
else None,
|
|
758
|
+
),
|
|
759
|
+
incident_behavior=_merge_field(
|
|
760
|
+
incident_behavior,
|
|
761
|
+
"incident_behavior",
|
|
762
|
+
assertion_input,
|
|
763
|
+
existing_assertion,
|
|
764
|
+
FreshnessAssertion._get_incident_behavior(maybe_assertion_entity)
|
|
765
|
+
if maybe_assertion_entity
|
|
766
|
+
else None,
|
|
767
|
+
),
|
|
768
|
+
tags=_merge_field(
|
|
769
|
+
tags,
|
|
770
|
+
"tags",
|
|
771
|
+
assertion_input,
|
|
772
|
+
existing_assertion,
|
|
773
|
+
maybe_assertion_entity.tags if maybe_assertion_entity else None,
|
|
774
|
+
),
|
|
775
|
+
created_by=existing_assertion.created_by
|
|
776
|
+
or DEFAULT_CREATED_BY, # Override with the existing assertion's created_by or the default created_by if not set
|
|
777
|
+
created_at=existing_assertion.created_at
|
|
778
|
+
or now_utc, # Override with the existing assertion's created_at or now if not set
|
|
779
|
+
updated_by=assertion_input.updated_by, # Override with the input's updated_by
|
|
780
|
+
updated_at=assertion_input.updated_at, # Override with the input's updated_at (now)
|
|
781
|
+
)
|
|
782
|
+
return merged_assertion_input
|
|
783
|
+
|
|
784
|
+
def _merge_smart_volume_input(
|
|
558
785
|
self,
|
|
559
786
|
dataset_urn: Union[str, DatasetUrn],
|
|
560
787
|
urn: Union[str, AssertionUrn],
|
|
@@ -919,46 +1146,35 @@ class AssertionsClient:
|
|
|
919
1146
|
# raise e
|
|
920
1147
|
return SmartVolumeAssertion._from_entities(assertion_entity, monitor_entity)
|
|
921
1148
|
|
|
922
|
-
def
|
|
1149
|
+
def _create_freshness_assertion(
|
|
923
1150
|
self,
|
|
924
1151
|
*,
|
|
925
1152
|
dataset_urn: Union[str, DatasetUrn],
|
|
926
|
-
urn: Optional[Union[str, AssertionUrn]] = None,
|
|
927
1153
|
display_name: Optional[str] = None,
|
|
928
|
-
enabled:
|
|
1154
|
+
enabled: bool = True,
|
|
1155
|
+
freshness_schedule_check_type: Optional[
|
|
1156
|
+
Union[str, models.FreshnessAssertionScheduleTypeClass]
|
|
1157
|
+
] = None,
|
|
1158
|
+
lookback_window: Optional[TimeWindowSizeInputTypes] = None,
|
|
929
1159
|
detection_mechanism: DetectionMechanismInputTypes = None,
|
|
930
|
-
sensitivity: Optional[Union[str, InferenceSensitivity]] = None,
|
|
931
|
-
exclusion_windows: Optional[ExclusionWindowInputTypes] = None,
|
|
932
|
-
training_data_lookback_days: Optional[int] = None,
|
|
933
1160
|
incident_behavior: Optional[
|
|
934
1161
|
Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
|
|
935
1162
|
] = None,
|
|
936
1163
|
tags: Optional[TagsInputType] = None,
|
|
937
|
-
|
|
1164
|
+
created_by: Optional[Union[str, CorpUserUrn]] = None,
|
|
938
1165
|
schedule: Optional[Union[str, models.CronScheduleClass]] = None,
|
|
939
|
-
) ->
|
|
940
|
-
"""
|
|
1166
|
+
) -> FreshnessAssertion:
|
|
1167
|
+
"""Create a freshness assertion.
|
|
941
1168
|
|
|
942
1169
|
Note: keyword arguments are required.
|
|
943
1170
|
|
|
944
|
-
|
|
945
|
-
it will be created. If it does exist, it will be updated. Existing assertion fields will
|
|
946
|
-
be updated if the input value is not None. If the input value is None, the existing value
|
|
947
|
-
will be preserved. If the input value can be un-set e.g. by passing an empty list or
|
|
948
|
-
empty string.
|
|
949
|
-
|
|
950
|
-
Schedule behavior:
|
|
951
|
-
- Create case: Uses default hourly schedule (\"0 * * * *\") or provided schedule
|
|
952
|
-
- Update case: Different than `sync_smart_freshness_assertion`, schedule is updated.
|
|
1171
|
+
The created assertion will use the default daily schedule ("0 0 * * *").
|
|
953
1172
|
|
|
954
1173
|
Args:
|
|
955
1174
|
dataset_urn: The urn of the dataset to be monitored.
|
|
956
|
-
|
|
957
|
-
will be
|
|
958
|
-
|
|
959
|
-
will be generated.
|
|
960
|
-
enabled: Whether the assertion is enabled. If not provided, the existing value
|
|
961
|
-
will be preserved.
|
|
1175
|
+
display_name: The display name of the assertion. If not provided, a random display
|
|
1176
|
+
name will be generated.
|
|
1177
|
+
enabled: Whether the assertion is enabled. Defaults to True.
|
|
962
1178
|
detection_mechanism: The detection mechanism to be used for the assertion. Information
|
|
963
1179
|
schema is recommended. Valid values are:
|
|
964
1180
|
- "information_schema" or DetectionMechanism.INFORMATION_SCHEMA
|
|
@@ -969,33 +1185,15 @@ class AssertionsClient:
|
|
|
969
1185
|
"additional_filter": "last_modified > '2021-01-01'",
|
|
970
1186
|
} or DetectionMechanism.LAST_MODIFIED_COLUMN(column_name='last_modified',
|
|
971
1187
|
additional_filter='last_modified > 2021-01-01')
|
|
972
|
-
- {
|
|
973
|
-
"type": "high_watermark_column",
|
|
974
|
-
"column_name": "id",
|
|
975
|
-
"additional_filter": "id > 1000",
|
|
976
|
-
} or DetectionMechanism.HIGH_WATERMARK_COLUMN(column_name='id',
|
|
977
|
-
additional_filter='id > 1000')
|
|
978
1188
|
- "datahub_operation" or DetectionMechanism.DATAHUB_OPERATION
|
|
979
|
-
|
|
980
|
-
- "
|
|
981
|
-
- "
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
"end": "datetime(2025, 1, 2, 0, 0, 0)",
|
|
988
|
-
}
|
|
989
|
-
- from string datetimes: {
|
|
990
|
-
"start": "2025-01-01T00:00:00",
|
|
991
|
-
"end": "2025-01-02T00:00:00",
|
|
992
|
-
}
|
|
993
|
-
- from FixedRangeExclusionWindow objects: FixedRangeExclusionWindow(
|
|
994
|
-
start=datetime(2025, 1, 1, 0, 0, 0),
|
|
995
|
-
end=datetime(2025, 1, 2, 0, 0, 0)
|
|
996
|
-
)
|
|
997
|
-
training_data_lookback_days: The training data lookback days to be applied to the
|
|
998
|
-
assertion as an integer.
|
|
1189
|
+
freshness_schedule_check_type: The freshness schedule check type to be applied to the assertion. Valid values are:
|
|
1190
|
+
- "since_the_last_check" or models.FreshnessAssertionScheduleTypeClass.SINCE_THE_LAST_CHECK
|
|
1191
|
+
- "cron" or models.FreshnessAssertionScheduleTypeClass.CRON
|
|
1192
|
+
lookback_window: The lookback window to be applied to the assertion. Valid values are:
|
|
1193
|
+
- from models.TimeWindowSize objects: models.TimeWindowSizeClass(
|
|
1194
|
+
unit=models.CalendarIntervalClass.DAY,
|
|
1195
|
+
multiple=1)
|
|
1196
|
+
- from TimeWindowSize objects: TimeWindowSize(unit='DAY', multiple=1)
|
|
999
1197
|
incident_behavior: The incident behavior to be applied to the assertion. Valid values are:
|
|
1000
1198
|
- "raise_on_fail" or AssertionIncidentBehavior.RAISE_ON_FAIL
|
|
1001
1199
|
- "resolve_on_pass" or AssertionIncidentBehavior.RESOLVE_ON_PASS
|
|
@@ -1003,7 +1201,7 @@ class AssertionsClient:
|
|
|
1003
1201
|
- a list of strings (strings will be converted to TagUrn objects)
|
|
1004
1202
|
- a list of TagUrn objects
|
|
1005
1203
|
- a list of TagAssociationClass objects
|
|
1006
|
-
|
|
1204
|
+
created_by: Optional urn of the user who created the assertion. The format is
|
|
1007
1205
|
"urn:li:corpuser:<username>", which you can find on the Users & Groups page.
|
|
1008
1206
|
The default is the datahub system user.
|
|
1009
1207
|
TODO: Retrieve the SDK user as the default instead of the datahub system user.
|
|
@@ -1014,22 +1212,157 @@ class AssertionsClient:
|
|
|
1014
1212
|
cron and timezone. Use `from datahub.metadata import schema_classes as models` to import the class.
|
|
1015
1213
|
|
|
1016
1214
|
Returns:
|
|
1017
|
-
|
|
1215
|
+
FreshnessAssertion: The created assertion.
|
|
1018
1216
|
"""
|
|
1019
1217
|
_print_experimental_warning()
|
|
1020
1218
|
now_utc = datetime.now(timezone.utc)
|
|
1021
|
-
|
|
1022
|
-
if updated_by is None:
|
|
1219
|
+
if created_by is None:
|
|
1023
1220
|
logger.warning(
|
|
1024
|
-
f"
|
|
1221
|
+
f"Created by is not set, using {DEFAULT_CREATED_BY} as a placeholder"
|
|
1025
1222
|
)
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1223
|
+
created_by = DEFAULT_CREATED_BY
|
|
1224
|
+
assertion_input = _FreshnessAssertionInput(
|
|
1225
|
+
urn=None,
|
|
1226
|
+
entity_client=self.client.entities,
|
|
1227
|
+
dataset_urn=dataset_urn,
|
|
1228
|
+
display_name=display_name,
|
|
1229
|
+
enabled=enabled,
|
|
1230
|
+
detection_mechanism=detection_mechanism,
|
|
1231
|
+
freshness_schedule_check_type=freshness_schedule_check_type,
|
|
1232
|
+
lookback_window=lookback_window,
|
|
1233
|
+
incident_behavior=incident_behavior,
|
|
1234
|
+
tags=tags,
|
|
1235
|
+
created_by=created_by,
|
|
1236
|
+
created_at=now_utc,
|
|
1237
|
+
updated_by=created_by,
|
|
1238
|
+
updated_at=now_utc,
|
|
1239
|
+
schedule=schedule,
|
|
1240
|
+
)
|
|
1241
|
+
assertion_entity, monitor_entity = (
|
|
1242
|
+
assertion_input.to_assertion_and_monitor_entities()
|
|
1243
|
+
)
|
|
1244
|
+
# If assertion creation fails, we won't try to create the monitor
|
|
1245
|
+
self.client.entities.create(assertion_entity)
|
|
1246
|
+
# TODO: Wrap monitor creation in a try-except and delete the assertion if monitor creation fails (once delete is implemented https://linear.app/acryl-data/issue/OBS-1350/add-delete-method-to-entity-clientpy)
|
|
1247
|
+
# try:
|
|
1248
|
+
self.client.entities.create(monitor_entity)
|
|
1249
|
+
# except Exception as e:
|
|
1250
|
+
# logger.error(f"Error creating monitor: {e}")
|
|
1251
|
+
# self.client.entities.delete(assertion_entity)
|
|
1252
|
+
# raise e
|
|
1253
|
+
return FreshnessAssertion._from_entities(assertion_entity, monitor_entity)
|
|
1254
|
+
|
|
1255
|
+
def sync_smart_volume_assertion(
|
|
1256
|
+
self,
|
|
1257
|
+
*,
|
|
1258
|
+
dataset_urn: Union[str, DatasetUrn],
|
|
1259
|
+
urn: Optional[Union[str, AssertionUrn]] = None,
|
|
1260
|
+
display_name: Optional[str] = None,
|
|
1261
|
+
enabled: Optional[bool] = None,
|
|
1262
|
+
detection_mechanism: DetectionMechanismInputTypes = None,
|
|
1263
|
+
sensitivity: Optional[Union[str, InferenceSensitivity]] = None,
|
|
1264
|
+
exclusion_windows: Optional[ExclusionWindowInputTypes] = None,
|
|
1265
|
+
training_data_lookback_days: Optional[int] = None,
|
|
1266
|
+
incident_behavior: Optional[
|
|
1267
|
+
Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
|
|
1268
|
+
] = None,
|
|
1269
|
+
tags: Optional[TagsInputType] = None,
|
|
1270
|
+
updated_by: Optional[Union[str, CorpUserUrn]] = None,
|
|
1271
|
+
schedule: Optional[Union[str, models.CronScheduleClass]] = None,
|
|
1272
|
+
) -> SmartVolumeAssertion:
|
|
1273
|
+
"""Upsert and merge a smart volume assertion.
|
|
1274
|
+
|
|
1275
|
+
Note: keyword arguments are required.
|
|
1276
|
+
|
|
1277
|
+
Upsert and merge is a combination of create and update. If the assertion does not exist,
|
|
1278
|
+
it will be created. If it does exist, it will be updated. Existing assertion fields will
|
|
1279
|
+
be updated if the input value is not None. If the input value is None, the existing value
|
|
1280
|
+
will be preserved. If the input value can be un-set e.g. by passing an empty list or
|
|
1281
|
+
empty string.
|
|
1282
|
+
|
|
1283
|
+
Schedule behavior:
|
|
1284
|
+
- Create case: Uses default hourly schedule (\"0 * * * *\") or provided schedule
|
|
1285
|
+
- Update case: Different than `sync_smart_freshness_assertion`, schedule is updated.
|
|
1286
|
+
|
|
1287
|
+
Args:
|
|
1288
|
+
dataset_urn: The urn of the dataset to be monitored.
|
|
1289
|
+
urn: The urn of the assertion. If not provided, a urn will be generated and the assertion
|
|
1290
|
+
will be _created_ in the DataHub instance.
|
|
1291
|
+
display_name: The display name of the assertion. If not provided, a random display name
|
|
1292
|
+
will be generated.
|
|
1293
|
+
enabled: Whether the assertion is enabled. If not provided, the existing value
|
|
1294
|
+
will be preserved.
|
|
1295
|
+
detection_mechanism: The detection mechanism to be used for the assertion. Information
|
|
1296
|
+
schema is recommended. Valid values are:
|
|
1297
|
+
- "information_schema" or DetectionMechanism.INFORMATION_SCHEMA
|
|
1298
|
+
- "audit_log" or DetectionMechanism.AUDIT_LOG
|
|
1299
|
+
- {
|
|
1300
|
+
"type": "last_modified_column",
|
|
1301
|
+
"column_name": "last_modified",
|
|
1302
|
+
"additional_filter": "last_modified > '2021-01-01'",
|
|
1303
|
+
} or DetectionMechanism.LAST_MODIFIED_COLUMN(column_name='last_modified',
|
|
1304
|
+
additional_filter='last_modified > 2021-01-01')
|
|
1305
|
+
- {
|
|
1306
|
+
"type": "high_watermark_column",
|
|
1307
|
+
"column_name": "id",
|
|
1308
|
+
"additional_filter": "id > 1000",
|
|
1309
|
+
} or DetectionMechanism.HIGH_WATERMARK_COLUMN(column_name='id',
|
|
1310
|
+
additional_filter='id > 1000')
|
|
1311
|
+
- "datahub_operation" or DetectionMechanism.DATAHUB_OPERATION
|
|
1312
|
+
sensitivity: The sensitivity to be applied to the assertion. Valid values are:
|
|
1313
|
+
- "low" or InferenceSensitivity.LOW
|
|
1314
|
+
- "medium" or InferenceSensitivity.MEDIUM
|
|
1315
|
+
- "high" or InferenceSensitivity.HIGH
|
|
1316
|
+
exclusion_windows: The exclusion windows to be applied to the assertion, currently only
|
|
1317
|
+
fixed range exclusion windows are supported. Valid values are:
|
|
1318
|
+
- from datetime.datetime objects: {
|
|
1319
|
+
"start": "datetime(2025, 1, 1, 0, 0, 0)",
|
|
1320
|
+
"end": "datetime(2025, 1, 2, 0, 0, 0)",
|
|
1321
|
+
}
|
|
1322
|
+
- from string datetimes: {
|
|
1323
|
+
"start": "2025-01-01T00:00:00",
|
|
1324
|
+
"end": "2025-01-02T00:00:00",
|
|
1325
|
+
}
|
|
1326
|
+
- from FixedRangeExclusionWindow objects: FixedRangeExclusionWindow(
|
|
1327
|
+
start=datetime(2025, 1, 1, 0, 0, 0),
|
|
1328
|
+
end=datetime(2025, 1, 2, 0, 0, 0)
|
|
1329
|
+
)
|
|
1330
|
+
training_data_lookback_days: The training data lookback days to be applied to the
|
|
1331
|
+
assertion as an integer.
|
|
1332
|
+
incident_behavior: The incident behavior to be applied to the assertion. Valid values are:
|
|
1333
|
+
- "raise_on_fail" or AssertionIncidentBehavior.RAISE_ON_FAIL
|
|
1334
|
+
- "resolve_on_pass" or AssertionIncidentBehavior.RESOLVE_ON_PASS
|
|
1335
|
+
tags: The tags to be applied to the assertion. Valid values are:
|
|
1336
|
+
- a list of strings (strings will be converted to TagUrn objects)
|
|
1337
|
+
- a list of TagUrn objects
|
|
1338
|
+
- a list of TagAssociationClass objects
|
|
1339
|
+
updated_by: Optional urn of the user who updated the assertion. The format is
|
|
1340
|
+
"urn:li:corpuser:<username>", which you can find on the Users & Groups page.
|
|
1341
|
+
The default is the datahub system user.
|
|
1342
|
+
TODO: Retrieve the SDK user as the default instead of the datahub system user.
|
|
1343
|
+
schedule: Optional cron formatted schedule for the assertion. If not provided, a default
|
|
1344
|
+
schedule will be used. The schedule determines when the assertion will be evaluated.
|
|
1345
|
+
The format is a cron expression, e.g. "0 * * * *" for every hour using UTC timezone.
|
|
1346
|
+
Alternatively, a models.CronScheduleClass object can be provided with string parameters
|
|
1347
|
+
cron and timezone. Use `from datahub.metadata import schema_classes as models` to import the class.
|
|
1348
|
+
|
|
1349
|
+
Returns:
|
|
1350
|
+
SmartVolumeAssertion: The created or updated assertion.
|
|
1351
|
+
"""
|
|
1352
|
+
_print_experimental_warning()
|
|
1353
|
+
now_utc = datetime.now(timezone.utc)
|
|
1354
|
+
|
|
1355
|
+
if updated_by is None:
|
|
1356
|
+
logger.warning(
|
|
1357
|
+
f"updated_by is not set, using {DEFAULT_CREATED_BY} as a placeholder"
|
|
1358
|
+
)
|
|
1359
|
+
updated_by = DEFAULT_CREATED_BY
|
|
1360
|
+
|
|
1361
|
+
# 1. If urn is not set, create a new assertion
|
|
1362
|
+
if urn is None:
|
|
1363
|
+
logger.info("URN is not set, creating a new assertion")
|
|
1364
|
+
return self._create_smart_volume_assertion(
|
|
1365
|
+
dataset_urn=dataset_urn,
|
|
1033
1366
|
display_name=display_name,
|
|
1034
1367
|
enabled=enabled if enabled is not None else True,
|
|
1035
1368
|
detection_mechanism=detection_mechanism,
|
|
@@ -1106,6 +1439,884 @@ class AssertionsClient:
|
|
|
1106
1439
|
|
|
1107
1440
|
return SmartVolumeAssertion._from_entities(assertion_entity, monitor_entity)
|
|
1108
1441
|
|
|
1442
|
+
def sync_smart_column_metric_assertion(
|
|
1443
|
+
self,
|
|
1444
|
+
*,
|
|
1445
|
+
dataset_urn: Union[str, DatasetUrn],
|
|
1446
|
+
column_name: str,
|
|
1447
|
+
metric_type: MetricInputType,
|
|
1448
|
+
operator: OperatorInputType,
|
|
1449
|
+
value: Optional[ValueInputType] = None,
|
|
1450
|
+
value_type: Optional[ValueTypeInputType] = None,
|
|
1451
|
+
range: Optional[RangeInputType] = None,
|
|
1452
|
+
range_type: Optional[RangeTypeInputType] = None,
|
|
1453
|
+
urn: Optional[Union[str, AssertionUrn]] = None,
|
|
1454
|
+
display_name: Optional[str] = None,
|
|
1455
|
+
enabled: Optional[bool] = None,
|
|
1456
|
+
detection_mechanism: DetectionMechanismInputTypes = None,
|
|
1457
|
+
sensitivity: Optional[Union[str, InferenceSensitivity]] = None,
|
|
1458
|
+
exclusion_windows: Optional[ExclusionWindowInputTypes] = None,
|
|
1459
|
+
training_data_lookback_days: Optional[int] = None,
|
|
1460
|
+
incident_behavior: Optional[
|
|
1461
|
+
Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
|
|
1462
|
+
] = None,
|
|
1463
|
+
tags: Optional[TagsInputType] = None,
|
|
1464
|
+
updated_by: Optional[Union[str, CorpUserUrn]] = None,
|
|
1465
|
+
schedule: Optional[Union[str, models.CronScheduleClass]] = None,
|
|
1466
|
+
) -> SmartColumnMetricAssertion:
|
|
1467
|
+
"""Upsert and merge a smart column metric assertion.
|
|
1468
|
+
|
|
1469
|
+
Note: keyword arguments are required.
|
|
1470
|
+
|
|
1471
|
+
Upsert and merge is a combination of create and update. If the assertion does not exist,
|
|
1472
|
+
it will be created. If it does exist, it will be updated.
|
|
1473
|
+
|
|
1474
|
+
Existing assertion fields will be updated if the input value is not None. If the input value is None, the existing value
|
|
1475
|
+
will be preserved. If the input value can be un-set e.g. by passing an empty list or
|
|
1476
|
+
empty string.
|
|
1477
|
+
|
|
1478
|
+
Args:
|
|
1479
|
+
dataset_urn: The urn of the dataset to be monitored. (Required)
|
|
1480
|
+
column_name: The name of the column to be monitored. (Required)
|
|
1481
|
+
metric_type: The type of the metric to be monitored. (Required)
|
|
1482
|
+
operator: The operator to be used for the assertion. (Required)
|
|
1483
|
+
value: The value to be used for the assertion. (Required if operator requires a value)
|
|
1484
|
+
value_type: The type of the value to be used for the assertion. (Required if operator requires a value)
|
|
1485
|
+
range: The range to be used for the assertion. (Required if operator requires a range)
|
|
1486
|
+
range_type: The type of the range to be used for the assertion. (Required if operator requires a range)
|
|
1487
|
+
urn: The urn of the assertion. If not provided, a urn will be generated and the assertion
|
|
1488
|
+
will be _created_ in the DataHub instance.
|
|
1489
|
+
display_name: The display name of the assertion. If not provided, a random display name
|
|
1490
|
+
will be generated.
|
|
1491
|
+
enabled: Whether the assertion is enabled. If not provided, the existing value
|
|
1492
|
+
will be preserved.
|
|
1493
|
+
detection_mechanism: The detection mechanism to be used for the assertion. Valid values are:
|
|
1494
|
+
- All rows query datahub dataset profile:
|
|
1495
|
+
- "all_rows_query_datahub_dataset_profile" or DetectionMechanism.ALL_ROWS_QUERY_DATAHUB_DATASET_PROFILE
|
|
1496
|
+
|
|
1497
|
+
- All rows query:
|
|
1498
|
+
- "all_rows_query" or DetectionMechanism.ALL_ROWS_QUERY
|
|
1499
|
+
- with optional additional filter: DetectionMechanism.ALL_ROWS_QUERY(additional_filter='last_modified > 2021-01-01')
|
|
1500
|
+
- Or as a dict: {
|
|
1501
|
+
"type": "all_rows_query",
|
|
1502
|
+
"additional_filter": "last_modified > '2021-01-01'", # optional
|
|
1503
|
+
}
|
|
1504
|
+
|
|
1505
|
+
- Changed rows query:
|
|
1506
|
+
- For changed rows query, you need to pass a supported column type (Number, Date or Time)
|
|
1507
|
+
- DetectionMechanism.CHANGED_ROWS_QUERY(column_name='last_modified')
|
|
1508
|
+
- With optional additional filter: DetectionMechanism.CHANGED_ROWS_QUERY(column_name='last_modified', additional_filter='last_modified > 2021-01-01')
|
|
1509
|
+
- Or as a dict: {
|
|
1510
|
+
"type": "changed_rows_query",
|
|
1511
|
+
"column_name": "last_modified",
|
|
1512
|
+
"additional_filter": "last_modified > '2021-01-01'", # optional
|
|
1513
|
+
}
|
|
1514
|
+
|
|
1515
|
+
sensitivity: The sensitivity to be applied to the assertion. Valid values are:
|
|
1516
|
+
- "low" or InferenceSensitivity.LOW
|
|
1517
|
+
- "medium" or InferenceSensitivity.MEDIUM
|
|
1518
|
+
- "high" or InferenceSensitivity.HIGH
|
|
1519
|
+
exclusion_windows: The exclusion windows to be applied to the assertion, currently only
|
|
1520
|
+
fixed range exclusion windows are supported. Valid values are:
|
|
1521
|
+
- from datetime.datetime objects: {
|
|
1522
|
+
"start": "datetime(2025, 1, 1, 0, 0, 0)",
|
|
1523
|
+
"end": "datetime(2025, 1, 2, 0, 0, 0)",
|
|
1524
|
+
}
|
|
1525
|
+
- from string datetimes: {
|
|
1526
|
+
"start": "2025-01-01T00:00:00",
|
|
1527
|
+
"end": "2025-01-02T00:00:00",
|
|
1528
|
+
}
|
|
1529
|
+
- from FixedRangeExclusionWindow objects: FixedRangeExclusionWindow(
|
|
1530
|
+
start=datetime(2025, 1, 1, 0, 0, 0),
|
|
1531
|
+
end=datetime(2025, 1, 2, 0, 0, 0)
|
|
1532
|
+
)
|
|
1533
|
+
training_data_lookback_days: The training data lookback days to be applied to the
|
|
1534
|
+
assertion as an integer.
|
|
1535
|
+
incident_behavior: The incident behavior to be applied to the assertion. Valid values are:
|
|
1536
|
+
- "raise_on_fail" or AssertionIncidentBehavior.RAISE_ON_FAIL
|
|
1537
|
+
- "resolve_on_pass" or AssertionIncidentBehavior.RESOLVE_ON_PASS
|
|
1538
|
+
tags: The tags to be applied to the assertion. Valid values are:
|
|
1539
|
+
- a list of strings (strings will be converted to TagUrn objects)
|
|
1540
|
+
- a list of TagUrn objects
|
|
1541
|
+
- a list of TagAssociationClass objects
|
|
1542
|
+
updated_by: Optional urn of the user who updated the assertion. The format is
|
|
1543
|
+
"urn:li:corpuser:<username>", which you can find on the Users & Groups page.
|
|
1544
|
+
The default is the datahub system user.
|
|
1545
|
+
TODO: Retrieve the SDK user as the default instead of the datahub system user.
|
|
1546
|
+
schedule: Optional cron formatted schedule for the assertion. If not provided, a default
|
|
1547
|
+
schedule of every 6 hours will be used. The schedule determines when the assertion will be evaluated.
|
|
1548
|
+
The format is a cron expression, e.g. "0 * * * *" for every hour using UTC timezone.
|
|
1549
|
+
Alternatively, a models.CronScheduleClass object can be provided with string parameters
|
|
1550
|
+
cron and timezone. Use `from datahub.metadata import schema_classes as models` to import the class.
|
|
1551
|
+
|
|
1552
|
+
Returns:
|
|
1553
|
+
SmartColumnMetricAssertion: The created or updated assertion.
|
|
1554
|
+
"""
|
|
1555
|
+
_print_experimental_warning()
|
|
1556
|
+
now_utc = datetime.now(timezone.utc)
|
|
1557
|
+
|
|
1558
|
+
if updated_by is None:
|
|
1559
|
+
logger.warning(
|
|
1560
|
+
f"updated_by is not set, using {DEFAULT_CREATED_BY} as a placeholder"
|
|
1561
|
+
)
|
|
1562
|
+
updated_by = DEFAULT_CREATED_BY
|
|
1563
|
+
|
|
1564
|
+
# 1. If urn is not set, create a new assertion
|
|
1565
|
+
if urn is None:
|
|
1566
|
+
logger.info("URN is not set, creating a new assertion")
|
|
1567
|
+
return self._create_smart_column_metric_assertion(
|
|
1568
|
+
dataset_urn=dataset_urn,
|
|
1569
|
+
column_name=column_name,
|
|
1570
|
+
metric_type=metric_type,
|
|
1571
|
+
operator=operator,
|
|
1572
|
+
value=value,
|
|
1573
|
+
value_type=value_type,
|
|
1574
|
+
range=range,
|
|
1575
|
+
range_type=range_type,
|
|
1576
|
+
display_name=display_name,
|
|
1577
|
+
enabled=enabled if enabled is not None else True,
|
|
1578
|
+
detection_mechanism=detection_mechanism,
|
|
1579
|
+
sensitivity=sensitivity,
|
|
1580
|
+
exclusion_windows=exclusion_windows,
|
|
1581
|
+
training_data_lookback_days=training_data_lookback_days,
|
|
1582
|
+
incident_behavior=incident_behavior,
|
|
1583
|
+
tags=tags,
|
|
1584
|
+
created_by=updated_by,
|
|
1585
|
+
schedule=schedule,
|
|
1586
|
+
)
|
|
1587
|
+
|
|
1588
|
+
# 2. If urn is set, first validate the input:
|
|
1589
|
+
assertion_input = _SmartColumnMetricAssertionInput(
|
|
1590
|
+
urn=urn,
|
|
1591
|
+
entity_client=self.client.entities,
|
|
1592
|
+
dataset_urn=dataset_urn,
|
|
1593
|
+
column_name=column_name,
|
|
1594
|
+
metric_type=metric_type,
|
|
1595
|
+
operator=operator,
|
|
1596
|
+
value=value,
|
|
1597
|
+
value_type=value_type,
|
|
1598
|
+
range=range,
|
|
1599
|
+
range_type=range_type,
|
|
1600
|
+
display_name=display_name,
|
|
1601
|
+
detection_mechanism=detection_mechanism,
|
|
1602
|
+
sensitivity=sensitivity,
|
|
1603
|
+
exclusion_windows=exclusion_windows,
|
|
1604
|
+
training_data_lookback_days=training_data_lookback_days,
|
|
1605
|
+
incident_behavior=incident_behavior,
|
|
1606
|
+
tags=tags,
|
|
1607
|
+
created_by=updated_by, # This will be overridden by the actual created_by
|
|
1608
|
+
created_at=now_utc, # This will be overridden by the actual created_at
|
|
1609
|
+
updated_by=updated_by,
|
|
1610
|
+
updated_at=now_utc,
|
|
1611
|
+
schedule=schedule,
|
|
1612
|
+
)
|
|
1613
|
+
|
|
1614
|
+
# 3. Merge the assertion input with the existing assertion and monitor entities or create a new assertion
|
|
1615
|
+
# if the assertion does not exist:
|
|
1616
|
+
merged_assertion_input_or_created_assertion = (
|
|
1617
|
+
self._retrieve_and_merge_smart_column_metric_assertion_and_monitor(
|
|
1618
|
+
assertion_input=assertion_input,
|
|
1619
|
+
dataset_urn=dataset_urn,
|
|
1620
|
+
column_name=column_name,
|
|
1621
|
+
metric_type=metric_type,
|
|
1622
|
+
operator=operator,
|
|
1623
|
+
value=value,
|
|
1624
|
+
value_type=value_type,
|
|
1625
|
+
range=range,
|
|
1626
|
+
range_type=range_type,
|
|
1627
|
+
urn=urn,
|
|
1628
|
+
display_name=display_name,
|
|
1629
|
+
enabled=enabled,
|
|
1630
|
+
detection_mechanism=detection_mechanism,
|
|
1631
|
+
sensitivity=sensitivity,
|
|
1632
|
+
exclusion_windows=exclusion_windows,
|
|
1633
|
+
training_data_lookback_days=training_data_lookback_days,
|
|
1634
|
+
incident_behavior=incident_behavior,
|
|
1635
|
+
tags=tags,
|
|
1636
|
+
updated_by=updated_by,
|
|
1637
|
+
now_utc=now_utc,
|
|
1638
|
+
schedule=schedule,
|
|
1639
|
+
)
|
|
1640
|
+
)
|
|
1641
|
+
|
|
1642
|
+
# Return early if we created a new assertion in the merge:
|
|
1643
|
+
if isinstance(merged_assertion_input_or_created_assertion, _AssertionPublic):
|
|
1644
|
+
# We know this is the correct type because we passed the assertion_class parameter
|
|
1645
|
+
assert isinstance(
|
|
1646
|
+
merged_assertion_input_or_created_assertion, SmartColumnMetricAssertion
|
|
1647
|
+
)
|
|
1648
|
+
return merged_assertion_input_or_created_assertion
|
|
1649
|
+
|
|
1650
|
+
# 4. Upsert the assertion and monitor entities:
|
|
1651
|
+
assertion_entity, monitor_entity = (
|
|
1652
|
+
merged_assertion_input_or_created_assertion.to_assertion_and_monitor_entities()
|
|
1653
|
+
)
|
|
1654
|
+
# If assertion upsert fails, we won't try to upsert the monitor
|
|
1655
|
+
self.client.entities.upsert(assertion_entity)
|
|
1656
|
+
# TODO: Wrap monitor upsert in a try-except and delete the assertion if monitor upsert fails (once delete is implemented https://linear.app/acryl-data/issue/OBS-1350/add-delete-method-to-entity-clientpy)
|
|
1657
|
+
# try:
|
|
1658
|
+
self.client.entities.upsert(monitor_entity)
|
|
1659
|
+
# except Exception as e:
|
|
1660
|
+
# logger.error(f"Error upserting monitor: {e}")
|
|
1661
|
+
# self.client.entities.delete(assertion_entity)
|
|
1662
|
+
# raise e
|
|
1663
|
+
|
|
1664
|
+
return SmartColumnMetricAssertion._from_entities(
|
|
1665
|
+
assertion_entity, monitor_entity
|
|
1666
|
+
)
|
|
1667
|
+
|
|
1668
|
+
def _create_smart_column_metric_assertion(
|
|
1669
|
+
self,
|
|
1670
|
+
*,
|
|
1671
|
+
dataset_urn: Union[str, DatasetUrn],
|
|
1672
|
+
column_name: str,
|
|
1673
|
+
metric_type: MetricInputType,
|
|
1674
|
+
operator: OperatorInputType,
|
|
1675
|
+
value: Optional[ValueInputType] = None,
|
|
1676
|
+
value_type: Optional[ValueTypeInputType] = None,
|
|
1677
|
+
range: Optional[RangeInputType] = None,
|
|
1678
|
+
range_type: Optional[RangeTypeInputType] = None,
|
|
1679
|
+
display_name: Optional[str] = None,
|
|
1680
|
+
enabled: bool = True,
|
|
1681
|
+
detection_mechanism: DetectionMechanismInputTypes = None,
|
|
1682
|
+
sensitivity: Optional[Union[str, InferenceSensitivity]] = None,
|
|
1683
|
+
exclusion_windows: Optional[ExclusionWindowInputTypes] = None,
|
|
1684
|
+
training_data_lookback_days: Optional[int] = None,
|
|
1685
|
+
incident_behavior: Optional[
|
|
1686
|
+
Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
|
|
1687
|
+
] = None,
|
|
1688
|
+
tags: Optional[TagsInputType] = None,
|
|
1689
|
+
created_by: Optional[Union[str, CorpUserUrn]] = None,
|
|
1690
|
+
schedule: Optional[Union[str, models.CronScheduleClass]] = None,
|
|
1691
|
+
) -> SmartColumnMetricAssertion:
|
|
1692
|
+
"""Create a smart column metric assertion.
|
|
1693
|
+
|
|
1694
|
+
Note: keyword arguments are required.
|
|
1695
|
+
|
|
1696
|
+
Args:
|
|
1697
|
+
dataset_urn: The urn of the dataset to be monitored. (Required)
|
|
1698
|
+
column_name: The name of the column to be monitored. (Required)
|
|
1699
|
+
metric_type: The type of the metric to be monitored. (Required)
|
|
1700
|
+
operator: The operator to be used for the assertion. (Required)
|
|
1701
|
+
value: The value to be used for the assertion. (Required if operator requires a value)
|
|
1702
|
+
value_type: The type of the value to be used for the assertion. (Required if operator requires a value)
|
|
1703
|
+
range: The range to be used for the assertion. (Required if operator requires a range)
|
|
1704
|
+
range_type: The type of the range to be used for the assertion. (Required if operator requires a range)
|
|
1705
|
+
display_name: The display name of the assertion. If not provided, a random display
|
|
1706
|
+
name will be generated.
|
|
1707
|
+
enabled: Whether the assertion is enabled. Defaults to True.
|
|
1708
|
+
detection_mechanism: The detection mechanism to be used for the assertion. Information
|
|
1709
|
+
schema is recommended. Valid values are:
|
|
1710
|
+
- "information_schema" or DetectionMechanism.INFORMATION_SCHEMA
|
|
1711
|
+
- "audit_log" or DetectionMechanism.AUDIT_LOG
|
|
1712
|
+
- {
|
|
1713
|
+
"type": "last_modified_column",
|
|
1714
|
+
"column_name": "last_modified",
|
|
1715
|
+
"additional_filter": "last_modified > '2021-01-01'",
|
|
1716
|
+
} or DetectionMechanism.LAST_MODIFIED_COLUMN(column_name='last_modified',
|
|
1717
|
+
additional_filter='last_modified > 2021-01-01')
|
|
1718
|
+
- {
|
|
1719
|
+
"type": "high_watermark_column",
|
|
1720
|
+
"column_name": "id",
|
|
1721
|
+
"additional_filter": "id > 1000",
|
|
1722
|
+
} or DetectionMechanism.HIGH_WATERMARK_COLUMN(column_name='id',
|
|
1723
|
+
additional_filter='id > 1000')
|
|
1724
|
+
- "datahub_operation" or DetectionMechanism.DATAHUB_OPERATION
|
|
1725
|
+
sensitivity: The sensitivity to be applied to the assertion. Valid values are:
|
|
1726
|
+
- "low" or InferenceSensitivity.LOW
|
|
1727
|
+
- "medium" or InferenceSensitivity.MEDIUM
|
|
1728
|
+
- "high" or InferenceSensitivity.HIGH
|
|
1729
|
+
exclusion_windows: The exclusion windows to be applied to the assertion, currently only
|
|
1730
|
+
fixed range exclusion windows are supported. Valid values are:
|
|
1731
|
+
- from datetime.datetime objects: {
|
|
1732
|
+
"start": "datetime(2025, 1, 1, 0, 0, 0)",
|
|
1733
|
+
"end": "datetime(2025, 1, 2, 0, 0, 0)",
|
|
1734
|
+
}
|
|
1735
|
+
- from string datetimes: {
|
|
1736
|
+
"start": "2025-01-01T00:00:00",
|
|
1737
|
+
"end": "2025-01-02T00:00:00",
|
|
1738
|
+
}
|
|
1739
|
+
- from FixedRangeExclusionWindow objects: FixedRangeExclusionWindow(
|
|
1740
|
+
start=datetime(2025, 1, 1, 0, 0, 0),
|
|
1741
|
+
end=datetime(2025, 1, 2, 0, 0, 0)
|
|
1742
|
+
)
|
|
1743
|
+
training_data_lookback_days: The training data lookback days to be applied to the
|
|
1744
|
+
assertion as an integer.
|
|
1745
|
+
incident_behavior: The incident behavior to be applied to the assertion. Valid values are:
|
|
1746
|
+
- "raise_on_fail" or AssertionIncidentBehavior.RAISE_ON_FAIL
|
|
1747
|
+
- "resolve_on_pass" or AssertionIncidentBehavior.RESOLVE_ON_PASS
|
|
1748
|
+
tags: The tags to be applied to the assertion. Valid values are:
|
|
1749
|
+
- a list of strings (strings will be converted to TagUrn objects)
|
|
1750
|
+
- a list of TagUrn objects
|
|
1751
|
+
- a list of TagAssociationClass objects
|
|
1752
|
+
created_by: Optional urn of the user who created the assertion. The format is
|
|
1753
|
+
"urn:li:corpuser:<username>", which you can find on the Users & Groups page.
|
|
1754
|
+
The default is the datahub system user.
|
|
1755
|
+
TODO: Retrieve the SDK user as the default instead of the datahub system user.
|
|
1756
|
+
schedule: Optional cron formatted schedule for the assertion. If not provided, a default
|
|
1757
|
+
schedule will be used. The schedule determines when the assertion will be evaluated.
|
|
1758
|
+
The format is a cron expression, e.g. "0 * * * *" for every hour using UTC timezone.
|
|
1759
|
+
Alternatively, a models.CronScheduleClass object can be provided with string parameters
|
|
1760
|
+
cron and timezone. Use `from datahub.metadata import schema_classes as models` to import the class.
|
|
1761
|
+
|
|
1762
|
+
Returns:
|
|
1763
|
+
SmartVolumeAssertion: The created assertion.
|
|
1764
|
+
"""
|
|
1765
|
+
_print_experimental_warning()
|
|
1766
|
+
now_utc = datetime.now(timezone.utc)
|
|
1767
|
+
if created_by is None:
|
|
1768
|
+
logger.warning(
|
|
1769
|
+
f"Created by is not set, using {DEFAULT_CREATED_BY} as a placeholder"
|
|
1770
|
+
)
|
|
1771
|
+
created_by = DEFAULT_CREATED_BY
|
|
1772
|
+
assertion_input = _SmartColumnMetricAssertionInput(
|
|
1773
|
+
urn=None,
|
|
1774
|
+
entity_client=self.client.entities,
|
|
1775
|
+
dataset_urn=dataset_urn,
|
|
1776
|
+
column_name=column_name,
|
|
1777
|
+
metric_type=metric_type,
|
|
1778
|
+
operator=operator,
|
|
1779
|
+
value=value,
|
|
1780
|
+
value_type=value_type,
|
|
1781
|
+
range=range,
|
|
1782
|
+
range_type=range_type,
|
|
1783
|
+
display_name=display_name,
|
|
1784
|
+
enabled=enabled,
|
|
1785
|
+
detection_mechanism=detection_mechanism,
|
|
1786
|
+
sensitivity=sensitivity,
|
|
1787
|
+
exclusion_windows=exclusion_windows,
|
|
1788
|
+
training_data_lookback_days=training_data_lookback_days,
|
|
1789
|
+
incident_behavior=incident_behavior,
|
|
1790
|
+
tags=tags,
|
|
1791
|
+
created_by=created_by,
|
|
1792
|
+
created_at=now_utc,
|
|
1793
|
+
updated_by=created_by,
|
|
1794
|
+
updated_at=now_utc,
|
|
1795
|
+
schedule=schedule,
|
|
1796
|
+
)
|
|
1797
|
+
assertion_entity, monitor_entity = (
|
|
1798
|
+
assertion_input.to_assertion_and_monitor_entities()
|
|
1799
|
+
)
|
|
1800
|
+
# If assertion creation fails, we won't try to create the monitor
|
|
1801
|
+
self.client.entities.create(assertion_entity)
|
|
1802
|
+
# TODO: Wrap monitor creation in a try-except and delete the assertion if monitor creation fails (once delete is implemented https://linear.app/acryl-data/issue/OBS-1350/add-delete-method-to-entity-clientpy)
|
|
1803
|
+
# try:
|
|
1804
|
+
self.client.entities.create(monitor_entity)
|
|
1805
|
+
# except Exception as e:
|
|
1806
|
+
# logger.error(f"Error creating monitor: {e}")
|
|
1807
|
+
# self.client.entities.delete(assertion_entity)
|
|
1808
|
+
# raise e
|
|
1809
|
+
return SmartColumnMetricAssertion._from_entities(
|
|
1810
|
+
assertion_entity, monitor_entity
|
|
1811
|
+
)
|
|
1812
|
+
|
|
1813
|
+
def _retrieve_and_merge_smart_column_metric_assertion_and_monitor(
|
|
1814
|
+
self,
|
|
1815
|
+
assertion_input: _SmartColumnMetricAssertionInput,
|
|
1816
|
+
dataset_urn: Union[str, DatasetUrn],
|
|
1817
|
+
column_name: str,
|
|
1818
|
+
metric_type: MetricInputType,
|
|
1819
|
+
operator: OperatorInputType,
|
|
1820
|
+
value: Optional[ValueInputType],
|
|
1821
|
+
value_type: Optional[ValueTypeInputType],
|
|
1822
|
+
range: Optional[RangeInputType],
|
|
1823
|
+
range_type: Optional[RangeTypeInputType],
|
|
1824
|
+
urn: Union[str, AssertionUrn],
|
|
1825
|
+
display_name: Optional[str],
|
|
1826
|
+
enabled: Optional[bool],
|
|
1827
|
+
detection_mechanism: DetectionMechanismInputTypes,
|
|
1828
|
+
sensitivity: Optional[Union[str, InferenceSensitivity]],
|
|
1829
|
+
exclusion_windows: Optional[ExclusionWindowInputTypes],
|
|
1830
|
+
training_data_lookback_days: Optional[int],
|
|
1831
|
+
incident_behavior: Optional[
|
|
1832
|
+
Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
|
|
1833
|
+
],
|
|
1834
|
+
tags: Optional[TagsInputType],
|
|
1835
|
+
updated_by: Optional[Union[str, CorpUserUrn]],
|
|
1836
|
+
now_utc: datetime,
|
|
1837
|
+
schedule: Optional[Union[str, models.CronScheduleClass]],
|
|
1838
|
+
) -> Union[SmartColumnMetricAssertion, _SmartColumnMetricAssertionInput]:
|
|
1839
|
+
# 1. Retrieve any existing assertion and monitor entities:
|
|
1840
|
+
maybe_assertion_entity, monitor_urn, maybe_monitor_entity = (
|
|
1841
|
+
self._retrieve_assertion_and_monitor(assertion_input)
|
|
1842
|
+
)
|
|
1843
|
+
|
|
1844
|
+
# 2.1 If the assertion and monitor entities exist, create an assertion object from them:
|
|
1845
|
+
if maybe_assertion_entity and maybe_monitor_entity:
|
|
1846
|
+
existing_assertion = SmartColumnMetricAssertion._from_entities(
|
|
1847
|
+
maybe_assertion_entity, maybe_monitor_entity
|
|
1848
|
+
)
|
|
1849
|
+
# 2.2 If the assertion exists but the monitor does not, create a placeholder monitor entity to be able to create the assertion:
|
|
1850
|
+
elif maybe_assertion_entity and not maybe_monitor_entity:
|
|
1851
|
+
monitor_mode = (
|
|
1852
|
+
"ACTIVE" if enabled else "INACTIVE" if enabled is not None else "ACTIVE"
|
|
1853
|
+
)
|
|
1854
|
+
existing_assertion = SmartColumnMetricAssertion._from_entities(
|
|
1855
|
+
maybe_assertion_entity,
|
|
1856
|
+
Monitor(id=monitor_urn, info=("ASSERTION", monitor_mode)),
|
|
1857
|
+
)
|
|
1858
|
+
# 2.3 If the assertion does not exist, create a new assertion with a generated urn and return the assertion input:
|
|
1859
|
+
elif not maybe_assertion_entity:
|
|
1860
|
+
logger.info(
|
|
1861
|
+
f"No existing assertion entity found for assertion urn {urn}, creating a new assertion with a generated urn"
|
|
1862
|
+
)
|
|
1863
|
+
return self._create_smart_column_metric_assertion(
|
|
1864
|
+
dataset_urn=dataset_urn,
|
|
1865
|
+
column_name=column_name,
|
|
1866
|
+
metric_type=metric_type,
|
|
1867
|
+
operator=operator,
|
|
1868
|
+
value=value,
|
|
1869
|
+
value_type=value_type,
|
|
1870
|
+
range=range,
|
|
1871
|
+
range_type=range_type,
|
|
1872
|
+
schedule=schedule,
|
|
1873
|
+
display_name=display_name,
|
|
1874
|
+
detection_mechanism=detection_mechanism,
|
|
1875
|
+
sensitivity=sensitivity,
|
|
1876
|
+
exclusion_windows=exclusion_windows,
|
|
1877
|
+
training_data_lookback_days=training_data_lookback_days,
|
|
1878
|
+
incident_behavior=incident_behavior,
|
|
1879
|
+
tags=tags,
|
|
1880
|
+
created_by=updated_by,
|
|
1881
|
+
)
|
|
1882
|
+
|
|
1883
|
+
# 3. Check for any issues e.g. different dataset urns
|
|
1884
|
+
if (
|
|
1885
|
+
existing_assertion
|
|
1886
|
+
and hasattr(existing_assertion, "dataset_urn")
|
|
1887
|
+
and existing_assertion.dataset_urn != assertion_input.dataset_urn
|
|
1888
|
+
):
|
|
1889
|
+
raise SDKUsageError(
|
|
1890
|
+
f"Dataset URN mismatch, existing assertion: {existing_assertion.dataset_urn} != new assertion: {dataset_urn}"
|
|
1891
|
+
)
|
|
1892
|
+
|
|
1893
|
+
# 4. Merge the existing assertion with the validated input:
|
|
1894
|
+
merged_assertion_input = self._merge_smart_column_metric_input(
|
|
1895
|
+
dataset_urn=dataset_urn,
|
|
1896
|
+
column_name=column_name,
|
|
1897
|
+
metric_type=metric_type,
|
|
1898
|
+
operator=operator,
|
|
1899
|
+
value=value,
|
|
1900
|
+
value_type=value_type,
|
|
1901
|
+
range=range,
|
|
1902
|
+
range_type=range_type,
|
|
1903
|
+
urn=urn,
|
|
1904
|
+
display_name=display_name,
|
|
1905
|
+
enabled=enabled,
|
|
1906
|
+
schedule=schedule,
|
|
1907
|
+
detection_mechanism=detection_mechanism,
|
|
1908
|
+
sensitivity=sensitivity,
|
|
1909
|
+
exclusion_windows=exclusion_windows,
|
|
1910
|
+
training_data_lookback_days=training_data_lookback_days,
|
|
1911
|
+
incident_behavior=incident_behavior,
|
|
1912
|
+
tags=tags,
|
|
1913
|
+
now_utc=now_utc,
|
|
1914
|
+
assertion_input=assertion_input,
|
|
1915
|
+
maybe_assertion_entity=maybe_assertion_entity,
|
|
1916
|
+
maybe_monitor_entity=maybe_monitor_entity,
|
|
1917
|
+
existing_assertion=existing_assertion,
|
|
1918
|
+
)
|
|
1919
|
+
|
|
1920
|
+
return merged_assertion_input
|
|
1921
|
+
|
|
1922
|
+
def _merge_smart_column_metric_input(
|
|
1923
|
+
self,
|
|
1924
|
+
dataset_urn: Union[str, DatasetUrn],
|
|
1925
|
+
column_name: str,
|
|
1926
|
+
metric_type: MetricInputType,
|
|
1927
|
+
operator: OperatorInputType,
|
|
1928
|
+
value: Optional[ValueInputType],
|
|
1929
|
+
value_type: Optional[ValueTypeInputType],
|
|
1930
|
+
range: Optional[RangeInputType],
|
|
1931
|
+
range_type: Optional[RangeTypeInputType],
|
|
1932
|
+
urn: Union[str, AssertionUrn],
|
|
1933
|
+
display_name: Optional[str],
|
|
1934
|
+
enabled: Optional[bool],
|
|
1935
|
+
detection_mechanism: DetectionMechanismInputTypes,
|
|
1936
|
+
sensitivity: Optional[Union[str, InferenceSensitivity]],
|
|
1937
|
+
exclusion_windows: Optional[ExclusionWindowInputTypes],
|
|
1938
|
+
training_data_lookback_days: Optional[int],
|
|
1939
|
+
incident_behavior: Optional[
|
|
1940
|
+
Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
|
|
1941
|
+
],
|
|
1942
|
+
tags: Optional[TagsInputType],
|
|
1943
|
+
schedule: Optional[Union[str, models.CronScheduleClass]],
|
|
1944
|
+
now_utc: datetime,
|
|
1945
|
+
assertion_input: _SmartColumnMetricAssertionInput,
|
|
1946
|
+
maybe_assertion_entity: Optional[Assertion],
|
|
1947
|
+
maybe_monitor_entity: Optional[Monitor],
|
|
1948
|
+
existing_assertion: SmartColumnMetricAssertion,
|
|
1949
|
+
) -> _SmartColumnMetricAssertionInput:
|
|
1950
|
+
"""Merge the input with the existing assertion and monitor entities.
|
|
1951
|
+
|
|
1952
|
+
Args:
|
|
1953
|
+
dataset_urn: The urn of the dataset to be monitored.
|
|
1954
|
+
column_name: The name of the column to be monitored.
|
|
1955
|
+
metric_type: The type of the metric to be monitored.
|
|
1956
|
+
operator: The operator to be used for the assertion.
|
|
1957
|
+
value: The value to be used for the assertion.
|
|
1958
|
+
value_type: The type of the value to be used for the assertion.
|
|
1959
|
+
range: The range to be used for the assertion.
|
|
1960
|
+
range_type: The type of the range to be used for the assertion.
|
|
1961
|
+
urn: The urn of the assertion.
|
|
1962
|
+
display_name: The display name of the assertion.
|
|
1963
|
+
enabled: Whether the assertion is enabled.
|
|
1964
|
+
detection_mechanism: The detection mechanism to be used for the assertion.
|
|
1965
|
+
sensitivity: The sensitivity to be applied to the assertion.
|
|
1966
|
+
exclusion_windows: The exclusion windows to be applied to the assertion.
|
|
1967
|
+
training_data_lookback_days: The training data lookback days to be applied to the assertion.
|
|
1968
|
+
incident_behavior: The incident behavior to be applied to the assertion.
|
|
1969
|
+
tags: The tags to be applied to the assertion.
|
|
1970
|
+
now_utc: The current UTC time from when the function is called.
|
|
1971
|
+
assertion_input: The validated input to the function.
|
|
1972
|
+
maybe_assertion_entity: The existing assertion entity from the DataHub instance.
|
|
1973
|
+
maybe_monitor_entity: The existing monitor entity from the DataHub instance.
|
|
1974
|
+
existing_assertion: The existing assertion from the DataHub instance.
|
|
1975
|
+
|
|
1976
|
+
Returns:
|
|
1977
|
+
The merged assertion input.
|
|
1978
|
+
"""
|
|
1979
|
+
merged_assertion_input = _SmartColumnMetricAssertionInput(
|
|
1980
|
+
urn=urn,
|
|
1981
|
+
entity_client=self.client.entities,
|
|
1982
|
+
dataset_urn=dataset_urn,
|
|
1983
|
+
column_name=_merge_field(
|
|
1984
|
+
input_field_value=column_name,
|
|
1985
|
+
input_field_name="column_name",
|
|
1986
|
+
validated_assertion_input=assertion_input,
|
|
1987
|
+
validated_existing_assertion=existing_assertion,
|
|
1988
|
+
existing_entity_value=SmartColumnMetricAssertion._get_column_name(
|
|
1989
|
+
maybe_assertion_entity
|
|
1990
|
+
)
|
|
1991
|
+
if maybe_assertion_entity
|
|
1992
|
+
else None,
|
|
1993
|
+
),
|
|
1994
|
+
metric_type=_merge_field(
|
|
1995
|
+
input_field_value=metric_type,
|
|
1996
|
+
input_field_name="metric_type",
|
|
1997
|
+
validated_assertion_input=assertion_input,
|
|
1998
|
+
validated_existing_assertion=existing_assertion,
|
|
1999
|
+
existing_entity_value=SmartColumnMetricAssertion._get_metric_type(
|
|
2000
|
+
maybe_assertion_entity
|
|
2001
|
+
)
|
|
2002
|
+
if maybe_assertion_entity
|
|
2003
|
+
else None,
|
|
2004
|
+
),
|
|
2005
|
+
operator=_merge_field(
|
|
2006
|
+
input_field_value=operator,
|
|
2007
|
+
input_field_name="operator",
|
|
2008
|
+
validated_assertion_input=assertion_input,
|
|
2009
|
+
validated_existing_assertion=existing_assertion,
|
|
2010
|
+
existing_entity_value=SmartColumnMetricAssertion._get_operator(
|
|
2011
|
+
maybe_assertion_entity
|
|
2012
|
+
)
|
|
2013
|
+
if maybe_assertion_entity
|
|
2014
|
+
else None,
|
|
2015
|
+
),
|
|
2016
|
+
value=_merge_field(
|
|
2017
|
+
input_field_value=value,
|
|
2018
|
+
input_field_name="value",
|
|
2019
|
+
validated_assertion_input=assertion_input,
|
|
2020
|
+
validated_existing_assertion=existing_assertion,
|
|
2021
|
+
existing_entity_value=SmartColumnMetricAssertion._get_value(
|
|
2022
|
+
maybe_assertion_entity
|
|
2023
|
+
)
|
|
2024
|
+
if maybe_assertion_entity
|
|
2025
|
+
else None,
|
|
2026
|
+
),
|
|
2027
|
+
value_type=_merge_field(
|
|
2028
|
+
input_field_value=value_type,
|
|
2029
|
+
input_field_name="value_type",
|
|
2030
|
+
validated_assertion_input=assertion_input,
|
|
2031
|
+
validated_existing_assertion=existing_assertion,
|
|
2032
|
+
existing_entity_value=SmartColumnMetricAssertion._get_value_type(
|
|
2033
|
+
maybe_assertion_entity
|
|
2034
|
+
)
|
|
2035
|
+
if maybe_assertion_entity
|
|
2036
|
+
else None,
|
|
2037
|
+
),
|
|
2038
|
+
range=_merge_field(
|
|
2039
|
+
input_field_value=range,
|
|
2040
|
+
input_field_name="range",
|
|
2041
|
+
validated_assertion_input=assertion_input,
|
|
2042
|
+
validated_existing_assertion=existing_assertion,
|
|
2043
|
+
existing_entity_value=SmartColumnMetricAssertion._get_range(
|
|
2044
|
+
maybe_assertion_entity
|
|
2045
|
+
)
|
|
2046
|
+
if maybe_assertion_entity
|
|
2047
|
+
else None,
|
|
2048
|
+
),
|
|
2049
|
+
range_type=_merge_field(
|
|
2050
|
+
input_field_value=range_type,
|
|
2051
|
+
input_field_name="range_type",
|
|
2052
|
+
validated_assertion_input=assertion_input,
|
|
2053
|
+
validated_existing_assertion=existing_assertion,
|
|
2054
|
+
existing_entity_value=SmartColumnMetricAssertion._get_range_type(
|
|
2055
|
+
maybe_assertion_entity
|
|
2056
|
+
)
|
|
2057
|
+
if maybe_assertion_entity
|
|
2058
|
+
else None,
|
|
2059
|
+
),
|
|
2060
|
+
display_name=_merge_field(
|
|
2061
|
+
input_field_value=display_name,
|
|
2062
|
+
input_field_name="display_name",
|
|
2063
|
+
validated_assertion_input=assertion_input,
|
|
2064
|
+
validated_existing_assertion=existing_assertion,
|
|
2065
|
+
existing_entity_value=maybe_assertion_entity.description
|
|
2066
|
+
if maybe_assertion_entity
|
|
2067
|
+
else None,
|
|
2068
|
+
),
|
|
2069
|
+
enabled=_merge_field(
|
|
2070
|
+
input_field_value=enabled,
|
|
2071
|
+
input_field_name="enabled",
|
|
2072
|
+
validated_assertion_input=assertion_input,
|
|
2073
|
+
validated_existing_assertion=existing_assertion,
|
|
2074
|
+
existing_entity_value=existing_assertion.mode == AssertionMode.ACTIVE
|
|
2075
|
+
if existing_assertion
|
|
2076
|
+
else None,
|
|
2077
|
+
),
|
|
2078
|
+
schedule=_merge_field(
|
|
2079
|
+
input_field_value=schedule,
|
|
2080
|
+
input_field_name="schedule",
|
|
2081
|
+
validated_assertion_input=assertion_input,
|
|
2082
|
+
validated_existing_assertion=existing_assertion,
|
|
2083
|
+
existing_entity_value=existing_assertion.schedule
|
|
2084
|
+
if existing_assertion
|
|
2085
|
+
else None,
|
|
2086
|
+
),
|
|
2087
|
+
detection_mechanism=_merge_field(
|
|
2088
|
+
input_field_value=detection_mechanism,
|
|
2089
|
+
input_field_name="detection_mechanism",
|
|
2090
|
+
validated_assertion_input=assertion_input,
|
|
2091
|
+
validated_existing_assertion=existing_assertion,
|
|
2092
|
+
existing_entity_value=SmartColumnMetricAssertion._get_detection_mechanism(
|
|
2093
|
+
maybe_assertion_entity, maybe_monitor_entity, default=None
|
|
2094
|
+
)
|
|
2095
|
+
if maybe_assertion_entity and maybe_monitor_entity
|
|
2096
|
+
else None,
|
|
2097
|
+
),
|
|
2098
|
+
sensitivity=_merge_field(
|
|
2099
|
+
input_field_value=sensitivity,
|
|
2100
|
+
input_field_name="sensitivity",
|
|
2101
|
+
validated_assertion_input=assertion_input,
|
|
2102
|
+
validated_existing_assertion=existing_assertion,
|
|
2103
|
+
existing_entity_value=maybe_monitor_entity.sensitivity
|
|
2104
|
+
if maybe_monitor_entity
|
|
2105
|
+
else None,
|
|
2106
|
+
),
|
|
2107
|
+
exclusion_windows=_merge_field(
|
|
2108
|
+
input_field_value=exclusion_windows,
|
|
2109
|
+
input_field_name="exclusion_windows",
|
|
2110
|
+
validated_assertion_input=assertion_input,
|
|
2111
|
+
validated_existing_assertion=existing_assertion,
|
|
2112
|
+
existing_entity_value=maybe_monitor_entity.exclusion_windows
|
|
2113
|
+
if maybe_monitor_entity
|
|
2114
|
+
else None,
|
|
2115
|
+
),
|
|
2116
|
+
training_data_lookback_days=_merge_field(
|
|
2117
|
+
input_field_value=training_data_lookback_days,
|
|
2118
|
+
input_field_name="training_data_lookback_days",
|
|
2119
|
+
validated_assertion_input=assertion_input,
|
|
2120
|
+
validated_existing_assertion=existing_assertion,
|
|
2121
|
+
existing_entity_value=maybe_monitor_entity.training_data_lookback_days
|
|
2122
|
+
if maybe_monitor_entity
|
|
2123
|
+
else None,
|
|
2124
|
+
),
|
|
2125
|
+
incident_behavior=_merge_field(
|
|
2126
|
+
input_field_value=incident_behavior,
|
|
2127
|
+
input_field_name="incident_behavior",
|
|
2128
|
+
validated_assertion_input=assertion_input,
|
|
2129
|
+
validated_existing_assertion=existing_assertion,
|
|
2130
|
+
existing_entity_value=SmartColumnMetricAssertion._get_incident_behavior(
|
|
2131
|
+
maybe_assertion_entity
|
|
2132
|
+
)
|
|
2133
|
+
if maybe_assertion_entity
|
|
2134
|
+
else None,
|
|
2135
|
+
),
|
|
2136
|
+
tags=_merge_field(
|
|
2137
|
+
input_field_value=tags,
|
|
2138
|
+
input_field_name="tags",
|
|
2139
|
+
validated_assertion_input=assertion_input,
|
|
2140
|
+
validated_existing_assertion=existing_assertion,
|
|
2141
|
+
existing_entity_value=maybe_assertion_entity.tags
|
|
2142
|
+
if maybe_assertion_entity
|
|
2143
|
+
else None,
|
|
2144
|
+
),
|
|
2145
|
+
created_by=existing_assertion.created_by
|
|
2146
|
+
or DEFAULT_CREATED_BY, # Override with the existing assertion's created_by or the default created_by if not set
|
|
2147
|
+
created_at=existing_assertion.created_at
|
|
2148
|
+
or now_utc, # Override with the existing assertion's created_at or now if not set
|
|
2149
|
+
updated_by=assertion_input.updated_by, # Override with the input's updated_by
|
|
2150
|
+
updated_at=assertion_input.updated_at, # Override with the input's updated_at (now)
|
|
2151
|
+
)
|
|
2152
|
+
|
|
2153
|
+
return merged_assertion_input
|
|
2154
|
+
|
|
2155
|
+
def sync_freshness_assertion(
|
|
2156
|
+
self,
|
|
2157
|
+
*,
|
|
2158
|
+
dataset_urn: Union[str, DatasetUrn],
|
|
2159
|
+
urn: Optional[Union[str, AssertionUrn]] = None,
|
|
2160
|
+
display_name: Optional[str] = None,
|
|
2161
|
+
enabled: Optional[bool] = None,
|
|
2162
|
+
detection_mechanism: DetectionMechanismInputTypes = None,
|
|
2163
|
+
incident_behavior: Optional[
|
|
2164
|
+
Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
|
|
2165
|
+
] = None,
|
|
2166
|
+
tags: Optional[TagsInputType] = None,
|
|
2167
|
+
updated_by: Optional[Union[str, CorpUserUrn]] = None,
|
|
2168
|
+
freshness_schedule_check_type: Optional[
|
|
2169
|
+
Union[str, models.FreshnessAssertionScheduleTypeClass]
|
|
2170
|
+
] = None,
|
|
2171
|
+
schedule: Optional[Union[str, models.CronScheduleClass]] = None,
|
|
2172
|
+
lookback_window: Optional[TimeWindowSizeInputTypes] = None,
|
|
2173
|
+
) -> FreshnessAssertion:
|
|
2174
|
+
"""Upsert and merge a freshness assertion.
|
|
2175
|
+
|
|
2176
|
+
Note: keyword arguments are required.
|
|
2177
|
+
|
|
2178
|
+
Upsert and merge is a combination of create and update. If the assertion does not exist,
|
|
2179
|
+
it will be created. If it does exist, it will be updated. Existing assertion fields will
|
|
2180
|
+
be updated if the input value is not None. If the input value is None, the existing value
|
|
2181
|
+
will be preserved. If the input value can be un-set e.g. by passing an empty list or
|
|
2182
|
+
empty string.
|
|
2183
|
+
|
|
2184
|
+
Schedule behavior:
|
|
2185
|
+
- Create case: Uses default daily schedule (\"0 0 * * *\") or provided schedule
|
|
2186
|
+
- Update case: Uses existing schedule or provided schedule.
|
|
2187
|
+
|
|
2188
|
+
Args:
|
|
2189
|
+
dataset_urn: The urn of the dataset to be monitored.
|
|
2190
|
+
urn: The urn of the assertion. If not provided, a urn will be generated and the assertion
|
|
2191
|
+
will be _created_ in the DataHub instance.
|
|
2192
|
+
display_name: The display name of the assertion. If not provided, a random display name
|
|
2193
|
+
will be generated.
|
|
2194
|
+
enabled: Whether the assertion is enabled. If not provided, the existing value
|
|
2195
|
+
will be preserved.
|
|
2196
|
+
detection_mechanism: The detection mechanism to be used for the assertion. Information
|
|
2197
|
+
schema is recommended. Valid values are:
|
|
2198
|
+
- "information_schema" or DetectionMechanism.INFORMATION_SCHEMA
|
|
2199
|
+
- "audit_log" or DetectionMechanism.AUDIT_LOG
|
|
2200
|
+
- {
|
|
2201
|
+
"type": "last_modified_column",
|
|
2202
|
+
"column_name": "last_modified",
|
|
2203
|
+
"additional_filter": "last_modified > '2021-01-01'",
|
|
2204
|
+
} or DetectionMechanism.LAST_MODIFIED_COLUMN(column_name='last_modified',
|
|
2205
|
+
additional_filter='last_modified > 2021-01-01')
|
|
2206
|
+
- {
|
|
2207
|
+
"type": "high_watermark_column",
|
|
2208
|
+
"column_name": "id",
|
|
2209
|
+
"additional_filter": "id > 1000",
|
|
2210
|
+
} or DetectionMechanism.HIGH_WATERMARK_COLUMN(column_name='id',
|
|
2211
|
+
additional_filter='id > 1000')
|
|
2212
|
+
- "datahub_operation" or DetectionMechanism.DATAHUB_OPERATION
|
|
2213
|
+
incident_behavior: The incident behavior to be applied to the assertion. Valid values are:
|
|
2214
|
+
- "raise_on_fail" or AssertionIncidentBehavior.RAISE_ON_FAIL
|
|
2215
|
+
- "resolve_on_pass" or AssertionIncidentBehavior.RESOLVE_ON_PASS
|
|
2216
|
+
tags: The tags to be applied to the assertion. Valid values are:
|
|
2217
|
+
- a list of strings (strings will be converted to TagUrn objects)
|
|
2218
|
+
- a list of TagUrn objects
|
|
2219
|
+
- a list of TagAssociationClass objects
|
|
2220
|
+
updated_by: Optional urn of the user who updated the assertion. The format is
|
|
2221
|
+
"urn:li:corpuser:<username>", which you can find on the Users & Groups page.
|
|
2222
|
+
The default is the datahub system user.
|
|
2223
|
+
TODO: Retrieve the SDK user as the default instead of the datahub system user.
|
|
2224
|
+
schedule: Optional cron formatted schedule for the assertion. If not provided, a default
|
|
2225
|
+
schedule will be used. The schedule determines when the assertion will be evaluated.
|
|
2226
|
+
The format is a cron expression, e.g. "0 * * * *" for every hour using UTC timezone.
|
|
2227
|
+
Alternatively, a models.CronScheduleClass object can be provided with string parameters
|
|
2228
|
+
cron and timezone. Use `from datahub.metadata import schema_classes as models` to import the class.
|
|
2229
|
+
|
|
2230
|
+
Returns:
|
|
2231
|
+
FreshnessAssertion: The created or updated assertion.
|
|
2232
|
+
"""
|
|
2233
|
+
_print_experimental_warning()
|
|
2234
|
+
now_utc = datetime.now(timezone.utc)
|
|
2235
|
+
|
|
2236
|
+
if updated_by is None:
|
|
2237
|
+
logger.warning(
|
|
2238
|
+
f"updated_by is not set, using {DEFAULT_CREATED_BY} as a placeholder"
|
|
2239
|
+
)
|
|
2240
|
+
updated_by = DEFAULT_CREATED_BY
|
|
2241
|
+
|
|
2242
|
+
# 1. If urn is not set, create a new assertion
|
|
2243
|
+
if urn is None:
|
|
2244
|
+
logger.info("URN is not set, creating a new assertion")
|
|
2245
|
+
return self._create_freshness_assertion(
|
|
2246
|
+
dataset_urn=dataset_urn,
|
|
2247
|
+
display_name=display_name,
|
|
2248
|
+
enabled=enabled if enabled is not None else True,
|
|
2249
|
+
detection_mechanism=detection_mechanism,
|
|
2250
|
+
incident_behavior=incident_behavior,
|
|
2251
|
+
tags=tags,
|
|
2252
|
+
created_by=updated_by,
|
|
2253
|
+
schedule=schedule,
|
|
2254
|
+
freshness_schedule_check_type=freshness_schedule_check_type,
|
|
2255
|
+
lookback_window=lookback_window,
|
|
2256
|
+
)
|
|
2257
|
+
|
|
2258
|
+
# 2. If urn is set, first validate the input:
|
|
2259
|
+
assertion_input = _FreshnessAssertionInput(
|
|
2260
|
+
urn=urn,
|
|
2261
|
+
entity_client=self.client.entities,
|
|
2262
|
+
dataset_urn=dataset_urn,
|
|
2263
|
+
display_name=display_name,
|
|
2264
|
+
detection_mechanism=detection_mechanism,
|
|
2265
|
+
incident_behavior=incident_behavior,
|
|
2266
|
+
tags=tags,
|
|
2267
|
+
created_by=updated_by, # This will be overridden by the actual created_by
|
|
2268
|
+
created_at=now_utc, # This will be overridden by the actual created_at
|
|
2269
|
+
updated_by=updated_by,
|
|
2270
|
+
updated_at=now_utc,
|
|
2271
|
+
schedule=schedule,
|
|
2272
|
+
freshness_schedule_check_type=freshness_schedule_check_type,
|
|
2273
|
+
lookback_window=lookback_window,
|
|
2274
|
+
)
|
|
2275
|
+
|
|
2276
|
+
# 3. Merge the assertion input with the existing assertion and monitor entities or create a new assertion
|
|
2277
|
+
# if the assertion does not exist:
|
|
2278
|
+
merged_assertion_input_or_created_assertion = (
|
|
2279
|
+
self._retrieve_and_merge_freshness_assertion_and_monitor(
|
|
2280
|
+
assertion_input=assertion_input,
|
|
2281
|
+
dataset_urn=dataset_urn,
|
|
2282
|
+
urn=urn,
|
|
2283
|
+
display_name=display_name,
|
|
2284
|
+
enabled=enabled,
|
|
2285
|
+
detection_mechanism=detection_mechanism,
|
|
2286
|
+
incident_behavior=incident_behavior,
|
|
2287
|
+
tags=tags,
|
|
2288
|
+
updated_by=updated_by,
|
|
2289
|
+
now_utc=now_utc,
|
|
2290
|
+
schedule=schedule,
|
|
2291
|
+
freshness_schedule_check_type=freshness_schedule_check_type,
|
|
2292
|
+
lookback_window=lookback_window,
|
|
2293
|
+
)
|
|
2294
|
+
)
|
|
2295
|
+
|
|
2296
|
+
# Return early if we created a new assertion in the merge:
|
|
2297
|
+
if isinstance(merged_assertion_input_or_created_assertion, _AssertionPublic):
|
|
2298
|
+
# We know this is the correct type because we passed the assertion_class parameter
|
|
2299
|
+
assert isinstance(
|
|
2300
|
+
merged_assertion_input_or_created_assertion, FreshnessAssertion
|
|
2301
|
+
)
|
|
2302
|
+
return merged_assertion_input_or_created_assertion
|
|
2303
|
+
|
|
2304
|
+
# 4. Upsert the assertion and monitor entities:
|
|
2305
|
+
assertion_entity, monitor_entity = (
|
|
2306
|
+
merged_assertion_input_or_created_assertion.to_assertion_and_monitor_entities()
|
|
2307
|
+
)
|
|
2308
|
+
# If assertion upsert fails, we won't try to upsert the monitor
|
|
2309
|
+
self.client.entities.upsert(assertion_entity)
|
|
2310
|
+
# TODO: Wrap monitor upsert in a try-except and delete the assertion if monitor upsert fails (once delete is implemented https://linear.app/acryl-data/issue/OBS-1350/add-delete-method-to-entity-clientpy)
|
|
2311
|
+
# try:
|
|
2312
|
+
self.client.entities.upsert(monitor_entity)
|
|
2313
|
+
# except Exception as e:
|
|
2314
|
+
# logger.error(f"Error upserting monitor: {e}")
|
|
2315
|
+
# self.client.entities.delete(assertion_entity)
|
|
2316
|
+
# raise e
|
|
2317
|
+
|
|
2318
|
+
return FreshnessAssertion._from_entities(assertion_entity, monitor_entity)
|
|
2319
|
+
|
|
1109
2320
|
|
|
1110
2321
|
def _merge_field(
|
|
1111
2322
|
input_field_value: Any,
|