acryl-datahub-cloud 0.3.12rc1__py3-none-any.whl → 0.3.12rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +524 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
- acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
- acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
- acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
- acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +49 -40
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +1842 -1786
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- acryl_datahub_cloud/metadata/schema.avsc +24747 -23945
- acryl_datahub_cloud/metadata/schema_classes.py +1031 -631
- acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
- acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +72 -0
- acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +31 -7
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +27 -6
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +31 -7
- acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +14 -0
- acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
- acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
- acryl_datahub_cloud/metadata/schemas/FormKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +27 -6
- acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +1 -0
- acryl_datahub_cloud/notifications/__init__.py +0 -0
- acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
- acryl_datahub_cloud/sdk/__init__.py +25 -0
- acryl_datahub_cloud/{_sdk_extras → sdk}/assertion.py +202 -45
- acryl_datahub_cloud/{_sdk_extras → sdk}/assertion_input.py +344 -83
- acryl_datahub_cloud/{_sdk_extras → sdk}/assertions_client.py +635 -199
- acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
- acryl_datahub_cloud/{_sdk_extras → sdk}/entities/assertion.py +1 -1
- acryl_datahub_cloud/{_sdk_extras → sdk}/subscription_client.py +146 -33
- {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc3.dist-info}/METADATA +48 -43
- {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc3.dist-info}/RECORD +69 -54
- {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc3.dist-info}/entry_points.txt +1 -0
- acryl_datahub_cloud/_sdk_extras/__init__.py +0 -19
- /acryl_datahub_cloud/{_sdk_extras/entities → datahub_forms_notifications}/__init__.py +0 -0
- /acryl_datahub_cloud/{_sdk_extras → sdk}/entities/monitor.py +0 -0
- /acryl_datahub_cloud/{_sdk_extras → sdk}/entities/subscription.py +0 -0
- /acryl_datahub_cloud/{_sdk_extras → sdk}/errors.py +0 -0
- /acryl_datahub_cloud/{_sdk_extras → sdk}/resolver_client.py +0 -0
- {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc3.dist-info}/top_level.txt +0 -0
|
@@ -11,17 +11,20 @@ from enum import Enum
|
|
|
11
11
|
from typing import Literal, Optional, TypeAlias, Union
|
|
12
12
|
|
|
13
13
|
import pydantic
|
|
14
|
+
import pytz
|
|
15
|
+
import tzlocal
|
|
14
16
|
from avrogen.dict_wrapper import DictWrapper
|
|
17
|
+
from croniter import croniter
|
|
15
18
|
from pydantic import BaseModel, Extra, ValidationError
|
|
16
19
|
|
|
17
|
-
from acryl_datahub_cloud.
|
|
20
|
+
from acryl_datahub_cloud.sdk.entities.assertion import (
|
|
18
21
|
Assertion,
|
|
19
22
|
AssertionActionsInputType,
|
|
20
23
|
AssertionInfoInputType,
|
|
21
24
|
TagsInputType,
|
|
22
25
|
)
|
|
23
|
-
from acryl_datahub_cloud.
|
|
24
|
-
from acryl_datahub_cloud.
|
|
26
|
+
from acryl_datahub_cloud.sdk.entities.monitor import Monitor
|
|
27
|
+
from acryl_datahub_cloud.sdk.errors import (
|
|
25
28
|
SDKNotYetSupportedError,
|
|
26
29
|
SDKUsageError,
|
|
27
30
|
SDKUsageErrorWithExamples,
|
|
@@ -39,6 +42,13 @@ ASSERTION_MONITOR_DEFAULT_TRAINING_LOOKBACK_WINDOW_DAYS = 60
|
|
|
39
42
|
DEFAULT_NAME_PREFIX = "New Assertion"
|
|
40
43
|
DEFAULT_NAME_SUFFIX_LENGTH = 8
|
|
41
44
|
|
|
45
|
+
DEFAULT_SCHEDULE = models.CronScheduleClass(
|
|
46
|
+
cron="0 * * * *", # Every hour, matches the UI default
|
|
47
|
+
timezone=str(
|
|
48
|
+
tzlocal.get_localzone()
|
|
49
|
+
), # User local timezone, matches the UI default
|
|
50
|
+
)
|
|
51
|
+
|
|
42
52
|
|
|
43
53
|
class AbstractDetectionMechanism(BaseModel, ABC):
|
|
44
54
|
type: str
|
|
@@ -85,6 +95,16 @@ class _DataHubOperation(AbstractDetectionMechanism):
|
|
|
85
95
|
type: Literal["datahub_operation"] = "datahub_operation"
|
|
86
96
|
|
|
87
97
|
|
|
98
|
+
class _Query(AbstractDetectionMechanism):
|
|
99
|
+
# COUNT(*) query
|
|
100
|
+
type: Literal["query"] = "query"
|
|
101
|
+
additional_filter: Optional[str] = None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class _DatasetProfile(AbstractDetectionMechanism):
|
|
105
|
+
type: Literal["dataset_profile"] = "dataset_profile"
|
|
106
|
+
|
|
107
|
+
|
|
88
108
|
# Keep these two lists in sync:
|
|
89
109
|
_DETECTION_MECHANISM_CONCRETE_TYPES = (
|
|
90
110
|
_InformationSchema,
|
|
@@ -92,6 +112,8 @@ _DETECTION_MECHANISM_CONCRETE_TYPES = (
|
|
|
92
112
|
_LastModifiedColumn,
|
|
93
113
|
_HighWatermarkColumn,
|
|
94
114
|
_DataHubOperation,
|
|
115
|
+
_Query,
|
|
116
|
+
_DatasetProfile,
|
|
95
117
|
)
|
|
96
118
|
_DetectionMechanismTypes = Union[
|
|
97
119
|
_InformationSchema,
|
|
@@ -99,8 +121,16 @@ _DetectionMechanismTypes = Union[
|
|
|
99
121
|
_LastModifiedColumn,
|
|
100
122
|
_HighWatermarkColumn,
|
|
101
123
|
_DataHubOperation,
|
|
124
|
+
_Query,
|
|
125
|
+
_DatasetProfile,
|
|
102
126
|
]
|
|
103
127
|
|
|
128
|
+
_DETECTION_MECHANISM_TYPES_WITH_ADDITIONAL_FILTER = (
|
|
129
|
+
_LastModifiedColumn,
|
|
130
|
+
_HighWatermarkColumn,
|
|
131
|
+
_Query,
|
|
132
|
+
)
|
|
133
|
+
|
|
104
134
|
|
|
105
135
|
class DetectionMechanism:
|
|
106
136
|
# To have a more enum-like user experience even with sub parameters, we define the detection mechanisms as class attributes.
|
|
@@ -110,6 +140,8 @@ class DetectionMechanism:
|
|
|
110
140
|
LAST_MODIFIED_COLUMN = _LastModifiedColumn
|
|
111
141
|
HIGH_WATERMARK_COLUMN = _HighWatermarkColumn
|
|
112
142
|
DATAHUB_OPERATION = _DataHubOperation()
|
|
143
|
+
QUERY = _Query
|
|
144
|
+
DATASET_PROFILE = _DatasetProfile()
|
|
113
145
|
|
|
114
146
|
_DETECTION_MECHANISM_EXAMPLES = {
|
|
115
147
|
"Information Schema from string": "information_schema",
|
|
@@ -130,6 +162,14 @@ class DetectionMechanism:
|
|
|
130
162
|
"High Watermark Column from DetectionMechanism": "DetectionMechanism.HIGH_WATERMARK_COLUMN(column_name='id', additional_filter='id > 1000')",
|
|
131
163
|
"DataHub Operation from string": "datahub_operation",
|
|
132
164
|
"DataHub Operation from DetectionMechanism": "DetectionMechanism.DATAHUB_OPERATION",
|
|
165
|
+
"Query from string": "query",
|
|
166
|
+
"Query from dict": {
|
|
167
|
+
"type": "query",
|
|
168
|
+
"additional_filter": "id > 1000",
|
|
169
|
+
},
|
|
170
|
+
"Query from DetectionMechanism (with optional additional filter)": "DetectionMechanism.QUERY(additional_filter='id > 1000')",
|
|
171
|
+
"Dataset Profile from string": "dataset_profile",
|
|
172
|
+
"Dataset Profile from DetectionMechanism": "DetectionMechanism.DATASET_PROFILE",
|
|
133
173
|
}
|
|
134
174
|
|
|
135
175
|
@staticmethod
|
|
@@ -496,6 +536,64 @@ def _try_parse_training_data_lookback_days(
|
|
|
496
536
|
return training_data_lookback_days
|
|
497
537
|
|
|
498
538
|
|
|
539
|
+
def _validate_cron_schedule(schedule: str, timezone: str) -> None:
|
|
540
|
+
"""We are using the POSIX.1-2017 standard for cron expressions.
|
|
541
|
+
|
|
542
|
+
Note: We are using the croniter library for cron parsing which is different from executor, which uses apscheduler, so there is a risk of mismatch here.
|
|
543
|
+
"""
|
|
544
|
+
try:
|
|
545
|
+
# Validate timezone - pytz.timezone() raises UnknownTimeZoneError for invalid timezones
|
|
546
|
+
# Skip timezone validation when empty
|
|
547
|
+
if timezone:
|
|
548
|
+
pytz.timezone(timezone)
|
|
549
|
+
|
|
550
|
+
# Validate 5-field cron expression only (POSIX.1-2017 standard)
|
|
551
|
+
fields = schedule.strip().split()
|
|
552
|
+
if len(fields) != 5:
|
|
553
|
+
raise ValueError("POSIX.1-2017 requires exactly 5 fields")
|
|
554
|
+
|
|
555
|
+
# POSIX.1-2017 specific validation: Sunday must be 0, not 7
|
|
556
|
+
# However croniter accepts 7 as Sunday, so custom check is needed here.
|
|
557
|
+
# Check the day-of-week field (5th field, index 4)
|
|
558
|
+
dow_field = fields[4]
|
|
559
|
+
if "7" in dow_field:
|
|
560
|
+
# Check if 7 appears as a standalone value or in ranges
|
|
561
|
+
import re
|
|
562
|
+
|
|
563
|
+
# Match 7 as standalone, in lists, or in ranges
|
|
564
|
+
if re.search(r"\b7\b|7-|,7,|^7,|,7$|-7\b", dow_field):
|
|
565
|
+
raise ValueError(
|
|
566
|
+
"POSIX.1-2017 standard: Sunday must be represented as 0, not 7"
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
# Validate cron expression - croniter constructor validates the expression
|
|
570
|
+
croniter(schedule)
|
|
571
|
+
|
|
572
|
+
except Exception as e:
|
|
573
|
+
raise SDKUsageError(
|
|
574
|
+
f"Invalid cron expression or timezone: {schedule} {timezone}, please use a POSIX.1-2017 compatible cron expression and timezone."
|
|
575
|
+
) from e
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def _try_parse_schedule(
|
|
579
|
+
schedule: Optional[Union[str, models.CronScheduleClass]],
|
|
580
|
+
) -> Optional[models.CronScheduleClass]:
|
|
581
|
+
if schedule is None:
|
|
582
|
+
return None
|
|
583
|
+
if isinstance(schedule, str):
|
|
584
|
+
_validate_cron_schedule(schedule, "UTC")
|
|
585
|
+
return models.CronScheduleClass(
|
|
586
|
+
cron=schedule,
|
|
587
|
+
timezone="UTC",
|
|
588
|
+
)
|
|
589
|
+
if isinstance(schedule, models.CronScheduleClass):
|
|
590
|
+
_validate_cron_schedule(schedule.cron, schedule.timezone)
|
|
591
|
+
return schedule
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
FieldSpecType = Union[models.FreshnessFieldSpecClass, models.SchemaFieldSpecClass]
|
|
595
|
+
|
|
596
|
+
|
|
499
597
|
class _AssertionInput(ABC):
|
|
500
598
|
def __init__(
|
|
501
599
|
self,
|
|
@@ -509,6 +607,7 @@ class _AssertionInput(ABC):
|
|
|
509
607
|
] = None, # Can be None if the assertion is not yet created
|
|
510
608
|
display_name: Optional[str] = None,
|
|
511
609
|
enabled: bool = True,
|
|
610
|
+
schedule: Optional[Union[str, models.CronScheduleClass]] = None,
|
|
512
611
|
detection_mechanism: DetectionMechanismInputTypes = None,
|
|
513
612
|
sensitivity: Optional[Union[str, InferenceSensitivity]] = None,
|
|
514
613
|
exclusion_windows: Optional[ExclusionWindowInputTypes] = None,
|
|
@@ -553,7 +652,7 @@ class _AssertionInput(ABC):
|
|
|
553
652
|
else _generate_default_name(DEFAULT_NAME_PREFIX, DEFAULT_NAME_SUFFIX_LENGTH)
|
|
554
653
|
)
|
|
555
654
|
self.enabled = enabled
|
|
556
|
-
|
|
655
|
+
self.schedule = _try_parse_schedule(schedule)
|
|
557
656
|
self.detection_mechanism = DetectionMechanism.parse(detection_mechanism)
|
|
558
657
|
self.sensitivity = InferenceSensitivity.parse(sensitivity)
|
|
559
658
|
self.exclusion_windows = _try_parse_exclusion_window(exclusion_windows)
|
|
@@ -656,10 +755,7 @@ class _AssertionInput(ABC):
|
|
|
656
755
|
"""
|
|
657
756
|
if not isinstance(
|
|
658
757
|
self.detection_mechanism,
|
|
659
|
-
|
|
660
|
-
DetectionMechanism.LAST_MODIFIED_COLUMN,
|
|
661
|
-
DetectionMechanism.HIGH_WATERMARK_COLUMN,
|
|
662
|
-
),
|
|
758
|
+
_DETECTION_MECHANISM_TYPES_WITH_ADDITIONAL_FILTER,
|
|
663
759
|
):
|
|
664
760
|
return None
|
|
665
761
|
|
|
@@ -672,12 +768,6 @@ class _AssertionInput(ABC):
|
|
|
672
768
|
sql=additional_filter,
|
|
673
769
|
)
|
|
674
770
|
|
|
675
|
-
@abstractmethod
|
|
676
|
-
def _create_assertion_info(
|
|
677
|
-
self, filter: Optional[models.DatasetFilterClass]
|
|
678
|
-
) -> AssertionInfoInputType:
|
|
679
|
-
pass
|
|
680
|
-
|
|
681
771
|
def _convert_tags(self) -> Optional[TagsInputType]:
|
|
682
772
|
"""
|
|
683
773
|
Convert the tags input into a standardized format.
|
|
@@ -800,30 +890,6 @@ class _AssertionInput(ABC):
|
|
|
800
890
|
)
|
|
801
891
|
return exclusion_windows
|
|
802
892
|
|
|
803
|
-
@abstractmethod
|
|
804
|
-
def _convert_assertion_source_type_and_field(
|
|
805
|
-
self,
|
|
806
|
-
) -> tuple[str, Optional[models.FreshnessFieldSpecClass]]:
|
|
807
|
-
"""
|
|
808
|
-
Convert detection mechanism into source type and field specification for freshness assertions.
|
|
809
|
-
|
|
810
|
-
Returns:
|
|
811
|
-
A tuple of (source_type, field) where field may be None.
|
|
812
|
-
Note that the source_type is a string, not a models.DatasetFreshnessSourceTypeClass since
|
|
813
|
-
the source type is not a enum in the code generated from the DatasetFreshnessSourceType enum in the PDL.
|
|
814
|
-
|
|
815
|
-
Raises:
|
|
816
|
-
SDKNotYetSupportedError: If the detection mechanism is not supported.
|
|
817
|
-
SDKUsageError: If the field (column) is not found in the dataset,
|
|
818
|
-
and the detection mechanism requires a field. Also if the field
|
|
819
|
-
is not an allowed type for the detection mechanism.
|
|
820
|
-
"""
|
|
821
|
-
pass
|
|
822
|
-
|
|
823
|
-
@abstractmethod
|
|
824
|
-
def _convert_schedule(self) -> models.CronScheduleClass:
|
|
825
|
-
pass
|
|
826
|
-
|
|
827
893
|
def _convert_sensitivity(self) -> models.AssertionMonitorSensitivityClass:
|
|
828
894
|
"""
|
|
829
895
|
Convert sensitivity into an AssertionMonitorSensitivityClass.
|
|
@@ -835,13 +901,68 @@ class _AssertionInput(ABC):
|
|
|
835
901
|
level=InferenceSensitivity.to_int(self.sensitivity),
|
|
836
902
|
)
|
|
837
903
|
|
|
904
|
+
def _get_schema_field_spec(self, column_name: str) -> models.SchemaFieldSpecClass:
|
|
905
|
+
"""
|
|
906
|
+
Get the schema field spec for the detection mechanism if needed.
|
|
907
|
+
"""
|
|
908
|
+
# Only fetch the dataset if it's not already cached.
|
|
909
|
+
# Also we only fetch the dataset if it's needed for the detection mechanism.
|
|
910
|
+
if self.cached_dataset is None:
|
|
911
|
+
self.cached_dataset = self.entity_client.get(self.dataset_urn)
|
|
912
|
+
|
|
913
|
+
# Handle case where dataset doesn't exist
|
|
914
|
+
if self.cached_dataset is None:
|
|
915
|
+
raise SDKUsageError(
|
|
916
|
+
f"Dataset {self.dataset_urn} not found. Cannot validate column {column_name}."
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
# TODO: Make a public accessor for _schema_dict in the SDK
|
|
920
|
+
schema_fields = self.cached_dataset._schema_dict()
|
|
921
|
+
field = schema_fields.get(column_name)
|
|
922
|
+
if field:
|
|
923
|
+
return models.SchemaFieldSpecClass(
|
|
924
|
+
path=field.fieldPath,
|
|
925
|
+
type=field.type.type.__class__.__name__,
|
|
926
|
+
nativeType=field.nativeDataType,
|
|
927
|
+
)
|
|
928
|
+
else:
|
|
929
|
+
raise SDKUsageError(
|
|
930
|
+
msg=f"Column {column_name} not found in dataset {self.dataset_urn}",
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
def _validate_field_type(
|
|
934
|
+
self,
|
|
935
|
+
field_spec: models.SchemaFieldSpecClass,
|
|
936
|
+
column_name: str,
|
|
937
|
+
allowed_types: list[DictWrapper],
|
|
938
|
+
field_type_name: str,
|
|
939
|
+
) -> None:
|
|
940
|
+
"""
|
|
941
|
+
Validate that a field has an allowed type.
|
|
942
|
+
|
|
943
|
+
Args:
|
|
944
|
+
field_spec: The field specification to validate
|
|
945
|
+
column_name: The name of the column for error messages
|
|
946
|
+
allowed_types: List of allowed field types
|
|
947
|
+
field_type_name: Human-readable name of the field type for error messages
|
|
948
|
+
|
|
949
|
+
Raises:
|
|
950
|
+
SDKUsageError: If the field has an invalid type
|
|
951
|
+
"""
|
|
952
|
+
allowed_type_names = [t.__class__.__name__ for t in allowed_types]
|
|
953
|
+
if field_spec.type not in allowed_type_names:
|
|
954
|
+
raise SDKUsageError(
|
|
955
|
+
msg=f"Column {column_name} with type {field_spec.type} does not have an allowed type for a {field_type_name} in dataset {self.dataset_urn}. "
|
|
956
|
+
f"Allowed types are {allowed_type_names}.",
|
|
957
|
+
)
|
|
958
|
+
|
|
838
959
|
def _create_monitor_info(
|
|
839
960
|
self,
|
|
840
961
|
assertion_urn: AssertionUrn,
|
|
841
962
|
status: models.MonitorStatusClass,
|
|
842
963
|
schedule: models.CronScheduleClass,
|
|
843
964
|
source_type: Union[str, models.DatasetFreshnessSourceTypeClass],
|
|
844
|
-
field: Optional[
|
|
965
|
+
field: Optional[FieldSpecType],
|
|
845
966
|
sensitivity: models.AssertionMonitorSensitivityClass,
|
|
846
967
|
exclusion_windows: list[models.AssertionExclusionWindowClass],
|
|
847
968
|
) -> models.MonitorInfoClass:
|
|
@@ -851,7 +972,7 @@ class _AssertionInput(ABC):
|
|
|
851
972
|
Args:
|
|
852
973
|
status: The monitor status.
|
|
853
974
|
schedule: The monitor schedule.
|
|
854
|
-
source_type: The
|
|
975
|
+
source_type: The source type.
|
|
855
976
|
field: Optional field specification.
|
|
856
977
|
sensitivity: The monitor sensitivity.
|
|
857
978
|
exclusion_windows: List of exclusion windows.
|
|
@@ -867,12 +988,8 @@ class _AssertionInput(ABC):
|
|
|
867
988
|
models.AssertionEvaluationSpecClass(
|
|
868
989
|
assertion=str(assertion_urn),
|
|
869
990
|
schedule=schedule,
|
|
870
|
-
parameters=
|
|
871
|
-
|
|
872
|
-
datasetFreshnessParameters=models.DatasetFreshnessAssertionParametersClass(
|
|
873
|
-
sourceType=source_type,
|
|
874
|
-
field=field,
|
|
875
|
-
),
|
|
991
|
+
parameters=self._get_assertion_evaluation_parameters(
|
|
992
|
+
str(source_type), field
|
|
876
993
|
),
|
|
877
994
|
)
|
|
878
995
|
],
|
|
@@ -886,36 +1003,34 @@ class _AssertionInput(ABC):
|
|
|
886
1003
|
),
|
|
887
1004
|
)
|
|
888
1005
|
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
if self.cached_dataset is None:
|
|
896
|
-
self.cached_dataset = self.entity_client.get(self.dataset_urn)
|
|
1006
|
+
@abstractmethod
|
|
1007
|
+
def _create_assertion_info(
|
|
1008
|
+
self, filter: Optional[models.DatasetFilterClass]
|
|
1009
|
+
) -> AssertionInfoInputType:
|
|
1010
|
+
"""Create assertion info specific to the assertion type."""
|
|
1011
|
+
pass
|
|
897
1012
|
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
return models.SchemaFieldSpecClass(
|
|
903
|
-
path=field.fieldPath,
|
|
904
|
-
type=field.type.type.__class__.__name__,
|
|
905
|
-
nativeType=field.nativeDataType,
|
|
906
|
-
)
|
|
907
|
-
else:
|
|
908
|
-
raise SDKUsageError(
|
|
909
|
-
msg=f"Column {column_name} not found in dataset {self.dataset_urn}",
|
|
910
|
-
)
|
|
1013
|
+
@abstractmethod
|
|
1014
|
+
def _convert_schedule(self) -> models.CronScheduleClass:
|
|
1015
|
+
"""Convert schedule to appropriate format for the assertion type."""
|
|
1016
|
+
pass
|
|
911
1017
|
|
|
1018
|
+
@abstractmethod
|
|
1019
|
+
def _get_assertion_evaluation_parameters(
|
|
1020
|
+
self, source_type: str, field: Optional[FieldSpecType]
|
|
1021
|
+
) -> models.AssertionEvaluationParametersClass:
|
|
1022
|
+
"""Get evaluation parameters specific to the assertion type."""
|
|
1023
|
+
pass
|
|
1024
|
+
|
|
1025
|
+
@abstractmethod
|
|
1026
|
+
def _convert_assertion_source_type_and_field(
|
|
1027
|
+
self,
|
|
1028
|
+
) -> tuple[str, Optional[FieldSpecType]]:
|
|
1029
|
+
"""Convert detection mechanism to source type and field spec."""
|
|
1030
|
+
pass
|
|
912
1031
|
|
|
913
|
-
class _SmartFreshnessAssertionInput(_AssertionInput):
|
|
914
|
-
DEFAULT_SCHEDULE = models.CronScheduleClass(
|
|
915
|
-
cron="0 0 * * *",
|
|
916
|
-
timezone="UTC",
|
|
917
|
-
)
|
|
918
1032
|
|
|
1033
|
+
class _SmartFreshnessAssertionInput(_AssertionInput):
|
|
919
1034
|
def __init__(
|
|
920
1035
|
self,
|
|
921
1036
|
*,
|
|
@@ -926,6 +1041,7 @@ class _SmartFreshnessAssertionInput(_AssertionInput):
|
|
|
926
1041
|
urn: Optional[Union[str, AssertionUrn]] = None,
|
|
927
1042
|
display_name: Optional[str] = None,
|
|
928
1043
|
enabled: bool = True,
|
|
1044
|
+
schedule: Optional[Union[str, models.CronScheduleClass]] = None,
|
|
929
1045
|
detection_mechanism: DetectionMechanismInputTypes = None,
|
|
930
1046
|
sensitivity: Optional[Union[str, InferenceSensitivity]] = None,
|
|
931
1047
|
exclusion_windows: Optional[ExclusionWindowInputTypes] = None,
|
|
@@ -945,6 +1061,9 @@ class _SmartFreshnessAssertionInput(_AssertionInput):
|
|
|
945
1061
|
urn=urn,
|
|
946
1062
|
display_name=display_name,
|
|
947
1063
|
enabled=enabled,
|
|
1064
|
+
schedule=schedule
|
|
1065
|
+
if schedule is not None
|
|
1066
|
+
else DEFAULT_SCHEDULE, # Use provided schedule or default for create case
|
|
948
1067
|
detection_mechanism=detection_mechanism,
|
|
949
1068
|
sensitivity=sensitivity,
|
|
950
1069
|
exclusion_windows=exclusion_windows,
|
|
@@ -973,29 +1092,51 @@ class _SmartFreshnessAssertionInput(_AssertionInput):
|
|
|
973
1092
|
return models.FreshnessAssertionInfoClass(
|
|
974
1093
|
type=models.FreshnessAssertionTypeClass.DATASET_CHANGE, # Currently only dataset change is supported
|
|
975
1094
|
entity=str(self.dataset_urn),
|
|
976
|
-
# schedule (optional,
|
|
1095
|
+
# schedule (optional, must be left empty for smart freshness assertions - managed by the AI inference engine)
|
|
977
1096
|
filter=filter,
|
|
978
1097
|
)
|
|
979
1098
|
|
|
980
1099
|
def _convert_schedule(self) -> models.CronScheduleClass:
|
|
981
1100
|
"""Create a schedule for a smart freshness assertion.
|
|
982
1101
|
|
|
983
|
-
|
|
1102
|
+
For create case, uses DEFAULT_SCHEDULE. For update case, preserves existing schedule.
|
|
984
1103
|
|
|
985
1104
|
Returns:
|
|
986
1105
|
A CronScheduleClass with appropriate schedule settings.
|
|
987
1106
|
"""
|
|
988
|
-
|
|
1107
|
+
assert self.schedule is not None, (
|
|
1108
|
+
"Schedule should never be None due to constructor logic"
|
|
1109
|
+
)
|
|
1110
|
+
return self.schedule
|
|
1111
|
+
|
|
1112
|
+
def _get_assertion_evaluation_parameters(
|
|
1113
|
+
self, source_type: str, field: Optional[FieldSpecType]
|
|
1114
|
+
) -> models.AssertionEvaluationParametersClass:
|
|
1115
|
+
# Ensure field is either None or FreshnessFieldSpecClass
|
|
1116
|
+
freshness_field = None
|
|
1117
|
+
if field is not None:
|
|
1118
|
+
if not isinstance(field, models.FreshnessFieldSpecClass):
|
|
1119
|
+
raise SDKUsageError(
|
|
1120
|
+
f"Expected FreshnessFieldSpecClass for freshness assertion, got {type(field).__name__}"
|
|
1121
|
+
)
|
|
1122
|
+
freshness_field = field
|
|
1123
|
+
|
|
1124
|
+
return models.AssertionEvaluationParametersClass(
|
|
1125
|
+
type=models.AssertionEvaluationParametersTypeClass.DATASET_FRESHNESS,
|
|
1126
|
+
datasetFreshnessParameters=models.DatasetFreshnessAssertionParametersClass(
|
|
1127
|
+
sourceType=source_type, field=freshness_field
|
|
1128
|
+
),
|
|
1129
|
+
)
|
|
989
1130
|
|
|
990
1131
|
def _convert_assertion_source_type_and_field(
|
|
991
1132
|
self,
|
|
992
|
-
) -> tuple[str, Optional[
|
|
1133
|
+
) -> tuple[str, Optional[FieldSpecType]]:
|
|
993
1134
|
"""
|
|
994
1135
|
Convert detection mechanism into source type and field specification for freshness assertions.
|
|
995
1136
|
|
|
996
1137
|
Returns:
|
|
997
1138
|
A tuple of (source_type, field) where field may be None.
|
|
998
|
-
Note that the source_type is a string, not a models.DatasetFreshnessSourceTypeClass since
|
|
1139
|
+
Note that the source_type is a string, not a models.DatasetFreshnessSourceTypeClass (or other assertion source type) since
|
|
999
1140
|
the source type is not a enum in the code generated from the DatasetFreshnessSourceType enum in the PDL.
|
|
1000
1141
|
|
|
1001
1142
|
Raises:
|
|
@@ -1060,15 +1201,135 @@ class _SmartFreshnessAssertionInput(_AssertionInput):
|
|
|
1060
1201
|
)
|
|
1061
1202
|
|
|
1062
1203
|
field_spec = self._get_schema_field_spec(column_name)
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
msg=f"Column {column_name} with type {field_spec.type} does not have an allowed type for a {field_type_name} in dataset {self.dataset_urn}. "
|
|
1067
|
-
f"Allowed types are {allowed_type_names}.",
|
|
1068
|
-
)
|
|
1204
|
+
self._validate_field_type(
|
|
1205
|
+
field_spec, column_name, allowed_types, field_type_name
|
|
1206
|
+
)
|
|
1069
1207
|
return models.FreshnessFieldSpecClass(
|
|
1070
1208
|
path=field_spec.path,
|
|
1071
1209
|
type=field_spec.type,
|
|
1072
1210
|
nativeType=field_spec.nativeType,
|
|
1073
1211
|
kind=kind,
|
|
1074
1212
|
)
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
class _SmartVolumeAssertionInput(_AssertionInput):
|
|
1216
|
+
def __init__(
|
|
1217
|
+
self,
|
|
1218
|
+
*,
|
|
1219
|
+
# Required fields
|
|
1220
|
+
dataset_urn: Union[str, DatasetUrn],
|
|
1221
|
+
entity_client: EntityClient, # Needed to get the schema field spec for the detection mechanism if needed
|
|
1222
|
+
# Optional fields
|
|
1223
|
+
urn: Optional[Union[str, AssertionUrn]] = None,
|
|
1224
|
+
display_name: Optional[str] = None,
|
|
1225
|
+
enabled: bool = True,
|
|
1226
|
+
schedule: Optional[Union[str, models.CronScheduleClass]] = None,
|
|
1227
|
+
detection_mechanism: DetectionMechanismInputTypes = None,
|
|
1228
|
+
sensitivity: Optional[Union[str, InferenceSensitivity]] = None,
|
|
1229
|
+
exclusion_windows: Optional[ExclusionWindowInputTypes] = None,
|
|
1230
|
+
training_data_lookback_days: Optional[int] = None,
|
|
1231
|
+
incident_behavior: Optional[
|
|
1232
|
+
Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
|
|
1233
|
+
] = None,
|
|
1234
|
+
tags: Optional[TagsInputType] = None,
|
|
1235
|
+
created_by: Union[str, CorpUserUrn],
|
|
1236
|
+
created_at: datetime,
|
|
1237
|
+
updated_by: Union[str, CorpUserUrn],
|
|
1238
|
+
updated_at: datetime,
|
|
1239
|
+
):
|
|
1240
|
+
super().__init__(
|
|
1241
|
+
dataset_urn=dataset_urn,
|
|
1242
|
+
entity_client=entity_client,
|
|
1243
|
+
urn=urn,
|
|
1244
|
+
display_name=display_name,
|
|
1245
|
+
enabled=enabled,
|
|
1246
|
+
schedule=schedule,
|
|
1247
|
+
detection_mechanism=detection_mechanism,
|
|
1248
|
+
sensitivity=sensitivity,
|
|
1249
|
+
exclusion_windows=exclusion_windows,
|
|
1250
|
+
training_data_lookback_days=training_data_lookback_days,
|
|
1251
|
+
incident_behavior=incident_behavior,
|
|
1252
|
+
tags=tags,
|
|
1253
|
+
source_type=models.AssertionSourceTypeClass.INFERRED, # Smart assertions are of type inferred, not native
|
|
1254
|
+
created_by=created_by,
|
|
1255
|
+
created_at=created_at,
|
|
1256
|
+
updated_by=updated_by,
|
|
1257
|
+
updated_at=updated_at,
|
|
1258
|
+
)
|
|
1259
|
+
|
|
1260
|
+
def _create_assertion_info(
|
|
1261
|
+
self, filter: Optional[models.DatasetFilterClass]
|
|
1262
|
+
) -> AssertionInfoInputType:
|
|
1263
|
+
"""
|
|
1264
|
+
Create a VolumeAssertionInfoClass for a smart volume assertion.
|
|
1265
|
+
|
|
1266
|
+
Args:
|
|
1267
|
+
filter: Optional filter to apply to the assertion.
|
|
1268
|
+
|
|
1269
|
+
Returns:
|
|
1270
|
+
A VolumeAssertionInfoClass configured for smart volume.
|
|
1271
|
+
"""
|
|
1272
|
+
return models.VolumeAssertionInfoClass(
|
|
1273
|
+
type=models.VolumeAssertionTypeClass.ROW_COUNT_TOTAL, # Currently only ROW_COUNT_TOTAL is supported for smart volume
|
|
1274
|
+
entity=str(self.dataset_urn),
|
|
1275
|
+
filter=filter,
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1278
|
+
def _convert_schedule(self) -> models.CronScheduleClass:
|
|
1279
|
+
"""Create a schedule for a smart freshness assertion.
|
|
1280
|
+
|
|
1281
|
+
Since the schedule is not used for smart freshness assertions, we return a default schedule.
|
|
1282
|
+
|
|
1283
|
+
Returns:
|
|
1284
|
+
A CronScheduleClass with appropriate schedule settings.
|
|
1285
|
+
"""
|
|
1286
|
+
if self.schedule is None:
|
|
1287
|
+
return DEFAULT_SCHEDULE
|
|
1288
|
+
|
|
1289
|
+
return models.CronScheduleClass(
|
|
1290
|
+
cron=self.schedule.cron,
|
|
1291
|
+
timezone=self.schedule.timezone,
|
|
1292
|
+
)
|
|
1293
|
+
|
|
1294
|
+
def _get_assertion_evaluation_parameters(
|
|
1295
|
+
self, source_type: str, field: Optional[FieldSpecType]
|
|
1296
|
+
) -> models.AssertionEvaluationParametersClass:
|
|
1297
|
+
return models.AssertionEvaluationParametersClass(
|
|
1298
|
+
type=models.AssertionEvaluationParametersTypeClass.DATASET_VOLUME,
|
|
1299
|
+
datasetVolumeParameters=models.DatasetVolumeAssertionParametersClass(
|
|
1300
|
+
sourceType=source_type,
|
|
1301
|
+
),
|
|
1302
|
+
)
|
|
1303
|
+
|
|
1304
|
+
def _convert_assertion_source_type_and_field(
|
|
1305
|
+
self,
|
|
1306
|
+
) -> tuple[str, Optional[FieldSpecType]]:
|
|
1307
|
+
"""
|
|
1308
|
+
Convert detection mechanism into source type and field specification for volume assertions.
|
|
1309
|
+
|
|
1310
|
+
Returns:
|
|
1311
|
+
A tuple of (source_type, field) where field may be None.
|
|
1312
|
+
Note that the source_type is a string, not a models.DatasetFreshnessSourceTypeClass (or other assertion source type) since
|
|
1313
|
+
the source type is not a enum in the code generated from the DatasetFreshnessSourceType enum in the PDL.
|
|
1314
|
+
|
|
1315
|
+
Raises:
|
|
1316
|
+
SDKNotYetSupportedError: If the detection mechanism is not supported.
|
|
1317
|
+
SDKUsageError: If the field (column) is not found in the dataset,
|
|
1318
|
+
and the detection mechanism requires a field. Also if the field
|
|
1319
|
+
is not an allowed type for the detection mechanism.
|
|
1320
|
+
"""
|
|
1321
|
+
source_type = models.DatasetVolumeSourceTypeClass.INFORMATION_SCHEMA
|
|
1322
|
+
field = None
|
|
1323
|
+
|
|
1324
|
+
if isinstance(self.detection_mechanism, _Query):
|
|
1325
|
+
source_type = models.DatasetVolumeSourceTypeClass.QUERY
|
|
1326
|
+
elif isinstance(self.detection_mechanism, _InformationSchema):
|
|
1327
|
+
source_type = models.DatasetVolumeSourceTypeClass.INFORMATION_SCHEMA
|
|
1328
|
+
elif isinstance(self.detection_mechanism, _DatasetProfile):
|
|
1329
|
+
source_type = models.DatasetVolumeSourceTypeClass.DATAHUB_DATASET_PROFILE
|
|
1330
|
+
else:
|
|
1331
|
+
raise SDKNotYetSupportedError(
|
|
1332
|
+
f"Detection mechanism {self.detection_mechanism} not yet supported for smart volume assertions"
|
|
1333
|
+
)
|
|
1334
|
+
|
|
1335
|
+
return source_type, field
|