acryl-datahub-cloud 0.3.12rc1__py3-none-any.whl → 0.3.12rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (74) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +559 -0
  3. acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
  4. acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
  5. acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
  6. acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
  7. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
  8. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +49 -40
  9. acryl_datahub_cloud/metadata/_urns/urn_defs.py +1842 -1786
  10. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  11. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +4 -0
  12. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
  13. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  14. acryl_datahub_cloud/metadata/schema.avsc +24861 -24050
  15. acryl_datahub_cloud/metadata/schema_classes.py +1031 -631
  16. acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
  17. acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +72 -0
  18. acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
  19. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +40 -7
  20. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +27 -6
  21. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +31 -7
  22. acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +14 -0
  23. acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
  24. acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
  25. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
  26. acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
  27. acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +2 -1
  28. acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
  29. acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
  30. acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  31. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
  32. acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +1 -0
  33. acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +1 -1
  34. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +1 -0
  35. acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
  36. acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
  37. acryl_datahub_cloud/metadata/schemas/FormKey.avsc +2 -1
  38. acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
  39. acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +3 -0
  40. acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +22 -0
  41. acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +1 -0
  42. acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +1 -0
  43. acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  44. acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +1 -0
  45. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +1 -0
  46. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  47. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +12 -1
  48. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +27 -6
  49. acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
  50. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +1 -0
  51. acryl_datahub_cloud/notifications/__init__.py +0 -0
  52. acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
  53. acryl_datahub_cloud/sdk/__init__.py +29 -0
  54. acryl_datahub_cloud/{_sdk_extras → sdk}/assertion.py +501 -193
  55. acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
  56. acryl_datahub_cloud/{_sdk_extras → sdk/assertion_input}/assertion_input.py +733 -189
  57. acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +261 -0
  58. acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +947 -0
  59. acryl_datahub_cloud/sdk/assertions_client.py +1639 -0
  60. acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
  61. acryl_datahub_cloud/{_sdk_extras → sdk}/entities/assertion.py +5 -2
  62. acryl_datahub_cloud/{_sdk_extras → sdk}/subscription_client.py +146 -33
  63. {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc4.dist-info}/METADATA +48 -43
  64. {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc4.dist-info}/RECORD +72 -54
  65. {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc4.dist-info}/entry_points.txt +1 -0
  66. acryl_datahub_cloud/_sdk_extras/__init__.py +0 -19
  67. acryl_datahub_cloud/_sdk_extras/assertions_client.py +0 -717
  68. /acryl_datahub_cloud/{_sdk_extras/entities → datahub_forms_notifications}/__init__.py +0 -0
  69. /acryl_datahub_cloud/{_sdk_extras → sdk}/entities/monitor.py +0 -0
  70. /acryl_datahub_cloud/{_sdk_extras → sdk}/entities/subscription.py +0 -0
  71. /acryl_datahub_cloud/{_sdk_extras → sdk}/errors.py +0 -0
  72. /acryl_datahub_cloud/{_sdk_extras → sdk}/resolver_client.py +0 -0
  73. {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc4.dist-info}/WHEEL +0 -0
  74. {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc4.dist-info}/top_level.txt +0 -0
@@ -6,22 +6,26 @@ validate and represent the input for creating an Assertion in DataHub.
6
6
  import random
7
7
  import string
8
8
  from abc import ABC, abstractmethod
9
+ from dataclasses import dataclass
9
10
  from datetime import datetime
10
11
  from enum import Enum
11
- from typing import Literal, Optional, TypeAlias, Union
12
+ from typing import Callable, Literal, Optional, Type, TypeAlias, TypeVar, Union
12
13
 
13
14
  import pydantic
15
+ import pytz
16
+ import tzlocal
14
17
  from avrogen.dict_wrapper import DictWrapper
18
+ from croniter import croniter
15
19
  from pydantic import BaseModel, Extra, ValidationError
16
20
 
17
- from acryl_datahub_cloud._sdk_extras.entities.assertion import (
21
+ from acryl_datahub_cloud.sdk.entities.assertion import (
18
22
  Assertion,
19
23
  AssertionActionsInputType,
20
24
  AssertionInfoInputType,
21
25
  TagsInputType,
22
26
  )
23
- from acryl_datahub_cloud._sdk_extras.entities.monitor import Monitor
24
- from acryl_datahub_cloud._sdk_extras.errors import (
27
+ from acryl_datahub_cloud.sdk.entities.monitor import Monitor
28
+ from acryl_datahub_cloud.sdk.errors import (
25
29
  SDKNotYetSupportedError,
26
30
  SDKUsageError,
27
31
  SDKUsageErrorWithExamples,
@@ -40,6 +44,22 @@ DEFAULT_NAME_PREFIX = "New Assertion"
40
44
  DEFAULT_NAME_SUFFIX_LENGTH = 8
41
45
 
42
46
 
47
+ DEFAULT_HOURLY_SCHEDULE = models.CronScheduleClass(
48
+ cron="0 * * * *", # Every hour, matches the UI default
49
+ timezone=str(
50
+ tzlocal.get_localzone()
51
+ ), # User local timezone, matches the UI default
52
+ )
53
+ DEFAULT_SCHEDULE: models.CronScheduleClass = DEFAULT_HOURLY_SCHEDULE
54
+
55
+ DEFAULT_DAILY_SCHEDULE = models.CronScheduleClass(
56
+ cron="0 0 * * *", # Every day at midnight, matches the UI default
57
+ timezone=str(
58
+ tzlocal.get_localzone()
59
+ ), # User local timezone, matches the UI default
60
+ )
61
+
62
+
43
63
  class AbstractDetectionMechanism(BaseModel, ABC):
44
64
  type: str
45
65
 
@@ -85,6 +105,36 @@ class _DataHubOperation(AbstractDetectionMechanism):
85
105
  type: Literal["datahub_operation"] = "datahub_operation"
86
106
 
87
107
 
108
+ class _Query(AbstractDetectionMechanism):
109
+ # COUNT(*) query
110
+ type: Literal["query"] = "query"
111
+ additional_filter: Optional[str] = None
112
+
113
+
114
+ class _AllRowsQuery(AbstractDetectionMechanism):
115
+ # For column-based assertions, this is the default detection mechanism.
116
+ type: Literal["all_rows_query"] = "all_rows_query"
117
+ additional_filter: Optional[str] = None
118
+
119
+
120
+ class _AllRowsQueryDataHubDatasetProfile(AbstractDetectionMechanism):
121
+ # Used for column-based assertions.
122
+ type: Literal["all_rows_query_datahub_dataset_profile"] = (
123
+ "all_rows_query_datahub_dataset_profile"
124
+ )
125
+
126
+
127
+ class _ChangedRowsQuery(AbstractDetectionMechanism):
128
+ # Used for column-based assertions.
129
+ type: Literal["changed_rows_query"] = "changed_rows_query"
130
+ column_name: str
131
+ additional_filter: Optional[str] = None
132
+
133
+
134
+ class _DatasetProfile(AbstractDetectionMechanism):
135
+ type: Literal["dataset_profile"] = "dataset_profile"
136
+
137
+
88
138
  # Keep these two lists in sync:
89
139
  _DETECTION_MECHANISM_CONCRETE_TYPES = (
90
140
  _InformationSchema,
@@ -92,6 +142,11 @@ _DETECTION_MECHANISM_CONCRETE_TYPES = (
92
142
  _LastModifiedColumn,
93
143
  _HighWatermarkColumn,
94
144
  _DataHubOperation,
145
+ _Query,
146
+ _DatasetProfile,
147
+ _AllRowsQuery,
148
+ _ChangedRowsQuery,
149
+ _AllRowsQueryDataHubDatasetProfile,
95
150
  )
96
151
  _DetectionMechanismTypes = Union[
97
152
  _InformationSchema,
@@ -99,8 +154,23 @@ _DetectionMechanismTypes = Union[
99
154
  _LastModifiedColumn,
100
155
  _HighWatermarkColumn,
101
156
  _DataHubOperation,
157
+ _Query,
158
+ _DatasetProfile,
159
+ _AllRowsQuery,
160
+ _ChangedRowsQuery,
161
+ _AllRowsQueryDataHubDatasetProfile,
102
162
  ]
103
163
 
164
+ _DETECTION_MECHANISM_TYPES_WITH_ADDITIONAL_FILTER = (
165
+ _LastModifiedColumn,
166
+ _HighWatermarkColumn,
167
+ _Query,
168
+ _AllRowsQuery,
169
+ _ChangedRowsQuery,
170
+ )
171
+
172
+ DEFAULT_DETECTION_MECHANISM: _DetectionMechanismTypes = _InformationSchema()
173
+
104
174
 
105
175
  class DetectionMechanism:
106
176
  # To have a more enum-like user experience even with sub parameters, we define the detection mechanisms as class attributes.
@@ -110,6 +180,11 @@ class DetectionMechanism:
110
180
  LAST_MODIFIED_COLUMN = _LastModifiedColumn
111
181
  HIGH_WATERMARK_COLUMN = _HighWatermarkColumn
112
182
  DATAHUB_OPERATION = _DataHubOperation()
183
+ QUERY = _Query
184
+ ALL_ROWS_QUERY = _AllRowsQuery()
185
+ CHANGED_ROWS_QUERY = _ChangedRowsQuery
186
+ ALL_ROWS_QUERY_DATAHUB_DATASET_PROFILE = _AllRowsQueryDataHubDatasetProfile()
187
+ DATASET_PROFILE = _DatasetProfile()
113
188
 
114
189
  _DETECTION_MECHANISM_EXAMPLES = {
115
190
  "Information Schema from string": "information_schema",
@@ -130,6 +205,26 @@ class DetectionMechanism:
130
205
  "High Watermark Column from DetectionMechanism": "DetectionMechanism.HIGH_WATERMARK_COLUMN(column_name='id', additional_filter='id > 1000')",
131
206
  "DataHub Operation from string": "datahub_operation",
132
207
  "DataHub Operation from DetectionMechanism": "DetectionMechanism.DATAHUB_OPERATION",
208
+ "Query from string": "query",
209
+ "Query from dict": {
210
+ "type": "query",
211
+ "additional_filter": "id > 1000",
212
+ },
213
+ "Query from DetectionMechanism (with optional additional filter)": "DetectionMechanism.QUERY(additional_filter='id > 1000')",
214
+ "Dataset Profile from string": "dataset_profile",
215
+ "Dataset Profile from DetectionMechanism": "DetectionMechanism.DATASET_PROFILE",
216
+ "All Rows Query from string": "all_rows_query",
217
+ "All Rows Query from DetectionMechanism": "DetectionMechanism.ALL_ROWS_QUERY",
218
+ "All Rows Query from DetectionMechanism (with optional additional filter)": "DetectionMechanism.ALL_ROWS_QUERY(additional_filter='id > 1000')",
219
+ "Changed Rows Query from dict (with optional additional filter)": {
220
+ "type": "changed_rows_query",
221
+ "column_name": "id",
222
+ "additional_filter": "id > 1000",
223
+ },
224
+ "Changed Rows Query from DetectionMechanism": "DetectionMechanism.CHANGED_ROWS_QUERY(column_name='id')",
225
+ "Changed Rows Query from DetectionMechanism (with optional additional filter)": "DetectionMechanism.CHANGED_ROWS_QUERY(column_name='id', additional_filter='id > 1000')",
226
+ "All Rows Query DataHub Dataset Profile from string": "all_rows_query_datahub_dataset_profile",
227
+ "All Rows Query DataHub Dataset Profile from DetectionMechanism": "DetectionMechanism.ALL_ROWS_QUERY_DATAHUB_DATASET_PROFILE",
133
228
  }
134
229
 
135
230
  @staticmethod
@@ -137,9 +232,10 @@ class DetectionMechanism:
137
232
  detection_mechanism_config: Optional[
138
233
  Union[str, dict[str, str], _DetectionMechanismTypes]
139
234
  ] = None,
235
+ default_detection_mechanism: _DetectionMechanismTypes = DEFAULT_DETECTION_MECHANISM,
140
236
  ) -> _DetectionMechanismTypes:
141
237
  if detection_mechanism_config is None:
142
- return DEFAULT_DETECTION_MECHANISM
238
+ return default_detection_mechanism
143
239
  if isinstance(detection_mechanism_config, _DETECTION_MECHANISM_CONCRETE_TYPES):
144
240
  return detection_mechanism_config
145
241
  elif isinstance(detection_mechanism_config, str):
@@ -220,8 +316,6 @@ class DetectionMechanism:
220
316
  ) from e
221
317
 
222
318
 
223
- DEFAULT_DETECTION_MECHANISM = DetectionMechanism.INFORMATION_SCHEMA
224
-
225
319
  DetectionMechanismInputTypes: TypeAlias = Union[
226
320
  str, dict[str, str], _DetectionMechanismTypes, None
227
321
  ]
@@ -288,7 +382,59 @@ class InferenceSensitivity(Enum):
288
382
  }[sensitivity]
289
383
 
290
384
 
291
- DEFAULT_SENSITIVITY = InferenceSensitivity.MEDIUM
385
+ DEFAULT_SENSITIVITY: InferenceSensitivity = InferenceSensitivity.MEDIUM
386
+
387
+ TIME_WINDOW_SIZE_EXAMPLES = {
388
+ "Time window size from models.TimeWindowSizeClass": "models.TimeWindowSizeClass(unit='MINUTE', multiple=10)",
389
+ "Time window size from object": "TimeWindowSize(unit='MINUTE', multiple=10)",
390
+ }
391
+
392
+
393
+ class CalendarInterval(Enum):
394
+ MINUTE = "MINUTE"
395
+ HOUR = "HOUR"
396
+ DAY = "DAY"
397
+
398
+
399
+ class TimeWindowSize(BaseModel):
400
+ unit: Union[CalendarInterval, str]
401
+ multiple: int
402
+
403
+
404
+ TimeWindowSizeInputTypes: TypeAlias = Union[
405
+ models.TimeWindowSizeClass,
406
+ models.FixedIntervalScheduleClass,
407
+ TimeWindowSize,
408
+ ]
409
+
410
+
411
+ def _try_parse_time_window_size(
412
+ config: TimeWindowSizeInputTypes,
413
+ ) -> models.TimeWindowSizeClass:
414
+ if isinstance(config, models.TimeWindowSizeClass):
415
+ return config
416
+ elif isinstance(config, models.FixedIntervalScheduleClass):
417
+ return models.TimeWindowSizeClass(
418
+ unit=_try_parse_and_validate_schema_classes_enum(
419
+ config.unit, models.CalendarIntervalClass
420
+ ),
421
+ multiple=config.multiple,
422
+ )
423
+ elif isinstance(config, TimeWindowSize):
424
+ return models.TimeWindowSizeClass(
425
+ unit=_try_parse_and_validate_schema_classes_enum(
426
+ _try_parse_and_validate_schema_classes_enum(
427
+ config.unit, CalendarInterval
428
+ ).value,
429
+ models.CalendarIntervalClass,
430
+ ),
431
+ multiple=config.multiple,
432
+ )
433
+ else:
434
+ raise SDKUsageErrorWithExamples(
435
+ msg=f"Invalid time window size: {config}",
436
+ examples=TIME_WINDOW_SIZE_EXAMPLES,
437
+ )
292
438
 
293
439
 
294
440
  class FixedRangeExclusionWindow(BaseModel):
@@ -496,6 +642,219 @@ def _try_parse_training_data_lookback_days(
496
642
  return training_data_lookback_days
497
643
 
498
644
 
645
+ def _validate_cron_schedule(schedule: str, timezone: str) -> None:
646
+ """We are using the POSIX.1-2017 standard for cron expressions.
647
+
648
+ Note: We are using the croniter library for cron parsing which is different from executor, which uses apscheduler, so there is a risk of mismatch here.
649
+ """
650
+ try:
651
+ # Validate timezone - pytz.timezone() raises UnknownTimeZoneError for invalid timezones
652
+ # Skip timezone validation when empty
653
+ if timezone:
654
+ pytz.timezone(timezone)
655
+
656
+ # Validate 5-field cron expression only (POSIX.1-2017 standard)
657
+ fields = schedule.strip().split()
658
+ if len(fields) != 5:
659
+ raise ValueError("POSIX.1-2017 requires exactly 5 fields")
660
+
661
+ # POSIX.1-2017 specific validation: Sunday must be 0, not 7
662
+ # However croniter accepts 7 as Sunday, so custom check is needed here.
663
+ # Check the day-of-week field (5th field, index 4)
664
+ dow_field = fields[4]
665
+ if "7" in dow_field:
666
+ # Check if 7 appears as a standalone value or in ranges
667
+ import re
668
+
669
+ # Match 7 as standalone, in lists, or in ranges
670
+ if re.search(r"\b7\b|7-|,7,|^7,|,7$|-7\b", dow_field):
671
+ raise ValueError(
672
+ "POSIX.1-2017 standard: Sunday must be represented as 0, not 7"
673
+ )
674
+
675
+ # Validate cron expression - croniter constructor validates the expression
676
+ croniter(schedule)
677
+
678
+ except Exception as e:
679
+ raise SDKUsageError(
680
+ f"Invalid cron expression or timezone: {schedule} {timezone}, please use a POSIX.1-2017 compatible cron expression and timezone."
681
+ ) from e
682
+
683
+
684
+ def _try_parse_schedule(
685
+ schedule: Optional[Union[str, models.CronScheduleClass]],
686
+ ) -> Optional[models.CronScheduleClass]:
687
+ if schedule is None:
688
+ return None
689
+ if isinstance(schedule, str):
690
+ _validate_cron_schedule(schedule, "UTC")
691
+ return models.CronScheduleClass(
692
+ cron=schedule,
693
+ timezone="UTC",
694
+ )
695
+ if isinstance(schedule, models.CronScheduleClass):
696
+ _validate_cron_schedule(schedule.cron, schedule.timezone)
697
+ return schedule
698
+
699
+
700
+ FieldSpecType = Union[models.FreshnessFieldSpecClass, models.SchemaFieldSpecClass]
701
+
702
+
703
+ T = TypeVar("T")
704
+
705
+
706
+ def _try_parse_and_validate_schema_classes_enum(
707
+ value: Union[str, T],
708
+ enum_class: Type[T],
709
+ ) -> T:
710
+ if isinstance(value, enum_class):
711
+ return value
712
+ assert isinstance(value, str)
713
+ if value not in get_enum_options(enum_class):
714
+ raise SDKUsageError(
715
+ f"Invalid value for {enum_class.__name__}: {value}, valid options are {get_enum_options(enum_class)}"
716
+ )
717
+ return getattr(enum_class, value.upper())
718
+
719
+
720
+ @dataclass(frozen=True)
721
+ class DatasetSourceType:
722
+ """
723
+ DatasetSourceType is used to represent a dataset source type.
724
+ It is used to check if a source type is valid for a dataset type and assertion type.
725
+
726
+ Args:
727
+ source_type: The source type (e.g. information schema, field value, etc. aka detection mechanism)
728
+ platform: The platform of the dataset as a string OR "all" for all platforms.
729
+ assertion_type: The assertion type as a models.AssertionTypeClass string e.g. models.AssertionTypeClass.FRESHNESS OR "all" for all assertion types.
730
+
731
+ Example:
732
+ DatasetSourceType(
733
+ source_type=_InformationSchema,
734
+ platform="databricks",
735
+ assertion_type="all",
736
+ )
737
+ This means that the source type _InformationSchema is invalid for the dataset type "databricks" and assertion type "all".
738
+ "all" in this example means that the source type is invalid for all assertion types.
739
+ """
740
+
741
+ source_type: Type[_DetectionMechanismTypes]
742
+ platform: str
743
+ assertion_type: Union[models.AssertionTypeClass, str]
744
+
745
+
746
+ INVALID_SOURCE_TYPES = {
747
+ # Add exceptions here if a source type (detection mechanism) is invalid for a dataset type and assertion type.
748
+ DatasetSourceType(
749
+ source_type=_InformationSchema,
750
+ platform="databricks",
751
+ assertion_type="all",
752
+ )
753
+ }
754
+
755
+
756
+ def _is_source_type_valid(
757
+ dataset_source_type: DatasetSourceType,
758
+ invalid_source_types: set[DatasetSourceType] = INVALID_SOURCE_TYPES,
759
+ ) -> bool:
760
+ for invalid in invalid_source_types:
761
+ if invalid.source_type == dataset_source_type.source_type:
762
+ # If both platform and assertion type are "all", the source type is invalid for all combinations
763
+ if invalid.platform == "all" and invalid.assertion_type == "all":
764
+ return False
765
+ # If platform matches and assertion type is "all", the source type is invalid for all assertion types on that platform
766
+ if (
767
+ invalid.platform == dataset_source_type.platform
768
+ and invalid.assertion_type == "all"
769
+ ):
770
+ return False
771
+ # If platform is "all" and assertion type matches, the source type is invalid for all platforms for that assertion type
772
+ if (
773
+ invalid.platform == "all"
774
+ and invalid.assertion_type == dataset_source_type.assertion_type
775
+ ):
776
+ return False
777
+ # If both platform and assertion type match exactly, the source type is invalid
778
+ if (
779
+ invalid.platform == dataset_source_type.platform
780
+ and invalid.assertion_type == dataset_source_type.assertion_type
781
+ ):
782
+ return False
783
+ return True
784
+
785
+
786
+ class _HasSmartAssertionInputs:
787
+ """
788
+ A class that contains the common inputs for smart assertions.
789
+ This is used to avoid code duplication in the smart assertion inputs.
790
+
791
+ Args:
792
+ sensitivity: The sensitivity to be applied to the assertion.
793
+ exclusion_windows: The exclusion windows to be applied to the assertion. If not provided, no exclusion windows will be applied.
794
+ training_data_lookback_days: The training data lookback days to be applied to the assertion.
795
+ """
796
+
797
+ def __init__(
798
+ self,
799
+ *,
800
+ sensitivity: Optional[Union[str, InferenceSensitivity]] = None,
801
+ exclusion_windows: Optional[ExclusionWindowInputTypes] = None,
802
+ training_data_lookback_days: Optional[int] = None,
803
+ ):
804
+ self.sensitivity = InferenceSensitivity.parse(sensitivity)
805
+ self.exclusion_windows = _try_parse_exclusion_window(exclusion_windows)
806
+ self.training_data_lookback_days = _try_parse_training_data_lookback_days(
807
+ training_data_lookback_days
808
+ )
809
+
810
+ def _convert_exclusion_windows(
811
+ self,
812
+ ) -> list[models.AssertionExclusionWindowClass]:
813
+ """
814
+ Convert exclusion windows into AssertionExclusionWindowClass objects including generating display names for them.
815
+
816
+ Returns:
817
+ A list of AssertionExclusionWindowClass objects.
818
+
819
+ Raises:
820
+ SDKUsageErrorWithExamples: If an exclusion window is of an invalid type.
821
+ """
822
+ exclusion_windows: list[models.AssertionExclusionWindowClass] = []
823
+ if self.exclusion_windows:
824
+ for window in self.exclusion_windows:
825
+ if not isinstance(window, FixedRangeExclusionWindow):
826
+ raise SDKUsageErrorWithExamples(
827
+ msg=f"Invalid exclusion window type: {window}",
828
+ examples=FIXED_RANGE_EXCLUSION_WINDOW_EXAMPLES,
829
+ )
830
+ # To match the UI, we generate a display name for the exclusion window.
831
+ # See here for the UI code: https://github.com/acryldata/datahub-fork/blob/acryl-main/datahub-web-react/src/app/entityV2/shared/tabs/Dataset/Validations/assertion/builder/steps/inferred/common/ExclusionWindowAdjuster.tsx#L31
832
+ # Copied here for reference: displayName: `${dayjs(startTime).format('MMM D, h:mm A')} - ${dayjs(endTime).format('MMM D, h:mm A')}`,
833
+ generated_display_name = f"{window.start.strftime('%b %-d, %-I:%M %p')} - {window.end.strftime('%b %-d, %-I:%M %p')}"
834
+ exclusion_windows.append(
835
+ models.AssertionExclusionWindowClass(
836
+ type=models.AssertionExclusionWindowTypeClass.FIXED_RANGE, # Currently only fixed range is supported
837
+ displayName=generated_display_name,
838
+ fixedRange=models.AbsoluteTimeWindowClass(
839
+ startTimeMillis=make_ts_millis(window.start),
840
+ endTimeMillis=make_ts_millis(window.end),
841
+ ),
842
+ )
843
+ )
844
+ return exclusion_windows
845
+
846
+ def _convert_sensitivity(self) -> models.AssertionMonitorSensitivityClass:
847
+ """
848
+ Convert sensitivity into an AssertionMonitorSensitivityClass.
849
+
850
+ Returns:
851
+ An AssertionMonitorSensitivityClass with the appropriate sensitivity.
852
+ """
853
+ return models.AssertionMonitorSensitivityClass(
854
+ level=InferenceSensitivity.to_int(self.sensitivity),
855
+ )
856
+
857
+
499
858
  class _AssertionInput(ABC):
500
859
  def __init__(
501
860
  self,
@@ -509,10 +868,8 @@ class _AssertionInput(ABC):
509
868
  ] = None, # Can be None if the assertion is not yet created
510
869
  display_name: Optional[str] = None,
511
870
  enabled: bool = True,
871
+ schedule: Optional[Union[str, models.CronScheduleClass]] = None,
512
872
  detection_mechanism: DetectionMechanismInputTypes = None,
513
- sensitivity: Optional[Union[str, InferenceSensitivity]] = None,
514
- exclusion_windows: Optional[ExclusionWindowInputTypes] = None,
515
- training_data_lookback_days: Optional[int] = None,
516
873
  incident_behavior: Optional[
517
874
  Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
518
875
  ] = None,
@@ -522,6 +879,7 @@ class _AssertionInput(ABC):
522
879
  created_at: datetime,
523
880
  updated_by: Union[str, CorpUserUrn],
524
881
  updated_at: datetime,
882
+ default_detection_mechanism: _DetectionMechanismTypes = DEFAULT_DETECTION_MECHANISM,
525
883
  ):
526
884
  """
527
885
  Create an AssertionInput object.
@@ -533,9 +891,6 @@ class _AssertionInput(ABC):
533
891
  display_name: The display name of the assertion. If not provided, a random display name will be generated.
534
892
  enabled: Whether the assertion is enabled. Defaults to True.
535
893
  detection_mechanism: The detection mechanism to be used for the assertion.
536
- sensitivity: The sensitivity to be applied to the assertion.
537
- exclusion_windows: The exclusion windows to be applied to the assertion. If not provided, no exclusion windows will be applied.
538
- training_data_lookback_days: The training data lookback days to be applied to the assertion.
539
894
  incident_behavior: The incident behavior to be applied to the assertion.
540
895
  tags: The tags to be applied to the assertion.
541
896
  source_type: The source type of the assertion. Defaults to models.AssertionSourceTypeClass.NATIVE.
@@ -553,13 +908,20 @@ class _AssertionInput(ABC):
553
908
  else _generate_default_name(DEFAULT_NAME_PREFIX, DEFAULT_NAME_SUFFIX_LENGTH)
554
909
  )
555
910
  self.enabled = enabled
556
-
557
- self.detection_mechanism = DetectionMechanism.parse(detection_mechanism)
558
- self.sensitivity = InferenceSensitivity.parse(sensitivity)
559
- self.exclusion_windows = _try_parse_exclusion_window(exclusion_windows)
560
- self.training_data_lookback_days = _try_parse_training_data_lookback_days(
561
- training_data_lookback_days
911
+ self.schedule = _try_parse_schedule(schedule)
912
+ self.detection_mechanism = DetectionMechanism.parse(
913
+ detection_mechanism, default_detection_mechanism
562
914
  )
915
+ if not _is_source_type_valid(
916
+ DatasetSourceType(
917
+ source_type=type(self.detection_mechanism),
918
+ platform=self.dataset_urn.platform,
919
+ assertion_type=self._assertion_type(),
920
+ )
921
+ ):
922
+ raise SDKUsageError(
923
+ f"Invalid source type: {self.detection_mechanism} for dataset type: {self.dataset_urn.platform} and assertion type: {self._assertion_type()}"
924
+ )
563
925
  self.incident_behavior = _try_parse_incident_behavior(incident_behavior)
564
926
  self.tags = tags
565
927
  if source_type not in get_enum_options(models.AssertionSourceTypeClass):
@@ -571,7 +933,6 @@ class _AssertionInput(ABC):
571
933
  self.created_at = created_at
572
934
  self.updated_by = updated_by
573
935
  self.updated_at = updated_at
574
-
575
936
  self.cached_dataset: Optional[Dataset] = None
576
937
 
577
938
  def to_assertion_and_monitor_entities(self) -> tuple[Assertion, Monitor]:
@@ -656,10 +1017,7 @@ class _AssertionInput(ABC):
656
1017
  """
657
1018
  if not isinstance(
658
1019
  self.detection_mechanism,
659
- (
660
- DetectionMechanism.LAST_MODIFIED_COLUMN,
661
- DetectionMechanism.HIGH_WATERMARK_COLUMN,
662
- ),
1020
+ _DETECTION_MECHANISM_TYPES_WITH_ADDITIONAL_FILTER,
663
1021
  ):
664
1022
  return None
665
1023
 
@@ -672,12 +1030,6 @@ class _AssertionInput(ABC):
672
1030
  sql=additional_filter,
673
1031
  )
674
1032
 
675
- @abstractmethod
676
- def _create_assertion_info(
677
- self, filter: Optional[models.DatasetFilterClass]
678
- ) -> AssertionInfoInputType:
679
- pass
680
-
681
1033
  def _convert_tags(self) -> Optional[TagsInputType]:
682
1034
  """
683
1035
  Convert the tags input into a standardized format.
@@ -746,8 +1098,6 @@ class _AssertionInput(ABC):
746
1098
  schedule=self._convert_schedule(),
747
1099
  source_type=source_type,
748
1100
  field=field,
749
- sensitivity=self._convert_sensitivity(),
750
- exclusion_windows=self._convert_exclusion_windows(),
751
1101
  ),
752
1102
  )
753
1103
 
@@ -764,86 +1114,69 @@ class _AssertionInput(ABC):
764
1114
  else models.MonitorModeClass.INACTIVE,
765
1115
  )
766
1116
 
767
- def _convert_exclusion_windows(
768
- self,
769
- ) -> list[models.AssertionExclusionWindowClass]:
1117
+ def _get_schema_field_spec(self, column_name: str) -> models.SchemaFieldSpecClass:
770
1118
  """
771
- Convert exclusion windows into AssertionExclusionWindowClass objects including generating display names for them.
1119
+ Get the schema field spec for the detection mechanism if needed.
1120
+ """
1121
+ # Only fetch the dataset if it's not already cached.
1122
+ # Also we only fetch the dataset if it's needed for the detection mechanism.
1123
+ if self.cached_dataset is None:
1124
+ self.cached_dataset = self.entity_client.get(self.dataset_urn)
772
1125
 
773
- Returns:
774
- A list of AssertionExclusionWindowClass objects.
1126
+ # Handle case where dataset doesn't exist
1127
+ if self.cached_dataset is None:
1128
+ raise SDKUsageError(
1129
+ f"Dataset {self.dataset_urn} not found. Cannot validate column {column_name}."
1130
+ )
775
1131
 
776
- Raises:
777
- SDKUsageErrorWithExamples: If an exclusion window is of an invalid type.
778
- """
779
- exclusion_windows: list[models.AssertionExclusionWindowClass] = []
780
- if self.exclusion_windows:
781
- for window in self.exclusion_windows:
782
- if not isinstance(window, FixedRangeExclusionWindow):
783
- raise SDKUsageErrorWithExamples(
784
- msg=f"Invalid exclusion window type: {window}",
785
- examples=FIXED_RANGE_EXCLUSION_WINDOW_EXAMPLES,
786
- )
787
- # To match the UI, we generate a display name for the exclusion window.
788
- # See here for the UI code: https://github.com/acryldata/datahub-fork/blob/acryl-main/datahub-web-react/src/app/entityV2/shared/tabs/Dataset/Validations/assertion/builder/steps/inferred/common/ExclusionWindowAdjuster.tsx#L31
789
- # Copied here for reference: displayName: `${dayjs(startTime).format('MMM D, h:mm A')} - ${dayjs(endTime).format('MMM D, h:mm A')}`,
790
- generated_display_name = f"{window.start.strftime('%b %-d, %-I:%M %p')} - {window.end.strftime('%b %-d, %-I:%M %p')}"
791
- exclusion_windows.append(
792
- models.AssertionExclusionWindowClass(
793
- type=models.AssertionExclusionWindowTypeClass.FIXED_RANGE, # Currently only fixed range is supported
794
- displayName=generated_display_name,
795
- fixedRange=models.AbsoluteTimeWindowClass(
796
- startTimeMillis=make_ts_millis(window.start),
797
- endTimeMillis=make_ts_millis(window.end),
798
- ),
799
- )
800
- )
801
- return exclusion_windows
1132
+ # TODO: Make a public accessor for _schema_dict in the SDK
1133
+ schema_fields = self.cached_dataset._schema_dict()
1134
+ field = schema_fields.get(column_name)
1135
+ if field:
1136
+ return models.SchemaFieldSpecClass(
1137
+ path=field.fieldPath,
1138
+ type=field.type.type.__class__.__name__,
1139
+ nativeType=field.nativeDataType,
1140
+ )
1141
+ else:
1142
+ raise SDKUsageError(
1143
+ msg=f"Column {column_name} not found in dataset {self.dataset_urn}",
1144
+ )
802
1145
 
803
- @abstractmethod
804
- def _convert_assertion_source_type_and_field(
1146
+ def _validate_field_type(
805
1147
  self,
806
- ) -> tuple[str, Optional[models.FreshnessFieldSpecClass]]:
1148
+ field_spec: models.SchemaFieldSpecClass,
1149
+ column_name: str,
1150
+ allowed_types: list[DictWrapper],
1151
+ field_type_name: str,
1152
+ ) -> None:
807
1153
  """
808
- Convert detection mechanism into source type and field specification for freshness assertions.
1154
+ Validate that a field has an allowed type.
809
1155
 
810
- Returns:
811
- A tuple of (source_type, field) where field may be None.
812
- Note that the source_type is a string, not a models.DatasetFreshnessSourceTypeClass since
813
- the source type is not a enum in the code generated from the DatasetFreshnessSourceType enum in the PDL.
1156
+ Args:
1157
+ field_spec: The field specification to validate
1158
+ column_name: The name of the column for error messages
1159
+ allowed_types: List of allowed field types
1160
+ field_type_name: Human-readable name of the field type for error messages
814
1161
 
815
1162
  Raises:
816
- SDKNotYetSupportedError: If the detection mechanism is not supported.
817
- SDKUsageError: If the field (column) is not found in the dataset,
818
- and the detection mechanism requires a field. Also if the field
819
- is not an allowed type for the detection mechanism.
1163
+ SDKUsageError: If the field has an invalid type
820
1164
  """
821
- pass
1165
+ allowed_type_names = [t.__class__.__name__ for t in allowed_types]
1166
+ if field_spec.type not in allowed_type_names:
1167
+ raise SDKUsageError(
1168
+ msg=f"Column {column_name} with type {field_spec.type} does not have an allowed type for a {field_type_name} in dataset {self.dataset_urn}. "
1169
+ f"Allowed types are {allowed_type_names}.",
1170
+ )
822
1171
 
823
1172
  @abstractmethod
824
- def _convert_schedule(self) -> models.CronScheduleClass:
825
- pass
826
-
827
- def _convert_sensitivity(self) -> models.AssertionMonitorSensitivityClass:
828
- """
829
- Convert sensitivity into an AssertionMonitorSensitivityClass.
830
-
831
- Returns:
832
- An AssertionMonitorSensitivityClass with the appropriate sensitivity.
833
- """
834
- return models.AssertionMonitorSensitivityClass(
835
- level=InferenceSensitivity.to_int(self.sensitivity),
836
- )
837
-
838
1173
  def _create_monitor_info(
839
1174
  self,
840
1175
  assertion_urn: AssertionUrn,
841
1176
  status: models.MonitorStatusClass,
842
1177
  schedule: models.CronScheduleClass,
843
1178
  source_type: Union[str, models.DatasetFreshnessSourceTypeClass],
844
- field: Optional[models.FreshnessFieldSpecClass],
845
- sensitivity: models.AssertionMonitorSensitivityClass,
846
- exclusion_windows: list[models.AssertionExclusionWindowClass],
1179
+ field: Optional[FieldSpecType],
847
1180
  ) -> models.MonitorInfoClass:
848
1181
  """
849
1182
  Create a MonitorInfoClass with all the necessary components.
@@ -851,71 +1184,94 @@ class _AssertionInput(ABC):
851
1184
  Args:
852
1185
  status: The monitor status.
853
1186
  schedule: The monitor schedule.
854
- source_type: The freshness source type.
1187
+ source_type: The source type.
855
1188
  field: Optional field specification.
856
- sensitivity: The monitor sensitivity.
857
- exclusion_windows: List of exclusion windows.
858
-
859
1189
  Returns:
860
1190
  A MonitorInfoClass configured with all the provided components.
861
1191
  """
862
- return models.MonitorInfoClass(
863
- type=models.MonitorTypeClass.ASSERTION,
864
- status=status,
865
- assertionMonitor=models.AssertionMonitorClass(
866
- assertions=[
867
- models.AssertionEvaluationSpecClass(
868
- assertion=str(assertion_urn),
869
- schedule=schedule,
870
- parameters=models.AssertionEvaluationParametersClass(
871
- type=models.AssertionEvaluationParametersTypeClass.DATASET_FRESHNESS,
872
- datasetFreshnessParameters=models.DatasetFreshnessAssertionParametersClass(
873
- sourceType=source_type,
874
- field=field,
875
- ),
876
- ),
877
- )
878
- ],
879
- settings=models.AssertionMonitorSettingsClass(
880
- adjustmentSettings=models.AssertionAdjustmentSettingsClass(
881
- sensitivity=sensitivity,
882
- exclusionWindows=exclusion_windows,
883
- trainingDataLookbackWindowDays=self.training_data_lookback_days,
884
- ),
885
- ),
886
- ),
887
- )
1192
+ pass
888
1193
 
889
- def _get_schema_field_spec(self, column_name: str) -> models.SchemaFieldSpecClass:
890
- """
891
- Get the schema field spec for the detection mechanism if needed.
1194
+ @abstractmethod
1195
+ def _assertion_type(self) -> str:
1196
+ """Get the assertion type."""
1197
+ pass
1198
+
1199
+ @abstractmethod
1200
+ def _create_assertion_info(
1201
+ self, filter: Optional[models.DatasetFilterClass]
1202
+ ) -> AssertionInfoInputType:
1203
+ """Create assertion info specific to the assertion type."""
1204
+ pass
1205
+
1206
+ @abstractmethod
1207
+ def _convert_schedule(self) -> models.CronScheduleClass:
1208
+ """Convert schedule to appropriate format for the assertion type."""
1209
+ pass
1210
+
1211
+ @abstractmethod
1212
+ def _get_assertion_evaluation_parameters(
1213
+ self, source_type: str, field: Optional[FieldSpecType]
1214
+ ) -> models.AssertionEvaluationParametersClass:
1215
+ """Get evaluation parameters specific to the assertion type."""
1216
+ pass
1217
+
1218
+ @abstractmethod
1219
+ def _convert_assertion_source_type_and_field(
1220
+ self,
1221
+ ) -> tuple[str, Optional[FieldSpecType]]:
1222
+ """Convert detection mechanism to source type and field spec."""
1223
+ pass
1224
+
1225
+
1226
+ class _HasFreshnessFeatures:
1227
+ def _create_field_spec(
1228
+ self,
1229
+ column_name: str,
1230
+ allowed_types: list[DictWrapper], # TODO: Use the type from the PDL
1231
+ field_type_name: str,
1232
+ kind: str,
1233
+ get_schema_field_spec: Callable[[str], models.SchemaFieldSpecClass],
1234
+ validate_field_type: Callable[
1235
+ [models.SchemaFieldSpecClass, str, list[DictWrapper], str], None
1236
+ ],
1237
+ ) -> models.FreshnessFieldSpecClass:
892
1238
  """
893
- # Only fetch the dataset if it's not already cached.
894
- # Also we only fetch the dataset if it's needed for the detection mechanism.
895
- if self.cached_dataset is None:
896
- self.cached_dataset = self.entity_client.get(self.dataset_urn)
1239
+ Create a field specification for a column, validating its type.
897
1240
 
898
- # TODO: Make a public accessor for _schema_dict in the SDK
899
- schema_fields = self.cached_dataset._schema_dict()
900
- field = schema_fields.get(column_name)
901
- if field:
902
- return models.SchemaFieldSpecClass(
903
- path=field.fieldPath,
904
- type=field.type.type.__class__.__name__,
905
- nativeType=field.nativeDataType,
906
- )
907
- else:
1241
+ Args:
1242
+ column_name: The name of the column to create a spec for
1243
+ allowed_types: List of allowed field types
1244
+ field_type_name: Human-readable name of the field type for error messages
1245
+ kind: The kind of field to create
1246
+
1247
+ Returns:
1248
+ A FreshnessFieldSpecClass for the column
1249
+
1250
+ Raises:
1251
+ SDKUsageError: If the column is not found or has an invalid type
1252
+ """
1253
+ SUPPORTED_KINDS = [
1254
+ models.FreshnessFieldKindClass.LAST_MODIFIED,
1255
+ models.FreshnessFieldKindClass.HIGH_WATERMARK,
1256
+ ]
1257
+ if kind not in SUPPORTED_KINDS:
908
1258
  raise SDKUsageError(
909
- msg=f"Column {column_name} not found in dataset {self.dataset_urn}",
1259
+ msg=f"Invalid kind: {kind}. Must be one of {SUPPORTED_KINDS}",
910
1260
  )
911
1261
 
1262
+ field_spec = get_schema_field_spec(column_name)
1263
+ validate_field_type(field_spec, column_name, allowed_types, field_type_name)
1264
+ return models.FreshnessFieldSpecClass(
1265
+ path=field_spec.path,
1266
+ type=field_spec.type,
1267
+ nativeType=field_spec.nativeType,
1268
+ kind=kind,
1269
+ )
912
1270
 
913
- class _SmartFreshnessAssertionInput(_AssertionInput):
914
- DEFAULT_SCHEDULE = models.CronScheduleClass(
915
- cron="0 0 * * *",
916
- timezone="UTC",
917
- )
918
1271
 
1272
+ class _SmartFreshnessAssertionInput(
1273
+ _AssertionInput, _HasSmartAssertionInputs, _HasFreshnessFeatures
1274
+ ):
919
1275
  def __init__(
920
1276
  self,
921
1277
  *,
@@ -926,6 +1282,7 @@ class _SmartFreshnessAssertionInput(_AssertionInput):
926
1282
  urn: Optional[Union[str, AssertionUrn]] = None,
927
1283
  display_name: Optional[str] = None,
928
1284
  enabled: bool = True,
1285
+ schedule: Optional[Union[str, models.CronScheduleClass]] = None,
929
1286
  detection_mechanism: DetectionMechanismInputTypes = None,
930
1287
  sensitivity: Optional[Union[str, InferenceSensitivity]] = None,
931
1288
  exclusion_windows: Optional[ExclusionWindowInputTypes] = None,
@@ -939,16 +1296,17 @@ class _SmartFreshnessAssertionInput(_AssertionInput):
939
1296
  updated_by: Union[str, CorpUserUrn],
940
1297
  updated_at: datetime,
941
1298
  ):
942
- super().__init__(
1299
+ _AssertionInput.__init__(
1300
+ self,
943
1301
  dataset_urn=dataset_urn,
944
1302
  entity_client=entity_client,
945
1303
  urn=urn,
946
1304
  display_name=display_name,
947
1305
  enabled=enabled,
1306
+ schedule=schedule
1307
+ if schedule is not None
1308
+ else DEFAULT_HOURLY_SCHEDULE, # Use provided schedule or default for create case
948
1309
  detection_mechanism=detection_mechanism,
949
- sensitivity=sensitivity,
950
- exclusion_windows=exclusion_windows,
951
- training_data_lookback_days=training_data_lookback_days,
952
1310
  incident_behavior=incident_behavior,
953
1311
  tags=tags,
954
1312
  source_type=models.AssertionSourceTypeClass.INFERRED, # Smart assertions are of type inferred, not native
@@ -957,6 +1315,16 @@ class _SmartFreshnessAssertionInput(_AssertionInput):
957
1315
  updated_by=updated_by,
958
1316
  updated_at=updated_at,
959
1317
  )
1318
+ _HasSmartAssertionInputs.__init__(
1319
+ self,
1320
+ sensitivity=sensitivity,
1321
+ exclusion_windows=exclusion_windows,
1322
+ training_data_lookback_days=training_data_lookback_days,
1323
+ )
1324
+
1325
+ def _assertion_type(self) -> str:
1326
+ """Get the assertion type."""
1327
+ return models.AssertionTypeClass.FRESHNESS
960
1328
 
961
1329
  def _create_assertion_info(
962
1330
  self, filter: Optional[models.DatasetFilterClass]
@@ -973,29 +1341,51 @@ class _SmartFreshnessAssertionInput(_AssertionInput):
973
1341
  return models.FreshnessAssertionInfoClass(
974
1342
  type=models.FreshnessAssertionTypeClass.DATASET_CHANGE, # Currently only dataset change is supported
975
1343
  entity=str(self.dataset_urn),
976
- # schedule (optional, not used for smart freshness assertions)
1344
+ # schedule (optional, must be left empty for smart freshness assertions - managed by the AI inference engine)
977
1345
  filter=filter,
978
1346
  )
979
1347
 
980
1348
  def _convert_schedule(self) -> models.CronScheduleClass:
981
1349
  """Create a schedule for a smart freshness assertion.
982
1350
 
983
- Since the schedule is not used for smart freshness assertions, we return a default schedule.
1351
+ For create case, uses DEFAULT_HOURLY_SCHEDULE. For update case, preserves existing schedule.
984
1352
 
985
1353
  Returns:
986
1354
  A CronScheduleClass with appropriate schedule settings.
987
1355
  """
988
- return self.DEFAULT_SCHEDULE
1356
+ assert self.schedule is not None, (
1357
+ "Schedule should never be None due to constructor logic"
1358
+ )
1359
+ return self.schedule
1360
+
1361
+ def _get_assertion_evaluation_parameters(
1362
+ self, source_type: str, field: Optional[FieldSpecType]
1363
+ ) -> models.AssertionEvaluationParametersClass:
1364
+ # Ensure field is either None or FreshnessFieldSpecClass
1365
+ freshness_field = None
1366
+ if field is not None:
1367
+ if not isinstance(field, models.FreshnessFieldSpecClass):
1368
+ raise SDKUsageError(
1369
+ f"Expected FreshnessFieldSpecClass for freshness assertion, got {type(field).__name__}"
1370
+ )
1371
+ freshness_field = field
1372
+
1373
+ return models.AssertionEvaluationParametersClass(
1374
+ type=models.AssertionEvaluationParametersTypeClass.DATASET_FRESHNESS,
1375
+ datasetFreshnessParameters=models.DatasetFreshnessAssertionParametersClass(
1376
+ sourceType=source_type, field=freshness_field
1377
+ ),
1378
+ )
989
1379
 
990
1380
  def _convert_assertion_source_type_and_field(
991
1381
  self,
992
- ) -> tuple[str, Optional[models.FreshnessFieldSpecClass]]:
1382
+ ) -> tuple[str, Optional[FieldSpecType]]:
993
1383
  """
994
1384
  Convert detection mechanism into source type and field specification for freshness assertions.
995
1385
 
996
1386
  Returns:
997
1387
  A tuple of (source_type, field) where field may be None.
998
- Note that the source_type is a string, not a models.DatasetFreshnessSourceTypeClass since
1388
+ Note that the source_type is a string, not a models.DatasetFreshnessSourceTypeClass (or other assertion source type) since
999
1389
  the source type is not a enum in the code generated from the DatasetFreshnessSourceType enum in the PDL.
1000
1390
 
1001
1391
  Raises:
@@ -1014,6 +1404,8 @@ class _SmartFreshnessAssertionInput(_AssertionInput):
1014
1404
  LAST_MODIFIED_ALLOWED_FIELD_TYPES,
1015
1405
  "last modified column",
1016
1406
  models.FreshnessFieldKindClass.LAST_MODIFIED,
1407
+ self._get_schema_field_spec,
1408
+ self._validate_field_type,
1017
1409
  )
1018
1410
  elif isinstance(self.detection_mechanism, _InformationSchema):
1019
1411
  source_type = models.DatasetFreshnessSourceTypeClass.INFORMATION_SCHEMA
@@ -1028,47 +1420,199 @@ class _SmartFreshnessAssertionInput(_AssertionInput):
1028
1420
 
1029
1421
  return source_type, field
1030
1422
 
1031
- def _create_field_spec(
1423
+ def _create_monitor_info(
1032
1424
  self,
1033
- column_name: str,
1034
- allowed_types: list[DictWrapper], # TODO: Use the type from the PDL
1035
- field_type_name: str,
1036
- kind: str,
1037
- ) -> models.FreshnessFieldSpecClass:
1425
+ assertion_urn: AssertionUrn,
1426
+ status: models.MonitorStatusClass,
1427
+ schedule: models.CronScheduleClass,
1428
+ source_type: Union[str, models.DatasetFreshnessSourceTypeClass],
1429
+ field: Optional[FieldSpecType],
1430
+ ) -> models.MonitorInfoClass:
1038
1431
  """
1039
- Create a field specification for a column, validating its type.
1432
+ Create a MonitorInfoClass with all the necessary components.
1433
+ """
1434
+ return models.MonitorInfoClass(
1435
+ type=models.MonitorTypeClass.ASSERTION,
1436
+ status=status,
1437
+ assertionMonitor=models.AssertionMonitorClass(
1438
+ assertions=[
1439
+ models.AssertionEvaluationSpecClass(
1440
+ assertion=str(assertion_urn),
1441
+ schedule=schedule,
1442
+ parameters=self._get_assertion_evaluation_parameters(
1443
+ str(source_type), field
1444
+ ),
1445
+ ),
1446
+ ],
1447
+ settings=models.AssertionMonitorSettingsClass(
1448
+ adjustmentSettings=models.AssertionAdjustmentSettingsClass(
1449
+ sensitivity=self._convert_sensitivity(),
1450
+ exclusionWindows=self._convert_exclusion_windows(),
1451
+ trainingDataLookbackWindowDays=self.training_data_lookback_days,
1452
+ ),
1453
+ ),
1454
+ ),
1455
+ )
1456
+
1457
+
1458
+ class _SmartVolumeAssertionInput(_AssertionInput, _HasSmartAssertionInputs):
1459
+ def __init__(
1460
+ self,
1461
+ *,
1462
+ # Required fields
1463
+ dataset_urn: Union[str, DatasetUrn],
1464
+ entity_client: EntityClient, # Needed to get the schema field spec for the detection mechanism if needed
1465
+ # Optional fields
1466
+ urn: Optional[Union[str, AssertionUrn]] = None,
1467
+ display_name: Optional[str] = None,
1468
+ enabled: bool = True,
1469
+ schedule: Optional[Union[str, models.CronScheduleClass]] = None,
1470
+ detection_mechanism: DetectionMechanismInputTypes = None,
1471
+ sensitivity: Optional[Union[str, InferenceSensitivity]] = None,
1472
+ exclusion_windows: Optional[ExclusionWindowInputTypes] = None,
1473
+ training_data_lookback_days: Optional[int] = None,
1474
+ incident_behavior: Optional[
1475
+ Union[AssertionIncidentBehavior, list[AssertionIncidentBehavior]]
1476
+ ] = None,
1477
+ tags: Optional[TagsInputType] = None,
1478
+ created_by: Union[str, CorpUserUrn],
1479
+ created_at: datetime,
1480
+ updated_by: Union[str, CorpUserUrn],
1481
+ updated_at: datetime,
1482
+ ):
1483
+ _AssertionInput.__init__(
1484
+ self,
1485
+ dataset_urn=dataset_urn,
1486
+ entity_client=entity_client,
1487
+ urn=urn,
1488
+ display_name=display_name,
1489
+ enabled=enabled,
1490
+ schedule=schedule,
1491
+ detection_mechanism=detection_mechanism,
1492
+ incident_behavior=incident_behavior,
1493
+ tags=tags,
1494
+ source_type=models.AssertionSourceTypeClass.INFERRED, # Smart assertions are of type inferred, not native
1495
+ created_by=created_by,
1496
+ created_at=created_at,
1497
+ updated_by=updated_by,
1498
+ updated_at=updated_at,
1499
+ )
1500
+ _HasSmartAssertionInputs.__init__(
1501
+ self,
1502
+ sensitivity=sensitivity,
1503
+ exclusion_windows=exclusion_windows,
1504
+ training_data_lookback_days=training_data_lookback_days,
1505
+ )
1506
+
1507
+ def _create_assertion_info(
1508
+ self, filter: Optional[models.DatasetFilterClass]
1509
+ ) -> AssertionInfoInputType:
1510
+ """
1511
+ Create a VolumeAssertionInfoClass for a smart volume assertion.
1040
1512
 
1041
1513
  Args:
1042
- column_name: The name of the column to create a spec for
1043
- allowed_types: List of allowed field types
1044
- field_type_name: Human-readable name of the field type for error messages
1045
- kind: The kind of field to create
1514
+ filter: Optional filter to apply to the assertion.
1046
1515
 
1047
1516
  Returns:
1048
- A FreshnessFieldSpecClass for the column
1517
+ A VolumeAssertionInfoClass configured for smart volume.
1518
+ """
1519
+ return models.VolumeAssertionInfoClass(
1520
+ type=models.VolumeAssertionTypeClass.ROW_COUNT_TOTAL, # Currently only ROW_COUNT_TOTAL is supported for smart volume
1521
+ entity=str(self.dataset_urn),
1522
+ filter=filter,
1523
+ )
1524
+
1525
+ def _convert_schedule(self) -> models.CronScheduleClass:
1526
+ """Create a schedule for a smart volume assertion.
1527
+
1528
+ Returns:
1529
+ A CronScheduleClass with appropriate schedule settings.
1530
+ """
1531
+ if self.schedule is None:
1532
+ return DEFAULT_HOURLY_SCHEDULE
1533
+
1534
+ return models.CronScheduleClass(
1535
+ cron=self.schedule.cron,
1536
+ timezone=self.schedule.timezone,
1537
+ )
1538
+
1539
+ def _get_assertion_evaluation_parameters(
1540
+ self, source_type: str, field: Optional[FieldSpecType]
1541
+ ) -> models.AssertionEvaluationParametersClass:
1542
+ return models.AssertionEvaluationParametersClass(
1543
+ type=models.AssertionEvaluationParametersTypeClass.DATASET_VOLUME,
1544
+ datasetVolumeParameters=models.DatasetVolumeAssertionParametersClass(
1545
+ sourceType=source_type,
1546
+ ),
1547
+ )
1548
+
1549
+ def _convert_assertion_source_type_and_field(
1550
+ self,
1551
+ ) -> tuple[str, Optional[FieldSpecType]]:
1552
+ """
1553
+ Convert detection mechanism into source type and field specification for volume assertions.
1554
+
1555
+ Returns:
1556
+ A tuple of (source_type, field) where field may be None.
1557
+ Note that the source_type is a string, not a models.DatasetFreshnessSourceTypeClass (or other assertion source type) since
1558
+ the source type is not a enum in the code generated from the DatasetFreshnessSourceType enum in the PDL.
1049
1559
 
1050
1560
  Raises:
1051
- SDKUsageError: If the column is not found or has an invalid type
1561
+ SDKNotYetSupportedError: If the detection mechanism is not supported.
1562
+ SDKUsageError: If the field (column) is not found in the dataset,
1563
+ and the detection mechanism requires a field. Also if the field
1564
+ is not an allowed type for the detection mechanism.
1052
1565
  """
1053
- SUPPORTED_KINDS = [
1054
- models.FreshnessFieldKindClass.LAST_MODIFIED,
1055
- models.FreshnessFieldKindClass.HIGH_WATERMARK,
1056
- ]
1057
- if kind not in SUPPORTED_KINDS:
1058
- raise SDKUsageError(
1059
- msg=f"Invalid kind: {kind}. Must be one of {SUPPORTED_KINDS}",
1060
- )
1566
+ source_type = models.DatasetVolumeSourceTypeClass.INFORMATION_SCHEMA
1567
+ field = None
1061
1568
 
1062
- field_spec = self._get_schema_field_spec(column_name)
1063
- allowed_type_names = [t.__class__.__name__ for t in allowed_types]
1064
- if field_spec.type not in allowed_type_names:
1065
- raise SDKUsageError(
1066
- msg=f"Column {column_name} with type {field_spec.type} does not have an allowed type for a {field_type_name} in dataset {self.dataset_urn}. "
1067
- f"Allowed types are {allowed_type_names}.",
1569
+ if isinstance(self.detection_mechanism, _Query):
1570
+ source_type = models.DatasetVolumeSourceTypeClass.QUERY
1571
+ elif isinstance(self.detection_mechanism, _InformationSchema):
1572
+ source_type = models.DatasetVolumeSourceTypeClass.INFORMATION_SCHEMA
1573
+ elif isinstance(self.detection_mechanism, _DatasetProfile):
1574
+ source_type = models.DatasetVolumeSourceTypeClass.DATAHUB_DATASET_PROFILE
1575
+ else:
1576
+ raise SDKNotYetSupportedError(
1577
+ f"Detection mechanism {self.detection_mechanism} not yet supported for smart volume assertions"
1068
1578
  )
1069
- return models.FreshnessFieldSpecClass(
1070
- path=field_spec.path,
1071
- type=field_spec.type,
1072
- nativeType=field_spec.nativeType,
1073
- kind=kind,
1579
+
1580
+ return source_type, field
1581
+
1582
+ def _create_monitor_info(
1583
+ self,
1584
+ assertion_urn: AssertionUrn,
1585
+ status: models.MonitorStatusClass,
1586
+ schedule: models.CronScheduleClass,
1587
+ source_type: Union[str, models.DatasetFreshnessSourceTypeClass],
1588
+ field: Optional[FieldSpecType],
1589
+ ) -> models.MonitorInfoClass:
1590
+ """
1591
+ Create a MonitorInfoClass with all the necessary components.
1592
+ """
1593
+ return models.MonitorInfoClass(
1594
+ type=models.MonitorTypeClass.ASSERTION,
1595
+ status=status,
1596
+ assertionMonitor=models.AssertionMonitorClass(
1597
+ assertions=[
1598
+ models.AssertionEvaluationSpecClass(
1599
+ assertion=str(assertion_urn),
1600
+ schedule=schedule,
1601
+ parameters=self._get_assertion_evaluation_parameters(
1602
+ str(source_type), field
1603
+ ),
1604
+ ),
1605
+ ],
1606
+ settings=models.AssertionMonitorSettingsClass(
1607
+ adjustmentSettings=models.AssertionAdjustmentSettingsClass(
1608
+ sensitivity=self._convert_sensitivity(),
1609
+ exclusionWindows=self._convert_exclusion_windows(),
1610
+ trainingDataLookbackWindowDays=self.training_data_lookback_days,
1611
+ ),
1612
+ ),
1613
+ ),
1074
1614
  )
1615
+
1616
+ def _assertion_type(self) -> str:
1617
+ """Get the assertion type."""
1618
+ return models.AssertionTypeClass.VOLUME