acryl-datahub-cloud 0.3.12.1rc3__py3-none-any.whl → 0.3.12.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

@@ -0,0 +1,965 @@
1
+ import json
2
+ from datetime import datetime
3
+ from typing import TYPE_CHECKING, Optional, Union
4
+
5
+ if TYPE_CHECKING:
6
+ pass
7
+
8
+ from acryl_datahub_cloud.sdk.assertion_input.assertion_input import (
9
+ DEFAULT_EVERY_SIX_HOURS_SCHEDULE,
10
+ HIGH_WATERMARK_ALLOWED_FIELD_TYPES,
11
+ NO_PARAMETER_OPERATORS,
12
+ RANGE_OPERATORS,
13
+ SINGLE_VALUE_OPERATORS,
14
+ AssertionIncidentBehaviorInputTypes,
15
+ AssertionInfoInputType,
16
+ DetectionMechanismInputTypes,
17
+ FieldSpecType,
18
+ _AllRowsQuery,
19
+ _AllRowsQueryDataHubDatasetProfile,
20
+ _AssertionInput,
21
+ _ChangedRowsQuery,
22
+ _DatasetProfile,
23
+ _try_parse_and_validate_schema_classes_enum,
24
+ )
25
+ from acryl_datahub_cloud.sdk.assertion_input.column_metric_constants import (
26
+ ALLOWED_COLUMN_TYPES_FOR_COLUMN_METRIC_ASSERTION,
27
+ FIELD_METRIC_TYPE_CONFIG,
28
+ FIELD_VALUES_OPERATOR_CONFIG,
29
+ MetricInputType,
30
+ OperatorInputType,
31
+ RangeInputType,
32
+ RangeTypeInputType,
33
+ RangeTypeParsedType,
34
+ ValueInputType,
35
+ ValueType,
36
+ ValueTypeInputType,
37
+ )
38
+ from acryl_datahub_cloud.sdk.entities.assertion import TagsInputType
39
+ from acryl_datahub_cloud.sdk.errors import (
40
+ SDKNotYetSupportedError,
41
+ SDKUsageError,
42
+ )
43
+ from datahub.emitter.enum_helpers import get_enum_options
44
+ from datahub.metadata import schema_classes as models
45
+ from datahub.metadata.urns import AssertionUrn, CorpUserUrn, DatasetUrn
46
+ from datahub.sdk.entity_client import EntityClient
47
+
48
+ # New unified criteria parameters type
49
+ ColumnMetricAssertionParameters = Union[
50
+ None, # For operators that don't require parameters (NULL, NOT_NULL)
51
+ ValueInputType, # Single value
52
+ RangeInputType, # Range as tuple
53
+ ]
54
+
55
+ DEFAULT_DETECTION_MECHANISM_COLUMN_METRIC_ASSERTION: _AllRowsQuery = _AllRowsQuery()
56
+
57
+
58
+ class _ColumnMetricAssertionInput(_AssertionInput):
59
+ """
60
+ Input used to create a column metric assertion.
61
+
62
+ This assertion is used to validate the value of a common field / column metric (e.g. aggregation) such as null count + percentage,
63
+ min, max, median, and more.
64
+
65
+ Example using the entity models, not comprehensive for all options:
66
+
67
+ ```python
68
+ models.AssertionInfoClass(
69
+ type=models.AssertionTypeClass.FIELD,
70
+ fieldAssertion=FieldAssertionInfoClass(
71
+ type=models.FieldAssertionTypeClass.FIELD_METRIC,
72
+ entity=str(self.dataset_urn),
73
+ filter=DatasetFilterClass(
74
+ type=models.DatasetFilterTypeClass.SQL,
75
+ sql="SELECT * FROM dataset WHERE column_name = 'value'", # Example filter
76
+ ),
77
+ fieldMetricAssertion=FieldMetricAssertionClass(
78
+ field=SchemaFieldSpecClass(
79
+ path="column_name", # The column name to validate
80
+ type="string", # The type of the column
81
+ nativeType="string", # The native type of the column
82
+ ),
83
+ metric=models.FieldMetricTypeClass.NULL_COUNT_PERCENTAGE, # The metric to validate
84
+ operator=models.AssertionStdOperatorClass.GREATER_THAN, # The operator to use
85
+ parameters=models.AssertionStdParametersClass(
86
+ value=models.AssertionStdParameterClass(
87
+ value=10, # The value to validate
88
+ type=models.AssertionStdParameterTypeClass.NUMBER, # The type of the value
89
+ ),
90
+ ),
91
+ ),
92
+ ),
93
+ source=models.AssertionSourceClass(
94
+ type=models.AssertionSourceTypeClass.NATIVE, # Column metric assertions are of type native
95
+ created=AuditStampClass(
96
+ time=1717929600,
97
+ actor="urn:li:corpuser:jdoe", # The actor who created the assertion
98
+ ),
99
+ ),
100
+ lastUpdated=AuditStampClass(
101
+ time=1717929600,
102
+ actor="urn:li:corpuser:jdoe", # The actor who last updated the assertion
103
+ ),
104
+ description="This assertion validates the null count percentage of the column 'column_name' is greater than 10.", # Optional description of the assertion
105
+ )
106
+ ```
107
+
108
+ ```python
109
+ models.MonitorInfoClass(
110
+ type=models.MonitorTypeClass.ASSERTION,
111
+ status=models.MonitorStatusClass(
112
+ mode=models.MonitorModeClass.ACTIVE, # Active or Inactive
113
+ ),
114
+ assertionMonitor=AssertionMonitorClass(
115
+ assertions=AssertionEvaluationSpecClass(
116
+ assertion="urn:li:assertion:123", # The assertion to monitor
117
+ schedule=models.CronScheduleClass(
118
+ cron="0 0 * * *", # The cron schedule
119
+ timezone="America/New_York", # The timezone
120
+ ),
121
+ parameters=models.AssertionEvaluationParametersClass(
122
+ type=models.AssertionEvaluationParametersTypeClass.DATASET_FIELD,
123
+ datasetFieldParameters=models.DatasetFieldAssertionParametersClass(
124
+ sourceType=models.DatasetFieldAssertionSourceTypeClass.CHANGED_ROWS_QUERY, # This can be ALL_ROWS_QUERY, CHANGED_ROWS_QUERY or DATAHUB_DATASET_PROFILE
125
+ changedRowsField=models.FreshnessFieldSpecClass(
126
+ path="column_name",
127
+ type="string",
128
+ nativeType="string",
129
+ kind=models.FreshnessFieldKindClass.HIGH_WATERMARK, # This can be LAST_MODIFIED or HIGH_WATERMARK
130
+ ),
131
+ ),
132
+ ),
133
+ ),
134
+ ),
135
+ )
136
+ ```
137
+ """
138
+
139
+ def __init__(
140
+ self,
141
+ *,
142
+ # Required parameters
143
+ dataset_urn: Union[str, DatasetUrn],
144
+ entity_client: EntityClient,
145
+ column_name: str,
146
+ metric_type: MetricInputType,
147
+ operator: OperatorInputType,
148
+ # Criteria parameters
149
+ criteria_parameters: Optional[ColumnMetricAssertionParameters] = None,
150
+ urn: Optional[Union[str, AssertionUrn]] = None,
151
+ display_name: Optional[str] = None,
152
+ enabled: bool = True,
153
+ schedule: Optional[Union[str, models.CronScheduleClass]] = None,
154
+ detection_mechanism: DetectionMechanismInputTypes = None,
155
+ incident_behavior: Optional[AssertionIncidentBehaviorInputTypes] = None,
156
+ tags: Optional[TagsInputType] = None,
157
+ created_by: Union[str, CorpUserUrn],
158
+ created_at: datetime,
159
+ updated_by: Union[str, CorpUserUrn],
160
+ updated_at: datetime,
161
+ gms_criteria_type_info: Optional[tuple] = None,
162
+ ):
163
+ """
164
+ Initialize a column metric assertion input.
165
+
166
+ Args:
167
+ dataset_urn: The dataset urn.
168
+ entity_client: The entity client.
169
+ column_name: The name of the column to validate.
170
+ metric_type: The metric type to validate.
171
+ operator: The operator to use.
172
+ criteria_parameters: The criteria parameters (single value, range tuple, or None). Type will be automatically inferred.
173
+ urn: The urn of the assertion.
174
+ display_name: The display name of the assertion.
175
+ enabled: Whether the assertion is enabled.
176
+ schedule: The schedule of the assertion.
177
+ detection_mechanism: The detection mechanism of the assertion.
178
+ incident_behavior: The incident behavior of the assertion. Accepts strings, enum values, lists, or None.
179
+ tags: The tags of the assertion.
180
+ created_by: The creator of the assertion.
181
+ created_at: The creation time of the assertion.
182
+ updated_by: The updater of the assertion.
183
+ updated_at: The update time of the assertion.
184
+ """
185
+ # Parent will handle validation of common parameters:
186
+ _AssertionInput.__init__(
187
+ self,
188
+ dataset_urn=dataset_urn,
189
+ entity_client=entity_client,
190
+ urn=urn,
191
+ display_name=display_name,
192
+ enabled=enabled,
193
+ schedule=schedule,
194
+ detection_mechanism=detection_mechanism,
195
+ incident_behavior=incident_behavior,
196
+ tags=tags,
197
+ source_type=models.AssertionSourceTypeClass.NATIVE, # Column metric assertions are of type native
198
+ created_by=created_by,
199
+ created_at=created_at,
200
+ updated_by=updated_by,
201
+ updated_at=updated_at,
202
+ default_detection_mechanism=DEFAULT_DETECTION_MECHANISM_COLUMN_METRIC_ASSERTION,
203
+ )
204
+
205
+ # Column metric assertions (non-smart) don't use exclusion_windows, sensitivity or training_data_lookback_days
206
+
207
+ # Validate Column Metric Assertion specific parameters
208
+ self.metric_type = _try_parse_and_validate_schema_classes_enum(
209
+ metric_type, models.FieldMetricTypeClass
210
+ )
211
+ self.column_name = self._try_parse_and_validate_column_name_is_valid_type(
212
+ column_name
213
+ )
214
+ self.operator = _try_parse_and_validate_schema_classes_enum(
215
+ operator, models.AssertionStdOperatorClass
216
+ )
217
+
218
+ # Initialize instance variables with proper type annotations
219
+ self.criteria_parameters: Optional[ColumnMetricAssertionParameters] = None
220
+ self.criteria_type: Optional[Union[ValueTypeInputType, RangeTypeInputType]] = (
221
+ None
222
+ )
223
+
224
+ # Process criteria parameters with GMS type information if available
225
+ if gms_criteria_type_info is not None:
226
+ self._process_criteria_parameters_with_gms_type(
227
+ criteria_parameters, gms_criteria_type_info[1]
228
+ )
229
+ else:
230
+ self._process_criteria_parameters(criteria_parameters)
231
+
232
+ # Validate compatibility:
233
+ self._validate_field_type_and_operator_compatibility(
234
+ self.column_name, self.operator
235
+ )
236
+ self._validate_field_type_and_metric_type_compatibility(
237
+ self.column_name, self.metric_type
238
+ )
239
+
240
+ def _infer_criteria_type_from_parameters(
241
+ self,
242
+ criteria_parameters: Optional[ColumnMetricAssertionParameters],
243
+ ) -> Optional[Union[ValueTypeInputType, RangeTypeInputType]]:
244
+ """
245
+ Infer the criteria type from the parameters based on Python types.
246
+
247
+ Args:
248
+ criteria_parameters: The criteria parameters to infer type from.
249
+
250
+ Returns:
251
+ The inferred type(s) for the criteria parameters.
252
+ """
253
+ if criteria_parameters is None:
254
+ return None
255
+
256
+ if isinstance(criteria_parameters, tuple):
257
+ # Range parameters - infer type for each value
258
+ if len(criteria_parameters) != 2:
259
+ raise SDKUsageError(
260
+ "Range parameters must be a tuple of exactly 2 values"
261
+ )
262
+
263
+ type1 = self._infer_single_value_type(criteria_parameters[0])
264
+ type2 = self._infer_single_value_type(criteria_parameters[1])
265
+ return (type1, type2)
266
+ else:
267
+ # Single value parameter
268
+ return self._infer_single_value_type(criteria_parameters)
269
+
270
+ def _infer_single_value_type(self, value: ValueInputType) -> ValueTypeInputType:
271
+ """
272
+ Infer the type of a single value based on its Python type.
273
+
274
+ Args:
275
+ value: The value to infer type from.
276
+
277
+ Returns:
278
+ The inferred ValueType.
279
+ """
280
+ if isinstance(value, (int, float)):
281
+ return ValueType.NUMBER
282
+ elif isinstance(value, str):
283
+ return ValueType.STRING
284
+ else:
285
+ # Default fallback
286
+ return ValueType.UNKNOWN
287
+
288
+ def _process_criteria_parameters_with_gms_type(
289
+ self,
290
+ criteria_parameters: Optional[ColumnMetricAssertionParameters],
291
+ gms_type_info: Optional[Union[models.AssertionStdParameterTypeClass, tuple]],
292
+ ) -> None:
293
+ """Process criteria_parameters using explicit type information from GMS."""
294
+ if criteria_parameters is None:
295
+ self._process_none_parameters()
296
+ elif isinstance(criteria_parameters, tuple):
297
+ # Range parameters with GMS types
298
+ if gms_type_info and isinstance(gms_type_info, tuple):
299
+ self._process_range_parameters_with_types(
300
+ criteria_parameters, gms_type_info
301
+ )
302
+ else:
303
+ self._process_range_parameters(criteria_parameters)
304
+ else:
305
+ # Single value with GMS type
306
+ if gms_type_info and not isinstance(gms_type_info, tuple):
307
+ self._process_single_value_parameters_with_type(
308
+ criteria_parameters, gms_type_info
309
+ )
310
+ else:
311
+ self._process_single_value_parameters(criteria_parameters)
312
+
313
+ def _process_criteria_parameters(
314
+ self,
315
+ criteria_parameters: Optional[ColumnMetricAssertionParameters],
316
+ ) -> None:
317
+ """Process the new consolidated criteria_parameters with automatic type inference."""
318
+ if criteria_parameters is None:
319
+ self._process_none_parameters()
320
+ elif isinstance(criteria_parameters, tuple):
321
+ self._process_range_parameters(criteria_parameters)
322
+ else:
323
+ self._process_single_value_parameters(criteria_parameters)
324
+
325
+ def _process_none_parameters(self) -> None:
326
+ """Process None criteria_parameters."""
327
+ # No parameters - validation is now handled at the client level
328
+ # This allows both creation and update scenarios to be handled appropriately
329
+ self.criteria_parameters = None
330
+ self.criteria_type = None
331
+
332
+ def _process_range_parameters(self, criteria_parameters: tuple) -> None:
333
+ """Process tuple criteria_parameters for range operators."""
334
+ # Range parameters
335
+ if not _is_range_required_for_operator(self.operator):
336
+ raise SDKUsageError(
337
+ f"Operator {self.operator} does not support range parameters. "
338
+ "Provide a single value instead of a tuple."
339
+ )
340
+
341
+ # Infer range type automatically
342
+ inferred_range_type = self._infer_criteria_type_from_parameters(
343
+ criteria_parameters
344
+ )
345
+
346
+ # Validate and parse the range type
347
+ validated_range_type = _try_parse_and_validate_range_type(inferred_range_type)
348
+
349
+ # Validate and parse the range values
350
+ validated_range = _try_parse_and_validate_range(
351
+ criteria_parameters, validated_range_type, self.operator
352
+ )
353
+
354
+ # Store validated parameters
355
+ self.criteria_parameters = validated_range
356
+ self.criteria_type = validated_range_type
357
+
358
+ def _process_single_value_parameters(
359
+ self, criteria_parameters: Union[str, int, float]
360
+ ) -> None:
361
+ """Process single value criteria_parameters."""
362
+ # Single value parameters
363
+ if _is_no_parameter_operator(self.operator):
364
+ raise SDKUsageError(
365
+ f"Value parameters should not be provided for operator {self.operator}"
366
+ )
367
+ if not _is_value_required_for_operator(self.operator):
368
+ raise SDKUsageError(
369
+ f"Operator {self.operator} does not support value parameters. "
370
+ "Use criteria_parameters=None or omit criteria_parameters."
371
+ )
372
+
373
+ # Infer value type automatically
374
+ inferred_value_type = self._infer_criteria_type_from_parameters(
375
+ criteria_parameters
376
+ )
377
+
378
+ # Validate value if required
379
+ if _is_value_required_for_operator(self.operator):
380
+ # Validate and parse the value type - make sure it's a single type, not a tuple
381
+ if isinstance(inferred_value_type, tuple):
382
+ raise SDKUsageError("Single value type expected, not a tuple type")
383
+
384
+ validated_value_type = _try_parse_and_validate_value_type(
385
+ inferred_value_type
386
+ )
387
+ validated_value = _try_parse_and_validate_value(
388
+ criteria_parameters, validated_value_type
389
+ )
390
+
391
+ # Store validated parameters
392
+ self.criteria_parameters = validated_value
393
+ self.criteria_type = validated_value_type
394
+ else:
395
+ # Store raw parameters for operators that don't require validation
396
+ self.criteria_parameters = criteria_parameters
397
+ self.criteria_type = inferred_value_type
398
+
399
+ def _process_single_value_parameters_with_type(
400
+ self,
401
+ criteria_parameters: Union[str, int, float],
402
+ gms_type: models.AssertionStdParameterTypeClass,
403
+ ) -> None:
404
+ """Process single value criteria_parameters using explicit GMS type information."""
405
+ # Single value parameters
406
+ if _is_no_parameter_operator(self.operator):
407
+ raise SDKUsageError(
408
+ f"Value parameters should not be provided for operator {self.operator}"
409
+ )
410
+ if not _is_value_required_for_operator(self.operator):
411
+ raise SDKUsageError(
412
+ f"Operator {self.operator} does not support value parameters. "
413
+ "Use criteria_parameters=None or omit criteria_parameters."
414
+ )
415
+
416
+ # Use GMS type instead of inferring
417
+ validated_value_type = _try_parse_and_validate_value_type(gms_type)
418
+ validated_value = _try_parse_and_validate_value(
419
+ criteria_parameters, validated_value_type
420
+ )
421
+
422
+ # Store validated parameters
423
+ self.criteria_parameters = validated_value
424
+ self.criteria_type = validated_value_type
425
+
426
+ def _process_range_parameters_with_types(
427
+ self,
428
+ criteria_parameters: tuple,
429
+ gms_types: tuple,
430
+ ) -> None:
431
+ """Process range criteria_parameters using explicit GMS type information."""
432
+ # Range parameters with GMS types
433
+ if _is_no_parameter_operator(self.operator):
434
+ raise SDKUsageError(
435
+ f"Range parameters should not be provided for operator {self.operator}"
436
+ )
437
+ if not _is_range_required_for_operator(self.operator):
438
+ raise SDKUsageError(
439
+ f"Operator {self.operator} does not support range parameters. "
440
+ "Use a single value or criteria_parameters=None."
441
+ )
442
+
443
+ if len(criteria_parameters) != 2:
444
+ raise SDKUsageError("Range parameters must be a tuple of exactly 2 values")
445
+
446
+ min_value, max_value = criteria_parameters
447
+ min_type, max_type = gms_types
448
+
449
+ # Use GMS types instead of inferring
450
+ validated_min_type = _try_parse_and_validate_value_type(min_type)
451
+ validated_max_type = _try_parse_and_validate_value_type(max_type)
452
+
453
+ validated_min_value = _try_parse_and_validate_value(
454
+ min_value, validated_min_type
455
+ )
456
+ validated_max_value = _try_parse_and_validate_value(
457
+ max_value, validated_max_type
458
+ )
459
+
460
+ # Store validated parameters
461
+ self.criteria_parameters = (validated_min_value, validated_max_value)
462
+ self.criteria_type = (validated_min_type, validated_max_type)
463
+
464
+ def _create_monitor_info(
465
+ self,
466
+ assertion_urn: AssertionUrn,
467
+ status: models.MonitorStatusClass,
468
+ schedule: models.CronScheduleClass,
469
+ ) -> models.MonitorInfoClass:
470
+ """
471
+ Create a MonitorInfoClass with all the necessary components.
472
+ """
473
+ source_type, field = self._convert_assertion_source_type_and_field()
474
+ return models.MonitorInfoClass(
475
+ type=models.MonitorTypeClass.ASSERTION,
476
+ status=status,
477
+ assertionMonitor=models.AssertionMonitorClass(
478
+ assertions=[
479
+ models.AssertionEvaluationSpecClass(
480
+ assertion=str(assertion_urn),
481
+ schedule=schedule,
482
+ parameters=self._get_assertion_evaluation_parameters(
483
+ str(source_type), field
484
+ ),
485
+ ),
486
+ ],
487
+ settings=None,
488
+ ),
489
+ )
490
+
491
+ def _create_assertion_info(
492
+ self, filter: Optional[models.DatasetFilterClass]
493
+ ) -> AssertionInfoInputType:
494
+ """
495
+ Create a FieldAssertionInfoClass for a column metric assertion.
496
+
497
+ Args:
498
+ filter: Optional filter to apply to the assertion.
499
+
500
+ Returns:
501
+ A FieldAssertionInfoClass configured for column metric.
502
+ """
503
+ # Get the field spec for the column
504
+ field_spec = self._get_schema_field_spec(self.column_name)
505
+
506
+ # Create the field metric assertion
507
+ field_metric_assertion = models.FieldMetricAssertionClass(
508
+ field=field_spec,
509
+ metric=self.metric_type,
510
+ operator=self.operator,
511
+ parameters=self._create_assertion_parameters(),
512
+ )
513
+
514
+ # Create the field assertion info
515
+ return models.FieldAssertionInfoClass(
516
+ type=models.FieldAssertionTypeClass.FIELD_METRIC,
517
+ entity=str(self.dataset_urn),
518
+ filter=filter,
519
+ fieldMetricAssertion=field_metric_assertion,
520
+ fieldValuesAssertion=None, # Explicitly set to None since this is a field metric assertion
521
+ )
522
+
523
+ def _convert_schedule(self) -> models.CronScheduleClass:
524
+ """
525
+ Create a schedule for a column metric assertion.
526
+
527
+ Returns:
528
+ A CronScheduleClass with appropriate schedule settings.
529
+ """
530
+ if self.schedule is None:
531
+ return DEFAULT_EVERY_SIX_HOURS_SCHEDULE
532
+
533
+ return models.CronScheduleClass(
534
+ cron=self.schedule.cron,
535
+ timezone=self.schedule.timezone,
536
+ )
537
+
538
+ def _convert_schema_field_spec_to_freshness_field_spec(
539
+ self, field_spec: models.SchemaFieldSpecClass
540
+ ) -> models.FreshnessFieldSpecClass:
541
+ """
542
+ Convert a SchemaFieldSpecClass to a FreshnessFieldSpecClass.
543
+ """
544
+ return models.FreshnessFieldSpecClass(
545
+ path=field_spec.path,
546
+ type=field_spec.type,
547
+ nativeType=field_spec.nativeType,
548
+ kind=models.FreshnessFieldKindClass.HIGH_WATERMARK,
549
+ )
550
+
551
+ def _get_assertion_evaluation_parameters(
552
+ self, source_type: str, field: Optional[FieldSpecType]
553
+ ) -> models.AssertionEvaluationParametersClass:
554
+ """
555
+ Get evaluation parameters for a column metric assertion.
556
+ Converts SchemaFieldSpecClass to FreshnessFieldSpecClass if needed.
557
+ """
558
+ if field is not None:
559
+ if isinstance(field, models.SchemaFieldSpecClass):
560
+ field = self._convert_schema_field_spec_to_freshness_field_spec(field)
561
+ assert isinstance(field, models.FreshnessFieldSpecClass), (
562
+ "Field must be FreshnessFieldSpecClass for monitor info"
563
+ )
564
+ return models.AssertionEvaluationParametersClass(
565
+ type=models.AssertionEvaluationParametersTypeClass.DATASET_FIELD,
566
+ datasetFieldParameters=models.DatasetFieldAssertionParametersClass(
567
+ sourceType=source_type,
568
+ changedRowsField=field,
569
+ ),
570
+ )
571
+
572
+ def _convert_assertion_source_type_and_field(
573
+ self,
574
+ ) -> tuple[str, Optional[FieldSpecType]]:
575
+ """
576
+ Convert detection mechanism into source type and field specification for column metric assertions.
577
+
578
+ Returns:
579
+ A tuple of (source_type, field) where field may be None.
580
+ Note that the source_type is a string, not a models.DatasetFieldAssertionSourceTypeClass (or other assertion source type) since
581
+ the source type is not a enum in the code generated from the DatasetFieldSourceType enum in the PDL.
582
+
583
+ Raises:
584
+ SDKNotYetSupportedError: If the detection mechanism is not supported.
585
+ SDKUsageError: If the field (column) is not found in the dataset,
586
+ and the detection mechanism requires a field. Also if the field
587
+ is not an allowed type for the detection mechanism.
588
+ """
589
+ source_type = models.DatasetFieldAssertionSourceTypeClass.ALL_ROWS_QUERY
590
+ field = None
591
+ SUPPORTED_DETECTION_MECHANISMS = [
592
+ _AllRowsQuery().type,
593
+ _AllRowsQueryDataHubDatasetProfile().type,
594
+ _ChangedRowsQuery(column_name="").type,
595
+ ]
596
+
597
+ if isinstance(self.detection_mechanism, _ChangedRowsQuery):
598
+ source_type = models.DatasetFieldAssertionSourceTypeClass.CHANGED_ROWS_QUERY
599
+ column_name = self._try_parse_and_validate_column_name_is_valid_type(
600
+ self.detection_mechanism.column_name, # The high watermark column name
601
+ allowed_column_types=HIGH_WATERMARK_ALLOWED_FIELD_TYPES,
602
+ )
603
+ field = self._get_schema_field_spec(column_name)
604
+ elif isinstance(self.detection_mechanism, _AllRowsQuery):
605
+ source_type = models.DatasetFieldAssertionSourceTypeClass.ALL_ROWS_QUERY
606
+ # For query-based detection, we don't need a field specification
607
+ # as the query itself defines what data to analyze
608
+ elif isinstance(
609
+ self.detection_mechanism,
610
+ (_AllRowsQueryDataHubDatasetProfile, _DatasetProfile),
611
+ ):
612
+ source_type = (
613
+ models.DatasetFieldAssertionSourceTypeClass.DATAHUB_DATASET_PROFILE
614
+ )
615
+ # Note: This is only valid on the all rows query
616
+ else:
617
+ raise SDKNotYetSupportedError(
618
+ f"Detection mechanism {self.detection_mechanism} is not supported for column metric assertions, please use a supported detection mechanism: {', '.join(SUPPORTED_DETECTION_MECHANISMS)}"
619
+ )
620
+
621
+ return source_type, field
622
+
623
+ def _create_assertion_parameters(self) -> models.AssertionStdParametersClass:
624
+ """
625
+ Create assertion parameters based on the operator type and provided values.
626
+
627
+ Returns:
628
+ An AssertionStdParametersClass with the appropriate parameters.
629
+
630
+ Raises:
631
+ SDKUsageError: If the parameters are invalid for the operator type.
632
+ """
633
+ if self.operator in SINGLE_VALUE_OPERATORS:
634
+ if self.criteria_parameters is None or isinstance(
635
+ self.criteria_parameters, tuple
636
+ ):
637
+ raise SDKUsageError(
638
+ f"Single value is required for operator {self.operator}"
639
+ )
640
+ if self.criteria_type is None or isinstance(self.criteria_type, tuple):
641
+ raise SDKUsageError(
642
+ f"Single value type is required for operator {self.operator}"
643
+ )
644
+ return models.AssertionStdParametersClass(
645
+ value=models.AssertionStdParameterClass(
646
+ value=str(self.criteria_parameters),
647
+ type=self.criteria_type,
648
+ ),
649
+ )
650
+ elif self.operator in RANGE_OPERATORS:
651
+ if not isinstance(self.criteria_parameters, tuple):
652
+ raise SDKUsageError(
653
+ f"Range parameters are required for operator {self.operator}"
654
+ )
655
+ if not isinstance(self.criteria_type, tuple):
656
+ raise SDKUsageError(
657
+ f"Range type is required for operator {self.operator}"
658
+ )
659
+ return models.AssertionStdParametersClass(
660
+ minValue=models.AssertionStdParameterClass(
661
+ value=str(self.criteria_parameters[0]),
662
+ type=self.criteria_type[0],
663
+ ),
664
+ maxValue=models.AssertionStdParameterClass(
665
+ value=str(self.criteria_parameters[1]),
666
+ type=self.criteria_type[1],
667
+ ),
668
+ )
669
+ elif self.operator in NO_PARAMETER_OPERATORS:
670
+ return models.AssertionStdParametersClass()
671
+ else:
672
+ raise SDKUsageError(f"Unsupported operator type: {self.operator}")
673
+
674
+ def _try_parse_and_validate_column_name_is_valid_type(
675
+ self,
676
+ column_name: str,
677
+ allowed_column_types: list[
678
+ models.DictWrapper
679
+ ] = ALLOWED_COLUMN_TYPES_FOR_COLUMN_METRIC_ASSERTION,
680
+ ) -> str:
681
+ """
682
+ Parse and validate a column name. Determine from the field spec if the column exists and is of the appropriate type for the metric type.
683
+ Validate that this is a column that is valid for the metric type, see also getEligibleFieldColumns and related functions in the frontend
684
+ """
685
+ field_spec = self._get_schema_field_spec(column_name)
686
+ self._validate_field_type(
687
+ field_spec,
688
+ column_name,
689
+ allowed_column_types,
690
+ "column metric assertion",
691
+ )
692
+ return column_name
693
+
694
+ def _assertion_type(self) -> str:
695
+ """Get the assertion type."""
696
+ return models.AssertionTypeClass.FIELD
697
+
698
+ def _validate_field_type_and_operator_compatibility(
699
+ self, column_name: str, operator: models.AssertionStdOperatorClass
700
+ ) -> None:
701
+ """Validate that the field type is compatible with the operator.
702
+
703
+ See FIELD_VALUES_OPERATOR_CONFIG in the frontend for the allowed operators for each field type.
704
+
705
+ Args:
706
+ column_name: The name of the column to validate.
707
+ operator: The operator to validate against.
708
+
709
+ Raises:
710
+ SDKUsageError: If the field type is not compatible with the operator.
711
+ """
712
+ field_spec = self._get_schema_field_spec(column_name)
713
+ allowed_operators = FIELD_VALUES_OPERATOR_CONFIG.get(field_spec.type, [])
714
+ if operator not in allowed_operators:
715
+ raise SDKUsageError(
716
+ f"Operator {operator} is not allowed for field type {field_spec.type} for column '{column_name}'. Allowed operators: {', '.join(str(op) for op in allowed_operators)}"
717
+ )
718
+
719
+ def _validate_field_type_and_metric_type_compatibility(
720
+ self, column_name: str, metric_type: models.FieldMetricTypeClass
721
+ ) -> None:
722
+ """Validate that the metric type is compatible with the field type.
723
+
724
+ See FIELD_METRIC_TYPE_CONFIG in the frontend for the allowed metric types for each field type.
725
+
726
+ Args:
727
+ column_name: The name of the column to validate.
728
+ metric_type: The metric type to validate.
729
+
730
+ Raises:
731
+ SDKUsageError: If the metric type is not compatible with the field type.
732
+ """
733
+ field_spec = self._get_schema_field_spec(column_name)
734
+ field_type = field_spec.type
735
+
736
+ if field_type not in FIELD_METRIC_TYPE_CONFIG:
737
+ raise SDKUsageError(
738
+ f"Column {column_name} is of type {field_type}, which is not supported for column metric assertions"
739
+ )
740
+
741
+ allowed_metric_types = FIELD_METRIC_TYPE_CONFIG[field_type]
742
+ if metric_type not in allowed_metric_types:
743
+ raise SDKUsageError(
744
+ f"Metric type {metric_type} is not allowed for field type {field_type}. Allowed metric types: {', '.join(str(mt) for mt in allowed_metric_types)}"
745
+ )
746
+
747
+
748
+ def _try_parse_and_validate_value_type(
749
+ value_type: Optional[ValueTypeInputType],
750
+ ) -> models.AssertionStdParameterTypeClass:
751
+ if value_type is None:
752
+ raise SDKUsageError("Value type is required")
753
+
754
+ return _try_parse_and_validate_schema_classes_enum(
755
+ value_type, models.AssertionStdParameterTypeClass
756
+ )
757
+
758
+
759
+ def _deserialize_json_value(value: ValueInputType) -> ValueInputType:
760
+ """
761
+ Deserialize a value that might be a JSON string.
762
+
763
+ Args:
764
+ value: The value to deserialize, potentially a JSON string.
765
+
766
+ Returns:
767
+ The deserialized value or the original value if not JSON.
768
+ """
769
+ if isinstance(value, str):
770
+ try:
771
+ return json.loads(value)
772
+ except json.JSONDecodeError:
773
+ return value
774
+ return value
775
+
776
+
777
+ def _convert_string_to_number(value: str) -> Union[int, float]:
778
+ """
779
+ Convert a string to a number (int or float).
780
+
781
+ Args:
782
+ value: The string value to convert.
783
+
784
+ Returns:
785
+ The converted number.
786
+
787
+ Raises:
788
+ ValueError: If the string cannot be converted to a number.
789
+ """
790
+ if "." in value:
791
+ return float(value)
792
+ return int(value)
793
+
794
+
795
+ def _validate_number_type(
796
+ value: ValueInputType, original_value: ValueInputType
797
+ ) -> ValueInputType:
798
+ """
799
+ Validate and convert a value to a number type.
800
+
801
+ Args:
802
+ value: The deserialized value to validate.
803
+ original_value: The original input value for error messages.
804
+
805
+ Returns:
806
+ The validated number value.
807
+
808
+ Raises:
809
+ SDKUsageError: If the value cannot be converted to a number.
810
+ """
811
+ if isinstance(value, (int, float)):
812
+ return value
813
+
814
+ if isinstance(value, str):
815
+ try:
816
+ return _convert_string_to_number(value)
817
+ except ValueError as e:
818
+ raise SDKUsageError(
819
+ f"Invalid value: {original_value}, must be a number"
820
+ ) from e
821
+
822
+ raise SDKUsageError(f"Invalid value: {original_value}, must be a number")
823
+
824
+
825
+ def _validate_string_type(
826
+ value: ValueInputType, original_value: ValueInputType
827
+ ) -> ValueInputType:
828
+ """
829
+ Validate that a value is a string type.
830
+
831
+ Args:
832
+ value: The deserialized value to validate.
833
+ original_value: The original input value for error messages.
834
+
835
+ Returns:
836
+ The validated string value.
837
+
838
+ Raises:
839
+ SDKUsageError: If the value is not a string.
840
+ """
841
+ if not isinstance(value, str):
842
+ raise SDKUsageError(f"Invalid value: {original_value}, must be a string")
843
+ return value
844
+
845
+
846
+ def _validate_unsupported_types(value_type: ValueTypeInputType) -> None:
847
+ """
848
+ Check for unsupported value types and raise appropriate errors.
849
+
850
+ Args:
851
+ value_type: The value type to check.
852
+
853
+ Raises:
854
+ SDKNotYetSupportedError: If the value type is LIST or SET.
855
+ SDKUsageError: If the value type is invalid.
856
+ """
857
+ if value_type in (
858
+ models.AssertionStdParameterTypeClass.LIST,
859
+ models.AssertionStdParameterTypeClass.SET,
860
+ ):
861
+ raise SDKNotYetSupportedError(
862
+ "List and set value types are not supported for column metric assertions"
863
+ )
864
+
865
+ valid_types = {
866
+ models.AssertionStdParameterTypeClass.NUMBER,
867
+ models.AssertionStdParameterTypeClass.STRING,
868
+ models.AssertionStdParameterTypeClass.UNKNOWN,
869
+ }
870
+
871
+ if value_type not in valid_types:
872
+ raise SDKUsageError(
873
+ f"Invalid value type: {value_type}, valid options are {get_enum_options(models.AssertionStdParameterTypeClass)}"
874
+ )
875
+
876
+
877
+ def _try_parse_and_validate_value(
878
+ value: Optional[ValueInputType],
879
+ value_type: ValueTypeInputType,
880
+ ) -> ValueInputType:
881
+ """
882
+ Parse and validate a value according to its expected type.
883
+
884
+ Args:
885
+ value: The value to parse and validate.
886
+ value_type: The expected type of the value.
887
+
888
+ Returns:
889
+ The validated and potentially converted value.
890
+
891
+ Raises:
892
+ SDKUsageError: If the value is None, invalid, or cannot be converted.
893
+ SDKNotYetSupportedError: If the value type is not supported.
894
+ """
895
+ if value is None:
896
+ raise SDKUsageError("Value parameter is required for the chosen operator")
897
+
898
+ # Deserialize JSON strings if applicable
899
+ deserialized_value = _deserialize_json_value(value)
900
+
901
+ # Validate based on expected type
902
+ if value_type == models.AssertionStdParameterTypeClass.NUMBER:
903
+ return _validate_number_type(deserialized_value, value)
904
+ elif value_type == models.AssertionStdParameterTypeClass.STRING:
905
+ return _validate_string_type(deserialized_value, value)
906
+ elif value_type == models.AssertionStdParameterTypeClass.UNKNOWN:
907
+ return deserialized_value # Accept any type for unknown
908
+ else:
909
+ _validate_unsupported_types(value_type)
910
+ return deserialized_value
911
+
912
+
913
+ def _is_range_required_for_operator(operator: models.AssertionStdOperatorClass) -> bool:
914
+ return operator in RANGE_OPERATORS
915
+
916
+
917
+ def _is_value_required_for_operator(operator: models.AssertionStdOperatorClass) -> bool:
918
+ return operator in SINGLE_VALUE_OPERATORS
919
+
920
+
921
+ def _is_no_parameter_operator(operator: models.AssertionStdOperatorClass) -> bool:
922
+ return operator in NO_PARAMETER_OPERATORS
923
+
924
+
925
+ def _try_parse_and_validate_range_type(
926
+ range_type: Optional[RangeTypeInputType] = None,
927
+ ) -> RangeTypeParsedType:
928
+ if range_type is None:
929
+ return (
930
+ models.AssertionStdParameterTypeClass.UNKNOWN,
931
+ models.AssertionStdParameterTypeClass.UNKNOWN,
932
+ )
933
+ if isinstance(range_type, tuple):
934
+ return (
935
+ _try_parse_and_validate_schema_classes_enum(
936
+ range_type[0], models.AssertionStdParameterTypeClass
937
+ ),
938
+ _try_parse_and_validate_schema_classes_enum(
939
+ range_type[1], models.AssertionStdParameterTypeClass
940
+ ),
941
+ )
942
+ # Single value, we assume the same type for start and end:
943
+ parsed_range_type = _try_parse_and_validate_schema_classes_enum(
944
+ range_type, models.AssertionStdParameterTypeClass
945
+ )
946
+ return parsed_range_type, parsed_range_type
947
+
948
+
949
+ def _try_parse_and_validate_range(
950
+ range: Optional[RangeInputType],
951
+ range_type: RangeTypeParsedType,
952
+ operator: models.AssertionStdOperatorClass,
953
+ ) -> RangeInputType:
954
+ if (range is None or range_type is None) and _is_range_required_for_operator(
955
+ operator
956
+ ):
957
+ raise SDKUsageError(f"Range is required for operator {operator}")
958
+
959
+ if range is None:
960
+ raise SDKUsageError(f"Range is required for operator {operator}")
961
+
962
+ range_start = _try_parse_and_validate_value(range[0], range_type[0])
963
+ range_end = _try_parse_and_validate_value(range[1], range_type[1])
964
+
965
+ return (range_start, range_end)