acryl-datahub-cloud 0.3.10.4__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/_sdk_extras/__init__.py +4 -0
- acryl_datahub_cloud/_sdk_extras/assertion.py +15 -0
- acryl_datahub_cloud/_sdk_extras/assertions_client.py +23 -0
- acryl_datahub_cloud/acryl_cs_issues/acryl_customer.py +1 -1
- acryl_datahub_cloud/action_request/action_request_owner_source.py +1 -2
- acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +3 -7
- acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +9 -5
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +14 -32
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +1849 -1793
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/monitor/__init__.py +6 -0
- acryl_datahub_cloud/metadata/schema.avsc +25538 -25429
- acryl_datahub_cloud/metadata/schema_classes.py +861 -676
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +25 -0
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +25 -0
- acryl_datahub_cloud/metadata/schemas/DataContractKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +36 -26
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +58 -0
- acryl_datahub_cloud/metadata/schemas/QueryProperties.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +86 -0
- {acryl_datahub_cloud-0.3.10.4.dist-info → acryl_datahub_cloud-0.3.11.dist-info}/METADATA +43 -43
- {acryl_datahub_cloud-0.3.10.4.dist-info → acryl_datahub_cloud-0.3.11.dist-info}/RECORD +33 -28
- {acryl_datahub_cloud-0.3.10.4.dist-info → acryl_datahub_cloud-0.3.11.dist-info}/WHEEL +1 -1
- {acryl_datahub_cloud-0.3.10.4.dist-info → acryl_datahub_cloud-0.3.11.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_cloud-0.3.10.4.dist-info → acryl_datahub_cloud-0.3.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This file contains the Assertion class, which is used to represent an assertion in DataHub.
|
|
3
|
+
|
|
4
|
+
The Assertion class is currently not implemented, this is a placeholder for future implementation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Union
|
|
8
|
+
|
|
9
|
+
from datahub.metadata.urns import AssertionUrn
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Assertion:
|
|
13
|
+
def __init__(self, urn: Union[str, AssertionUrn]):
|
|
14
|
+
print(f"The Assertion class is currently not implemented. Urn provided: {urn}")
|
|
15
|
+
self.urn = AssertionUrn(urn)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Union
|
|
4
|
+
|
|
5
|
+
from acryl_datahub_cloud._sdk_extras.assertion import Assertion
|
|
6
|
+
from datahub.metadata.urns import AssertionUrn
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from datahub.sdk.main_client import DataHubClient
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AssertionsClient:
|
|
13
|
+
def __init__(self, client: DataHubClient):
|
|
14
|
+
self.client = client
|
|
15
|
+
|
|
16
|
+
def get_assertions(
|
|
17
|
+
self, urn: Union[str, list[str], AssertionUrn, list[AssertionUrn]]
|
|
18
|
+
) -> list[Assertion]:
|
|
19
|
+
print(
|
|
20
|
+
"get_assertions is not implemented, this is a placeholder. Returning empty list."
|
|
21
|
+
)
|
|
22
|
+
print(f"urn provided: {urn}")
|
|
23
|
+
return []
|
|
@@ -219,6 +219,7 @@ class AcrylCustomer:
|
|
|
219
219
|
self._emitted_containers: Dict[str, bool] = {}
|
|
220
220
|
|
|
221
221
|
def _get_owner_from_assignee(self, assignee: ExternalUser) -> str:
|
|
222
|
+
assert assignee.email, "Assignee must have an email"
|
|
222
223
|
owner_urn_options = [
|
|
223
224
|
urn
|
|
224
225
|
for urn in self.graph.get_urns_by_filter(
|
|
@@ -233,7 +234,6 @@ class AcrylCustomer:
|
|
|
233
234
|
)
|
|
234
235
|
]
|
|
235
236
|
if not owner_urn_options:
|
|
236
|
-
assert assignee.email, "Assignee must have an email"
|
|
237
237
|
owner_urn = "urn:li:corpuser:" + assignee.email
|
|
238
238
|
self.graph.emit(
|
|
239
239
|
MetadataChangeProposalWrapper(
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
from typing import Dict, Iterable, List, Optional
|
|
3
3
|
|
|
4
4
|
from datahub.configuration import ConfigModel
|
|
5
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
5
6
|
from datahub.ingestion.api.common import PipelineContext
|
|
6
7
|
from datahub.ingestion.api.decorators import (
|
|
7
8
|
SupportStatus,
|
|
@@ -14,10 +15,8 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
14
15
|
from datahub.metadata.schema_classes import (
|
|
15
16
|
ActionRequestInfoClass,
|
|
16
17
|
)
|
|
17
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
18
18
|
from datahub.utilities.urns.urn import guess_entity_type
|
|
19
19
|
|
|
20
|
-
|
|
21
20
|
logger = logging.getLogger(__name__)
|
|
22
21
|
|
|
23
22
|
|
|
@@ -163,7 +163,7 @@ class DataHubBasedS3Dataset:
|
|
|
163
163
|
self.schema = row.arrow_schema()
|
|
164
164
|
else:
|
|
165
165
|
# hail mary: infer schema from the first row and cast everything to string
|
|
166
|
-
self.schema = pa.schema([(key, pa.string()) for key in row
|
|
166
|
+
self.schema = pa.schema([(key, pa.string()) for key in row])
|
|
167
167
|
self.stringify_row = True
|
|
168
168
|
|
|
169
169
|
self._initialize_local_file()
|
|
@@ -172,7 +172,7 @@ class DataHubBasedS3Dataset:
|
|
|
172
172
|
self.schema,
|
|
173
173
|
compression=self.config.file_compression,
|
|
174
174
|
)
|
|
175
|
-
if isinstance(row, BaseModel
|
|
175
|
+
if isinstance(row, (BaseModel, BaseModelRow)):
|
|
176
176
|
# for anything extending BaseModel, we want to use the dict representation
|
|
177
177
|
write_row: Dict[str, Any] = row.dict()
|
|
178
178
|
elif isinstance(row, dict):
|
|
@@ -274,11 +274,7 @@ class DataHubBasedS3Dataset:
|
|
|
274
274
|
self, duckdb_columns: List[Tuple[str, str]]
|
|
275
275
|
) -> SchemaMetadataClass:
|
|
276
276
|
def get_type_from_dtype(dtype: str) -> SchemaFieldDataTypeClass:
|
|
277
|
-
if "int" in dtype:
|
|
278
|
-
return SchemaFieldDataTypeClass(type=NumberTypeClass())
|
|
279
|
-
elif "float" in dtype:
|
|
280
|
-
return SchemaFieldDataTypeClass(type=NumberTypeClass())
|
|
281
|
-
elif "number" in dtype:
|
|
277
|
+
if "int" in dtype or "float" in dtype or "number" in dtype:
|
|
282
278
|
return SchemaFieldDataTypeClass(type=NumberTypeClass())
|
|
283
279
|
elif "bool" in dtype:
|
|
284
280
|
return SchemaFieldDataTypeClass(type=BooleanTypeClass())
|
|
@@ -5,6 +5,9 @@ from enum import Enum
|
|
|
5
5
|
from typing import Any, Callable, Dict, Iterable, List, Optional
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow
|
|
8
11
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
12
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
10
13
|
from datahub.ingestion.graph.filters import RawSearchFilterRule
|
|
@@ -16,9 +19,6 @@ from datahub.metadata.schema_classes import (
|
|
|
16
19
|
FormStateClass,
|
|
17
20
|
FormTypeClass,
|
|
18
21
|
)
|
|
19
|
-
from pydantic import BaseModel
|
|
20
|
-
|
|
21
|
-
from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow
|
|
22
22
|
|
|
23
23
|
logger = logging.getLogger(__name__)
|
|
24
24
|
|
|
@@ -257,6 +257,7 @@ class DataHubFormReportingData(FormData):
|
|
|
257
257
|
for prompt_id, response_time in zip(
|
|
258
258
|
search_row.completedFormsCompletedPromptIds,
|
|
259
259
|
search_row.completedFormsCompletedPromptResponseTimes,
|
|
260
|
+
strict=False,
|
|
260
261
|
)
|
|
261
262
|
if prompt_id in form_prompts
|
|
262
263
|
}
|
|
@@ -289,9 +290,10 @@ class DataHubFormReportingData(FormData):
|
|
|
289
290
|
on_asset_scanned: Optional[Callable[[str], Any]] = None,
|
|
290
291
|
on_form_scanned: Optional[Callable[[str], Any]] = None,
|
|
291
292
|
) -> Iterable[FormReportingRow]:
|
|
292
|
-
extra_fields = [f for f in self.DataHubDatasetSearchRow.__fields__
|
|
293
|
+
extra_fields = [f for f in self.DataHubDatasetSearchRow.__fields__]
|
|
294
|
+
# TODO: Replace with the new search/filter SDK.
|
|
293
295
|
result = self.graph.get_results_by_filter(
|
|
294
|
-
extra_or_filters=self.get_form_existence_or_filters(),
|
|
296
|
+
extra_or_filters=[{"and": self.get_form_existence_or_filters()}],
|
|
295
297
|
extra_source_fields=extra_fields,
|
|
296
298
|
skip_cache=True,
|
|
297
299
|
)
|
|
@@ -388,6 +390,7 @@ class DataHubFormReportingData(FormData):
|
|
|
388
390
|
for (p, p_response_time) in zip(
|
|
389
391
|
search_row.incompleteFormsCompletedPromptIds,
|
|
390
392
|
search_row.incompleteFormsCompletedPromptResponseTimes,
|
|
393
|
+
strict=False,
|
|
391
394
|
)
|
|
392
395
|
if p in form_prompts
|
|
393
396
|
]:
|
|
@@ -485,6 +488,7 @@ class DataHubFormReportingData(FormData):
|
|
|
485
488
|
for (p, p_response_time) in zip(
|
|
486
489
|
search_row.completedFormsCompletedPromptIds,
|
|
487
490
|
search_row.completedFormsCompletedPromptResponseTimes,
|
|
491
|
+
strict=False,
|
|
488
492
|
)
|
|
489
493
|
if p in form_prompts
|
|
490
494
|
]:
|
|
@@ -395,18 +395,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
395
395
|
"last_modified_at": (
|
|
396
396
|
doc["_source"]["lastModifiedAt"]
|
|
397
397
|
if "lastModifiedAt" in doc["_source"]
|
|
398
|
-
else (
|
|
399
|
-
doc["_source"]["lastModifiedAt"]
|
|
400
|
-
if "lastModifiedAt" in doc["_source"]
|
|
401
|
-
else None
|
|
402
|
-
)
|
|
398
|
+
else (doc["_source"].get("lastModifiedAt", None))
|
|
403
399
|
),
|
|
404
400
|
"platform": doc["_source"]["platform"],
|
|
405
|
-
"removed": (
|
|
406
|
-
doc["_source"]["removed"]
|
|
407
|
-
if "removed" in doc["_source"]
|
|
408
|
-
else False
|
|
409
|
-
),
|
|
401
|
+
"removed": (doc["_source"].get("removed", False)),
|
|
410
402
|
}
|
|
411
403
|
|
|
412
404
|
time_taken = timer.elapsed_seconds()
|
|
@@ -509,11 +501,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
509
501
|
"eventGranularity": doc["_source"].get("eventGranularity"),
|
|
510
502
|
"totalSqlQueries": doc["_source"].get("totalSqlQueries", 0),
|
|
511
503
|
"uniqueUserCount": doc["_source"].get("uniqueUserCount", 0),
|
|
512
|
-
"userCounts": (
|
|
513
|
-
doc["_source"]["event"]["userCounts"]
|
|
514
|
-
if "userCounts" in doc["_source"]["event"]
|
|
515
|
-
else None
|
|
516
|
-
),
|
|
504
|
+
"userCounts": (doc["_source"]["event"].get("userCounts", None)),
|
|
517
505
|
"platform": platform,
|
|
518
506
|
}
|
|
519
507
|
except KeyError as e:
|
|
@@ -525,7 +513,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
525
513
|
time_taken = timer.elapsed_seconds()
|
|
526
514
|
logger.info(f"DatasetUsage processing took {time_taken:.3f} seconds")
|
|
527
515
|
|
|
528
|
-
def search_score(
|
|
516
|
+
def search_score( # noqa: C901
|
|
529
517
|
self, urn: str, last_update_time: int, usage_percentile: int
|
|
530
518
|
) -> SearchRankingMultipliers:
|
|
531
519
|
usage_search_score_multiplier = 1.0
|
|
@@ -622,10 +610,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
622
610
|
[endpoint],
|
|
623
611
|
http_auth=(user, password),
|
|
624
612
|
use_ssl=(
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
613
|
+
bool(
|
|
614
|
+
self.config.search_index
|
|
615
|
+
and self.config.search_index.use_ssl
|
|
616
|
+
)
|
|
629
617
|
),
|
|
630
618
|
)
|
|
631
619
|
|
|
@@ -639,10 +627,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
639
627
|
[endpoint],
|
|
640
628
|
http_auth=(user, password),
|
|
641
629
|
use_ssl=(
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
630
|
+
bool(
|
|
631
|
+
self.config.search_index
|
|
632
|
+
and self.config.search_index.use_ssl
|
|
633
|
+
)
|
|
646
634
|
),
|
|
647
635
|
)
|
|
648
636
|
|
|
@@ -737,7 +725,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
737
725
|
polars.Duration(): pa.duration("ns"),
|
|
738
726
|
}
|
|
739
727
|
|
|
740
|
-
if polars_dtype in [type(key) for key in type_mapping
|
|
728
|
+
if polars_dtype in [type(key) for key in type_mapping]:
|
|
741
729
|
return type_mapping[polars_dtype]
|
|
742
730
|
elif polars_dtype == polars.Categorical:
|
|
743
731
|
return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
|
|
@@ -1006,12 +994,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1006
994
|
def generate_mcp_from_lazyframe(
|
|
1007
995
|
self, lazy_frame: polars.LazyFrame
|
|
1008
996
|
) -> Iterable[MetadataWorkUnit]:
|
|
1009
|
-
num = 0
|
|
1010
997
|
for row in lazy_frame.collect(
|
|
1011
998
|
streaming=self.config.experimental_full_streaming
|
|
1012
999
|
).to_struct():
|
|
1013
|
-
num += 1
|
|
1014
|
-
|
|
1015
1000
|
if "siblings" in row and row["siblings"]:
|
|
1016
1001
|
logger.info(f"Siblings found for urn: {row['urn']} -> row['siblings']")
|
|
1017
1002
|
|
|
@@ -1101,10 +1086,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1101
1086
|
def generate_query_usage_mcp_from_lazyframe(
|
|
1102
1087
|
self, lazy_frame: polars.LazyFrame
|
|
1103
1088
|
) -> Iterable[MetadataWorkUnit]:
|
|
1104
|
-
num = 0
|
|
1105
1089
|
for row in lazy_frame.collect().iter_rows(named=True):
|
|
1106
|
-
num += 1
|
|
1107
|
-
|
|
1108
1090
|
query_usage_features = QueryUsageFeaturesClass(
|
|
1109
1091
|
queryCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
|
|
1110
1092
|
queryCountTotal=None, # This is not implemented
|
|
@@ -1287,7 +1269,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1287
1269
|
.is_not_null()
|
|
1288
1270
|
# We only want to downrank datasets that have a search score multiplier greater than 1. 1 is the minimum score of a dataset
|
|
1289
1271
|
.and_(polars.col("combinedSearchRankingMultiplier").ne(1))
|
|
1290
|
-
)
|
|
1272
|
+
)
|
|
1291
1273
|
.filter(polars.col("removed") == False) # noqa: E712
|
|
1292
1274
|
.drop(["removed"])
|
|
1293
1275
|
.drop(["last_modified_at"])
|