acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2557 -2557
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +81 -79
- datahub/_version.py +1 -1
- datahub/api/entities/datajob/dataflow.py +15 -0
- datahub/api/entities/datajob/datajob.py +17 -0
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataset/dataset.py +2 -2
- datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
- datahub/cli/ingest_cli.py +4 -4
- datahub/cli/migrate.py +6 -6
- datahub/configuration/common.py +1 -1
- datahub/emitter/mcp_builder.py +4 -0
- datahub/errors.py +4 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/source.py +6 -2
- datahub/ingestion/api/source_helpers.py +35 -2
- datahub/ingestion/graph/client.py +122 -7
- datahub/ingestion/graph/filters.py +41 -16
- datahub/ingestion/run/pipeline.py +0 -6
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/fivetran/fivetran.py +1 -0
- datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
- datahub/ingestion/source/hex/constants.py +5 -0
- datahub/ingestion/source/hex/hex.py +150 -22
- datahub/ingestion/source/hex/mapper.py +28 -2
- datahub/ingestion/source/hex/model.py +10 -2
- datahub/ingestion/source/hex/query_fetcher.py +300 -0
- datahub/ingestion/source/iceberg/iceberg.py +106 -18
- datahub/ingestion/source/kafka/kafka.py +1 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
- datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +2 -3
- datahub/ingestion/source/mlflow.py +6 -7
- datahub/ingestion/source/mode.py +2 -2
- datahub/ingestion/source/nifi.py +3 -3
- datahub/ingestion/source/openapi.py +3 -3
- datahub/ingestion/source/openapi_parser.py +8 -8
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +16 -3
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/sigma/sigma.py +6 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/trino.py +4 -3
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/superset.py +108 -81
- datahub/ingestion/source/tableau/tableau.py +4 -4
- datahub/ingestion/source/tableau/tableau_common.py +2 -2
- datahub/ingestion/source/unity/source.py +1 -1
- datahub/ingestion/source/vertexai/vertexai.py +7 -7
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_schema_classes.py +47 -2
- datahub/metadata/_urns/urn_defs.py +56 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +121 -85
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +73 -11
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +6 -6
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from functools import lru_cache
|
|
@@ -100,6 +101,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
100
101
|
from datahub.utilities import config_clean
|
|
101
102
|
from datahub.utilities.lossy_collections import LossyList
|
|
102
103
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
104
|
+
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
103
105
|
|
|
104
106
|
logger = logging.getLogger(__name__)
|
|
105
107
|
|
|
@@ -210,6 +212,11 @@ class SupersetConfig(
|
|
|
210
212
|
default=10, description="Timeout of single API call to superset."
|
|
211
213
|
)
|
|
212
214
|
|
|
215
|
+
max_threads: int = Field(
|
|
216
|
+
default_factory=lambda: os.cpu_count() or 40,
|
|
217
|
+
description="Max parallelism for API calls. Defaults to cpuCount or 40",
|
|
218
|
+
)
|
|
219
|
+
|
|
213
220
|
# TODO: Check and remove this if no longer needed.
|
|
214
221
|
# Config database_alias is removed from sql sources.
|
|
215
222
|
database_alias: Dict[str, str] = Field(
|
|
@@ -339,6 +346,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
339
346
|
|
|
340
347
|
if response.status_code != 200:
|
|
341
348
|
logger.warning(f"Failed to get {entity_type} data: {response.text}")
|
|
349
|
+
continue
|
|
342
350
|
|
|
343
351
|
payload = response.json()
|
|
344
352
|
# Update total_items with the actual count from the response
|
|
@@ -501,33 +509,41 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
501
509
|
|
|
502
510
|
return dashboard_snapshot
|
|
503
511
|
|
|
504
|
-
def
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
|
|
513
|
-
)
|
|
514
|
-
continue
|
|
515
|
-
|
|
516
|
-
dashboard_snapshot = self.construct_dashboard_from_api_data(
|
|
517
|
-
dashboard_data
|
|
518
|
-
)
|
|
519
|
-
except Exception as e:
|
|
520
|
-
self.report.warning(
|
|
521
|
-
f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
|
|
512
|
+
def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
513
|
+
dashboard_title = ""
|
|
514
|
+
try:
|
|
515
|
+
dashboard_id = str(dashboard_data.get("id"))
|
|
516
|
+
dashboard_title = dashboard_data.get("dashboard_title", "")
|
|
517
|
+
if not self.config.dashboard_pattern.allowed(dashboard_title):
|
|
518
|
+
self.report.report_dropped(
|
|
519
|
+
f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
|
|
522
520
|
)
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
521
|
+
return
|
|
522
|
+
dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
|
|
523
|
+
except Exception as e:
|
|
524
|
+
self.report.warning(
|
|
525
|
+
f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
|
|
526
|
+
)
|
|
527
|
+
return
|
|
528
|
+
mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
|
|
529
|
+
yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
|
|
530
|
+
yield from self._get_domain_wu(
|
|
531
|
+
title=dashboard_title, entity_urn=dashboard_snapshot.urn
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
535
|
+
dashboard_data_list = [
|
|
536
|
+
(dashboard_data,)
|
|
537
|
+
for dashboard_data in self.paginate_entity_api_results(
|
|
538
|
+
"dashboard/", PAGE_SIZE
|
|
530
539
|
)
|
|
540
|
+
]
|
|
541
|
+
|
|
542
|
+
yield from ThreadedIteratorExecutor.process(
|
|
543
|
+
worker_func=self._process_dashboard,
|
|
544
|
+
args_list=dashboard_data_list,
|
|
545
|
+
max_workers=self.config.max_threads,
|
|
546
|
+
)
|
|
531
547
|
|
|
532
548
|
def build_input_fields(
|
|
533
549
|
self,
|
|
@@ -762,40 +778,46 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
762
778
|
entity_urn=chart_urn,
|
|
763
779
|
)
|
|
764
780
|
|
|
765
|
-
def
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
781
|
+
def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
782
|
+
chart_name = ""
|
|
783
|
+
try:
|
|
784
|
+
chart_id = str(chart_data.get("id"))
|
|
785
|
+
chart_name = chart_data.get("slice_name", "")
|
|
786
|
+
if not self.config.chart_pattern.allowed(chart_name):
|
|
787
|
+
self.report.report_dropped(
|
|
788
|
+
f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
|
|
789
|
+
)
|
|
790
|
+
return
|
|
791
|
+
if self.config.dataset_pattern != AllowDenyPattern.allow_all():
|
|
792
|
+
datasource_id = chart_data.get("datasource_id")
|
|
793
|
+
if datasource_id:
|
|
794
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
795
|
+
dataset_name = dataset_response.get("result", {}).get(
|
|
796
|
+
"table_name", ""
|
|
774
797
|
)
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
if datasource_id:
|
|
781
|
-
dataset_response = self.get_dataset_info(datasource_id)
|
|
782
|
-
dataset_name = dataset_response.get("result", {}).get(
|
|
783
|
-
"table_name", ""
|
|
798
|
+
if dataset_name and not self.config.dataset_pattern.allowed(
|
|
799
|
+
dataset_name
|
|
800
|
+
):
|
|
801
|
+
self.report.warning(
|
|
802
|
+
f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
|
|
784
803
|
)
|
|
804
|
+
yield from self.construct_chart_from_chart_data(chart_data)
|
|
805
|
+
except Exception as e:
|
|
806
|
+
self.report.warning(
|
|
807
|
+
f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
|
|
808
|
+
)
|
|
809
|
+
return
|
|
785
810
|
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
|
|
797
|
-
)
|
|
798
|
-
continue
|
|
811
|
+
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
812
|
+
chart_data_list = [
|
|
813
|
+
(chart_data,)
|
|
814
|
+
for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE)
|
|
815
|
+
]
|
|
816
|
+
yield from ThreadedIteratorExecutor.process(
|
|
817
|
+
worker_func=self._process_chart,
|
|
818
|
+
args_list=chart_data_list,
|
|
819
|
+
max_workers=self.config.max_threads,
|
|
820
|
+
)
|
|
799
821
|
|
|
800
822
|
def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
|
|
801
823
|
schema_fields: List[SchemaField] = []
|
|
@@ -1023,33 +1045,38 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1023
1045
|
|
|
1024
1046
|
return dataset_snapshot
|
|
1025
1047
|
|
|
1026
|
-
def
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
self.report.report_dropped(
|
|
1034
|
-
f"Dataset '{dataset_name}' filtered by dataset_pattern"
|
|
1035
|
-
)
|
|
1036
|
-
continue
|
|
1037
|
-
|
|
1038
|
-
dataset_snapshot = self.construct_dataset_from_dataset_data(
|
|
1039
|
-
dataset_data
|
|
1040
|
-
)
|
|
1041
|
-
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1042
|
-
except Exception as e:
|
|
1043
|
-
self.report.warning(
|
|
1044
|
-
f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
|
|
1048
|
+
def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
1049
|
+
dataset_name = ""
|
|
1050
|
+
try:
|
|
1051
|
+
dataset_name = dataset_data.get("table_name", "")
|
|
1052
|
+
if not self.config.dataset_pattern.allowed(dataset_name):
|
|
1053
|
+
self.report.report_dropped(
|
|
1054
|
+
f"Dataset '{dataset_name}' filtered by dataset_pattern"
|
|
1045
1055
|
)
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1056
|
+
return
|
|
1057
|
+
dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
|
|
1058
|
+
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1059
|
+
except Exception as e:
|
|
1060
|
+
self.report.warning(
|
|
1061
|
+
f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
|
|
1052
1062
|
)
|
|
1063
|
+
return
|
|
1064
|
+
yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
|
|
1065
|
+
yield from self._get_domain_wu(
|
|
1066
|
+
title=dataset_data.get("table_name", ""),
|
|
1067
|
+
entity_urn=dataset_snapshot.urn,
|
|
1068
|
+
)
|
|
1069
|
+
|
|
1070
|
+
def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
1071
|
+
dataset_data_list = [
|
|
1072
|
+
(dataset_data,)
|
|
1073
|
+
for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE)
|
|
1074
|
+
]
|
|
1075
|
+
yield from ThreadedIteratorExecutor.process(
|
|
1076
|
+
worker_func=self._process_dataset,
|
|
1077
|
+
args_list=dataset_data_list,
|
|
1078
|
+
max_workers=self.config.max_threads,
|
|
1079
|
+
)
|
|
1053
1080
|
|
|
1054
1081
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1055
1082
|
if self.config.ingest_dashboards:
|
|
@@ -1623,7 +1623,7 @@ class TableauSiteSource:
|
|
|
1623
1623
|
# if multiple project has name C. Ideal solution is to use projectLuidWithin to avoid duplicate project,
|
|
1624
1624
|
# however Tableau supports projectLuidWithin in Tableau Cloud June 2022 / Server 2022.3 and later.
|
|
1625
1625
|
project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
|
|
1626
|
-
if project_luid not in self.tableau_project_registry
|
|
1626
|
+
if project_luid not in self.tableau_project_registry:
|
|
1627
1627
|
wrk_name: Optional[str] = workbook.get(c.NAME)
|
|
1628
1628
|
wrk_id: Optional[str] = workbook.get(c.ID)
|
|
1629
1629
|
prj_name: Optional[str] = workbook.get(c.PROJECT_NAME)
|
|
@@ -2253,7 +2253,7 @@ class TableauSiteSource:
|
|
|
2253
2253
|
# It is possible due to https://github.com/tableau/server-client-python/issues/1210
|
|
2254
2254
|
if (
|
|
2255
2255
|
ds.get(c.LUID)
|
|
2256
|
-
and ds[c.LUID] not in self.datasource_project_map
|
|
2256
|
+
and ds[c.LUID] not in self.datasource_project_map
|
|
2257
2257
|
and self.report.get_all_datasources_query_failed
|
|
2258
2258
|
):
|
|
2259
2259
|
logger.debug(
|
|
@@ -2265,7 +2265,7 @@ class TableauSiteSource:
|
|
|
2265
2265
|
|
|
2266
2266
|
if (
|
|
2267
2267
|
ds.get(c.LUID)
|
|
2268
|
-
and ds[c.LUID] in self.datasource_project_map
|
|
2268
|
+
and ds[c.LUID] in self.datasource_project_map
|
|
2269
2269
|
and self.datasource_project_map[ds[c.LUID]] in self.tableau_project_registry
|
|
2270
2270
|
):
|
|
2271
2271
|
return self.datasource_project_map[ds[c.LUID]]
|
|
@@ -3252,7 +3252,7 @@ class TableauSiteSource:
|
|
|
3252
3252
|
|
|
3253
3253
|
parent_key = None
|
|
3254
3254
|
project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
|
|
3255
|
-
if project_luid and project_luid in self.tableau_project_registry
|
|
3255
|
+
if project_luid and project_luid in self.tableau_project_registry:
|
|
3256
3256
|
parent_key = self.gen_project_key(project_luid)
|
|
3257
3257
|
else:
|
|
3258
3258
|
workbook_id: Optional[str] = workbook.get(c.ID)
|
|
@@ -774,7 +774,7 @@ def get_overridden_info(
|
|
|
774
774
|
if (
|
|
775
775
|
lineage_overrides is not None
|
|
776
776
|
and lineage_overrides.platform_override_map is not None
|
|
777
|
-
and original_platform in lineage_overrides.platform_override_map
|
|
777
|
+
and original_platform in lineage_overrides.platform_override_map
|
|
778
778
|
):
|
|
779
779
|
platform = lineage_overrides.platform_override_map[original_platform]
|
|
780
780
|
|
|
@@ -782,7 +782,7 @@ def get_overridden_info(
|
|
|
782
782
|
lineage_overrides is not None
|
|
783
783
|
and lineage_overrides.database_override_map is not None
|
|
784
784
|
and upstream_db is not None
|
|
785
|
-
and upstream_db in lineage_overrides.database_override_map
|
|
785
|
+
and upstream_db in lineage_overrides.database_override_map
|
|
786
786
|
):
|
|
787
787
|
upstream_db = lineage_overrides.database_override_map[upstream_db]
|
|
788
788
|
|
|
@@ -1003,7 +1003,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1003
1003
|
generate_usage_statistics=False,
|
|
1004
1004
|
generate_operations=False,
|
|
1005
1005
|
)
|
|
1006
|
-
for dataset_name in self.view_definitions
|
|
1006
|
+
for dataset_name in self.view_definitions:
|
|
1007
1007
|
view_ref, view_definition = self.view_definitions[dataset_name]
|
|
1008
1008
|
result = self._run_sql_parser(
|
|
1009
1009
|
view_ref,
|
|
@@ -22,7 +22,11 @@ from google.oauth2 import service_account
|
|
|
22
22
|
|
|
23
23
|
import datahub.emitter.mce_builder as builder
|
|
24
24
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
25
|
-
from datahub.emitter.mcp_builder import
|
|
25
|
+
from datahub.emitter.mcp_builder import (
|
|
26
|
+
ExperimentKey,
|
|
27
|
+
ProjectIdKey,
|
|
28
|
+
gen_containers,
|
|
29
|
+
)
|
|
26
30
|
from datahub.ingestion.api.common import PipelineContext
|
|
27
31
|
from datahub.ingestion.api.decorators import (
|
|
28
32
|
SupportStatus,
|
|
@@ -96,10 +100,6 @@ class ModelMetadata:
|
|
|
96
100
|
endpoints: Optional[List[Endpoint]] = None
|
|
97
101
|
|
|
98
102
|
|
|
99
|
-
class ContainerKeyWithId(ContainerKey):
|
|
100
|
-
id: str
|
|
101
|
-
|
|
102
|
-
|
|
103
103
|
@platform_name("Vertex AI", id="vertexai")
|
|
104
104
|
@config_class(VertexAIConfig)
|
|
105
105
|
@support_status(SupportStatus.TESTING)
|
|
@@ -173,7 +173,7 @@ class VertexAISource(Source):
|
|
|
173
173
|
) -> Iterable[MetadataWorkUnit]:
|
|
174
174
|
yield from gen_containers(
|
|
175
175
|
parent_container_key=self._get_project_container(),
|
|
176
|
-
container_key=
|
|
176
|
+
container_key=ExperimentKey(
|
|
177
177
|
platform=self.platform,
|
|
178
178
|
id=self._make_vertexai_experiment_name(experiment.name),
|
|
179
179
|
),
|
|
@@ -309,7 +309,7 @@ class VertexAISource(Source):
|
|
|
309
309
|
def _gen_experiment_run_mcps(
|
|
310
310
|
self, experiment: Experiment, run: ExperimentRun
|
|
311
311
|
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
312
|
-
experiment_key =
|
|
312
|
+
experiment_key = ExperimentKey(
|
|
313
313
|
platform=self.platform,
|
|
314
314
|
id=self._make_vertexai_experiment_name(experiment.name),
|
|
315
315
|
)
|
|
@@ -54,7 +54,7 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
|
|
|
54
54
|
data_products_container: Dict[str, DataProductPatchBuilder] = {}
|
|
55
55
|
logger.debug("Generating dataproducts")
|
|
56
56
|
is_container = self.config.is_container
|
|
57
|
-
for entity_urn in self.entity_map
|
|
57
|
+
for entity_urn in self.entity_map:
|
|
58
58
|
data_product_urn = self.config.get_data_product_to_add(entity_urn)
|
|
59
59
|
if data_product_urn:
|
|
60
60
|
if data_product_urn not in data_products:
|
|
@@ -86,7 +86,7 @@ class AddDatasetOwnership(OwnershipTransformer):
|
|
|
86
86
|
logger.debug("Generating Ownership for containers")
|
|
87
87
|
ownership_container_mapping: Dict[str, List[OwnerClass]] = {}
|
|
88
88
|
for entity_urn, data_ownerships in (
|
|
89
|
-
(urn, self.config.get_owners_to_add(urn)) for urn in self.entity_map
|
|
89
|
+
(urn, self.config.get_owners_to_add(urn)) for urn in self.entity_map
|
|
90
90
|
):
|
|
91
91
|
if not data_ownerships:
|
|
92
92
|
continue
|
|
@@ -125,7 +125,7 @@ class AddDatasetDomain(DatasetDomainTransformer):
|
|
|
125
125
|
return domain_mcps
|
|
126
126
|
|
|
127
127
|
for entity_urn, domain_to_add in (
|
|
128
|
-
(urn, self.config.get_domains_to_add(urn)) for urn in self.entity_map
|
|
128
|
+
(urn, self.config.get_domains_to_add(urn)) for urn in self.entity_map
|
|
129
129
|
):
|
|
130
130
|
if not domain_to_add or not domain_to_add.domains:
|
|
131
131
|
continue
|
datahub/lite/lite_util.py
CHANGED
|
@@ -99,7 +99,7 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
|
|
|
99
99
|
lite_class = lite_registry.get(lite_type)
|
|
100
100
|
except KeyError as e:
|
|
101
101
|
raise Exception(
|
|
102
|
-
f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping
|
|
102
|
+
f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping]}"
|
|
103
103
|
) from e
|
|
104
104
|
|
|
105
105
|
lite_specific_config = lite_class.get_config_class().parse_obj(
|
|
@@ -127,7 +127,7 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
|
|
|
127
127
|
return lite
|
|
128
128
|
else:
|
|
129
129
|
raise Exception(
|
|
130
|
-
f"Failed to find a registered forwarding sink for type {lite_local_config.forward_to.type}. Valid values are {[k for k in sink_registry.mapping
|
|
130
|
+
f"Failed to find a registered forwarding sink for type {lite_local_config.forward_to.type}. Valid values are {[k for k in sink_registry.mapping]}"
|
|
131
131
|
)
|
|
132
132
|
else:
|
|
133
133
|
return lite
|
|
@@ -15442,6 +15442,35 @@ class DataHubIngestionSourceKeyClass(_Aspect):
|
|
|
15442
15442
|
self._inner_dict['id'] = value
|
|
15443
15443
|
|
|
15444
15444
|
|
|
15445
|
+
class DataHubOpenAPISchemaKeyClass(_Aspect):
|
|
15446
|
+
"""Key for a Query"""
|
|
15447
|
+
|
|
15448
|
+
|
|
15449
|
+
ASPECT_NAME = 'dataHubOpenAPISchemaKey'
|
|
15450
|
+
ASPECT_INFO = {'keyForEntity': 'dataHubOpenAPISchema', 'entityCategory': 'internal', 'entityAspects': ['systemMetadata'], 'entityDoc': 'Contains aspects which are used in OpenAPI requests/responses which are not otherwise present in the data model.'}
|
|
15451
|
+
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataHubOpenAPISchemaKey")
|
|
15452
|
+
|
|
15453
|
+
def __init__(self,
|
|
15454
|
+
id: str,
|
|
15455
|
+
):
|
|
15456
|
+
super().__init__()
|
|
15457
|
+
|
|
15458
|
+
self.id = id
|
|
15459
|
+
|
|
15460
|
+
def _restore_defaults(self) -> None:
|
|
15461
|
+
self.id = str()
|
|
15462
|
+
|
|
15463
|
+
|
|
15464
|
+
@property
|
|
15465
|
+
def id(self) -> str:
|
|
15466
|
+
"""A unique id for the DataHub OpenAPI schema."""
|
|
15467
|
+
return self._inner_dict.get('id') # type: ignore
|
|
15468
|
+
|
|
15469
|
+
@id.setter
|
|
15470
|
+
def id(self, value: str) -> None:
|
|
15471
|
+
self._inner_dict['id'] = value
|
|
15472
|
+
|
|
15473
|
+
|
|
15445
15474
|
class DataHubPersonaKeyClass(_Aspect):
|
|
15446
15475
|
"""Key for a persona type"""
|
|
15447
15476
|
|
|
@@ -20128,10 +20157,14 @@ class PlatformEventHeaderClass(DictWrapper):
|
|
|
20128
20157
|
self._inner_dict['timestampMillis'] = value
|
|
20129
20158
|
|
|
20130
20159
|
|
|
20131
|
-
class SystemMetadataClass(
|
|
20160
|
+
class SystemMetadataClass(_Aspect):
|
|
20132
20161
|
"""Metadata associated with each metadata change that is processed by the system"""
|
|
20133
|
-
|
|
20162
|
+
|
|
20163
|
+
|
|
20164
|
+
ASPECT_NAME = 'systemMetadata'
|
|
20165
|
+
ASPECT_INFO = {}
|
|
20134
20166
|
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.mxe.SystemMetadata")
|
|
20167
|
+
|
|
20135
20168
|
def __init__(self,
|
|
20136
20169
|
lastObserved: Optional[Union[int, None]]=None,
|
|
20137
20170
|
runId: Optional[Union[str, None]]=None,
|
|
@@ -21738,6 +21771,9 @@ class QueryLanguageClass(object):
|
|
|
21738
21771
|
SQL = "SQL"
|
|
21739
21772
|
"""A SQL Query"""
|
|
21740
21773
|
|
|
21774
|
+
UNKNOWN = "UNKNOWN"
|
|
21775
|
+
"""Unknown query language"""
|
|
21776
|
+
|
|
21741
21777
|
|
|
21742
21778
|
|
|
21743
21779
|
class QueryPropertiesClass(_Aspect):
|
|
@@ -26135,6 +26171,7 @@ __SCHEMA_TYPES = {
|
|
|
26135
26171
|
'com.linkedin.pegasus2avro.metadata.key.DataHubActionKey': DataHubActionKeyClass,
|
|
26136
26172
|
'com.linkedin.pegasus2avro.metadata.key.DataHubConnectionKey': DataHubConnectionKeyClass,
|
|
26137
26173
|
'com.linkedin.pegasus2avro.metadata.key.DataHubIngestionSourceKey': DataHubIngestionSourceKeyClass,
|
|
26174
|
+
'com.linkedin.pegasus2avro.metadata.key.DataHubOpenAPISchemaKey': DataHubOpenAPISchemaKeyClass,
|
|
26138
26175
|
'com.linkedin.pegasus2avro.metadata.key.DataHubPersonaKey': DataHubPersonaKeyClass,
|
|
26139
26176
|
'com.linkedin.pegasus2avro.metadata.key.DataHubPolicyKey': DataHubPolicyKeyClass,
|
|
26140
26177
|
'com.linkedin.pegasus2avro.metadata.key.DataHubRetentionKey': DataHubRetentionKeyClass,
|
|
@@ -26620,6 +26657,7 @@ __SCHEMA_TYPES = {
|
|
|
26620
26657
|
'DataHubActionKey': DataHubActionKeyClass,
|
|
26621
26658
|
'DataHubConnectionKey': DataHubConnectionKeyClass,
|
|
26622
26659
|
'DataHubIngestionSourceKey': DataHubIngestionSourceKeyClass,
|
|
26660
|
+
'DataHubOpenAPISchemaKey': DataHubOpenAPISchemaKeyClass,
|
|
26623
26661
|
'DataHubPersonaKey': DataHubPersonaKeyClass,
|
|
26624
26662
|
'DataHubPolicyKey': DataHubPolicyKeyClass,
|
|
26625
26663
|
'DataHubRetentionKey': DataHubRetentionKeyClass,
|
|
@@ -26879,6 +26917,7 @@ ASPECT_CLASSES: List[Type[_Aspect]] = [
|
|
|
26879
26917
|
ContainerClass,
|
|
26880
26918
|
ContainerPropertiesClass,
|
|
26881
26919
|
EditableContainerPropertiesClass,
|
|
26920
|
+
SystemMetadataClass,
|
|
26882
26921
|
DataHubSecretValueClass,
|
|
26883
26922
|
DataHubUpgradeRequestClass,
|
|
26884
26923
|
DataHubUpgradeResultClass,
|
|
@@ -26935,6 +26974,7 @@ ASPECT_CLASSES: List[Type[_Aspect]] = [
|
|
|
26935
26974
|
MLModelKeyClass,
|
|
26936
26975
|
NotebookKeyClass,
|
|
26937
26976
|
RoleKeyClass,
|
|
26977
|
+
DataHubOpenAPISchemaKeyClass,
|
|
26938
26978
|
GlobalSettingsKeyClass,
|
|
26939
26979
|
DatasetKeyClass,
|
|
26940
26980
|
ChartKeyClass,
|
|
@@ -27102,6 +27142,7 @@ class AspectBag(TypedDict, total=False):
|
|
|
27102
27142
|
container: ContainerClass
|
|
27103
27143
|
containerProperties: ContainerPropertiesClass
|
|
27104
27144
|
editableContainerProperties: EditableContainerPropertiesClass
|
|
27145
|
+
systemMetadata: SystemMetadataClass
|
|
27105
27146
|
dataHubSecretValue: DataHubSecretValueClass
|
|
27106
27147
|
dataHubUpgradeRequest: DataHubUpgradeRequestClass
|
|
27107
27148
|
dataHubUpgradeResult: DataHubUpgradeResultClass
|
|
@@ -27158,6 +27199,7 @@ class AspectBag(TypedDict, total=False):
|
|
|
27158
27199
|
mlModelKey: MLModelKeyClass
|
|
27159
27200
|
notebookKey: NotebookKeyClass
|
|
27160
27201
|
roleKey: RoleKeyClass
|
|
27202
|
+
dataHubOpenAPISchemaKey: DataHubOpenAPISchemaKeyClass
|
|
27161
27203
|
globalSettingsKey: GlobalSettingsKeyClass
|
|
27162
27204
|
datasetKey: DatasetKeyClass
|
|
27163
27205
|
chartKey: ChartKeyClass
|
|
@@ -27292,6 +27334,7 @@ KEY_ASPECTS: Dict[str, Type[_Aspect]] = {
|
|
|
27292
27334
|
'mlModel': MLModelKeyClass,
|
|
27293
27335
|
'notebook': NotebookKeyClass,
|
|
27294
27336
|
'role': RoleKeyClass,
|
|
27337
|
+
'dataHubOpenAPISchema': DataHubOpenAPISchemaKeyClass,
|
|
27295
27338
|
'globalSettings': GlobalSettingsKeyClass,
|
|
27296
27339
|
'dataset': DatasetKeyClass,
|
|
27297
27340
|
'chart': ChartKeyClass,
|
|
@@ -27352,6 +27395,7 @@ ENTITY_TYPE_NAMES: List[str] = [
|
|
|
27352
27395
|
'mlModel',
|
|
27353
27396
|
'notebook',
|
|
27354
27397
|
'role',
|
|
27398
|
+
'dataHubOpenAPISchema',
|
|
27355
27399
|
'globalSettings',
|
|
27356
27400
|
'dataset',
|
|
27357
27401
|
'chart',
|
|
@@ -27411,6 +27455,7 @@ EntityTypeName = Literal[
|
|
|
27411
27455
|
'mlModel',
|
|
27412
27456
|
'notebook',
|
|
27413
27457
|
'role',
|
|
27458
|
+
'dataHubOpenAPISchema',
|
|
27414
27459
|
'globalSettings',
|
|
27415
27460
|
'dataset',
|
|
27416
27461
|
'chart',
|
|
@@ -594,6 +594,62 @@ class RoleUrn(_SpecificUrn):
|
|
|
594
594
|
def id(self) -> str:
|
|
595
595
|
return self._entity_ids[0]
|
|
596
596
|
|
|
597
|
+
if TYPE_CHECKING:
|
|
598
|
+
from datahub.metadata.schema_classes import DataHubOpenAPISchemaKeyClass
|
|
599
|
+
|
|
600
|
+
class DataHubOpenAPISchemaUrn(_SpecificUrn):
|
|
601
|
+
ENTITY_TYPE: ClassVar[Literal["dataHubOpenAPISchema"]] = "dataHubOpenAPISchema"
|
|
602
|
+
_URN_PARTS: ClassVar[int] = 1
|
|
603
|
+
|
|
604
|
+
def __init__(self, id: Union["DataHubOpenAPISchemaUrn", str], *, _allow_coercion: bool = True) -> None:
|
|
605
|
+
if _allow_coercion:
|
|
606
|
+
# Field coercion logic (if any is required).
|
|
607
|
+
if isinstance(id, str):
|
|
608
|
+
if id.startswith('urn:li:'):
|
|
609
|
+
try:
|
|
610
|
+
id = DataHubOpenAPISchemaUrn.from_string(id)
|
|
611
|
+
except InvalidUrnError:
|
|
612
|
+
raise InvalidUrnError(f'Expecting a DataHubOpenAPISchemaUrn but got {id}')
|
|
613
|
+
else:
|
|
614
|
+
id = UrnEncoder.encode_string(id)
|
|
615
|
+
|
|
616
|
+
# Validation logic.
|
|
617
|
+
if not id:
|
|
618
|
+
raise InvalidUrnError("DataHubOpenAPISchemaUrn id cannot be empty")
|
|
619
|
+
if isinstance(id, DataHubOpenAPISchemaUrn):
|
|
620
|
+
id = id.id
|
|
621
|
+
elif isinstance(id, Urn):
|
|
622
|
+
raise InvalidUrnError(f'Expecting a DataHubOpenAPISchemaUrn but got {id}')
|
|
623
|
+
if UrnEncoder.contains_reserved_char(id):
|
|
624
|
+
raise InvalidUrnError(f'DataHubOpenAPISchemaUrn id contains reserved characters')
|
|
625
|
+
|
|
626
|
+
super().__init__(self.ENTITY_TYPE, [id])
|
|
627
|
+
|
|
628
|
+
@classmethod
|
|
629
|
+
def _parse_ids(cls, entity_ids: List[str]) -> "DataHubOpenAPISchemaUrn":
|
|
630
|
+
if len(entity_ids) != cls._URN_PARTS:
|
|
631
|
+
raise InvalidUrnError(f"DataHubOpenAPISchemaUrn should have {cls._URN_PARTS} parts, got {len(entity_ids)}: {entity_ids}")
|
|
632
|
+
return cls(id=entity_ids[0], _allow_coercion=False)
|
|
633
|
+
|
|
634
|
+
@classmethod
|
|
635
|
+
def underlying_key_aspect_type(cls) -> Type["DataHubOpenAPISchemaKeyClass"]:
|
|
636
|
+
from datahub.metadata.schema_classes import DataHubOpenAPISchemaKeyClass
|
|
637
|
+
|
|
638
|
+
return DataHubOpenAPISchemaKeyClass
|
|
639
|
+
|
|
640
|
+
def to_key_aspect(self) -> "DataHubOpenAPISchemaKeyClass":
|
|
641
|
+
from datahub.metadata.schema_classes import DataHubOpenAPISchemaKeyClass
|
|
642
|
+
|
|
643
|
+
return DataHubOpenAPISchemaKeyClass(id=self.id)
|
|
644
|
+
|
|
645
|
+
@classmethod
|
|
646
|
+
def from_key_aspect(cls, key_aspect: "DataHubOpenAPISchemaKeyClass") -> "DataHubOpenAPISchemaUrn":
|
|
647
|
+
return cls(id=key_aspect.id)
|
|
648
|
+
|
|
649
|
+
@property
|
|
650
|
+
def id(self) -> str:
|
|
651
|
+
return self._entity_ids[0]
|
|
652
|
+
|
|
597
653
|
if TYPE_CHECKING:
|
|
598
654
|
from datahub.metadata.schema_classes import GlobalSettingsKeyClass
|
|
599
655
|
|
|
@@ -19,6 +19,7 @@ from ......schema_classes import DataHubAccessTokenKeyClass
|
|
|
19
19
|
from ......schema_classes import DataHubActionKeyClass
|
|
20
20
|
from ......schema_classes import DataHubConnectionKeyClass
|
|
21
21
|
from ......schema_classes import DataHubIngestionSourceKeyClass
|
|
22
|
+
from ......schema_classes import DataHubOpenAPISchemaKeyClass
|
|
22
23
|
from ......schema_classes import DataHubPersonaKeyClass
|
|
23
24
|
from ......schema_classes import DataHubPolicyKeyClass
|
|
24
25
|
from ......schema_classes import DataHubRetentionKeyClass
|
|
@@ -72,6 +73,7 @@ DataHubAccessTokenKey = DataHubAccessTokenKeyClass
|
|
|
72
73
|
DataHubActionKey = DataHubActionKeyClass
|
|
73
74
|
DataHubConnectionKey = DataHubConnectionKeyClass
|
|
74
75
|
DataHubIngestionSourceKey = DataHubIngestionSourceKeyClass
|
|
76
|
+
DataHubOpenAPISchemaKey = DataHubOpenAPISchemaKeyClass
|
|
75
77
|
DataHubPersonaKey = DataHubPersonaKeyClass
|
|
76
78
|
DataHubPolicyKey = DataHubPolicyKeyClass
|
|
77
79
|
DataHubRetentionKey = DataHubRetentionKeyClass
|