acryl-datahub 1.1.0rc4__py3-none-any.whl → 1.1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/METADATA +2414 -2412
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/RECORD +87 -70
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +9 -8
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +170 -0
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/delete_cli.py +4 -4
- datahub/cli/ingest_cli.py +9 -1
- datahub/emitter/mce_builder.py +3 -1
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +1 -1
- datahub/ingestion/graph/client.py +3 -3
- datahub/ingestion/source/apply/datahub_apply.py +4 -4
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
- datahub/ingestion/source/data_lake_common/object_store.py +644 -0
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_common.py +30 -11
- datahub/ingestion/source/gcs/gcs_source.py +22 -7
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/hex/query_fetcher.py +9 -3
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/s3/source.py +65 -6
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
- datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
- datahub/ingestion/source/sql/athena.py +1 -0
- datahub/ingestion/source/sql/hive.py +2 -3
- datahub/ingestion/source/sql/sql_common.py +98 -34
- datahub/ingestion/source/sql/sql_types.py +5 -2
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +117 -0
- datahub/ingestion/source/unity/source.py +167 -15
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/metadata/_internal_schema_classes.py +667 -522
- datahub/metadata/_urns/urn_defs.py +1804 -1748
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/schema.avsc +17358 -17584
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +1 -0
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +1 -0
- datahub/metadata/schemas/MLModelKey.avsc +1 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +6 -0
- datahub/sdk/_all_entities.py +11 -0
- datahub/sdk/_shared.py +118 -1
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +342 -0
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +90 -2
- datahub/sdk/lineage_client.py +681 -82
- datahub/sdk/main_client.py +27 -8
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/sdk_v2_helpers.py +18 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
|
+
import time
|
|
3
4
|
from concurrent.futures import ThreadPoolExecutor
|
|
4
5
|
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
5
6
|
from urllib.parse import urljoin
|
|
6
7
|
|
|
8
|
+
from datahub.api.entities.external.external_entities import PlatformResourceRepository
|
|
9
|
+
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
7
10
|
from datahub.emitter.mce_builder import (
|
|
8
11
|
make_data_platform_urn,
|
|
9
12
|
make_dataplatform_instance_urn,
|
|
@@ -78,6 +81,7 @@ from datahub.ingestion.source.unity.proxy_types import (
|
|
|
78
81
|
Catalog,
|
|
79
82
|
Column,
|
|
80
83
|
CustomCatalogType,
|
|
84
|
+
HiveTableType,
|
|
81
85
|
Metastore,
|
|
82
86
|
Notebook,
|
|
83
87
|
NotebookId,
|
|
@@ -87,8 +91,17 @@ from datahub.ingestion.source.unity.proxy_types import (
|
|
|
87
91
|
TableReference,
|
|
88
92
|
)
|
|
89
93
|
from datahub.ingestion.source.unity.report import UnityCatalogReport
|
|
94
|
+
from datahub.ingestion.source.unity.tag_entities import (
|
|
95
|
+
UnityCatalogTagPlatformResource,
|
|
96
|
+
UnityCatalogTagPlatformResourceId,
|
|
97
|
+
)
|
|
90
98
|
from datahub.ingestion.source.unity.usage import UnityCatalogUsageExtractor
|
|
91
|
-
from datahub.metadata.com.linkedin.pegasus2avro.common import
|
|
99
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
100
|
+
GlobalTags,
|
|
101
|
+
MetadataAttribution,
|
|
102
|
+
Siblings,
|
|
103
|
+
TagAssociation,
|
|
104
|
+
)
|
|
92
105
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
93
106
|
DatasetLineageType,
|
|
94
107
|
FineGrainedLineage,
|
|
@@ -116,6 +129,7 @@ from datahub.metadata.schema_classes import (
|
|
|
116
129
|
UpstreamClass,
|
|
117
130
|
UpstreamLineageClass,
|
|
118
131
|
)
|
|
132
|
+
from datahub.metadata.urns import TagUrn
|
|
119
133
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
120
134
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
121
135
|
SqlParsingResult,
|
|
@@ -162,6 +176,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
162
176
|
platform: str = "databricks"
|
|
163
177
|
platform_instance_name: Optional[str]
|
|
164
178
|
sql_parser_schema_resolver: Optional[SchemaResolver] = None
|
|
179
|
+
platform_resource_repository: Optional[PlatformResourceRepository] = None
|
|
165
180
|
|
|
166
181
|
def get_report(self) -> UnityCatalogReport:
|
|
167
182
|
return self.report
|
|
@@ -211,6 +226,10 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
211
226
|
|
|
212
227
|
# Global map of tables, for profiling
|
|
213
228
|
self.tables: FileBackedDict[Table] = FileBackedDict()
|
|
229
|
+
if self.ctx.graph:
|
|
230
|
+
self.platform_resource_repository = PlatformResourceRepository(
|
|
231
|
+
self.ctx.graph
|
|
232
|
+
)
|
|
214
233
|
|
|
215
234
|
def init_hive_metastore_proxy(self):
|
|
216
235
|
self.hive_metastore_proxy: Optional[HiveMetastoreProxy] = None
|
|
@@ -506,13 +525,42 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
506
525
|
yield from self.add_table_to_dataset_container(dataset_urn, schema)
|
|
507
526
|
|
|
508
527
|
table_props = self._create_table_property_aspect(table)
|
|
528
|
+
tags = None
|
|
529
|
+
if not isinstance(table.table_type, HiveTableType) and self.config.include_tags:
|
|
530
|
+
try:
|
|
531
|
+
table_tags = self._get_table_tags(
|
|
532
|
+
table.ref.catalog, table.ref.schema, table.ref.table
|
|
533
|
+
)
|
|
534
|
+
if table_tags:
|
|
535
|
+
logger.debug(f"Table tags for {table.ref}: {table_tags}")
|
|
536
|
+
attribution = MetadataAttribution(
|
|
537
|
+
# source="unity-catalog",
|
|
538
|
+
actor="urn:li:corpuser:datahub",
|
|
539
|
+
time=int(time.time() * 1000),
|
|
540
|
+
)
|
|
541
|
+
tags = GlobalTags(
|
|
542
|
+
tags=[
|
|
543
|
+
TagAssociation(
|
|
544
|
+
tag=tag.to_datahub_tag_urn().urn(),
|
|
545
|
+
attribution=attribution,
|
|
546
|
+
)
|
|
547
|
+
for tag in table_tags
|
|
548
|
+
]
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
yield from self.gen_platform_resources(table_tags)
|
|
552
|
+
|
|
553
|
+
except Exception as e:
|
|
554
|
+
logger.exception(f"Error fetching table {table.ref} tags", exc_info=e)
|
|
509
555
|
|
|
510
556
|
view_props = None
|
|
511
557
|
if table.view_definition:
|
|
512
558
|
view_props = self._create_view_property_aspect(table)
|
|
513
559
|
|
|
514
560
|
sub_type = self._create_table_sub_type_aspect(table)
|
|
515
|
-
schema_metadata = self._create_schema_metadata_aspect(table)
|
|
561
|
+
schema_metadata, platform_resources = self._create_schema_metadata_aspect(table)
|
|
562
|
+
yield from platform_resources
|
|
563
|
+
|
|
516
564
|
domain = self._get_domain_aspect(dataset_name=table.ref.qualified_table_name)
|
|
517
565
|
ownership = self._create_table_ownership_aspect(table)
|
|
518
566
|
data_platform_instance = self._create_data_platform_instance_aspect()
|
|
@@ -585,6 +633,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
585
633
|
domain,
|
|
586
634
|
data_platform_instance,
|
|
587
635
|
lineage,
|
|
636
|
+
tags,
|
|
588
637
|
],
|
|
589
638
|
)
|
|
590
639
|
]
|
|
@@ -718,6 +767,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
718
767
|
|
|
719
768
|
def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
|
|
720
769
|
domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}")
|
|
770
|
+
schema_tags = self.unity_catalog_api_proxy.get_schema_tags(
|
|
771
|
+
schema.catalog.name
|
|
772
|
+
).get(f"{schema.catalog.name}.{schema.name}", [])
|
|
773
|
+
if schema_tags:
|
|
774
|
+
logger.debug(f"Schema tags for {schema.name}: {schema_tags}")
|
|
775
|
+
# Generate platform resources for schema tags
|
|
776
|
+
yield from self.gen_platform_resources(schema_tags)
|
|
721
777
|
|
|
722
778
|
schema_container_key = self.gen_schema_key(schema)
|
|
723
779
|
yield from gen_containers(
|
|
@@ -729,6 +785,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
729
785
|
description=schema.comment,
|
|
730
786
|
owner_urn=self.get_owner_urn(schema.owner),
|
|
731
787
|
external_url=f"{self.external_url_base}/{schema.catalog.name}/{schema.name}",
|
|
788
|
+
tags=[tag.to_datahub_tag_urn().urn() for tag in schema_tags]
|
|
789
|
+
if schema_tags
|
|
790
|
+
else None,
|
|
732
791
|
)
|
|
733
792
|
|
|
734
793
|
def gen_metastore_containers(
|
|
@@ -749,6 +808,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
749
808
|
|
|
750
809
|
def gen_catalog_containers(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]:
|
|
751
810
|
domain_urn = self._gen_domain_urn(catalog.name)
|
|
811
|
+
catalog_tags = self.unity_catalog_api_proxy.get_catalog_tags(catalog.name).get(
|
|
812
|
+
catalog.name, []
|
|
813
|
+
)
|
|
814
|
+
if catalog_tags:
|
|
815
|
+
logger.debug(f"Schema tags for {catalog.name}: {catalog_tags}")
|
|
816
|
+
# Generate platform resources for schema tags
|
|
817
|
+
yield from self.gen_platform_resources(catalog_tags)
|
|
752
818
|
|
|
753
819
|
catalog_container_key = self.gen_catalog_key(catalog)
|
|
754
820
|
yield from gen_containers(
|
|
@@ -764,6 +830,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
764
830
|
description=catalog.comment,
|
|
765
831
|
owner_urn=self.get_owner_urn(catalog.owner),
|
|
766
832
|
external_url=f"{self.external_url_base}/{catalog.name}",
|
|
833
|
+
tags=[tag.to_datahub_tag_urn().urn() for tag in catalog_tags]
|
|
834
|
+
if catalog_tags
|
|
835
|
+
else None,
|
|
767
836
|
)
|
|
768
837
|
|
|
769
838
|
def gen_schema_key(self, schema: Schema) -> ContainerKey:
|
|
@@ -832,6 +901,30 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
832
901
|
dataset_urn=dataset_urn,
|
|
833
902
|
)
|
|
834
903
|
|
|
904
|
+
def _get_catalog_tags(
|
|
905
|
+
self, catalog: str, schema: str, table: str
|
|
906
|
+
) -> List[UnityCatalogTag]:
|
|
907
|
+
all_tags = self.unity_catalog_api_proxy.get_catalog_tags(catalog)
|
|
908
|
+
return all_tags.get(f"{catalog}", [])
|
|
909
|
+
|
|
910
|
+
def _get_schema_tags(
|
|
911
|
+
self, catalog: str, schema: str, table: str
|
|
912
|
+
) -> List[UnityCatalogTag]:
|
|
913
|
+
all_tags = self.unity_catalog_api_proxy.get_schema_tags(catalog)
|
|
914
|
+
return all_tags.get(f"{catalog}.{schema}", [])
|
|
915
|
+
|
|
916
|
+
def _get_table_tags(
|
|
917
|
+
self, catalog: str, schema: str, table: str
|
|
918
|
+
) -> List[UnityCatalogTag]:
|
|
919
|
+
all_tags = self.unity_catalog_api_proxy.get_table_tags(catalog)
|
|
920
|
+
return all_tags.get(f"{catalog}.{schema}.{table}", [])
|
|
921
|
+
|
|
922
|
+
def _get_column_tags(
|
|
923
|
+
self, catalog: str, schema: str, table: str, column: str
|
|
924
|
+
) -> List[UnityCatalogTag]:
|
|
925
|
+
all_tags = self.unity_catalog_api_proxy.get_column_tags(catalog)
|
|
926
|
+
return all_tags.get(f"{catalog}.{schema}.{table}.{column}", [])
|
|
927
|
+
|
|
835
928
|
def _create_table_property_aspect(self, table: Table) -> DatasetPropertiesClass:
|
|
836
929
|
custom_properties: dict = {}
|
|
837
930
|
if table.storage_location is not None:
|
|
@@ -921,30 +1014,88 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
921
1014
|
materialized=False, viewLanguage="SQL", viewLogic=table.view_definition
|
|
922
1015
|
)
|
|
923
1016
|
|
|
924
|
-
def
|
|
925
|
-
|
|
1017
|
+
def gen_platform_resources(
|
|
1018
|
+
self, tags: List[UnityCatalogTag]
|
|
1019
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1020
|
+
if self.ctx.graph and self.platform_resource_repository:
|
|
1021
|
+
for tag in tags:
|
|
1022
|
+
platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
|
|
1023
|
+
platform_instance=self.platform_instance_name,
|
|
1024
|
+
platform_resource_repository=self.platform_resource_repository,
|
|
1025
|
+
tag=tag,
|
|
1026
|
+
)
|
|
1027
|
+
logger.debug(f"Created platform resource {platform_resource_id}")
|
|
926
1028
|
|
|
1029
|
+
unity_catalog_tag = UnityCatalogTagPlatformResource.get_from_datahub(
|
|
1030
|
+
platform_resource_id, self.platform_resource_repository, False
|
|
1031
|
+
)
|
|
1032
|
+
if (
|
|
1033
|
+
tag.to_datahub_tag_urn().urn()
|
|
1034
|
+
not in unity_catalog_tag.datahub_linked_resources().urns
|
|
1035
|
+
):
|
|
1036
|
+
unity_catalog_tag.datahub_linked_resources().add(
|
|
1037
|
+
tag.to_datahub_tag_urn().urn()
|
|
1038
|
+
)
|
|
1039
|
+
platform_resource = unity_catalog_tag.as_platform_resource()
|
|
1040
|
+
for mcp in platform_resource.to_mcps():
|
|
1041
|
+
yield MetadataWorkUnit(
|
|
1042
|
+
id=f"platform_resource-{platform_resource.id}",
|
|
1043
|
+
mcp=mcp,
|
|
1044
|
+
)
|
|
1045
|
+
|
|
1046
|
+
def _create_schema_metadata_aspect(
|
|
1047
|
+
self, table: Table
|
|
1048
|
+
) -> Tuple[SchemaMetadataClass, Iterable[MetadataWorkUnit]]:
|
|
1049
|
+
schema_fields: List[SchemaFieldClass] = []
|
|
1050
|
+
unique_tags: Set[UnityCatalogTag] = set()
|
|
927
1051
|
for column in table.columns:
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
1052
|
+
tag_urns: Optional[List[TagUrn]] = None
|
|
1053
|
+
if self.config.include_tags:
|
|
1054
|
+
column_tags = self._get_column_tags(
|
|
1055
|
+
table.ref.catalog, table.ref.schema, table.ref.table, column.name
|
|
1056
|
+
)
|
|
1057
|
+
unique_tags.update(column_tags)
|
|
1058
|
+
tag_urns = [tag.to_datahub_tag_urn() for tag in column_tags]
|
|
1059
|
+
schema_fields.extend(self._create_schema_field(column, tag_urns))
|
|
1060
|
+
|
|
1061
|
+
platform_resources = self.gen_platform_resources(list(unique_tags))
|
|
1062
|
+
return (
|
|
1063
|
+
SchemaMetadataClass(
|
|
1064
|
+
schemaName=table.id,
|
|
1065
|
+
platform=make_data_platform_urn(self.platform),
|
|
1066
|
+
fields=schema_fields,
|
|
1067
|
+
hash="",
|
|
1068
|
+
version=0,
|
|
1069
|
+
platformSchema=MySqlDDLClass(tableSchema=""),
|
|
1070
|
+
),
|
|
1071
|
+
platform_resources,
|
|
937
1072
|
)
|
|
938
1073
|
|
|
939
1074
|
@staticmethod
|
|
940
|
-
def _create_schema_field(
|
|
1075
|
+
def _create_schema_field(
|
|
1076
|
+
column: Column, tags: Optional[List[TagUrn]]
|
|
1077
|
+
) -> List[SchemaFieldClass]:
|
|
941
1078
|
_COMPLEX_TYPE = re.compile("^(struct|array)")
|
|
942
|
-
|
|
1079
|
+
global_tags: Optional[GlobalTags] = None
|
|
943
1080
|
if _COMPLEX_TYPE.match(column.type_text.lower()):
|
|
944
1081
|
return get_schema_fields_for_hive_column(
|
|
945
1082
|
column.name, column.type_text.lower(), description=column.comment
|
|
946
1083
|
)
|
|
947
1084
|
else:
|
|
1085
|
+
if tags is not None:
|
|
1086
|
+
logger.debug(f"Column tags are: {tags}")
|
|
1087
|
+
attribution = MetadataAttribution(
|
|
1088
|
+
source="urn:li:dataPlatform:unity-catalog",
|
|
1089
|
+
actor="urn:li:corpuser:datahub",
|
|
1090
|
+
time=int(time.time() * 1000),
|
|
1091
|
+
)
|
|
1092
|
+
global_tags = GlobalTags(
|
|
1093
|
+
tags=[
|
|
1094
|
+
TagAssociation(tag=tag.urn(), attribution=attribution)
|
|
1095
|
+
for tag in tags
|
|
1096
|
+
]
|
|
1097
|
+
)
|
|
1098
|
+
|
|
948
1099
|
return [
|
|
949
1100
|
SchemaFieldClass(
|
|
950
1101
|
fieldPath=column.name,
|
|
@@ -954,6 +1105,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
954
1105
|
nativeDataType=column.type_text,
|
|
955
1106
|
nullable=column.nullable,
|
|
956
1107
|
description=column.comment,
|
|
1108
|
+
globalTags=global_tags if tags else None,
|
|
957
1109
|
)
|
|
958
1110
|
]
|
|
959
1111
|
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from datahub.api.entities.external.external_entities import (
|
|
7
|
+
ExternalEntity,
|
|
8
|
+
ExternalEntityId,
|
|
9
|
+
LinkedResourceSet,
|
|
10
|
+
PlatformResourceRepository,
|
|
11
|
+
)
|
|
12
|
+
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
13
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
14
|
+
PlatformResource,
|
|
15
|
+
PlatformResourceKey,
|
|
16
|
+
PlatformResourceSearchFields,
|
|
17
|
+
)
|
|
18
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
19
|
+
from datahub.metadata.urns import TagUrn
|
|
20
|
+
from datahub.utilities.search_utils import ElasticDocumentQuery
|
|
21
|
+
from datahub.utilities.urns.urn import Urn
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class UnityCatalogTagSyncContext(BaseModel):
|
|
25
|
+
# it is intentionally empty
|
|
26
|
+
platform_instance: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
33
|
+
"""
|
|
34
|
+
A SnowflakeTagId is a unique identifier for a Snowflake tag.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
tag_key: str
|
|
38
|
+
tag_value: Optional[str] = None
|
|
39
|
+
platform_instance: Optional[str]
|
|
40
|
+
exists_in_unity_catalog: bool = False
|
|
41
|
+
persisted: bool = False
|
|
42
|
+
|
|
43
|
+
def __hash__(self) -> int:
|
|
44
|
+
return hash(self.to_platform_resource_key().id)
|
|
45
|
+
|
|
46
|
+
# this is a hack to make sure the property is a string and not private pydantic field
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _RESOURCE_TYPE() -> str:
|
|
49
|
+
return "UnityCatalogTagPlatformResource"
|
|
50
|
+
|
|
51
|
+
def to_platform_resource_key(self) -> PlatformResourceKey:
|
|
52
|
+
return PlatformResourceKey(
|
|
53
|
+
platform="databricks",
|
|
54
|
+
resource_type=str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
|
|
55
|
+
primary_key=f"{self.tag_key}:{self.tag_value}",
|
|
56
|
+
platform_instance=self.platform_instance,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def from_tag(
|
|
61
|
+
cls,
|
|
62
|
+
tag: UnityCatalogTag,
|
|
63
|
+
platform_instance: Optional[str],
|
|
64
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
65
|
+
exists_in_unity_catalog: bool = False,
|
|
66
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
67
|
+
"""
|
|
68
|
+
Creates a UnityCatalogTagPlatformResourceId from a UnityCatalogTag.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
existing_platform_resource = cls.search_by_urn(
|
|
72
|
+
tag.to_datahub_tag_urn().urn(),
|
|
73
|
+
platform_resource_repository=platform_resource_repository,
|
|
74
|
+
tag_sync_context=UnityCatalogTagSyncContext(
|
|
75
|
+
platform_instance=platform_instance
|
|
76
|
+
),
|
|
77
|
+
)
|
|
78
|
+
if existing_platform_resource:
|
|
79
|
+
logger.info(
|
|
80
|
+
f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.original}: {existing_platform_resource}"
|
|
81
|
+
)
|
|
82
|
+
return existing_platform_resource
|
|
83
|
+
|
|
84
|
+
return UnityCatalogTagPlatformResourceId(
|
|
85
|
+
tag_key=tag.key.original,
|
|
86
|
+
tag_value=tag.value.original if tag.value is not None else None,
|
|
87
|
+
platform_instance=platform_instance,
|
|
88
|
+
exists_in_unity_catalog=exists_in_unity_catalog,
|
|
89
|
+
persisted=False,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def search_by_urn(
|
|
94
|
+
cls,
|
|
95
|
+
urn: str,
|
|
96
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
97
|
+
tag_sync_context: UnityCatalogTagSyncContext,
|
|
98
|
+
) -> Optional["UnityCatalogTagPlatformResourceId"]:
|
|
99
|
+
mapped_tags = [
|
|
100
|
+
t
|
|
101
|
+
for t in platform_resource_repository.search_by_filter(
|
|
102
|
+
ElasticDocumentQuery.create_from(
|
|
103
|
+
(
|
|
104
|
+
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
105
|
+
str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
|
|
106
|
+
),
|
|
107
|
+
(PlatformResourceSearchFields.SECONDARY_KEYS, urn),
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
]
|
|
111
|
+
logger.info(
|
|
112
|
+
f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
|
|
113
|
+
)
|
|
114
|
+
if len(mapped_tags) > 0:
|
|
115
|
+
for platform_resource in mapped_tags:
|
|
116
|
+
if (
|
|
117
|
+
platform_resource.resource_info
|
|
118
|
+
and platform_resource.resource_info.value
|
|
119
|
+
):
|
|
120
|
+
unity_catalog_tag = UnityCatalogTagPlatformResource(
|
|
121
|
+
**platform_resource.resource_info.value.as_pydantic_object(
|
|
122
|
+
UnityCatalogTagPlatformResource
|
|
123
|
+
).dict()
|
|
124
|
+
)
|
|
125
|
+
if (
|
|
126
|
+
unity_catalog_tag.id.platform_instance
|
|
127
|
+
== tag_sync_context.platform_instance
|
|
128
|
+
):
|
|
129
|
+
unity_catalog_tag_id = unity_catalog_tag.id
|
|
130
|
+
unity_catalog_tag_id.exists_in_unity_catalog = True
|
|
131
|
+
unity_catalog_tag_id.persisted = True
|
|
132
|
+
return unity_catalog_tag_id
|
|
133
|
+
else:
|
|
134
|
+
logger.warning(
|
|
135
|
+
f"Platform resource {platform_resource} does not have a resource_info value"
|
|
136
|
+
)
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
# If we reach here, it means we did not find a mapped tag for the URN
|
|
140
|
+
logger.info(
|
|
141
|
+
f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new UnityCatalogTagPlatformResourceId."
|
|
142
|
+
)
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def from_datahub_urn(
|
|
147
|
+
cls,
|
|
148
|
+
urn: str,
|
|
149
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
150
|
+
tag_sync_context: UnityCatalogTagSyncContext,
|
|
151
|
+
graph: DataHubGraph,
|
|
152
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
153
|
+
"""
|
|
154
|
+
Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
|
|
155
|
+
"""
|
|
156
|
+
# First we check if we already have a mapped platform resource for this
|
|
157
|
+
# urn that is of the type UnityCatalogTagPlatformResource
|
|
158
|
+
# If we do, we can use it to create the UnityCatalogTagPlatformResourceId
|
|
159
|
+
# Else, we need to generate a new UnityCatalogTagPlatformResourceId
|
|
160
|
+
existing_platform_resource_id = cls.search_by_urn(
|
|
161
|
+
urn, platform_resource_repository, tag_sync_context
|
|
162
|
+
)
|
|
163
|
+
if existing_platform_resource_id:
|
|
164
|
+
logger.info(
|
|
165
|
+
f"Found existing UnityCatalogTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
|
|
166
|
+
)
|
|
167
|
+
return existing_platform_resource_id
|
|
168
|
+
|
|
169
|
+
# Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
|
|
170
|
+
new_unity_catalog_tag_id = cls.generate_tag_id(graph, tag_sync_context, urn)
|
|
171
|
+
if new_unity_catalog_tag_id:
|
|
172
|
+
# we then check if this tag has already been ingested as a platform
|
|
173
|
+
# resource in the platform resource repository
|
|
174
|
+
resource_key = platform_resource_repository.get(
|
|
175
|
+
new_unity_catalog_tag_id.to_platform_resource_key()
|
|
176
|
+
)
|
|
177
|
+
if resource_key:
|
|
178
|
+
logger.info(
|
|
179
|
+
f"Tag {new_unity_catalog_tag_id} already exists in platform resource repository with {resource_key}"
|
|
180
|
+
)
|
|
181
|
+
new_unity_catalog_tag_id.exists_in_unity_catalog = (
|
|
182
|
+
True # TODO: Check if this is a safe assumption
|
|
183
|
+
)
|
|
184
|
+
return new_unity_catalog_tag_id
|
|
185
|
+
raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
|
|
186
|
+
|
|
187
|
+
@classmethod
|
|
188
|
+
def generate_tag_id(
|
|
189
|
+
cls, graph: DataHubGraph, tag_sync_context: UnityCatalogTagSyncContext, urn: str
|
|
190
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
191
|
+
parsed_urn = Urn.from_string(urn)
|
|
192
|
+
entity_type = parsed_urn.entity_type
|
|
193
|
+
if entity_type == "tag":
|
|
194
|
+
new_unity_catalog_tag_id = (
|
|
195
|
+
UnityCatalogTagPlatformResourceId.from_datahub_tag(
|
|
196
|
+
TagUrn.from_string(urn), tag_sync_context
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
else:
|
|
200
|
+
raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
|
|
201
|
+
return new_unity_catalog_tag_id
|
|
202
|
+
|
|
203
|
+
@classmethod
|
|
204
|
+
def from_datahub_tag(
|
|
205
|
+
cls, tag_urn: TagUrn, tag_sync_context: UnityCatalogTagSyncContext
|
|
206
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
207
|
+
uc_tag = UnityCatalogTag.from_urn(tag_urn)
|
|
208
|
+
|
|
209
|
+
return UnityCatalogTagPlatformResourceId(
|
|
210
|
+
tag_key=str(uc_tag.key),
|
|
211
|
+
tag_value=str(uc_tag.value) if uc_tag.value is not None else None,
|
|
212
|
+
platform_instance=tag_sync_context.platform_instance,
|
|
213
|
+
exists_in_unity_catalog=False,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity):
|
|
218
|
+
datahub_urns: LinkedResourceSet
|
|
219
|
+
managed_by_datahub: bool
|
|
220
|
+
id: UnityCatalogTagPlatformResourceId
|
|
221
|
+
allowed_values: Optional[List[str]]
|
|
222
|
+
|
|
223
|
+
def get_id(self) -> ExternalEntityId:
|
|
224
|
+
return self.id
|
|
225
|
+
|
|
226
|
+
def is_managed_by_datahub(self) -> bool:
|
|
227
|
+
return self.managed_by_datahub
|
|
228
|
+
|
|
229
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
230
|
+
return self.datahub_urns
|
|
231
|
+
|
|
232
|
+
def as_platform_resource(self) -> PlatformResource:
|
|
233
|
+
return PlatformResource.create(
|
|
234
|
+
key=self.id.to_platform_resource_key(),
|
|
235
|
+
secondary_keys=[u for u in self.datahub_urns.urns],
|
|
236
|
+
value=self,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
@classmethod
|
|
240
|
+
def get_from_datahub(
|
|
241
|
+
cls,
|
|
242
|
+
unity_catalog_tag_id: UnityCatalogTagPlatformResourceId,
|
|
243
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
244
|
+
managed_by_datahub: bool = False,
|
|
245
|
+
) -> "UnityCatalogTagPlatformResource":
|
|
246
|
+
# Search for linked DataHub URNs
|
|
247
|
+
platform_resources = [
|
|
248
|
+
r
|
|
249
|
+
for r in platform_resource_repository.search_by_filter(
|
|
250
|
+
ElasticDocumentQuery.create_from(
|
|
251
|
+
(
|
|
252
|
+
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
253
|
+
str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
|
|
254
|
+
),
|
|
255
|
+
(
|
|
256
|
+
PlatformResourceSearchFields.PRIMARY_KEY,
|
|
257
|
+
f"{unity_catalog_tag_id.tag_key}/{unity_catalog_tag_id.tag_value}",
|
|
258
|
+
),
|
|
259
|
+
)
|
|
260
|
+
)
|
|
261
|
+
]
|
|
262
|
+
if len(platform_resources) == 1:
|
|
263
|
+
platform_resource: PlatformResource = platform_resources[0]
|
|
264
|
+
if (
|
|
265
|
+
platform_resource.resource_info
|
|
266
|
+
and platform_resource.resource_info.value
|
|
267
|
+
):
|
|
268
|
+
unity_catalog_tag = UnityCatalogTagPlatformResource(
|
|
269
|
+
**platform_resource.resource_info.value.as_pydantic_object(
|
|
270
|
+
UnityCatalogTagPlatformResource
|
|
271
|
+
).dict()
|
|
272
|
+
)
|
|
273
|
+
return unity_catalog_tag
|
|
274
|
+
else:
|
|
275
|
+
for platform_resource in platform_resources:
|
|
276
|
+
if (
|
|
277
|
+
platform_resource.resource_info
|
|
278
|
+
and platform_resource.resource_info.value
|
|
279
|
+
):
|
|
280
|
+
unity_catalog_tag = UnityCatalogTagPlatformResource(
|
|
281
|
+
**platform_resource.resource_info.value.as_pydantic_object(
|
|
282
|
+
UnityCatalogTagPlatformResource
|
|
283
|
+
).dict()
|
|
284
|
+
)
|
|
285
|
+
if (
|
|
286
|
+
unity_catalog_tag.id.platform_instance
|
|
287
|
+
== unity_catalog_tag_id.platform_instance
|
|
288
|
+
):
|
|
289
|
+
return unity_catalog_tag
|
|
290
|
+
return cls(
|
|
291
|
+
id=unity_catalog_tag_id,
|
|
292
|
+
datahub_urns=LinkedResourceSet(urns=[]),
|
|
293
|
+
managed_by_datahub=managed_by_datahub,
|
|
294
|
+
allowed_values=None,
|
|
295
|
+
)
|