acryl-datahub 1.2.0.4rc3__py3-none-any.whl → 1.2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/METADATA +2454 -2454
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/RECORD +43 -41
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/external/external_entities.py +500 -15
- datahub/ingestion/source/aws/glue.py +18 -14
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/tag_entities.py +82 -104
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/hex/api.py +2 -0
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/looker/looker_common.py +26 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -1
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
- datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
- datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
- datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
- datahub/ingestion/source/sql/mssql/source.py +2 -25
- datahub/ingestion/source/sql/mysql.py +54 -0
- datahub/ingestion/source/sql/postgres.py +5 -134
- datahub/ingestion/source/sql/sql_common.py +137 -0
- datahub/ingestion/source/superset.py +140 -56
- datahub/ingestion/source/unity/config.py +11 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +20 -6
- datahub/ingestion/source/unity/report.py +9 -1
- datahub/ingestion/source/unity/source.py +51 -16
- datahub/ingestion/source/unity/tag_entities.py +49 -147
- datahub/metadata/_internal_schema_classes.py +514 -514
- datahub/metadata/_urns/urn_defs.py +1684 -1684
- datahub/metadata/schema.avsc +16680 -16281
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/top_level.txt +0 -0
|
@@ -141,6 +141,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
141
141
|
report: UnityCatalogReport,
|
|
142
142
|
hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
|
|
143
143
|
lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
|
|
144
|
+
databricks_api_page_size: int = 0,
|
|
144
145
|
):
|
|
145
146
|
self._workspace_client = WorkspaceClient(
|
|
146
147
|
host=workspace_url,
|
|
@@ -152,6 +153,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
152
153
|
self.report = report
|
|
153
154
|
self.hive_metastore_proxy = hive_metastore_proxy
|
|
154
155
|
self.lineage_data_source = lineage_data_source
|
|
156
|
+
self.databricks_api_page_size = databricks_api_page_size
|
|
155
157
|
self._sql_connection_params = {
|
|
156
158
|
"server_hostname": self._workspace_client.config.host.replace(
|
|
157
159
|
"https://", ""
|
|
@@ -161,7 +163,11 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
161
163
|
}
|
|
162
164
|
|
|
163
165
|
def check_basic_connectivity(self) -> bool:
|
|
164
|
-
return bool(
|
|
166
|
+
return bool(
|
|
167
|
+
self._workspace_client.catalogs.list(
|
|
168
|
+
include_browse=True, max_results=self.databricks_api_page_size
|
|
169
|
+
)
|
|
170
|
+
)
|
|
165
171
|
|
|
166
172
|
def assigned_metastore(self) -> Optional[Metastore]:
|
|
167
173
|
response = self._workspace_client.metastores.summary()
|
|
@@ -171,7 +177,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
171
177
|
if self.hive_metastore_proxy:
|
|
172
178
|
yield self.hive_metastore_proxy.hive_metastore_catalog(metastore)
|
|
173
179
|
|
|
174
|
-
response = self._workspace_client.catalogs.list(
|
|
180
|
+
response = self._workspace_client.catalogs.list(
|
|
181
|
+
include_browse=True, max_results=self.databricks_api_page_size
|
|
182
|
+
)
|
|
175
183
|
if not response:
|
|
176
184
|
logger.info("Catalogs not found")
|
|
177
185
|
return
|
|
@@ -203,7 +211,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
203
211
|
yield from self.hive_metastore_proxy.hive_metastore_schemas(catalog)
|
|
204
212
|
return
|
|
205
213
|
response = self._workspace_client.schemas.list(
|
|
206
|
-
catalog_name=catalog.name,
|
|
214
|
+
catalog_name=catalog.name,
|
|
215
|
+
include_browse=True,
|
|
216
|
+
max_results=self.databricks_api_page_size,
|
|
207
217
|
)
|
|
208
218
|
if not response:
|
|
209
219
|
logger.info(f"Schemas not found for catalog {catalog.id}")
|
|
@@ -225,6 +235,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
225
235
|
catalog_name=schema.catalog.name,
|
|
226
236
|
schema_name=schema.name,
|
|
227
237
|
include_browse=True,
|
|
238
|
+
max_results=self.databricks_api_page_size,
|
|
228
239
|
)
|
|
229
240
|
if not response:
|
|
230
241
|
logger.info(f"Tables not found for schema {schema.id}")
|
|
@@ -257,7 +268,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
257
268
|
return group_list
|
|
258
269
|
|
|
259
270
|
def workspace_notebooks(self) -> Iterable[Notebook]:
|
|
260
|
-
|
|
271
|
+
workspace_objects_iter = self._workspace_client.workspace.list(
|
|
272
|
+
"/", recursive=True, max_results=self.databricks_api_page_size
|
|
273
|
+
)
|
|
274
|
+
for obj in workspace_objects_iter:
|
|
261
275
|
if obj.object_type == ObjectType.NOTEBOOK and obj.object_id and obj.path:
|
|
262
276
|
yield Notebook(
|
|
263
277
|
id=obj.object_id,
|
|
@@ -299,7 +313,6 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
299
313
|
def _query_history(
|
|
300
314
|
self,
|
|
301
315
|
filter_by: QueryFilterWithStatementTypes,
|
|
302
|
-
max_results: int = 1000,
|
|
303
316
|
include_metrics: bool = False,
|
|
304
317
|
) -> Iterable[QueryInfo]:
|
|
305
318
|
"""Manual implementation of the query_history.list() endpoint.
|
|
@@ -311,9 +324,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
311
324
|
"""
|
|
312
325
|
method = "GET"
|
|
313
326
|
path = "/api/2.0/sql/history/queries"
|
|
327
|
+
|
|
314
328
|
body: Dict[str, Any] = {
|
|
315
329
|
"include_metrics": include_metrics,
|
|
316
|
-
"max_results":
|
|
330
|
+
"max_results": self.databricks_api_page_size, # Max batch size
|
|
317
331
|
}
|
|
318
332
|
|
|
319
333
|
response: dict = self._workspace_client.api_client.do( # type: ignore
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import Optional, Tuple
|
|
2
|
+
from typing import TYPE_CHECKING, Optional, Tuple
|
|
3
3
|
|
|
4
4
|
from datahub.ingestion.api.report import EntityFilterReport, Report
|
|
5
5
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
@@ -7,6 +7,11 @@ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
|
7
7
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
8
8
|
from datahub.utilities.perf_timer import PerfTimer
|
|
9
9
|
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from datahub.ingestion.source.unity.platform_resource_repository import (
|
|
12
|
+
UnityCatalogPlatformResourceRepository,
|
|
13
|
+
)
|
|
14
|
+
|
|
10
15
|
|
|
11
16
|
@dataclass
|
|
12
17
|
class UnityCatalogUsagePerfReport(Report):
|
|
@@ -61,3 +66,6 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
|
|
|
61
66
|
num_tables_missing_name: int = 0
|
|
62
67
|
num_columns_missing_name: int = 0
|
|
63
68
|
num_queries_missing_info: int = 0
|
|
69
|
+
|
|
70
|
+
# Platform resource repository for automatic cache statistics via SupportsAsObj
|
|
71
|
+
tag_urn_resolver_cache: Optional["UnityCatalogPlatformResourceRepository"] = None
|
|
@@ -4,7 +4,6 @@ import time
|
|
|
4
4
|
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
5
5
|
from urllib.parse import urljoin
|
|
6
6
|
|
|
7
|
-
from datahub.api.entities.external.external_entities import PlatformResourceRepository
|
|
8
7
|
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
9
8
|
from datahub.emitter.mce_builder import (
|
|
10
9
|
UNKNOWN_USER,
|
|
@@ -77,6 +76,9 @@ from datahub.ingestion.source.unity.hive_metastore_proxy import (
|
|
|
77
76
|
HIVE_METASTORE,
|
|
78
77
|
HiveMetastoreProxy,
|
|
79
78
|
)
|
|
79
|
+
from datahub.ingestion.source.unity.platform_resource_repository import (
|
|
80
|
+
UnityCatalogPlatformResourceRepository,
|
|
81
|
+
)
|
|
80
82
|
from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
|
|
81
83
|
from datahub.ingestion.source.unity.proxy_types import (
|
|
82
84
|
DATA_TYPE_REGISTRY,
|
|
@@ -187,7 +189,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
187
189
|
platform: str = "databricks"
|
|
188
190
|
platform_instance_name: Optional[str]
|
|
189
191
|
sql_parser_schema_resolver: Optional[SchemaResolver] = None
|
|
190
|
-
platform_resource_repository: Optional[
|
|
192
|
+
platform_resource_repository: Optional[UnityCatalogPlatformResourceRepository] = (
|
|
193
|
+
None
|
|
194
|
+
)
|
|
191
195
|
|
|
192
196
|
def get_report(self) -> UnityCatalogReport:
|
|
193
197
|
return self.report
|
|
@@ -207,6 +211,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
207
211
|
report=self.report,
|
|
208
212
|
hive_metastore_proxy=self.hive_metastore_proxy,
|
|
209
213
|
lineage_data_source=config.lineage_data_source,
|
|
214
|
+
databricks_api_page_size=config.databricks_api_page_size,
|
|
210
215
|
)
|
|
211
216
|
|
|
212
217
|
self.external_url_base = urljoin(self.config.workspace_url, "/explore/data")
|
|
@@ -239,9 +244,15 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
239
244
|
# Global map of tables, for profiling
|
|
240
245
|
self.tables: FileBackedDict[Table] = FileBackedDict()
|
|
241
246
|
if self.ctx.graph:
|
|
242
|
-
self.platform_resource_repository =
|
|
243
|
-
self.ctx.graph
|
|
247
|
+
self.platform_resource_repository = UnityCatalogPlatformResourceRepository(
|
|
248
|
+
self.ctx.graph, platform_instance=self.platform_instance_name
|
|
244
249
|
)
|
|
250
|
+
else:
|
|
251
|
+
self.platform_resource_repository = None
|
|
252
|
+
|
|
253
|
+
# Include platform resource repository in report for automatic cache statistics
|
|
254
|
+
if self.config.include_tags and self.platform_resource_repository:
|
|
255
|
+
self.report.tag_urn_resolver_cache = self.platform_resource_repository
|
|
245
256
|
|
|
246
257
|
def init_hive_metastore_proxy(self):
|
|
247
258
|
self.hive_metastore_proxy: Optional[HiveMetastoreProxy] = None
|
|
@@ -1079,25 +1090,49 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1079
1090
|
materialized=False, viewLanguage="SQL", viewLogic=table.view_definition
|
|
1080
1091
|
)
|
|
1081
1092
|
|
|
1093
|
+
def get_or_create_from_unity_tag(
|
|
1094
|
+
self,
|
|
1095
|
+
unity_tag: UnityCatalogTag,
|
|
1096
|
+
platform_instance: Optional[str],
|
|
1097
|
+
managed_by_datahub: bool = False,
|
|
1098
|
+
) -> UnityCatalogTagPlatformResource:
|
|
1099
|
+
"""
|
|
1100
|
+
Optimized helper to get or create a Unity Catalog tag platform resource.
|
|
1101
|
+
This eliminates the duplicate search by skipping the from_tag method which was
|
|
1102
|
+
doing a redundant search before get_from_datahub.
|
|
1103
|
+
"""
|
|
1104
|
+
# Create the platform resource ID directly without the from_tag search
|
|
1105
|
+
platform_resource_id = UnityCatalogTagPlatformResourceId(
|
|
1106
|
+
tag_key=unity_tag.key.raw_text,
|
|
1107
|
+
tag_value=unity_tag.value.raw_text if unity_tag.value is not None else None,
|
|
1108
|
+
platform_instance=platform_instance,
|
|
1109
|
+
exists_in_unity_catalog=True, # We got it from Unity Catalog
|
|
1110
|
+
persisted=False,
|
|
1111
|
+
)
|
|
1112
|
+
|
|
1113
|
+
# Use the repository's get_entity_from_datahub method which handles
|
|
1114
|
+
# searching and caching internally - this is the ONLY search we need
|
|
1115
|
+
if self.platform_resource_repository is None:
|
|
1116
|
+
raise ValueError("Platform resource repository not initialized")
|
|
1117
|
+
return self.platform_resource_repository.get_entity_from_datahub(
|
|
1118
|
+
platform_resource_id,
|
|
1119
|
+
managed_by_datahub,
|
|
1120
|
+
)
|
|
1121
|
+
|
|
1082
1122
|
def gen_platform_resources(
|
|
1083
1123
|
self, tags: List[UnityCatalogTag]
|
|
1084
1124
|
) -> Iterable[MetadataWorkUnit]:
|
|
1085
1125
|
if self.ctx.graph and self.platform_resource_repository:
|
|
1086
1126
|
for tag in tags:
|
|
1087
1127
|
try:
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1128
|
+
# Use optimized helper method that combines ID creation and entity retrieval
|
|
1129
|
+
unity_catalog_tag = self.get_or_create_from_unity_tag(
|
|
1130
|
+
tag,
|
|
1131
|
+
self.platform_instance_name,
|
|
1132
|
+
managed_by_datahub=False,
|
|
1092
1133
|
)
|
|
1093
|
-
logger.debug(
|
|
1094
|
-
|
|
1095
|
-
unity_catalog_tag = (
|
|
1096
|
-
UnityCatalogTagPlatformResource.get_from_datahub(
|
|
1097
|
-
platform_resource_id,
|
|
1098
|
-
self.platform_resource_repository,
|
|
1099
|
-
False,
|
|
1100
|
-
)
|
|
1134
|
+
logger.debug(
|
|
1135
|
+
f"Retrieved/created platform resource for tag {tag.key.raw_text}"
|
|
1101
1136
|
)
|
|
1102
1137
|
if (
|
|
1103
1138
|
tag.to_datahub_tag_urn().urn()
|
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import List, Optional
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from datahub.ingestion.source.unity.platform_resource_repository import (
|
|
6
|
+
UnityCatalogPlatformResourceRepository,
|
|
7
|
+
)
|
|
3
8
|
|
|
4
9
|
from pydantic import BaseModel
|
|
5
10
|
|
|
@@ -7,17 +12,14 @@ from datahub.api.entities.external.external_entities import (
|
|
|
7
12
|
ExternalEntity,
|
|
8
13
|
ExternalEntityId,
|
|
9
14
|
LinkedResourceSet,
|
|
10
|
-
PlatformResourceRepository,
|
|
11
15
|
)
|
|
12
16
|
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
13
17
|
from datahub.api.entities.platformresource.platform_resource import (
|
|
14
18
|
PlatformResource,
|
|
15
19
|
PlatformResourceKey,
|
|
16
|
-
PlatformResourceSearchFields,
|
|
17
20
|
)
|
|
18
21
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
19
22
|
from datahub.metadata.urns import TagUrn
|
|
20
|
-
from datahub.utilities.search_utils import ElasticDocumentQuery
|
|
21
23
|
from datahub.utilities.urns.urn import Urn
|
|
22
24
|
|
|
23
25
|
|
|
@@ -29,9 +31,9 @@ class UnityCatalogTagSyncContext(BaseModel):
|
|
|
29
31
|
logger = logging.getLogger(__name__)
|
|
30
32
|
|
|
31
33
|
|
|
32
|
-
class UnityCatalogTagPlatformResourceId(
|
|
34
|
+
class UnityCatalogTagPlatformResourceId(ExternalEntityId):
|
|
33
35
|
"""
|
|
34
|
-
A
|
|
36
|
+
A Unity Catalog tag platform resource ID.
|
|
35
37
|
"""
|
|
36
38
|
|
|
37
39
|
tag_key: str
|
|
@@ -40,9 +42,6 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
40
42
|
exists_in_unity_catalog: bool = False
|
|
41
43
|
persisted: bool = False
|
|
42
44
|
|
|
43
|
-
def __hash__(self) -> int:
|
|
44
|
-
return hash(self.to_platform_resource_key().id)
|
|
45
|
-
|
|
46
45
|
# this is a hack to make sure the property is a string and not private pydantic field
|
|
47
46
|
@staticmethod
|
|
48
47
|
def _RESOURCE_TYPE() -> str:
|
|
@@ -57,26 +56,21 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
57
56
|
)
|
|
58
57
|
|
|
59
58
|
@classmethod
|
|
60
|
-
def
|
|
59
|
+
def get_or_create_from_tag(
|
|
61
60
|
cls,
|
|
62
61
|
tag: UnityCatalogTag,
|
|
63
|
-
|
|
64
|
-
platform_resource_repository: PlatformResourceRepository,
|
|
62
|
+
platform_resource_repository: "UnityCatalogPlatformResourceRepository",
|
|
65
63
|
exists_in_unity_catalog: bool = False,
|
|
66
64
|
) -> "UnityCatalogTagPlatformResourceId":
|
|
67
65
|
"""
|
|
68
66
|
Creates a UnityCatalogTagPlatformResourceId from a UnityCatalogTag.
|
|
69
67
|
"""
|
|
70
68
|
|
|
71
|
-
existing_platform_resource =
|
|
72
|
-
tag.to_datahub_tag_urn().urn()
|
|
73
|
-
platform_resource_repository=platform_resource_repository,
|
|
74
|
-
tag_sync_context=UnityCatalogTagSyncContext(
|
|
75
|
-
platform_instance=platform_instance
|
|
76
|
-
),
|
|
69
|
+
existing_platform_resource = platform_resource_repository.search_entity_by_urn(
|
|
70
|
+
tag.to_datahub_tag_urn().urn()
|
|
77
71
|
)
|
|
78
72
|
if existing_platform_resource:
|
|
79
|
-
logger.
|
|
73
|
+
logger.debug(
|
|
80
74
|
f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.raw_text}: {existing_platform_resource}"
|
|
81
75
|
)
|
|
82
76
|
return existing_platform_resource
|
|
@@ -84,105 +78,46 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
84
78
|
return UnityCatalogTagPlatformResourceId(
|
|
85
79
|
tag_key=tag.key.raw_text,
|
|
86
80
|
tag_value=tag.value.raw_text if tag.value is not None else None,
|
|
87
|
-
platform_instance=platform_instance,
|
|
81
|
+
platform_instance=platform_resource_repository.platform_instance,
|
|
88
82
|
exists_in_unity_catalog=exists_in_unity_catalog,
|
|
89
83
|
persisted=False,
|
|
90
84
|
)
|
|
91
85
|
|
|
92
|
-
@classmethod
|
|
93
|
-
def search_by_urn(
|
|
94
|
-
cls,
|
|
95
|
-
urn: str,
|
|
96
|
-
platform_resource_repository: PlatformResourceRepository,
|
|
97
|
-
tag_sync_context: UnityCatalogTagSyncContext,
|
|
98
|
-
) -> Optional["UnityCatalogTagPlatformResourceId"]:
|
|
99
|
-
mapped_tags = [
|
|
100
|
-
t
|
|
101
|
-
for t in platform_resource_repository.search_by_filter(
|
|
102
|
-
ElasticDocumentQuery.create_from(
|
|
103
|
-
(
|
|
104
|
-
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
105
|
-
str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
|
|
106
|
-
),
|
|
107
|
-
(PlatformResourceSearchFields.SECONDARY_KEYS, urn),
|
|
108
|
-
)
|
|
109
|
-
)
|
|
110
|
-
]
|
|
111
|
-
logger.info(
|
|
112
|
-
f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
|
|
113
|
-
)
|
|
114
|
-
if len(mapped_tags) > 0:
|
|
115
|
-
for platform_resource in mapped_tags:
|
|
116
|
-
if (
|
|
117
|
-
platform_resource.resource_info
|
|
118
|
-
and platform_resource.resource_info.value
|
|
119
|
-
):
|
|
120
|
-
unity_catalog_tag = UnityCatalogTagPlatformResource(
|
|
121
|
-
**platform_resource.resource_info.value.as_pydantic_object(
|
|
122
|
-
UnityCatalogTagPlatformResource
|
|
123
|
-
).dict()
|
|
124
|
-
)
|
|
125
|
-
if (
|
|
126
|
-
unity_catalog_tag.id.platform_instance
|
|
127
|
-
== tag_sync_context.platform_instance
|
|
128
|
-
):
|
|
129
|
-
unity_catalog_tag_id = unity_catalog_tag.id
|
|
130
|
-
unity_catalog_tag_id.exists_in_unity_catalog = True
|
|
131
|
-
unity_catalog_tag_id.persisted = True
|
|
132
|
-
return unity_catalog_tag_id
|
|
133
|
-
else:
|
|
134
|
-
logger.warning(
|
|
135
|
-
f"Platform resource {platform_resource} does not have a resource_info value"
|
|
136
|
-
)
|
|
137
|
-
continue
|
|
138
|
-
|
|
139
|
-
# If we reach here, it means we did not find a mapped tag for the URN
|
|
140
|
-
logger.info(
|
|
141
|
-
f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new UnityCatalogTagPlatformResourceId."
|
|
142
|
-
)
|
|
143
|
-
return None
|
|
144
|
-
|
|
145
86
|
@classmethod
|
|
146
87
|
def from_datahub_urn(
|
|
147
88
|
cls,
|
|
148
89
|
urn: str,
|
|
149
|
-
platform_resource_repository: PlatformResourceRepository,
|
|
150
90
|
tag_sync_context: UnityCatalogTagSyncContext,
|
|
91
|
+
platform_resource_repository: "UnityCatalogPlatformResourceRepository",
|
|
151
92
|
graph: DataHubGraph,
|
|
152
93
|
) -> "UnityCatalogTagPlatformResourceId":
|
|
153
94
|
"""
|
|
154
95
|
Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
|
|
155
96
|
"""
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
# If we do, we can use it to create the UnityCatalogTagPlatformResourceId
|
|
159
|
-
# Else, we need to generate a new UnityCatalogTagPlatformResourceId
|
|
160
|
-
existing_platform_resource_id = cls.search_by_urn(
|
|
161
|
-
urn, platform_resource_repository, tag_sync_context
|
|
97
|
+
existing_platform_resource_id = (
|
|
98
|
+
platform_resource_repository.search_entity_by_urn(urn)
|
|
162
99
|
)
|
|
163
100
|
if existing_platform_resource_id:
|
|
164
|
-
logger.info(
|
|
165
|
-
f"Found existing UnityCatalogTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
|
|
166
|
-
)
|
|
167
101
|
return existing_platform_resource_id
|
|
168
102
|
|
|
169
|
-
# Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
|
|
170
103
|
new_unity_catalog_tag_id = cls.generate_tag_id(graph, tag_sync_context, urn)
|
|
171
104
|
if new_unity_catalog_tag_id:
|
|
172
|
-
# we then check if this tag has already been ingested as a platform
|
|
173
|
-
# resource in the platform resource repository
|
|
174
105
|
resource_key = platform_resource_repository.get(
|
|
175
106
|
new_unity_catalog_tag_id.to_platform_resource_key()
|
|
176
107
|
)
|
|
177
108
|
if resource_key:
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
109
|
+
# Create a new ID with the correct state instead of mutating
|
|
110
|
+
return UnityCatalogTagPlatformResourceId(
|
|
111
|
+
tag_key=new_unity_catalog_tag_id.tag_key,
|
|
112
|
+
tag_value=new_unity_catalog_tag_id.tag_value,
|
|
113
|
+
platform_instance=new_unity_catalog_tag_id.platform_instance,
|
|
114
|
+
exists_in_unity_catalog=True, # This tag exists in Unity Catalog
|
|
115
|
+
persisted=new_unity_catalog_tag_id.persisted,
|
|
183
116
|
)
|
|
184
117
|
return new_unity_catalog_tag_id
|
|
185
|
-
raise ValueError(
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"Unable to create Unity Catalog tag ID from DataHub URN: {urn}"
|
|
120
|
+
)
|
|
186
121
|
|
|
187
122
|
@classmethod
|
|
188
123
|
def generate_tag_id(
|
|
@@ -191,14 +126,11 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
191
126
|
parsed_urn = Urn.from_string(urn)
|
|
192
127
|
entity_type = parsed_urn.entity_type
|
|
193
128
|
if entity_type == "tag":
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
TagUrn.from_string(urn), tag_sync_context
|
|
197
|
-
)
|
|
129
|
+
return UnityCatalogTagPlatformResourceId.from_datahub_tag(
|
|
130
|
+
TagUrn.from_string(urn), tag_sync_context
|
|
198
131
|
)
|
|
199
132
|
else:
|
|
200
133
|
raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
|
|
201
|
-
return new_unity_catalog_tag_id
|
|
202
134
|
|
|
203
135
|
@classmethod
|
|
204
136
|
def from_datahub_tag(
|
|
@@ -214,7 +146,7 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
214
146
|
)
|
|
215
147
|
|
|
216
148
|
|
|
217
|
-
class UnityCatalogTagPlatformResource(
|
|
149
|
+
class UnityCatalogTagPlatformResource(ExternalEntity):
|
|
218
150
|
datahub_urns: LinkedResourceSet
|
|
219
151
|
managed_by_datahub: bool
|
|
220
152
|
id: UnityCatalogTagPlatformResourceId
|
|
@@ -237,58 +169,28 @@ class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity):
|
|
|
237
169
|
)
|
|
238
170
|
|
|
239
171
|
@classmethod
|
|
240
|
-
def
|
|
172
|
+
def create_default(
|
|
241
173
|
cls,
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
managed_by_datahub: bool = False,
|
|
174
|
+
entity_id: ExternalEntityId,
|
|
175
|
+
managed_by_datahub: bool,
|
|
245
176
|
) -> "UnityCatalogTagPlatformResource":
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
]
|
|
262
|
-
if len(platform_resources) == 1:
|
|
263
|
-
platform_resource: PlatformResource = platform_resources[0]
|
|
264
|
-
if (
|
|
265
|
-
platform_resource.resource_info
|
|
266
|
-
and platform_resource.resource_info.value
|
|
267
|
-
):
|
|
268
|
-
unity_catalog_tag = UnityCatalogTagPlatformResource(
|
|
269
|
-
**platform_resource.resource_info.value.as_pydantic_object(
|
|
270
|
-
UnityCatalogTagPlatformResource
|
|
271
|
-
).dict()
|
|
272
|
-
)
|
|
273
|
-
return unity_catalog_tag
|
|
274
|
-
else:
|
|
275
|
-
for platform_resource in platform_resources:
|
|
276
|
-
if (
|
|
277
|
-
platform_resource.resource_info
|
|
278
|
-
and platform_resource.resource_info.value
|
|
279
|
-
):
|
|
280
|
-
unity_catalog_tag = UnityCatalogTagPlatformResource(
|
|
281
|
-
**platform_resource.resource_info.value.as_pydantic_object(
|
|
282
|
-
UnityCatalogTagPlatformResource
|
|
283
|
-
).dict()
|
|
284
|
-
)
|
|
285
|
-
if (
|
|
286
|
-
unity_catalog_tag.id.platform_instance
|
|
287
|
-
== unity_catalog_tag_id.platform_instance
|
|
288
|
-
):
|
|
289
|
-
return unity_catalog_tag
|
|
177
|
+
"""Create a default Unity Catalog tag entity when none found in DataHub."""
|
|
178
|
+
# Type narrowing: we know this will be a UnityCatalogTagPlatformResourceId
|
|
179
|
+
assert isinstance(entity_id, UnityCatalogTagPlatformResourceId), (
|
|
180
|
+
f"Expected UnityCatalogTagPlatformResourceId, got {type(entity_id)}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Create a new entity ID with correct default state instead of mutating
|
|
184
|
+
default_entity_id = UnityCatalogTagPlatformResourceId(
|
|
185
|
+
tag_key=entity_id.tag_key,
|
|
186
|
+
tag_value=entity_id.tag_value,
|
|
187
|
+
platform_instance=entity_id.platform_instance,
|
|
188
|
+
exists_in_unity_catalog=False, # New entities don't exist in Unity Catalog yet
|
|
189
|
+
persisted=False, # New entities are not persisted yet
|
|
190
|
+
)
|
|
191
|
+
|
|
290
192
|
return cls(
|
|
291
|
-
id=
|
|
193
|
+
id=default_entity_id,
|
|
292
194
|
datahub_urns=LinkedResourceSet(urns=[]),
|
|
293
195
|
managed_by_datahub=managed_by_datahub,
|
|
294
196
|
allowed_values=None,
|