acryl-datahub 1.2.0.4rc4__py3-none-any.whl → 1.2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (41) hide show
  1. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/METADATA +2509 -2509
  2. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/RECORD +41 -39
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +1 -1
  5. datahub/api/entities/external/external_entities.py +500 -15
  6. datahub/ingestion/source/aws/glue.py +18 -14
  7. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  8. datahub/ingestion/source/aws/tag_entities.py +82 -104
  9. datahub/ingestion/source/common/subtypes.py +1 -0
  10. datahub/ingestion/source/hex/api.py +2 -0
  11. datahub/ingestion/source/hex/mapper.py +16 -2
  12. datahub/ingestion/source/hex/model.py +2 -0
  13. datahub/ingestion/source/looker/looker_common.py +26 -0
  14. datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -1
  15. datahub/ingestion/source/snowflake/constants.py +1 -0
  16. datahub/ingestion/source/snowflake/snowflake_config.py +8 -1
  17. datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
  18. datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
  19. datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
  20. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
  21. datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
  22. datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
  23. datahub/ingestion/source/sql/mssql/source.py +2 -25
  24. datahub/ingestion/source/sql/mysql.py +54 -0
  25. datahub/ingestion/source/sql/postgres.py +5 -134
  26. datahub/ingestion/source/sql/sql_common.py +137 -0
  27. datahub/ingestion/source/superset.py +140 -56
  28. datahub/ingestion/source/unity/config.py +11 -0
  29. datahub/ingestion/source/unity/connection_test.py +1 -0
  30. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  31. datahub/ingestion/source/unity/proxy.py +20 -6
  32. datahub/ingestion/source/unity/report.py +9 -1
  33. datahub/ingestion/source/unity/source.py +51 -16
  34. datahub/ingestion/source/unity/tag_entities.py +49 -147
  35. datahub/metadata/_internal_schema_classes.py +1 -1
  36. datahub/metadata/schema.avsc +4 -2
  37. datahub/metadata/schemas/Operation.avsc +4 -2
  38. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/WHEEL +0 -0
  39. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/entry_points.txt +0 -0
  40. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/licenses/LICENSE +0 -0
  41. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/top_level.txt +0 -0
@@ -141,6 +141,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
141
141
  report: UnityCatalogReport,
142
142
  hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
143
143
  lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
144
+ databricks_api_page_size: int = 0,
144
145
  ):
145
146
  self._workspace_client = WorkspaceClient(
146
147
  host=workspace_url,
@@ -152,6 +153,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
152
153
  self.report = report
153
154
  self.hive_metastore_proxy = hive_metastore_proxy
154
155
  self.lineage_data_source = lineage_data_source
156
+ self.databricks_api_page_size = databricks_api_page_size
155
157
  self._sql_connection_params = {
156
158
  "server_hostname": self._workspace_client.config.host.replace(
157
159
  "https://", ""
@@ -161,7 +163,11 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
161
163
  }
162
164
 
163
165
  def check_basic_connectivity(self) -> bool:
164
- return bool(self._workspace_client.catalogs.list(include_browse=True))
166
+ return bool(
167
+ self._workspace_client.catalogs.list(
168
+ include_browse=True, max_results=self.databricks_api_page_size
169
+ )
170
+ )
165
171
 
166
172
  def assigned_metastore(self) -> Optional[Metastore]:
167
173
  response = self._workspace_client.metastores.summary()
@@ -171,7 +177,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
171
177
  if self.hive_metastore_proxy:
172
178
  yield self.hive_metastore_proxy.hive_metastore_catalog(metastore)
173
179
 
174
- response = self._workspace_client.catalogs.list(include_browse=True)
180
+ response = self._workspace_client.catalogs.list(
181
+ include_browse=True, max_results=self.databricks_api_page_size
182
+ )
175
183
  if not response:
176
184
  logger.info("Catalogs not found")
177
185
  return
@@ -203,7 +211,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
203
211
  yield from self.hive_metastore_proxy.hive_metastore_schemas(catalog)
204
212
  return
205
213
  response = self._workspace_client.schemas.list(
206
- catalog_name=catalog.name, include_browse=True
214
+ catalog_name=catalog.name,
215
+ include_browse=True,
216
+ max_results=self.databricks_api_page_size,
207
217
  )
208
218
  if not response:
209
219
  logger.info(f"Schemas not found for catalog {catalog.id}")
@@ -225,6 +235,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
225
235
  catalog_name=schema.catalog.name,
226
236
  schema_name=schema.name,
227
237
  include_browse=True,
238
+ max_results=self.databricks_api_page_size,
228
239
  )
229
240
  if not response:
230
241
  logger.info(f"Tables not found for schema {schema.id}")
@@ -257,7 +268,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
257
268
  return group_list
258
269
 
259
270
  def workspace_notebooks(self) -> Iterable[Notebook]:
260
- for obj in self._workspace_client.workspace.list("/", recursive=True):
271
+ workspace_objects_iter = self._workspace_client.workspace.list(
272
+ "/", recursive=True, max_results=self.databricks_api_page_size
273
+ )
274
+ for obj in workspace_objects_iter:
261
275
  if obj.object_type == ObjectType.NOTEBOOK and obj.object_id and obj.path:
262
276
  yield Notebook(
263
277
  id=obj.object_id,
@@ -299,7 +313,6 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
299
313
  def _query_history(
300
314
  self,
301
315
  filter_by: QueryFilterWithStatementTypes,
302
- max_results: int = 1000,
303
316
  include_metrics: bool = False,
304
317
  ) -> Iterable[QueryInfo]:
305
318
  """Manual implementation of the query_history.list() endpoint.
@@ -311,9 +324,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
311
324
  """
312
325
  method = "GET"
313
326
  path = "/api/2.0/sql/history/queries"
327
+
314
328
  body: Dict[str, Any] = {
315
329
  "include_metrics": include_metrics,
316
- "max_results": max_results, # Max batch size
330
+ "max_results": self.databricks_api_page_size, # Max batch size
317
331
  }
318
332
 
319
333
  response: dict = self._workspace_client.api_client.do( # type: ignore
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import Optional, Tuple
2
+ from typing import TYPE_CHECKING, Optional, Tuple
3
3
 
4
4
  from datahub.ingestion.api.report import EntityFilterReport, Report
5
5
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
@@ -7,6 +7,11 @@ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
7
7
  from datahub.utilities.lossy_collections import LossyDict, LossyList
8
8
  from datahub.utilities.perf_timer import PerfTimer
9
9
 
10
+ if TYPE_CHECKING:
11
+ from datahub.ingestion.source.unity.platform_resource_repository import (
12
+ UnityCatalogPlatformResourceRepository,
13
+ )
14
+
10
15
 
11
16
  @dataclass
12
17
  class UnityCatalogUsagePerfReport(Report):
@@ -61,3 +66,6 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
61
66
  num_tables_missing_name: int = 0
62
67
  num_columns_missing_name: int = 0
63
68
  num_queries_missing_info: int = 0
69
+
70
+ # Platform resource repository for automatic cache statistics via SupportsAsObj
71
+ tag_urn_resolver_cache: Optional["UnityCatalogPlatformResourceRepository"] = None
@@ -4,7 +4,6 @@ import time
4
4
  from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
5
5
  from urllib.parse import urljoin
6
6
 
7
- from datahub.api.entities.external.external_entities import PlatformResourceRepository
8
7
  from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
9
8
  from datahub.emitter.mce_builder import (
10
9
  UNKNOWN_USER,
@@ -77,6 +76,9 @@ from datahub.ingestion.source.unity.hive_metastore_proxy import (
77
76
  HIVE_METASTORE,
78
77
  HiveMetastoreProxy,
79
78
  )
79
+ from datahub.ingestion.source.unity.platform_resource_repository import (
80
+ UnityCatalogPlatformResourceRepository,
81
+ )
80
82
  from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
81
83
  from datahub.ingestion.source.unity.proxy_types import (
82
84
  DATA_TYPE_REGISTRY,
@@ -187,7 +189,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
187
189
  platform: str = "databricks"
188
190
  platform_instance_name: Optional[str]
189
191
  sql_parser_schema_resolver: Optional[SchemaResolver] = None
190
- platform_resource_repository: Optional[PlatformResourceRepository] = None
192
+ platform_resource_repository: Optional[UnityCatalogPlatformResourceRepository] = (
193
+ None
194
+ )
191
195
 
192
196
  def get_report(self) -> UnityCatalogReport:
193
197
  return self.report
@@ -207,6 +211,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
207
211
  report=self.report,
208
212
  hive_metastore_proxy=self.hive_metastore_proxy,
209
213
  lineage_data_source=config.lineage_data_source,
214
+ databricks_api_page_size=config.databricks_api_page_size,
210
215
  )
211
216
 
212
217
  self.external_url_base = urljoin(self.config.workspace_url, "/explore/data")
@@ -239,9 +244,15 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
239
244
  # Global map of tables, for profiling
240
245
  self.tables: FileBackedDict[Table] = FileBackedDict()
241
246
  if self.ctx.graph:
242
- self.platform_resource_repository = PlatformResourceRepository(
243
- self.ctx.graph
247
+ self.platform_resource_repository = UnityCatalogPlatformResourceRepository(
248
+ self.ctx.graph, platform_instance=self.platform_instance_name
244
249
  )
250
+ else:
251
+ self.platform_resource_repository = None
252
+
253
+ # Include platform resource repository in report for automatic cache statistics
254
+ if self.config.include_tags and self.platform_resource_repository:
255
+ self.report.tag_urn_resolver_cache = self.platform_resource_repository
245
256
 
246
257
  def init_hive_metastore_proxy(self):
247
258
  self.hive_metastore_proxy: Optional[HiveMetastoreProxy] = None
@@ -1079,25 +1090,49 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
1079
1090
  materialized=False, viewLanguage="SQL", viewLogic=table.view_definition
1080
1091
  )
1081
1092
 
1093
+ def get_or_create_from_unity_tag(
1094
+ self,
1095
+ unity_tag: UnityCatalogTag,
1096
+ platform_instance: Optional[str],
1097
+ managed_by_datahub: bool = False,
1098
+ ) -> UnityCatalogTagPlatformResource:
1099
+ """
1100
+ Optimized helper to get or create a Unity Catalog tag platform resource.
1101
+ This eliminates the duplicate search by skipping the from_tag method which was
1102
+ doing a redundant search before get_from_datahub.
1103
+ """
1104
+ # Create the platform resource ID directly without the from_tag search
1105
+ platform_resource_id = UnityCatalogTagPlatformResourceId(
1106
+ tag_key=unity_tag.key.raw_text,
1107
+ tag_value=unity_tag.value.raw_text if unity_tag.value is not None else None,
1108
+ platform_instance=platform_instance,
1109
+ exists_in_unity_catalog=True, # We got it from Unity Catalog
1110
+ persisted=False,
1111
+ )
1112
+
1113
+ # Use the repository's get_entity_from_datahub method which handles
1114
+ # searching and caching internally - this is the ONLY search we need
1115
+ if self.platform_resource_repository is None:
1116
+ raise ValueError("Platform resource repository not initialized")
1117
+ return self.platform_resource_repository.get_entity_from_datahub(
1118
+ platform_resource_id,
1119
+ managed_by_datahub,
1120
+ )
1121
+
1082
1122
  def gen_platform_resources(
1083
1123
  self, tags: List[UnityCatalogTag]
1084
1124
  ) -> Iterable[MetadataWorkUnit]:
1085
1125
  if self.ctx.graph and self.platform_resource_repository:
1086
1126
  for tag in tags:
1087
1127
  try:
1088
- platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
1089
- platform_instance=self.platform_instance_name,
1090
- platform_resource_repository=self.platform_resource_repository,
1091
- tag=tag,
1128
+ # Use optimized helper method that combines ID creation and entity retrieval
1129
+ unity_catalog_tag = self.get_or_create_from_unity_tag(
1130
+ tag,
1131
+ self.platform_instance_name,
1132
+ managed_by_datahub=False,
1092
1133
  )
1093
- logger.debug(f"Created platform resource {platform_resource_id}")
1094
-
1095
- unity_catalog_tag = (
1096
- UnityCatalogTagPlatformResource.get_from_datahub(
1097
- platform_resource_id,
1098
- self.platform_resource_repository,
1099
- False,
1100
- )
1134
+ logger.debug(
1135
+ f"Retrieved/created platform resource for tag {tag.key.raw_text}"
1101
1136
  )
1102
1137
  if (
1103
1138
  tag.to_datahub_tag_urn().urn()
@@ -1,5 +1,10 @@
1
1
  import logging
2
- from typing import List, Optional
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ from datahub.ingestion.source.unity.platform_resource_repository import (
6
+ UnityCatalogPlatformResourceRepository,
7
+ )
3
8
 
4
9
  from pydantic import BaseModel
5
10
 
@@ -7,17 +12,14 @@ from datahub.api.entities.external.external_entities import (
7
12
  ExternalEntity,
8
13
  ExternalEntityId,
9
14
  LinkedResourceSet,
10
- PlatformResourceRepository,
11
15
  )
12
16
  from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
13
17
  from datahub.api.entities.platformresource.platform_resource import (
14
18
  PlatformResource,
15
19
  PlatformResourceKey,
16
- PlatformResourceSearchFields,
17
20
  )
18
21
  from datahub.ingestion.graph.client import DataHubGraph
19
22
  from datahub.metadata.urns import TagUrn
20
- from datahub.utilities.search_utils import ElasticDocumentQuery
21
23
  from datahub.utilities.urns.urn import Urn
22
24
 
23
25
 
@@ -29,9 +31,9 @@ class UnityCatalogTagSyncContext(BaseModel):
29
31
  logger = logging.getLogger(__name__)
30
32
 
31
33
 
32
- class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
34
+ class UnityCatalogTagPlatformResourceId(ExternalEntityId):
33
35
  """
34
- A SnowflakeTagId is a unique identifier for a Snowflake tag.
36
+ A Unity Catalog tag platform resource ID.
35
37
  """
36
38
 
37
39
  tag_key: str
@@ -40,9 +42,6 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
40
42
  exists_in_unity_catalog: bool = False
41
43
  persisted: bool = False
42
44
 
43
- def __hash__(self) -> int:
44
- return hash(self.to_platform_resource_key().id)
45
-
46
45
  # this is a hack to make sure the property is a string and not private pydantic field
47
46
  @staticmethod
48
47
  def _RESOURCE_TYPE() -> str:
@@ -57,26 +56,21 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
57
56
  )
58
57
 
59
58
  @classmethod
60
- def from_tag(
59
+ def get_or_create_from_tag(
61
60
  cls,
62
61
  tag: UnityCatalogTag,
63
- platform_instance: Optional[str],
64
- platform_resource_repository: PlatformResourceRepository,
62
+ platform_resource_repository: "UnityCatalogPlatformResourceRepository",
65
63
  exists_in_unity_catalog: bool = False,
66
64
  ) -> "UnityCatalogTagPlatformResourceId":
67
65
  """
68
66
  Creates a UnityCatalogTagPlatformResourceId from a UnityCatalogTag.
69
67
  """
70
68
 
71
- existing_platform_resource = cls.search_by_urn(
72
- tag.to_datahub_tag_urn().urn(),
73
- platform_resource_repository=platform_resource_repository,
74
- tag_sync_context=UnityCatalogTagSyncContext(
75
- platform_instance=platform_instance
76
- ),
69
+ existing_platform_resource = platform_resource_repository.search_entity_by_urn(
70
+ tag.to_datahub_tag_urn().urn()
77
71
  )
78
72
  if existing_platform_resource:
79
- logger.info(
73
+ logger.debug(
80
74
  f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.raw_text}: {existing_platform_resource}"
81
75
  )
82
76
  return existing_platform_resource
@@ -84,105 +78,46 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
84
78
  return UnityCatalogTagPlatformResourceId(
85
79
  tag_key=tag.key.raw_text,
86
80
  tag_value=tag.value.raw_text if tag.value is not None else None,
87
- platform_instance=platform_instance,
81
+ platform_instance=platform_resource_repository.platform_instance,
88
82
  exists_in_unity_catalog=exists_in_unity_catalog,
89
83
  persisted=False,
90
84
  )
91
85
 
92
- @classmethod
93
- def search_by_urn(
94
- cls,
95
- urn: str,
96
- platform_resource_repository: PlatformResourceRepository,
97
- tag_sync_context: UnityCatalogTagSyncContext,
98
- ) -> Optional["UnityCatalogTagPlatformResourceId"]:
99
- mapped_tags = [
100
- t
101
- for t in platform_resource_repository.search_by_filter(
102
- ElasticDocumentQuery.create_from(
103
- (
104
- PlatformResourceSearchFields.RESOURCE_TYPE,
105
- str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
106
- ),
107
- (PlatformResourceSearchFields.SECONDARY_KEYS, urn),
108
- )
109
- )
110
- ]
111
- logger.info(
112
- f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
113
- )
114
- if len(mapped_tags) > 0:
115
- for platform_resource in mapped_tags:
116
- if (
117
- platform_resource.resource_info
118
- and platform_resource.resource_info.value
119
- ):
120
- unity_catalog_tag = UnityCatalogTagPlatformResource(
121
- **platform_resource.resource_info.value.as_pydantic_object(
122
- UnityCatalogTagPlatformResource
123
- ).dict()
124
- )
125
- if (
126
- unity_catalog_tag.id.platform_instance
127
- == tag_sync_context.platform_instance
128
- ):
129
- unity_catalog_tag_id = unity_catalog_tag.id
130
- unity_catalog_tag_id.exists_in_unity_catalog = True
131
- unity_catalog_tag_id.persisted = True
132
- return unity_catalog_tag_id
133
- else:
134
- logger.warning(
135
- f"Platform resource {platform_resource} does not have a resource_info value"
136
- )
137
- continue
138
-
139
- # If we reach here, it means we did not find a mapped tag for the URN
140
- logger.info(
141
- f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new UnityCatalogTagPlatformResourceId."
142
- )
143
- return None
144
-
145
86
  @classmethod
146
87
  def from_datahub_urn(
147
88
  cls,
148
89
  urn: str,
149
- platform_resource_repository: PlatformResourceRepository,
150
90
  tag_sync_context: UnityCatalogTagSyncContext,
91
+ platform_resource_repository: "UnityCatalogPlatformResourceRepository",
151
92
  graph: DataHubGraph,
152
93
  ) -> "UnityCatalogTagPlatformResourceId":
153
94
  """
154
95
  Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
155
96
  """
156
- # First we check if we already have a mapped platform resource for this
157
- # urn that is of the type UnityCatalogTagPlatformResource
158
- # If we do, we can use it to create the UnityCatalogTagPlatformResourceId
159
- # Else, we need to generate a new UnityCatalogTagPlatformResourceId
160
- existing_platform_resource_id = cls.search_by_urn(
161
- urn, platform_resource_repository, tag_sync_context
97
+ existing_platform_resource_id = (
98
+ platform_resource_repository.search_entity_by_urn(urn)
162
99
  )
163
100
  if existing_platform_resource_id:
164
- logger.info(
165
- f"Found existing UnityCatalogTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
166
- )
167
101
  return existing_platform_resource_id
168
102
 
169
- # Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
170
103
  new_unity_catalog_tag_id = cls.generate_tag_id(graph, tag_sync_context, urn)
171
104
  if new_unity_catalog_tag_id:
172
- # we then check if this tag has already been ingested as a platform
173
- # resource in the platform resource repository
174
105
  resource_key = platform_resource_repository.get(
175
106
  new_unity_catalog_tag_id.to_platform_resource_key()
176
107
  )
177
108
  if resource_key:
178
- logger.info(
179
- f"Tag {new_unity_catalog_tag_id} already exists in platform resource repository with {resource_key}"
180
- )
181
- new_unity_catalog_tag_id.exists_in_unity_catalog = (
182
- True # TODO: Check if this is a safe assumption
109
+ # Create a new ID with the correct state instead of mutating
110
+ return UnityCatalogTagPlatformResourceId(
111
+ tag_key=new_unity_catalog_tag_id.tag_key,
112
+ tag_value=new_unity_catalog_tag_id.tag_value,
113
+ platform_instance=new_unity_catalog_tag_id.platform_instance,
114
+ exists_in_unity_catalog=True, # This tag exists in Unity Catalog
115
+ persisted=new_unity_catalog_tag_id.persisted,
183
116
  )
184
117
  return new_unity_catalog_tag_id
185
- raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
118
+ raise ValueError(
119
+ f"Unable to create Unity Catalog tag ID from DataHub URN: {urn}"
120
+ )
186
121
 
187
122
  @classmethod
188
123
  def generate_tag_id(
@@ -191,14 +126,11 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
191
126
  parsed_urn = Urn.from_string(urn)
192
127
  entity_type = parsed_urn.entity_type
193
128
  if entity_type == "tag":
194
- new_unity_catalog_tag_id = (
195
- UnityCatalogTagPlatformResourceId.from_datahub_tag(
196
- TagUrn.from_string(urn), tag_sync_context
197
- )
129
+ return UnityCatalogTagPlatformResourceId.from_datahub_tag(
130
+ TagUrn.from_string(urn), tag_sync_context
198
131
  )
199
132
  else:
200
133
  raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
201
- return new_unity_catalog_tag_id
202
134
 
203
135
  @classmethod
204
136
  def from_datahub_tag(
@@ -214,7 +146,7 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
214
146
  )
215
147
 
216
148
 
217
- class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity):
149
+ class UnityCatalogTagPlatformResource(ExternalEntity):
218
150
  datahub_urns: LinkedResourceSet
219
151
  managed_by_datahub: bool
220
152
  id: UnityCatalogTagPlatformResourceId
@@ -237,58 +169,28 @@ class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity):
237
169
  )
238
170
 
239
171
  @classmethod
240
- def get_from_datahub(
172
+ def create_default(
241
173
  cls,
242
- unity_catalog_tag_id: UnityCatalogTagPlatformResourceId,
243
- platform_resource_repository: PlatformResourceRepository,
244
- managed_by_datahub: bool = False,
174
+ entity_id: ExternalEntityId,
175
+ managed_by_datahub: bool,
245
176
  ) -> "UnityCatalogTagPlatformResource":
246
- # Search for linked DataHub URNs
247
- platform_resources = [
248
- r
249
- for r in platform_resource_repository.search_by_filter(
250
- ElasticDocumentQuery.create_from(
251
- (
252
- PlatformResourceSearchFields.RESOURCE_TYPE,
253
- str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
254
- ),
255
- (
256
- PlatformResourceSearchFields.PRIMARY_KEY,
257
- f"{unity_catalog_tag_id.tag_key}/{unity_catalog_tag_id.tag_value}",
258
- ),
259
- )
260
- )
261
- ]
262
- if len(platform_resources) == 1:
263
- platform_resource: PlatformResource = platform_resources[0]
264
- if (
265
- platform_resource.resource_info
266
- and platform_resource.resource_info.value
267
- ):
268
- unity_catalog_tag = UnityCatalogTagPlatformResource(
269
- **platform_resource.resource_info.value.as_pydantic_object(
270
- UnityCatalogTagPlatformResource
271
- ).dict()
272
- )
273
- return unity_catalog_tag
274
- else:
275
- for platform_resource in platform_resources:
276
- if (
277
- platform_resource.resource_info
278
- and platform_resource.resource_info.value
279
- ):
280
- unity_catalog_tag = UnityCatalogTagPlatformResource(
281
- **platform_resource.resource_info.value.as_pydantic_object(
282
- UnityCatalogTagPlatformResource
283
- ).dict()
284
- )
285
- if (
286
- unity_catalog_tag.id.platform_instance
287
- == unity_catalog_tag_id.platform_instance
288
- ):
289
- return unity_catalog_tag
177
+ """Create a default Unity Catalog tag entity when none found in DataHub."""
178
+ # Type narrowing: we know this will be a UnityCatalogTagPlatformResourceId
179
+ assert isinstance(entity_id, UnityCatalogTagPlatformResourceId), (
180
+ f"Expected UnityCatalogTagPlatformResourceId, got {type(entity_id)}"
181
+ )
182
+
183
+ # Create a new entity ID with correct default state instead of mutating
184
+ default_entity_id = UnityCatalogTagPlatformResourceId(
185
+ tag_key=entity_id.tag_key,
186
+ tag_value=entity_id.tag_value,
187
+ platform_instance=entity_id.platform_instance,
188
+ exists_in_unity_catalog=False, # New entities don't exist in Unity Catalog yet
189
+ persisted=False, # New entities are not persisted yet
190
+ )
191
+
290
192
  return cls(
291
- id=unity_catalog_tag_id,
193
+ id=default_entity_id,
292
194
  datahub_urns=LinkedResourceSet(urns=[]),
293
195
  managed_by_datahub=managed_by_datahub,
294
196
  allowed_values=None,
@@ -5800,7 +5800,7 @@ class OperationTypeClass(object):
5800
5800
  """Asset was dropped"""
5801
5801
 
5802
5802
  CUSTOM = "CUSTOM"
5803
- """Custom asset operation"""
5803
+ """Custom asset operation. If this is set, ensure customOperationType is filled out."""
5804
5804
 
5805
5805
  UNKNOWN = "UNKNOWN"
5806
5806
 
@@ -9841,7 +9841,7 @@
9841
9841
  "symbolDocs": {
9842
9842
  "ALTER": "Asset was altered",
9843
9843
  "CREATE": "Asset was created",
9844
- "CUSTOM": "Custom asset operation",
9844
+ "CUSTOM": "Custom asset operation. If this is set, ensure customOperationType is filled out.",
9845
9845
  "DELETE": "Rows were deleted",
9846
9846
  "DROP": "Asset was dropped",
9847
9847
  "INSERT": "Rows were inserted",
@@ -9941,7 +9941,9 @@
9941
9941
  "fieldName": "lastOperationTime",
9942
9942
  "fieldType": "DATETIME"
9943
9943
  },
9944
- "TimeseriesField": {},
9944
+ "TimeseriesField": {
9945
+ "fieldType": "DATETIME"
9946
+ },
9945
9947
  "type": "long",
9946
9948
  "name": "lastUpdatedTimestamp",
9947
9949
  "doc": "The time at which the operation occurred. Would be better named 'operationTime'"
@@ -150,7 +150,7 @@
150
150
  "symbolDocs": {
151
151
  "ALTER": "Asset was altered",
152
152
  "CREATE": "Asset was created",
153
- "CUSTOM": "Custom asset operation",
153
+ "CUSTOM": "Custom asset operation. If this is set, ensure customOperationType is filled out.",
154
154
  "DELETE": "Rows were deleted",
155
155
  "DROP": "Asset was dropped",
156
156
  "INSERT": "Rows were inserted",
@@ -250,7 +250,9 @@
250
250
  "fieldName": "lastOperationTime",
251
251
  "fieldType": "DATETIME"
252
252
  },
253
- "TimeseriesField": {},
253
+ "TimeseriesField": {
254
+ "fieldType": "DATETIME"
255
+ },
254
256
  "type": "long",
255
257
  "name": "lastUpdatedTimestamp",
256
258
  "doc": "The time at which the operation occurred. Would be better named 'operationTime'"