acryl-datahub 1.2.0.4rc3__py3-none-any.whl → 1.2.0.5rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (40) hide show
  1. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/METADATA +2604 -2604
  2. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/RECORD +40 -38
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +1 -1
  5. datahub/api/entities/external/external_entities.py +500 -15
  6. datahub/ingestion/source/aws/glue.py +18 -14
  7. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  8. datahub/ingestion/source/aws/tag_entities.py +82 -104
  9. datahub/ingestion/source/common/subtypes.py +1 -0
  10. datahub/ingestion/source/hex/api.py +2 -0
  11. datahub/ingestion/source/hex/mapper.py +16 -2
  12. datahub/ingestion/source/hex/model.py +2 -0
  13. datahub/ingestion/source/looker/looker_common.py +26 -0
  14. datahub/ingestion/source/snowflake/constants.py +1 -0
  15. datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
  16. datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
  17. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
  18. datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
  19. datahub/ingestion/source/snowflake/snowflake_v2.py +3 -1
  20. datahub/ingestion/source/sql/mssql/source.py +2 -25
  21. datahub/ingestion/source/sql/mysql.py +54 -0
  22. datahub/ingestion/source/sql/postgres.py +5 -134
  23. datahub/ingestion/source/sql/sql_common.py +137 -0
  24. datahub/ingestion/source/superset.py +140 -56
  25. datahub/ingestion/source/unity/config.py +11 -0
  26. datahub/ingestion/source/unity/connection_test.py +1 -0
  27. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  28. datahub/ingestion/source/unity/proxy.py +20 -6
  29. datahub/ingestion/source/unity/report.py +9 -1
  30. datahub/ingestion/source/unity/source.py +51 -16
  31. datahub/ingestion/source/unity/tag_entities.py +49 -147
  32. datahub/metadata/_internal_schema_classes.py +514 -514
  33. datahub/metadata/_urns/urn_defs.py +1684 -1684
  34. datahub/metadata/schema.avsc +16680 -16281
  35. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  36. datahub/metadata/schemas/Operation.avsc +4 -2
  37. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/WHEEL +0 -0
  38. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/entry_points.txt +0 -0
  39. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/licenses/LICENSE +0 -0
  40. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,6 @@ import time
4
4
  from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
5
5
  from urllib.parse import urljoin
6
6
 
7
- from datahub.api.entities.external.external_entities import PlatformResourceRepository
8
7
  from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
9
8
  from datahub.emitter.mce_builder import (
10
9
  UNKNOWN_USER,
@@ -77,6 +76,9 @@ from datahub.ingestion.source.unity.hive_metastore_proxy import (
77
76
  HIVE_METASTORE,
78
77
  HiveMetastoreProxy,
79
78
  )
79
+ from datahub.ingestion.source.unity.platform_resource_repository import (
80
+ UnityCatalogPlatformResourceRepository,
81
+ )
80
82
  from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
81
83
  from datahub.ingestion.source.unity.proxy_types import (
82
84
  DATA_TYPE_REGISTRY,
@@ -187,7 +189,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
187
189
  platform: str = "databricks"
188
190
  platform_instance_name: Optional[str]
189
191
  sql_parser_schema_resolver: Optional[SchemaResolver] = None
190
- platform_resource_repository: Optional[PlatformResourceRepository] = None
192
+ platform_resource_repository: Optional[UnityCatalogPlatformResourceRepository] = (
193
+ None
194
+ )
191
195
 
192
196
  def get_report(self) -> UnityCatalogReport:
193
197
  return self.report
@@ -207,6 +211,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
207
211
  report=self.report,
208
212
  hive_metastore_proxy=self.hive_metastore_proxy,
209
213
  lineage_data_source=config.lineage_data_source,
214
+ databricks_api_page_size=config.databricks_api_page_size,
210
215
  )
211
216
 
212
217
  self.external_url_base = urljoin(self.config.workspace_url, "/explore/data")
@@ -239,9 +244,15 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
239
244
  # Global map of tables, for profiling
240
245
  self.tables: FileBackedDict[Table] = FileBackedDict()
241
246
  if self.ctx.graph:
242
- self.platform_resource_repository = PlatformResourceRepository(
243
- self.ctx.graph
247
+ self.platform_resource_repository = UnityCatalogPlatformResourceRepository(
248
+ self.ctx.graph, platform_instance=self.platform_instance_name
244
249
  )
250
+ else:
251
+ self.platform_resource_repository = None
252
+
253
+ # Include platform resource repository in report for automatic cache statistics
254
+ if self.config.include_tags and self.platform_resource_repository:
255
+ self.report.tag_urn_resolver_cache = self.platform_resource_repository
245
256
 
246
257
  def init_hive_metastore_proxy(self):
247
258
  self.hive_metastore_proxy: Optional[HiveMetastoreProxy] = None
@@ -1079,25 +1090,49 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
1079
1090
  materialized=False, viewLanguage="SQL", viewLogic=table.view_definition
1080
1091
  )
1081
1092
 
1093
+ def get_or_create_from_unity_tag(
1094
+ self,
1095
+ unity_tag: UnityCatalogTag,
1096
+ platform_instance: Optional[str],
1097
+ managed_by_datahub: bool = False,
1098
+ ) -> UnityCatalogTagPlatformResource:
1099
+ """
1100
+ Optimized helper to get or create a Unity Catalog tag platform resource.
1101
+ This eliminates the duplicate search by skipping the from_tag method which was
1102
+ doing a redundant search before get_from_datahub.
1103
+ """
1104
+ # Create the platform resource ID directly without the from_tag search
1105
+ platform_resource_id = UnityCatalogTagPlatformResourceId(
1106
+ tag_key=unity_tag.key.raw_text,
1107
+ tag_value=unity_tag.value.raw_text if unity_tag.value is not None else None,
1108
+ platform_instance=platform_instance,
1109
+ exists_in_unity_catalog=True, # We got it from Unity Catalog
1110
+ persisted=False,
1111
+ )
1112
+
1113
+ # Use the repository's get_entity_from_datahub method which handles
1114
+ # searching and caching internally - this is the ONLY search we need
1115
+ if self.platform_resource_repository is None:
1116
+ raise ValueError("Platform resource repository not initialized")
1117
+ return self.platform_resource_repository.get_entity_from_datahub(
1118
+ platform_resource_id,
1119
+ managed_by_datahub,
1120
+ )
1121
+
1082
1122
  def gen_platform_resources(
1083
1123
  self, tags: List[UnityCatalogTag]
1084
1124
  ) -> Iterable[MetadataWorkUnit]:
1085
1125
  if self.ctx.graph and self.platform_resource_repository:
1086
1126
  for tag in tags:
1087
1127
  try:
1088
- platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
1089
- platform_instance=self.platform_instance_name,
1090
- platform_resource_repository=self.platform_resource_repository,
1091
- tag=tag,
1128
+ # Use optimized helper method that combines ID creation and entity retrieval
1129
+ unity_catalog_tag = self.get_or_create_from_unity_tag(
1130
+ tag,
1131
+ self.platform_instance_name,
1132
+ managed_by_datahub=False,
1092
1133
  )
1093
- logger.debug(f"Created platform resource {platform_resource_id}")
1094
-
1095
- unity_catalog_tag = (
1096
- UnityCatalogTagPlatformResource.get_from_datahub(
1097
- platform_resource_id,
1098
- self.platform_resource_repository,
1099
- False,
1100
- )
1134
+ logger.debug(
1135
+ f"Retrieved/created platform resource for tag {tag.key.raw_text}"
1101
1136
  )
1102
1137
  if (
1103
1138
  tag.to_datahub_tag_urn().urn()
@@ -1,5 +1,10 @@
1
1
  import logging
2
- from typing import List, Optional
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ from datahub.ingestion.source.unity.platform_resource_repository import (
6
+ UnityCatalogPlatformResourceRepository,
7
+ )
3
8
 
4
9
  from pydantic import BaseModel
5
10
 
@@ -7,17 +12,14 @@ from datahub.api.entities.external.external_entities import (
7
12
  ExternalEntity,
8
13
  ExternalEntityId,
9
14
  LinkedResourceSet,
10
- PlatformResourceRepository,
11
15
  )
12
16
  from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
13
17
  from datahub.api.entities.platformresource.platform_resource import (
14
18
  PlatformResource,
15
19
  PlatformResourceKey,
16
- PlatformResourceSearchFields,
17
20
  )
18
21
  from datahub.ingestion.graph.client import DataHubGraph
19
22
  from datahub.metadata.urns import TagUrn
20
- from datahub.utilities.search_utils import ElasticDocumentQuery
21
23
  from datahub.utilities.urns.urn import Urn
22
24
 
23
25
 
@@ -29,9 +31,9 @@ class UnityCatalogTagSyncContext(BaseModel):
29
31
  logger = logging.getLogger(__name__)
30
32
 
31
33
 
32
- class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
34
+ class UnityCatalogTagPlatformResourceId(ExternalEntityId):
33
35
  """
34
- A SnowflakeTagId is a unique identifier for a Snowflake tag.
36
+ A Unity Catalog tag platform resource ID.
35
37
  """
36
38
 
37
39
  tag_key: str
@@ -40,9 +42,6 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
40
42
  exists_in_unity_catalog: bool = False
41
43
  persisted: bool = False
42
44
 
43
- def __hash__(self) -> int:
44
- return hash(self.to_platform_resource_key().id)
45
-
46
45
  # this is a hack to make sure the property is a string and not private pydantic field
47
46
  @staticmethod
48
47
  def _RESOURCE_TYPE() -> str:
@@ -57,26 +56,21 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
57
56
  )
58
57
 
59
58
  @classmethod
60
- def from_tag(
59
+ def get_or_create_from_tag(
61
60
  cls,
62
61
  tag: UnityCatalogTag,
63
- platform_instance: Optional[str],
64
- platform_resource_repository: PlatformResourceRepository,
62
+ platform_resource_repository: "UnityCatalogPlatformResourceRepository",
65
63
  exists_in_unity_catalog: bool = False,
66
64
  ) -> "UnityCatalogTagPlatformResourceId":
67
65
  """
68
66
  Creates a UnityCatalogTagPlatformResourceId from a UnityCatalogTag.
69
67
  """
70
68
 
71
- existing_platform_resource = cls.search_by_urn(
72
- tag.to_datahub_tag_urn().urn(),
73
- platform_resource_repository=platform_resource_repository,
74
- tag_sync_context=UnityCatalogTagSyncContext(
75
- platform_instance=platform_instance
76
- ),
69
+ existing_platform_resource = platform_resource_repository.search_entity_by_urn(
70
+ tag.to_datahub_tag_urn().urn()
77
71
  )
78
72
  if existing_platform_resource:
79
- logger.info(
73
+ logger.debug(
80
74
  f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.raw_text}: {existing_platform_resource}"
81
75
  )
82
76
  return existing_platform_resource
@@ -84,105 +78,46 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
84
78
  return UnityCatalogTagPlatformResourceId(
85
79
  tag_key=tag.key.raw_text,
86
80
  tag_value=tag.value.raw_text if tag.value is not None else None,
87
- platform_instance=platform_instance,
81
+ platform_instance=platform_resource_repository.platform_instance,
88
82
  exists_in_unity_catalog=exists_in_unity_catalog,
89
83
  persisted=False,
90
84
  )
91
85
 
92
- @classmethod
93
- def search_by_urn(
94
- cls,
95
- urn: str,
96
- platform_resource_repository: PlatformResourceRepository,
97
- tag_sync_context: UnityCatalogTagSyncContext,
98
- ) -> Optional["UnityCatalogTagPlatformResourceId"]:
99
- mapped_tags = [
100
- t
101
- for t in platform_resource_repository.search_by_filter(
102
- ElasticDocumentQuery.create_from(
103
- (
104
- PlatformResourceSearchFields.RESOURCE_TYPE,
105
- str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
106
- ),
107
- (PlatformResourceSearchFields.SECONDARY_KEYS, urn),
108
- )
109
- )
110
- ]
111
- logger.info(
112
- f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
113
- )
114
- if len(mapped_tags) > 0:
115
- for platform_resource in mapped_tags:
116
- if (
117
- platform_resource.resource_info
118
- and platform_resource.resource_info.value
119
- ):
120
- unity_catalog_tag = UnityCatalogTagPlatformResource(
121
- **platform_resource.resource_info.value.as_pydantic_object(
122
- UnityCatalogTagPlatformResource
123
- ).dict()
124
- )
125
- if (
126
- unity_catalog_tag.id.platform_instance
127
- == tag_sync_context.platform_instance
128
- ):
129
- unity_catalog_tag_id = unity_catalog_tag.id
130
- unity_catalog_tag_id.exists_in_unity_catalog = True
131
- unity_catalog_tag_id.persisted = True
132
- return unity_catalog_tag_id
133
- else:
134
- logger.warning(
135
- f"Platform resource {platform_resource} does not have a resource_info value"
136
- )
137
- continue
138
-
139
- # If we reach here, it means we did not find a mapped tag for the URN
140
- logger.info(
141
- f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new UnityCatalogTagPlatformResourceId."
142
- )
143
- return None
144
-
145
86
  @classmethod
146
87
  def from_datahub_urn(
147
88
  cls,
148
89
  urn: str,
149
- platform_resource_repository: PlatformResourceRepository,
150
90
  tag_sync_context: UnityCatalogTagSyncContext,
91
+ platform_resource_repository: "UnityCatalogPlatformResourceRepository",
151
92
  graph: DataHubGraph,
152
93
  ) -> "UnityCatalogTagPlatformResourceId":
153
94
  """
154
95
  Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
155
96
  """
156
- # First we check if we already have a mapped platform resource for this
157
- # urn that is of the type UnityCatalogTagPlatformResource
158
- # If we do, we can use it to create the UnityCatalogTagPlatformResourceId
159
- # Else, we need to generate a new UnityCatalogTagPlatformResourceId
160
- existing_platform_resource_id = cls.search_by_urn(
161
- urn, platform_resource_repository, tag_sync_context
97
+ existing_platform_resource_id = (
98
+ platform_resource_repository.search_entity_by_urn(urn)
162
99
  )
163
100
  if existing_platform_resource_id:
164
- logger.info(
165
- f"Found existing UnityCatalogTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
166
- )
167
101
  return existing_platform_resource_id
168
102
 
169
- # Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
170
103
  new_unity_catalog_tag_id = cls.generate_tag_id(graph, tag_sync_context, urn)
171
104
  if new_unity_catalog_tag_id:
172
- # we then check if this tag has already been ingested as a platform
173
- # resource in the platform resource repository
174
105
  resource_key = platform_resource_repository.get(
175
106
  new_unity_catalog_tag_id.to_platform_resource_key()
176
107
  )
177
108
  if resource_key:
178
- logger.info(
179
- f"Tag {new_unity_catalog_tag_id} already exists in platform resource repository with {resource_key}"
180
- )
181
- new_unity_catalog_tag_id.exists_in_unity_catalog = (
182
- True # TODO: Check if this is a safe assumption
109
+ # Create a new ID with the correct state instead of mutating
110
+ return UnityCatalogTagPlatformResourceId(
111
+ tag_key=new_unity_catalog_tag_id.tag_key,
112
+ tag_value=new_unity_catalog_tag_id.tag_value,
113
+ platform_instance=new_unity_catalog_tag_id.platform_instance,
114
+ exists_in_unity_catalog=True, # This tag exists in Unity Catalog
115
+ persisted=new_unity_catalog_tag_id.persisted,
183
116
  )
184
117
  return new_unity_catalog_tag_id
185
- raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
118
+ raise ValueError(
119
+ f"Unable to create Unity Catalog tag ID from DataHub URN: {urn}"
120
+ )
186
121
 
187
122
  @classmethod
188
123
  def generate_tag_id(
@@ -191,14 +126,11 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
191
126
  parsed_urn = Urn.from_string(urn)
192
127
  entity_type = parsed_urn.entity_type
193
128
  if entity_type == "tag":
194
- new_unity_catalog_tag_id = (
195
- UnityCatalogTagPlatformResourceId.from_datahub_tag(
196
- TagUrn.from_string(urn), tag_sync_context
197
- )
129
+ return UnityCatalogTagPlatformResourceId.from_datahub_tag(
130
+ TagUrn.from_string(urn), tag_sync_context
198
131
  )
199
132
  else:
200
133
  raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
201
- return new_unity_catalog_tag_id
202
134
 
203
135
  @classmethod
204
136
  def from_datahub_tag(
@@ -214,7 +146,7 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
214
146
  )
215
147
 
216
148
 
217
- class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity):
149
+ class UnityCatalogTagPlatformResource(ExternalEntity):
218
150
  datahub_urns: LinkedResourceSet
219
151
  managed_by_datahub: bool
220
152
  id: UnityCatalogTagPlatformResourceId
@@ -237,58 +169,28 @@ class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity):
237
169
  )
238
170
 
239
171
  @classmethod
240
- def get_from_datahub(
172
+ def create_default(
241
173
  cls,
242
- unity_catalog_tag_id: UnityCatalogTagPlatformResourceId,
243
- platform_resource_repository: PlatformResourceRepository,
244
- managed_by_datahub: bool = False,
174
+ entity_id: ExternalEntityId,
175
+ managed_by_datahub: bool,
245
176
  ) -> "UnityCatalogTagPlatformResource":
246
- # Search for linked DataHub URNs
247
- platform_resources = [
248
- r
249
- for r in platform_resource_repository.search_by_filter(
250
- ElasticDocumentQuery.create_from(
251
- (
252
- PlatformResourceSearchFields.RESOURCE_TYPE,
253
- str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
254
- ),
255
- (
256
- PlatformResourceSearchFields.PRIMARY_KEY,
257
- f"{unity_catalog_tag_id.tag_key}/{unity_catalog_tag_id.tag_value}",
258
- ),
259
- )
260
- )
261
- ]
262
- if len(platform_resources) == 1:
263
- platform_resource: PlatformResource = platform_resources[0]
264
- if (
265
- platform_resource.resource_info
266
- and platform_resource.resource_info.value
267
- ):
268
- unity_catalog_tag = UnityCatalogTagPlatformResource(
269
- **platform_resource.resource_info.value.as_pydantic_object(
270
- UnityCatalogTagPlatformResource
271
- ).dict()
272
- )
273
- return unity_catalog_tag
274
- else:
275
- for platform_resource in platform_resources:
276
- if (
277
- platform_resource.resource_info
278
- and platform_resource.resource_info.value
279
- ):
280
- unity_catalog_tag = UnityCatalogTagPlatformResource(
281
- **platform_resource.resource_info.value.as_pydantic_object(
282
- UnityCatalogTagPlatformResource
283
- ).dict()
284
- )
285
- if (
286
- unity_catalog_tag.id.platform_instance
287
- == unity_catalog_tag_id.platform_instance
288
- ):
289
- return unity_catalog_tag
177
+ """Create a default Unity Catalog tag entity when none found in DataHub."""
178
+ # Type narrowing: we know this will be a UnityCatalogTagPlatformResourceId
179
+ assert isinstance(entity_id, UnityCatalogTagPlatformResourceId), (
180
+ f"Expected UnityCatalogTagPlatformResourceId, got {type(entity_id)}"
181
+ )
182
+
183
+ # Create a new entity ID with correct default state instead of mutating
184
+ default_entity_id = UnityCatalogTagPlatformResourceId(
185
+ tag_key=entity_id.tag_key,
186
+ tag_value=entity_id.tag_value,
187
+ platform_instance=entity_id.platform_instance,
188
+ exists_in_unity_catalog=False, # New entities don't exist in Unity Catalog yet
189
+ persisted=False, # New entities are not persisted yet
190
+ )
191
+
290
192
  return cls(
291
- id=unity_catalog_tag_id,
193
+ id=default_entity_id,
292
194
  datahub_urns=LinkedResourceSet(urns=[]),
293
195
  managed_by_datahub=managed_by_datahub,
294
196
  allowed_values=None,