acryl-datahub 1.2.0.4rc4__py3-none-any.whl → 1.2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (41) hide show
  1. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/METADATA +2509 -2509
  2. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/RECORD +41 -39
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +1 -1
  5. datahub/api/entities/external/external_entities.py +500 -15
  6. datahub/ingestion/source/aws/glue.py +18 -14
  7. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  8. datahub/ingestion/source/aws/tag_entities.py +82 -104
  9. datahub/ingestion/source/common/subtypes.py +1 -0
  10. datahub/ingestion/source/hex/api.py +2 -0
  11. datahub/ingestion/source/hex/mapper.py +16 -2
  12. datahub/ingestion/source/hex/model.py +2 -0
  13. datahub/ingestion/source/looker/looker_common.py +26 -0
  14. datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -1
  15. datahub/ingestion/source/snowflake/constants.py +1 -0
  16. datahub/ingestion/source/snowflake/snowflake_config.py +8 -1
  17. datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
  18. datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
  19. datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
  20. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
  21. datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
  22. datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
  23. datahub/ingestion/source/sql/mssql/source.py +2 -25
  24. datahub/ingestion/source/sql/mysql.py +54 -0
  25. datahub/ingestion/source/sql/postgres.py +5 -134
  26. datahub/ingestion/source/sql/sql_common.py +137 -0
  27. datahub/ingestion/source/superset.py +140 -56
  28. datahub/ingestion/source/unity/config.py +11 -0
  29. datahub/ingestion/source/unity/connection_test.py +1 -0
  30. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  31. datahub/ingestion/source/unity/proxy.py +20 -6
  32. datahub/ingestion/source/unity/report.py +9 -1
  33. datahub/ingestion/source/unity/source.py +51 -16
  34. datahub/ingestion/source/unity/tag_entities.py +49 -147
  35. datahub/metadata/_internal_schema_classes.py +1 -1
  36. datahub/metadata/schema.avsc +4 -2
  37. datahub/metadata/schemas/Operation.avsc +4 -2
  38. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/WHEEL +0 -0
  39. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/entry_points.txt +0 -0
  40. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/licenses/LICENSE +0 -0
  41. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5.dist-info}/top_level.txt +0 -0
@@ -1,55 +1,517 @@
1
1
  import logging
2
- from abc import abstractmethod
2
+ import threading
3
+ import typing
4
+ from abc import ABC, abstractmethod
3
5
  from dataclasses import dataclass
4
6
  from enum import Enum
5
- from typing import Iterable, List, Optional, Union
7
+ from typing import (
8
+ Generic,
9
+ Iterable,
10
+ List,
11
+ Optional,
12
+ Protocol,
13
+ Type,
14
+ TypeVar,
15
+ Union,
16
+ cast,
17
+ )
6
18
 
7
19
  import cachetools
8
20
  from pydantic import BaseModel
21
+ from typing_extensions import get_original_bases
9
22
 
10
23
  from datahub.api.entities.platformresource.platform_resource import (
11
24
  PlatformResource,
12
25
  PlatformResourceKey,
26
+ PlatformResourceSearchFields,
13
27
  )
28
+ from datahub.ingestion.api.report import SupportsAsObj
14
29
  from datahub.ingestion.graph.client import DataHubGraph
15
30
  from datahub.metadata.urns import PlatformResourceUrn, Urn
16
31
  from datahub.utilities.search_utils import ElasticDocumentQuery
17
32
 
18
33
  logger = logging.getLogger(__name__)
19
34
 
35
+ # Type variables for generic repository
36
+ TExternalEntityId = TypeVar("TExternalEntityId", bound="ExternalEntityId")
37
+ TExternalEntity = TypeVar("TExternalEntity", bound="ExternalEntity")
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class UrnCacheKey:
42
+ """Typed compound key for URN search cache.
43
+
44
+ This eliminates fragile string parsing and provides type safety for cache operations.
45
+ Using dataclass with frozen=True makes it immutable and hashable.
46
+ """
47
+
48
+ urn: str
49
+ platform_instance: Optional[str]
50
+
51
+ def __str__(self) -> str:
52
+ """String representation for debugging purposes only."""
53
+ return (
54
+ f"UrnCacheKey(urn={self.urn}, platform_instance={self.platform_instance})"
55
+ )
56
+
57
+
58
+ class SyncContext(Protocol):
59
+ """Protocol defining the interface for platform-specific sync context objects.
60
+
61
+ All sync context objects must have a platform_instance attribute that can be None.
62
+ """
63
+
64
+ platform_instance: Optional[str]
65
+
66
+
67
+ class PlatformResourceRepository(
68
+ SupportsAsObj, ABC, Generic[TExternalEntityId, TExternalEntity]
69
+ ):
70
+ CACHE_SIZE = 1000
71
+
72
+ # Subclasses should override this with their specific entity class
73
+ entity_class: Type[TExternalEntity]
20
74
 
21
- class PlatformResourceRepository:
22
- def __init__(self, graph: DataHubGraph):
75
+ def __init__(self, graph: DataHubGraph, platform_instance: Optional[str] = None):
23
76
  self.graph = graph
24
- self.cache: cachetools.TTLCache = cachetools.TTLCache(maxsize=1000, ttl=60 * 5)
77
+ self.platform_instance = platform_instance
78
+
79
+ # Extract the entity class from generic type parameters
80
+ # self.entity_class = typing.get_args(self.__class__.__orig_bases__[0])[1]
81
+ self.entity_class = typing.get_args(get_original_bases(self.__class__)[0])[1]
82
+
83
+ # Two-tier cache architecture for efficient external entity management
84
+ # URN search cache: maps UrnCacheKey -> ExternalEntityId
85
+ self.urn_search_cache: cachetools.LRUCache[
86
+ UrnCacheKey, Optional[TExternalEntityId]
87
+ ] = cachetools.LRUCache(maxsize=PlatformResourceRepository.CACHE_SIZE)
88
+ # External entity cache: maps platform_resource_key.id -> ExternalEntity
89
+ self.external_entity_cache: cachetools.LRUCache[str, TExternalEntity] = (
90
+ cachetools.LRUCache(maxsize=PlatformResourceRepository.CACHE_SIZE)
91
+ )
92
+
93
+ # Statistics tracking - simple integers following DataHub report patterns
94
+ self.urn_search_cache_hits = 0
95
+ self.urn_search_cache_misses = 0
96
+ self.external_entity_cache_hits = 0
97
+ self.external_entity_cache_misses = 0
98
+
99
+ # Error tracking for cache operations
100
+ self.cache_update_errors = 0
101
+ self.cache_invalidation_errors = 0
102
+ self.entity_creation_errors = 0
103
+ self.cache_key_parsing_errors = 0
104
+
105
+ # Thread safety infrastructure
106
+ # Use RLock to allow recursive acquisition within the same thread
107
+ self._cache_lock = threading.RLock()
25
108
 
26
109
  def search_by_filter(
27
- self, filter: ElasticDocumentQuery, add_to_cache: bool = True
110
+ self, query: ElasticDocumentQuery, add_to_cache: bool = True
28
111
  ) -> Iterable[PlatformResource]:
29
- results = PlatformResource.search_by_filters(self.graph, filter)
112
+ results = PlatformResource.search_by_filters(self.graph, query)
113
+ # Note: add_to_cache parameter is kept for API compatibility but ignored
114
+ # since we no longer cache raw PlatformResource objects
30
115
  for platform_resource in results:
31
- if add_to_cache:
32
- self.cache[platform_resource.id] = platform_resource
33
116
  yield platform_resource
34
117
 
35
118
  def create(self, platform_resource: PlatformResource) -> None:
119
+ """Create platform resource in DataHub with atomic cache operations.
120
+
121
+ This method ensures thread-safe, atomic updates across both caches.
122
+ """
123
+ # First, perform the DataHub ingestion outside the cache lock
36
124
  platform_resource.to_datahub(self.graph)
37
- self.cache[platform_resource.id] = platform_resource
125
+
126
+ # Now perform atomic cache operations
127
+ with self._cache_lock:
128
+ # Cache the transformed entity with correct flags after ingestion and update related caches
129
+ if (
130
+ platform_resource.resource_info
131
+ and platform_resource.resource_info.value
132
+ ):
133
+ try:
134
+ # Extract the original entity from the serialized resource value
135
+ entity_obj = (
136
+ platform_resource.resource_info.value.as_pydantic_object(
137
+ self.entity_class
138
+ )
139
+ )
140
+ entity = self.entity_class(**entity_obj.dict())
141
+
142
+ # Create updated entity ID with persisted=True
143
+ entity_id = entity.get_id()
144
+ if hasattr(entity_id, "dict"):
145
+ entity_id_data = entity_id.dict()
146
+ entity_id_data["persisted"] = True
147
+
148
+ # Create new entity ID with updated flags
149
+ updated_entity_id = type(entity_id)(**entity_id_data)
150
+
151
+ # Update the entity with the new ID (immutable update)
152
+ entity_data = entity.dict() # type: ignore[attr-defined]
153
+ entity_data["id"] = updated_entity_id
154
+ updated_entity = type(entity)(**entity_data)
155
+
156
+ # Cache the updated entity in the external entity cache
157
+ # Use the same cache key that get_entity_from_datahub uses
158
+ updated_platform_resource_key = (
159
+ updated_entity_id.to_platform_resource_key()
160
+ )
161
+ self.external_entity_cache[updated_platform_resource_key.id] = (
162
+ updated_entity
163
+ )
164
+
165
+ # Update URN search cache for any URNs associated with this entity
166
+ # This ensures that future URN searches will find the newly created entity
167
+ if (
168
+ platform_resource.resource_info
169
+ and platform_resource.resource_info.secondary_keys
170
+ ):
171
+ for (
172
+ secondary_key
173
+ ) in platform_resource.resource_info.secondary_keys:
174
+ # Create typed compound cache key
175
+ urn_cache_key = UrnCacheKey(
176
+ urn=secondary_key,
177
+ platform_instance=self.platform_instance,
178
+ )
179
+ # Cache the updated entity ID so URN searches will find it
180
+ self.urn_search_cache[urn_cache_key] = cast(
181
+ TExternalEntityId, updated_entity_id
182
+ )
183
+
184
+ # Also check if there are any None cache entries that should be invalidated
185
+ # Look for cache entries that were previously searched but not found
186
+ stale_cache_keys = []
187
+ if (
188
+ platform_resource.resource_info
189
+ and platform_resource.resource_info.secondary_keys
190
+ ):
191
+ for cache_key, cached_value in list(
192
+ self.urn_search_cache.items()
193
+ ):
194
+ if cached_value is None:
195
+ # Direct attribute access on typed key - no parsing needed!
196
+ try:
197
+ # Check if this cache key refers to this entity
198
+ if (
199
+ cache_key.platform_instance
200
+ == self.platform_instance
201
+ and cache_key.urn
202
+ in platform_resource.resource_info.secondary_keys
203
+ ):
204
+ stale_cache_keys.append(cache_key)
205
+ except Exception as cache_key_error:
206
+ # Track cache key processing errors and log them
207
+ self.cache_key_parsing_errors += 1
208
+ logger.warning(
209
+ f"Failed to process cache key '{cache_key}' during stale cache invalidation: {cache_key_error}"
210
+ )
211
+ continue
212
+
213
+ # Remove stale None cache entries and replace with the actual entity ID
214
+ for stale_key in stale_cache_keys:
215
+ try:
216
+ del self.urn_search_cache[stale_key]
217
+ self.urn_search_cache[stale_key] = cast(
218
+ TExternalEntityId, updated_entity_id
219
+ )
220
+ except Exception as invalidation_error:
221
+ # Track cache invalidation errors
222
+ self.cache_invalidation_errors += 1
223
+ logger.warning(
224
+ f"Failed to invalidate stale cache entry '{stale_key}': {invalidation_error}"
225
+ )
226
+
227
+ except Exception as cache_error:
228
+ # Track cache update errors and log them
229
+ self.cache_update_errors += 1
230
+ logger.error(
231
+ f"Failed to update caches after entity creation for resource {platform_resource.id}: {cache_error}"
232
+ )
38
233
 
39
234
  def get(self, key: PlatformResourceKey) -> Optional[PlatformResource]:
40
- return self.cache.get(key.id)
235
+ """Retrieve platform resource by performing a direct DataHub query.
236
+
237
+ Note: This method no longer uses caching since we eliminated the
238
+ platform_resource_cache in favor of the more useful external_entity_cache.
239
+ """
240
+ # Query DataHub directly for the platform resource
241
+ platform_resources = list(
242
+ self.search_by_filter(
243
+ ElasticDocumentQuery.create_from(
244
+ (
245
+ PlatformResourceSearchFields.RESOURCE_TYPE,
246
+ self.get_resource_type(),
247
+ ),
248
+ (PlatformResourceSearchFields.PRIMARY_KEY, key.primary_key),
249
+ ),
250
+ add_to_cache=False,
251
+ )
252
+ )
253
+
254
+ # Find matching resource by ID
255
+ for platform_resource in platform_resources:
256
+ if platform_resource.id == key.id:
257
+ return platform_resource
258
+ return None
41
259
 
42
260
  def delete(self, key: PlatformResourceKey) -> None:
261
+ """Thread-safe atomic deletion from DataHub and all caches."""
262
+ # First, perform the DataHub deletion outside the cache lock
43
263
  self.graph.delete_entity(urn=PlatformResourceUrn(key.id).urn(), hard=True)
44
- del self.cache[key.id]
45
264
 
265
+ # Now perform atomic cache cleanup
266
+ with self._cache_lock:
267
+ # Clear external entity cache
268
+ if key.id in self.external_entity_cache:
269
+ del self.external_entity_cache[key.id]
270
+
271
+ # Note: We intentionally do not clear URN search cache entries here
272
+ # The URN cache will naturally expire via LRU eviction, and clearing
273
+ # stale entries would require expensive O(n) iteration over all cache keys.
274
+ # Stale cache entries pointing to deleted entities will return None on
275
+ # subsequent lookups, which is the correct behavior.
46
276
 
47
- class ExternalEntityId:
277
+ def get_resource_type(self) -> str:
278
+ """Get the platform-specific resource type for filtering.
279
+
280
+ Returns the entity class name, which matches the resource type.
281
+
282
+ Returns:
283
+ Resource type string (e.g., 'UnityCatalogTagPlatformResource')
284
+ """
285
+ return self.entity_class.__name__
286
+
287
+ def create_default_entity(
288
+ self, entity_id: TExternalEntityId, managed_by_datahub: bool
289
+ ) -> TExternalEntity:
290
+ """Create a default entity when none found in DataHub.
291
+
292
+ This method delegates to the entity class's create_default class method
293
+ to avoid circular dependencies and ensure entity creation logic stays
294
+ with the entity class.
295
+
296
+ Args:
297
+ entity_id: The external entity ID
298
+ managed_by_datahub: Whether the entity is managed by DataHub
299
+
300
+ Returns:
301
+ Default entity instance
302
+ """
303
+ # Call the abstract create_default method on the entity class
304
+ return cast(
305
+ TExternalEntity,
306
+ self.entity_class.create_default(entity_id, managed_by_datahub),
307
+ )
308
+
309
+ def search_entity_by_urn(self, urn: str) -> Optional[TExternalEntityId]:
310
+ """Search for existing external entity by URN with thread-safe caching.
311
+
312
+ Args:
313
+ urn: The URN to search for
314
+
315
+ Returns:
316
+ External entity ID if found, None otherwise
317
+ """
318
+ # Create typed compound cache key
319
+ cache_key = UrnCacheKey(urn=urn, platform_instance=self.platform_instance)
320
+
321
+ # Thread-safe cache check
322
+ with self._cache_lock:
323
+ if cache_key in self.urn_search_cache:
324
+ cached_result = self.urn_search_cache[cache_key]
325
+ # Update statistics within cache lock following DataHub patterns
326
+ self.urn_search_cache_hits += 1
327
+ logger.debug(f"Cache hit for URN search: {cache_key}")
328
+ return cached_result
329
+
330
+ self.urn_search_cache_misses += 1
331
+
332
+ logger.debug(
333
+ f"Cache miss for URN {urn} with platform instance {self.platform_instance}"
334
+ )
335
+
336
+ mapped_entities = [
337
+ t
338
+ for t in self.search_by_filter(
339
+ ElasticDocumentQuery.create_from(
340
+ (
341
+ PlatformResourceSearchFields.RESOURCE_TYPE,
342
+ self.get_resource_type(),
343
+ ),
344
+ (PlatformResourceSearchFields.SECONDARY_KEYS, urn),
345
+ ),
346
+ add_to_cache=False, # We'll cache the result ourselves
347
+ )
348
+ ]
349
+
350
+ result = None
351
+ if len(mapped_entities) > 0:
352
+ for platform_resource in mapped_entities:
353
+ if (
354
+ platform_resource.resource_info
355
+ and platform_resource.resource_info.value
356
+ ):
357
+ entity_obj = (
358
+ platform_resource.resource_info.value.as_pydantic_object(
359
+ self.entity_class
360
+ )
361
+ )
362
+ entity = self.entity_class(**entity_obj.dict())
363
+ # Check if platform instance matches
364
+ entity_id = entity.get_id()
365
+ if entity_id.platform_instance == self.platform_instance:
366
+ # Create a new entity ID with the correct state instead of mutating
367
+ # All our entity IDs are Pydantic models, so we can use dict() method
368
+ entity_data = entity_id.dict()
369
+ entity_data["persisted"] = (
370
+ True # This entity was found in DataHub
371
+ )
372
+ result = cast(TExternalEntityId, type(entity_id)(**entity_data))
373
+ break
374
+
375
+ # Thread-safe cache update of the result (even if None)
376
+ with self._cache_lock:
377
+ self.urn_search_cache[cache_key] = result
378
+ return result
379
+
380
+ def get_entity_from_datahub(
381
+ self, entity_id: TExternalEntityId, managed_by_datahub: bool = False
382
+ ) -> TExternalEntity:
383
+ """Get external entity from DataHub with caching.
384
+
385
+ Args:
386
+ entity_id: The external entity ID to retrieve
387
+ managed_by_datahub: Whether the entity is managed by DataHub
388
+
389
+ Returns:
390
+ External entity if found or created
391
+ """
392
+ platform_resource_key = entity_id.to_platform_resource_key()
393
+ cache_key = platform_resource_key.id
394
+
395
+ # Thread-safe cache check
396
+ with self._cache_lock:
397
+ cached_result = self.external_entity_cache.get(cache_key)
398
+ if cached_result is not None:
399
+ # Update statistics within cache lock following DataHub patterns
400
+ self.external_entity_cache_hits += 1
401
+ logger.debug(f"Cache hit for get_entity_from_datahub: {cache_key}")
402
+ return cached_result
403
+
404
+ # Cache miss - update statistics within cache lock
405
+ self.external_entity_cache_misses += 1
406
+ logger.debug(f"Cache miss for get_entity_from_datahub {entity_id}")
407
+
408
+ platform_resources = [
409
+ r
410
+ for r in self.search_by_filter(
411
+ ElasticDocumentQuery.create_from(
412
+ (
413
+ PlatformResourceSearchFields.RESOURCE_TYPE,
414
+ self.get_resource_type(),
415
+ ),
416
+ (
417
+ PlatformResourceSearchFields.PRIMARY_KEY,
418
+ platform_resource_key.primary_key,
419
+ ),
420
+ ),
421
+ add_to_cache=False, # We'll cache the result ourselves
422
+ )
423
+ ]
424
+
425
+ result = None
426
+
427
+ if len(platform_resources) == 1:
428
+ platform_resource = platform_resources[0]
429
+ if (
430
+ platform_resource.resource_info
431
+ and platform_resource.resource_info.value
432
+ ):
433
+ entity_obj = platform_resource.resource_info.value.as_pydantic_object(
434
+ self.entity_class
435
+ )
436
+ result = self.entity_class(**entity_obj.dict())
437
+ elif len(platform_resources) > 1:
438
+ # Handle multiple matches - find the one with matching platform instance
439
+ target_platform_instance = entity_id.platform_instance
440
+ for platform_resource in platform_resources:
441
+ if (
442
+ platform_resource.resource_info
443
+ and platform_resource.resource_info.value
444
+ ):
445
+ entity_obj = (
446
+ platform_resource.resource_info.value.as_pydantic_object(
447
+ self.entity_class
448
+ )
449
+ )
450
+ entity = self.entity_class(**entity_obj.dict())
451
+ if entity.get_id().platform_instance == target_platform_instance:
452
+ result = entity
453
+ break
454
+
455
+ if result is None:
456
+ try:
457
+ result = self.create_default_entity(entity_id, managed_by_datahub)
458
+ except Exception as create_error:
459
+ # Track entity creation errors
460
+ self.entity_creation_errors += 1
461
+ logger.error(
462
+ f"Failed to create default entity for {entity_id}: {create_error}"
463
+ )
464
+ raise
465
+
466
+ # Thread-safe cache update
467
+ with self._cache_lock:
468
+ self.external_entity_cache[cache_key] = result
469
+ return result
470
+
471
+ def as_obj(self) -> dict:
472
+ """Implementation of SupportsAsObj protocol for automatic report serialization.
473
+
474
+ Returns cache statistics and error metrics on demand when the repository is included in a report.
475
+ This eliminates the need for manual cache statistics collection.
476
+
477
+ Returns:
478
+ Dictionary containing cache statistics and error metrics with structure:
479
+ {
480
+ "search_by_urn_cache": {"hits": int, "misses": int, "current_size": int, "max_size": int},
481
+ "external_entity_cache": {"hits": int, "misses": int, "current_size": int, "max_size": int},
482
+ "errors": {"cache_updates": int, "cache_invalidations": int, "entity_creations": int, "cache_key_parsing": int}
483
+ }
484
+ """
485
+ return {
486
+ "search_by_urn_cache": {
487
+ "hits": self.urn_search_cache_hits,
488
+ "misses": self.urn_search_cache_misses,
489
+ "current_size": len(self.urn_search_cache),
490
+ "max_size": int(self.urn_search_cache.maxsize),
491
+ },
492
+ "external_entity_cache": {
493
+ "hits": self.external_entity_cache_hits,
494
+ "misses": self.external_entity_cache_misses,
495
+ "current_size": len(self.external_entity_cache),
496
+ "max_size": int(self.external_entity_cache.maxsize),
497
+ },
498
+ "errors": {
499
+ "cache_updates": self.cache_update_errors,
500
+ "cache_invalidations": self.cache_invalidation_errors,
501
+ "entity_creations": self.entity_creation_errors,
502
+ "cache_key_parsing": self.cache_key_parsing_errors,
503
+ },
504
+ }
505
+
506
+
507
+ class ExternalEntityId(BaseModel):
48
508
  """
49
509
  ExternalEntityId is a unique
50
510
  identifier for an ExternalEntity.
51
511
  """
52
512
 
513
+ platform_instance: Optional[str] = None
514
+
53
515
  @abstractmethod
54
516
  def to_platform_resource_key(self) -> PlatformResourceKey:
55
517
  """
@@ -165,7 +627,7 @@ class LinkedResourceSet(BaseModel):
165
627
  return False
166
628
 
167
629
 
168
- class ExternalEntity:
630
+ class ExternalEntity(BaseModel):
169
631
  """
170
632
  An ExternalEntity is a representation of an entity that external to DataHub
171
633
  but could be linked to one or more DataHub entities.
@@ -200,8 +662,24 @@ class ExternalEntity:
200
662
  """
201
663
  pass
202
664
 
665
+ @classmethod
666
+ @abstractmethod
667
+ def create_default(
668
+ cls, entity_id: "ExternalEntityId", managed_by_datahub: bool
669
+ ) -> "ExternalEntity":
670
+ """
671
+ Create a default entity instance when none found in DataHub.
672
+
673
+ Args:
674
+ entity_id: The external entity ID (concrete implementations can expect their specific types)
675
+ managed_by_datahub: Whether the entity is managed by DataHub
676
+
677
+ Returns:
678
+ Default entity instance
679
+ """
680
+ pass
681
+
203
682
 
204
- @dataclass
205
683
  class MissingExternalEntity(ExternalEntity):
206
684
  id: ExternalEntityId
207
685
 
@@ -217,6 +695,13 @@ class MissingExternalEntity(ExternalEntity):
217
695
  def get_id(self) -> ExternalEntityId:
218
696
  return self.id
219
697
 
698
+ @classmethod
699
+ def create_default(
700
+ cls, entity_id: ExternalEntityId, managed_by_datahub: bool
701
+ ) -> "MissingExternalEntity":
702
+ """Create a missing external entity."""
703
+ return cls(id=entity_id)
704
+
220
705
 
221
706
  class ExternalSystem:
222
707
  @abstractmethod
@@ -25,9 +25,6 @@ from pydantic import validator
25
25
  from pydantic.fields import Field
26
26
 
27
27
  from datahub.api.entities.dataset.dataset import Dataset
28
- from datahub.api.entities.external.external_entities import (
29
- PlatformResourceRepository,
30
- )
31
28
  from datahub.api.entities.external.lake_formation_external_entites import (
32
29
  LakeFormationTag,
33
30
  )
@@ -63,13 +60,15 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
63
60
  from datahub.ingestion.api.workunit import MetadataWorkUnit
64
61
  from datahub.ingestion.source.aws import s3_util
65
62
  from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
63
+ from datahub.ingestion.source.aws.platform_resource_repository import (
64
+ GluePlatformResourceRepository,
65
+ )
66
66
  from datahub.ingestion.source.aws.s3_util import (
67
67
  is_s3_uri,
68
68
  make_s3_urn,
69
69
  make_s3_urn_for_lineage,
70
70
  )
71
71
  from datahub.ingestion.source.aws.tag_entities import (
72
- LakeFormationTagPlatformResource,
73
72
  LakeFormationTagPlatformResourceId,
74
73
  )
75
74
  from datahub.ingestion.source.common.subtypes import (
@@ -357,10 +356,14 @@ class GlueSource(StatefulIngestionSourceBase):
357
356
  self.extract_transforms = config.extract_transforms
358
357
  self.env = config.env
359
358
 
360
- self.platform_resource_repository: Optional[PlatformResourceRepository] = None
359
+ self.platform_resource_repository: Optional[
360
+ "GluePlatformResourceRepository"
361
+ ] = None
361
362
  if self.ctx.graph:
362
- self.platform_resource_repository = PlatformResourceRepository(
363
- self.ctx.graph
363
+ self.platform_resource_repository = GluePlatformResourceRepository(
364
+ self.ctx.graph,
365
+ platform_instance=self.source_config.platform_instance,
366
+ catalog=self.source_config.catalog_id,
364
367
  )
365
368
 
366
369
  def get_database_lf_tags(
@@ -1179,16 +1182,17 @@ class GlueSource(StatefulIngestionSourceBase):
1179
1182
  self, tag: LakeFormationTag
1180
1183
  ) -> Iterable[MetadataWorkUnit]:
1181
1184
  if self.ctx.graph and self.platform_resource_repository:
1182
- platform_resource_id = LakeFormationTagPlatformResourceId.from_tag(
1183
- platform_instance=self.source_config.platform_instance,
1184
- platform_resource_repository=self.platform_resource_repository,
1185
- catalog=tag.catalog,
1186
- tag=tag,
1185
+ platform_resource_id = (
1186
+ LakeFormationTagPlatformResourceId.get_or_create_from_tag(
1187
+ tag=tag,
1188
+ platform_resource_repository=self.platform_resource_repository,
1189
+ catalog_id=tag.catalog,
1190
+ )
1187
1191
  )
1188
1192
  logger.info(f"Created platform resource {platform_resource_id}")
1189
1193
 
1190
- lf_tag = LakeFormationTagPlatformResource.get_from_datahub(
1191
- platform_resource_id, self.platform_resource_repository, False
1194
+ lf_tag = self.platform_resource_repository.get_entity_from_datahub(
1195
+ platform_resource_id, False
1192
1196
  )
1193
1197
  if (
1194
1198
  tag.to_datahub_tag_urn().urn()
@@ -0,0 +1,30 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+ from datahub.api.entities.external.external_entities import (
5
+ PlatformResourceRepository,
6
+ )
7
+ from datahub.ingestion.graph.client import DataHubGraph
8
+ from datahub.ingestion.source.aws.tag_entities import (
9
+ LakeFormationTagPlatformResource,
10
+ LakeFormationTagPlatformResourceId,
11
+ )
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class GluePlatformResourceRepository(
17
+ PlatformResourceRepository[
18
+ LakeFormationTagPlatformResourceId, LakeFormationTagPlatformResource
19
+ ]
20
+ ):
21
+ """AWS Glue-specific platform resource repository with tag-related operations."""
22
+
23
+ def __init__(
24
+ self,
25
+ graph: DataHubGraph,
26
+ platform_instance: Optional[str] = None,
27
+ catalog: Optional[str] = None,
28
+ ):
29
+ super().__init__(graph, platform_instance)
30
+ self.catalog = catalog