acryl-datahub 0.15.0.1rc2__py3-none-any.whl → 0.15.0.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,10 @@
1
1
  import json
2
2
  import logging
3
3
  import re
4
- from typing import Any, Dict, Iterable, List, Optional, Union
4
+ from dataclasses import dataclass
5
+ from enum import Enum
6
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
7
+ from urllib.parse import urlparse
5
8
 
6
9
  from pydantic.class_validators import validator
7
10
  from pydantic.fields import Field
@@ -11,7 +14,12 @@ from pyhive import hive # noqa: F401
11
14
  from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveDialect, HiveTimestamp
12
15
  from sqlalchemy.engine.reflection import Inspector
13
16
 
14
- from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
17
+ from datahub.emitter.mce_builder import (
18
+ make_data_platform_urn,
19
+ make_dataplatform_instance_urn,
20
+ make_dataset_urn_with_platform_instance,
21
+ make_schema_field_urn,
22
+ )
15
23
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
16
24
  from datahub.ingestion.api.decorators import (
17
25
  SourceCapability,
@@ -29,14 +37,24 @@ from datahub.ingestion.source.sql.two_tier_sql_source import (
29
37
  TwoTierSQLAlchemyConfig,
30
38
  TwoTierSQLAlchemySource,
31
39
  )
32
- from datahub.metadata.com.linkedin.pegasus2avro.schema import (
40
+ from datahub.metadata.schema_classes import (
41
+ DataPlatformInstanceClass,
42
+ DatasetLineageTypeClass,
43
+ DatasetPropertiesClass,
33
44
  DateTypeClass,
45
+ FineGrainedLineageClass,
46
+ FineGrainedLineageDownstreamTypeClass,
47
+ FineGrainedLineageUpstreamTypeClass,
34
48
  NullTypeClass,
35
49
  NumberTypeClass,
36
- SchemaField,
50
+ OtherSchemaClass,
51
+ SchemaFieldClass,
52
+ SchemaMetadataClass,
37
53
  TimeTypeClass,
54
+ UpstreamClass,
55
+ UpstreamLineageClass,
56
+ ViewPropertiesClass,
38
57
  )
39
- from datahub.metadata.schema_classes import ViewPropertiesClass
40
58
  from datahub.utilities import config_clean
41
59
  from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
42
60
 
@@ -46,6 +64,511 @@ register_custom_type(HiveDate, DateTypeClass)
46
64
  register_custom_type(HiveTimestamp, TimeTypeClass)
47
65
  register_custom_type(HiveDecimal, NumberTypeClass)
48
66
 
67
+
68
+ class StoragePlatform(Enum):
69
+ """Enumeration of storage platforms supported for lineage"""
70
+
71
+ S3 = "s3"
72
+ AZURE = "abs"
73
+ GCS = "gcs"
74
+ DBFS = "dbfs"
75
+ LOCAL = "file"
76
+ HDFS = "hdfs"
77
+
78
+
79
+ # Mapping of URL schemes to storage platforms
80
+ STORAGE_SCHEME_MAPPING = {
81
+ # S3 and derivatives
82
+ "s3": StoragePlatform.S3,
83
+ "s3a": StoragePlatform.S3,
84
+ "s3n": StoragePlatform.S3,
85
+ # Azure and derivatives
86
+ "abfs": StoragePlatform.AZURE,
87
+ "abfss": StoragePlatform.AZURE,
88
+ "adl": StoragePlatform.AZURE,
89
+ "adls": StoragePlatform.AZURE,
90
+ "wasb": StoragePlatform.AZURE,
91
+ "wasbs": StoragePlatform.AZURE,
92
+ # GCS and derivatives
93
+ "gs": StoragePlatform.GCS,
94
+ "gcs": StoragePlatform.GCS,
95
+ # DBFS
96
+ "dbfs": StoragePlatform.DBFS,
97
+ # Local filesystem
98
+ "file": StoragePlatform.LOCAL,
99
+ # HDFS
100
+ "hdfs": StoragePlatform.HDFS,
101
+ }
102
+
103
+
104
+ class StoragePathParser:
105
+ """Parser for storage paths with platform-specific logic"""
106
+
107
+ @staticmethod
108
+ def parse_storage_location(location: str) -> Optional[Tuple[StoragePlatform, str]]:
109
+ """
110
+ Parse a storage location into platform and normalized path.
111
+
112
+ Args:
113
+ location: Storage location URI (e.g., s3://bucket/path, abfss://container@account.dfs.core.windows.net/path)
114
+
115
+ Returns:
116
+ Tuple of (StoragePlatform, normalized_path) if valid, None if invalid
117
+ """
118
+
119
+ try:
120
+ # Handle special case for local files with no scheme
121
+ if location.startswith("/"):
122
+ return StoragePlatform.LOCAL, location
123
+
124
+ # Parse the URI
125
+ parsed = urlparse(location)
126
+ scheme = parsed.scheme.lower()
127
+
128
+ if not scheme:
129
+ return None
130
+
131
+ # Look up the platform
132
+ platform = STORAGE_SCHEME_MAPPING.get(scheme)
133
+ if not platform:
134
+ return None
135
+
136
+ # Get normalized path based on platform
137
+ if platform == StoragePlatform.S3:
138
+ # For S3, combine bucket and path
139
+ path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
140
+
141
+ elif platform == StoragePlatform.AZURE:
142
+ if scheme in ("abfs", "abfss"):
143
+ # Format: abfss://container@account.dfs.core.windows.net/path
144
+ container = parsed.netloc.split("@")[0]
145
+ path = f"{container}/{parsed.path.lstrip('/')}"
146
+ else:
147
+ # Handle other Azure schemes
148
+ path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
149
+
150
+ elif platform == StoragePlatform.GCS:
151
+ # For GCS, combine bucket and path
152
+ path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
153
+
154
+ elif platform == StoragePlatform.DBFS:
155
+ # For DBFS, use path as-is
156
+ path = parsed.path.lstrip("/")
157
+
158
+ elif platform == StoragePlatform.LOCAL:
159
+ # For local files, use full path
160
+ path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
161
+
162
+ elif platform == StoragePlatform.HDFS:
163
+ # For HDFS, use full path
164
+ path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
165
+
166
+ else:
167
+ return None
168
+
169
+ # Clean up the path
170
+ path = path.rstrip("/") # Remove trailing slashes
171
+ path = re.sub(r"/+", "/", path) # Normalize multiple slashes
172
+ path = f"/{path}"
173
+
174
+ return platform, path
175
+
176
+ except Exception as exp:
177
+ logger.warning(f"Failed to parse storage location {location}: {exp}")
178
+ return None
179
+
180
+ @staticmethod
181
+ def get_platform_name(platform: StoragePlatform) -> str:
182
+ """Get the platform name to use in URNs"""
183
+
184
+ platform_names = {
185
+ StoragePlatform.S3: "s3",
186
+ StoragePlatform.AZURE: "adls",
187
+ StoragePlatform.GCS: "gcs",
188
+ StoragePlatform.DBFS: "dbfs",
189
+ StoragePlatform.LOCAL: "file",
190
+ StoragePlatform.HDFS: "hdfs",
191
+ }
192
+ return platform_names[platform]
193
+
194
+
195
+ class HiveStorageLineageConfig:
196
+ """Configuration for Hive storage lineage."""
197
+
198
+ def __init__(
199
+ self,
200
+ emit_storage_lineage: bool,
201
+ hive_storage_lineage_direction: str,
202
+ include_column_lineage: bool,
203
+ storage_platform_instance: Optional[str],
204
+ ):
205
+ if hive_storage_lineage_direction.lower() not in ["upstream", "downstream"]:
206
+ raise ValueError(
207
+ "hive_storage_lineage_direction must be either upstream or downstream"
208
+ )
209
+
210
+ self.emit_storage_lineage = emit_storage_lineage
211
+ self.hive_storage_lineage_direction = hive_storage_lineage_direction.lower()
212
+ self.include_column_lineage = include_column_lineage
213
+ self.storage_platform_instance = storage_platform_instance
214
+
215
+
216
+ @dataclass
217
+ class HiveStorageSourceReport:
218
+ """Report for tracking storage lineage statistics"""
219
+
220
+ storage_locations_scanned: int = 0
221
+ filtered_locations: List[str] = Field(default_factory=list)
222
+ failed_locations: List[str] = Field(default_factory=list)
223
+
224
+ def report_location_scanned(self) -> None:
225
+ self.storage_locations_scanned += 1
226
+
227
+ def report_location_filtered(self, location: str) -> None:
228
+ self.filtered_locations.append(location)
229
+
230
+ def report_location_failed(self, location: str) -> None:
231
+ self.failed_locations.append(location)
232
+
233
+
234
+ class HiveStorageLineage:
235
+ """Handles storage lineage for Hive tables"""
236
+
237
+ def __init__(
238
+ self,
239
+ config: HiveStorageLineageConfig,
240
+ env: str,
241
+ convert_urns_to_lowercase: bool = False,
242
+ ):
243
+ self.config = config
244
+ self.env = env
245
+ self.convert_urns_to_lowercase = convert_urns_to_lowercase
246
+ self.report = HiveStorageSourceReport()
247
+
248
+ def _make_dataset_platform_instance(
249
+ self,
250
+ platform: str,
251
+ instance: Optional[str],
252
+ ) -> DataPlatformInstanceClass:
253
+ """Create DataPlatformInstance aspect"""
254
+
255
+ return DataPlatformInstanceClass(
256
+ platform=make_data_platform_urn(platform),
257
+ instance=make_dataplatform_instance_urn(platform, instance)
258
+ if instance
259
+ else None,
260
+ )
261
+
262
+ def _make_storage_dataset_urn(
263
+ self,
264
+ storage_location: str,
265
+ ) -> Optional[Tuple[str, str]]:
266
+ """
267
+ Create storage dataset URN from location.
268
+ Returns tuple of (urn, platform) if successful, None otherwise.
269
+ """
270
+
271
+ platform_instance = None
272
+ storage_info = StoragePathParser.parse_storage_location(storage_location)
273
+ if not storage_info:
274
+ logger.debug(f"Could not parse storage location: {storage_location}")
275
+ return None
276
+
277
+ platform, path = storage_info
278
+ platform_name = StoragePathParser.get_platform_name(platform)
279
+
280
+ if self.convert_urns_to_lowercase:
281
+ platform_name = platform_name.lower()
282
+ path = path.lower()
283
+ if self.config.storage_platform_instance:
284
+ platform_instance = self.config.storage_platform_instance.lower()
285
+
286
+ try:
287
+ storage_urn = make_dataset_urn_with_platform_instance(
288
+ platform=platform_name,
289
+ name=path,
290
+ env=self.env,
291
+ platform_instance=platform_instance,
292
+ )
293
+ return storage_urn, platform_name
294
+ except Exception as exp:
295
+ logger.error(f"Failed to create URN for {platform_name}:{path}: {exp}")
296
+ return None
297
+
298
+ def _get_fine_grained_lineages(
299
+ self,
300
+ dataset_urn: str,
301
+ storage_urn: str,
302
+ dataset_schema: SchemaMetadataClass,
303
+ storage_schema: SchemaMetadataClass,
304
+ ) -> Iterable[FineGrainedLineageClass]:
305
+ """Generate column-level lineage between dataset and storage"""
306
+
307
+ if not self.config.include_column_lineage:
308
+ return
309
+
310
+ for dataset_field in dataset_schema.fields:
311
+ dataset_path = dataset_field.fieldPath
312
+
313
+ # Find matching field in storage schema
314
+ matching_field = next(
315
+ (f for f in storage_schema.fields if f.fieldPath == dataset_path),
316
+ None,
317
+ )
318
+
319
+ if matching_field:
320
+ if self.config.hive_storage_lineage_direction == "upstream":
321
+ yield FineGrainedLineageClass(
322
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
323
+ upstreams=[
324
+ make_schema_field_urn(
325
+ parent_urn=storage_urn,
326
+ field_path=matching_field.fieldPath,
327
+ )
328
+ ],
329
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
330
+ downstreams=[
331
+ make_schema_field_urn(
332
+ parent_urn=dataset_urn,
333
+ field_path=dataset_path,
334
+ )
335
+ ],
336
+ )
337
+ else:
338
+ yield FineGrainedLineageClass(
339
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
340
+ upstreams=[
341
+ make_schema_field_urn(
342
+ parent_urn=dataset_urn,
343
+ field_path=dataset_path,
344
+ )
345
+ ],
346
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
347
+ downstreams=[
348
+ make_schema_field_urn(
349
+ parent_urn=storage_urn,
350
+ field_path=matching_field.fieldPath,
351
+ )
352
+ ],
353
+ )
354
+
355
+ def _create_lineage_mcp(
356
+ self,
357
+ source_urn: str,
358
+ target_urn: str,
359
+ fine_grained_lineages: Optional[Iterable[FineGrainedLineageClass]] = None,
360
+ ) -> Iterable[MetadataWorkUnit]:
361
+ """Create lineage MCP between source and target datasets"""
362
+
363
+ lineages_list = (
364
+ list(fine_grained_lineages) if fine_grained_lineages is not None else None
365
+ )
366
+
367
+ upstream_lineage = UpstreamLineageClass(
368
+ upstreams=[
369
+ UpstreamClass(dataset=source_urn, type=DatasetLineageTypeClass.COPY)
370
+ ],
371
+ fineGrainedLineages=lineages_list,
372
+ )
373
+
374
+ yield MetadataWorkUnit(
375
+ id=f"{source_urn}-{target_urn}-lineage",
376
+ mcp=MetadataChangeProposalWrapper(
377
+ entityUrn=target_urn, aspect=upstream_lineage
378
+ ),
379
+ )
380
+
381
+ def get_storage_dataset_mcp(
382
+ self,
383
+ storage_location: str,
384
+ platform_instance: Optional[str] = None,
385
+ schema_metadata: Optional[SchemaMetadataClass] = None,
386
+ ) -> Iterable[MetadataWorkUnit]:
387
+ """
388
+ Generate MCPs for storage dataset if needed.
389
+ This creates the storage dataset entity in DataHub.
390
+ """
391
+
392
+ storage_info = StoragePathParser.parse_storage_location(
393
+ storage_location,
394
+ )
395
+ if not storage_info:
396
+ return
397
+
398
+ platform, path = storage_info
399
+ platform_name = StoragePathParser.get_platform_name(platform)
400
+
401
+ if self.convert_urns_to_lowercase:
402
+ platform_name = platform_name.lower()
403
+ path = path.lower()
404
+ if self.config.storage_platform_instance:
405
+ platform_instance = self.config.storage_platform_instance.lower()
406
+
407
+ try:
408
+ storage_urn = make_dataset_urn_with_platform_instance(
409
+ platform=platform_name,
410
+ name=path,
411
+ env=self.env,
412
+ platform_instance=platform_instance,
413
+ )
414
+
415
+ # Dataset properties
416
+ props = DatasetPropertiesClass(name=path)
417
+ yield MetadataWorkUnit(
418
+ id=f"storage-{storage_urn}-props",
419
+ mcp=MetadataChangeProposalWrapper(
420
+ entityUrn=storage_urn,
421
+ aspect=props,
422
+ ),
423
+ )
424
+
425
+ # Platform instance
426
+ platform_instance_aspect = self._make_dataset_platform_instance(
427
+ platform=platform_name,
428
+ instance=platform_instance,
429
+ )
430
+ yield MetadataWorkUnit(
431
+ id=f"storage-{storage_urn}-platform",
432
+ mcp=MetadataChangeProposalWrapper(
433
+ entityUrn=storage_urn, aspect=platform_instance_aspect
434
+ ),
435
+ )
436
+
437
+ # Schema if available
438
+ if schema_metadata:
439
+ storage_schema = SchemaMetadataClass(
440
+ schemaName=f"{platform.value}_schema",
441
+ platform=f"urn:li:dataPlatform:{platform.value}",
442
+ version=0,
443
+ fields=schema_metadata.fields,
444
+ hash="",
445
+ platformSchema=OtherSchemaClass(rawSchema=""),
446
+ )
447
+ yield MetadataWorkUnit(
448
+ id=f"storage-{storage_urn}-schema",
449
+ mcp=MetadataChangeProposalWrapper(
450
+ entityUrn=storage_urn, aspect=storage_schema
451
+ ),
452
+ )
453
+
454
+ except Exception as e:
455
+ logger.error(
456
+ f"Failed to create storage dataset MCPs for {storage_location}: {e}"
457
+ )
458
+ return
459
+
460
+ def get_lineage_mcp(
461
+ self,
462
+ dataset_urn: str,
463
+ table: Dict[str, Any],
464
+ dataset_schema: Optional[SchemaMetadataClass] = None,
465
+ ) -> Iterable[MetadataWorkUnit]:
466
+ """
467
+ Generate lineage MCP for a Hive table to its storage location.
468
+
469
+ Args:
470
+ dataset_urn: URN of the Hive dataset
471
+ table: Hive table dictionary containing metadata
472
+ dataset_schema: Optional schema metadata for the Hive dataset
473
+
474
+ Returns:
475
+ MetadataWorkUnit containing the lineage MCP if successful
476
+ """
477
+
478
+ platform_instance = None
479
+
480
+ if not self.config.emit_storage_lineage:
481
+ return
482
+
483
+ # Get storage location from table
484
+ storage_location = table.get("StorageDescriptor", {}).get("Location")
485
+ if not storage_location:
486
+ return
487
+
488
+ # Create storage dataset URN
489
+ storage_info = self._make_storage_dataset_urn(storage_location)
490
+ if not storage_info:
491
+ self.report.report_location_failed(storage_location)
492
+ return
493
+
494
+ storage_urn, storage_platform = storage_info
495
+ self.report.report_location_scanned()
496
+
497
+ if self.config.storage_platform_instance:
498
+ platform_instance = self.config.storage_platform_instance.lower()
499
+
500
+ # Create storage dataset entity
501
+ yield from self.get_storage_dataset_mcp(
502
+ storage_location=storage_location,
503
+ platform_instance=platform_instance,
504
+ schema_metadata=dataset_schema,
505
+ )
506
+
507
+ # Get storage schema if available (implement based on storage system)
508
+ storage_schema = (
509
+ self._get_storage_schema(storage_location, dataset_schema)
510
+ if dataset_schema
511
+ else None
512
+ )
513
+
514
+ # Generate fine-grained lineage if schemas available
515
+ fine_grained_lineages = (
516
+ None
517
+ if not (dataset_schema and storage_schema)
518
+ else self._get_fine_grained_lineages(
519
+ dataset_urn, storage_urn, dataset_schema, storage_schema
520
+ )
521
+ )
522
+
523
+ # Create lineage MCP
524
+ if self.config.hive_storage_lineage_direction == "upstream":
525
+ yield from self._create_lineage_mcp(
526
+ source_urn=storage_urn,
527
+ target_urn=dataset_urn,
528
+ fine_grained_lineages=fine_grained_lineages,
529
+ )
530
+ else:
531
+ yield from self._create_lineage_mcp(
532
+ source_urn=dataset_urn,
533
+ target_urn=storage_urn,
534
+ fine_grained_lineages=fine_grained_lineages,
535
+ )
536
+
537
+ def _get_storage_schema(
538
+ self,
539
+ storage_location: str,
540
+ table_schema: Optional[SchemaMetadataClass] = None,
541
+ ) -> Optional[SchemaMetadataClass]:
542
+ """
543
+ Get schema metadata for storage location.
544
+ Currently supports:
545
+ - Delta tables
546
+ - Parquet files
547
+ - Spark tables
548
+
549
+ Returns:
550
+ SchemaMetadataClass if schema can be inferred, None otherwise
551
+ """
552
+
553
+ if not table_schema:
554
+ return None
555
+
556
+ storage_info = StoragePathParser.parse_storage_location(storage_location)
557
+ if not storage_info:
558
+ return None
559
+
560
+ platform, _ = storage_info
561
+
562
+ return SchemaMetadataClass(
563
+ schemaName=f"{platform.value}_schema",
564
+ platform=f"urn:li:dataPlatform:{platform.value}",
565
+ version=0,
566
+ fields=table_schema.fields,
567
+ hash="",
568
+ platformSchema=OtherSchemaClass(rawSchema=""),
569
+ )
570
+
571
+
49
572
  try:
50
573
  from databricks_dbapi.sqlalchemy_dialects.hive import DatabricksPyhiveDialect
51
574
  from pyhive.sqlalchemy_hive import _type_map
@@ -94,8 +617,8 @@ try:
94
617
  DatabricksPyhiveDialect.get_columns = dbapi_get_columns_patched
95
618
  except ModuleNotFoundError:
96
619
  pass
97
- except Exception as e:
98
- logger.warning(f"Failed to patch method due to {e}")
620
+ except Exception as exp:
621
+ logger.warning(f"Failed to patch method due to {exp}")
99
622
 
100
623
 
101
624
  @reflection.cache # type: ignore
@@ -126,10 +649,48 @@ class HiveConfig(TwoTierSQLAlchemyConfig):
126
649
  # defaults
127
650
  scheme: str = Field(default="hive", hidden_from_docs=True)
128
651
 
652
+ # Overriding as table location lineage is richer implementation here than with include_table_location_lineage
653
+ include_table_location_lineage: bool = Field(default=False, hidden_from_docs=True)
654
+
655
+ emit_storage_lineage: bool = Field(
656
+ default=False,
657
+ description="Whether to emit storage-to-Hive lineage",
658
+ )
659
+ hive_storage_lineage_direction: str = Field(
660
+ default="upstream",
661
+ description="If 'upstream', storage is upstream to Hive. If 'downstream' storage is downstream to Hive",
662
+ )
663
+ include_column_lineage: bool = Field(
664
+ default=True,
665
+ description="When enabled, column-level lineage will be extracted from storage",
666
+ )
667
+ storage_platform_instance: Optional[str] = Field(
668
+ default=None,
669
+ description="Platform instance for the storage system",
670
+ )
671
+
129
672
  @validator("host_port")
130
673
  def clean_host_port(cls, v):
131
674
  return config_clean.remove_protocol(v)
132
675
 
676
+ @validator("hive_storage_lineage_direction")
677
+ def _validate_direction(cls, v: str) -> str:
678
+ """Validate the lineage direction."""
679
+ if v.lower() not in ["upstream", "downstream"]:
680
+ raise ValueError(
681
+ "storage_lineage_direction must be either upstream or downstream"
682
+ )
683
+ return v.lower()
684
+
685
+ def get_storage_lineage_config(self) -> HiveStorageLineageConfig:
686
+ """Convert base config parameters to HiveStorageLineageConfig"""
687
+ return HiveStorageLineageConfig(
688
+ emit_storage_lineage=self.emit_storage_lineage,
689
+ hive_storage_lineage_direction=self.hive_storage_lineage_direction,
690
+ include_column_lineage=self.include_column_lineage,
691
+ storage_platform_instance=self.storage_platform_instance,
692
+ )
693
+
133
694
 
134
695
  @platform_name("Hive")
135
696
  @config_class(HiveConfig)
@@ -151,12 +712,49 @@ class HiveSource(TwoTierSQLAlchemySource):
151
712
 
152
713
  def __init__(self, config, ctx):
153
714
  super().__init__(config, ctx, "hive")
715
+ self.storage_lineage = HiveStorageLineage(
716
+ config=config.get_storage_lineage_config(),
717
+ env=config.env,
718
+ convert_urns_to_lowercase=config.convert_urns_to_lowercase,
719
+ )
154
720
 
155
721
  @classmethod
156
722
  def create(cls, config_dict, ctx):
157
723
  config = HiveConfig.parse_obj(config_dict)
158
724
  return cls(config, ctx)
159
725
 
726
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
727
+ """Generate workunits for tables and their storage lineage."""
728
+ for wu in super().get_workunits_internal():
729
+ yield wu
730
+
731
+ if not isinstance(wu, MetadataWorkUnit):
732
+ continue
733
+
734
+ # Get dataset URN and required aspects using workunit methods
735
+ try:
736
+ dataset_urn = wu.get_urn()
737
+ dataset_props = wu.get_aspect_of_type(DatasetPropertiesClass)
738
+ schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass)
739
+ except Exception as exp:
740
+ logger.warning(f"Failed to process workunit {wu.id}: {exp}")
741
+ continue
742
+
743
+ # Only proceed if we have the necessary properties
744
+ if dataset_props and dataset_props.customProperties:
745
+ table = {
746
+ "StorageDescriptor": {
747
+ "Location": dataset_props.customProperties.get("Location")
748
+ }
749
+ }
750
+
751
+ if table.get("StorageDescriptor", {}).get("Location"):
752
+ yield from self.storage_lineage.get_lineage_mcp(
753
+ dataset_urn=dataset_urn,
754
+ table=table,
755
+ dataset_schema=schema_metadata,
756
+ )
757
+
160
758
  def get_schema_names(self, inspector):
161
759
  assert isinstance(self.config, HiveConfig)
162
760
  # This condition restricts the ingestion to the specified database.
@@ -173,7 +771,7 @@ class HiveSource(TwoTierSQLAlchemySource):
173
771
  pk_constraints: Optional[Dict[Any, Any]] = None,
174
772
  partition_keys: Optional[List[str]] = None,
175
773
  tags: Optional[List[str]] = None,
176
- ) -> List[SchemaField]:
774
+ ) -> List[SchemaFieldClass]:
177
775
  fields = super().get_schema_fields_for_column(
178
776
  dataset_name,
179
777
  column,