acryl-datahub 0.15.0.1rc2__py3-none-any.whl → 0.15.0.1rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc2.dist-info → acryl_datahub-0.15.0.1rc3.dist-info}/METADATA +2459 -2459
- {acryl_datahub-0.15.0.1rc2.dist-info → acryl_datahub-0.15.0.1rc3.dist-info}/RECORD +12 -12
- datahub/__init__.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/sql/hive.py +606 -8
- datahub/ingestion/source/sql/mssql/job_models.py +26 -0
- datahub/ingestion/source/sql/mssql/source.py +10 -0
- datahub/ingestion/source/tableau/tableau.py +11 -7
- {acryl_datahub-0.15.0.1rc2.dist-info → acryl_datahub-0.15.0.1rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc2.dist-info → acryl_datahub-0.15.0.1rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc2.dist-info → acryl_datahub-0.15.0.1rc3.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
-
from
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
7
|
+
from urllib.parse import urlparse
|
|
5
8
|
|
|
6
9
|
from pydantic.class_validators import validator
|
|
7
10
|
from pydantic.fields import Field
|
|
@@ -11,7 +14,12 @@ from pyhive import hive # noqa: F401
|
|
|
11
14
|
from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveDialect, HiveTimestamp
|
|
12
15
|
from sqlalchemy.engine.reflection import Inspector
|
|
13
16
|
|
|
14
|
-
from datahub.emitter.mce_builder import
|
|
17
|
+
from datahub.emitter.mce_builder import (
|
|
18
|
+
make_data_platform_urn,
|
|
19
|
+
make_dataplatform_instance_urn,
|
|
20
|
+
make_dataset_urn_with_platform_instance,
|
|
21
|
+
make_schema_field_urn,
|
|
22
|
+
)
|
|
15
23
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
16
24
|
from datahub.ingestion.api.decorators import (
|
|
17
25
|
SourceCapability,
|
|
@@ -29,14 +37,24 @@ from datahub.ingestion.source.sql.two_tier_sql_source import (
|
|
|
29
37
|
TwoTierSQLAlchemyConfig,
|
|
30
38
|
TwoTierSQLAlchemySource,
|
|
31
39
|
)
|
|
32
|
-
from datahub.metadata.
|
|
40
|
+
from datahub.metadata.schema_classes import (
|
|
41
|
+
DataPlatformInstanceClass,
|
|
42
|
+
DatasetLineageTypeClass,
|
|
43
|
+
DatasetPropertiesClass,
|
|
33
44
|
DateTypeClass,
|
|
45
|
+
FineGrainedLineageClass,
|
|
46
|
+
FineGrainedLineageDownstreamTypeClass,
|
|
47
|
+
FineGrainedLineageUpstreamTypeClass,
|
|
34
48
|
NullTypeClass,
|
|
35
49
|
NumberTypeClass,
|
|
36
|
-
|
|
50
|
+
OtherSchemaClass,
|
|
51
|
+
SchemaFieldClass,
|
|
52
|
+
SchemaMetadataClass,
|
|
37
53
|
TimeTypeClass,
|
|
54
|
+
UpstreamClass,
|
|
55
|
+
UpstreamLineageClass,
|
|
56
|
+
ViewPropertiesClass,
|
|
38
57
|
)
|
|
39
|
-
from datahub.metadata.schema_classes import ViewPropertiesClass
|
|
40
58
|
from datahub.utilities import config_clean
|
|
41
59
|
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
|
|
42
60
|
|
|
@@ -46,6 +64,511 @@ register_custom_type(HiveDate, DateTypeClass)
|
|
|
46
64
|
register_custom_type(HiveTimestamp, TimeTypeClass)
|
|
47
65
|
register_custom_type(HiveDecimal, NumberTypeClass)
|
|
48
66
|
|
|
67
|
+
|
|
68
|
+
class StoragePlatform(Enum):
|
|
69
|
+
"""Enumeration of storage platforms supported for lineage"""
|
|
70
|
+
|
|
71
|
+
S3 = "s3"
|
|
72
|
+
AZURE = "abs"
|
|
73
|
+
GCS = "gcs"
|
|
74
|
+
DBFS = "dbfs"
|
|
75
|
+
LOCAL = "file"
|
|
76
|
+
HDFS = "hdfs"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# Mapping of URL schemes to storage platforms
|
|
80
|
+
STORAGE_SCHEME_MAPPING = {
|
|
81
|
+
# S3 and derivatives
|
|
82
|
+
"s3": StoragePlatform.S3,
|
|
83
|
+
"s3a": StoragePlatform.S3,
|
|
84
|
+
"s3n": StoragePlatform.S3,
|
|
85
|
+
# Azure and derivatives
|
|
86
|
+
"abfs": StoragePlatform.AZURE,
|
|
87
|
+
"abfss": StoragePlatform.AZURE,
|
|
88
|
+
"adl": StoragePlatform.AZURE,
|
|
89
|
+
"adls": StoragePlatform.AZURE,
|
|
90
|
+
"wasb": StoragePlatform.AZURE,
|
|
91
|
+
"wasbs": StoragePlatform.AZURE,
|
|
92
|
+
# GCS and derivatives
|
|
93
|
+
"gs": StoragePlatform.GCS,
|
|
94
|
+
"gcs": StoragePlatform.GCS,
|
|
95
|
+
# DBFS
|
|
96
|
+
"dbfs": StoragePlatform.DBFS,
|
|
97
|
+
# Local filesystem
|
|
98
|
+
"file": StoragePlatform.LOCAL,
|
|
99
|
+
# HDFS
|
|
100
|
+
"hdfs": StoragePlatform.HDFS,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class StoragePathParser:
|
|
105
|
+
"""Parser for storage paths with platform-specific logic"""
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def parse_storage_location(location: str) -> Optional[Tuple[StoragePlatform, str]]:
|
|
109
|
+
"""
|
|
110
|
+
Parse a storage location into platform and normalized path.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
location: Storage location URI (e.g., s3://bucket/path, abfss://container@account.dfs.core.windows.net/path)
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Tuple of (StoragePlatform, normalized_path) if valid, None if invalid
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
# Handle special case for local files with no scheme
|
|
121
|
+
if location.startswith("/"):
|
|
122
|
+
return StoragePlatform.LOCAL, location
|
|
123
|
+
|
|
124
|
+
# Parse the URI
|
|
125
|
+
parsed = urlparse(location)
|
|
126
|
+
scheme = parsed.scheme.lower()
|
|
127
|
+
|
|
128
|
+
if not scheme:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
# Look up the platform
|
|
132
|
+
platform = STORAGE_SCHEME_MAPPING.get(scheme)
|
|
133
|
+
if not platform:
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
# Get normalized path based on platform
|
|
137
|
+
if platform == StoragePlatform.S3:
|
|
138
|
+
# For S3, combine bucket and path
|
|
139
|
+
path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
|
|
140
|
+
|
|
141
|
+
elif platform == StoragePlatform.AZURE:
|
|
142
|
+
if scheme in ("abfs", "abfss"):
|
|
143
|
+
# Format: abfss://container@account.dfs.core.windows.net/path
|
|
144
|
+
container = parsed.netloc.split("@")[0]
|
|
145
|
+
path = f"{container}/{parsed.path.lstrip('/')}"
|
|
146
|
+
else:
|
|
147
|
+
# Handle other Azure schemes
|
|
148
|
+
path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
|
|
149
|
+
|
|
150
|
+
elif platform == StoragePlatform.GCS:
|
|
151
|
+
# For GCS, combine bucket and path
|
|
152
|
+
path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
|
|
153
|
+
|
|
154
|
+
elif platform == StoragePlatform.DBFS:
|
|
155
|
+
# For DBFS, use path as-is
|
|
156
|
+
path = parsed.path.lstrip("/")
|
|
157
|
+
|
|
158
|
+
elif platform == StoragePlatform.LOCAL:
|
|
159
|
+
# For local files, use full path
|
|
160
|
+
path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
|
|
161
|
+
|
|
162
|
+
elif platform == StoragePlatform.HDFS:
|
|
163
|
+
# For HDFS, use full path
|
|
164
|
+
path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
|
|
165
|
+
|
|
166
|
+
else:
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
# Clean up the path
|
|
170
|
+
path = path.rstrip("/") # Remove trailing slashes
|
|
171
|
+
path = re.sub(r"/+", "/", path) # Normalize multiple slashes
|
|
172
|
+
path = f"/{path}"
|
|
173
|
+
|
|
174
|
+
return platform, path
|
|
175
|
+
|
|
176
|
+
except Exception as exp:
|
|
177
|
+
logger.warning(f"Failed to parse storage location {location}: {exp}")
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def get_platform_name(platform: StoragePlatform) -> str:
|
|
182
|
+
"""Get the platform name to use in URNs"""
|
|
183
|
+
|
|
184
|
+
platform_names = {
|
|
185
|
+
StoragePlatform.S3: "s3",
|
|
186
|
+
StoragePlatform.AZURE: "adls",
|
|
187
|
+
StoragePlatform.GCS: "gcs",
|
|
188
|
+
StoragePlatform.DBFS: "dbfs",
|
|
189
|
+
StoragePlatform.LOCAL: "file",
|
|
190
|
+
StoragePlatform.HDFS: "hdfs",
|
|
191
|
+
}
|
|
192
|
+
return platform_names[platform]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class HiveStorageLineageConfig:
|
|
196
|
+
"""Configuration for Hive storage lineage."""
|
|
197
|
+
|
|
198
|
+
def __init__(
|
|
199
|
+
self,
|
|
200
|
+
emit_storage_lineage: bool,
|
|
201
|
+
hive_storage_lineage_direction: str,
|
|
202
|
+
include_column_lineage: bool,
|
|
203
|
+
storage_platform_instance: Optional[str],
|
|
204
|
+
):
|
|
205
|
+
if hive_storage_lineage_direction.lower() not in ["upstream", "downstream"]:
|
|
206
|
+
raise ValueError(
|
|
207
|
+
"hive_storage_lineage_direction must be either upstream or downstream"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
self.emit_storage_lineage = emit_storage_lineage
|
|
211
|
+
self.hive_storage_lineage_direction = hive_storage_lineage_direction.lower()
|
|
212
|
+
self.include_column_lineage = include_column_lineage
|
|
213
|
+
self.storage_platform_instance = storage_platform_instance
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@dataclass
|
|
217
|
+
class HiveStorageSourceReport:
|
|
218
|
+
"""Report for tracking storage lineage statistics"""
|
|
219
|
+
|
|
220
|
+
storage_locations_scanned: int = 0
|
|
221
|
+
filtered_locations: List[str] = Field(default_factory=list)
|
|
222
|
+
failed_locations: List[str] = Field(default_factory=list)
|
|
223
|
+
|
|
224
|
+
def report_location_scanned(self) -> None:
|
|
225
|
+
self.storage_locations_scanned += 1
|
|
226
|
+
|
|
227
|
+
def report_location_filtered(self, location: str) -> None:
|
|
228
|
+
self.filtered_locations.append(location)
|
|
229
|
+
|
|
230
|
+
def report_location_failed(self, location: str) -> None:
|
|
231
|
+
self.failed_locations.append(location)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
class HiveStorageLineage:
|
|
235
|
+
"""Handles storage lineage for Hive tables"""
|
|
236
|
+
|
|
237
|
+
def __init__(
|
|
238
|
+
self,
|
|
239
|
+
config: HiveStorageLineageConfig,
|
|
240
|
+
env: str,
|
|
241
|
+
convert_urns_to_lowercase: bool = False,
|
|
242
|
+
):
|
|
243
|
+
self.config = config
|
|
244
|
+
self.env = env
|
|
245
|
+
self.convert_urns_to_lowercase = convert_urns_to_lowercase
|
|
246
|
+
self.report = HiveStorageSourceReport()
|
|
247
|
+
|
|
248
|
+
def _make_dataset_platform_instance(
|
|
249
|
+
self,
|
|
250
|
+
platform: str,
|
|
251
|
+
instance: Optional[str],
|
|
252
|
+
) -> DataPlatformInstanceClass:
|
|
253
|
+
"""Create DataPlatformInstance aspect"""
|
|
254
|
+
|
|
255
|
+
return DataPlatformInstanceClass(
|
|
256
|
+
platform=make_data_platform_urn(platform),
|
|
257
|
+
instance=make_dataplatform_instance_urn(platform, instance)
|
|
258
|
+
if instance
|
|
259
|
+
else None,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
def _make_storage_dataset_urn(
|
|
263
|
+
self,
|
|
264
|
+
storage_location: str,
|
|
265
|
+
) -> Optional[Tuple[str, str]]:
|
|
266
|
+
"""
|
|
267
|
+
Create storage dataset URN from location.
|
|
268
|
+
Returns tuple of (urn, platform) if successful, None otherwise.
|
|
269
|
+
"""
|
|
270
|
+
|
|
271
|
+
platform_instance = None
|
|
272
|
+
storage_info = StoragePathParser.parse_storage_location(storage_location)
|
|
273
|
+
if not storage_info:
|
|
274
|
+
logger.debug(f"Could not parse storage location: {storage_location}")
|
|
275
|
+
return None
|
|
276
|
+
|
|
277
|
+
platform, path = storage_info
|
|
278
|
+
platform_name = StoragePathParser.get_platform_name(platform)
|
|
279
|
+
|
|
280
|
+
if self.convert_urns_to_lowercase:
|
|
281
|
+
platform_name = platform_name.lower()
|
|
282
|
+
path = path.lower()
|
|
283
|
+
if self.config.storage_platform_instance:
|
|
284
|
+
platform_instance = self.config.storage_platform_instance.lower()
|
|
285
|
+
|
|
286
|
+
try:
|
|
287
|
+
storage_urn = make_dataset_urn_with_platform_instance(
|
|
288
|
+
platform=platform_name,
|
|
289
|
+
name=path,
|
|
290
|
+
env=self.env,
|
|
291
|
+
platform_instance=platform_instance,
|
|
292
|
+
)
|
|
293
|
+
return storage_urn, platform_name
|
|
294
|
+
except Exception as exp:
|
|
295
|
+
logger.error(f"Failed to create URN for {platform_name}:{path}: {exp}")
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
def _get_fine_grained_lineages(
|
|
299
|
+
self,
|
|
300
|
+
dataset_urn: str,
|
|
301
|
+
storage_urn: str,
|
|
302
|
+
dataset_schema: SchemaMetadataClass,
|
|
303
|
+
storage_schema: SchemaMetadataClass,
|
|
304
|
+
) -> Iterable[FineGrainedLineageClass]:
|
|
305
|
+
"""Generate column-level lineage between dataset and storage"""
|
|
306
|
+
|
|
307
|
+
if not self.config.include_column_lineage:
|
|
308
|
+
return
|
|
309
|
+
|
|
310
|
+
for dataset_field in dataset_schema.fields:
|
|
311
|
+
dataset_path = dataset_field.fieldPath
|
|
312
|
+
|
|
313
|
+
# Find matching field in storage schema
|
|
314
|
+
matching_field = next(
|
|
315
|
+
(f for f in storage_schema.fields if f.fieldPath == dataset_path),
|
|
316
|
+
None,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
if matching_field:
|
|
320
|
+
if self.config.hive_storage_lineage_direction == "upstream":
|
|
321
|
+
yield FineGrainedLineageClass(
|
|
322
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
323
|
+
upstreams=[
|
|
324
|
+
make_schema_field_urn(
|
|
325
|
+
parent_urn=storage_urn,
|
|
326
|
+
field_path=matching_field.fieldPath,
|
|
327
|
+
)
|
|
328
|
+
],
|
|
329
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
330
|
+
downstreams=[
|
|
331
|
+
make_schema_field_urn(
|
|
332
|
+
parent_urn=dataset_urn,
|
|
333
|
+
field_path=dataset_path,
|
|
334
|
+
)
|
|
335
|
+
],
|
|
336
|
+
)
|
|
337
|
+
else:
|
|
338
|
+
yield FineGrainedLineageClass(
|
|
339
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
340
|
+
upstreams=[
|
|
341
|
+
make_schema_field_urn(
|
|
342
|
+
parent_urn=dataset_urn,
|
|
343
|
+
field_path=dataset_path,
|
|
344
|
+
)
|
|
345
|
+
],
|
|
346
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
347
|
+
downstreams=[
|
|
348
|
+
make_schema_field_urn(
|
|
349
|
+
parent_urn=storage_urn,
|
|
350
|
+
field_path=matching_field.fieldPath,
|
|
351
|
+
)
|
|
352
|
+
],
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def _create_lineage_mcp(
|
|
356
|
+
self,
|
|
357
|
+
source_urn: str,
|
|
358
|
+
target_urn: str,
|
|
359
|
+
fine_grained_lineages: Optional[Iterable[FineGrainedLineageClass]] = None,
|
|
360
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
361
|
+
"""Create lineage MCP between source and target datasets"""
|
|
362
|
+
|
|
363
|
+
lineages_list = (
|
|
364
|
+
list(fine_grained_lineages) if fine_grained_lineages is not None else None
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
upstream_lineage = UpstreamLineageClass(
|
|
368
|
+
upstreams=[
|
|
369
|
+
UpstreamClass(dataset=source_urn, type=DatasetLineageTypeClass.COPY)
|
|
370
|
+
],
|
|
371
|
+
fineGrainedLineages=lineages_list,
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
yield MetadataWorkUnit(
|
|
375
|
+
id=f"{source_urn}-{target_urn}-lineage",
|
|
376
|
+
mcp=MetadataChangeProposalWrapper(
|
|
377
|
+
entityUrn=target_urn, aspect=upstream_lineage
|
|
378
|
+
),
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
def get_storage_dataset_mcp(
|
|
382
|
+
self,
|
|
383
|
+
storage_location: str,
|
|
384
|
+
platform_instance: Optional[str] = None,
|
|
385
|
+
schema_metadata: Optional[SchemaMetadataClass] = None,
|
|
386
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
387
|
+
"""
|
|
388
|
+
Generate MCPs for storage dataset if needed.
|
|
389
|
+
This creates the storage dataset entity in DataHub.
|
|
390
|
+
"""
|
|
391
|
+
|
|
392
|
+
storage_info = StoragePathParser.parse_storage_location(
|
|
393
|
+
storage_location,
|
|
394
|
+
)
|
|
395
|
+
if not storage_info:
|
|
396
|
+
return
|
|
397
|
+
|
|
398
|
+
platform, path = storage_info
|
|
399
|
+
platform_name = StoragePathParser.get_platform_name(platform)
|
|
400
|
+
|
|
401
|
+
if self.convert_urns_to_lowercase:
|
|
402
|
+
platform_name = platform_name.lower()
|
|
403
|
+
path = path.lower()
|
|
404
|
+
if self.config.storage_platform_instance:
|
|
405
|
+
platform_instance = self.config.storage_platform_instance.lower()
|
|
406
|
+
|
|
407
|
+
try:
|
|
408
|
+
storage_urn = make_dataset_urn_with_platform_instance(
|
|
409
|
+
platform=platform_name,
|
|
410
|
+
name=path,
|
|
411
|
+
env=self.env,
|
|
412
|
+
platform_instance=platform_instance,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# Dataset properties
|
|
416
|
+
props = DatasetPropertiesClass(name=path)
|
|
417
|
+
yield MetadataWorkUnit(
|
|
418
|
+
id=f"storage-{storage_urn}-props",
|
|
419
|
+
mcp=MetadataChangeProposalWrapper(
|
|
420
|
+
entityUrn=storage_urn,
|
|
421
|
+
aspect=props,
|
|
422
|
+
),
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Platform instance
|
|
426
|
+
platform_instance_aspect = self._make_dataset_platform_instance(
|
|
427
|
+
platform=platform_name,
|
|
428
|
+
instance=platform_instance,
|
|
429
|
+
)
|
|
430
|
+
yield MetadataWorkUnit(
|
|
431
|
+
id=f"storage-{storage_urn}-platform",
|
|
432
|
+
mcp=MetadataChangeProposalWrapper(
|
|
433
|
+
entityUrn=storage_urn, aspect=platform_instance_aspect
|
|
434
|
+
),
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
# Schema if available
|
|
438
|
+
if schema_metadata:
|
|
439
|
+
storage_schema = SchemaMetadataClass(
|
|
440
|
+
schemaName=f"{platform.value}_schema",
|
|
441
|
+
platform=f"urn:li:dataPlatform:{platform.value}",
|
|
442
|
+
version=0,
|
|
443
|
+
fields=schema_metadata.fields,
|
|
444
|
+
hash="",
|
|
445
|
+
platformSchema=OtherSchemaClass(rawSchema=""),
|
|
446
|
+
)
|
|
447
|
+
yield MetadataWorkUnit(
|
|
448
|
+
id=f"storage-{storage_urn}-schema",
|
|
449
|
+
mcp=MetadataChangeProposalWrapper(
|
|
450
|
+
entityUrn=storage_urn, aspect=storage_schema
|
|
451
|
+
),
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
except Exception as e:
|
|
455
|
+
logger.error(
|
|
456
|
+
f"Failed to create storage dataset MCPs for {storage_location}: {e}"
|
|
457
|
+
)
|
|
458
|
+
return
|
|
459
|
+
|
|
460
|
+
def get_lineage_mcp(
|
|
461
|
+
self,
|
|
462
|
+
dataset_urn: str,
|
|
463
|
+
table: Dict[str, Any],
|
|
464
|
+
dataset_schema: Optional[SchemaMetadataClass] = None,
|
|
465
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
466
|
+
"""
|
|
467
|
+
Generate lineage MCP for a Hive table to its storage location.
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
dataset_urn: URN of the Hive dataset
|
|
471
|
+
table: Hive table dictionary containing metadata
|
|
472
|
+
dataset_schema: Optional schema metadata for the Hive dataset
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
MetadataWorkUnit containing the lineage MCP if successful
|
|
476
|
+
"""
|
|
477
|
+
|
|
478
|
+
platform_instance = None
|
|
479
|
+
|
|
480
|
+
if not self.config.emit_storage_lineage:
|
|
481
|
+
return
|
|
482
|
+
|
|
483
|
+
# Get storage location from table
|
|
484
|
+
storage_location = table.get("StorageDescriptor", {}).get("Location")
|
|
485
|
+
if not storage_location:
|
|
486
|
+
return
|
|
487
|
+
|
|
488
|
+
# Create storage dataset URN
|
|
489
|
+
storage_info = self._make_storage_dataset_urn(storage_location)
|
|
490
|
+
if not storage_info:
|
|
491
|
+
self.report.report_location_failed(storage_location)
|
|
492
|
+
return
|
|
493
|
+
|
|
494
|
+
storage_urn, storage_platform = storage_info
|
|
495
|
+
self.report.report_location_scanned()
|
|
496
|
+
|
|
497
|
+
if self.config.storage_platform_instance:
|
|
498
|
+
platform_instance = self.config.storage_platform_instance.lower()
|
|
499
|
+
|
|
500
|
+
# Create storage dataset entity
|
|
501
|
+
yield from self.get_storage_dataset_mcp(
|
|
502
|
+
storage_location=storage_location,
|
|
503
|
+
platform_instance=platform_instance,
|
|
504
|
+
schema_metadata=dataset_schema,
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
# Get storage schema if available (implement based on storage system)
|
|
508
|
+
storage_schema = (
|
|
509
|
+
self._get_storage_schema(storage_location, dataset_schema)
|
|
510
|
+
if dataset_schema
|
|
511
|
+
else None
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Generate fine-grained lineage if schemas available
|
|
515
|
+
fine_grained_lineages = (
|
|
516
|
+
None
|
|
517
|
+
if not (dataset_schema and storage_schema)
|
|
518
|
+
else self._get_fine_grained_lineages(
|
|
519
|
+
dataset_urn, storage_urn, dataset_schema, storage_schema
|
|
520
|
+
)
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
# Create lineage MCP
|
|
524
|
+
if self.config.hive_storage_lineage_direction == "upstream":
|
|
525
|
+
yield from self._create_lineage_mcp(
|
|
526
|
+
source_urn=storage_urn,
|
|
527
|
+
target_urn=dataset_urn,
|
|
528
|
+
fine_grained_lineages=fine_grained_lineages,
|
|
529
|
+
)
|
|
530
|
+
else:
|
|
531
|
+
yield from self._create_lineage_mcp(
|
|
532
|
+
source_urn=dataset_urn,
|
|
533
|
+
target_urn=storage_urn,
|
|
534
|
+
fine_grained_lineages=fine_grained_lineages,
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
def _get_storage_schema(
|
|
538
|
+
self,
|
|
539
|
+
storage_location: str,
|
|
540
|
+
table_schema: Optional[SchemaMetadataClass] = None,
|
|
541
|
+
) -> Optional[SchemaMetadataClass]:
|
|
542
|
+
"""
|
|
543
|
+
Get schema metadata for storage location.
|
|
544
|
+
Currently supports:
|
|
545
|
+
- Delta tables
|
|
546
|
+
- Parquet files
|
|
547
|
+
- Spark tables
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
SchemaMetadataClass if schema can be inferred, None otherwise
|
|
551
|
+
"""
|
|
552
|
+
|
|
553
|
+
if not table_schema:
|
|
554
|
+
return None
|
|
555
|
+
|
|
556
|
+
storage_info = StoragePathParser.parse_storage_location(storage_location)
|
|
557
|
+
if not storage_info:
|
|
558
|
+
return None
|
|
559
|
+
|
|
560
|
+
platform, _ = storage_info
|
|
561
|
+
|
|
562
|
+
return SchemaMetadataClass(
|
|
563
|
+
schemaName=f"{platform.value}_schema",
|
|
564
|
+
platform=f"urn:li:dataPlatform:{platform.value}",
|
|
565
|
+
version=0,
|
|
566
|
+
fields=table_schema.fields,
|
|
567
|
+
hash="",
|
|
568
|
+
platformSchema=OtherSchemaClass(rawSchema=""),
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
|
|
49
572
|
try:
|
|
50
573
|
from databricks_dbapi.sqlalchemy_dialects.hive import DatabricksPyhiveDialect
|
|
51
574
|
from pyhive.sqlalchemy_hive import _type_map
|
|
@@ -94,8 +617,8 @@ try:
|
|
|
94
617
|
DatabricksPyhiveDialect.get_columns = dbapi_get_columns_patched
|
|
95
618
|
except ModuleNotFoundError:
|
|
96
619
|
pass
|
|
97
|
-
except Exception as
|
|
98
|
-
logger.warning(f"Failed to patch method due to {
|
|
620
|
+
except Exception as exp:
|
|
621
|
+
logger.warning(f"Failed to patch method due to {exp}")
|
|
99
622
|
|
|
100
623
|
|
|
101
624
|
@reflection.cache # type: ignore
|
|
@@ -126,10 +649,48 @@ class HiveConfig(TwoTierSQLAlchemyConfig):
|
|
|
126
649
|
# defaults
|
|
127
650
|
scheme: str = Field(default="hive", hidden_from_docs=True)
|
|
128
651
|
|
|
652
|
+
# Overriding as table location lineage is richer implementation here than with include_table_location_lineage
|
|
653
|
+
include_table_location_lineage: bool = Field(default=False, hidden_from_docs=True)
|
|
654
|
+
|
|
655
|
+
emit_storage_lineage: bool = Field(
|
|
656
|
+
default=False,
|
|
657
|
+
description="Whether to emit storage-to-Hive lineage",
|
|
658
|
+
)
|
|
659
|
+
hive_storage_lineage_direction: str = Field(
|
|
660
|
+
default="upstream",
|
|
661
|
+
description="If 'upstream', storage is upstream to Hive. If 'downstream' storage is downstream to Hive",
|
|
662
|
+
)
|
|
663
|
+
include_column_lineage: bool = Field(
|
|
664
|
+
default=True,
|
|
665
|
+
description="When enabled, column-level lineage will be extracted from storage",
|
|
666
|
+
)
|
|
667
|
+
storage_platform_instance: Optional[str] = Field(
|
|
668
|
+
default=None,
|
|
669
|
+
description="Platform instance for the storage system",
|
|
670
|
+
)
|
|
671
|
+
|
|
129
672
|
@validator("host_port")
|
|
130
673
|
def clean_host_port(cls, v):
|
|
131
674
|
return config_clean.remove_protocol(v)
|
|
132
675
|
|
|
676
|
+
@validator("hive_storage_lineage_direction")
|
|
677
|
+
def _validate_direction(cls, v: str) -> str:
|
|
678
|
+
"""Validate the lineage direction."""
|
|
679
|
+
if v.lower() not in ["upstream", "downstream"]:
|
|
680
|
+
raise ValueError(
|
|
681
|
+
"storage_lineage_direction must be either upstream or downstream"
|
|
682
|
+
)
|
|
683
|
+
return v.lower()
|
|
684
|
+
|
|
685
|
+
def get_storage_lineage_config(self) -> HiveStorageLineageConfig:
|
|
686
|
+
"""Convert base config parameters to HiveStorageLineageConfig"""
|
|
687
|
+
return HiveStorageLineageConfig(
|
|
688
|
+
emit_storage_lineage=self.emit_storage_lineage,
|
|
689
|
+
hive_storage_lineage_direction=self.hive_storage_lineage_direction,
|
|
690
|
+
include_column_lineage=self.include_column_lineage,
|
|
691
|
+
storage_platform_instance=self.storage_platform_instance,
|
|
692
|
+
)
|
|
693
|
+
|
|
133
694
|
|
|
134
695
|
@platform_name("Hive")
|
|
135
696
|
@config_class(HiveConfig)
|
|
@@ -151,12 +712,49 @@ class HiveSource(TwoTierSQLAlchemySource):
|
|
|
151
712
|
|
|
152
713
|
def __init__(self, config, ctx):
|
|
153
714
|
super().__init__(config, ctx, "hive")
|
|
715
|
+
self.storage_lineage = HiveStorageLineage(
|
|
716
|
+
config=config.get_storage_lineage_config(),
|
|
717
|
+
env=config.env,
|
|
718
|
+
convert_urns_to_lowercase=config.convert_urns_to_lowercase,
|
|
719
|
+
)
|
|
154
720
|
|
|
155
721
|
@classmethod
|
|
156
722
|
def create(cls, config_dict, ctx):
|
|
157
723
|
config = HiveConfig.parse_obj(config_dict)
|
|
158
724
|
return cls(config, ctx)
|
|
159
725
|
|
|
726
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
727
|
+
"""Generate workunits for tables and their storage lineage."""
|
|
728
|
+
for wu in super().get_workunits_internal():
|
|
729
|
+
yield wu
|
|
730
|
+
|
|
731
|
+
if not isinstance(wu, MetadataWorkUnit):
|
|
732
|
+
continue
|
|
733
|
+
|
|
734
|
+
# Get dataset URN and required aspects using workunit methods
|
|
735
|
+
try:
|
|
736
|
+
dataset_urn = wu.get_urn()
|
|
737
|
+
dataset_props = wu.get_aspect_of_type(DatasetPropertiesClass)
|
|
738
|
+
schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass)
|
|
739
|
+
except Exception as exp:
|
|
740
|
+
logger.warning(f"Failed to process workunit {wu.id}: {exp}")
|
|
741
|
+
continue
|
|
742
|
+
|
|
743
|
+
# Only proceed if we have the necessary properties
|
|
744
|
+
if dataset_props and dataset_props.customProperties:
|
|
745
|
+
table = {
|
|
746
|
+
"StorageDescriptor": {
|
|
747
|
+
"Location": dataset_props.customProperties.get("Location")
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
if table.get("StorageDescriptor", {}).get("Location"):
|
|
752
|
+
yield from self.storage_lineage.get_lineage_mcp(
|
|
753
|
+
dataset_urn=dataset_urn,
|
|
754
|
+
table=table,
|
|
755
|
+
dataset_schema=schema_metadata,
|
|
756
|
+
)
|
|
757
|
+
|
|
160
758
|
def get_schema_names(self, inspector):
|
|
161
759
|
assert isinstance(self.config, HiveConfig)
|
|
162
760
|
# This condition restricts the ingestion to the specified database.
|
|
@@ -173,7 +771,7 @@ class HiveSource(TwoTierSQLAlchemySource):
|
|
|
173
771
|
pk_constraints: Optional[Dict[Any, Any]] = None,
|
|
174
772
|
partition_keys: Optional[List[str]] = None,
|
|
175
773
|
tags: Optional[List[str]] = None,
|
|
176
|
-
) -> List[
|
|
774
|
+
) -> List[SchemaFieldClass]:
|
|
177
775
|
fields = super().get_schema_fields_for_column(
|
|
178
776
|
dataset_name,
|
|
179
777
|
column,
|