acryl-datahub 1.2.0.4rc1__py3-none-any.whl → 1.2.0.4rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/METADATA +2397 -2396
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/RECORD +42 -41
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +3 -3
- datahub/api/entities/external/restricted_text.py +3 -3
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/quickstart_versioning.py +1 -1
- datahub/cli/specific/assertions_cli.py +37 -2
- datahub/cli/specific/datacontract_cli.py +54 -4
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
- datahub/ingestion/api/report.py +21 -2
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/aws/tag_entities.py +2 -2
- datahub/ingestion/source/data_lake_common/path_spec.py +6 -3
- datahub/ingestion/source/dbt/dbt_cloud.py +6 -3
- datahub/ingestion/source/fivetran/fivetran_log_api.py +4 -3
- datahub/ingestion/source/grafana/models.py +6 -0
- datahub/ingestion/source/hex/hex.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +4 -4
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/slack/slack.py +7 -14
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
- datahub/ingestion/source/tableau/tableau.py +1 -1
- datahub/ingestion/source/unity/config.py +36 -1
- datahub/ingestion/source/unity/proxy.py +332 -46
- datahub/ingestion/source/unity/proxy_types.py +12 -2
- datahub/ingestion/source/unity/source.py +91 -34
- datahub/ingestion/source/unity/tag_entities.py +2 -2
- datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/sdk/search_client.py +3 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataset.py +37 -59
- datahub/utilities/server_config_util.py +2 -1
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/top_level.txt +0 -0
|
@@ -30,10 +30,14 @@ from databricks.sdk.service.sql import (
|
|
|
30
30
|
from databricks.sdk.service.workspace import ObjectType
|
|
31
31
|
from databricks.sql import connect
|
|
32
32
|
from databricks.sql.types import Row
|
|
33
|
+
from typing_extensions import assert_never
|
|
33
34
|
|
|
34
35
|
from datahub._version import nice_version_name
|
|
35
36
|
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
36
37
|
from datahub.emitter.mce_builder import parse_ts_millis
|
|
38
|
+
from datahub.ingestion.source.unity.config import (
|
|
39
|
+
LineageDataSource,
|
|
40
|
+
)
|
|
37
41
|
from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
|
|
38
42
|
from datahub.ingestion.source.unity.proxy_profiling import (
|
|
39
43
|
UnityCatalogProxyProfilingMixin,
|
|
@@ -46,6 +50,7 @@ from datahub.ingestion.source.unity.proxy_types import (
|
|
|
46
50
|
ExternalTableReference,
|
|
47
51
|
Metastore,
|
|
48
52
|
Notebook,
|
|
53
|
+
NotebookReference,
|
|
49
54
|
Query,
|
|
50
55
|
Schema,
|
|
51
56
|
ServicePrincipal,
|
|
@@ -53,9 +58,14 @@ from datahub.ingestion.source.unity.proxy_types import (
|
|
|
53
58
|
TableReference,
|
|
54
59
|
)
|
|
55
60
|
from datahub.ingestion.source.unity.report import UnityCatalogReport
|
|
61
|
+
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
56
62
|
|
|
57
63
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
58
64
|
|
|
65
|
+
# It is enough to keep the cache size to 1, since we only process one catalog at a time
|
|
66
|
+
# We need to change this if we want to support parallel processing of multiple catalogs
|
|
67
|
+
_MAX_CONCURRENT_CATALOGS = 1
|
|
68
|
+
|
|
59
69
|
|
|
60
70
|
@dataclasses.dataclass
|
|
61
71
|
class TableInfoWithGeneration(TableInfo):
|
|
@@ -91,6 +101,32 @@ class QueryFilterWithStatementTypes(QueryFilter):
|
|
|
91
101
|
return v
|
|
92
102
|
|
|
93
103
|
|
|
104
|
+
@dataclasses.dataclass
|
|
105
|
+
class TableUpstream:
|
|
106
|
+
table_name: str
|
|
107
|
+
source_type: str
|
|
108
|
+
last_updated: Optional[datetime] = None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclasses.dataclass
|
|
112
|
+
class ExternalUpstream:
|
|
113
|
+
path: str
|
|
114
|
+
source_type: str
|
|
115
|
+
last_updated: Optional[datetime] = None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclasses.dataclass
|
|
119
|
+
class TableLineageInfo:
|
|
120
|
+
upstreams: List[TableUpstream] = dataclasses.field(default_factory=list)
|
|
121
|
+
external_upstreams: List[ExternalUpstream] = dataclasses.field(default_factory=list)
|
|
122
|
+
upstream_notebooks: List[NotebookReference] = dataclasses.field(
|
|
123
|
+
default_factory=list
|
|
124
|
+
)
|
|
125
|
+
downstream_notebooks: List[NotebookReference] = dataclasses.field(
|
|
126
|
+
default_factory=list
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
94
130
|
class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
95
131
|
_workspace_client: WorkspaceClient
|
|
96
132
|
_workspace_url: str
|
|
@@ -104,6 +140,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
104
140
|
warehouse_id: Optional[str],
|
|
105
141
|
report: UnityCatalogReport,
|
|
106
142
|
hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
|
|
143
|
+
lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
|
|
107
144
|
):
|
|
108
145
|
self._workspace_client = WorkspaceClient(
|
|
109
146
|
host=workspace_url,
|
|
@@ -114,6 +151,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
114
151
|
self.warehouse_id = warehouse_id or ""
|
|
115
152
|
self.report = report
|
|
116
153
|
self.hive_metastore_proxy = hive_metastore_proxy
|
|
154
|
+
self.lineage_data_source = lineage_data_source
|
|
117
155
|
self._sql_connection_params = {
|
|
118
156
|
"server_hostname": self._workspace_client.config.host.replace(
|
|
119
157
|
"https://", ""
|
|
@@ -293,16 +331,142 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
293
331
|
method, path, body={**body, "page_token": response["next_page_token"]}
|
|
294
332
|
)
|
|
295
333
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
334
|
+
def _build_datetime_where_conditions(
|
|
335
|
+
self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None
|
|
336
|
+
) -> str:
|
|
337
|
+
"""Build datetime filtering conditions for lineage queries."""
|
|
338
|
+
conditions = []
|
|
339
|
+
if start_time:
|
|
340
|
+
conditions.append(f"event_time >= '{start_time.isoformat()}'")
|
|
341
|
+
if end_time:
|
|
342
|
+
conditions.append(f"event_time <= '{end_time.isoformat()}'")
|
|
343
|
+
return " AND " + " AND ".join(conditions) if conditions else ""
|
|
344
|
+
|
|
345
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
346
|
+
def get_catalog_table_lineage_via_system_tables(
|
|
347
|
+
self,
|
|
348
|
+
catalog: str,
|
|
349
|
+
start_time: Optional[datetime] = None,
|
|
350
|
+
end_time: Optional[datetime] = None,
|
|
351
|
+
) -> FileBackedDict[TableLineageInfo]:
|
|
352
|
+
"""Get table lineage for all tables in a catalog using system tables."""
|
|
353
|
+
logger.info(f"Fetching table lineage for catalog: {catalog}")
|
|
354
|
+
try:
|
|
355
|
+
additional_where = self._build_datetime_where_conditions(
|
|
356
|
+
start_time, end_time
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
query = f"""
|
|
360
|
+
SELECT
|
|
361
|
+
entity_type, entity_id,
|
|
362
|
+
source_table_full_name, source_type,
|
|
363
|
+
target_table_full_name, target_type,
|
|
364
|
+
max(event_time) as last_updated
|
|
365
|
+
FROM system.access.table_lineage
|
|
366
|
+
WHERE
|
|
367
|
+
(target_table_catalog = %s or source_table_catalog = %s)
|
|
368
|
+
{additional_where}
|
|
369
|
+
GROUP BY
|
|
370
|
+
entity_type, entity_id,
|
|
371
|
+
source_table_full_name, source_type,
|
|
372
|
+
target_table_full_name, target_type
|
|
373
|
+
"""
|
|
374
|
+
rows = self._execute_sql_query(query, [catalog, catalog])
|
|
375
|
+
|
|
376
|
+
result_dict: FileBackedDict[TableLineageInfo] = FileBackedDict()
|
|
377
|
+
for row in rows:
|
|
378
|
+
entity_type = row["entity_type"]
|
|
379
|
+
entity_id = row["entity_id"]
|
|
380
|
+
source_full_name = row["source_table_full_name"]
|
|
381
|
+
target_full_name = row["target_table_full_name"]
|
|
382
|
+
source_type = row["source_type"]
|
|
383
|
+
last_updated = row["last_updated"]
|
|
384
|
+
|
|
385
|
+
# Initialize TableLineageInfo for both source and target tables if they're in our catalog
|
|
386
|
+
for table_name in [source_full_name, target_full_name]:
|
|
387
|
+
if (
|
|
388
|
+
table_name
|
|
389
|
+
and table_name.startswith(f"{catalog}.")
|
|
390
|
+
and table_name not in result_dict
|
|
391
|
+
):
|
|
392
|
+
result_dict[table_name] = TableLineageInfo()
|
|
393
|
+
|
|
394
|
+
# Process upstream relationships (target table gets upstreams)
|
|
395
|
+
if target_full_name and target_full_name.startswith(f"{catalog}."):
|
|
396
|
+
# Handle table upstreams
|
|
397
|
+
if (
|
|
398
|
+
source_type in ["TABLE", "VIEW"]
|
|
399
|
+
and source_full_name != target_full_name
|
|
400
|
+
):
|
|
401
|
+
upstream = TableUpstream(
|
|
402
|
+
table_name=source_full_name,
|
|
403
|
+
source_type=source_type,
|
|
404
|
+
last_updated=last_updated,
|
|
405
|
+
)
|
|
406
|
+
result_dict[target_full_name].upstreams.append(upstream)
|
|
407
|
+
|
|
408
|
+
# Handle external upstreams (PATH type)
|
|
409
|
+
elif source_type == "PATH":
|
|
410
|
+
external_upstream = ExternalUpstream(
|
|
411
|
+
path=source_full_name,
|
|
412
|
+
source_type=source_type,
|
|
413
|
+
last_updated=last_updated,
|
|
414
|
+
)
|
|
415
|
+
result_dict[target_full_name].external_upstreams.append(
|
|
416
|
+
external_upstream
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
# Handle upstream notebooks (notebook -> table)
|
|
420
|
+
elif entity_type == "NOTEBOOK":
|
|
421
|
+
notebook_ref = NotebookReference(
|
|
422
|
+
id=entity_id,
|
|
423
|
+
last_updated=last_updated,
|
|
424
|
+
)
|
|
425
|
+
result_dict[target_full_name].upstream_notebooks.append(
|
|
426
|
+
notebook_ref
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# Process downstream relationships (source table gets downstream notebooks)
|
|
430
|
+
if (
|
|
431
|
+
entity_type == "NOTEBOOK"
|
|
432
|
+
and source_full_name
|
|
433
|
+
and source_full_name.startswith(f"{catalog}.")
|
|
434
|
+
):
|
|
435
|
+
notebook_ref = NotebookReference(
|
|
436
|
+
id=entity_id,
|
|
437
|
+
last_updated=last_updated,
|
|
438
|
+
)
|
|
439
|
+
result_dict[source_full_name].downstream_notebooks.append(
|
|
440
|
+
notebook_ref
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
return result_dict
|
|
444
|
+
except Exception as e:
|
|
445
|
+
logger.warning(
|
|
446
|
+
f"Error getting table lineage for catalog {catalog}: {e}",
|
|
447
|
+
exc_info=True,
|
|
448
|
+
)
|
|
449
|
+
return FileBackedDict()
|
|
450
|
+
|
|
451
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
452
|
+
def get_catalog_column_lineage_via_system_tables(
|
|
453
|
+
self,
|
|
454
|
+
catalog: str,
|
|
455
|
+
start_time: Optional[datetime] = None,
|
|
456
|
+
end_time: Optional[datetime] = None,
|
|
457
|
+
) -> FileBackedDict[Dict[str, dict]]:
|
|
458
|
+
"""Get column lineage for all tables in a catalog using system tables."""
|
|
299
459
|
logger.info(f"Fetching column lineage for catalog: {catalog}")
|
|
300
460
|
try:
|
|
301
|
-
|
|
461
|
+
additional_where = self._build_datetime_where_conditions(
|
|
462
|
+
start_time, end_time
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
query = f"""
|
|
302
466
|
SELECT
|
|
303
467
|
source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
|
|
304
468
|
target_table_schema, target_table_name, target_column_name,
|
|
305
|
-
max(event_time)
|
|
469
|
+
max(event_time) as last_updated
|
|
306
470
|
FROM system.access.column_lineage
|
|
307
471
|
WHERE
|
|
308
472
|
target_table_catalog = %s
|
|
@@ -313,13 +477,14 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
313
477
|
AND source_table_schema IS NOT NULL
|
|
314
478
|
AND source_table_name IS NOT NULL
|
|
315
479
|
AND source_column_name IS NOT NULL
|
|
480
|
+
{additional_where}
|
|
316
481
|
GROUP BY
|
|
317
|
-
source_table_catalog, source_table_schema, source_table_name, source_column_name,
|
|
482
|
+
source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
|
|
318
483
|
target_table_schema, target_table_name, target_column_name
|
|
319
484
|
"""
|
|
320
|
-
rows = self._execute_sql_query(query,
|
|
485
|
+
rows = self._execute_sql_query(query, [catalog])
|
|
321
486
|
|
|
322
|
-
result_dict:
|
|
487
|
+
result_dict: FileBackedDict[Dict[str, dict]] = FileBackedDict()
|
|
323
488
|
for row in rows:
|
|
324
489
|
result_dict.setdefault(row["target_table_schema"], {}).setdefault(
|
|
325
490
|
row["target_table_name"], {}
|
|
@@ -330,6 +495,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
330
495
|
"schema_name": row["source_table_schema"],
|
|
331
496
|
"table_name": row["source_table_name"],
|
|
332
497
|
"name": row["source_column_name"],
|
|
498
|
+
"last_updated": row["last_updated"],
|
|
333
499
|
}
|
|
334
500
|
)
|
|
335
501
|
|
|
@@ -339,9 +505,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
339
505
|
f"Error getting column lineage for catalog {catalog}: {e}",
|
|
340
506
|
exc_info=True,
|
|
341
507
|
)
|
|
342
|
-
return
|
|
508
|
+
return FileBackedDict()
|
|
343
509
|
|
|
344
|
-
def
|
|
510
|
+
def list_lineages_by_table_via_http_api(
|
|
345
511
|
self, table_name: str, include_entity_lineage: bool
|
|
346
512
|
) -> dict:
|
|
347
513
|
"""List table lineage by table name."""
|
|
@@ -355,7 +521,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
355
521
|
},
|
|
356
522
|
)
|
|
357
523
|
|
|
358
|
-
def
|
|
524
|
+
def list_lineages_by_column_via_http_api(
|
|
525
|
+
self, table_name: str, column_name: str
|
|
526
|
+
) -> list:
|
|
359
527
|
"""List column lineage by table name and column name."""
|
|
360
528
|
logger.debug(f"Getting column lineage for {table_name}.{column_name}")
|
|
361
529
|
try:
|
|
@@ -374,55 +542,173 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
374
542
|
)
|
|
375
543
|
return []
|
|
376
544
|
|
|
377
|
-
def table_lineage(
|
|
545
|
+
def table_lineage(
|
|
546
|
+
self,
|
|
547
|
+
table: Table,
|
|
548
|
+
include_entity_lineage: bool,
|
|
549
|
+
start_time: Optional[datetime] = None,
|
|
550
|
+
end_time: Optional[datetime] = None,
|
|
551
|
+
) -> None:
|
|
378
552
|
if table.schema.catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG:
|
|
379
553
|
# Lineage is not available for Hive Metastore Tables.
|
|
380
554
|
return None
|
|
381
|
-
# Lineage endpoint doesn't exists on 2.1 version
|
|
382
|
-
try:
|
|
383
|
-
response: dict = self.list_lineages_by_table(
|
|
384
|
-
table_name=table.ref.qualified_table_name,
|
|
385
|
-
include_entity_lineage=include_entity_lineage,
|
|
386
|
-
)
|
|
387
|
-
|
|
388
|
-
for item in response.get("upstreams") or []:
|
|
389
|
-
if "tableInfo" in item:
|
|
390
|
-
table_ref = TableReference.create_from_lineage(
|
|
391
|
-
item["tableInfo"], table.schema.catalog.metastore
|
|
392
|
-
)
|
|
393
|
-
if table_ref:
|
|
394
|
-
table.upstreams[table_ref] = {}
|
|
395
|
-
elif "fileInfo" in item:
|
|
396
|
-
external_ref = ExternalTableReference.create_from_lineage(
|
|
397
|
-
item["fileInfo"]
|
|
398
|
-
)
|
|
399
|
-
if external_ref:
|
|
400
|
-
table.external_upstreams.add(external_ref)
|
|
401
555
|
|
|
402
|
-
|
|
403
|
-
|
|
556
|
+
try:
|
|
557
|
+
# Determine lineage data source based on config
|
|
558
|
+
use_system_tables = False
|
|
559
|
+
if self.lineage_data_source == LineageDataSource.SYSTEM_TABLES:
|
|
560
|
+
use_system_tables = True
|
|
561
|
+
elif self.lineage_data_source == LineageDataSource.API:
|
|
562
|
+
use_system_tables = False
|
|
563
|
+
elif self.lineage_data_source == LineageDataSource.AUTO:
|
|
564
|
+
# Use the newer system tables if we have a SQL warehouse, otherwise fall back
|
|
565
|
+
# to the older (and slower) HTTP API.
|
|
566
|
+
use_system_tables = bool(self.warehouse_id)
|
|
567
|
+
else:
|
|
568
|
+
assert_never(self.lineage_data_source)
|
|
404
569
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
570
|
+
if use_system_tables:
|
|
571
|
+
self._process_system_table_lineage(table, start_time, end_time)
|
|
572
|
+
else:
|
|
573
|
+
self._process_table_lineage_via_http_api(table, include_entity_lineage)
|
|
408
574
|
except Exception as e:
|
|
409
575
|
logger.warning(
|
|
410
576
|
f"Error getting lineage on table {table.ref}: {e}", exc_info=True
|
|
411
577
|
)
|
|
412
578
|
|
|
579
|
+
def _process_system_table_lineage(
|
|
580
|
+
self,
|
|
581
|
+
table: Table,
|
|
582
|
+
start_time: Optional[datetime] = None,
|
|
583
|
+
end_time: Optional[datetime] = None,
|
|
584
|
+
) -> None:
|
|
585
|
+
"""Process table lineage using system.access.table_lineage table."""
|
|
586
|
+
catalog_lineage = self.get_catalog_table_lineage_via_system_tables(
|
|
587
|
+
table.ref.catalog, start_time, end_time
|
|
588
|
+
)
|
|
589
|
+
table_full_name = table.ref.qualified_table_name
|
|
590
|
+
|
|
591
|
+
lineage_info = catalog_lineage.get(table_full_name, TableLineageInfo())
|
|
592
|
+
|
|
593
|
+
# Process table upstreams
|
|
594
|
+
for upstream in lineage_info.upstreams:
|
|
595
|
+
upstream_table_name = upstream.table_name
|
|
596
|
+
# Parse catalog.schema.table format
|
|
597
|
+
parts = upstream_table_name.split(".")
|
|
598
|
+
if len(parts) == 3:
|
|
599
|
+
catalog_name, schema_name, table_name = parts[0], parts[1], parts[2]
|
|
600
|
+
table_ref = TableReference(
|
|
601
|
+
metastore=table.schema.catalog.metastore.id
|
|
602
|
+
if table.schema.catalog.metastore
|
|
603
|
+
else None,
|
|
604
|
+
catalog=catalog_name,
|
|
605
|
+
schema=schema_name,
|
|
606
|
+
table=table_name,
|
|
607
|
+
last_updated=upstream.last_updated,
|
|
608
|
+
)
|
|
609
|
+
table.upstreams[table_ref] = {}
|
|
610
|
+
else:
|
|
611
|
+
logger.warning(
|
|
612
|
+
f"Unexpected upstream table format: {upstream_table_name} for table {table_full_name}"
|
|
613
|
+
)
|
|
614
|
+
continue
|
|
615
|
+
|
|
616
|
+
# Process external upstreams
|
|
617
|
+
for external_upstream in lineage_info.external_upstreams:
|
|
618
|
+
external_ref = ExternalTableReference(
|
|
619
|
+
path=external_upstream.path,
|
|
620
|
+
has_permission=True,
|
|
621
|
+
name=None,
|
|
622
|
+
type=None,
|
|
623
|
+
storage_location=external_upstream.path,
|
|
624
|
+
last_updated=external_upstream.last_updated,
|
|
625
|
+
)
|
|
626
|
+
table.external_upstreams.add(external_ref)
|
|
627
|
+
|
|
628
|
+
# Process upstream notebook lineage
|
|
629
|
+
for notebook_ref in lineage_info.upstream_notebooks:
|
|
630
|
+
existing_ref = table.upstream_notebooks.get(notebook_ref.id)
|
|
631
|
+
if existing_ref is None or (
|
|
632
|
+
notebook_ref.last_updated
|
|
633
|
+
and existing_ref.last_updated
|
|
634
|
+
and notebook_ref.last_updated > existing_ref.last_updated
|
|
635
|
+
):
|
|
636
|
+
table.upstream_notebooks[notebook_ref.id] = notebook_ref
|
|
637
|
+
|
|
638
|
+
# Process downstream notebook lineage
|
|
639
|
+
for notebook_ref in lineage_info.downstream_notebooks:
|
|
640
|
+
existing_ref = table.downstream_notebooks.get(notebook_ref.id)
|
|
641
|
+
if existing_ref is None or (
|
|
642
|
+
notebook_ref.last_updated
|
|
643
|
+
and existing_ref.last_updated
|
|
644
|
+
and notebook_ref.last_updated > existing_ref.last_updated
|
|
645
|
+
):
|
|
646
|
+
table.downstream_notebooks[notebook_ref.id] = notebook_ref
|
|
647
|
+
|
|
648
|
+
def _process_table_lineage_via_http_api(
|
|
649
|
+
self, table: Table, include_entity_lineage: bool
|
|
650
|
+
) -> None:
|
|
651
|
+
"""Process table lineage using the HTTP API (legacy fallback)."""
|
|
652
|
+
response: dict = self.list_lineages_by_table_via_http_api(
|
|
653
|
+
table_name=table.ref.qualified_table_name,
|
|
654
|
+
include_entity_lineage=include_entity_lineage,
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
for item in response.get("upstreams") or []:
|
|
658
|
+
if "tableInfo" in item:
|
|
659
|
+
table_ref = TableReference.create_from_lineage(
|
|
660
|
+
item["tableInfo"], table.schema.catalog.metastore
|
|
661
|
+
)
|
|
662
|
+
if table_ref:
|
|
663
|
+
table.upstreams[table_ref] = {}
|
|
664
|
+
elif "fileInfo" in item:
|
|
665
|
+
external_ref = ExternalTableReference.create_from_lineage(
|
|
666
|
+
item["fileInfo"]
|
|
667
|
+
)
|
|
668
|
+
if external_ref:
|
|
669
|
+
table.external_upstreams.add(external_ref)
|
|
670
|
+
|
|
671
|
+
for notebook in item.get("notebookInfos") or []:
|
|
672
|
+
notebook_ref = NotebookReference(
|
|
673
|
+
id=notebook["notebook_id"],
|
|
674
|
+
)
|
|
675
|
+
table.upstream_notebooks[notebook_ref.id] = notebook_ref
|
|
676
|
+
|
|
677
|
+
for item in response.get("downstreams") or []:
|
|
678
|
+
for notebook in item.get("notebookInfos") or []:
|
|
679
|
+
notebook_ref = NotebookReference(
|
|
680
|
+
id=notebook["notebook_id"],
|
|
681
|
+
)
|
|
682
|
+
table.downstream_notebooks[notebook_ref.id] = notebook_ref
|
|
683
|
+
|
|
413
684
|
def get_column_lineage(
|
|
414
685
|
self,
|
|
415
686
|
table: Table,
|
|
416
687
|
column_names: List[str],
|
|
417
688
|
*,
|
|
418
689
|
max_workers: Optional[int] = None,
|
|
690
|
+
start_time: Optional[datetime] = None,
|
|
691
|
+
end_time: Optional[datetime] = None,
|
|
419
692
|
) -> None:
|
|
420
693
|
try:
|
|
421
|
-
#
|
|
422
|
-
|
|
423
|
-
if self.
|
|
694
|
+
# Determine lineage data source based on config
|
|
695
|
+
use_system_tables = False
|
|
696
|
+
if self.lineage_data_source == LineageDataSource.SYSTEM_TABLES:
|
|
697
|
+
use_system_tables = True
|
|
698
|
+
elif self.lineage_data_source == LineageDataSource.API:
|
|
699
|
+
use_system_tables = False
|
|
700
|
+
elif self.lineage_data_source == LineageDataSource.AUTO:
|
|
701
|
+
# Use the newer system tables if we have a SQL warehouse, otherwise fall back
|
|
702
|
+
# to the older (and slower) HTTP API.
|
|
703
|
+
use_system_tables = bool(self.warehouse_id)
|
|
704
|
+
else:
|
|
705
|
+
assert_never(self.lineage_data_source)
|
|
706
|
+
|
|
707
|
+
if use_system_tables:
|
|
424
708
|
lineage = (
|
|
425
|
-
self.
|
|
709
|
+
self.get_catalog_column_lineage_via_system_tables(
|
|
710
|
+
table.ref.catalog, start_time, end_time
|
|
711
|
+
)
|
|
426
712
|
.get(table.ref.schema, {})
|
|
427
713
|
.get(table.ref.table, {})
|
|
428
714
|
)
|
|
@@ -430,7 +716,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
430
716
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
431
717
|
futures = [
|
|
432
718
|
executor.submit(
|
|
433
|
-
self.
|
|
719
|
+
self.list_lineages_by_column_via_http_api,
|
|
434
720
|
table.ref.qualified_table_name,
|
|
435
721
|
column_name,
|
|
436
722
|
)
|
|
@@ -608,7 +894,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
608
894
|
logger.warning(f"Failed to execute SQL query: {e}")
|
|
609
895
|
return []
|
|
610
896
|
|
|
611
|
-
@cached(cachetools.FIFOCache(maxsize=
|
|
897
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
612
898
|
def get_schema_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
613
899
|
"""Optimized version using databricks-sql"""
|
|
614
900
|
logger.info(f"Fetching schema tags for catalog: `{catalog}`")
|
|
@@ -631,7 +917,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
631
917
|
|
|
632
918
|
return result_dict
|
|
633
919
|
|
|
634
|
-
@cached(cachetools.FIFOCache(maxsize=
|
|
920
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
635
921
|
def get_catalog_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
636
922
|
"""Optimized version using databricks-sql"""
|
|
637
923
|
logger.info(f"Fetching table tags for catalog: `{catalog}`")
|
|
@@ -653,7 +939,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
653
939
|
|
|
654
940
|
return result_dict
|
|
655
941
|
|
|
656
|
-
@cached(cachetools.FIFOCache(maxsize=
|
|
942
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
657
943
|
def get_table_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
658
944
|
"""Optimized version using databricks-sql"""
|
|
659
945
|
logger.info(f"Fetching table tags for catalog: `{catalog}`")
|
|
@@ -676,7 +962,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
676
962
|
|
|
677
963
|
return result_dict
|
|
678
964
|
|
|
679
|
-
@cached(cachetools.FIFOCache(maxsize=
|
|
965
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
680
966
|
def get_column_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
681
967
|
"""Optimized version using databricks-sql"""
|
|
682
968
|
logger.info(f"Fetching column tags for catalog: `{catalog}`")
|
|
@@ -148,6 +148,7 @@ class TableReference:
|
|
|
148
148
|
catalog: str
|
|
149
149
|
schema: str
|
|
150
150
|
table: str
|
|
151
|
+
last_updated: Optional[datetime] = None
|
|
151
152
|
|
|
152
153
|
@classmethod
|
|
153
154
|
def create(cls, table: "Table") -> "TableReference":
|
|
@@ -172,6 +173,7 @@ class TableReference:
|
|
|
172
173
|
d["catalog_name"],
|
|
173
174
|
d["schema_name"],
|
|
174
175
|
d.get("table_name", d["name"]), # column vs table query output
|
|
176
|
+
last_updated=d.get("last_updated"),
|
|
175
177
|
)
|
|
176
178
|
except Exception as e:
|
|
177
179
|
logger.warning(f"Failed to create TableReference from {d}: {e}")
|
|
@@ -199,6 +201,7 @@ class ExternalTableReference:
|
|
|
199
201
|
name: Optional[str]
|
|
200
202
|
type: Optional[SecurableType]
|
|
201
203
|
storage_location: Optional[str]
|
|
204
|
+
last_updated: Optional[datetime] = None
|
|
202
205
|
|
|
203
206
|
@classmethod
|
|
204
207
|
def create_from_lineage(cls, d: dict) -> Optional["ExternalTableReference"]:
|
|
@@ -215,12 +218,19 @@ class ExternalTableReference:
|
|
|
215
218
|
name=d.get("securable_name"),
|
|
216
219
|
type=securable_type,
|
|
217
220
|
storage_location=d.get("storage_location"),
|
|
221
|
+
last_updated=d.get("last_updated"),
|
|
218
222
|
)
|
|
219
223
|
except Exception as e:
|
|
220
224
|
logger.warning(f"Failed to create ExternalTableReference from {d}: {e}")
|
|
221
225
|
return None
|
|
222
226
|
|
|
223
227
|
|
|
228
|
+
@dataclass(frozen=True, order=True)
|
|
229
|
+
class NotebookReference:
|
|
230
|
+
id: int
|
|
231
|
+
last_updated: Optional[datetime] = None
|
|
232
|
+
|
|
233
|
+
|
|
224
234
|
@dataclass
|
|
225
235
|
class Table(CommonProperty):
|
|
226
236
|
schema: Schema
|
|
@@ -239,8 +249,8 @@ class Table(CommonProperty):
|
|
|
239
249
|
properties: Dict[str, str]
|
|
240
250
|
upstreams: Dict[TableReference, Dict[str, List[str]]] = field(default_factory=dict)
|
|
241
251
|
external_upstreams: Set[ExternalTableReference] = field(default_factory=set)
|
|
242
|
-
upstream_notebooks:
|
|
243
|
-
downstream_notebooks:
|
|
252
|
+
upstream_notebooks: Dict[int, NotebookReference] = field(default_factory=dict)
|
|
253
|
+
downstream_notebooks: Dict[int, NotebookReference] = field(default_factory=dict)
|
|
244
254
|
|
|
245
255
|
ref: TableReference = field(init=False)
|
|
246
256
|
|