acryl-datahub 1.2.0.4rc1__py3-none-any.whl → 1.2.0.4rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (29) hide show
  1. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/METADATA +2520 -2520
  2. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/RECORD +29 -29
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +3 -3
  5. datahub/api/entities/external/restricted_text.py +3 -3
  6. datahub/api/entities/forms/forms.py +3 -3
  7. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  8. datahub/cli/quickstart_versioning.py +1 -1
  9. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
  10. datahub/ingestion/source/abs/config.py +1 -1
  11. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  12. datahub/ingestion/source/aws/tag_entities.py +2 -2
  13. datahub/ingestion/source/data_lake_common/path_spec.py +1 -2
  14. datahub/ingestion/source/hex/hex.py +1 -1
  15. datahub/ingestion/source/iceberg/iceberg.py +4 -4
  16. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  17. datahub/ingestion/source/redshift/datashares.py +1 -1
  18. datahub/ingestion/source/slack/slack.py +7 -14
  19. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
  20. datahub/ingestion/source/unity/config.py +36 -1
  21. datahub/ingestion/source/unity/proxy.py +332 -46
  22. datahub/ingestion/source/unity/proxy_types.py +12 -2
  23. datahub/ingestion/source/unity/source.py +91 -34
  24. datahub/ingestion/source/unity/tag_entities.py +2 -2
  25. datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
  26. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/WHEEL +0 -0
  27. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/entry_points.txt +0 -0
  28. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/licenses/LICENSE +0 -0
  29. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc2.dist-info}/top_level.txt +0 -0
@@ -30,10 +30,14 @@ from databricks.sdk.service.sql import (
30
30
  from databricks.sdk.service.workspace import ObjectType
31
31
  from databricks.sql import connect
32
32
  from databricks.sql.types import Row
33
+ from typing_extensions import assert_never
33
34
 
34
35
  from datahub._version import nice_version_name
35
36
  from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
36
37
  from datahub.emitter.mce_builder import parse_ts_millis
38
+ from datahub.ingestion.source.unity.config import (
39
+ LineageDataSource,
40
+ )
37
41
  from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
38
42
  from datahub.ingestion.source.unity.proxy_profiling import (
39
43
  UnityCatalogProxyProfilingMixin,
@@ -46,6 +50,7 @@ from datahub.ingestion.source.unity.proxy_types import (
46
50
  ExternalTableReference,
47
51
  Metastore,
48
52
  Notebook,
53
+ NotebookReference,
49
54
  Query,
50
55
  Schema,
51
56
  ServicePrincipal,
@@ -53,9 +58,14 @@ from datahub.ingestion.source.unity.proxy_types import (
53
58
  TableReference,
54
59
  )
55
60
  from datahub.ingestion.source.unity.report import UnityCatalogReport
61
+ from datahub.utilities.file_backed_collections import FileBackedDict
56
62
 
57
63
  logger: logging.Logger = logging.getLogger(__name__)
58
64
 
65
+ # It is enough to keep the cache size to 1, since we only process one catalog at a time
66
+ # We need to change this if we want to support parallel processing of multiple catalogs
67
+ _MAX_CONCURRENT_CATALOGS = 1
68
+
59
69
 
60
70
  @dataclasses.dataclass
61
71
  class TableInfoWithGeneration(TableInfo):
@@ -91,6 +101,32 @@ class QueryFilterWithStatementTypes(QueryFilter):
91
101
  return v
92
102
 
93
103
 
104
+ @dataclasses.dataclass
105
+ class TableUpstream:
106
+ table_name: str
107
+ source_type: str
108
+ last_updated: Optional[datetime] = None
109
+
110
+
111
+ @dataclasses.dataclass
112
+ class ExternalUpstream:
113
+ path: str
114
+ source_type: str
115
+ last_updated: Optional[datetime] = None
116
+
117
+
118
+ @dataclasses.dataclass
119
+ class TableLineageInfo:
120
+ upstreams: List[TableUpstream] = dataclasses.field(default_factory=list)
121
+ external_upstreams: List[ExternalUpstream] = dataclasses.field(default_factory=list)
122
+ upstream_notebooks: List[NotebookReference] = dataclasses.field(
123
+ default_factory=list
124
+ )
125
+ downstream_notebooks: List[NotebookReference] = dataclasses.field(
126
+ default_factory=list
127
+ )
128
+
129
+
94
130
  class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
95
131
  _workspace_client: WorkspaceClient
96
132
  _workspace_url: str
@@ -104,6 +140,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
104
140
  warehouse_id: Optional[str],
105
141
  report: UnityCatalogReport,
106
142
  hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
143
+ lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
107
144
  ):
108
145
  self._workspace_client = WorkspaceClient(
109
146
  host=workspace_url,
@@ -114,6 +151,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
114
151
  self.warehouse_id = warehouse_id or ""
115
152
  self.report = report
116
153
  self.hive_metastore_proxy = hive_metastore_proxy
154
+ self.lineage_data_source = lineage_data_source
117
155
  self._sql_connection_params = {
118
156
  "server_hostname": self._workspace_client.config.host.replace(
119
157
  "https://", ""
@@ -293,16 +331,142 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
293
331
  method, path, body={**body, "page_token": response["next_page_token"]}
294
332
  )
295
333
 
296
- @cached(cachetools.FIFOCache(maxsize=100))
297
- def get_catalog_column_lineage(self, catalog: str) -> Dict[str, Dict[str, dict]]:
298
- """Get column lineage for all tables in a catalog."""
334
+ def _build_datetime_where_conditions(
335
+ self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None
336
+ ) -> str:
337
+ """Build datetime filtering conditions for lineage queries."""
338
+ conditions = []
339
+ if start_time:
340
+ conditions.append(f"event_time >= '{start_time.isoformat()}'")
341
+ if end_time:
342
+ conditions.append(f"event_time <= '{end_time.isoformat()}'")
343
+ return " AND " + " AND ".join(conditions) if conditions else ""
344
+
345
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
346
+ def get_catalog_table_lineage_via_system_tables(
347
+ self,
348
+ catalog: str,
349
+ start_time: Optional[datetime] = None,
350
+ end_time: Optional[datetime] = None,
351
+ ) -> FileBackedDict[TableLineageInfo]:
352
+ """Get table lineage for all tables in a catalog using system tables."""
353
+ logger.info(f"Fetching table lineage for catalog: {catalog}")
354
+ try:
355
+ additional_where = self._build_datetime_where_conditions(
356
+ start_time, end_time
357
+ )
358
+
359
+ query = f"""
360
+ SELECT
361
+ entity_type, entity_id,
362
+ source_table_full_name, source_type,
363
+ target_table_full_name, target_type,
364
+ max(event_time) as last_updated
365
+ FROM system.access.table_lineage
366
+ WHERE
367
+ (target_table_catalog = %s or source_table_catalog = %s)
368
+ {additional_where}
369
+ GROUP BY
370
+ entity_type, entity_id,
371
+ source_table_full_name, source_type,
372
+ target_table_full_name, target_type
373
+ """
374
+ rows = self._execute_sql_query(query, [catalog, catalog])
375
+
376
+ result_dict: FileBackedDict[TableLineageInfo] = FileBackedDict()
377
+ for row in rows:
378
+ entity_type = row["entity_type"]
379
+ entity_id = row["entity_id"]
380
+ source_full_name = row["source_table_full_name"]
381
+ target_full_name = row["target_table_full_name"]
382
+ source_type = row["source_type"]
383
+ last_updated = row["last_updated"]
384
+
385
+ # Initialize TableLineageInfo for both source and target tables if they're in our catalog
386
+ for table_name in [source_full_name, target_full_name]:
387
+ if (
388
+ table_name
389
+ and table_name.startswith(f"{catalog}.")
390
+ and table_name not in result_dict
391
+ ):
392
+ result_dict[table_name] = TableLineageInfo()
393
+
394
+ # Process upstream relationships (target table gets upstreams)
395
+ if target_full_name and target_full_name.startswith(f"{catalog}."):
396
+ # Handle table upstreams
397
+ if (
398
+ source_type in ["TABLE", "VIEW"]
399
+ and source_full_name != target_full_name
400
+ ):
401
+ upstream = TableUpstream(
402
+ table_name=source_full_name,
403
+ source_type=source_type,
404
+ last_updated=last_updated,
405
+ )
406
+ result_dict[target_full_name].upstreams.append(upstream)
407
+
408
+ # Handle external upstreams (PATH type)
409
+ elif source_type == "PATH":
410
+ external_upstream = ExternalUpstream(
411
+ path=source_full_name,
412
+ source_type=source_type,
413
+ last_updated=last_updated,
414
+ )
415
+ result_dict[target_full_name].external_upstreams.append(
416
+ external_upstream
417
+ )
418
+
419
+ # Handle upstream notebooks (notebook -> table)
420
+ elif entity_type == "NOTEBOOK":
421
+ notebook_ref = NotebookReference(
422
+ id=entity_id,
423
+ last_updated=last_updated,
424
+ )
425
+ result_dict[target_full_name].upstream_notebooks.append(
426
+ notebook_ref
427
+ )
428
+
429
+ # Process downstream relationships (source table gets downstream notebooks)
430
+ if (
431
+ entity_type == "NOTEBOOK"
432
+ and source_full_name
433
+ and source_full_name.startswith(f"{catalog}.")
434
+ ):
435
+ notebook_ref = NotebookReference(
436
+ id=entity_id,
437
+ last_updated=last_updated,
438
+ )
439
+ result_dict[source_full_name].downstream_notebooks.append(
440
+ notebook_ref
441
+ )
442
+
443
+ return result_dict
444
+ except Exception as e:
445
+ logger.warning(
446
+ f"Error getting table lineage for catalog {catalog}: {e}",
447
+ exc_info=True,
448
+ )
449
+ return FileBackedDict()
450
+
451
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
452
+ def get_catalog_column_lineage_via_system_tables(
453
+ self,
454
+ catalog: str,
455
+ start_time: Optional[datetime] = None,
456
+ end_time: Optional[datetime] = None,
457
+ ) -> FileBackedDict[Dict[str, dict]]:
458
+ """Get column lineage for all tables in a catalog using system tables."""
299
459
  logger.info(f"Fetching column lineage for catalog: {catalog}")
300
460
  try:
301
- query = """
461
+ additional_where = self._build_datetime_where_conditions(
462
+ start_time, end_time
463
+ )
464
+
465
+ query = f"""
302
466
  SELECT
303
467
  source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
304
468
  target_table_schema, target_table_name, target_column_name,
305
- max(event_time)
469
+ max(event_time) as last_updated
306
470
  FROM system.access.column_lineage
307
471
  WHERE
308
472
  target_table_catalog = %s
@@ -313,13 +477,14 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
313
477
  AND source_table_schema IS NOT NULL
314
478
  AND source_table_name IS NOT NULL
315
479
  AND source_column_name IS NOT NULL
480
+ {additional_where}
316
481
  GROUP BY
317
- source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
482
+ source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
318
483
  target_table_schema, target_table_name, target_column_name
319
484
  """
320
- rows = self._execute_sql_query(query, (catalog,))
485
+ rows = self._execute_sql_query(query, [catalog])
321
486
 
322
- result_dict: Dict[str, Dict[str, dict]] = {}
487
+ result_dict: FileBackedDict[Dict[str, dict]] = FileBackedDict()
323
488
  for row in rows:
324
489
  result_dict.setdefault(row["target_table_schema"], {}).setdefault(
325
490
  row["target_table_name"], {}
@@ -330,6 +495,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
330
495
  "schema_name": row["source_table_schema"],
331
496
  "table_name": row["source_table_name"],
332
497
  "name": row["source_column_name"],
498
+ "last_updated": row["last_updated"],
333
499
  }
334
500
  )
335
501
 
@@ -339,9 +505,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
339
505
  f"Error getting column lineage for catalog {catalog}: {e}",
340
506
  exc_info=True,
341
507
  )
342
- return {}
508
+ return FileBackedDict()
343
509
 
344
- def list_lineages_by_table(
510
+ def list_lineages_by_table_via_http_api(
345
511
  self, table_name: str, include_entity_lineage: bool
346
512
  ) -> dict:
347
513
  """List table lineage by table name."""
@@ -355,7 +521,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
355
521
  },
356
522
  )
357
523
 
358
- def list_lineages_by_column(self, table_name: str, column_name: str) -> list:
524
+ def list_lineages_by_column_via_http_api(
525
+ self, table_name: str, column_name: str
526
+ ) -> list:
359
527
  """List column lineage by table name and column name."""
360
528
  logger.debug(f"Getting column lineage for {table_name}.{column_name}")
361
529
  try:
@@ -374,55 +542,173 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
374
542
  )
375
543
  return []
376
544
 
377
- def table_lineage(self, table: Table, include_entity_lineage: bool) -> None:
545
+ def table_lineage(
546
+ self,
547
+ table: Table,
548
+ include_entity_lineage: bool,
549
+ start_time: Optional[datetime] = None,
550
+ end_time: Optional[datetime] = None,
551
+ ) -> None:
378
552
  if table.schema.catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG:
379
553
  # Lineage is not available for Hive Metastore Tables.
380
554
  return None
381
- # Lineage endpoint doesn't exists on 2.1 version
382
- try:
383
- response: dict = self.list_lineages_by_table(
384
- table_name=table.ref.qualified_table_name,
385
- include_entity_lineage=include_entity_lineage,
386
- )
387
-
388
- for item in response.get("upstreams") or []:
389
- if "tableInfo" in item:
390
- table_ref = TableReference.create_from_lineage(
391
- item["tableInfo"], table.schema.catalog.metastore
392
- )
393
- if table_ref:
394
- table.upstreams[table_ref] = {}
395
- elif "fileInfo" in item:
396
- external_ref = ExternalTableReference.create_from_lineage(
397
- item["fileInfo"]
398
- )
399
- if external_ref:
400
- table.external_upstreams.add(external_ref)
401
555
 
402
- for notebook in item.get("notebookInfos") or []:
403
- table.upstream_notebooks.add(notebook["notebook_id"])
556
+ try:
557
+ # Determine lineage data source based on config
558
+ use_system_tables = False
559
+ if self.lineage_data_source == LineageDataSource.SYSTEM_TABLES:
560
+ use_system_tables = True
561
+ elif self.lineage_data_source == LineageDataSource.API:
562
+ use_system_tables = False
563
+ elif self.lineage_data_source == LineageDataSource.AUTO:
564
+ # Use the newer system tables if we have a SQL warehouse, otherwise fall back
565
+ # to the older (and slower) HTTP API.
566
+ use_system_tables = bool(self.warehouse_id)
567
+ else:
568
+ assert_never(self.lineage_data_source)
404
569
 
405
- for item in response.get("downstreams") or []:
406
- for notebook in item.get("notebookInfos") or []:
407
- table.downstream_notebooks.add(notebook["notebook_id"])
570
+ if use_system_tables:
571
+ self._process_system_table_lineage(table, start_time, end_time)
572
+ else:
573
+ self._process_table_lineage_via_http_api(table, include_entity_lineage)
408
574
  except Exception as e:
409
575
  logger.warning(
410
576
  f"Error getting lineage on table {table.ref}: {e}", exc_info=True
411
577
  )
412
578
 
579
+ def _process_system_table_lineage(
580
+ self,
581
+ table: Table,
582
+ start_time: Optional[datetime] = None,
583
+ end_time: Optional[datetime] = None,
584
+ ) -> None:
585
+ """Process table lineage using system.access.table_lineage table."""
586
+ catalog_lineage = self.get_catalog_table_lineage_via_system_tables(
587
+ table.ref.catalog, start_time, end_time
588
+ )
589
+ table_full_name = table.ref.qualified_table_name
590
+
591
+ lineage_info = catalog_lineage.get(table_full_name, TableLineageInfo())
592
+
593
+ # Process table upstreams
594
+ for upstream in lineage_info.upstreams:
595
+ upstream_table_name = upstream.table_name
596
+ # Parse catalog.schema.table format
597
+ parts = upstream_table_name.split(".")
598
+ if len(parts) == 3:
599
+ catalog_name, schema_name, table_name = parts[0], parts[1], parts[2]
600
+ table_ref = TableReference(
601
+ metastore=table.schema.catalog.metastore.id
602
+ if table.schema.catalog.metastore
603
+ else None,
604
+ catalog=catalog_name,
605
+ schema=schema_name,
606
+ table=table_name,
607
+ last_updated=upstream.last_updated,
608
+ )
609
+ table.upstreams[table_ref] = {}
610
+ else:
611
+ logger.warning(
612
+ f"Unexpected upstream table format: {upstream_table_name} for table {table_full_name}"
613
+ )
614
+ continue
615
+
616
+ # Process external upstreams
617
+ for external_upstream in lineage_info.external_upstreams:
618
+ external_ref = ExternalTableReference(
619
+ path=external_upstream.path,
620
+ has_permission=True,
621
+ name=None,
622
+ type=None,
623
+ storage_location=external_upstream.path,
624
+ last_updated=external_upstream.last_updated,
625
+ )
626
+ table.external_upstreams.add(external_ref)
627
+
628
+ # Process upstream notebook lineage
629
+ for notebook_ref in lineage_info.upstream_notebooks:
630
+ existing_ref = table.upstream_notebooks.get(notebook_ref.id)
631
+ if existing_ref is None or (
632
+ notebook_ref.last_updated
633
+ and existing_ref.last_updated
634
+ and notebook_ref.last_updated > existing_ref.last_updated
635
+ ):
636
+ table.upstream_notebooks[notebook_ref.id] = notebook_ref
637
+
638
+ # Process downstream notebook lineage
639
+ for notebook_ref in lineage_info.downstream_notebooks:
640
+ existing_ref = table.downstream_notebooks.get(notebook_ref.id)
641
+ if existing_ref is None or (
642
+ notebook_ref.last_updated
643
+ and existing_ref.last_updated
644
+ and notebook_ref.last_updated > existing_ref.last_updated
645
+ ):
646
+ table.downstream_notebooks[notebook_ref.id] = notebook_ref
647
+
648
+ def _process_table_lineage_via_http_api(
649
+ self, table: Table, include_entity_lineage: bool
650
+ ) -> None:
651
+ """Process table lineage using the HTTP API (legacy fallback)."""
652
+ response: dict = self.list_lineages_by_table_via_http_api(
653
+ table_name=table.ref.qualified_table_name,
654
+ include_entity_lineage=include_entity_lineage,
655
+ )
656
+
657
+ for item in response.get("upstreams") or []:
658
+ if "tableInfo" in item:
659
+ table_ref = TableReference.create_from_lineage(
660
+ item["tableInfo"], table.schema.catalog.metastore
661
+ )
662
+ if table_ref:
663
+ table.upstreams[table_ref] = {}
664
+ elif "fileInfo" in item:
665
+ external_ref = ExternalTableReference.create_from_lineage(
666
+ item["fileInfo"]
667
+ )
668
+ if external_ref:
669
+ table.external_upstreams.add(external_ref)
670
+
671
+ for notebook in item.get("notebookInfos") or []:
672
+ notebook_ref = NotebookReference(
673
+ id=notebook["notebook_id"],
674
+ )
675
+ table.upstream_notebooks[notebook_ref.id] = notebook_ref
676
+
677
+ for item in response.get("downstreams") or []:
678
+ for notebook in item.get("notebookInfos") or []:
679
+ notebook_ref = NotebookReference(
680
+ id=notebook["notebook_id"],
681
+ )
682
+ table.downstream_notebooks[notebook_ref.id] = notebook_ref
683
+
413
684
  def get_column_lineage(
414
685
  self,
415
686
  table: Table,
416
687
  column_names: List[str],
417
688
  *,
418
689
  max_workers: Optional[int] = None,
690
+ start_time: Optional[datetime] = None,
691
+ end_time: Optional[datetime] = None,
419
692
  ) -> None:
420
693
  try:
421
- # use the newer system tables if we have a SQL warehouse, otherwise fall back
422
- # and use the older (and much slower) HTTP API.
423
- if self.warehouse_id:
694
+ # Determine lineage data source based on config
695
+ use_system_tables = False
696
+ if self.lineage_data_source == LineageDataSource.SYSTEM_TABLES:
697
+ use_system_tables = True
698
+ elif self.lineage_data_source == LineageDataSource.API:
699
+ use_system_tables = False
700
+ elif self.lineage_data_source == LineageDataSource.AUTO:
701
+ # Use the newer system tables if we have a SQL warehouse, otherwise fall back
702
+ # to the older (and slower) HTTP API.
703
+ use_system_tables = bool(self.warehouse_id)
704
+ else:
705
+ assert_never(self.lineage_data_source)
706
+
707
+ if use_system_tables:
424
708
  lineage = (
425
- self.get_catalog_column_lineage(table.ref.catalog)
709
+ self.get_catalog_column_lineage_via_system_tables(
710
+ table.ref.catalog, start_time, end_time
711
+ )
426
712
  .get(table.ref.schema, {})
427
713
  .get(table.ref.table, {})
428
714
  )
@@ -430,7 +716,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
430
716
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
431
717
  futures = [
432
718
  executor.submit(
433
- self.list_lineages_by_column,
719
+ self.list_lineages_by_column_via_http_api,
434
720
  table.ref.qualified_table_name,
435
721
  column_name,
436
722
  )
@@ -608,7 +894,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
608
894
  logger.warning(f"Failed to execute SQL query: {e}")
609
895
  return []
610
896
 
611
- @cached(cachetools.FIFOCache(maxsize=100))
897
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
612
898
  def get_schema_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
613
899
  """Optimized version using databricks-sql"""
614
900
  logger.info(f"Fetching schema tags for catalog: `{catalog}`")
@@ -631,7 +917,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
631
917
 
632
918
  return result_dict
633
919
 
634
- @cached(cachetools.FIFOCache(maxsize=100))
920
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
635
921
  def get_catalog_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
636
922
  """Optimized version using databricks-sql"""
637
923
  logger.info(f"Fetching table tags for catalog: `{catalog}`")
@@ -653,7 +939,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
653
939
 
654
940
  return result_dict
655
941
 
656
- @cached(cachetools.FIFOCache(maxsize=100))
942
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
657
943
  def get_table_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
658
944
  """Optimized version using databricks-sql"""
659
945
  logger.info(f"Fetching table tags for catalog: `{catalog}`")
@@ -676,7 +962,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
676
962
 
677
963
  return result_dict
678
964
 
679
- @cached(cachetools.FIFOCache(maxsize=100))
965
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
680
966
  def get_column_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
681
967
  """Optimized version using databricks-sql"""
682
968
  logger.info(f"Fetching column tags for catalog: `{catalog}`")
@@ -148,6 +148,7 @@ class TableReference:
148
148
  catalog: str
149
149
  schema: str
150
150
  table: str
151
+ last_updated: Optional[datetime] = None
151
152
 
152
153
  @classmethod
153
154
  def create(cls, table: "Table") -> "TableReference":
@@ -172,6 +173,7 @@ class TableReference:
172
173
  d["catalog_name"],
173
174
  d["schema_name"],
174
175
  d.get("table_name", d["name"]), # column vs table query output
176
+ last_updated=d.get("last_updated"),
175
177
  )
176
178
  except Exception as e:
177
179
  logger.warning(f"Failed to create TableReference from {d}: {e}")
@@ -199,6 +201,7 @@ class ExternalTableReference:
199
201
  name: Optional[str]
200
202
  type: Optional[SecurableType]
201
203
  storage_location: Optional[str]
204
+ last_updated: Optional[datetime] = None
202
205
 
203
206
  @classmethod
204
207
  def create_from_lineage(cls, d: dict) -> Optional["ExternalTableReference"]:
@@ -215,12 +218,19 @@ class ExternalTableReference:
215
218
  name=d.get("securable_name"),
216
219
  type=securable_type,
217
220
  storage_location=d.get("storage_location"),
221
+ last_updated=d.get("last_updated"),
218
222
  )
219
223
  except Exception as e:
220
224
  logger.warning(f"Failed to create ExternalTableReference from {d}: {e}")
221
225
  return None
222
226
 
223
227
 
228
+ @dataclass(frozen=True, order=True)
229
+ class NotebookReference:
230
+ id: int
231
+ last_updated: Optional[datetime] = None
232
+
233
+
224
234
  @dataclass
225
235
  class Table(CommonProperty):
226
236
  schema: Schema
@@ -239,8 +249,8 @@ class Table(CommonProperty):
239
249
  properties: Dict[str, str]
240
250
  upstreams: Dict[TableReference, Dict[str, List[str]]] = field(default_factory=dict)
241
251
  external_upstreams: Set[ExternalTableReference] = field(default_factory=set)
242
- upstream_notebooks: Set[NotebookId] = field(default_factory=set)
243
- downstream_notebooks: Set[NotebookId] = field(default_factory=set)
252
+ upstream_notebooks: Dict[int, NotebookReference] = field(default_factory=dict)
253
+ downstream_notebooks: Dict[int, NotebookReference] = field(default_factory=dict)
244
254
 
245
255
  ref: TableReference = field(init=False)
246
256