acryl-datahub 1.2.0.4rc4__py3-none-any.whl → 1.2.0.5rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (38) hide show
  1. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/METADATA +2410 -2410
  2. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/RECORD +38 -36
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +1 -1
  5. datahub/api/entities/external/external_entities.py +500 -15
  6. datahub/ingestion/source/aws/glue.py +18 -14
  7. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  8. datahub/ingestion/source/aws/tag_entities.py +82 -104
  9. datahub/ingestion/source/common/subtypes.py +1 -0
  10. datahub/ingestion/source/hex/api.py +2 -0
  11. datahub/ingestion/source/hex/mapper.py +16 -2
  12. datahub/ingestion/source/hex/model.py +2 -0
  13. datahub/ingestion/source/looker/looker_common.py +26 -0
  14. datahub/ingestion/source/snowflake/constants.py +1 -0
  15. datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
  16. datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
  17. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
  18. datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
  19. datahub/ingestion/source/snowflake/snowflake_v2.py +3 -1
  20. datahub/ingestion/source/sql/mssql/source.py +2 -25
  21. datahub/ingestion/source/sql/mysql.py +54 -0
  22. datahub/ingestion/source/sql/postgres.py +5 -134
  23. datahub/ingestion/source/sql/sql_common.py +137 -0
  24. datahub/ingestion/source/superset.py +140 -56
  25. datahub/ingestion/source/unity/config.py +11 -0
  26. datahub/ingestion/source/unity/connection_test.py +1 -0
  27. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  28. datahub/ingestion/source/unity/proxy.py +20 -6
  29. datahub/ingestion/source/unity/report.py +9 -1
  30. datahub/ingestion/source/unity/source.py +51 -16
  31. datahub/ingestion/source/unity/tag_entities.py +49 -147
  32. datahub/metadata/_internal_schema_classes.py +1 -1
  33. datahub/metadata/schema.avsc +4 -2
  34. datahub/metadata/schemas/Operation.avsc +4 -2
  35. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/WHEEL +0 -0
  36. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/entry_points.txt +0 -0
  37. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/licenses/LICENSE +0 -0
  38. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/top_level.txt +0 -0
@@ -27,6 +27,7 @@ from sqlalchemy.exc import ProgrammingError
27
27
  from sqlalchemy.sql import sqltypes as types
28
28
  from sqlalchemy.types import TypeDecorator, TypeEngine
29
29
 
30
+ from datahub.configuration.common import AllowDenyPattern
30
31
  from datahub.emitter.mce_builder import (
31
32
  make_data_platform_urn,
32
33
  make_dataplatform_instance_urn,
@@ -71,6 +72,11 @@ from datahub.ingestion.source.sql.sql_utils import (
71
72
  from datahub.ingestion.source.sql.sqlalchemy_data_reader import (
72
73
  SqlAlchemyTableDataReader,
73
74
  )
75
+ from datahub.ingestion.source.sql.stored_procedures.base import (
76
+ BaseProcedure,
77
+ generate_procedure_container_workunits,
78
+ generate_procedure_workunits,
79
+ )
74
80
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
75
81
  StaleEntityRemovalHandler,
76
82
  )
@@ -531,6 +537,24 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
531
537
  if self.config.include_views:
532
538
  yield from self.loop_views(inspector, schema, self.config)
533
539
 
540
+ if getattr(self.config, "include_stored_procedures", False):
541
+ try:
542
+ yield from self.loop_stored_procedures(inspector, schema, self.config)
543
+ except NotImplementedError as e:
544
+ self.report.warning(
545
+ title="Stored procedures not supported",
546
+ message="The current SQL dialect does not support stored procedures.",
547
+ context=f"{database}.{schema}",
548
+ exc=e,
549
+ )
550
+ except Exception as e:
551
+ self.report.failure(
552
+ title="Failed to list stored procedures for schema",
553
+ message="An error occurred while listing procedures for the schema.",
554
+ context=f"{database}.{schema}",
555
+ exc=e,
556
+ )
557
+
534
558
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
535
559
  return [
536
560
  *super().get_workunit_processors(),
@@ -1437,3 +1461,116 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1437
1461
 
1438
1462
  def get_report(self):
1439
1463
  return self.report
1464
+
1465
+ def loop_stored_procedures(
1466
+ self,
1467
+ inspector: Inspector,
1468
+ schema: str,
1469
+ config: Union[SQLCommonConfig, Type[SQLCommonConfig]],
1470
+ ) -> Iterable[MetadataWorkUnit]:
1471
+ """
1472
+ Loop schema data for get stored procedures as dataJob-s.
1473
+ """
1474
+ db_name = self.get_db_name(inspector)
1475
+
1476
+ procedures = self.fetch_procedures_for_schema(inspector, schema, db_name)
1477
+ if procedures:
1478
+ yield from self._process_procedures(procedures, db_name, schema)
1479
+
1480
+ def fetch_procedures_for_schema(
1481
+ self, inspector: Inspector, schema: str, db_name: str
1482
+ ) -> List[BaseProcedure]:
1483
+ try:
1484
+ raw_procedures: List[BaseProcedure] = self.get_procedures_for_schema(
1485
+ inspector, schema, db_name
1486
+ )
1487
+ procedures: List[BaseProcedure] = []
1488
+ for procedure in raw_procedures:
1489
+ procedure_qualified_name = self.get_identifier(
1490
+ schema=schema,
1491
+ entity=procedure.name,
1492
+ inspector=inspector,
1493
+ )
1494
+
1495
+ procedure_pattern = getattr(
1496
+ self.config, "procedure_pattern", AllowDenyPattern.allow_all()
1497
+ )
1498
+ if not procedure_pattern.allowed(procedure_qualified_name):
1499
+ self.report.report_dropped(procedure_qualified_name)
1500
+ else:
1501
+ procedures.append(procedure)
1502
+ return procedures
1503
+ except NotImplementedError:
1504
+ raise
1505
+ except Exception as e:
1506
+ self.report.warning(
1507
+ title="Failed to get procedures for schema",
1508
+ message="An error occurred while fetching procedures for the schema.",
1509
+ context=f"{db_name}.{schema}",
1510
+ exc=e,
1511
+ )
1512
+ return []
1513
+
1514
+ def get_procedures_for_schema(
1515
+ self, inspector: Inspector, schema: str, db_name: str
1516
+ ) -> List[BaseProcedure]:
1517
+ raise NotImplementedError(
1518
+ "Subclasses must implement the 'get_procedures_for_schema' method."
1519
+ )
1520
+
1521
+ def _process_procedures(
1522
+ self,
1523
+ procedures: List[BaseProcedure],
1524
+ db_name: str,
1525
+ schema: str,
1526
+ ) -> Iterable[MetadataWorkUnit]:
1527
+ if procedures:
1528
+ yield from generate_procedure_container_workunits(
1529
+ database_key=gen_database_key(
1530
+ database=db_name,
1531
+ platform=self.platform,
1532
+ platform_instance=self.config.platform_instance,
1533
+ env=self.config.env,
1534
+ ),
1535
+ schema_key=gen_schema_key(
1536
+ db_name=db_name,
1537
+ schema=schema,
1538
+ platform=self.platform,
1539
+ platform_instance=self.config.platform_instance,
1540
+ env=self.config.env,
1541
+ ),
1542
+ )
1543
+ for procedure in procedures:
1544
+ yield from self._process_procedure(procedure, schema, db_name)
1545
+
1546
+ def _process_procedure(
1547
+ self,
1548
+ procedure: BaseProcedure,
1549
+ schema: str,
1550
+ db_name: str,
1551
+ ) -> Iterable[MetadataWorkUnit]:
1552
+ try:
1553
+ yield from generate_procedure_workunits(
1554
+ procedure=procedure,
1555
+ database_key=gen_database_key(
1556
+ database=db_name,
1557
+ platform=self.platform,
1558
+ platform_instance=self.config.platform_instance,
1559
+ env=self.config.env,
1560
+ ),
1561
+ schema_key=gen_schema_key(
1562
+ db_name=db_name,
1563
+ schema=schema,
1564
+ platform=self.platform,
1565
+ platform_instance=self.config.platform_instance,
1566
+ env=self.config.env,
1567
+ ),
1568
+ schema_resolver=self.get_schema_resolver(),
1569
+ )
1570
+ except Exception as e:
1571
+ self.report.warning(
1572
+ title="Failed to emit stored procedure",
1573
+ message="An error occurred while emitting stored procedure",
1574
+ context=procedure.name,
1575
+ exc=e,
1576
+ )
@@ -8,6 +8,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
8
8
 
9
9
  import dateutil.parser as dp
10
10
  import requests
11
+ import sqlglot
11
12
  from pydantic import BaseModel
12
13
  from pydantic.class_validators import root_validator, validator
13
14
  from pydantic.fields import Field
@@ -75,6 +76,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
75
76
  SchemaFieldDataType,
76
77
  SchemaMetadata,
77
78
  StringTypeClass,
79
+ TimeTypeClass,
78
80
  )
79
81
  from datahub.metadata.schema_classes import (
80
82
  AuditStampClass,
@@ -131,8 +133,11 @@ FIELD_TYPE_MAPPING = {
131
133
  "STRING": StringTypeClass,
132
134
  "FLOAT": NumberTypeClass,
133
135
  "DATETIME": DateTypeClass,
136
+ "TIMESTAMP": TimeTypeClass,
134
137
  "BOOLEAN": BooleanTypeClass,
135
138
  "SQL": StringTypeClass,
139
+ "NUMERIC": NumberTypeClass,
140
+ "TEXT": StringTypeClass,
136
141
  }
137
142
 
138
143
 
@@ -633,74 +638,130 @@ class SupersetSource(StatefulIngestionSourceBase):
633
638
 
634
639
  return input_fields
635
640
 
636
- def construct_chart_cll(
637
- self,
638
- chart_data: dict,
639
- datasource_urn: Union[str, None],
640
- datasource_id: Union[Any, int],
641
- ) -> List[InputField]:
642
- column_data: List[Union[str, dict]] = chart_data.get("form_data", {}).get(
643
- "all_columns", []
644
- )
641
+ def _extract_columns_from_sql(self, sql_expr: Optional[str]) -> List[str]:
642
+ if not sql_expr:
643
+ return []
645
644
 
646
- # the second field represents whether its a SQL expression,
647
- # false being just regular column and true being SQL col
648
- chart_column_data: List[Tuple[str, bool]] = [
649
- (column, False)
650
- if isinstance(column, str)
651
- else (column.get("label", ""), True)
652
- for column in column_data
653
- ]
645
+ try:
646
+ parsed_expr = sqlglot.parse_one(sql_expr)
654
647
 
655
- dataset_columns: List[Tuple[str, str, str]] = []
648
+ column_refs = set()
649
+ for node in parsed_expr.walk():
650
+ if isinstance(node, sqlglot.exp.Column):
651
+ column_name = node.name
652
+ column_refs.add(column_name)
656
653
 
657
- # parses the superset dataset's column info, to build type and description info
658
- if datasource_id:
659
- dataset_info = self.get_dataset_info(datasource_id).get("result", {})
660
- dataset_column_info = dataset_info.get("columns", [])
661
- dataset_metric_info = dataset_info.get("metrics", [])
654
+ return list(column_refs)
655
+ except Exception as e:
656
+ self.report.warning(f"Failed to parse SQL expression '{sql_expr}': {e}")
657
+ return []
662
658
 
663
- for column in dataset_column_info:
664
- col_name = column.get("column_name", "")
665
- col_type = column.get("type", "")
666
- col_description = column.get("description", "")
659
+ def _process_column_item(
660
+ self, item: Union[str, dict], unique_columns: Dict[str, bool]
661
+ ) -> None:
662
+ """Process a single column item and add to unique_columns."""
667
663
 
668
- # if missing column name or column type, cannot construct the column,
669
- # so we skip this column, missing description is fine
670
- if col_name == "" or col_type == "":
671
- logger.info(f"could not construct column lineage for {column}")
672
- continue
664
+ def add_column(col_name: str, is_sql: bool) -> None:
665
+ if not col_name:
666
+ return
667
+ # Always set to False if any non-SQL seen, else keep as is_sql
668
+ unique_columns[col_name] = unique_columns.get(col_name, True) and is_sql
669
+
670
+ if isinstance(item, str):
671
+ add_column(item, False)
672
+ elif isinstance(item, dict):
673
+ if item.get("expressionType") == "SIMPLE":
674
+ # For metrics with SIMPLE expression type
675
+ add_column(item.get("column", {}).get("column_name", ""), False)
676
+ elif item.get("expressionType") == "SQL":
677
+ sql_expr = item.get("sqlExpression")
678
+ column_refs = self._extract_columns_from_sql(sql_expr)
679
+ for col in column_refs:
680
+ add_column(col, False)
681
+ if not column_refs:
682
+ add_column(item.get("label", ""), True)
683
+
684
+ def _collect_all_unique_columns(self, form_data: dict) -> Dict[str, bool]:
685
+ """Collect all unique column names from form_data, distinguishing SQL vs non-SQL."""
686
+ unique_columns: Dict[str, bool] = {}
687
+
688
+ # Process regular columns
689
+ for column in form_data.get("all_columns", []):
690
+ self._process_column_item(column, unique_columns)
691
+
692
+ # Process metrics
693
+ # For charts with a single metric, the metric is stored in the form_data as a string in the 'metric' key
694
+ # For charts with multiple metrics, the metrics are stored in the form_data as a list of strings in the 'metrics' key
695
+ if "metric" in form_data:
696
+ metrics_data = [form_data.get("metric")]
697
+ else:
698
+ metrics_data = form_data.get("metrics", [])
673
699
 
674
- dataset_columns.append((col_name, col_type, col_description))
700
+ for metric in metrics_data:
701
+ if metric is not None:
702
+ self._process_column_item(metric, unique_columns)
675
703
 
676
- for metric in dataset_metric_info:
677
- metric_name = metric.get("metric_name", "")
678
- metric_type = metric.get("metric_type", "")
679
- metric_description = metric.get("description", "")
704
+ # Process group by columns
705
+ for group in form_data.get("groupby", []):
706
+ self._process_column_item(group, unique_columns)
680
707
 
681
- if metric_name == "" or metric_type == "":
682
- logger.info(f"could not construct metric lineage for {metric}")
683
- continue
708
+ # Process x-axis columns
709
+ x_axis_data = form_data.get("x_axis")
710
+ if x_axis_data is not None:
711
+ self._process_column_item(x_axis_data, unique_columns)
684
712
 
685
- dataset_columns.append((metric_name, metric_type, metric_description))
686
- else:
687
- # if no datasource id, cannot build cll, just return
713
+ return unique_columns
714
+
715
+ def _fetch_dataset_columns(
716
+ self, datasource_id: Union[Any, int]
717
+ ) -> List[Tuple[str, str, str]]:
718
+ """Fetch dataset columns and metrics from Superset API."""
719
+ if not datasource_id:
688
720
  logger.warning(
689
721
  "no datasource id was found, cannot build column level lineage"
690
722
  )
691
723
  return []
692
724
 
725
+ dataset_info = self.get_dataset_info(datasource_id).get("result", {})
726
+ dataset_column_info = dataset_info.get("columns", [])
727
+ dataset_metric_info = dataset_info.get("metrics", [])
728
+
729
+ dataset_columns: List[Tuple[str, str, str]] = []
730
+ for column in dataset_column_info:
731
+ col_name = column.get("column_name", "")
732
+ col_type = column.get("type", "")
733
+ col_description = column.get("description", "")
734
+
735
+ if col_name == "" or col_type == "":
736
+ logger.info(f"could not construct column lineage for {column}")
737
+ continue
738
+
739
+ dataset_columns.append((col_name, col_type, col_description))
740
+
741
+ for metric in dataset_metric_info:
742
+ metric_name = metric.get("metric_name", "")
743
+ metric_type = metric.get("metric_type", "")
744
+ metric_description = metric.get("description", "")
745
+
746
+ if metric_name == "" or metric_type == "":
747
+ logger.info(f"could not construct metric lineage for {metric}")
748
+ continue
749
+
750
+ dataset_columns.append((metric_name, metric_type, metric_description))
751
+
752
+ return dataset_columns
753
+
754
+ def _match_chart_columns_with_dataset(
755
+ self,
756
+ unique_chart_columns: Dict[str, bool],
757
+ dataset_columns: List[Tuple[str, str, str]],
758
+ ) -> List[Tuple[str, str, str]]:
759
+ """Match chart columns with dataset columns, preserving SQL/non-SQL status."""
693
760
  chart_columns: List[Tuple[str, str, str]] = []
694
- for chart_col in chart_column_data:
695
- chart_col_name, is_sql = chart_col
761
+
762
+ for chart_col_name, is_sql in unique_chart_columns.items():
696
763
  if is_sql:
697
- chart_columns.append(
698
- (
699
- chart_col_name,
700
- "SQL",
701
- "",
702
- )
703
- )
764
+ chart_columns.append((chart_col_name, "SQL", ""))
704
765
  continue
705
766
 
706
767
  # find matching upstream column
@@ -711,13 +772,36 @@ class SupersetSource(StatefulIngestionSourceBase):
711
772
  if dataset_col_name == chart_col_name:
712
773
  chart_columns.append(
713
774
  (chart_col_name, dataset_col_type, dataset_col_description)
714
- ) # column name, column type, description
775
+ )
715
776
  break
716
-
717
- # if no matching upstream column was found
718
- if len(chart_columns) == 0 or chart_columns[-1][0] != chart_col_name:
777
+ else:
719
778
  chart_columns.append((chart_col_name, "", ""))
720
779
 
780
+ return chart_columns
781
+
782
+ def construct_chart_cll(
783
+ self,
784
+ chart_data: dict,
785
+ datasource_urn: Union[str, None],
786
+ datasource_id: Union[Any, int],
787
+ ) -> List[InputField]:
788
+ """Construct column-level lineage for a chart."""
789
+ form_data = chart_data.get("form_data", {})
790
+
791
+ # Extract and process all columns in one go
792
+ unique_columns = self._collect_all_unique_columns(form_data)
793
+
794
+ # Fetch dataset columns
795
+ dataset_columns = self._fetch_dataset_columns(datasource_id)
796
+ if not dataset_columns:
797
+ return []
798
+
799
+ # Match chart columns with dataset columns
800
+ chart_columns = self._match_chart_columns_with_dataset(
801
+ unique_columns, dataset_columns
802
+ )
803
+
804
+ # Build input fields
721
805
  return self.build_input_fields(chart_columns, datasource_urn)
722
806
 
723
807
  def construct_chart_from_chart_data(
@@ -275,6 +275,17 @@ class UnityCatalogSourceConfig(
275
275
  hidden_from_docs=True,
276
276
  )
277
277
 
278
+ databricks_api_page_size: int = pydantic.Field(
279
+ default=0,
280
+ ge=0,
281
+ description=(
282
+ "Page size for Databricks API calls when listing resources (catalogs, schemas, tables, etc.). "
283
+ "When set to 0 (default), uses server-side configured page length (recommended). "
284
+ "When set to a positive value, the page length is the minimum of this value and the server configured value. "
285
+ "Must be a non-negative integer."
286
+ ),
287
+ )
288
+
278
289
  include_usage_statistics: bool = Field(
279
290
  default=True,
280
291
  description="Generate usage statistics.",
@@ -19,6 +19,7 @@ class UnityCatalogConnectionTest:
19
19
  self.config.token,
20
20
  self.config.profiling.warehouse_id,
21
21
  report=self.report,
22
+ databricks_api_page_size=self.config.databricks_api_page_size,
22
23
  )
23
24
 
24
25
  def get_connection_test(self) -> TestConnectionReport:
@@ -0,0 +1,19 @@
1
+ import logging
2
+
3
+ from datahub.api.entities.external.external_entities import (
4
+ PlatformResourceRepository,
5
+ )
6
+ from datahub.ingestion.source.unity.tag_entities import (
7
+ UnityCatalogTagPlatformResource,
8
+ UnityCatalogTagPlatformResourceId,
9
+ )
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class UnityCatalogPlatformResourceRepository(
15
+ PlatformResourceRepository[
16
+ UnityCatalogTagPlatformResourceId, UnityCatalogTagPlatformResource
17
+ ]
18
+ ):
19
+ """Unity Catalog-specific platform resource repository with tag-related operations."""
@@ -141,6 +141,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
141
141
  report: UnityCatalogReport,
142
142
  hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
143
143
  lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
144
+ databricks_api_page_size: int = 0,
144
145
  ):
145
146
  self._workspace_client = WorkspaceClient(
146
147
  host=workspace_url,
@@ -152,6 +153,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
152
153
  self.report = report
153
154
  self.hive_metastore_proxy = hive_metastore_proxy
154
155
  self.lineage_data_source = lineage_data_source
156
+ self.databricks_api_page_size = databricks_api_page_size
155
157
  self._sql_connection_params = {
156
158
  "server_hostname": self._workspace_client.config.host.replace(
157
159
  "https://", ""
@@ -161,7 +163,11 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
161
163
  }
162
164
 
163
165
  def check_basic_connectivity(self) -> bool:
164
- return bool(self._workspace_client.catalogs.list(include_browse=True))
166
+ return bool(
167
+ self._workspace_client.catalogs.list(
168
+ include_browse=True, max_results=self.databricks_api_page_size
169
+ )
170
+ )
165
171
 
166
172
  def assigned_metastore(self) -> Optional[Metastore]:
167
173
  response = self._workspace_client.metastores.summary()
@@ -171,7 +177,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
171
177
  if self.hive_metastore_proxy:
172
178
  yield self.hive_metastore_proxy.hive_metastore_catalog(metastore)
173
179
 
174
- response = self._workspace_client.catalogs.list(include_browse=True)
180
+ response = self._workspace_client.catalogs.list(
181
+ include_browse=True, max_results=self.databricks_api_page_size
182
+ )
175
183
  if not response:
176
184
  logger.info("Catalogs not found")
177
185
  return
@@ -203,7 +211,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
203
211
  yield from self.hive_metastore_proxy.hive_metastore_schemas(catalog)
204
212
  return
205
213
  response = self._workspace_client.schemas.list(
206
- catalog_name=catalog.name, include_browse=True
214
+ catalog_name=catalog.name,
215
+ include_browse=True,
216
+ max_results=self.databricks_api_page_size,
207
217
  )
208
218
  if not response:
209
219
  logger.info(f"Schemas not found for catalog {catalog.id}")
@@ -225,6 +235,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
225
235
  catalog_name=schema.catalog.name,
226
236
  schema_name=schema.name,
227
237
  include_browse=True,
238
+ max_results=self.databricks_api_page_size,
228
239
  )
229
240
  if not response:
230
241
  logger.info(f"Tables not found for schema {schema.id}")
@@ -257,7 +268,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
257
268
  return group_list
258
269
 
259
270
  def workspace_notebooks(self) -> Iterable[Notebook]:
260
- for obj in self._workspace_client.workspace.list("/", recursive=True):
271
+ workspace_objects_iter = self._workspace_client.workspace.list(
272
+ "/", recursive=True, max_results=self.databricks_api_page_size
273
+ )
274
+ for obj in workspace_objects_iter:
261
275
  if obj.object_type == ObjectType.NOTEBOOK and obj.object_id and obj.path:
262
276
  yield Notebook(
263
277
  id=obj.object_id,
@@ -299,7 +313,6 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
299
313
  def _query_history(
300
314
  self,
301
315
  filter_by: QueryFilterWithStatementTypes,
302
- max_results: int = 1000,
303
316
  include_metrics: bool = False,
304
317
  ) -> Iterable[QueryInfo]:
305
318
  """Manual implementation of the query_history.list() endpoint.
@@ -311,9 +324,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
311
324
  """
312
325
  method = "GET"
313
326
  path = "/api/2.0/sql/history/queries"
327
+
314
328
  body: Dict[str, Any] = {
315
329
  "include_metrics": include_metrics,
316
- "max_results": max_results, # Max batch size
330
+ "max_results": self.databricks_api_page_size, # Max batch size
317
331
  }
318
332
 
319
333
  response: dict = self._workspace_client.api_client.do( # type: ignore
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import Optional, Tuple
2
+ from typing import TYPE_CHECKING, Optional, Tuple
3
3
 
4
4
  from datahub.ingestion.api.report import EntityFilterReport, Report
5
5
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
@@ -7,6 +7,11 @@ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
7
7
  from datahub.utilities.lossy_collections import LossyDict, LossyList
8
8
  from datahub.utilities.perf_timer import PerfTimer
9
9
 
10
+ if TYPE_CHECKING:
11
+ from datahub.ingestion.source.unity.platform_resource_repository import (
12
+ UnityCatalogPlatformResourceRepository,
13
+ )
14
+
10
15
 
11
16
  @dataclass
12
17
  class UnityCatalogUsagePerfReport(Report):
@@ -61,3 +66,6 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
61
66
  num_tables_missing_name: int = 0
62
67
  num_columns_missing_name: int = 0
63
68
  num_queries_missing_info: int = 0
69
+
70
+ # Platform resource repository for automatic cache statistics via SupportsAsObj
71
+ tag_urn_resolver_cache: Optional["UnityCatalogPlatformResourceRepository"] = None