acryl-datahub 1.2.0.4rc4__py3-none-any.whl → 1.2.0.5rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/METADATA +2410 -2410
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/RECORD +38 -36
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/external/external_entities.py +500 -15
- datahub/ingestion/source/aws/glue.py +18 -14
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/tag_entities.py +82 -104
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/hex/api.py +2 -0
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/looker/looker_common.py +26 -0
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
- datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
- datahub/ingestion/source/snowflake/snowflake_v2.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +2 -25
- datahub/ingestion/source/sql/mysql.py +54 -0
- datahub/ingestion/source/sql/postgres.py +5 -134
- datahub/ingestion/source/sql/sql_common.py +137 -0
- datahub/ingestion/source/superset.py +140 -56
- datahub/ingestion/source/unity/config.py +11 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +20 -6
- datahub/ingestion/source/unity/report.py +9 -1
- datahub/ingestion/source/unity/source.py +51 -16
- datahub/ingestion/source/unity/tag_entities.py +49 -147
- datahub/metadata/_internal_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +4 -2
- datahub/metadata/schemas/Operation.avsc +4 -2
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/top_level.txt +0 -0
|
@@ -27,6 +27,7 @@ from sqlalchemy.exc import ProgrammingError
|
|
|
27
27
|
from sqlalchemy.sql import sqltypes as types
|
|
28
28
|
from sqlalchemy.types import TypeDecorator, TypeEngine
|
|
29
29
|
|
|
30
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
30
31
|
from datahub.emitter.mce_builder import (
|
|
31
32
|
make_data_platform_urn,
|
|
32
33
|
make_dataplatform_instance_urn,
|
|
@@ -71,6 +72,11 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
71
72
|
from datahub.ingestion.source.sql.sqlalchemy_data_reader import (
|
|
72
73
|
SqlAlchemyTableDataReader,
|
|
73
74
|
)
|
|
75
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
76
|
+
BaseProcedure,
|
|
77
|
+
generate_procedure_container_workunits,
|
|
78
|
+
generate_procedure_workunits,
|
|
79
|
+
)
|
|
74
80
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
75
81
|
StaleEntityRemovalHandler,
|
|
76
82
|
)
|
|
@@ -531,6 +537,24 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
531
537
|
if self.config.include_views:
|
|
532
538
|
yield from self.loop_views(inspector, schema, self.config)
|
|
533
539
|
|
|
540
|
+
if getattr(self.config, "include_stored_procedures", False):
|
|
541
|
+
try:
|
|
542
|
+
yield from self.loop_stored_procedures(inspector, schema, self.config)
|
|
543
|
+
except NotImplementedError as e:
|
|
544
|
+
self.report.warning(
|
|
545
|
+
title="Stored procedures not supported",
|
|
546
|
+
message="The current SQL dialect does not support stored procedures.",
|
|
547
|
+
context=f"{database}.{schema}",
|
|
548
|
+
exc=e,
|
|
549
|
+
)
|
|
550
|
+
except Exception as e:
|
|
551
|
+
self.report.failure(
|
|
552
|
+
title="Failed to list stored procedures for schema",
|
|
553
|
+
message="An error occurred while listing procedures for the schema.",
|
|
554
|
+
context=f"{database}.{schema}",
|
|
555
|
+
exc=e,
|
|
556
|
+
)
|
|
557
|
+
|
|
534
558
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
535
559
|
return [
|
|
536
560
|
*super().get_workunit_processors(),
|
|
@@ -1437,3 +1461,116 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1437
1461
|
|
|
1438
1462
|
def get_report(self):
|
|
1439
1463
|
return self.report
|
|
1464
|
+
|
|
1465
|
+
def loop_stored_procedures(
|
|
1466
|
+
self,
|
|
1467
|
+
inspector: Inspector,
|
|
1468
|
+
schema: str,
|
|
1469
|
+
config: Union[SQLCommonConfig, Type[SQLCommonConfig]],
|
|
1470
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1471
|
+
"""
|
|
1472
|
+
Loop schema data for get stored procedures as dataJob-s.
|
|
1473
|
+
"""
|
|
1474
|
+
db_name = self.get_db_name(inspector)
|
|
1475
|
+
|
|
1476
|
+
procedures = self.fetch_procedures_for_schema(inspector, schema, db_name)
|
|
1477
|
+
if procedures:
|
|
1478
|
+
yield from self._process_procedures(procedures, db_name, schema)
|
|
1479
|
+
|
|
1480
|
+
def fetch_procedures_for_schema(
|
|
1481
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
1482
|
+
) -> List[BaseProcedure]:
|
|
1483
|
+
try:
|
|
1484
|
+
raw_procedures: List[BaseProcedure] = self.get_procedures_for_schema(
|
|
1485
|
+
inspector, schema, db_name
|
|
1486
|
+
)
|
|
1487
|
+
procedures: List[BaseProcedure] = []
|
|
1488
|
+
for procedure in raw_procedures:
|
|
1489
|
+
procedure_qualified_name = self.get_identifier(
|
|
1490
|
+
schema=schema,
|
|
1491
|
+
entity=procedure.name,
|
|
1492
|
+
inspector=inspector,
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
procedure_pattern = getattr(
|
|
1496
|
+
self.config, "procedure_pattern", AllowDenyPattern.allow_all()
|
|
1497
|
+
)
|
|
1498
|
+
if not procedure_pattern.allowed(procedure_qualified_name):
|
|
1499
|
+
self.report.report_dropped(procedure_qualified_name)
|
|
1500
|
+
else:
|
|
1501
|
+
procedures.append(procedure)
|
|
1502
|
+
return procedures
|
|
1503
|
+
except NotImplementedError:
|
|
1504
|
+
raise
|
|
1505
|
+
except Exception as e:
|
|
1506
|
+
self.report.warning(
|
|
1507
|
+
title="Failed to get procedures for schema",
|
|
1508
|
+
message="An error occurred while fetching procedures for the schema.",
|
|
1509
|
+
context=f"{db_name}.{schema}",
|
|
1510
|
+
exc=e,
|
|
1511
|
+
)
|
|
1512
|
+
return []
|
|
1513
|
+
|
|
1514
|
+
def get_procedures_for_schema(
|
|
1515
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
1516
|
+
) -> List[BaseProcedure]:
|
|
1517
|
+
raise NotImplementedError(
|
|
1518
|
+
"Subclasses must implement the 'get_procedures_for_schema' method."
|
|
1519
|
+
)
|
|
1520
|
+
|
|
1521
|
+
def _process_procedures(
|
|
1522
|
+
self,
|
|
1523
|
+
procedures: List[BaseProcedure],
|
|
1524
|
+
db_name: str,
|
|
1525
|
+
schema: str,
|
|
1526
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1527
|
+
if procedures:
|
|
1528
|
+
yield from generate_procedure_container_workunits(
|
|
1529
|
+
database_key=gen_database_key(
|
|
1530
|
+
database=db_name,
|
|
1531
|
+
platform=self.platform,
|
|
1532
|
+
platform_instance=self.config.platform_instance,
|
|
1533
|
+
env=self.config.env,
|
|
1534
|
+
),
|
|
1535
|
+
schema_key=gen_schema_key(
|
|
1536
|
+
db_name=db_name,
|
|
1537
|
+
schema=schema,
|
|
1538
|
+
platform=self.platform,
|
|
1539
|
+
platform_instance=self.config.platform_instance,
|
|
1540
|
+
env=self.config.env,
|
|
1541
|
+
),
|
|
1542
|
+
)
|
|
1543
|
+
for procedure in procedures:
|
|
1544
|
+
yield from self._process_procedure(procedure, schema, db_name)
|
|
1545
|
+
|
|
1546
|
+
def _process_procedure(
|
|
1547
|
+
self,
|
|
1548
|
+
procedure: BaseProcedure,
|
|
1549
|
+
schema: str,
|
|
1550
|
+
db_name: str,
|
|
1551
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1552
|
+
try:
|
|
1553
|
+
yield from generate_procedure_workunits(
|
|
1554
|
+
procedure=procedure,
|
|
1555
|
+
database_key=gen_database_key(
|
|
1556
|
+
database=db_name,
|
|
1557
|
+
platform=self.platform,
|
|
1558
|
+
platform_instance=self.config.platform_instance,
|
|
1559
|
+
env=self.config.env,
|
|
1560
|
+
),
|
|
1561
|
+
schema_key=gen_schema_key(
|
|
1562
|
+
db_name=db_name,
|
|
1563
|
+
schema=schema,
|
|
1564
|
+
platform=self.platform,
|
|
1565
|
+
platform_instance=self.config.platform_instance,
|
|
1566
|
+
env=self.config.env,
|
|
1567
|
+
),
|
|
1568
|
+
schema_resolver=self.get_schema_resolver(),
|
|
1569
|
+
)
|
|
1570
|
+
except Exception as e:
|
|
1571
|
+
self.report.warning(
|
|
1572
|
+
title="Failed to emit stored procedure",
|
|
1573
|
+
message="An error occurred while emitting stored procedure",
|
|
1574
|
+
context=procedure.name,
|
|
1575
|
+
exc=e,
|
|
1576
|
+
)
|
|
@@ -8,6 +8,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
|
8
8
|
|
|
9
9
|
import dateutil.parser as dp
|
|
10
10
|
import requests
|
|
11
|
+
import sqlglot
|
|
11
12
|
from pydantic import BaseModel
|
|
12
13
|
from pydantic.class_validators import root_validator, validator
|
|
13
14
|
from pydantic.fields import Field
|
|
@@ -75,6 +76,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
75
76
|
SchemaFieldDataType,
|
|
76
77
|
SchemaMetadata,
|
|
77
78
|
StringTypeClass,
|
|
79
|
+
TimeTypeClass,
|
|
78
80
|
)
|
|
79
81
|
from datahub.metadata.schema_classes import (
|
|
80
82
|
AuditStampClass,
|
|
@@ -131,8 +133,11 @@ FIELD_TYPE_MAPPING = {
|
|
|
131
133
|
"STRING": StringTypeClass,
|
|
132
134
|
"FLOAT": NumberTypeClass,
|
|
133
135
|
"DATETIME": DateTypeClass,
|
|
136
|
+
"TIMESTAMP": TimeTypeClass,
|
|
134
137
|
"BOOLEAN": BooleanTypeClass,
|
|
135
138
|
"SQL": StringTypeClass,
|
|
139
|
+
"NUMERIC": NumberTypeClass,
|
|
140
|
+
"TEXT": StringTypeClass,
|
|
136
141
|
}
|
|
137
142
|
|
|
138
143
|
|
|
@@ -633,74 +638,130 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
633
638
|
|
|
634
639
|
return input_fields
|
|
635
640
|
|
|
636
|
-
def
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
datasource_urn: Union[str, None],
|
|
640
|
-
datasource_id: Union[Any, int],
|
|
641
|
-
) -> List[InputField]:
|
|
642
|
-
column_data: List[Union[str, dict]] = chart_data.get("form_data", {}).get(
|
|
643
|
-
"all_columns", []
|
|
644
|
-
)
|
|
641
|
+
def _extract_columns_from_sql(self, sql_expr: Optional[str]) -> List[str]:
|
|
642
|
+
if not sql_expr:
|
|
643
|
+
return []
|
|
645
644
|
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
chart_column_data: List[Tuple[str, bool]] = [
|
|
649
|
-
(column, False)
|
|
650
|
-
if isinstance(column, str)
|
|
651
|
-
else (column.get("label", ""), True)
|
|
652
|
-
for column in column_data
|
|
653
|
-
]
|
|
645
|
+
try:
|
|
646
|
+
parsed_expr = sqlglot.parse_one(sql_expr)
|
|
654
647
|
|
|
655
|
-
|
|
648
|
+
column_refs = set()
|
|
649
|
+
for node in parsed_expr.walk():
|
|
650
|
+
if isinstance(node, sqlglot.exp.Column):
|
|
651
|
+
column_name = node.name
|
|
652
|
+
column_refs.add(column_name)
|
|
656
653
|
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
dataset_metric_info = dataset_info.get("metrics", [])
|
|
654
|
+
return list(column_refs)
|
|
655
|
+
except Exception as e:
|
|
656
|
+
self.report.warning(f"Failed to parse SQL expression '{sql_expr}': {e}")
|
|
657
|
+
return []
|
|
662
658
|
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
659
|
+
def _process_column_item(
|
|
660
|
+
self, item: Union[str, dict], unique_columns: Dict[str, bool]
|
|
661
|
+
) -> None:
|
|
662
|
+
"""Process a single column item and add to unique_columns."""
|
|
667
663
|
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
664
|
+
def add_column(col_name: str, is_sql: bool) -> None:
|
|
665
|
+
if not col_name:
|
|
666
|
+
return
|
|
667
|
+
# Always set to False if any non-SQL seen, else keep as is_sql
|
|
668
|
+
unique_columns[col_name] = unique_columns.get(col_name, True) and is_sql
|
|
669
|
+
|
|
670
|
+
if isinstance(item, str):
|
|
671
|
+
add_column(item, False)
|
|
672
|
+
elif isinstance(item, dict):
|
|
673
|
+
if item.get("expressionType") == "SIMPLE":
|
|
674
|
+
# For metrics with SIMPLE expression type
|
|
675
|
+
add_column(item.get("column", {}).get("column_name", ""), False)
|
|
676
|
+
elif item.get("expressionType") == "SQL":
|
|
677
|
+
sql_expr = item.get("sqlExpression")
|
|
678
|
+
column_refs = self._extract_columns_from_sql(sql_expr)
|
|
679
|
+
for col in column_refs:
|
|
680
|
+
add_column(col, False)
|
|
681
|
+
if not column_refs:
|
|
682
|
+
add_column(item.get("label", ""), True)
|
|
683
|
+
|
|
684
|
+
def _collect_all_unique_columns(self, form_data: dict) -> Dict[str, bool]:
|
|
685
|
+
"""Collect all unique column names from form_data, distinguishing SQL vs non-SQL."""
|
|
686
|
+
unique_columns: Dict[str, bool] = {}
|
|
687
|
+
|
|
688
|
+
# Process regular columns
|
|
689
|
+
for column in form_data.get("all_columns", []):
|
|
690
|
+
self._process_column_item(column, unique_columns)
|
|
691
|
+
|
|
692
|
+
# Process metrics
|
|
693
|
+
# For charts with a single metric, the metric is stored in the form_data as a string in the 'metric' key
|
|
694
|
+
# For charts with multiple metrics, the metrics are stored in the form_data as a list of strings in the 'metrics' key
|
|
695
|
+
if "metric" in form_data:
|
|
696
|
+
metrics_data = [form_data.get("metric")]
|
|
697
|
+
else:
|
|
698
|
+
metrics_data = form_data.get("metrics", [])
|
|
673
699
|
|
|
674
|
-
|
|
700
|
+
for metric in metrics_data:
|
|
701
|
+
if metric is not None:
|
|
702
|
+
self._process_column_item(metric, unique_columns)
|
|
675
703
|
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
metric_description = metric.get("description", "")
|
|
704
|
+
# Process group by columns
|
|
705
|
+
for group in form_data.get("groupby", []):
|
|
706
|
+
self._process_column_item(group, unique_columns)
|
|
680
707
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
708
|
+
# Process x-axis columns
|
|
709
|
+
x_axis_data = form_data.get("x_axis")
|
|
710
|
+
if x_axis_data is not None:
|
|
711
|
+
self._process_column_item(x_axis_data, unique_columns)
|
|
684
712
|
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
713
|
+
return unique_columns
|
|
714
|
+
|
|
715
|
+
def _fetch_dataset_columns(
|
|
716
|
+
self, datasource_id: Union[Any, int]
|
|
717
|
+
) -> List[Tuple[str, str, str]]:
|
|
718
|
+
"""Fetch dataset columns and metrics from Superset API."""
|
|
719
|
+
if not datasource_id:
|
|
688
720
|
logger.warning(
|
|
689
721
|
"no datasource id was found, cannot build column level lineage"
|
|
690
722
|
)
|
|
691
723
|
return []
|
|
692
724
|
|
|
725
|
+
dataset_info = self.get_dataset_info(datasource_id).get("result", {})
|
|
726
|
+
dataset_column_info = dataset_info.get("columns", [])
|
|
727
|
+
dataset_metric_info = dataset_info.get("metrics", [])
|
|
728
|
+
|
|
729
|
+
dataset_columns: List[Tuple[str, str, str]] = []
|
|
730
|
+
for column in dataset_column_info:
|
|
731
|
+
col_name = column.get("column_name", "")
|
|
732
|
+
col_type = column.get("type", "")
|
|
733
|
+
col_description = column.get("description", "")
|
|
734
|
+
|
|
735
|
+
if col_name == "" or col_type == "":
|
|
736
|
+
logger.info(f"could not construct column lineage for {column}")
|
|
737
|
+
continue
|
|
738
|
+
|
|
739
|
+
dataset_columns.append((col_name, col_type, col_description))
|
|
740
|
+
|
|
741
|
+
for metric in dataset_metric_info:
|
|
742
|
+
metric_name = metric.get("metric_name", "")
|
|
743
|
+
metric_type = metric.get("metric_type", "")
|
|
744
|
+
metric_description = metric.get("description", "")
|
|
745
|
+
|
|
746
|
+
if metric_name == "" or metric_type == "":
|
|
747
|
+
logger.info(f"could not construct metric lineage for {metric}")
|
|
748
|
+
continue
|
|
749
|
+
|
|
750
|
+
dataset_columns.append((metric_name, metric_type, metric_description))
|
|
751
|
+
|
|
752
|
+
return dataset_columns
|
|
753
|
+
|
|
754
|
+
def _match_chart_columns_with_dataset(
|
|
755
|
+
self,
|
|
756
|
+
unique_chart_columns: Dict[str, bool],
|
|
757
|
+
dataset_columns: List[Tuple[str, str, str]],
|
|
758
|
+
) -> List[Tuple[str, str, str]]:
|
|
759
|
+
"""Match chart columns with dataset columns, preserving SQL/non-SQL status."""
|
|
693
760
|
chart_columns: List[Tuple[str, str, str]] = []
|
|
694
|
-
|
|
695
|
-
|
|
761
|
+
|
|
762
|
+
for chart_col_name, is_sql in unique_chart_columns.items():
|
|
696
763
|
if is_sql:
|
|
697
|
-
chart_columns.append(
|
|
698
|
-
(
|
|
699
|
-
chart_col_name,
|
|
700
|
-
"SQL",
|
|
701
|
-
"",
|
|
702
|
-
)
|
|
703
|
-
)
|
|
764
|
+
chart_columns.append((chart_col_name, "SQL", ""))
|
|
704
765
|
continue
|
|
705
766
|
|
|
706
767
|
# find matching upstream column
|
|
@@ -711,13 +772,36 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
711
772
|
if dataset_col_name == chart_col_name:
|
|
712
773
|
chart_columns.append(
|
|
713
774
|
(chart_col_name, dataset_col_type, dataset_col_description)
|
|
714
|
-
)
|
|
775
|
+
)
|
|
715
776
|
break
|
|
716
|
-
|
|
717
|
-
# if no matching upstream column was found
|
|
718
|
-
if len(chart_columns) == 0 or chart_columns[-1][0] != chart_col_name:
|
|
777
|
+
else:
|
|
719
778
|
chart_columns.append((chart_col_name, "", ""))
|
|
720
779
|
|
|
780
|
+
return chart_columns
|
|
781
|
+
|
|
782
|
+
def construct_chart_cll(
|
|
783
|
+
self,
|
|
784
|
+
chart_data: dict,
|
|
785
|
+
datasource_urn: Union[str, None],
|
|
786
|
+
datasource_id: Union[Any, int],
|
|
787
|
+
) -> List[InputField]:
|
|
788
|
+
"""Construct column-level lineage for a chart."""
|
|
789
|
+
form_data = chart_data.get("form_data", {})
|
|
790
|
+
|
|
791
|
+
# Extract and process all columns in one go
|
|
792
|
+
unique_columns = self._collect_all_unique_columns(form_data)
|
|
793
|
+
|
|
794
|
+
# Fetch dataset columns
|
|
795
|
+
dataset_columns = self._fetch_dataset_columns(datasource_id)
|
|
796
|
+
if not dataset_columns:
|
|
797
|
+
return []
|
|
798
|
+
|
|
799
|
+
# Match chart columns with dataset columns
|
|
800
|
+
chart_columns = self._match_chart_columns_with_dataset(
|
|
801
|
+
unique_columns, dataset_columns
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
# Build input fields
|
|
721
805
|
return self.build_input_fields(chart_columns, datasource_urn)
|
|
722
806
|
|
|
723
807
|
def construct_chart_from_chart_data(
|
|
@@ -275,6 +275,17 @@ class UnityCatalogSourceConfig(
|
|
|
275
275
|
hidden_from_docs=True,
|
|
276
276
|
)
|
|
277
277
|
|
|
278
|
+
databricks_api_page_size: int = pydantic.Field(
|
|
279
|
+
default=0,
|
|
280
|
+
ge=0,
|
|
281
|
+
description=(
|
|
282
|
+
"Page size for Databricks API calls when listing resources (catalogs, schemas, tables, etc.). "
|
|
283
|
+
"When set to 0 (default), uses server-side configured page length (recommended). "
|
|
284
|
+
"When set to a positive value, the page length is the minimum of this value and the server configured value. "
|
|
285
|
+
"Must be a non-negative integer."
|
|
286
|
+
),
|
|
287
|
+
)
|
|
288
|
+
|
|
278
289
|
include_usage_statistics: bool = Field(
|
|
279
290
|
default=True,
|
|
280
291
|
description="Generate usage statistics.",
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from datahub.api.entities.external.external_entities import (
|
|
4
|
+
PlatformResourceRepository,
|
|
5
|
+
)
|
|
6
|
+
from datahub.ingestion.source.unity.tag_entities import (
|
|
7
|
+
UnityCatalogTagPlatformResource,
|
|
8
|
+
UnityCatalogTagPlatformResourceId,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UnityCatalogPlatformResourceRepository(
|
|
15
|
+
PlatformResourceRepository[
|
|
16
|
+
UnityCatalogTagPlatformResourceId, UnityCatalogTagPlatformResource
|
|
17
|
+
]
|
|
18
|
+
):
|
|
19
|
+
"""Unity Catalog-specific platform resource repository with tag-related operations."""
|
|
@@ -141,6 +141,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
141
141
|
report: UnityCatalogReport,
|
|
142
142
|
hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
|
|
143
143
|
lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
|
|
144
|
+
databricks_api_page_size: int = 0,
|
|
144
145
|
):
|
|
145
146
|
self._workspace_client = WorkspaceClient(
|
|
146
147
|
host=workspace_url,
|
|
@@ -152,6 +153,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
152
153
|
self.report = report
|
|
153
154
|
self.hive_metastore_proxy = hive_metastore_proxy
|
|
154
155
|
self.lineage_data_source = lineage_data_source
|
|
156
|
+
self.databricks_api_page_size = databricks_api_page_size
|
|
155
157
|
self._sql_connection_params = {
|
|
156
158
|
"server_hostname": self._workspace_client.config.host.replace(
|
|
157
159
|
"https://", ""
|
|
@@ -161,7 +163,11 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
161
163
|
}
|
|
162
164
|
|
|
163
165
|
def check_basic_connectivity(self) -> bool:
|
|
164
|
-
return bool(
|
|
166
|
+
return bool(
|
|
167
|
+
self._workspace_client.catalogs.list(
|
|
168
|
+
include_browse=True, max_results=self.databricks_api_page_size
|
|
169
|
+
)
|
|
170
|
+
)
|
|
165
171
|
|
|
166
172
|
def assigned_metastore(self) -> Optional[Metastore]:
|
|
167
173
|
response = self._workspace_client.metastores.summary()
|
|
@@ -171,7 +177,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
171
177
|
if self.hive_metastore_proxy:
|
|
172
178
|
yield self.hive_metastore_proxy.hive_metastore_catalog(metastore)
|
|
173
179
|
|
|
174
|
-
response = self._workspace_client.catalogs.list(
|
|
180
|
+
response = self._workspace_client.catalogs.list(
|
|
181
|
+
include_browse=True, max_results=self.databricks_api_page_size
|
|
182
|
+
)
|
|
175
183
|
if not response:
|
|
176
184
|
logger.info("Catalogs not found")
|
|
177
185
|
return
|
|
@@ -203,7 +211,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
203
211
|
yield from self.hive_metastore_proxy.hive_metastore_schemas(catalog)
|
|
204
212
|
return
|
|
205
213
|
response = self._workspace_client.schemas.list(
|
|
206
|
-
catalog_name=catalog.name,
|
|
214
|
+
catalog_name=catalog.name,
|
|
215
|
+
include_browse=True,
|
|
216
|
+
max_results=self.databricks_api_page_size,
|
|
207
217
|
)
|
|
208
218
|
if not response:
|
|
209
219
|
logger.info(f"Schemas not found for catalog {catalog.id}")
|
|
@@ -225,6 +235,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
225
235
|
catalog_name=schema.catalog.name,
|
|
226
236
|
schema_name=schema.name,
|
|
227
237
|
include_browse=True,
|
|
238
|
+
max_results=self.databricks_api_page_size,
|
|
228
239
|
)
|
|
229
240
|
if not response:
|
|
230
241
|
logger.info(f"Tables not found for schema {schema.id}")
|
|
@@ -257,7 +268,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
257
268
|
return group_list
|
|
258
269
|
|
|
259
270
|
def workspace_notebooks(self) -> Iterable[Notebook]:
|
|
260
|
-
|
|
271
|
+
workspace_objects_iter = self._workspace_client.workspace.list(
|
|
272
|
+
"/", recursive=True, max_results=self.databricks_api_page_size
|
|
273
|
+
)
|
|
274
|
+
for obj in workspace_objects_iter:
|
|
261
275
|
if obj.object_type == ObjectType.NOTEBOOK and obj.object_id and obj.path:
|
|
262
276
|
yield Notebook(
|
|
263
277
|
id=obj.object_id,
|
|
@@ -299,7 +313,6 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
299
313
|
def _query_history(
|
|
300
314
|
self,
|
|
301
315
|
filter_by: QueryFilterWithStatementTypes,
|
|
302
|
-
max_results: int = 1000,
|
|
303
316
|
include_metrics: bool = False,
|
|
304
317
|
) -> Iterable[QueryInfo]:
|
|
305
318
|
"""Manual implementation of the query_history.list() endpoint.
|
|
@@ -311,9 +324,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
311
324
|
"""
|
|
312
325
|
method = "GET"
|
|
313
326
|
path = "/api/2.0/sql/history/queries"
|
|
327
|
+
|
|
314
328
|
body: Dict[str, Any] = {
|
|
315
329
|
"include_metrics": include_metrics,
|
|
316
|
-
"max_results":
|
|
330
|
+
"max_results": self.databricks_api_page_size, # Max batch size
|
|
317
331
|
}
|
|
318
332
|
|
|
319
333
|
response: dict = self._workspace_client.api_client.do( # type: ignore
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import Optional, Tuple
|
|
2
|
+
from typing import TYPE_CHECKING, Optional, Tuple
|
|
3
3
|
|
|
4
4
|
from datahub.ingestion.api.report import EntityFilterReport, Report
|
|
5
5
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
@@ -7,6 +7,11 @@ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
|
7
7
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
8
8
|
from datahub.utilities.perf_timer import PerfTimer
|
|
9
9
|
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from datahub.ingestion.source.unity.platform_resource_repository import (
|
|
12
|
+
UnityCatalogPlatformResourceRepository,
|
|
13
|
+
)
|
|
14
|
+
|
|
10
15
|
|
|
11
16
|
@dataclass
|
|
12
17
|
class UnityCatalogUsagePerfReport(Report):
|
|
@@ -61,3 +66,6 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
|
|
|
61
66
|
num_tables_missing_name: int = 0
|
|
62
67
|
num_columns_missing_name: int = 0
|
|
63
68
|
num_queries_missing_info: int = 0
|
|
69
|
+
|
|
70
|
+
# Platform resource repository for automatic cache statistics via SupportsAsObj
|
|
71
|
+
tag_urn_resolver_cache: Optional["UnityCatalogPlatformResourceRepository"] = None
|