acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +12 -16
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +71 -13
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1446 -559
- datahub/metadata/_urns/urn_defs.py +1721 -1553
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +18055 -17802
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/__init__.py +6 -0
- datahub/sdk/_all_entities.py +11 -0
- datahub/sdk/_shared.py +118 -1
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +90 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -25,6 +25,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
25
25
|
)
|
|
26
26
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
27
27
|
from datahub.ingestion.source.common.data_reader import DataReader
|
|
28
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
29
|
+
DatasetSubTypes,
|
|
30
|
+
SourceCapabilityModifier,
|
|
31
|
+
)
|
|
28
32
|
from datahub.ingestion.source.sql.sql_common import (
|
|
29
33
|
SQLAlchemySource,
|
|
30
34
|
SqlWorkUnit,
|
|
@@ -41,7 +45,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
|
|
41
45
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
42
46
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
43
47
|
from datahub.metadata.schema_classes import (
|
|
44
|
-
ChangeTypeClass,
|
|
45
48
|
DatasetLineageTypeClass,
|
|
46
49
|
DatasetPropertiesClass,
|
|
47
50
|
SubTypesClass,
|
|
@@ -113,10 +116,14 @@ class VerticaConfig(BasicSQLAlchemyConfig):
|
|
|
113
116
|
@capability(
|
|
114
117
|
SourceCapability.LINEAGE_COARSE,
|
|
115
118
|
"Enabled by default, can be disabled via configuration `include_view_lineage` and `include_projection_lineage`",
|
|
119
|
+
subtype_modifier=[
|
|
120
|
+
SourceCapabilityModifier.VIEW,
|
|
121
|
+
SourceCapabilityModifier.PROJECTIONS,
|
|
122
|
+
],
|
|
116
123
|
)
|
|
117
124
|
@capability(
|
|
118
125
|
SourceCapability.DELETION_DETECTION,
|
|
119
|
-
"
|
|
126
|
+
"Enabled by default via stateful ingestion",
|
|
120
127
|
supported=True,
|
|
121
128
|
)
|
|
122
129
|
class VerticaSource(SQLAlchemySource):
|
|
@@ -493,11 +500,8 @@ class VerticaSource(SQLAlchemySource):
|
|
|
493
500
|
if dpi_aspect:
|
|
494
501
|
yield dpi_aspect
|
|
495
502
|
yield MetadataChangeProposalWrapper(
|
|
496
|
-
entityType="dataset",
|
|
497
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
498
503
|
entityUrn=dataset_urn,
|
|
499
|
-
|
|
500
|
-
aspect=SubTypesClass(typeNames=["Projections"]),
|
|
504
|
+
aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
|
|
501
505
|
).as_workunit()
|
|
502
506
|
|
|
503
507
|
if self.config.domain:
|
|
@@ -66,7 +66,7 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
66
66
|
description="The default schema to use for unqualified table names",
|
|
67
67
|
default=None,
|
|
68
68
|
)
|
|
69
|
-
|
|
69
|
+
override_dialect: Optional[str] = Field(
|
|
70
70
|
description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
|
|
71
71
|
default=None,
|
|
72
72
|
)
|
|
@@ -181,7 +181,7 @@ class SqlQueriesSource(Source):
|
|
|
181
181
|
schema_resolver=self.schema_resolver,
|
|
182
182
|
default_db=self.config.default_db,
|
|
183
183
|
default_schema=self.config.default_schema,
|
|
184
|
-
|
|
184
|
+
override_dialect=self.config.override_dialect,
|
|
185
185
|
)
|
|
186
186
|
if result.debug_info.table_error:
|
|
187
187
|
logger.info(f"Error parsing table lineage, {result.debug_info.table_error}")
|
|
@@ -179,7 +179,7 @@ class StatefulIngestionReport(SourceReport):
|
|
|
179
179
|
|
|
180
180
|
@capability(
|
|
181
181
|
SourceCapability.DELETION_DETECTION,
|
|
182
|
-
"
|
|
182
|
+
"Enabled by default via stateful ingestion",
|
|
183
183
|
supported=True,
|
|
184
184
|
)
|
|
185
185
|
class StatefulIngestionSourceBase(Source):
|
|
@@ -272,7 +272,7 @@ def get_filter_name(filter_obj):
|
|
|
272
272
|
@config_class(SupersetConfig)
|
|
273
273
|
@support_status(SupportStatus.CERTIFIED)
|
|
274
274
|
@capability(
|
|
275
|
-
SourceCapability.DELETION_DETECTION, "
|
|
275
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
276
276
|
)
|
|
277
277
|
@capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
|
|
278
278
|
@capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
|
|
@@ -658,6 +658,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
658
658
|
if datasource_id:
|
|
659
659
|
dataset_info = self.get_dataset_info(datasource_id).get("result", {})
|
|
660
660
|
dataset_column_info = dataset_info.get("columns", [])
|
|
661
|
+
dataset_metric_info = dataset_info.get("metrics", [])
|
|
661
662
|
|
|
662
663
|
for column in dataset_column_info:
|
|
663
664
|
col_name = column.get("column_name", "")
|
|
@@ -671,6 +672,17 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
671
672
|
continue
|
|
672
673
|
|
|
673
674
|
dataset_columns.append((col_name, col_type, col_description))
|
|
675
|
+
|
|
676
|
+
for metric in dataset_metric_info:
|
|
677
|
+
metric_name = metric.get("metric_name", "")
|
|
678
|
+
metric_type = metric.get("metric_type", "")
|
|
679
|
+
metric_description = metric.get("description", "")
|
|
680
|
+
|
|
681
|
+
if metric_name == "" or metric_type == "":
|
|
682
|
+
logger.info(f"could not construct metric lineage for {metric}")
|
|
683
|
+
continue
|
|
684
|
+
|
|
685
|
+
dataset_columns.append((metric_name, metric_type, metric_description))
|
|
674
686
|
else:
|
|
675
687
|
# if no datasource id, cannot build cll, just return
|
|
676
688
|
logger.warning(
|
|
@@ -972,19 +984,44 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
972
984
|
schema_fields.append(field)
|
|
973
985
|
return schema_fields
|
|
974
986
|
|
|
987
|
+
def gen_metric_schema_fields(
|
|
988
|
+
self, metric_data: List[Dict[str, Any]]
|
|
989
|
+
) -> List[SchemaField]:
|
|
990
|
+
schema_fields: List[SchemaField] = []
|
|
991
|
+
for metric in metric_data:
|
|
992
|
+
metric_type = metric.get("metric_type", "")
|
|
993
|
+
data_type = resolve_sql_type(metric_type)
|
|
994
|
+
if data_type is None:
|
|
995
|
+
data_type = NullType()
|
|
996
|
+
|
|
997
|
+
field = SchemaField(
|
|
998
|
+
fieldPath=metric.get("metric_name", ""),
|
|
999
|
+
type=SchemaFieldDataType(data_type),
|
|
1000
|
+
nativeDataType=metric_type or "",
|
|
1001
|
+
description=metric.get("description", ""),
|
|
1002
|
+
nullable=True,
|
|
1003
|
+
)
|
|
1004
|
+
schema_fields.append(field)
|
|
1005
|
+
return schema_fields
|
|
1006
|
+
|
|
975
1007
|
def gen_schema_metadata(
|
|
976
1008
|
self,
|
|
977
1009
|
dataset_response: dict,
|
|
978
1010
|
) -> SchemaMetadata:
|
|
979
1011
|
dataset_response = dataset_response.get("result", {})
|
|
980
1012
|
column_data = dataset_response.get("columns", [])
|
|
1013
|
+
metric_data = dataset_response.get("metrics", [])
|
|
1014
|
+
|
|
1015
|
+
column_fields = self.gen_schema_fields(column_data)
|
|
1016
|
+
metric_fields = self.gen_metric_schema_fields(metric_data)
|
|
1017
|
+
|
|
981
1018
|
schema_metadata = SchemaMetadata(
|
|
982
1019
|
schemaName=dataset_response.get("table_name", ""),
|
|
983
1020
|
platform=make_data_platform_urn(self.platform),
|
|
984
1021
|
version=0,
|
|
985
1022
|
hash="",
|
|
986
1023
|
platformSchema=MySqlDDL(tableSchema=""),
|
|
987
|
-
fields=
|
|
1024
|
+
fields=column_fields + metric_fields,
|
|
988
1025
|
)
|
|
989
1026
|
return schema_metadata
|
|
990
1027
|
|
|
@@ -1049,6 +1086,8 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1049
1086
|
# To generate column level lineage, we can manually decode the metadata
|
|
1050
1087
|
# to produce the ColumnLineageInfo
|
|
1051
1088
|
columns = dataset_response.get("result", {}).get("columns", [])
|
|
1089
|
+
metrics = dataset_response.get("result", {}).get("metrics", [])
|
|
1090
|
+
|
|
1052
1091
|
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
1053
1092
|
|
|
1054
1093
|
for column in columns:
|
|
@@ -1067,6 +1106,22 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1067
1106
|
)
|
|
1068
1107
|
)
|
|
1069
1108
|
|
|
1109
|
+
for metric in metrics:
|
|
1110
|
+
metric_name = metric.get("metric_name", "")
|
|
1111
|
+
if not metric_name:
|
|
1112
|
+
continue
|
|
1113
|
+
|
|
1114
|
+
downstream = [make_schema_field_urn(datasource_urn, metric_name)]
|
|
1115
|
+
upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
|
|
1116
|
+
fine_grained_lineages.append(
|
|
1117
|
+
FineGrainedLineageClass(
|
|
1118
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
1119
|
+
downstreams=downstream,
|
|
1120
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
1121
|
+
upstreams=upstreams,
|
|
1122
|
+
)
|
|
1123
|
+
)
|
|
1124
|
+
|
|
1070
1125
|
upstream_lineage = UpstreamLineageClass(
|
|
1071
1126
|
upstreams=[
|
|
1072
1127
|
UpstreamClass(
|
|
@@ -1087,7 +1142,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1087
1142
|
datasource_urn = self.get_datasource_urn_from_id(
|
|
1088
1143
|
dataset_response, self.platform
|
|
1089
1144
|
)
|
|
1090
|
-
dataset_url = f"{self.config.display_uri}{
|
|
1145
|
+
dataset_url = f"{self.config.display_uri}/explore/?datasource_type=table&datasource_id={dataset.id}"
|
|
1091
1146
|
|
|
1092
1147
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
1093
1148
|
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
@@ -80,6 +80,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
80
80
|
from datahub.ingestion.source.common.subtypes import (
|
|
81
81
|
BIContainerSubTypes,
|
|
82
82
|
DatasetSubTypes,
|
|
83
|
+
SourceCapabilityModifier,
|
|
83
84
|
)
|
|
84
85
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
85
86
|
StaleEntityRemovalHandler,
|
|
@@ -148,7 +149,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
148
149
|
)
|
|
149
150
|
from datahub.metadata.schema_classes import (
|
|
150
151
|
BrowsePathsClass,
|
|
151
|
-
ChangeTypeClass,
|
|
152
152
|
ChartInfoClass,
|
|
153
153
|
ChartUsageStatisticsClass,
|
|
154
154
|
DashboardInfoClass,
|
|
@@ -528,6 +528,14 @@ class TableauConfig(
|
|
|
528
528
|
default=False,
|
|
529
529
|
description="Ingest details for tables external to (not embedded in) tableau as entities.",
|
|
530
530
|
)
|
|
531
|
+
emit_all_published_datasources: bool = Field(
|
|
532
|
+
default=False,
|
|
533
|
+
description="Ingest all published data sources. When False (default), only ingest published data sources that belong to an ingested workbook.",
|
|
534
|
+
)
|
|
535
|
+
emit_all_embedded_datasources: bool = Field(
|
|
536
|
+
default=False,
|
|
537
|
+
description="Ingest all embedded data sources. When False (default), only ingest embedded data sources that belong to an ingested workbook.",
|
|
538
|
+
)
|
|
531
539
|
|
|
532
540
|
env: str = Field(
|
|
533
541
|
default=builder.DEFAULT_ENV,
|
|
@@ -861,16 +869,29 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
|
|
|
861
869
|
@platform_name("Tableau")
|
|
862
870
|
@config_class(TableauConfig)
|
|
863
871
|
@support_status(SupportStatus.CERTIFIED)
|
|
872
|
+
@capability(
|
|
873
|
+
SourceCapability.CONTAINERS,
|
|
874
|
+
"Enabled by default",
|
|
875
|
+
subtype_modifier=[
|
|
876
|
+
SourceCapabilityModifier.TABLEAU_PROJECT,
|
|
877
|
+
SourceCapabilityModifier.TABLEAU_SITE,
|
|
878
|
+
SourceCapabilityModifier.TABLEAU_WORKBOOK,
|
|
879
|
+
],
|
|
880
|
+
)
|
|
864
881
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
865
882
|
@capability(SourceCapability.DOMAINS, "Requires transformer", supported=False)
|
|
866
883
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
867
884
|
@capability(
|
|
868
885
|
SourceCapability.USAGE_STATS,
|
|
869
886
|
"Dashboard/Chart view counts, enabled using extract_usage_stats config",
|
|
887
|
+
subtype_modifier=[
|
|
888
|
+
SourceCapabilityModifier.DASHBOARD,
|
|
889
|
+
SourceCapabilityModifier.CHART,
|
|
890
|
+
],
|
|
870
891
|
)
|
|
871
892
|
@capability(
|
|
872
893
|
SourceCapability.DELETION_DETECTION,
|
|
873
|
-
"Enabled by default
|
|
894
|
+
"Enabled by default via stateful ingestion.",
|
|
874
895
|
)
|
|
875
896
|
@capability(SourceCapability.OWNERSHIP, "Requires recipe configuration")
|
|
876
897
|
@capability(SourceCapability.TAGS, "Requires recipe configuration")
|
|
@@ -879,6 +900,7 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
|
|
|
879
900
|
SourceCapability.LINEAGE_FINE,
|
|
880
901
|
"Enabled by default, configure using `extract_column_level_lineage`",
|
|
881
902
|
)
|
|
903
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
882
904
|
class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
883
905
|
platform = "tableau"
|
|
884
906
|
|
|
@@ -2174,32 +2196,32 @@ class TableauSiteSource:
|
|
|
2174
2196
|
else []
|
|
2175
2197
|
)
|
|
2176
2198
|
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2199
|
+
tableau_table_list = csql.get(c.TABLES, [])
|
|
2200
|
+
if self.config.force_extraction_of_lineage_from_custom_sql_queries or (
|
|
2201
|
+
not tableau_table_list
|
|
2202
|
+
and self.config.extract_lineage_from_unsupported_custom_sql_queries
|
|
2203
|
+
):
|
|
2204
|
+
if not tableau_table_list:
|
|
2205
|
+
# custom sql tables may contain unsupported sql, causing incomplete lineage
|
|
2206
|
+
# we extract the lineage from the raw queries
|
|
2207
|
+
logger.debug(
|
|
2208
|
+
"Parsing TLL & CLL from custom sql (tableau metadata incomplete)"
|
|
2209
|
+
)
|
|
2210
|
+
else:
|
|
2211
|
+
# The Tableau SQL parser is much worse than our sqlglot based parser,
|
|
2212
|
+
# so relying on metadata parsed by Tableau from SQL queries can be
|
|
2213
|
+
# less accurate. This option allows us to ignore Tableau's parser and
|
|
2214
|
+
# only use our own.
|
|
2215
|
+
logger.debug("Parsing TLL & CLL from custom sql (forced)")
|
|
2216
|
+
|
|
2183
2217
|
yield from self._create_lineage_from_unsupported_csql(
|
|
2184
2218
|
csql_urn, csql, columns
|
|
2185
2219
|
)
|
|
2186
2220
|
else:
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
yield from self._create_lineage_to_upstream_tables(
|
|
2192
|
-
csql_urn, tables, datasource
|
|
2193
|
-
)
|
|
2194
|
-
elif (
|
|
2195
|
-
self.config.extract_lineage_from_unsupported_custom_sql_queries
|
|
2196
|
-
):
|
|
2197
|
-
logger.debug("Extracting TLL & CLL from custom sql")
|
|
2198
|
-
# custom sql tables may contain unsupported sql, causing incomplete lineage
|
|
2199
|
-
# we extract the lineage from the raw queries
|
|
2200
|
-
yield from self._create_lineage_from_unsupported_csql(
|
|
2201
|
-
csql_urn, csql, columns
|
|
2202
|
-
)
|
|
2221
|
+
# lineage from custom sql -> datasets/tables #
|
|
2222
|
+
yield from self._create_lineage_to_upstream_tables(
|
|
2223
|
+
csql_urn, tableau_table_list, datasource
|
|
2224
|
+
)
|
|
2203
2225
|
|
|
2204
2226
|
# Schema Metadata
|
|
2205
2227
|
schema_metadata = self.get_schema_metadata_for_custom_sql(columns)
|
|
@@ -2237,7 +2259,6 @@ class TableauSiteSource:
|
|
|
2237
2259
|
yield self.get_metadata_change_event(dataset_snapshot)
|
|
2238
2260
|
yield self.get_metadata_change_proposal(
|
|
2239
2261
|
dataset_snapshot.urn,
|
|
2240
|
-
aspect_name=c.SUB_TYPES,
|
|
2241
2262
|
aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]),
|
|
2242
2263
|
)
|
|
2243
2264
|
|
|
@@ -2402,7 +2423,6 @@ class TableauSiteSource:
|
|
|
2402
2423
|
upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
|
|
2403
2424
|
yield self.get_metadata_change_proposal(
|
|
2404
2425
|
csql_urn,
|
|
2405
|
-
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2406
2426
|
aspect=upstream_lineage,
|
|
2407
2427
|
)
|
|
2408
2428
|
self.report.num_tables_with_upstream_lineage += 1
|
|
@@ -2588,7 +2608,6 @@ class TableauSiteSource:
|
|
|
2588
2608
|
)
|
|
2589
2609
|
yield self.get_metadata_change_proposal(
|
|
2590
2610
|
csql_urn,
|
|
2591
|
-
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2592
2611
|
aspect=upstream_lineage,
|
|
2593
2612
|
)
|
|
2594
2613
|
self.report.num_tables_with_upstream_lineage += 1
|
|
@@ -2634,14 +2653,10 @@ class TableauSiteSource:
|
|
|
2634
2653
|
def get_metadata_change_proposal(
|
|
2635
2654
|
self,
|
|
2636
2655
|
urn: str,
|
|
2637
|
-
aspect_name: str,
|
|
2638
2656
|
aspect: Union["UpstreamLineage", "SubTypesClass"],
|
|
2639
2657
|
) -> MetadataWorkUnit:
|
|
2640
2658
|
return MetadataChangeProposalWrapper(
|
|
2641
|
-
entityType=c.DATASET,
|
|
2642
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
2643
2659
|
entityUrn=urn,
|
|
2644
|
-
aspectName=aspect_name,
|
|
2645
2660
|
aspect=aspect,
|
|
2646
2661
|
).as_workunit()
|
|
2647
2662
|
|
|
@@ -2749,7 +2764,6 @@ class TableauSiteSource:
|
|
|
2749
2764
|
)
|
|
2750
2765
|
yield self.get_metadata_change_proposal(
|
|
2751
2766
|
datasource_urn,
|
|
2752
|
-
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2753
2767
|
aspect=upstream_lineage,
|
|
2754
2768
|
)
|
|
2755
2769
|
self.report.num_tables_with_upstream_lineage += 1
|
|
@@ -2768,7 +2782,6 @@ class TableauSiteSource:
|
|
|
2768
2782
|
yield self.get_metadata_change_event(dataset_snapshot)
|
|
2769
2783
|
yield self.get_metadata_change_proposal(
|
|
2770
2784
|
dataset_snapshot.urn,
|
|
2771
|
-
aspect_name=c.SUB_TYPES,
|
|
2772
2785
|
aspect=SubTypesClass(
|
|
2773
2786
|
typeNames=(
|
|
2774
2787
|
["Embedded Data Source"]
|
|
@@ -2854,7 +2867,11 @@ class TableauSiteSource:
|
|
|
2854
2867
|
return datasource
|
|
2855
2868
|
|
|
2856
2869
|
def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
|
|
2857
|
-
datasource_filter =
|
|
2870
|
+
datasource_filter = (
|
|
2871
|
+
{}
|
|
2872
|
+
if self.config.emit_all_published_datasources
|
|
2873
|
+
else {c.ID_WITH_IN: self.datasource_ids_being_used}
|
|
2874
|
+
)
|
|
2858
2875
|
|
|
2859
2876
|
for datasource in self.get_connection_objects(
|
|
2860
2877
|
query=published_datasource_graphql_query,
|
|
@@ -3547,7 +3564,11 @@ class TableauSiteSource:
|
|
|
3547
3564
|
return browse_paths
|
|
3548
3565
|
|
|
3549
3566
|
def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]:
|
|
3550
|
-
datasource_filter =
|
|
3567
|
+
datasource_filter = (
|
|
3568
|
+
{}
|
|
3569
|
+
if self.config.emit_all_embedded_datasources
|
|
3570
|
+
else {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
|
|
3571
|
+
)
|
|
3551
3572
|
|
|
3552
3573
|
for datasource in self.get_connection_objects(
|
|
3553
3574
|
query=embedded_datasource_graphql_query,
|
|
@@ -3659,7 +3680,7 @@ class TableauSiteSource:
|
|
|
3659
3680
|
container_key=project_key,
|
|
3660
3681
|
name=project_.name,
|
|
3661
3682
|
description=project_.description,
|
|
3662
|
-
sub_types=[
|
|
3683
|
+
sub_types=[BIContainerSubTypes.TABLEAU_PROJECT],
|
|
3663
3684
|
parent_container_key=parent_project_key,
|
|
3664
3685
|
)
|
|
3665
3686
|
|
|
@@ -3677,7 +3698,7 @@ class TableauSiteSource:
|
|
|
3677
3698
|
yield from gen_containers(
|
|
3678
3699
|
container_key=self.gen_site_key(self.site_id),
|
|
3679
3700
|
name=self.site.name or "Default",
|
|
3680
|
-
sub_types=[
|
|
3701
|
+
sub_types=[BIContainerSubTypes.TABLEAU_SITE],
|
|
3681
3702
|
)
|
|
3682
3703
|
|
|
3683
3704
|
def _fetch_groups(self):
|
|
@@ -579,10 +579,12 @@ def get_platform(connection_type: str) -> str:
|
|
|
579
579
|
platform = "oracle"
|
|
580
580
|
elif connection_type in ("tbio", "teradata"):
|
|
581
581
|
platform = "teradata"
|
|
582
|
-
elif connection_type in ("sqlserver"):
|
|
582
|
+
elif connection_type in ("sqlserver",):
|
|
583
583
|
platform = "mssql"
|
|
584
|
-
elif connection_type in ("athena"):
|
|
584
|
+
elif connection_type in ("athena",):
|
|
585
585
|
platform = "athena"
|
|
586
|
+
elif connection_type in ("googlebigquery",):
|
|
587
|
+
platform = "bigquery"
|
|
586
588
|
elif connection_type.endswith("_jdbc"):
|
|
587
589
|
# e.g. convert trino_jdbc -> trino
|
|
588
590
|
platform = connection_type[: -len("_jdbc")]
|
|
@@ -50,7 +50,6 @@ TABLES = "tables"
|
|
|
50
50
|
DESCRIPTION = "description"
|
|
51
51
|
SQL = "SQL"
|
|
52
52
|
QUERY = "query"
|
|
53
|
-
SUB_TYPES = "subTypes"
|
|
54
53
|
VIEW = "view"
|
|
55
54
|
CUSTOM_SQL = "Custom SQL"
|
|
56
55
|
REMOTE_TYPE = "remoteType"
|
|
@@ -58,7 +57,6 @@ UNKNOWN = "UNKNOWN"
|
|
|
58
57
|
PUBLISHED_DATA_SOURCE = "PublishedDatasource"
|
|
59
58
|
LUID = "luid"
|
|
60
59
|
EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
|
|
61
|
-
UPSTREAM_LINEAGE = "upstreamLineage"
|
|
62
60
|
OWNER = "owner"
|
|
63
61
|
USERNAME = "username"
|
|
64
62
|
HAS_EXTRACTS = "hasExtracts"
|
|
@@ -78,8 +76,6 @@ CHART = "chart"
|
|
|
78
76
|
DASHBOARD = "dashboard"
|
|
79
77
|
DASHBOARDS_CONNECTION = "dashboardsConnection"
|
|
80
78
|
EMBEDDED_DATA_SOURCES_CONNECTION = "embeddedDatasourcesConnection"
|
|
81
|
-
PROJECT = "Project"
|
|
82
|
-
SITE = "Site"
|
|
83
79
|
IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
|
|
84
80
|
SITE_PERMISSION = "sitePermission"
|
|
85
81
|
ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer"
|
|
@@ -229,6 +229,11 @@ class UnityCatalogSourceConfig(
|
|
|
229
229
|
description="Option to enable/disable ownership generation for metastores, catalogs, schemas, and tables.",
|
|
230
230
|
)
|
|
231
231
|
|
|
232
|
+
include_tags: bool = pydantic.Field(
|
|
233
|
+
default=True,
|
|
234
|
+
description="Option to enable/disable column/table tag extraction.",
|
|
235
|
+
)
|
|
236
|
+
|
|
232
237
|
_rename_table_ownership = pydantic_renamed_field(
|
|
233
238
|
"include_table_ownership", "include_ownership"
|
|
234
239
|
)
|
|
@@ -8,6 +8,8 @@ from datetime import datetime
|
|
|
8
8
|
from typing import Any, Dict, Iterable, List, Optional, Union, cast
|
|
9
9
|
from unittest.mock import patch
|
|
10
10
|
|
|
11
|
+
import cachetools
|
|
12
|
+
from cachetools import cached
|
|
11
13
|
from databricks.sdk import WorkspaceClient
|
|
12
14
|
from databricks.sdk.service.catalog import (
|
|
13
15
|
CatalogInfo,
|
|
@@ -25,8 +27,10 @@ from databricks.sdk.service.sql import (
|
|
|
25
27
|
QueryStatus,
|
|
26
28
|
)
|
|
27
29
|
from databricks.sdk.service.workspace import ObjectType
|
|
30
|
+
from databricks.sql import connect
|
|
28
31
|
|
|
29
32
|
from datahub._version import nice_version_name
|
|
33
|
+
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
30
34
|
from datahub.emitter.mce_builder import parse_ts_millis
|
|
31
35
|
from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
|
|
32
36
|
from datahub.ingestion.source.unity.proxy_profiling import (
|
|
@@ -108,6 +112,13 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
108
112
|
self.warehouse_id = warehouse_id or ""
|
|
109
113
|
self.report = report
|
|
110
114
|
self.hive_metastore_proxy = hive_metastore_proxy
|
|
115
|
+
self._sql_connection_params = {
|
|
116
|
+
"server_hostname": self._workspace_client.config.host.replace(
|
|
117
|
+
"https://", ""
|
|
118
|
+
),
|
|
119
|
+
"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}",
|
|
120
|
+
"access_token": self._workspace_client.config.token,
|
|
121
|
+
}
|
|
111
122
|
|
|
112
123
|
def check_basic_connectivity(self) -> bool:
|
|
113
124
|
return bool(self._workspace_client.catalogs.list(include_browse=True))
|
|
@@ -492,3 +503,110 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
492
503
|
executed_as_user_id=info.executed_as_user_id,
|
|
493
504
|
executed_as_user_name=info.executed_as_user_name,
|
|
494
505
|
)
|
|
506
|
+
|
|
507
|
+
def _execute_sql_query(self, query: str) -> List[List[str]]:
|
|
508
|
+
"""Execute SQL query using databricks-sql connector for better performance"""
|
|
509
|
+
try:
|
|
510
|
+
with (
|
|
511
|
+
connect(**self._sql_connection_params) as connection,
|
|
512
|
+
connection.cursor() as cursor,
|
|
513
|
+
):
|
|
514
|
+
cursor.execute(query)
|
|
515
|
+
return cursor.fetchall()
|
|
516
|
+
|
|
517
|
+
except Exception as e:
|
|
518
|
+
logger.warning(f"Failed to execute SQL query: {e}")
|
|
519
|
+
return []
|
|
520
|
+
|
|
521
|
+
@cached(cachetools.FIFOCache(maxsize=100))
|
|
522
|
+
def get_schema_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
523
|
+
"""Optimized version using databricks-sql"""
|
|
524
|
+
logger.info(f"Fetching schema tags for catalog: {catalog}")
|
|
525
|
+
|
|
526
|
+
query = f"SELECT * FROM {catalog}.information_schema.schema_tags"
|
|
527
|
+
rows = self._execute_sql_query(query)
|
|
528
|
+
|
|
529
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
530
|
+
|
|
531
|
+
for row in rows:
|
|
532
|
+
catalog_name, schema_name, tag_name, tag_value = row
|
|
533
|
+
schema_key = f"{catalog_name}.{schema_name}"
|
|
534
|
+
|
|
535
|
+
if schema_key not in result_dict:
|
|
536
|
+
result_dict[schema_key] = []
|
|
537
|
+
|
|
538
|
+
result_dict[schema_key].append(
|
|
539
|
+
UnityCatalogTag(key=tag_name, value=tag_value)
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
return result_dict
|
|
543
|
+
|
|
544
|
+
@cached(cachetools.FIFOCache(maxsize=100))
|
|
545
|
+
def get_catalog_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
546
|
+
"""Optimized version using databricks-sql"""
|
|
547
|
+
logger.info(f"Fetching table tags for catalog: {catalog}")
|
|
548
|
+
|
|
549
|
+
query = f"SELECT * FROM {catalog}.information_schema.catalog_tags"
|
|
550
|
+
rows = self._execute_sql_query(query)
|
|
551
|
+
|
|
552
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
553
|
+
|
|
554
|
+
for row in rows:
|
|
555
|
+
catalog_name, tag_name, tag_value = row
|
|
556
|
+
|
|
557
|
+
if catalog_name not in result_dict:
|
|
558
|
+
result_dict[catalog_name] = []
|
|
559
|
+
|
|
560
|
+
result_dict[catalog_name].append(
|
|
561
|
+
UnityCatalogTag(key=tag_name, value=tag_value)
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
return result_dict
|
|
565
|
+
|
|
566
|
+
@cached(cachetools.FIFOCache(maxsize=100))
|
|
567
|
+
def get_table_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
568
|
+
"""Optimized version using databricks-sql"""
|
|
569
|
+
logger.info(f"Fetching table tags for catalog: {catalog}")
|
|
570
|
+
|
|
571
|
+
query = f"SELECT * FROM {catalog}.information_schema.table_tags"
|
|
572
|
+
rows = self._execute_sql_query(query)
|
|
573
|
+
|
|
574
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
575
|
+
|
|
576
|
+
for row in rows:
|
|
577
|
+
catalog_name, schema_name, table_name, tag_name, tag_value = row
|
|
578
|
+
table_key = f"{catalog_name}.{schema_name}.{table_name}"
|
|
579
|
+
|
|
580
|
+
if table_key not in result_dict:
|
|
581
|
+
result_dict[table_key] = []
|
|
582
|
+
|
|
583
|
+
result_dict[table_key].append(
|
|
584
|
+
UnityCatalogTag(key=tag_name, value=tag_value if tag_value else None)
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
return result_dict
|
|
588
|
+
|
|
589
|
+
@cached(cachetools.FIFOCache(maxsize=100))
|
|
590
|
+
def get_column_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
591
|
+
"""Optimized version using databricks-sql"""
|
|
592
|
+
logger.info(f"Fetching column tags for catalog: {catalog}")
|
|
593
|
+
|
|
594
|
+
query = f"SELECT * FROM {catalog}.information_schema.column_tags"
|
|
595
|
+
rows = self._execute_sql_query(query)
|
|
596
|
+
|
|
597
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
598
|
+
|
|
599
|
+
for row in rows:
|
|
600
|
+
catalog_name, schema_name, table_name, column_name, tag_name, tag_value = (
|
|
601
|
+
row
|
|
602
|
+
)
|
|
603
|
+
column_key = f"{catalog_name}.{schema_name}.{table_name}.{column_name}"
|
|
604
|
+
|
|
605
|
+
if column_key not in result_dict:
|
|
606
|
+
result_dict[column_key] = []
|
|
607
|
+
|
|
608
|
+
result_dict[column_key].append(
|
|
609
|
+
UnityCatalogTag(key=tag_name, value=tag_value if tag_value else None)
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
return result_dict
|