acryl-datahub 1.1.0rc3__py3-none-any.whl → 1.1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/METADATA +2532 -2530
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/RECORD +87 -70
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +9 -8
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +170 -0
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/delete_cli.py +4 -4
- datahub/cli/ingest_cli.py +9 -1
- datahub/emitter/mce_builder.py +3 -1
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +1 -1
- datahub/ingestion/graph/client.py +3 -3
- datahub/ingestion/source/apply/datahub_apply.py +4 -4
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
- datahub/ingestion/source/data_lake_common/object_store.py +644 -0
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_common.py +30 -11
- datahub/ingestion/source/gcs/gcs_source.py +22 -7
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/hex/query_fetcher.py +9 -3
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/s3/source.py +65 -6
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
- datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
- datahub/ingestion/source/sql/athena.py +1 -0
- datahub/ingestion/source/sql/hive.py +2 -3
- datahub/ingestion/source/sql/sql_common.py +98 -34
- datahub/ingestion/source/sql/sql_types.py +5 -2
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +117 -0
- datahub/ingestion/source/unity/source.py +167 -15
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/metadata/_internal_schema_classes.py +667 -522
- datahub/metadata/_urns/urn_defs.py +1804 -1748
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/schema.avsc +17358 -17584
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +1 -0
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +1 -0
- datahub/metadata/schemas/MLModelKey.avsc +1 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +6 -0
- datahub/sdk/_all_entities.py +11 -0
- datahub/sdk/_shared.py +118 -1
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +342 -0
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +90 -2
- datahub/sdk/lineage_client.py +681 -82
- datahub/sdk/main_client.py +27 -8
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/sdk_v2_helpers.py +18 -0
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -76,33 +76,36 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
76
76
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
77
77
|
StatefulIngestionSourceBase,
|
|
78
78
|
)
|
|
79
|
-
from datahub.metadata.
|
|
80
|
-
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
81
|
-
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
82
|
-
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
79
|
+
from datahub.metadata.schema_classes import (
|
|
83
80
|
ArrayTypeClass,
|
|
84
81
|
BooleanTypeClass,
|
|
85
82
|
BytesTypeClass,
|
|
83
|
+
DataPlatformInstanceClass,
|
|
84
|
+
DatasetLineageTypeClass,
|
|
85
|
+
DatasetPropertiesClass,
|
|
86
|
+
DatasetSnapshotClass,
|
|
86
87
|
DateTypeClass,
|
|
87
88
|
EnumTypeClass,
|
|
88
|
-
|
|
89
|
-
|
|
89
|
+
FineGrainedLineageClass,
|
|
90
|
+
FineGrainedLineageDownstreamTypeClass,
|
|
91
|
+
FineGrainedLineageUpstreamTypeClass,
|
|
92
|
+
ForeignKeyConstraintClass,
|
|
93
|
+
GlobalTagsClass,
|
|
94
|
+
MetadataChangeEventClass,
|
|
95
|
+
MySqlDDLClass,
|
|
90
96
|
NullTypeClass,
|
|
91
97
|
NumberTypeClass,
|
|
92
98
|
RecordTypeClass,
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
99
|
+
SchemaFieldClass,
|
|
100
|
+
SchemaFieldDataTypeClass,
|
|
101
|
+
SchemaMetadataClass,
|
|
102
|
+
StatusClass,
|
|
96
103
|
StringTypeClass,
|
|
97
|
-
TimeTypeClass,
|
|
98
|
-
)
|
|
99
|
-
from datahub.metadata.schema_classes import (
|
|
100
|
-
DataPlatformInstanceClass,
|
|
101
|
-
DatasetLineageTypeClass,
|
|
102
|
-
DatasetPropertiesClass,
|
|
103
|
-
GlobalTagsClass,
|
|
104
104
|
SubTypesClass,
|
|
105
105
|
TagAssociationClass,
|
|
106
|
+
TimeTypeClass,
|
|
107
|
+
UpstreamClass,
|
|
108
|
+
UpstreamLineageClass,
|
|
106
109
|
ViewPropertiesClass,
|
|
107
110
|
)
|
|
108
111
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -112,6 +115,7 @@ from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
|
112
115
|
from datahub.utilities.sqlalchemy_type_converter import (
|
|
113
116
|
get_native_data_type_for_sqlalchemy_type,
|
|
114
117
|
)
|
|
118
|
+
from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
|
|
115
119
|
|
|
116
120
|
if TYPE_CHECKING:
|
|
117
121
|
from datahub.ingestion.source.ge_data_profiler import (
|
|
@@ -198,7 +202,7 @@ def make_sqlalchemy_type(name: str) -> Type[TypeEngine]:
|
|
|
198
202
|
|
|
199
203
|
def get_column_type(
|
|
200
204
|
sql_report: SQLSourceReport, dataset_name: str, column_type: Any
|
|
201
|
-
) ->
|
|
205
|
+
) -> SchemaFieldDataTypeClass:
|
|
202
206
|
"""
|
|
203
207
|
Maps SQLAlchemy types (https://docs.sqlalchemy.org/en/13/core/type_basics.html) to corresponding schema types
|
|
204
208
|
"""
|
|
@@ -223,7 +227,7 @@ def get_column_type(
|
|
|
223
227
|
)
|
|
224
228
|
TypeClass = NullTypeClass
|
|
225
229
|
|
|
226
|
-
return
|
|
230
|
+
return SchemaFieldDataTypeClass(type=TypeClass())
|
|
227
231
|
|
|
228
232
|
|
|
229
233
|
def get_schema_metadata(
|
|
@@ -232,10 +236,10 @@ def get_schema_metadata(
|
|
|
232
236
|
platform: str,
|
|
233
237
|
columns: List[dict],
|
|
234
238
|
pk_constraints: Optional[dict] = None,
|
|
235
|
-
foreign_keys: Optional[List[
|
|
236
|
-
canonical_schema: Optional[List[
|
|
239
|
+
foreign_keys: Optional[List[ForeignKeyConstraintClass]] = None,
|
|
240
|
+
canonical_schema: Optional[List[SchemaFieldClass]] = None,
|
|
237
241
|
simplify_nested_field_paths: bool = False,
|
|
238
|
-
) ->
|
|
242
|
+
) -> SchemaMetadataClass:
|
|
239
243
|
if (
|
|
240
244
|
simplify_nested_field_paths
|
|
241
245
|
and canonical_schema is not None
|
|
@@ -243,12 +247,12 @@ def get_schema_metadata(
|
|
|
243
247
|
):
|
|
244
248
|
canonical_schema = downgrade_schema_from_v2(canonical_schema)
|
|
245
249
|
|
|
246
|
-
schema_metadata =
|
|
250
|
+
schema_metadata = SchemaMetadataClass(
|
|
247
251
|
schemaName=dataset_name,
|
|
248
252
|
platform=make_data_platform_urn(platform),
|
|
249
253
|
version=0,
|
|
250
254
|
hash="",
|
|
251
|
-
platformSchema=
|
|
255
|
+
platformSchema=MySqlDDLClass(tableSchema=""),
|
|
252
256
|
fields=canonical_schema or [],
|
|
253
257
|
)
|
|
254
258
|
if foreign_keys is not None and foreign_keys != []:
|
|
@@ -590,7 +594,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
590
594
|
schema: str,
|
|
591
595
|
fk_dict: Dict[str, str],
|
|
592
596
|
inspector: Inspector,
|
|
593
|
-
) ->
|
|
597
|
+
) -> ForeignKeyConstraintClass:
|
|
594
598
|
referred_schema: Optional[str] = fk_dict.get("referred_schema")
|
|
595
599
|
|
|
596
600
|
if not referred_schema:
|
|
@@ -617,7 +621,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
617
621
|
for f in fk_dict["referred_columns"]
|
|
618
622
|
]
|
|
619
623
|
|
|
620
|
-
return
|
|
624
|
+
return ForeignKeyConstraintClass(
|
|
621
625
|
fk_dict["name"], foreign_fields, source_fields, foreign_dataset
|
|
622
626
|
)
|
|
623
627
|
|
|
@@ -714,7 +718,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
714
718
|
self.config.platform_instance,
|
|
715
719
|
self.config.env,
|
|
716
720
|
)
|
|
717
|
-
dataset_snapshot =
|
|
721
|
+
dataset_snapshot = DatasetSnapshotClass(
|
|
718
722
|
urn=dataset_urn,
|
|
719
723
|
aspects=[StatusClass(removed=False)],
|
|
720
724
|
)
|
|
@@ -742,6 +746,30 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
742
746
|
tags=extra_tags,
|
|
743
747
|
partition_keys=partitions,
|
|
744
748
|
)
|
|
749
|
+
|
|
750
|
+
if self.config.include_table_location_lineage and location_urn:
|
|
751
|
+
self.aggregator.add_known_lineage_mapping(
|
|
752
|
+
upstream_urn=location_urn,
|
|
753
|
+
downstream_urn=dataset_snapshot.urn,
|
|
754
|
+
lineage_type=DatasetLineageTypeClass.COPY,
|
|
755
|
+
)
|
|
756
|
+
external_upstream_table = UpstreamClass(
|
|
757
|
+
dataset=location_urn,
|
|
758
|
+
type=DatasetLineageTypeClass.COPY,
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
yield MetadataChangeProposalWrapper(
|
|
762
|
+
entityUrn=dataset_snapshot.urn,
|
|
763
|
+
aspect=UpstreamLineageClass(
|
|
764
|
+
upstreams=[external_upstream_table],
|
|
765
|
+
fineGrainedLineages=self.get_fine_grained_lineages(
|
|
766
|
+
dataset_urn=dataset_snapshot.urn,
|
|
767
|
+
upstream_dataset_urn=location_urn,
|
|
768
|
+
schema_fields=schema_fields,
|
|
769
|
+
),
|
|
770
|
+
),
|
|
771
|
+
).as_workunit()
|
|
772
|
+
|
|
745
773
|
schema_metadata = get_schema_metadata(
|
|
746
774
|
self.report,
|
|
747
775
|
dataset_name,
|
|
@@ -762,7 +790,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
762
790
|
yield from self.add_table_to_schema_container(
|
|
763
791
|
dataset_urn=dataset_urn, db_name=db_name, schema=schema
|
|
764
792
|
)
|
|
765
|
-
mce =
|
|
793
|
+
mce = MetadataChangeEventClass(proposedSnapshot=dataset_snapshot)
|
|
766
794
|
yield SqlWorkUnit(id=dataset_name, mce=mce)
|
|
767
795
|
dpi_aspect = self.get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
768
796
|
if dpi_aspect:
|
|
@@ -797,7 +825,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
797
825
|
schema: str,
|
|
798
826
|
table: str,
|
|
799
827
|
data_reader: Optional[DataReader],
|
|
800
|
-
schema_metadata:
|
|
828
|
+
schema_metadata: SchemaMetadataClass,
|
|
801
829
|
) -> None:
|
|
802
830
|
try:
|
|
803
831
|
if (
|
|
@@ -908,7 +936,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
908
936
|
|
|
909
937
|
def _get_foreign_keys(
|
|
910
938
|
self, dataset_urn: str, inspector: Inspector, schema: str, table: str
|
|
911
|
-
) -> List[
|
|
939
|
+
) -> List[ForeignKeyConstraintClass]:
|
|
912
940
|
try:
|
|
913
941
|
foreign_keys = [
|
|
914
942
|
self.get_foreign_key_metadata(dataset_urn, schema, fk_rec, inspector)
|
|
@@ -922,6 +950,42 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
922
950
|
foreign_keys = []
|
|
923
951
|
return foreign_keys
|
|
924
952
|
|
|
953
|
+
def get_fine_grained_lineages(
|
|
954
|
+
self,
|
|
955
|
+
dataset_urn: str,
|
|
956
|
+
upstream_dataset_urn: str,
|
|
957
|
+
schema_fields: List[SchemaFieldClass],
|
|
958
|
+
) -> Optional[List[FineGrainedLineageClass]]:
|
|
959
|
+
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
960
|
+
|
|
961
|
+
for schema_field in schema_fields:
|
|
962
|
+
try:
|
|
963
|
+
field_path_v1 = get_simple_field_path_from_v2_field_path(
|
|
964
|
+
schema_field.fieldPath
|
|
965
|
+
)
|
|
966
|
+
fine_grained_lineages.append(
|
|
967
|
+
FineGrainedLineageClass(
|
|
968
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
969
|
+
downstreams=[make_schema_field_urn(dataset_urn, field_path_v1)],
|
|
970
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
971
|
+
upstreams=[
|
|
972
|
+
make_schema_field_urn(
|
|
973
|
+
upstream_dataset_urn,
|
|
974
|
+
get_simple_field_path_from_v2_field_path(
|
|
975
|
+
schema_field.fieldPath
|
|
976
|
+
),
|
|
977
|
+
)
|
|
978
|
+
],
|
|
979
|
+
)
|
|
980
|
+
)
|
|
981
|
+
except Exception as e:
|
|
982
|
+
logger.warning(
|
|
983
|
+
f"Error processing field path for {dataset_urn}: {str(e)}"
|
|
984
|
+
)
|
|
985
|
+
continue
|
|
986
|
+
|
|
987
|
+
return fine_grained_lineages if fine_grained_lineages else None
|
|
988
|
+
|
|
925
989
|
def get_schema_fields(
|
|
926
990
|
self,
|
|
927
991
|
dataset_name: str,
|
|
@@ -930,7 +994,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
930
994
|
pk_constraints: Optional[dict] = None,
|
|
931
995
|
partition_keys: Optional[List[str]] = None,
|
|
932
996
|
tags: Optional[Dict[str, List[str]]] = None,
|
|
933
|
-
) -> List[
|
|
997
|
+
) -> List[SchemaFieldClass]:
|
|
934
998
|
canonical_schema = []
|
|
935
999
|
for column in columns:
|
|
936
1000
|
column_tags: Optional[List[str]] = None
|
|
@@ -955,14 +1019,14 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
955
1019
|
pk_constraints: Optional[dict] = None,
|
|
956
1020
|
partition_keys: Optional[List[str]] = None,
|
|
957
1021
|
tags: Optional[List[str]] = None,
|
|
958
|
-
) -> List[
|
|
1022
|
+
) -> List[SchemaFieldClass]:
|
|
959
1023
|
gtc: Optional[GlobalTagsClass] = None
|
|
960
1024
|
if tags:
|
|
961
1025
|
tags_str = [make_tag_urn(t) for t in tags]
|
|
962
1026
|
tags_tac = [TagAssociationClass(t) for t in tags_str]
|
|
963
1027
|
gtc = GlobalTagsClass(tags_tac)
|
|
964
1028
|
full_type = column.get("full_type")
|
|
965
|
-
field =
|
|
1029
|
+
field = SchemaFieldClass(
|
|
966
1030
|
fieldPath=column["name"],
|
|
967
1031
|
type=get_column_type(self.report, dataset_name, column["type"]),
|
|
968
1032
|
nativeDataType=(
|
|
@@ -1092,7 +1156,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1092
1156
|
default_schema=default_schema,
|
|
1093
1157
|
)
|
|
1094
1158
|
|
|
1095
|
-
dataset_snapshot =
|
|
1159
|
+
dataset_snapshot = DatasetSnapshotClass(
|
|
1096
1160
|
urn=dataset_urn,
|
|
1097
1161
|
aspects=[StatusClass(removed=False)],
|
|
1098
1162
|
)
|
|
@@ -1111,7 +1175,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1111
1175
|
dataset_snapshot.aspects.append(dataset_properties)
|
|
1112
1176
|
if schema_metadata:
|
|
1113
1177
|
dataset_snapshot.aspects.append(schema_metadata)
|
|
1114
|
-
mce =
|
|
1178
|
+
mce = MetadataChangeEventClass(proposedSnapshot=dataset_snapshot)
|
|
1115
1179
|
yield SqlWorkUnit(id=dataset_name, mce=mce)
|
|
1116
1180
|
dpi_aspect = self.get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
1117
1181
|
if dpi_aspect:
|
|
@@ -284,6 +284,8 @@ SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
|
|
|
284
284
|
"INTEGER": NumberType,
|
|
285
285
|
"BIGINT": NumberType,
|
|
286
286
|
"SMALLINT": NumberType,
|
|
287
|
+
"TINYINT": NumberType,
|
|
288
|
+
"BYTEINT": NumberType,
|
|
287
289
|
"FLOAT": NumberType,
|
|
288
290
|
"FLOAT4": NumberType,
|
|
289
291
|
"FLOAT8": NumberType,
|
|
@@ -291,6 +293,7 @@ SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
|
|
|
291
293
|
"DOUBLE PRECISION": NumberType,
|
|
292
294
|
"REAL": NumberType,
|
|
293
295
|
"VARCHAR": StringType,
|
|
296
|
+
"CHARACTER VARYING": StringType,
|
|
294
297
|
"CHAR": StringType,
|
|
295
298
|
"CHARACTER": StringType,
|
|
296
299
|
"STRING": StringType,
|
|
@@ -313,8 +316,8 @@ SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
|
|
|
313
316
|
|
|
314
317
|
|
|
315
318
|
def resolve_snowflake_modified_type(type_string: str) -> Any:
|
|
316
|
-
# Match types with precision and scale, e.g., 'DECIMAL(38,0)'
|
|
317
|
-
match = re.match(r"([a-
|
|
319
|
+
# Match types with precision and scale, e.g., 'DECIMAL(38,0)' or TIME(3)
|
|
320
|
+
match = re.match(r"([a-z A-Z_]+)\(\d+(,(\s+)?\d+)?\)", type_string)
|
|
318
321
|
if match:
|
|
319
322
|
modified_type_base = match.group(1) # Extract the base type
|
|
320
323
|
return SNOWFLAKE_TYPES_MAP.get(modified_type_base)
|
|
@@ -229,6 +229,11 @@ class UnityCatalogSourceConfig(
|
|
|
229
229
|
description="Option to enable/disable ownership generation for metastores, catalogs, schemas, and tables.",
|
|
230
230
|
)
|
|
231
231
|
|
|
232
|
+
include_tags: bool = pydantic.Field(
|
|
233
|
+
default=True,
|
|
234
|
+
description="Option to enable/disable column/table tag extraction.",
|
|
235
|
+
)
|
|
236
|
+
|
|
232
237
|
_rename_table_ownership = pydantic_renamed_field(
|
|
233
238
|
"include_table_ownership", "include_ownership"
|
|
234
239
|
)
|
|
@@ -8,6 +8,8 @@ from datetime import datetime
|
|
|
8
8
|
from typing import Any, Dict, Iterable, List, Optional, Union, cast
|
|
9
9
|
from unittest.mock import patch
|
|
10
10
|
|
|
11
|
+
import cachetools
|
|
12
|
+
from cachetools import cached
|
|
11
13
|
from databricks.sdk import WorkspaceClient
|
|
12
14
|
from databricks.sdk.service.catalog import (
|
|
13
15
|
CatalogInfo,
|
|
@@ -25,8 +27,10 @@ from databricks.sdk.service.sql import (
|
|
|
25
27
|
QueryStatus,
|
|
26
28
|
)
|
|
27
29
|
from databricks.sdk.service.workspace import ObjectType
|
|
30
|
+
from databricks.sql import connect
|
|
28
31
|
|
|
29
32
|
from datahub._version import nice_version_name
|
|
33
|
+
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
30
34
|
from datahub.emitter.mce_builder import parse_ts_millis
|
|
31
35
|
from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
|
|
32
36
|
from datahub.ingestion.source.unity.proxy_profiling import (
|
|
@@ -108,6 +112,13 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
108
112
|
self.warehouse_id = warehouse_id or ""
|
|
109
113
|
self.report = report
|
|
110
114
|
self.hive_metastore_proxy = hive_metastore_proxy
|
|
115
|
+
self._sql_connection_params = {
|
|
116
|
+
"server_hostname": self._workspace_client.config.host.replace(
|
|
117
|
+
"https://", ""
|
|
118
|
+
),
|
|
119
|
+
"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}",
|
|
120
|
+
"access_token": self._workspace_client.config.token,
|
|
121
|
+
}
|
|
111
122
|
|
|
112
123
|
def check_basic_connectivity(self) -> bool:
|
|
113
124
|
return bool(self._workspace_client.catalogs.list(include_browse=True))
|
|
@@ -492,3 +503,109 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
492
503
|
executed_as_user_id=info.executed_as_user_id,
|
|
493
504
|
executed_as_user_name=info.executed_as_user_name,
|
|
494
505
|
)
|
|
506
|
+
|
|
507
|
+
def _execute_sql_query(self, query: str) -> List[List[str]]:
|
|
508
|
+
"""Execute SQL query using databricks-sql connector for better performance"""
|
|
509
|
+
try:
|
|
510
|
+
with connect(
|
|
511
|
+
**self._sql_connection_params
|
|
512
|
+
) as connection, connection.cursor() as cursor:
|
|
513
|
+
cursor.execute(query)
|
|
514
|
+
return cursor.fetchall()
|
|
515
|
+
|
|
516
|
+
except Exception as e:
|
|
517
|
+
logger.warning(f"Failed to execute SQL query: {e}")
|
|
518
|
+
return []
|
|
519
|
+
|
|
520
|
+
@cached(cachetools.FIFOCache(maxsize=100))
|
|
521
|
+
def get_schema_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
522
|
+
"""Optimized version using databricks-sql"""
|
|
523
|
+
logger.info(f"Fetching schema tags for catalog: {catalog}")
|
|
524
|
+
|
|
525
|
+
query = f"SELECT * FROM {catalog}.information_schema.schema_tags"
|
|
526
|
+
rows = self._execute_sql_query(query)
|
|
527
|
+
|
|
528
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
529
|
+
|
|
530
|
+
for row in rows:
|
|
531
|
+
catalog_name, schema_name, tag_name, tag_value = row
|
|
532
|
+
schema_key = f"{catalog_name}.{schema_name}"
|
|
533
|
+
|
|
534
|
+
if schema_key not in result_dict:
|
|
535
|
+
result_dict[schema_key] = []
|
|
536
|
+
|
|
537
|
+
result_dict[schema_key].append(
|
|
538
|
+
UnityCatalogTag(key=tag_name, value=tag_value)
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
return result_dict
|
|
542
|
+
|
|
543
|
+
@cached(cachetools.FIFOCache(maxsize=100))
|
|
544
|
+
def get_catalog_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
545
|
+
"""Optimized version using databricks-sql"""
|
|
546
|
+
logger.info(f"Fetching table tags for catalog: {catalog}")
|
|
547
|
+
|
|
548
|
+
query = f"SELECT * FROM {catalog}.information_schema.catalog_tags"
|
|
549
|
+
rows = self._execute_sql_query(query)
|
|
550
|
+
|
|
551
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
552
|
+
|
|
553
|
+
for row in rows:
|
|
554
|
+
catalog_name, tag_name, tag_value = row
|
|
555
|
+
|
|
556
|
+
if catalog_name not in result_dict:
|
|
557
|
+
result_dict[catalog_name] = []
|
|
558
|
+
|
|
559
|
+
result_dict[catalog_name].append(
|
|
560
|
+
UnityCatalogTag(key=tag_name, value=tag_value)
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
return result_dict
|
|
564
|
+
|
|
565
|
+
@cached(cachetools.FIFOCache(maxsize=100))
|
|
566
|
+
def get_table_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
567
|
+
"""Optimized version using databricks-sql"""
|
|
568
|
+
logger.info(f"Fetching table tags for catalog: {catalog}")
|
|
569
|
+
|
|
570
|
+
query = f"SELECT * FROM {catalog}.information_schema.table_tags"
|
|
571
|
+
rows = self._execute_sql_query(query)
|
|
572
|
+
|
|
573
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
574
|
+
|
|
575
|
+
for row in rows:
|
|
576
|
+
catalog_name, schema_name, table_name, tag_name, tag_value = row
|
|
577
|
+
table_key = f"{catalog_name}.{schema_name}.{table_name}"
|
|
578
|
+
|
|
579
|
+
if table_key not in result_dict:
|
|
580
|
+
result_dict[table_key] = []
|
|
581
|
+
|
|
582
|
+
result_dict[table_key].append(
|
|
583
|
+
UnityCatalogTag(key=tag_name, value=tag_value if tag_value else None)
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
return result_dict
|
|
587
|
+
|
|
588
|
+
@cached(cachetools.FIFOCache(maxsize=100))
|
|
589
|
+
def get_column_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
590
|
+
"""Optimized version using databricks-sql"""
|
|
591
|
+
logger.info(f"Fetching column tags for catalog: {catalog}")
|
|
592
|
+
|
|
593
|
+
query = f"SELECT * FROM {catalog}.information_schema.column_tags"
|
|
594
|
+
rows = self._execute_sql_query(query)
|
|
595
|
+
|
|
596
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
597
|
+
|
|
598
|
+
for row in rows:
|
|
599
|
+
catalog_name, schema_name, table_name, column_name, tag_name, tag_value = (
|
|
600
|
+
row
|
|
601
|
+
)
|
|
602
|
+
column_key = f"{catalog_name}.{schema_name}.{table_name}.{column_name}"
|
|
603
|
+
|
|
604
|
+
if column_key not in result_dict:
|
|
605
|
+
result_dict[column_key] = []
|
|
606
|
+
|
|
607
|
+
result_dict[column_key].append(
|
|
608
|
+
UnityCatalogTag(key=tag_name, value=tag_value if tag_value else None)
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
return result_dict
|