acryl-datahub 1.2.0.7rc2__py3-none-any.whl → 1.2.0.7rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/METADATA +2754 -2749
- {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/RECORD +30 -30
- datahub/_version.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +9 -6
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/redshift.py +19 -106
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -0
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +62 -3
- datahub/ingestion/source/unity/config.py +74 -9
- datahub/ingestion/source/unity/proxy.py +167 -5
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +24 -0
- datahub/ingestion/source/unity/report.py +5 -0
- datahub/ingestion/source/unity/source.py +111 -1
- datahub/ingestion/source/usage/usage_common.py +1 -0
- datahub/metadata/_internal_schema_classes.py +5 -5
- datahub/metadata/schema.avsc +66 -60
- datahub/metadata/schemas/LogicalParent.avsc +104 -100
- datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
- datahub/sdk/chart.py +36 -22
- datahub/sdk/dashboard.py +38 -62
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc4.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import functools
|
|
2
|
-
import itertools
|
|
3
2
|
import logging
|
|
4
3
|
from collections import defaultdict
|
|
5
4
|
from typing import Dict, Iterable, List, Optional, Type, Union
|
|
@@ -52,8 +51,7 @@ from datahub.ingestion.source.common.subtypes import (
|
|
|
52
51
|
from datahub.ingestion.source.redshift.config import RedshiftConfig
|
|
53
52
|
from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper
|
|
54
53
|
from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield
|
|
55
|
-
from datahub.ingestion.source.redshift.lineage import
|
|
56
|
-
from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2
|
|
54
|
+
from datahub.ingestion.source.redshift.lineage import RedshiftSqlLineage
|
|
57
55
|
from datahub.ingestion.source.redshift.profile import RedshiftProfiler
|
|
58
56
|
from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader
|
|
59
57
|
from datahub.ingestion.source.redshift.redshift_schema import (
|
|
@@ -72,7 +70,6 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
72
70
|
add_table_to_schema_container,
|
|
73
71
|
gen_database_container,
|
|
74
72
|
gen_database_key,
|
|
75
|
-
gen_lineage,
|
|
76
73
|
gen_schema_container,
|
|
77
74
|
gen_schema_key,
|
|
78
75
|
get_dataplatform_instance_aspect,
|
|
@@ -116,7 +113,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
116
113
|
)
|
|
117
114
|
from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
|
|
118
115
|
from datahub.utilities import memory_footprint
|
|
119
|
-
from datahub.utilities.dedup_list import deduplicate_list
|
|
120
116
|
from datahub.utilities.mapping import Constants
|
|
121
117
|
from datahub.utilities.perf_timer import PerfTimer
|
|
122
118
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
@@ -423,40 +419,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
423
419
|
memory_footprint.total_size(self.db_views)
|
|
424
420
|
)
|
|
425
421
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
)
|
|
437
|
-
|
|
438
|
-
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
439
|
-
yield from self.extract_lineage_v2(
|
|
440
|
-
connection=connection,
|
|
441
|
-
database=database,
|
|
442
|
-
lineage_extractor=lineage_extractor,
|
|
443
|
-
)
|
|
444
|
-
|
|
445
|
-
all_tables = self.get_all_tables()
|
|
446
|
-
else:
|
|
447
|
-
yield from self.process_schemas(connection, database)
|
|
422
|
+
with RedshiftSqlLineage(
|
|
423
|
+
config=self.config,
|
|
424
|
+
report=self.report,
|
|
425
|
+
context=self.ctx,
|
|
426
|
+
database=database,
|
|
427
|
+
redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
|
|
428
|
+
) as lineage_extractor:
|
|
429
|
+
yield from lineage_extractor.aggregator.register_schemas_from_stream(
|
|
430
|
+
self.process_schemas(connection, database)
|
|
431
|
+
)
|
|
448
432
|
|
|
449
|
-
|
|
433
|
+
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
434
|
+
yield from self.extract_lineage_v2(
|
|
435
|
+
connection=connection,
|
|
436
|
+
database=database,
|
|
437
|
+
lineage_extractor=lineage_extractor,
|
|
438
|
+
)
|
|
450
439
|
|
|
451
|
-
|
|
452
|
-
self.config.include_table_lineage
|
|
453
|
-
or self.config.include_view_lineage
|
|
454
|
-
or self.config.include_copy_lineage
|
|
455
|
-
):
|
|
456
|
-
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
457
|
-
yield from self.extract_lineage(
|
|
458
|
-
connection=connection, all_tables=all_tables, database=database
|
|
459
|
-
)
|
|
440
|
+
all_tables = self.get_all_tables()
|
|
460
441
|
|
|
461
442
|
if self.config.include_usage_statistics:
|
|
462
443
|
with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
|
|
@@ -968,45 +949,11 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
968
949
|
|
|
969
950
|
self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
|
|
970
951
|
|
|
971
|
-
def extract_lineage(
|
|
972
|
-
self,
|
|
973
|
-
connection: redshift_connector.Connection,
|
|
974
|
-
database: str,
|
|
975
|
-
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
|
|
976
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
977
|
-
if not self._should_ingest_lineage():
|
|
978
|
-
return
|
|
979
|
-
|
|
980
|
-
lineage_extractor = RedshiftLineageExtractor(
|
|
981
|
-
config=self.config,
|
|
982
|
-
report=self.report,
|
|
983
|
-
context=self.ctx,
|
|
984
|
-
redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
|
|
985
|
-
)
|
|
986
|
-
|
|
987
|
-
with PerfTimer() as timer:
|
|
988
|
-
lineage_extractor.populate_lineage(
|
|
989
|
-
database=database, connection=connection, all_tables=all_tables
|
|
990
|
-
)
|
|
991
|
-
|
|
992
|
-
self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
|
|
993
|
-
digits=2
|
|
994
|
-
)
|
|
995
|
-
yield from self.generate_lineage(
|
|
996
|
-
database, lineage_extractor=lineage_extractor
|
|
997
|
-
)
|
|
998
|
-
|
|
999
|
-
if self.redundant_lineage_run_skip_handler:
|
|
1000
|
-
# Update the checkpoint state for this run.
|
|
1001
|
-
self.redundant_lineage_run_skip_handler.update_state(
|
|
1002
|
-
self.config.start_time, self.config.end_time
|
|
1003
|
-
)
|
|
1004
|
-
|
|
1005
952
|
def extract_lineage_v2(
|
|
1006
953
|
self,
|
|
1007
954
|
connection: redshift_connector.Connection,
|
|
1008
955
|
database: str,
|
|
1009
|
-
lineage_extractor:
|
|
956
|
+
lineage_extractor: RedshiftSqlLineage,
|
|
1010
957
|
) -> Iterable[MetadataWorkUnit]:
|
|
1011
958
|
if self.config.include_share_lineage:
|
|
1012
959
|
outbound_shares = self.data_dictionary.get_outbound_datashares(connection)
|
|
@@ -1069,40 +1016,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1069
1016
|
|
|
1070
1017
|
return True
|
|
1071
1018
|
|
|
1072
|
-
def generate_lineage(
|
|
1073
|
-
self, database: str, lineage_extractor: RedshiftLineageExtractor
|
|
1074
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
1075
|
-
logger.info(f"Generate lineage for {database}")
|
|
1076
|
-
for schema in deduplicate_list(
|
|
1077
|
-
itertools.chain(self.db_tables[database], self.db_views[database])
|
|
1078
|
-
):
|
|
1079
|
-
if (
|
|
1080
|
-
database not in self.db_schemas
|
|
1081
|
-
or schema not in self.db_schemas[database]
|
|
1082
|
-
):
|
|
1083
|
-
logger.warning(
|
|
1084
|
-
f"Either database {database} or {schema} exists in the lineage but was not discovered earlier. Something went wrong."
|
|
1085
|
-
)
|
|
1086
|
-
continue
|
|
1087
|
-
|
|
1088
|
-
table_or_view: Union[RedshiftTable, RedshiftView]
|
|
1089
|
-
for table_or_view in (
|
|
1090
|
-
[]
|
|
1091
|
-
+ self.db_tables[database].get(schema, [])
|
|
1092
|
-
+ self.db_views[database].get(schema, [])
|
|
1093
|
-
):
|
|
1094
|
-
datahub_dataset_name = f"{database}.{schema}.{table_or_view.name}"
|
|
1095
|
-
dataset_urn = self.gen_dataset_urn(datahub_dataset_name)
|
|
1096
|
-
|
|
1097
|
-
lineage_info = lineage_extractor.get_lineage(
|
|
1098
|
-
table_or_view,
|
|
1099
|
-
dataset_urn,
|
|
1100
|
-
self.db_schemas[database][schema],
|
|
1101
|
-
)
|
|
1102
|
-
if lineage_info:
|
|
1103
|
-
# incremental lineage generation is taken care by auto_incremental_lineage
|
|
1104
|
-
yield from gen_lineage(dataset_urn, lineage_info)
|
|
1105
|
-
|
|
1106
1019
|
def add_config_to_report(self):
|
|
1107
1020
|
self.report.stateful_lineage_ingestion_enabled = (
|
|
1108
1021
|
self.config.enable_stateful_lineage_ingestion
|
|
@@ -441,13 +441,16 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
441
441
|
tables = self.fetch_tables_for_schema(
|
|
442
442
|
snowflake_schema, db_name, schema_name
|
|
443
443
|
)
|
|
444
|
+
if self.config.include_views:
|
|
445
|
+
views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
|
|
446
|
+
|
|
447
|
+
if self.config.include_tables:
|
|
444
448
|
db_tables[schema_name] = tables
|
|
445
449
|
yield from self._process_tables(
|
|
446
450
|
tables, snowflake_schema, db_name, schema_name
|
|
447
451
|
)
|
|
448
452
|
|
|
449
453
|
if self.config.include_views:
|
|
450
|
-
views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
|
|
451
454
|
yield from self._process_views(
|
|
452
455
|
views, snowflake_schema, db_name, schema_name
|
|
453
456
|
)
|
|
@@ -199,6 +199,7 @@ class SnowflakeV2Source(
|
|
|
199
199
|
),
|
|
200
200
|
generate_usage_statistics=False,
|
|
201
201
|
generate_operations=False,
|
|
202
|
+
generate_queries=self.config.include_queries,
|
|
202
203
|
format_queries=self.config.format_sql_queries,
|
|
203
204
|
is_temp_table=self._is_temp_table,
|
|
204
205
|
is_allowed_table=self._is_allowed_table,
|
|
@@ -134,7 +134,9 @@ class StoredProcedure:
|
|
|
134
134
|
|
|
135
135
|
@property
|
|
136
136
|
def escape_full_name(self) -> str:
|
|
137
|
-
return f"[{self.db}].[{self.schema}].[{self.formatted_name}]"
|
|
137
|
+
return f"[{self.db}].[{self.schema}].[{self.formatted_name}]".replace(
|
|
138
|
+
"'", r"''"
|
|
139
|
+
)
|
|
138
140
|
|
|
139
141
|
def to_base_procedure(self) -> BaseProcedure:
|
|
140
142
|
return BaseProcedure(
|
|
@@ -10,6 +10,7 @@ from sqlalchemy import create_engine, inspect
|
|
|
10
10
|
from sqlalchemy.engine.base import Connection
|
|
11
11
|
from sqlalchemy.engine.reflection import Inspector
|
|
12
12
|
from sqlalchemy.exc import ProgrammingError, ResourceClosedError
|
|
13
|
+
from sqlalchemy.sql import quoted_name
|
|
13
14
|
|
|
14
15
|
import datahub.metadata.schema_classes as models
|
|
15
16
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -130,10 +131,14 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
130
131
|
"match the entire table name in database.schema.table format. Defaults are to set in such a way "
|
|
131
132
|
"to ignore the temporary staging tables created by known ETL tools.",
|
|
132
133
|
)
|
|
134
|
+
quote_schemas: bool = Field(
|
|
135
|
+
default=False,
|
|
136
|
+
description="Represent a schema identifiers combined with quoting preferences. See [sqlalchemy quoted_name docs](https://docs.sqlalchemy.org/en/20/core/sqlelement.html#sqlalchemy.sql.expression.quoted_name).",
|
|
137
|
+
)
|
|
133
138
|
|
|
134
139
|
@pydantic.validator("uri_args")
|
|
135
140
|
def passwords_match(cls, v, values, **kwargs):
|
|
136
|
-
if values["use_odbc"] and "driver" not in v:
|
|
141
|
+
if values["use_odbc"] and not values["sqlalchemy_uri"] and "driver" not in v:
|
|
137
142
|
raise ValueError("uri_args must contain a 'driver' option")
|
|
138
143
|
elif not values["use_odbc"] and v:
|
|
139
144
|
raise ValueError("uri_args is not supported when ODBC is disabled")
|
|
@@ -159,7 +164,15 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
159
164
|
uri_opts=uri_opts,
|
|
160
165
|
)
|
|
161
166
|
if self.use_odbc:
|
|
162
|
-
|
|
167
|
+
final_uri_args = self.uri_args.copy()
|
|
168
|
+
if final_uri_args and current_db:
|
|
169
|
+
final_uri_args.update({"database": current_db})
|
|
170
|
+
|
|
171
|
+
uri = (
|
|
172
|
+
f"{uri}?{urllib.parse.urlencode(final_uri_args)}"
|
|
173
|
+
if final_uri_args
|
|
174
|
+
else uri
|
|
175
|
+
)
|
|
163
176
|
return uri
|
|
164
177
|
|
|
165
178
|
@property
|
|
@@ -923,7 +936,11 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
923
936
|
logger.debug(f"sql_alchemy_url={url}")
|
|
924
937
|
engine = create_engine(url, **self.config.options)
|
|
925
938
|
|
|
926
|
-
if
|
|
939
|
+
if (
|
|
940
|
+
self.config.database
|
|
941
|
+
and self.config.database != ""
|
|
942
|
+
or (self.config.sqlalchemy_uri and self.config.sqlalchemy_uri != "")
|
|
943
|
+
):
|
|
927
944
|
inspector = inspect(engine)
|
|
928
945
|
yield inspector
|
|
929
946
|
else:
|
|
@@ -1020,3 +1037,45 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
1020
1037
|
if self.config.convert_urns_to_lowercase
|
|
1021
1038
|
else table_ref_str
|
|
1022
1039
|
)
|
|
1040
|
+
|
|
1041
|
+
def get_allowed_schemas(self, inspector: Inspector, db_name: str) -> Iterable[str]:
|
|
1042
|
+
for schema in super().get_allowed_schemas(inspector, db_name):
|
|
1043
|
+
if self.config.quote_schemas:
|
|
1044
|
+
yield quoted_name(schema, True)
|
|
1045
|
+
else:
|
|
1046
|
+
yield schema
|
|
1047
|
+
|
|
1048
|
+
def get_db_name(self, inspector: Inspector) -> str:
|
|
1049
|
+
engine = inspector.engine
|
|
1050
|
+
|
|
1051
|
+
try:
|
|
1052
|
+
if (
|
|
1053
|
+
engine
|
|
1054
|
+
and hasattr(engine, "url")
|
|
1055
|
+
and hasattr(engine.url, "database")
|
|
1056
|
+
and engine.url.database
|
|
1057
|
+
):
|
|
1058
|
+
return str(engine.url.database).strip('"')
|
|
1059
|
+
|
|
1060
|
+
if (
|
|
1061
|
+
engine
|
|
1062
|
+
and hasattr(engine, "url")
|
|
1063
|
+
and hasattr(engine.url, "query")
|
|
1064
|
+
and "odbc_connect" in engine.url.query
|
|
1065
|
+
):
|
|
1066
|
+
# According to the ODBC connection keywords: https://learn.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver17#supported-dsnconnection-string-keywords-and-connection-attributes
|
|
1067
|
+
database = re.search(
|
|
1068
|
+
r"DATABASE=([^;]*);",
|
|
1069
|
+
urllib.parse.unquote_plus(str(engine.url.query["odbc_connect"])),
|
|
1070
|
+
flags=re.IGNORECASE,
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
if database and database.group(1):
|
|
1074
|
+
return database.group(1)
|
|
1075
|
+
|
|
1076
|
+
return ""
|
|
1077
|
+
|
|
1078
|
+
except Exception as e:
|
|
1079
|
+
raise RuntimeError(
|
|
1080
|
+
"Unable to get database name from Sqlalchemy inspector"
|
|
1081
|
+
) from e
|
|
@@ -35,6 +35,10 @@ from datahub.utilities.global_warning_util import add_global_warning
|
|
|
35
35
|
|
|
36
36
|
logger = logging.getLogger(__name__)
|
|
37
37
|
|
|
38
|
+
# Configuration default constants
|
|
39
|
+
INCLUDE_TAGS_DEFAULT = True
|
|
40
|
+
INCLUDE_HIVE_METASTORE_DEFAULT = True
|
|
41
|
+
|
|
38
42
|
|
|
39
43
|
class LineageDataSource(ConfigEnum):
|
|
40
44
|
AUTO = "AUTO"
|
|
@@ -137,10 +141,18 @@ class UnityCatalogSourceConfig(
|
|
|
137
141
|
)
|
|
138
142
|
warehouse_id: Optional[str] = pydantic.Field(
|
|
139
143
|
default=None,
|
|
140
|
-
description=
|
|
144
|
+
description=(
|
|
145
|
+
"SQL Warehouse id, for running queries. Must be explicitly provided to enable SQL-based features. "
|
|
146
|
+
"Required for the following features that need SQL access: "
|
|
147
|
+
"1) Tag extraction (include_tags=True) - queries system.information_schema.tags "
|
|
148
|
+
"2) Hive Metastore catalog (include_hive_metastore=True) - queries legacy hive_metastore catalog "
|
|
149
|
+
"3) System table lineage (lineage_data_source=SYSTEM_TABLES) - queries system.access.table_lineage/column_lineage "
|
|
150
|
+
"4) Data profiling (profiling.enabled=True) - runs SELECT/ANALYZE queries on tables. "
|
|
151
|
+
"When warehouse_id is missing, these features will be automatically disabled (with warnings) to allow ingestion to continue."
|
|
152
|
+
),
|
|
141
153
|
)
|
|
142
154
|
include_hive_metastore: bool = pydantic.Field(
|
|
143
|
-
default=
|
|
155
|
+
default=INCLUDE_HIVE_METASTORE_DEFAULT,
|
|
144
156
|
description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
|
|
145
157
|
)
|
|
146
158
|
workspace_name: Optional[str] = pydantic.Field(
|
|
@@ -236,8 +248,12 @@ class UnityCatalogSourceConfig(
|
|
|
236
248
|
)
|
|
237
249
|
|
|
238
250
|
include_tags: bool = pydantic.Field(
|
|
239
|
-
default=
|
|
240
|
-
description=
|
|
251
|
+
default=INCLUDE_TAGS_DEFAULT,
|
|
252
|
+
description=(
|
|
253
|
+
"Option to enable/disable column/table tag extraction. "
|
|
254
|
+
"Requires warehouse_id to be set since tag extraction needs to query system.information_schema.tags. "
|
|
255
|
+
"If warehouse_id is not provided, this will be automatically disabled to allow ingestion to continue."
|
|
256
|
+
),
|
|
241
257
|
)
|
|
242
258
|
|
|
243
259
|
_rename_table_ownership = pydantic_renamed_field(
|
|
@@ -310,8 +326,62 @@ class UnityCatalogSourceConfig(
|
|
|
310
326
|
description="Details about the delta lake, incase to emit siblings",
|
|
311
327
|
)
|
|
312
328
|
|
|
329
|
+
include_ml_model_aliases: bool = pydantic.Field(
|
|
330
|
+
default=False,
|
|
331
|
+
description="Whether to include ML model aliases in the ingestion.",
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
ml_model_max_results: int = pydantic.Field(
|
|
335
|
+
default=1000,
|
|
336
|
+
ge=0,
|
|
337
|
+
description="Maximum number of ML models to ingest.",
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
_forced_disable_tag_extraction: bool = pydantic.PrivateAttr(default=False)
|
|
341
|
+
_forced_disable_hive_metastore_extraction = pydantic.PrivateAttr(default=False)
|
|
342
|
+
|
|
313
343
|
scheme: str = DATABRICKS
|
|
314
344
|
|
|
345
|
+
def __init__(self, **data):
|
|
346
|
+
# First, let the parent handle the root validators and field processing
|
|
347
|
+
super().__init__(**data)
|
|
348
|
+
|
|
349
|
+
# After model creation, check if we need to auto-disable features
|
|
350
|
+
# based on the final warehouse_id value (which may have been set by root validators)
|
|
351
|
+
include_tags_original = data.get("include_tags", INCLUDE_TAGS_DEFAULT)
|
|
352
|
+
include_hive_metastore_original = data.get(
|
|
353
|
+
"include_hive_metastore", INCLUDE_HIVE_METASTORE_DEFAULT
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Track what we're force-disabling
|
|
357
|
+
forced_disable_tag_extraction = False
|
|
358
|
+
forced_disable_hive_metastore_extraction = False
|
|
359
|
+
|
|
360
|
+
# Check if features should be auto-disabled based on final warehouse_id
|
|
361
|
+
if include_tags_original and not self.warehouse_id:
|
|
362
|
+
forced_disable_tag_extraction = True
|
|
363
|
+
self.include_tags = False # Modify the model attribute directly
|
|
364
|
+
logger.warning(
|
|
365
|
+
"warehouse_id is not set but include_tags=True. "
|
|
366
|
+
"Automatically disabling tag extraction since it requires SQL queries. "
|
|
367
|
+
"Set warehouse_id to enable tag extraction."
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
if include_hive_metastore_original and not self.warehouse_id:
|
|
371
|
+
forced_disable_hive_metastore_extraction = True
|
|
372
|
+
self.include_hive_metastore = False # Modify the model attribute directly
|
|
373
|
+
logger.warning(
|
|
374
|
+
"warehouse_id is not set but include_hive_metastore=True. "
|
|
375
|
+
"Automatically disabling hive metastore extraction since it requires SQL queries. "
|
|
376
|
+
"Set warehouse_id to enable hive metastore extraction."
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# Set private attributes
|
|
380
|
+
self._forced_disable_tag_extraction = forced_disable_tag_extraction
|
|
381
|
+
self._forced_disable_hive_metastore_extraction = (
|
|
382
|
+
forced_disable_hive_metastore_extraction
|
|
383
|
+
)
|
|
384
|
+
|
|
315
385
|
def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
|
|
316
386
|
uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
|
|
317
387
|
if database:
|
|
@@ -381,11 +451,6 @@ class UnityCatalogSourceConfig(
|
|
|
381
451
|
"When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`."
|
|
382
452
|
)
|
|
383
453
|
|
|
384
|
-
if values.get("include_hive_metastore") and not values.get("warehouse_id"):
|
|
385
|
-
raise ValueError(
|
|
386
|
-
"When `include_hive_metastore` is set, `warehouse_id` must be set."
|
|
387
|
-
)
|
|
388
|
-
|
|
389
454
|
if values.get("warehouse_id") and profiling and not profiling.warehouse_id:
|
|
390
455
|
profiling.warehouse_id = values["warehouse_id"]
|
|
391
456
|
|