acryl-datahub 1.3.1__py3-none-any.whl → 1.3.1.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/METADATA +2501 -2501
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/RECORD +193 -193
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +2 -2
- datahub/api/entities/corpgroup/corpgroup.py +11 -6
- datahub/api/entities/corpuser/corpuser.py +11 -11
- datahub/api/entities/dataproduct/dataproduct.py +47 -27
- datahub/api/entities/dataset/dataset.py +32 -21
- datahub/api/entities/external/lake_formation_external_entites.py +5 -6
- datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
- datahub/api/entities/forms/forms.py +16 -14
- datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
- datahub/cli/check_cli.py +2 -2
- datahub/cli/config_utils.py +3 -3
- datahub/cli/lite_cli.py +9 -7
- datahub/cli/migrate.py +4 -4
- datahub/cli/quickstart_versioning.py +3 -3
- datahub/cli/specific/group_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +1 -1
- datahub/cli/specific/user_cli.py +1 -1
- datahub/configuration/common.py +14 -2
- datahub/configuration/connection_resolver.py +2 -2
- datahub/configuration/git.py +47 -30
- datahub/configuration/import_resolver.py +2 -2
- datahub/configuration/kafka.py +4 -3
- datahub/configuration/time_window_config.py +26 -26
- datahub/configuration/validate_field_deprecation.py +2 -2
- datahub/configuration/validate_field_removal.py +2 -2
- datahub/configuration/validate_field_rename.py +2 -2
- datahub/configuration/validate_multiline_string.py +2 -1
- datahub/emitter/kafka_emitter.py +3 -1
- datahub/emitter/rest_emitter.py +2 -4
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/report.py +1 -1
- datahub/ingestion/api/sink.py +1 -1
- datahub/ingestion/api/source.py +1 -1
- datahub/ingestion/glossary/datahub_classifier.py +11 -8
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/reporting/file_reporter.py +5 -4
- datahub/ingestion/run/pipeline.py +6 -6
- datahub/ingestion/run/pipeline_config.py +12 -14
- datahub/ingestion/run/sink_callback.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +6 -4
- datahub/ingestion/source/abs/config.py +19 -19
- datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/abs/source.py +2 -2
- datahub/ingestion/source/aws/aws_common.py +1 -1
- datahub/ingestion/source/aws/glue.py +6 -4
- datahub/ingestion/source/aws/sagemaker.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +8 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
- datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
- datahub/ingestion/source/datahub/config.py +8 -8
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
- datahub/ingestion/source/dbt/dbt_common.py +39 -37
- datahub/ingestion/source/dbt/dbt_core.py +10 -12
- datahub/ingestion/source/debug/datahub_debug.py +1 -1
- datahub/ingestion/source/delta_lake/config.py +6 -4
- datahub/ingestion/source/dremio/dremio_config.py +10 -6
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/elastic_search.py +4 -3
- datahub/ingestion/source/excel/source.py +1 -1
- datahub/ingestion/source/feast.py +1 -1
- datahub/ingestion/source/file.py +5 -4
- datahub/ingestion/source/fivetran/config.py +17 -16
- datahub/ingestion/source/fivetran/fivetran.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +8 -10
- datahub/ingestion/source/ge_profiling_config.py +8 -5
- datahub/ingestion/source/grafana/grafana_api.py +2 -2
- datahub/ingestion/source/grafana/grafana_config.py +4 -3
- datahub/ingestion/source/grafana/grafana_source.py +1 -1
- datahub/ingestion/source/grafana/models.py +23 -5
- datahub/ingestion/source/hex/api.py +7 -5
- datahub/ingestion/source/hex/hex.py +4 -3
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +10 -10
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +7 -5
- datahub/ingestion/source/looker/looker_config.py +21 -20
- datahub/ingestion/source/looker/lookml_config.py +47 -47
- datahub/ingestion/source/metabase.py +8 -8
- datahub/ingestion/source/metadata/business_glossary.py +2 -2
- datahub/ingestion/source/metadata/lineage.py +13 -8
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/mode.py +6 -4
- datahub/ingestion/source/mongodb.py +4 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +17 -23
- datahub/ingestion/source/openapi.py +6 -8
- datahub/ingestion/source/powerbi/config.py +33 -32
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
- datahub/ingestion/source/powerbi/powerbi.py +1 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
- datahub/ingestion/source/preset.py +8 -8
- datahub/ingestion/source/pulsar.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
- datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +18 -20
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/usage.py +23 -3
- datahub/ingestion/source/s3/config.py +83 -62
- datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/s3/source.py +8 -5
- datahub/ingestion/source/sac/sac.py +5 -4
- datahub/ingestion/source/salesforce.py +3 -2
- datahub/ingestion/source/schema/json_schema.py +2 -2
- datahub/ingestion/source/sigma/data_classes.py +3 -2
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/sigma/sigma_api.py +7 -7
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
- datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_queries.py +1 -1
- datahub/ingestion/source/sql/athena.py +1 -1
- datahub/ingestion/source/sql/clickhouse.py +4 -2
- datahub/ingestion/source/sql/cockroachdb.py +1 -1
- datahub/ingestion/source/sql/druid.py +1 -1
- datahub/ingestion/source/sql/hana.py +1 -1
- datahub/ingestion/source/sql/hive.py +7 -5
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +13 -6
- datahub/ingestion/source/sql/mysql.py +1 -1
- datahub/ingestion/source/sql/oracle.py +17 -10
- datahub/ingestion/source/sql/postgres.py +2 -2
- datahub/ingestion/source/sql/presto.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +8 -9
- datahub/ingestion/source/sql/sql_generic.py +1 -1
- datahub/ingestion/source/sql/teradata.py +1 -1
- datahub/ingestion/source/sql/trino.py +1 -1
- datahub/ingestion/source/sql/vertica.py +5 -4
- datahub/ingestion/source/sql_queries.py +11 -8
- datahub/ingestion/source/state/checkpoint.py +2 -2
- datahub/ingestion/source/state/entity_removal_state.py +2 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +9 -9
- datahub/ingestion/source/tableau/tableau.py +14 -16
- datahub/ingestion/source/unity/config.py +33 -34
- datahub/ingestion/source/unity/proxy.py +203 -0
- datahub/ingestion/source/unity/proxy_types.py +91 -0
- datahub/ingestion/source/unity/source.py +27 -2
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
- datahub/ingestion/source/usage/usage_common.py +5 -3
- datahub/ingestion/source_config/csv_enricher.py +7 -6
- datahub/ingestion/source_config/operation_config.py +7 -4
- datahub/ingestion/source_config/pulsar.py +11 -15
- datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
- datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
- datahub/ingestion/transformer/add_dataset_properties.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
- datahub/ingestion/transformer/add_dataset_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
- datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
- datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
- datahub/ingestion/transformer/mark_dataset_status.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
- datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/replace_external_url.py +2 -2
- datahub/ingestion/transformer/set_browse_path.py +1 -1
- datahub/ingestion/transformer/tags_to_terms.py +1 -1
- datahub/lite/duckdb_lite.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/sdk/search_filters.py +68 -40
- datahub/secret/datahub_secret_store.py +7 -4
- datahub/secret/file_secret_store.py +1 -1
- datahub/sql_parsing/sqlglot_lineage.py +5 -2
- datahub/testing/check_sql_parser_result.py +2 -2
- datahub/utilities/ingest_utils.py +1 -1
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/top_level.txt +0 -0
|
@@ -2,7 +2,7 @@ from copy import deepcopy
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel,
|
|
5
|
+
from pydantic import BaseModel, model_validator
|
|
6
6
|
|
|
7
7
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
8
8
|
|
|
@@ -22,7 +22,8 @@ class Workspace(BaseModel):
|
|
|
22
22
|
createdAt: datetime
|
|
23
23
|
updatedAt: datetime
|
|
24
24
|
|
|
25
|
-
@
|
|
25
|
+
@model_validator(mode="before")
|
|
26
|
+
@classmethod
|
|
26
27
|
def update_values(cls, values: Dict) -> Dict:
|
|
27
28
|
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
28
29
|
values = deepcopy(values)
|
|
@@ -150,7 +150,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
150
150
|
|
|
151
151
|
@classmethod
|
|
152
152
|
def create(cls, config_dict, ctx):
|
|
153
|
-
config = SigmaSourceConfig.
|
|
153
|
+
config = SigmaSourceConfig.model_validate(config_dict)
|
|
154
154
|
return cls(config, ctx)
|
|
155
155
|
|
|
156
156
|
def _gen_workbook_key(self, workbook_id: str) -> WorkbookKey:
|
|
@@ -108,7 +108,7 @@ class SigmaAPI:
|
|
|
108
108
|
self.report.non_accessible_workspaces_count += 1
|
|
109
109
|
return None
|
|
110
110
|
response.raise_for_status()
|
|
111
|
-
workspace = Workspace.
|
|
111
|
+
workspace = Workspace.model_validate(response.json())
|
|
112
112
|
self.workspaces[workspace.workspaceId] = workspace
|
|
113
113
|
return workspace
|
|
114
114
|
except Exception as e:
|
|
@@ -127,7 +127,7 @@ class SigmaAPI:
|
|
|
127
127
|
response_dict = response.json()
|
|
128
128
|
for workspace_dict in response_dict[Constant.ENTRIES]:
|
|
129
129
|
self.workspaces[workspace_dict[Constant.WORKSPACEID]] = (
|
|
130
|
-
Workspace.
|
|
130
|
+
Workspace.model_validate(workspace_dict)
|
|
131
131
|
)
|
|
132
132
|
if response_dict[Constant.NEXTPAGE]:
|
|
133
133
|
url = f"{workspace_url}&page={response_dict[Constant.NEXTPAGE]}"
|
|
@@ -197,7 +197,7 @@ class SigmaAPI:
|
|
|
197
197
|
response.raise_for_status()
|
|
198
198
|
response_dict = response.json()
|
|
199
199
|
for file_dict in response_dict[Constant.ENTRIES]:
|
|
200
|
-
file = File.
|
|
200
|
+
file = File.model_validate(file_dict)
|
|
201
201
|
file.workspaceId = self.get_workspace_id_from_file_path(
|
|
202
202
|
file.parentId, file.path
|
|
203
203
|
)
|
|
@@ -225,7 +225,7 @@ class SigmaAPI:
|
|
|
225
225
|
response.raise_for_status()
|
|
226
226
|
response_dict = response.json()
|
|
227
227
|
for dataset_dict in response_dict[Constant.ENTRIES]:
|
|
228
|
-
dataset = SigmaDataset.
|
|
228
|
+
dataset = SigmaDataset.model_validate(dataset_dict)
|
|
229
229
|
|
|
230
230
|
if dataset.datasetId not in dataset_files_metadata:
|
|
231
231
|
self.report.datasets.dropped(
|
|
@@ -354,7 +354,7 @@ class SigmaAPI:
|
|
|
354
354
|
element_dict[Constant.URL] = (
|
|
355
355
|
f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
|
|
356
356
|
)
|
|
357
|
-
element = Element.
|
|
357
|
+
element = Element.model_validate(element_dict)
|
|
358
358
|
if (
|
|
359
359
|
self.config.extract_lineage
|
|
360
360
|
and self.config.workbook_lineage_pattern.allowed(workbook.name)
|
|
@@ -379,7 +379,7 @@ class SigmaAPI:
|
|
|
379
379
|
)
|
|
380
380
|
response.raise_for_status()
|
|
381
381
|
for page_dict in response.json()[Constant.ENTRIES]:
|
|
382
|
-
page = Page.
|
|
382
|
+
page = Page.model_validate(page_dict)
|
|
383
383
|
page.elements = self.get_page_elements(workbook, page)
|
|
384
384
|
pages.append(page)
|
|
385
385
|
return pages
|
|
@@ -400,7 +400,7 @@ class SigmaAPI:
|
|
|
400
400
|
response.raise_for_status()
|
|
401
401
|
response_dict = response.json()
|
|
402
402
|
for workbook_dict in response_dict[Constant.ENTRIES]:
|
|
403
|
-
workbook = Workbook.
|
|
403
|
+
workbook = Workbook.model_validate(workbook_dict)
|
|
404
404
|
|
|
405
405
|
if workbook.workbookId not in workbook_files_metadata:
|
|
406
406
|
# Due to a bug in the Sigma API, it seems like the /files endpoint does not
|
|
@@ -260,7 +260,7 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
260
260
|
|
|
261
261
|
@classmethod
|
|
262
262
|
def create(cls, config_dict, ctx):
|
|
263
|
-
config = SlackSourceConfig.
|
|
263
|
+
config = SlackSourceConfig.model_validate(config_dict)
|
|
264
264
|
return cls(ctx, config)
|
|
265
265
|
|
|
266
266
|
def get_slack_client(self) -> WebClient:
|
|
@@ -351,5 +351,5 @@ class SnaplogicSource(StatefulIngestionSourceBase):
|
|
|
351
351
|
|
|
352
352
|
@classmethod
|
|
353
353
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "SnaplogicSource":
|
|
354
|
-
config = SnaplogicConfig.
|
|
354
|
+
config = SnaplogicConfig.model_validate(config_dict)
|
|
355
355
|
return cls(config, ctx)
|
|
@@ -91,7 +91,7 @@ class SnowflakeAssertionsHandler:
|
|
|
91
91
|
self, result_row: dict, discovered_datasets: List[str]
|
|
92
92
|
) -> Optional[MetadataChangeProposalWrapper]:
|
|
93
93
|
try:
|
|
94
|
-
result = DataQualityMonitoringResult.
|
|
94
|
+
result = DataQualityMonitoringResult.model_validate(result_row)
|
|
95
95
|
assertion_guid = result.METRIC_NAME.split("__")[-1].lower()
|
|
96
96
|
status = bool(result.VALUE) # 1 if PASS, 0 if FAIL
|
|
97
97
|
assertee = self.identifiers.get_dataset_identifier(
|
|
@@ -5,7 +5,7 @@ from enum import Enum
|
|
|
5
5
|
from typing import Dict, List, Optional, Set
|
|
6
6
|
|
|
7
7
|
import pydantic
|
|
8
|
-
from pydantic import Field,
|
|
8
|
+
from pydantic import Field, ValidationInfo, field_validator, model_validator
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
11
11
|
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
@@ -122,10 +122,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
|
|
|
122
122
|
description="Whether `schema_pattern` is matched against fully qualified schema name `<catalog>.<schema>`.",
|
|
123
123
|
)
|
|
124
124
|
|
|
125
|
-
@
|
|
126
|
-
def validate_legacy_schema_pattern(
|
|
127
|
-
schema_pattern: Optional[AllowDenyPattern] =
|
|
128
|
-
match_fully_qualified_names =
|
|
125
|
+
@model_validator(mode="after")
|
|
126
|
+
def validate_legacy_schema_pattern(self) -> "SnowflakeFilterConfig":
|
|
127
|
+
schema_pattern: Optional[AllowDenyPattern] = self.schema_pattern
|
|
128
|
+
match_fully_qualified_names = self.match_fully_qualified_names
|
|
129
129
|
|
|
130
130
|
if (
|
|
131
131
|
schema_pattern is not None
|
|
@@ -145,7 +145,7 @@ class SnowflakeFilterConfig(SQLFilterConfig):
|
|
|
145
145
|
assert isinstance(schema_pattern, AllowDenyPattern)
|
|
146
146
|
schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
|
|
147
147
|
|
|
148
|
-
return
|
|
148
|
+
return self
|
|
149
149
|
|
|
150
150
|
|
|
151
151
|
class SnowflakeIdentifierConfig(
|
|
@@ -391,7 +391,8 @@ class SnowflakeV2Config(
|
|
|
391
391
|
"This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.",
|
|
392
392
|
)
|
|
393
393
|
|
|
394
|
-
@
|
|
394
|
+
@field_validator("convert_urns_to_lowercase", mode="after")
|
|
395
|
+
@classmethod
|
|
395
396
|
def validate_convert_urns_to_lowercase(cls, v):
|
|
396
397
|
if not v:
|
|
397
398
|
add_global_warning(
|
|
@@ -400,30 +401,31 @@ class SnowflakeV2Config(
|
|
|
400
401
|
|
|
401
402
|
return v
|
|
402
403
|
|
|
403
|
-
@
|
|
404
|
-
|
|
405
|
-
|
|
404
|
+
@field_validator("include_column_lineage", mode="after")
|
|
405
|
+
@classmethod
|
|
406
|
+
def validate_include_column_lineage(cls, v, info):
|
|
407
|
+
if not info.data.get("include_table_lineage") and v:
|
|
406
408
|
raise ValueError(
|
|
407
409
|
"include_table_lineage must be True for include_column_lineage to be set."
|
|
408
410
|
)
|
|
409
411
|
return v
|
|
410
412
|
|
|
411
|
-
@
|
|
412
|
-
def validate_unsupported_configs(
|
|
413
|
-
|
|
414
|
-
|
|
413
|
+
@model_validator(mode="after")
|
|
414
|
+
def validate_unsupported_configs(self) -> "SnowflakeV2Config":
|
|
415
|
+
if (
|
|
416
|
+
hasattr(self, "include_read_operational_stats")
|
|
417
|
+
and self.include_read_operational_stats
|
|
418
|
+
):
|
|
415
419
|
raise ValueError(
|
|
416
420
|
"include_read_operational_stats is not supported. Set `include_read_operational_stats` to False.",
|
|
417
421
|
)
|
|
418
422
|
|
|
419
|
-
include_technical_schema =
|
|
420
|
-
include_profiles =
|
|
421
|
-
values.get("profiling") is not None and values["profiling"].enabled
|
|
422
|
-
)
|
|
423
|
+
include_technical_schema = self.include_technical_schema
|
|
424
|
+
include_profiles = self.profiling is not None and self.profiling.enabled
|
|
423
425
|
delete_detection_enabled = (
|
|
424
|
-
|
|
425
|
-
and
|
|
426
|
-
and
|
|
426
|
+
self.stateful_ingestion is not None
|
|
427
|
+
and self.stateful_ingestion.enabled
|
|
428
|
+
and self.stateful_ingestion.remove_stale_metadata
|
|
427
429
|
)
|
|
428
430
|
|
|
429
431
|
# TODO: Allow profiling irrespective of basic schema extraction,
|
|
@@ -435,13 +437,14 @@ class SnowflakeV2Config(
|
|
|
435
437
|
"Cannot perform Deletion Detection or Profiling without extracting snowflake technical schema. Set `include_technical_schema` to True or disable Deletion Detection and Profiling."
|
|
436
438
|
)
|
|
437
439
|
|
|
438
|
-
return
|
|
440
|
+
return self
|
|
439
441
|
|
|
440
|
-
@
|
|
442
|
+
@field_validator("shares", mode="after")
|
|
443
|
+
@classmethod
|
|
441
444
|
def validate_shares(
|
|
442
|
-
cls, shares: Optional[Dict[str, SnowflakeShareConfig]],
|
|
445
|
+
cls, shares: Optional[Dict[str, SnowflakeShareConfig]], info: ValidationInfo
|
|
443
446
|
) -> Optional[Dict[str, SnowflakeShareConfig]]:
|
|
444
|
-
current_platform_instance =
|
|
447
|
+
current_platform_instance = info.data.get("platform_instance")
|
|
445
448
|
|
|
446
449
|
if shares:
|
|
447
450
|
# Check: platform_instance should be present
|
|
@@ -479,11 +482,12 @@ class SnowflakeV2Config(
|
|
|
479
482
|
|
|
480
483
|
return shares
|
|
481
484
|
|
|
482
|
-
@
|
|
483
|
-
def validate_queries_v2_stateful_ingestion(
|
|
484
|
-
if
|
|
485
|
-
if
|
|
486
|
-
|
|
485
|
+
@model_validator(mode="after")
|
|
486
|
+
def validate_queries_v2_stateful_ingestion(self) -> "SnowflakeV2Config":
|
|
487
|
+
if self.use_queries_v2:
|
|
488
|
+
if (
|
|
489
|
+
self.enable_stateful_lineage_ingestion
|
|
490
|
+
or self.enable_stateful_usage_ingestion
|
|
487
491
|
):
|
|
488
492
|
logger.warning(
|
|
489
493
|
"enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
|
|
@@ -491,7 +495,7 @@ class SnowflakeV2Config(
|
|
|
491
495
|
"For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
|
|
492
496
|
"for the unified time window extraction (lineage + usage + operations + queries)."
|
|
493
497
|
)
|
|
494
|
-
return
|
|
498
|
+
return self
|
|
495
499
|
|
|
496
500
|
def outbounds(self) -> Dict[str, Set[DatabaseId]]:
|
|
497
501
|
"""
|
|
@@ -6,6 +6,7 @@ import pydantic
|
|
|
6
6
|
import snowflake.connector
|
|
7
7
|
from cryptography.hazmat.backends import default_backend
|
|
8
8
|
from cryptography.hazmat.primitives import serialization
|
|
9
|
+
from pydantic import field_validator, model_validator
|
|
9
10
|
from snowflake.connector import SnowflakeConnection as NativeSnowflakeConnection
|
|
10
11
|
from snowflake.connector.cursor import DictCursor
|
|
11
12
|
from snowflake.connector.network import (
|
|
@@ -125,26 +126,28 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
125
126
|
|
|
126
127
|
rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id") # type: ignore[pydantic-field]
|
|
127
128
|
|
|
128
|
-
@
|
|
129
|
-
|
|
129
|
+
@field_validator("account_id", mode="after")
|
|
130
|
+
@classmethod
|
|
131
|
+
def validate_account_id(cls, account_id: str, info: pydantic.ValidationInfo) -> str:
|
|
130
132
|
account_id = remove_protocol(account_id)
|
|
131
133
|
account_id = remove_trailing_slashes(account_id)
|
|
132
134
|
# Get the domain from config, fallback to default
|
|
133
|
-
domain =
|
|
135
|
+
domain = info.data.get("snowflake_domain", DEFAULT_SNOWFLAKE_DOMAIN)
|
|
134
136
|
snowflake_host_suffix = f".{domain}"
|
|
135
137
|
account_id = remove_suffix(account_id, snowflake_host_suffix)
|
|
136
138
|
return account_id
|
|
137
139
|
|
|
138
|
-
@
|
|
139
|
-
|
|
140
|
+
@field_validator("authentication_type", mode="before")
|
|
141
|
+
@classmethod
|
|
142
|
+
def authenticator_type_is_valid(cls, v: Any, info: pydantic.ValidationInfo) -> Any:
|
|
140
143
|
if v not in _VALID_AUTH_TYPES:
|
|
141
144
|
raise ValueError(
|
|
142
145
|
f"unsupported authenticator type '{v}' was provided,"
|
|
143
146
|
f" use one of {list(_VALID_AUTH_TYPES.keys())}"
|
|
144
147
|
)
|
|
145
148
|
if (
|
|
146
|
-
|
|
147
|
-
or
|
|
149
|
+
info.data.get("private_key") is not None
|
|
150
|
+
or info.data.get("private_key_path") is not None
|
|
148
151
|
) and v != "KEY_PAIR_AUTHENTICATOR":
|
|
149
152
|
raise ValueError(
|
|
150
153
|
f"Either `private_key` and `private_key_path` is set but `authentication_type` is {v}. "
|
|
@@ -153,21 +156,22 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
153
156
|
if v == "KEY_PAIR_AUTHENTICATOR":
|
|
154
157
|
# If we are using key pair auth, we need the private key path and password to be set
|
|
155
158
|
if (
|
|
156
|
-
|
|
157
|
-
and
|
|
159
|
+
info.data.get("private_key") is None
|
|
160
|
+
and info.data.get("private_key_path") is None
|
|
158
161
|
):
|
|
159
162
|
raise ValueError(
|
|
160
163
|
f"Both `private_key` and `private_key_path` are none. "
|
|
161
164
|
f"At least one should be set when using {v} authentication"
|
|
162
165
|
)
|
|
163
166
|
elif v == "OAUTH_AUTHENTICATOR":
|
|
164
|
-
cls._check_oauth_config(
|
|
167
|
+
cls._check_oauth_config(info.data.get("oauth_config"))
|
|
165
168
|
logger.info(f"using authenticator type '{v}'")
|
|
166
169
|
return v
|
|
167
170
|
|
|
168
|
-
@
|
|
169
|
-
|
|
170
|
-
|
|
171
|
+
@field_validator("token", mode="before")
|
|
172
|
+
@classmethod
|
|
173
|
+
def validate_token_oauth_config(cls, v: Any, info: pydantic.ValidationInfo) -> Any:
|
|
174
|
+
auth_type = info.data.get("authentication_type")
|
|
171
175
|
if auth_type == "OAUTH_AUTHENTICATOR_TOKEN":
|
|
172
176
|
if not v:
|
|
173
177
|
raise ValueError("Token required for OAUTH_AUTHENTICATOR_TOKEN.")
|
|
@@ -177,6 +181,24 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
177
181
|
)
|
|
178
182
|
return v
|
|
179
183
|
|
|
184
|
+
@model_validator(mode="after")
|
|
185
|
+
def validate_authentication_config(self):
|
|
186
|
+
"""Validate authentication configuration consistency."""
|
|
187
|
+
# Check token requirement for OAUTH_AUTHENTICATOR_TOKEN
|
|
188
|
+
if self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
|
|
189
|
+
if not self.token:
|
|
190
|
+
raise ValueError("Token required for OAUTH_AUTHENTICATOR_TOKEN.")
|
|
191
|
+
|
|
192
|
+
# Check private key authentication consistency
|
|
193
|
+
if self.private_key is not None or self.private_key_path is not None:
|
|
194
|
+
if self.authentication_type != "KEY_PAIR_AUTHENTICATOR":
|
|
195
|
+
raise ValueError(
|
|
196
|
+
f"Either `private_key` and `private_key_path` is set but `authentication_type` is {self.authentication_type}. "
|
|
197
|
+
f"Should be set to 'KEY_PAIR_AUTHENTICATOR' when using key pair authentication"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
return self
|
|
201
|
+
|
|
180
202
|
@staticmethod
|
|
181
203
|
def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
|
|
182
204
|
if oauth_config is None:
|
|
@@ -14,7 +14,7 @@ from typing import (
|
|
|
14
14
|
Type,
|
|
15
15
|
)
|
|
16
16
|
|
|
17
|
-
from pydantic import BaseModel, Field,
|
|
17
|
+
from pydantic import BaseModel, Field, field_validator
|
|
18
18
|
|
|
19
19
|
from datahub.configuration.datetimes import parse_absolute_time
|
|
20
20
|
from datahub.ingestion.api.closeable import Closeable
|
|
@@ -70,7 +70,7 @@ def pydantic_parse_json(field: str) -> "V1Validator":
|
|
|
70
70
|
return json.loads(v)
|
|
71
71
|
return v
|
|
72
72
|
|
|
73
|
-
return
|
|
73
|
+
return field_validator(field, mode="before")(_parse_from_json)
|
|
74
74
|
|
|
75
75
|
|
|
76
76
|
class UpstreamColumnNode(BaseModel):
|
|
@@ -379,7 +379,7 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
379
379
|
# To avoid that causing a pydantic error we are setting it to an empty list
|
|
380
380
|
# instead of a list with an empty object
|
|
381
381
|
db_row["QUERIES"] = "[]"
|
|
382
|
-
return UpstreamLineageEdge.
|
|
382
|
+
return UpstreamLineageEdge.model_validate(db_row)
|
|
383
383
|
except Exception as e:
|
|
384
384
|
self.report.num_upstream_lineage_edge_parsing_failed += 1
|
|
385
385
|
upstream_tables = db_row.get("UPSTREAM_TABLES")
|
|
@@ -806,7 +806,7 @@ class SnowflakeQueriesSource(Source):
|
|
|
806
806
|
|
|
807
807
|
@classmethod
|
|
808
808
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> Self:
|
|
809
|
-
config = SnowflakeQueriesSourceConfig.
|
|
809
|
+
config = SnowflakeQueriesSourceConfig.model_validate(config_dict)
|
|
810
810
|
return cls(ctx, config)
|
|
811
811
|
|
|
812
812
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -386,7 +386,7 @@ class AthenaSource(SQLAlchemySource):
|
|
|
386
386
|
|
|
387
387
|
@classmethod
|
|
388
388
|
def create(cls, config_dict, ctx):
|
|
389
|
-
config = AthenaConfig.
|
|
389
|
+
config = AthenaConfig.model_validate(config_dict)
|
|
390
390
|
return cls(config, ctx)
|
|
391
391
|
|
|
392
392
|
# overwrite this method to allow to specify the usage of a custom dialect
|
|
@@ -10,6 +10,7 @@ import clickhouse_sqlalchemy.types as custom_types
|
|
|
10
10
|
import pydantic
|
|
11
11
|
from clickhouse_sqlalchemy.drivers import base
|
|
12
12
|
from clickhouse_sqlalchemy.drivers.base import ClickHouseDialect
|
|
13
|
+
from pydantic import model_validator
|
|
13
14
|
from pydantic.fields import Field
|
|
14
15
|
from sqlalchemy import create_engine, text
|
|
15
16
|
from sqlalchemy.engine import reflection
|
|
@@ -175,7 +176,8 @@ class ClickHouseConfig(
|
|
|
175
176
|
return str(url)
|
|
176
177
|
|
|
177
178
|
# pre = True because we want to take some decision before pydantic initialize the configuration to default values
|
|
178
|
-
@
|
|
179
|
+
@model_validator(mode="before")
|
|
180
|
+
@classmethod
|
|
179
181
|
def projects_backward_compatibility(cls, values: Dict) -> Dict:
|
|
180
182
|
secure = values.get("secure")
|
|
181
183
|
protocol = values.get("protocol")
|
|
@@ -423,7 +425,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
423
425
|
|
|
424
426
|
@classmethod
|
|
425
427
|
def create(cls, config_dict, ctx):
|
|
426
|
-
config = ClickHouseConfig.
|
|
428
|
+
config = ClickHouseConfig.model_validate(config_dict)
|
|
427
429
|
return cls(config, ctx)
|
|
428
430
|
|
|
429
431
|
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
|
|
@@ -6,7 +6,7 @@ from enum import Enum
|
|
|
6
6
|
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
|
-
from pydantic import
|
|
9
|
+
from pydantic import field_validator
|
|
10
10
|
from pydantic.fields import Field
|
|
11
11
|
|
|
12
12
|
# This import verifies that the dependencies are available.
|
|
@@ -674,11 +674,13 @@ class HiveConfig(TwoTierSQLAlchemyConfig):
|
|
|
674
674
|
description="Platform instance for the storage system",
|
|
675
675
|
)
|
|
676
676
|
|
|
677
|
-
@
|
|
678
|
-
|
|
677
|
+
@field_validator("host_port", mode="after")
|
|
678
|
+
@classmethod
|
|
679
|
+
def clean_host_port(cls, v: str) -> str:
|
|
679
680
|
return config_clean.remove_protocol(v)
|
|
680
681
|
|
|
681
|
-
@
|
|
682
|
+
@field_validator("hive_storage_lineage_direction", mode="after")
|
|
683
|
+
@classmethod
|
|
682
684
|
def _validate_direction(cls, v: str) -> str:
|
|
683
685
|
"""Validate the lineage direction."""
|
|
684
686
|
if v.lower() not in ["upstream", "downstream"]:
|
|
@@ -725,7 +727,7 @@ class HiveSource(TwoTierSQLAlchemySource):
|
|
|
725
727
|
|
|
726
728
|
@classmethod
|
|
727
729
|
def create(cls, config_dict, ctx):
|
|
728
|
-
config = HiveConfig.
|
|
730
|
+
config = HiveConfig.model_validate(config_dict)
|
|
729
731
|
return cls(config, ctx)
|
|
730
732
|
|
|
731
733
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -351,7 +351,7 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
351
351
|
|
|
352
352
|
@classmethod
|
|
353
353
|
def create(cls, config_dict, ctx):
|
|
354
|
-
config = HiveMetastore.
|
|
354
|
+
config = HiveMetastore.model_validate(config_dict)
|
|
355
355
|
return cls(config, ctx)
|
|
356
356
|
|
|
357
357
|
def gen_database_containers(
|
|
@@ -3,8 +3,8 @@ import re
|
|
|
3
3
|
import urllib.parse
|
|
4
4
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
5
5
|
|
|
6
|
-
import pydantic
|
|
7
6
|
import sqlalchemy.dialects.mssql
|
|
7
|
+
from pydantic import ValidationInfo, field_validator
|
|
8
8
|
from pydantic.fields import Field
|
|
9
9
|
from sqlalchemy import create_engine, inspect
|
|
10
10
|
from sqlalchemy.engine.base import Connection
|
|
@@ -140,11 +140,18 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
140
140
|
description="Indicates if the SQL Server instance is running on AWS RDS. When None (default), automatic detection will be attempted using server name analysis.",
|
|
141
141
|
)
|
|
142
142
|
|
|
143
|
-
@
|
|
144
|
-
|
|
145
|
-
|
|
143
|
+
@field_validator("uri_args", mode="after")
|
|
144
|
+
@classmethod
|
|
145
|
+
def passwords_match(
|
|
146
|
+
cls, v: Dict[str, Any], info: ValidationInfo, **kwargs: Any
|
|
147
|
+
) -> Dict[str, Any]:
|
|
148
|
+
if (
|
|
149
|
+
info.data["use_odbc"]
|
|
150
|
+
and not info.data["sqlalchemy_uri"]
|
|
151
|
+
and "driver" not in v
|
|
152
|
+
):
|
|
146
153
|
raise ValueError("uri_args must contain a 'driver' option")
|
|
147
|
-
elif not
|
|
154
|
+
elif not info.data["use_odbc"] and v:
|
|
148
155
|
raise ValueError("uri_args is not supported when ODBC is disabled")
|
|
149
156
|
return v
|
|
150
157
|
|
|
@@ -314,7 +321,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
314
321
|
|
|
315
322
|
@classmethod
|
|
316
323
|
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "SQLServerSource":
|
|
317
|
-
config = SQLServerConfig.
|
|
324
|
+
config = SQLServerConfig.model_validate(config_dict)
|
|
318
325
|
return cls(config, ctx)
|
|
319
326
|
|
|
320
327
|
# override to get table descriptions
|
|
@@ -150,7 +150,7 @@ class MySQLSource(TwoTierSQLAlchemySource):
|
|
|
150
150
|
|
|
151
151
|
@classmethod
|
|
152
152
|
def create(cls, config_dict, ctx):
|
|
153
|
-
config = MySQLConfig.
|
|
153
|
+
config = MySQLConfig.model_validate(config_dict)
|
|
154
154
|
return cls(config, ctx)
|
|
155
155
|
|
|
156
156
|
def _setup_rds_iam_event_listener(
|
|
@@ -10,8 +10,8 @@ from typing import Any, Dict, Iterable, List, NoReturn, Optional, Tuple, Union,
|
|
|
10
10
|
from unittest.mock import patch
|
|
11
11
|
|
|
12
12
|
import oracledb
|
|
13
|
-
import pydantic
|
|
14
13
|
import sqlalchemy.engine
|
|
14
|
+
from pydantic import ValidationInfo, field_validator
|
|
15
15
|
from pydantic.fields import Field
|
|
16
16
|
from sqlalchemy import event, sql
|
|
17
17
|
from sqlalchemy.dialects.oracle.base import ischema_names
|
|
@@ -101,25 +101,32 @@ class OracleConfig(BasicSQLAlchemyConfig):
|
|
|
101
101
|
"On Linux, this value is ignored, as ldconfig or LD_LIBRARY_PATH will define the location.",
|
|
102
102
|
)
|
|
103
103
|
|
|
104
|
-
@
|
|
105
|
-
|
|
106
|
-
|
|
104
|
+
@field_validator("service_name", mode="after")
|
|
105
|
+
@classmethod
|
|
106
|
+
def check_service_name(
|
|
107
|
+
cls, v: Optional[str], info: ValidationInfo
|
|
108
|
+
) -> Optional[str]:
|
|
109
|
+
if info.data.get("database") and v:
|
|
107
110
|
raise ValueError(
|
|
108
111
|
"specify one of 'database' and 'service_name', but not both"
|
|
109
112
|
)
|
|
110
113
|
return v
|
|
111
114
|
|
|
112
|
-
@
|
|
113
|
-
|
|
115
|
+
@field_validator("data_dictionary_mode", mode="after")
|
|
116
|
+
@classmethod
|
|
117
|
+
def check_data_dictionary_mode(cls, value: str) -> str:
|
|
114
118
|
if value not in ("ALL", "DBA"):
|
|
115
119
|
raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
|
|
116
120
|
return value
|
|
117
121
|
|
|
118
|
-
@
|
|
119
|
-
|
|
122
|
+
@field_validator("thick_mode_lib_dir", mode="before")
|
|
123
|
+
@classmethod
|
|
124
|
+
def check_thick_mode_lib_dir(
|
|
125
|
+
cls, v: Optional[str], info: ValidationInfo
|
|
126
|
+
) -> Optional[str]:
|
|
120
127
|
if (
|
|
121
128
|
v is None
|
|
122
|
-
and
|
|
129
|
+
and info.data.get("enable_thick_mode")
|
|
123
130
|
and (platform.system() == "Darwin" or platform.system() == "Windows")
|
|
124
131
|
):
|
|
125
132
|
raise ValueError(
|
|
@@ -659,7 +666,7 @@ class OracleSource(SQLAlchemySource):
|
|
|
659
666
|
|
|
660
667
|
@classmethod
|
|
661
668
|
def create(cls, config_dict, ctx):
|
|
662
|
-
config = OracleConfig.
|
|
669
|
+
config = OracleConfig.model_validate(config_dict)
|
|
663
670
|
return cls(config, ctx)
|
|
664
671
|
|
|
665
672
|
def get_db_name(self, inspector: Inspector) -> str:
|
|
@@ -212,7 +212,7 @@ class PostgresSource(SQLAlchemySource):
|
|
|
212
212
|
|
|
213
213
|
@classmethod
|
|
214
214
|
def create(cls, config_dict, ctx):
|
|
215
|
-
config = PostgresConfig.
|
|
215
|
+
config = PostgresConfig.model_validate(config_dict)
|
|
216
216
|
return cls(config, ctx)
|
|
217
217
|
|
|
218
218
|
def _setup_rds_iam_event_listener(
|
|
@@ -288,7 +288,7 @@ class PostgresSource(SQLAlchemySource):
|
|
|
288
288
|
return {}
|
|
289
289
|
|
|
290
290
|
for row in results:
|
|
291
|
-
data.append(ViewLineageEntry.
|
|
291
|
+
data.append(ViewLineageEntry.model_validate(row))
|
|
292
292
|
|
|
293
293
|
lineage_elements: Dict[Tuple[str, str], List[str]] = defaultdict(list)
|
|
294
294
|
# Loop over the lineages in the JSON data.
|