acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2550 -2543
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +263 -261
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +2 -2
- datahub/api/entities/corpgroup/corpgroup.py +11 -6
- datahub/api/entities/corpuser/corpuser.py +11 -11
- datahub/api/entities/dataproduct/dataproduct.py +47 -27
- datahub/api/entities/dataset/dataset.py +32 -21
- datahub/api/entities/external/lake_formation_external_entites.py +5 -6
- datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
- datahub/api/entities/forms/forms.py +16 -14
- datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
- datahub/cli/check_cli.py +2 -2
- datahub/cli/config_utils.py +3 -3
- datahub/cli/lite_cli.py +9 -7
- datahub/cli/migrate.py +4 -4
- datahub/cli/quickstart_versioning.py +3 -3
- datahub/cli/specific/group_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +1 -1
- datahub/cli/specific/user_cli.py +1 -1
- datahub/configuration/common.py +14 -2
- datahub/configuration/connection_resolver.py +2 -2
- datahub/configuration/git.py +47 -30
- datahub/configuration/import_resolver.py +2 -2
- datahub/configuration/kafka.py +4 -3
- datahub/configuration/time_window_config.py +26 -26
- datahub/configuration/validate_field_deprecation.py +2 -2
- datahub/configuration/validate_field_removal.py +2 -2
- datahub/configuration/validate_field_rename.py +2 -2
- datahub/configuration/validate_multiline_string.py +2 -1
- datahub/emitter/kafka_emitter.py +3 -1
- datahub/emitter/rest_emitter.py +2 -4
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/report.py +1 -1
- datahub/ingestion/api/sink.py +1 -1
- datahub/ingestion/api/source.py +1 -1
- datahub/ingestion/glossary/datahub_classifier.py +11 -8
- datahub/ingestion/graph/client.py +5 -1
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/reporting/file_reporter.py +5 -4
- datahub/ingestion/run/pipeline.py +7 -6
- datahub/ingestion/run/pipeline_config.py +12 -14
- datahub/ingestion/run/sink_callback.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +6 -4
- datahub/ingestion/source/abs/config.py +19 -19
- datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/abs/source.py +2 -2
- datahub/ingestion/source/aws/aws_common.py +1 -1
- datahub/ingestion/source/aws/glue.py +6 -4
- datahub/ingestion/source/aws/sagemaker.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +8 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
- datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
- datahub/ingestion/source/datahub/config.py +8 -8
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
- datahub/ingestion/source/dbt/dbt_common.py +39 -37
- datahub/ingestion/source/dbt/dbt_core.py +10 -12
- datahub/ingestion/source/debug/datahub_debug.py +1 -1
- datahub/ingestion/source/delta_lake/config.py +6 -4
- datahub/ingestion/source/dremio/dremio_api.py +212 -78
- datahub/ingestion/source/dremio/dremio_config.py +10 -6
- datahub/ingestion/source/dremio/dremio_entities.py +55 -39
- datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
- datahub/ingestion/source/dremio/dremio_source.py +24 -26
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/elastic_search.py +110 -32
- datahub/ingestion/source/excel/source.py +1 -1
- datahub/ingestion/source/feast.py +1 -1
- datahub/ingestion/source/file.py +5 -4
- datahub/ingestion/source/fivetran/config.py +17 -16
- datahub/ingestion/source/fivetran/fivetran.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +8 -10
- datahub/ingestion/source/ge_profiling_config.py +8 -5
- datahub/ingestion/source/grafana/grafana_api.py +2 -2
- datahub/ingestion/source/grafana/grafana_config.py +4 -3
- datahub/ingestion/source/grafana/grafana_source.py +1 -1
- datahub/ingestion/source/grafana/models.py +23 -5
- datahub/ingestion/source/hex/api.py +7 -5
- datahub/ingestion/source/hex/hex.py +4 -3
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +10 -10
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +7 -5
- datahub/ingestion/source/looker/looker_config.py +21 -20
- datahub/ingestion/source/looker/lookml_config.py +47 -47
- datahub/ingestion/source/metabase.py +8 -8
- datahub/ingestion/source/metadata/business_glossary.py +2 -2
- datahub/ingestion/source/metadata/lineage.py +13 -8
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/mode.py +6 -4
- datahub/ingestion/source/mongodb.py +4 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +17 -23
- datahub/ingestion/source/openapi.py +6 -8
- datahub/ingestion/source/powerbi/config.py +33 -32
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
- datahub/ingestion/source/powerbi/powerbi.py +1 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
- datahub/ingestion/source/preset.py +8 -8
- datahub/ingestion/source/pulsar.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
- datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +18 -20
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/usage.py +23 -3
- datahub/ingestion/source/s3/config.py +83 -62
- datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/s3/source.py +8 -5
- datahub/ingestion/source/sac/sac.py +5 -4
- datahub/ingestion/source/salesforce.py +3 -2
- datahub/ingestion/source/schema/json_schema.py +2 -2
- datahub/ingestion/source/sigma/data_classes.py +3 -2
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/sigma/sigma_api.py +7 -7
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
- datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_queries.py +28 -4
- datahub/ingestion/source/sql/athena.py +1 -1
- datahub/ingestion/source/sql/clickhouse.py +4 -2
- datahub/ingestion/source/sql/cockroachdb.py +1 -1
- datahub/ingestion/source/sql/druid.py +1 -1
- datahub/ingestion/source/sql/hana.py +1 -1
- datahub/ingestion/source/sql/hive.py +7 -5
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +13 -6
- datahub/ingestion/source/sql/mysql.py +1 -1
- datahub/ingestion/source/sql/oracle.py +17 -10
- datahub/ingestion/source/sql/postgres.py +2 -2
- datahub/ingestion/source/sql/presto.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +8 -9
- datahub/ingestion/source/sql/sql_generic.py +1 -1
- datahub/ingestion/source/sql/teradata.py +1 -1
- datahub/ingestion/source/sql/trino.py +1 -1
- datahub/ingestion/source/sql/vertica.py +5 -4
- datahub/ingestion/source/sql_queries.py +174 -22
- datahub/ingestion/source/state/checkpoint.py +2 -2
- datahub/ingestion/source/state/entity_removal_state.py +2 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +9 -9
- datahub/ingestion/source/tableau/tableau.py +14 -16
- datahub/ingestion/source/unity/azure_auth_config.py +15 -0
- datahub/ingestion/source/unity/config.py +51 -34
- datahub/ingestion/source/unity/connection.py +7 -1
- datahub/ingestion/source/unity/connection_test.py +1 -1
- datahub/ingestion/source/unity/proxy.py +216 -7
- datahub/ingestion/source/unity/proxy_types.py +91 -0
- datahub/ingestion/source/unity/source.py +29 -3
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
- datahub/ingestion/source/usage/usage_common.py +5 -3
- datahub/ingestion/source_config/csv_enricher.py +7 -6
- datahub/ingestion/source_config/operation_config.py +7 -4
- datahub/ingestion/source_config/pulsar.py +11 -15
- datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
- datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
- datahub/ingestion/transformer/add_dataset_properties.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
- datahub/ingestion/transformer/add_dataset_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
- datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
- datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
- datahub/ingestion/transformer/mark_dataset_status.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
- datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/replace_external_url.py +2 -2
- datahub/ingestion/transformer/set_browse_path.py +1 -1
- datahub/ingestion/transformer/tags_to_terms.py +1 -1
- datahub/lite/duckdb_lite.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_internal_schema_classes.py +62 -2
- datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
- datahub/metadata/schema.avsc +271 -91
- datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
- datahub/metadata/schemas/AssertionInfo.avsc +48 -5
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
- datahub/metadata/schemas/ChartInfo.avsc +12 -5
- datahub/metadata/schemas/ContainerProperties.avsc +12 -5
- datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
- datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
- datahub/metadata/schemas/DashboardInfo.avsc +16 -4
- datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
- datahub/metadata/schemas/DataJobInfo.avsc +9 -4
- datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
- datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- datahub/metadata/schemas/DataProductProperties.avsc +5 -2
- datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/DatasetProperties.avsc +12 -5
- datahub/metadata/schemas/DomainProperties.avsc +7 -3
- datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
- datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalTags.avsc +3 -2
- datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- datahub/metadata/schemas/InputFields.avsc +3 -2
- datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLModelProperties.avsc +4 -2
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
- datahub/metadata/schemas/NotebookInfo.avsc +5 -2
- datahub/metadata/schemas/Ownership.avsc +3 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -1
- datahub/metadata/schemas/RoleProperties.avsc +3 -1
- datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- datahub/metadata/schemas/TagProperties.avsc +3 -1
- datahub/metadata/schemas/TestInfo.avsc +2 -1
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/_all_entities.py +2 -0
- datahub/sdk/search_filters.py +68 -40
- datahub/sdk/tag.py +112 -0
- datahub/secret/datahub_secret_store.py +7 -4
- datahub/secret/file_secret_store.py +1 -1
- datahub/sql_parsing/schema_resolver.py +29 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
- datahub/sql_parsing/sqlglot_lineage.py +5 -2
- datahub/testing/check_sql_parser_result.py +2 -2
- datahub/utilities/ingest_utils.py +1 -1
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import collections
|
|
2
2
|
import logging
|
|
3
3
|
import time
|
|
4
|
-
from datetime import datetime
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
5
|
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
import cachetools
|
|
8
|
-
import pydantic.error_wrappers
|
|
9
8
|
import redshift_connector
|
|
9
|
+
from pydantic import ValidationError, field_validator
|
|
10
10
|
from pydantic.fields import Field
|
|
11
11
|
from pydantic.main import BaseModel
|
|
12
12
|
|
|
@@ -64,6 +64,26 @@ class RedshiftAccessEvent(BaseModel):
|
|
|
64
64
|
starttime: datetime
|
|
65
65
|
endtime: datetime
|
|
66
66
|
|
|
67
|
+
@field_validator("starttime", "endtime", mode="before")
|
|
68
|
+
@classmethod
|
|
69
|
+
def ensure_utc_datetime(cls, v):
|
|
70
|
+
"""Ensure datetime fields are treated as UTC for consistency with Pydantic V1 behavior.
|
|
71
|
+
|
|
72
|
+
Pydantic V2 assumes local timezone for naive datetime strings, whereas Pydantic V1 assumed UTC.
|
|
73
|
+
This validator restores V1 behavior to maintain timestamp consistency.
|
|
74
|
+
"""
|
|
75
|
+
if isinstance(v, str):
|
|
76
|
+
# Parse as naive datetime, then assume UTC (matching V1 behavior)
|
|
77
|
+
dt = datetime.fromisoformat(v)
|
|
78
|
+
if dt.tzinfo is None:
|
|
79
|
+
# Treat naive datetime as UTC (this was the V1 behavior)
|
|
80
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
81
|
+
return dt
|
|
82
|
+
elif isinstance(v, datetime) and v.tzinfo is None:
|
|
83
|
+
# If we get a naive datetime object, assume UTC
|
|
84
|
+
return v.replace(tzinfo=timezone.utc)
|
|
85
|
+
return v
|
|
86
|
+
|
|
67
87
|
|
|
68
88
|
class RedshiftUsageExtractor:
|
|
69
89
|
"""
|
|
@@ -291,7 +311,7 @@ class RedshiftUsageExtractor:
|
|
|
291
311
|
else None
|
|
292
312
|
),
|
|
293
313
|
)
|
|
294
|
-
except
|
|
314
|
+
except ValidationError as e:
|
|
295
315
|
logging.warning(
|
|
296
316
|
f"Validation error on access event creation from row {row}. The error was: {e} Skipping ...."
|
|
297
317
|
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Dict,
|
|
2
|
+
from typing import Any, Dict, Optional, Union
|
|
3
3
|
|
|
4
|
-
import
|
|
4
|
+
from pydantic import ValidationInfo, field_validator, model_validator
|
|
5
5
|
from pydantic.fields import Field
|
|
6
6
|
|
|
7
7
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -12,7 +12,6 @@ from datahub.configuration.validate_field_deprecation import pydantic_field_depr
|
|
|
12
12
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
13
13
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
14
14
|
from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
|
|
15
|
-
from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
|
|
16
15
|
from datahub.ingestion.source.s3.datalake_profiler_config import DataLakeProfilerConfig
|
|
17
16
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
18
17
|
StatefulStaleMetadataRemovalConfig,
|
|
@@ -117,69 +116,91 @@ class DataLakeSourceConfig(
|
|
|
117
116
|
self.profiling.operation_config
|
|
118
117
|
)
|
|
119
118
|
|
|
120
|
-
@
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
) -> List[PathSpec]:
|
|
119
|
+
@field_validator("path_specs", mode="before")
|
|
120
|
+
@classmethod
|
|
121
|
+
def check_path_specs(cls, path_specs: Any, info: ValidationInfo) -> Any:
|
|
124
122
|
if len(path_specs) == 0:
|
|
125
123
|
raise ValueError("path_specs must not be empty")
|
|
126
124
|
|
|
127
|
-
#
|
|
128
|
-
guessed_platforms = {
|
|
129
|
-
"s3" if path_spec.is_s3 else "file" for path_spec in path_specs
|
|
130
|
-
}
|
|
131
|
-
if len(guessed_platforms) > 1:
|
|
132
|
-
raise ValueError(
|
|
133
|
-
f"Cannot have multiple platforms in path_specs: {guessed_platforms}"
|
|
134
|
-
)
|
|
135
|
-
guessed_platform = guessed_platforms.pop()
|
|
136
|
-
|
|
137
|
-
# Ensure s3 configs aren't used for file sources.
|
|
138
|
-
if guessed_platform != "s3" and (
|
|
139
|
-
values.get("use_s3_object_tags") or values.get("use_s3_bucket_tags")
|
|
140
|
-
):
|
|
141
|
-
raise ValueError(
|
|
142
|
-
"Cannot grab s3 object/bucket tags when platform is not s3. Remove the flag or use s3."
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
# Infer platform if not specified.
|
|
146
|
-
if values.get("platform") and values["platform"] != guessed_platform:
|
|
147
|
-
raise ValueError(
|
|
148
|
-
f"All path_specs belong to {guessed_platform} platform, but platform is set to {values['platform']}"
|
|
149
|
-
)
|
|
150
|
-
else:
|
|
151
|
-
logger.debug(f'Setting config "platform": {guessed_platform}')
|
|
152
|
-
values["platform"] = guessed_platform
|
|
125
|
+
# Basic validation - path specs consistency and S3 config validation is now handled in model_validator
|
|
153
126
|
|
|
154
127
|
return path_specs
|
|
155
128
|
|
|
156
|
-
@
|
|
157
|
-
def
|
|
158
|
-
|
|
159
|
-
platform = platform or inferred_platform
|
|
160
|
-
if not platform:
|
|
161
|
-
raise ValueError("platform must not be empty")
|
|
162
|
-
|
|
163
|
-
if platform != "s3" and values.get("use_s3_bucket_tags"):
|
|
164
|
-
raise ValueError(
|
|
165
|
-
"Cannot grab s3 bucket tags when platform is not s3. Remove the flag or ingest from s3."
|
|
166
|
-
)
|
|
167
|
-
if platform != "s3" and values.get("use_s3_object_tags"):
|
|
168
|
-
raise ValueError(
|
|
169
|
-
"Cannot grab s3 object tags when platform is not s3. Remove the flag or ingest from s3."
|
|
170
|
-
)
|
|
171
|
-
if platform != "s3" and values.get("use_s3_content_type"):
|
|
172
|
-
raise ValueError(
|
|
173
|
-
"Cannot grab s3 object content type when platform is not s3. Remove the flag or ingest from s3."
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
return platform
|
|
177
|
-
|
|
178
|
-
@pydantic.root_validator(skip_on_failure=True)
|
|
179
|
-
def ensure_profiling_pattern_is_passed_to_profiling(
|
|
180
|
-
cls, values: Dict[str, Any]
|
|
181
|
-
) -> Dict[str, Any]:
|
|
182
|
-
profiling: Optional[DataLakeProfilerConfig] = values.get("profiling")
|
|
129
|
+
@model_validator(mode="after")
|
|
130
|
+
def ensure_profiling_pattern_is_passed_to_profiling(self) -> "DataLakeSourceConfig":
|
|
131
|
+
profiling = self.profiling
|
|
183
132
|
if profiling is not None and profiling.enabled:
|
|
184
|
-
profiling._allow_deny_patterns =
|
|
185
|
-
return
|
|
133
|
+
profiling._allow_deny_patterns = self.profile_patterns
|
|
134
|
+
return self
|
|
135
|
+
|
|
136
|
+
@model_validator(mode="after")
|
|
137
|
+
def validate_platform_and_config_consistency(self) -> "DataLakeSourceConfig":
|
|
138
|
+
"""Infer platform from path_specs and validate config consistency."""
|
|
139
|
+
# Track whether platform was explicitly provided
|
|
140
|
+
platform_was_explicit = bool(self.platform)
|
|
141
|
+
|
|
142
|
+
# Infer platform from path_specs if not explicitly set
|
|
143
|
+
if not self.platform and self.path_specs:
|
|
144
|
+
guessed_platforms = set()
|
|
145
|
+
for path_spec in self.path_specs:
|
|
146
|
+
if (
|
|
147
|
+
hasattr(path_spec, "include")
|
|
148
|
+
and path_spec.include
|
|
149
|
+
and path_spec.include.startswith("s3://")
|
|
150
|
+
):
|
|
151
|
+
guessed_platforms.add("s3")
|
|
152
|
+
else:
|
|
153
|
+
guessed_platforms.add("file")
|
|
154
|
+
|
|
155
|
+
# Ensure all path specs belong to the same platform
|
|
156
|
+
if len(guessed_platforms) > 1:
|
|
157
|
+
raise ValueError(
|
|
158
|
+
f"Cannot have multiple platforms in path_specs: {guessed_platforms}"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if guessed_platforms:
|
|
162
|
+
guessed_platform = guessed_platforms.pop()
|
|
163
|
+
logger.debug(f"Inferred platform: {guessed_platform}")
|
|
164
|
+
self.platform = guessed_platform
|
|
165
|
+
else:
|
|
166
|
+
self.platform = "file"
|
|
167
|
+
elif not self.platform:
|
|
168
|
+
self.platform = "file"
|
|
169
|
+
|
|
170
|
+
# Validate platform consistency only when platform was inferred (not explicitly set)
|
|
171
|
+
# This allows sources like GCS to set platform="gcs" with s3:// URIs for correct container subtypes
|
|
172
|
+
if not platform_was_explicit and self.platform and self.path_specs:
|
|
173
|
+
expected_platforms = set()
|
|
174
|
+
for path_spec in self.path_specs:
|
|
175
|
+
if (
|
|
176
|
+
hasattr(path_spec, "include")
|
|
177
|
+
and path_spec.include
|
|
178
|
+
and path_spec.include.startswith("s3://")
|
|
179
|
+
):
|
|
180
|
+
expected_platforms.add("s3")
|
|
181
|
+
else:
|
|
182
|
+
expected_platforms.add("file")
|
|
183
|
+
|
|
184
|
+
if len(expected_platforms) == 1:
|
|
185
|
+
expected_platform = expected_platforms.pop()
|
|
186
|
+
if self.platform != expected_platform:
|
|
187
|
+
raise ValueError(
|
|
188
|
+
f"All path_specs belong to {expected_platform} platform, but platform was inferred as {self.platform}"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Validate S3-specific configurations
|
|
192
|
+
if self.platform != "s3":
|
|
193
|
+
if self.use_s3_bucket_tags:
|
|
194
|
+
raise ValueError(
|
|
195
|
+
"Cannot grab s3 bucket tags when platform is not s3. Remove the flag or ingest from s3."
|
|
196
|
+
)
|
|
197
|
+
if self.use_s3_object_tags:
|
|
198
|
+
raise ValueError(
|
|
199
|
+
"Cannot grab s3 object tags when platform is not s3. Remove the flag or ingest from s3."
|
|
200
|
+
)
|
|
201
|
+
if self.use_s3_content_type:
|
|
202
|
+
raise ValueError(
|
|
203
|
+
"Cannot grab s3 object content type when platform is not s3. Remove the flag or ingest from s3."
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return self
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
import pydantic
|
|
4
|
+
from pydantic import model_validator
|
|
4
5
|
from pydantic.fields import Field
|
|
5
6
|
|
|
6
7
|
from datahub.configuration import ConfigModel
|
|
@@ -72,21 +73,18 @@ class DataLakeProfilerConfig(ConfigModel):
|
|
|
72
73
|
description="Whether to profile for the sample values for all columns.",
|
|
73
74
|
)
|
|
74
75
|
|
|
75
|
-
@
|
|
76
|
-
def ensure_field_level_settings_are_normalized(
|
|
77
|
-
|
|
78
|
-
) -> Dict[str, Any]:
|
|
79
|
-
max_num_fields_to_profile_key = "max_number_of_fields_to_profile"
|
|
80
|
-
max_num_fields_to_profile = values.get(max_num_fields_to_profile_key)
|
|
76
|
+
@model_validator(mode="after")
|
|
77
|
+
def ensure_field_level_settings_are_normalized(self) -> "DataLakeProfilerConfig":
|
|
78
|
+
max_num_fields_to_profile = self.max_number_of_fields_to_profile
|
|
81
79
|
|
|
82
80
|
# Disable all field-level metrics.
|
|
83
|
-
if
|
|
84
|
-
for
|
|
85
|
-
if
|
|
86
|
-
|
|
81
|
+
if self.profile_table_level_only:
|
|
82
|
+
for field_name in self.__fields__:
|
|
83
|
+
if field_name.startswith("include_field_"):
|
|
84
|
+
setattr(self, field_name, False)
|
|
87
85
|
|
|
88
86
|
assert max_num_fields_to_profile is None, (
|
|
89
|
-
|
|
87
|
+
"max_number_of_fields_to_profile should be set to None"
|
|
90
88
|
)
|
|
91
89
|
|
|
92
|
-
return
|
|
90
|
+
return self
|
|
@@ -53,8 +53,11 @@ from datahub.ingestion.source.data_lake_common.data_lake_utils import (
|
|
|
53
53
|
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
54
54
|
create_object_store_adapter,
|
|
55
55
|
)
|
|
56
|
-
from datahub.ingestion.source.data_lake_common.path_spec import
|
|
57
|
-
|
|
56
|
+
from datahub.ingestion.source.data_lake_common.path_spec import (
|
|
57
|
+
FolderTraversalMethod,
|
|
58
|
+
PathSpec,
|
|
59
|
+
)
|
|
60
|
+
from datahub.ingestion.source.s3.config import DataLakeSourceConfig
|
|
58
61
|
from datahub.ingestion.source.s3.report import DataLakeSourceReport
|
|
59
62
|
from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet
|
|
60
63
|
from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase
|
|
@@ -261,7 +264,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
261
264
|
)
|
|
262
265
|
|
|
263
266
|
config_report = {
|
|
264
|
-
config_option: config.
|
|
267
|
+
config_option: config.model_dump().get(config_option)
|
|
265
268
|
for config_option in config_options_to_report
|
|
266
269
|
}
|
|
267
270
|
config_report = {
|
|
@@ -278,7 +281,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
278
281
|
telemetry.telemetry_instance.ping(
|
|
279
282
|
"data_lake_profiling_config",
|
|
280
283
|
{
|
|
281
|
-
config_flag: config.profiling.
|
|
284
|
+
config_flag: config.profiling.model_dump().get(config_flag)
|
|
282
285
|
for config_flag in profiling_flags_to_report
|
|
283
286
|
},
|
|
284
287
|
)
|
|
@@ -370,7 +373,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
370
373
|
|
|
371
374
|
@classmethod
|
|
372
375
|
def create(cls, config_dict, ctx):
|
|
373
|
-
config = DataLakeSourceConfig.
|
|
376
|
+
config = DataLakeSourceConfig.model_validate(config_dict)
|
|
374
377
|
|
|
375
378
|
return cls(config, ctx)
|
|
376
379
|
|
|
@@ -8,7 +8,7 @@ import pyodata
|
|
|
8
8
|
import pyodata.v2.model
|
|
9
9
|
import pyodata.v2.service
|
|
10
10
|
from authlib.integrations.requests_client import OAuth2Session
|
|
11
|
-
from pydantic import Field, SecretStr,
|
|
11
|
+
from pydantic import Field, SecretStr, field_validator
|
|
12
12
|
from requests.adapters import HTTPAdapter
|
|
13
13
|
from urllib3.util.retry import Retry
|
|
14
14
|
|
|
@@ -159,7 +159,8 @@ class SACSourceConfig(
|
|
|
159
159
|
description="Template for generating dataset urns of consumed queries, the placeholder {query} can be used within the template for inserting the name of the query",
|
|
160
160
|
)
|
|
161
161
|
|
|
162
|
-
@
|
|
162
|
+
@field_validator("tenant_url", "token_url", mode="after")
|
|
163
|
+
@classmethod
|
|
163
164
|
def remove_trailing_slash(cls, v):
|
|
164
165
|
return config_clean.remove_trailing_slashes(v)
|
|
165
166
|
|
|
@@ -209,7 +210,7 @@ class SACSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
209
210
|
|
|
210
211
|
@classmethod
|
|
211
212
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "SACSource":
|
|
212
|
-
config = SACSourceConfig.
|
|
213
|
+
config = SACSourceConfig.model_validate(config_dict)
|
|
213
214
|
return cls(config, ctx)
|
|
214
215
|
|
|
215
216
|
@staticmethod
|
|
@@ -217,7 +218,7 @@ class SACSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
217
218
|
test_report = TestConnectionReport()
|
|
218
219
|
|
|
219
220
|
try:
|
|
220
|
-
config = SACSourceConfig.
|
|
221
|
+
config = SACSourceConfig.model_validate(config_dict)
|
|
221
222
|
|
|
222
223
|
# when creating the pyodata.Client, the metadata is automatically parsed and validated
|
|
223
224
|
session, _ = SACSource.get_sac_connection(config)
|
|
@@ -7,7 +7,7 @@ from enum import Enum
|
|
|
7
7
|
from typing import Any, Dict, Iterable, List, Literal, Optional, TypedDict
|
|
8
8
|
|
|
9
9
|
import requests
|
|
10
|
-
from pydantic import Field,
|
|
10
|
+
from pydantic import Field, field_validator
|
|
11
11
|
from simple_salesforce import Salesforce
|
|
12
12
|
from simple_salesforce.exceptions import SalesforceAuthenticationFailed
|
|
13
13
|
|
|
@@ -172,7 +172,8 @@ class SalesforceConfig(
|
|
|
172
172
|
self.profiling.operation_config
|
|
173
173
|
)
|
|
174
174
|
|
|
175
|
-
@
|
|
175
|
+
@field_validator("instance_url", mode="after")
|
|
176
|
+
@classmethod
|
|
176
177
|
def remove_trailing_slash(cls, v):
|
|
177
178
|
return config_clean.remove_trailing_slashes(v)
|
|
178
179
|
|
|
@@ -12,7 +12,7 @@ from urllib.parse import urlparse
|
|
|
12
12
|
|
|
13
13
|
import jsonref
|
|
14
14
|
import requests
|
|
15
|
-
from pydantic import AnyHttpUrl, DirectoryPath, FilePath,
|
|
15
|
+
from pydantic import AnyHttpUrl, DirectoryPath, FilePath, field_validator
|
|
16
16
|
from pydantic.fields import Field
|
|
17
17
|
|
|
18
18
|
import datahub.metadata.schema_classes as models
|
|
@@ -90,7 +90,7 @@ class JsonSchemaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMix
|
|
|
90
90
|
description="Use this if URI-s need to be modified during reference resolution. Simple string match - replace capabilities are supported.",
|
|
91
91
|
)
|
|
92
92
|
|
|
93
|
-
@
|
|
93
|
+
@field_validator("path", mode="after")
|
|
94
94
|
def download_http_url_to_temp_file(cls, v):
|
|
95
95
|
if isinstance(v, AnyHttpUrl):
|
|
96
96
|
try:
|
|
@@ -2,7 +2,7 @@ from copy import deepcopy
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel,
|
|
5
|
+
from pydantic import BaseModel, model_validator
|
|
6
6
|
|
|
7
7
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
8
8
|
|
|
@@ -22,7 +22,8 @@ class Workspace(BaseModel):
|
|
|
22
22
|
createdAt: datetime
|
|
23
23
|
updatedAt: datetime
|
|
24
24
|
|
|
25
|
-
@
|
|
25
|
+
@model_validator(mode="before")
|
|
26
|
+
@classmethod
|
|
26
27
|
def update_values(cls, values: Dict) -> Dict:
|
|
27
28
|
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
28
29
|
values = deepcopy(values)
|
|
@@ -150,7 +150,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
150
150
|
|
|
151
151
|
@classmethod
|
|
152
152
|
def create(cls, config_dict, ctx):
|
|
153
|
-
config = SigmaSourceConfig.
|
|
153
|
+
config = SigmaSourceConfig.model_validate(config_dict)
|
|
154
154
|
return cls(config, ctx)
|
|
155
155
|
|
|
156
156
|
def _gen_workbook_key(self, workbook_id: str) -> WorkbookKey:
|
|
@@ -108,7 +108,7 @@ class SigmaAPI:
|
|
|
108
108
|
self.report.non_accessible_workspaces_count += 1
|
|
109
109
|
return None
|
|
110
110
|
response.raise_for_status()
|
|
111
|
-
workspace = Workspace.
|
|
111
|
+
workspace = Workspace.model_validate(response.json())
|
|
112
112
|
self.workspaces[workspace.workspaceId] = workspace
|
|
113
113
|
return workspace
|
|
114
114
|
except Exception as e:
|
|
@@ -127,7 +127,7 @@ class SigmaAPI:
|
|
|
127
127
|
response_dict = response.json()
|
|
128
128
|
for workspace_dict in response_dict[Constant.ENTRIES]:
|
|
129
129
|
self.workspaces[workspace_dict[Constant.WORKSPACEID]] = (
|
|
130
|
-
Workspace.
|
|
130
|
+
Workspace.model_validate(workspace_dict)
|
|
131
131
|
)
|
|
132
132
|
if response_dict[Constant.NEXTPAGE]:
|
|
133
133
|
url = f"{workspace_url}&page={response_dict[Constant.NEXTPAGE]}"
|
|
@@ -197,7 +197,7 @@ class SigmaAPI:
|
|
|
197
197
|
response.raise_for_status()
|
|
198
198
|
response_dict = response.json()
|
|
199
199
|
for file_dict in response_dict[Constant.ENTRIES]:
|
|
200
|
-
file = File.
|
|
200
|
+
file = File.model_validate(file_dict)
|
|
201
201
|
file.workspaceId = self.get_workspace_id_from_file_path(
|
|
202
202
|
file.parentId, file.path
|
|
203
203
|
)
|
|
@@ -225,7 +225,7 @@ class SigmaAPI:
|
|
|
225
225
|
response.raise_for_status()
|
|
226
226
|
response_dict = response.json()
|
|
227
227
|
for dataset_dict in response_dict[Constant.ENTRIES]:
|
|
228
|
-
dataset = SigmaDataset.
|
|
228
|
+
dataset = SigmaDataset.model_validate(dataset_dict)
|
|
229
229
|
|
|
230
230
|
if dataset.datasetId not in dataset_files_metadata:
|
|
231
231
|
self.report.datasets.dropped(
|
|
@@ -354,7 +354,7 @@ class SigmaAPI:
|
|
|
354
354
|
element_dict[Constant.URL] = (
|
|
355
355
|
f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
|
|
356
356
|
)
|
|
357
|
-
element = Element.
|
|
357
|
+
element = Element.model_validate(element_dict)
|
|
358
358
|
if (
|
|
359
359
|
self.config.extract_lineage
|
|
360
360
|
and self.config.workbook_lineage_pattern.allowed(workbook.name)
|
|
@@ -379,7 +379,7 @@ class SigmaAPI:
|
|
|
379
379
|
)
|
|
380
380
|
response.raise_for_status()
|
|
381
381
|
for page_dict in response.json()[Constant.ENTRIES]:
|
|
382
|
-
page = Page.
|
|
382
|
+
page = Page.model_validate(page_dict)
|
|
383
383
|
page.elements = self.get_page_elements(workbook, page)
|
|
384
384
|
pages.append(page)
|
|
385
385
|
return pages
|
|
@@ -400,7 +400,7 @@ class SigmaAPI:
|
|
|
400
400
|
response.raise_for_status()
|
|
401
401
|
response_dict = response.json()
|
|
402
402
|
for workbook_dict in response_dict[Constant.ENTRIES]:
|
|
403
|
-
workbook = Workbook.
|
|
403
|
+
workbook = Workbook.model_validate(workbook_dict)
|
|
404
404
|
|
|
405
405
|
if workbook.workbookId not in workbook_files_metadata:
|
|
406
406
|
# Due to a bug in the Sigma API, it seems like the /files endpoint does not
|
|
@@ -260,7 +260,7 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
260
260
|
|
|
261
261
|
@classmethod
|
|
262
262
|
def create(cls, config_dict, ctx):
|
|
263
|
-
config = SlackSourceConfig.
|
|
263
|
+
config = SlackSourceConfig.model_validate(config_dict)
|
|
264
264
|
return cls(ctx, config)
|
|
265
265
|
|
|
266
266
|
def get_slack_client(self) -> WebClient:
|
|
@@ -351,5 +351,5 @@ class SnaplogicSource(StatefulIngestionSourceBase):
|
|
|
351
351
|
|
|
352
352
|
@classmethod
|
|
353
353
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "SnaplogicSource":
|
|
354
|
-
config = SnaplogicConfig.
|
|
354
|
+
config = SnaplogicConfig.model_validate(config_dict)
|
|
355
355
|
return cls(config, ctx)
|
|
@@ -91,7 +91,7 @@ class SnowflakeAssertionsHandler:
|
|
|
91
91
|
self, result_row: dict, discovered_datasets: List[str]
|
|
92
92
|
) -> Optional[MetadataChangeProposalWrapper]:
|
|
93
93
|
try:
|
|
94
|
-
result = DataQualityMonitoringResult.
|
|
94
|
+
result = DataQualityMonitoringResult.model_validate(result_row)
|
|
95
95
|
assertion_guid = result.METRIC_NAME.split("__")[-1].lower()
|
|
96
96
|
status = bool(result.VALUE) # 1 if PASS, 0 if FAIL
|
|
97
97
|
assertee = self.identifiers.get_dataset_identifier(
|
|
@@ -5,7 +5,7 @@ from enum import Enum
|
|
|
5
5
|
from typing import Dict, List, Optional, Set
|
|
6
6
|
|
|
7
7
|
import pydantic
|
|
8
|
-
from pydantic import Field,
|
|
8
|
+
from pydantic import Field, ValidationInfo, field_validator, model_validator
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
11
11
|
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
@@ -122,10 +122,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
|
|
|
122
122
|
description="Whether `schema_pattern` is matched against fully qualified schema name `<catalog>.<schema>`.",
|
|
123
123
|
)
|
|
124
124
|
|
|
125
|
-
@
|
|
126
|
-
def validate_legacy_schema_pattern(
|
|
127
|
-
schema_pattern: Optional[AllowDenyPattern] =
|
|
128
|
-
match_fully_qualified_names =
|
|
125
|
+
@model_validator(mode="after")
|
|
126
|
+
def validate_legacy_schema_pattern(self) -> "SnowflakeFilterConfig":
|
|
127
|
+
schema_pattern: Optional[AllowDenyPattern] = self.schema_pattern
|
|
128
|
+
match_fully_qualified_names = self.match_fully_qualified_names
|
|
129
129
|
|
|
130
130
|
if (
|
|
131
131
|
schema_pattern is not None
|
|
@@ -145,7 +145,7 @@ class SnowflakeFilterConfig(SQLFilterConfig):
|
|
|
145
145
|
assert isinstance(schema_pattern, AllowDenyPattern)
|
|
146
146
|
schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
|
|
147
147
|
|
|
148
|
-
return
|
|
148
|
+
return self
|
|
149
149
|
|
|
150
150
|
|
|
151
151
|
class SnowflakeIdentifierConfig(
|
|
@@ -391,7 +391,8 @@ class SnowflakeV2Config(
|
|
|
391
391
|
"This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.",
|
|
392
392
|
)
|
|
393
393
|
|
|
394
|
-
@
|
|
394
|
+
@field_validator("convert_urns_to_lowercase", mode="after")
|
|
395
|
+
@classmethod
|
|
395
396
|
def validate_convert_urns_to_lowercase(cls, v):
|
|
396
397
|
if not v:
|
|
397
398
|
add_global_warning(
|
|
@@ -400,30 +401,31 @@ class SnowflakeV2Config(
|
|
|
400
401
|
|
|
401
402
|
return v
|
|
402
403
|
|
|
403
|
-
@
|
|
404
|
-
|
|
405
|
-
|
|
404
|
+
@field_validator("include_column_lineage", mode="after")
|
|
405
|
+
@classmethod
|
|
406
|
+
def validate_include_column_lineage(cls, v, info):
|
|
407
|
+
if not info.data.get("include_table_lineage") and v:
|
|
406
408
|
raise ValueError(
|
|
407
409
|
"include_table_lineage must be True for include_column_lineage to be set."
|
|
408
410
|
)
|
|
409
411
|
return v
|
|
410
412
|
|
|
411
|
-
@
|
|
412
|
-
def validate_unsupported_configs(
|
|
413
|
-
|
|
414
|
-
|
|
413
|
+
@model_validator(mode="after")
|
|
414
|
+
def validate_unsupported_configs(self) -> "SnowflakeV2Config":
|
|
415
|
+
if (
|
|
416
|
+
hasattr(self, "include_read_operational_stats")
|
|
417
|
+
and self.include_read_operational_stats
|
|
418
|
+
):
|
|
415
419
|
raise ValueError(
|
|
416
420
|
"include_read_operational_stats is not supported. Set `include_read_operational_stats` to False.",
|
|
417
421
|
)
|
|
418
422
|
|
|
419
|
-
include_technical_schema =
|
|
420
|
-
include_profiles =
|
|
421
|
-
values.get("profiling") is not None and values["profiling"].enabled
|
|
422
|
-
)
|
|
423
|
+
include_technical_schema = self.include_technical_schema
|
|
424
|
+
include_profiles = self.profiling is not None and self.profiling.enabled
|
|
423
425
|
delete_detection_enabled = (
|
|
424
|
-
|
|
425
|
-
and
|
|
426
|
-
and
|
|
426
|
+
self.stateful_ingestion is not None
|
|
427
|
+
and self.stateful_ingestion.enabled
|
|
428
|
+
and self.stateful_ingestion.remove_stale_metadata
|
|
427
429
|
)
|
|
428
430
|
|
|
429
431
|
# TODO: Allow profiling irrespective of basic schema extraction,
|
|
@@ -435,13 +437,14 @@ class SnowflakeV2Config(
|
|
|
435
437
|
"Cannot perform Deletion Detection or Profiling without extracting snowflake technical schema. Set `include_technical_schema` to True or disable Deletion Detection and Profiling."
|
|
436
438
|
)
|
|
437
439
|
|
|
438
|
-
return
|
|
440
|
+
return self
|
|
439
441
|
|
|
440
|
-
@
|
|
442
|
+
@field_validator("shares", mode="after")
|
|
443
|
+
@classmethod
|
|
441
444
|
def validate_shares(
|
|
442
|
-
cls, shares: Optional[Dict[str, SnowflakeShareConfig]],
|
|
445
|
+
cls, shares: Optional[Dict[str, SnowflakeShareConfig]], info: ValidationInfo
|
|
443
446
|
) -> Optional[Dict[str, SnowflakeShareConfig]]:
|
|
444
|
-
current_platform_instance =
|
|
447
|
+
current_platform_instance = info.data.get("platform_instance")
|
|
445
448
|
|
|
446
449
|
if shares:
|
|
447
450
|
# Check: platform_instance should be present
|
|
@@ -479,11 +482,12 @@ class SnowflakeV2Config(
|
|
|
479
482
|
|
|
480
483
|
return shares
|
|
481
484
|
|
|
482
|
-
@
|
|
483
|
-
def validate_queries_v2_stateful_ingestion(
|
|
484
|
-
if
|
|
485
|
-
if
|
|
486
|
-
|
|
485
|
+
@model_validator(mode="after")
|
|
486
|
+
def validate_queries_v2_stateful_ingestion(self) -> "SnowflakeV2Config":
|
|
487
|
+
if self.use_queries_v2:
|
|
488
|
+
if (
|
|
489
|
+
self.enable_stateful_lineage_ingestion
|
|
490
|
+
or self.enable_stateful_usage_ingestion
|
|
487
491
|
):
|
|
488
492
|
logger.warning(
|
|
489
493
|
"enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
|
|
@@ -491,7 +495,7 @@ class SnowflakeV2Config(
|
|
|
491
495
|
"For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
|
|
492
496
|
"for the unified time window extraction (lineage + usage + operations + queries)."
|
|
493
497
|
)
|
|
494
|
-
return
|
|
498
|
+
return self
|
|
495
499
|
|
|
496
500
|
def outbounds(self) -> Dict[str, Set[DatabaseId]]:
|
|
497
501
|
"""
|