acryl-datahub 1.3.1__py3-none-any.whl → 1.3.1.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/METADATA +2501 -2501
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/RECORD +193 -193
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +2 -2
- datahub/api/entities/corpgroup/corpgroup.py +11 -6
- datahub/api/entities/corpuser/corpuser.py +11 -11
- datahub/api/entities/dataproduct/dataproduct.py +47 -27
- datahub/api/entities/dataset/dataset.py +32 -21
- datahub/api/entities/external/lake_formation_external_entites.py +5 -6
- datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
- datahub/api/entities/forms/forms.py +16 -14
- datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
- datahub/cli/check_cli.py +2 -2
- datahub/cli/config_utils.py +3 -3
- datahub/cli/lite_cli.py +9 -7
- datahub/cli/migrate.py +4 -4
- datahub/cli/quickstart_versioning.py +3 -3
- datahub/cli/specific/group_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +1 -1
- datahub/cli/specific/user_cli.py +1 -1
- datahub/configuration/common.py +14 -2
- datahub/configuration/connection_resolver.py +2 -2
- datahub/configuration/git.py +47 -30
- datahub/configuration/import_resolver.py +2 -2
- datahub/configuration/kafka.py +4 -3
- datahub/configuration/time_window_config.py +26 -26
- datahub/configuration/validate_field_deprecation.py +2 -2
- datahub/configuration/validate_field_removal.py +2 -2
- datahub/configuration/validate_field_rename.py +2 -2
- datahub/configuration/validate_multiline_string.py +2 -1
- datahub/emitter/kafka_emitter.py +3 -1
- datahub/emitter/rest_emitter.py +2 -4
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/report.py +1 -1
- datahub/ingestion/api/sink.py +1 -1
- datahub/ingestion/api/source.py +1 -1
- datahub/ingestion/glossary/datahub_classifier.py +11 -8
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/reporting/file_reporter.py +5 -4
- datahub/ingestion/run/pipeline.py +6 -6
- datahub/ingestion/run/pipeline_config.py +12 -14
- datahub/ingestion/run/sink_callback.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +6 -4
- datahub/ingestion/source/abs/config.py +19 -19
- datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/abs/source.py +2 -2
- datahub/ingestion/source/aws/aws_common.py +1 -1
- datahub/ingestion/source/aws/glue.py +6 -4
- datahub/ingestion/source/aws/sagemaker.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +8 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
- datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
- datahub/ingestion/source/datahub/config.py +8 -8
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
- datahub/ingestion/source/dbt/dbt_common.py +39 -37
- datahub/ingestion/source/dbt/dbt_core.py +10 -12
- datahub/ingestion/source/debug/datahub_debug.py +1 -1
- datahub/ingestion/source/delta_lake/config.py +6 -4
- datahub/ingestion/source/dremio/dremio_config.py +10 -6
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/elastic_search.py +4 -3
- datahub/ingestion/source/excel/source.py +1 -1
- datahub/ingestion/source/feast.py +1 -1
- datahub/ingestion/source/file.py +5 -4
- datahub/ingestion/source/fivetran/config.py +17 -16
- datahub/ingestion/source/fivetran/fivetran.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +8 -10
- datahub/ingestion/source/ge_profiling_config.py +8 -5
- datahub/ingestion/source/grafana/grafana_api.py +2 -2
- datahub/ingestion/source/grafana/grafana_config.py +4 -3
- datahub/ingestion/source/grafana/grafana_source.py +1 -1
- datahub/ingestion/source/grafana/models.py +23 -5
- datahub/ingestion/source/hex/api.py +7 -5
- datahub/ingestion/source/hex/hex.py +4 -3
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +10 -10
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +7 -5
- datahub/ingestion/source/looker/looker_config.py +21 -20
- datahub/ingestion/source/looker/lookml_config.py +47 -47
- datahub/ingestion/source/metabase.py +8 -8
- datahub/ingestion/source/metadata/business_glossary.py +2 -2
- datahub/ingestion/source/metadata/lineage.py +13 -8
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/mode.py +6 -4
- datahub/ingestion/source/mongodb.py +4 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +17 -23
- datahub/ingestion/source/openapi.py +6 -8
- datahub/ingestion/source/powerbi/config.py +33 -32
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
- datahub/ingestion/source/powerbi/powerbi.py +1 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
- datahub/ingestion/source/preset.py +8 -8
- datahub/ingestion/source/pulsar.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
- datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +18 -20
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/usage.py +23 -3
- datahub/ingestion/source/s3/config.py +83 -62
- datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/s3/source.py +8 -5
- datahub/ingestion/source/sac/sac.py +5 -4
- datahub/ingestion/source/salesforce.py +3 -2
- datahub/ingestion/source/schema/json_schema.py +2 -2
- datahub/ingestion/source/sigma/data_classes.py +3 -2
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/sigma/sigma_api.py +7 -7
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
- datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_queries.py +1 -1
- datahub/ingestion/source/sql/athena.py +1 -1
- datahub/ingestion/source/sql/clickhouse.py +4 -2
- datahub/ingestion/source/sql/cockroachdb.py +1 -1
- datahub/ingestion/source/sql/druid.py +1 -1
- datahub/ingestion/source/sql/hana.py +1 -1
- datahub/ingestion/source/sql/hive.py +7 -5
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +13 -6
- datahub/ingestion/source/sql/mysql.py +1 -1
- datahub/ingestion/source/sql/oracle.py +17 -10
- datahub/ingestion/source/sql/postgres.py +2 -2
- datahub/ingestion/source/sql/presto.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +8 -9
- datahub/ingestion/source/sql/sql_generic.py +1 -1
- datahub/ingestion/source/sql/teradata.py +1 -1
- datahub/ingestion/source/sql/trino.py +1 -1
- datahub/ingestion/source/sql/vertica.py +5 -4
- datahub/ingestion/source/sql_queries.py +11 -8
- datahub/ingestion/source/state/checkpoint.py +2 -2
- datahub/ingestion/source/state/entity_removal_state.py +2 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +9 -9
- datahub/ingestion/source/tableau/tableau.py +14 -16
- datahub/ingestion/source/unity/config.py +33 -34
- datahub/ingestion/source/unity/proxy.py +203 -0
- datahub/ingestion/source/unity/proxy_types.py +91 -0
- datahub/ingestion/source/unity/source.py +27 -2
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
- datahub/ingestion/source/usage/usage_common.py +5 -3
- datahub/ingestion/source_config/csv_enricher.py +7 -6
- datahub/ingestion/source_config/operation_config.py +7 -4
- datahub/ingestion/source_config/pulsar.py +11 -15
- datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
- datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
- datahub/ingestion/transformer/add_dataset_properties.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
- datahub/ingestion/transformer/add_dataset_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
- datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
- datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
- datahub/ingestion/transformer/mark_dataset_status.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
- datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/replace_external_url.py +2 -2
- datahub/ingestion/transformer/set_browse_path.py +1 -1
- datahub/ingestion/transformer/tags_to_terms.py +1 -1
- datahub/lite/duckdb_lite.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/sdk/search_filters.py +68 -40
- datahub/secret/datahub_secret_store.py +7 -4
- datahub/secret/file_secret_store.py +1 -1
- datahub/sql_parsing/sqlglot_lineage.py +5 -2
- datahub/testing/check_sql_parser_result.py +2 -2
- datahub/utilities/ingest_utils.py +1 -1
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@ from abc import abstractmethod
|
|
|
3
3
|
from typing import Any, Dict, Optional
|
|
4
4
|
|
|
5
5
|
import pydantic
|
|
6
|
-
from pydantic import Field
|
|
6
|
+
from pydantic import Field, model_validator
|
|
7
7
|
|
|
8
8
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
9
9
|
from datahub.configuration.source_common import (
|
|
@@ -49,7 +49,8 @@ class SQLFilterConfig(ConfigModel):
|
|
|
49
49
|
description="Regex patterns for views to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
50
50
|
)
|
|
51
51
|
|
|
52
|
-
@
|
|
52
|
+
@model_validator(mode="before")
|
|
53
|
+
@classmethod
|
|
53
54
|
def view_pattern_is_table_pattern_unless_specified(
|
|
54
55
|
cls, values: Dict[str, Any]
|
|
55
56
|
) -> Dict[str, Any]:
|
|
@@ -120,11 +121,9 @@ class SQLCommonConfig(
|
|
|
120
121
|
self.profiling.operation_config
|
|
121
122
|
)
|
|
122
123
|
|
|
123
|
-
@
|
|
124
|
-
def ensure_profiling_pattern_is_passed_to_profiling(
|
|
125
|
-
|
|
126
|
-
) -> Dict[str, Any]:
|
|
127
|
-
profiling: Optional[GEProfilingConfig] = values.get("profiling")
|
|
124
|
+
@model_validator(mode="after")
|
|
125
|
+
def ensure_profiling_pattern_is_passed_to_profiling(self):
|
|
126
|
+
profiling = self.profiling
|
|
128
127
|
# Note: isinstance() check is required here as unity-catalog source reuses
|
|
129
128
|
# SQLCommonConfig with different profiling config than GEProfilingConfig
|
|
130
129
|
if (
|
|
@@ -132,8 +131,8 @@ class SQLCommonConfig(
|
|
|
132
131
|
and isinstance(profiling, GEProfilingConfig)
|
|
133
132
|
and profiling.enabled
|
|
134
133
|
):
|
|
135
|
-
profiling._allow_deny_patterns =
|
|
136
|
-
return
|
|
134
|
+
profiling._allow_deny_patterns = self.profile_pattern
|
|
135
|
+
return self
|
|
137
136
|
|
|
138
137
|
@abstractmethod
|
|
139
138
|
def get_sql_alchemy_url(self):
|
|
@@ -860,7 +860,7 @@ ORDER by DataBaseName, TableName;
|
|
|
860
860
|
|
|
861
861
|
@classmethod
|
|
862
862
|
def create(cls, config_dict, ctx):
|
|
863
|
-
config = TeradataConfig.
|
|
863
|
+
config = TeradataConfig.model_validate(config_dict)
|
|
864
864
|
return cls(config, ctx)
|
|
865
865
|
|
|
866
866
|
def _init_schema_resolver(self) -> SchemaResolver:
|
|
@@ -413,7 +413,7 @@ class TrinoSource(SQLAlchemySource):
|
|
|
413
413
|
|
|
414
414
|
@classmethod
|
|
415
415
|
def create(cls, config_dict, ctx):
|
|
416
|
-
config = TrinoConfig.
|
|
416
|
+
config = TrinoConfig.model_validate(config_dict)
|
|
417
417
|
return cls(config, ctx)
|
|
418
418
|
|
|
419
419
|
def get_schema_fields_for_column(
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tupl
|
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
import pytest
|
|
8
|
-
from pydantic import
|
|
8
|
+
from pydantic import field_validator
|
|
9
9
|
from vertica_sqlalchemy_dialect.base import VerticaInspector
|
|
10
10
|
|
|
11
11
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -105,8 +105,9 @@ class VerticaConfig(BasicSQLAlchemyConfig):
|
|
|
105
105
|
# defaults
|
|
106
106
|
scheme: str = pydantic.Field(default="vertica+vertica_python")
|
|
107
107
|
|
|
108
|
-
@
|
|
109
|
-
|
|
108
|
+
@field_validator("host_port", mode="after")
|
|
109
|
+
@classmethod
|
|
110
|
+
def clean_host_port(cls, v: str) -> str:
|
|
110
111
|
return config_clean.remove_protocol(v)
|
|
111
112
|
|
|
112
113
|
|
|
@@ -138,7 +139,7 @@ class VerticaSource(SQLAlchemySource):
|
|
|
138
139
|
|
|
139
140
|
@classmethod
|
|
140
141
|
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "VerticaSource":
|
|
141
|
-
config = VerticaConfig.
|
|
142
|
+
config = VerticaConfig.model_validate(config_dict)
|
|
142
143
|
return cls(config, ctx)
|
|
143
144
|
|
|
144
145
|
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
|
|
@@ -5,10 +5,10 @@ import re
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from functools import partial
|
|
8
|
-
from typing import ClassVar, Iterable, List, Optional, Union, cast
|
|
8
|
+
from typing import Any, ClassVar, Iterable, List, Optional, Union, cast
|
|
9
9
|
|
|
10
10
|
import smart_open
|
|
11
|
-
from pydantic import BaseModel, Field,
|
|
11
|
+
from pydantic import BaseModel, Field, field_validator
|
|
12
12
|
|
|
13
13
|
from datahub.configuration.common import HiddenFromDocs
|
|
14
14
|
from datahub.configuration.datetimes import parse_user_datetime
|
|
@@ -450,19 +450,22 @@ class QueryEntry(BaseModel):
|
|
|
450
450
|
class Config:
|
|
451
451
|
arbitrary_types_allowed = True
|
|
452
452
|
|
|
453
|
-
@
|
|
454
|
-
|
|
453
|
+
@field_validator("timestamp", mode="before")
|
|
454
|
+
@classmethod
|
|
455
|
+
def parse_timestamp(cls, v: Any) -> Any:
|
|
455
456
|
return None if v is None else parse_user_datetime(str(v))
|
|
456
457
|
|
|
457
|
-
@
|
|
458
|
-
|
|
458
|
+
@field_validator("user", mode="before")
|
|
459
|
+
@classmethod
|
|
460
|
+
def parse_user(cls, v: Any) -> Any:
|
|
459
461
|
if v is None:
|
|
460
462
|
return None
|
|
461
463
|
|
|
462
464
|
return v if isinstance(v, CorpUserUrn) else CorpUserUrn(v)
|
|
463
465
|
|
|
464
|
-
@
|
|
465
|
-
|
|
466
|
+
@field_validator("downstream_tables", "upstream_tables", mode="before")
|
|
467
|
+
@classmethod
|
|
468
|
+
def parse_tables(cls, v: Any) -> Any:
|
|
466
469
|
if not v:
|
|
467
470
|
return []
|
|
468
471
|
|
|
@@ -163,7 +163,7 @@ class Checkpoint(Generic[StateType]):
|
|
|
163
163
|
)
|
|
164
164
|
state_as_dict["version"] = checkpoint_aspect.state.formatVersion
|
|
165
165
|
state_as_dict["serde"] = checkpoint_aspect.state.serde
|
|
166
|
-
return state_class.
|
|
166
|
+
return state_class.model_validate(state_as_dict)
|
|
167
167
|
|
|
168
168
|
@staticmethod
|
|
169
169
|
def _from_base85_json_bytes(
|
|
@@ -179,7 +179,7 @@ class Checkpoint(Generic[StateType]):
|
|
|
179
179
|
state_as_dict = json.loads(state_uncompressed.decode("utf-8"))
|
|
180
180
|
state_as_dict["version"] = checkpoint_aspect.state.formatVersion
|
|
181
181
|
state_as_dict["serde"] = checkpoint_aspect.state.serde
|
|
182
|
-
return state_class.
|
|
182
|
+
return state_class.model_validate(state_as_dict)
|
|
183
183
|
|
|
184
184
|
def to_checkpoint_aspect(
|
|
185
185
|
self, max_allowed_state_size: int
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Tuple, Type
|
|
2
2
|
|
|
3
3
|
import pydantic
|
|
4
|
+
from pydantic import model_validator
|
|
4
5
|
|
|
5
6
|
from datahub.emitter.mce_builder import make_assertion_urn, make_container_urn
|
|
6
7
|
from datahub.ingestion.source.state.checkpoint import CheckpointStateBase
|
|
@@ -59,7 +60,7 @@ def pydantic_state_migrator(mapping: Dict[str, str]) -> "V1RootValidator":
|
|
|
59
60
|
|
|
60
61
|
return values
|
|
61
62
|
|
|
62
|
-
return
|
|
63
|
+
return model_validator(mode="before")(_validate_field_rename)
|
|
63
64
|
|
|
64
65
|
|
|
65
66
|
class GenericCheckpointState(CheckpointStateBase):
|
|
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|
|
3
3
|
from typing import Any, Dict, Generic, Optional, Type, TypeVar
|
|
4
4
|
|
|
5
5
|
import pydantic
|
|
6
|
-
from pydantic import
|
|
6
|
+
from pydantic import model_validator
|
|
7
7
|
from pydantic.fields import Field
|
|
8
8
|
|
|
9
9
|
from datahub.configuration.common import (
|
|
@@ -73,14 +73,14 @@ class StatefulIngestionConfig(ConfigModel):
|
|
|
73
73
|
description="If set to True, ignores the current checkpoint state.",
|
|
74
74
|
)
|
|
75
75
|
|
|
76
|
-
@
|
|
77
|
-
def validate_config(
|
|
78
|
-
if
|
|
79
|
-
if
|
|
80
|
-
|
|
76
|
+
@model_validator(mode="after")
|
|
77
|
+
def validate_config(self) -> "StatefulIngestionConfig":
|
|
78
|
+
if self.enabled:
|
|
79
|
+
if self.state_provider is None:
|
|
80
|
+
self.state_provider = DynamicTypedStateProviderConfig(
|
|
81
81
|
type="datahub", config={}
|
|
82
82
|
)
|
|
83
|
-
return
|
|
83
|
+
return self
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
CustomConfig = TypeVar("CustomConfig", bound=StatefulIngestionConfig)
|
|
@@ -110,17 +110,19 @@ class StatefulLineageConfigMixin(ConfigModel):
|
|
|
110
110
|
"store_last_lineage_extraction_timestamp", "enable_stateful_lineage_ingestion"
|
|
111
111
|
)
|
|
112
112
|
|
|
113
|
-
@
|
|
114
|
-
def lineage_stateful_option_validator(
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
if
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
113
|
+
@model_validator(mode="after")
|
|
114
|
+
def lineage_stateful_option_validator(self) -> "StatefulLineageConfigMixin":
|
|
115
|
+
try:
|
|
116
|
+
sti = getattr(self, "stateful_ingestion", None)
|
|
117
|
+
if not sti or not getattr(sti, "enabled", False):
|
|
118
|
+
if getattr(self, "enable_stateful_lineage_ingestion", False):
|
|
119
|
+
logger.warning(
|
|
120
|
+
"Stateful ingestion is disabled, disabling enable_stateful_lineage_ingestion config option as well"
|
|
121
|
+
)
|
|
122
|
+
self.enable_stateful_lineage_ingestion = False
|
|
123
|
+
except (AttributeError, RecursionError) as e:
|
|
124
|
+
logger.debug(f"Skipping stateful lineage validation due to: {e}")
|
|
125
|
+
return self
|
|
124
126
|
|
|
125
127
|
|
|
126
128
|
class StatefulProfilingConfigMixin(ConfigModel):
|
|
@@ -135,16 +137,19 @@ class StatefulProfilingConfigMixin(ConfigModel):
|
|
|
135
137
|
"store_last_profiling_timestamps", "enable_stateful_profiling"
|
|
136
138
|
)
|
|
137
139
|
|
|
138
|
-
@
|
|
139
|
-
def profiling_stateful_option_validator(
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
if
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
140
|
+
@model_validator(mode="after")
|
|
141
|
+
def profiling_stateful_option_validator(self) -> "StatefulProfilingConfigMixin":
|
|
142
|
+
try:
|
|
143
|
+
sti = getattr(self, "stateful_ingestion", None)
|
|
144
|
+
if not sti or not getattr(sti, "enabled", False):
|
|
145
|
+
if getattr(self, "enable_stateful_profiling", False):
|
|
146
|
+
logger.warning(
|
|
147
|
+
"Stateful ingestion is disabled, disabling enable_stateful_profiling config option as well"
|
|
148
|
+
)
|
|
149
|
+
self.enable_stateful_profiling = False
|
|
150
|
+
except (AttributeError, RecursionError) as e:
|
|
151
|
+
logger.debug(f"Skipping stateful profiling validation due to: {e}")
|
|
152
|
+
return self
|
|
148
153
|
|
|
149
154
|
|
|
150
155
|
class StatefulUsageConfigMixin(BaseTimeWindowConfig):
|
|
@@ -161,16 +166,21 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
|
|
|
161
166
|
"store_last_usage_extraction_timestamp", "enable_stateful_usage_ingestion"
|
|
162
167
|
)
|
|
163
168
|
|
|
164
|
-
@
|
|
165
|
-
def last_usage_extraction_stateful_option_validator(
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
|
|
169
|
+
@model_validator(mode="after")
|
|
170
|
+
def last_usage_extraction_stateful_option_validator(
|
|
171
|
+
self,
|
|
172
|
+
) -> "StatefulUsageConfigMixin":
|
|
173
|
+
try:
|
|
174
|
+
sti = getattr(self, "stateful_ingestion", None)
|
|
175
|
+
if not sti or not getattr(sti, "enabled", False):
|
|
176
|
+
if getattr(self, "enable_stateful_usage_ingestion", False):
|
|
177
|
+
logger.warning(
|
|
178
|
+
"Stateful ingestion is disabled, disabling enable_stateful_usage_ingestion config option as well"
|
|
179
|
+
)
|
|
180
|
+
self.enable_stateful_usage_ingestion = False
|
|
181
|
+
except (AttributeError, RecursionError) as e:
|
|
182
|
+
logger.debug(f"Skipping stateful usage validation due to: {e}")
|
|
183
|
+
return self
|
|
174
184
|
|
|
175
185
|
|
|
176
186
|
class StatefulTimeWindowConfigMixin(BaseTimeWindowConfig):
|
|
@@ -185,16 +195,16 @@ class StatefulTimeWindowConfigMixin(BaseTimeWindowConfig):
|
|
|
185
195
|
"and queries together from a single audit log and uses a unified time window.",
|
|
186
196
|
)
|
|
187
197
|
|
|
188
|
-
@
|
|
189
|
-
def time_window_stateful_option_validator(
|
|
190
|
-
sti =
|
|
191
|
-
if not sti or not sti
|
|
192
|
-
if
|
|
198
|
+
@model_validator(mode="after")
|
|
199
|
+
def time_window_stateful_option_validator(self) -> "StatefulTimeWindowConfigMixin":
|
|
200
|
+
sti = getattr(self, "stateful_ingestion", None)
|
|
201
|
+
if not sti or not getattr(sti, "enabled", False):
|
|
202
|
+
if getattr(self, "enable_stateful_time_window", False):
|
|
193
203
|
logger.warning(
|
|
194
204
|
"Stateful ingestion is disabled, disabling enable_stateful_time_window config option as well"
|
|
195
205
|
)
|
|
196
|
-
|
|
197
|
-
return
|
|
206
|
+
self.enable_stateful_time_window = False
|
|
207
|
+
return self
|
|
198
208
|
|
|
199
209
|
|
|
200
210
|
@dataclass
|
|
@@ -40,7 +40,7 @@ class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
|
|
|
40
40
|
def create(
|
|
41
41
|
cls, config_dict: Dict[str, Any], ctx: PipelineContext
|
|
42
42
|
) -> "DatahubIngestionCheckpointingProvider":
|
|
43
|
-
config = DatahubIngestionStateProviderConfig.
|
|
43
|
+
config = DatahubIngestionStateProviderConfig.model_validate(config_dict)
|
|
44
44
|
if config.datahub_api is not None:
|
|
45
45
|
return cls(DataHubGraph(config.datahub_api))
|
|
46
46
|
elif ctx.graph:
|
|
@@ -32,7 +32,7 @@ class FileIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
|
|
|
32
32
|
def create(
|
|
33
33
|
cls, config_dict: Dict[str, Any], ctx: PipelineContext
|
|
34
34
|
) -> "FileIngestionCheckpointingProvider":
|
|
35
|
-
config = FileIngestionStateProviderConfig.
|
|
35
|
+
config = FileIngestionStateProviderConfig.model_validate(config_dict)
|
|
36
36
|
return cls(config)
|
|
37
37
|
|
|
38
38
|
def get_latest_checkpoint(
|
|
@@ -9,7 +9,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
|
9
9
|
import dateutil.parser as dp
|
|
10
10
|
import requests
|
|
11
11
|
import sqlglot
|
|
12
|
-
from pydantic import BaseModel,
|
|
12
|
+
from pydantic import BaseModel, field_validator, model_validator
|
|
13
13
|
from pydantic.fields import Field
|
|
14
14
|
from requests.adapters import HTTPAdapter
|
|
15
15
|
from urllib3.util.retry import Retry
|
|
@@ -246,16 +246,16 @@ class SupersetConfig(
|
|
|
246
246
|
# This is required to allow preset configs to get parsed
|
|
247
247
|
extra = "allow"
|
|
248
248
|
|
|
249
|
-
@
|
|
250
|
-
|
|
249
|
+
@field_validator("connect_uri", "display_uri", mode="after")
|
|
250
|
+
@classmethod
|
|
251
|
+
def remove_trailing_slash(cls, v: str) -> str:
|
|
251
252
|
return config_clean.remove_trailing_slashes(v)
|
|
252
253
|
|
|
253
|
-
@
|
|
254
|
-
def default_display_uri_to_connect_uri(
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
return values
|
|
254
|
+
@model_validator(mode="after")
|
|
255
|
+
def default_display_uri_to_connect_uri(self) -> "SupersetConfig":
|
|
256
|
+
if self.display_uri is None:
|
|
257
|
+
self.display_uri = self.connect_uri
|
|
258
|
+
return self
|
|
259
259
|
|
|
260
260
|
|
|
261
261
|
def get_metric_name(metric):
|
|
@@ -25,7 +25,7 @@ from urllib.parse import quote, urlparse
|
|
|
25
25
|
|
|
26
26
|
import dateutil.parser as dp
|
|
27
27
|
import tableauserverclient as TSC
|
|
28
|
-
from pydantic import
|
|
28
|
+
from pydantic import field_validator, model_validator
|
|
29
29
|
from pydantic.fields import Field
|
|
30
30
|
from requests.adapters import HTTPAdapter
|
|
31
31
|
from tableauserverclient import (
|
|
@@ -257,8 +257,9 @@ class TableauConnectionConfig(ConfigModel):
|
|
|
257
257
|
description="When enabled, extracts column-level lineage from Tableau Datasources",
|
|
258
258
|
)
|
|
259
259
|
|
|
260
|
-
@
|
|
261
|
-
|
|
260
|
+
@field_validator("connect_uri", mode="after")
|
|
261
|
+
@classmethod
|
|
262
|
+
def remove_trailing_slash(cls, v: str) -> str:
|
|
262
263
|
return config_clean.remove_trailing_slashes(v)
|
|
263
264
|
|
|
264
265
|
def get_tableau_auth(
|
|
@@ -652,8 +653,9 @@ class TableauConfig(
|
|
|
652
653
|
"fetch_size",
|
|
653
654
|
)
|
|
654
655
|
|
|
655
|
-
#
|
|
656
|
-
@
|
|
656
|
+
# mode = "before" because we want to take some decision before pydantic initialize the configuration to default values
|
|
657
|
+
@model_validator(mode="before")
|
|
658
|
+
@classmethod
|
|
657
659
|
def projects_backward_compatibility(cls, values: Dict) -> Dict:
|
|
658
660
|
# In-place update of the input dict would cause state contamination. This was discovered through test failures
|
|
659
661
|
# in test_hex.py where the same dict is reused.
|
|
@@ -683,27 +685,23 @@ class TableauConfig(
|
|
|
683
685
|
|
|
684
686
|
return values
|
|
685
687
|
|
|
686
|
-
@
|
|
687
|
-
def validate_config_values(
|
|
688
|
-
tags_for_hidden_assets = values.get("tags_for_hidden_assets")
|
|
689
|
-
ingest_tags = values.get("ingest_tags")
|
|
688
|
+
@model_validator(mode="after")
|
|
689
|
+
def validate_config_values(self) -> "TableauConfig":
|
|
690
690
|
if (
|
|
691
|
-
not ingest_tags
|
|
692
|
-
and tags_for_hidden_assets
|
|
693
|
-
and len(tags_for_hidden_assets) > 0
|
|
691
|
+
not self.ingest_tags
|
|
692
|
+
and self.tags_for_hidden_assets
|
|
693
|
+
and len(self.tags_for_hidden_assets) > 0
|
|
694
694
|
):
|
|
695
695
|
raise ValueError(
|
|
696
696
|
"tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
|
|
697
697
|
)
|
|
698
698
|
|
|
699
|
-
use_email_as_username
|
|
700
|
-
ingest_owner = values.get("ingest_owner")
|
|
701
|
-
if use_email_as_username and not ingest_owner:
|
|
699
|
+
if self.use_email_as_username and not self.ingest_owner:
|
|
702
700
|
raise ValueError(
|
|
703
701
|
"use_email_as_username requires ingest_owner to be enabled."
|
|
704
702
|
)
|
|
705
703
|
|
|
706
|
-
return
|
|
704
|
+
return self
|
|
707
705
|
|
|
708
706
|
|
|
709
707
|
class WorkbookKey(ContainerKey):
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
from datetime import datetime, timedelta, timezone
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Dict, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
|
-
from pydantic import Field
|
|
7
|
+
from pydantic import Field, field_validator, model_validator
|
|
8
8
|
from typing_extensions import Literal
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.common import (
|
|
@@ -397,13 +397,15 @@ class UnityCatalogSourceConfig(
|
|
|
397
397
|
default=None, description="Unity Catalog Stateful Ingestion Config."
|
|
398
398
|
)
|
|
399
399
|
|
|
400
|
-
@
|
|
400
|
+
@field_validator("start_time", mode="after")
|
|
401
|
+
@classmethod
|
|
401
402
|
def within_thirty_days(cls, v: datetime) -> datetime:
|
|
402
403
|
if (datetime.now(timezone.utc) - v).days > 30:
|
|
403
404
|
raise ValueError("Query history is only maintained for 30 days.")
|
|
404
405
|
return v
|
|
405
406
|
|
|
406
|
-
@
|
|
407
|
+
@field_validator("workspace_url", mode="after")
|
|
408
|
+
@classmethod
|
|
407
409
|
def workspace_url_should_start_with_http_scheme(cls, workspace_url: str) -> str:
|
|
408
410
|
if not workspace_url.lower().startswith(("http://", "https://")):
|
|
409
411
|
raise ValueError(
|
|
@@ -411,7 +413,8 @@ class UnityCatalogSourceConfig(
|
|
|
411
413
|
)
|
|
412
414
|
return workspace_url
|
|
413
415
|
|
|
414
|
-
@
|
|
416
|
+
@field_validator("include_metastore", mode="after")
|
|
417
|
+
@classmethod
|
|
415
418
|
def include_metastore_warning(cls, v: bool) -> bool:
|
|
416
419
|
if v:
|
|
417
420
|
msg = (
|
|
@@ -424,60 +427,56 @@ class UnityCatalogSourceConfig(
|
|
|
424
427
|
add_global_warning(msg)
|
|
425
428
|
return v
|
|
426
429
|
|
|
427
|
-
@
|
|
428
|
-
def set_warehouse_id_from_profiling(
|
|
429
|
-
profiling
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
if not values.get("warehouse_id") and profiling and profiling.warehouse_id:
|
|
433
|
-
values["warehouse_id"] = profiling.warehouse_id
|
|
430
|
+
@model_validator(mode="after")
|
|
431
|
+
def set_warehouse_id_from_profiling(self):
|
|
432
|
+
profiling = self.profiling
|
|
433
|
+
if not self.warehouse_id and profiling and profiling.warehouse_id:
|
|
434
|
+
self.warehouse_id = profiling.warehouse_id
|
|
434
435
|
if (
|
|
435
|
-
|
|
436
|
+
self.warehouse_id
|
|
436
437
|
and profiling
|
|
437
438
|
and profiling.warehouse_id
|
|
438
|
-
and
|
|
439
|
+
and self.warehouse_id != profiling.warehouse_id
|
|
439
440
|
):
|
|
440
441
|
raise ValueError(
|
|
441
442
|
"When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`."
|
|
442
443
|
)
|
|
443
444
|
|
|
444
|
-
if
|
|
445
|
-
profiling.warehouse_id =
|
|
445
|
+
if self.warehouse_id and profiling and not profiling.warehouse_id:
|
|
446
|
+
profiling.warehouse_id = self.warehouse_id
|
|
446
447
|
|
|
447
448
|
if profiling and profiling.enabled and not profiling.warehouse_id:
|
|
448
449
|
raise ValueError("warehouse_id must be set when profiling is enabled.")
|
|
449
450
|
|
|
450
|
-
return
|
|
451
|
+
return self
|
|
451
452
|
|
|
452
|
-
@
|
|
453
|
-
def validate_lineage_data_source_with_warehouse(
|
|
454
|
-
|
|
455
|
-
) -> Dict[str, Any]:
|
|
456
|
-
lineage_data_source = values.get("lineage_data_source", LineageDataSource.AUTO)
|
|
457
|
-
warehouse_id = values.get("warehouse_id")
|
|
453
|
+
@model_validator(mode="after")
|
|
454
|
+
def validate_lineage_data_source_with_warehouse(self):
|
|
455
|
+
lineage_data_source = self.lineage_data_source or LineageDataSource.AUTO
|
|
458
456
|
|
|
459
|
-
if
|
|
457
|
+
if (
|
|
458
|
+
lineage_data_source == LineageDataSource.SYSTEM_TABLES
|
|
459
|
+
and not self.warehouse_id
|
|
460
|
+
):
|
|
460
461
|
raise ValueError(
|
|
461
462
|
f"lineage_data_source='{LineageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
|
|
462
463
|
)
|
|
463
464
|
|
|
464
|
-
return
|
|
465
|
+
return self
|
|
465
466
|
|
|
466
|
-
@
|
|
467
|
-
def validate_usage_data_source_with_warehouse(
|
|
468
|
-
|
|
469
|
-
) -> Dict[str, Any]:
|
|
470
|
-
usage_data_source = values.get("usage_data_source", UsageDataSource.AUTO)
|
|
471
|
-
warehouse_id = values.get("warehouse_id")
|
|
467
|
+
@model_validator(mode="after")
|
|
468
|
+
def validate_usage_data_source_with_warehouse(self):
|
|
469
|
+
usage_data_source = self.usage_data_source or UsageDataSource.AUTO
|
|
472
470
|
|
|
473
|
-
if usage_data_source == UsageDataSource.SYSTEM_TABLES and not warehouse_id:
|
|
471
|
+
if usage_data_source == UsageDataSource.SYSTEM_TABLES and not self.warehouse_id:
|
|
474
472
|
raise ValueError(
|
|
475
473
|
f"usage_data_source='{UsageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
|
|
476
474
|
)
|
|
477
475
|
|
|
478
|
-
return
|
|
476
|
+
return self
|
|
479
477
|
|
|
480
|
-
@
|
|
478
|
+
@field_validator("schema_pattern", mode="before")
|
|
479
|
+
@classmethod
|
|
481
480
|
def schema_pattern_should__always_deny_information_schema(
|
|
482
481
|
cls, v: AllowDenyPattern
|
|
483
482
|
) -> AllowDenyPattern:
|