acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2550 -2543
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +263 -261
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +2 -2
- datahub/api/entities/corpgroup/corpgroup.py +11 -6
- datahub/api/entities/corpuser/corpuser.py +11 -11
- datahub/api/entities/dataproduct/dataproduct.py +47 -27
- datahub/api/entities/dataset/dataset.py +32 -21
- datahub/api/entities/external/lake_formation_external_entites.py +5 -6
- datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
- datahub/api/entities/forms/forms.py +16 -14
- datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
- datahub/cli/check_cli.py +2 -2
- datahub/cli/config_utils.py +3 -3
- datahub/cli/lite_cli.py +9 -7
- datahub/cli/migrate.py +4 -4
- datahub/cli/quickstart_versioning.py +3 -3
- datahub/cli/specific/group_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +1 -1
- datahub/cli/specific/user_cli.py +1 -1
- datahub/configuration/common.py +14 -2
- datahub/configuration/connection_resolver.py +2 -2
- datahub/configuration/git.py +47 -30
- datahub/configuration/import_resolver.py +2 -2
- datahub/configuration/kafka.py +4 -3
- datahub/configuration/time_window_config.py +26 -26
- datahub/configuration/validate_field_deprecation.py +2 -2
- datahub/configuration/validate_field_removal.py +2 -2
- datahub/configuration/validate_field_rename.py +2 -2
- datahub/configuration/validate_multiline_string.py +2 -1
- datahub/emitter/kafka_emitter.py +3 -1
- datahub/emitter/rest_emitter.py +2 -4
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/report.py +1 -1
- datahub/ingestion/api/sink.py +1 -1
- datahub/ingestion/api/source.py +1 -1
- datahub/ingestion/glossary/datahub_classifier.py +11 -8
- datahub/ingestion/graph/client.py +5 -1
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/reporting/file_reporter.py +5 -4
- datahub/ingestion/run/pipeline.py +7 -6
- datahub/ingestion/run/pipeline_config.py +12 -14
- datahub/ingestion/run/sink_callback.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +6 -4
- datahub/ingestion/source/abs/config.py +19 -19
- datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/abs/source.py +2 -2
- datahub/ingestion/source/aws/aws_common.py +1 -1
- datahub/ingestion/source/aws/glue.py +6 -4
- datahub/ingestion/source/aws/sagemaker.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +8 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
- datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
- datahub/ingestion/source/datahub/config.py +8 -8
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
- datahub/ingestion/source/dbt/dbt_common.py +39 -37
- datahub/ingestion/source/dbt/dbt_core.py +10 -12
- datahub/ingestion/source/debug/datahub_debug.py +1 -1
- datahub/ingestion/source/delta_lake/config.py +6 -4
- datahub/ingestion/source/dremio/dremio_api.py +212 -78
- datahub/ingestion/source/dremio/dremio_config.py +10 -6
- datahub/ingestion/source/dremio/dremio_entities.py +55 -39
- datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
- datahub/ingestion/source/dremio/dremio_source.py +24 -26
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/elastic_search.py +110 -32
- datahub/ingestion/source/excel/source.py +1 -1
- datahub/ingestion/source/feast.py +1 -1
- datahub/ingestion/source/file.py +5 -4
- datahub/ingestion/source/fivetran/config.py +17 -16
- datahub/ingestion/source/fivetran/fivetran.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +8 -10
- datahub/ingestion/source/ge_profiling_config.py +8 -5
- datahub/ingestion/source/grafana/grafana_api.py +2 -2
- datahub/ingestion/source/grafana/grafana_config.py +4 -3
- datahub/ingestion/source/grafana/grafana_source.py +1 -1
- datahub/ingestion/source/grafana/models.py +23 -5
- datahub/ingestion/source/hex/api.py +7 -5
- datahub/ingestion/source/hex/hex.py +4 -3
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +10 -10
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +7 -5
- datahub/ingestion/source/looker/looker_config.py +21 -20
- datahub/ingestion/source/looker/lookml_config.py +47 -47
- datahub/ingestion/source/metabase.py +8 -8
- datahub/ingestion/source/metadata/business_glossary.py +2 -2
- datahub/ingestion/source/metadata/lineage.py +13 -8
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/mode.py +6 -4
- datahub/ingestion/source/mongodb.py +4 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +17 -23
- datahub/ingestion/source/openapi.py +6 -8
- datahub/ingestion/source/powerbi/config.py +33 -32
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
- datahub/ingestion/source/powerbi/powerbi.py +1 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
- datahub/ingestion/source/preset.py +8 -8
- datahub/ingestion/source/pulsar.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
- datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +18 -20
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/usage.py +23 -3
- datahub/ingestion/source/s3/config.py +83 -62
- datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/s3/source.py +8 -5
- datahub/ingestion/source/sac/sac.py +5 -4
- datahub/ingestion/source/salesforce.py +3 -2
- datahub/ingestion/source/schema/json_schema.py +2 -2
- datahub/ingestion/source/sigma/data_classes.py +3 -2
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/sigma/sigma_api.py +7 -7
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
- datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_queries.py +28 -4
- datahub/ingestion/source/sql/athena.py +1 -1
- datahub/ingestion/source/sql/clickhouse.py +4 -2
- datahub/ingestion/source/sql/cockroachdb.py +1 -1
- datahub/ingestion/source/sql/druid.py +1 -1
- datahub/ingestion/source/sql/hana.py +1 -1
- datahub/ingestion/source/sql/hive.py +7 -5
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +13 -6
- datahub/ingestion/source/sql/mysql.py +1 -1
- datahub/ingestion/source/sql/oracle.py +17 -10
- datahub/ingestion/source/sql/postgres.py +2 -2
- datahub/ingestion/source/sql/presto.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +8 -9
- datahub/ingestion/source/sql/sql_generic.py +1 -1
- datahub/ingestion/source/sql/teradata.py +1 -1
- datahub/ingestion/source/sql/trino.py +1 -1
- datahub/ingestion/source/sql/vertica.py +5 -4
- datahub/ingestion/source/sql_queries.py +174 -22
- datahub/ingestion/source/state/checkpoint.py +2 -2
- datahub/ingestion/source/state/entity_removal_state.py +2 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +9 -9
- datahub/ingestion/source/tableau/tableau.py +14 -16
- datahub/ingestion/source/unity/azure_auth_config.py +15 -0
- datahub/ingestion/source/unity/config.py +51 -34
- datahub/ingestion/source/unity/connection.py +7 -1
- datahub/ingestion/source/unity/connection_test.py +1 -1
- datahub/ingestion/source/unity/proxy.py +216 -7
- datahub/ingestion/source/unity/proxy_types.py +91 -0
- datahub/ingestion/source/unity/source.py +29 -3
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
- datahub/ingestion/source/usage/usage_common.py +5 -3
- datahub/ingestion/source_config/csv_enricher.py +7 -6
- datahub/ingestion/source_config/operation_config.py +7 -4
- datahub/ingestion/source_config/pulsar.py +11 -15
- datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
- datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
- datahub/ingestion/transformer/add_dataset_properties.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
- datahub/ingestion/transformer/add_dataset_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
- datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
- datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
- datahub/ingestion/transformer/mark_dataset_status.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
- datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/replace_external_url.py +2 -2
- datahub/ingestion/transformer/set_browse_path.py +1 -1
- datahub/ingestion/transformer/tags_to_terms.py +1 -1
- datahub/lite/duckdb_lite.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_internal_schema_classes.py +62 -2
- datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
- datahub/metadata/schema.avsc +271 -91
- datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
- datahub/metadata/schemas/AssertionInfo.avsc +48 -5
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
- datahub/metadata/schemas/ChartInfo.avsc +12 -5
- datahub/metadata/schemas/ContainerProperties.avsc +12 -5
- datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
- datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
- datahub/metadata/schemas/DashboardInfo.avsc +16 -4
- datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
- datahub/metadata/schemas/DataJobInfo.avsc +9 -4
- datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
- datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- datahub/metadata/schemas/DataProductProperties.avsc +5 -2
- datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/DatasetProperties.avsc +12 -5
- datahub/metadata/schemas/DomainProperties.avsc +7 -3
- datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
- datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalTags.avsc +3 -2
- datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- datahub/metadata/schemas/InputFields.avsc +3 -2
- datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLModelProperties.avsc +4 -2
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
- datahub/metadata/schemas/NotebookInfo.avsc +5 -2
- datahub/metadata/schemas/Ownership.avsc +3 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -1
- datahub/metadata/schemas/RoleProperties.avsc +3 -1
- datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- datahub/metadata/schemas/TagProperties.avsc +3 -1
- datahub/metadata/schemas/TestInfo.avsc +2 -1
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/_all_entities.py +2 -0
- datahub/sdk/search_filters.py +68 -40
- datahub/sdk/tag.py +112 -0
- datahub/secret/datahub_secret_store.py +7 -4
- datahub/secret/file_secret_store.py +1 -1
- datahub/sql_parsing/schema_resolver.py +29 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
- datahub/sql_parsing/sqlglot_lineage.py +5 -2
- datahub/testing/check_sql_parser_result.py +2 -2
- datahub/utilities/ingest_utils.py +1 -1
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
|
@@ -2,7 +2,7 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from typing import Any, Dict
|
|
4
4
|
|
|
5
|
-
from pydantic import
|
|
5
|
+
from pydantic import field_validator
|
|
6
6
|
|
|
7
7
|
from datahub.configuration.common import ConfigModel
|
|
8
8
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -16,8 +16,9 @@ class FileReporterConfig(ConfigModel):
|
|
|
16
16
|
filename: str
|
|
17
17
|
format: str = "json"
|
|
18
18
|
|
|
19
|
-
@
|
|
20
|
-
|
|
19
|
+
@field_validator("format", mode="after")
|
|
20
|
+
@classmethod
|
|
21
|
+
def only_json_supported(cls, v: str) -> str:
|
|
21
22
|
if v and v.lower() != "json":
|
|
22
23
|
raise ValueError(
|
|
23
24
|
f"Format {v} is not yet supported. Only json is supported at this time"
|
|
@@ -33,7 +34,7 @@ class FileReporter(PipelineRunListener):
|
|
|
33
34
|
ctx: PipelineContext,
|
|
34
35
|
sink: Sink,
|
|
35
36
|
) -> PipelineRunListener:
|
|
36
|
-
reporter_config = FileReporterConfig.
|
|
37
|
+
reporter_config = FileReporterConfig.model_validate(config_dict)
|
|
37
38
|
return cls(reporter_config)
|
|
38
39
|
|
|
39
40
|
def __init__(self, reporter_config: FileReporterConfig) -> None:
|
|
@@ -215,7 +215,7 @@ class Pipeline:
|
|
|
215
215
|
sink_class = sink_registry.get(self.sink_type)
|
|
216
216
|
|
|
217
217
|
with _add_init_error_context(f"configure the sink ({self.sink_type})"):
|
|
218
|
-
sink_config = self.config.sink.
|
|
218
|
+
sink_config = self.config.sink.model_dump().get("config") or {}
|
|
219
219
|
self.sink = exit_stack.enter_context(
|
|
220
220
|
sink_class.create(sink_config, self.ctx)
|
|
221
221
|
)
|
|
@@ -245,7 +245,7 @@ class Pipeline:
|
|
|
245
245
|
):
|
|
246
246
|
self.source = inner_exit_stack.enter_context(
|
|
247
247
|
source_class.create(
|
|
248
|
-
self.config.source.
|
|
248
|
+
self.config.source.model_dump().get("config", {}), self.ctx
|
|
249
249
|
)
|
|
250
250
|
)
|
|
251
251
|
logger.debug(
|
|
@@ -288,7 +288,7 @@ class Pipeline:
|
|
|
288
288
|
for transformer in self.config.transformers:
|
|
289
289
|
transformer_type = transformer.type
|
|
290
290
|
transformer_class = transform_registry.get(transformer_type)
|
|
291
|
-
transformer_config = transformer.
|
|
291
|
+
transformer_config = transformer.model_dump().get("config", {})
|
|
292
292
|
self.transformers.append(
|
|
293
293
|
transformer_class.create(transformer_config, self.ctx)
|
|
294
294
|
)
|
|
@@ -310,12 +310,12 @@ class Pipeline:
|
|
|
310
310
|
reporter.type for reporter in self.config.reporting
|
|
311
311
|
]:
|
|
312
312
|
self.config.reporting.append(
|
|
313
|
-
ReporterConfig.
|
|
313
|
+
ReporterConfig.model_validate({"type": "datahub"})
|
|
314
314
|
)
|
|
315
315
|
elif report_to:
|
|
316
316
|
# we assume this is a file name, and add the file reporter
|
|
317
317
|
self.config.reporting.append(
|
|
318
|
-
ReporterConfig.
|
|
318
|
+
ReporterConfig.model_validate(
|
|
319
319
|
{"type": "file", "config": {"filename": report_to}}
|
|
320
320
|
)
|
|
321
321
|
)
|
|
@@ -323,7 +323,7 @@ class Pipeline:
|
|
|
323
323
|
for reporter in self.config.reporting:
|
|
324
324
|
reporter_type = reporter.type
|
|
325
325
|
reporter_class = reporting_provider_registry.get(reporter_type)
|
|
326
|
-
reporter_config_dict = reporter.
|
|
326
|
+
reporter_config_dict = reporter.model_dump().get("config", {})
|
|
327
327
|
try:
|
|
328
328
|
self.reporters.append(
|
|
329
329
|
reporter_class.create(
|
|
@@ -558,6 +558,7 @@ class Pipeline:
|
|
|
558
558
|
|
|
559
559
|
self.process_commits()
|
|
560
560
|
self.final_status = PipelineStatus.COMPLETED
|
|
561
|
+
|
|
561
562
|
except (SystemExit, KeyboardInterrupt):
|
|
562
563
|
self.final_status = PipelineStatus.CANCELLED
|
|
563
564
|
logger.error("Caught error", exc_info=True)
|
|
@@ -2,9 +2,9 @@ import datetime
|
|
|
2
2
|
import logging
|
|
3
3
|
import random
|
|
4
4
|
import string
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Dict, List, Optional
|
|
6
6
|
|
|
7
|
-
from pydantic import Field,
|
|
7
|
+
from pydantic import Field, model_validator
|
|
8
8
|
|
|
9
9
|
from datahub.configuration.common import ConfigModel, DynamicTypedConfig, HiddenFromDocs
|
|
10
10
|
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
@@ -96,30 +96,28 @@ class PipelineConfig(ConfigModel):
|
|
|
96
96
|
None # the raw dict that was parsed to construct this config
|
|
97
97
|
)
|
|
98
98
|
|
|
99
|
-
@
|
|
100
|
-
def run_id_should_be_semantic(
|
|
101
|
-
|
|
102
|
-
) -> str:
|
|
103
|
-
if v == DEFAULT_RUN_ID:
|
|
99
|
+
@model_validator(mode="after")
|
|
100
|
+
def run_id_should_be_semantic(self) -> "PipelineConfig":
|
|
101
|
+
if self.run_id == DEFAULT_RUN_ID:
|
|
104
102
|
source_type = None
|
|
105
|
-
if
|
|
106
|
-
source_type =
|
|
103
|
+
if hasattr(self.source, "type"):
|
|
104
|
+
source_type = self.source.type
|
|
107
105
|
|
|
108
|
-
|
|
106
|
+
self.run_id = _generate_run_id(source_type)
|
|
109
107
|
else:
|
|
110
|
-
assert
|
|
111
|
-
|
|
108
|
+
assert self.run_id is not None
|
|
109
|
+
return self
|
|
112
110
|
|
|
113
111
|
@classmethod
|
|
114
112
|
def from_dict(
|
|
115
113
|
cls, resolved_dict: dict, raw_dict: Optional[dict] = None
|
|
116
114
|
) -> "PipelineConfig":
|
|
117
|
-
config = cls.
|
|
115
|
+
config = cls.model_validate(resolved_dict)
|
|
118
116
|
config._raw_dict = raw_dict
|
|
119
117
|
return config
|
|
120
118
|
|
|
121
119
|
def get_raw_dict(self) -> Dict:
|
|
122
120
|
result = self._raw_dict
|
|
123
121
|
if result is None:
|
|
124
|
-
result = self.
|
|
122
|
+
result = self.model_dump()
|
|
125
123
|
return result
|
|
@@ -39,7 +39,7 @@ class LoggingCallback(WriteCallback):
|
|
|
39
39
|
class DeadLetterQueueCallback(WriteCallback, Closeable):
|
|
40
40
|
def __init__(self, ctx: PipelineContext, config: Optional[FileSinkConfig]) -> None:
|
|
41
41
|
if not config:
|
|
42
|
-
config = FileSinkConfig.
|
|
42
|
+
config = FileSinkConfig.model_validate({"filename": "failed_events.json"})
|
|
43
43
|
self.file_sink: FileSink = FileSink(ctx, config)
|
|
44
44
|
self.file_sink_lock = threading.Lock()
|
|
45
45
|
self.logging_callback = LoggingCallback(name="failure-queue")
|
|
@@ -9,6 +9,7 @@ from enum import auto
|
|
|
9
9
|
from typing import List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
11
|
import pydantic
|
|
12
|
+
from pydantic import field_validator
|
|
12
13
|
|
|
13
14
|
from datahub.configuration.common import (
|
|
14
15
|
ConfigEnum,
|
|
@@ -63,8 +64,8 @@ class RestSinkMode(ConfigEnum):
|
|
|
63
64
|
ASYNC_BATCH = auto()
|
|
64
65
|
|
|
65
66
|
|
|
66
|
-
_DEFAULT_REST_SINK_MODE = pydantic.
|
|
67
|
-
|
|
67
|
+
_DEFAULT_REST_SINK_MODE = pydantic.TypeAdapter(RestSinkMode).validate_python(
|
|
68
|
+
get_rest_sink_default_mode() or RestSinkMode.ASYNC_BATCH
|
|
68
69
|
)
|
|
69
70
|
|
|
70
71
|
|
|
@@ -80,8 +81,9 @@ class DatahubRestSinkConfig(DatahubClientConfig):
|
|
|
80
81
|
# Only applies in async batch mode.
|
|
81
82
|
max_per_batch: pydantic.PositiveInt = 100
|
|
82
83
|
|
|
83
|
-
@
|
|
84
|
-
|
|
84
|
+
@field_validator("max_per_batch", mode="before")
|
|
85
|
+
@classmethod
|
|
86
|
+
def validate_max_per_batch(cls, v: int) -> int:
|
|
85
87
|
if v > BATCH_INGEST_MAX_PAYLOAD_LENGTH:
|
|
86
88
|
raise ValueError(
|
|
87
89
|
f"max_per_batch must be less than or equal to {BATCH_INGEST_MAX_PAYLOAD_LENGTH}"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Any, Dict, List, Optional, Union
|
|
3
3
|
|
|
4
|
-
import
|
|
4
|
+
from pydantic import ValidationInfo, field_validator, model_validator
|
|
5
5
|
from pydantic.fields import Field
|
|
6
6
|
|
|
7
7
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -105,9 +105,10 @@ class DataLakeSourceConfig(
|
|
|
105
105
|
self.profiling.operation_config
|
|
106
106
|
)
|
|
107
107
|
|
|
108
|
-
@
|
|
108
|
+
@field_validator("path_specs", mode="before")
|
|
109
|
+
@classmethod
|
|
109
110
|
def check_path_specs_and_infer_platform(
|
|
110
|
-
cls, path_specs: List[PathSpec],
|
|
111
|
+
cls, path_specs: List[PathSpec], info: ValidationInfo
|
|
111
112
|
) -> List[PathSpec]:
|
|
112
113
|
if len(path_specs) == 0:
|
|
113
114
|
raise ValueError("path_specs must not be empty")
|
|
@@ -124,38 +125,37 @@ class DataLakeSourceConfig(
|
|
|
124
125
|
|
|
125
126
|
# Ensure abs configs aren't used for file sources.
|
|
126
127
|
if guessed_platform != "abs" and (
|
|
127
|
-
|
|
128
|
-
or
|
|
129
|
-
or
|
|
128
|
+
info.data.get("use_abs_container_properties")
|
|
129
|
+
or info.data.get("use_abs_blob_tags")
|
|
130
|
+
or info.data.get("use_abs_blob_properties")
|
|
130
131
|
):
|
|
131
132
|
raise ValueError(
|
|
132
133
|
"Cannot grab abs blob/container tags when platform is not abs. Remove the flag or use abs."
|
|
133
134
|
)
|
|
134
135
|
|
|
135
136
|
# Infer platform if not specified.
|
|
136
|
-
if
|
|
137
|
+
if info.data.get("platform") and info.data["platform"] != guessed_platform:
|
|
137
138
|
raise ValueError(
|
|
138
|
-
f"All path_specs belong to {guessed_platform} platform, but platform is set to {
|
|
139
|
+
f"All path_specs belong to {guessed_platform} platform, but platform is set to {info.data['platform']}"
|
|
139
140
|
)
|
|
140
141
|
else:
|
|
141
142
|
logger.debug(f'Setting config "platform": {guessed_platform}')
|
|
142
|
-
|
|
143
|
+
info.data["platform"] = guessed_platform
|
|
143
144
|
|
|
144
145
|
return path_specs
|
|
145
146
|
|
|
146
|
-
@
|
|
147
|
-
|
|
148
|
-
|
|
147
|
+
@field_validator("platform", mode="before")
|
|
148
|
+
@classmethod
|
|
149
|
+
def platform_not_empty(cls, platform: Any, info: ValidationInfo) -> str:
|
|
150
|
+
inferred_platform = info.data.get("platform") # we may have inferred it above
|
|
149
151
|
platform = platform or inferred_platform
|
|
150
152
|
if not platform:
|
|
151
153
|
raise ValueError("platform must not be empty")
|
|
152
154
|
return platform
|
|
153
155
|
|
|
154
|
-
@
|
|
155
|
-
def ensure_profiling_pattern_is_passed_to_profiling(
|
|
156
|
-
|
|
157
|
-
) -> Dict[str, Any]:
|
|
158
|
-
profiling: Optional[DataLakeProfilerConfig] = values.get("profiling")
|
|
156
|
+
@model_validator(mode="after")
|
|
157
|
+
def ensure_profiling_pattern_is_passed_to_profiling(self) -> "DataLakeSourceConfig":
|
|
158
|
+
profiling = self.profiling
|
|
159
159
|
if profiling is not None and profiling.enabled:
|
|
160
|
-
profiling._allow_deny_patterns =
|
|
161
|
-
return
|
|
160
|
+
profiling._allow_deny_patterns = self.profile_patterns
|
|
161
|
+
return self
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
import pydantic
|
|
4
|
+
from pydantic import model_validator
|
|
4
5
|
from pydantic.fields import Field
|
|
5
6
|
|
|
6
7
|
from datahub.configuration import ConfigModel
|
|
@@ -72,21 +73,18 @@ class DataLakeProfilerConfig(ConfigModel):
|
|
|
72
73
|
description="Whether to profile for the sample values for all columns.",
|
|
73
74
|
)
|
|
74
75
|
|
|
75
|
-
@
|
|
76
|
-
def ensure_field_level_settings_are_normalized(
|
|
77
|
-
|
|
78
|
-
) -> Dict[str, Any]:
|
|
79
|
-
max_num_fields_to_profile_key = "max_number_of_fields_to_profile"
|
|
80
|
-
max_num_fields_to_profile = values.get(max_num_fields_to_profile_key)
|
|
76
|
+
@model_validator(mode="after")
|
|
77
|
+
def ensure_field_level_settings_are_normalized(self) -> "DataLakeProfilerConfig":
|
|
78
|
+
max_num_fields_to_profile = self.max_number_of_fields_to_profile
|
|
81
79
|
|
|
82
80
|
# Disable all field-level metrics.
|
|
83
|
-
if
|
|
84
|
-
for
|
|
85
|
-
if
|
|
86
|
-
|
|
81
|
+
if self.profile_table_level_only:
|
|
82
|
+
for field_name in self.__fields__:
|
|
83
|
+
if field_name.startswith("include_field_"):
|
|
84
|
+
setattr(self, field_name, False)
|
|
87
85
|
|
|
88
86
|
assert max_num_fields_to_profile is None, (
|
|
89
|
-
|
|
87
|
+
"max_number_of_fields_to_profile should be set to None"
|
|
90
88
|
)
|
|
91
89
|
|
|
92
|
-
return
|
|
90
|
+
return self
|
|
@@ -149,7 +149,7 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
149
149
|
self.report = DataLakeSourceReport()
|
|
150
150
|
self.profiling_times_taken = []
|
|
151
151
|
config_report = {
|
|
152
|
-
config_option: config.
|
|
152
|
+
config_option: config.model_dump().get(config_option)
|
|
153
153
|
for config_option in config_options_to_report
|
|
154
154
|
}
|
|
155
155
|
config_report = {
|
|
@@ -164,7 +164,7 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
164
164
|
|
|
165
165
|
@classmethod
|
|
166
166
|
def create(cls, config_dict, ctx):
|
|
167
|
-
config = DataLakeSourceConfig.
|
|
167
|
+
config = DataLakeSourceConfig.model_validate(config_dict)
|
|
168
168
|
|
|
169
169
|
return cls(config, ctx)
|
|
170
170
|
|
|
@@ -246,7 +246,7 @@ def assume_role(
|
|
|
246
246
|
**dict(
|
|
247
247
|
RoleSessionName="DatahubIngestionSource",
|
|
248
248
|
),
|
|
249
|
-
**{k: v for k, v in role.
|
|
249
|
+
**{k: v for k, v in role.model_dump().items() if v is not None},
|
|
250
250
|
}
|
|
251
251
|
|
|
252
252
|
assumed_role_object = sts_client.assume_role(
|
|
@@ -21,7 +21,7 @@ from urllib.parse import urlparse
|
|
|
21
21
|
|
|
22
22
|
import botocore.exceptions
|
|
23
23
|
import yaml
|
|
24
|
-
from pydantic import
|
|
24
|
+
from pydantic import field_validator
|
|
25
25
|
from pydantic.fields import Field
|
|
26
26
|
|
|
27
27
|
from datahub.api.entities.dataset.dataset import Dataset
|
|
@@ -221,7 +221,8 @@ class GlueSourceConfig(
|
|
|
221
221
|
def lakeformation_client(self):
|
|
222
222
|
return self.get_lakeformation_client()
|
|
223
223
|
|
|
224
|
-
@
|
|
224
|
+
@field_validator("glue_s3_lineage_direction", mode="after")
|
|
225
|
+
@classmethod
|
|
225
226
|
def check_direction(cls, v: str) -> str:
|
|
226
227
|
if v.lower() not in ["upstream", "downstream"]:
|
|
227
228
|
raise ValueError(
|
|
@@ -229,7 +230,8 @@ class GlueSourceConfig(
|
|
|
229
230
|
)
|
|
230
231
|
return v.lower()
|
|
231
232
|
|
|
232
|
-
@
|
|
233
|
+
@field_validator("platform", mode="after")
|
|
234
|
+
@classmethod
|
|
233
235
|
def platform_validator(cls, v: str) -> str:
|
|
234
236
|
if not v or v in VALID_PLATFORMS:
|
|
235
237
|
return v
|
|
@@ -473,7 +475,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
473
475
|
|
|
474
476
|
@classmethod
|
|
475
477
|
def create(cls, config_dict, ctx):
|
|
476
|
-
config = GlueSourceConfig.
|
|
478
|
+
config = GlueSourceConfig.model_validate(config_dict)
|
|
477
479
|
return cls(config, ctx)
|
|
478
480
|
|
|
479
481
|
@property
|
|
@@ -66,7 +66,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
|
|
|
66
66
|
|
|
67
67
|
@classmethod
|
|
68
68
|
def create(cls, config_dict, ctx):
|
|
69
|
-
config = SagemakerSourceConfig.
|
|
69
|
+
config = SagemakerSourceConfig.model_validate(config_dict)
|
|
70
70
|
return cls(config, ctx)
|
|
71
71
|
|
|
72
72
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional, Union
|
|
2
2
|
|
|
3
3
|
from azure.identity import ClientSecretCredential
|
|
4
4
|
from azure.storage.blob import BlobServiceClient
|
|
5
5
|
from azure.storage.filedatalake import DataLakeServiceClient, FileSystemClient
|
|
6
|
-
from pydantic import Field,
|
|
6
|
+
from pydantic import Field, model_validator
|
|
7
7
|
|
|
8
8
|
from datahub.configuration import ConfigModel
|
|
9
9
|
from datahub.configuration.common import ConfigurationError
|
|
@@ -81,18 +81,14 @@ class AzureConnectionConfig(ConfigModel):
|
|
|
81
81
|
)
|
|
82
82
|
return self.sas_token if self.sas_token is not None else self.account_key
|
|
83
83
|
|
|
84
|
-
@
|
|
85
|
-
def _check_credential_values(
|
|
84
|
+
@model_validator(mode="after")
|
|
85
|
+
def _check_credential_values(self) -> "AzureConnectionConfig":
|
|
86
86
|
if (
|
|
87
|
-
|
|
88
|
-
or
|
|
89
|
-
or (
|
|
90
|
-
values.get("client_id")
|
|
91
|
-
and values.get("client_secret")
|
|
92
|
-
and values.get("tenant_id")
|
|
93
|
-
)
|
|
87
|
+
self.account_key
|
|
88
|
+
or self.sas_token
|
|
89
|
+
or (self.client_id and self.client_secret and self.tenant_id)
|
|
94
90
|
):
|
|
95
|
-
return
|
|
91
|
+
return self
|
|
96
92
|
raise ConfigurationError(
|
|
97
93
|
"credentials missing, requires one combination of account_key or sas_token or (client_id and client_secret and tenant_id)"
|
|
98
94
|
)
|
|
@@ -211,7 +211,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
211
211
|
|
|
212
212
|
@classmethod
|
|
213
213
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "BigqueryV2Source":
|
|
214
|
-
config = BigQueryV2Config.
|
|
214
|
+
config = BigQueryV2Config.model_validate(config_dict)
|
|
215
215
|
return cls(ctx, config)
|
|
216
216
|
|
|
217
217
|
@staticmethod
|
|
@@ -2,9 +2,16 @@ import logging
|
|
|
2
2
|
import re
|
|
3
3
|
from copy import deepcopy
|
|
4
4
|
from datetime import timedelta
|
|
5
|
-
from typing import Dict, List, Optional, Union
|
|
6
|
-
|
|
7
|
-
from pydantic import
|
|
5
|
+
from typing import Any, Dict, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
from pydantic import (
|
|
8
|
+
Field,
|
|
9
|
+
PositiveInt,
|
|
10
|
+
PrivateAttr,
|
|
11
|
+
ValidationInfo,
|
|
12
|
+
field_validator,
|
|
13
|
+
model_validator,
|
|
14
|
+
)
|
|
8
15
|
|
|
9
16
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
10
17
|
from datahub.configuration.env_vars import get_bigquery_schema_parallelism
|
|
@@ -63,8 +70,9 @@ class BigQueryBaseConfig(ConfigModel):
|
|
|
63
70
|
description="The regex pattern to match sharded tables and group as one table. This is a very low level config parameter, only change if you know what you are doing, ",
|
|
64
71
|
)
|
|
65
72
|
|
|
66
|
-
@
|
|
67
|
-
|
|
73
|
+
@field_validator("sharded_table_pattern", mode="after")
|
|
74
|
+
@classmethod
|
|
75
|
+
def sharded_table_pattern_is_a_valid_regexp(cls, v: str) -> str:
|
|
68
76
|
try:
|
|
69
77
|
re.compile(v)
|
|
70
78
|
except Exception as e:
|
|
@@ -73,7 +81,8 @@ class BigQueryBaseConfig(ConfigModel):
|
|
|
73
81
|
) from e
|
|
74
82
|
return v
|
|
75
83
|
|
|
76
|
-
@
|
|
84
|
+
@model_validator(mode="before")
|
|
85
|
+
@classmethod
|
|
77
86
|
def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
|
|
78
87
|
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
79
88
|
values = deepcopy(values)
|
|
@@ -188,12 +197,11 @@ class BigQueryFilterConfig(SQLFilterConfig):
|
|
|
188
197
|
default=AllowDenyPattern.allow_all(),
|
|
189
198
|
)
|
|
190
199
|
|
|
191
|
-
@
|
|
192
|
-
def backward_compatibility_configs_set(
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
schema_pattern = values.get("schema_pattern")
|
|
200
|
+
@model_validator(mode="after")
|
|
201
|
+
def backward_compatibility_configs_set(self) -> Any:
|
|
202
|
+
dataset_pattern = self.dataset_pattern
|
|
203
|
+
schema_pattern = self.schema_pattern
|
|
204
|
+
|
|
197
205
|
if (
|
|
198
206
|
dataset_pattern == AllowDenyPattern.allow_all()
|
|
199
207
|
and schema_pattern != AllowDenyPattern.allow_all()
|
|
@@ -202,7 +210,7 @@ class BigQueryFilterConfig(SQLFilterConfig):
|
|
|
202
210
|
"dataset_pattern is not set but schema_pattern is set, using schema_pattern as dataset_pattern. "
|
|
203
211
|
"schema_pattern will be deprecated, please use dataset_pattern instead."
|
|
204
212
|
)
|
|
205
|
-
|
|
213
|
+
self.dataset_pattern = schema_pattern
|
|
206
214
|
dataset_pattern = schema_pattern
|
|
207
215
|
elif (
|
|
208
216
|
dataset_pattern != AllowDenyPattern.allow_all()
|
|
@@ -213,7 +221,7 @@ class BigQueryFilterConfig(SQLFilterConfig):
|
|
|
213
221
|
" please use dataset_pattern only."
|
|
214
222
|
)
|
|
215
223
|
|
|
216
|
-
match_fully_qualified_names =
|
|
224
|
+
match_fully_qualified_names = self.match_fully_qualified_names
|
|
217
225
|
|
|
218
226
|
if (
|
|
219
227
|
dataset_pattern is not None
|
|
@@ -243,7 +251,7 @@ class BigQueryFilterConfig(SQLFilterConfig):
|
|
|
243
251
|
" of the form `<project_id>.<dataset_name>`."
|
|
244
252
|
)
|
|
245
253
|
|
|
246
|
-
return
|
|
254
|
+
return self
|
|
247
255
|
|
|
248
256
|
|
|
249
257
|
class BigQueryIdentifierConfig(
|
|
@@ -478,7 +486,8 @@ class BigQueryV2Config(
|
|
|
478
486
|
_include_view_column_lineage = pydantic_removed_field("include_view_column_lineage")
|
|
479
487
|
_lineage_parse_view_ddl = pydantic_removed_field("lineage_parse_view_ddl")
|
|
480
488
|
|
|
481
|
-
@
|
|
489
|
+
@model_validator(mode="before")
|
|
490
|
+
@classmethod
|
|
482
491
|
def set_include_schema_metadata(cls, values: Dict) -> Dict:
|
|
483
492
|
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
484
493
|
values = deepcopy(values)
|
|
@@ -498,30 +507,33 @@ class BigQueryV2Config(
|
|
|
498
507
|
|
|
499
508
|
return values
|
|
500
509
|
|
|
501
|
-
@
|
|
510
|
+
@model_validator(mode="before")
|
|
511
|
+
@classmethod
|
|
502
512
|
def profile_default_settings(cls, values: Dict) -> Dict:
|
|
503
513
|
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
504
514
|
values = deepcopy(values)
|
|
505
515
|
# Extra default SQLAlchemy option for better connection pooling and threading.
|
|
506
516
|
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
|
|
507
|
-
values
|
|
517
|
+
values.setdefault("options", {}).setdefault("max_overflow", -1)
|
|
508
518
|
|
|
509
519
|
return values
|
|
510
520
|
|
|
511
|
-
@
|
|
521
|
+
@field_validator("bigquery_audit_metadata_datasets", mode="after")
|
|
522
|
+
@classmethod
|
|
512
523
|
def validate_bigquery_audit_metadata_datasets(
|
|
513
|
-
cls, v: Optional[List[str]],
|
|
524
|
+
cls, v: Optional[List[str]], info: ValidationInfo
|
|
514
525
|
) -> Optional[List[str]]:
|
|
515
|
-
if
|
|
526
|
+
if info.data.get("use_exported_bigquery_audit_metadata"):
|
|
516
527
|
assert v and len(v) > 0, (
|
|
517
528
|
"`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
|
|
518
529
|
)
|
|
519
530
|
|
|
520
531
|
return v
|
|
521
532
|
|
|
522
|
-
@
|
|
523
|
-
|
|
524
|
-
|
|
533
|
+
@field_validator("upstream_lineage_in_report", mode="after")
|
|
534
|
+
@classmethod
|
|
535
|
+
def validate_upstream_lineage_in_report(cls, v: bool, info: ValidationInfo) -> bool:
|
|
536
|
+
if v and info.data.get("use_queries_v2", True):
|
|
525
537
|
logging.warning(
|
|
526
538
|
"`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
|
|
527
539
|
"This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
|
|
@@ -529,11 +541,12 @@ class BigQueryV2Config(
|
|
|
529
541
|
|
|
530
542
|
return v
|
|
531
543
|
|
|
532
|
-
@
|
|
533
|
-
def validate_queries_v2_stateful_ingestion(
|
|
534
|
-
if
|
|
535
|
-
if
|
|
536
|
-
|
|
544
|
+
@model_validator(mode="after")
|
|
545
|
+
def validate_queries_v2_stateful_ingestion(self) -> "BigQueryV2Config":
|
|
546
|
+
if self.use_queries_v2:
|
|
547
|
+
if (
|
|
548
|
+
self.enable_stateful_lineage_ingestion
|
|
549
|
+
or self.enable_stateful_usage_ingestion
|
|
537
550
|
):
|
|
538
551
|
logger.warning(
|
|
539
552
|
"enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
|
|
@@ -541,7 +554,7 @@ class BigQueryV2Config(
|
|
|
541
554
|
"For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
|
|
542
555
|
"for the unified time window extraction (lineage + usage + operations + queries)."
|
|
543
556
|
)
|
|
544
|
-
return
|
|
557
|
+
return self
|
|
545
558
|
|
|
546
559
|
def get_table_pattern(self, pattern: List[str]) -> str:
|
|
547
560
|
return "|".join(pattern) if pattern else ""
|
|
@@ -80,7 +80,7 @@ class BigQueryQueriesSource(Source):
|
|
|
80
80
|
|
|
81
81
|
@classmethod
|
|
82
82
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> Self:
|
|
83
|
-
config = BigQueryQueriesSourceConfig.
|
|
83
|
+
config = BigQueryQueriesSourceConfig.model_validate(config_dict)
|
|
84
84
|
return cls(ctx, config)
|
|
85
85
|
|
|
86
86
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -109,7 +109,7 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
109
109
|
|
|
110
110
|
@classmethod
|
|
111
111
|
def create(cls, config_dict, ctx):
|
|
112
|
-
config = CassandraSourceConfig.
|
|
112
|
+
config = CassandraSourceConfig.model_validate(config_dict)
|
|
113
113
|
return cls(ctx, config)
|
|
114
114
|
|
|
115
115
|
def get_platform(self) -> str:
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import tempfile
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Dict, Optional
|
|
4
4
|
|
|
5
|
-
from pydantic import Field,
|
|
5
|
+
from pydantic import Field, model_validator
|
|
6
6
|
|
|
7
7
|
from datahub.configuration import ConfigModel
|
|
8
8
|
from datahub.configuration.validate_multiline_string import pydantic_multiline_string
|
|
@@ -37,16 +37,16 @@ class GCPCredential(ConfigModel):
|
|
|
37
37
|
|
|
38
38
|
_fix_private_key_newlines = pydantic_multiline_string("private_key")
|
|
39
39
|
|
|
40
|
-
@
|
|
41
|
-
def validate_config(
|
|
42
|
-
if
|
|
43
|
-
|
|
44
|
-
f"https://www.googleapis.com/robot/v1/metadata/x509/{
|
|
40
|
+
@model_validator(mode="after")
|
|
41
|
+
def validate_config(self) -> "GCPCredential":
|
|
42
|
+
if self.client_x509_cert_url is None:
|
|
43
|
+
self.client_x509_cert_url = (
|
|
44
|
+
f"https://www.googleapis.com/robot/v1/metadata/x509/{self.client_email}"
|
|
45
45
|
)
|
|
46
|
-
return
|
|
46
|
+
return self
|
|
47
47
|
|
|
48
48
|
def create_credential_temp_file(self, project_id: Optional[str] = None) -> str:
|
|
49
|
-
configs = self.
|
|
49
|
+
configs = self.model_dump()
|
|
50
50
|
if project_id:
|
|
51
51
|
configs["project_id"] = project_id
|
|
52
52
|
with tempfile.NamedTemporaryFile(delete=False) as fp:
|
|
@@ -55,7 +55,7 @@ class GCPCredential(ConfigModel):
|
|
|
55
55
|
return fp.name
|
|
56
56
|
|
|
57
57
|
def to_dict(self, project_id: Optional[str] = None) -> Dict[str, str]:
|
|
58
|
-
configs = self.
|
|
58
|
+
configs = self.model_dump()
|
|
59
59
|
if project_id:
|
|
60
60
|
configs["project_id"] = project_id
|
|
61
61
|
return configs
|