acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2550 -2543
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +263 -261
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +2 -2
- datahub/api/entities/corpgroup/corpgroup.py +11 -6
- datahub/api/entities/corpuser/corpuser.py +11 -11
- datahub/api/entities/dataproduct/dataproduct.py +47 -27
- datahub/api/entities/dataset/dataset.py +32 -21
- datahub/api/entities/external/lake_formation_external_entites.py +5 -6
- datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
- datahub/api/entities/forms/forms.py +16 -14
- datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
- datahub/cli/check_cli.py +2 -2
- datahub/cli/config_utils.py +3 -3
- datahub/cli/lite_cli.py +9 -7
- datahub/cli/migrate.py +4 -4
- datahub/cli/quickstart_versioning.py +3 -3
- datahub/cli/specific/group_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +1 -1
- datahub/cli/specific/user_cli.py +1 -1
- datahub/configuration/common.py +14 -2
- datahub/configuration/connection_resolver.py +2 -2
- datahub/configuration/git.py +47 -30
- datahub/configuration/import_resolver.py +2 -2
- datahub/configuration/kafka.py +4 -3
- datahub/configuration/time_window_config.py +26 -26
- datahub/configuration/validate_field_deprecation.py +2 -2
- datahub/configuration/validate_field_removal.py +2 -2
- datahub/configuration/validate_field_rename.py +2 -2
- datahub/configuration/validate_multiline_string.py +2 -1
- datahub/emitter/kafka_emitter.py +3 -1
- datahub/emitter/rest_emitter.py +2 -4
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/report.py +1 -1
- datahub/ingestion/api/sink.py +1 -1
- datahub/ingestion/api/source.py +1 -1
- datahub/ingestion/glossary/datahub_classifier.py +11 -8
- datahub/ingestion/graph/client.py +5 -1
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/reporting/file_reporter.py +5 -4
- datahub/ingestion/run/pipeline.py +7 -6
- datahub/ingestion/run/pipeline_config.py +12 -14
- datahub/ingestion/run/sink_callback.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +6 -4
- datahub/ingestion/source/abs/config.py +19 -19
- datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/abs/source.py +2 -2
- datahub/ingestion/source/aws/aws_common.py +1 -1
- datahub/ingestion/source/aws/glue.py +6 -4
- datahub/ingestion/source/aws/sagemaker.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +8 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
- datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
- datahub/ingestion/source/datahub/config.py +8 -8
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
- datahub/ingestion/source/dbt/dbt_common.py +39 -37
- datahub/ingestion/source/dbt/dbt_core.py +10 -12
- datahub/ingestion/source/debug/datahub_debug.py +1 -1
- datahub/ingestion/source/delta_lake/config.py +6 -4
- datahub/ingestion/source/dremio/dremio_api.py +212 -78
- datahub/ingestion/source/dremio/dremio_config.py +10 -6
- datahub/ingestion/source/dremio/dremio_entities.py +55 -39
- datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
- datahub/ingestion/source/dremio/dremio_source.py +24 -26
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/elastic_search.py +110 -32
- datahub/ingestion/source/excel/source.py +1 -1
- datahub/ingestion/source/feast.py +1 -1
- datahub/ingestion/source/file.py +5 -4
- datahub/ingestion/source/fivetran/config.py +17 -16
- datahub/ingestion/source/fivetran/fivetran.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +8 -10
- datahub/ingestion/source/ge_profiling_config.py +8 -5
- datahub/ingestion/source/grafana/grafana_api.py +2 -2
- datahub/ingestion/source/grafana/grafana_config.py +4 -3
- datahub/ingestion/source/grafana/grafana_source.py +1 -1
- datahub/ingestion/source/grafana/models.py +23 -5
- datahub/ingestion/source/hex/api.py +7 -5
- datahub/ingestion/source/hex/hex.py +4 -3
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +10 -10
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +7 -5
- datahub/ingestion/source/looker/looker_config.py +21 -20
- datahub/ingestion/source/looker/lookml_config.py +47 -47
- datahub/ingestion/source/metabase.py +8 -8
- datahub/ingestion/source/metadata/business_glossary.py +2 -2
- datahub/ingestion/source/metadata/lineage.py +13 -8
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/mode.py +6 -4
- datahub/ingestion/source/mongodb.py +4 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +17 -23
- datahub/ingestion/source/openapi.py +6 -8
- datahub/ingestion/source/powerbi/config.py +33 -32
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
- datahub/ingestion/source/powerbi/powerbi.py +1 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
- datahub/ingestion/source/preset.py +8 -8
- datahub/ingestion/source/pulsar.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
- datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +18 -20
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/usage.py +23 -3
- datahub/ingestion/source/s3/config.py +83 -62
- datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/s3/source.py +8 -5
- datahub/ingestion/source/sac/sac.py +5 -4
- datahub/ingestion/source/salesforce.py +3 -2
- datahub/ingestion/source/schema/json_schema.py +2 -2
- datahub/ingestion/source/sigma/data_classes.py +3 -2
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/sigma/sigma_api.py +7 -7
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
- datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_queries.py +28 -4
- datahub/ingestion/source/sql/athena.py +1 -1
- datahub/ingestion/source/sql/clickhouse.py +4 -2
- datahub/ingestion/source/sql/cockroachdb.py +1 -1
- datahub/ingestion/source/sql/druid.py +1 -1
- datahub/ingestion/source/sql/hana.py +1 -1
- datahub/ingestion/source/sql/hive.py +7 -5
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +13 -6
- datahub/ingestion/source/sql/mysql.py +1 -1
- datahub/ingestion/source/sql/oracle.py +17 -10
- datahub/ingestion/source/sql/postgres.py +2 -2
- datahub/ingestion/source/sql/presto.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +8 -9
- datahub/ingestion/source/sql/sql_generic.py +1 -1
- datahub/ingestion/source/sql/teradata.py +1 -1
- datahub/ingestion/source/sql/trino.py +1 -1
- datahub/ingestion/source/sql/vertica.py +5 -4
- datahub/ingestion/source/sql_queries.py +174 -22
- datahub/ingestion/source/state/checkpoint.py +2 -2
- datahub/ingestion/source/state/entity_removal_state.py +2 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +9 -9
- datahub/ingestion/source/tableau/tableau.py +14 -16
- datahub/ingestion/source/unity/azure_auth_config.py +15 -0
- datahub/ingestion/source/unity/config.py +51 -34
- datahub/ingestion/source/unity/connection.py +7 -1
- datahub/ingestion/source/unity/connection_test.py +1 -1
- datahub/ingestion/source/unity/proxy.py +216 -7
- datahub/ingestion/source/unity/proxy_types.py +91 -0
- datahub/ingestion/source/unity/source.py +29 -3
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
- datahub/ingestion/source/usage/usage_common.py +5 -3
- datahub/ingestion/source_config/csv_enricher.py +7 -6
- datahub/ingestion/source_config/operation_config.py +7 -4
- datahub/ingestion/source_config/pulsar.py +11 -15
- datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
- datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
- datahub/ingestion/transformer/add_dataset_properties.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
- datahub/ingestion/transformer/add_dataset_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
- datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
- datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
- datahub/ingestion/transformer/mark_dataset_status.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
- datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/replace_external_url.py +2 -2
- datahub/ingestion/transformer/set_browse_path.py +1 -1
- datahub/ingestion/transformer/tags_to_terms.py +1 -1
- datahub/lite/duckdb_lite.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_internal_schema_classes.py +62 -2
- datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
- datahub/metadata/schema.avsc +271 -91
- datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
- datahub/metadata/schemas/AssertionInfo.avsc +48 -5
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
- datahub/metadata/schemas/ChartInfo.avsc +12 -5
- datahub/metadata/schemas/ContainerProperties.avsc +12 -5
- datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
- datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
- datahub/metadata/schemas/DashboardInfo.avsc +16 -4
- datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
- datahub/metadata/schemas/DataJobInfo.avsc +9 -4
- datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
- datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- datahub/metadata/schemas/DataProductProperties.avsc +5 -2
- datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/DatasetProperties.avsc +12 -5
- datahub/metadata/schemas/DomainProperties.avsc +7 -3
- datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
- datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalTags.avsc +3 -2
- datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- datahub/metadata/schemas/InputFields.avsc +3 -2
- datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLModelProperties.avsc +4 -2
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
- datahub/metadata/schemas/NotebookInfo.avsc +5 -2
- datahub/metadata/schemas/Ownership.avsc +3 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -1
- datahub/metadata/schemas/RoleProperties.avsc +3 -1
- datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- datahub/metadata/schemas/TagProperties.avsc +3 -1
- datahub/metadata/schemas/TestInfo.avsc +2 -1
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/_all_entities.py +2 -0
- datahub/sdk/search_filters.py +68 -40
- datahub/sdk/tag.py +112 -0
- datahub/secret/datahub_secret_store.py +7 -4
- datahub/secret/file_secret_store.py +1 -1
- datahub/sql_parsing/schema_resolver.py +29 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
- datahub/sql_parsing/sqlglot_lineage.py +5 -2
- datahub/testing/check_sql_parser_result.py +2 -2
- datahub/utilities/ingest_utils.py +1 -1
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
-
|
|
4
|
+
import re
|
|
5
|
+
from dataclasses import dataclass, field
|
|
5
6
|
from datetime import datetime
|
|
6
7
|
from functools import partial
|
|
7
|
-
from typing import ClassVar, Iterable, List, Optional, Union
|
|
8
|
+
from typing import Any, ClassVar, Iterable, List, Optional, Union, cast
|
|
8
9
|
|
|
9
|
-
|
|
10
|
+
import smart_open
|
|
11
|
+
from pydantic import BaseModel, Field, field_validator
|
|
10
12
|
|
|
11
13
|
from datahub.configuration.common import HiddenFromDocs
|
|
12
14
|
from datahub.configuration.datetimes import parse_user_datetime
|
|
@@ -36,12 +38,13 @@ from datahub.ingestion.api.source import (
|
|
|
36
38
|
SourceCapability,
|
|
37
39
|
SourceReport,
|
|
38
40
|
)
|
|
39
|
-
from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
41
|
+
from datahub.ingestion.api.source_helpers import auto_workunit, auto_workunit_reporter
|
|
40
42
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
41
43
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
44
|
+
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
42
45
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
43
46
|
from datahub.metadata.urns import CorpUserUrn, DatasetUrn
|
|
44
|
-
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
47
|
+
from datahub.sql_parsing.schema_resolver import SchemaResolver, SchemaResolverReport
|
|
45
48
|
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
46
49
|
KnownQueryLineageInfo,
|
|
47
50
|
ObservedQuery,
|
|
@@ -82,6 +85,24 @@ class SqlQueriesSourceConfig(
|
|
|
82
85
|
None,
|
|
83
86
|
description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
|
|
84
87
|
)
|
|
88
|
+
temp_table_patterns: List[str] = Field(
|
|
89
|
+
description="Regex patterns for temporary tables to filter in lineage ingestion. "
|
|
90
|
+
"Specify regex to match the entire table name. This is useful for platforms like Athena "
|
|
91
|
+
"that don't have native temp tables but use naming patterns for fake temp tables.",
|
|
92
|
+
default=[],
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
enable_lazy_schema_loading: bool = Field(
|
|
96
|
+
default=True,
|
|
97
|
+
description="Enable lazy schema loading for better performance. When enabled, schemas are fetched on-demand "
|
|
98
|
+
"instead of bulk loading all schemas upfront, reducing startup time and memory usage.",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# AWS/S3 configuration
|
|
102
|
+
aws_config: Optional[AwsConnectionConfig] = Field(
|
|
103
|
+
default=None,
|
|
104
|
+
description="AWS configuration for S3 access. Required when query_file is an S3 URI (s3://).",
|
|
105
|
+
)
|
|
85
106
|
|
|
86
107
|
|
|
87
108
|
@dataclass
|
|
@@ -89,8 +110,13 @@ class SqlQueriesSourceReport(SourceReport):
|
|
|
89
110
|
num_entries_processed: int = 0
|
|
90
111
|
num_entries_failed: int = 0
|
|
91
112
|
num_queries_aggregator_failures: int = 0
|
|
113
|
+
num_queries_processed_sequential: int = 0
|
|
114
|
+
num_temp_tables_detected: int = 0
|
|
115
|
+
temp_table_patterns_used: List[str] = field(default_factory=list)
|
|
116
|
+
peak_memory_usage_mb: float = 0.0
|
|
92
117
|
|
|
93
118
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
119
|
+
schema_resolver_report: Optional[SchemaResolverReport] = None
|
|
94
120
|
|
|
95
121
|
|
|
96
122
|
@platform_name("SQL Queries", id="sql-queries")
|
|
@@ -115,6 +141,18 @@ class SqlQueriesSource(Source):
|
|
|
115
141
|
- upstream_tables (optional): string[] - Fallback list of tables the query reads from,
|
|
116
142
|
used if the query can't be parsed.
|
|
117
143
|
|
|
144
|
+
**Lazy Schema Loading**:
|
|
145
|
+
- Fetches schemas on-demand during query parsing instead of bulk loading all schemas upfront
|
|
146
|
+
- Caches fetched schemas for future lookups to avoid repeated network requests
|
|
147
|
+
- Reduces initial startup time and memory usage significantly
|
|
148
|
+
- Automatically handles large platforms efficiently without memory issues
|
|
149
|
+
|
|
150
|
+
**Query Processing**:
|
|
151
|
+
- Loads the entire query file into memory at once
|
|
152
|
+
- Processes all queries sequentially before generating metadata work units
|
|
153
|
+
- Preserves temp table mappings and lineage relationships to ensure consistent lineage tracking
|
|
154
|
+
- Query deduplication is handled automatically by the SQL parsing aggregator
|
|
155
|
+
|
|
118
156
|
### Incremental Lineage
|
|
119
157
|
When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
|
|
120
158
|
This allows you to add lineage edges without removing existing ones, which is useful for:
|
|
@@ -124,6 +162,12 @@ class SqlQueriesSource(Source):
|
|
|
124
162
|
|
|
125
163
|
Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
|
|
126
164
|
statistics will still be emitted normally.
|
|
165
|
+
|
|
166
|
+
### Temporary Table Support
|
|
167
|
+
For platforms like Athena that don't have native temporary tables, you can use the `temp_table_patterns`
|
|
168
|
+
configuration to specify regex patterns that identify fake temporary tables. This allows the source to
|
|
169
|
+
process these tables like other sources that support native temp tables, enabling proper lineage tracking
|
|
170
|
+
across temporary table operations.
|
|
127
171
|
"""
|
|
128
172
|
|
|
129
173
|
schema_resolver: Optional[SchemaResolver]
|
|
@@ -141,13 +185,19 @@ class SqlQueriesSource(Source):
|
|
|
141
185
|
self.report = SqlQueriesSourceReport()
|
|
142
186
|
|
|
143
187
|
if self.config.use_schema_resolver:
|
|
144
|
-
#
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
188
|
+
# Create schema resolver report for tracking
|
|
189
|
+
self.report.schema_resolver_report = SchemaResolverReport()
|
|
190
|
+
|
|
191
|
+
# Use lazy loading - schemas will be fetched on-demand and cached
|
|
192
|
+
logger.info(
|
|
193
|
+
"Using lazy schema loading - schemas will be fetched on-demand and cached"
|
|
194
|
+
)
|
|
195
|
+
self.schema_resolver = SchemaResolver(
|
|
148
196
|
platform=self.config.platform,
|
|
149
197
|
platform_instance=self.config.platform_instance,
|
|
150
198
|
env=self.config.env,
|
|
199
|
+
graph=self.graph,
|
|
200
|
+
report=self.report.schema_resolver_report,
|
|
151
201
|
)
|
|
152
202
|
else:
|
|
153
203
|
self.schema_resolver = None
|
|
@@ -156,7 +206,9 @@ class SqlQueriesSource(Source):
|
|
|
156
206
|
platform=self.config.platform,
|
|
157
207
|
platform_instance=self.config.platform_instance,
|
|
158
208
|
env=self.config.env,
|
|
159
|
-
schema_resolver=self.schema_resolver
|
|
209
|
+
schema_resolver=cast(SchemaResolver, self.schema_resolver)
|
|
210
|
+
if self.schema_resolver
|
|
211
|
+
else None,
|
|
160
212
|
eager_graph_load=False,
|
|
161
213
|
generate_lineage=True, # TODO: make this configurable
|
|
162
214
|
generate_queries=True, # TODO: make this configurable
|
|
@@ -165,7 +217,9 @@ class SqlQueriesSource(Source):
|
|
|
165
217
|
generate_usage_statistics=True,
|
|
166
218
|
generate_operations=True, # TODO: make this configurable
|
|
167
219
|
usage_config=self.config.usage,
|
|
168
|
-
is_temp_table=
|
|
220
|
+
is_temp_table=self.is_temp_table
|
|
221
|
+
if self.config.temp_table_patterns
|
|
222
|
+
else None,
|
|
169
223
|
is_allowed_table=None,
|
|
170
224
|
format_queries=False,
|
|
171
225
|
)
|
|
@@ -193,20 +247,73 @@ class SqlQueriesSource(Source):
|
|
|
193
247
|
) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
|
|
194
248
|
logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}")
|
|
195
249
|
|
|
250
|
+
logger.info("Processing all queries in batch mode")
|
|
251
|
+
yield from self._process_queries_batch()
|
|
252
|
+
|
|
253
|
+
def _process_queries_batch(
|
|
254
|
+
self,
|
|
255
|
+
) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
|
|
256
|
+
"""Process all queries in memory (original behavior)."""
|
|
196
257
|
with self.report.new_stage("Collecting queries from file"):
|
|
197
258
|
queries = list(self._parse_query_file())
|
|
198
259
|
logger.info(f"Collected {len(queries)} queries for processing")
|
|
199
260
|
|
|
200
261
|
with self.report.new_stage("Processing queries through SQL parsing aggregator"):
|
|
201
|
-
|
|
202
|
-
|
|
262
|
+
logger.info("Using sequential processing")
|
|
263
|
+
self._process_queries_sequential(queries)
|
|
203
264
|
|
|
204
265
|
with self.report.new_stage("Generating metadata work units"):
|
|
205
266
|
logger.info("Generating workunits from SQL parsing aggregator")
|
|
206
|
-
yield from self.aggregator.gen_metadata()
|
|
267
|
+
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
207
268
|
|
|
208
|
-
def
|
|
209
|
-
"""
|
|
269
|
+
def _is_s3_uri(self, path: str) -> bool:
|
|
270
|
+
"""Check if the path is an S3 URI."""
|
|
271
|
+
return path.startswith("s3://")
|
|
272
|
+
|
|
273
|
+
def _parse_s3_query_file(self) -> Iterable["QueryEntry"]:
|
|
274
|
+
"""Parse query file from S3 using smart_open."""
|
|
275
|
+
if not self.config.aws_config:
|
|
276
|
+
raise ValueError("AWS configuration required for S3 file access")
|
|
277
|
+
|
|
278
|
+
logger.info(f"Reading query file from S3: {self.config.query_file}")
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
# Use smart_open for efficient S3 streaming, similar to S3FileSystem
|
|
282
|
+
s3_client = self.config.aws_config.get_s3_client()
|
|
283
|
+
|
|
284
|
+
with smart_open.open(
|
|
285
|
+
self.config.query_file, mode="r", transport_params={"client": s3_client}
|
|
286
|
+
) as file_stream:
|
|
287
|
+
for line in file_stream:
|
|
288
|
+
if line.strip():
|
|
289
|
+
try:
|
|
290
|
+
query_dict = json.loads(line, strict=False)
|
|
291
|
+
entry = QueryEntry.create(query_dict, config=self.config)
|
|
292
|
+
self.report.num_entries_processed += 1
|
|
293
|
+
if self.report.num_entries_processed % 1000 == 0:
|
|
294
|
+
logger.info(
|
|
295
|
+
f"Processed {self.report.num_entries_processed} query entries from S3"
|
|
296
|
+
)
|
|
297
|
+
yield entry
|
|
298
|
+
except Exception as e:
|
|
299
|
+
self.report.num_entries_failed += 1
|
|
300
|
+
self.report.warning(
|
|
301
|
+
title="Error processing query from S3",
|
|
302
|
+
message="Query skipped due to parsing error",
|
|
303
|
+
context=line.strip(),
|
|
304
|
+
exc=e,
|
|
305
|
+
)
|
|
306
|
+
except Exception as e:
|
|
307
|
+
self.report.warning(
|
|
308
|
+
title="Error reading S3 file",
|
|
309
|
+
message="Failed to read S3 file",
|
|
310
|
+
context=self.config.query_file,
|
|
311
|
+
exc=e,
|
|
312
|
+
)
|
|
313
|
+
raise
|
|
314
|
+
|
|
315
|
+
def _parse_local_query_file(self) -> Iterable["QueryEntry"]:
|
|
316
|
+
"""Parse local query file (existing logic)."""
|
|
210
317
|
with open(self.config.query_file) as f:
|
|
211
318
|
for line in f:
|
|
212
319
|
try:
|
|
@@ -227,6 +334,30 @@ class SqlQueriesSource(Source):
|
|
|
227
334
|
exc=e,
|
|
228
335
|
)
|
|
229
336
|
|
|
337
|
+
def _parse_query_file(self) -> Iterable["QueryEntry"]:
|
|
338
|
+
"""Parse the query file and yield QueryEntry objects."""
|
|
339
|
+
if self._is_s3_uri(self.config.query_file):
|
|
340
|
+
yield from self._parse_s3_query_file()
|
|
341
|
+
else:
|
|
342
|
+
yield from self._parse_local_query_file()
|
|
343
|
+
|
|
344
|
+
def _process_queries_sequential(self, queries: List["QueryEntry"]) -> None:
|
|
345
|
+
"""Process queries sequentially."""
|
|
346
|
+
total_queries = len(queries)
|
|
347
|
+
logger.info(f"Processing {total_queries} queries sequentially")
|
|
348
|
+
|
|
349
|
+
# Process each query sequentially
|
|
350
|
+
for i, query_entry in enumerate(queries):
|
|
351
|
+
self._add_query_to_aggregator(query_entry)
|
|
352
|
+
self.report.num_queries_processed_sequential += 1
|
|
353
|
+
|
|
354
|
+
# Simple progress reporting every 1000 queries
|
|
355
|
+
if (i + 1) % 1000 == 0:
|
|
356
|
+
progress_pct = ((i + 1) / total_queries) * 100
|
|
357
|
+
logger.info(
|
|
358
|
+
f"Processed {i + 1}/{total_queries} queries ({progress_pct:.1f}%)"
|
|
359
|
+
)
|
|
360
|
+
|
|
230
361
|
def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
|
|
231
362
|
"""Add a query to the SQL parsing aggregator."""
|
|
232
363
|
try:
|
|
@@ -285,6 +416,24 @@ class SqlQueriesSource(Source):
|
|
|
285
416
|
exc=e,
|
|
286
417
|
)
|
|
287
418
|
|
|
419
|
+
def is_temp_table(self, name: str) -> bool:
|
|
420
|
+
"""Check if a table name matches any of the configured temp table patterns."""
|
|
421
|
+
if not self.config.temp_table_patterns:
|
|
422
|
+
return False
|
|
423
|
+
|
|
424
|
+
try:
|
|
425
|
+
for pattern in self.config.temp_table_patterns:
|
|
426
|
+
if re.match(pattern, name, flags=re.IGNORECASE):
|
|
427
|
+
logger.debug(
|
|
428
|
+
f"Table '{name}' matched temp table pattern: {pattern}"
|
|
429
|
+
)
|
|
430
|
+
self.report.num_temp_tables_detected += 1
|
|
431
|
+
return True
|
|
432
|
+
except re.error as e:
|
|
433
|
+
logger.warning(f"Invalid regex pattern '{pattern}': {e}")
|
|
434
|
+
|
|
435
|
+
return False
|
|
436
|
+
|
|
288
437
|
|
|
289
438
|
class QueryEntry(BaseModel):
|
|
290
439
|
query: str
|
|
@@ -301,19 +450,22 @@ class QueryEntry(BaseModel):
|
|
|
301
450
|
class Config:
|
|
302
451
|
arbitrary_types_allowed = True
|
|
303
452
|
|
|
304
|
-
@
|
|
305
|
-
|
|
453
|
+
@field_validator("timestamp", mode="before")
|
|
454
|
+
@classmethod
|
|
455
|
+
def parse_timestamp(cls, v: Any) -> Any:
|
|
306
456
|
return None if v is None else parse_user_datetime(str(v))
|
|
307
457
|
|
|
308
|
-
@
|
|
309
|
-
|
|
458
|
+
@field_validator("user", mode="before")
|
|
459
|
+
@classmethod
|
|
460
|
+
def parse_user(cls, v: Any) -> Any:
|
|
310
461
|
if v is None:
|
|
311
462
|
return None
|
|
312
463
|
|
|
313
464
|
return v if isinstance(v, CorpUserUrn) else CorpUserUrn(v)
|
|
314
465
|
|
|
315
|
-
@
|
|
316
|
-
|
|
466
|
+
@field_validator("downstream_tables", "upstream_tables", mode="before")
|
|
467
|
+
@classmethod
|
|
468
|
+
def parse_tables(cls, v: Any) -> Any:
|
|
317
469
|
if not v:
|
|
318
470
|
return []
|
|
319
471
|
|
|
@@ -163,7 +163,7 @@ class Checkpoint(Generic[StateType]):
|
|
|
163
163
|
)
|
|
164
164
|
state_as_dict["version"] = checkpoint_aspect.state.formatVersion
|
|
165
165
|
state_as_dict["serde"] = checkpoint_aspect.state.serde
|
|
166
|
-
return state_class.
|
|
166
|
+
return state_class.model_validate(state_as_dict)
|
|
167
167
|
|
|
168
168
|
@staticmethod
|
|
169
169
|
def _from_base85_json_bytes(
|
|
@@ -179,7 +179,7 @@ class Checkpoint(Generic[StateType]):
|
|
|
179
179
|
state_as_dict = json.loads(state_uncompressed.decode("utf-8"))
|
|
180
180
|
state_as_dict["version"] = checkpoint_aspect.state.formatVersion
|
|
181
181
|
state_as_dict["serde"] = checkpoint_aspect.state.serde
|
|
182
|
-
return state_class.
|
|
182
|
+
return state_class.model_validate(state_as_dict)
|
|
183
183
|
|
|
184
184
|
def to_checkpoint_aspect(
|
|
185
185
|
self, max_allowed_state_size: int
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Tuple, Type
|
|
2
2
|
|
|
3
3
|
import pydantic
|
|
4
|
+
from pydantic import model_validator
|
|
4
5
|
|
|
5
6
|
from datahub.emitter.mce_builder import make_assertion_urn, make_container_urn
|
|
6
7
|
from datahub.ingestion.source.state.checkpoint import CheckpointStateBase
|
|
@@ -59,7 +60,7 @@ def pydantic_state_migrator(mapping: Dict[str, str]) -> "V1RootValidator":
|
|
|
59
60
|
|
|
60
61
|
return values
|
|
61
62
|
|
|
62
|
-
return
|
|
63
|
+
return model_validator(mode="before")(_validate_field_rename)
|
|
63
64
|
|
|
64
65
|
|
|
65
66
|
class GenericCheckpointState(CheckpointStateBase):
|
|
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|
|
3
3
|
from typing import Any, Dict, Generic, Optional, Type, TypeVar
|
|
4
4
|
|
|
5
5
|
import pydantic
|
|
6
|
-
from pydantic import
|
|
6
|
+
from pydantic import model_validator
|
|
7
7
|
from pydantic.fields import Field
|
|
8
8
|
|
|
9
9
|
from datahub.configuration.common import (
|
|
@@ -73,14 +73,14 @@ class StatefulIngestionConfig(ConfigModel):
|
|
|
73
73
|
description="If set to True, ignores the current checkpoint state.",
|
|
74
74
|
)
|
|
75
75
|
|
|
76
|
-
@
|
|
77
|
-
def validate_config(
|
|
78
|
-
if
|
|
79
|
-
if
|
|
80
|
-
|
|
76
|
+
@model_validator(mode="after")
|
|
77
|
+
def validate_config(self) -> "StatefulIngestionConfig":
|
|
78
|
+
if self.enabled:
|
|
79
|
+
if self.state_provider is None:
|
|
80
|
+
self.state_provider = DynamicTypedStateProviderConfig(
|
|
81
81
|
type="datahub", config={}
|
|
82
82
|
)
|
|
83
|
-
return
|
|
83
|
+
return self
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
CustomConfig = TypeVar("CustomConfig", bound=StatefulIngestionConfig)
|
|
@@ -110,17 +110,19 @@ class StatefulLineageConfigMixin(ConfigModel):
|
|
|
110
110
|
"store_last_lineage_extraction_timestamp", "enable_stateful_lineage_ingestion"
|
|
111
111
|
)
|
|
112
112
|
|
|
113
|
-
@
|
|
114
|
-
def lineage_stateful_option_validator(
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
if
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
113
|
+
@model_validator(mode="after")
|
|
114
|
+
def lineage_stateful_option_validator(self) -> "StatefulLineageConfigMixin":
|
|
115
|
+
try:
|
|
116
|
+
sti = getattr(self, "stateful_ingestion", None)
|
|
117
|
+
if not sti or not getattr(sti, "enabled", False):
|
|
118
|
+
if getattr(self, "enable_stateful_lineage_ingestion", False):
|
|
119
|
+
logger.warning(
|
|
120
|
+
"Stateful ingestion is disabled, disabling enable_stateful_lineage_ingestion config option as well"
|
|
121
|
+
)
|
|
122
|
+
self.enable_stateful_lineage_ingestion = False
|
|
123
|
+
except (AttributeError, RecursionError) as e:
|
|
124
|
+
logger.debug(f"Skipping stateful lineage validation due to: {e}")
|
|
125
|
+
return self
|
|
124
126
|
|
|
125
127
|
|
|
126
128
|
class StatefulProfilingConfigMixin(ConfigModel):
|
|
@@ -135,16 +137,19 @@ class StatefulProfilingConfigMixin(ConfigModel):
|
|
|
135
137
|
"store_last_profiling_timestamps", "enable_stateful_profiling"
|
|
136
138
|
)
|
|
137
139
|
|
|
138
|
-
@
|
|
139
|
-
def profiling_stateful_option_validator(
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
if
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
140
|
+
@model_validator(mode="after")
|
|
141
|
+
def profiling_stateful_option_validator(self) -> "StatefulProfilingConfigMixin":
|
|
142
|
+
try:
|
|
143
|
+
sti = getattr(self, "stateful_ingestion", None)
|
|
144
|
+
if not sti or not getattr(sti, "enabled", False):
|
|
145
|
+
if getattr(self, "enable_stateful_profiling", False):
|
|
146
|
+
logger.warning(
|
|
147
|
+
"Stateful ingestion is disabled, disabling enable_stateful_profiling config option as well"
|
|
148
|
+
)
|
|
149
|
+
self.enable_stateful_profiling = False
|
|
150
|
+
except (AttributeError, RecursionError) as e:
|
|
151
|
+
logger.debug(f"Skipping stateful profiling validation due to: {e}")
|
|
152
|
+
return self
|
|
148
153
|
|
|
149
154
|
|
|
150
155
|
class StatefulUsageConfigMixin(BaseTimeWindowConfig):
|
|
@@ -161,16 +166,21 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
|
|
|
161
166
|
"store_last_usage_extraction_timestamp", "enable_stateful_usage_ingestion"
|
|
162
167
|
)
|
|
163
168
|
|
|
164
|
-
@
|
|
165
|
-
def last_usage_extraction_stateful_option_validator(
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
|
|
169
|
+
@model_validator(mode="after")
|
|
170
|
+
def last_usage_extraction_stateful_option_validator(
|
|
171
|
+
self,
|
|
172
|
+
) -> "StatefulUsageConfigMixin":
|
|
173
|
+
try:
|
|
174
|
+
sti = getattr(self, "stateful_ingestion", None)
|
|
175
|
+
if not sti or not getattr(sti, "enabled", False):
|
|
176
|
+
if getattr(self, "enable_stateful_usage_ingestion", False):
|
|
177
|
+
logger.warning(
|
|
178
|
+
"Stateful ingestion is disabled, disabling enable_stateful_usage_ingestion config option as well"
|
|
179
|
+
)
|
|
180
|
+
self.enable_stateful_usage_ingestion = False
|
|
181
|
+
except (AttributeError, RecursionError) as e:
|
|
182
|
+
logger.debug(f"Skipping stateful usage validation due to: {e}")
|
|
183
|
+
return self
|
|
174
184
|
|
|
175
185
|
|
|
176
186
|
class StatefulTimeWindowConfigMixin(BaseTimeWindowConfig):
|
|
@@ -185,16 +195,16 @@ class StatefulTimeWindowConfigMixin(BaseTimeWindowConfig):
|
|
|
185
195
|
"and queries together from a single audit log and uses a unified time window.",
|
|
186
196
|
)
|
|
187
197
|
|
|
188
|
-
@
|
|
189
|
-
def time_window_stateful_option_validator(
|
|
190
|
-
sti =
|
|
191
|
-
if not sti or not sti
|
|
192
|
-
if
|
|
198
|
+
@model_validator(mode="after")
|
|
199
|
+
def time_window_stateful_option_validator(self) -> "StatefulTimeWindowConfigMixin":
|
|
200
|
+
sti = getattr(self, "stateful_ingestion", None)
|
|
201
|
+
if not sti or not getattr(sti, "enabled", False):
|
|
202
|
+
if getattr(self, "enable_stateful_time_window", False):
|
|
193
203
|
logger.warning(
|
|
194
204
|
"Stateful ingestion is disabled, disabling enable_stateful_time_window config option as well"
|
|
195
205
|
)
|
|
196
|
-
|
|
197
|
-
return
|
|
206
|
+
self.enable_stateful_time_window = False
|
|
207
|
+
return self
|
|
198
208
|
|
|
199
209
|
|
|
200
210
|
@dataclass
|
|
@@ -40,7 +40,7 @@ class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
|
|
|
40
40
|
def create(
|
|
41
41
|
cls, config_dict: Dict[str, Any], ctx: PipelineContext
|
|
42
42
|
) -> "DatahubIngestionCheckpointingProvider":
|
|
43
|
-
config = DatahubIngestionStateProviderConfig.
|
|
43
|
+
config = DatahubIngestionStateProviderConfig.model_validate(config_dict)
|
|
44
44
|
if config.datahub_api is not None:
|
|
45
45
|
return cls(DataHubGraph(config.datahub_api))
|
|
46
46
|
elif ctx.graph:
|
|
@@ -32,7 +32,7 @@ class FileIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
|
|
|
32
32
|
def create(
|
|
33
33
|
cls, config_dict: Dict[str, Any], ctx: PipelineContext
|
|
34
34
|
) -> "FileIngestionCheckpointingProvider":
|
|
35
|
-
config = FileIngestionStateProviderConfig.
|
|
35
|
+
config = FileIngestionStateProviderConfig.model_validate(config_dict)
|
|
36
36
|
return cls(config)
|
|
37
37
|
|
|
38
38
|
def get_latest_checkpoint(
|
|
@@ -9,7 +9,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
|
9
9
|
import dateutil.parser as dp
|
|
10
10
|
import requests
|
|
11
11
|
import sqlglot
|
|
12
|
-
from pydantic import BaseModel,
|
|
12
|
+
from pydantic import BaseModel, field_validator, model_validator
|
|
13
13
|
from pydantic.fields import Field
|
|
14
14
|
from requests.adapters import HTTPAdapter
|
|
15
15
|
from urllib3.util.retry import Retry
|
|
@@ -246,16 +246,16 @@ class SupersetConfig(
|
|
|
246
246
|
# This is required to allow preset configs to get parsed
|
|
247
247
|
extra = "allow"
|
|
248
248
|
|
|
249
|
-
@
|
|
250
|
-
|
|
249
|
+
@field_validator("connect_uri", "display_uri", mode="after")
|
|
250
|
+
@classmethod
|
|
251
|
+
def remove_trailing_slash(cls, v: str) -> str:
|
|
251
252
|
return config_clean.remove_trailing_slashes(v)
|
|
252
253
|
|
|
253
|
-
@
|
|
254
|
-
def default_display_uri_to_connect_uri(
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
return values
|
|
254
|
+
@model_validator(mode="after")
|
|
255
|
+
def default_display_uri_to_connect_uri(self) -> "SupersetConfig":
|
|
256
|
+
if self.display_uri is None:
|
|
257
|
+
self.display_uri = self.connect_uri
|
|
258
|
+
return self
|
|
259
259
|
|
|
260
260
|
|
|
261
261
|
def get_metric_name(metric):
|
|
@@ -25,7 +25,7 @@ from urllib.parse import quote, urlparse
|
|
|
25
25
|
|
|
26
26
|
import dateutil.parser as dp
|
|
27
27
|
import tableauserverclient as TSC
|
|
28
|
-
from pydantic import
|
|
28
|
+
from pydantic import field_validator, model_validator
|
|
29
29
|
from pydantic.fields import Field
|
|
30
30
|
from requests.adapters import HTTPAdapter
|
|
31
31
|
from tableauserverclient import (
|
|
@@ -257,8 +257,9 @@ class TableauConnectionConfig(ConfigModel):
|
|
|
257
257
|
description="When enabled, extracts column-level lineage from Tableau Datasources",
|
|
258
258
|
)
|
|
259
259
|
|
|
260
|
-
@
|
|
261
|
-
|
|
260
|
+
@field_validator("connect_uri", mode="after")
|
|
261
|
+
@classmethod
|
|
262
|
+
def remove_trailing_slash(cls, v: str) -> str:
|
|
262
263
|
return config_clean.remove_trailing_slashes(v)
|
|
263
264
|
|
|
264
265
|
def get_tableau_auth(
|
|
@@ -652,8 +653,9 @@ class TableauConfig(
|
|
|
652
653
|
"fetch_size",
|
|
653
654
|
)
|
|
654
655
|
|
|
655
|
-
#
|
|
656
|
-
@
|
|
656
|
+
# mode = "before" because we want to take some decision before pydantic initialize the configuration to default values
|
|
657
|
+
@model_validator(mode="before")
|
|
658
|
+
@classmethod
|
|
657
659
|
def projects_backward_compatibility(cls, values: Dict) -> Dict:
|
|
658
660
|
# In-place update of the input dict would cause state contamination. This was discovered through test failures
|
|
659
661
|
# in test_hex.py where the same dict is reused.
|
|
@@ -683,27 +685,23 @@ class TableauConfig(
|
|
|
683
685
|
|
|
684
686
|
return values
|
|
685
687
|
|
|
686
|
-
@
|
|
687
|
-
def validate_config_values(
|
|
688
|
-
tags_for_hidden_assets = values.get("tags_for_hidden_assets")
|
|
689
|
-
ingest_tags = values.get("ingest_tags")
|
|
688
|
+
@model_validator(mode="after")
|
|
689
|
+
def validate_config_values(self) -> "TableauConfig":
|
|
690
690
|
if (
|
|
691
|
-
not ingest_tags
|
|
692
|
-
and tags_for_hidden_assets
|
|
693
|
-
and len(tags_for_hidden_assets) > 0
|
|
691
|
+
not self.ingest_tags
|
|
692
|
+
and self.tags_for_hidden_assets
|
|
693
|
+
and len(self.tags_for_hidden_assets) > 0
|
|
694
694
|
):
|
|
695
695
|
raise ValueError(
|
|
696
696
|
"tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
|
|
697
697
|
)
|
|
698
698
|
|
|
699
|
-
use_email_as_username
|
|
700
|
-
ingest_owner = values.get("ingest_owner")
|
|
701
|
-
if use_email_as_username and not ingest_owner:
|
|
699
|
+
if self.use_email_as_username and not self.ingest_owner:
|
|
702
700
|
raise ValueError(
|
|
703
701
|
"use_email_as_username requires ingest_owner to be enabled."
|
|
704
702
|
)
|
|
705
703
|
|
|
706
|
-
return
|
|
704
|
+
return self
|
|
707
705
|
|
|
708
706
|
|
|
709
707
|
class WorkbookKey(ContainerKey):
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from pydantic import Field, SecretStr
|
|
2
|
+
|
|
3
|
+
from datahub.configuration import ConfigModel
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AzureAuthConfig(ConfigModel):
|
|
7
|
+
client_secret: SecretStr = Field(
|
|
8
|
+
description="Azure application client secret used for authentication. This is a confidential credential that should be kept secure."
|
|
9
|
+
)
|
|
10
|
+
client_id: str = Field(
|
|
11
|
+
description="Azure application (client) ID. This is the unique identifier for the registered Azure AD application.",
|
|
12
|
+
)
|
|
13
|
+
tenant_id: str = Field(
|
|
14
|
+
description="Azure tenant (directory) ID. This identifies the Azure AD tenant where the application is registered.",
|
|
15
|
+
)
|