acryl-datahub 1.3.1__py3-none-any.whl → 1.3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2582 -2582
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +203 -201
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +2 -2
- datahub/api/entities/corpgroup/corpgroup.py +11 -6
- datahub/api/entities/corpuser/corpuser.py +11 -11
- datahub/api/entities/dataproduct/dataproduct.py +47 -27
- datahub/api/entities/dataset/dataset.py +32 -21
- datahub/api/entities/external/lake_formation_external_entites.py +5 -6
- datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
- datahub/api/entities/forms/forms.py +16 -14
- datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
- datahub/cli/check_cli.py +2 -2
- datahub/cli/config_utils.py +3 -3
- datahub/cli/lite_cli.py +9 -7
- datahub/cli/migrate.py +4 -4
- datahub/cli/quickstart_versioning.py +3 -3
- datahub/cli/specific/group_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +1 -1
- datahub/cli/specific/user_cli.py +1 -1
- datahub/configuration/common.py +14 -2
- datahub/configuration/connection_resolver.py +2 -2
- datahub/configuration/git.py +47 -30
- datahub/configuration/import_resolver.py +2 -2
- datahub/configuration/kafka.py +4 -3
- datahub/configuration/time_window_config.py +26 -26
- datahub/configuration/validate_field_deprecation.py +2 -2
- datahub/configuration/validate_field_removal.py +2 -2
- datahub/configuration/validate_field_rename.py +2 -2
- datahub/configuration/validate_multiline_string.py +2 -1
- datahub/emitter/kafka_emitter.py +3 -1
- datahub/emitter/rest_emitter.py +2 -4
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/report.py +1 -1
- datahub/ingestion/api/sink.py +1 -1
- datahub/ingestion/api/source.py +1 -1
- datahub/ingestion/glossary/datahub_classifier.py +11 -8
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/reporting/file_reporter.py +5 -4
- datahub/ingestion/run/pipeline.py +6 -6
- datahub/ingestion/run/pipeline_config.py +12 -14
- datahub/ingestion/run/sink_callback.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +6 -4
- datahub/ingestion/source/abs/config.py +19 -19
- datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/abs/source.py +2 -2
- datahub/ingestion/source/aws/aws_common.py +1 -1
- datahub/ingestion/source/aws/glue.py +6 -4
- datahub/ingestion/source/aws/sagemaker.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +8 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
- datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
- datahub/ingestion/source/datahub/config.py +8 -8
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
- datahub/ingestion/source/dbt/dbt_common.py +39 -37
- datahub/ingestion/source/dbt/dbt_core.py +10 -12
- datahub/ingestion/source/debug/datahub_debug.py +1 -1
- datahub/ingestion/source/delta_lake/config.py +6 -4
- datahub/ingestion/source/dremio/dremio_config.py +10 -6
- datahub/ingestion/source/dremio/dremio_source.py +15 -15
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/elastic_search.py +4 -3
- datahub/ingestion/source/excel/source.py +1 -1
- datahub/ingestion/source/feast.py +1 -1
- datahub/ingestion/source/file.py +5 -4
- datahub/ingestion/source/fivetran/config.py +17 -16
- datahub/ingestion/source/fivetran/fivetran.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +8 -10
- datahub/ingestion/source/ge_profiling_config.py +8 -5
- datahub/ingestion/source/grafana/grafana_api.py +2 -2
- datahub/ingestion/source/grafana/grafana_config.py +4 -3
- datahub/ingestion/source/grafana/grafana_source.py +1 -1
- datahub/ingestion/source/grafana/models.py +23 -5
- datahub/ingestion/source/hex/api.py +7 -5
- datahub/ingestion/source/hex/hex.py +4 -3
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +10 -10
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +7 -5
- datahub/ingestion/source/looker/looker_config.py +21 -20
- datahub/ingestion/source/looker/lookml_config.py +47 -47
- datahub/ingestion/source/metabase.py +8 -8
- datahub/ingestion/source/metadata/business_glossary.py +2 -2
- datahub/ingestion/source/metadata/lineage.py +13 -8
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/mode.py +6 -4
- datahub/ingestion/source/mongodb.py +4 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +17 -23
- datahub/ingestion/source/openapi.py +6 -8
- datahub/ingestion/source/powerbi/config.py +33 -32
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
- datahub/ingestion/source/powerbi/powerbi.py +1 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
- datahub/ingestion/source/preset.py +8 -8
- datahub/ingestion/source/pulsar.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
- datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +18 -20
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/usage.py +23 -3
- datahub/ingestion/source/s3/config.py +83 -62
- datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/s3/source.py +8 -5
- datahub/ingestion/source/sac/sac.py +5 -4
- datahub/ingestion/source/salesforce.py +3 -2
- datahub/ingestion/source/schema/json_schema.py +2 -2
- datahub/ingestion/source/sigma/data_classes.py +3 -2
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/sigma/sigma_api.py +7 -7
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
- datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_queries.py +1 -1
- datahub/ingestion/source/sql/athena.py +1 -1
- datahub/ingestion/source/sql/clickhouse.py +4 -2
- datahub/ingestion/source/sql/cockroachdb.py +1 -1
- datahub/ingestion/source/sql/druid.py +1 -1
- datahub/ingestion/source/sql/hana.py +1 -1
- datahub/ingestion/source/sql/hive.py +7 -5
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +13 -6
- datahub/ingestion/source/sql/mysql.py +1 -1
- datahub/ingestion/source/sql/oracle.py +17 -10
- datahub/ingestion/source/sql/postgres.py +2 -2
- datahub/ingestion/source/sql/presto.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +8 -9
- datahub/ingestion/source/sql/sql_generic.py +1 -1
- datahub/ingestion/source/sql/teradata.py +1 -1
- datahub/ingestion/source/sql/trino.py +1 -1
- datahub/ingestion/source/sql/vertica.py +5 -4
- datahub/ingestion/source/sql_queries.py +11 -8
- datahub/ingestion/source/state/checkpoint.py +2 -2
- datahub/ingestion/source/state/entity_removal_state.py +2 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +9 -9
- datahub/ingestion/source/tableau/tableau.py +14 -16
- datahub/ingestion/source/unity/azure_auth_config.py +15 -0
- datahub/ingestion/source/unity/config.py +51 -34
- datahub/ingestion/source/unity/connection.py +7 -1
- datahub/ingestion/source/unity/connection_test.py +1 -1
- datahub/ingestion/source/unity/proxy.py +216 -7
- datahub/ingestion/source/unity/proxy_types.py +91 -0
- datahub/ingestion/source/unity/source.py +29 -3
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
- datahub/ingestion/source/usage/usage_common.py +5 -3
- datahub/ingestion/source_config/csv_enricher.py +7 -6
- datahub/ingestion/source_config/operation_config.py +7 -4
- datahub/ingestion/source_config/pulsar.py +11 -15
- datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
- datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
- datahub/ingestion/transformer/add_dataset_properties.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
- datahub/ingestion/transformer/add_dataset_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
- datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
- datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
- datahub/ingestion/transformer/mark_dataset_status.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
- datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/replace_external_url.py +2 -2
- datahub/ingestion/transformer/set_browse_path.py +1 -1
- datahub/ingestion/transformer/tags_to_terms.py +1 -1
- datahub/lite/duckdb_lite.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/schema.avsc +7 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +6 -1
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/_all_entities.py +2 -0
- datahub/sdk/search_filters.py +68 -40
- datahub/sdk/tag.py +112 -0
- datahub/secret/datahub_secret_store.py +7 -4
- datahub/secret/file_secret_store.py +1 -1
- datahub/sql_parsing/sqlglot_lineage.py +5 -2
- datahub/testing/check_sql_parser_result.py +2 -2
- datahub/utilities/ingest_utils.py +1 -1
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
|
@@ -3,11 +3,11 @@ import logging
|
|
|
3
3
|
import os
|
|
4
4
|
import re
|
|
5
5
|
from enum import Enum
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import List, Optional, Tuple, Union
|
|
7
7
|
|
|
8
8
|
import parse
|
|
9
|
-
import pydantic
|
|
10
9
|
from cached_property import cached_property
|
|
10
|
+
from pydantic import field_validator, model_validator
|
|
11
11
|
from pydantic.fields import Field
|
|
12
12
|
from wcmatch import pathlib
|
|
13
13
|
|
|
@@ -65,7 +65,8 @@ class SortKey(ConfigModel):
|
|
|
65
65
|
description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.",
|
|
66
66
|
)
|
|
67
67
|
|
|
68
|
-
@
|
|
68
|
+
@field_validator("date_format", mode="before")
|
|
69
|
+
@classmethod
|
|
69
70
|
def convert_date_format_to_python_format(cls, v: Optional[str]) -> Optional[str]:
|
|
70
71
|
if v is None:
|
|
71
72
|
return None
|
|
@@ -86,7 +87,7 @@ class PathSpec(ConfigModel):
|
|
|
86
87
|
arbitrary_types_allowed = True
|
|
87
88
|
|
|
88
89
|
include: str = Field(
|
|
89
|
-
description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details."
|
|
90
|
+
description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details.",
|
|
90
91
|
)
|
|
91
92
|
exclude: Optional[List[str]] = Field(
|
|
92
93
|
[],
|
|
@@ -260,20 +261,80 @@ class PathSpec(ConfigModel):
|
|
|
260
261
|
) -> Union[None, parse.Result, parse.Match]:
|
|
261
262
|
return self.compiled_folder_include.parse(path)
|
|
262
263
|
|
|
263
|
-
@
|
|
264
|
-
def
|
|
265
|
-
|
|
266
|
-
|
|
264
|
+
@model_validator(mode="after")
|
|
265
|
+
def validate_path_spec_comprehensive(self):
|
|
266
|
+
"""
|
|
267
|
+
Comprehensive model validator that handles multiple interdependent validations.
|
|
268
|
+
|
|
269
|
+
Consolidates related validation logic to avoid order dependencies between multiple
|
|
270
|
+
model validators and ensures reliable cross-field validation. This approach is
|
|
271
|
+
preferred over multiple separate validators when:
|
|
272
|
+
|
|
273
|
+
1. Validations depend on multiple fields (e.g., sample_files depends on include)
|
|
274
|
+
2. One validation modifies a field that another validation checks
|
|
275
|
+
3. Field validators can't reliably access other field values or defaults
|
|
276
|
+
4. Order of execution between validators is important but undefined
|
|
277
|
+
|
|
278
|
+
By combining related validations, we ensure they execute in the correct sequence
|
|
279
|
+
and have access to all field values after Pydantic has processed defaults.
|
|
280
|
+
"""
|
|
281
|
+
# Handle autodetect_partitions logic first
|
|
282
|
+
if self.autodetect_partitions:
|
|
283
|
+
include = self.include
|
|
284
|
+
if include.endswith("/"):
|
|
285
|
+
include = include[:-1]
|
|
286
|
+
if include.endswith("{table}"):
|
|
287
|
+
self.include = include + "/**"
|
|
288
|
+
# Allow double stars when we add them for autodetect_partitions
|
|
289
|
+
self.allow_double_stars = True
|
|
290
|
+
|
|
291
|
+
# Handle table_name logic
|
|
292
|
+
if self.table_name is None and "{table}" in self.include:
|
|
293
|
+
self.table_name = "{table}"
|
|
294
|
+
elif self.table_name is not None:
|
|
295
|
+
parsable_include = PathSpec.get_parsable_include(self.include)
|
|
296
|
+
compiled_include = parse.compile(parsable_include)
|
|
297
|
+
if not all(
|
|
298
|
+
x in compiled_include.named_fields
|
|
299
|
+
for x in parse.compile(self.table_name).named_fields
|
|
300
|
+
):
|
|
301
|
+
raise ValueError(
|
|
302
|
+
f"Not all named variables used in path_spec.table_name {self.table_name} are specified in path_spec.include {self.include}"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Handle sample_files logic - turn off sampling for non-cloud URIs
|
|
306
|
+
is_s3 = is_s3_uri(self.include)
|
|
307
|
+
is_gcs = is_gcs_uri(self.include)
|
|
308
|
+
is_abs = is_abs_uri(self.include)
|
|
309
|
+
if not is_s3 and not is_gcs and not is_abs:
|
|
310
|
+
# Sampling only makes sense on s3 and gcs currently
|
|
311
|
+
self.sample_files = False
|
|
312
|
+
|
|
313
|
+
# Validate double stars
|
|
314
|
+
if "**" in self.include and not self.allow_double_stars:
|
|
315
|
+
raise ValueError("path_spec.include cannot contain '**'")
|
|
267
316
|
|
|
317
|
+
# Validate file extension
|
|
318
|
+
include_ext = os.path.splitext(self.include)[1].strip(".")
|
|
319
|
+
if not include_ext:
|
|
320
|
+
include_ext = (
|
|
321
|
+
"*" # if no extension is provided, we assume all files are allowed
|
|
322
|
+
)
|
|
268
323
|
if (
|
|
269
|
-
|
|
270
|
-
and
|
|
271
|
-
and not
|
|
324
|
+
include_ext not in self.file_types
|
|
325
|
+
and include_ext not in ["*", ""]
|
|
326
|
+
and not self.default_extension
|
|
327
|
+
and include_ext not in SUPPORTED_COMPRESSIONS
|
|
272
328
|
):
|
|
273
|
-
raise ValueError(
|
|
274
|
-
|
|
329
|
+
raise ValueError(
|
|
330
|
+
f"file type specified ({include_ext}) in path_spec.include is not in specified file "
|
|
331
|
+
f'types. Please select one from {self.file_types} or specify ".*" to allow all types'
|
|
332
|
+
)
|
|
275
333
|
|
|
276
|
-
|
|
334
|
+
return self
|
|
335
|
+
|
|
336
|
+
@field_validator("file_types", mode="before")
|
|
337
|
+
@classmethod
|
|
277
338
|
def validate_file_types(cls, v: Optional[List[str]]) -> List[str]:
|
|
278
339
|
if v is None:
|
|
279
340
|
return SUPPORTED_FILE_TYPES
|
|
@@ -285,50 +346,24 @@ class PathSpec(ConfigModel):
|
|
|
285
346
|
)
|
|
286
347
|
return v
|
|
287
348
|
|
|
288
|
-
@
|
|
289
|
-
|
|
349
|
+
@field_validator("default_extension", mode="after")
|
|
350
|
+
@classmethod
|
|
351
|
+
def validate_default_extension(cls, v: Optional[str]) -> Optional[str]:
|
|
290
352
|
if v is not None and v not in SUPPORTED_FILE_TYPES:
|
|
291
353
|
raise ValueError(
|
|
292
354
|
f"default extension {v} not in supported default file extension. Please specify one from {SUPPORTED_FILE_TYPES}"
|
|
293
355
|
)
|
|
294
356
|
return v
|
|
295
357
|
|
|
296
|
-
@
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
is_gcs = is_gcs_uri(values.get("include") or "")
|
|
300
|
-
is_abs = is_abs_uri(values.get("include") or "")
|
|
301
|
-
if not is_s3 and not is_gcs and not is_abs:
|
|
302
|
-
# Sampling only makes sense on s3 and gcs currently
|
|
303
|
-
v = False
|
|
304
|
-
return v
|
|
305
|
-
|
|
306
|
-
@pydantic.validator("exclude", each_item=True)
|
|
307
|
-
def no_named_fields_in_exclude(cls, v: str) -> str:
|
|
308
|
-
if len(parse.compile(v).named_fields) != 0:
|
|
309
|
-
raise ValueError(
|
|
310
|
-
f"path_spec.exclude {v} should not contain any named variables"
|
|
311
|
-
)
|
|
312
|
-
return v
|
|
313
|
-
|
|
314
|
-
@pydantic.validator("table_name", always=True)
|
|
315
|
-
def table_name_in_include(cls, v, values):
|
|
316
|
-
if "include" not in values:
|
|
317
|
-
return v
|
|
318
|
-
|
|
319
|
-
parsable_include = PathSpec.get_parsable_include(values["include"])
|
|
320
|
-
compiled_include = parse.compile(parsable_include)
|
|
321
|
-
|
|
358
|
+
@field_validator("exclude", mode="after")
|
|
359
|
+
@classmethod
|
|
360
|
+
def no_named_fields_in_exclude(cls, v: Optional[List[str]]) -> Optional[List[str]]:
|
|
322
361
|
if v is None:
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
if not all(
|
|
327
|
-
x in compiled_include.named_fields
|
|
328
|
-
for x in parse.compile(v).named_fields
|
|
329
|
-
):
|
|
362
|
+
return v
|
|
363
|
+
for item in v:
|
|
364
|
+
if len(parse.compile(item).named_fields) != 0:
|
|
330
365
|
raise ValueError(
|
|
331
|
-
f"
|
|
366
|
+
f"path_spec.exclude {item} should not contain any named variables"
|
|
332
367
|
)
|
|
333
368
|
return v
|
|
334
369
|
|
|
@@ -479,45 +514,6 @@ class PathSpec(ConfigModel):
|
|
|
479
514
|
logger.debug(f"Setting _glob_include: {glob_include}")
|
|
480
515
|
return glob_include
|
|
481
516
|
|
|
482
|
-
@pydantic.root_validator(skip_on_failure=True)
|
|
483
|
-
@staticmethod
|
|
484
|
-
def validate_path_spec(values: Dict) -> Dict[str, Any]:
|
|
485
|
-
# validate that main fields are populated
|
|
486
|
-
required_fields = ["include", "file_types", "default_extension"]
|
|
487
|
-
for f in required_fields:
|
|
488
|
-
if f not in values:
|
|
489
|
-
logger.debug(
|
|
490
|
-
f"Failed to validate because {f} wasn't populated correctly"
|
|
491
|
-
)
|
|
492
|
-
return values
|
|
493
|
-
|
|
494
|
-
if values["include"] and values["autodetect_partitions"]:
|
|
495
|
-
include = values["include"]
|
|
496
|
-
if include.endswith("/"):
|
|
497
|
-
include = include[:-1]
|
|
498
|
-
|
|
499
|
-
if include.endswith("{table}"):
|
|
500
|
-
values["include"] = include + "/**"
|
|
501
|
-
|
|
502
|
-
include_ext = os.path.splitext(values["include"])[1].strip(".")
|
|
503
|
-
if not include_ext:
|
|
504
|
-
include_ext = (
|
|
505
|
-
"*" # if no extension is provided, we assume all files are allowed
|
|
506
|
-
)
|
|
507
|
-
|
|
508
|
-
if (
|
|
509
|
-
include_ext not in values["file_types"]
|
|
510
|
-
and include_ext not in ["*", ""]
|
|
511
|
-
and not values["default_extension"]
|
|
512
|
-
and include_ext not in SUPPORTED_COMPRESSIONS
|
|
513
|
-
):
|
|
514
|
-
raise ValueError(
|
|
515
|
-
f"file type specified ({include_ext}) in path_spec.include is not in specified file "
|
|
516
|
-
f'types. Please select one from {values.get("file_types")} or specify ".*" to allow all types'
|
|
517
|
-
)
|
|
518
|
-
|
|
519
|
-
return values
|
|
520
|
-
|
|
521
517
|
def _extract_table_name(self, named_vars: dict) -> str:
|
|
522
518
|
if self.table_name is None:
|
|
523
519
|
raise ValueError("path_spec.table_name is not set")
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
from typing import Optional, Set
|
|
3
3
|
|
|
4
4
|
import pydantic
|
|
5
|
-
from pydantic import Field,
|
|
5
|
+
from pydantic import Field, model_validator
|
|
6
6
|
|
|
7
7
|
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
8
8
|
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
|
|
@@ -132,20 +132,20 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
132
132
|
default=True, description="Copy system metadata from the source system"
|
|
133
133
|
)
|
|
134
134
|
|
|
135
|
-
@
|
|
136
|
-
def check_ingesting_data(
|
|
135
|
+
@model_validator(mode="after")
|
|
136
|
+
def check_ingesting_data(self):
|
|
137
137
|
if (
|
|
138
|
-
not
|
|
139
|
-
and not
|
|
140
|
-
and not
|
|
138
|
+
not self.database_connection
|
|
139
|
+
and not self.kafka_connection
|
|
140
|
+
and not self.pull_from_datahub_api
|
|
141
141
|
):
|
|
142
142
|
raise ValueError(
|
|
143
143
|
"Your current config will not ingest any data."
|
|
144
144
|
" Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
|
|
145
145
|
)
|
|
146
|
-
return
|
|
146
|
+
return self
|
|
147
147
|
|
|
148
|
-
@pydantic.
|
|
148
|
+
@pydantic.field_validator("database_connection")
|
|
149
149
|
def validate_mysql_scheme(
|
|
150
150
|
cls, v: SQLAlchemyConnectionConfig
|
|
151
151
|
) -> SQLAlchemyConnectionConfig:
|
|
@@ -62,7 +62,7 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
62
62
|
|
|
63
63
|
@classmethod
|
|
64
64
|
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "DataHubSource":
|
|
65
|
-
config: DataHubSourceConfig = DataHubSourceConfig.
|
|
65
|
+
config: DataHubSourceConfig = DataHubSourceConfig.model_validate(config_dict)
|
|
66
66
|
return cls(config, ctx)
|
|
67
67
|
|
|
68
68
|
def get_report(self) -> SourceReport:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from copy import deepcopy
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
from json import JSONDecodeError
|
|
4
5
|
from typing import Dict, List, Literal, Optional, Tuple
|
|
@@ -6,7 +7,7 @@ from urllib.parse import urlparse
|
|
|
6
7
|
|
|
7
8
|
import dateutil.parser
|
|
8
9
|
import requests
|
|
9
|
-
from pydantic import Field,
|
|
10
|
+
from pydantic import Field, model_validator
|
|
10
11
|
|
|
11
12
|
from datahub.ingestion.api.decorators import (
|
|
12
13
|
SourceCapability,
|
|
@@ -68,8 +69,13 @@ class DBTCloudConfig(DBTCommonConfig):
|
|
|
68
69
|
description='Where should the "View in dbt" link point to - either the "Explore" UI or the dbt Cloud IDE',
|
|
69
70
|
)
|
|
70
71
|
|
|
71
|
-
@
|
|
72
|
+
@model_validator(mode="before")
|
|
73
|
+
@classmethod
|
|
72
74
|
def set_metadata_endpoint(cls, values: dict) -> dict:
|
|
75
|
+
# In-place update of the input dict would cause state contamination.
|
|
76
|
+
# So a deepcopy is performed first.
|
|
77
|
+
values = deepcopy(values)
|
|
78
|
+
|
|
73
79
|
if values.get("access_url") and not values.get("metadata_endpoint"):
|
|
74
80
|
metadata_endpoint = infer_metadata_endpoint(values["access_url"])
|
|
75
81
|
if metadata_endpoint is None:
|
|
@@ -271,7 +277,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
271
277
|
|
|
272
278
|
@classmethod
|
|
273
279
|
def create(cls, config_dict, ctx):
|
|
274
|
-
config = DBTCloudConfig.
|
|
280
|
+
config = DBTCloudConfig.model_validate(config_dict)
|
|
275
281
|
return cls(config, ctx)
|
|
276
282
|
|
|
277
283
|
@staticmethod
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
3
|
from abc import abstractmethod
|
|
4
|
+
from copy import deepcopy
|
|
4
5
|
from dataclasses import dataclass, field
|
|
5
6
|
from datetime import datetime
|
|
6
7
|
from enum import auto
|
|
@@ -8,7 +9,7 @@ from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
|
8
9
|
|
|
9
10
|
import more_itertools
|
|
10
11
|
import pydantic
|
|
11
|
-
from pydantic import
|
|
12
|
+
from pydantic import field_validator, model_validator
|
|
12
13
|
from pydantic.fields import Field
|
|
13
14
|
|
|
14
15
|
from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
@@ -194,22 +195,26 @@ class DBTEntitiesEnabled(ConfigModel):
|
|
|
194
195
|
"Only supported with dbt core.",
|
|
195
196
|
)
|
|
196
197
|
|
|
197
|
-
@
|
|
198
|
-
def process_only_directive(
|
|
198
|
+
@model_validator(mode="after")
|
|
199
|
+
def process_only_directive(self) -> "DBTEntitiesEnabled":
|
|
199
200
|
# Checks that at most one is set to ONLY, and then sets the others to NO.
|
|
200
|
-
|
|
201
|
-
|
|
201
|
+
only_values = [
|
|
202
|
+
k for k, v in self.model_dump().items() if v == EmitDirective.ONLY
|
|
203
|
+
]
|
|
202
204
|
if len(only_values) > 1:
|
|
203
205
|
raise ValueError(
|
|
204
206
|
f"Cannot have more than 1 type of entity emission set to ONLY. Found {only_values}"
|
|
205
207
|
)
|
|
206
208
|
|
|
207
209
|
if len(only_values) == 1:
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
210
|
+
# Set all fields to NO first
|
|
211
|
+
for field_name in self.model_dump():
|
|
212
|
+
setattr(self, field_name, EmitDirective.NO)
|
|
211
213
|
|
|
212
|
-
|
|
214
|
+
# Set the ONLY one to YES
|
|
215
|
+
setattr(self, only_values[0], EmitDirective.YES)
|
|
216
|
+
|
|
217
|
+
return self
|
|
213
218
|
|
|
214
219
|
def _node_type_allow_map(self):
|
|
215
220
|
# Node type comes from dbt's node types.
|
|
@@ -412,7 +417,8 @@ class DBTCommonConfig(
|
|
|
412
417
|
"This ensures that lineage is generated reliably, but will lose any documentation associated only with the source.",
|
|
413
418
|
)
|
|
414
419
|
|
|
415
|
-
@
|
|
420
|
+
@field_validator("target_platform", mode="after")
|
|
421
|
+
@classmethod
|
|
416
422
|
def validate_target_platform_value(cls, target_platform: str) -> str:
|
|
417
423
|
if target_platform.lower() == DBT_PLATFORM:
|
|
418
424
|
raise ValueError(
|
|
@@ -421,15 +427,21 @@ class DBTCommonConfig(
|
|
|
421
427
|
)
|
|
422
428
|
return target_platform
|
|
423
429
|
|
|
424
|
-
@
|
|
430
|
+
@model_validator(mode="before")
|
|
431
|
+
@classmethod
|
|
425
432
|
def set_convert_column_urns_to_lowercase_default_for_snowflake(
|
|
426
433
|
cls, values: dict
|
|
427
434
|
) -> dict:
|
|
435
|
+
# In-place update of the input dict would cause state contamination.
|
|
436
|
+
# So a deepcopy is performed first.
|
|
437
|
+
values = deepcopy(values)
|
|
438
|
+
|
|
428
439
|
if values.get("target_platform", "").lower() == "snowflake":
|
|
429
440
|
values.setdefault("convert_column_urns_to_lowercase", True)
|
|
430
441
|
return values
|
|
431
442
|
|
|
432
|
-
@
|
|
443
|
+
@field_validator("write_semantics", mode="after")
|
|
444
|
+
@classmethod
|
|
433
445
|
def validate_write_semantics(cls, write_semantics: str) -> str:
|
|
434
446
|
if write_semantics.lower() not in {"patch", "override"}:
|
|
435
447
|
raise ValueError(
|
|
@@ -439,10 +451,9 @@ class DBTCommonConfig(
|
|
|
439
451
|
)
|
|
440
452
|
return write_semantics
|
|
441
453
|
|
|
442
|
-
@
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
) -> Dict[str, Any]:
|
|
454
|
+
@field_validator("meta_mapping", mode="after")
|
|
455
|
+
@classmethod
|
|
456
|
+
def meta_mapping_validator(cls, meta_mapping: Dict[str, Any]) -> Dict[str, Any]:
|
|
446
457
|
for k, v in meta_mapping.items():
|
|
447
458
|
if "match" not in v:
|
|
448
459
|
raise ValueError(
|
|
@@ -458,44 +469,35 @@ class DBTCommonConfig(
|
|
|
458
469
|
mce_builder.validate_ownership_type(owner_category)
|
|
459
470
|
return meta_mapping
|
|
460
471
|
|
|
461
|
-
@
|
|
462
|
-
def validate_include_column_lineage(
|
|
463
|
-
|
|
464
|
-
) -> bool:
|
|
465
|
-
if include_column_lineage and not values.get("infer_dbt_schemas"):
|
|
472
|
+
@model_validator(mode="after")
|
|
473
|
+
def validate_include_column_lineage(self) -> "DBTCommonConfig":
|
|
474
|
+
if self.include_column_lineage and not self.infer_dbt_schemas:
|
|
466
475
|
raise ValueError(
|
|
467
476
|
"`infer_dbt_schemas` must be enabled to use `include_column_lineage`"
|
|
468
477
|
)
|
|
469
478
|
|
|
470
|
-
return
|
|
471
|
-
|
|
472
|
-
@validator("skip_sources_in_lineage", always=True)
|
|
473
|
-
def validate_skip_sources_in_lineage(
|
|
474
|
-
cls, skip_sources_in_lineage: bool, values: Dict
|
|
475
|
-
) -> bool:
|
|
476
|
-
entities_enabled: Optional[DBTEntitiesEnabled] = values.get("entities_enabled")
|
|
477
|
-
prefer_sql_parser_lineage: Optional[bool] = values.get(
|
|
478
|
-
"prefer_sql_parser_lineage"
|
|
479
|
-
)
|
|
479
|
+
return self
|
|
480
480
|
|
|
481
|
-
|
|
481
|
+
@model_validator(mode="after")
|
|
482
|
+
def validate_skip_sources_in_lineage(self) -> "DBTCommonConfig":
|
|
483
|
+
if self.prefer_sql_parser_lineage and not self.skip_sources_in_lineage:
|
|
482
484
|
raise ValueError(
|
|
483
485
|
"`prefer_sql_parser_lineage` requires that `skip_sources_in_lineage` is enabled."
|
|
484
486
|
)
|
|
485
487
|
|
|
486
488
|
if (
|
|
487
|
-
skip_sources_in_lineage
|
|
488
|
-
and entities_enabled
|
|
489
|
-
and entities_enabled.sources == EmitDirective.YES
|
|
489
|
+
self.skip_sources_in_lineage
|
|
490
|
+
and self.entities_enabled
|
|
491
|
+
and self.entities_enabled.sources == EmitDirective.YES
|
|
490
492
|
# When `prefer_sql_parser_lineage` is enabled, it's ok to have `skip_sources_in_lineage` enabled
|
|
491
493
|
# without also disabling sources.
|
|
492
|
-
and not prefer_sql_parser_lineage
|
|
494
|
+
and not self.prefer_sql_parser_lineage
|
|
493
495
|
):
|
|
494
496
|
raise ValueError(
|
|
495
497
|
"When `skip_sources_in_lineage` is enabled, `entities_enabled.sources` must be set to NO."
|
|
496
498
|
)
|
|
497
499
|
|
|
498
|
-
return
|
|
500
|
+
return self
|
|
499
501
|
|
|
500
502
|
|
|
501
503
|
@dataclass
|
|
@@ -9,7 +9,7 @@ from urllib.parse import urlparse
|
|
|
9
9
|
import dateutil.parser
|
|
10
10
|
import requests
|
|
11
11
|
from packaging import version
|
|
12
|
-
from pydantic import BaseModel, Field,
|
|
12
|
+
from pydantic import BaseModel, Field, model_validator
|
|
13
13
|
|
|
14
14
|
from datahub.configuration.git import GitReference
|
|
15
15
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
@@ -99,26 +99,24 @@ class DBTCoreConfig(DBTCommonConfig):
|
|
|
99
99
|
|
|
100
100
|
_github_info_deprecated = pydantic_renamed_field("github_info", "git_info")
|
|
101
101
|
|
|
102
|
-
@
|
|
103
|
-
def aws_connection_needed_if_s3_uris_present(
|
|
104
|
-
cls, aws_connection: Optional[AwsConnectionConfig], values: Dict, **kwargs: Any
|
|
105
|
-
) -> Optional[AwsConnectionConfig]:
|
|
102
|
+
@model_validator(mode="after")
|
|
103
|
+
def aws_connection_needed_if_s3_uris_present(self) -> "DBTCoreConfig":
|
|
106
104
|
# first check if there are fields that contain s3 uris
|
|
107
105
|
uris = [
|
|
108
|
-
|
|
106
|
+
getattr(self, f, None)
|
|
109
107
|
for f in [
|
|
110
108
|
"manifest_path",
|
|
111
109
|
"catalog_path",
|
|
112
110
|
"sources_path",
|
|
113
111
|
]
|
|
114
|
-
] +
|
|
112
|
+
] + (self.run_results_paths or [])
|
|
115
113
|
s3_uris = [uri for uri in uris if is_s3_uri(uri or "")]
|
|
116
114
|
|
|
117
|
-
if s3_uris and aws_connection is None:
|
|
115
|
+
if s3_uris and self.aws_connection is None:
|
|
118
116
|
raise ValueError(
|
|
119
117
|
f"Please provide aws_connection configuration, since s3 uris have been provided {s3_uris}"
|
|
120
118
|
)
|
|
121
|
-
return
|
|
119
|
+
return self
|
|
122
120
|
|
|
123
121
|
|
|
124
122
|
def get_columns(
|
|
@@ -426,13 +424,13 @@ def load_run_results(
|
|
|
426
424
|
)
|
|
427
425
|
return all_nodes
|
|
428
426
|
|
|
429
|
-
dbt_metadata = DBTRunMetadata.
|
|
427
|
+
dbt_metadata = DBTRunMetadata.model_validate(test_results_json.get("metadata", {}))
|
|
430
428
|
|
|
431
429
|
all_nodes_map: Dict[str, DBTNode] = {x.dbt_name: x for x in all_nodes}
|
|
432
430
|
|
|
433
431
|
results = test_results_json.get("results", [])
|
|
434
432
|
for result in results:
|
|
435
|
-
run_result = DBTRunResult.
|
|
433
|
+
run_result = DBTRunResult.model_validate(result)
|
|
436
434
|
id = run_result.unique_id
|
|
437
435
|
|
|
438
436
|
if id.startswith("test."):
|
|
@@ -477,7 +475,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
477
475
|
|
|
478
476
|
@classmethod
|
|
479
477
|
def create(cls, config_dict, ctx):
|
|
480
|
-
config = DBTCoreConfig.
|
|
478
|
+
config = DBTCoreConfig.model_validate(config_dict)
|
|
481
479
|
return cls(config, ctx)
|
|
482
480
|
|
|
483
481
|
@staticmethod
|
|
@@ -46,7 +46,7 @@ class DataHubDebugSource(Source):
|
|
|
46
46
|
|
|
47
47
|
@classmethod
|
|
48
48
|
def create(cls, config_dict, ctx):
|
|
49
|
-
config = DataHubDebugSourceConfig.
|
|
49
|
+
config = DataHubDebugSourceConfig.model_validate(config_dict)
|
|
50
50
|
return cls(ctx, config)
|
|
51
51
|
|
|
52
52
|
def perform_dns_probe(self, url: str) -> None:
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
import pydantic
|
|
5
4
|
from cached_property import cached_property
|
|
6
|
-
from pydantic import Field
|
|
5
|
+
from pydantic import Field, field_validator
|
|
7
6
|
from typing_extensions import Literal
|
|
8
7
|
|
|
9
8
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
@@ -98,8 +97,11 @@ class DeltaLakeSourceConfig(
|
|
|
98
97
|
|
|
99
98
|
return complete_path
|
|
100
99
|
|
|
101
|
-
@
|
|
102
|
-
|
|
100
|
+
@field_validator("version_history_lookback", mode="after")
|
|
101
|
+
@classmethod
|
|
102
|
+
def negative_version_history_implies_no_limit(
|
|
103
|
+
cls, v: Optional[int]
|
|
104
|
+
) -> Optional[int]:
|
|
103
105
|
if v and v < 0:
|
|
104
106
|
return None
|
|
105
107
|
return v
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
from typing import List, Literal, Optional
|
|
3
3
|
|
|
4
4
|
import certifi
|
|
5
|
-
from pydantic import Field,
|
|
5
|
+
from pydantic import Field, ValidationInfo, field_validator
|
|
6
6
|
|
|
7
7
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
8
8
|
from datahub.configuration.source_common import (
|
|
@@ -78,8 +78,9 @@ class DremioConnectionConfig(ConfigModel):
|
|
|
78
78
|
description="ID of Dremio Cloud Project. Found in Project Settings in the Dremio Cloud UI",
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
@
|
|
82
|
-
|
|
81
|
+
@field_validator("authentication_method", mode="after")
|
|
82
|
+
@classmethod
|
|
83
|
+
def validate_auth_method(cls, value: str) -> str:
|
|
83
84
|
allowed_methods = ["password", "PAT"]
|
|
84
85
|
if value not in allowed_methods:
|
|
85
86
|
raise ValueError(
|
|
@@ -87,9 +88,12 @@ class DremioConnectionConfig(ConfigModel):
|
|
|
87
88
|
)
|
|
88
89
|
return value
|
|
89
90
|
|
|
90
|
-
@
|
|
91
|
-
|
|
92
|
-
|
|
91
|
+
@field_validator("password", mode="after")
|
|
92
|
+
@classmethod
|
|
93
|
+
def validate_password(
|
|
94
|
+
cls, value: Optional[str], info: ValidationInfo
|
|
95
|
+
) -> Optional[str]:
|
|
96
|
+
if info.data.get("authentication_method") == "PAT" and not value:
|
|
93
97
|
raise ValueError(
|
|
94
98
|
"Password (Personal Access Token) is required when using PAT authentication",
|
|
95
99
|
)
|