acryl-datahub 1.3.1__py3-none-any.whl → 1.3.1.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (193) hide show
  1. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/METADATA +2501 -2501
  2. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/RECORD +193 -193
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +2 -2
  5. datahub/api/entities/corpgroup/corpgroup.py +11 -6
  6. datahub/api/entities/corpuser/corpuser.py +11 -11
  7. datahub/api/entities/dataproduct/dataproduct.py +47 -27
  8. datahub/api/entities/dataset/dataset.py +32 -21
  9. datahub/api/entities/external/lake_formation_external_entites.py +5 -6
  10. datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
  11. datahub/api/entities/forms/forms.py +16 -14
  12. datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
  13. datahub/cli/check_cli.py +2 -2
  14. datahub/cli/config_utils.py +3 -3
  15. datahub/cli/lite_cli.py +9 -7
  16. datahub/cli/migrate.py +4 -4
  17. datahub/cli/quickstart_versioning.py +3 -3
  18. datahub/cli/specific/group_cli.py +1 -1
  19. datahub/cli/specific/structuredproperties_cli.py +1 -1
  20. datahub/cli/specific/user_cli.py +1 -1
  21. datahub/configuration/common.py +14 -2
  22. datahub/configuration/connection_resolver.py +2 -2
  23. datahub/configuration/git.py +47 -30
  24. datahub/configuration/import_resolver.py +2 -2
  25. datahub/configuration/kafka.py +4 -3
  26. datahub/configuration/time_window_config.py +26 -26
  27. datahub/configuration/validate_field_deprecation.py +2 -2
  28. datahub/configuration/validate_field_removal.py +2 -2
  29. datahub/configuration/validate_field_rename.py +2 -2
  30. datahub/configuration/validate_multiline_string.py +2 -1
  31. datahub/emitter/kafka_emitter.py +3 -1
  32. datahub/emitter/rest_emitter.py +2 -4
  33. datahub/ingestion/api/decorators.py +1 -1
  34. datahub/ingestion/api/report.py +1 -1
  35. datahub/ingestion/api/sink.py +1 -1
  36. datahub/ingestion/api/source.py +1 -1
  37. datahub/ingestion/glossary/datahub_classifier.py +11 -8
  38. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  39. datahub/ingestion/reporting/file_reporter.py +5 -4
  40. datahub/ingestion/run/pipeline.py +6 -6
  41. datahub/ingestion/run/pipeline_config.py +12 -14
  42. datahub/ingestion/run/sink_callback.py +1 -1
  43. datahub/ingestion/sink/datahub_rest.py +6 -4
  44. datahub/ingestion/source/abs/config.py +19 -19
  45. datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
  46. datahub/ingestion/source/abs/source.py +2 -2
  47. datahub/ingestion/source/aws/aws_common.py +1 -1
  48. datahub/ingestion/source/aws/glue.py +6 -4
  49. datahub/ingestion/source/aws/sagemaker.py +1 -1
  50. datahub/ingestion/source/azure/azure_common.py +8 -12
  51. datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
  53. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
  54. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  55. datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
  56. datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
  57. datahub/ingestion/source/datahub/config.py +8 -8
  58. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  59. datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
  60. datahub/ingestion/source/dbt/dbt_common.py +39 -37
  61. datahub/ingestion/source/dbt/dbt_core.py +10 -12
  62. datahub/ingestion/source/debug/datahub_debug.py +1 -1
  63. datahub/ingestion/source/delta_lake/config.py +6 -4
  64. datahub/ingestion/source/dremio/dremio_config.py +10 -6
  65. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  66. datahub/ingestion/source/elastic_search.py +4 -3
  67. datahub/ingestion/source/excel/source.py +1 -1
  68. datahub/ingestion/source/feast.py +1 -1
  69. datahub/ingestion/source/file.py +5 -4
  70. datahub/ingestion/source/fivetran/config.py +17 -16
  71. datahub/ingestion/source/fivetran/fivetran.py +2 -2
  72. datahub/ingestion/source/gc/datahub_gc.py +1 -1
  73. datahub/ingestion/source/gcs/gcs_source.py +8 -10
  74. datahub/ingestion/source/ge_profiling_config.py +8 -5
  75. datahub/ingestion/source/grafana/grafana_api.py +2 -2
  76. datahub/ingestion/source/grafana/grafana_config.py +4 -3
  77. datahub/ingestion/source/grafana/grafana_source.py +1 -1
  78. datahub/ingestion/source/grafana/models.py +23 -5
  79. datahub/ingestion/source/hex/api.py +7 -5
  80. datahub/ingestion/source/hex/hex.py +4 -3
  81. datahub/ingestion/source/iceberg/iceberg.py +1 -1
  82. datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
  83. datahub/ingestion/source/identity/azure_ad.py +1 -1
  84. datahub/ingestion/source/identity/okta.py +10 -10
  85. datahub/ingestion/source/kafka/kafka.py +1 -1
  86. datahub/ingestion/source/ldap.py +1 -1
  87. datahub/ingestion/source/looker/looker_common.py +7 -5
  88. datahub/ingestion/source/looker/looker_config.py +21 -20
  89. datahub/ingestion/source/looker/lookml_config.py +47 -47
  90. datahub/ingestion/source/metabase.py +8 -8
  91. datahub/ingestion/source/metadata/business_glossary.py +2 -2
  92. datahub/ingestion/source/metadata/lineage.py +13 -8
  93. datahub/ingestion/source/mlflow.py +1 -1
  94. datahub/ingestion/source/mode.py +6 -4
  95. datahub/ingestion/source/mongodb.py +4 -3
  96. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  97. datahub/ingestion/source/nifi.py +17 -23
  98. datahub/ingestion/source/openapi.py +6 -8
  99. datahub/ingestion/source/powerbi/config.py +33 -32
  100. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
  101. datahub/ingestion/source/powerbi/powerbi.py +1 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
  103. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
  104. datahub/ingestion/source/preset.py +8 -8
  105. datahub/ingestion/source/pulsar.py +1 -1
  106. datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
  107. datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
  108. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  109. datahub/ingestion/source/redshift/config.py +18 -20
  110. datahub/ingestion/source/redshift/redshift.py +2 -2
  111. datahub/ingestion/source/redshift/usage.py +23 -3
  112. datahub/ingestion/source/s3/config.py +83 -62
  113. datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
  114. datahub/ingestion/source/s3/source.py +8 -5
  115. datahub/ingestion/source/sac/sac.py +5 -4
  116. datahub/ingestion/source/salesforce.py +3 -2
  117. datahub/ingestion/source/schema/json_schema.py +2 -2
  118. datahub/ingestion/source/sigma/data_classes.py +3 -2
  119. datahub/ingestion/source/sigma/sigma.py +1 -1
  120. datahub/ingestion/source/sigma/sigma_api.py +7 -7
  121. datahub/ingestion/source/slack/slack.py +1 -1
  122. datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
  123. datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
  124. datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
  125. datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
  126. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +1 -1
  128. datahub/ingestion/source/sql/athena.py +1 -1
  129. datahub/ingestion/source/sql/clickhouse.py +4 -2
  130. datahub/ingestion/source/sql/cockroachdb.py +1 -1
  131. datahub/ingestion/source/sql/druid.py +1 -1
  132. datahub/ingestion/source/sql/hana.py +1 -1
  133. datahub/ingestion/source/sql/hive.py +7 -5
  134. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  135. datahub/ingestion/source/sql/mssql/source.py +13 -6
  136. datahub/ingestion/source/sql/mysql.py +1 -1
  137. datahub/ingestion/source/sql/oracle.py +17 -10
  138. datahub/ingestion/source/sql/postgres.py +2 -2
  139. datahub/ingestion/source/sql/presto.py +1 -1
  140. datahub/ingestion/source/sql/sql_config.py +8 -9
  141. datahub/ingestion/source/sql/sql_generic.py +1 -1
  142. datahub/ingestion/source/sql/teradata.py +1 -1
  143. datahub/ingestion/source/sql/trino.py +1 -1
  144. datahub/ingestion/source/sql/vertica.py +5 -4
  145. datahub/ingestion/source/sql_queries.py +11 -8
  146. datahub/ingestion/source/state/checkpoint.py +2 -2
  147. datahub/ingestion/source/state/entity_removal_state.py +2 -1
  148. datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
  149. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
  150. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  151. datahub/ingestion/source/superset.py +9 -9
  152. datahub/ingestion/source/tableau/tableau.py +14 -16
  153. datahub/ingestion/source/unity/config.py +33 -34
  154. datahub/ingestion/source/unity/proxy.py +203 -0
  155. datahub/ingestion/source/unity/proxy_types.py +91 -0
  156. datahub/ingestion/source/unity/source.py +27 -2
  157. datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
  158. datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
  159. datahub/ingestion/source/usage/usage_common.py +5 -3
  160. datahub/ingestion/source_config/csv_enricher.py +7 -6
  161. datahub/ingestion/source_config/operation_config.py +7 -4
  162. datahub/ingestion/source_config/pulsar.py +11 -15
  163. datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
  164. datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
  165. datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
  166. datahub/ingestion/transformer/add_dataset_properties.py +2 -2
  167. datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
  168. datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
  169. datahub/ingestion/transformer/add_dataset_tags.py +3 -3
  170. datahub/ingestion/transformer/add_dataset_terms.py +3 -3
  171. datahub/ingestion/transformer/dataset_domain.py +3 -3
  172. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
  173. datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
  174. datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
  175. datahub/ingestion/transformer/mark_dataset_status.py +1 -1
  176. datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
  177. datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
  178. datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
  179. datahub/ingestion/transformer/replace_external_url.py +2 -2
  180. datahub/ingestion/transformer/set_browse_path.py +1 -1
  181. datahub/ingestion/transformer/tags_to_terms.py +1 -1
  182. datahub/lite/duckdb_lite.py +1 -1
  183. datahub/lite/lite_util.py +2 -2
  184. datahub/sdk/search_filters.py +68 -40
  185. datahub/secret/datahub_secret_store.py +7 -4
  186. datahub/secret/file_secret_store.py +1 -1
  187. datahub/sql_parsing/sqlglot_lineage.py +5 -2
  188. datahub/testing/check_sql_parser_result.py +2 -2
  189. datahub/utilities/ingest_utils.py +1 -1
  190. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/WHEEL +0 -0
  191. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/entry_points.txt +0 -0
  192. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/licenses/LICENSE +0 -0
  193. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/top_level.txt +0 -0
@@ -215,7 +215,7 @@ class Pipeline:
215
215
  sink_class = sink_registry.get(self.sink_type)
216
216
 
217
217
  with _add_init_error_context(f"configure the sink ({self.sink_type})"):
218
- sink_config = self.config.sink.dict().get("config") or {}
218
+ sink_config = self.config.sink.model_dump().get("config") or {}
219
219
  self.sink = exit_stack.enter_context(
220
220
  sink_class.create(sink_config, self.ctx)
221
221
  )
@@ -245,7 +245,7 @@ class Pipeline:
245
245
  ):
246
246
  self.source = inner_exit_stack.enter_context(
247
247
  source_class.create(
248
- self.config.source.dict().get("config", {}), self.ctx
248
+ self.config.source.model_dump().get("config", {}), self.ctx
249
249
  )
250
250
  )
251
251
  logger.debug(
@@ -288,7 +288,7 @@ class Pipeline:
288
288
  for transformer in self.config.transformers:
289
289
  transformer_type = transformer.type
290
290
  transformer_class = transform_registry.get(transformer_type)
291
- transformer_config = transformer.dict().get("config", {})
291
+ transformer_config = transformer.model_dump().get("config", {})
292
292
  self.transformers.append(
293
293
  transformer_class.create(transformer_config, self.ctx)
294
294
  )
@@ -310,12 +310,12 @@ class Pipeline:
310
310
  reporter.type for reporter in self.config.reporting
311
311
  ]:
312
312
  self.config.reporting.append(
313
- ReporterConfig.parse_obj({"type": "datahub"})
313
+ ReporterConfig.model_validate({"type": "datahub"})
314
314
  )
315
315
  elif report_to:
316
316
  # we assume this is a file name, and add the file reporter
317
317
  self.config.reporting.append(
318
- ReporterConfig.parse_obj(
318
+ ReporterConfig.model_validate(
319
319
  {"type": "file", "config": {"filename": report_to}}
320
320
  )
321
321
  )
@@ -323,7 +323,7 @@ class Pipeline:
323
323
  for reporter in self.config.reporting:
324
324
  reporter_type = reporter.type
325
325
  reporter_class = reporting_provider_registry.get(reporter_type)
326
- reporter_config_dict = reporter.dict().get("config", {})
326
+ reporter_config_dict = reporter.model_dump().get("config", {})
327
327
  try:
328
328
  self.reporters.append(
329
329
  reporter_class.create(
@@ -2,9 +2,9 @@ import datetime
2
2
  import logging
3
3
  import random
4
4
  import string
5
- from typing import Any, Dict, List, Optional
5
+ from typing import Dict, List, Optional
6
6
 
7
- from pydantic import Field, validator
7
+ from pydantic import Field, model_validator
8
8
 
9
9
  from datahub.configuration.common import ConfigModel, DynamicTypedConfig, HiddenFromDocs
10
10
  from datahub.ingestion.graph.config import DatahubClientConfig
@@ -96,30 +96,28 @@ class PipelineConfig(ConfigModel):
96
96
  None # the raw dict that was parsed to construct this config
97
97
  )
98
98
 
99
- @validator("run_id", pre=True, always=True)
100
- def run_id_should_be_semantic(
101
- cls, v: Optional[str], values: Dict[str, Any], **kwargs: Any
102
- ) -> str:
103
- if v == DEFAULT_RUN_ID:
99
+ @model_validator(mode="after")
100
+ def run_id_should_be_semantic(self) -> "PipelineConfig":
101
+ if self.run_id == DEFAULT_RUN_ID:
104
102
  source_type = None
105
- if "source" in values and hasattr(values["source"], "type"):
106
- source_type = values["source"].type
103
+ if hasattr(self.source, "type"):
104
+ source_type = self.source.type
107
105
 
108
- return _generate_run_id(source_type)
106
+ self.run_id = _generate_run_id(source_type)
109
107
  else:
110
- assert v is not None
111
- return v
108
+ assert self.run_id is not None
109
+ return self
112
110
 
113
111
  @classmethod
114
112
  def from_dict(
115
113
  cls, resolved_dict: dict, raw_dict: Optional[dict] = None
116
114
  ) -> "PipelineConfig":
117
- config = cls.parse_obj(resolved_dict)
115
+ config = cls.model_validate(resolved_dict)
118
116
  config._raw_dict = raw_dict
119
117
  return config
120
118
 
121
119
  def get_raw_dict(self) -> Dict:
122
120
  result = self._raw_dict
123
121
  if result is None:
124
- result = self.dict()
122
+ result = self.model_dump()
125
123
  return result
@@ -39,7 +39,7 @@ class LoggingCallback(WriteCallback):
39
39
  class DeadLetterQueueCallback(WriteCallback, Closeable):
40
40
  def __init__(self, ctx: PipelineContext, config: Optional[FileSinkConfig]) -> None:
41
41
  if not config:
42
- config = FileSinkConfig.parse_obj({"filename": "failed_events.json"})
42
+ config = FileSinkConfig.model_validate({"filename": "failed_events.json"})
43
43
  self.file_sink: FileSink = FileSink(ctx, config)
44
44
  self.file_sink_lock = threading.Lock()
45
45
  self.logging_callback = LoggingCallback(name="failure-queue")
@@ -9,6 +9,7 @@ from enum import auto
9
9
  from typing import List, Optional, Tuple, Union
10
10
 
11
11
  import pydantic
12
+ from pydantic import field_validator
12
13
 
13
14
  from datahub.configuration.common import (
14
15
  ConfigEnum,
@@ -63,8 +64,8 @@ class RestSinkMode(ConfigEnum):
63
64
  ASYNC_BATCH = auto()
64
65
 
65
66
 
66
- _DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
67
- RestSinkMode, get_rest_sink_default_mode() or RestSinkMode.ASYNC_BATCH
67
+ _DEFAULT_REST_SINK_MODE = pydantic.TypeAdapter(RestSinkMode).validate_python(
68
+ get_rest_sink_default_mode() or RestSinkMode.ASYNC_BATCH
68
69
  )
69
70
 
70
71
 
@@ -80,8 +81,9 @@ class DatahubRestSinkConfig(DatahubClientConfig):
80
81
  # Only applies in async batch mode.
81
82
  max_per_batch: pydantic.PositiveInt = 100
82
83
 
83
- @pydantic.validator("max_per_batch", always=True)
84
- def validate_max_per_batch(cls, v):
84
+ @field_validator("max_per_batch", mode="before")
85
+ @classmethod
86
+ def validate_max_per_batch(cls, v: int) -> int:
85
87
  if v > BATCH_INGEST_MAX_PAYLOAD_LENGTH:
86
88
  raise ValueError(
87
89
  f"max_per_batch must be less than or equal to {BATCH_INGEST_MAX_PAYLOAD_LENGTH}"
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from typing import Any, Dict, List, Optional, Union
3
3
 
4
- import pydantic
4
+ from pydantic import ValidationInfo, field_validator, model_validator
5
5
  from pydantic.fields import Field
6
6
 
7
7
  from datahub.configuration.common import AllowDenyPattern
@@ -105,9 +105,10 @@ class DataLakeSourceConfig(
105
105
  self.profiling.operation_config
106
106
  )
107
107
 
108
- @pydantic.validator("path_specs", always=True)
108
+ @field_validator("path_specs", mode="before")
109
+ @classmethod
109
110
  def check_path_specs_and_infer_platform(
110
- cls, path_specs: List[PathSpec], values: Dict
111
+ cls, path_specs: List[PathSpec], info: ValidationInfo
111
112
  ) -> List[PathSpec]:
112
113
  if len(path_specs) == 0:
113
114
  raise ValueError("path_specs must not be empty")
@@ -124,38 +125,37 @@ class DataLakeSourceConfig(
124
125
 
125
126
  # Ensure abs configs aren't used for file sources.
126
127
  if guessed_platform != "abs" and (
127
- values.get("use_abs_container_properties")
128
- or values.get("use_abs_blob_tags")
129
- or values.get("use_abs_blob_properties")
128
+ info.data.get("use_abs_container_properties")
129
+ or info.data.get("use_abs_blob_tags")
130
+ or info.data.get("use_abs_blob_properties")
130
131
  ):
131
132
  raise ValueError(
132
133
  "Cannot grab abs blob/container tags when platform is not abs. Remove the flag or use abs."
133
134
  )
134
135
 
135
136
  # Infer platform if not specified.
136
- if values.get("platform") and values["platform"] != guessed_platform:
137
+ if info.data.get("platform") and info.data["platform"] != guessed_platform:
137
138
  raise ValueError(
138
- f"All path_specs belong to {guessed_platform} platform, but platform is set to {values['platform']}"
139
+ f"All path_specs belong to {guessed_platform} platform, but platform is set to {info.data['platform']}"
139
140
  )
140
141
  else:
141
142
  logger.debug(f'Setting config "platform": {guessed_platform}')
142
- values["platform"] = guessed_platform
143
+ info.data["platform"] = guessed_platform
143
144
 
144
145
  return path_specs
145
146
 
146
- @pydantic.validator("platform", always=True)
147
- def platform_not_empty(cls, platform: Any, values: dict) -> str:
148
- inferred_platform = values.get("platform") # we may have inferred it above
147
+ @field_validator("platform", mode="before")
148
+ @classmethod
149
+ def platform_not_empty(cls, platform: Any, info: ValidationInfo) -> str:
150
+ inferred_platform = info.data.get("platform") # we may have inferred it above
149
151
  platform = platform or inferred_platform
150
152
  if not platform:
151
153
  raise ValueError("platform must not be empty")
152
154
  return platform
153
155
 
154
- @pydantic.root_validator(skip_on_failure=True)
155
- def ensure_profiling_pattern_is_passed_to_profiling(
156
- cls, values: Dict[str, Any]
157
- ) -> Dict[str, Any]:
158
- profiling: Optional[DataLakeProfilerConfig] = values.get("profiling")
156
+ @model_validator(mode="after")
157
+ def ensure_profiling_pattern_is_passed_to_profiling(self) -> "DataLakeSourceConfig":
158
+ profiling = self.profiling
159
159
  if profiling is not None and profiling.enabled:
160
- profiling._allow_deny_patterns = values["profile_patterns"]
161
- return values
160
+ profiling._allow_deny_patterns = self.profile_patterns
161
+ return self
@@ -1,6 +1,7 @@
1
- from typing import Any, Dict, Optional
1
+ from typing import Optional
2
2
 
3
3
  import pydantic
4
+ from pydantic import model_validator
4
5
  from pydantic.fields import Field
5
6
 
6
7
  from datahub.configuration import ConfigModel
@@ -72,21 +73,18 @@ class DataLakeProfilerConfig(ConfigModel):
72
73
  description="Whether to profile for the sample values for all columns.",
73
74
  )
74
75
 
75
- @pydantic.root_validator(skip_on_failure=True)
76
- def ensure_field_level_settings_are_normalized(
77
- cls: "DataLakeProfilerConfig", values: Dict[str, Any]
78
- ) -> Dict[str, Any]:
79
- max_num_fields_to_profile_key = "max_number_of_fields_to_profile"
80
- max_num_fields_to_profile = values.get(max_num_fields_to_profile_key)
76
+ @model_validator(mode="after")
77
+ def ensure_field_level_settings_are_normalized(self) -> "DataLakeProfilerConfig":
78
+ max_num_fields_to_profile = self.max_number_of_fields_to_profile
81
79
 
82
80
  # Disable all field-level metrics.
83
- if values.get("profile_table_level_only"):
84
- for field_level_metric in cls.__fields__:
85
- if field_level_metric.startswith("include_field_"):
86
- values.setdefault(field_level_metric, False)
81
+ if self.profile_table_level_only:
82
+ for field_name in self.__fields__:
83
+ if field_name.startswith("include_field_"):
84
+ setattr(self, field_name, False)
87
85
 
88
86
  assert max_num_fields_to_profile is None, (
89
- f"{max_num_fields_to_profile_key} should be set to None"
87
+ "max_number_of_fields_to_profile should be set to None"
90
88
  )
91
89
 
92
- return values
90
+ return self
@@ -149,7 +149,7 @@ class ABSSource(StatefulIngestionSourceBase):
149
149
  self.report = DataLakeSourceReport()
150
150
  self.profiling_times_taken = []
151
151
  config_report = {
152
- config_option: config.dict().get(config_option)
152
+ config_option: config.model_dump().get(config_option)
153
153
  for config_option in config_options_to_report
154
154
  }
155
155
  config_report = {
@@ -164,7 +164,7 @@ class ABSSource(StatefulIngestionSourceBase):
164
164
 
165
165
  @classmethod
166
166
  def create(cls, config_dict, ctx):
167
- config = DataLakeSourceConfig.parse_obj(config_dict)
167
+ config = DataLakeSourceConfig.model_validate(config_dict)
168
168
 
169
169
  return cls(config, ctx)
170
170
 
@@ -246,7 +246,7 @@ def assume_role(
246
246
  **dict(
247
247
  RoleSessionName="DatahubIngestionSource",
248
248
  ),
249
- **{k: v for k, v in role.dict().items() if v is not None},
249
+ **{k: v for k, v in role.model_dump().items() if v is not None},
250
250
  }
251
251
 
252
252
  assumed_role_object = sts_client.assume_role(
@@ -21,7 +21,7 @@ from urllib.parse import urlparse
21
21
 
22
22
  import botocore.exceptions
23
23
  import yaml
24
- from pydantic import validator
24
+ from pydantic import field_validator
25
25
  from pydantic.fields import Field
26
26
 
27
27
  from datahub.api.entities.dataset.dataset import Dataset
@@ -221,7 +221,8 @@ class GlueSourceConfig(
221
221
  def lakeformation_client(self):
222
222
  return self.get_lakeformation_client()
223
223
 
224
- @validator("glue_s3_lineage_direction")
224
+ @field_validator("glue_s3_lineage_direction", mode="after")
225
+ @classmethod
225
226
  def check_direction(cls, v: str) -> str:
226
227
  if v.lower() not in ["upstream", "downstream"]:
227
228
  raise ValueError(
@@ -229,7 +230,8 @@ class GlueSourceConfig(
229
230
  )
230
231
  return v.lower()
231
232
 
232
- @validator("platform")
233
+ @field_validator("platform", mode="after")
234
+ @classmethod
233
235
  def platform_validator(cls, v: str) -> str:
234
236
  if not v or v in VALID_PLATFORMS:
235
237
  return v
@@ -473,7 +475,7 @@ class GlueSource(StatefulIngestionSourceBase):
473
475
 
474
476
  @classmethod
475
477
  def create(cls, config_dict, ctx):
476
- config = GlueSourceConfig.parse_obj(config_dict)
478
+ config = GlueSourceConfig.model_validate(config_dict)
477
479
  return cls(config, ctx)
478
480
 
479
481
  @property
@@ -66,7 +66,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
66
66
 
67
67
  @classmethod
68
68
  def create(cls, config_dict, ctx):
69
- config = SagemakerSourceConfig.parse_obj(config_dict)
69
+ config = SagemakerSourceConfig.model_validate(config_dict)
70
70
  return cls(config, ctx)
71
71
 
72
72
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
@@ -1,9 +1,9 @@
1
- from typing import Dict, Optional, Union
1
+ from typing import Optional, Union
2
2
 
3
3
  from azure.identity import ClientSecretCredential
4
4
  from azure.storage.blob import BlobServiceClient
5
5
  from azure.storage.filedatalake import DataLakeServiceClient, FileSystemClient
6
- from pydantic import Field, root_validator
6
+ from pydantic import Field, model_validator
7
7
 
8
8
  from datahub.configuration import ConfigModel
9
9
  from datahub.configuration.common import ConfigurationError
@@ -81,18 +81,14 @@ class AzureConnectionConfig(ConfigModel):
81
81
  )
82
82
  return self.sas_token if self.sas_token is not None else self.account_key
83
83
 
84
- @root_validator(skip_on_failure=True)
85
- def _check_credential_values(cls, values: Dict) -> Dict:
84
+ @model_validator(mode="after")
85
+ def _check_credential_values(self) -> "AzureConnectionConfig":
86
86
  if (
87
- values.get("account_key")
88
- or values.get("sas_token")
89
- or (
90
- values.get("client_id")
91
- and values.get("client_secret")
92
- and values.get("tenant_id")
93
- )
87
+ self.account_key
88
+ or self.sas_token
89
+ or (self.client_id and self.client_secret and self.tenant_id)
94
90
  ):
95
- return values
91
+ return self
96
92
  raise ConfigurationError(
97
93
  "credentials missing, requires one combination of account_key or sas_token or (client_id and client_secret and tenant_id)"
98
94
  )
@@ -211,7 +211,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
211
211
 
212
212
  @classmethod
213
213
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "BigqueryV2Source":
214
- config = BigQueryV2Config.parse_obj(config_dict)
214
+ config = BigQueryV2Config.model_validate(config_dict)
215
215
  return cls(ctx, config)
216
216
 
217
217
  @staticmethod
@@ -2,9 +2,16 @@ import logging
2
2
  import re
3
3
  from copy import deepcopy
4
4
  from datetime import timedelta
5
- from typing import Dict, List, Optional, Union
6
-
7
- from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
5
+ from typing import Any, Dict, List, Optional, Union
6
+
7
+ from pydantic import (
8
+ Field,
9
+ PositiveInt,
10
+ PrivateAttr,
11
+ ValidationInfo,
12
+ field_validator,
13
+ model_validator,
14
+ )
8
15
 
9
16
  from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
10
17
  from datahub.configuration.env_vars import get_bigquery_schema_parallelism
@@ -63,8 +70,9 @@ class BigQueryBaseConfig(ConfigModel):
63
70
  description="The regex pattern to match sharded tables and group as one table. This is a very low level config parameter, only change if you know what you are doing, ",
64
71
  )
65
72
 
66
- @validator("sharded_table_pattern")
67
- def sharded_table_pattern_is_a_valid_regexp(cls, v):
73
+ @field_validator("sharded_table_pattern", mode="after")
74
+ @classmethod
75
+ def sharded_table_pattern_is_a_valid_regexp(cls, v: str) -> str:
68
76
  try:
69
77
  re.compile(v)
70
78
  except Exception as e:
@@ -73,7 +81,8 @@ class BigQueryBaseConfig(ConfigModel):
73
81
  ) from e
74
82
  return v
75
83
 
76
- @root_validator(pre=True)
84
+ @model_validator(mode="before")
85
+ @classmethod
77
86
  def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
78
87
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
79
88
  values = deepcopy(values)
@@ -188,12 +197,11 @@ class BigQueryFilterConfig(SQLFilterConfig):
188
197
  default=AllowDenyPattern.allow_all(),
189
198
  )
190
199
 
191
- @root_validator(pre=False, skip_on_failure=True)
192
- def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
193
- # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
194
- values = deepcopy(values)
195
- dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
196
- schema_pattern = values.get("schema_pattern")
200
+ @model_validator(mode="after")
201
+ def backward_compatibility_configs_set(self) -> Any:
202
+ dataset_pattern = self.dataset_pattern
203
+ schema_pattern = self.schema_pattern
204
+
197
205
  if (
198
206
  dataset_pattern == AllowDenyPattern.allow_all()
199
207
  and schema_pattern != AllowDenyPattern.allow_all()
@@ -202,7 +210,7 @@ class BigQueryFilterConfig(SQLFilterConfig):
202
210
  "dataset_pattern is not set but schema_pattern is set, using schema_pattern as dataset_pattern. "
203
211
  "schema_pattern will be deprecated, please use dataset_pattern instead."
204
212
  )
205
- values["dataset_pattern"] = schema_pattern
213
+ self.dataset_pattern = schema_pattern
206
214
  dataset_pattern = schema_pattern
207
215
  elif (
208
216
  dataset_pattern != AllowDenyPattern.allow_all()
@@ -213,7 +221,7 @@ class BigQueryFilterConfig(SQLFilterConfig):
213
221
  " please use dataset_pattern only."
214
222
  )
215
223
 
216
- match_fully_qualified_names = values.get("match_fully_qualified_names")
224
+ match_fully_qualified_names = self.match_fully_qualified_names
217
225
 
218
226
  if (
219
227
  dataset_pattern is not None
@@ -243,7 +251,7 @@ class BigQueryFilterConfig(SQLFilterConfig):
243
251
  " of the form `<project_id>.<dataset_name>`."
244
252
  )
245
253
 
246
- return values
254
+ return self
247
255
 
248
256
 
249
257
  class BigQueryIdentifierConfig(
@@ -478,7 +486,8 @@ class BigQueryV2Config(
478
486
  _include_view_column_lineage = pydantic_removed_field("include_view_column_lineage")
479
487
  _lineage_parse_view_ddl = pydantic_removed_field("lineage_parse_view_ddl")
480
488
 
481
- @root_validator(pre=True)
489
+ @model_validator(mode="before")
490
+ @classmethod
482
491
  def set_include_schema_metadata(cls, values: Dict) -> Dict:
483
492
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
484
493
  values = deepcopy(values)
@@ -498,30 +507,33 @@ class BigQueryV2Config(
498
507
 
499
508
  return values
500
509
 
501
- @root_validator(skip_on_failure=True)
510
+ @model_validator(mode="before")
511
+ @classmethod
502
512
  def profile_default_settings(cls, values: Dict) -> Dict:
503
513
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
504
514
  values = deepcopy(values)
505
515
  # Extra default SQLAlchemy option for better connection pooling and threading.
506
516
  # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
507
- values["options"].setdefault("max_overflow", -1)
517
+ values.setdefault("options", {}).setdefault("max_overflow", -1)
508
518
 
509
519
  return values
510
520
 
511
- @validator("bigquery_audit_metadata_datasets")
521
+ @field_validator("bigquery_audit_metadata_datasets", mode="after")
522
+ @classmethod
512
523
  def validate_bigquery_audit_metadata_datasets(
513
- cls, v: Optional[List[str]], values: Dict
524
+ cls, v: Optional[List[str]], info: ValidationInfo
514
525
  ) -> Optional[List[str]]:
515
- if values.get("use_exported_bigquery_audit_metadata"):
526
+ if info.data.get("use_exported_bigquery_audit_metadata"):
516
527
  assert v and len(v) > 0, (
517
528
  "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
518
529
  )
519
530
 
520
531
  return v
521
532
 
522
- @validator("upstream_lineage_in_report")
523
- def validate_upstream_lineage_in_report(cls, v: bool, values: Dict) -> bool:
524
- if v and values.get("use_queries_v2", True):
533
+ @field_validator("upstream_lineage_in_report", mode="after")
534
+ @classmethod
535
+ def validate_upstream_lineage_in_report(cls, v: bool, info: ValidationInfo) -> bool:
536
+ if v and info.data.get("use_queries_v2", True):
525
537
  logging.warning(
526
538
  "`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
527
539
  "This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
@@ -529,11 +541,12 @@ class BigQueryV2Config(
529
541
 
530
542
  return v
531
543
 
532
- @root_validator(pre=False, skip_on_failure=True)
533
- def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
534
- if values.get("use_queries_v2"):
535
- if values.get("enable_stateful_lineage_ingestion") or values.get(
536
- "enable_stateful_usage_ingestion"
544
+ @model_validator(mode="after")
545
+ def validate_queries_v2_stateful_ingestion(self) -> "BigQueryV2Config":
546
+ if self.use_queries_v2:
547
+ if (
548
+ self.enable_stateful_lineage_ingestion
549
+ or self.enable_stateful_usage_ingestion
537
550
  ):
538
551
  logger.warning(
539
552
  "enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
@@ -541,7 +554,7 @@ class BigQueryV2Config(
541
554
  "For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
542
555
  "for the unified time window extraction (lineage + usage + operations + queries)."
543
556
  )
544
- return values
557
+ return self
545
558
 
546
559
  def get_table_pattern(self, pattern: List[str]) -> str:
547
560
  return "|".join(pattern) if pattern else ""
@@ -80,7 +80,7 @@ class BigQueryQueriesSource(Source):
80
80
 
81
81
  @classmethod
82
82
  def create(cls, config_dict: dict, ctx: PipelineContext) -> Self:
83
- config = BigQueryQueriesSourceConfig.parse_obj(config_dict)
83
+ config = BigQueryQueriesSourceConfig.model_validate(config_dict)
84
84
  return cls(ctx, config)
85
85
 
86
86
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
@@ -109,7 +109,7 @@ class CassandraSource(StatefulIngestionSourceBase):
109
109
 
110
110
  @classmethod
111
111
  def create(cls, config_dict, ctx):
112
- config = CassandraSourceConfig.parse_obj(config_dict)
112
+ config = CassandraSourceConfig.model_validate(config_dict)
113
113
  return cls(ctx, config)
114
114
 
115
115
  def get_platform(self) -> str:
@@ -1,8 +1,8 @@
1
1
  import json
2
2
  import tempfile
3
- from typing import Any, Dict, Optional
3
+ from typing import Dict, Optional
4
4
 
5
- from pydantic import Field, root_validator
5
+ from pydantic import Field, model_validator
6
6
 
7
7
  from datahub.configuration import ConfigModel
8
8
  from datahub.configuration.validate_multiline_string import pydantic_multiline_string
@@ -37,16 +37,16 @@ class GCPCredential(ConfigModel):
37
37
 
38
38
  _fix_private_key_newlines = pydantic_multiline_string("private_key")
39
39
 
40
- @root_validator(skip_on_failure=True)
41
- def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
42
- if values.get("client_x509_cert_url") is None:
43
- values["client_x509_cert_url"] = (
44
- f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
40
+ @model_validator(mode="after")
41
+ def validate_config(self) -> "GCPCredential":
42
+ if self.client_x509_cert_url is None:
43
+ self.client_x509_cert_url = (
44
+ f"https://www.googleapis.com/robot/v1/metadata/x509/{self.client_email}"
45
45
  )
46
- return values
46
+ return self
47
47
 
48
48
  def create_credential_temp_file(self, project_id: Optional[str] = None) -> str:
49
- configs = self.dict()
49
+ configs = self.model_dump()
50
50
  if project_id:
51
51
  configs["project_id"] = project_id
52
52
  with tempfile.NamedTemporaryFile(delete=False) as fp:
@@ -55,7 +55,7 @@ class GCPCredential(ConfigModel):
55
55
  return fp.name
56
56
 
57
57
  def to_dict(self, project_id: Optional[str] = None) -> Dict[str, str]:
58
- configs = self.dict()
58
+ configs = self.model_dump()
59
59
  if project_id:
60
60
  configs["project_id"] = project_id
61
61
  return configs