acryl-datahub 1.3.1__py3-none-any.whl → 1.3.1.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (193) hide show
  1. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/METADATA +2501 -2501
  2. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/RECORD +193 -193
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +2 -2
  5. datahub/api/entities/corpgroup/corpgroup.py +11 -6
  6. datahub/api/entities/corpuser/corpuser.py +11 -11
  7. datahub/api/entities/dataproduct/dataproduct.py +47 -27
  8. datahub/api/entities/dataset/dataset.py +32 -21
  9. datahub/api/entities/external/lake_formation_external_entites.py +5 -6
  10. datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
  11. datahub/api/entities/forms/forms.py +16 -14
  12. datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
  13. datahub/cli/check_cli.py +2 -2
  14. datahub/cli/config_utils.py +3 -3
  15. datahub/cli/lite_cli.py +9 -7
  16. datahub/cli/migrate.py +4 -4
  17. datahub/cli/quickstart_versioning.py +3 -3
  18. datahub/cli/specific/group_cli.py +1 -1
  19. datahub/cli/specific/structuredproperties_cli.py +1 -1
  20. datahub/cli/specific/user_cli.py +1 -1
  21. datahub/configuration/common.py +14 -2
  22. datahub/configuration/connection_resolver.py +2 -2
  23. datahub/configuration/git.py +47 -30
  24. datahub/configuration/import_resolver.py +2 -2
  25. datahub/configuration/kafka.py +4 -3
  26. datahub/configuration/time_window_config.py +26 -26
  27. datahub/configuration/validate_field_deprecation.py +2 -2
  28. datahub/configuration/validate_field_removal.py +2 -2
  29. datahub/configuration/validate_field_rename.py +2 -2
  30. datahub/configuration/validate_multiline_string.py +2 -1
  31. datahub/emitter/kafka_emitter.py +3 -1
  32. datahub/emitter/rest_emitter.py +2 -4
  33. datahub/ingestion/api/decorators.py +1 -1
  34. datahub/ingestion/api/report.py +1 -1
  35. datahub/ingestion/api/sink.py +1 -1
  36. datahub/ingestion/api/source.py +1 -1
  37. datahub/ingestion/glossary/datahub_classifier.py +11 -8
  38. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  39. datahub/ingestion/reporting/file_reporter.py +5 -4
  40. datahub/ingestion/run/pipeline.py +6 -6
  41. datahub/ingestion/run/pipeline_config.py +12 -14
  42. datahub/ingestion/run/sink_callback.py +1 -1
  43. datahub/ingestion/sink/datahub_rest.py +6 -4
  44. datahub/ingestion/source/abs/config.py +19 -19
  45. datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
  46. datahub/ingestion/source/abs/source.py +2 -2
  47. datahub/ingestion/source/aws/aws_common.py +1 -1
  48. datahub/ingestion/source/aws/glue.py +6 -4
  49. datahub/ingestion/source/aws/sagemaker.py +1 -1
  50. datahub/ingestion/source/azure/azure_common.py +8 -12
  51. datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
  53. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
  54. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  55. datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
  56. datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
  57. datahub/ingestion/source/datahub/config.py +8 -8
  58. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  59. datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
  60. datahub/ingestion/source/dbt/dbt_common.py +39 -37
  61. datahub/ingestion/source/dbt/dbt_core.py +10 -12
  62. datahub/ingestion/source/debug/datahub_debug.py +1 -1
  63. datahub/ingestion/source/delta_lake/config.py +6 -4
  64. datahub/ingestion/source/dremio/dremio_config.py +10 -6
  65. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  66. datahub/ingestion/source/elastic_search.py +4 -3
  67. datahub/ingestion/source/excel/source.py +1 -1
  68. datahub/ingestion/source/feast.py +1 -1
  69. datahub/ingestion/source/file.py +5 -4
  70. datahub/ingestion/source/fivetran/config.py +17 -16
  71. datahub/ingestion/source/fivetran/fivetran.py +2 -2
  72. datahub/ingestion/source/gc/datahub_gc.py +1 -1
  73. datahub/ingestion/source/gcs/gcs_source.py +8 -10
  74. datahub/ingestion/source/ge_profiling_config.py +8 -5
  75. datahub/ingestion/source/grafana/grafana_api.py +2 -2
  76. datahub/ingestion/source/grafana/grafana_config.py +4 -3
  77. datahub/ingestion/source/grafana/grafana_source.py +1 -1
  78. datahub/ingestion/source/grafana/models.py +23 -5
  79. datahub/ingestion/source/hex/api.py +7 -5
  80. datahub/ingestion/source/hex/hex.py +4 -3
  81. datahub/ingestion/source/iceberg/iceberg.py +1 -1
  82. datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
  83. datahub/ingestion/source/identity/azure_ad.py +1 -1
  84. datahub/ingestion/source/identity/okta.py +10 -10
  85. datahub/ingestion/source/kafka/kafka.py +1 -1
  86. datahub/ingestion/source/ldap.py +1 -1
  87. datahub/ingestion/source/looker/looker_common.py +7 -5
  88. datahub/ingestion/source/looker/looker_config.py +21 -20
  89. datahub/ingestion/source/looker/lookml_config.py +47 -47
  90. datahub/ingestion/source/metabase.py +8 -8
  91. datahub/ingestion/source/metadata/business_glossary.py +2 -2
  92. datahub/ingestion/source/metadata/lineage.py +13 -8
  93. datahub/ingestion/source/mlflow.py +1 -1
  94. datahub/ingestion/source/mode.py +6 -4
  95. datahub/ingestion/source/mongodb.py +4 -3
  96. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  97. datahub/ingestion/source/nifi.py +17 -23
  98. datahub/ingestion/source/openapi.py +6 -8
  99. datahub/ingestion/source/powerbi/config.py +33 -32
  100. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
  101. datahub/ingestion/source/powerbi/powerbi.py +1 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
  103. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
  104. datahub/ingestion/source/preset.py +8 -8
  105. datahub/ingestion/source/pulsar.py +1 -1
  106. datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
  107. datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
  108. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  109. datahub/ingestion/source/redshift/config.py +18 -20
  110. datahub/ingestion/source/redshift/redshift.py +2 -2
  111. datahub/ingestion/source/redshift/usage.py +23 -3
  112. datahub/ingestion/source/s3/config.py +83 -62
  113. datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
  114. datahub/ingestion/source/s3/source.py +8 -5
  115. datahub/ingestion/source/sac/sac.py +5 -4
  116. datahub/ingestion/source/salesforce.py +3 -2
  117. datahub/ingestion/source/schema/json_schema.py +2 -2
  118. datahub/ingestion/source/sigma/data_classes.py +3 -2
  119. datahub/ingestion/source/sigma/sigma.py +1 -1
  120. datahub/ingestion/source/sigma/sigma_api.py +7 -7
  121. datahub/ingestion/source/slack/slack.py +1 -1
  122. datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
  123. datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
  124. datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
  125. datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
  126. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +1 -1
  128. datahub/ingestion/source/sql/athena.py +1 -1
  129. datahub/ingestion/source/sql/clickhouse.py +4 -2
  130. datahub/ingestion/source/sql/cockroachdb.py +1 -1
  131. datahub/ingestion/source/sql/druid.py +1 -1
  132. datahub/ingestion/source/sql/hana.py +1 -1
  133. datahub/ingestion/source/sql/hive.py +7 -5
  134. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  135. datahub/ingestion/source/sql/mssql/source.py +13 -6
  136. datahub/ingestion/source/sql/mysql.py +1 -1
  137. datahub/ingestion/source/sql/oracle.py +17 -10
  138. datahub/ingestion/source/sql/postgres.py +2 -2
  139. datahub/ingestion/source/sql/presto.py +1 -1
  140. datahub/ingestion/source/sql/sql_config.py +8 -9
  141. datahub/ingestion/source/sql/sql_generic.py +1 -1
  142. datahub/ingestion/source/sql/teradata.py +1 -1
  143. datahub/ingestion/source/sql/trino.py +1 -1
  144. datahub/ingestion/source/sql/vertica.py +5 -4
  145. datahub/ingestion/source/sql_queries.py +11 -8
  146. datahub/ingestion/source/state/checkpoint.py +2 -2
  147. datahub/ingestion/source/state/entity_removal_state.py +2 -1
  148. datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
  149. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
  150. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  151. datahub/ingestion/source/superset.py +9 -9
  152. datahub/ingestion/source/tableau/tableau.py +14 -16
  153. datahub/ingestion/source/unity/config.py +33 -34
  154. datahub/ingestion/source/unity/proxy.py +203 -0
  155. datahub/ingestion/source/unity/proxy_types.py +91 -0
  156. datahub/ingestion/source/unity/source.py +27 -2
  157. datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
  158. datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
  159. datahub/ingestion/source/usage/usage_common.py +5 -3
  160. datahub/ingestion/source_config/csv_enricher.py +7 -6
  161. datahub/ingestion/source_config/operation_config.py +7 -4
  162. datahub/ingestion/source_config/pulsar.py +11 -15
  163. datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
  164. datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
  165. datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
  166. datahub/ingestion/transformer/add_dataset_properties.py +2 -2
  167. datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
  168. datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
  169. datahub/ingestion/transformer/add_dataset_tags.py +3 -3
  170. datahub/ingestion/transformer/add_dataset_terms.py +3 -3
  171. datahub/ingestion/transformer/dataset_domain.py +3 -3
  172. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
  173. datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
  174. datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
  175. datahub/ingestion/transformer/mark_dataset_status.py +1 -1
  176. datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
  177. datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
  178. datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
  179. datahub/ingestion/transformer/replace_external_url.py +2 -2
  180. datahub/ingestion/transformer/set_browse_path.py +1 -1
  181. datahub/ingestion/transformer/tags_to_terms.py +1 -1
  182. datahub/lite/duckdb_lite.py +1 -1
  183. datahub/lite/lite_util.py +2 -2
  184. datahub/sdk/search_filters.py +68 -40
  185. datahub/secret/datahub_secret_store.py +7 -4
  186. datahub/secret/file_secret_store.py +1 -1
  187. datahub/sql_parsing/sqlglot_lineage.py +5 -2
  188. datahub/testing/check_sql_parser_result.py +2 -2
  189. datahub/utilities/ingest_utils.py +1 -1
  190. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/WHEEL +0 -0
  191. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/entry_points.txt +0 -0
  192. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/licenses/LICENSE +0 -0
  193. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/top_level.txt +0 -0
@@ -3,11 +3,11 @@ import logging
3
3
  import os
4
4
  import re
5
5
  from enum import Enum
6
- from typing import Any, Dict, List, Optional, Tuple, Union
6
+ from typing import List, Optional, Tuple, Union
7
7
 
8
8
  import parse
9
- import pydantic
10
9
  from cached_property import cached_property
10
+ from pydantic import field_validator, model_validator
11
11
  from pydantic.fields import Field
12
12
  from wcmatch import pathlib
13
13
 
@@ -65,7 +65,8 @@ class SortKey(ConfigModel):
65
65
  description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.",
66
66
  )
67
67
 
68
- @pydantic.validator("date_format", always=True)
68
+ @field_validator("date_format", mode="before")
69
+ @classmethod
69
70
  def convert_date_format_to_python_format(cls, v: Optional[str]) -> Optional[str]:
70
71
  if v is None:
71
72
  return None
@@ -86,7 +87,7 @@ class PathSpec(ConfigModel):
86
87
  arbitrary_types_allowed = True
87
88
 
88
89
  include: str = Field(
89
- description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details."
90
+ description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details.",
90
91
  )
91
92
  exclude: Optional[List[str]] = Field(
92
93
  [],
@@ -260,20 +261,80 @@ class PathSpec(ConfigModel):
260
261
  ) -> Union[None, parse.Result, parse.Match]:
261
262
  return self.compiled_folder_include.parse(path)
262
263
 
263
- @pydantic.root_validator(skip_on_failure=True)
264
- def validate_no_double_stars(cls, values: Dict) -> Dict:
265
- if "include" not in values:
266
- return values
264
+ @model_validator(mode="after")
265
+ def validate_path_spec_comprehensive(self):
266
+ """
267
+ Comprehensive model validator that handles multiple interdependent validations.
268
+
269
+ Consolidates related validation logic to avoid order dependencies between multiple
270
+ model validators and ensures reliable cross-field validation. This approach is
271
+ preferred over multiple separate validators when:
272
+
273
+ 1. Validations depend on multiple fields (e.g., sample_files depends on include)
274
+ 2. One validation modifies a field that another validation checks
275
+ 3. Field validators can't reliably access other field values or defaults
276
+ 4. Order of execution between validators is important but undefined
277
+
278
+ By combining related validations, we ensure they execute in the correct sequence
279
+ and have access to all field values after Pydantic has processed defaults.
280
+ """
281
+ # Handle autodetect_partitions logic first
282
+ if self.autodetect_partitions:
283
+ include = self.include
284
+ if include.endswith("/"):
285
+ include = include[:-1]
286
+ if include.endswith("{table}"):
287
+ self.include = include + "/**"
288
+ # Allow double stars when we add them for autodetect_partitions
289
+ self.allow_double_stars = True
290
+
291
+ # Handle table_name logic
292
+ if self.table_name is None and "{table}" in self.include:
293
+ self.table_name = "{table}"
294
+ elif self.table_name is not None:
295
+ parsable_include = PathSpec.get_parsable_include(self.include)
296
+ compiled_include = parse.compile(parsable_include)
297
+ if not all(
298
+ x in compiled_include.named_fields
299
+ for x in parse.compile(self.table_name).named_fields
300
+ ):
301
+ raise ValueError(
302
+ f"Not all named variables used in path_spec.table_name {self.table_name} are specified in path_spec.include {self.include}"
303
+ )
304
+
305
+ # Handle sample_files logic - turn off sampling for non-cloud URIs
306
+ is_s3 = is_s3_uri(self.include)
307
+ is_gcs = is_gcs_uri(self.include)
308
+ is_abs = is_abs_uri(self.include)
309
+ if not is_s3 and not is_gcs and not is_abs:
310
+ # Sampling only makes sense on s3 and gcs currently
311
+ self.sample_files = False
312
+
313
+ # Validate double stars
314
+ if "**" in self.include and not self.allow_double_stars:
315
+ raise ValueError("path_spec.include cannot contain '**'")
267
316
 
317
+ # Validate file extension
318
+ include_ext = os.path.splitext(self.include)[1].strip(".")
319
+ if not include_ext:
320
+ include_ext = (
321
+ "*" # if no extension is provided, we assume all files are allowed
322
+ )
268
323
  if (
269
- values.get("include")
270
- and "**" in values["include"]
271
- and not values.get("allow_double_stars")
324
+ include_ext not in self.file_types
325
+ and include_ext not in ["*", ""]
326
+ and not self.default_extension
327
+ and include_ext not in SUPPORTED_COMPRESSIONS
272
328
  ):
273
- raise ValueError("path_spec.include cannot contain '**'")
274
- return values
329
+ raise ValueError(
330
+ f"file type specified ({include_ext}) in path_spec.include is not in specified file "
331
+ f'types. Please select one from {self.file_types} or specify ".*" to allow all types'
332
+ )
275
333
 
276
- @pydantic.validator("file_types", always=True)
334
+ return self
335
+
336
+ @field_validator("file_types", mode="before")
337
+ @classmethod
277
338
  def validate_file_types(cls, v: Optional[List[str]]) -> List[str]:
278
339
  if v is None:
279
340
  return SUPPORTED_FILE_TYPES
@@ -285,50 +346,24 @@ class PathSpec(ConfigModel):
285
346
  )
286
347
  return v
287
348
 
288
- @pydantic.validator("default_extension")
289
- def validate_default_extension(cls, v):
349
+ @field_validator("default_extension", mode="after")
350
+ @classmethod
351
+ def validate_default_extension(cls, v: Optional[str]) -> Optional[str]:
290
352
  if v is not None and v not in SUPPORTED_FILE_TYPES:
291
353
  raise ValueError(
292
354
  f"default extension {v} not in supported default file extension. Please specify one from {SUPPORTED_FILE_TYPES}"
293
355
  )
294
356
  return v
295
357
 
296
- @pydantic.validator("sample_files", always=True)
297
- def turn_off_sampling_for_non_s3(cls, v, values):
298
- is_s3 = is_s3_uri(values.get("include") or "")
299
- is_gcs = is_gcs_uri(values.get("include") or "")
300
- is_abs = is_abs_uri(values.get("include") or "")
301
- if not is_s3 and not is_gcs and not is_abs:
302
- # Sampling only makes sense on s3 and gcs currently
303
- v = False
304
- return v
305
-
306
- @pydantic.validator("exclude", each_item=True)
307
- def no_named_fields_in_exclude(cls, v: str) -> str:
308
- if len(parse.compile(v).named_fields) != 0:
309
- raise ValueError(
310
- f"path_spec.exclude {v} should not contain any named variables"
311
- )
312
- return v
313
-
314
- @pydantic.validator("table_name", always=True)
315
- def table_name_in_include(cls, v, values):
316
- if "include" not in values:
317
- return v
318
-
319
- parsable_include = PathSpec.get_parsable_include(values["include"])
320
- compiled_include = parse.compile(parsable_include)
321
-
358
+ @field_validator("exclude", mode="after")
359
+ @classmethod
360
+ def no_named_fields_in_exclude(cls, v: Optional[List[str]]) -> Optional[List[str]]:
322
361
  if v is None:
323
- if "{table}" in values["include"]:
324
- v = "{table}"
325
- else:
326
- if not all(
327
- x in compiled_include.named_fields
328
- for x in parse.compile(v).named_fields
329
- ):
362
+ return v
363
+ for item in v:
364
+ if len(parse.compile(item).named_fields) != 0:
330
365
  raise ValueError(
331
- f"Not all named variables used in path_spec.table_name {v} are specified in path_spec.include {values['include']}"
366
+ f"path_spec.exclude {item} should not contain any named variables"
332
367
  )
333
368
  return v
334
369
 
@@ -479,45 +514,6 @@ class PathSpec(ConfigModel):
479
514
  logger.debug(f"Setting _glob_include: {glob_include}")
480
515
  return glob_include
481
516
 
482
- @pydantic.root_validator(skip_on_failure=True)
483
- @staticmethod
484
- def validate_path_spec(values: Dict) -> Dict[str, Any]:
485
- # validate that main fields are populated
486
- required_fields = ["include", "file_types", "default_extension"]
487
- for f in required_fields:
488
- if f not in values:
489
- logger.debug(
490
- f"Failed to validate because {f} wasn't populated correctly"
491
- )
492
- return values
493
-
494
- if values["include"] and values["autodetect_partitions"]:
495
- include = values["include"]
496
- if include.endswith("/"):
497
- include = include[:-1]
498
-
499
- if include.endswith("{table}"):
500
- values["include"] = include + "/**"
501
-
502
- include_ext = os.path.splitext(values["include"])[1].strip(".")
503
- if not include_ext:
504
- include_ext = (
505
- "*" # if no extension is provided, we assume all files are allowed
506
- )
507
-
508
- if (
509
- include_ext not in values["file_types"]
510
- and include_ext not in ["*", ""]
511
- and not values["default_extension"]
512
- and include_ext not in SUPPORTED_COMPRESSIONS
513
- ):
514
- raise ValueError(
515
- f"file type specified ({include_ext}) in path_spec.include is not in specified file "
516
- f'types. Please select one from {values.get("file_types")} or specify ".*" to allow all types'
517
- )
518
-
519
- return values
520
-
521
517
  def _extract_table_name(self, named_vars: dict) -> str:
522
518
  if self.table_name is None:
523
519
  raise ValueError("path_spec.table_name is not set")
@@ -2,7 +2,7 @@ import os
2
2
  from typing import Optional, Set
3
3
 
4
4
  import pydantic
5
- from pydantic import Field, root_validator
5
+ from pydantic import Field, model_validator
6
6
 
7
7
  from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
8
8
  from datahub.configuration.kafka import KafkaConsumerConnectionConfig
@@ -132,20 +132,20 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
132
132
  default=True, description="Copy system metadata from the source system"
133
133
  )
134
134
 
135
- @root_validator(skip_on_failure=True)
136
- def check_ingesting_data(cls, values):
135
+ @model_validator(mode="after")
136
+ def check_ingesting_data(self):
137
137
  if (
138
- not values.get("database_connection")
139
- and not values.get("kafka_connection")
140
- and not values.get("pull_from_datahub_api")
138
+ not self.database_connection
139
+ and not self.kafka_connection
140
+ and not self.pull_from_datahub_api
141
141
  ):
142
142
  raise ValueError(
143
143
  "Your current config will not ingest any data."
144
144
  " Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
145
145
  )
146
- return values
146
+ return self
147
147
 
148
- @pydantic.validator("database_connection")
148
+ @pydantic.field_validator("database_connection")
149
149
  def validate_mysql_scheme(
150
150
  cls, v: SQLAlchemyConnectionConfig
151
151
  ) -> SQLAlchemyConnectionConfig:
@@ -62,7 +62,7 @@ class DataHubSource(StatefulIngestionSourceBase):
62
62
 
63
63
  @classmethod
64
64
  def create(cls, config_dict: Dict, ctx: PipelineContext) -> "DataHubSource":
65
- config: DataHubSourceConfig = DataHubSourceConfig.parse_obj(config_dict)
65
+ config: DataHubSourceConfig = DataHubSourceConfig.model_validate(config_dict)
66
66
  return cls(config, ctx)
67
67
 
68
68
  def get_report(self) -> SourceReport:
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from copy import deepcopy
2
3
  from datetime import datetime
3
4
  from json import JSONDecodeError
4
5
  from typing import Dict, List, Literal, Optional, Tuple
@@ -6,7 +7,7 @@ from urllib.parse import urlparse
6
7
 
7
8
  import dateutil.parser
8
9
  import requests
9
- from pydantic import Field, root_validator
10
+ from pydantic import Field, model_validator
10
11
 
11
12
  from datahub.ingestion.api.decorators import (
12
13
  SourceCapability,
@@ -68,8 +69,13 @@ class DBTCloudConfig(DBTCommonConfig):
68
69
  description='Where should the "View in dbt" link point to - either the "Explore" UI or the dbt Cloud IDE',
69
70
  )
70
71
 
71
- @root_validator(pre=True)
72
+ @model_validator(mode="before")
73
+ @classmethod
72
74
  def set_metadata_endpoint(cls, values: dict) -> dict:
75
+ # In-place update of the input dict would cause state contamination.
76
+ # So a deepcopy is performed first.
77
+ values = deepcopy(values)
78
+
73
79
  if values.get("access_url") and not values.get("metadata_endpoint"):
74
80
  metadata_endpoint = infer_metadata_endpoint(values["access_url"])
75
81
  if metadata_endpoint is None:
@@ -271,7 +277,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
271
277
 
272
278
  @classmethod
273
279
  def create(cls, config_dict, ctx):
274
- config = DBTCloudConfig.parse_obj(config_dict)
280
+ config = DBTCloudConfig.model_validate(config_dict)
275
281
  return cls(config, ctx)
276
282
 
277
283
  @staticmethod
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import re
3
3
  from abc import abstractmethod
4
+ from copy import deepcopy
4
5
  from dataclasses import dataclass, field
5
6
  from datetime import datetime
6
7
  from enum import auto
@@ -8,7 +9,7 @@ from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
8
9
 
9
10
  import more_itertools
10
11
  import pydantic
11
- from pydantic import root_validator, validator
12
+ from pydantic import field_validator, model_validator
12
13
  from pydantic.fields import Field
13
14
 
14
15
  from datahub.api.entities.dataprocess.dataprocess_instance import (
@@ -194,22 +195,26 @@ class DBTEntitiesEnabled(ConfigModel):
194
195
  "Only supported with dbt core.",
195
196
  )
196
197
 
197
- @root_validator(skip_on_failure=True)
198
- def process_only_directive(cls, values):
198
+ @model_validator(mode="after")
199
+ def process_only_directive(self) -> "DBTEntitiesEnabled":
199
200
  # Checks that at most one is set to ONLY, and then sets the others to NO.
200
-
201
- only_values = [k for k in values if values.get(k) == EmitDirective.ONLY]
201
+ only_values = [
202
+ k for k, v in self.model_dump().items() if v == EmitDirective.ONLY
203
+ ]
202
204
  if len(only_values) > 1:
203
205
  raise ValueError(
204
206
  f"Cannot have more than 1 type of entity emission set to ONLY. Found {only_values}"
205
207
  )
206
208
 
207
209
  if len(only_values) == 1:
208
- for k in values:
209
- values[k] = EmitDirective.NO
210
- values[only_values[0]] = EmitDirective.YES
210
+ # Set all fields to NO first
211
+ for field_name in self.model_dump():
212
+ setattr(self, field_name, EmitDirective.NO)
211
213
 
212
- return values
214
+ # Set the ONLY one to YES
215
+ setattr(self, only_values[0], EmitDirective.YES)
216
+
217
+ return self
213
218
 
214
219
  def _node_type_allow_map(self):
215
220
  # Node type comes from dbt's node types.
@@ -412,7 +417,8 @@ class DBTCommonConfig(
412
417
  "This ensures that lineage is generated reliably, but will lose any documentation associated only with the source.",
413
418
  )
414
419
 
415
- @validator("target_platform")
420
+ @field_validator("target_platform", mode="after")
421
+ @classmethod
416
422
  def validate_target_platform_value(cls, target_platform: str) -> str:
417
423
  if target_platform.lower() == DBT_PLATFORM:
418
424
  raise ValueError(
@@ -421,15 +427,21 @@ class DBTCommonConfig(
421
427
  )
422
428
  return target_platform
423
429
 
424
- @root_validator(pre=True)
430
+ @model_validator(mode="before")
431
+ @classmethod
425
432
  def set_convert_column_urns_to_lowercase_default_for_snowflake(
426
433
  cls, values: dict
427
434
  ) -> dict:
435
+ # In-place update of the input dict would cause state contamination.
436
+ # So a deepcopy is performed first.
437
+ values = deepcopy(values)
438
+
428
439
  if values.get("target_platform", "").lower() == "snowflake":
429
440
  values.setdefault("convert_column_urns_to_lowercase", True)
430
441
  return values
431
442
 
432
- @validator("write_semantics")
443
+ @field_validator("write_semantics", mode="after")
444
+ @classmethod
433
445
  def validate_write_semantics(cls, write_semantics: str) -> str:
434
446
  if write_semantics.lower() not in {"patch", "override"}:
435
447
  raise ValueError(
@@ -439,10 +451,9 @@ class DBTCommonConfig(
439
451
  )
440
452
  return write_semantics
441
453
 
442
- @validator("meta_mapping")
443
- def meta_mapping_validator(
444
- cls, meta_mapping: Dict[str, Any], values: Dict, **kwargs: Any
445
- ) -> Dict[str, Any]:
454
+ @field_validator("meta_mapping", mode="after")
455
+ @classmethod
456
+ def meta_mapping_validator(cls, meta_mapping: Dict[str, Any]) -> Dict[str, Any]:
446
457
  for k, v in meta_mapping.items():
447
458
  if "match" not in v:
448
459
  raise ValueError(
@@ -458,44 +469,35 @@ class DBTCommonConfig(
458
469
  mce_builder.validate_ownership_type(owner_category)
459
470
  return meta_mapping
460
471
 
461
- @validator("include_column_lineage")
462
- def validate_include_column_lineage(
463
- cls, include_column_lineage: bool, values: Dict
464
- ) -> bool:
465
- if include_column_lineage and not values.get("infer_dbt_schemas"):
472
+ @model_validator(mode="after")
473
+ def validate_include_column_lineage(self) -> "DBTCommonConfig":
474
+ if self.include_column_lineage and not self.infer_dbt_schemas:
466
475
  raise ValueError(
467
476
  "`infer_dbt_schemas` must be enabled to use `include_column_lineage`"
468
477
  )
469
478
 
470
- return include_column_lineage
471
-
472
- @validator("skip_sources_in_lineage", always=True)
473
- def validate_skip_sources_in_lineage(
474
- cls, skip_sources_in_lineage: bool, values: Dict
475
- ) -> bool:
476
- entities_enabled: Optional[DBTEntitiesEnabled] = values.get("entities_enabled")
477
- prefer_sql_parser_lineage: Optional[bool] = values.get(
478
- "prefer_sql_parser_lineage"
479
- )
479
+ return self
480
480
 
481
- if prefer_sql_parser_lineage and not skip_sources_in_lineage:
481
+ @model_validator(mode="after")
482
+ def validate_skip_sources_in_lineage(self) -> "DBTCommonConfig":
483
+ if self.prefer_sql_parser_lineage and not self.skip_sources_in_lineage:
482
484
  raise ValueError(
483
485
  "`prefer_sql_parser_lineage` requires that `skip_sources_in_lineage` is enabled."
484
486
  )
485
487
 
486
488
  if (
487
- skip_sources_in_lineage
488
- and entities_enabled
489
- and entities_enabled.sources == EmitDirective.YES
489
+ self.skip_sources_in_lineage
490
+ and self.entities_enabled
491
+ and self.entities_enabled.sources == EmitDirective.YES
490
492
  # When `prefer_sql_parser_lineage` is enabled, it's ok to have `skip_sources_in_lineage` enabled
491
493
  # without also disabling sources.
492
- and not prefer_sql_parser_lineage
494
+ and not self.prefer_sql_parser_lineage
493
495
  ):
494
496
  raise ValueError(
495
497
  "When `skip_sources_in_lineage` is enabled, `entities_enabled.sources` must be set to NO."
496
498
  )
497
499
 
498
- return skip_sources_in_lineage
500
+ return self
499
501
 
500
502
 
501
503
  @dataclass
@@ -9,7 +9,7 @@ from urllib.parse import urlparse
9
9
  import dateutil.parser
10
10
  import requests
11
11
  from packaging import version
12
- from pydantic import BaseModel, Field, validator
12
+ from pydantic import BaseModel, Field, model_validator
13
13
 
14
14
  from datahub.configuration.git import GitReference
15
15
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
@@ -99,26 +99,24 @@ class DBTCoreConfig(DBTCommonConfig):
99
99
 
100
100
  _github_info_deprecated = pydantic_renamed_field("github_info", "git_info")
101
101
 
102
- @validator("aws_connection", always=True)
103
- def aws_connection_needed_if_s3_uris_present(
104
- cls, aws_connection: Optional[AwsConnectionConfig], values: Dict, **kwargs: Any
105
- ) -> Optional[AwsConnectionConfig]:
102
+ @model_validator(mode="after")
103
+ def aws_connection_needed_if_s3_uris_present(self) -> "DBTCoreConfig":
106
104
  # first check if there are fields that contain s3 uris
107
105
  uris = [
108
- values.get(f)
106
+ getattr(self, f, None)
109
107
  for f in [
110
108
  "manifest_path",
111
109
  "catalog_path",
112
110
  "sources_path",
113
111
  ]
114
- ] + values.get("run_results_paths", [])
112
+ ] + (self.run_results_paths or [])
115
113
  s3_uris = [uri for uri in uris if is_s3_uri(uri or "")]
116
114
 
117
- if s3_uris and aws_connection is None:
115
+ if s3_uris and self.aws_connection is None:
118
116
  raise ValueError(
119
117
  f"Please provide aws_connection configuration, since s3 uris have been provided {s3_uris}"
120
118
  )
121
- return aws_connection
119
+ return self
122
120
 
123
121
 
124
122
  def get_columns(
@@ -426,13 +424,13 @@ def load_run_results(
426
424
  )
427
425
  return all_nodes
428
426
 
429
- dbt_metadata = DBTRunMetadata.parse_obj(test_results_json.get("metadata", {}))
427
+ dbt_metadata = DBTRunMetadata.model_validate(test_results_json.get("metadata", {}))
430
428
 
431
429
  all_nodes_map: Dict[str, DBTNode] = {x.dbt_name: x for x in all_nodes}
432
430
 
433
431
  results = test_results_json.get("results", [])
434
432
  for result in results:
435
- run_result = DBTRunResult.parse_obj(result)
433
+ run_result = DBTRunResult.model_validate(result)
436
434
  id = run_result.unique_id
437
435
 
438
436
  if id.startswith("test."):
@@ -477,7 +475,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
477
475
 
478
476
  @classmethod
479
477
  def create(cls, config_dict, ctx):
480
- config = DBTCoreConfig.parse_obj(config_dict)
478
+ config = DBTCoreConfig.model_validate(config_dict)
481
479
  return cls(config, ctx)
482
480
 
483
481
  @staticmethod
@@ -46,7 +46,7 @@ class DataHubDebugSource(Source):
46
46
 
47
47
  @classmethod
48
48
  def create(cls, config_dict, ctx):
49
- config = DataHubDebugSourceConfig.parse_obj(config_dict)
49
+ config = DataHubDebugSourceConfig.model_validate(config_dict)
50
50
  return cls(ctx, config)
51
51
 
52
52
  def perform_dns_probe(self, url: str) -> None:
@@ -1,9 +1,8 @@
1
1
  import logging
2
2
  from typing import Optional
3
3
 
4
- import pydantic
5
4
  from cached_property import cached_property
6
- from pydantic import Field
5
+ from pydantic import Field, field_validator
7
6
  from typing_extensions import Literal
8
7
 
9
8
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
@@ -98,8 +97,11 @@ class DeltaLakeSourceConfig(
98
97
 
99
98
  return complete_path
100
99
 
101
- @pydantic.validator("version_history_lookback")
102
- def negative_version_history_implies_no_limit(cls, v):
100
+ @field_validator("version_history_lookback", mode="after")
101
+ @classmethod
102
+ def negative_version_history_implies_no_limit(
103
+ cls, v: Optional[int]
104
+ ) -> Optional[int]:
103
105
  if v and v < 0:
104
106
  return None
105
107
  return v
@@ -2,7 +2,7 @@ import os
2
2
  from typing import List, Literal, Optional
3
3
 
4
4
  import certifi
5
- from pydantic import Field, validator
5
+ from pydantic import Field, ValidationInfo, field_validator
6
6
 
7
7
  from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
8
8
  from datahub.configuration.source_common import (
@@ -78,8 +78,9 @@ class DremioConnectionConfig(ConfigModel):
78
78
  description="ID of Dremio Cloud Project. Found in Project Settings in the Dremio Cloud UI",
79
79
  )
80
80
 
81
- @validator("authentication_method")
82
- def validate_auth_method(cls, value):
81
+ @field_validator("authentication_method", mode="after")
82
+ @classmethod
83
+ def validate_auth_method(cls, value: str) -> str:
83
84
  allowed_methods = ["password", "PAT"]
84
85
  if value not in allowed_methods:
85
86
  raise ValueError(
@@ -87,9 +88,12 @@ class DremioConnectionConfig(ConfigModel):
87
88
  )
88
89
  return value
89
90
 
90
- @validator("password")
91
- def validate_password(cls, value, values):
92
- if values.get("authentication_method") == "PAT" and not value:
91
+ @field_validator("password", mode="after")
92
+ @classmethod
93
+ def validate_password(
94
+ cls, value: Optional[str], info: ValidationInfo
95
+ ) -> Optional[str]:
96
+ if info.data.get("authentication_method") == "PAT" and not value:
93
97
  raise ValueError(
94
98
  "Password (Personal Access Token) is required when using PAT authentication",
95
99
  )
@@ -200,7 +200,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
200
200
 
201
201
  @classmethod
202
202
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "DynamoDBSource":
203
- config = DynamoDBConfig.parse_obj(config_dict)
203
+ config = DynamoDBConfig.model_validate(config_dict)
204
204
  return cls(ctx, config, "dynamodb")
205
205
 
206
206
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
@@ -8,7 +8,7 @@ from hashlib import md5
8
8
  from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, Type, Union
9
9
 
10
10
  from elasticsearch import Elasticsearch
11
- from pydantic import validator
11
+ from pydantic import field_validator
12
12
  from pydantic.fields import Field
13
13
 
14
14
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
@@ -330,7 +330,8 @@ class ElasticsearchSourceConfig(
330
330
  self.profiling.operation_config
331
331
  )
332
332
 
333
- @validator("host")
333
+ @field_validator("host", mode="after")
334
+ @classmethod
334
335
  def host_colon_port_comma(cls, host_val: str) -> str:
335
336
  for entry in host_val.split(","):
336
337
  entry = remove_protocol(entry)
@@ -382,7 +383,7 @@ class ElasticsearchSource(StatefulIngestionSourceBase):
382
383
  def create(
383
384
  cls, config_dict: Dict[str, Any], ctx: PipelineContext
384
385
  ) -> "ElasticsearchSource":
385
- config = ElasticsearchSourceConfig.parse_obj(config_dict)
386
+ config = ElasticsearchSourceConfig.model_validate(config_dict)
386
387
  return cls(config, ctx)
387
388
 
388
389
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
@@ -156,7 +156,7 @@ class ExcelSource(StatefulIngestionSourceBase):
156
156
 
157
157
  @classmethod
158
158
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "ExcelSource":
159
- config = ExcelSourceConfig.parse_obj(config_dict)
159
+ config = ExcelSourceConfig.model_validate(config_dict)
160
160
  return cls(ctx, config)
161
161
 
162
162
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: