acryl-datahub 1.3.1__py3-none-any.whl → 1.3.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (203) hide show
  1. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2582 -2582
  2. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +203 -201
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +2 -2
  5. datahub/api/entities/corpgroup/corpgroup.py +11 -6
  6. datahub/api/entities/corpuser/corpuser.py +11 -11
  7. datahub/api/entities/dataproduct/dataproduct.py +47 -27
  8. datahub/api/entities/dataset/dataset.py +32 -21
  9. datahub/api/entities/external/lake_formation_external_entites.py +5 -6
  10. datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
  11. datahub/api/entities/forms/forms.py +16 -14
  12. datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
  13. datahub/cli/check_cli.py +2 -2
  14. datahub/cli/config_utils.py +3 -3
  15. datahub/cli/lite_cli.py +9 -7
  16. datahub/cli/migrate.py +4 -4
  17. datahub/cli/quickstart_versioning.py +3 -3
  18. datahub/cli/specific/group_cli.py +1 -1
  19. datahub/cli/specific/structuredproperties_cli.py +1 -1
  20. datahub/cli/specific/user_cli.py +1 -1
  21. datahub/configuration/common.py +14 -2
  22. datahub/configuration/connection_resolver.py +2 -2
  23. datahub/configuration/git.py +47 -30
  24. datahub/configuration/import_resolver.py +2 -2
  25. datahub/configuration/kafka.py +4 -3
  26. datahub/configuration/time_window_config.py +26 -26
  27. datahub/configuration/validate_field_deprecation.py +2 -2
  28. datahub/configuration/validate_field_removal.py +2 -2
  29. datahub/configuration/validate_field_rename.py +2 -2
  30. datahub/configuration/validate_multiline_string.py +2 -1
  31. datahub/emitter/kafka_emitter.py +3 -1
  32. datahub/emitter/rest_emitter.py +2 -4
  33. datahub/ingestion/api/decorators.py +1 -1
  34. datahub/ingestion/api/report.py +1 -1
  35. datahub/ingestion/api/sink.py +1 -1
  36. datahub/ingestion/api/source.py +1 -1
  37. datahub/ingestion/glossary/datahub_classifier.py +11 -8
  38. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  39. datahub/ingestion/reporting/file_reporter.py +5 -4
  40. datahub/ingestion/run/pipeline.py +6 -6
  41. datahub/ingestion/run/pipeline_config.py +12 -14
  42. datahub/ingestion/run/sink_callback.py +1 -1
  43. datahub/ingestion/sink/datahub_rest.py +6 -4
  44. datahub/ingestion/source/abs/config.py +19 -19
  45. datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
  46. datahub/ingestion/source/abs/source.py +2 -2
  47. datahub/ingestion/source/aws/aws_common.py +1 -1
  48. datahub/ingestion/source/aws/glue.py +6 -4
  49. datahub/ingestion/source/aws/sagemaker.py +1 -1
  50. datahub/ingestion/source/azure/azure_common.py +8 -12
  51. datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
  53. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
  54. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  55. datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
  56. datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
  57. datahub/ingestion/source/datahub/config.py +8 -8
  58. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  59. datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
  60. datahub/ingestion/source/dbt/dbt_common.py +39 -37
  61. datahub/ingestion/source/dbt/dbt_core.py +10 -12
  62. datahub/ingestion/source/debug/datahub_debug.py +1 -1
  63. datahub/ingestion/source/delta_lake/config.py +6 -4
  64. datahub/ingestion/source/dremio/dremio_config.py +10 -6
  65. datahub/ingestion/source/dremio/dremio_source.py +15 -15
  66. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  67. datahub/ingestion/source/elastic_search.py +4 -3
  68. datahub/ingestion/source/excel/source.py +1 -1
  69. datahub/ingestion/source/feast.py +1 -1
  70. datahub/ingestion/source/file.py +5 -4
  71. datahub/ingestion/source/fivetran/config.py +17 -16
  72. datahub/ingestion/source/fivetran/fivetran.py +2 -2
  73. datahub/ingestion/source/gc/datahub_gc.py +1 -1
  74. datahub/ingestion/source/gcs/gcs_source.py +8 -10
  75. datahub/ingestion/source/ge_profiling_config.py +8 -5
  76. datahub/ingestion/source/grafana/grafana_api.py +2 -2
  77. datahub/ingestion/source/grafana/grafana_config.py +4 -3
  78. datahub/ingestion/source/grafana/grafana_source.py +1 -1
  79. datahub/ingestion/source/grafana/models.py +23 -5
  80. datahub/ingestion/source/hex/api.py +7 -5
  81. datahub/ingestion/source/hex/hex.py +4 -3
  82. datahub/ingestion/source/iceberg/iceberg.py +1 -1
  83. datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
  84. datahub/ingestion/source/identity/azure_ad.py +1 -1
  85. datahub/ingestion/source/identity/okta.py +10 -10
  86. datahub/ingestion/source/kafka/kafka.py +1 -1
  87. datahub/ingestion/source/ldap.py +1 -1
  88. datahub/ingestion/source/looker/looker_common.py +7 -5
  89. datahub/ingestion/source/looker/looker_config.py +21 -20
  90. datahub/ingestion/source/looker/lookml_config.py +47 -47
  91. datahub/ingestion/source/metabase.py +8 -8
  92. datahub/ingestion/source/metadata/business_glossary.py +2 -2
  93. datahub/ingestion/source/metadata/lineage.py +13 -8
  94. datahub/ingestion/source/mlflow.py +1 -1
  95. datahub/ingestion/source/mode.py +6 -4
  96. datahub/ingestion/source/mongodb.py +4 -3
  97. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  98. datahub/ingestion/source/nifi.py +17 -23
  99. datahub/ingestion/source/openapi.py +6 -8
  100. datahub/ingestion/source/powerbi/config.py +33 -32
  101. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
  102. datahub/ingestion/source/powerbi/powerbi.py +1 -1
  103. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
  104. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
  105. datahub/ingestion/source/preset.py +8 -8
  106. datahub/ingestion/source/pulsar.py +1 -1
  107. datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
  108. datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
  109. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  110. datahub/ingestion/source/redshift/config.py +18 -20
  111. datahub/ingestion/source/redshift/redshift.py +2 -2
  112. datahub/ingestion/source/redshift/usage.py +23 -3
  113. datahub/ingestion/source/s3/config.py +83 -62
  114. datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
  115. datahub/ingestion/source/s3/source.py +8 -5
  116. datahub/ingestion/source/sac/sac.py +5 -4
  117. datahub/ingestion/source/salesforce.py +3 -2
  118. datahub/ingestion/source/schema/json_schema.py +2 -2
  119. datahub/ingestion/source/sigma/data_classes.py +3 -2
  120. datahub/ingestion/source/sigma/sigma.py +1 -1
  121. datahub/ingestion/source/sigma/sigma_api.py +7 -7
  122. datahub/ingestion/source/slack/slack.py +1 -1
  123. datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
  124. datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
  125. datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
  126. datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
  127. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
  128. datahub/ingestion/source/snowflake/snowflake_queries.py +1 -1
  129. datahub/ingestion/source/sql/athena.py +1 -1
  130. datahub/ingestion/source/sql/clickhouse.py +4 -2
  131. datahub/ingestion/source/sql/cockroachdb.py +1 -1
  132. datahub/ingestion/source/sql/druid.py +1 -1
  133. datahub/ingestion/source/sql/hana.py +1 -1
  134. datahub/ingestion/source/sql/hive.py +7 -5
  135. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  136. datahub/ingestion/source/sql/mssql/source.py +13 -6
  137. datahub/ingestion/source/sql/mysql.py +1 -1
  138. datahub/ingestion/source/sql/oracle.py +17 -10
  139. datahub/ingestion/source/sql/postgres.py +2 -2
  140. datahub/ingestion/source/sql/presto.py +1 -1
  141. datahub/ingestion/source/sql/sql_config.py +8 -9
  142. datahub/ingestion/source/sql/sql_generic.py +1 -1
  143. datahub/ingestion/source/sql/teradata.py +1 -1
  144. datahub/ingestion/source/sql/trino.py +1 -1
  145. datahub/ingestion/source/sql/vertica.py +5 -4
  146. datahub/ingestion/source/sql_queries.py +11 -8
  147. datahub/ingestion/source/state/checkpoint.py +2 -2
  148. datahub/ingestion/source/state/entity_removal_state.py +2 -1
  149. datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
  150. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
  151. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  152. datahub/ingestion/source/superset.py +9 -9
  153. datahub/ingestion/source/tableau/tableau.py +14 -16
  154. datahub/ingestion/source/unity/azure_auth_config.py +15 -0
  155. datahub/ingestion/source/unity/config.py +51 -34
  156. datahub/ingestion/source/unity/connection.py +7 -1
  157. datahub/ingestion/source/unity/connection_test.py +1 -1
  158. datahub/ingestion/source/unity/proxy.py +216 -7
  159. datahub/ingestion/source/unity/proxy_types.py +91 -0
  160. datahub/ingestion/source/unity/source.py +29 -3
  161. datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
  162. datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
  163. datahub/ingestion/source/usage/usage_common.py +5 -3
  164. datahub/ingestion/source_config/csv_enricher.py +7 -6
  165. datahub/ingestion/source_config/operation_config.py +7 -4
  166. datahub/ingestion/source_config/pulsar.py +11 -15
  167. datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
  168. datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
  169. datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
  170. datahub/ingestion/transformer/add_dataset_properties.py +2 -2
  171. datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
  172. datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
  173. datahub/ingestion/transformer/add_dataset_tags.py +3 -3
  174. datahub/ingestion/transformer/add_dataset_terms.py +3 -3
  175. datahub/ingestion/transformer/dataset_domain.py +3 -3
  176. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
  177. datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
  178. datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
  179. datahub/ingestion/transformer/mark_dataset_status.py +1 -1
  180. datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
  181. datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
  182. datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
  183. datahub/ingestion/transformer/replace_external_url.py +2 -2
  184. datahub/ingestion/transformer/set_browse_path.py +1 -1
  185. datahub/ingestion/transformer/tags_to_terms.py +1 -1
  186. datahub/lite/duckdb_lite.py +1 -1
  187. datahub/lite/lite_util.py +2 -2
  188. datahub/metadata/schema.avsc +7 -2
  189. datahub/metadata/schemas/QuerySubjects.avsc +1 -1
  190. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +6 -1
  191. datahub/sdk/__init__.py +1 -0
  192. datahub/sdk/_all_entities.py +2 -0
  193. datahub/sdk/search_filters.py +68 -40
  194. datahub/sdk/tag.py +112 -0
  195. datahub/secret/datahub_secret_store.py +7 -4
  196. datahub/secret/file_secret_store.py +1 -1
  197. datahub/sql_parsing/sqlglot_lineage.py +5 -2
  198. datahub/testing/check_sql_parser_result.py +2 -2
  199. datahub/utilities/ingest_utils.py +1 -1
  200. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
  201. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
  202. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
  203. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import logging
2
- from typing import Any, Dict, List, Optional, Union
2
+ from typing import Any, Dict, Optional, Union
3
3
 
4
- import pydantic
4
+ from pydantic import ValidationInfo, field_validator, model_validator
5
5
  from pydantic.fields import Field
6
6
 
7
7
  from datahub.configuration.common import AllowDenyPattern
@@ -12,7 +12,6 @@ from datahub.configuration.validate_field_deprecation import pydantic_field_depr
12
12
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
13
13
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
14
14
  from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
15
- from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
16
15
  from datahub.ingestion.source.s3.datalake_profiler_config import DataLakeProfilerConfig
17
16
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
18
17
  StatefulStaleMetadataRemovalConfig,
@@ -117,69 +116,91 @@ class DataLakeSourceConfig(
117
116
  self.profiling.operation_config
118
117
  )
119
118
 
120
- @pydantic.validator("path_specs", always=True)
121
- def check_path_specs_and_infer_platform(
122
- cls, path_specs: List[PathSpec], values: Dict
123
- ) -> List[PathSpec]:
119
+ @field_validator("path_specs", mode="before")
120
+ @classmethod
121
+ def check_path_specs(cls, path_specs: Any, info: ValidationInfo) -> Any:
124
122
  if len(path_specs) == 0:
125
123
  raise ValueError("path_specs must not be empty")
126
124
 
127
- # Check that all path specs have the same platform.
128
- guessed_platforms = {
129
- "s3" if path_spec.is_s3 else "file" for path_spec in path_specs
130
- }
131
- if len(guessed_platforms) > 1:
132
- raise ValueError(
133
- f"Cannot have multiple platforms in path_specs: {guessed_platforms}"
134
- )
135
- guessed_platform = guessed_platforms.pop()
136
-
137
- # Ensure s3 configs aren't used for file sources.
138
- if guessed_platform != "s3" and (
139
- values.get("use_s3_object_tags") or values.get("use_s3_bucket_tags")
140
- ):
141
- raise ValueError(
142
- "Cannot grab s3 object/bucket tags when platform is not s3. Remove the flag or use s3."
143
- )
144
-
145
- # Infer platform if not specified.
146
- if values.get("platform") and values["platform"] != guessed_platform:
147
- raise ValueError(
148
- f"All path_specs belong to {guessed_platform} platform, but platform is set to {values['platform']}"
149
- )
150
- else:
151
- logger.debug(f'Setting config "platform": {guessed_platform}')
152
- values["platform"] = guessed_platform
125
+ # Basic validation - path specs consistency and S3 config validation is now handled in model_validator
153
126
 
154
127
  return path_specs
155
128
 
156
- @pydantic.validator("platform", always=True)
157
- def platform_valid(cls, platform: Any, values: dict) -> str:
158
- inferred_platform = values.get("platform") # we may have inferred it above
159
- platform = platform or inferred_platform
160
- if not platform:
161
- raise ValueError("platform must not be empty")
162
-
163
- if platform != "s3" and values.get("use_s3_bucket_tags"):
164
- raise ValueError(
165
- "Cannot grab s3 bucket tags when platform is not s3. Remove the flag or ingest from s3."
166
- )
167
- if platform != "s3" and values.get("use_s3_object_tags"):
168
- raise ValueError(
169
- "Cannot grab s3 object tags when platform is not s3. Remove the flag or ingest from s3."
170
- )
171
- if platform != "s3" and values.get("use_s3_content_type"):
172
- raise ValueError(
173
- "Cannot grab s3 object content type when platform is not s3. Remove the flag or ingest from s3."
174
- )
175
-
176
- return platform
177
-
178
- @pydantic.root_validator(skip_on_failure=True)
179
- def ensure_profiling_pattern_is_passed_to_profiling(
180
- cls, values: Dict[str, Any]
181
- ) -> Dict[str, Any]:
182
- profiling: Optional[DataLakeProfilerConfig] = values.get("profiling")
129
+ @model_validator(mode="after")
130
+ def ensure_profiling_pattern_is_passed_to_profiling(self) -> "DataLakeSourceConfig":
131
+ profiling = self.profiling
183
132
  if profiling is not None and profiling.enabled:
184
- profiling._allow_deny_patterns = values["profile_patterns"]
185
- return values
133
+ profiling._allow_deny_patterns = self.profile_patterns
134
+ return self
135
+
136
+ @model_validator(mode="after")
137
+ def validate_platform_and_config_consistency(self) -> "DataLakeSourceConfig":
138
+ """Infer platform from path_specs and validate config consistency."""
139
+ # Track whether platform was explicitly provided
140
+ platform_was_explicit = bool(self.platform)
141
+
142
+ # Infer platform from path_specs if not explicitly set
143
+ if not self.platform and self.path_specs:
144
+ guessed_platforms = set()
145
+ for path_spec in self.path_specs:
146
+ if (
147
+ hasattr(path_spec, "include")
148
+ and path_spec.include
149
+ and path_spec.include.startswith("s3://")
150
+ ):
151
+ guessed_platforms.add("s3")
152
+ else:
153
+ guessed_platforms.add("file")
154
+
155
+ # Ensure all path specs belong to the same platform
156
+ if len(guessed_platforms) > 1:
157
+ raise ValueError(
158
+ f"Cannot have multiple platforms in path_specs: {guessed_platforms}"
159
+ )
160
+
161
+ if guessed_platforms:
162
+ guessed_platform = guessed_platforms.pop()
163
+ logger.debug(f"Inferred platform: {guessed_platform}")
164
+ self.platform = guessed_platform
165
+ else:
166
+ self.platform = "file"
167
+ elif not self.platform:
168
+ self.platform = "file"
169
+
170
+ # Validate platform consistency only when platform was inferred (not explicitly set)
171
+ # This allows sources like GCS to set platform="gcs" with s3:// URIs for correct container subtypes
172
+ if not platform_was_explicit and self.platform and self.path_specs:
173
+ expected_platforms = set()
174
+ for path_spec in self.path_specs:
175
+ if (
176
+ hasattr(path_spec, "include")
177
+ and path_spec.include
178
+ and path_spec.include.startswith("s3://")
179
+ ):
180
+ expected_platforms.add("s3")
181
+ else:
182
+ expected_platforms.add("file")
183
+
184
+ if len(expected_platforms) == 1:
185
+ expected_platform = expected_platforms.pop()
186
+ if self.platform != expected_platform:
187
+ raise ValueError(
188
+ f"All path_specs belong to {expected_platform} platform, but platform was inferred as {self.platform}"
189
+ )
190
+
191
+ # Validate S3-specific configurations
192
+ if self.platform != "s3":
193
+ if self.use_s3_bucket_tags:
194
+ raise ValueError(
195
+ "Cannot grab s3 bucket tags when platform is not s3. Remove the flag or ingest from s3."
196
+ )
197
+ if self.use_s3_object_tags:
198
+ raise ValueError(
199
+ "Cannot grab s3 object tags when platform is not s3. Remove the flag or ingest from s3."
200
+ )
201
+ if self.use_s3_content_type:
202
+ raise ValueError(
203
+ "Cannot grab s3 object content type when platform is not s3. Remove the flag or ingest from s3."
204
+ )
205
+
206
+ return self
@@ -1,6 +1,7 @@
1
- from typing import Any, Dict, Optional
1
+ from typing import Optional
2
2
 
3
3
  import pydantic
4
+ from pydantic import model_validator
4
5
  from pydantic.fields import Field
5
6
 
6
7
  from datahub.configuration import ConfigModel
@@ -72,21 +73,18 @@ class DataLakeProfilerConfig(ConfigModel):
72
73
  description="Whether to profile for the sample values for all columns.",
73
74
  )
74
75
 
75
- @pydantic.root_validator(skip_on_failure=True)
76
- def ensure_field_level_settings_are_normalized(
77
- cls: "DataLakeProfilerConfig", values: Dict[str, Any]
78
- ) -> Dict[str, Any]:
79
- max_num_fields_to_profile_key = "max_number_of_fields_to_profile"
80
- max_num_fields_to_profile = values.get(max_num_fields_to_profile_key)
76
+ @model_validator(mode="after")
77
+ def ensure_field_level_settings_are_normalized(self) -> "DataLakeProfilerConfig":
78
+ max_num_fields_to_profile = self.max_number_of_fields_to_profile
81
79
 
82
80
  # Disable all field-level metrics.
83
- if values.get("profile_table_level_only"):
84
- for field_level_metric in cls.__fields__:
85
- if field_level_metric.startswith("include_field_"):
86
- values.setdefault(field_level_metric, False)
81
+ if self.profile_table_level_only:
82
+ for field_name in self.__fields__:
83
+ if field_name.startswith("include_field_"):
84
+ setattr(self, field_name, False)
87
85
 
88
86
  assert max_num_fields_to_profile is None, (
89
- f"{max_num_fields_to_profile_key} should be set to None"
87
+ "max_number_of_fields_to_profile should be set to None"
90
88
  )
91
89
 
92
- return values
90
+ return self
@@ -53,8 +53,11 @@ from datahub.ingestion.source.data_lake_common.data_lake_utils import (
53
53
  from datahub.ingestion.source.data_lake_common.object_store import (
54
54
  create_object_store_adapter,
55
55
  )
56
- from datahub.ingestion.source.data_lake_common.path_spec import FolderTraversalMethod
57
- from datahub.ingestion.source.s3.config import DataLakeSourceConfig, PathSpec
56
+ from datahub.ingestion.source.data_lake_common.path_spec import (
57
+ FolderTraversalMethod,
58
+ PathSpec,
59
+ )
60
+ from datahub.ingestion.source.s3.config import DataLakeSourceConfig
58
61
  from datahub.ingestion.source.s3.report import DataLakeSourceReport
59
62
  from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet
60
63
  from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase
@@ -261,7 +264,7 @@ class S3Source(StatefulIngestionSourceBase):
261
264
  )
262
265
 
263
266
  config_report = {
264
- config_option: config.dict().get(config_option)
267
+ config_option: config.model_dump().get(config_option)
265
268
  for config_option in config_options_to_report
266
269
  }
267
270
  config_report = {
@@ -278,7 +281,7 @@ class S3Source(StatefulIngestionSourceBase):
278
281
  telemetry.telemetry_instance.ping(
279
282
  "data_lake_profiling_config",
280
283
  {
281
- config_flag: config.profiling.dict().get(config_flag)
284
+ config_flag: config.profiling.model_dump().get(config_flag)
282
285
  for config_flag in profiling_flags_to_report
283
286
  },
284
287
  )
@@ -370,7 +373,7 @@ class S3Source(StatefulIngestionSourceBase):
370
373
 
371
374
  @classmethod
372
375
  def create(cls, config_dict, ctx):
373
- config = DataLakeSourceConfig.parse_obj(config_dict)
376
+ config = DataLakeSourceConfig.model_validate(config_dict)
374
377
 
375
378
  return cls(config, ctx)
376
379
 
@@ -8,7 +8,7 @@ import pyodata
8
8
  import pyodata.v2.model
9
9
  import pyodata.v2.service
10
10
  from authlib.integrations.requests_client import OAuth2Session
11
- from pydantic import Field, SecretStr, validator
11
+ from pydantic import Field, SecretStr, field_validator
12
12
  from requests.adapters import HTTPAdapter
13
13
  from urllib3.util.retry import Retry
14
14
 
@@ -159,7 +159,8 @@ class SACSourceConfig(
159
159
  description="Template for generating dataset urns of consumed queries, the placeholder {query} can be used within the template for inserting the name of the query",
160
160
  )
161
161
 
162
- @validator("tenant_url", "token_url")
162
+ @field_validator("tenant_url", "token_url", mode="after")
163
+ @classmethod
163
164
  def remove_trailing_slash(cls, v):
164
165
  return config_clean.remove_trailing_slashes(v)
165
166
 
@@ -209,7 +210,7 @@ class SACSource(StatefulIngestionSourceBase, TestableSource):
209
210
 
210
211
  @classmethod
211
212
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "SACSource":
212
- config = SACSourceConfig.parse_obj(config_dict)
213
+ config = SACSourceConfig.model_validate(config_dict)
213
214
  return cls(config, ctx)
214
215
 
215
216
  @staticmethod
@@ -217,7 +218,7 @@ class SACSource(StatefulIngestionSourceBase, TestableSource):
217
218
  test_report = TestConnectionReport()
218
219
 
219
220
  try:
220
- config = SACSourceConfig.parse_obj(config_dict)
221
+ config = SACSourceConfig.model_validate(config_dict)
221
222
 
222
223
  # when creating the pyodata.Client, the metadata is automatically parsed and validated
223
224
  session, _ = SACSource.get_sac_connection(config)
@@ -7,7 +7,7 @@ from enum import Enum
7
7
  from typing import Any, Dict, Iterable, List, Literal, Optional, TypedDict
8
8
 
9
9
  import requests
10
- from pydantic import Field, validator
10
+ from pydantic import Field, field_validator
11
11
  from simple_salesforce import Salesforce
12
12
  from simple_salesforce.exceptions import SalesforceAuthenticationFailed
13
13
 
@@ -172,7 +172,8 @@ class SalesforceConfig(
172
172
  self.profiling.operation_config
173
173
  )
174
174
 
175
- @validator("instance_url")
175
+ @field_validator("instance_url", mode="after")
176
+ @classmethod
176
177
  def remove_trailing_slash(cls, v):
177
178
  return config_clean.remove_trailing_slashes(v)
178
179
 
@@ -12,7 +12,7 @@ from urllib.parse import urlparse
12
12
 
13
13
  import jsonref
14
14
  import requests
15
- from pydantic import AnyHttpUrl, DirectoryPath, FilePath, validator
15
+ from pydantic import AnyHttpUrl, DirectoryPath, FilePath, field_validator
16
16
  from pydantic.fields import Field
17
17
 
18
18
  import datahub.metadata.schema_classes as models
@@ -90,7 +90,7 @@ class JsonSchemaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMix
90
90
  description="Use this if URI-s need to be modified during reference resolution. Simple string match - replace capabilities are supported.",
91
91
  )
92
92
 
93
- @validator("path")
93
+ @field_validator("path", mode="after")
94
94
  def download_http_url_to_temp_file(cls, v):
95
95
  if isinstance(v, AnyHttpUrl):
96
96
  try:
@@ -2,7 +2,7 @@ from copy import deepcopy
2
2
  from datetime import datetime
3
3
  from typing import Dict, List, Optional
4
4
 
5
- from pydantic import BaseModel, root_validator
5
+ from pydantic import BaseModel, model_validator
6
6
 
7
7
  from datahub.emitter.mcp_builder import ContainerKey
8
8
 
@@ -22,7 +22,8 @@ class Workspace(BaseModel):
22
22
  createdAt: datetime
23
23
  updatedAt: datetime
24
24
 
25
- @root_validator(pre=True)
25
+ @model_validator(mode="before")
26
+ @classmethod
26
27
  def update_values(cls, values: Dict) -> Dict:
27
28
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
28
29
  values = deepcopy(values)
@@ -150,7 +150,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
150
150
 
151
151
  @classmethod
152
152
  def create(cls, config_dict, ctx):
153
- config = SigmaSourceConfig.parse_obj(config_dict)
153
+ config = SigmaSourceConfig.model_validate(config_dict)
154
154
  return cls(config, ctx)
155
155
 
156
156
  def _gen_workbook_key(self, workbook_id: str) -> WorkbookKey:
@@ -108,7 +108,7 @@ class SigmaAPI:
108
108
  self.report.non_accessible_workspaces_count += 1
109
109
  return None
110
110
  response.raise_for_status()
111
- workspace = Workspace.parse_obj(response.json())
111
+ workspace = Workspace.model_validate(response.json())
112
112
  self.workspaces[workspace.workspaceId] = workspace
113
113
  return workspace
114
114
  except Exception as e:
@@ -127,7 +127,7 @@ class SigmaAPI:
127
127
  response_dict = response.json()
128
128
  for workspace_dict in response_dict[Constant.ENTRIES]:
129
129
  self.workspaces[workspace_dict[Constant.WORKSPACEID]] = (
130
- Workspace.parse_obj(workspace_dict)
130
+ Workspace.model_validate(workspace_dict)
131
131
  )
132
132
  if response_dict[Constant.NEXTPAGE]:
133
133
  url = f"{workspace_url}&page={response_dict[Constant.NEXTPAGE]}"
@@ -197,7 +197,7 @@ class SigmaAPI:
197
197
  response.raise_for_status()
198
198
  response_dict = response.json()
199
199
  for file_dict in response_dict[Constant.ENTRIES]:
200
- file = File.parse_obj(file_dict)
200
+ file = File.model_validate(file_dict)
201
201
  file.workspaceId = self.get_workspace_id_from_file_path(
202
202
  file.parentId, file.path
203
203
  )
@@ -225,7 +225,7 @@ class SigmaAPI:
225
225
  response.raise_for_status()
226
226
  response_dict = response.json()
227
227
  for dataset_dict in response_dict[Constant.ENTRIES]:
228
- dataset = SigmaDataset.parse_obj(dataset_dict)
228
+ dataset = SigmaDataset.model_validate(dataset_dict)
229
229
 
230
230
  if dataset.datasetId not in dataset_files_metadata:
231
231
  self.report.datasets.dropped(
@@ -354,7 +354,7 @@ class SigmaAPI:
354
354
  element_dict[Constant.URL] = (
355
355
  f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
356
356
  )
357
- element = Element.parse_obj(element_dict)
357
+ element = Element.model_validate(element_dict)
358
358
  if (
359
359
  self.config.extract_lineage
360
360
  and self.config.workbook_lineage_pattern.allowed(workbook.name)
@@ -379,7 +379,7 @@ class SigmaAPI:
379
379
  )
380
380
  response.raise_for_status()
381
381
  for page_dict in response.json()[Constant.ENTRIES]:
382
- page = Page.parse_obj(page_dict)
382
+ page = Page.model_validate(page_dict)
383
383
  page.elements = self.get_page_elements(workbook, page)
384
384
  pages.append(page)
385
385
  return pages
@@ -400,7 +400,7 @@ class SigmaAPI:
400
400
  response.raise_for_status()
401
401
  response_dict = response.json()
402
402
  for workbook_dict in response_dict[Constant.ENTRIES]:
403
- workbook = Workbook.parse_obj(workbook_dict)
403
+ workbook = Workbook.model_validate(workbook_dict)
404
404
 
405
405
  if workbook.workbookId not in workbook_files_metadata:
406
406
  # Due to a bug in the Sigma API, it seems like the /files endpoint does not
@@ -260,7 +260,7 @@ class SlackSource(StatefulIngestionSourceBase):
260
260
 
261
261
  @classmethod
262
262
  def create(cls, config_dict, ctx):
263
- config = SlackSourceConfig.parse_obj(config_dict)
263
+ config = SlackSourceConfig.model_validate(config_dict)
264
264
  return cls(ctx, config)
265
265
 
266
266
  def get_slack_client(self) -> WebClient:
@@ -351,5 +351,5 @@ class SnaplogicSource(StatefulIngestionSourceBase):
351
351
 
352
352
  @classmethod
353
353
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "SnaplogicSource":
354
- config = SnaplogicConfig.parse_obj(config_dict)
354
+ config = SnaplogicConfig.model_validate(config_dict)
355
355
  return cls(config, ctx)
@@ -91,7 +91,7 @@ class SnowflakeAssertionsHandler:
91
91
  self, result_row: dict, discovered_datasets: List[str]
92
92
  ) -> Optional[MetadataChangeProposalWrapper]:
93
93
  try:
94
- result = DataQualityMonitoringResult.parse_obj(result_row)
94
+ result = DataQualityMonitoringResult.model_validate(result_row)
95
95
  assertion_guid = result.METRIC_NAME.split("__")[-1].lower()
96
96
  status = bool(result.VALUE) # 1 if PASS, 0 if FAIL
97
97
  assertee = self.identifiers.get_dataset_identifier(
@@ -5,7 +5,7 @@ from enum import Enum
5
5
  from typing import Dict, List, Optional, Set
6
6
 
7
7
  import pydantic
8
- from pydantic import Field, root_validator, validator
8
+ from pydantic import Field, ValidationInfo, field_validator, model_validator
9
9
 
10
10
  from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
11
11
  from datahub.configuration.pattern_utils import UUID_REGEX
@@ -122,10 +122,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
122
122
  description="Whether `schema_pattern` is matched against fully qualified schema name `<catalog>.<schema>`.",
123
123
  )
124
124
 
125
- @root_validator(pre=False, skip_on_failure=True)
126
- def validate_legacy_schema_pattern(cls, values: Dict) -> Dict:
127
- schema_pattern: Optional[AllowDenyPattern] = values.get("schema_pattern")
128
- match_fully_qualified_names = values.get("match_fully_qualified_names")
125
+ @model_validator(mode="after")
126
+ def validate_legacy_schema_pattern(self) -> "SnowflakeFilterConfig":
127
+ schema_pattern: Optional[AllowDenyPattern] = self.schema_pattern
128
+ match_fully_qualified_names = self.match_fully_qualified_names
129
129
 
130
130
  if (
131
131
  schema_pattern is not None
@@ -145,7 +145,7 @@ class SnowflakeFilterConfig(SQLFilterConfig):
145
145
  assert isinstance(schema_pattern, AllowDenyPattern)
146
146
  schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
147
147
 
148
- return values
148
+ return self
149
149
 
150
150
 
151
151
  class SnowflakeIdentifierConfig(
@@ -391,7 +391,8 @@ class SnowflakeV2Config(
391
391
  "This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.",
392
392
  )
393
393
 
394
- @validator("convert_urns_to_lowercase")
394
+ @field_validator("convert_urns_to_lowercase", mode="after")
395
+ @classmethod
395
396
  def validate_convert_urns_to_lowercase(cls, v):
396
397
  if not v:
397
398
  add_global_warning(
@@ -400,30 +401,31 @@ class SnowflakeV2Config(
400
401
 
401
402
  return v
402
403
 
403
- @validator("include_column_lineage")
404
- def validate_include_column_lineage(cls, v, values):
405
- if not values.get("include_table_lineage") and v:
404
+ @field_validator("include_column_lineage", mode="after")
405
+ @classmethod
406
+ def validate_include_column_lineage(cls, v, info):
407
+ if not info.data.get("include_table_lineage") and v:
406
408
  raise ValueError(
407
409
  "include_table_lineage must be True for include_column_lineage to be set."
408
410
  )
409
411
  return v
410
412
 
411
- @root_validator(pre=False, skip_on_failure=True)
412
- def validate_unsupported_configs(cls, values: Dict) -> Dict:
413
- value = values.get("include_read_operational_stats")
414
- if value is not None and value:
413
+ @model_validator(mode="after")
414
+ def validate_unsupported_configs(self) -> "SnowflakeV2Config":
415
+ if (
416
+ hasattr(self, "include_read_operational_stats")
417
+ and self.include_read_operational_stats
418
+ ):
415
419
  raise ValueError(
416
420
  "include_read_operational_stats is not supported. Set `include_read_operational_stats` to False.",
417
421
  )
418
422
 
419
- include_technical_schema = values.get("include_technical_schema")
420
- include_profiles = (
421
- values.get("profiling") is not None and values["profiling"].enabled
422
- )
423
+ include_technical_schema = self.include_technical_schema
424
+ include_profiles = self.profiling is not None and self.profiling.enabled
423
425
  delete_detection_enabled = (
424
- values.get("stateful_ingestion") is not None
425
- and values["stateful_ingestion"].enabled
426
- and values["stateful_ingestion"].remove_stale_metadata
426
+ self.stateful_ingestion is not None
427
+ and self.stateful_ingestion.enabled
428
+ and self.stateful_ingestion.remove_stale_metadata
427
429
  )
428
430
 
429
431
  # TODO: Allow profiling irrespective of basic schema extraction,
@@ -435,13 +437,14 @@ class SnowflakeV2Config(
435
437
  "Cannot perform Deletion Detection or Profiling without extracting snowflake technical schema. Set `include_technical_schema` to True or disable Deletion Detection and Profiling."
436
438
  )
437
439
 
438
- return values
440
+ return self
439
441
 
440
- @validator("shares")
442
+ @field_validator("shares", mode="after")
443
+ @classmethod
441
444
  def validate_shares(
442
- cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
445
+ cls, shares: Optional[Dict[str, SnowflakeShareConfig]], info: ValidationInfo
443
446
  ) -> Optional[Dict[str, SnowflakeShareConfig]]:
444
- current_platform_instance = values.get("platform_instance")
447
+ current_platform_instance = info.data.get("platform_instance")
445
448
 
446
449
  if shares:
447
450
  # Check: platform_instance should be present
@@ -479,11 +482,12 @@ class SnowflakeV2Config(
479
482
 
480
483
  return shares
481
484
 
482
- @root_validator(pre=False, skip_on_failure=True)
483
- def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
484
- if values.get("use_queries_v2"):
485
- if values.get("enable_stateful_lineage_ingestion") or values.get(
486
- "enable_stateful_usage_ingestion"
485
+ @model_validator(mode="after")
486
+ def validate_queries_v2_stateful_ingestion(self) -> "SnowflakeV2Config":
487
+ if self.use_queries_v2:
488
+ if (
489
+ self.enable_stateful_lineage_ingestion
490
+ or self.enable_stateful_usage_ingestion
487
491
  ):
488
492
  logger.warning(
489
493
  "enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
@@ -491,7 +495,7 @@ class SnowflakeV2Config(
491
495
  "For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
492
496
  "for the unified time window extraction (lineage + usage + operations + queries)."
493
497
  )
494
- return values
498
+ return self
495
499
 
496
500
  def outbounds(self) -> Dict[str, Set[DatabaseId]]:
497
501
  """