acryl-datahub 1.3.1__py3-none-any.whl → 1.3.1.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (193) hide show
  1. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/METADATA +2501 -2501
  2. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/RECORD +193 -193
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +2 -2
  5. datahub/api/entities/corpgroup/corpgroup.py +11 -6
  6. datahub/api/entities/corpuser/corpuser.py +11 -11
  7. datahub/api/entities/dataproduct/dataproduct.py +47 -27
  8. datahub/api/entities/dataset/dataset.py +32 -21
  9. datahub/api/entities/external/lake_formation_external_entites.py +5 -6
  10. datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
  11. datahub/api/entities/forms/forms.py +16 -14
  12. datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
  13. datahub/cli/check_cli.py +2 -2
  14. datahub/cli/config_utils.py +3 -3
  15. datahub/cli/lite_cli.py +9 -7
  16. datahub/cli/migrate.py +4 -4
  17. datahub/cli/quickstart_versioning.py +3 -3
  18. datahub/cli/specific/group_cli.py +1 -1
  19. datahub/cli/specific/structuredproperties_cli.py +1 -1
  20. datahub/cli/specific/user_cli.py +1 -1
  21. datahub/configuration/common.py +14 -2
  22. datahub/configuration/connection_resolver.py +2 -2
  23. datahub/configuration/git.py +47 -30
  24. datahub/configuration/import_resolver.py +2 -2
  25. datahub/configuration/kafka.py +4 -3
  26. datahub/configuration/time_window_config.py +26 -26
  27. datahub/configuration/validate_field_deprecation.py +2 -2
  28. datahub/configuration/validate_field_removal.py +2 -2
  29. datahub/configuration/validate_field_rename.py +2 -2
  30. datahub/configuration/validate_multiline_string.py +2 -1
  31. datahub/emitter/kafka_emitter.py +3 -1
  32. datahub/emitter/rest_emitter.py +2 -4
  33. datahub/ingestion/api/decorators.py +1 -1
  34. datahub/ingestion/api/report.py +1 -1
  35. datahub/ingestion/api/sink.py +1 -1
  36. datahub/ingestion/api/source.py +1 -1
  37. datahub/ingestion/glossary/datahub_classifier.py +11 -8
  38. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  39. datahub/ingestion/reporting/file_reporter.py +5 -4
  40. datahub/ingestion/run/pipeline.py +6 -6
  41. datahub/ingestion/run/pipeline_config.py +12 -14
  42. datahub/ingestion/run/sink_callback.py +1 -1
  43. datahub/ingestion/sink/datahub_rest.py +6 -4
  44. datahub/ingestion/source/abs/config.py +19 -19
  45. datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
  46. datahub/ingestion/source/abs/source.py +2 -2
  47. datahub/ingestion/source/aws/aws_common.py +1 -1
  48. datahub/ingestion/source/aws/glue.py +6 -4
  49. datahub/ingestion/source/aws/sagemaker.py +1 -1
  50. datahub/ingestion/source/azure/azure_common.py +8 -12
  51. datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
  53. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
  54. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  55. datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
  56. datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
  57. datahub/ingestion/source/datahub/config.py +8 -8
  58. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  59. datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
  60. datahub/ingestion/source/dbt/dbt_common.py +39 -37
  61. datahub/ingestion/source/dbt/dbt_core.py +10 -12
  62. datahub/ingestion/source/debug/datahub_debug.py +1 -1
  63. datahub/ingestion/source/delta_lake/config.py +6 -4
  64. datahub/ingestion/source/dremio/dremio_config.py +10 -6
  65. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  66. datahub/ingestion/source/elastic_search.py +4 -3
  67. datahub/ingestion/source/excel/source.py +1 -1
  68. datahub/ingestion/source/feast.py +1 -1
  69. datahub/ingestion/source/file.py +5 -4
  70. datahub/ingestion/source/fivetran/config.py +17 -16
  71. datahub/ingestion/source/fivetran/fivetran.py +2 -2
  72. datahub/ingestion/source/gc/datahub_gc.py +1 -1
  73. datahub/ingestion/source/gcs/gcs_source.py +8 -10
  74. datahub/ingestion/source/ge_profiling_config.py +8 -5
  75. datahub/ingestion/source/grafana/grafana_api.py +2 -2
  76. datahub/ingestion/source/grafana/grafana_config.py +4 -3
  77. datahub/ingestion/source/grafana/grafana_source.py +1 -1
  78. datahub/ingestion/source/grafana/models.py +23 -5
  79. datahub/ingestion/source/hex/api.py +7 -5
  80. datahub/ingestion/source/hex/hex.py +4 -3
  81. datahub/ingestion/source/iceberg/iceberg.py +1 -1
  82. datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
  83. datahub/ingestion/source/identity/azure_ad.py +1 -1
  84. datahub/ingestion/source/identity/okta.py +10 -10
  85. datahub/ingestion/source/kafka/kafka.py +1 -1
  86. datahub/ingestion/source/ldap.py +1 -1
  87. datahub/ingestion/source/looker/looker_common.py +7 -5
  88. datahub/ingestion/source/looker/looker_config.py +21 -20
  89. datahub/ingestion/source/looker/lookml_config.py +47 -47
  90. datahub/ingestion/source/metabase.py +8 -8
  91. datahub/ingestion/source/metadata/business_glossary.py +2 -2
  92. datahub/ingestion/source/metadata/lineage.py +13 -8
  93. datahub/ingestion/source/mlflow.py +1 -1
  94. datahub/ingestion/source/mode.py +6 -4
  95. datahub/ingestion/source/mongodb.py +4 -3
  96. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  97. datahub/ingestion/source/nifi.py +17 -23
  98. datahub/ingestion/source/openapi.py +6 -8
  99. datahub/ingestion/source/powerbi/config.py +33 -32
  100. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
  101. datahub/ingestion/source/powerbi/powerbi.py +1 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
  103. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
  104. datahub/ingestion/source/preset.py +8 -8
  105. datahub/ingestion/source/pulsar.py +1 -1
  106. datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
  107. datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
  108. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  109. datahub/ingestion/source/redshift/config.py +18 -20
  110. datahub/ingestion/source/redshift/redshift.py +2 -2
  111. datahub/ingestion/source/redshift/usage.py +23 -3
  112. datahub/ingestion/source/s3/config.py +83 -62
  113. datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
  114. datahub/ingestion/source/s3/source.py +8 -5
  115. datahub/ingestion/source/sac/sac.py +5 -4
  116. datahub/ingestion/source/salesforce.py +3 -2
  117. datahub/ingestion/source/schema/json_schema.py +2 -2
  118. datahub/ingestion/source/sigma/data_classes.py +3 -2
  119. datahub/ingestion/source/sigma/sigma.py +1 -1
  120. datahub/ingestion/source/sigma/sigma_api.py +7 -7
  121. datahub/ingestion/source/slack/slack.py +1 -1
  122. datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
  123. datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
  124. datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
  125. datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
  126. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +1 -1
  128. datahub/ingestion/source/sql/athena.py +1 -1
  129. datahub/ingestion/source/sql/clickhouse.py +4 -2
  130. datahub/ingestion/source/sql/cockroachdb.py +1 -1
  131. datahub/ingestion/source/sql/druid.py +1 -1
  132. datahub/ingestion/source/sql/hana.py +1 -1
  133. datahub/ingestion/source/sql/hive.py +7 -5
  134. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  135. datahub/ingestion/source/sql/mssql/source.py +13 -6
  136. datahub/ingestion/source/sql/mysql.py +1 -1
  137. datahub/ingestion/source/sql/oracle.py +17 -10
  138. datahub/ingestion/source/sql/postgres.py +2 -2
  139. datahub/ingestion/source/sql/presto.py +1 -1
  140. datahub/ingestion/source/sql/sql_config.py +8 -9
  141. datahub/ingestion/source/sql/sql_generic.py +1 -1
  142. datahub/ingestion/source/sql/teradata.py +1 -1
  143. datahub/ingestion/source/sql/trino.py +1 -1
  144. datahub/ingestion/source/sql/vertica.py +5 -4
  145. datahub/ingestion/source/sql_queries.py +11 -8
  146. datahub/ingestion/source/state/checkpoint.py +2 -2
  147. datahub/ingestion/source/state/entity_removal_state.py +2 -1
  148. datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
  149. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
  150. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  151. datahub/ingestion/source/superset.py +9 -9
  152. datahub/ingestion/source/tableau/tableau.py +14 -16
  153. datahub/ingestion/source/unity/config.py +33 -34
  154. datahub/ingestion/source/unity/proxy.py +203 -0
  155. datahub/ingestion/source/unity/proxy_types.py +91 -0
  156. datahub/ingestion/source/unity/source.py +27 -2
  157. datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
  158. datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
  159. datahub/ingestion/source/usage/usage_common.py +5 -3
  160. datahub/ingestion/source_config/csv_enricher.py +7 -6
  161. datahub/ingestion/source_config/operation_config.py +7 -4
  162. datahub/ingestion/source_config/pulsar.py +11 -15
  163. datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
  164. datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
  165. datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
  166. datahub/ingestion/transformer/add_dataset_properties.py +2 -2
  167. datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
  168. datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
  169. datahub/ingestion/transformer/add_dataset_tags.py +3 -3
  170. datahub/ingestion/transformer/add_dataset_terms.py +3 -3
  171. datahub/ingestion/transformer/dataset_domain.py +3 -3
  172. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
  173. datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
  174. datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
  175. datahub/ingestion/transformer/mark_dataset_status.py +1 -1
  176. datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
  177. datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
  178. datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
  179. datahub/ingestion/transformer/replace_external_url.py +2 -2
  180. datahub/ingestion/transformer/set_browse_path.py +1 -1
  181. datahub/ingestion/transformer/tags_to_terms.py +1 -1
  182. datahub/lite/duckdb_lite.py +1 -1
  183. datahub/lite/lite_util.py +2 -2
  184. datahub/sdk/search_filters.py +68 -40
  185. datahub/secret/datahub_secret_store.py +7 -4
  186. datahub/secret/file_secret_store.py +1 -1
  187. datahub/sql_parsing/sqlglot_lineage.py +5 -2
  188. datahub/testing/check_sql_parser_result.py +2 -2
  189. datahub/utilities/ingest_utils.py +1 -1
  190. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/WHEEL +0 -0
  191. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/entry_points.txt +0 -0
  192. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/licenses/LICENSE +0 -0
  193. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@ import logging
2
2
  from typing import Dict, Optional
3
3
 
4
4
  import requests
5
- from pydantic import root_validator, validator
5
+ from pydantic import field_validator, model_validator
6
6
  from pydantic.fields import Field
7
7
 
8
8
  from datahub.emitter.mce_builder import DEFAULT_ENV
@@ -55,16 +55,16 @@ class PresetConfig(SupersetConfig):
55
55
  description="Can be used to change mapping for database names in superset to what you have in datahub",
56
56
  )
57
57
 
58
- @validator("connect_uri", "display_uri")
58
+ @field_validator("connect_uri", "display_uri", mode="after")
59
+ @classmethod
59
60
  def remove_trailing_slash(cls, v):
60
61
  return config_clean.remove_trailing_slashes(v)
61
62
 
62
- @root_validator(skip_on_failure=True)
63
- def default_display_uri_to_connect_uri(cls, values):
64
- base = values.get("display_uri")
65
- if base is None:
66
- values["display_uri"] = values.get("connect_uri")
67
- return values
63
+ @model_validator(mode="after")
64
+ def default_display_uri_to_connect_uri(self) -> "PresetConfig":
65
+ if self.display_uri is None:
66
+ self.display_uri = self.connect_uri
67
+ return self
68
68
 
69
69
 
70
70
  @platform_name("Preset")
@@ -235,7 +235,7 @@ class PulsarSource(StatefulIngestionSourceBase):
235
235
 
236
236
  @classmethod
237
237
  def create(cls, config_dict, ctx):
238
- config = PulsarSourceConfig.parse_obj(config_dict)
238
+ config = PulsarSourceConfig.model_validate(config_dict)
239
239
 
240
240
  # Do not include each individual partition for partitioned topics,
241
241
  if config.exclude_individual_partitions:
@@ -3,7 +3,7 @@ from datetime import datetime
3
3
  from enum import Enum
4
4
  from typing import Dict, List, Optional, Type, Union
5
5
 
6
- from pydantic import BaseModel, ConfigDict, Field, root_validator
6
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
7
7
 
8
8
  from datahub.emitter.mcp_builder import ContainerKey
9
9
  from datahub.ingestion.source.qlik_sense.config import QLIK_DATETIME_FORMAT, Constant
@@ -92,7 +92,8 @@ class Space(_QlikBaseModel):
92
92
  updatedAt: datetime
93
93
  ownerId: Optional[str] = None
94
94
 
95
- @root_validator(pre=True)
95
+ @model_validator(mode="before")
96
+ @classmethod
96
97
  def update_values(cls, values: Dict) -> Dict:
97
98
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
98
99
  values = deepcopy(values)
@@ -121,7 +122,8 @@ class SchemaField(_QlikBaseModel):
121
122
  primaryKey: Optional[bool] = None
122
123
  nullable: Optional[bool] = None
123
124
 
124
- @root_validator(pre=True)
125
+ @model_validator(mode="before")
126
+ @classmethod
125
127
  def update_values(cls, values: Dict) -> Dict:
126
128
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
127
129
  values = deepcopy(values)
@@ -138,7 +140,8 @@ class QlikDataset(Item):
138
140
  itemId: str
139
141
  datasetSchema: List[SchemaField]
140
142
 
141
- @root_validator(pre=True)
143
+ @model_validator(mode="before")
144
+ @classmethod
142
145
  def update_values(cls, values: Dict) -> Dict:
143
146
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
144
147
  values = deepcopy(values)
@@ -174,7 +177,8 @@ class Chart(_QlikBaseModel):
174
177
  qDimension: List[AxisProperty]
175
178
  qMeasure: List[AxisProperty]
176
179
 
177
- @root_validator(pre=True)
180
+ @model_validator(mode="before")
181
+ @classmethod
178
182
  def update_values(cls, values: Dict) -> Dict:
179
183
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
180
184
  values = deepcopy(values)
@@ -193,7 +197,8 @@ class Sheet(_QlikBaseModel):
193
197
  updatedAt: datetime
194
198
  charts: List[Chart] = []
195
199
 
196
- @root_validator(pre=True)
200
+ @model_validator(mode="before")
201
+ @classmethod
197
202
  def update_values(cls, values: Dict) -> Dict:
198
203
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
199
204
  values = deepcopy(values)
@@ -220,7 +225,8 @@ class QlikTable(_QlikBaseModel):
220
225
  databaseName: Optional[str] = None
221
226
  schemaName: Optional[str] = None
222
227
 
223
- @root_validator(pre=True)
228
+ @model_validator(mode="before")
229
+ @classmethod
224
230
  def update_values(cls, values: Dict) -> Dict:
225
231
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
226
232
  values = deepcopy(values)
@@ -239,7 +245,8 @@ class App(Item):
239
245
  sheets: List[Sheet] = []
240
246
  tables: List[QlikTable] = []
241
247
 
242
- @root_validator(pre=True)
248
+ @model_validator(mode="before")
249
+ @classmethod
243
250
  def update_values(cls, values: Dict) -> Dict:
244
251
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
245
252
  values = deepcopy(values)
@@ -56,7 +56,7 @@ class QlikAPI:
56
56
  response.raise_for_status()
57
57
  response_dict = response.json()
58
58
  for space_dict in response_dict[Constant.DATA]:
59
- space = Space.parse_obj(space_dict)
59
+ space = Space.model_validate(space_dict)
60
60
  spaces.append(space)
61
61
  self.spaces[space.id] = space.name
62
62
  if Constant.NEXT in response_dict[Constant.LINKS]:
@@ -64,7 +64,7 @@ class QlikAPI:
64
64
  else:
65
65
  break
66
66
  # Add personal space entity
67
- spaces.append(Space.parse_obj(PERSONAL_SPACE_DICT))
67
+ spaces.append(Space.model_validate(PERSONAL_SPACE_DICT))
68
68
  self.spaces[PERSONAL_SPACE_DICT[Constant.ID]] = PERSONAL_SPACE_DICT[
69
69
  Constant.NAME
70
70
  ]
@@ -78,7 +78,7 @@ class QlikAPI:
78
78
  response.raise_for_status()
79
79
  response_dict = response.json()
80
80
  response_dict[Constant.ITEMID] = item_id
81
- return QlikDataset.parse_obj(response_dict)
81
+ return QlikDataset.model_validate(response_dict)
82
82
  except Exception as e:
83
83
  self._log_http_error(
84
84
  message=f"Unable to fetch dataset with id {dataset_id}. Exception: {e}"
@@ -119,7 +119,7 @@ class QlikAPI:
119
119
  f"Chart with id {chart_id} of sheet {sheet_id} does not have hypercube. q_layout: {q_layout}"
120
120
  )
121
121
  return None
122
- return Chart.parse_obj(q_layout)
122
+ return Chart.model_validate(q_layout)
123
123
  except Exception as e:
124
124
  self._log_http_error(
125
125
  message=f"Unable to fetch chart {chart_id} of sheet {sheet_id}. Exception: {e}"
@@ -140,7 +140,7 @@ class QlikAPI:
140
140
  if Constant.OWNERID not in sheet_dict[Constant.QMETA]:
141
141
  # That means sheet is private sheet
142
142
  return None
143
- sheet = Sheet.parse_obj(sheet_dict[Constant.QMETA])
143
+ sheet = Sheet.model_validate(sheet_dict[Constant.QMETA])
144
144
  if Constant.QCHILDLIST not in sheet_dict:
145
145
  logger.warning(
146
146
  f"Sheet {sheet.title} with id {sheet_id} does not have any charts. sheet_dict: {sheet_dict}"
@@ -222,7 +222,7 @@ class QlikAPI:
222
222
  return []
223
223
  response = websocket_connection.websocket_send_request(method="GetLayout")
224
224
  for table_dict in response[Constant.QLAYOUT][Constant.TABLES]:
225
- tables.append(QlikTable.parse_obj(table_dict))
225
+ tables.append(QlikTable.model_validate(table_dict))
226
226
  websocket_connection.handle.pop()
227
227
  self._add_qri_of_tables(tables, app_id)
228
228
  except Exception as e:
@@ -270,7 +270,7 @@ class QlikAPI:
270
270
  response = websocket_connection.websocket_send_request(
271
271
  method="GetAppLayout"
272
272
  )
273
- app = App.parse_obj(response[Constant.QLAYOUT])
273
+ app = App.model_validate(response[Constant.QLAYOUT])
274
274
  app.sheets = self._get_app_sheets(websocket_connection, app_id)
275
275
  app.tables = self._get_app_used_tables(websocket_connection, app_id)
276
276
  websocket_connection.close_websocket()
@@ -148,7 +148,7 @@ class QlikSenseSource(StatefulIngestionSourceBase, TestableSource):
148
148
 
149
149
  @classmethod
150
150
  def create(cls, config_dict, ctx):
151
- config = QlikSourceConfig.parse_obj(config_dict)
151
+ config = QlikSourceConfig.model_validate(config_dict)
152
152
  return cls(config, ctx)
153
153
 
154
154
  def _gen_space_key(self, space_id: str) -> SpaceKey:
@@ -3,7 +3,7 @@ from copy import deepcopy
3
3
  from enum import Enum
4
4
  from typing import Any, Dict, List, Optional
5
5
 
6
- from pydantic import root_validator
6
+ from pydantic import model_validator
7
7
  from pydantic.fields import Field
8
8
 
9
9
  from datahub.configuration import ConfigModel
@@ -182,7 +182,8 @@ class RedshiftConfig(
182
182
  description="Whether to skip EXTERNAL tables.",
183
183
  )
184
184
 
185
- @root_validator(pre=True)
185
+ @model_validator(mode="before")
186
+ @classmethod
186
187
  def check_email_is_set_on_usage(cls, values):
187
188
  if values.get("include_usage_statistics"):
188
189
  assert "email_domain" in values and values["email_domain"], (
@@ -190,31 +191,28 @@ class RedshiftConfig(
190
191
  )
191
192
  return values
192
193
 
193
- @root_validator(skip_on_failure=True)
194
- def check_database_is_set(cls, values):
195
- assert values.get("database"), "database must be set"
196
- return values
197
-
198
- @root_validator(skip_on_failure=True)
199
- def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
200
- match_fully_qualified_names = values.get("match_fully_qualified_names")
201
-
202
- schema_pattern: Optional[AllowDenyPattern] = values.get("schema_pattern")
194
+ @model_validator(mode="after")
195
+ def check_database_is_set(self) -> "RedshiftConfig":
196
+ assert self.database, "database must be set"
197
+ return self
203
198
 
199
+ @model_validator(mode="after")
200
+ def backward_compatibility_configs_set(self) -> "RedshiftConfig":
204
201
  if (
205
- schema_pattern is not None
206
- and schema_pattern != AllowDenyPattern.allow_all()
207
- and match_fully_qualified_names is not None
208
- and not match_fully_qualified_names
202
+ self.schema_pattern is not None
203
+ and self.schema_pattern != AllowDenyPattern.allow_all()
204
+ and self.match_fully_qualified_names is not None
205
+ and not self.match_fully_qualified_names
209
206
  ):
210
207
  logger.warning(
211
208
  "Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
212
209
  "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
213
210
  "The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`."
214
211
  )
215
- return values
212
+ return self
216
213
 
217
- @root_validator(skip_on_failure=True)
214
+ @model_validator(mode="before")
215
+ @classmethod
218
216
  def connection_config_compatibility_set(cls, values: Dict) -> Dict:
219
217
  # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
220
218
  values = deepcopy(values)
@@ -231,8 +229,8 @@ class RedshiftConfig(
231
229
  if "options" in values and "connect_args" in values["options"]:
232
230
  values["extra_client_options"] = values["options"]["connect_args"]
233
231
 
234
- if values["extra_client_options"]:
235
- if values["options"]:
232
+ if values.get("extra_client_options"):
233
+ if values.get("options"):
236
234
  values["options"]["connect_args"] = values["extra_client_options"]
237
235
  else:
238
236
  values["options"] = {"connect_args": values["extra_client_options"]}
@@ -236,7 +236,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
236
236
  RedshiftConfig.Config.extra = (
237
237
  pydantic.Extra.allow
238
238
  ) # we are okay with extra fields during this stage
239
- config = RedshiftConfig.parse_obj(config_dict)
239
+ config = RedshiftConfig.model_validate(config_dict)
240
240
  # source = RedshiftSource(config, report)
241
241
  connection: redshift_connector.Connection = (
242
242
  RedshiftSource.get_redshift_connection(config)
@@ -316,7 +316,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
316
316
 
317
317
  @classmethod
318
318
  def create(cls, config_dict, ctx):
319
- config = RedshiftConfig.parse_obj(config_dict)
319
+ config = RedshiftConfig.model_validate(config_dict)
320
320
  return cls(config, ctx)
321
321
 
322
322
  @staticmethod
@@ -1,12 +1,12 @@
1
1
  import collections
2
2
  import logging
3
3
  import time
4
- from datetime import datetime
4
+ from datetime import datetime, timezone
5
5
  from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
6
6
 
7
7
  import cachetools
8
- import pydantic.error_wrappers
9
8
  import redshift_connector
9
+ from pydantic import ValidationError, field_validator
10
10
  from pydantic.fields import Field
11
11
  from pydantic.main import BaseModel
12
12
 
@@ -64,6 +64,26 @@ class RedshiftAccessEvent(BaseModel):
64
64
  starttime: datetime
65
65
  endtime: datetime
66
66
 
67
+ @field_validator("starttime", "endtime", mode="before")
68
+ @classmethod
69
+ def ensure_utc_datetime(cls, v):
70
+ """Ensure datetime fields are treated as UTC for consistency with Pydantic V1 behavior.
71
+
72
+ Pydantic V2 assumes local timezone for naive datetime strings, whereas Pydantic V1 assumed UTC.
73
+ This validator restores V1 behavior to maintain timestamp consistency.
74
+ """
75
+ if isinstance(v, str):
76
+ # Parse as naive datetime, then assume UTC (matching V1 behavior)
77
+ dt = datetime.fromisoformat(v)
78
+ if dt.tzinfo is None:
79
+ # Treat naive datetime as UTC (this was the V1 behavior)
80
+ dt = dt.replace(tzinfo=timezone.utc)
81
+ return dt
82
+ elif isinstance(v, datetime) and v.tzinfo is None:
83
+ # If we get a naive datetime object, assume UTC
84
+ return v.replace(tzinfo=timezone.utc)
85
+ return v
86
+
67
87
 
68
88
  class RedshiftUsageExtractor:
69
89
  """
@@ -291,7 +311,7 @@ class RedshiftUsageExtractor:
291
311
  else None
292
312
  ),
293
313
  )
294
- except pydantic.error_wrappers.ValidationError as e:
314
+ except ValidationError as e:
295
315
  logging.warning(
296
316
  f"Validation error on access event creation from row {row}. The error was: {e} Skipping ...."
297
317
  )
@@ -1,7 +1,7 @@
1
1
  import logging
2
- from typing import Any, Dict, List, Optional, Union
2
+ from typing import Any, Dict, Optional, Union
3
3
 
4
- import pydantic
4
+ from pydantic import ValidationInfo, field_validator, model_validator
5
5
  from pydantic.fields import Field
6
6
 
7
7
  from datahub.configuration.common import AllowDenyPattern
@@ -12,7 +12,6 @@ from datahub.configuration.validate_field_deprecation import pydantic_field_depr
12
12
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
13
13
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
14
14
  from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
15
- from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
16
15
  from datahub.ingestion.source.s3.datalake_profiler_config import DataLakeProfilerConfig
17
16
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
18
17
  StatefulStaleMetadataRemovalConfig,
@@ -117,69 +116,91 @@ class DataLakeSourceConfig(
117
116
  self.profiling.operation_config
118
117
  )
119
118
 
120
- @pydantic.validator("path_specs", always=True)
121
- def check_path_specs_and_infer_platform(
122
- cls, path_specs: List[PathSpec], values: Dict
123
- ) -> List[PathSpec]:
119
+ @field_validator("path_specs", mode="before")
120
+ @classmethod
121
+ def check_path_specs(cls, path_specs: Any, info: ValidationInfo) -> Any:
124
122
  if len(path_specs) == 0:
125
123
  raise ValueError("path_specs must not be empty")
126
124
 
127
- # Check that all path specs have the same platform.
128
- guessed_platforms = {
129
- "s3" if path_spec.is_s3 else "file" for path_spec in path_specs
130
- }
131
- if len(guessed_platforms) > 1:
132
- raise ValueError(
133
- f"Cannot have multiple platforms in path_specs: {guessed_platforms}"
134
- )
135
- guessed_platform = guessed_platforms.pop()
136
-
137
- # Ensure s3 configs aren't used for file sources.
138
- if guessed_platform != "s3" and (
139
- values.get("use_s3_object_tags") or values.get("use_s3_bucket_tags")
140
- ):
141
- raise ValueError(
142
- "Cannot grab s3 object/bucket tags when platform is not s3. Remove the flag or use s3."
143
- )
144
-
145
- # Infer platform if not specified.
146
- if values.get("platform") and values["platform"] != guessed_platform:
147
- raise ValueError(
148
- f"All path_specs belong to {guessed_platform} platform, but platform is set to {values['platform']}"
149
- )
150
- else:
151
- logger.debug(f'Setting config "platform": {guessed_platform}')
152
- values["platform"] = guessed_platform
125
+ # Basic validation - path specs consistency and S3 config validation is now handled in model_validator
153
126
 
154
127
  return path_specs
155
128
 
156
- @pydantic.validator("platform", always=True)
157
- def platform_valid(cls, platform: Any, values: dict) -> str:
158
- inferred_platform = values.get("platform") # we may have inferred it above
159
- platform = platform or inferred_platform
160
- if not platform:
161
- raise ValueError("platform must not be empty")
162
-
163
- if platform != "s3" and values.get("use_s3_bucket_tags"):
164
- raise ValueError(
165
- "Cannot grab s3 bucket tags when platform is not s3. Remove the flag or ingest from s3."
166
- )
167
- if platform != "s3" and values.get("use_s3_object_tags"):
168
- raise ValueError(
169
- "Cannot grab s3 object tags when platform is not s3. Remove the flag or ingest from s3."
170
- )
171
- if platform != "s3" and values.get("use_s3_content_type"):
172
- raise ValueError(
173
- "Cannot grab s3 object content type when platform is not s3. Remove the flag or ingest from s3."
174
- )
175
-
176
- return platform
177
-
178
- @pydantic.root_validator(skip_on_failure=True)
179
- def ensure_profiling_pattern_is_passed_to_profiling(
180
- cls, values: Dict[str, Any]
181
- ) -> Dict[str, Any]:
182
- profiling: Optional[DataLakeProfilerConfig] = values.get("profiling")
129
+ @model_validator(mode="after")
130
+ def ensure_profiling_pattern_is_passed_to_profiling(self) -> "DataLakeSourceConfig":
131
+ profiling = self.profiling
183
132
  if profiling is not None and profiling.enabled:
184
- profiling._allow_deny_patterns = values["profile_patterns"]
185
- return values
133
+ profiling._allow_deny_patterns = self.profile_patterns
134
+ return self
135
+
136
+ @model_validator(mode="after")
137
+ def validate_platform_and_config_consistency(self) -> "DataLakeSourceConfig":
138
+ """Infer platform from path_specs and validate config consistency."""
139
+ # Track whether platform was explicitly provided
140
+ platform_was_explicit = bool(self.platform)
141
+
142
+ # Infer platform from path_specs if not explicitly set
143
+ if not self.platform and self.path_specs:
144
+ guessed_platforms = set()
145
+ for path_spec in self.path_specs:
146
+ if (
147
+ hasattr(path_spec, "include")
148
+ and path_spec.include
149
+ and path_spec.include.startswith("s3://")
150
+ ):
151
+ guessed_platforms.add("s3")
152
+ else:
153
+ guessed_platforms.add("file")
154
+
155
+ # Ensure all path specs belong to the same platform
156
+ if len(guessed_platforms) > 1:
157
+ raise ValueError(
158
+ f"Cannot have multiple platforms in path_specs: {guessed_platforms}"
159
+ )
160
+
161
+ if guessed_platforms:
162
+ guessed_platform = guessed_platforms.pop()
163
+ logger.debug(f"Inferred platform: {guessed_platform}")
164
+ self.platform = guessed_platform
165
+ else:
166
+ self.platform = "file"
167
+ elif not self.platform:
168
+ self.platform = "file"
169
+
170
+ # Validate platform consistency only when platform was inferred (not explicitly set)
171
+ # This allows sources like GCS to set platform="gcs" with s3:// URIs for correct container subtypes
172
+ if not platform_was_explicit and self.platform and self.path_specs:
173
+ expected_platforms = set()
174
+ for path_spec in self.path_specs:
175
+ if (
176
+ hasattr(path_spec, "include")
177
+ and path_spec.include
178
+ and path_spec.include.startswith("s3://")
179
+ ):
180
+ expected_platforms.add("s3")
181
+ else:
182
+ expected_platforms.add("file")
183
+
184
+ if len(expected_platforms) == 1:
185
+ expected_platform = expected_platforms.pop()
186
+ if self.platform != expected_platform:
187
+ raise ValueError(
188
+ f"All path_specs belong to {expected_platform} platform, but platform was inferred as {self.platform}"
189
+ )
190
+
191
+ # Validate S3-specific configurations
192
+ if self.platform != "s3":
193
+ if self.use_s3_bucket_tags:
194
+ raise ValueError(
195
+ "Cannot grab s3 bucket tags when platform is not s3. Remove the flag or ingest from s3."
196
+ )
197
+ if self.use_s3_object_tags:
198
+ raise ValueError(
199
+ "Cannot grab s3 object tags when platform is not s3. Remove the flag or ingest from s3."
200
+ )
201
+ if self.use_s3_content_type:
202
+ raise ValueError(
203
+ "Cannot grab s3 object content type when platform is not s3. Remove the flag or ingest from s3."
204
+ )
205
+
206
+ return self
@@ -1,6 +1,7 @@
1
- from typing import Any, Dict, Optional
1
+ from typing import Optional
2
2
 
3
3
  import pydantic
4
+ from pydantic import model_validator
4
5
  from pydantic.fields import Field
5
6
 
6
7
  from datahub.configuration import ConfigModel
@@ -72,21 +73,18 @@ class DataLakeProfilerConfig(ConfigModel):
72
73
  description="Whether to profile for the sample values for all columns.",
73
74
  )
74
75
 
75
- @pydantic.root_validator(skip_on_failure=True)
76
- def ensure_field_level_settings_are_normalized(
77
- cls: "DataLakeProfilerConfig", values: Dict[str, Any]
78
- ) -> Dict[str, Any]:
79
- max_num_fields_to_profile_key = "max_number_of_fields_to_profile"
80
- max_num_fields_to_profile = values.get(max_num_fields_to_profile_key)
76
+ @model_validator(mode="after")
77
+ def ensure_field_level_settings_are_normalized(self) -> "DataLakeProfilerConfig":
78
+ max_num_fields_to_profile = self.max_number_of_fields_to_profile
81
79
 
82
80
  # Disable all field-level metrics.
83
- if values.get("profile_table_level_only"):
84
- for field_level_metric in cls.__fields__:
85
- if field_level_metric.startswith("include_field_"):
86
- values.setdefault(field_level_metric, False)
81
+ if self.profile_table_level_only:
82
+ for field_name in self.__fields__:
83
+ if field_name.startswith("include_field_"):
84
+ setattr(self, field_name, False)
87
85
 
88
86
  assert max_num_fields_to_profile is None, (
89
- f"{max_num_fields_to_profile_key} should be set to None"
87
+ "max_number_of_fields_to_profile should be set to None"
90
88
  )
91
89
 
92
- return values
90
+ return self
@@ -53,8 +53,11 @@ from datahub.ingestion.source.data_lake_common.data_lake_utils import (
53
53
  from datahub.ingestion.source.data_lake_common.object_store import (
54
54
  create_object_store_adapter,
55
55
  )
56
- from datahub.ingestion.source.data_lake_common.path_spec import FolderTraversalMethod
57
- from datahub.ingestion.source.s3.config import DataLakeSourceConfig, PathSpec
56
+ from datahub.ingestion.source.data_lake_common.path_spec import (
57
+ FolderTraversalMethod,
58
+ PathSpec,
59
+ )
60
+ from datahub.ingestion.source.s3.config import DataLakeSourceConfig
58
61
  from datahub.ingestion.source.s3.report import DataLakeSourceReport
59
62
  from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet
60
63
  from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase
@@ -261,7 +264,7 @@ class S3Source(StatefulIngestionSourceBase):
261
264
  )
262
265
 
263
266
  config_report = {
264
- config_option: config.dict().get(config_option)
267
+ config_option: config.model_dump().get(config_option)
265
268
  for config_option in config_options_to_report
266
269
  }
267
270
  config_report = {
@@ -278,7 +281,7 @@ class S3Source(StatefulIngestionSourceBase):
278
281
  telemetry.telemetry_instance.ping(
279
282
  "data_lake_profiling_config",
280
283
  {
281
- config_flag: config.profiling.dict().get(config_flag)
284
+ config_flag: config.profiling.model_dump().get(config_flag)
282
285
  for config_flag in profiling_flags_to_report
283
286
  },
284
287
  )
@@ -370,7 +373,7 @@ class S3Source(StatefulIngestionSourceBase):
370
373
 
371
374
  @classmethod
372
375
  def create(cls, config_dict, ctx):
373
- config = DataLakeSourceConfig.parse_obj(config_dict)
376
+ config = DataLakeSourceConfig.model_validate(config_dict)
374
377
 
375
378
  return cls(config, ctx)
376
379
 
@@ -8,7 +8,7 @@ import pyodata
8
8
  import pyodata.v2.model
9
9
  import pyodata.v2.service
10
10
  from authlib.integrations.requests_client import OAuth2Session
11
- from pydantic import Field, SecretStr, validator
11
+ from pydantic import Field, SecretStr, field_validator
12
12
  from requests.adapters import HTTPAdapter
13
13
  from urllib3.util.retry import Retry
14
14
 
@@ -159,7 +159,8 @@ class SACSourceConfig(
159
159
  description="Template for generating dataset urns of consumed queries, the placeholder {query} can be used within the template for inserting the name of the query",
160
160
  )
161
161
 
162
- @validator("tenant_url", "token_url")
162
+ @field_validator("tenant_url", "token_url", mode="after")
163
+ @classmethod
163
164
  def remove_trailing_slash(cls, v):
164
165
  return config_clean.remove_trailing_slashes(v)
165
166
 
@@ -209,7 +210,7 @@ class SACSource(StatefulIngestionSourceBase, TestableSource):
209
210
 
210
211
  @classmethod
211
212
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "SACSource":
212
- config = SACSourceConfig.parse_obj(config_dict)
213
+ config = SACSourceConfig.model_validate(config_dict)
213
214
  return cls(config, ctx)
214
215
 
215
216
  @staticmethod
@@ -217,7 +218,7 @@ class SACSource(StatefulIngestionSourceBase, TestableSource):
217
218
  test_report = TestConnectionReport()
218
219
 
219
220
  try:
220
- config = SACSourceConfig.parse_obj(config_dict)
221
+ config = SACSourceConfig.model_validate(config_dict)
221
222
 
222
223
  # when creating the pyodata.Client, the metadata is automatically parsed and validated
223
224
  session, _ = SACSource.get_sac_connection(config)
@@ -7,7 +7,7 @@ from enum import Enum
7
7
  from typing import Any, Dict, Iterable, List, Literal, Optional, TypedDict
8
8
 
9
9
  import requests
10
- from pydantic import Field, validator
10
+ from pydantic import Field, field_validator
11
11
  from simple_salesforce import Salesforce
12
12
  from simple_salesforce.exceptions import SalesforceAuthenticationFailed
13
13
 
@@ -172,7 +172,8 @@ class SalesforceConfig(
172
172
  self.profiling.operation_config
173
173
  )
174
174
 
175
- @validator("instance_url")
175
+ @field_validator("instance_url", mode="after")
176
+ @classmethod
176
177
  def remove_trailing_slash(cls, v):
177
178
  return config_clean.remove_trailing_slashes(v)
178
179
 
@@ -12,7 +12,7 @@ from urllib.parse import urlparse
12
12
 
13
13
  import jsonref
14
14
  import requests
15
- from pydantic import AnyHttpUrl, DirectoryPath, FilePath, validator
15
+ from pydantic import AnyHttpUrl, DirectoryPath, FilePath, field_validator
16
16
  from pydantic.fields import Field
17
17
 
18
18
  import datahub.metadata.schema_classes as models
@@ -90,7 +90,7 @@ class JsonSchemaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMix
90
90
  description="Use this if URI-s need to be modified during reference resolution. Simple string match - replace capabilities are supported.",
91
91
  )
92
92
 
93
- @validator("path")
93
+ @field_validator("path", mode="after")
94
94
  def download_http_url_to_temp_file(cls, v):
95
95
  if isinstance(v, AnyHttpUrl):
96
96
  try: