acryl-datahub 1.3.1__py3-none-any.whl → 1.3.1.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (193) hide show
  1. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/METADATA +2501 -2501
  2. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/RECORD +193 -193
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +2 -2
  5. datahub/api/entities/corpgroup/corpgroup.py +11 -6
  6. datahub/api/entities/corpuser/corpuser.py +11 -11
  7. datahub/api/entities/dataproduct/dataproduct.py +47 -27
  8. datahub/api/entities/dataset/dataset.py +32 -21
  9. datahub/api/entities/external/lake_formation_external_entites.py +5 -6
  10. datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
  11. datahub/api/entities/forms/forms.py +16 -14
  12. datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
  13. datahub/cli/check_cli.py +2 -2
  14. datahub/cli/config_utils.py +3 -3
  15. datahub/cli/lite_cli.py +9 -7
  16. datahub/cli/migrate.py +4 -4
  17. datahub/cli/quickstart_versioning.py +3 -3
  18. datahub/cli/specific/group_cli.py +1 -1
  19. datahub/cli/specific/structuredproperties_cli.py +1 -1
  20. datahub/cli/specific/user_cli.py +1 -1
  21. datahub/configuration/common.py +14 -2
  22. datahub/configuration/connection_resolver.py +2 -2
  23. datahub/configuration/git.py +47 -30
  24. datahub/configuration/import_resolver.py +2 -2
  25. datahub/configuration/kafka.py +4 -3
  26. datahub/configuration/time_window_config.py +26 -26
  27. datahub/configuration/validate_field_deprecation.py +2 -2
  28. datahub/configuration/validate_field_removal.py +2 -2
  29. datahub/configuration/validate_field_rename.py +2 -2
  30. datahub/configuration/validate_multiline_string.py +2 -1
  31. datahub/emitter/kafka_emitter.py +3 -1
  32. datahub/emitter/rest_emitter.py +2 -4
  33. datahub/ingestion/api/decorators.py +1 -1
  34. datahub/ingestion/api/report.py +1 -1
  35. datahub/ingestion/api/sink.py +1 -1
  36. datahub/ingestion/api/source.py +1 -1
  37. datahub/ingestion/glossary/datahub_classifier.py +11 -8
  38. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  39. datahub/ingestion/reporting/file_reporter.py +5 -4
  40. datahub/ingestion/run/pipeline.py +6 -6
  41. datahub/ingestion/run/pipeline_config.py +12 -14
  42. datahub/ingestion/run/sink_callback.py +1 -1
  43. datahub/ingestion/sink/datahub_rest.py +6 -4
  44. datahub/ingestion/source/abs/config.py +19 -19
  45. datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
  46. datahub/ingestion/source/abs/source.py +2 -2
  47. datahub/ingestion/source/aws/aws_common.py +1 -1
  48. datahub/ingestion/source/aws/glue.py +6 -4
  49. datahub/ingestion/source/aws/sagemaker.py +1 -1
  50. datahub/ingestion/source/azure/azure_common.py +8 -12
  51. datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
  53. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
  54. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  55. datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
  56. datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
  57. datahub/ingestion/source/datahub/config.py +8 -8
  58. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  59. datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
  60. datahub/ingestion/source/dbt/dbt_common.py +39 -37
  61. datahub/ingestion/source/dbt/dbt_core.py +10 -12
  62. datahub/ingestion/source/debug/datahub_debug.py +1 -1
  63. datahub/ingestion/source/delta_lake/config.py +6 -4
  64. datahub/ingestion/source/dremio/dremio_config.py +10 -6
  65. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  66. datahub/ingestion/source/elastic_search.py +4 -3
  67. datahub/ingestion/source/excel/source.py +1 -1
  68. datahub/ingestion/source/feast.py +1 -1
  69. datahub/ingestion/source/file.py +5 -4
  70. datahub/ingestion/source/fivetran/config.py +17 -16
  71. datahub/ingestion/source/fivetran/fivetran.py +2 -2
  72. datahub/ingestion/source/gc/datahub_gc.py +1 -1
  73. datahub/ingestion/source/gcs/gcs_source.py +8 -10
  74. datahub/ingestion/source/ge_profiling_config.py +8 -5
  75. datahub/ingestion/source/grafana/grafana_api.py +2 -2
  76. datahub/ingestion/source/grafana/grafana_config.py +4 -3
  77. datahub/ingestion/source/grafana/grafana_source.py +1 -1
  78. datahub/ingestion/source/grafana/models.py +23 -5
  79. datahub/ingestion/source/hex/api.py +7 -5
  80. datahub/ingestion/source/hex/hex.py +4 -3
  81. datahub/ingestion/source/iceberg/iceberg.py +1 -1
  82. datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
  83. datahub/ingestion/source/identity/azure_ad.py +1 -1
  84. datahub/ingestion/source/identity/okta.py +10 -10
  85. datahub/ingestion/source/kafka/kafka.py +1 -1
  86. datahub/ingestion/source/ldap.py +1 -1
  87. datahub/ingestion/source/looker/looker_common.py +7 -5
  88. datahub/ingestion/source/looker/looker_config.py +21 -20
  89. datahub/ingestion/source/looker/lookml_config.py +47 -47
  90. datahub/ingestion/source/metabase.py +8 -8
  91. datahub/ingestion/source/metadata/business_glossary.py +2 -2
  92. datahub/ingestion/source/metadata/lineage.py +13 -8
  93. datahub/ingestion/source/mlflow.py +1 -1
  94. datahub/ingestion/source/mode.py +6 -4
  95. datahub/ingestion/source/mongodb.py +4 -3
  96. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  97. datahub/ingestion/source/nifi.py +17 -23
  98. datahub/ingestion/source/openapi.py +6 -8
  99. datahub/ingestion/source/powerbi/config.py +33 -32
  100. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
  101. datahub/ingestion/source/powerbi/powerbi.py +1 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
  103. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
  104. datahub/ingestion/source/preset.py +8 -8
  105. datahub/ingestion/source/pulsar.py +1 -1
  106. datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
  107. datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
  108. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  109. datahub/ingestion/source/redshift/config.py +18 -20
  110. datahub/ingestion/source/redshift/redshift.py +2 -2
  111. datahub/ingestion/source/redshift/usage.py +23 -3
  112. datahub/ingestion/source/s3/config.py +83 -62
  113. datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
  114. datahub/ingestion/source/s3/source.py +8 -5
  115. datahub/ingestion/source/sac/sac.py +5 -4
  116. datahub/ingestion/source/salesforce.py +3 -2
  117. datahub/ingestion/source/schema/json_schema.py +2 -2
  118. datahub/ingestion/source/sigma/data_classes.py +3 -2
  119. datahub/ingestion/source/sigma/sigma.py +1 -1
  120. datahub/ingestion/source/sigma/sigma_api.py +7 -7
  121. datahub/ingestion/source/slack/slack.py +1 -1
  122. datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
  123. datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
  124. datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
  125. datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
  126. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +1 -1
  128. datahub/ingestion/source/sql/athena.py +1 -1
  129. datahub/ingestion/source/sql/clickhouse.py +4 -2
  130. datahub/ingestion/source/sql/cockroachdb.py +1 -1
  131. datahub/ingestion/source/sql/druid.py +1 -1
  132. datahub/ingestion/source/sql/hana.py +1 -1
  133. datahub/ingestion/source/sql/hive.py +7 -5
  134. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  135. datahub/ingestion/source/sql/mssql/source.py +13 -6
  136. datahub/ingestion/source/sql/mysql.py +1 -1
  137. datahub/ingestion/source/sql/oracle.py +17 -10
  138. datahub/ingestion/source/sql/postgres.py +2 -2
  139. datahub/ingestion/source/sql/presto.py +1 -1
  140. datahub/ingestion/source/sql/sql_config.py +8 -9
  141. datahub/ingestion/source/sql/sql_generic.py +1 -1
  142. datahub/ingestion/source/sql/teradata.py +1 -1
  143. datahub/ingestion/source/sql/trino.py +1 -1
  144. datahub/ingestion/source/sql/vertica.py +5 -4
  145. datahub/ingestion/source/sql_queries.py +11 -8
  146. datahub/ingestion/source/state/checkpoint.py +2 -2
  147. datahub/ingestion/source/state/entity_removal_state.py +2 -1
  148. datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
  149. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
  150. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  151. datahub/ingestion/source/superset.py +9 -9
  152. datahub/ingestion/source/tableau/tableau.py +14 -16
  153. datahub/ingestion/source/unity/config.py +33 -34
  154. datahub/ingestion/source/unity/proxy.py +203 -0
  155. datahub/ingestion/source/unity/proxy_types.py +91 -0
  156. datahub/ingestion/source/unity/source.py +27 -2
  157. datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
  158. datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
  159. datahub/ingestion/source/usage/usage_common.py +5 -3
  160. datahub/ingestion/source_config/csv_enricher.py +7 -6
  161. datahub/ingestion/source_config/operation_config.py +7 -4
  162. datahub/ingestion/source_config/pulsar.py +11 -15
  163. datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
  164. datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
  165. datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
  166. datahub/ingestion/transformer/add_dataset_properties.py +2 -2
  167. datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
  168. datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
  169. datahub/ingestion/transformer/add_dataset_tags.py +3 -3
  170. datahub/ingestion/transformer/add_dataset_terms.py +3 -3
  171. datahub/ingestion/transformer/dataset_domain.py +3 -3
  172. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
  173. datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
  174. datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
  175. datahub/ingestion/transformer/mark_dataset_status.py +1 -1
  176. datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
  177. datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
  178. datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
  179. datahub/ingestion/transformer/replace_external_url.py +2 -2
  180. datahub/ingestion/transformer/set_browse_path.py +1 -1
  181. datahub/ingestion/transformer/tags_to_terms.py +1 -1
  182. datahub/lite/duckdb_lite.py +1 -1
  183. datahub/lite/lite_util.py +2 -2
  184. datahub/sdk/search_filters.py +68 -40
  185. datahub/secret/datahub_secret_store.py +7 -4
  186. datahub/secret/file_secret_store.py +1 -1
  187. datahub/sql_parsing/sqlglot_lineage.py +5 -2
  188. datahub/testing/check_sql_parser_result.py +2 -2
  189. datahub/utilities/ingest_utils.py +1 -1
  190. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/WHEEL +0 -0
  191. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/entry_points.txt +0 -0
  192. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/licenses/LICENSE +0 -0
  193. {acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/top_level.txt +0 -0
datahub/cli/migrate.py CHANGED
@@ -318,13 +318,13 @@ def migrate_containers(
318
318
  try:
319
319
  newKey: Union[SchemaKey, DatabaseKey, ProjectIdKey, BigQueryDatasetKey]
320
320
  if subType == "Schema":
321
- newKey = SchemaKey.parse_obj(customProperties)
321
+ newKey = SchemaKey.model_validate(customProperties)
322
322
  elif subType == "Database":
323
- newKey = DatabaseKey.parse_obj(customProperties)
323
+ newKey = DatabaseKey.model_validate(customProperties)
324
324
  elif subType == "Project":
325
- newKey = ProjectIdKey.parse_obj(customProperties)
325
+ newKey = ProjectIdKey.model_validate(customProperties)
326
326
  elif subType == "Dataset":
327
- newKey = BigQueryDatasetKey.parse_obj(customProperties)
327
+ newKey = BigQueryDatasetKey.model_validate(customProperties)
328
328
  else:
329
329
  log.warning(f"Invalid subtype {subType}. Skipping")
330
330
  continue
@@ -80,7 +80,7 @@ class QuickstartVersionMappingConfig(BaseModel):
80
80
  path = os.path.expanduser(LOCAL_QUICKSTART_MAPPING_FILE)
81
81
  with open(path) as f:
82
82
  config_raw = yaml.safe_load(f)
83
- return cls.parse_obj(config_raw)
83
+ return cls.model_validate(config_raw)
84
84
 
85
85
  config_raw = None
86
86
  try:
@@ -110,7 +110,7 @@ class QuickstartVersionMappingConfig(BaseModel):
110
110
  }
111
111
  )
112
112
 
113
- config = cls.parse_obj(config_raw)
113
+ config = cls.model_validate(config_raw)
114
114
 
115
115
  # If stable is not defined in the config, we need to fetch the latest version from github.
116
116
  if config.quickstart_version_map.get("stable") is None:
@@ -177,7 +177,7 @@ def save_quickstart_config(
177
177
  path = os.path.expanduser(path)
178
178
  os.makedirs(os.path.dirname(path), exist_ok=True)
179
179
  with open(path, "w") as f:
180
- yaml.dump(config.dict(), f)
180
+ yaml.dump(config.model_dump(), f)
181
181
  logger.info(f"Saved quickstart config to {path}.")
182
182
 
183
183
 
@@ -42,7 +42,7 @@ def upsert(file: Path, override_editable: bool) -> None:
42
42
  with get_default_graph(ClientMode.CLI) as emitter:
43
43
  for group_config in group_configs:
44
44
  try:
45
- datahub_group = CorpGroup.parse_obj(group_config)
45
+ datahub_group = CorpGroup.model_validate(group_config)
46
46
  for mcp in datahub_group.generate_mcp(
47
47
  generation_config=CorpGroupGenerationConfig(
48
48
  override_editable=override_editable, datahub_graph=emitter
@@ -85,7 +85,7 @@ def list(details: bool, to_file: str) -> None:
85
85
  with open(file, "r") as fp:
86
86
  existing_objects = yaml.load(fp) # this is a list of dicts
87
87
  existing_objects = [
88
- StructuredProperties.parse_obj(obj) for obj in existing_objects
88
+ StructuredProperties.model_validate(obj) for obj in existing_objects
89
89
  ]
90
90
  objects = [obj for obj in objects]
91
91
  # do a positional update of the existing objects
@@ -42,7 +42,7 @@ def upsert(file: Path, override_editable: bool) -> None:
42
42
  with get_default_graph(ClientMode.CLI) as emitter:
43
43
  for user_config in user_configs:
44
44
  try:
45
- datahub_user: CorpUser = CorpUser.parse_obj(user_config)
45
+ datahub_user: CorpUser = CorpUser.model_validate(user_config)
46
46
 
47
47
  emitter.emit_all(
48
48
  datahub_user.generate_mcp(
@@ -140,6 +140,18 @@ class ConfigModel(BaseModel):
140
140
 
141
141
  @classmethod
142
142
  def parse_obj_allow_extras(cls, obj: Any) -> Self:
143
+ """Parse an object while allowing extra fields.
144
+
145
+ 'parse_obj' in Pydantic v1 is equivalent to 'model_validate' in Pydantic v2.
146
+ However, 'parse_obj_allow_extras' in v1 is not directly available in v2.
147
+
148
+ `model_validate(..., strict=False)` does not work because it still raises errors on extra fields;
149
+ strict=False only affects type coercion and validation strictness, not extra field handling.
150
+
151
+ This method temporarily modifies the model's configuration to allow extra fields
152
+
153
+ TODO: Do we really need to support this behaviour? Consider removing this method in future.
154
+ """
143
155
  if PYDANTIC_VERSION_2:
144
156
  try:
145
157
  with unittest.mock.patch.dict(
@@ -148,12 +160,12 @@ class ConfigModel(BaseModel):
148
160
  clear=False,
149
161
  ):
150
162
  cls.model_rebuild(force=True) # type: ignore
151
- return cls.parse_obj(obj)
163
+ return cls.model_validate(obj)
152
164
  finally:
153
165
  cls.model_rebuild(force=True) # type: ignore
154
166
  else:
155
167
  with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow):
156
- return cls.parse_obj(obj)
168
+ return cls.model_validate(obj)
157
169
 
158
170
 
159
171
  class PermissiveConfigModel(ConfigModel):
@@ -1,6 +1,6 @@
1
1
  from typing import TYPE_CHECKING, Type
2
2
 
3
- import pydantic
3
+ from pydantic import model_validator
4
4
 
5
5
  from datahub.ingestion.api.global_context import get_graph_context
6
6
 
@@ -40,4 +40,4 @@ def auto_connection_resolver(
40
40
  # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
41
41
  # This hack ensures that multiple validators do not overwrite each other.
42
42
  _resolve_connection.__name__ = f"{_resolve_connection.__name__}_{connection_field}"
43
- return pydantic.root_validator(pre=True, allow_reuse=True)(_resolve_connection)
43
+ return model_validator(mode="before")(_resolve_connection)
@@ -1,7 +1,14 @@
1
1
  import pathlib
2
+ from copy import deepcopy
2
3
  from typing import Any, Dict, Optional, Union
3
4
 
4
- from pydantic import Field, FilePath, SecretStr, validator
5
+ from pydantic import (
6
+ Field,
7
+ FilePath,
8
+ SecretStr,
9
+ field_validator,
10
+ model_validator,
11
+ )
5
12
 
6
13
  from datahub.configuration.common import ConfigModel
7
14
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
@@ -41,7 +48,8 @@ class GitReference(ConfigModel):
41
48
  transform=lambda url: _GITHUB_URL_TEMPLATE,
42
49
  )
43
50
 
44
- @validator("repo", pre=True)
51
+ @field_validator("repo", mode="before")
52
+ @classmethod
45
53
  def simplify_repo_url(cls, repo: str) -> str:
46
54
  if repo.startswith("github.com/") or repo.startswith("gitlab.com"):
47
55
  repo = f"https://{repo}"
@@ -53,21 +61,22 @@ class GitReference(ConfigModel):
53
61
 
54
62
  return repo
55
63
 
56
- @validator("url_template", always=True)
57
- def infer_url_template(cls, url_template: Optional[str], values: dict) -> str:
58
- if url_template is not None:
59
- return url_template
64
+ @model_validator(mode="after")
65
+ def infer_url_template(self) -> "GitReference":
66
+ if self.url_template is not None:
67
+ return self
60
68
 
61
- repo: str = values["repo"]
62
- if repo.startswith(_GITHUB_PREFIX):
63
- return _GITHUB_URL_TEMPLATE
64
- elif repo.startswith(_GITLAB_PREFIX):
65
- return _GITLAB_URL_TEMPLATE
69
+ if self.repo.startswith(_GITHUB_PREFIX):
70
+ self.url_template = _GITHUB_URL_TEMPLATE
71
+ elif self.repo.startswith(_GITLAB_PREFIX):
72
+ self.url_template = _GITLAB_URL_TEMPLATE
66
73
  else:
67
74
  raise ValueError(
68
75
  "Unable to infer URL template from repo. Please set url_template manually."
69
76
  )
70
77
 
78
+ return self
79
+
71
80
  def get_url_for_file_path(self, file_path: str) -> str:
72
81
  assert self.url_template
73
82
  if self.url_subdir:
@@ -98,35 +107,43 @@ class GitInfo(GitReference):
98
107
 
99
108
  _fix_deploy_key_newlines = pydantic_multiline_string("deploy_key")
100
109
 
101
- @validator("deploy_key", pre=True, always=True)
110
+ @model_validator(mode="before")
111
+ @classmethod
102
112
  def deploy_key_filled_from_deploy_key_file(
103
- cls, v: Optional[SecretStr], values: Dict[str, Any]
104
- ) -> Optional[SecretStr]:
105
- if v is None:
113
+ cls, values: Dict[str, Any]
114
+ ) -> Dict[str, Any]:
115
+ # In-place update of the input dict would cause state contamination.
116
+ # So a deepcopy is performed first.
117
+ values = deepcopy(values)
118
+
119
+ if values.get("deploy_key") is None:
106
120
  deploy_key_file = values.get("deploy_key_file")
107
121
  if deploy_key_file is not None:
108
122
  with open(deploy_key_file) as fp:
109
123
  deploy_key = SecretStr(fp.read())
110
- return deploy_key
111
- return v
112
-
113
- @validator("repo_ssh_locator", always=True)
114
- def infer_repo_ssh_locator(
115
- cls, repo_ssh_locator: Optional[str], values: dict
116
- ) -> str:
117
- if repo_ssh_locator is not None:
118
- return repo_ssh_locator
119
-
120
- repo: str = values["repo"]
121
- if repo.startswith(_GITHUB_PREFIX):
122
- return f"git@github.com:{repo[len(_GITHUB_PREFIX) :]}.git"
123
- elif repo.startswith(_GITLAB_PREFIX):
124
- return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX) :]}.git"
124
+ values["deploy_key"] = deploy_key
125
+ return values
126
+
127
+ @model_validator(mode="after")
128
+ def infer_repo_ssh_locator(self) -> "GitInfo":
129
+ if self.repo_ssh_locator is not None:
130
+ return self
131
+
132
+ if self.repo.startswith(_GITHUB_PREFIX):
133
+ self.repo_ssh_locator = (
134
+ f"git@github.com:{self.repo[len(_GITHUB_PREFIX) :]}.git"
135
+ )
136
+ elif self.repo.startswith(_GITLAB_PREFIX):
137
+ self.repo_ssh_locator = (
138
+ f"git@gitlab.com:{self.repo[len(_GITLAB_PREFIX) :]}.git"
139
+ )
125
140
  else:
126
141
  raise ValueError(
127
142
  "Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually."
128
143
  )
129
144
 
145
+ return self
146
+
130
147
  @property
131
148
  def branch_for_clone(self) -> Optional[str]:
132
149
  # If branch was manually set, we should use it. Otherwise return None.
@@ -1,6 +1,6 @@
1
1
  from typing import TYPE_CHECKING, Type, TypeVar, Union
2
2
 
3
- import pydantic
3
+ from pydantic import field_validator
4
4
 
5
5
  from datahub.ingestion.api.registry import import_path
6
6
 
@@ -15,4 +15,4 @@ def _pydantic_resolver(cls: Type, v: Union[str, _T]) -> _T:
15
15
 
16
16
 
17
17
  def pydantic_resolve_key(field: str) -> "V1Validator":
18
- return pydantic.validator(field, pre=True, allow_reuse=True)(_pydantic_resolver)
18
+ return field_validator(field, mode="before")(_pydantic_resolver)
@@ -1,4 +1,4 @@
1
- from pydantic import Field, validator
1
+ from pydantic import Field, field_validator
2
2
 
3
3
  from datahub.configuration.common import ConfigModel, ConfigurationError
4
4
  from datahub.configuration.env_vars import (
@@ -42,7 +42,8 @@ class _KafkaConnectionConfig(ConfigModel):
42
42
  description="The request timeout used when interacting with the Kafka APIs.",
43
43
  )
44
44
 
45
- @validator("bootstrap")
45
+ @field_validator("bootstrap", mode="after")
46
+ @classmethod
46
47
  def bootstrap_host_colon_port_comma(cls, val: str) -> str:
47
48
  for entry in val.split(","):
48
49
  validate_host_port(entry)
@@ -57,7 +58,7 @@ class KafkaConsumerConnectionConfig(_KafkaConnectionConfig):
57
58
  description="Extra consumer config serialized as JSON. These options will be passed into Kafka's DeserializingConsumer. See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer and https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md .",
58
59
  )
59
60
 
60
- @validator("consumer_config")
61
+ @field_validator("consumer_config", mode="after")
61
62
  @classmethod
62
63
  def resolve_callback(cls, value: dict) -> dict:
63
64
  if CallableConsumerConfig.is_callable_config(value):
@@ -1,10 +1,9 @@
1
1
  import enum
2
2
  from datetime import datetime, timedelta, timezone
3
- from typing import Any, Dict, List
3
+ from typing import Any, List
4
4
 
5
5
  import humanfriendly
6
- import pydantic
7
- from pydantic.fields import Field
6
+ from pydantic import Field, ValidationInfo, field_validator, model_validator
8
7
 
9
8
  from datahub.configuration.common import ConfigModel
10
9
  from datahub.configuration.datetimes import parse_absolute_time, parse_relative_timespan
@@ -52,45 +51,46 @@ class BaseTimeWindowConfig(ConfigModel):
52
51
  description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.",
53
52
  ) # type: ignore
54
53
 
55
- @pydantic.validator("start_time", pre=True, always=True)
56
- def default_start_time(
57
- cls, v: Any, values: Dict[str, Any], **kwargs: Any
58
- ) -> datetime:
59
- if v is None:
60
- return get_time_bucket(
61
- values["end_time"]
62
- - get_bucket_duration_delta(values["bucket_duration"]),
63
- values["bucket_duration"],
64
- )
65
- elif isinstance(v, str):
54
+ @field_validator("start_time", mode="before")
55
+ @classmethod
56
+ def parse_start_time(cls, v: Any, info: ValidationInfo) -> Any:
57
+ if isinstance(v, str):
66
58
  # This is where start_time str is resolved to datetime
67
59
  try:
68
60
  delta = parse_relative_timespan(v)
69
61
  assert delta < timedelta(0), (
70
62
  "Relative start time should start with minus sign (-) e.g. '-2 days'."
71
63
  )
72
- assert abs(delta) >= get_bucket_duration_delta(
73
- values["bucket_duration"]
74
- ), (
64
+ bucket_duration = info.data.get("bucket_duration", BucketDuration.DAY)
65
+ assert abs(delta) >= get_bucket_duration_delta(bucket_duration), (
75
66
  "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'."
76
67
  )
77
68
 
78
- # The end_time's default value is not yet populated, in which case
79
- # we can just manually generate it here.
80
- if "end_time" not in values:
81
- values["end_time"] = datetime.now(tz=timezone.utc)
69
+ # We need end_time, but it might not be set yet
70
+ # In that case, we'll use the default
71
+ end_time = info.data.get("end_time")
72
+ if end_time is None:
73
+ end_time = datetime.now(tz=timezone.utc)
82
74
 
83
- return get_time_bucket(
84
- values["end_time"] + delta, values["bucket_duration"]
85
- )
75
+ return get_time_bucket(end_time + delta, bucket_duration)
86
76
  except humanfriendly.InvalidTimespan:
87
77
  # We do not floor start_time to the bucket start time if absolute start time is specified.
88
78
  # If user has specified absolute start time in recipe, it's most likely that he means it.
89
79
  return parse_absolute_time(v)
90
-
91
80
  return v
92
81
 
93
- @pydantic.validator("start_time", "end_time")
82
+ @model_validator(mode="after")
83
+ def default_start_time(self) -> "BaseTimeWindowConfig":
84
+ # Only calculate start_time if it was None (not provided by user)
85
+ if self.start_time is None:
86
+ self.start_time = get_time_bucket(
87
+ self.end_time - get_bucket_duration_delta(self.bucket_duration),
88
+ self.bucket_duration,
89
+ )
90
+ return self
91
+
92
+ @field_validator("start_time", "end_time", mode="after")
93
+ @classmethod
94
94
  def ensure_timestamps_in_utc(cls, v: datetime) -> datetime:
95
95
  if v.tzinfo is None:
96
96
  raise ValueError(
@@ -1,7 +1,7 @@
1
1
  import warnings
2
2
  from typing import TYPE_CHECKING, Any, Optional, Type
3
3
 
4
- import pydantic
4
+ from pydantic import model_validator
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
  from datahub.utilities.global_warning_util import add_global_warning
@@ -34,4 +34,4 @@ def pydantic_field_deprecated(
34
34
  # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
35
35
  # This hack ensures that multiple field deprecated do not overwrite each other.
36
36
  _validate_deprecated.__name__ = f"{_validate_deprecated.__name__}_{field}"
37
- return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_deprecated)
37
+ return model_validator(mode="before")(_validate_deprecated)
@@ -1,7 +1,7 @@
1
1
  import warnings
2
2
  from typing import TYPE_CHECKING, Type
3
3
 
4
- import pydantic
4
+ from pydantic import model_validator
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
 
@@ -31,4 +31,4 @@ def pydantic_removed_field(
31
31
  # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
32
32
  # This hack ensures that multiple field removals do not overwrite each other.
33
33
  _validate_field_removal.__name__ = f"{_validate_field_removal.__name__}_{field}"
34
- return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_removal)
34
+ return model_validator(mode="before")(_validate_field_removal)
@@ -1,7 +1,7 @@
1
1
  import warnings
2
2
  from typing import TYPE_CHECKING, Callable, Type, TypeVar
3
3
 
4
- import pydantic
4
+ from pydantic import model_validator
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
  from datahub.utilities.global_warning_util import add_global_warning
@@ -52,4 +52,4 @@ def pydantic_renamed_field(
52
52
  # validator with pre=True gets all the values that were passed in.
53
53
  # Given that a renamed field doesn't show up in the fields list, we can't use
54
54
  # the field-level validator, even with a different field name.
55
- return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename)
55
+ return model_validator(mode="before")(_validate_field_rename)
@@ -1,6 +1,7 @@
1
1
  from typing import TYPE_CHECKING, Optional, Type, Union
2
2
 
3
3
  import pydantic
4
+ from pydantic import field_validator
4
5
 
5
6
  if TYPE_CHECKING:
6
7
  from pydantic.deprecated.class_validators import V1Validator
@@ -31,4 +32,4 @@ def pydantic_multiline_string(field: str) -> "V1Validator":
31
32
  # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
32
33
  # This hack ensures that multiple field deprecated do not overwrite each other.
33
34
  _validate_field.__name__ = f"{_validate_field.__name__}_{field}"
34
- return pydantic.validator(field, pre=True, allow_reuse=True)(_validate_field)
35
+ return field_validator(field, mode="before")(_validate_field)
@@ -6,6 +6,7 @@ from confluent_kafka import SerializingProducer
6
6
  from confluent_kafka.schema_registry import SchemaRegistryClient
7
7
  from confluent_kafka.schema_registry.avro import AvroSerializer
8
8
  from confluent_kafka.serialization import SerializationContext, StringSerializer
9
+ from pydantic import field_validator
9
10
 
10
11
  from datahub.configuration.common import ConfigModel
11
12
  from datahub.configuration.kafka import KafkaProducerConnectionConfig
@@ -49,7 +50,8 @@ class KafkaEmitterConfig(ConfigModel):
49
50
  },
50
51
  )
51
52
 
52
- @pydantic.validator("topic_routes")
53
+ @field_validator("topic_routes", mode="after")
54
+ @classmethod
53
55
  def validate_topic_routes(cls, v: Dict[str, str]) -> Dict[str, str]:
54
56
  assert MCE_KEY in v, f"topic_routes must contain a route for {MCE_KEY}"
55
57
  assert MCP_KEY in v, f"topic_routes must contain a route for {MCP_KEY}"
@@ -145,8 +145,7 @@ class EmitMode(ConfigEnum):
145
145
  ASYNC_WAIT = auto()
146
146
 
147
147
 
148
- _DEFAULT_EMIT_MODE = pydantic.parse_obj_as(
149
- EmitMode,
148
+ _DEFAULT_EMIT_MODE = pydantic.TypeAdapter(EmitMode).validate_python(
150
149
  get_emit_mode() or EmitMode.SYNC_PRIMARY,
151
150
  )
152
151
 
@@ -156,8 +155,7 @@ class RestSinkEndpoint(ConfigEnum):
156
155
  OPENAPI = auto()
157
156
 
158
157
 
159
- DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
160
- RestSinkEndpoint,
158
+ DEFAULT_REST_EMITTER_ENDPOINT = pydantic.TypeAdapter(RestSinkEndpoint).validate_python(
161
159
  get_rest_emitter_default_endpoint() or RestSinkEndpoint.RESTLI,
162
160
  )
163
161
 
@@ -17,7 +17,7 @@ def config_class(config_cls: Type) -> Callable[[Type], Type]:
17
17
  """Adds a get_config_class method to the decorated class"""
18
18
 
19
19
  def default_create(cls: Type, config_dict: Dict, ctx: PipelineContext) -> Type:
20
- config = config_cls.parse_obj(config_dict)
20
+ config = config_cls.model_validate(config_dict)
21
21
  return cls(config=config, ctx=ctx)
22
22
 
23
23
  def wrapper(cls: Type) -> Type:
@@ -65,7 +65,7 @@ class Report(SupportsAsObj):
65
65
  if isinstance(some_val, SupportsAsObj):
66
66
  return some_val.as_obj()
67
67
  elif isinstance(some_val, pydantic.BaseModel):
68
- return Report.to_pure_python_obj(some_val.dict())
68
+ return Report.to_pure_python_obj(some_val.model_dump())
69
69
  elif dataclasses.is_dataclass(some_val) and not isinstance(some_val, type):
70
70
  # The `is_dataclass` function returns `True` for both instances and classes.
71
71
  # We need an extra check to ensure an instance was passed in.
@@ -123,7 +123,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
123
123
 
124
124
  @classmethod
125
125
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "Self":
126
- return cls(ctx, cls.get_config_class().parse_obj(config_dict))
126
+ return cls(ctx, cls.get_config_class().model_validate(config_dict))
127
127
 
128
128
  def handle_work_unit_start(self, workunit: WorkUnit) -> None:
129
129
  """Called at the start of each new workunit.
@@ -480,7 +480,7 @@ class Extractor(Generic[WorkUnitType, ExtractorConfig], Closeable, metaclass=ABC
480
480
  config_class = self.get_config_class()
481
481
 
482
482
  self.ctx = ctx
483
- self.config = config_class.parse_obj(config_dict)
483
+ self.config = config_class.model_validate(config_dict)
484
484
 
485
485
  @abstractmethod
486
486
  def get_records(self, workunit: WorkUnitType) -> Iterable[RecordEnvelope]:
@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional
3
3
  from datahub_classify.helper_classes import ColumnInfo
4
4
  from datahub_classify.infotype_predictor import predict_infotypes
5
5
  from datahub_classify.reference_input import input1 as default_config
6
- from pydantic import validator
6
+ from pydantic import field_validator
7
7
  from pydantic.fields import Field
8
8
 
9
9
  from datahub.configuration.common import ConfigModel
@@ -90,7 +90,7 @@ class InfoTypeConfig(ConfigModel):
90
90
 
91
91
 
92
92
  DEFAULT_CLASSIFIER_CONFIG = {
93
- k: InfoTypeConfig.parse_obj(v) for k, v in default_config.items()
93
+ k: InfoTypeConfig.model_validate(v) for k, v in default_config.items()
94
94
  }
95
95
 
96
96
 
@@ -114,8 +114,11 @@ class DataHubClassifierConfig(ConfigModel):
114
114
  description="Minimum number of non-null column values required to process `values` prediction factor.",
115
115
  )
116
116
 
117
- @validator("info_types_config")
118
- def input_config_selectively_overrides_default_config(cls, info_types_config):
117
+ @field_validator("info_types_config", mode="after")
118
+ @classmethod
119
+ def input_config_selectively_overrides_default_config(
120
+ cls, info_types_config: Dict[str, Any]
121
+ ) -> Dict[str, Any]:
119
122
  for infotype, infotype_config in DEFAULT_CLASSIFIER_CONFIG.items():
120
123
  if infotype not in info_types_config:
121
124
  # if config for some info type is not provided by user, use default config for that info type.
@@ -125,7 +128,7 @@ class DataHubClassifierConfig(ConfigModel):
125
128
  # use default config for that prediction factor.
126
129
  for factor, weight in (
127
130
  info_types_config[infotype]
128
- .Prediction_Factors_and_Weights.dict()
131
+ .Prediction_Factors_and_Weights.model_dump()
129
132
  .items()
130
133
  ):
131
134
  if (
@@ -146,7 +149,7 @@ class DataHubClassifierConfig(ConfigModel):
146
149
  for (
147
150
  factor,
148
151
  weight,
149
- ) in custom_infotype_config.Prediction_Factors_and_Weights.dict().items():
152
+ ) in custom_infotype_config.Prediction_Factors_and_Weights.model_dump().items():
150
153
  if weight > 0:
151
154
  assert getattr(custom_infotype_config, factor) is not None, (
152
155
  f"Missing Configuration for Prediction Factor {factor} for Custom Info Type {custom_infotype}"
@@ -173,7 +176,7 @@ class DataHubClassifier(Classifier):
173
176
  def create(cls, config_dict: Optional[Dict[str, Any]]) -> "DataHubClassifier":
174
177
  # This could be replaced by parsing to particular class, if required
175
178
  if config_dict is not None:
176
- config = DataHubClassifierConfig.parse_obj(config_dict)
179
+ config = DataHubClassifierConfig.model_validate(config_dict)
177
180
  else:
178
181
  config = DataHubClassifierConfig()
179
182
  return cls(config)
@@ -183,7 +186,7 @@ class DataHubClassifier(Classifier):
183
186
  column_infos=columns,
184
187
  confidence_level_threshold=self.config.confidence_level_threshold,
185
188
  global_config={
186
- k: v.dict() for k, v in self.config.info_types_config.items()
189
+ k: v.model_dump() for k, v in self.config.info_types_config.items()
187
190
  },
188
191
  infotypes=self.config.info_types,
189
192
  minimum_values_threshold=self.config.minimum_values_threshold,
@@ -82,7 +82,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
82
82
  ctx: PipelineContext,
83
83
  sink: Sink,
84
84
  ) -> PipelineRunListener:
85
- reporter_config = DatahubIngestionRunSummaryProviderConfig.parse_obj(
85
+ reporter_config = DatahubIngestionRunSummaryProviderConfig.model_validate(
86
86
  config_dict or {}
87
87
  )
88
88
  if reporter_config.sink:
@@ -2,7 +2,7 @@ import json
2
2
  import logging
3
3
  from typing import Any, Dict
4
4
 
5
- from pydantic import validator
5
+ from pydantic import field_validator
6
6
 
7
7
  from datahub.configuration.common import ConfigModel
8
8
  from datahub.ingestion.api.common import PipelineContext
@@ -16,8 +16,9 @@ class FileReporterConfig(ConfigModel):
16
16
  filename: str
17
17
  format: str = "json"
18
18
 
19
- @validator("format")
20
- def only_json_supported(cls, v):
19
+ @field_validator("format", mode="after")
20
+ @classmethod
21
+ def only_json_supported(cls, v: str) -> str:
21
22
  if v and v.lower() != "json":
22
23
  raise ValueError(
23
24
  f"Format {v} is not yet supported. Only json is supported at this time"
@@ -33,7 +34,7 @@ class FileReporter(PipelineRunListener):
33
34
  ctx: PipelineContext,
34
35
  sink: Sink,
35
36
  ) -> PipelineRunListener:
36
- reporter_config = FileReporterConfig.parse_obj(config_dict)
37
+ reporter_config = FileReporterConfig.model_validate(config_dict)
37
38
  return cls(reporter_config)
38
39
 
39
40
  def __init__(self, reporter_config: FileReporterConfig) -> None: