acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (263) hide show
  1. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2550 -2543
  2. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +263 -261
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +2 -2
  5. datahub/api/entities/corpgroup/corpgroup.py +11 -6
  6. datahub/api/entities/corpuser/corpuser.py +11 -11
  7. datahub/api/entities/dataproduct/dataproduct.py +47 -27
  8. datahub/api/entities/dataset/dataset.py +32 -21
  9. datahub/api/entities/external/lake_formation_external_entites.py +5 -6
  10. datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
  11. datahub/api/entities/forms/forms.py +16 -14
  12. datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
  13. datahub/cli/check_cli.py +2 -2
  14. datahub/cli/config_utils.py +3 -3
  15. datahub/cli/lite_cli.py +9 -7
  16. datahub/cli/migrate.py +4 -4
  17. datahub/cli/quickstart_versioning.py +3 -3
  18. datahub/cli/specific/group_cli.py +1 -1
  19. datahub/cli/specific/structuredproperties_cli.py +1 -1
  20. datahub/cli/specific/user_cli.py +1 -1
  21. datahub/configuration/common.py +14 -2
  22. datahub/configuration/connection_resolver.py +2 -2
  23. datahub/configuration/git.py +47 -30
  24. datahub/configuration/import_resolver.py +2 -2
  25. datahub/configuration/kafka.py +4 -3
  26. datahub/configuration/time_window_config.py +26 -26
  27. datahub/configuration/validate_field_deprecation.py +2 -2
  28. datahub/configuration/validate_field_removal.py +2 -2
  29. datahub/configuration/validate_field_rename.py +2 -2
  30. datahub/configuration/validate_multiline_string.py +2 -1
  31. datahub/emitter/kafka_emitter.py +3 -1
  32. datahub/emitter/rest_emitter.py +2 -4
  33. datahub/ingestion/api/decorators.py +1 -1
  34. datahub/ingestion/api/report.py +1 -1
  35. datahub/ingestion/api/sink.py +1 -1
  36. datahub/ingestion/api/source.py +1 -1
  37. datahub/ingestion/glossary/datahub_classifier.py +11 -8
  38. datahub/ingestion/graph/client.py +5 -1
  39. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  40. datahub/ingestion/reporting/file_reporter.py +5 -4
  41. datahub/ingestion/run/pipeline.py +7 -6
  42. datahub/ingestion/run/pipeline_config.py +12 -14
  43. datahub/ingestion/run/sink_callback.py +1 -1
  44. datahub/ingestion/sink/datahub_rest.py +6 -4
  45. datahub/ingestion/source/abs/config.py +19 -19
  46. datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
  47. datahub/ingestion/source/abs/source.py +2 -2
  48. datahub/ingestion/source/aws/aws_common.py +1 -1
  49. datahub/ingestion/source/aws/glue.py +6 -4
  50. datahub/ingestion/source/aws/sagemaker.py +1 -1
  51. datahub/ingestion/source/azure/azure_common.py +8 -12
  52. datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
  53. datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
  54. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
  55. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  56. datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
  57. datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
  58. datahub/ingestion/source/datahub/config.py +8 -8
  59. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  60. datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
  61. datahub/ingestion/source/dbt/dbt_common.py +39 -37
  62. datahub/ingestion/source/dbt/dbt_core.py +10 -12
  63. datahub/ingestion/source/debug/datahub_debug.py +1 -1
  64. datahub/ingestion/source/delta_lake/config.py +6 -4
  65. datahub/ingestion/source/dremio/dremio_api.py +212 -78
  66. datahub/ingestion/source/dremio/dremio_config.py +10 -6
  67. datahub/ingestion/source/dremio/dremio_entities.py +55 -39
  68. datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
  69. datahub/ingestion/source/dremio/dremio_source.py +24 -26
  70. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  71. datahub/ingestion/source/elastic_search.py +110 -32
  72. datahub/ingestion/source/excel/source.py +1 -1
  73. datahub/ingestion/source/feast.py +1 -1
  74. datahub/ingestion/source/file.py +5 -4
  75. datahub/ingestion/source/fivetran/config.py +17 -16
  76. datahub/ingestion/source/fivetran/fivetran.py +2 -2
  77. datahub/ingestion/source/gc/datahub_gc.py +1 -1
  78. datahub/ingestion/source/gcs/gcs_source.py +8 -10
  79. datahub/ingestion/source/ge_profiling_config.py +8 -5
  80. datahub/ingestion/source/grafana/grafana_api.py +2 -2
  81. datahub/ingestion/source/grafana/grafana_config.py +4 -3
  82. datahub/ingestion/source/grafana/grafana_source.py +1 -1
  83. datahub/ingestion/source/grafana/models.py +23 -5
  84. datahub/ingestion/source/hex/api.py +7 -5
  85. datahub/ingestion/source/hex/hex.py +4 -3
  86. datahub/ingestion/source/iceberg/iceberg.py +1 -1
  87. datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +10 -10
  90. datahub/ingestion/source/kafka/kafka.py +1 -1
  91. datahub/ingestion/source/ldap.py +1 -1
  92. datahub/ingestion/source/looker/looker_common.py +7 -5
  93. datahub/ingestion/source/looker/looker_config.py +21 -20
  94. datahub/ingestion/source/looker/lookml_config.py +47 -47
  95. datahub/ingestion/source/metabase.py +8 -8
  96. datahub/ingestion/source/metadata/business_glossary.py +2 -2
  97. datahub/ingestion/source/metadata/lineage.py +13 -8
  98. datahub/ingestion/source/mlflow.py +1 -1
  99. datahub/ingestion/source/mode.py +6 -4
  100. datahub/ingestion/source/mongodb.py +4 -3
  101. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  102. datahub/ingestion/source/nifi.py +17 -23
  103. datahub/ingestion/source/openapi.py +6 -8
  104. datahub/ingestion/source/powerbi/config.py +33 -32
  105. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
  106. datahub/ingestion/source/powerbi/powerbi.py +1 -1
  107. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
  108. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
  109. datahub/ingestion/source/preset.py +8 -8
  110. datahub/ingestion/source/pulsar.py +1 -1
  111. datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
  112. datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
  113. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  114. datahub/ingestion/source/redshift/config.py +18 -20
  115. datahub/ingestion/source/redshift/redshift.py +2 -2
  116. datahub/ingestion/source/redshift/usage.py +23 -3
  117. datahub/ingestion/source/s3/config.py +83 -62
  118. datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
  119. datahub/ingestion/source/s3/source.py +8 -5
  120. datahub/ingestion/source/sac/sac.py +5 -4
  121. datahub/ingestion/source/salesforce.py +3 -2
  122. datahub/ingestion/source/schema/json_schema.py +2 -2
  123. datahub/ingestion/source/sigma/data_classes.py +3 -2
  124. datahub/ingestion/source/sigma/sigma.py +1 -1
  125. datahub/ingestion/source/sigma/sigma_api.py +7 -7
  126. datahub/ingestion/source/slack/slack.py +1 -1
  127. datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
  128. datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
  129. datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
  130. datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
  131. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
  132. datahub/ingestion/source/snowflake/snowflake_queries.py +28 -4
  133. datahub/ingestion/source/sql/athena.py +1 -1
  134. datahub/ingestion/source/sql/clickhouse.py +4 -2
  135. datahub/ingestion/source/sql/cockroachdb.py +1 -1
  136. datahub/ingestion/source/sql/druid.py +1 -1
  137. datahub/ingestion/source/sql/hana.py +1 -1
  138. datahub/ingestion/source/sql/hive.py +7 -5
  139. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  140. datahub/ingestion/source/sql/mssql/source.py +13 -6
  141. datahub/ingestion/source/sql/mysql.py +1 -1
  142. datahub/ingestion/source/sql/oracle.py +17 -10
  143. datahub/ingestion/source/sql/postgres.py +2 -2
  144. datahub/ingestion/source/sql/presto.py +1 -1
  145. datahub/ingestion/source/sql/sql_config.py +8 -9
  146. datahub/ingestion/source/sql/sql_generic.py +1 -1
  147. datahub/ingestion/source/sql/teradata.py +1 -1
  148. datahub/ingestion/source/sql/trino.py +1 -1
  149. datahub/ingestion/source/sql/vertica.py +5 -4
  150. datahub/ingestion/source/sql_queries.py +174 -22
  151. datahub/ingestion/source/state/checkpoint.py +2 -2
  152. datahub/ingestion/source/state/entity_removal_state.py +2 -1
  153. datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
  154. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
  155. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  156. datahub/ingestion/source/superset.py +9 -9
  157. datahub/ingestion/source/tableau/tableau.py +14 -16
  158. datahub/ingestion/source/unity/azure_auth_config.py +15 -0
  159. datahub/ingestion/source/unity/config.py +51 -34
  160. datahub/ingestion/source/unity/connection.py +7 -1
  161. datahub/ingestion/source/unity/connection_test.py +1 -1
  162. datahub/ingestion/source/unity/proxy.py +216 -7
  163. datahub/ingestion/source/unity/proxy_types.py +91 -0
  164. datahub/ingestion/source/unity/source.py +29 -3
  165. datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
  166. datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
  167. datahub/ingestion/source/usage/usage_common.py +5 -3
  168. datahub/ingestion/source_config/csv_enricher.py +7 -6
  169. datahub/ingestion/source_config/operation_config.py +7 -4
  170. datahub/ingestion/source_config/pulsar.py +11 -15
  171. datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
  172. datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
  173. datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
  174. datahub/ingestion/transformer/add_dataset_properties.py +2 -2
  175. datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
  176. datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
  177. datahub/ingestion/transformer/add_dataset_tags.py +3 -3
  178. datahub/ingestion/transformer/add_dataset_terms.py +3 -3
  179. datahub/ingestion/transformer/dataset_domain.py +3 -3
  180. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
  181. datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
  182. datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
  183. datahub/ingestion/transformer/mark_dataset_status.py +1 -1
  184. datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
  185. datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
  186. datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
  187. datahub/ingestion/transformer/replace_external_url.py +2 -2
  188. datahub/ingestion/transformer/set_browse_path.py +1 -1
  189. datahub/ingestion/transformer/tags_to_terms.py +1 -1
  190. datahub/lite/duckdb_lite.py +1 -1
  191. datahub/lite/lite_util.py +2 -2
  192. datahub/metadata/_internal_schema_classes.py +62 -2
  193. datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
  194. datahub/metadata/schema.avsc +271 -91
  195. datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
  196. datahub/metadata/schemas/AssertionInfo.avsc +48 -5
  197. datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
  198. datahub/metadata/schemas/ChartInfo.avsc +12 -5
  199. datahub/metadata/schemas/ContainerProperties.avsc +12 -5
  200. datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
  201. datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
  202. datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
  203. datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
  204. datahub/metadata/schemas/DashboardInfo.avsc +16 -4
  205. datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
  206. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
  207. datahub/metadata/schemas/DataJobInfo.avsc +9 -4
  208. datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
  209. datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
  210. datahub/metadata/schemas/DataProductProperties.avsc +5 -2
  211. datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
  212. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  213. datahub/metadata/schemas/DatasetProperties.avsc +12 -5
  214. datahub/metadata/schemas/DomainProperties.avsc +7 -3
  215. datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
  216. datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
  217. datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
  218. datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
  219. datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
  220. datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
  221. datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
  222. datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
  223. datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
  224. datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
  225. datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
  226. datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
  227. datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
  228. datahub/metadata/schemas/GlobalTags.avsc +3 -2
  229. datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
  230. datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
  231. datahub/metadata/schemas/InputFields.avsc +3 -2
  232. datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
  233. datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
  234. datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
  235. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  236. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  237. datahub/metadata/schemas/MLModelProperties.avsc +4 -2
  238. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
  239. datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
  240. datahub/metadata/schemas/NotebookInfo.avsc +5 -2
  241. datahub/metadata/schemas/Ownership.avsc +3 -2
  242. datahub/metadata/schemas/QuerySubjects.avsc +1 -1
  243. datahub/metadata/schemas/RoleProperties.avsc +3 -1
  244. datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
  245. datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
  246. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
  247. datahub/metadata/schemas/TagProperties.avsc +3 -1
  248. datahub/metadata/schemas/TestInfo.avsc +2 -1
  249. datahub/sdk/__init__.py +1 -0
  250. datahub/sdk/_all_entities.py +2 -0
  251. datahub/sdk/search_filters.py +68 -40
  252. datahub/sdk/tag.py +112 -0
  253. datahub/secret/datahub_secret_store.py +7 -4
  254. datahub/secret/file_secret_store.py +1 -1
  255. datahub/sql_parsing/schema_resolver.py +29 -0
  256. datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
  257. datahub/sql_parsing/sqlglot_lineage.py +5 -2
  258. datahub/testing/check_sql_parser_result.py +2 -2
  259. datahub/utilities/ingest_utils.py +1 -1
  260. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
  261. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
  262. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
  263. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
datahub/sdk/__init__.py CHANGED
@@ -28,6 +28,7 @@ from datahub.sdk.main_client import DataHubClient
28
28
  from datahub.sdk.mlmodel import MLModel
29
29
  from datahub.sdk.mlmodelgroup import MLModelGroup
30
30
  from datahub.sdk.search_filters import Filter, FilterDsl
31
+ from datahub.sdk.tag import Tag
31
32
 
32
33
  # We want to print out the warning if people do `from datahub.sdk import X`.
33
34
  # But we don't want to print out warnings if they're doing a more direct
@@ -9,6 +9,7 @@ from datahub.sdk.dataset import Dataset
9
9
  from datahub.sdk.entity import Entity
10
10
  from datahub.sdk.mlmodel import MLModel
11
11
  from datahub.sdk.mlmodelgroup import MLModelGroup
12
+ from datahub.sdk.tag import Tag
12
13
 
13
14
  # Base entity classes that don't have circular dependencies
14
15
  # Those that do are imported in the EntityClient where needed
@@ -22,6 +23,7 @@ ENTITY_CLASSES_LIST: List[Type[Entity]] = [
22
23
  DataJob,
23
24
  Dashboard,
24
25
  Chart,
26
+ Tag,
25
27
  ]
26
28
 
27
29
  # Create the mapping of entity types to classes
@@ -16,6 +16,7 @@ from typing import (
16
16
  )
17
17
 
18
18
  import pydantic
19
+ from pydantic import field_validator
19
20
 
20
21
  from datahub.configuration.common import ConfigModel
21
22
  from datahub.configuration.pydantic_migration_helpers import (
@@ -102,7 +103,8 @@ class _EntitySubtypeFilter(_BaseFilter):
102
103
  description="The entity subtype to filter on. Can be 'Table', 'View', 'Source', etc. depending on the native platform's concepts.",
103
104
  )
104
105
 
105
- @pydantic.validator("entity_subtype", pre=True)
106
+ @field_validator("entity_subtype", mode="before")
107
+ @classmethod
106
108
  def validate_entity_subtype(cls, v: str) -> List[str]:
107
109
  return [v] if not isinstance(v, list) else v
108
110
 
@@ -141,10 +143,13 @@ class _PlatformFilter(_BaseFilter):
141
143
  platform: List[str]
142
144
  # TODO: Add validator to convert string -> list of strings
143
145
 
144
- @pydantic.validator("platform", each_item=True)
145
- def validate_platform(cls, v: str) -> str:
146
+ @field_validator("platform", mode="before")
147
+ @classmethod
148
+ def validate_platform(cls, v):
146
149
  # Subtle - we use the constructor instead of the from_string method
147
150
  # because coercion is acceptable here.
151
+ if isinstance(v, list):
152
+ return [str(DataPlatformUrn(item)) for item in v]
148
153
  return str(DataPlatformUrn(v))
149
154
 
150
155
  def _build_rule(self) -> SearchFilterRule:
@@ -161,8 +166,11 @@ class _PlatformFilter(_BaseFilter):
161
166
  class _DomainFilter(_BaseFilter):
162
167
  domain: List[str]
163
168
 
164
- @pydantic.validator("domain", each_item=True)
165
- def validate_domain(cls, v: str) -> str:
169
+ @field_validator("domain", mode="before")
170
+ @classmethod
171
+ def validate_domain(cls, v):
172
+ if isinstance(v, list):
173
+ return [str(DomainUrn.from_string(item)) for item in v]
166
174
  return str(DomainUrn.from_string(v))
167
175
 
168
176
  def _build_rule(self) -> SearchFilterRule:
@@ -183,8 +191,11 @@ class _ContainerFilter(_BaseFilter):
183
191
  description="If true, only entities that are direct descendants of the container will be returned.",
184
192
  )
185
193
 
186
- @pydantic.validator("container", each_item=True)
187
- def validate_container(cls, v: str) -> str:
194
+ @field_validator("container", mode="before")
195
+ @classmethod
196
+ def validate_container(cls, v):
197
+ if isinstance(v, list):
198
+ return [str(ContainerUrn.from_string(item)) for item in v]
188
199
  return str(ContainerUrn.from_string(v))
189
200
 
190
201
  @classmethod
@@ -249,17 +260,25 @@ class _OwnerFilter(_BaseFilter):
249
260
  description="The owner to filter on. Should be user or group URNs.",
250
261
  )
251
262
 
252
- @pydantic.validator("owner", each_item=True)
253
- def validate_owner(cls, v: str) -> str:
254
- if not v.startswith("urn:li:"):
255
- raise ValueError(f"Owner must be a valid User or Group URN, got: {v}")
256
- _type = guess_entity_type(v)
257
- if _type == CorpUserUrn.ENTITY_TYPE:
258
- return str(CorpUserUrn.from_string(v))
259
- elif _type == CorpGroupUrn.ENTITY_TYPE:
260
- return str(CorpGroupUrn.from_string(v))
261
- else:
262
- raise ValueError(f"Owner must be a valid User or Group URN, got: {v}")
263
+ @field_validator("owner", mode="before")
264
+ @classmethod
265
+ def validate_owner(cls, v):
266
+ validated = []
267
+ for owner in v:
268
+ if not owner.startswith("urn:li:"):
269
+ raise ValueError(
270
+ f"Owner must be a valid User or Group URN, got: {owner}"
271
+ )
272
+ _type = guess_entity_type(owner)
273
+ if _type == CorpUserUrn.ENTITY_TYPE:
274
+ validated.append(str(CorpUserUrn.from_string(owner)))
275
+ elif _type == CorpGroupUrn.ENTITY_TYPE:
276
+ validated.append(str(CorpGroupUrn.from_string(owner)))
277
+ else:
278
+ raise ValueError(
279
+ f"Owner must be a valid User or Group URN, got: {owner}"
280
+ )
281
+ return validated
263
282
 
264
283
  def _build_rule(self) -> SearchFilterRule:
265
284
  return SearchFilterRule(
@@ -279,17 +298,21 @@ class _GlossaryTermFilter(_BaseFilter):
279
298
  description="The glossary term to filter on. Should be glossary term URNs.",
280
299
  )
281
300
 
282
- @pydantic.validator("glossary_term", each_item=True)
283
- def validate_glossary_term(cls, v: str) -> str:
284
- if not v.startswith("urn:li:"):
285
- raise ValueError(f"Glossary term must be a valid URN, got: {v}")
286
- # Validate that it's a glossary term URN
287
- _type = guess_entity_type(v)
288
- if _type != "glossaryTerm":
289
- raise ValueError(
290
- f"Glossary term must be a valid glossary term URN, got: {v}"
291
- )
292
- return v
301
+ @field_validator("glossary_term", mode="before")
302
+ @classmethod
303
+ def validate_glossary_term(cls, v):
304
+ validated = []
305
+ for term in v:
306
+ if not term.startswith("urn:li:"):
307
+ raise ValueError(f"Glossary term must be a valid URN, got: {term}")
308
+ # Validate that it's a glossary term URN
309
+ _type = guess_entity_type(term)
310
+ if _type != "glossaryTerm":
311
+ raise ValueError(
312
+ f"Glossary term must be a valid glossary term URN, got: {term}"
313
+ )
314
+ validated.append(term)
315
+ return validated
293
316
 
294
317
  def _build_rule(self) -> SearchFilterRule:
295
318
  return SearchFilterRule(
@@ -309,15 +332,19 @@ class _TagFilter(_BaseFilter):
309
332
  description="The tag to filter on. Should be tag URNs.",
310
333
  )
311
334
 
312
- @pydantic.validator("tag", each_item=True)
313
- def validate_tag(cls, v: str) -> str:
314
- if not v.startswith("urn:li:"):
315
- raise ValueError(f"Tag must be a valid URN, got: {v}")
316
- # Validate that it's a tag URN
317
- _type = guess_entity_type(v)
318
- if _type != "tag":
319
- raise ValueError(f"Tag must be a valid tag URN, got: {v}")
320
- return v
335
+ @field_validator("tag", mode="before")
336
+ @classmethod
337
+ def validate_tag(cls, v):
338
+ validated = []
339
+ for tag in v:
340
+ if not tag.startswith("urn:li:"):
341
+ raise ValueError(f"Tag must be a valid URN, got: {tag}")
342
+ # Validate that it's a tag URN
343
+ _type = guess_entity_type(tag)
344
+ if _type != "tag":
345
+ raise ValueError(f"Tag must be a valid tag URN, got: {tag}")
346
+ validated.append(tag)
347
+ return validated
321
348
 
322
349
  def _build_rule(self) -> SearchFilterRule:
323
350
  return SearchFilterRule(
@@ -426,7 +453,8 @@ class _Not(_BaseFilter):
426
453
 
427
454
  not_: "Filter" = pydantic.Field(alias="not")
428
455
 
429
- @pydantic.validator("not_", pre=False)
456
+ @field_validator("not_", mode="after")
457
+ @classmethod
430
458
  def validate_not(cls, v: "Filter") -> "Filter":
431
459
  inner_filter = v.compile()
432
460
  if len(inner_filter) != 1:
@@ -571,7 +599,7 @@ def load_filters(obj: Any) -> Filter:
571
599
  if PYDANTIC_VERSION_2:
572
600
  return pydantic.TypeAdapter(Filter).validate_python(obj) # type: ignore
573
601
  else:
574
- return pydantic.parse_obj_as(Filter, obj) # type: ignore
602
+ return pydantic.TypeAdapter(Filter).validate_python(obj) # type: ignore
575
603
 
576
604
 
577
605
  # We need FilterDsl for two reasons:
datahub/sdk/tag.py ADDED
@@ -0,0 +1,112 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Type
4
+
5
+ from typing_extensions import Self
6
+
7
+ import datahub.metadata.schema_classes as models
8
+ from datahub.metadata.urns import TagUrn, Urn
9
+ from datahub.sdk._shared import (
10
+ HasOwnership,
11
+ OwnersInputType,
12
+ )
13
+ from datahub.sdk.entity import Entity, ExtraAspectsType
14
+
15
+
16
+ class Tag(
17
+ HasOwnership,
18
+ Entity,
19
+ ):
20
+ __slots__ = ()
21
+
22
+ @classmethod
23
+ def get_urn_type(cls) -> Type[TagUrn]:
24
+ return TagUrn
25
+
26
+ def __init__(
27
+ self,
28
+ *,
29
+ # Identity.
30
+ name: str,
31
+ # Tag properties.
32
+ display_name: Optional[str] = None,
33
+ description: Optional[str] = None,
34
+ color: Optional[str] = None,
35
+ # Standard aspects.
36
+ owners: Optional[OwnersInputType] = None,
37
+ extra_aspects: ExtraAspectsType = None,
38
+ ):
39
+ """Initialize a new Tag instance."""
40
+ urn = TagUrn(name=name)
41
+ super().__init__(urn)
42
+ self._set_extra_aspects(extra_aspects)
43
+
44
+ self._ensure_tag_props(
45
+ display_name=display_name or name,
46
+ description=description,
47
+ color=color,
48
+ )
49
+
50
+ if owners is not None:
51
+ self.set_owners(owners)
52
+
53
+ @classmethod
54
+ def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
55
+ assert isinstance(urn, TagUrn)
56
+ entity = cls(name=urn.name)
57
+ return entity._init_from_graph(current_aspects)
58
+
59
+ @property
60
+ def urn(self) -> TagUrn:
61
+ assert isinstance(self._urn, TagUrn)
62
+ return self._urn
63
+
64
+ def _ensure_tag_props(
65
+ self,
66
+ *,
67
+ display_name: Optional[str] = None,
68
+ description: Optional[str] = None,
69
+ color: Optional[str] = None,
70
+ ) -> models.TagPropertiesClass:
71
+ existing_props = self._get_aspect(models.TagPropertiesClass)
72
+ if existing_props is not None:
73
+ if display_name is not None:
74
+ existing_props.name = display_name
75
+ if description is not None:
76
+ existing_props.description = description
77
+ if color is not None:
78
+ existing_props.colorHex = color
79
+ return existing_props
80
+
81
+ return self._setdefault_aspect(
82
+ models.TagPropertiesClass(
83
+ name=display_name or self.urn.name,
84
+ description=description,
85
+ colorHex=color,
86
+ )
87
+ )
88
+
89
+ @property
90
+ def name(self) -> str:
91
+ return self.urn.name
92
+
93
+ @property
94
+ def display_name(self) -> str:
95
+ return self._ensure_tag_props().name
96
+
97
+ def set_display_name(self, display_name: str) -> None:
98
+ self._ensure_tag_props(display_name=display_name)
99
+
100
+ @property
101
+ def description(self) -> Optional[str]:
102
+ return self._ensure_tag_props().description
103
+
104
+ def set_description(self, description: str) -> None:
105
+ self._ensure_tag_props(description=description)
106
+
107
+ @property
108
+ def color(self) -> Optional[str]:
109
+ return self._ensure_tag_props().colorHex
110
+
111
+ def set_color(self, color: str) -> None:
112
+ self._ensure_tag_props(color=color)
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from typing import Any, Dict, List, Optional, Union
3
3
 
4
- from pydantic import BaseModel, validator
4
+ from pydantic import BaseModel, field_validator
5
5
 
6
6
  from datahub.ingestion.graph.client import DataHubGraph
7
7
  from datahub.ingestion.graph.config import DatahubClientConfig
@@ -18,8 +18,11 @@ class DataHubSecretStoreConfig(BaseModel):
18
18
  class Config:
19
19
  arbitrary_types_allowed = True
20
20
 
21
- @validator("graph_client")
22
- def check_graph_connection(cls, v: DataHubGraph) -> DataHubGraph:
21
+ @field_validator("graph_client", mode="after")
22
+ @classmethod
23
+ def check_graph_connection(
24
+ cls, v: Optional[DataHubGraph]
25
+ ) -> Optional[DataHubGraph]:
23
26
  if v is not None:
24
27
  v.test_connection()
25
28
  return v
@@ -63,7 +66,7 @@ class DataHubSecretStore(SecretStore):
63
66
 
64
67
  @classmethod
65
68
  def create(cls, config: Any) -> "DataHubSecretStore":
66
- config = DataHubSecretStoreConfig.parse_obj(config)
69
+ config = DataHubSecretStoreConfig.model_validate(config)
67
70
  return cls(config)
68
71
 
69
72
  def close(self) -> None:
@@ -45,5 +45,5 @@ class FileSecretStore(SecretStore):
45
45
 
46
46
  @classmethod
47
47
  def create(cls, config: Any) -> "FileSecretStore":
48
- config = FileSecretStoreConfig.parse_obj(config)
48
+ config = FileSecretStoreConfig.model_validate(config)
49
49
  return cls(config)
@@ -1,5 +1,6 @@
1
1
  import contextlib
2
2
  import pathlib
3
+ from dataclasses import dataclass
3
4
  from typing import Dict, List, Optional, Protocol, Set, Tuple
4
5
 
5
6
  from typing_extensions import TypedDict
@@ -22,6 +23,14 @@ from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_fie
22
23
  SchemaInfo = Dict[str, str]
23
24
 
24
25
 
26
+ @dataclass
27
+ class SchemaResolverReport:
28
+ """Report class for tracking SchemaResolver cache performance."""
29
+
30
+ num_schema_cache_hits: int = 0
31
+ num_schema_cache_misses: int = 0
32
+
33
+
25
34
  class GraphQLSchemaField(TypedDict):
26
35
  fieldPath: str
27
36
  nativeDataType: str
@@ -53,6 +62,7 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
53
62
  env: str = DEFAULT_ENV,
54
63
  graph: Optional[DataHubGraph] = None,
55
64
  _cache_filename: Optional[pathlib.Path] = None,
65
+ report: Optional[SchemaResolverReport] = None,
56
66
  ):
57
67
  # Also supports platform with an urn prefix.
58
68
  self._platform = DataPlatformUrn(platform).platform_name
@@ -60,6 +70,7 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
60
70
  self.env = env
61
71
 
62
72
  self.graph = graph
73
+ self.report = report
63
74
 
64
75
  # Init cache, potentially restoring from a previous run.
65
76
  shared_conn = None
@@ -132,12 +143,14 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
132
143
 
133
144
  schema_info = self._resolve_schema_info(urn)
134
145
  if schema_info:
146
+ self._track_cache_hit()
135
147
  return urn, schema_info
136
148
 
137
149
  urn_lower = self.get_urn_for_table(table, lower=True)
138
150
  if urn_lower != urn:
139
151
  schema_info = self._resolve_schema_info(urn_lower)
140
152
  if schema_info:
153
+ self._track_cache_hit()
141
154
  return urn_lower, schema_info
142
155
 
143
156
  # Our treatment of platform instances when lowercasing urns
@@ -152,8 +165,12 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
152
165
  if urn_mixed not in {urn, urn_lower}:
153
166
  schema_info = self._resolve_schema_info(urn_mixed)
154
167
  if schema_info:
168
+ self._track_cache_hit()
155
169
  return urn_mixed, schema_info
156
170
 
171
+ # Track cache miss for the final attempt
172
+ self._track_cache_miss()
173
+
157
174
  if self._prefers_urn_lower():
158
175
  return urn_lower, None
159
176
  else:
@@ -165,6 +182,16 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
165
182
  def has_urn(self, urn: str) -> bool:
166
183
  return self._schema_cache.get(urn) is not None
167
184
 
185
+ def _track_cache_hit(self) -> None:
186
+ """Track a cache hit if reporting is enabled."""
187
+ if self.report is not None:
188
+ self.report.num_schema_cache_hits += 1
189
+
190
+ def _track_cache_miss(self) -> None:
191
+ """Track a cache miss if reporting is enabled."""
192
+ if self.report is not None:
193
+ self.report.num_schema_cache_misses += 1
194
+
168
195
  def _resolve_schema_info(self, urn: str) -> Optional[SchemaInfo]:
169
196
  if urn in self._schema_cache:
170
197
  return self._schema_cache[urn]
@@ -261,6 +288,8 @@ class _SchemaResolverWithExtras(SchemaResolverInterface):
261
288
  table, lower=self._base_resolver._prefers_urn_lower()
262
289
  )
263
290
  if urn in self._extra_schemas:
291
+ # Track cache hit for extra schemas
292
+ self._base_resolver._track_cache_hit()
264
293
  return urn, self._extra_schemas[urn]
265
294
  return self._base_resolver.resolve_table(table)
266
295
 
@@ -168,6 +168,12 @@ class QueryMetadata:
168
168
  query_subject_urns.add(upstream)
169
169
  if include_fields:
170
170
  for column in sorted(self.column_usage.get(upstream, [])):
171
+ # Skip empty column names to avoid creating invalid URNs
172
+ if not column or not column.strip():
173
+ logger.warning(
174
+ f"Skipping empty upstream column name for query {self.query_id} on upstream {upstream}"
175
+ )
176
+ continue
171
177
  query_subject_urns.add(
172
178
  builder.make_schema_field_urn(upstream, column)
173
179
  )
@@ -175,6 +181,15 @@ class QueryMetadata:
175
181
  query_subject_urns.add(downstream_urn)
176
182
  if include_fields:
177
183
  for column_lineage in self.column_lineage:
184
+ # Skip empty downstream columns to avoid creating invalid URNs
185
+ if (
186
+ not column_lineage.downstream.column
187
+ or not column_lineage.downstream.column.strip()
188
+ ):
189
+ logger.warning(
190
+ f"Skipping empty downstream column name for query {self.query_id} on downstream {downstream_urn}"
191
+ )
192
+ continue
178
193
  query_subject_urns.add(
179
194
  builder.make_schema_field_urn(
180
195
  downstream_urn, column_lineage.downstream.column
@@ -28,6 +28,7 @@ import sqlglot.optimizer.optimizer
28
28
  import sqlglot.optimizer.qualify
29
29
  import sqlglot.optimizer.qualify_columns
30
30
  import sqlglot.optimizer.unnest_subqueries
31
+ from pydantic import field_validator
31
32
 
32
33
  from datahub.cli.env_utils import get_boolean_env_variable
33
34
  from datahub.ingestion.graph.client import DataHubGraph
@@ -141,7 +142,8 @@ class DownstreamColumnRef(_ParserBaseModel):
141
142
  column_type: Optional[SchemaFieldDataTypeClass] = None
142
143
  native_column_type: Optional[str] = None
143
144
 
144
- @pydantic.validator("column_type", pre=True)
145
+ @field_validator("column_type", mode="before")
146
+ @classmethod
145
147
  def _load_column_type(
146
148
  cls, v: Optional[Union[dict, SchemaFieldDataTypeClass]]
147
149
  ) -> Optional[SchemaFieldDataTypeClass]:
@@ -215,7 +217,8 @@ class SqlParsingDebugInfo(_ParserBaseModel):
215
217
  def error(self) -> Optional[Exception]:
216
218
  return self.table_error or self.column_error
217
219
 
218
- @pydantic.validator("table_error", "column_error")
220
+ @field_validator("table_error", "column_error", mode="before")
221
+ @classmethod
219
222
  def remove_variables_from_error(cls, v: Optional[Exception]) -> Optional[Exception]:
220
223
  if v and v.__traceback__:
221
224
  # Remove local variables from the traceback to avoid memory leaks.
@@ -60,8 +60,8 @@ def assert_sql_result_with_resolver(
60
60
  expected = SqlParsingResult.parse_raw(expected_file.read_text())
61
61
 
62
62
  full_diff = deepdiff.DeepDiff(
63
- expected.dict(),
64
- res.dict(),
63
+ expected.model_dump(),
64
+ res.model_dump(),
65
65
  exclude_regex_paths=[
66
66
  r"root.column_lineage\[\d+\].logic",
67
67
  ],
@@ -48,7 +48,7 @@ def deploy_source_vars(
48
48
 
49
49
  deploy_options_raw = pipeline_config.pop("deployment", None)
50
50
  if deploy_options_raw is not None:
51
- deploy_options = DeployOptions.parse_obj(deploy_options_raw)
51
+ deploy_options = DeployOptions.model_validate(deploy_options_raw)
52
52
 
53
53
  if name:
54
54
  logger.info(f"Overriding deployment name {deploy_options.name} with {name}")