acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (263) hide show
  1. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2550 -2543
  2. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +263 -261
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +2 -2
  5. datahub/api/entities/corpgroup/corpgroup.py +11 -6
  6. datahub/api/entities/corpuser/corpuser.py +11 -11
  7. datahub/api/entities/dataproduct/dataproduct.py +47 -27
  8. datahub/api/entities/dataset/dataset.py +32 -21
  9. datahub/api/entities/external/lake_formation_external_entites.py +5 -6
  10. datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
  11. datahub/api/entities/forms/forms.py +16 -14
  12. datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
  13. datahub/cli/check_cli.py +2 -2
  14. datahub/cli/config_utils.py +3 -3
  15. datahub/cli/lite_cli.py +9 -7
  16. datahub/cli/migrate.py +4 -4
  17. datahub/cli/quickstart_versioning.py +3 -3
  18. datahub/cli/specific/group_cli.py +1 -1
  19. datahub/cli/specific/structuredproperties_cli.py +1 -1
  20. datahub/cli/specific/user_cli.py +1 -1
  21. datahub/configuration/common.py +14 -2
  22. datahub/configuration/connection_resolver.py +2 -2
  23. datahub/configuration/git.py +47 -30
  24. datahub/configuration/import_resolver.py +2 -2
  25. datahub/configuration/kafka.py +4 -3
  26. datahub/configuration/time_window_config.py +26 -26
  27. datahub/configuration/validate_field_deprecation.py +2 -2
  28. datahub/configuration/validate_field_removal.py +2 -2
  29. datahub/configuration/validate_field_rename.py +2 -2
  30. datahub/configuration/validate_multiline_string.py +2 -1
  31. datahub/emitter/kafka_emitter.py +3 -1
  32. datahub/emitter/rest_emitter.py +2 -4
  33. datahub/ingestion/api/decorators.py +1 -1
  34. datahub/ingestion/api/report.py +1 -1
  35. datahub/ingestion/api/sink.py +1 -1
  36. datahub/ingestion/api/source.py +1 -1
  37. datahub/ingestion/glossary/datahub_classifier.py +11 -8
  38. datahub/ingestion/graph/client.py +5 -1
  39. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  40. datahub/ingestion/reporting/file_reporter.py +5 -4
  41. datahub/ingestion/run/pipeline.py +7 -6
  42. datahub/ingestion/run/pipeline_config.py +12 -14
  43. datahub/ingestion/run/sink_callback.py +1 -1
  44. datahub/ingestion/sink/datahub_rest.py +6 -4
  45. datahub/ingestion/source/abs/config.py +19 -19
  46. datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
  47. datahub/ingestion/source/abs/source.py +2 -2
  48. datahub/ingestion/source/aws/aws_common.py +1 -1
  49. datahub/ingestion/source/aws/glue.py +6 -4
  50. datahub/ingestion/source/aws/sagemaker.py +1 -1
  51. datahub/ingestion/source/azure/azure_common.py +8 -12
  52. datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
  53. datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
  54. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
  55. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  56. datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
  57. datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
  58. datahub/ingestion/source/datahub/config.py +8 -8
  59. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  60. datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
  61. datahub/ingestion/source/dbt/dbt_common.py +39 -37
  62. datahub/ingestion/source/dbt/dbt_core.py +10 -12
  63. datahub/ingestion/source/debug/datahub_debug.py +1 -1
  64. datahub/ingestion/source/delta_lake/config.py +6 -4
  65. datahub/ingestion/source/dremio/dremio_api.py +212 -78
  66. datahub/ingestion/source/dremio/dremio_config.py +10 -6
  67. datahub/ingestion/source/dremio/dremio_entities.py +55 -39
  68. datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
  69. datahub/ingestion/source/dremio/dremio_source.py +24 -26
  70. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  71. datahub/ingestion/source/elastic_search.py +110 -32
  72. datahub/ingestion/source/excel/source.py +1 -1
  73. datahub/ingestion/source/feast.py +1 -1
  74. datahub/ingestion/source/file.py +5 -4
  75. datahub/ingestion/source/fivetran/config.py +17 -16
  76. datahub/ingestion/source/fivetran/fivetran.py +2 -2
  77. datahub/ingestion/source/gc/datahub_gc.py +1 -1
  78. datahub/ingestion/source/gcs/gcs_source.py +8 -10
  79. datahub/ingestion/source/ge_profiling_config.py +8 -5
  80. datahub/ingestion/source/grafana/grafana_api.py +2 -2
  81. datahub/ingestion/source/grafana/grafana_config.py +4 -3
  82. datahub/ingestion/source/grafana/grafana_source.py +1 -1
  83. datahub/ingestion/source/grafana/models.py +23 -5
  84. datahub/ingestion/source/hex/api.py +7 -5
  85. datahub/ingestion/source/hex/hex.py +4 -3
  86. datahub/ingestion/source/iceberg/iceberg.py +1 -1
  87. datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +10 -10
  90. datahub/ingestion/source/kafka/kafka.py +1 -1
  91. datahub/ingestion/source/ldap.py +1 -1
  92. datahub/ingestion/source/looker/looker_common.py +7 -5
  93. datahub/ingestion/source/looker/looker_config.py +21 -20
  94. datahub/ingestion/source/looker/lookml_config.py +47 -47
  95. datahub/ingestion/source/metabase.py +8 -8
  96. datahub/ingestion/source/metadata/business_glossary.py +2 -2
  97. datahub/ingestion/source/metadata/lineage.py +13 -8
  98. datahub/ingestion/source/mlflow.py +1 -1
  99. datahub/ingestion/source/mode.py +6 -4
  100. datahub/ingestion/source/mongodb.py +4 -3
  101. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  102. datahub/ingestion/source/nifi.py +17 -23
  103. datahub/ingestion/source/openapi.py +6 -8
  104. datahub/ingestion/source/powerbi/config.py +33 -32
  105. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
  106. datahub/ingestion/source/powerbi/powerbi.py +1 -1
  107. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
  108. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
  109. datahub/ingestion/source/preset.py +8 -8
  110. datahub/ingestion/source/pulsar.py +1 -1
  111. datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
  112. datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
  113. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  114. datahub/ingestion/source/redshift/config.py +18 -20
  115. datahub/ingestion/source/redshift/redshift.py +2 -2
  116. datahub/ingestion/source/redshift/usage.py +23 -3
  117. datahub/ingestion/source/s3/config.py +83 -62
  118. datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
  119. datahub/ingestion/source/s3/source.py +8 -5
  120. datahub/ingestion/source/sac/sac.py +5 -4
  121. datahub/ingestion/source/salesforce.py +3 -2
  122. datahub/ingestion/source/schema/json_schema.py +2 -2
  123. datahub/ingestion/source/sigma/data_classes.py +3 -2
  124. datahub/ingestion/source/sigma/sigma.py +1 -1
  125. datahub/ingestion/source/sigma/sigma_api.py +7 -7
  126. datahub/ingestion/source/slack/slack.py +1 -1
  127. datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
  128. datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
  129. datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
  130. datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
  131. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
  132. datahub/ingestion/source/snowflake/snowflake_queries.py +28 -4
  133. datahub/ingestion/source/sql/athena.py +1 -1
  134. datahub/ingestion/source/sql/clickhouse.py +4 -2
  135. datahub/ingestion/source/sql/cockroachdb.py +1 -1
  136. datahub/ingestion/source/sql/druid.py +1 -1
  137. datahub/ingestion/source/sql/hana.py +1 -1
  138. datahub/ingestion/source/sql/hive.py +7 -5
  139. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  140. datahub/ingestion/source/sql/mssql/source.py +13 -6
  141. datahub/ingestion/source/sql/mysql.py +1 -1
  142. datahub/ingestion/source/sql/oracle.py +17 -10
  143. datahub/ingestion/source/sql/postgres.py +2 -2
  144. datahub/ingestion/source/sql/presto.py +1 -1
  145. datahub/ingestion/source/sql/sql_config.py +8 -9
  146. datahub/ingestion/source/sql/sql_generic.py +1 -1
  147. datahub/ingestion/source/sql/teradata.py +1 -1
  148. datahub/ingestion/source/sql/trino.py +1 -1
  149. datahub/ingestion/source/sql/vertica.py +5 -4
  150. datahub/ingestion/source/sql_queries.py +174 -22
  151. datahub/ingestion/source/state/checkpoint.py +2 -2
  152. datahub/ingestion/source/state/entity_removal_state.py +2 -1
  153. datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
  154. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
  155. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  156. datahub/ingestion/source/superset.py +9 -9
  157. datahub/ingestion/source/tableau/tableau.py +14 -16
  158. datahub/ingestion/source/unity/azure_auth_config.py +15 -0
  159. datahub/ingestion/source/unity/config.py +51 -34
  160. datahub/ingestion/source/unity/connection.py +7 -1
  161. datahub/ingestion/source/unity/connection_test.py +1 -1
  162. datahub/ingestion/source/unity/proxy.py +216 -7
  163. datahub/ingestion/source/unity/proxy_types.py +91 -0
  164. datahub/ingestion/source/unity/source.py +29 -3
  165. datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
  166. datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
  167. datahub/ingestion/source/usage/usage_common.py +5 -3
  168. datahub/ingestion/source_config/csv_enricher.py +7 -6
  169. datahub/ingestion/source_config/operation_config.py +7 -4
  170. datahub/ingestion/source_config/pulsar.py +11 -15
  171. datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
  172. datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
  173. datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
  174. datahub/ingestion/transformer/add_dataset_properties.py +2 -2
  175. datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
  176. datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
  177. datahub/ingestion/transformer/add_dataset_tags.py +3 -3
  178. datahub/ingestion/transformer/add_dataset_terms.py +3 -3
  179. datahub/ingestion/transformer/dataset_domain.py +3 -3
  180. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
  181. datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
  182. datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
  183. datahub/ingestion/transformer/mark_dataset_status.py +1 -1
  184. datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
  185. datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
  186. datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
  187. datahub/ingestion/transformer/replace_external_url.py +2 -2
  188. datahub/ingestion/transformer/set_browse_path.py +1 -1
  189. datahub/ingestion/transformer/tags_to_terms.py +1 -1
  190. datahub/lite/duckdb_lite.py +1 -1
  191. datahub/lite/lite_util.py +2 -2
  192. datahub/metadata/_internal_schema_classes.py +62 -2
  193. datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
  194. datahub/metadata/schema.avsc +271 -91
  195. datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
  196. datahub/metadata/schemas/AssertionInfo.avsc +48 -5
  197. datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
  198. datahub/metadata/schemas/ChartInfo.avsc +12 -5
  199. datahub/metadata/schemas/ContainerProperties.avsc +12 -5
  200. datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
  201. datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
  202. datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
  203. datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
  204. datahub/metadata/schemas/DashboardInfo.avsc +16 -4
  205. datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
  206. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
  207. datahub/metadata/schemas/DataJobInfo.avsc +9 -4
  208. datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
  209. datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
  210. datahub/metadata/schemas/DataProductProperties.avsc +5 -2
  211. datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
  212. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  213. datahub/metadata/schemas/DatasetProperties.avsc +12 -5
  214. datahub/metadata/schemas/DomainProperties.avsc +7 -3
  215. datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
  216. datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
  217. datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
  218. datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
  219. datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
  220. datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
  221. datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
  222. datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
  223. datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
  224. datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
  225. datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
  226. datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
  227. datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
  228. datahub/metadata/schemas/GlobalTags.avsc +3 -2
  229. datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
  230. datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
  231. datahub/metadata/schemas/InputFields.avsc +3 -2
  232. datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
  233. datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
  234. datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
  235. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  236. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  237. datahub/metadata/schemas/MLModelProperties.avsc +4 -2
  238. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
  239. datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
  240. datahub/metadata/schemas/NotebookInfo.avsc +5 -2
  241. datahub/metadata/schemas/Ownership.avsc +3 -2
  242. datahub/metadata/schemas/QuerySubjects.avsc +1 -1
  243. datahub/metadata/schemas/RoleProperties.avsc +3 -1
  244. datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
  245. datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
  246. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
  247. datahub/metadata/schemas/TagProperties.avsc +3 -1
  248. datahub/metadata/schemas/TestInfo.avsc +2 -1
  249. datahub/sdk/__init__.py +1 -0
  250. datahub/sdk/_all_entities.py +2 -0
  251. datahub/sdk/search_filters.py +68 -40
  252. datahub/sdk/tag.py +112 -0
  253. datahub/secret/datahub_secret_store.py +7 -4
  254. datahub/secret/file_secret_store.py +1 -1
  255. datahub/sql_parsing/schema_resolver.py +29 -0
  256. datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
  257. datahub/sql_parsing/sqlglot_lineage.py +5 -2
  258. datahub/testing/check_sql_parser_result.py +2 -2
  259. datahub/utilities/ingest_utils.py +1 -1
  260. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
  261. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
  262. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
  263. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,14 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
- from dataclasses import dataclass
4
+ import re
5
+ from dataclasses import dataclass, field
5
6
  from datetime import datetime
6
7
  from functools import partial
7
- from typing import ClassVar, Iterable, List, Optional, Union
8
+ from typing import Any, ClassVar, Iterable, List, Optional, Union, cast
8
9
 
9
- from pydantic import BaseModel, Field, validator
10
+ import smart_open
11
+ from pydantic import BaseModel, Field, field_validator
10
12
 
11
13
  from datahub.configuration.common import HiddenFromDocs
12
14
  from datahub.configuration.datetimes import parse_user_datetime
@@ -36,12 +38,13 @@ from datahub.ingestion.api.source import (
36
38
  SourceCapability,
37
39
  SourceReport,
38
40
  )
39
- from datahub.ingestion.api.source_helpers import auto_workunit_reporter
41
+ from datahub.ingestion.api.source_helpers import auto_workunit, auto_workunit_reporter
40
42
  from datahub.ingestion.api.workunit import MetadataWorkUnit
41
43
  from datahub.ingestion.graph.client import DataHubGraph
44
+ from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
42
45
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
43
46
  from datahub.metadata.urns import CorpUserUrn, DatasetUrn
44
- from datahub.sql_parsing.schema_resolver import SchemaResolver
47
+ from datahub.sql_parsing.schema_resolver import SchemaResolver, SchemaResolverReport
45
48
  from datahub.sql_parsing.sql_parsing_aggregator import (
46
49
  KnownQueryLineageInfo,
47
50
  ObservedQuery,
@@ -82,6 +85,24 @@ class SqlQueriesSourceConfig(
82
85
  None,
83
86
  description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
84
87
  )
88
+ temp_table_patterns: List[str] = Field(
89
+ description="Regex patterns for temporary tables to filter in lineage ingestion. "
90
+ "Specify regex to match the entire table name. This is useful for platforms like Athena "
91
+ "that don't have native temp tables but use naming patterns for fake temp tables.",
92
+ default=[],
93
+ )
94
+
95
+ enable_lazy_schema_loading: bool = Field(
96
+ default=True,
97
+ description="Enable lazy schema loading for better performance. When enabled, schemas are fetched on-demand "
98
+ "instead of bulk loading all schemas upfront, reducing startup time and memory usage.",
99
+ )
100
+
101
+ # AWS/S3 configuration
102
+ aws_config: Optional[AwsConnectionConfig] = Field(
103
+ default=None,
104
+ description="AWS configuration for S3 access. Required when query_file is an S3 URI (s3://).",
105
+ )
85
106
 
86
107
 
87
108
  @dataclass
@@ -89,8 +110,13 @@ class SqlQueriesSourceReport(SourceReport):
89
110
  num_entries_processed: int = 0
90
111
  num_entries_failed: int = 0
91
112
  num_queries_aggregator_failures: int = 0
113
+ num_queries_processed_sequential: int = 0
114
+ num_temp_tables_detected: int = 0
115
+ temp_table_patterns_used: List[str] = field(default_factory=list)
116
+ peak_memory_usage_mb: float = 0.0
92
117
 
93
118
  sql_aggregator: Optional[SqlAggregatorReport] = None
119
+ schema_resolver_report: Optional[SchemaResolverReport] = None
94
120
 
95
121
 
96
122
  @platform_name("SQL Queries", id="sql-queries")
@@ -115,6 +141,18 @@ class SqlQueriesSource(Source):
115
141
  - upstream_tables (optional): string[] - Fallback list of tables the query reads from,
116
142
  used if the query can't be parsed.
117
143
 
144
+ **Lazy Schema Loading**:
145
+ - Fetches schemas on-demand during query parsing instead of bulk loading all schemas upfront
146
+ - Caches fetched schemas for future lookups to avoid repeated network requests
147
+ - Reduces initial startup time and memory usage significantly
148
+ - Automatically handles large platforms efficiently without memory issues
149
+
150
+ **Query Processing**:
151
+ - Loads the entire query file into memory at once
152
+ - Processes all queries sequentially before generating metadata work units
153
+ - Preserves temp table mappings and lineage relationships to ensure consistent lineage tracking
154
+ - Query deduplication is handled automatically by the SQL parsing aggregator
155
+
118
156
  ### Incremental Lineage
119
157
  When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
120
158
  This allows you to add lineage edges without removing existing ones, which is useful for:
@@ -124,6 +162,12 @@ class SqlQueriesSource(Source):
124
162
 
125
163
  Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
126
164
  statistics will still be emitted normally.
165
+
166
+ ### Temporary Table Support
167
+ For platforms like Athena that don't have native temporary tables, you can use the `temp_table_patterns`
168
+ configuration to specify regex patterns that identify fake temporary tables. This allows the source to
169
+ process these tables like other sources that support native temp tables, enabling proper lineage tracking
170
+ across temporary table operations.
127
171
  """
128
172
 
129
173
  schema_resolver: Optional[SchemaResolver]
@@ -141,13 +185,19 @@ class SqlQueriesSource(Source):
141
185
  self.report = SqlQueriesSourceReport()
142
186
 
143
187
  if self.config.use_schema_resolver:
144
- # TODO: `initialize_schema_resolver_from_datahub` does a bulk initialization by fetching all schemas
145
- # for the given platform, platform instance, and env. Instead this should be configurable:
146
- # bulk initialization vs lazy on-demand schema fetching.
147
- self.schema_resolver = self.graph.initialize_schema_resolver_from_datahub(
188
+ # Create schema resolver report for tracking
189
+ self.report.schema_resolver_report = SchemaResolverReport()
190
+
191
+ # Use lazy loading - schemas will be fetched on-demand and cached
192
+ logger.info(
193
+ "Using lazy schema loading - schemas will be fetched on-demand and cached"
194
+ )
195
+ self.schema_resolver = SchemaResolver(
148
196
  platform=self.config.platform,
149
197
  platform_instance=self.config.platform_instance,
150
198
  env=self.config.env,
199
+ graph=self.graph,
200
+ report=self.report.schema_resolver_report,
151
201
  )
152
202
  else:
153
203
  self.schema_resolver = None
@@ -156,7 +206,9 @@ class SqlQueriesSource(Source):
156
206
  platform=self.config.platform,
157
207
  platform_instance=self.config.platform_instance,
158
208
  env=self.config.env,
159
- schema_resolver=self.schema_resolver,
209
+ schema_resolver=cast(SchemaResolver, self.schema_resolver)
210
+ if self.schema_resolver
211
+ else None,
160
212
  eager_graph_load=False,
161
213
  generate_lineage=True, # TODO: make this configurable
162
214
  generate_queries=True, # TODO: make this configurable
@@ -165,7 +217,9 @@ class SqlQueriesSource(Source):
165
217
  generate_usage_statistics=True,
166
218
  generate_operations=True, # TODO: make this configurable
167
219
  usage_config=self.config.usage,
168
- is_temp_table=None,
220
+ is_temp_table=self.is_temp_table
221
+ if self.config.temp_table_patterns
222
+ else None,
169
223
  is_allowed_table=None,
170
224
  format_queries=False,
171
225
  )
@@ -193,20 +247,73 @@ class SqlQueriesSource(Source):
193
247
  ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
194
248
  logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}")
195
249
 
250
+ logger.info("Processing all queries in batch mode")
251
+ yield from self._process_queries_batch()
252
+
253
+ def _process_queries_batch(
254
+ self,
255
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
256
+ """Process all queries in memory (original behavior)."""
196
257
  with self.report.new_stage("Collecting queries from file"):
197
258
  queries = list(self._parse_query_file())
198
259
  logger.info(f"Collected {len(queries)} queries for processing")
199
260
 
200
261
  with self.report.new_stage("Processing queries through SQL parsing aggregator"):
201
- for query_entry in queries:
202
- self._add_query_to_aggregator(query_entry)
262
+ logger.info("Using sequential processing")
263
+ self._process_queries_sequential(queries)
203
264
 
204
265
  with self.report.new_stage("Generating metadata work units"):
205
266
  logger.info("Generating workunits from SQL parsing aggregator")
206
- yield from self.aggregator.gen_metadata()
267
+ yield from auto_workunit(self.aggregator.gen_metadata())
207
268
 
208
- def _parse_query_file(self) -> Iterable["QueryEntry"]:
209
- """Parse the query file and yield QueryEntry objects."""
269
+ def _is_s3_uri(self, path: str) -> bool:
270
+ """Check if the path is an S3 URI."""
271
+ return path.startswith("s3://")
272
+
273
+ def _parse_s3_query_file(self) -> Iterable["QueryEntry"]:
274
+ """Parse query file from S3 using smart_open."""
275
+ if not self.config.aws_config:
276
+ raise ValueError("AWS configuration required for S3 file access")
277
+
278
+ logger.info(f"Reading query file from S3: {self.config.query_file}")
279
+
280
+ try:
281
+ # Use smart_open for efficient S3 streaming, similar to S3FileSystem
282
+ s3_client = self.config.aws_config.get_s3_client()
283
+
284
+ with smart_open.open(
285
+ self.config.query_file, mode="r", transport_params={"client": s3_client}
286
+ ) as file_stream:
287
+ for line in file_stream:
288
+ if line.strip():
289
+ try:
290
+ query_dict = json.loads(line, strict=False)
291
+ entry = QueryEntry.create(query_dict, config=self.config)
292
+ self.report.num_entries_processed += 1
293
+ if self.report.num_entries_processed % 1000 == 0:
294
+ logger.info(
295
+ f"Processed {self.report.num_entries_processed} query entries from S3"
296
+ )
297
+ yield entry
298
+ except Exception as e:
299
+ self.report.num_entries_failed += 1
300
+ self.report.warning(
301
+ title="Error processing query from S3",
302
+ message="Query skipped due to parsing error",
303
+ context=line.strip(),
304
+ exc=e,
305
+ )
306
+ except Exception as e:
307
+ self.report.warning(
308
+ title="Error reading S3 file",
309
+ message="Failed to read S3 file",
310
+ context=self.config.query_file,
311
+ exc=e,
312
+ )
313
+ raise
314
+
315
+ def _parse_local_query_file(self) -> Iterable["QueryEntry"]:
316
+ """Parse local query file (existing logic)."""
210
317
  with open(self.config.query_file) as f:
211
318
  for line in f:
212
319
  try:
@@ -227,6 +334,30 @@ class SqlQueriesSource(Source):
227
334
  exc=e,
228
335
  )
229
336
 
337
+ def _parse_query_file(self) -> Iterable["QueryEntry"]:
338
+ """Parse the query file and yield QueryEntry objects."""
339
+ if self._is_s3_uri(self.config.query_file):
340
+ yield from self._parse_s3_query_file()
341
+ else:
342
+ yield from self._parse_local_query_file()
343
+
344
+ def _process_queries_sequential(self, queries: List["QueryEntry"]) -> None:
345
+ """Process queries sequentially."""
346
+ total_queries = len(queries)
347
+ logger.info(f"Processing {total_queries} queries sequentially")
348
+
349
+ # Process each query sequentially
350
+ for i, query_entry in enumerate(queries):
351
+ self._add_query_to_aggregator(query_entry)
352
+ self.report.num_queries_processed_sequential += 1
353
+
354
+ # Simple progress reporting every 1000 queries
355
+ if (i + 1) % 1000 == 0:
356
+ progress_pct = ((i + 1) / total_queries) * 100
357
+ logger.info(
358
+ f"Processed {i + 1}/{total_queries} queries ({progress_pct:.1f}%)"
359
+ )
360
+
230
361
  def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
231
362
  """Add a query to the SQL parsing aggregator."""
232
363
  try:
@@ -285,6 +416,24 @@ class SqlQueriesSource(Source):
285
416
  exc=e,
286
417
  )
287
418
 
419
+ def is_temp_table(self, name: str) -> bool:
420
+ """Check if a table name matches any of the configured temp table patterns."""
421
+ if not self.config.temp_table_patterns:
422
+ return False
423
+
424
+ try:
425
+ for pattern in self.config.temp_table_patterns:
426
+ if re.match(pattern, name, flags=re.IGNORECASE):
427
+ logger.debug(
428
+ f"Table '{name}' matched temp table pattern: {pattern}"
429
+ )
430
+ self.report.num_temp_tables_detected += 1
431
+ return True
432
+ except re.error as e:
433
+ logger.warning(f"Invalid regex pattern '{pattern}': {e}")
434
+
435
+ return False
436
+
288
437
 
289
438
  class QueryEntry(BaseModel):
290
439
  query: str
@@ -301,19 +450,22 @@ class QueryEntry(BaseModel):
301
450
  class Config:
302
451
  arbitrary_types_allowed = True
303
452
 
304
- @validator("timestamp", pre=True)
305
- def parse_timestamp(cls, v):
453
+ @field_validator("timestamp", mode="before")
454
+ @classmethod
455
+ def parse_timestamp(cls, v: Any) -> Any:
306
456
  return None if v is None else parse_user_datetime(str(v))
307
457
 
308
- @validator("user", pre=True)
309
- def parse_user(cls, v):
458
+ @field_validator("user", mode="before")
459
+ @classmethod
460
+ def parse_user(cls, v: Any) -> Any:
310
461
  if v is None:
311
462
  return None
312
463
 
313
464
  return v if isinstance(v, CorpUserUrn) else CorpUserUrn(v)
314
465
 
315
- @validator("downstream_tables", "upstream_tables", pre=True)
316
- def parse_tables(cls, v):
466
+ @field_validator("downstream_tables", "upstream_tables", mode="before")
467
+ @classmethod
468
+ def parse_tables(cls, v: Any) -> Any:
317
469
  if not v:
318
470
  return []
319
471
 
@@ -163,7 +163,7 @@ class Checkpoint(Generic[StateType]):
163
163
  )
164
164
  state_as_dict["version"] = checkpoint_aspect.state.formatVersion
165
165
  state_as_dict["serde"] = checkpoint_aspect.state.serde
166
- return state_class.parse_obj(state_as_dict)
166
+ return state_class.model_validate(state_as_dict)
167
167
 
168
168
  @staticmethod
169
169
  def _from_base85_json_bytes(
@@ -179,7 +179,7 @@ class Checkpoint(Generic[StateType]):
179
179
  state_as_dict = json.loads(state_uncompressed.decode("utf-8"))
180
180
  state_as_dict["version"] = checkpoint_aspect.state.formatVersion
181
181
  state_as_dict["serde"] = checkpoint_aspect.state.serde
182
- return state_class.parse_obj(state_as_dict)
182
+ return state_class.model_validate(state_as_dict)
183
183
 
184
184
  def to_checkpoint_aspect(
185
185
  self, max_allowed_state_size: int
@@ -1,6 +1,7 @@
1
1
  from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Tuple, Type
2
2
 
3
3
  import pydantic
4
+ from pydantic import model_validator
4
5
 
5
6
  from datahub.emitter.mce_builder import make_assertion_urn, make_container_urn
6
7
  from datahub.ingestion.source.state.checkpoint import CheckpointStateBase
@@ -59,7 +60,7 @@ def pydantic_state_migrator(mapping: Dict[str, str]) -> "V1RootValidator":
59
60
 
60
61
  return values
61
62
 
62
- return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename)
63
+ return model_validator(mode="before")(_validate_field_rename)
63
64
 
64
65
 
65
66
  class GenericCheckpointState(CheckpointStateBase):
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
3
  from typing import Any, Dict, Generic, Optional, Type, TypeVar
4
4
 
5
5
  import pydantic
6
- from pydantic import root_validator
6
+ from pydantic import model_validator
7
7
  from pydantic.fields import Field
8
8
 
9
9
  from datahub.configuration.common import (
@@ -73,14 +73,14 @@ class StatefulIngestionConfig(ConfigModel):
73
73
  description="If set to True, ignores the current checkpoint state.",
74
74
  )
75
75
 
76
- @pydantic.root_validator(skip_on_failure=True)
77
- def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
78
- if values.get("enabled"):
79
- if values.get("state_provider") is None:
80
- values["state_provider"] = DynamicTypedStateProviderConfig(
76
+ @model_validator(mode="after")
77
+ def validate_config(self) -> "StatefulIngestionConfig":
78
+ if self.enabled:
79
+ if self.state_provider is None:
80
+ self.state_provider = DynamicTypedStateProviderConfig(
81
81
  type="datahub", config={}
82
82
  )
83
- return values
83
+ return self
84
84
 
85
85
 
86
86
  CustomConfig = TypeVar("CustomConfig", bound=StatefulIngestionConfig)
@@ -110,17 +110,19 @@ class StatefulLineageConfigMixin(ConfigModel):
110
110
  "store_last_lineage_extraction_timestamp", "enable_stateful_lineage_ingestion"
111
111
  )
112
112
 
113
- @root_validator(skip_on_failure=True)
114
- def lineage_stateful_option_validator(cls, values: Dict) -> Dict:
115
- sti = values.get("stateful_ingestion")
116
- if not sti or not sti.enabled:
117
- if values.get("enable_stateful_lineage_ingestion"):
118
- logger.warning(
119
- "Stateful ingestion is disabled, disabling enable_stateful_lineage_ingestion config option as well"
120
- )
121
- values["enable_stateful_lineage_ingestion"] = False
122
-
123
- return values
113
+ @model_validator(mode="after")
114
+ def lineage_stateful_option_validator(self) -> "StatefulLineageConfigMixin":
115
+ try:
116
+ sti = getattr(self, "stateful_ingestion", None)
117
+ if not sti or not getattr(sti, "enabled", False):
118
+ if getattr(self, "enable_stateful_lineage_ingestion", False):
119
+ logger.warning(
120
+ "Stateful ingestion is disabled, disabling enable_stateful_lineage_ingestion config option as well"
121
+ )
122
+ self.enable_stateful_lineage_ingestion = False
123
+ except (AttributeError, RecursionError) as e:
124
+ logger.debug(f"Skipping stateful lineage validation due to: {e}")
125
+ return self
124
126
 
125
127
 
126
128
  class StatefulProfilingConfigMixin(ConfigModel):
@@ -135,16 +137,19 @@ class StatefulProfilingConfigMixin(ConfigModel):
135
137
  "store_last_profiling_timestamps", "enable_stateful_profiling"
136
138
  )
137
139
 
138
- @root_validator(skip_on_failure=True)
139
- def profiling_stateful_option_validator(cls, values: Dict) -> Dict:
140
- sti = values.get("stateful_ingestion")
141
- if not sti or not sti.enabled:
142
- if values.get("enable_stateful_profiling"):
143
- logger.warning(
144
- "Stateful ingestion is disabled, disabling enable_stateful_profiling config option as well"
145
- )
146
- values["enable_stateful_profiling"] = False
147
- return values
140
+ @model_validator(mode="after")
141
+ def profiling_stateful_option_validator(self) -> "StatefulProfilingConfigMixin":
142
+ try:
143
+ sti = getattr(self, "stateful_ingestion", None)
144
+ if not sti or not getattr(sti, "enabled", False):
145
+ if getattr(self, "enable_stateful_profiling", False):
146
+ logger.warning(
147
+ "Stateful ingestion is disabled, disabling enable_stateful_profiling config option as well"
148
+ )
149
+ self.enable_stateful_profiling = False
150
+ except (AttributeError, RecursionError) as e:
151
+ logger.debug(f"Skipping stateful profiling validation due to: {e}")
152
+ return self
148
153
 
149
154
 
150
155
  class StatefulUsageConfigMixin(BaseTimeWindowConfig):
@@ -161,16 +166,21 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
161
166
  "store_last_usage_extraction_timestamp", "enable_stateful_usage_ingestion"
162
167
  )
163
168
 
164
- @root_validator(skip_on_failure=True)
165
- def last_usage_extraction_stateful_option_validator(cls, values: Dict) -> Dict:
166
- sti = values.get("stateful_ingestion")
167
- if not sti or not sti.enabled:
168
- if values.get("enable_stateful_usage_ingestion"):
169
- logger.warning(
170
- "Stateful ingestion is disabled, disabling enable_stateful_usage_ingestion config option as well"
171
- )
172
- values["enable_stateful_usage_ingestion"] = False
173
- return values
169
+ @model_validator(mode="after")
170
+ def last_usage_extraction_stateful_option_validator(
171
+ self,
172
+ ) -> "StatefulUsageConfigMixin":
173
+ try:
174
+ sti = getattr(self, "stateful_ingestion", None)
175
+ if not sti or not getattr(sti, "enabled", False):
176
+ if getattr(self, "enable_stateful_usage_ingestion", False):
177
+ logger.warning(
178
+ "Stateful ingestion is disabled, disabling enable_stateful_usage_ingestion config option as well"
179
+ )
180
+ self.enable_stateful_usage_ingestion = False
181
+ except (AttributeError, RecursionError) as e:
182
+ logger.debug(f"Skipping stateful usage validation due to: {e}")
183
+ return self
174
184
 
175
185
 
176
186
  class StatefulTimeWindowConfigMixin(BaseTimeWindowConfig):
@@ -185,16 +195,16 @@ class StatefulTimeWindowConfigMixin(BaseTimeWindowConfig):
185
195
  "and queries together from a single audit log and uses a unified time window.",
186
196
  )
187
197
 
188
- @root_validator(skip_on_failure=True)
189
- def time_window_stateful_option_validator(cls, values: Dict) -> Dict:
190
- sti = values.get("stateful_ingestion")
191
- if not sti or not sti.enabled:
192
- if values.get("enable_stateful_time_window"):
198
+ @model_validator(mode="after")
199
+ def time_window_stateful_option_validator(self) -> "StatefulTimeWindowConfigMixin":
200
+ sti = getattr(self, "stateful_ingestion", None)
201
+ if not sti or not getattr(sti, "enabled", False):
202
+ if getattr(self, "enable_stateful_time_window", False):
193
203
  logger.warning(
194
204
  "Stateful ingestion is disabled, disabling enable_stateful_time_window config option as well"
195
205
  )
196
- values["enable_stateful_time_window"] = False
197
- return values
206
+ self.enable_stateful_time_window = False
207
+ return self
198
208
 
199
209
 
200
210
  @dataclass
@@ -40,7 +40,7 @@ class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
40
40
  def create(
41
41
  cls, config_dict: Dict[str, Any], ctx: PipelineContext
42
42
  ) -> "DatahubIngestionCheckpointingProvider":
43
- config = DatahubIngestionStateProviderConfig.parse_obj(config_dict)
43
+ config = DatahubIngestionStateProviderConfig.model_validate(config_dict)
44
44
  if config.datahub_api is not None:
45
45
  return cls(DataHubGraph(config.datahub_api))
46
46
  elif ctx.graph:
@@ -32,7 +32,7 @@ class FileIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
32
32
  def create(
33
33
  cls, config_dict: Dict[str, Any], ctx: PipelineContext
34
34
  ) -> "FileIngestionCheckpointingProvider":
35
- config = FileIngestionStateProviderConfig.parse_obj(config_dict)
35
+ config = FileIngestionStateProviderConfig.model_validate(config_dict)
36
36
  return cls(config)
37
37
 
38
38
  def get_latest_checkpoint(
@@ -9,7 +9,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
9
9
  import dateutil.parser as dp
10
10
  import requests
11
11
  import sqlglot
12
- from pydantic import BaseModel, root_validator, validator
12
+ from pydantic import BaseModel, field_validator, model_validator
13
13
  from pydantic.fields import Field
14
14
  from requests.adapters import HTTPAdapter
15
15
  from urllib3.util.retry import Retry
@@ -246,16 +246,16 @@ class SupersetConfig(
246
246
  # This is required to allow preset configs to get parsed
247
247
  extra = "allow"
248
248
 
249
- @validator("connect_uri", "display_uri")
250
- def remove_trailing_slash(cls, v):
249
+ @field_validator("connect_uri", "display_uri", mode="after")
250
+ @classmethod
251
+ def remove_trailing_slash(cls, v: str) -> str:
251
252
  return config_clean.remove_trailing_slashes(v)
252
253
 
253
- @root_validator(skip_on_failure=True)
254
- def default_display_uri_to_connect_uri(cls, values):
255
- base = values.get("display_uri")
256
- if base is None:
257
- values["display_uri"] = values.get("connect_uri")
258
- return values
254
+ @model_validator(mode="after")
255
+ def default_display_uri_to_connect_uri(self) -> "SupersetConfig":
256
+ if self.display_uri is None:
257
+ self.display_uri = self.connect_uri
258
+ return self
259
259
 
260
260
 
261
261
  def get_metric_name(metric):
@@ -25,7 +25,7 @@ from urllib.parse import quote, urlparse
25
25
 
26
26
  import dateutil.parser as dp
27
27
  import tableauserverclient as TSC
28
- from pydantic import root_validator, validator
28
+ from pydantic import field_validator, model_validator
29
29
  from pydantic.fields import Field
30
30
  from requests.adapters import HTTPAdapter
31
31
  from tableauserverclient import (
@@ -257,8 +257,9 @@ class TableauConnectionConfig(ConfigModel):
257
257
  description="When enabled, extracts column-level lineage from Tableau Datasources",
258
258
  )
259
259
 
260
- @validator("connect_uri")
261
- def remove_trailing_slash(cls, v):
260
+ @field_validator("connect_uri", mode="after")
261
+ @classmethod
262
+ def remove_trailing_slash(cls, v: str) -> str:
262
263
  return config_clean.remove_trailing_slashes(v)
263
264
 
264
265
  def get_tableau_auth(
@@ -652,8 +653,9 @@ class TableauConfig(
652
653
  "fetch_size",
653
654
  )
654
655
 
655
- # pre = True because we want to take some decision before pydantic initialize the configuration to default values
656
- @root_validator(pre=True)
656
+ # mode = "before" because we want to take some decision before pydantic initialize the configuration to default values
657
+ @model_validator(mode="before")
658
+ @classmethod
657
659
  def projects_backward_compatibility(cls, values: Dict) -> Dict:
658
660
  # In-place update of the input dict would cause state contamination. This was discovered through test failures
659
661
  # in test_hex.py where the same dict is reused.
@@ -683,27 +685,23 @@ class TableauConfig(
683
685
 
684
686
  return values
685
687
 
686
- @root_validator(skip_on_failure=True)
687
- def validate_config_values(cls, values: Dict) -> Dict:
688
- tags_for_hidden_assets = values.get("tags_for_hidden_assets")
689
- ingest_tags = values.get("ingest_tags")
688
+ @model_validator(mode="after")
689
+ def validate_config_values(self) -> "TableauConfig":
690
690
  if (
691
- not ingest_tags
692
- and tags_for_hidden_assets
693
- and len(tags_for_hidden_assets) > 0
691
+ not self.ingest_tags
692
+ and self.tags_for_hidden_assets
693
+ and len(self.tags_for_hidden_assets) > 0
694
694
  ):
695
695
  raise ValueError(
696
696
  "tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
697
697
  )
698
698
 
699
- use_email_as_username = values.get("use_email_as_username")
700
- ingest_owner = values.get("ingest_owner")
701
- if use_email_as_username and not ingest_owner:
699
+ if self.use_email_as_username and not self.ingest_owner:
702
700
  raise ValueError(
703
701
  "use_email_as_username requires ingest_owner to be enabled."
704
702
  )
705
703
 
706
- return values
704
+ return self
707
705
 
708
706
 
709
707
  class WorkbookKey(ContainerKey):
@@ -0,0 +1,15 @@
1
+ from pydantic import Field, SecretStr
2
+
3
+ from datahub.configuration import ConfigModel
4
+
5
+
6
+ class AzureAuthConfig(ConfigModel):
7
+ client_secret: SecretStr = Field(
8
+ description="Azure application client secret used for authentication. This is a confidential credential that should be kept secure."
9
+ )
10
+ client_id: str = Field(
11
+ description="Azure application (client) ID. This is the unique identifier for the registered Azure AD application.",
12
+ )
13
+ tenant_id: str = Field(
14
+ description="Azure tenant (directory) ID. This identifies the Azure AD tenant where the application is registered.",
15
+ )