acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  from dataclasses import dataclass
2
+ from typing import Optional
2
3
 
3
4
  from tableauserverclient import Server, UserItem
4
5
 
@@ -10,6 +11,7 @@ class UserInfo:
10
11
  user_name: str
11
12
  site_role: str
12
13
  site_id: str
14
+ email: Optional[str] = None
13
15
 
14
16
  def has_site_administrator_explorer_privileges(self):
15
17
  return self.site_role in [
@@ -34,4 +36,5 @@ class UserInfo:
34
36
  user_name=user.name,
35
37
  site_role=user.site_role,
36
38
  site_id=server.site_id,
39
+ email=user.email,
37
40
  )
@@ -24,7 +24,7 @@ def check_user_role(
24
24
  mitigation_message_prefix: str = (
25
25
  "Assign `Site Administrator Explorer` role to the user"
26
26
  )
27
- mitigation_message_suffix: str = "Refer to the setup guide: https://datahubproject.io/docs/quick-ingestion-guides/tableau/setup"
27
+ mitigation_message_suffix: str = "Refer to the setup guide: https://docs.datahub.com/docs/quick-ingestion-guides/tableau/setup"
28
28
 
29
29
  try:
30
30
  # TODO: Add check for `Enable Derived Permissions`
@@ -2,22 +2,25 @@ import logging
2
2
  import os
3
3
  from datetime import datetime, timedelta, timezone
4
4
  from typing import Any, Dict, List, Optional, Union
5
- from urllib.parse import urlparse
6
5
 
7
6
  import pydantic
8
7
  from pydantic import Field
9
8
  from typing_extensions import Literal
10
9
 
11
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
+ from datahub.configuration.common import (
11
+ AllowDenyPattern,
12
+ ConfigEnum,
13
+ ConfigModel,
14
+ HiddenFromDocs,
15
+ )
12
16
  from datahub.configuration.source_common import (
13
17
  DatasetSourceConfigMixin,
14
18
  LowerCaseDatasetUrnConfigMixin,
15
19
  )
16
20
  from datahub.configuration.validate_field_removal import pydantic_removed_field
17
21
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
18
- from datahub.ingestion.source.ge_data_profiler import DATABRICKS
19
22
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
20
- from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
23
+ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
21
24
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
22
25
  StatefulStaleMetadataRemovalConfig,
23
26
  )
@@ -25,6 +28,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
25
28
  StatefulIngestionConfigBase,
26
29
  StatefulProfilingConfigMixin,
27
30
  )
31
+ from datahub.ingestion.source.unity.connection import UnityCatalogConnectionConfig
28
32
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
29
33
  from datahub.ingestion.source_config.operation_config import (
30
34
  OperationConfig,
@@ -34,6 +38,22 @@ from datahub.utilities.global_warning_util import add_global_warning
34
38
 
35
39
  logger = logging.getLogger(__name__)
36
40
 
41
+ # Configuration default constants
42
+ INCLUDE_TAGS_DEFAULT = True
43
+ INCLUDE_HIVE_METASTORE_DEFAULT = True
44
+
45
+
46
+ class LineageDataSource(ConfigEnum):
47
+ AUTO = "AUTO"
48
+ SYSTEM_TABLES = "SYSTEM_TABLES"
49
+ API = "API"
50
+
51
+
52
+ class UsageDataSource(ConfigEnum):
53
+ AUTO = "AUTO"
54
+ SYSTEM_TABLES = "SYSTEM_TABLES"
55
+ API = "API"
56
+
37
57
 
38
58
  class UnityCatalogProfilerConfig(ConfigModel):
39
59
  method: str = Field(
@@ -117,6 +137,7 @@ class UnityCatalogGEProfilerConfig(UnityCatalogProfilerConfig, GEProfilingConfig
117
137
 
118
138
 
119
139
  class UnityCatalogSourceConfig(
140
+ UnityCatalogConnectionConfig,
120
141
  SQLCommonConfig,
121
142
  StatefulIngestionConfigBase,
122
143
  BaseUsageConfig,
@@ -124,23 +145,6 @@ class UnityCatalogSourceConfig(
124
145
  StatefulProfilingConfigMixin,
125
146
  LowerCaseDatasetUrnConfigMixin,
126
147
  ):
127
- token: str = pydantic.Field(description="Databricks personal access token")
128
- workspace_url: str = pydantic.Field(
129
- description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
130
- )
131
- warehouse_id: Optional[str] = pydantic.Field(
132
- default=None,
133
- description="SQL Warehouse id, for running queries. If not set, will use the default warehouse.",
134
- )
135
- include_hive_metastore: bool = pydantic.Field(
136
- default=True,
137
- description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
138
- )
139
- workspace_name: Optional[str] = pydantic.Field(
140
- default=None,
141
- description="Name of the workspace. Default to deployment name present in workspace_url",
142
- )
143
-
144
148
  include_metastore: bool = pydantic.Field(
145
149
  default=False,
146
150
  description=(
@@ -228,6 +232,15 @@ class UnityCatalogSourceConfig(
228
232
  description="Option to enable/disable ownership generation for metastores, catalogs, schemas, and tables.",
229
233
  )
230
234
 
235
+ include_tags: bool = pydantic.Field(
236
+ default=INCLUDE_TAGS_DEFAULT,
237
+ description=(
238
+ "Option to enable/disable column/table tag extraction. "
239
+ "Requires warehouse_id to be set since tag extraction needs to query system.information_schema.tags. "
240
+ "If warehouse_id is not provided, this will be automatically disabled to allow ingestion to continue."
241
+ ),
242
+ )
243
+
231
244
  _rename_table_ownership = pydantic_renamed_field(
232
245
  "include_table_ownership", "include_ownership"
233
246
  )
@@ -237,15 +250,40 @@ class UnityCatalogSourceConfig(
237
250
  description="Option to enable/disable lineage generation. Currently we have to call a rest call per column to get column level lineage due to the Databrick api which can slow down ingestion. ",
238
251
  )
239
252
 
253
+ lineage_data_source: LineageDataSource = pydantic.Field(
254
+ default=LineageDataSource.AUTO,
255
+ description=(
256
+ "Source for lineage data extraction. Options: "
257
+ f"'{LineageDataSource.AUTO.value}' - Use system tables when SQL warehouse is available, fallback to API; "
258
+ f"'{LineageDataSource.SYSTEM_TABLES.value}' - Force use of system.access.table_lineage and system.access.column_lineage tables (requires SQL warehouse); "
259
+ f"'{LineageDataSource.API.value}' - Force use of REST API endpoints for lineage data"
260
+ ),
261
+ )
262
+
263
+ ignore_start_time_lineage: bool = pydantic.Field(
264
+ default=False,
265
+ description="Option to ignore the start_time and retrieve all available lineage. When enabled, the start_time filter will be set to zero to extract all lineage events regardless of the configured time window.",
266
+ )
267
+
240
268
  column_lineage_column_limit: int = pydantic.Field(
241
269
  default=300,
242
270
  description="Limit the number of columns to get column level lineage. ",
243
271
  )
244
272
 
245
- lineage_max_workers: int = pydantic.Field(
273
+ lineage_max_workers: HiddenFromDocs[int] = pydantic.Field(
246
274
  default=5 * (os.cpu_count() or 4),
247
275
  description="Number of worker threads to use for column lineage thread pool executor. Set to 1 to disable.",
248
- hidden_from_docs=True,
276
+ )
277
+
278
+ databricks_api_page_size: int = pydantic.Field(
279
+ default=0,
280
+ ge=0,
281
+ description=(
282
+ "Page size for Databricks API calls when listing resources (catalogs, schemas, tables, etc.). "
283
+ "When set to 0 (default), uses server-side configured page length (recommended). "
284
+ "When set to a positive value, the page length is the minimum of this value and the server configured value. "
285
+ "Must be a non-negative integer."
286
+ ),
249
287
  )
250
288
 
251
289
  include_usage_statistics: bool = Field(
@@ -253,6 +291,17 @@ class UnityCatalogSourceConfig(
253
291
  description="Generate usage statistics.",
254
292
  )
255
293
 
294
+ usage_data_source: UsageDataSource = pydantic.Field(
295
+ default=UsageDataSource.AUTO,
296
+ description=(
297
+ "Source for usage/query history data extraction. Options: "
298
+ f"'{UsageDataSource.AUTO.value}' (default) - Automatically use system.query.history table when SQL warehouse is configured, otherwise fall back to REST API. "
299
+ "This provides better performance for multi-workspace setups and large query volumes when warehouse_id is set. "
300
+ f"'{UsageDataSource.SYSTEM_TABLES.value}' - Force use of system.query.history table (requires SQL warehouse and SELECT permission on system.query.history). "
301
+ f"'{UsageDataSource.API.value}' - Force use of REST API endpoints for query history (legacy method, may have limitations with multiple workspaces)."
302
+ ),
303
+ )
304
+
256
305
  # TODO: Remove `type:ignore` by refactoring config
257
306
  profiling: Union[
258
307
  UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
@@ -272,19 +321,68 @@ class UnityCatalogSourceConfig(
272
321
  description="Details about the delta lake, incase to emit siblings",
273
322
  )
274
323
 
275
- scheme: str = DATABRICKS
324
+ include_ml_model_aliases: bool = pydantic.Field(
325
+ default=False,
326
+ description="Whether to include ML model aliases in the ingestion.",
327
+ )
328
+
329
+ ml_model_max_results: int = pydantic.Field(
330
+ default=1000,
331
+ ge=0,
332
+ description="Maximum number of ML models to ingest.",
333
+ )
334
+
335
+ _forced_disable_tag_extraction: bool = pydantic.PrivateAttr(default=False)
336
+ _forced_disable_hive_metastore_extraction = pydantic.PrivateAttr(default=False)
337
+
338
+ include_hive_metastore: bool = pydantic.Field(
339
+ default=INCLUDE_HIVE_METASTORE_DEFAULT,
340
+ description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
341
+ )
276
342
 
277
- def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
278
- uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
279
- if database:
280
- uri_opts["catalog"] = database
281
- return make_sqlalchemy_uri(
282
- scheme=self.scheme,
283
- username="token",
284
- password=self.token,
285
- at=urlparse(self.workspace_url).netloc,
286
- db=database,
287
- uri_opts=uri_opts,
343
+ workspace_name: Optional[str] = pydantic.Field(
344
+ default=None,
345
+ description="Name of the workspace. Default to deployment name present in workspace_url",
346
+ )
347
+
348
+ def __init__(self, **data):
349
+ # First, let the parent handle the root validators and field processing
350
+ super().__init__(**data)
351
+
352
+ # After model creation, check if we need to auto-disable features
353
+ # based on the final warehouse_id value (which may have been set by root validators)
354
+ include_tags_original = data.get("include_tags", INCLUDE_TAGS_DEFAULT)
355
+ include_hive_metastore_original = data.get(
356
+ "include_hive_metastore", INCLUDE_HIVE_METASTORE_DEFAULT
357
+ )
358
+
359
+ # Track what we're force-disabling
360
+ forced_disable_tag_extraction = False
361
+ forced_disable_hive_metastore_extraction = False
362
+
363
+ # Check if features should be auto-disabled based on final warehouse_id
364
+ if include_tags_original and not self.warehouse_id:
365
+ forced_disable_tag_extraction = True
366
+ self.include_tags = False # Modify the model attribute directly
367
+ logger.warning(
368
+ "warehouse_id is not set but include_tags=True. "
369
+ "Automatically disabling tag extraction since it requires SQL queries. "
370
+ "Set warehouse_id to enable tag extraction."
371
+ )
372
+
373
+ if include_hive_metastore_original and not self.warehouse_id:
374
+ forced_disable_hive_metastore_extraction = True
375
+ self.include_hive_metastore = False # Modify the model attribute directly
376
+ logger.warning(
377
+ "warehouse_id is not set but include_hive_metastore=True. "
378
+ "Automatically disabling hive metastore extraction since it requires SQL queries. "
379
+ "Set warehouse_id to enable hive metastore extraction."
380
+ )
381
+
382
+ # Set private attributes
383
+ self._forced_disable_tag_extraction = forced_disable_tag_extraction
384
+ self._forced_disable_hive_metastore_extraction = (
385
+ forced_disable_hive_metastore_extraction
288
386
  )
289
387
 
290
388
  def is_profiling_enabled(self) -> bool:
@@ -343,11 +441,6 @@ class UnityCatalogSourceConfig(
343
441
  "When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`."
344
442
  )
345
443
 
346
- if values.get("include_hive_metastore") and not values.get("warehouse_id"):
347
- raise ValueError(
348
- "When `include_hive_metastore` is set, `warehouse_id` must be set."
349
- )
350
-
351
444
  if values.get("warehouse_id") and profiling and not profiling.warehouse_id:
352
445
  profiling.warehouse_id = values["warehouse_id"]
353
446
 
@@ -356,6 +449,34 @@ class UnityCatalogSourceConfig(
356
449
 
357
450
  return values
358
451
 
452
+ @pydantic.root_validator(skip_on_failure=True)
453
+ def validate_lineage_data_source_with_warehouse(
454
+ cls, values: Dict[str, Any]
455
+ ) -> Dict[str, Any]:
456
+ lineage_data_source = values.get("lineage_data_source", LineageDataSource.AUTO)
457
+ warehouse_id = values.get("warehouse_id")
458
+
459
+ if lineage_data_source == LineageDataSource.SYSTEM_TABLES and not warehouse_id:
460
+ raise ValueError(
461
+ f"lineage_data_source='{LineageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
462
+ )
463
+
464
+ return values
465
+
466
+ @pydantic.root_validator(skip_on_failure=True)
467
+ def validate_usage_data_source_with_warehouse(
468
+ cls, values: Dict[str, Any]
469
+ ) -> Dict[str, Any]:
470
+ usage_data_source = values.get("usage_data_source", UsageDataSource.AUTO)
471
+ warehouse_id = values.get("warehouse_id")
472
+
473
+ if usage_data_source == UsageDataSource.SYSTEM_TABLES and not warehouse_id:
474
+ raise ValueError(
475
+ f"usage_data_source='{UsageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
476
+ )
477
+
478
+ return values
479
+
359
480
  @pydantic.validator("schema_pattern", always=True)
360
481
  def schema_pattern_should__always_deny_information_schema(
361
482
  cls, v: AllowDenyPattern
@@ -0,0 +1,61 @@
1
+ """Databricks Unity Catalog connection configuration."""
2
+
3
+ from typing import Any, Dict, Optional
4
+ from urllib.parse import urlparse
5
+
6
+ import pydantic
7
+ from pydantic import Field
8
+
9
+ from datahub.configuration.common import ConfigModel
10
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
11
+
12
+ DATABRICKS = "databricks"
13
+
14
+
15
+ class UnityCatalogConnectionConfig(ConfigModel):
16
+ """
17
+ Configuration for connecting to Databricks Unity Catalog.
18
+ Contains only connection-related fields that can be reused across different sources.
19
+ """
20
+
21
+ scheme: str = DATABRICKS
22
+ token: str = pydantic.Field(description="Databricks personal access token")
23
+ workspace_url: str = pydantic.Field(
24
+ description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
25
+ )
26
+ warehouse_id: Optional[str] = pydantic.Field(
27
+ default=None,
28
+ description=(
29
+ "SQL Warehouse id, for running queries. Must be explicitly provided to enable SQL-based features. "
30
+ "Required for the following features that need SQL access: "
31
+ "1) Tag extraction (include_tags=True) - queries system.information_schema.tags "
32
+ "2) Hive Metastore catalog (include_hive_metastore=True) - queries legacy hive_metastore catalog "
33
+ "3) System table lineage (lineage_data_source=SYSTEM_TABLES) - queries system.access.table_lineage/column_lineage "
34
+ "4) Data profiling (profiling.enabled=True) - runs SELECT/ANALYZE queries on tables. "
35
+ "When warehouse_id is missing, these features will be automatically disabled (with warnings) to allow ingestion to continue."
36
+ ),
37
+ )
38
+
39
+ extra_client_options: Dict[str, Any] = Field(
40
+ default={},
41
+ description="Additional options to pass to Databricks SQLAlchemy client.",
42
+ )
43
+
44
+ def __init__(self, **data: Any):
45
+ super().__init__(**data)
46
+
47
+ def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
48
+ uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
49
+ if database:
50
+ uri_opts["catalog"] = database
51
+ return make_sqlalchemy_uri(
52
+ scheme=self.scheme,
53
+ username="token",
54
+ password=self.token,
55
+ at=urlparse(self.workspace_url).netloc,
56
+ db=database,
57
+ uri_opts=uri_opts,
58
+ )
59
+
60
+ def get_options(self) -> dict:
61
+ return self.extra_client_options
@@ -19,6 +19,7 @@ class UnityCatalogConnectionTest:
19
19
  self.config.token,
20
20
  self.config.profiling.warehouse_id,
21
21
  report=self.report,
22
+ databricks_api_page_size=self.config.databricks_api_page_size,
22
23
  )
23
24
 
24
25
  def get_connection_test(self) -> TestConnectionReport:
@@ -0,0 +1,19 @@
1
+ import logging
2
+
3
+ from datahub.api.entities.external.external_entities import (
4
+ PlatformResourceRepository,
5
+ )
6
+ from datahub.ingestion.source.unity.tag_entities import (
7
+ UnityCatalogTagPlatformResource,
8
+ UnityCatalogTagPlatformResourceId,
9
+ )
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class UnityCatalogPlatformResourceRepository(
15
+ PlatformResourceRepository[
16
+ UnityCatalogTagPlatformResourceId, UnityCatalogTagPlatformResource
17
+ ]
18
+ ):
19
+ """Unity Catalog-specific platform resource repository with tag-related operations."""