acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -2,23 +2,25 @@ import logging
2
2
  import os
3
3
  from datetime import datetime, timedelta, timezone
4
4
  from typing import Any, Dict, List, Optional, Union
5
- from urllib.parse import urlparse
6
5
 
7
6
  import pydantic
8
7
  from pydantic import Field
9
8
  from typing_extensions import Literal
10
9
 
11
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
+ from datahub.configuration.common import (
11
+ AllowDenyPattern,
12
+ ConfigEnum,
13
+ ConfigModel,
14
+ HiddenFromDocs,
15
+ )
12
16
  from datahub.configuration.source_common import (
13
17
  DatasetSourceConfigMixin,
14
18
  LowerCaseDatasetUrnConfigMixin,
15
19
  )
16
20
  from datahub.configuration.validate_field_removal import pydantic_removed_field
17
21
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
18
- from datahub.ingestion.source.ge_data_profiler import DATABRICKS
19
22
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
20
23
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
21
- from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
22
24
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
23
25
  StatefulStaleMetadataRemovalConfig,
24
26
  )
@@ -26,6 +28,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
26
28
  StatefulIngestionConfigBase,
27
29
  StatefulProfilingConfigMixin,
28
30
  )
31
+ from datahub.ingestion.source.unity.connection import UnityCatalogConnectionConfig
29
32
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
30
33
  from datahub.ingestion.source_config.operation_config import (
31
34
  OperationConfig,
@@ -35,6 +38,22 @@ from datahub.utilities.global_warning_util import add_global_warning
35
38
 
36
39
  logger = logging.getLogger(__name__)
37
40
 
41
+ # Configuration default constants
42
+ INCLUDE_TAGS_DEFAULT = True
43
+ INCLUDE_HIVE_METASTORE_DEFAULT = True
44
+
45
+
46
+ class LineageDataSource(ConfigEnum):
47
+ AUTO = "AUTO"
48
+ SYSTEM_TABLES = "SYSTEM_TABLES"
49
+ API = "API"
50
+
51
+
52
+ class UsageDataSource(ConfigEnum):
53
+ AUTO = "AUTO"
54
+ SYSTEM_TABLES = "SYSTEM_TABLES"
55
+ API = "API"
56
+
38
57
 
39
58
  class UnityCatalogProfilerConfig(ConfigModel):
40
59
  method: str = Field(
@@ -118,6 +137,7 @@ class UnityCatalogGEProfilerConfig(UnityCatalogProfilerConfig, GEProfilingConfig
118
137
 
119
138
 
120
139
  class UnityCatalogSourceConfig(
140
+ UnityCatalogConnectionConfig,
121
141
  SQLCommonConfig,
122
142
  StatefulIngestionConfigBase,
123
143
  BaseUsageConfig,
@@ -125,23 +145,6 @@ class UnityCatalogSourceConfig(
125
145
  StatefulProfilingConfigMixin,
126
146
  LowerCaseDatasetUrnConfigMixin,
127
147
  ):
128
- token: str = pydantic.Field(description="Databricks personal access token")
129
- workspace_url: str = pydantic.Field(
130
- description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
131
- )
132
- warehouse_id: Optional[str] = pydantic.Field(
133
- default=None,
134
- description="SQL Warehouse id, for running queries. If not set, will use the default warehouse.",
135
- )
136
- include_hive_metastore: bool = pydantic.Field(
137
- default=True,
138
- description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
139
- )
140
- workspace_name: Optional[str] = pydantic.Field(
141
- default=None,
142
- description="Name of the workspace. Default to deployment name present in workspace_url",
143
- )
144
-
145
148
  include_metastore: bool = pydantic.Field(
146
149
  default=False,
147
150
  description=(
@@ -229,6 +232,15 @@ class UnityCatalogSourceConfig(
229
232
  description="Option to enable/disable ownership generation for metastores, catalogs, schemas, and tables.",
230
233
  )
231
234
 
235
+ include_tags: bool = pydantic.Field(
236
+ default=INCLUDE_TAGS_DEFAULT,
237
+ description=(
238
+ "Option to enable/disable column/table tag extraction. "
239
+ "Requires warehouse_id to be set since tag extraction needs to query system.information_schema.tags. "
240
+ "If warehouse_id is not provided, this will be automatically disabled to allow ingestion to continue."
241
+ ),
242
+ )
243
+
232
244
  _rename_table_ownership = pydantic_renamed_field(
233
245
  "include_table_ownership", "include_ownership"
234
246
  )
@@ -238,15 +250,40 @@ class UnityCatalogSourceConfig(
238
250
  description="Option to enable/disable lineage generation. Currently we have to call a rest call per column to get column level lineage due to the Databrick api which can slow down ingestion. ",
239
251
  )
240
252
 
253
+ lineage_data_source: LineageDataSource = pydantic.Field(
254
+ default=LineageDataSource.AUTO,
255
+ description=(
256
+ "Source for lineage data extraction. Options: "
257
+ f"'{LineageDataSource.AUTO.value}' - Use system tables when SQL warehouse is available, fallback to API; "
258
+ f"'{LineageDataSource.SYSTEM_TABLES.value}' - Force use of system.access.table_lineage and system.access.column_lineage tables (requires SQL warehouse); "
259
+ f"'{LineageDataSource.API.value}' - Force use of REST API endpoints for lineage data"
260
+ ),
261
+ )
262
+
263
+ ignore_start_time_lineage: bool = pydantic.Field(
264
+ default=False,
265
+ description="Option to ignore the start_time and retrieve all available lineage. When enabled, the start_time filter will be set to zero to extract all lineage events regardless of the configured time window.",
266
+ )
267
+
241
268
  column_lineage_column_limit: int = pydantic.Field(
242
269
  default=300,
243
270
  description="Limit the number of columns to get column level lineage. ",
244
271
  )
245
272
 
246
- lineage_max_workers: int = pydantic.Field(
273
+ lineage_max_workers: HiddenFromDocs[int] = pydantic.Field(
247
274
  default=5 * (os.cpu_count() or 4),
248
275
  description="Number of worker threads to use for column lineage thread pool executor. Set to 1 to disable.",
249
- hidden_from_docs=True,
276
+ )
277
+
278
+ databricks_api_page_size: int = pydantic.Field(
279
+ default=0,
280
+ ge=0,
281
+ description=(
282
+ "Page size for Databricks API calls when listing resources (catalogs, schemas, tables, etc.). "
283
+ "When set to 0 (default), uses server-side configured page length (recommended). "
284
+ "When set to a positive value, the page length is the minimum of this value and the server configured value. "
285
+ "Must be a non-negative integer."
286
+ ),
250
287
  )
251
288
 
252
289
  include_usage_statistics: bool = Field(
@@ -254,6 +291,17 @@ class UnityCatalogSourceConfig(
254
291
  description="Generate usage statistics.",
255
292
  )
256
293
 
294
+ usage_data_source: UsageDataSource = pydantic.Field(
295
+ default=UsageDataSource.AUTO,
296
+ description=(
297
+ "Source for usage/query history data extraction. Options: "
298
+ f"'{UsageDataSource.AUTO.value}' (default) - Automatically use system.query.history table when SQL warehouse is configured, otherwise fall back to REST API. "
299
+ "This provides better performance for multi-workspace setups and large query volumes when warehouse_id is set. "
300
+ f"'{UsageDataSource.SYSTEM_TABLES.value}' - Force use of system.query.history table (requires SQL warehouse and SELECT permission on system.query.history). "
301
+ f"'{UsageDataSource.API.value}' - Force use of REST API endpoints for query history (legacy method, may have limitations with multiple workspaces)."
302
+ ),
303
+ )
304
+
257
305
  # TODO: Remove `type:ignore` by refactoring config
258
306
  profiling: Union[
259
307
  UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
@@ -273,19 +321,68 @@ class UnityCatalogSourceConfig(
273
321
  description="Details about the delta lake, incase to emit siblings",
274
322
  )
275
323
 
276
- scheme: str = DATABRICKS
324
+ include_ml_model_aliases: bool = pydantic.Field(
325
+ default=False,
326
+ description="Whether to include ML model aliases in the ingestion.",
327
+ )
328
+
329
+ ml_model_max_results: int = pydantic.Field(
330
+ default=1000,
331
+ ge=0,
332
+ description="Maximum number of ML models to ingest.",
333
+ )
334
+
335
+ _forced_disable_tag_extraction: bool = pydantic.PrivateAttr(default=False)
336
+ _forced_disable_hive_metastore_extraction = pydantic.PrivateAttr(default=False)
337
+
338
+ include_hive_metastore: bool = pydantic.Field(
339
+ default=INCLUDE_HIVE_METASTORE_DEFAULT,
340
+ description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
341
+ )
277
342
 
278
- def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
279
- uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
280
- if database:
281
- uri_opts["catalog"] = database
282
- return make_sqlalchemy_uri(
283
- scheme=self.scheme,
284
- username="token",
285
- password=self.token,
286
- at=urlparse(self.workspace_url).netloc,
287
- db=database,
288
- uri_opts=uri_opts,
343
+ workspace_name: Optional[str] = pydantic.Field(
344
+ default=None,
345
+ description="Name of the workspace. Default to deployment name present in workspace_url",
346
+ )
347
+
348
+ def __init__(self, **data):
349
+ # First, let the parent handle the root validators and field processing
350
+ super().__init__(**data)
351
+
352
+ # After model creation, check if we need to auto-disable features
353
+ # based on the final warehouse_id value (which may have been set by root validators)
354
+ include_tags_original = data.get("include_tags", INCLUDE_TAGS_DEFAULT)
355
+ include_hive_metastore_original = data.get(
356
+ "include_hive_metastore", INCLUDE_HIVE_METASTORE_DEFAULT
357
+ )
358
+
359
+ # Track what we're force-disabling
360
+ forced_disable_tag_extraction = False
361
+ forced_disable_hive_metastore_extraction = False
362
+
363
+ # Check if features should be auto-disabled based on final warehouse_id
364
+ if include_tags_original and not self.warehouse_id:
365
+ forced_disable_tag_extraction = True
366
+ self.include_tags = False # Modify the model attribute directly
367
+ logger.warning(
368
+ "warehouse_id is not set but include_tags=True. "
369
+ "Automatically disabling tag extraction since it requires SQL queries. "
370
+ "Set warehouse_id to enable tag extraction."
371
+ )
372
+
373
+ if include_hive_metastore_original and not self.warehouse_id:
374
+ forced_disable_hive_metastore_extraction = True
375
+ self.include_hive_metastore = False # Modify the model attribute directly
376
+ logger.warning(
377
+ "warehouse_id is not set but include_hive_metastore=True. "
378
+ "Automatically disabling hive metastore extraction since it requires SQL queries. "
379
+ "Set warehouse_id to enable hive metastore extraction."
380
+ )
381
+
382
+ # Set private attributes
383
+ self._forced_disable_tag_extraction = forced_disable_tag_extraction
384
+ self._forced_disable_hive_metastore_extraction = (
385
+ forced_disable_hive_metastore_extraction
289
386
  )
290
387
 
291
388
  def is_profiling_enabled(self) -> bool:
@@ -344,11 +441,6 @@ class UnityCatalogSourceConfig(
344
441
  "When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`."
345
442
  )
346
443
 
347
- if values.get("include_hive_metastore") and not values.get("warehouse_id"):
348
- raise ValueError(
349
- "When `include_hive_metastore` is set, `warehouse_id` must be set."
350
- )
351
-
352
444
  if values.get("warehouse_id") and profiling and not profiling.warehouse_id:
353
445
  profiling.warehouse_id = values["warehouse_id"]
354
446
 
@@ -357,6 +449,34 @@ class UnityCatalogSourceConfig(
357
449
 
358
450
  return values
359
451
 
452
+ @pydantic.root_validator(skip_on_failure=True)
453
+ def validate_lineage_data_source_with_warehouse(
454
+ cls, values: Dict[str, Any]
455
+ ) -> Dict[str, Any]:
456
+ lineage_data_source = values.get("lineage_data_source", LineageDataSource.AUTO)
457
+ warehouse_id = values.get("warehouse_id")
458
+
459
+ if lineage_data_source == LineageDataSource.SYSTEM_TABLES and not warehouse_id:
460
+ raise ValueError(
461
+ f"lineage_data_source='{LineageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
462
+ )
463
+
464
+ return values
465
+
466
+ @pydantic.root_validator(skip_on_failure=True)
467
+ def validate_usage_data_source_with_warehouse(
468
+ cls, values: Dict[str, Any]
469
+ ) -> Dict[str, Any]:
470
+ usage_data_source = values.get("usage_data_source", UsageDataSource.AUTO)
471
+ warehouse_id = values.get("warehouse_id")
472
+
473
+ if usage_data_source == UsageDataSource.SYSTEM_TABLES and not warehouse_id:
474
+ raise ValueError(
475
+ f"usage_data_source='{UsageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
476
+ )
477
+
478
+ return values
479
+
360
480
  @pydantic.validator("schema_pattern", always=True)
361
481
  def schema_pattern_should__always_deny_information_schema(
362
482
  cls, v: AllowDenyPattern
@@ -0,0 +1,61 @@
1
+ """Databricks Unity Catalog connection configuration."""
2
+
3
+ from typing import Any, Dict, Optional
4
+ from urllib.parse import urlparse
5
+
6
+ import pydantic
7
+ from pydantic import Field
8
+
9
+ from datahub.configuration.common import ConfigModel
10
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
11
+
12
+ DATABRICKS = "databricks"
13
+
14
+
15
+ class UnityCatalogConnectionConfig(ConfigModel):
16
+ """
17
+ Configuration for connecting to Databricks Unity Catalog.
18
+ Contains only connection-related fields that can be reused across different sources.
19
+ """
20
+
21
+ scheme: str = DATABRICKS
22
+ token: str = pydantic.Field(description="Databricks personal access token")
23
+ workspace_url: str = pydantic.Field(
24
+ description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
25
+ )
26
+ warehouse_id: Optional[str] = pydantic.Field(
27
+ default=None,
28
+ description=(
29
+ "SQL Warehouse id, for running queries. Must be explicitly provided to enable SQL-based features. "
30
+ "Required for the following features that need SQL access: "
31
+ "1) Tag extraction (include_tags=True) - queries system.information_schema.tags "
32
+ "2) Hive Metastore catalog (include_hive_metastore=True) - queries legacy hive_metastore catalog "
33
+ "3) System table lineage (lineage_data_source=SYSTEM_TABLES) - queries system.access.table_lineage/column_lineage "
34
+ "4) Data profiling (profiling.enabled=True) - runs SELECT/ANALYZE queries on tables. "
35
+ "When warehouse_id is missing, these features will be automatically disabled (with warnings) to allow ingestion to continue."
36
+ ),
37
+ )
38
+
39
+ extra_client_options: Dict[str, Any] = Field(
40
+ default={},
41
+ description="Additional options to pass to Databricks SQLAlchemy client.",
42
+ )
43
+
44
+ def __init__(self, **data: Any):
45
+ super().__init__(**data)
46
+
47
+ def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
48
+ uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
49
+ if database:
50
+ uri_opts["catalog"] = database
51
+ return make_sqlalchemy_uri(
52
+ scheme=self.scheme,
53
+ username="token",
54
+ password=self.token,
55
+ at=urlparse(self.workspace_url).netloc,
56
+ db=database,
57
+ uri_opts=uri_opts,
58
+ )
59
+
60
+ def get_options(self) -> dict:
61
+ return self.extra_client_options
@@ -19,6 +19,7 @@ class UnityCatalogConnectionTest:
19
19
  self.config.token,
20
20
  self.config.profiling.warehouse_id,
21
21
  report=self.report,
22
+ databricks_api_page_size=self.config.databricks_api_page_size,
22
23
  )
23
24
 
24
25
  def get_connection_test(self) -> TestConnectionReport:
@@ -0,0 +1,19 @@
1
+ import logging
2
+
3
+ from datahub.api.entities.external.external_entities import (
4
+ PlatformResourceRepository,
5
+ )
6
+ from datahub.ingestion.source.unity.tag_entities import (
7
+ UnityCatalogTagPlatformResource,
8
+ UnityCatalogTagPlatformResourceId,
9
+ )
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class UnityCatalogPlatformResourceRepository(
15
+ PlatformResourceRepository[
16
+ UnityCatalogTagPlatformResourceId, UnityCatalogTagPlatformResource
17
+ ]
18
+ ):
19
+ """Unity Catalog-specific platform resource repository with tag-related operations."""