acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,6 @@
1
1
  import logging
2
- import re
3
- from collections import defaultdict
4
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from dataclasses import dataclass
5
4
  from typing import Dict, Iterable, List, Optional
6
5
 
7
6
  from datahub.emitter.mce_builder import (
@@ -23,12 +22,16 @@ from datahub.ingestion.api.source import (
23
22
  SourceReport,
24
23
  )
25
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
25
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
26
26
  from datahub.ingestion.source.dremio.dremio_api import (
27
27
  DremioAPIOperations,
28
28
  DremioEdition,
29
29
  )
30
30
  from datahub.ingestion.source.dremio.dremio_aspects import DremioAspects
31
- from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
31
+ from datahub.ingestion.source.dremio.dremio_config import (
32
+ DremioSourceConfig,
33
+ DremioSourceMapping,
34
+ )
32
35
  from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
33
36
  DremioToDataHubSourceTypeMapping,
34
37
  )
@@ -39,6 +42,7 @@ from datahub.ingestion.source.dremio.dremio_entities import (
39
42
  DremioDatasetType,
40
43
  DremioGlossaryTerm,
41
44
  DremioQuery,
45
+ DremioSourceContainer,
42
46
  )
43
47
  from datahub.ingestion.source.dremio.dremio_profiling import DremioProfiler
44
48
  from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
@@ -48,13 +52,17 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
48
52
  from datahub.ingestion.source.state.stateful_ingestion_base import (
49
53
  StatefulIngestionSourceBase,
50
54
  )
51
- from datahub.ingestion.source_report.ingestion_stage import PROFILING
55
+ from datahub.ingestion.source_report.ingestion_stage import (
56
+ LINEAGE_EXTRACTION,
57
+ METADATA_EXTRACTION,
58
+ IngestionHighStage,
59
+ )
52
60
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
53
61
  DatasetLineageTypeClass,
54
62
  UpstreamClass,
55
63
  UpstreamLineage,
56
64
  )
57
- from datahub.metadata.schema_classes import ChangeTypeClass, SchemaMetadataClass
65
+ from datahub.metadata.schema_classes import SchemaMetadataClass
58
66
  from datahub.metadata.urns import CorpUserUrn
59
67
  from datahub.sql_parsing.sql_parsing_aggregator import (
60
68
  KnownQueryLineageInfo,
@@ -65,16 +73,48 @@ from datahub.sql_parsing.sql_parsing_aggregator import (
65
73
  logger = logging.getLogger(__name__)
66
74
 
67
75
 
76
+ @dataclass
77
+ class DremioSourceMapEntry:
78
+ platform: str
79
+ source_name: str
80
+ dremio_source_category: str
81
+ root_path: str = ""
82
+ database_name: str = ""
83
+ platform_instance: Optional[str] = None
84
+ env: Optional[str] = None
85
+
86
+
68
87
  @platform_name("Dremio")
69
88
  @config_class(DremioSourceConfig)
70
89
  @support_status(SupportStatus.CERTIFIED)
71
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
90
+ @capability(
91
+ SourceCapability.CONTAINERS,
92
+ "Enabled by default",
93
+ subtype_modifier=[
94
+ SourceCapabilityModifier.DREMIO_SPACE,
95
+ SourceCapabilityModifier.DREMIO_SOURCE,
96
+ ],
97
+ )
72
98
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
73
99
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
74
100
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
75
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
101
+ @capability(
102
+ SourceCapability.LINEAGE_COARSE,
103
+ "Enabled by default",
104
+ subtype_modifier=[
105
+ SourceCapabilityModifier.TABLE,
106
+ ],
107
+ )
108
+ @capability(
109
+ SourceCapability.LINEAGE_FINE,
110
+ "Extract column-level lineage",
111
+ subtype_modifier=[
112
+ SourceCapabilityModifier.TABLE,
113
+ ],
114
+ )
76
115
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
77
116
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
117
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
78
118
  class DremioSource(StatefulIngestionSourceBase):
79
119
  """
80
120
  This plugin integrates with Dremio to extract and ingest metadata into DataHub.
@@ -112,7 +152,14 @@ class DremioSource(StatefulIngestionSourceBase):
112
152
  self.default_db = "dremio"
113
153
  self.config = config
114
154
  self.report = DremioSourceReport()
115
- self.source_map: Dict[str, Dict] = defaultdict()
155
+
156
+ # Set time window for query lineage extraction
157
+ self.report.window_start_time, self.report.window_end_time = (
158
+ self.config.start_time,
159
+ self.config.end_time,
160
+ )
161
+
162
+ self.source_map: Dict[str, DremioSourceMapEntry] = dict()
116
163
 
117
164
  # Initialize API operations
118
165
  dremio_api = DremioAPIOperations(self.config, self.report)
@@ -140,6 +187,7 @@ class DremioSource(StatefulIngestionSourceBase):
140
187
  generate_operations=True,
141
188
  usage_config=self.config.usage,
142
189
  )
190
+ self.report.sql_aggregator = self.sql_parsing_aggregator.report
143
191
 
144
192
  # For profiling
145
193
  self.profiler = DremioProfiler(config, self.report, dremio_api)
@@ -152,111 +200,12 @@ class DremioSource(StatefulIngestionSourceBase):
152
200
  def get_platform(self) -> str:
153
201
  return "dremio"
154
202
 
155
- def _build_source_map(self) -> Dict[str, Dict]:
156
- """
157
- Builds a source mapping dictionary to support external lineage generation across
158
- multiple Dremio sources, based on provided configuration mappings.
159
-
160
- This method operates as follows:
161
-
162
- 1. If a source mapping is present in the config:
163
- - For each source in the Dremio catalog, if the mapping's `source_name` matches
164
- the `dremio_source_type`, `root_path` and `database_name` are added to the mapping
165
- information, along with the platform, platform instance, and environment if they exist.
166
- This allows constructing the full URN for upstream lineage.
167
-
168
- 2. If a source mapping is absent in the configuration:
169
- - Default mappings are created for each source name, setting `env` and `platform_instance`
170
- to default values and classifying the source type. This ensures all sources have a
171
- mapping, even if specific configuration details are missing.
172
-
173
- Returns:
174
- Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
175
- (lowercased) and each value is another dictionary containing:
176
- - `platform`: The source platform.
177
- - `source_name`: The source name.
178
- - `dremio_source_type`: The type mapped to DataHub,
179
- e.g., "database", "folder".
180
- - Optional `root_path`, `database_name`, `platform_instance`,
181
- and `env` if provided in the configuration.
182
- Example:
183
- This method is used internally within the class to generate mappings before
184
- creating cross-platform lineage.
185
-
186
- """
187
-
188
- source_map = {}
203
+ def _build_source_map(self) -> Dict[str, DremioSourceMapEntry]:
189
204
  dremio_sources = self.dremio_catalog.get_sources()
205
+ source_mappings_config = self.config.source_mappings or []
190
206
 
191
- for source in dremio_sources:
192
- source_name = source.container_name
193
- if isinstance(source.dremio_source_type, str):
194
- source_type = source.dremio_source_type.lower()
195
- root_path = source.root_path.lower() if source.root_path else ""
196
- database_name = (
197
- source.database_name.lower() if source.database_name else ""
198
- )
199
- source_present = False
200
- source_platform_name = source_name
201
-
202
- for mapping in self.config.source_mappings or []:
203
- if re.search(mapping.source_name, source_type, re.IGNORECASE):
204
- source_platform_name = mapping.source_name.lower()
205
-
206
- datahub_source_type = (
207
- DremioToDataHubSourceTypeMapping.get_datahub_source_type(
208
- source_type
209
- )
210
- )
211
-
212
- if re.search(mapping.platform, datahub_source_type, re.IGNORECASE):
213
- source_platform_name = source_platform_name.lower()
214
- source_map[source_platform_name] = {
215
- "platform": mapping.platform,
216
- "source_name": mapping.source_name,
217
- "dremio_source_type": DremioToDataHubSourceTypeMapping.get_category(
218
- source_type,
219
- ),
220
- "root_path": root_path,
221
- "database_name": database_name,
222
- "platform_instance": mapping.platform_instance,
223
- "env": mapping.env,
224
- }
225
- source_present = True
226
- break
227
-
228
- if not source_present:
229
- try:
230
- dremio_source_type = (
231
- DremioToDataHubSourceTypeMapping.get_category(source_type)
232
- )
233
- except Exception as exc:
234
- logger.info(
235
- f"Source {source_type} is not a standard Dremio source type. "
236
- f"Adding source_type {source_type} to mapping as database. Error: {exc}"
237
- )
238
-
239
- DremioToDataHubSourceTypeMapping.add_mapping(
240
- source_type, source_name
241
- )
242
- dremio_source_type = (
243
- DremioToDataHubSourceTypeMapping.get_category(source_type)
244
- )
245
-
246
- source_map[source_platform_name.lower()] = {
247
- "platform": source_type,
248
- "source_name": source_name,
249
- "dremio_source_type": dremio_source_type,
250
- }
251
-
252
- else:
253
- logger.error(
254
- f'Source "{source.container_name}" is broken. Containers will not be created for source.'
255
- )
256
- logger.error(
257
- f'No new cross-platform lineage will be emitted for source "{source.container_name}".'
258
- )
259
- logger.error("Fix this source in Dremio to fix this issue.")
207
+ source_map = build_dremio_source_map(dremio_sources, source_mappings_config)
208
+ logger.info(f"Full source map: {source_map}")
260
209
 
261
210
  return source_map
262
211
 
@@ -275,84 +224,88 @@ class DremioSource(StatefulIngestionSourceBase):
275
224
 
276
225
  self.source_map = self._build_source_map()
277
226
 
278
- # Process Containers
279
- containers = self.dremio_catalog.get_containers()
280
- for container in containers:
281
- try:
282
- yield from self.process_container(container)
283
- logger.info(
284
- f"Dremio container {container.container_name} emitted successfully"
285
- )
286
- except Exception as exc:
287
- self.report.num_containers_failed += 1 # Increment failed containers
288
- self.report.report_failure(
289
- message="Failed to process Dremio container",
290
- context=f"{'.'.join(container.path)}.{container.container_name}",
291
- exc=exc,
292
- )
227
+ with self.report.new_stage(METADATA_EXTRACTION):
228
+ # Process Containers
229
+ containers = self.dremio_catalog.get_containers()
230
+ for container in containers:
231
+ try:
232
+ yield from self.process_container(container)
233
+ logger.info(
234
+ f"Dremio container {container.container_name} emitted successfully"
235
+ )
236
+ except Exception as exc:
237
+ self.report.num_containers_failed += 1
238
+ self.report.report_failure(
239
+ message="Failed to process Dremio container",
240
+ context=f"{'.'.join(container.path)}.{container.container_name}",
241
+ exc=exc,
242
+ )
293
243
 
294
- # Process Datasets
295
- datasets = self.dremio_catalog.get_datasets()
244
+ # Process Datasets
245
+ datasets = self.dremio_catalog.get_datasets()
296
246
 
297
- for dataset_info in datasets:
298
- try:
299
- yield from self.process_dataset(dataset_info)
300
- logger.info(
301
- f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
302
- )
303
- except Exception as exc:
304
- self.report.num_datasets_failed += 1 # Increment failed datasets
305
- self.report.report_failure(
306
- message="Failed to process Dremio dataset",
307
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
308
- exc=exc,
309
- )
247
+ for dataset_info in datasets:
248
+ try:
249
+ yield from self.process_dataset(dataset_info)
250
+ logger.info(
251
+ f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
252
+ )
253
+ except Exception as exc:
254
+ self.report.num_datasets_failed += 1 # Increment failed datasets
255
+ self.report.report_failure(
256
+ message="Failed to process Dremio dataset",
257
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
258
+ exc=exc,
259
+ )
310
260
 
311
- # Optionally Process Query Lineage
312
- if self.config.include_query_lineage:
313
- self.get_query_lineage_workunits()
314
-
315
- # Process Glossary Terms
316
- glossary_terms = self.dremio_catalog.get_glossary_terms()
317
-
318
- for glossary_term in glossary_terms:
319
- try:
320
- yield from self.process_glossary_term(glossary_term)
321
- except Exception as exc:
322
- self.report.report_failure(
323
- message="Failed to process Glossary terms",
324
- context=f"{glossary_term.glossary_term}",
325
- exc=exc,
326
- )
261
+ # Process Glossary Terms
262
+ glossary_terms = self.dremio_catalog.get_glossary_terms()
263
+
264
+ for glossary_term in glossary_terms:
265
+ try:
266
+ yield from self.process_glossary_term(glossary_term)
267
+ except Exception as exc:
268
+ self.report.report_failure(
269
+ message="Failed to process Glossary terms",
270
+ context=f"{glossary_term.glossary_term}",
271
+ exc=exc,
272
+ )
327
273
 
328
- # Generate workunit for aggregated SQL parsing results
329
- for mcp in self.sql_parsing_aggregator.gen_metadata():
330
- self.report.report_workunit(mcp.as_workunit())
331
- yield mcp.as_workunit()
332
-
333
- # Profiling
334
- if self.config.is_profiling_enabled():
335
- with ThreadPoolExecutor(
336
- max_workers=self.config.profiling.max_workers
337
- ) as executor:
338
- future_to_dataset = {
339
- executor.submit(self.generate_profiles, dataset): dataset
340
- for dataset in datasets
341
- }
342
-
343
- for future in as_completed(future_to_dataset):
344
- dataset_info = future_to_dataset[future]
345
- try:
346
- yield from future.result()
347
- except Exception as exc:
348
- self.report.profiling_skipped_other[
349
- dataset_info.resource_name
350
- ] += 1
351
- self.report.report_failure(
352
- message="Failed to profile dataset",
353
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
354
- exc=exc,
355
- )
274
+ # Optionally Process Query Lineage
275
+ if self.config.include_query_lineage:
276
+ with self.report.new_stage(LINEAGE_EXTRACTION):
277
+ self.get_query_lineage_workunits()
278
+
279
+ # Generate workunit for aggregated SQL parsing results
280
+ for mcp in self.sql_parsing_aggregator.gen_metadata():
281
+ yield mcp.as_workunit()
282
+
283
+ # Profiling
284
+ if self.config.is_profiling_enabled():
285
+ with (
286
+ self.report.new_high_stage(IngestionHighStage.PROFILING),
287
+ ThreadPoolExecutor(
288
+ max_workers=self.config.profiling.max_workers
289
+ ) as executor,
290
+ ):
291
+ future_to_dataset = {
292
+ executor.submit(self.generate_profiles, dataset): dataset
293
+ for dataset in datasets
294
+ }
295
+
296
+ for future in as_completed(future_to_dataset):
297
+ dataset_info = future_to_dataset[future]
298
+ try:
299
+ yield from future.result()
300
+ except Exception as exc:
301
+ self.report.profiling_skipped_other[
302
+ dataset_info.resource_name
303
+ ] += 1
304
+ self.report.report_failure(
305
+ message="Failed to profile dataset",
306
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
307
+ exc=exc,
308
+ )
356
309
 
357
310
  def process_container(
358
311
  self, container_info: DremioContainer
@@ -385,10 +338,10 @@ class DremioSource(StatefulIngestionSourceBase):
385
338
  return
386
339
 
387
340
  dataset_urn = make_dataset_urn_with_platform_instance(
388
- platform=make_data_platform_urn(self.get_platform()),
389
- name=f"dremio.{dataset_name}",
390
- env=self.config.env,
341
+ platform=self.get_platform(),
342
+ name=dataset_name,
391
343
  platform_instance=self.config.platform_instance,
344
+ env=self.config.env,
392
345
  )
393
346
 
394
347
  for dremio_mcp in self.dremio_aspects.populate_dataset_mcp(
@@ -431,6 +384,7 @@ class DremioSource(StatefulIngestionSourceBase):
431
384
  dremio_path=dataset_info.path,
432
385
  dremio_dataset=dataset_info.resource_name,
433
386
  )
387
+ logger.debug(f"Upstream dataset for {dataset_urn}: {upstream_urn}")
434
388
 
435
389
  if upstream_urn:
436
390
  upstream_lineage = UpstreamLineage(
@@ -467,13 +421,12 @@ class DremioSource(StatefulIngestionSourceBase):
467
421
  schema_str = ".".join(dataset_info.path)
468
422
  dataset_name = f"{schema_str}.{dataset_info.resource_name}".lower()
469
423
  dataset_urn = make_dataset_urn_with_platform_instance(
470
- platform=make_data_platform_urn(self.get_platform()),
471
- name=f"dremio.{dataset_name}",
472
- env=self.config.env,
424
+ platform=self.get_platform(),
425
+ name=dataset_name,
473
426
  platform_instance=self.config.platform_instance,
427
+ env=self.config.env,
474
428
  )
475
- with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
476
- yield from self.profiler.get_workunits(dataset_info, dataset_urn)
429
+ yield from self.profiler.get_workunits(dataset_info, dataset_urn)
477
430
 
478
431
  def generate_view_lineage(
479
432
  self, dataset_urn: str, parents: List[str]
@@ -483,10 +436,10 @@ class DremioSource(StatefulIngestionSourceBase):
483
436
  """
484
437
  upstream_urns = [
485
438
  make_dataset_urn_with_platform_instance(
486
- platform=make_data_platform_urn(self.get_platform()),
487
- name=f"dremio.{upstream_table.lower()}",
488
- env=self.config.env,
439
+ platform=self.get_platform(),
440
+ name=upstream_table.lower(),
489
441
  platform_instance=self.config.platform_instance,
442
+ env=self.config.env,
490
443
  )
491
444
  for upstream_table in parents
492
445
  ]
@@ -501,11 +454,8 @@ class DremioSource(StatefulIngestionSourceBase):
501
454
  ]
502
455
  )
503
456
  mcp = MetadataChangeProposalWrapper(
504
- entityType="dataset",
505
457
  entityUrn=dataset_urn,
506
- aspectName=lineage.ASPECT_NAME,
507
458
  aspect=lineage,
508
- changeType=ChangeTypeClass.UPSERT,
509
459
  )
510
460
 
511
461
  for upstream_urn in upstream_urns:
@@ -548,19 +498,19 @@ class DremioSource(StatefulIngestionSourceBase):
548
498
  if query.query and query.affected_dataset:
549
499
  upstream_urns = [
550
500
  make_dataset_urn_with_platform_instance(
551
- platform=make_data_platform_urn(self.get_platform()),
552
- name=f"dremio.{ds.lower()}",
553
- env=self.config.env,
501
+ platform=self.get_platform(),
502
+ name=ds.lower(),
554
503
  platform_instance=self.config.platform_instance,
504
+ env=self.config.env,
555
505
  )
556
506
  for ds in query.queried_datasets
557
507
  ]
558
508
 
559
509
  downstream_urn = make_dataset_urn_with_platform_instance(
560
- platform=make_data_platform_urn(self.get_platform()),
561
- name=f"dremio.{query.affected_dataset.lower()}",
562
- env=self.config.env,
510
+ platform=self.get_platform(),
511
+ name=query.affected_dataset.lower(),
563
512
  platform_instance=self.config.platform_instance,
513
+ env=self.config.env,
564
514
  )
565
515
 
566
516
  # Add query to SqlParsingAggregator
@@ -596,25 +546,23 @@ class DremioSource(StatefulIngestionSourceBase):
596
546
  if not mapping:
597
547
  return None
598
548
 
599
- platform = mapping.get("platform")
549
+ platform = mapping.platform
600
550
  if not platform:
601
551
  return None
602
552
 
603
- platform_instance = mapping.get(
604
- "platform_instance", self.config.platform_instance
605
- )
606
- env = mapping.get("env", self.config.env)
553
+ platform_instance = mapping.platform_instance
554
+ env = mapping.env or self.config.env
607
555
 
608
556
  root_path = ""
609
557
  database_name = ""
610
558
 
611
- if mapping.get("dremio_source_type") == "file_object_storage":
612
- if mapping.get("root_path"):
613
- root_path = f"{mapping['root_path'][1:]}/"
559
+ if mapping.dremio_source_category == "file_object_storage":
560
+ if mapping.root_path:
561
+ root_path = f"{mapping.root_path[1:]}/"
614
562
  dremio_dataset = f"{root_path}{'/'.join(dremio_path[1:])}/{dremio_dataset}"
615
563
  else:
616
- if mapping.get("database_name"):
617
- database_name = f"{mapping['database_name']}."
564
+ if mapping.database_name:
565
+ database_name = f"{mapping.database_name}."
618
566
  dremio_dataset = (
619
567
  f"{database_name}{'.'.join(dremio_path[1:])}.{dremio_dataset}"
620
568
  )
@@ -639,3 +587,68 @@ class DremioSource(StatefulIngestionSourceBase):
639
587
  Get the source report.
640
588
  """
641
589
  return self.report
590
+
591
+
592
+ def build_dremio_source_map(
593
+ dremio_sources: Iterable[DremioSourceContainer],
594
+ source_mappings_config: List[DremioSourceMapping],
595
+ ) -> Dict[str, DremioSourceMapEntry]:
596
+ """
597
+ Builds a source mapping dictionary to support external lineage generation across
598
+ multiple Dremio sources, based on provided configuration mappings.
599
+
600
+ This method operates as follows:
601
+
602
+ Returns:
603
+ Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
604
+ (lowercased) and each value is another entry containing:
605
+ - `platform`: The source platform.
606
+ - `source_name`: The source name.
607
+ - `dremio_source_category`: The type mapped to DataHub,
608
+ e.g., "database", "folder".
609
+ - Optional `root_path`, `database_name`, `platform_instance`,
610
+ and `env` if provided in the configuration.
611
+ Example:
612
+ This method is used internally within the class to generate mappings before
613
+ creating cross-platform lineage.
614
+
615
+ """
616
+ source_map = {}
617
+ for source in dremio_sources:
618
+ current_source_name = source.container_name
619
+
620
+ source_type = source.dremio_source_type.lower()
621
+ source_category = DremioToDataHubSourceTypeMapping.get_category(source_type)
622
+ datahub_platform = DremioToDataHubSourceTypeMapping.get_datahub_platform(
623
+ source_type
624
+ )
625
+ root_path = source.root_path.lower() if source.root_path else ""
626
+ database_name = source.database_name.lower() if source.database_name else ""
627
+ source_present = False
628
+
629
+ for mapping in source_mappings_config:
630
+ if mapping.source_name.lower() == current_source_name.lower():
631
+ source_map[current_source_name.lower()] = DremioSourceMapEntry(
632
+ platform=mapping.platform,
633
+ source_name=mapping.source_name,
634
+ dremio_source_category=source_category,
635
+ root_path=root_path,
636
+ database_name=database_name,
637
+ platform_instance=mapping.platform_instance,
638
+ env=mapping.env,
639
+ )
640
+ source_present = True
641
+ break
642
+
643
+ if not source_present:
644
+ source_map[current_source_name.lower()] = DremioSourceMapEntry(
645
+ platform=datahub_platform,
646
+ source_name=current_source_name,
647
+ dremio_source_category=source_category,
648
+ root_path=root_path,
649
+ database_name=database_name,
650
+ platform_instance=None,
651
+ env=None,
652
+ )
653
+
654
+ return source_map