acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,22 @@
1
+ from enum import Enum, auto
1
2
  from typing import Dict, List, Optional
2
3
 
3
4
  from datahub.configuration.common import ConfigModel
5
+ from datahub.configuration.env_vars import get_datahub_component
6
+
7
+
8
+ class ClientMode(Enum):
9
+ INGESTION = auto()
10
+ CLI = auto()
11
+ SDK = auto()
12
+
13
+
14
+ DATAHUB_COMPONENT_ENV: str = get_datahub_component().lower()
4
15
 
5
16
 
6
17
  class DatahubClientConfig(ConfigModel):
7
18
  """Configuration class for holding connectivity to datahub gms"""
8
19
 
9
- # TODO: Having a default for the server doesn't make a ton of sense. This should be handled
10
- # by callers / the CLI, but the actual client should not have any magic.
11
20
  server: str
12
21
  token: Optional[str] = None
13
22
  timeout_sec: Optional[float] = None
@@ -17,3 +26,10 @@ class DatahubClientConfig(ConfigModel):
17
26
  ca_certificate_path: Optional[str] = None
18
27
  client_certificate_path: Optional[str] = None
19
28
  disable_ssl_verification: bool = False
29
+ openapi_ingestion: Optional[bool] = None
30
+ client_mode: Optional[ClientMode] = None
31
+ datahub_component: Optional[str] = None
32
+ server_config_refresh_interval: Optional[int] = None
33
+
34
+ class Config:
35
+ extra = "ignore"
@@ -1,30 +1,58 @@
1
1
  import dataclasses
2
2
  import enum
3
- from typing import Any, Dict, List, Optional
3
+ import warnings
4
+ from typing import Dict, List, Literal, Optional, Union
5
+
6
+ from typing_extensions import TypeAlias
4
7
 
5
8
  from datahub.emitter.mce_builder import (
6
9
  make_data_platform_urn,
7
10
  make_dataplatform_instance_urn,
8
11
  )
12
+ from datahub.errors import SearchFilterWarning
9
13
  from datahub.utilities.urns.urn import guess_entity_type
10
14
 
11
- RawSearchFilterRule = Dict[str, Any]
15
+ RawSearchFilterRule: TypeAlias = Dict[str, Union[str, bool, List[str]]]
16
+
17
+ # This is a list of OR filters, each of which is a list of AND filters.
18
+ # This can be put directly into the orFilters parameter in GraphQL.
19
+ RawSearchFilter: TypeAlias = List[Dict[Literal["and"], List[RawSearchFilterRule]]]
20
+
21
+ # Mirrors our GraphQL enum: https://docs.datahub.com/docs/graphql/enums#filteroperator
22
+ FilterOperator: TypeAlias = Literal[
23
+ "CONTAIN",
24
+ "EQUAL",
25
+ "IEQUAL",
26
+ "IN",
27
+ "EXISTS",
28
+ "GREATER_THAN",
29
+ "GREATER_THAN_OR_EQUAL_TO",
30
+ "LESS_THAN",
31
+ "LESS_THAN_OR_EQUAL_TO",
32
+ "START_WITH",
33
+ "END_WITH",
34
+ "DESCENDANTS_INCL",
35
+ "ANCESTORS_INCL",
36
+ "RELATED_INCL",
37
+ ]
12
38
 
13
39
 
14
40
  @dataclasses.dataclass
15
41
  class SearchFilterRule:
16
42
  field: str
17
- condition: str # TODO: convert to an enum
43
+ condition: FilterOperator
18
44
  values: List[str]
19
45
  negated: bool = False
20
46
 
21
47
  def to_raw(self) -> RawSearchFilterRule:
22
- return {
48
+ rule: RawSearchFilterRule = {
23
49
  "field": self.field,
24
50
  "condition": self.condition,
25
51
  "values": self.values,
26
- "negated": self.negated,
27
52
  }
53
+ if self.negated:
54
+ rule["negated"] = True
55
+ return rule
28
56
 
29
57
  def negate(self) -> "SearchFilterRule":
30
58
  return SearchFilterRule(
@@ -48,15 +76,25 @@ class RemovedStatusFilter(enum.Enum):
48
76
  """Search only soft-deleted entities."""
49
77
 
50
78
 
79
+ def _validate_or_filter_structure(
80
+ or_filters: List[Dict[str, List[SearchFilterRule]]],
81
+ ) -> None:
82
+ for filter_list in or_filters:
83
+ if "and" not in filter_list:
84
+ raise ValueError(f"Invalid or filter: {filter_list}")
85
+ if not isinstance(filter_list["and"], list):
86
+ raise ValueError(f"Invalid or filter: {filter_list}")
87
+
88
+
51
89
  def generate_filter(
52
- platform: Optional[str],
90
+ platform: Union[None, str, List[str]],
53
91
  platform_instance: Optional[str],
54
92
  env: Optional[str],
55
- container: Optional[str],
56
- status: RemovedStatusFilter,
93
+ container: Union[None, str, List[str]],
94
+ status: Optional[RemovedStatusFilter],
57
95
  extra_filters: Optional[List[RawSearchFilterRule]],
58
- extra_or_filters: Optional[List[RawSearchFilterRule]] = None,
59
- ) -> List[Dict[str, List[RawSearchFilterRule]]]:
96
+ extra_or_filters: Optional[RawSearchFilter] = None,
97
+ ) -> RawSearchFilter:
60
98
  """
61
99
  Generate a search filter based on the provided parameters.
62
100
  :param platform: The platform to filter by.
@@ -65,8 +103,7 @@ def generate_filter(
65
103
  :param container: The container to filter by.
66
104
  :param status: The status to filter by.
67
105
  :param extra_filters: Extra AND filters to apply.
68
- :param extra_or_filters: Extra OR filters to apply. These are combined with
69
- the AND filters using an OR at the top level.
106
+ :param extra_or_filters: Extra OR filters to apply. These are combined with the AND filters using an OR at the top level.
70
107
  """
71
108
  and_filters: List[RawSearchFilterRule] = []
72
109
 
@@ -85,15 +122,16 @@ def generate_filter(
85
122
  and_filters.append(_get_container_filter(container).to_raw())
86
123
 
87
124
  # Status filter.
88
- status_filter = _get_status_filter(status)
89
- if status_filter:
90
- and_filters.append(status_filter.to_raw())
125
+ if status:
126
+ status_filter = _get_status_filter(status)
127
+ if status_filter:
128
+ and_filters.append(status_filter.to_raw())
91
129
 
92
130
  # Extra filters.
93
131
  if extra_filters:
94
132
  and_filters += extra_filters
95
133
 
96
- or_filters: List[Dict[str, List[RawSearchFilterRule]]] = [{"and": and_filters}]
134
+ or_filters: RawSearchFilter = [{"and": and_filters}]
97
135
 
98
136
  # Env filter
99
137
  if env:
@@ -107,11 +145,27 @@ def generate_filter(
107
145
 
108
146
  # Extra OR filters are distributed across the top level and lists.
109
147
  if extra_or_filters:
110
- or_filters = [
111
- {"and": and_filter["and"] + [extra_or_filter]}
112
- for extra_or_filter in extra_or_filters
113
- for and_filter in or_filters
114
- ]
148
+ new_or_filters: RawSearchFilter = []
149
+ for and_filter in or_filters:
150
+ for extra_or_filter in extra_or_filters:
151
+ if isinstance(extra_or_filter, dict) and "and" in extra_or_filter:
152
+ new_or_filters.append(
153
+ {"and": and_filter["and"] + extra_or_filter["and"]}
154
+ )
155
+ else:
156
+ # Hack for backwards compatibility.
157
+ # We have some code that erroneously passed a List[RawSearchFilterRule]
158
+ # instead of a List[Dict["and", List[RawSearchFilterRule]]].
159
+ warnings.warn(
160
+ "Passing a List[RawSearchFilterRule] to extra_or_filters is deprecated. "
161
+ "Please pass a List[Dict[str, List[RawSearchFilterRule]]] instead.",
162
+ SearchFilterWarning,
163
+ stacklevel=3,
164
+ )
165
+ new_or_filters.append(
166
+ {"and": and_filter["and"] + [extra_or_filter]} # type: ignore
167
+ )
168
+ or_filters = new_or_filters
115
169
 
116
170
  return or_filters
117
171
 
@@ -123,7 +177,7 @@ def _get_env_filters(env: str) -> List[RawSearchFilterRule]:
123
177
  # For most entity types, we look at the origin field.
124
178
  {
125
179
  "field": "origin",
126
- "value": env,
180
+ "values": [env],
127
181
  "condition": "EQUAL",
128
182
  },
129
183
  # For containers, we look at the customProperties field.
@@ -131,15 +185,15 @@ def _get_env_filters(env: str) -> List[RawSearchFilterRule]:
131
185
  # we look for the "env" property. Otherwise, we use the "instance" property.
132
186
  {
133
187
  "field": "customProperties",
134
- "value": f"env={env}",
188
+ "values": [f"env={env}"],
135
189
  },
136
190
  {
137
191
  "field": "customProperties",
138
- "value": f"instance={env}",
192
+ "values": [f"instance={env}"],
139
193
  },
140
194
  {
141
195
  "field": "env",
142
- "value": env,
196
+ "values": [env],
143
197
  },
144
198
  # Note that not all entity types have an env (e.g. dashboards / charts).
145
199
  # If the env filter is specified, these will be excluded.
@@ -173,23 +227,31 @@ def _get_status_filter(status: RemovedStatusFilter) -> Optional[SearchFilterRule
173
227
  raise ValueError(f"Invalid status filter: {status}")
174
228
 
175
229
 
176
- def _get_container_filter(container: str) -> SearchFilterRule:
230
+ def _get_container_filter(container: Union[str, List[str]]) -> SearchFilterRule:
231
+ if not isinstance(container, list):
232
+ container = [container]
233
+
177
234
  # Warn if container is not a fully qualified urn.
178
235
  # TODO: Change this once we have a first-class container urn type.
179
- if guess_entity_type(container) != "container":
180
- raise ValueError(f"Invalid container urn: {container}")
236
+ for cont in container:
237
+ if guess_entity_type(cont) != "container":
238
+ raise ValueError(f"Invalid container urn: {cont}")
181
239
 
182
240
  return SearchFilterRule(
183
241
  field="browsePathV2",
184
- values=[container],
242
+ values=container,
185
243
  condition="CONTAIN",
186
244
  )
187
245
 
188
246
 
189
247
  def _get_platform_instance_filter(
190
- platform: Optional[str], platform_instance: str
248
+ platform: Union[None, str, List[str]], platform_instance: str
191
249
  ) -> SearchFilterRule:
192
250
  if platform:
251
+ if isinstance(platform, list):
252
+ raise ValueError(
253
+ "Platform instance filter cannot be combined with a multi-value platform filter."
254
+ )
193
255
  # Massage the platform instance into a fully qualified urn, if necessary.
194
256
  platform_instance = make_dataplatform_instance_urn(platform, platform_instance)
195
257
 
@@ -205,9 +267,11 @@ def _get_platform_instance_filter(
205
267
  )
206
268
 
207
269
 
208
- def _get_platform_filter(platform: str) -> SearchFilterRule:
270
+ def _get_platform_filter(platform: Union[str, List[str]]) -> SearchFilterRule:
271
+ if not isinstance(platform, list):
272
+ platform = [platform]
209
273
  return SearchFilterRule(
210
274
  field="platform.keyword",
211
275
  condition="EQUAL",
212
- values=[make_data_platform_urn(platform)],
276
+ values=[make_data_platform_urn(plt) for plt in platform],
213
277
  )
@@ -0,0 +1,55 @@
1
+ import urllib.parse
2
+ from typing import Optional
3
+
4
+ import datahub.metadata.urns as urns
5
+ from datahub.utilities.urns.urn import guess_entity_type
6
+
7
+ _url_prefixes = {
8
+ # Atypical mappings.
9
+ urns.DataJobUrn.ENTITY_TYPE: "tasks",
10
+ urns.DataFlowUrn.ENTITY_TYPE: "pipelines",
11
+ urns.CorpUserUrn.ENTITY_TYPE: "user",
12
+ urns.CorpGroupUrn.ENTITY_TYPE: "group",
13
+ # Normal mappings - matches the entity type.
14
+ urns.ChartUrn.ENTITY_TYPE: "chart",
15
+ urns.ContainerUrn.ENTITY_TYPE: "container",
16
+ urns.DataProductUrn.ENTITY_TYPE: "dataProduct",
17
+ urns.DatasetUrn.ENTITY_TYPE: "dataset",
18
+ urns.DashboardUrn.ENTITY_TYPE: "dashboard",
19
+ urns.DomainUrn.ENTITY_TYPE: "domain",
20
+ urns.GlossaryNodeUrn.ENTITY_TYPE: "glossaryNode",
21
+ urns.GlossaryTermUrn.ENTITY_TYPE: "glossaryTerm",
22
+ urns.TagUrn.ENTITY_TYPE: "tag",
23
+ }
24
+
25
+
26
+ def make_url_for_urn(
27
+ frontend_base_url: str,
28
+ entity_urn: str,
29
+ *,
30
+ tab: Optional[str] = None,
31
+ ) -> str:
32
+ """Build the public-facing URL for an entity urn.
33
+
34
+ Args:
35
+ frontend_url: The public-facing base url of the frontend.
36
+ entity_urn: The urn of the entity to get the url for.
37
+ tab: The tab to deep link into. If not provided, the default tab for the entity will be shown.
38
+
39
+ Returns:
40
+ The public-facing url for the entity.
41
+
42
+ Examples:
43
+ >>> make_url_for_urn("https://demo.datahub.com", "urn:li:container:b41c14bc5cb3ccfbb0433c8cbdef2992", tab="Contents")
44
+ 'https://demo.datahub.com/container/urn%3Ali%3Acontainer%3Ab41c14bc5cb3ccfbb0433c8cbdef2992/Contents'
45
+ >>> make_url_for_urn("https://demo.datahub.com", "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.actuating,PROD)")
46
+ 'https://demo.datahub.com/dataset/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Asnowflake%2Clong_tail_companions.adoption.actuating%2CPROD%29/'
47
+ """
48
+ entity_type = guess_entity_type(entity_urn)
49
+ encoded_entity_urn = urllib.parse.quote(entity_urn, safe="")
50
+
51
+ url_prefix = _url_prefixes.get(entity_type, entity_type)
52
+ url = f"{frontend_base_url}/{url_prefix}/{encoded_entity_urn}/"
53
+ if tab:
54
+ url += f"{tab}"
55
+ return url
@@ -13,6 +13,7 @@ from datahub.configuration.common import (
13
13
  from datahub.emitter.aspect import JSON_CONTENT_TYPE
14
14
  from datahub.emitter.mce_builder import datahub_guid, make_data_platform_urn
15
15
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
16
+ from datahub.emitter.rest_emitter import EmitMode
16
17
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
17
18
  from datahub.ingestion.api.pipeline_run_listener import PipelineRunListener
18
19
  from datahub.ingestion.api.sink import NoopWriteCallback, Sink
@@ -111,6 +112,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
111
112
  def __init__(self, sink: Sink, report_recipe: bool, ctx: PipelineContext) -> None:
112
113
  assert ctx.pipeline_config is not None
113
114
 
115
+ self.ctx = ctx
114
116
  self.sink: Sink = sink
115
117
  self.report_recipe = report_recipe
116
118
  ingestion_source_key = self.generate_unique_key(ctx.pipeline_config)
@@ -191,18 +193,25 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
191
193
  )
192
194
  return json.dumps(converted_recipe)
193
195
 
194
- def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
195
- self.sink.write_record_async(
196
- RecordEnvelope(
197
- record=MetadataChangeProposalWrapper(
198
- entityUrn=str(entity_urn),
199
- aspect=aspect_value,
200
- ),
201
- metadata={},
202
- ),
203
- NoopWriteCallback(),
196
+ def _emit_aspect(
197
+ self, entity_urn: Urn, aspect_value: _Aspect, try_sync: bool = False
198
+ ) -> None:
199
+ mcp = MetadataChangeProposalWrapper(
200
+ entityUrn=str(entity_urn),
201
+ aspect=aspect_value,
204
202
  )
205
203
 
204
+ if try_sync and self.ctx.graph:
205
+ self.ctx.graph.emit_mcp(mcp, emit_mode=EmitMode.SYNC_PRIMARY)
206
+ else:
207
+ self.sink.write_record_async(
208
+ RecordEnvelope(
209
+ record=mcp,
210
+ metadata={},
211
+ ),
212
+ NoopWriteCallback(),
213
+ )
214
+
206
215
  def on_start(self, ctx: PipelineContext) -> None:
207
216
  assert ctx.pipeline_config is not None
208
217
  # Construct the dataHubExecutionRequestInput aspect
@@ -223,6 +232,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
223
232
  self._emit_aspect(
224
233
  entity_urn=self.execution_request_input_urn,
225
234
  aspect_value=execution_input_aspect,
235
+ try_sync=True,
226
236
  )
227
237
 
228
238
  def on_completion(
@@ -258,4 +268,4 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
258
268
  entity_urn=self.execution_request_input_urn,
259
269
  aspect_value=execution_result_aspect,
260
270
  )
261
- self.sink.close()
271
+ # Note: sink.close() is handled by the pipeline's context manager
@@ -31,6 +31,7 @@ from datahub.ingestion.api.source import Extractor, Source
31
31
  from datahub.ingestion.api.transform import Transformer
32
32
  from datahub.ingestion.extractor.extractor_registry import extractor_registry
33
33
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
34
+ from datahub.ingestion.graph.config import ClientMode
34
35
  from datahub.ingestion.reporting.reporting_provider_registry import (
35
36
  reporting_provider_registry,
36
37
  )
@@ -39,13 +40,14 @@ from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, Logging
39
40
  from datahub.ingestion.sink.datahub_rest import DatahubRestSink
40
41
  from datahub.ingestion.sink.sink_registry import sink_registry
41
42
  from datahub.ingestion.source.source_registry import source_registry
42
- from datahub.ingestion.transformer.system_metadata_transformer import (
43
- SystemMetadataTransformer,
44
- )
45
43
  from datahub.ingestion.transformer.transform_registry import transform_registry
46
44
  from datahub.sdk._attribution import KnownAttribution, change_default_attribution
47
45
  from datahub.telemetry import stats
48
46
  from datahub.telemetry.telemetry import telemetry_instance
47
+ from datahub.upgrade.upgrade import (
48
+ is_server_default_cli_ahead,
49
+ retrieve_version_stats,
50
+ )
49
51
  from datahub.utilities._custom_package_loader import model_version_name
50
52
  from datahub.utilities.global_warning_util import (
51
53
  clear_global_warnings,
@@ -139,9 +141,8 @@ class CliReport(Report):
139
141
 
140
142
 
141
143
  def _make_default_rest_sink(ctx: PipelineContext) -> DatahubRestSink:
142
- graph = get_default_graph()
144
+ graph = get_default_graph(ClientMode.INGESTION)
143
145
  sink_config = graph._make_rest_sink_config()
144
-
145
146
  return DatahubRestSink(ctx, sink_config)
146
147
 
147
148
 
@@ -174,10 +175,14 @@ class Pipeline:
174
175
  self.last_time_printed = int(time.time())
175
176
  self.cli_report = CliReport()
176
177
 
177
- with contextlib.ExitStack() as exit_stack, contextlib.ExitStack() as inner_exit_stack:
178
+ with (
179
+ contextlib.ExitStack() as exit_stack,
180
+ contextlib.ExitStack() as inner_exit_stack,
181
+ ):
178
182
  self.graph: Optional[DataHubGraph] = None
179
183
  with _add_init_error_context("connect to DataHub"):
180
184
  if self.config.datahub_api:
185
+ self.config.datahub_api.client_mode = ClientMode.INGESTION
181
186
  self.graph = exit_stack.enter_context(
182
187
  DataHubGraph(self.config.datahub_api)
183
188
  )
@@ -260,6 +265,11 @@ class Pipeline:
260
265
  with _add_init_error_context("configure transformers"):
261
266
  self._configure_transforms()
262
267
 
268
+ # Register completion callback with sink to handle final reporting
269
+ self.sink.register_pre_shutdown_callback(
270
+ self._notify_reporters_on_ingestion_completion
271
+ )
272
+
263
273
  # If all of the initialization succeeds, we can preserve the exit stack until the pipeline run.
264
274
  # We need to use an exit stack so that if we have an exception during initialization,
265
275
  # things that were already initialized are still cleaned up.
@@ -286,9 +296,6 @@ class Pipeline:
286
296
  f"Transformer type:{transformer_type},{transformer_class} configured"
287
297
  )
288
298
 
289
- # Add the system metadata transformer at the end of the list.
290
- self.transformers.append(SystemMetadataTransformer(self.ctx))
291
-
292
299
  def _configure_reporting(self, report_to: Optional[str]) -> None:
293
300
  if self.dry_run:
294
301
  # In dry run mode, we don't want to report anything.
@@ -342,8 +349,48 @@ class Pipeline:
342
349
  for reporter in self.reporters:
343
350
  try:
344
351
  reporter.on_start(ctx=self.ctx)
345
- except Exception as e:
346
- logger.warning("Reporting failed on start", exc_info=e)
352
+ except Exception:
353
+ logger.warning("Reporting failed on start", exc_info=True)
354
+
355
+ def _warn_old_cli_version(self) -> None:
356
+ """
357
+ Check if the server default CLI version is ahead of the CLI version being used.
358
+ If so, add a warning to the report.
359
+ """
360
+
361
+ try:
362
+ version_stats = retrieve_version_stats(timeout=2.0, graph=self.graph)
363
+ except RuntimeError as e:
364
+ # Handle case where there's no event loop available (e.g., in ThreadPoolExecutor)
365
+ if "no current event loop" in str(e):
366
+ logger.debug("Skipping version check - no event loop available")
367
+ return
368
+ raise
369
+
370
+ if not version_stats or not self.graph:
371
+ return
372
+
373
+ if is_server_default_cli_ahead(version_stats):
374
+ server_default_version = (
375
+ version_stats.server.current_server_default_cli_version.version
376
+ if version_stats.server.current_server_default_cli_version
377
+ else None
378
+ )
379
+ current_version = version_stats.client.current.version
380
+
381
+ logger.debug(
382
+ f"""
383
+ client_version: {current_version}
384
+ server_default_version: {server_default_version}
385
+ server_default_cli_ahead: True
386
+ """
387
+ )
388
+
389
+ self.source.get_report().warning(
390
+ title="Server default CLI version is ahead of CLI version",
391
+ message="Please upgrade the CLI version being used",
392
+ context=f"Server Default CLI version: {server_default_version}, Used CLI version: {current_version}",
393
+ )
347
394
 
348
395
  def _notify_reporters_on_ingestion_completion(self) -> None:
349
396
  for reporter in self.reporters:
@@ -365,8 +412,8 @@ class Pipeline:
365
412
  report=self._get_structured_report(),
366
413
  ctx=self.ctx,
367
414
  )
368
- except Exception as e:
369
- logger.warning("Reporting failed on completion", exc_info=e)
415
+ except Exception:
416
+ logger.warning("Reporting failed on completion", exc_info=True)
370
417
 
371
418
  @classmethod
372
419
  def create(
@@ -400,7 +447,20 @@ class Pipeline:
400
447
  return True
401
448
  return False
402
449
 
450
+ def _set_platform(self) -> None:
451
+ platform = self.source.infer_platform()
452
+ if platform:
453
+ self.source.get_report().set_platform(platform)
454
+ else:
455
+ self.source.get_report().warning(
456
+ message="Platform not found",
457
+ title="Platform not found",
458
+ context="Platform not found",
459
+ )
460
+
403
461
  def run(self) -> None:
462
+ self._set_platform()
463
+ self._warn_old_cli_version()
404
464
  with self.exit_stack, self.inner_exit_stack:
405
465
  if self.config.flags.generate_memory_profiles:
406
466
  import memray
@@ -466,10 +526,10 @@ class Pipeline:
466
526
 
467
527
  except (RuntimeError, SystemExit):
468
528
  raise
469
- except Exception as e:
529
+ except Exception:
470
530
  logger.error(
471
531
  "Failed to process some records. Continuing.",
472
- exc_info=e,
532
+ exc_info=True,
473
533
  )
474
534
  # TODO: Transformer errors should be reported more loudly / as part of the pipeline report.
475
535
 
@@ -498,9 +558,9 @@ class Pipeline:
498
558
 
499
559
  self.process_commits()
500
560
  self.final_status = PipelineStatus.COMPLETED
501
- except (SystemExit, KeyboardInterrupt) as e:
561
+ except (SystemExit, KeyboardInterrupt):
502
562
  self.final_status = PipelineStatus.CANCELLED
503
- logger.error("Caught error", exc_info=e)
563
+ logger.error("Caught error", exc_info=True)
504
564
  raise
505
565
  except Exception as exc:
506
566
  self.final_status = PipelineStatus.ERROR
@@ -508,8 +568,6 @@ class Pipeline:
508
568
  finally:
509
569
  clear_global_warnings()
510
570
 
511
- self._notify_reporters_on_ingestion_completion()
512
-
513
571
  def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
514
572
  """
515
573
  Transforms the given sequence of records by passing the records through the transformers
@@ -561,18 +619,20 @@ class Pipeline:
561
619
  def raise_from_status(self, raise_warnings: bool = False) -> None:
562
620
  if self.source.get_report().failures:
563
621
  raise PipelineExecutionError(
564
- "Source reported errors", self.source.get_report()
622
+ "Source reported errors", self.source.get_report().failures
565
623
  )
566
624
  if self.sink.get_report().failures:
567
- raise PipelineExecutionError("Sink reported errors", self.sink.get_report())
625
+ raise PipelineExecutionError(
626
+ "Sink reported errors", self.sink.get_report().failures
627
+ )
568
628
  if raise_warnings:
569
629
  if self.source.get_report().warnings:
570
630
  raise PipelineExecutionError(
571
- "Source reported warnings", self.source.get_report()
631
+ "Source reported warnings", self.source.get_report().warnings
572
632
  )
573
633
  if self.sink.get_report().warnings:
574
634
  raise PipelineExecutionError(
575
- "Sink reported warnings", self.sink.get_report()
635
+ "Sink reported warnings", self.sink.get_report().warnings
576
636
  )
577
637
 
578
638
  def log_ingestion_stats(self) -> None:
@@ -581,15 +641,22 @@ class Pipeline:
581
641
  sink_failures = len(self.sink.get_report().failures)
582
642
  sink_warnings = len(self.sink.get_report().warnings)
583
643
  global_warnings = len(get_global_warnings())
644
+ source_aspects = self.source.get_report().get_aspects_dict()
645
+ source_aspects_by_subtype = (
646
+ self.source.get_report().get_aspects_by_subtypes_dict()
647
+ )
584
648
 
585
649
  telemetry_instance.ping(
586
650
  "ingest_stats",
587
651
  {
588
652
  "source_type": self.source_type,
653
+ "source_aspects": source_aspects,
654
+ "source_aspects_by_subtype": source_aspects_by_subtype,
589
655
  "sink_type": self.sink_type,
590
656
  "transformer_types": [
591
657
  transformer.type for transformer in self.config.transformers or []
592
658
  ],
659
+ "extractor_type": self.config.source.extractor,
593
660
  "records_written": stats.discretize(
594
661
  self.sink.get_report().total_records_written
595
662
  ),
@@ -6,8 +6,8 @@ from typing import Any, Dict, List, Optional
6
6
 
7
7
  from pydantic import Field, validator
8
8
 
9
- from datahub.configuration.common import ConfigModel, DynamicTypedConfig
10
- from datahub.ingestion.graph.client import DatahubClientConfig
9
+ from datahub.configuration.common import ConfigModel, DynamicTypedConfig, HiddenFromDocs
10
+ from datahub.ingestion.graph.config import DatahubClientConfig
11
11
  from datahub.ingestion.sink.file import FileSinkConfig
12
12
 
13
13
  logger = logging.getLogger(__name__)
@@ -85,7 +85,7 @@ class PipelineConfig(ConfigModel):
85
85
  source: SourceConfig
86
86
  sink: Optional[DynamicTypedConfig] = None
87
87
  transformers: Optional[List[DynamicTypedConfig]] = None
88
- flags: FlagsConfig = Field(default=FlagsConfig(), hidden_from_docs=True)
88
+ flags: HiddenFromDocs[FlagsConfig] = FlagsConfig()
89
89
  reporting: List[ReporterConfig] = []
90
90
  run_id: str = DEFAULT_RUN_ID
91
91
  datahub_api: Optional[DatahubClientConfig] = None
@@ -74,4 +74,5 @@ class DatahubKafkaSink(Sink[KafkaSinkConfig, SinkReport]):
74
74
  callback(err, f"Failed to write record: {err}")
75
75
 
76
76
  def close(self) -> None:
77
+ super().close()
77
78
  self.emitter.flush()