acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,309 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from datetime import datetime
5
+ from typing import Dict, Optional, Type, Union
6
+
7
+ from typing_extensions import Self
8
+
9
+ import datahub.metadata.schema_classes as models
10
+ from datahub.cli.cli_utils import first_non_null
11
+ from datahub.emitter.mce_builder import DEFAULT_ENV
12
+ from datahub.errors import (
13
+ IngestionAttributionWarning,
14
+ )
15
+ from datahub.metadata.urns import DataFlowUrn, Urn
16
+ from datahub.sdk._attribution import is_ingestion_attribution
17
+ from datahub.sdk._shared import (
18
+ DomainInputType,
19
+ HasContainer,
20
+ HasDomain,
21
+ HasInstitutionalMemory,
22
+ HasOwnership,
23
+ HasPlatformInstance,
24
+ HasStructuredProperties,
25
+ HasSubtype,
26
+ HasTags,
27
+ HasTerms,
28
+ LinksInputType,
29
+ OwnersInputType,
30
+ ParentContainerInputType,
31
+ StructuredPropertyInputType,
32
+ TagsInputType,
33
+ TermsInputType,
34
+ make_time_stamp,
35
+ parse_time_stamp,
36
+ )
37
+ from datahub.sdk.entity import Entity, ExtraAspectsType
38
+ from datahub.utilities.sentinels import Unset, unset
39
+
40
+
41
+ class DataFlow(
42
+ HasPlatformInstance,
43
+ HasSubtype,
44
+ HasOwnership,
45
+ HasContainer,
46
+ HasInstitutionalMemory,
47
+ HasTags,
48
+ HasTerms,
49
+ HasDomain,
50
+ HasStructuredProperties,
51
+ Entity,
52
+ ):
53
+ """Represents a dataflow in DataHub.
54
+ A dataflow represents a collection of data, such as a table, view, or file.
55
+ This class provides methods for managing dataflow metadata including schema,
56
+ lineage, and various aspects like ownership, tags, and terms.
57
+ """
58
+
59
+ __slots__ = ()
60
+
61
+ @classmethod
62
+ def get_urn_type(cls) -> Type[DataFlowUrn]:
63
+ """Get the URN type for dataflows.
64
+ Returns:
65
+ The DataflowUrn class.
66
+ """
67
+ return DataFlowUrn
68
+
69
+ def __init__(
70
+ self,
71
+ *,
72
+ # Identity.
73
+ name: str,
74
+ platform: str,
75
+ display_name: Optional[str] = None,
76
+ platform_instance: Optional[str] = None,
77
+ env: str = DEFAULT_ENV,
78
+ # Dataflow properties.
79
+ description: Optional[str] = None,
80
+ external_url: Optional[str] = None,
81
+ custom_properties: Optional[Dict[str, str]] = None,
82
+ created: Optional[datetime] = None,
83
+ last_modified: Optional[datetime] = None,
84
+ # Standard aspects.
85
+ subtype: Optional[str] = None,
86
+ owners: Optional[OwnersInputType] = None,
87
+ links: Optional[LinksInputType] = None,
88
+ tags: Optional[TagsInputType] = None,
89
+ terms: Optional[TermsInputType] = None,
90
+ domain: Optional[DomainInputType] = None,
91
+ parent_container: ParentContainerInputType | Unset = unset,
92
+ structured_properties: Optional[StructuredPropertyInputType] = None,
93
+ extra_aspects: ExtraAspectsType = None,
94
+ ):
95
+ """Initialize a new Dataflow instance.
96
+ Args:
97
+ platform: The platform this dataflow belongs to (e.g. "mysql", "snowflake").
98
+ name: The name of the dataflow.
99
+ platform_instance: Optional platform instance identifier.
100
+ env: The environment this dataflow belongs to (default: DEFAULT_ENV).
101
+ description: Optional description of the dataflow.
102
+ display_name: Optional display name for the dataflow.
103
+ external_url: Optional URL to external documentation or source.
104
+ custom_properties: Optional dictionary of custom properties.
105
+ created: Optional creation timestamp.
106
+ last_modified: Optional last modification timestamp.
107
+ subtype: Optional subtype of the dataflow.
108
+ owners: Optional list of owners.
109
+ links: Optional list of links.
110
+ tags: Optional list of tags.
111
+ terms: Optional list of glossary terms.
112
+ domain: Optional domain this dataflow belongs to.
113
+ extra_aspects: Optional list of additional aspects.
114
+ upstreams: Optional upstream lineage information.
115
+ """
116
+ urn = DataFlowUrn.create_from_ids(
117
+ orchestrator=platform,
118
+ flow_id=name,
119
+ env=env,
120
+ platform_instance=platform_instance,
121
+ )
122
+ super().__init__(urn)
123
+ self._set_extra_aspects(extra_aspects)
124
+
125
+ self._set_platform_instance(urn.orchestrator, platform_instance)
126
+
127
+ # Initialize DataFlowInfoClass directly with name
128
+ self._setdefault_aspect(models.DataFlowInfoClass(name=display_name or name))
129
+ self._ensure_dataflow_props().env = env
130
+
131
+ if description is not None:
132
+ self.set_description(description)
133
+ if display_name is not None:
134
+ self.set_display_name(display_name)
135
+ if external_url is not None:
136
+ self.set_external_url(external_url)
137
+ if custom_properties is not None:
138
+ self.set_custom_properties(custom_properties)
139
+ if created is not None:
140
+ self.set_created(created)
141
+ if last_modified is not None:
142
+ self.set_last_modified(last_modified)
143
+ if subtype is not None:
144
+ self.set_subtype(subtype)
145
+ if owners is not None:
146
+ self.set_owners(owners)
147
+ if links is not None:
148
+ self.set_links(links)
149
+ if tags is not None:
150
+ self.set_tags(tags)
151
+ if terms is not None:
152
+ self.set_terms(terms)
153
+ if domain is not None:
154
+ self.set_domain(domain)
155
+ if parent_container is not unset:
156
+ self._set_container(parent_container)
157
+ if structured_properties is not None:
158
+ for key, value in structured_properties.items():
159
+ self.set_structured_property(property_urn=key, values=value)
160
+
161
+ @classmethod
162
+ def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
163
+ assert isinstance(urn, DataFlowUrn)
164
+ entity = cls(
165
+ platform=urn.orchestrator,
166
+ name=urn.flow_id,
167
+ )
168
+ return entity._init_from_graph(current_aspects)
169
+
170
+ @property
171
+ def urn(self) -> DataFlowUrn:
172
+ return self._urn # type: ignore
173
+
174
+ def _ensure_dataflow_props(self) -> models.DataFlowInfoClass:
175
+ props = self._get_aspect(models.DataFlowInfoClass)
176
+ if props is None:
177
+ # Use name from URN as fallback
178
+ props = models.DataFlowInfoClass(name=self.urn.flow_id)
179
+ self._set_aspect(props)
180
+ return props
181
+
182
+ def _get_editable_props(self) -> Optional[models.EditableDataFlowPropertiesClass]:
183
+ return self._get_aspect(models.EditableDataFlowPropertiesClass)
184
+
185
+ def _ensure_editable_props(self) -> models.EditableDataFlowPropertiesClass:
186
+ # Note that most of the fields in this aspect are not used.
187
+ # The only one that's relevant for us is the description.
188
+ return self._setdefault_aspect(models.EditableDataFlowPropertiesClass())
189
+
190
+ @property
191
+ def description(self) -> Optional[str]:
192
+ """Get the description of the dataflow.
193
+ Returns:
194
+ The description if set, None otherwise.
195
+ """
196
+ editable_props = self._get_editable_props()
197
+ return first_non_null(
198
+ [
199
+ editable_props.description if editable_props is not None else None,
200
+ self._ensure_dataflow_props().description,
201
+ ]
202
+ )
203
+
204
+ def set_description(self, description: str) -> None:
205
+ """Set the description of the dataflow.
206
+ Args:
207
+ description: The description to set.
208
+ Note:
209
+ If called during ingestion, this will warn if overwriting
210
+ a non-ingestion description.
211
+ """
212
+ if is_ingestion_attribution():
213
+ editable_props = self._get_editable_props()
214
+ if editable_props is not None and editable_props.description is not None:
215
+ warnings.warn(
216
+ "Overwriting non-ingestion description from ingestion is an anti-pattern.",
217
+ category=IngestionAttributionWarning,
218
+ stacklevel=2,
219
+ )
220
+ # Force the ingestion description to show up.
221
+ editable_props.description = None
222
+
223
+ self._ensure_dataflow_props().description = description
224
+ else:
225
+ self._ensure_editable_props().description = description
226
+
227
+ @property
228
+ def name(self) -> str:
229
+ """Get the name of the dataflow.
230
+ Returns:
231
+ The name of the dataflow.
232
+ """
233
+ return self.urn.flow_id
234
+
235
+ @property
236
+ def display_name(self) -> Optional[str]:
237
+ """Get the display name of the dataflow.
238
+ Returns:
239
+ The display name if set, None otherwise.
240
+ """
241
+ return self._ensure_dataflow_props().name
242
+
243
+ def set_display_name(self, display_name: str) -> None:
244
+ """Set the display name of the dataflow.
245
+ Args:
246
+ display_name: The display name to set.
247
+ """
248
+ self._ensure_dataflow_props().name = display_name
249
+
250
+ @property
251
+ def external_url(self) -> Optional[str]:
252
+ """Get the external URL of the dataflow.
253
+ Returns:
254
+ The external URL if set, None otherwise.
255
+ """
256
+ return self._ensure_dataflow_props().externalUrl
257
+
258
+ def set_external_url(self, external_url: str) -> None:
259
+ """Set the external URL of the dataflow.
260
+ Args:
261
+ external_url: The external URL to set.
262
+ """
263
+ self._ensure_dataflow_props().externalUrl = external_url
264
+
265
+ @property
266
+ def custom_properties(self) -> Dict[str, str]:
267
+ """Get the custom properties of the dataflow.
268
+ Returns:
269
+ Dictionary of custom properties.
270
+ """
271
+ return self._ensure_dataflow_props().customProperties
272
+
273
+ def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
274
+ """Set the custom properties of the dataflow.
275
+ Args:
276
+ custom_properties: Dictionary of custom properties to set.
277
+ """
278
+ self._ensure_dataflow_props().customProperties = custom_properties
279
+
280
+ @property
281
+ def created(self) -> Optional[datetime]:
282
+ """Get the creation timestamp of the dataflow.
283
+ Returns:
284
+ The creation timestamp if set, None otherwise.
285
+ """
286
+ return parse_time_stamp(self._ensure_dataflow_props().created)
287
+
288
+ def set_created(self, created: datetime) -> None:
289
+ """Set the creation timestamp of the dataflow.
290
+ Args:
291
+ created: The creation timestamp to set.
292
+ """
293
+ self._ensure_dataflow_props().created = make_time_stamp(created)
294
+
295
+ @property
296
+ def last_modified(self) -> Optional[datetime]:
297
+ """Get the last modification timestamp of the dataflow.
298
+ Returns:
299
+ The last modification timestamp if set, None otherwise.
300
+ """
301
+ return parse_time_stamp(self._ensure_dataflow_props().lastModified)
302
+
303
+ def set_last_modified(self, last_modified: datetime) -> None:
304
+ self._ensure_dataflow_props().lastModified = make_time_stamp(last_modified)
305
+
306
+ @property
307
+ def env(self) -> Optional[Union[str, models.FabricTypeClass]]:
308
+ """Get the environment of the dataflow."""
309
+ return self._ensure_dataflow_props().env
datahub/sdk/datajob.py ADDED
@@ -0,0 +1,367 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from datetime import datetime
5
+ from typing import Dict, List, Optional, Type
6
+
7
+ from typing_extensions import Self
8
+
9
+ import datahub.emitter.mce_builder as builder
10
+ import datahub.metadata.schema_classes as models
11
+ from datahub.cli.cli_utils import first_non_null
12
+ from datahub.errors import IngestionAttributionWarning
13
+ from datahub.metadata.urns import (
14
+ DataFlowUrn,
15
+ DataJobUrn,
16
+ DatasetUrn,
17
+ Urn,
18
+ )
19
+ from datahub.sdk._attribution import is_ingestion_attribution
20
+ from datahub.sdk._shared import (
21
+ DataflowUrnOrStr,
22
+ DatasetUrnOrStr,
23
+ DomainInputType,
24
+ HasContainer,
25
+ HasDomain,
26
+ HasInstitutionalMemory,
27
+ HasOwnership,
28
+ HasPlatformInstance,
29
+ HasStructuredProperties,
30
+ HasSubtype,
31
+ HasTags,
32
+ HasTerms,
33
+ LinksInputType,
34
+ OwnersInputType,
35
+ StructuredPropertyInputType,
36
+ TagsInputType,
37
+ TermsInputType,
38
+ make_time_stamp,
39
+ parse_time_stamp,
40
+ )
41
+ from datahub.sdk.dataflow import DataFlow
42
+ from datahub.sdk.entity import Entity, ExtraAspectsType
43
+
44
+
45
+ class DataJob(
46
+ HasPlatformInstance,
47
+ HasSubtype,
48
+ HasContainer,
49
+ HasOwnership,
50
+ HasInstitutionalMemory,
51
+ HasTags,
52
+ HasTerms,
53
+ HasDomain,
54
+ HasStructuredProperties,
55
+ Entity,
56
+ ):
57
+ """Represents a data job in DataHub.
58
+ A data job is an executable unit of a data pipeline, such as an Airflow task or a Spark job.
59
+ """
60
+
61
+ __slots__ = ()
62
+
63
+ @classmethod
64
+ def get_urn_type(cls) -> Type[DataJobUrn]:
65
+ """Get the URN type for data jobs."""
66
+ return DataJobUrn
67
+
68
+ def __init__( # noqa: C901
69
+ self,
70
+ *,
71
+ name: str,
72
+ flow: Optional[DataFlow] = None,
73
+ flow_urn: Optional[DataflowUrnOrStr] = None,
74
+ platform_instance: Optional[str] = None,
75
+ display_name: Optional[str] = None,
76
+ description: Optional[str] = None,
77
+ external_url: Optional[str] = None,
78
+ custom_properties: Optional[Dict[str, str]] = None,
79
+ created: Optional[datetime] = None,
80
+ last_modified: Optional[datetime] = None,
81
+ # Standard aspects
82
+ subtype: Optional[str] = None,
83
+ owners: Optional[OwnersInputType] = None,
84
+ links: Optional[LinksInputType] = None,
85
+ tags: Optional[TagsInputType] = None,
86
+ terms: Optional[TermsInputType] = None,
87
+ domain: Optional[DomainInputType] = None,
88
+ inlets: Optional[List[DatasetUrnOrStr]] = None,
89
+ outlets: Optional[List[DatasetUrnOrStr]] = None,
90
+ fine_grained_lineages: Optional[List[models.FineGrainedLineageClass]] = None,
91
+ structured_properties: Optional[StructuredPropertyInputType] = None,
92
+ extra_aspects: ExtraAspectsType = None,
93
+ ):
94
+ """
95
+ Initialize a DataJob with either a DataFlow or a DataFlowUrn with platform instance.
96
+
97
+ Args:
98
+ name: Name of the data job (required)
99
+ flow: A DataFlow object (optional)
100
+ flow_urn: A DataFlowUrn object (optional)
101
+ platform_instance: Platform instance name (optional, required if flow_urn is provided)
102
+ ... (other optional parameters)
103
+
104
+ Raises:
105
+ ValueError: If neither flow nor (flow_urn and platform_instance) are provided
106
+ """
107
+ if flow is None:
108
+ if flow_urn is None:
109
+ raise ValueError(
110
+ "You must provide either: 1. a DataFlow object, or 2. a DataFlowUrn (and a platform_instance config if required)"
111
+ )
112
+ flow_urn = DataFlowUrn.from_string(flow_urn)
113
+ if platform_instance and flow_urn.flow_id.startswith(
114
+ f"{platform_instance}."
115
+ ):
116
+ flow_name = flow_urn.flow_id[len(platform_instance) + 1 :]
117
+ else:
118
+ flow_name = flow_urn.flow_id
119
+ flow = DataFlow(
120
+ platform=flow_urn.orchestrator,
121
+ name=flow_name,
122
+ platform_instance=platform_instance,
123
+ )
124
+ urn = DataJobUrn.create_from_ids(
125
+ job_id=name,
126
+ data_flow_urn=str(flow.urn),
127
+ )
128
+ super().__init__(urn)
129
+ self._set_extra_aspects(extra_aspects)
130
+ self._set_platform_instance(flow.urn.orchestrator, flow.platform_instance)
131
+ self._set_browse_path_from_flow(flow)
132
+
133
+ # Initialize DataJobInfoClass with default type
134
+ job_info = models.DataJobInfoClass(
135
+ name=display_name or name,
136
+ type=models.AzkabanJobTypeClass.COMMAND, # Default type
137
+ )
138
+ self._setdefault_aspect(job_info)
139
+ self._ensure_datajob_props().flowUrn = str(flow.urn)
140
+ if description is not None:
141
+ self.set_description(description)
142
+ if external_url is not None:
143
+ self.set_external_url(external_url)
144
+ if custom_properties is not None:
145
+ self.set_custom_properties(custom_properties)
146
+ if created is not None:
147
+ self.set_created(created)
148
+ if last_modified is not None:
149
+ self.set_last_modified(last_modified)
150
+ if subtype is not None:
151
+ self.set_subtype(subtype)
152
+ if owners is not None:
153
+ self.set_owners(owners)
154
+ if links is not None:
155
+ self.set_links(links)
156
+ if tags is not None:
157
+ self.set_tags(tags)
158
+ if terms is not None:
159
+ self.set_terms(terms)
160
+ if domain is not None:
161
+ self.set_domain(domain)
162
+ if structured_properties is not None:
163
+ for key, value in structured_properties.items():
164
+ self.set_structured_property(property_urn=key, values=value)
165
+ if inlets is not None:
166
+ self.set_inlets(inlets)
167
+ if outlets is not None:
168
+ self.set_outlets(outlets)
169
+ if fine_grained_lineages is not None:
170
+ self.set_fine_grained_lineages(fine_grained_lineages)
171
+
172
+ if self.flow_urn.cluster.upper() in builder.ALL_ENV_TYPES:
173
+ env = self.flow_urn.cluster.upper()
174
+ self._ensure_datajob_props().env = env
175
+
176
+ @classmethod
177
+ def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
178
+ assert isinstance(urn, DataJobUrn)
179
+ # Extracting platform from the DataFlowUrn inside the DataJobUrn
180
+ data_flow_urn = urn.get_data_flow_urn()
181
+
182
+ entity = cls(
183
+ flow=DataFlow(
184
+ platform=data_flow_urn.orchestrator,
185
+ name=data_flow_urn.flow_id,
186
+ ),
187
+ name=urn.job_id,
188
+ )
189
+ return entity._init_from_graph(current_aspects)
190
+
191
+ @property
192
+ def urn(self) -> DataJobUrn:
193
+ return self._urn # type: ignore
194
+
195
+ def _ensure_datajob_props(self) -> models.DataJobInfoClass:
196
+ props = self._get_aspect(models.DataJobInfoClass)
197
+ if props is None:
198
+ # Use name from URN as fallback with default type
199
+ props = models.DataJobInfoClass(
200
+ name=self.urn.job_id, type=models.AzkabanJobTypeClass.COMMAND
201
+ )
202
+ self._set_aspect(props)
203
+ return props
204
+
205
+ def _get_datajob_inputoutput_props(
206
+ self,
207
+ ) -> Optional[models.DataJobInputOutputClass]:
208
+ return self._get_aspect(models.DataJobInputOutputClass)
209
+
210
+ def _ensure_datajob_inputoutput_props(self) -> models.DataJobInputOutputClass:
211
+ return self._setdefault_aspect(
212
+ models.DataJobInputOutputClass(inputDatasets=[], outputDatasets=[])
213
+ )
214
+
215
+ def _get_editable_props(self) -> Optional[models.EditableDataJobPropertiesClass]:
216
+ return self._get_aspect(models.EditableDataJobPropertiesClass)
217
+
218
+ def _ensure_editable_props(self) -> models.EditableDataJobPropertiesClass:
219
+ return self._setdefault_aspect(models.EditableDataJobPropertiesClass())
220
+
221
+ @property
222
+ def description(self) -> Optional[str]:
223
+ """Get the description of the data job."""
224
+ editable_props = self._get_editable_props()
225
+ return first_non_null(
226
+ [
227
+ editable_props.description if editable_props is not None else None,
228
+ self._ensure_datajob_props().description,
229
+ ]
230
+ )
231
+
232
+ def set_description(self, description: str) -> None:
233
+ """Set the description of the data job."""
234
+ if is_ingestion_attribution():
235
+ editable_props = self._get_editable_props()
236
+ if editable_props is not None and editable_props.description is not None:
237
+ warnings.warn(
238
+ "Overwriting non-ingestion description from ingestion is an anti-pattern.",
239
+ category=IngestionAttributionWarning,
240
+ stacklevel=2,
241
+ )
242
+ # Force the ingestion description to show up.
243
+ editable_props.description = None
244
+
245
+ self._ensure_datajob_props().description = description
246
+ else:
247
+ self._ensure_editable_props().description = description
248
+
249
+ @property
250
+ def name(self) -> str:
251
+ """Get the name of the data job."""
252
+ return self.urn.job_id
253
+
254
+ @property
255
+ def display_name(self) -> Optional[str]:
256
+ """Get the display name of the data job."""
257
+ return self._ensure_datajob_props().name
258
+
259
+ def set_display_name(self, display_name: str) -> None:
260
+ """Set the display name of the data job."""
261
+ self._ensure_datajob_props().name = display_name
262
+
263
+ @property
264
+ def external_url(self) -> Optional[str]:
265
+ """Get the external URL of the data job."""
266
+ return self._ensure_datajob_props().externalUrl
267
+
268
+ def set_external_url(self, external_url: str) -> None:
269
+ """Set the external URL of the data job."""
270
+ self._ensure_datajob_props().externalUrl = external_url
271
+
272
+ @property
273
+ def custom_properties(self) -> Dict[str, str]:
274
+ """Get the custom properties of the data job."""
275
+ return self._ensure_datajob_props().customProperties
276
+
277
+ def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
278
+ """Set the custom properties of the data job."""
279
+ self._ensure_datajob_props().customProperties = custom_properties
280
+
281
+ @property
282
+ def created(self) -> Optional[datetime]:
283
+ """Get the creation timestamp of the data job."""
284
+ return parse_time_stamp(self._ensure_datajob_props().created)
285
+
286
+ def set_created(self, created: datetime) -> None:
287
+ """Set the creation timestamp of the data job."""
288
+ self._ensure_datajob_props().created = make_time_stamp(created)
289
+
290
+ @property
291
+ def last_modified(self) -> Optional[datetime]:
292
+ """Get the last modification timestamp of the data job."""
293
+ return parse_time_stamp(self._ensure_datajob_props().lastModified)
294
+
295
+ def set_last_modified(self, last_modified: datetime) -> None:
296
+ """Set the last modification timestamp of the data job."""
297
+ self._ensure_datajob_props().lastModified = make_time_stamp(last_modified)
298
+
299
+ @property
300
+ def flow_urn(self) -> DataFlowUrn:
301
+ """Get the data flow associated with the data job."""
302
+ return self.urn.get_data_flow_urn()
303
+
304
+ def _set_browse_path_from_flow(self, flow: DataFlow) -> None:
305
+ flow_browse_path = flow._get_aspect(models.BrowsePathsV2Class)
306
+
307
+ # extend the flow's browse path with this job
308
+ browse_path = []
309
+ if flow_browse_path is not None:
310
+ for entry in flow_browse_path.path:
311
+ browse_path.append(
312
+ models.BrowsePathEntryClass(id=entry.id, urn=entry.urn)
313
+ )
314
+ browse_path.append(models.BrowsePathEntryClass(id=flow.name, urn=str(flow.urn)))
315
+ # Set the browse path aspect
316
+ self._set_aspect(models.BrowsePathsV2Class(path=browse_path))
317
+
318
+ # TODO: support datajob input/output
319
+ @property
320
+ def inlets(self) -> List[DatasetUrn]:
321
+ """Get the inlets of the data job."""
322
+ inlets = self._ensure_datajob_inputoutput_props().inputDatasets
323
+ return [DatasetUrn.from_string(inlet) for inlet in inlets]
324
+
325
+ def set_inlets(self, inlets: List[DatasetUrnOrStr]) -> None:
326
+ """Set the inlets of the data job."""
327
+ for inlet in inlets:
328
+ inlet_urn = DatasetUrn.from_string(inlet) # type checking
329
+ self._ensure_datajob_inputoutput_props().inputDatasets.append(
330
+ str(inlet_urn)
331
+ )
332
+
333
+ @property
334
+ def outlets(self) -> List[DatasetUrn]:
335
+ """Get the outlets of the data job."""
336
+ outlets = self._ensure_datajob_inputoutput_props().outputDatasets
337
+ return [DatasetUrn.from_string(outlet) for outlet in outlets]
338
+
339
+ def set_outlets(self, outlets: List[DatasetUrnOrStr]) -> None:
340
+ """Set the outlets of the data job."""
341
+ for outlet in outlets:
342
+ outlet_urn = DatasetUrn.from_string(outlet) # type checking
343
+ self._ensure_datajob_inputoutput_props().outputDatasets.append(
344
+ str(outlet_urn)
345
+ )
346
+
347
+ @property
348
+ def fine_grained_lineages(self) -> List[models.FineGrainedLineageClass]:
349
+ io_aspect = self._get_datajob_inputoutput_props()
350
+ return (
351
+ io_aspect.fineGrainedLineages
352
+ if io_aspect and io_aspect.fineGrainedLineages
353
+ else []
354
+ )
355
+
356
+ def set_fine_grained_lineages(
357
+ self, lineages: List[models.FineGrainedLineageClass]
358
+ ) -> None:
359
+ io_aspect = self._ensure_datajob_inputoutput_props()
360
+ if io_aspect.fineGrainedLineages is None:
361
+ io_aspect.fineGrainedLineages = []
362
+ io_aspect.fineGrainedLineages.extend(lineages)
363
+
364
+ @property
365
+ def env(self) -> Optional[str]:
366
+ """Get the environment of the data job."""
367
+ return str(self._ensure_datajob_props().env)