acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import dataclasses
1
2
  import json
2
3
  import logging
3
4
  import re
@@ -12,7 +13,9 @@ from pydantic import BaseModel, Field, validator
12
13
 
13
14
  from datahub.configuration.git import GitReference
14
15
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
16
+ from datahub.ingestion.api.common import PipelineContext
15
17
  from datahub.ingestion.api.decorators import (
18
+ SourceCapability,
16
19
  SupportStatus,
17
20
  capability,
18
21
  config_class,
@@ -21,7 +24,6 @@ from datahub.ingestion.api.decorators import (
21
24
  )
22
25
  from datahub.ingestion.api.source import (
23
26
  CapabilityReport,
24
- SourceCapability,
25
27
  TestableSource,
26
28
  TestConnectionReport,
27
29
  )
@@ -40,19 +42,28 @@ from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
40
42
  logger = logging.getLogger(__name__)
41
43
 
42
44
 
45
+ @dataclasses.dataclass
46
+ class DBTCoreReport(DBTSourceReport):
47
+ catalog_info: Optional[dict] = None
48
+ manifest_info: Optional[dict] = None
49
+
50
+
43
51
  class DBTCoreConfig(DBTCommonConfig):
44
52
  manifest_path: str = Field(
45
- description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json Note "
46
- "this can be a local file or a URI."
53
+ description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json. "
54
+ "This can be a local file or a URI."
47
55
  )
48
- catalog_path: str = Field(
49
- description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json Note this "
50
- "can be a local file or a URI."
56
+ catalog_path: Optional[str] = Field(
57
+ None,
58
+ description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json. "
59
+ "This file is optional, but highly recommended. Without it, some metadata like column info will be incomplete or missing. "
60
+ "This can be a local file or a URI.",
51
61
  )
52
62
  sources_path: Optional[str] = Field(
53
63
  default=None,
54
- description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. If not "
55
- "specified, last-modified fields will not be populated. Note this can be a local file or a URI.",
64
+ description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. "
65
+ "If not specified, last-modified fields will not be populated. "
66
+ "This can be a local file or a URI.",
56
67
  )
57
68
  run_results_paths: List[str] = Field(
58
69
  default=[],
@@ -161,7 +172,7 @@ def get_columns(
161
172
 
162
173
  def extract_dbt_entities(
163
174
  all_manifest_entities: Dict[str, Dict[str, Any]],
164
- all_catalog_entities: Dict[str, Dict[str, Any]],
175
+ all_catalog_entities: Optional[Dict[str, Dict[str, Any]]],
165
176
  sources_results: List[Dict[str, Any]],
166
177
  manifest_adapter: str,
167
178
  use_identifiers: bool,
@@ -186,15 +197,6 @@ def extract_dbt_entities(
186
197
  ):
187
198
  name = manifest_node["alias"]
188
199
 
189
- # initialize comment to "" for consistency with descriptions
190
- # (since dbt null/undefined descriptions as "")
191
- comment = ""
192
-
193
- if key in all_catalog_entities and all_catalog_entities[key]["metadata"].get(
194
- "comment"
195
- ):
196
- comment = all_catalog_entities[key]["metadata"]["comment"]
197
-
198
200
  materialization = None
199
201
  if "materialized" in manifest_node.get("config", {}):
200
202
  # It's a model
@@ -204,8 +206,9 @@ def extract_dbt_entities(
204
206
  if "depends_on" in manifest_node and "nodes" in manifest_node["depends_on"]:
205
207
  upstream_nodes = manifest_node["depends_on"]["nodes"]
206
208
 
207
- # It's a source
208
- catalog_node = all_catalog_entities.get(key)
209
+ catalog_node = (
210
+ all_catalog_entities.get(key) if all_catalog_entities is not None else None
211
+ )
209
212
  missing_from_catalog = catalog_node is None
210
213
  catalog_type = None
211
214
 
@@ -214,16 +217,23 @@ def extract_dbt_entities(
214
217
  # Test and ephemeral nodes will never show up in the catalog.
215
218
  missing_from_catalog = False
216
219
  else:
217
- if not only_include_if_in_catalog:
220
+ if all_catalog_entities is not None and not only_include_if_in_catalog:
221
+ # If the catalog file is missing, we have already generated a general message.
218
222
  report.warning(
219
223
  title="Node missing from catalog",
220
224
  message="Found a node in the manifest file but not in the catalog. "
221
225
  "This usually means the catalog file was not generated by `dbt docs generate` and so is incomplete. "
222
- "Some metadata, such as column types and descriptions, will be impacted.",
226
+ "Some metadata, particularly schema information, will be impacted.",
223
227
  context=key,
224
228
  )
225
229
  else:
226
- catalog_type = all_catalog_entities[key]["metadata"]["type"]
230
+ catalog_type = catalog_node["metadata"]["type"]
231
+
232
+ # initialize comment to "" for consistency with descriptions
233
+ # (since dbt null/undefined descriptions as "")
234
+ comment = ""
235
+ if catalog_node is not None and catalog_node.get("metadata", {}).get("comment"):
236
+ comment = catalog_node["metadata"]["comment"]
227
237
 
228
238
  query_tag_props = manifest_node.get("query_tag", {})
229
239
 
@@ -231,12 +241,15 @@ def extract_dbt_entities(
231
241
 
232
242
  owner = meta.get("owner")
233
243
  if owner is None:
234
- owner = manifest_node.get("config", {}).get("meta", {}).get("owner")
244
+ owner = (manifest_node.get("config", {}).get("meta") or {}).get("owner")
245
+
246
+ if not meta:
247
+ # On older versions of dbt, the meta field was nested under config
248
+ # for some node types.
249
+ meta = manifest_node.get("config", {}).get("meta") or {}
235
250
 
236
251
  tags = manifest_node.get("tags", [])
237
252
  tags = [tag_prefix + tag for tag in tags]
238
- if not meta:
239
- meta = manifest_node.get("config", {}).get("meta", {})
240
253
 
241
254
  max_loaded_at_str = sources_by_id.get(key, {}).get("max_loaded_at")
242
255
  max_loaded_at = None
@@ -343,6 +356,9 @@ class DBTRunResult(BaseModel):
343
356
  def timing_map(self) -> Dict[str, DBTRunTiming]:
344
357
  return {x.name: x for x in self.timing if x.name}
345
358
 
359
+ def has_success_status(self) -> bool:
360
+ return self.status in ("pass", "success")
361
+
346
362
 
347
363
  class DBTRunMetadata(BaseModel):
348
364
  dbt_schema_version: str
@@ -355,12 +371,7 @@ def _parse_test_result(
355
371
  dbt_metadata: DBTRunMetadata,
356
372
  run_result: DBTRunResult,
357
373
  ) -> Optional[DBTTestResult]:
358
- if run_result.status == "success":
359
- # This was probably a docs generate run result, so this isn't actually
360
- # a test result.
361
- return None
362
-
363
- if run_result.status != "pass":
374
+ if not run_result.has_success_status():
364
375
  native_results = {"message": run_result.message or ""}
365
376
  if run_result.failures:
366
377
  native_results.update({"failures": str(run_result.failures)})
@@ -455,15 +466,19 @@ def load_run_results(
455
466
  @platform_name("dbt")
456
467
  @config_class(DBTCoreConfig)
457
468
  @support_status(SupportStatus.CERTIFIED)
458
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
459
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
469
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
460
470
  class DBTCoreSource(DBTSourceBase, TestableSource):
461
471
  config: DBTCoreConfig
472
+ report: DBTCoreReport
473
+
474
+ def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
475
+ super().__init__(config, ctx)
476
+ self.report = DBTCoreReport()
462
477
 
463
478
  @classmethod
464
479
  def create(cls, config_dict, ctx):
465
480
  config = DBTCoreConfig.parse_obj(config_dict)
466
- return cls(config, ctx, "dbt")
481
+ return cls(config, ctx)
467
482
 
468
483
  @staticmethod
469
484
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -473,9 +488,10 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
473
488
  DBTCoreSource.load_file_as_json(
474
489
  source_config.manifest_path, source_config.aws_connection
475
490
  )
476
- DBTCoreSource.load_file_as_json(
477
- source_config.catalog_path, source_config.aws_connection
478
- )
491
+ if source_config.catalog_path is not None:
492
+ DBTCoreSource.load_file_as_json(
493
+ source_config.catalog_path, source_config.aws_connection
494
+ )
479
495
  test_report.basic_connectivity = CapabilityReport(capable=True)
480
496
  except Exception as e:
481
497
  test_report.basic_connectivity = CapabilityReport(
@@ -513,11 +529,31 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
513
529
  dbt_manifest_json = self.load_file_as_json(
514
530
  self.config.manifest_path, self.config.aws_connection
515
531
  )
516
-
517
- dbt_catalog_json = self.load_file_as_json(
518
- self.config.catalog_path, self.config.aws_connection
532
+ dbt_manifest_metadata = dbt_manifest_json["metadata"]
533
+ self.report.manifest_info = dict(
534
+ generated_at=dbt_manifest_metadata.get("generated_at", "unknown"),
535
+ dbt_version=dbt_manifest_metadata.get("dbt_version", "unknown"),
536
+ project_name=dbt_manifest_metadata.get("project_name", "unknown"),
519
537
  )
520
538
 
539
+ dbt_catalog_json = None
540
+ dbt_catalog_metadata = None
541
+ if self.config.catalog_path is not None:
542
+ dbt_catalog_json = self.load_file_as_json(
543
+ self.config.catalog_path, self.config.aws_connection
544
+ )
545
+ dbt_catalog_metadata = dbt_catalog_json.get("metadata", {})
546
+ self.report.catalog_info = dict(
547
+ generated_at=dbt_catalog_metadata.get("generated_at", "unknown"),
548
+ dbt_version=dbt_catalog_metadata.get("dbt_version", "unknown"),
549
+ project_name=dbt_catalog_metadata.get("project_name", "unknown"),
550
+ )
551
+ else:
552
+ self.report.warning(
553
+ title="No catalog file configured",
554
+ message="Some metadata, particularly schema information, will be missing.",
555
+ )
556
+
521
557
  if self.config.sources_path is not None:
522
558
  dbt_sources_json = self.load_file_as_json(
523
559
  self.config.sources_path, self.config.aws_connection
@@ -530,18 +566,23 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
530
566
  manifest_version = dbt_manifest_json["metadata"].get("dbt_version")
531
567
  manifest_adapter = dbt_manifest_json["metadata"].get("adapter_type")
532
568
 
533
- catalog_schema = dbt_catalog_json.get("metadata", {}).get("dbt_schema_version")
534
- catalog_version = dbt_catalog_json.get("metadata", {}).get("dbt_version")
569
+ catalog_schema = None
570
+ catalog_version = None
571
+ if dbt_catalog_metadata is not None:
572
+ catalog_schema = dbt_catalog_metadata.get("dbt_schema_version")
573
+ catalog_version = dbt_catalog_metadata.get("dbt_version")
535
574
 
536
575
  manifest_nodes = dbt_manifest_json["nodes"]
537
576
  manifest_sources = dbt_manifest_json["sources"]
538
577
 
539
578
  all_manifest_entities = {**manifest_nodes, **manifest_sources}
540
579
 
541
- catalog_nodes = dbt_catalog_json["nodes"]
542
- catalog_sources = dbt_catalog_json["sources"]
580
+ all_catalog_entities = None
581
+ if dbt_catalog_json is not None:
582
+ catalog_nodes = dbt_catalog_json["nodes"]
583
+ catalog_sources = dbt_catalog_json["sources"]
543
584
 
544
- all_catalog_entities = {**catalog_nodes, **catalog_sources}
585
+ all_catalog_entities = {**catalog_nodes, **catalog_sources}
545
586
 
546
587
  nodes = extract_dbt_entities(
547
588
  all_manifest_entities=all_manifest_entities,
@@ -592,7 +633,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
592
633
  )
593
634
  except Exception as e:
594
635
  self.report.info(
595
- title="Dbt Catalog Version",
636
+ title="dbt Catalog Version",
596
637
  message="Failed to determine the catalog version",
597
638
  exc=e,
598
639
  )
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
6
6
 
7
7
  from datahub.emitter import mce_builder
8
8
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
9
- from datahub.ingestion.api.workunit import MetadataWorkUnit
10
9
  from datahub.metadata.schema_classes import (
11
10
  AssertionInfoClass,
12
11
  AssertionResultClass,
@@ -43,6 +42,9 @@ class DBTTestResult:
43
42
 
44
43
  native_results: Dict[str, str]
45
44
 
45
+ def has_success_status(self) -> bool:
46
+ return self.status in ("pass", "success")
47
+
46
48
 
47
49
  def _get_name_for_relationship_test(kw_args: Dict[str, str]) -> Optional[str]:
48
50
  """
@@ -157,7 +159,7 @@ def make_assertion_from_test(
157
159
  node: "DBTNode",
158
160
  assertion_urn: str,
159
161
  upstream_urn: str,
160
- ) -> MetadataWorkUnit:
162
+ ) -> MetadataChangeProposalWrapper:
161
163
  assert node.test_info
162
164
  qualified_test_name = node.test_info.qualified_test_name
163
165
  column_name = node.test_info.column_name
@@ -231,7 +233,7 @@ def make_assertion_from_test(
231
233
  return MetadataChangeProposalWrapper(
232
234
  entityUrn=assertion_urn,
233
235
  aspect=assertion_info,
234
- ).as_workunit()
236
+ )
235
237
 
236
238
 
237
239
  def make_assertion_result_from_test(
@@ -240,7 +242,7 @@ def make_assertion_result_from_test(
240
242
  assertion_urn: str,
241
243
  upstream_urn: str,
242
244
  test_warnings_are_errors: bool,
243
- ) -> MetadataWorkUnit:
245
+ ) -> MetadataChangeProposalWrapper:
244
246
  assertionResult = AssertionRunEventClass(
245
247
  timestampMillis=int(test_result.execution_time.timestamp() * 1000.0),
246
248
  assertionUrn=assertion_urn,
@@ -249,7 +251,7 @@ def make_assertion_result_from_test(
249
251
  result=AssertionResultClass(
250
252
  type=(
251
253
  AssertionResultTypeClass.SUCCESS
252
- if test_result.status == "pass"
254
+ if test_result.has_success_status()
253
255
  or (not test_warnings_are_errors and test_result.status == "warn")
254
256
  else AssertionResultTypeClass.FAILURE
255
257
  ),
@@ -261,4 +263,4 @@ def make_assertion_result_from_test(
261
263
  return MetadataChangeProposalWrapper(
262
264
  entityUrn=assertion_urn,
263
265
  aspect=assertionResult,
264
- ).as_workunit()
266
+ )
File without changes
@@ -0,0 +1,300 @@
1
+ import logging
2
+ import socket
3
+ import time
4
+ from typing import Iterable, Optional
5
+ from urllib.parse import urlparse
6
+
7
+ import dns.exception
8
+ import dns.resolver
9
+ import requests
10
+
11
+ from datahub.configuration.common import ConfigModel
12
+ from datahub.ingestion.api.common import PipelineContext
13
+ from datahub.ingestion.api.decorators import (
14
+ SupportStatus,
15
+ config_class,
16
+ platform_name,
17
+ support_status,
18
+ )
19
+ from datahub.ingestion.api.source import Source, SourceReport
20
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class DataHubDebugSourceConfig(ConfigModel):
26
+ dns_probe_url: Optional[str] = None
27
+
28
+
29
+ @platform_name("DataHubDebug")
30
+ @config_class(DataHubDebugSourceConfig)
31
+ @support_status(SupportStatus.TESTING)
32
+ class DataHubDebugSource(Source):
33
+ """
34
+ DataHubDebugSource is helper to debug things in executor where ingestion is running.
35
+
36
+ This source can perform the following tasks:
37
+ 1. Network probe of a URL. Different from test connection in sources as that is after source starts.
38
+
39
+ """
40
+
41
+ def __init__(self, ctx: PipelineContext, config: DataHubDebugSourceConfig):
42
+ self.ctx = ctx
43
+ self.config = config
44
+ self.report = SourceReport()
45
+ self.report.event_not_produced_warn = False
46
+
47
+ @classmethod
48
+ def create(cls, config_dict, ctx):
49
+ config = DataHubDebugSourceConfig.parse_obj(config_dict)
50
+ return cls(ctx, config)
51
+
52
+ def perform_dns_probe(self, url: str) -> None:
53
+ """
54
+ Perform comprehensive DNS probe and network connectivity tests.
55
+ Logs detailed information to help diagnose network issues.
56
+ """
57
+ logger.info(f"Starting DNS probe for URL: {url}")
58
+ logger.info("=" * 60)
59
+ logger.info(f"DNS PROBE REPORT FOR: {url}")
60
+ logger.info("=" * 60)
61
+
62
+ try:
63
+ # Parse the URL to extract hostname
64
+ parsed_url = urlparse(
65
+ url if url.startswith(("http://", "https://")) else f"http://{url}"
66
+ )
67
+ hostname = parsed_url.hostname or parsed_url.netloc
68
+ port = parsed_url.port or (443 if parsed_url.scheme == "https" else 80)
69
+
70
+ logger.info(f"Parsed hostname: {hostname}")
71
+ logger.info(f"Target port: {port}")
72
+ logger.info(f"URL scheme: {parsed_url.scheme}")
73
+ logger.info("-" * 60)
74
+
75
+ # Test 1: Enhanced DNS resolution with dnspython if available
76
+ logger.info("1. DNS RESOLUTION TEST")
77
+ self._dns_probe_with_dnspython(hostname)
78
+
79
+ logger.info("-" * 60)
80
+
81
+ # Test 2: HTTP/HTTPS connectivity test with requests if available
82
+ logger.info("2. HTTP CONNECTIVITY TEST")
83
+ self._http_probe_with_requests(url)
84
+
85
+ logger.info("-" * 60)
86
+
87
+ # Test 3: System network information
88
+ logger.info("3. SYSTEM NETWORK INFORMATION")
89
+ self._log_system_network_info()
90
+
91
+ except Exception as e:
92
+ logger.error(f"DNS probe failed with unexpected error: {e}", exc_info=True)
93
+
94
+ logger.info("=" * 60)
95
+ logger.info("DNS PROBE COMPLETED")
96
+ logger.info("=" * 60)
97
+
98
+ def _dns_probe_with_dnspython(self, hostname: str) -> None:
99
+ """Enhanced DNS probing using dnspython library"""
100
+ try:
101
+ # Test different record types
102
+ record_types = ["A", "AAAA", "CNAME", "MX"]
103
+
104
+ for record_type in record_types:
105
+ try:
106
+ start_time = time.time()
107
+ answers = dns.resolver.resolve(hostname, record_type)
108
+ dns_time = time.time() - start_time
109
+
110
+ logger.info(
111
+ f"✓ {record_type} record resolution successful ({dns_time:.3f}s)"
112
+ )
113
+ for answer in answers:
114
+ logger.info(f" - {record_type}: {answer}")
115
+
116
+ except dns.resolver.NXDOMAIN:
117
+ logger.info(f"✗ {record_type} record: Domain does not exist")
118
+ except dns.resolver.NoAnswer:
119
+ logger.info(
120
+ f"- {record_type} record: No answer (record type not available)"
121
+ )
122
+ except dns.exception.Timeout:
123
+ logger.error(f"✗ {record_type} record: DNS query timed out")
124
+ except Exception as e:
125
+ logger.error(f"✗ {record_type} record query failed: {e}")
126
+
127
+ # Test different DNS servers
128
+ logger.info("Testing with different DNS servers:")
129
+ dns_servers = ["8.8.8.8", "1.1.1.1", "208.67.222.222"]
130
+
131
+ for dns_server in dns_servers:
132
+ try:
133
+ resolver = dns.resolver.Resolver()
134
+ resolver.nameservers = [dns_server]
135
+ resolver.timeout = 5
136
+
137
+ start_time = time.time()
138
+ answers = resolver.resolve(hostname, "A")
139
+ dns_time = time.time() - start_time
140
+
141
+ logger.info(
142
+ f"✓ DNS server {dns_server} responded ({dns_time:.3f}s)"
143
+ )
144
+ for answer in answers:
145
+ logger.info(f" - A: {answer}")
146
+
147
+ except Exception as e:
148
+ logger.error(f"✗ DNS server {dns_server} failed: {e}")
149
+
150
+ except Exception as e:
151
+ logger.error(f"Enhanced DNS probe failed: {e}", exc_info=True)
152
+
153
+ def _http_probe_with_requests(self, url: str) -> None:
154
+ """HTTP connectivity test using requests library"""
155
+ try:
156
+ # Test with different timeouts and methods
157
+ timeout = 10
158
+ allow_redirects_head = True
159
+ allow_redirects_get = False
160
+
161
+ # Test HEAD request
162
+ try:
163
+ logger.info(f"Testing HEAD request with timeout {timeout}s")
164
+ start_time = time.time()
165
+
166
+ response = requests.head(
167
+ url, timeout=timeout, allow_redirects=allow_redirects_head
168
+ )
169
+
170
+ request_time = time.time() - start_time
171
+
172
+ logger.info(f"✓ HEAD request successful ({request_time:.3f}s)")
173
+ logger.info(f" Status code: {response.status_code}")
174
+ logger.info(
175
+ f" Response headers: {dict(list(response.headers.items())[:5])}"
176
+ )
177
+
178
+ if hasattr(response, "url") and response.url != url:
179
+ logger.info(f" Final URL after redirects: {response.url}")
180
+
181
+ except requests.exceptions.Timeout:
182
+ logger.error(f"✗ HEAD request timed out after {timeout}s")
183
+ except requests.exceptions.ConnectionError as e:
184
+ logger.error(f"✗ HEAD connection error: {e}")
185
+ except requests.exceptions.RequestException as e:
186
+ logger.error(f"✗ HEAD request failed: {e}")
187
+ except Exception as e:
188
+ logger.error(f"✗ HEAD unexpected error: {e}")
189
+
190
+ # Test GET request
191
+ try:
192
+ logger.info(f"Testing GET request with timeout {timeout}s")
193
+ start_time = time.time()
194
+
195
+ response = requests.get(
196
+ url, timeout=timeout, allow_redirects=allow_redirects_get
197
+ )
198
+
199
+ request_time = time.time() - start_time
200
+
201
+ logger.info(f"✓ GET request successful ({request_time:.3f}s)")
202
+ logger.info(f" Status code: {response.status_code}")
203
+ logger.info(
204
+ f" Response headers: {dict(list(response.headers.items())[:5])}"
205
+ )
206
+
207
+ if hasattr(response, "url") and response.url != url:
208
+ logger.info(f" Final URL after redirects: {response.url}")
209
+
210
+ except requests.exceptions.Timeout:
211
+ logger.error(f"✗ GET request timed out after {timeout}s")
212
+ except requests.exceptions.ConnectionError as e:
213
+ logger.error(f"✗ GET connection error: {e}")
214
+ except requests.exceptions.RequestException as e:
215
+ logger.error(f"✗ GET request failed: {e}")
216
+ except Exception as e:
217
+ logger.error(f"✗ GET unexpected error: {e}")
218
+
219
+ except Exception as e:
220
+ logger.error(f"HTTP probe failed: {e}", exc_info=True)
221
+
222
+ def _log_dns_troubleshooting(self) -> None:
223
+ """Log DNS troubleshooting information"""
224
+ logger.info("DNS TROUBLESHOOTING SUGGESTIONS:")
225
+ logger.info("- Check if the hostname is correct")
226
+ logger.info("- Verify DNS server configuration")
227
+ logger.info("- Check network connectivity")
228
+ logger.info("- Try using a different DNS server (8.8.8.8, 1.1.1.1)")
229
+ logger.info("- Check if there are firewall restrictions")
230
+
231
+ def _log_system_network_info(self) -> None:
232
+ """Log system network configuration information"""
233
+ try:
234
+ local_hostname = socket.gethostname()
235
+ logger.info(f"Local hostname: {local_hostname}")
236
+
237
+ try:
238
+ local_ips = socket.getaddrinfo(local_hostname, None)
239
+ logger.info("Local IP addresses:")
240
+ for addr_info in local_ips:
241
+ if addr_info[0] in [socket.AF_INET, socket.AF_INET6]:
242
+ family = "IPv4" if addr_info[0] == socket.AF_INET else "IPv6"
243
+ logger.info(f" - {addr_info[4][0]} ({family})")
244
+ except Exception as e:
245
+ logger.warning(f"Could not retrieve local IP addresses: {e}")
246
+
247
+ logger.info("DNS Server Connectivity:")
248
+ dns_servers = ["8.8.8.8", "1.1.1.1", "208.67.222.222"]
249
+ for dns_server in dns_servers:
250
+ try:
251
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
252
+ sock.settimeout(5)
253
+ result = sock.connect_ex((dns_server, 53))
254
+ if result == 0:
255
+ logger.info(f" ✓ Can reach {dns_server}:53")
256
+ else:
257
+ logger.error(f" ✗ Cannot reach {dns_server}:53")
258
+ sock.close()
259
+ except Exception as e:
260
+ logger.error(f" ✗ Error testing {dns_server}:53 - {e}")
261
+
262
+ except Exception as e:
263
+ logger.warning(f"Could not gather system network info: {e}")
264
+
265
+ def _test_alternative_dns(self, hostname: str) -> None:
266
+ """Test hostname resolution using alternative methods"""
267
+ try:
268
+ families = [(socket.AF_INET, "IPv4"), (socket.AF_INET6, "IPv6")]
269
+
270
+ for family, family_name in families:
271
+ try:
272
+ result = socket.getaddrinfo(hostname, None, family)
273
+ if result:
274
+ logger.info(f"✓ {family_name} resolution successful:")
275
+ for addr_info in result[:3]:
276
+ logger.info(f" - {addr_info[4][0]}")
277
+ else:
278
+ logger.warning(
279
+ f"✗ {family_name} resolution returned no results"
280
+ )
281
+ except socket.gaierror:
282
+ logger.error(f"✗ {family_name} resolution failed")
283
+ except Exception as e:
284
+ logger.error(f"✗ {family_name} resolution error: {e}")
285
+
286
+ except Exception as e:
287
+ logger.error(f"Alternative DNS test failed: {e}")
288
+
289
+ def get_workunits_internal(
290
+ self,
291
+ ) -> Iterable[MetadataWorkUnit]:
292
+ if self.config.dns_probe_url is not None:
293
+ # Perform DNS probe
294
+ logger.info(f"Performing DNS probe for: {self.config.dns_probe_url}")
295
+ self.perform_dns_probe(self.config.dns_probe_url)
296
+
297
+ yield from []
298
+
299
+ def get_report(self) -> SourceReport:
300
+ return self.report
@@ -13,8 +13,9 @@ from datahub.configuration.source_common import (
13
13
  )
14
14
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
15
15
  from datahub.ingestion.source.aws.s3_util import is_s3_uri
16
- from datahub.ingestion.source.state.stateful_ingestion_base import (
16
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
17
17
  StatefulIngestionConfigBase,
18
+ StatefulStaleMetadataRemovalConfig,
18
19
  )
19
20
 
20
21
  # hide annoying debug errors from py4j
@@ -39,9 +40,7 @@ class S3(ConfigModel):
39
40
 
40
41
 
41
42
  class DeltaLakeSourceConfig(
42
- PlatformInstanceConfigMixin,
43
- EnvConfigMixin,
44
- StatefulIngestionConfigBase,
43
+ PlatformInstanceConfigMixin, EnvConfigMixin, StatefulIngestionConfigBase
45
44
  ):
46
45
  base_path: str = Field(
47
46
  description="Path to table (s3 or local file system). If path is not a delta table path "
@@ -78,7 +77,12 @@ class DeltaLakeSourceConfig(
78
77
  "When set to `False`, number_of_files in delta table can not be reported.",
79
78
  )
80
79
 
81
- s3: Optional[S3] = Field()
80
+ s3: Optional[S3] = Field(None)
81
+
82
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
83
+ default=None,
84
+ description="Stateful Ingestion Config with stale metadata removal",
85
+ )
82
86
 
83
87
  @cached_property
84
88
  def is_s3(self):