acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ from typing import Optional, Set
4
4
  import pydantic
5
5
  from pydantic import Field, root_validator
6
6
 
7
- from datahub.configuration.common import AllowDenyPattern
7
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
8
8
  from datahub.configuration.kafka import KafkaConsumerConnectionConfig
9
9
  from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
10
10
  from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -98,16 +98,14 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
98
98
  ),
99
99
  )
100
100
 
101
- pull_from_datahub_api: bool = Field(
101
+ pull_from_datahub_api: HiddenFromDocs[bool] = Field(
102
102
  default=False,
103
103
  description="Use the DataHub API to fetch versioned aspects.",
104
- hidden_from_docs=True,
105
104
  )
106
105
 
107
- max_workers: int = Field(
106
+ max_workers: HiddenFromDocs[int] = Field(
108
107
  default=5 * (os.cpu_count() or 4),
109
108
  description="Number of worker threads to use for datahub api ingestion.",
110
- hidden_from_docs=True,
111
109
  )
112
110
 
113
111
  urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
@@ -118,10 +116,11 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
118
116
  "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
119
117
  )
120
118
 
121
- structured_properties_template_cache_invalidation_interval: int = Field(
122
- hidden_from_docs=True,
123
- default=60,
124
- description="Interval in seconds to invalidate the structured properties template cache.",
119
+ structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
120
+ Field(
121
+ default=60,
122
+ description="Interval in seconds to invalidate the structured properties template cache.",
123
+ )
125
124
  )
126
125
 
127
126
  query_timeout: Optional[int] = Field(
@@ -129,6 +128,10 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
129
128
  description="Timeout for each query in seconds. ",
130
129
  )
131
130
 
131
+ preserve_system_metadata: bool = Field(
132
+ default=True, description="Copy system metadata from the source system"
133
+ )
134
+
132
135
  @root_validator(skip_on_failure=True)
133
136
  def check_ingesting_data(cls, values):
134
137
  if (
@@ -12,7 +12,7 @@ from datahub.emitter.serialization_helper import post_json_transform
12
12
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
13
13
  from datahub.ingestion.source.datahub.report import DataHubSourceReport
14
14
  from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
15
- from datahub.metadata.schema_classes import ChangeTypeClass, SystemMetadataClass
15
+ from datahub.metadata.schema_classes import SystemMetadataClass
16
16
  from datahub.utilities.lossy_collections import LossyDict, LossyList
17
17
 
18
18
  logger = logging.getLogger(__name__)
@@ -104,6 +104,22 @@ class DataHubDatabaseReader:
104
104
  ORDER BY mav.urn
105
105
  """
106
106
 
107
+ def _get_json_extract_expression(self) -> str:
108
+ """
109
+ Returns the appropriate JSON extraction expression based on the database dialect.
110
+
111
+ Returns:
112
+ Database-specific JSON extraction expression
113
+ """
114
+ # Return the correct JSON extraction expression for the "removed" field,
115
+ # depending on the database dialect.
116
+ if self.engine.dialect.name == "postgresql":
117
+ # For PostgreSQL, cast the metadata column to JSON and extract the 'removed' key as boolean.
118
+ return "((metadata::json)->>'removed')::boolean"
119
+ else:
120
+ # For other databases (e.g., MySQL), use JSON_EXTRACT.
121
+ return "JSON_EXTRACT(metadata, '$.removed')"
122
+
107
123
  def query(self, set_structured_properties_filter: bool) -> str:
108
124
  """
109
125
  Main query that gets data for specified date range with appropriate filters.
@@ -125,7 +141,7 @@ class DataHubDatabaseReader:
125
141
  LEFT JOIN (
126
142
  SELECT
127
143
  *,
128
- JSON_EXTRACT(metadata, '$.removed') as removed
144
+ {self._get_json_extract_expression()} as removed
129
145
  FROM {self.engine.dialect.identifier_preparer.quote(self.config.database_table_name)}
130
146
  WHERE aspect = 'status'
131
147
  AND version = 0
@@ -241,15 +257,10 @@ class DataHubDatabaseReader:
241
257
  "end_createdon": end_date.strftime(DATETIME_FORMAT),
242
258
  "limit": limit,
243
259
  "offset": offset,
260
+ # Always pass exclude_aspects as a tuple, postgres doesn't support lists
261
+ "exclude_aspects": tuple(self.config.exclude_aspects),
244
262
  }
245
263
 
246
- # Add exclude_aspects if needed
247
- if (
248
- hasattr(self.config, "exclude_aspects")
249
- and self.config.exclude_aspects
250
- ):
251
- params["exclude_aspects"] = tuple(self.config.exclude_aspects)
252
-
253
264
  logger.info(
254
265
  f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
255
266
  f"with limit {limit} and offset {offset} (inclusive range)"
@@ -369,12 +380,16 @@ class DataHubDatabaseReader:
369
380
  json_metadata = post_json_transform(
370
381
  json.loads(row["systemmetadata"] or "{}")
371
382
  )
372
- system_metadata = SystemMetadataClass.from_obj(json_metadata)
383
+ system_metadata = None
384
+ if self.config.preserve_system_metadata:
385
+ system_metadata = SystemMetadataClass.from_obj(json_metadata)
386
+ if system_metadata.properties:
387
+ is_no_op = system_metadata.properties.pop("isNoOp", None)
388
+ logger.debug(f"Removed potential value for is_no_op={is_no_op}")
373
389
  return MetadataChangeProposalWrapper(
374
390
  entityUrn=row["urn"],
375
391
  aspect=ASPECT_MAP[row["aspect"]].from_obj(json_aspect),
376
392
  systemMetadata=system_metadata,
377
- changeType=ChangeTypeClass.UPSERT,
378
393
  )
379
394
  except Exception as e:
380
395
  logger.warning(
@@ -6,7 +6,9 @@ from typing import Dict, Iterable, List, Optional
6
6
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
7
7
  from datahub.ingestion.api.common import PipelineContext
8
8
  from datahub.ingestion.api.decorators import (
9
+ SourceCapability,
9
10
  SupportStatus,
11
+ capability,
10
12
  config_class,
11
13
  platform_name,
12
14
  support_status,
@@ -17,6 +19,7 @@ from datahub.ingestion.api.source_helpers import (
17
19
  auto_workunit_reporter,
18
20
  )
19
21
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
20
23
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
21
24
  from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
22
25
  from datahub.ingestion.source.datahub.datahub_database_reader import (
@@ -37,6 +40,13 @@ logger = logging.getLogger(__name__)
37
40
  @platform_name("DataHub")
38
41
  @config_class(DataHubSourceConfig)
39
42
  @support_status(SupportStatus.TESTING)
43
+ @capability(
44
+ SourceCapability.CONTAINERS,
45
+ "Enabled by default",
46
+ subtype_modifier=[
47
+ SourceCapabilityModifier.DATABASE,
48
+ ],
49
+ )
40
50
  class DataHubSource(StatefulIngestionSourceBase):
41
51
  platform: str = "datahub"
42
52
 
@@ -9,7 +9,9 @@ import requests
9
9
  from pydantic import Field, root_validator
10
10
 
11
11
  from datahub.ingestion.api.decorators import (
12
+ SourceCapability,
12
13
  SupportStatus,
14
+ capability,
13
15
  config_class,
14
16
  platform_name,
15
17
  support_status,
@@ -24,6 +26,7 @@ from datahub.ingestion.source.dbt.dbt_common import (
24
26
  DBTCommonConfig,
25
27
  DBTNode,
26
28
  DBTSourceBase,
29
+ DBTSourceReport,
27
30
  )
28
31
  from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
29
32
 
@@ -261,8 +264,10 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
261
264
  @platform_name("dbt")
262
265
  @config_class(DBTCloudConfig)
263
266
  @support_status(SupportStatus.CERTIFIED)
267
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
264
268
  class DBTCloudSource(DBTSourceBase, TestableSource):
265
269
  config: DBTCloudConfig
270
+ report: DBTSourceReport # nothing cloud-specific in the report
266
271
 
267
272
  @classmethod
268
273
  def create(cls, config_dict, ctx):
@@ -365,9 +370,12 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
365
370
  name = node["alias"]
366
371
 
367
372
  comment = node.get("comment", "")
368
- description = node["description"]
369
- if node.get("sourceDescription"):
370
- description = node["sourceDescription"]
373
+
374
+ # In dbt sources, there are two types of descriptions:
375
+ # - description: table-level description (specific to the source table)
376
+ # - sourceDescription: schema-level description (describes the overall source schema)
377
+ # The table-level description should take precedence since it's more specific.
378
+ description = node["description"] or node.get("sourceDescription", "")
371
379
 
372
380
  if node["resourceType"] == "model":
373
381
  materialization = node["materializedType"]
@@ -401,8 +409,11 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
401
409
  if node["resourceType"] in {"model", "seed", "snapshot"}:
402
410
  status = node["status"]
403
411
  if status is None and materialization != "ephemeral":
404
- self.report.report_warning(
405
- key, "node is missing a status, schema metadata will be incomplete"
412
+ self.report.warning(
413
+ title="Schema information may be incomplete",
414
+ message="Some nodes are missing the `status` field, which dbt uses to track the status of the node in the target database.",
415
+ context=key,
416
+ log=False,
406
417
  )
407
418
 
408
419
  # The code fields are new in dbt 1.3, and replace the sql ones.
@@ -91,6 +91,7 @@ from datahub.metadata.schema_classes import (
91
91
  OwnershipClass,
92
92
  OwnershipSourceTypeClass,
93
93
  OwnershipTypeClass,
94
+ SiblingsClass,
94
95
  StatusClass,
95
96
  SubTypesClass,
96
97
  TagAssociationClass,
@@ -98,6 +99,7 @@ from datahub.metadata.schema_classes import (
98
99
  ViewPropertiesClass,
99
100
  )
100
101
  from datahub.metadata.urns import DatasetUrn
102
+ from datahub.specific.dataset import DatasetPatchBuilder
101
103
  from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
102
104
  from datahub.sql_parsing.sqlglot_lineage import (
103
105
  SqlParsingDebugInfo,
@@ -120,6 +122,7 @@ logger = logging.getLogger(__name__)
120
122
  DBT_PLATFORM = "dbt"
121
123
 
122
124
  _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
125
+ _DBT_MAX_COMPILED_CODE_LENGTH = 1 * 1024 * 1024 # 1MB
123
126
 
124
127
 
125
128
  @dataclass
@@ -145,6 +148,9 @@ class DBTSourceReport(StaleEntityRemovalSourceReport):
145
148
 
146
149
  nodes_filtered: LossyList[str] = field(default_factory=LossyList)
147
150
 
151
+ duplicate_sources_dropped: Optional[int] = None
152
+ duplicate_sources_references_updated: Optional[int] = None
153
+
148
154
 
149
155
  class EmitDirective(ConfigEnum):
150
156
  """A holder for directives for emission for specific types of entities"""
@@ -240,6 +246,23 @@ class DBTEntitiesEnabled(ConfigModel):
240
246
  return self.model_performance == EmitDirective.YES
241
247
 
242
248
 
249
+ class MaterializedNodePatternConfig(ConfigModel):
250
+ """Configuration for filtering materialized nodes based on their physical location"""
251
+
252
+ database_pattern: AllowDenyPattern = Field(
253
+ default=AllowDenyPattern.allow_all(),
254
+ description="Regex patterns for database names to filter materialized nodes.",
255
+ )
256
+ schema_pattern: AllowDenyPattern = Field(
257
+ default=AllowDenyPattern.allow_all(),
258
+ description="Regex patterns for schema names in format '{database}.{schema}' to filter materialized nodes.",
259
+ )
260
+ table_pattern: AllowDenyPattern = Field(
261
+ default=AllowDenyPattern.allow_all(),
262
+ description="Regex patterns for table/view names in format '{database}.{schema}.{table}' to filter materialized nodes.",
263
+ )
264
+
265
+
243
266
  class DBTCommonConfig(
244
267
  StatefulIngestionConfigBase,
245
268
  PlatformInstanceConfigMixin,
@@ -288,6 +311,11 @@ class DBTCommonConfig(
288
311
  default=AllowDenyPattern.allow_all(),
289
312
  description="regex patterns for dbt model names to filter in ingestion.",
290
313
  )
314
+ materialized_node_pattern: MaterializedNodePatternConfig = Field(
315
+ default=MaterializedNodePatternConfig(),
316
+ description="Advanced filtering for materialized nodes based on their physical database location. "
317
+ "Provides fine-grained control over database.schema.table patterns for catalog consistency.",
318
+ )
291
319
  meta_mapping: Dict = Field(
292
320
  default={},
293
321
  description="mapping rules that will be executed against dbt meta properties. Refer to the section below on dbt meta automated mappings.",
@@ -355,7 +383,7 @@ class DBTCommonConfig(
355
383
  # override default value to True.
356
384
  incremental_lineage: bool = Field(
357
385
  default=True,
358
- description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run.",
386
+ description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run. This would also require enabling 'incremental_lineage' in the counterpart warehouse ingestion (_e.g._ BigQuery, Redshift, etc).",
359
387
  )
360
388
 
361
389
  _remove_use_compiled_code = pydantic_removed_field("use_compiled_code")
@@ -370,6 +398,20 @@ class DBTCommonConfig(
370
398
  "Set to False to skip it for engines like AWS Athena where it's not required.",
371
399
  )
372
400
 
401
+ dbt_is_primary_sibling: bool = Field(
402
+ default=True,
403
+ description="Experimental: Controls sibling relationship primary designation between dbt entities and target platform entities. "
404
+ "When True (default), dbt entities are primary and target platform entities are secondary. "
405
+ "When False, target platform entities are primary and dbt entities are secondary. "
406
+ "Uses aspect patches for precise control. Requires DataHub server 1.3.0+.",
407
+ )
408
+
409
+ drop_duplicate_sources: bool = Field(
410
+ default=True,
411
+ description="When enabled, drops sources that have the same name in the target platform as a model. "
412
+ "This ensures that lineage is generated reliably, but will lose any documentation associated only with the source.",
413
+ )
414
+
373
415
  @validator("target_platform")
374
416
  def validate_target_platform_value(cls, target_platform: str) -> str:
375
417
  if target_platform.lower() == DBT_PLATFORM:
@@ -509,7 +551,7 @@ class DBTNode:
509
551
  raw_code: Optional[str]
510
552
 
511
553
  dbt_adapter: str
512
- dbt_name: str
554
+ dbt_name: str # dbt unique identifier
513
555
  dbt_file_path: Optional[str]
514
556
  dbt_package_name: Optional[str] # this is pretty much always present
515
557
 
@@ -823,7 +865,9 @@ def get_column_type(
823
865
  @platform_name("dbt")
824
866
  @config_class(DBTCommonConfig)
825
867
  @support_status(SupportStatus.CERTIFIED)
826
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
868
+ @capability(
869
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
870
+ )
827
871
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
828
872
  @capability(
829
873
  SourceCapability.LINEAGE_FINE,
@@ -973,6 +1017,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
973
1017
  self._infer_schemas_and_update_cll(all_nodes_map)
974
1018
 
975
1019
  nodes = self._filter_nodes(all_nodes)
1020
+ nodes = self._drop_duplicate_sources(nodes)
1021
+
976
1022
  non_test_nodes = [
977
1023
  dataset_node for dataset_node in nodes if dataset_node.node_type != "test"
978
1024
  ]
@@ -994,15 +1040,53 @@ class DBTSourceBase(StatefulIngestionSourceBase):
994
1040
  all_nodes_map,
995
1041
  )
996
1042
 
997
- def _is_allowed_node(self, key: str) -> bool:
998
- return self.config.node_name_pattern.allowed(key)
1043
+ def _is_allowed_node(self, node: DBTNode) -> bool:
1044
+ """
1045
+ Check whether a node should be processed, using multi-layer rules. Checks for materialized nodes might need to be restricted in the future to some cases
1046
+ """
1047
+ if not self.config.node_name_pattern.allowed(node.dbt_name):
1048
+ return False
1049
+
1050
+ if not self._is_allowed_materialized_node(node):
1051
+ return False
1052
+
1053
+ return True
1054
+
1055
+ def _is_allowed_materialized_node(self, node: DBTNode) -> bool:
1056
+ """Filter nodes based on their materialized database location for catalog consistency"""
1057
+
1058
+ # Database level filtering
1059
+ if not node.database:
1060
+ return True
1061
+ if not self.config.materialized_node_pattern.database_pattern.allowed(
1062
+ node.database
1063
+ ):
1064
+ return False
1065
+
1066
+ # Schema level filtering: {database}.{schema}
1067
+ if not node.schema:
1068
+ return True
1069
+ if not self.config.materialized_node_pattern.schema_pattern.allowed(
1070
+ node._join_parts([node.database, node.schema])
1071
+ ):
1072
+ return False
1073
+
1074
+ # Table level filtering: {database}.{schema}.{table}
1075
+ if not node.name:
1076
+ return True
1077
+ if not self.config.materialized_node_pattern.table_pattern.allowed(
1078
+ node.get_db_fqn()
1079
+ ):
1080
+ return False
1081
+
1082
+ return True
999
1083
 
1000
1084
  def _filter_nodes(self, all_nodes: List[DBTNode]) -> List[DBTNode]:
1001
- nodes = []
1085
+ nodes: List[DBTNode] = []
1002
1086
  for node in all_nodes:
1003
1087
  key = node.dbt_name
1004
1088
 
1005
- if not self._is_allowed_node(key):
1089
+ if not self._is_allowed_node(node):
1006
1090
  self.report.nodes_filtered.append(key)
1007
1091
  continue
1008
1092
 
@@ -1010,6 +1094,62 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1010
1094
 
1011
1095
  return nodes
1012
1096
 
1097
+ def _drop_duplicate_sources(self, original_nodes: List[DBTNode]) -> List[DBTNode]:
1098
+ """Detect and correct cases where a model and source have the same name.
1099
+
1100
+ In these cases, we don't want to generate both because they'll have the same
1101
+ urn and hence overwrite each other. Instead, we drop the source and update
1102
+ references to it to point at the model.
1103
+
1104
+ The risk here is that the source might have documentation that'd be lost,
1105
+ which is why we maintain optionality with a config flag.
1106
+ """
1107
+ if not self.config.drop_duplicate_sources:
1108
+ return original_nodes
1109
+
1110
+ self.report.duplicate_sources_dropped = 0
1111
+ self.report.duplicate_sources_references_updated = 0
1112
+
1113
+ # Pass 1 - find all model names in the warehouse.
1114
+ warehouse_model_names: Dict[str, str] = {} # warehouse name -> model unique id
1115
+ for node in original_nodes:
1116
+ if node.node_type == "model" and node.exists_in_target_platform:
1117
+ warehouse_model_names[node.get_db_fqn()] = node.dbt_name
1118
+
1119
+ # Pass 2 - identify + drop duplicate sources.
1120
+ source_references_to_update: Dict[
1121
+ str, str
1122
+ ] = {} # source unique id -> model unique id
1123
+ nodes: List[DBTNode] = []
1124
+ for node in original_nodes:
1125
+ if (
1126
+ node.node_type == "source"
1127
+ and node.exists_in_target_platform
1128
+ and (model_name := warehouse_model_names.get(node.get_db_fqn()))
1129
+ ):
1130
+ self.report.warning(
1131
+ title="Duplicate model and source names detected",
1132
+ message="We found a dbt model and dbt source with the same name. To ensure reliable lineage generation, the source node was ignored. "
1133
+ "If you associated documentation/tags/other metadata with the source, it will be lost. "
1134
+ "To avoid this, you should remove the source node from your dbt project and replace any `source(<source_name>)` calls with `ref(<model_name>)`.",
1135
+ context=f"{node.dbt_name} (called {node.get_db_fqn()} in {self.config.target_platform}) duplicates {model_name}",
1136
+ )
1137
+ self.report.duplicate_sources_dropped += 1
1138
+ source_references_to_update[node.dbt_name] = model_name
1139
+ else:
1140
+ nodes.append(node)
1141
+
1142
+ # Pass 3 - update references to the dropped sources.
1143
+ for node in nodes:
1144
+ for i, current_upstream in enumerate(node.upstream_nodes):
1145
+ if current_upstream in source_references_to_update:
1146
+ node.upstream_nodes[i] = source_references_to_update[
1147
+ current_upstream
1148
+ ]
1149
+ self.report.duplicate_sources_references_updated += 1
1150
+
1151
+ return nodes
1152
+
1013
1153
  @staticmethod
1014
1154
  def _to_schema_info(schema_fields: List[SchemaField]) -> SchemaInfo:
1015
1155
  return {column.fieldPath: column.nativeDataType for column in schema_fields}
@@ -1038,8 +1178,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1038
1178
  cll_nodes.add(dbt_name)
1039
1179
  schema_nodes.add(dbt_name)
1040
1180
 
1041
- for dbt_name in all_nodes_map:
1042
- if self._is_allowed_node(dbt_name):
1181
+ for dbt_name, dbt_node in all_nodes_map.items():
1182
+ if self._is_allowed_node(dbt_node):
1043
1183
  add_node_to_cll_list(dbt_name)
1044
1184
 
1045
1185
  return schema_nodes, cll_nodes
@@ -1334,6 +1474,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1334
1474
  self.config.tag_prefix,
1335
1475
  "SOURCE_CONTROL",
1336
1476
  self.config.strip_user_ids_from_email,
1477
+ match_nested_props=True,
1337
1478
  )
1338
1479
 
1339
1480
  action_processor_tag = OperationProcessor(
@@ -1405,6 +1546,23 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1405
1546
  dataset_snapshot = DatasetSnapshot(
1406
1547
  urn=node_datahub_urn, aspects=list(snapshot_aspects)
1407
1548
  )
1549
+ # Emit sibling aspect for dbt entity (dbt is authoritative source for sibling relationships)
1550
+ if self._should_create_sibling_relationships(node):
1551
+ # Get the target platform URN
1552
+ target_platform_urn = node.get_urn(
1553
+ self.config.target_platform,
1554
+ self.config.env,
1555
+ self.config.target_platform_instance,
1556
+ )
1557
+
1558
+ yield MetadataChangeProposalWrapper(
1559
+ entityUrn=node_datahub_urn,
1560
+ aspect=SiblingsClass(
1561
+ siblings=[target_platform_urn],
1562
+ primary=self.config.dbt_is_primary_sibling,
1563
+ ),
1564
+ ).as_workunit()
1565
+
1408
1566
  mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1409
1567
  if self.config.write_semantics == "PATCH":
1410
1568
  mce = self.get_patched_mce(mce)
@@ -1508,6 +1666,31 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1508
1666
  if not node.exists_in_target_platform:
1509
1667
  continue
1510
1668
 
1669
+ # Emit sibling patch for target platform entity BEFORE any other aspects.
1670
+ # This ensures the hook can detect explicit primary settings when processing later aspects.
1671
+ if self._should_create_sibling_relationships(node):
1672
+ # Get the dbt platform URN
1673
+ dbt_platform_urn = node.get_urn(
1674
+ DBT_PLATFORM,
1675
+ self.config.env,
1676
+ self.config.platform_instance,
1677
+ )
1678
+
1679
+ # Create patch for target platform entity (make it primary when dbt_is_primary_sibling=False)
1680
+ target_patch = DatasetPatchBuilder(node_datahub_urn)
1681
+ target_patch.add_sibling(
1682
+ dbt_platform_urn, primary=not self.config.dbt_is_primary_sibling
1683
+ )
1684
+
1685
+ yield from auto_workunit(
1686
+ MetadataWorkUnit(
1687
+ id=MetadataWorkUnit.generate_workunit_id(mcp),
1688
+ mcp_raw=mcp,
1689
+ is_primary_source=False, # Not authoritative over warehouse metadata
1690
+ )
1691
+ for mcp in target_patch.build()
1692
+ )
1693
+
1511
1694
  # This code block is run when we are generating entities of platform type.
1512
1695
  # We will not link the platform not to the dbt node for type "source" because
1513
1696
  # in this case the platform table existed first.
@@ -1614,6 +1797,12 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1614
1797
  def get_external_url(self, node: DBTNode) -> Optional[str]:
1615
1798
  pass
1616
1799
 
1800
+ @staticmethod
1801
+ def _truncate_code(code: str, max_length: int) -> str:
1802
+ if len(code) > max_length:
1803
+ return code[:max_length] + "..."
1804
+ return code
1805
+
1617
1806
  def _create_view_properties_aspect(
1618
1807
  self, node: DBTNode
1619
1808
  ) -> Optional[ViewPropertiesClass]:
@@ -1625,6 +1814,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1625
1814
  compiled_code = try_format_query(
1626
1815
  node.compiled_code, platform=self.config.target_platform
1627
1816
  )
1817
+ compiled_code = self._truncate_code(
1818
+ compiled_code, _DBT_MAX_COMPILED_CODE_LENGTH
1819
+ )
1628
1820
 
1629
1821
  materialized = node.materialization in {"table", "incremental", "snapshot"}
1630
1822
  view_properties = ViewPropertiesClass(
@@ -1705,6 +1897,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1705
1897
  self.config.tag_prefix,
1706
1898
  "SOURCE_CONTROL",
1707
1899
  self.config.strip_user_ids_from_email,
1900
+ match_nested_props=True,
1708
1901
  )
1709
1902
 
1710
1903
  canonical_schema: List[SchemaField] = []
@@ -2053,5 +2246,27 @@ class DBTSourceBase(StatefulIngestionSourceBase):
2053
2246
  term_id_set.add(existing_term.urn)
2054
2247
  return [GlossaryTermAssociation(term_urn) for term_urn in sorted(term_id_set)]
2055
2248
 
2249
+ def _should_create_sibling_relationships(self, node: DBTNode) -> bool:
2250
+ """
2251
+ Determines whether to emit sibling relationships for a dbt node.
2252
+
2253
+ Sibling relationships (both dbt entity's aspect and target entity's patch) are only
2254
+ emitted when dbt_is_primary_sibling=False to establish explicit primary/secondary
2255
+ relationships. When dbt_is_primary_sibling=True,
2256
+ the SiblingAssociationHook handles sibling creation automatically.
2257
+
2258
+ Args:
2259
+ node: The dbt node to evaluate
2260
+
2261
+ Returns:
2262
+ True if sibling patches should be emitted for this node
2263
+ """
2264
+ # Only create siblings for entities that exist in target platform
2265
+ if not node.exists_in_target_platform:
2266
+ return False
2267
+
2268
+ # Only emit patches when explicit primary/secondary control is needed
2269
+ return self.config.dbt_is_primary_sibling is False
2270
+
2056
2271
  def get_report(self):
2057
2272
  return self.report
@@ -15,7 +15,9 @@ from datahub.configuration.git import GitReference
15
15
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
16
16
  from datahub.ingestion.api.common import PipelineContext
17
17
  from datahub.ingestion.api.decorators import (
18
+ SourceCapability,
18
19
  SupportStatus,
20
+ capability,
19
21
  config_class,
20
22
  platform_name,
21
23
  support_status,
@@ -464,6 +466,7 @@ def load_run_results(
464
466
  @platform_name("dbt")
465
467
  @config_class(DBTCoreConfig)
466
468
  @support_status(SupportStatus.CERTIFIED)
469
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
467
470
  class DBTCoreSource(DBTSourceBase, TestableSource):
468
471
  config: DBTCoreConfig
469
472
  report: DBTCoreReport
File without changes