acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ from abc import abstractmethod
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import datetime
6
6
  from enum import auto
7
- from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
7
+ from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
8
8
 
9
9
  import more_itertools
10
10
  import pydantic
@@ -91,6 +91,7 @@ from datahub.metadata.schema_classes import (
91
91
  OwnershipClass,
92
92
  OwnershipSourceTypeClass,
93
93
  OwnershipTypeClass,
94
+ SiblingsClass,
94
95
  StatusClass,
95
96
  SubTypesClass,
96
97
  TagAssociationClass,
@@ -98,6 +99,7 @@ from datahub.metadata.schema_classes import (
98
99
  ViewPropertiesClass,
99
100
  )
100
101
  from datahub.metadata.urns import DatasetUrn
102
+ from datahub.specific.dataset import DatasetPatchBuilder
101
103
  from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
102
104
  from datahub.sql_parsing.sqlglot_lineage import (
103
105
  SqlParsingDebugInfo,
@@ -120,17 +122,25 @@ logger = logging.getLogger(__name__)
120
122
  DBT_PLATFORM = "dbt"
121
123
 
122
124
  _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
125
+ _DBT_MAX_COMPILED_CODE_LENGTH = 1 * 1024 * 1024 # 1MB
123
126
 
124
127
 
125
128
  @dataclass
126
129
  class DBTSourceReport(StaleEntityRemovalSourceReport):
127
130
  sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
131
+ sql_parser_skipped_non_sql_model: LossyList[str] = field(default_factory=LossyList)
128
132
  sql_parser_parse_failures: int = 0
129
133
  sql_parser_detach_ctes_failures: int = 0
130
134
  sql_parser_table_errors: int = 0
131
135
  sql_parser_column_errors: int = 0
132
136
  sql_parser_successes: int = 0
133
137
 
138
+ # Details on where column info comes from.
139
+ nodes_with_catalog_columns: int = 0
140
+ nodes_with_inferred_columns: int = 0
141
+ nodes_with_graph_columns: int = 0
142
+ nodes_with_no_columns: int = 0
143
+
134
144
  sql_parser_parse_failures_list: LossyList[str] = field(default_factory=LossyList)
135
145
  sql_parser_detach_ctes_failures_list: LossyList[str] = field(
136
146
  default_factory=LossyList
@@ -138,6 +148,9 @@ class DBTSourceReport(StaleEntityRemovalSourceReport):
138
148
 
139
149
  nodes_filtered: LossyList[str] = field(default_factory=LossyList)
140
150
 
151
+ duplicate_sources_dropped: Optional[int] = None
152
+ duplicate_sources_references_updated: Optional[int] = None
153
+
141
154
 
142
155
  class EmitDirective(ConfigEnum):
143
156
  """A holder for directives for emission for specific types of entities"""
@@ -233,6 +246,23 @@ class DBTEntitiesEnabled(ConfigModel):
233
246
  return self.model_performance == EmitDirective.YES
234
247
 
235
248
 
249
+ class MaterializedNodePatternConfig(ConfigModel):
250
+ """Configuration for filtering materialized nodes based on their physical location"""
251
+
252
+ database_pattern: AllowDenyPattern = Field(
253
+ default=AllowDenyPattern.allow_all(),
254
+ description="Regex patterns for database names to filter materialized nodes.",
255
+ )
256
+ schema_pattern: AllowDenyPattern = Field(
257
+ default=AllowDenyPattern.allow_all(),
258
+ description="Regex patterns for schema names in format '{database}.{schema}' to filter materialized nodes.",
259
+ )
260
+ table_pattern: AllowDenyPattern = Field(
261
+ default=AllowDenyPattern.allow_all(),
262
+ description="Regex patterns for table/view names in format '{database}.{schema}.{table}' to filter materialized nodes.",
263
+ )
264
+
265
+
236
266
  class DBTCommonConfig(
237
267
  StatefulIngestionConfigBase,
238
268
  PlatformInstanceConfigMixin,
@@ -281,6 +311,11 @@ class DBTCommonConfig(
281
311
  default=AllowDenyPattern.allow_all(),
282
312
  description="regex patterns for dbt model names to filter in ingestion.",
283
313
  )
314
+ materialized_node_pattern: MaterializedNodePatternConfig = Field(
315
+ default=MaterializedNodePatternConfig(),
316
+ description="Advanced filtering for materialized nodes based on their physical database location. "
317
+ "Provides fine-grained control over database.schema.table patterns for catalog consistency.",
318
+ )
284
319
  meta_mapping: Dict = Field(
285
320
  default={},
286
321
  description="mapping rules that will be executed against dbt meta properties. Refer to the section below on dbt meta automated mappings.",
@@ -348,7 +383,7 @@ class DBTCommonConfig(
348
383
  # override default value to True.
349
384
  incremental_lineage: bool = Field(
350
385
  default=True,
351
- description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run.",
386
+ description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run. This would also require enabling 'incremental_lineage' in the counterpart warehouse ingestion (_e.g._ BigQuery, Redshift, etc).",
352
387
  )
353
388
 
354
389
  _remove_use_compiled_code = pydantic_removed_field("use_compiled_code")
@@ -363,6 +398,20 @@ class DBTCommonConfig(
363
398
  "Set to False to skip it for engines like AWS Athena where it's not required.",
364
399
  )
365
400
 
401
+ dbt_is_primary_sibling: bool = Field(
402
+ default=True,
403
+ description="Experimental: Controls sibling relationship primary designation between dbt entities and target platform entities. "
404
+ "When True (default), dbt entities are primary and target platform entities are secondary. "
405
+ "When False, target platform entities are primary and dbt entities are secondary. "
406
+ "Uses aspect patches for precise control. Requires DataHub server 1.3.0+.",
407
+ )
408
+
409
+ drop_duplicate_sources: bool = Field(
410
+ default=True,
411
+ description="When enabled, drops sources that have the same name in the target platform as a model. "
412
+ "This ensures that lineage is generated reliably, but will lose any documentation associated only with the source.",
413
+ )
414
+
366
415
  @validator("target_platform")
367
416
  def validate_target_platform_value(cls, target_platform: str) -> str:
368
417
  if target_platform.lower() == DBT_PLATFORM:
@@ -502,7 +551,7 @@ class DBTNode:
502
551
  raw_code: Optional[str]
503
552
 
504
553
  dbt_adapter: str
505
- dbt_name: str
554
+ dbt_name: str # dbt unique identifier
506
555
  dbt_file_path: Optional[str]
507
556
  dbt_package_name: Optional[str] # this is pretty much always present
508
557
 
@@ -618,14 +667,8 @@ class DBTNode:
618
667
  def exists_in_target_platform(self):
619
668
  return not (self.is_ephemeral_model() or self.node_type == "test")
620
669
 
621
- def columns_setdefault(self, schema_fields: List[SchemaField]) -> None:
622
- """
623
- Update the column list if they are not already set.
624
- """
625
-
626
- if self.columns:
627
- # If we already have columns, don't overwrite them.
628
- return
670
+ def set_columns(self, schema_fields: List[SchemaField]) -> None:
671
+ """Update the column list."""
629
672
 
630
673
  self.columns = [
631
674
  DBTColumn(
@@ -822,18 +865,22 @@ def get_column_type(
822
865
  @platform_name("dbt")
823
866
  @config_class(DBTCommonConfig)
824
867
  @support_status(SupportStatus.CERTIFIED)
825
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
868
+ @capability(
869
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
870
+ )
826
871
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
827
872
  @capability(
828
873
  SourceCapability.LINEAGE_FINE,
829
874
  "Enabled by default, configure using `include_column_lineage`",
830
875
  )
831
876
  class DBTSourceBase(StatefulIngestionSourceBase):
832
- def __init__(self, config: DBTCommonConfig, ctx: PipelineContext, platform: str):
877
+ def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
833
878
  super().__init__(config, ctx)
879
+ self.platform: str = "dbt"
880
+
834
881
  self.config = config
835
- self.platform: str = platform
836
882
  self.report: DBTSourceReport = DBTSourceReport()
883
+
837
884
  self.compiled_owner_extraction_pattern: Optional[Any] = None
838
885
  if self.config.owner_extraction_pattern:
839
886
  self.compiled_owner_extraction_pattern = re.compile(
@@ -849,7 +896,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
849
896
  test_nodes: List[DBTNode],
850
897
  extra_custom_props: Dict[str, str],
851
898
  all_nodes_map: Dict[str, DBTNode],
852
- ) -> Iterable[MetadataWorkUnit]:
899
+ ) -> Iterable[MetadataChangeProposalWrapper]:
853
900
  for node in sorted(test_nodes, key=lambda n: n.dbt_name):
854
901
  upstreams = get_upstreams_for_test(
855
902
  test_node=node,
@@ -902,7 +949,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
902
949
  yield MetadataChangeProposalWrapper(
903
950
  entityUrn=assertion_urn,
904
951
  aspect=self._make_data_platform_instance_aspect(),
905
- ).as_workunit()
952
+ )
906
953
 
907
954
  yield make_assertion_from_test(
908
955
  custom_props,
@@ -949,7 +996,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
949
996
  ),
950
997
  )
951
998
 
952
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
999
+ def get_workunits_internal(
1000
+ self,
1001
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
953
1002
  if self.config.write_semantics == "PATCH":
954
1003
  self.ctx.require_graph("Using dbt with write_semantics=PATCH")
955
1004
 
@@ -968,6 +1017,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
968
1017
  self._infer_schemas_and_update_cll(all_nodes_map)
969
1018
 
970
1019
  nodes = self._filter_nodes(all_nodes)
1020
+ nodes = self._drop_duplicate_sources(nodes)
1021
+
971
1022
  non_test_nodes = [
972
1023
  dataset_node for dataset_node in nodes if dataset_node.node_type != "test"
973
1024
  ]
@@ -989,15 +1040,53 @@ class DBTSourceBase(StatefulIngestionSourceBase):
989
1040
  all_nodes_map,
990
1041
  )
991
1042
 
992
- def _is_allowed_node(self, key: str) -> bool:
993
- return self.config.node_name_pattern.allowed(key)
1043
+ def _is_allowed_node(self, node: DBTNode) -> bool:
1044
+ """
1045
+ Check whether a node should be processed, using multi-layer rules. Checks for materialized nodes might need to be restricted in the future to some cases
1046
+ """
1047
+ if not self.config.node_name_pattern.allowed(node.dbt_name):
1048
+ return False
1049
+
1050
+ if not self._is_allowed_materialized_node(node):
1051
+ return False
1052
+
1053
+ return True
1054
+
1055
+ def _is_allowed_materialized_node(self, node: DBTNode) -> bool:
1056
+ """Filter nodes based on their materialized database location for catalog consistency"""
1057
+
1058
+ # Database level filtering
1059
+ if not node.database:
1060
+ return True
1061
+ if not self.config.materialized_node_pattern.database_pattern.allowed(
1062
+ node.database
1063
+ ):
1064
+ return False
1065
+
1066
+ # Schema level filtering: {database}.{schema}
1067
+ if not node.schema:
1068
+ return True
1069
+ if not self.config.materialized_node_pattern.schema_pattern.allowed(
1070
+ node._join_parts([node.database, node.schema])
1071
+ ):
1072
+ return False
1073
+
1074
+ # Table level filtering: {database}.{schema}.{table}
1075
+ if not node.name:
1076
+ return True
1077
+ if not self.config.materialized_node_pattern.table_pattern.allowed(
1078
+ node.get_db_fqn()
1079
+ ):
1080
+ return False
1081
+
1082
+ return True
994
1083
 
995
1084
  def _filter_nodes(self, all_nodes: List[DBTNode]) -> List[DBTNode]:
996
- nodes = []
1085
+ nodes: List[DBTNode] = []
997
1086
  for node in all_nodes:
998
1087
  key = node.dbt_name
999
1088
 
1000
- if not self._is_allowed_node(key):
1089
+ if not self._is_allowed_node(node):
1001
1090
  self.report.nodes_filtered.append(key)
1002
1091
  continue
1003
1092
 
@@ -1005,6 +1094,62 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1005
1094
 
1006
1095
  return nodes
1007
1096
 
1097
+ def _drop_duplicate_sources(self, original_nodes: List[DBTNode]) -> List[DBTNode]:
1098
+ """Detect and correct cases where a model and source have the same name.
1099
+
1100
+ In these cases, we don't want to generate both because they'll have the same
1101
+ urn and hence overwrite each other. Instead, we drop the source and update
1102
+ references to it to point at the model.
1103
+
1104
+ The risk here is that the source might have documentation that'd be lost,
1105
+ which is why we maintain optionality with a config flag.
1106
+ """
1107
+ if not self.config.drop_duplicate_sources:
1108
+ return original_nodes
1109
+
1110
+ self.report.duplicate_sources_dropped = 0
1111
+ self.report.duplicate_sources_references_updated = 0
1112
+
1113
+ # Pass 1 - find all model names in the warehouse.
1114
+ warehouse_model_names: Dict[str, str] = {} # warehouse name -> model unique id
1115
+ for node in original_nodes:
1116
+ if node.node_type == "model" and node.exists_in_target_platform:
1117
+ warehouse_model_names[node.get_db_fqn()] = node.dbt_name
1118
+
1119
+ # Pass 2 - identify + drop duplicate sources.
1120
+ source_references_to_update: Dict[
1121
+ str, str
1122
+ ] = {} # source unique id -> model unique id
1123
+ nodes: List[DBTNode] = []
1124
+ for node in original_nodes:
1125
+ if (
1126
+ node.node_type == "source"
1127
+ and node.exists_in_target_platform
1128
+ and (model_name := warehouse_model_names.get(node.get_db_fqn()))
1129
+ ):
1130
+ self.report.warning(
1131
+ title="Duplicate model and source names detected",
1132
+ message="We found a dbt model and dbt source with the same name. To ensure reliable lineage generation, the source node was ignored. "
1133
+ "If you associated documentation/tags/other metadata with the source, it will be lost. "
1134
+ "To avoid this, you should remove the source node from your dbt project and replace any `source(<source_name>)` calls with `ref(<model_name>)`.",
1135
+ context=f"{node.dbt_name} (called {node.get_db_fqn()} in {self.config.target_platform}) duplicates {model_name}",
1136
+ )
1137
+ self.report.duplicate_sources_dropped += 1
1138
+ source_references_to_update[node.dbt_name] = model_name
1139
+ else:
1140
+ nodes.append(node)
1141
+
1142
+ # Pass 3 - update references to the dropped sources.
1143
+ for node in nodes:
1144
+ for i, current_upstream in enumerate(node.upstream_nodes):
1145
+ if current_upstream in source_references_to_update:
1146
+ node.upstream_nodes[i] = source_references_to_update[
1147
+ current_upstream
1148
+ ]
1149
+ self.report.duplicate_sources_references_updated += 1
1150
+
1151
+ return nodes
1152
+
1008
1153
  @staticmethod
1009
1154
  def _to_schema_info(schema_fields: List[SchemaField]) -> SchemaInfo:
1010
1155
  return {column.fieldPath: column.nativeDataType for column in schema_fields}
@@ -1033,8 +1178,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1033
1178
  cll_nodes.add(dbt_name)
1034
1179
  schema_nodes.add(dbt_name)
1035
1180
 
1036
- for dbt_name in all_nodes_map:
1037
- if self._is_allowed_node(dbt_name):
1181
+ for dbt_name, dbt_node in all_nodes_map.items():
1182
+ if self._is_allowed_node(dbt_node):
1038
1183
  add_node_to_cll_list(dbt_name)
1039
1184
 
1040
1185
  return schema_nodes, cll_nodes
@@ -1175,6 +1320,11 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1175
1320
  logger.debug(
1176
1321
  f"Not generating CLL for {node.dbt_name} because we don't need it."
1177
1322
  )
1323
+ elif node.language != "sql":
1324
+ logger.debug(
1325
+ f"Not generating CLL for {node.dbt_name} because it is not a SQL model."
1326
+ )
1327
+ self.report.sql_parser_skipped_non_sql_model.append(node.dbt_name)
1178
1328
  elif node.compiled_code:
1179
1329
  # Add CTE stops based on the upstreams list.
1180
1330
  cte_mapping = {
@@ -1238,9 +1388,28 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1238
1388
  target_node_urn, self._to_schema_info(inferred_schema_fields)
1239
1389
  )
1240
1390
 
1241
- # Save the inferred schema fields into the dbt node.
1242
- if inferred_schema_fields:
1243
- node.columns_setdefault(inferred_schema_fields)
1391
+ # When updating the node's columns, our order of preference is:
1392
+ # 1. Schema from the dbt catalog
1393
+ # 2. Inferred schema
1394
+ # 3. Schema fetched from the graph
1395
+ if node.columns:
1396
+ self.report.nodes_with_catalog_columns += 1
1397
+ pass # we already have columns from the dbt catalog
1398
+ elif inferred_schema_fields:
1399
+ logger.debug(
1400
+ f"Using {len(inferred_schema_fields)} inferred columns for {node.dbt_name}"
1401
+ )
1402
+ self.report.nodes_with_inferred_columns += 1
1403
+ node.set_columns(inferred_schema_fields)
1404
+ elif schema_fields:
1405
+ logger.debug(
1406
+ f"Using {len(schema_fields)} graph columns for {node.dbt_name}"
1407
+ )
1408
+ self.report.nodes_with_graph_columns += 1
1409
+ node.set_columns(schema_fields)
1410
+ else:
1411
+ logger.debug(f"No columns found for {node.dbt_name}")
1412
+ self.report.nodes_with_no_columns += 1
1244
1413
 
1245
1414
  def _parse_cll(
1246
1415
  self,
@@ -1305,6 +1474,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1305
1474
  self.config.tag_prefix,
1306
1475
  "SOURCE_CONTROL",
1307
1476
  self.config.strip_user_ids_from_email,
1477
+ match_nested_props=True,
1308
1478
  )
1309
1479
 
1310
1480
  action_processor_tag = OperationProcessor(
@@ -1376,6 +1546,23 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1376
1546
  dataset_snapshot = DatasetSnapshot(
1377
1547
  urn=node_datahub_urn, aspects=list(snapshot_aspects)
1378
1548
  )
1549
+ # Emit sibling aspect for dbt entity (dbt is authoritative source for sibling relationships)
1550
+ if self._should_create_sibling_relationships(node):
1551
+ # Get the target platform URN
1552
+ target_platform_urn = node.get_urn(
1553
+ self.config.target_platform,
1554
+ self.config.env,
1555
+ self.config.target_platform_instance,
1556
+ )
1557
+
1558
+ yield MetadataChangeProposalWrapper(
1559
+ entityUrn=node_datahub_urn,
1560
+ aspect=SiblingsClass(
1561
+ siblings=[target_platform_urn],
1562
+ primary=self.config.dbt_is_primary_sibling,
1563
+ ),
1564
+ ).as_workunit()
1565
+
1379
1566
  mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1380
1567
  if self.config.write_semantics == "PATCH":
1381
1568
  mce = self.get_patched_mce(mce)
@@ -1479,6 +1666,31 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1479
1666
  if not node.exists_in_target_platform:
1480
1667
  continue
1481
1668
 
1669
+ # Emit sibling patch for target platform entity BEFORE any other aspects.
1670
+ # This ensures the hook can detect explicit primary settings when processing later aspects.
1671
+ if self._should_create_sibling_relationships(node):
1672
+ # Get the dbt platform URN
1673
+ dbt_platform_urn = node.get_urn(
1674
+ DBT_PLATFORM,
1675
+ self.config.env,
1676
+ self.config.platform_instance,
1677
+ )
1678
+
1679
+ # Create patch for target platform entity (make it primary when dbt_is_primary_sibling=False)
1680
+ target_patch = DatasetPatchBuilder(node_datahub_urn)
1681
+ target_patch.add_sibling(
1682
+ dbt_platform_urn, primary=not self.config.dbt_is_primary_sibling
1683
+ )
1684
+
1685
+ yield from auto_workunit(
1686
+ MetadataWorkUnit(
1687
+ id=MetadataWorkUnit.generate_workunit_id(mcp),
1688
+ mcp_raw=mcp,
1689
+ is_primary_source=False, # Not authoritative over warehouse metadata
1690
+ )
1691
+ for mcp in target_patch.build()
1692
+ )
1693
+
1482
1694
  # This code block is run when we are generating entities of platform type.
1483
1695
  # We will not link the platform not to the dbt node for type "source" because
1484
1696
  # in this case the platform table existed first.
@@ -1585,6 +1797,12 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1585
1797
  def get_external_url(self, node: DBTNode) -> Optional[str]:
1586
1798
  pass
1587
1799
 
1800
+ @staticmethod
1801
+ def _truncate_code(code: str, max_length: int) -> str:
1802
+ if len(code) > max_length:
1803
+ return code[:max_length] + "..."
1804
+ return code
1805
+
1588
1806
  def _create_view_properties_aspect(
1589
1807
  self, node: DBTNode
1590
1808
  ) -> Optional[ViewPropertiesClass]:
@@ -1596,6 +1814,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1596
1814
  compiled_code = try_format_query(
1597
1815
  node.compiled_code, platform=self.config.target_platform
1598
1816
  )
1817
+ compiled_code = self._truncate_code(
1818
+ compiled_code, _DBT_MAX_COMPILED_CODE_LENGTH
1819
+ )
1599
1820
 
1600
1821
  materialized = node.materialization in {"table", "incremental", "snapshot"}
1601
1822
  view_properties = ViewPropertiesClass(
@@ -1676,6 +1897,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1676
1897
  self.config.tag_prefix,
1677
1898
  "SOURCE_CONTROL",
1678
1899
  self.config.strip_user_ids_from_email,
1900
+ match_nested_props=True,
1679
1901
  )
1680
1902
 
1681
1903
  canonical_schema: List[SchemaField] = []
@@ -2024,5 +2246,27 @@ class DBTSourceBase(StatefulIngestionSourceBase):
2024
2246
  term_id_set.add(existing_term.urn)
2025
2247
  return [GlossaryTermAssociation(term_urn) for term_urn in sorted(term_id_set)]
2026
2248
 
2249
+ def _should_create_sibling_relationships(self, node: DBTNode) -> bool:
2250
+ """
2251
+ Determines whether to emit sibling relationships for a dbt node.
2252
+
2253
+ Sibling relationships (both dbt entity's aspect and target entity's patch) are only
2254
+ emitted when dbt_is_primary_sibling=False to establish explicit primary/secondary
2255
+ relationships. When dbt_is_primary_sibling=True,
2256
+ the SiblingAssociationHook handles sibling creation automatically.
2257
+
2258
+ Args:
2259
+ node: The dbt node to evaluate
2260
+
2261
+ Returns:
2262
+ True if sibling patches should be emitted for this node
2263
+ """
2264
+ # Only create siblings for entities that exist in target platform
2265
+ if not node.exists_in_target_platform:
2266
+ return False
2267
+
2268
+ # Only emit patches when explicit primary/secondary control is needed
2269
+ return self.config.dbt_is_primary_sibling is False
2270
+
2027
2271
  def get_report(self):
2028
2272
  return self.report