acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,10 @@
1
- import contextlib
2
1
  import json
3
2
  import logging
3
+ import time
4
4
  from datetime import datetime
5
5
  from typing import Any, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
6
6
 
7
- from sqlalchemy import create_engine
7
+ from sqlalchemy import create_engine, text
8
8
 
9
9
  from datahub.emitter.aspect import ASPECT_MAP
10
10
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -12,13 +12,14 @@ from datahub.emitter.serialization_helper import post_json_transform
12
12
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
13
13
  from datahub.ingestion.source.datahub.report import DataHubSourceReport
14
14
  from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
15
- from datahub.metadata.schema_classes import ChangeTypeClass, SystemMetadataClass
15
+ from datahub.metadata.schema_classes import SystemMetadataClass
16
16
  from datahub.utilities.lossy_collections import LossyDict, LossyList
17
17
 
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
20
  # Should work for at least mysql, mariadb, postgres
21
21
  DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
22
+ DATE_FORMAT = "%Y-%m-%d"
22
23
 
23
24
  ROW = TypeVar("ROW", bound=Dict[str, Any])
24
25
 
@@ -85,6 +86,9 @@ class DataHubDatabaseReader:
85
86
  **connection_config.options,
86
87
  )
87
88
 
89
+ # Cache for available dates to avoid redundant queries
90
+ self.available_dates_cache: Optional[List[datetime]] = None
91
+
88
92
  @property
89
93
  def soft_deleted_urns_query(self) -> str:
90
94
  return f"""
@@ -100,14 +104,28 @@ class DataHubDatabaseReader:
100
104
  ORDER BY mav.urn
101
105
  """
102
106
 
103
- @property
104
- def query(self) -> str:
105
- # May repeat rows for the same date
106
- # Offset is generally 0, unless we repeat the same createdon twice
107
+ def _get_json_extract_expression(self) -> str:
108
+ """
109
+ Returns the appropriate JSON extraction expression based on the database dialect.
110
+
111
+ Returns:
112
+ Database-specific JSON extraction expression
113
+ """
114
+ # Return the correct JSON extraction expression for the "removed" field,
115
+ # depending on the database dialect.
116
+ if self.engine.dialect.name == "postgresql":
117
+ # For PostgreSQL, cast the metadata column to JSON and extract the 'removed' key as boolean.
118
+ return "((metadata::json)->>'removed')::boolean"
119
+ else:
120
+ # For other databases (e.g., MySQL), use JSON_EXTRACT.
121
+ return "JSON_EXTRACT(metadata, '$.removed')"
122
+
123
+ def query(self, set_structured_properties_filter: bool) -> str:
124
+ """
125
+ Main query that gets data for specified date range with appropriate filters.
126
+ """
127
+ structured_prop_filter = f" AND urn {'' if set_structured_properties_filter else 'NOT'} like 'urn:li:structuredProperty:%%'"
107
128
 
108
- # Ensures stable order, chronological per (urn, aspect)
109
- # Relies on createdon order to reflect version order
110
- # Ordering of entries with the same createdon is handled by VersionOrderer
111
129
  return f"""
112
130
  SELECT *
113
131
  FROM (
@@ -123,7 +141,7 @@ class DataHubDatabaseReader:
123
141
  LEFT JOIN (
124
142
  SELECT
125
143
  *,
126
- JSON_EXTRACT(metadata, '$.removed') as removed
144
+ {self._get_json_extract_expression()} as removed
127
145
  FROM {self.engine.dialect.identifier_preparer.quote(self.config.database_table_name)}
128
146
  WHERE aspect = 'status'
129
147
  AND version = 0
@@ -132,6 +150,7 @@ class DataHubDatabaseReader:
132
150
  {"" if self.config.include_all_versions else "AND mav.version = 0"}
133
151
  {"" if not self.config.exclude_aspects else "AND mav.aspect NOT IN %(exclude_aspects)s"}
134
152
  AND mav.createdon >= %(since_createdon)s
153
+ AND mav.createdon < %(end_createdon)s
135
154
  ORDER BY
136
155
  createdon,
137
156
  urn,
@@ -139,50 +158,189 @@ class DataHubDatabaseReader:
139
158
  version
140
159
  ) as t
141
160
  WHERE 1=1
142
- {"" if self.config.include_soft_deleted_entities else "AND (removed = false or removed is NULL)"}
161
+ {"" if self.config.include_soft_deleted_entities else " AND (removed = false or removed is NULL)"}
162
+ {structured_prop_filter}
143
163
  ORDER BY
144
164
  createdon,
145
165
  urn,
146
166
  aspect,
147
167
  version
168
+ LIMIT %(limit)s
169
+ OFFSET %(offset)s
148
170
  """
149
171
 
172
+ def execute_with_params(
173
+ self, query: str, params: Dict[str, Any]
174
+ ) -> List[Dict[str, Any]]:
175
+ """Execute query with proper parameter binding that works with your database"""
176
+ with self.engine.connect() as conn:
177
+ result = conn.execute(query, params or {})
178
+ return [dict(row) for row in result.fetchall()]
179
+
150
180
  def execute_server_cursor(
151
181
  self, query: str, params: Dict[str, Any]
152
182
  ) -> Iterable[Dict[str, Any]]:
183
+ """Execute a query with server-side cursor"""
153
184
  with self.engine.connect() as conn:
154
185
  if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
155
186
  with (
156
187
  conn.begin()
157
188
  ): # Transaction required for PostgreSQL server-side cursor
158
- # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
159
- # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
189
+ # Set query timeout at the connection level
190
+ if self.config.query_timeout:
191
+ if self.engine.dialect.name == "postgresql":
192
+ conn.execute(
193
+ text(
194
+ f"SET statement_timeout = {self.config.query_timeout * 1000}"
195
+ )
196
+ ) # milliseconds
197
+ elif self.engine.dialect.name in ["mysql", "mariadb"]:
198
+ conn.execute(
199
+ text(
200
+ f"SET max_execution_time = {self.config.query_timeout * 1000}"
201
+ )
202
+ ) # milliseconds
203
+
204
+ # Stream results with batch size
160
205
  conn = conn.execution_options(
161
206
  stream_results=True,
162
207
  yield_per=self.config.database_query_batch_size,
163
208
  )
209
+
210
+ # Execute query - using native parameterization without text()
211
+ # to maintain compatibility with your original code
164
212
  result = conn.execute(query, params)
165
213
  for row in result:
166
214
  yield dict(row)
215
+
216
+ return # Success, exit the retry loop
167
217
  else:
168
218
  raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
169
219
 
170
220
  def _get_rows(
171
- self, from_createdon: datetime, stop_time: datetime
221
+ self,
222
+ start_date: datetime,
223
+ end_date: datetime,
224
+ set_structured_properties_filter: bool,
225
+ limit: int,
172
226
  ) -> Iterable[Dict[str, Any]]:
173
- params = {
174
- "exclude_aspects": list(self.config.exclude_aspects),
175
- "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
176
- }
177
- yield from self.execute_server_cursor(self.query, params)
227
+ """
228
+ Retrieves data rows within a specified date range using pagination.
178
229
 
179
- def get_aspects(
230
+ Implements a hybrid pagination strategy that switches between time-based and
231
+ offset-based approaches depending on the returned data. Uses server-side
232
+ cursors for efficient memory usage.
233
+
234
+ Note: May return duplicate rows across batch boundaries when multiple rows
235
+ share the same 'createdon' timestamp. This is expected behavior when
236
+ transitioning between pagination methods.
237
+
238
+ Args:
239
+ start_date: Beginning of date range (inclusive)
240
+ end_date: End of date range (exclusive)
241
+ set_structured_properties_filter: Whether to apply structured filtering
242
+ limit: Maximum rows to fetch per query
243
+
244
+ Returns:
245
+ An iterable of database rows as dictionaries
246
+ """
247
+ offset = 0
248
+ last_createdon = None
249
+ first_iteration = True
250
+
251
+ while True:
252
+ try:
253
+ # Set up query and parameters - using named parameters
254
+ query = self.query(set_structured_properties_filter)
255
+ params: Dict[str, Any] = {
256
+ "since_createdon": start_date.strftime(DATETIME_FORMAT),
257
+ "end_createdon": end_date.strftime(DATETIME_FORMAT),
258
+ "limit": limit,
259
+ "offset": offset,
260
+ # Always pass exclude_aspects as a tuple, postgres doesn't support lists
261
+ "exclude_aspects": tuple(self.config.exclude_aspects),
262
+ }
263
+
264
+ logger.info(
265
+ f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
266
+ f"with limit {limit} and offset {offset} (inclusive range)"
267
+ )
268
+
269
+ # Execute query with server-side cursor
270
+ rows = self.execute_server_cursor(query, params)
271
+ # Process and yield rows
272
+ rows_processed = 0
273
+ for row in rows:
274
+ if first_iteration:
275
+ start_date = row.get("createdon", start_date)
276
+ first_iteration = False
277
+
278
+ last_createdon = row.get("createdon")
279
+ rows_processed += 1
280
+ yield row
281
+
282
+ # If we processed fewer than the limit or no last_createdon, we're done
283
+ if rows_processed < limit or not last_createdon:
284
+ break
285
+
286
+ # Update parameters for next iteration
287
+ if start_date != last_createdon:
288
+ start_date = last_createdon
289
+ offset = 0
290
+ else:
291
+ offset += limit
292
+
293
+ logger.info(
294
+ f"Processed {rows_processed} rows for date range {start_date} to {end_date}. Continuing to next batch."
295
+ )
296
+
297
+ except Exception as e:
298
+ logger.error(
299
+ f"Error processing date range {start_date} to {end_date}: {str(e)}"
300
+ )
301
+ # Re-raise the exception after logging
302
+ raise
303
+
304
+ def get_all_aspects(
180
305
  self, from_createdon: datetime, stop_time: datetime
306
+ ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
307
+ logger.info("Fetching Structured properties aspects")
308
+ yield from self.get_aspects(
309
+ from_createdon=from_createdon,
310
+ stop_time=stop_time,
311
+ set_structured_properties_filter=True,
312
+ )
313
+
314
+ logger.info(
315
+ f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
316
+ )
317
+
318
+ time.sleep(
319
+ self.config.structured_properties_template_cache_invalidation_interval
320
+ )
321
+
322
+ logger.info("Fetching aspects")
323
+ yield from self.get_aspects(
324
+ from_createdon=from_createdon,
325
+ stop_time=stop_time,
326
+ set_structured_properties_filter=False,
327
+ )
328
+
329
+ def get_aspects(
330
+ self,
331
+ from_createdon: datetime,
332
+ stop_time: datetime,
333
+ set_structured_properties_filter: bool = False,
181
334
  ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
182
335
  orderer = VersionOrderer[Dict[str, Any]](
183
336
  enabled=self.config.include_all_versions
184
337
  )
185
- rows = self._get_rows(from_createdon=from_createdon, stop_time=stop_time)
338
+ rows = self._get_rows(
339
+ start_date=from_createdon,
340
+ end_date=stop_time,
341
+ set_structured_properties_filter=set_structured_properties_filter,
342
+ limit=self.config.database_query_batch_size,
343
+ )
186
344
  for row in orderer(rows):
187
345
  mcp = self._parse_row(row)
188
346
  if mcp:
@@ -190,22 +348,29 @@ class DataHubDatabaseReader:
190
348
 
191
349
  def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
192
350
  """
193
- Fetches all soft-deleted entities from the database.
351
+ Fetches all soft-deleted entities from the database using pagination.
194
352
 
195
353
  Yields:
196
354
  Row objects containing URNs of soft-deleted entities
197
355
  """
198
- with self.engine.connect() as conn:
199
- with contextlib.closing(conn.connection.cursor()) as cursor:
200
- logger.debug("Polling soft-deleted urns from database")
201
- cursor.execute(self.soft_deleted_urns_query)
202
- columns = [desc[0] for desc in cursor.description]
203
- while True:
204
- rows = cursor.fetchmany(self.config.database_query_batch_size)
205
- if not rows:
206
- return
207
- for row in rows:
208
- yield dict(zip(columns, row))
356
+ try:
357
+ params: Dict = {}
358
+
359
+ logger.debug("Fetching soft-deleted URNs")
360
+
361
+ # Use server-side cursor implementation
362
+ rows = self.execute_server_cursor(self.soft_deleted_urns_query, params)
363
+ processed_rows = 0
364
+ # Process and yield rows
365
+ for row in rows:
366
+ processed_rows += 1
367
+ yield row
368
+
369
+ logger.debug(f"Fetched batch of {processed_rows} soft-deleted URNs")
370
+
371
+ except Exception:
372
+ logger.exception("Error fetching soft-deleted row", exc_info=True)
373
+ raise
209
374
 
210
375
  def _parse_row(
211
376
  self, row: Dict[str, Any]
@@ -215,12 +380,16 @@ class DataHubDatabaseReader:
215
380
  json_metadata = post_json_transform(
216
381
  json.loads(row["systemmetadata"] or "{}")
217
382
  )
218
- system_metadata = SystemMetadataClass.from_obj(json_metadata)
383
+ system_metadata = None
384
+ if self.config.preserve_system_metadata:
385
+ system_metadata = SystemMetadataClass.from_obj(json_metadata)
386
+ if system_metadata.properties:
387
+ is_no_op = system_metadata.properties.pop("isNoOp", None)
388
+ logger.debug(f"Removed potential value for is_no_op={is_no_op}")
219
389
  return MetadataChangeProposalWrapper(
220
390
  entityUrn=row["urn"],
221
391
  aspect=ASPECT_MAP[row["aspect"]].from_obj(json_aspect),
222
392
  systemMetadata=system_metadata,
223
- changeType=ChangeTypeClass.UPSERT,
224
393
  )
225
394
  except Exception as e:
226
395
  logger.warning(
@@ -6,7 +6,9 @@ from typing import Dict, Iterable, List, Optional
6
6
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
7
7
  from datahub.ingestion.api.common import PipelineContext
8
8
  from datahub.ingestion.api.decorators import (
9
+ SourceCapability,
9
10
  SupportStatus,
11
+ capability,
10
12
  config_class,
11
13
  platform_name,
12
14
  support_status,
@@ -17,6 +19,7 @@ from datahub.ingestion.api.source_helpers import (
17
19
  auto_workunit_reporter,
18
20
  )
19
21
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
20
23
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
21
24
  from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
22
25
  from datahub.ingestion.source.datahub.datahub_database_reader import (
@@ -37,6 +40,13 @@ logger = logging.getLogger(__name__)
37
40
  @platform_name("DataHub")
38
41
  @config_class(DataHubSourceConfig)
39
42
  @support_status(SupportStatus.TESTING)
43
+ @capability(
44
+ SourceCapability.CONTAINERS,
45
+ "Enabled by default",
46
+ subtype_modifier=[
47
+ SourceCapabilityModifier.DATABASE,
48
+ ],
49
+ )
40
50
  class DataHubSource(StatefulIngestionSourceBase):
41
51
  platform: str = "datahub"
42
52
 
@@ -117,7 +127,7 @@ class DataHubSource(StatefulIngestionSourceBase):
117
127
  ) -> Iterable[MetadataWorkUnit]:
118
128
  logger.info(f"Fetching database aspects starting from {from_createdon}")
119
129
  progress = ProgressTimer(report_every=timedelta(seconds=60))
120
- mcps = reader.get_aspects(from_createdon, self.report.stop_time)
130
+ mcps = reader.get_all_aspects(from_createdon, self.report.stop_time)
121
131
  for i, (mcp, createdon) in enumerate(mcps):
122
132
  if not self.urn_pattern.allowed(str(mcp.entityUrn)):
123
133
  continue
@@ -9,6 +9,7 @@ import requests
9
9
  from pydantic import Field, root_validator
10
10
 
11
11
  from datahub.ingestion.api.decorators import (
12
+ SourceCapability,
12
13
  SupportStatus,
13
14
  capability,
14
15
  config_class,
@@ -17,7 +18,6 @@ from datahub.ingestion.api.decorators import (
17
18
  )
18
19
  from datahub.ingestion.api.source import (
19
20
  CapabilityReport,
20
- SourceCapability,
21
21
  TestableSource,
22
22
  TestConnectionReport,
23
23
  )
@@ -26,6 +26,7 @@ from datahub.ingestion.source.dbt.dbt_common import (
26
26
  DBTCommonConfig,
27
27
  DBTNode,
28
28
  DBTSourceBase,
29
+ DBTSourceReport,
29
30
  )
30
31
  from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
31
32
 
@@ -262,16 +263,16 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
262
263
 
263
264
  @platform_name("dbt")
264
265
  @config_class(DBTCloudConfig)
265
- @support_status(SupportStatus.INCUBATING)
266
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
267
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
266
+ @support_status(SupportStatus.CERTIFIED)
267
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
268
268
  class DBTCloudSource(DBTSourceBase, TestableSource):
269
269
  config: DBTCloudConfig
270
+ report: DBTSourceReport # nothing cloud-specific in the report
270
271
 
271
272
  @classmethod
272
273
  def create(cls, config_dict, ctx):
273
274
  config = DBTCloudConfig.parse_obj(config_dict)
274
- return cls(config, ctx, "dbt")
275
+ return cls(config, ctx)
275
276
 
276
277
  @staticmethod
277
278
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -369,9 +370,12 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
369
370
  name = node["alias"]
370
371
 
371
372
  comment = node.get("comment", "")
372
- description = node["description"]
373
- if node.get("sourceDescription"):
374
- description = node["sourceDescription"]
373
+
374
+ # In dbt sources, there are two types of descriptions:
375
+ # - description: table-level description (specific to the source table)
376
+ # - sourceDescription: schema-level description (describes the overall source schema)
377
+ # The table-level description should take precedence since it's more specific.
378
+ description = node["description"] or node.get("sourceDescription", "")
375
379
 
376
380
  if node["resourceType"] == "model":
377
381
  materialization = node["materializedType"]
@@ -405,8 +409,11 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
405
409
  if node["resourceType"] in {"model", "seed", "snapshot"}:
406
410
  status = node["status"]
407
411
  if status is None and materialization != "ephemeral":
408
- self.report.report_warning(
409
- key, "node is missing a status, schema metadata will be incomplete"
412
+ self.report.warning(
413
+ title="Schema information may be incomplete",
414
+ message="Some nodes are missing the `status` field, which dbt uses to track the status of the node in the target database.",
415
+ context=key,
416
+ log=False,
410
417
  )
411
418
 
412
419
  # The code fields are new in dbt 1.3, and replace the sql ones.