acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
datahub/entrypoints.py CHANGED
@@ -10,6 +10,7 @@ import click
10
10
  import datahub._version as datahub_version
11
11
  from datahub.cli.check_cli import check
12
12
  from datahub.cli.cli_utils import (
13
+ enable_auto_decorators,
13
14
  fixup_gms_url,
14
15
  generate_access_token,
15
16
  make_shim_command,
@@ -21,6 +22,7 @@ from datahub.cli.docker_cli import docker
21
22
  from datahub.cli.env_utils import get_boolean_env_variable
22
23
  from datahub.cli.exists_cli import exists
23
24
  from datahub.cli.get_cli import get
25
+ from datahub.cli.graphql_cli import graphql
24
26
  from datahub.cli.ingest_cli import ingest
25
27
  from datahub.cli.migrate import migrate
26
28
  from datahub.cli.put_cli import put
@@ -37,7 +39,7 @@ from datahub.cli.telemetry import telemetry as telemetry_cli
37
39
  from datahub.cli.timeline_cli import timeline
38
40
  from datahub.configuration.common import should_show_stack_trace
39
41
  from datahub.ingestion.graph.client import get_default_graph
40
- from datahub.telemetry import telemetry
42
+ from datahub.ingestion.graph.config import ClientMode
41
43
  from datahub.utilities._custom_package_loader import model_version_name
42
44
  from datahub.utilities.logging_manager import configure_logging
43
45
  from datahub.utilities.server_config_util import get_gms_config
@@ -49,8 +51,9 @@ MAX_CONTENT_WIDTH = 120
49
51
 
50
52
  if sys.version_info >= (3, 12):
51
53
  click.secho(
52
- "Python versions above 3.11 are not tested with. Please use Python 3.11.",
54
+ "Python versions above 3.11 are not actively tested with yet. Please use Python 3.11 for now.",
53
55
  fg="red",
56
+ err=True,
54
57
  )
55
58
 
56
59
 
@@ -109,7 +112,6 @@ def datahub(
109
112
  default=False,
110
113
  help="If passed will show server config. Assumes datahub init has happened.",
111
114
  )
112
- @telemetry.with_telemetry()
113
115
  def version(include_server: bool = False) -> None:
114
116
  """Print version number and exit."""
115
117
 
@@ -117,7 +119,7 @@ def version(include_server: bool = False) -> None:
117
119
  click.echo(f"Models: {model_version_name()}")
118
120
  click.echo(f"Python version: {sys.version}")
119
121
  if include_server:
120
- server_config = get_default_graph().get_config()
122
+ server_config = get_default_graph(ClientMode.CLI).get_config()
121
123
  click.echo(f"Server config: {server_config}")
122
124
 
123
125
 
@@ -129,7 +131,6 @@ def version(include_server: bool = False) -> None:
129
131
  default=False,
130
132
  help="If passed then uses password to initialise token.",
131
133
  )
132
- @telemetry.with_telemetry()
133
134
  def init(use_password: bool = False) -> None:
134
135
  """Configure which datahub instance to connect to"""
135
136
 
@@ -169,6 +170,7 @@ datahub.add_command(ingest)
169
170
  datahub.add_command(delete)
170
171
  datahub.add_command(exists)
171
172
  datahub.add_command(get)
173
+ datahub.add_command(graphql)
172
174
  datahub.add_command(put)
173
175
  datahub.add_command(state)
174
176
  datahub.add_command(telemetry_cli)
@@ -216,6 +218,9 @@ except ImportError as e:
216
218
  make_shim_command("actions", "run `pip install acryl-datahub-actions`")
217
219
  )
218
220
 
221
+ # Adding telemetry and upgrade decorators to all commands
222
+ enable_auto_decorators(datahub)
223
+
219
224
 
220
225
  def main(**kwargs):
221
226
  # We use threads in a variety of places within our CLI. The multiprocessing
datahub/errors.py CHANGED
@@ -31,5 +31,17 @@ class MultipleSubtypesWarning(Warning):
31
31
  pass
32
32
 
33
33
 
34
+ class SearchFilterWarning(Warning):
35
+ pass
36
+
37
+
34
38
  class ExperimentalWarning(Warning):
35
39
  pass
40
+
41
+
42
+ class APITracingWarning(Warning):
43
+ pass
44
+
45
+
46
+ class DataHubDeprecationWarning(DeprecationWarning):
47
+ pass
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import os
3
4
  from typing import TYPE_CHECKING, Iterable, List
4
5
 
5
6
  from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
@@ -7,15 +8,36 @@ from datahub.emitter.serialization_helper import pre_json_transform
7
8
  from datahub.ingestion.api.workunit import MetadataWorkUnit
8
9
  from datahub.metadata.schema_classes import (
9
10
  DatasetProfileClass,
11
+ QueryPropertiesClass,
12
+ QuerySubjectsClass,
10
13
  SchemaFieldClass,
11
14
  SchemaMetadataClass,
15
+ UpstreamLineageClass,
12
16
  )
13
17
 
14
18
  if TYPE_CHECKING:
15
19
  from datahub.ingestion.api.source import SourceReport
16
20
 
21
+
22
+ # TODO: ordering
23
+ # In the cases where we trim collections of data (e.g. fields in schema, upstream lineage, query subjects), given
24
+ # those collections are typically unordered, we should consider sorting them by some criteria (e.g. size, alphabetically)
25
+ # so that the trimming is deterministic and predictable and more importantly consistent across executions.
26
+ # In the case of schemaMetadata, that's more relevant as currently we may be trimming fields while adding nested ones,
27
+ # which may lead to poorly schema rendering in the UI.
28
+
17
29
  logger = logging.getLogger(__name__)
18
30
 
31
+ DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = 5 * 1024 * 1024 # 5MB
32
+ QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = int(
33
+ os.environ.get(
34
+ "QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES",
35
+ DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES,
36
+ )
37
+ )
38
+
39
+ QUERY_STATEMENT_TRUNCATION_BUFFER = 100
40
+
19
41
 
20
42
  class EnsureAspectSizeProcessor:
21
43
  def __init__(
@@ -23,6 +45,7 @@ class EnsureAspectSizeProcessor:
23
45
  ):
24
46
  self.report = report
25
47
  self.payload_constraint = payload_constraint
48
+ self.schema_size_constraint = int(self.payload_constraint * 0.985)
26
49
 
27
50
  def ensure_dataset_profile_size(
28
51
  self, dataset_urn: str, profile: DatasetProfileClass
@@ -68,7 +91,7 @@ class EnsureAspectSizeProcessor:
68
91
  for field in schema.fields:
69
92
  field_size = len(json.dumps(pre_json_transform(field.to_obj())))
70
93
  logger.debug(f"Field {field.fieldPath} takes total {field_size}")
71
- if total_fields_size + field_size < self.payload_constraint:
94
+ if total_fields_size + field_size < self.schema_size_constraint:
72
95
  accepted_fields.append(field)
73
96
  total_fields_size += field_size
74
97
  else:
@@ -80,6 +103,274 @@ class EnsureAspectSizeProcessor:
80
103
 
81
104
  schema.fields = accepted_fields
82
105
 
106
+ def ensure_query_subjects_size(
107
+ self, entity_urn: str, query_subjects: QuerySubjectsClass
108
+ ) -> None:
109
+ """
110
+ Ensure query subjects aspect does not exceed allowed size by removing column-level lineage first,
111
+ then table lineage if necessary.
112
+ """
113
+ if not query_subjects.subjects:
114
+ return
115
+
116
+ total_subjects_size = 0
117
+ accepted_table_level_subjects = []
118
+ accepted_column_level_subjects = []
119
+ column_level_subjects_with_sizes = []
120
+ table_level_subjects_with_sizes = []
121
+
122
+ # Separate column-level and table-level subjects
123
+ for subject in query_subjects.subjects:
124
+ subject_size = len(json.dumps(pre_json_transform(subject.to_obj())))
125
+
126
+ if subject.entity.startswith("urn:li:schemaField:"):
127
+ column_level_subjects_with_sizes.append((subject, subject_size))
128
+ else:
129
+ table_level_subjects_with_sizes.append((subject, subject_size))
130
+
131
+ # Once we find one that doesn't fit, stop everything else to prevent inconsistencies
132
+ first_skip_done = False
133
+
134
+ # First, try to include all table-level subjects
135
+ for subject, subject_size in table_level_subjects_with_sizes:
136
+ if total_subjects_size + subject_size < self.payload_constraint:
137
+ accepted_table_level_subjects.append(subject)
138
+ total_subjects_size += subject_size
139
+ else:
140
+ first_skip_done = True
141
+ break
142
+
143
+ # Then, add column-level subjects if there's remaining space
144
+ # Only process if we successfully included all table-level subjects
145
+ if not first_skip_done:
146
+ for subject, subject_size in column_level_subjects_with_sizes:
147
+ if total_subjects_size + subject_size < self.payload_constraint:
148
+ accepted_column_level_subjects.append(subject)
149
+ total_subjects_size += subject_size
150
+ else:
151
+ first_skip_done = True
152
+ break
153
+
154
+ if first_skip_done:
155
+ # Log aggregate warnings
156
+ table_level_skipped_count = len(table_level_subjects_with_sizes) - len(
157
+ accepted_table_level_subjects
158
+ )
159
+ column_level_skipped_count = len(column_level_subjects_with_sizes) - len(
160
+ accepted_column_level_subjects
161
+ )
162
+
163
+ self._maybe_warn_query_subjects(
164
+ entity_urn, table_level_skipped_count, "table-level lineage subjects"
165
+ )
166
+ self._maybe_warn_query_subjects(
167
+ entity_urn, column_level_skipped_count, "column-level lineage subjects"
168
+ )
169
+
170
+ query_subjects.subjects = (
171
+ accepted_table_level_subjects + accepted_column_level_subjects
172
+ )
173
+
174
+ def _maybe_warn_query_subjects(
175
+ self, entity_urn: str, skipped_count: int, item_type: str
176
+ ) -> None:
177
+ """Log warning for query subjects truncation if any items were skipped."""
178
+ if skipped_count > 0:
179
+ self.report.warning(
180
+ title="Query subjects truncated due to size constraint",
181
+ message="Query subjects contained too much data and would have caused ingestion to fail",
182
+ context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints",
183
+ )
184
+
185
+ def _maybe_warn_upstream_lineage(
186
+ self, entity_urn: str, skipped_count: int, item_type: str
187
+ ) -> None:
188
+ """Log warning for upstream lineage truncation if any items were skipped."""
189
+ if skipped_count > 0:
190
+ self.report.warning(
191
+ title="Upstream lineage truncated due to size constraint",
192
+ message="Upstream lineage contained too much data and would have caused ingestion to fail",
193
+ context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints",
194
+ )
195
+
196
+ def ensure_upstream_lineage_size( # noqa: C901
197
+ self, entity_urn: str, upstream_lineage: UpstreamLineageClass
198
+ ) -> None:
199
+ """
200
+ Ensure upstream lineage aspect does not exceed allowed size by removing lineage in priority order:
201
+ first NONE fine-grained lineages (lowest priority), then FIELD_SET fine-grained lineages,
202
+ then DATASET fine-grained lineages, and finally upstreams (highest priority).
203
+ """
204
+ if not upstream_lineage.fineGrainedLineages and not upstream_lineage.upstreams:
205
+ return
206
+
207
+ total_lineage_size = 0
208
+ accepted_upstreams = []
209
+ accepted_dataset_fg_lineages = []
210
+ accepted_field_set_fg_lineages = []
211
+ accepted_none_fg_lineages = []
212
+ upstream_items_with_sizes = []
213
+ dataset_fg_items_with_sizes = []
214
+ field_set_fg_items_with_sizes = []
215
+ none_fg_items_with_sizes = []
216
+
217
+ # Add upstreams (highest priority)
218
+ if upstream_lineage.upstreams:
219
+ for upstream in upstream_lineage.upstreams:
220
+ upstream_size = len(json.dumps(pre_json_transform(upstream.to_obj())))
221
+ upstream_items_with_sizes.append((upstream, upstream_size))
222
+
223
+ # Separate fine-grained lineage items by upstreamType: DATASET > FIELD_SET > NONE
224
+ if upstream_lineage.fineGrainedLineages:
225
+ for fg_lineage in upstream_lineage.fineGrainedLineages:
226
+ fg_lineage_size = len(
227
+ json.dumps(pre_json_transform(fg_lineage.to_obj()))
228
+ )
229
+
230
+ upstream_type_str = str(fg_lineage.upstreamType)
231
+ if upstream_type_str == "DATASET":
232
+ dataset_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
233
+ elif upstream_type_str == "FIELD_SET":
234
+ field_set_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
235
+ elif upstream_type_str == "NONE":
236
+ none_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
237
+
238
+ # Once we find one that doesn't fit, stop everything else to prevent inconsistencies
239
+ first_skip_done = False
240
+
241
+ # First, include all upstreams (highest priority)
242
+ for item, item_size in upstream_items_with_sizes:
243
+ if total_lineage_size + item_size < self.payload_constraint:
244
+ accepted_upstreams.append(item)
245
+ total_lineage_size += item_size
246
+ else:
247
+ first_skip_done = True
248
+ break
249
+
250
+ # Second, include DATASET fine-grained lineages if no upstreams were skipped
251
+ if not first_skip_done:
252
+ for fg_lineage, fg_lineage_size in dataset_fg_items_with_sizes:
253
+ if total_lineage_size + fg_lineage_size < self.payload_constraint:
254
+ accepted_dataset_fg_lineages.append(fg_lineage)
255
+ total_lineage_size += fg_lineage_size
256
+ else:
257
+ first_skip_done = True
258
+ break
259
+
260
+ # Third, include FIELD_SET fine-grained lineages if no higher priority items were skipped
261
+ if not first_skip_done:
262
+ for fg_lineage, fg_lineage_size in field_set_fg_items_with_sizes:
263
+ if total_lineage_size + fg_lineage_size < self.payload_constraint:
264
+ accepted_field_set_fg_lineages.append(fg_lineage)
265
+ total_lineage_size += fg_lineage_size
266
+ else:
267
+ first_skip_done = True
268
+ break
269
+
270
+ # Finally, include NONE fine-grained lineages if no higher priority items were skipped
271
+ if not first_skip_done:
272
+ for fg_lineage, fg_lineage_size in none_fg_items_with_sizes:
273
+ if total_lineage_size + fg_lineage_size < self.payload_constraint:
274
+ accepted_none_fg_lineages.append(fg_lineage)
275
+ total_lineage_size += fg_lineage_size
276
+ else:
277
+ first_skip_done = True
278
+ break
279
+
280
+ # Log aggregate warnings instead of per-item warnings
281
+ if first_skip_done:
282
+ upstreams_skipped_count = len(upstream_items_with_sizes) - len(
283
+ accepted_upstreams
284
+ )
285
+ dataset_fg_skipped_count = len(dataset_fg_items_with_sizes) - len(
286
+ accepted_dataset_fg_lineages
287
+ )
288
+ field_set_fg_skipped_count = len(field_set_fg_items_with_sizes) - len(
289
+ accepted_field_set_fg_lineages
290
+ )
291
+ none_fg_skipped_count = len(none_fg_items_with_sizes) - len(
292
+ accepted_none_fg_lineages
293
+ )
294
+
295
+ self._maybe_warn_upstream_lineage(
296
+ entity_urn, upstreams_skipped_count, "upstream datasets"
297
+ )
298
+ self._maybe_warn_upstream_lineage(
299
+ entity_urn,
300
+ dataset_fg_skipped_count,
301
+ "dataset-level fine-grained lineages",
302
+ )
303
+ self._maybe_warn_upstream_lineage(
304
+ entity_urn,
305
+ field_set_fg_skipped_count,
306
+ "field-set-level fine-grained lineages",
307
+ )
308
+ self._maybe_warn_upstream_lineage(
309
+ entity_urn, none_fg_skipped_count, "none-level fine-grained lineages"
310
+ )
311
+
312
+ # Combine all accepted fine-grained lineages
313
+ accepted_fine_grained_lineages = (
314
+ accepted_dataset_fg_lineages
315
+ + accepted_field_set_fg_lineages
316
+ + accepted_none_fg_lineages
317
+ )
318
+
319
+ upstream_lineage.upstreams = accepted_upstreams
320
+ upstream_lineage.fineGrainedLineages = (
321
+ accepted_fine_grained_lineages if accepted_fine_grained_lineages else None
322
+ )
323
+
324
+ def ensure_query_properties_size(
325
+ self, entity_urn: str, query_properties: QueryPropertiesClass
326
+ ) -> None:
327
+ """
328
+ Ensure query properties aspect does not exceed allowed size by truncating the query statement value.
329
+ Uses a configurable max payload size that is the minimum between QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES
330
+ and INGEST_MAX_PAYLOAD_BYTES.
331
+
332
+ We have found surprisingly large query statements (e.g. 20MB+) that caused ingestion to fail;
333
+ that was INSERT INTO VALUES with huge list of values.
334
+ """
335
+ if not query_properties.statement or not query_properties.statement.value:
336
+ return
337
+
338
+ max_payload_size = min(
339
+ QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES, self.payload_constraint
340
+ )
341
+
342
+ current_size = len(json.dumps(pre_json_transform(query_properties.to_obj())))
343
+
344
+ if current_size < max_payload_size:
345
+ return
346
+
347
+ reduction_needed = (
348
+ current_size - max_payload_size + QUERY_STATEMENT_TRUNCATION_BUFFER
349
+ )
350
+
351
+ statement_value_size = len(query_properties.statement.value)
352
+ original_statement_size = statement_value_size
353
+
354
+ # Only truncate if reduction is actually needed and possible
355
+ if statement_value_size > reduction_needed > 0:
356
+ new_statement_length = statement_value_size - reduction_needed
357
+ truncated_statement = query_properties.statement.value[
358
+ :new_statement_length
359
+ ]
360
+
361
+ truncation_message = f"... [original value was {original_statement_size} bytes and truncated to {new_statement_length} bytes]"
362
+ query_properties.statement.value = truncated_statement + truncation_message
363
+
364
+ self.report.warning(
365
+ title="Query properties truncated due to size constraint",
366
+ message="Query properties contained too much data and would have caused ingestion to fail",
367
+ context=f"Query statement was truncated from {original_statement_size} to {new_statement_length} characters for {entity_urn} due to aspect size constraints",
368
+ )
369
+ else:
370
+ logger.warning(
371
+ f"Cannot truncate query statement for {entity_urn} as it is smaller than or equal to the required reduction size {reduction_needed}. That means that 'ensure_query_properties_size' must be extended to trim other fields different than statement."
372
+ )
373
+
83
374
  def ensure_aspect_size(
84
375
  self,
85
376
  stream: Iterable[MetadataWorkUnit],
@@ -89,10 +380,16 @@ class EnsureAspectSizeProcessor:
89
380
  on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
90
381
  """
91
382
  for wu in stream:
92
- logger.debug(f"Ensuring size of workunit: {wu.id}")
383
+ # logger.debug(f"Ensuring size of workunit: {wu.id}")
93
384
 
94
385
  if schema := wu.get_aspect_of_type(SchemaMetadataClass):
95
386
  self.ensure_schema_metadata_size(wu.get_urn(), schema)
96
387
  elif profile := wu.get_aspect_of_type(DatasetProfileClass):
97
388
  self.ensure_dataset_profile_size(wu.get_urn(), profile)
389
+ elif query_subjects := wu.get_aspect_of_type(QuerySubjectsClass):
390
+ self.ensure_query_subjects_size(wu.get_urn(), query_subjects)
391
+ elif upstream_lineage := wu.get_aspect_of_type(UpstreamLineageClass):
392
+ self.ensure_upstream_lineage_size(wu.get_urn(), upstream_lineage)
393
+ elif query_properties := wu.get_aspect_of_type(QueryPropertiesClass):
394
+ self.ensure_query_properties_size(wu.get_urn(), query_properties)
98
395
  yield wu
@@ -0,0 +1,87 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, Iterable, List
3
+
4
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
5
+ from datahub.metadata.schema_classes import InputFieldClass, InputFieldsClass
6
+
7
+ if TYPE_CHECKING:
8
+ from datahub.ingestion.api.source import SourceReport
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class ValidateInputFieldsProcessor:
14
+ def __init__(self, report: "SourceReport"):
15
+ self.report = report
16
+
17
+ def validate_input_fields(
18
+ self,
19
+ stream: Iterable[MetadataWorkUnit],
20
+ ) -> Iterable[MetadataWorkUnit]:
21
+ """
22
+ Validate input fields and filter out invalid ones.
23
+
24
+ Invalid input fields have empty or missing fieldPath values, which would cause
25
+ URN generation to fail when sent to the server. This processor filters them out
26
+ and reports them as warnings.
27
+ """
28
+ for wu in stream:
29
+ input_fields_aspect = wu.get_aspect_of_type(InputFieldsClass)
30
+ if input_fields_aspect and input_fields_aspect.fields:
31
+ valid_fields: List[InputFieldClass] = []
32
+ invalid_count = 0
33
+
34
+ for input_field in input_fields_aspect.fields:
35
+ if (
36
+ input_field.schemaField
37
+ and input_field.schemaField.fieldPath
38
+ and input_field.schemaField.fieldPath.strip()
39
+ ):
40
+ valid_fields.append(input_field)
41
+ else:
42
+ invalid_count += 1
43
+
44
+ if invalid_count > 0:
45
+ logger.debug(
46
+ f"Filtered {invalid_count} invalid input field(s) with empty fieldPath for {wu.get_urn()}"
47
+ )
48
+ self.report.num_input_fields_filtered += invalid_count
49
+ self.report.warning(
50
+ title="Invalid input fields filtered",
51
+ message="Input fields with empty fieldPath values were filtered out to prevent ingestion errors",
52
+ context=f"Filtered {invalid_count} invalid input field(s) for {wu.get_urn()}",
53
+ )
54
+
55
+ # Update the aspect with only valid fields
56
+ if valid_fields:
57
+ input_fields_aspect.fields = valid_fields
58
+ else:
59
+ # If no valid fields remain, skip this workunit entirely
60
+ logger.debug(
61
+ f"All input fields were invalid for {wu.get_urn()}, skipping InputFieldsClass workunit"
62
+ )
63
+ # Don't yield this workunit
64
+ continue
65
+
66
+ yield wu
67
+
68
+ def _remove_input_fields_aspect(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
69
+ """Remove InputFieldsClass aspect from a workunit."""
70
+ # For MCPs, we can simply not yield the aspect
71
+ # For MCEs, we need to remove it from the snapshot
72
+ if hasattr(wu.metadata, "aspect") and isinstance(
73
+ wu.metadata.aspect, InputFieldsClass
74
+ ):
75
+ # This is an MCP with InputFieldsClass, skip it
76
+ return wu
77
+
78
+ if hasattr(wu.metadata, "proposedSnapshot"):
79
+ snapshot = wu.metadata.proposedSnapshot
80
+ if hasattr(snapshot, "aspects"):
81
+ snapshot.aspects = [
82
+ aspect
83
+ for aspect in snapshot.aspects
84
+ if not isinstance(aspect, InputFieldsClass)
85
+ ]
86
+
87
+ return wu
@@ -12,6 +12,9 @@ if TYPE_CHECKING:
12
12
 
13
13
  T = TypeVar("T")
14
14
 
15
+ if TYPE_CHECKING:
16
+ from datahub.ingestion.run.pipeline_config import FlagsConfig
17
+
15
18
 
16
19
  @dataclass
17
20
  class RecordEnvelope(Generic[T]):
@@ -60,6 +63,12 @@ class PipelineContext:
60
63
 
61
64
  self._set_dataset_urn_to_lower_if_needed()
62
65
 
66
+ @property
67
+ def flags(self) -> "FlagsConfig":
68
+ from datahub.ingestion.run.pipeline_config import FlagsConfig
69
+
70
+ return self.pipeline_config.flags if self.pipeline_config else FlagsConfig()
71
+
63
72
  def _set_dataset_urn_to_lower_if_needed(self) -> None:
64
73
  # TODO: Get rid of this function once lower-casing is the standard.
65
74
  if self.graph:
@@ -1,12 +1,16 @@
1
+ # So that SourceCapabilityModifier can be resolved at runtime
2
+ from __future__ import annotations
3
+
1
4
  from dataclasses import dataclass
2
5
  from enum import Enum, auto
3
- from typing import Callable, Dict, Optional, Type
6
+ from typing import Callable, Dict, List, Optional, Type
4
7
 
5
8
  from datahub.ingestion.api.common import PipelineContext
6
9
  from datahub.ingestion.api.source import (
7
10
  Source,
8
11
  SourceCapability as SourceCapability,
9
12
  )
13
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
10
14
 
11
15
 
12
16
  def config_class(config_cls: Type) -> Callable[[Type], Type]:
@@ -88,10 +92,14 @@ class CapabilitySetting:
88
92
  capability: SourceCapability
89
93
  description: str
90
94
  supported: bool
95
+ subtype_modifier: Optional[List[SourceCapabilityModifier]] = None
91
96
 
92
97
 
93
98
  def capability(
94
- capability_name: SourceCapability, description: str, supported: bool = True
99
+ capability_name: SourceCapability,
100
+ description: str,
101
+ supported: bool = True,
102
+ subtype_modifier: Optional[List[SourceCapabilityModifier]] = None,
95
103
  ) -> Callable[[Type], Type]:
96
104
  """
97
105
  A decorator to mark a source as having a certain capability
@@ -104,6 +112,7 @@ def capability(
104
112
  for base in cls.__bases__
105
113
  ):
106
114
  cls.__capabilities = {}
115
+
107
116
  cls.get_capabilities = lambda: cls.__capabilities.values()
108
117
 
109
118
  # If the superclasses have capability annotations, copy those over.
@@ -113,7 +122,10 @@ def capability(
113
122
  cls.__capabilities.update(base_caps)
114
123
 
115
124
  cls.__capabilities[capability_name] = CapabilitySetting(
116
- capability=capability_name, description=description, supported=supported
125
+ capability=capability_name,
126
+ description=description,
127
+ supported=supported,
128
+ subtype_modifier=subtype_modifier,
117
129
  )
118
130
  return cls
119
131