acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import os
3
4
  from typing import TYPE_CHECKING, Iterable, List
4
5
 
5
6
  from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
@@ -7,15 +8,36 @@ from datahub.emitter.serialization_helper import pre_json_transform
7
8
  from datahub.ingestion.api.workunit import MetadataWorkUnit
8
9
  from datahub.metadata.schema_classes import (
9
10
  DatasetProfileClass,
11
+ QueryPropertiesClass,
12
+ QuerySubjectsClass,
10
13
  SchemaFieldClass,
11
14
  SchemaMetadataClass,
15
+ UpstreamLineageClass,
12
16
  )
13
17
 
14
18
  if TYPE_CHECKING:
15
19
  from datahub.ingestion.api.source import SourceReport
16
20
 
21
+
22
+ # TODO: ordering
23
+ # In the cases where we trim collections of data (e.g. fields in schema, upstream lineage, query subjects), given
24
+ # those collections are typically unordered, we should consider sorting them by some criteria (e.g. size, alphabetically)
25
+ # so that the trimming is deterministic and predictable and more importantly consistent across executions.
26
+ # In the case of schemaMetadata, that's more relevant as currently we may be trimming fields while adding nested ones,
27
+ # which may lead to poorly schema rendering in the UI.
28
+
17
29
  logger = logging.getLogger(__name__)
18
30
 
31
+ DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = 5 * 1024 * 1024 # 5MB
32
+ QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = int(
33
+ os.environ.get(
34
+ "QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES",
35
+ DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES,
36
+ )
37
+ )
38
+
39
+ QUERY_STATEMENT_TRUNCATION_BUFFER = 100
40
+
19
41
 
20
42
  class EnsureAspectSizeProcessor:
21
43
  def __init__(
@@ -81,6 +103,274 @@ class EnsureAspectSizeProcessor:
81
103
 
82
104
  schema.fields = accepted_fields
83
105
 
106
+ def ensure_query_subjects_size(
107
+ self, entity_urn: str, query_subjects: QuerySubjectsClass
108
+ ) -> None:
109
+ """
110
+ Ensure query subjects aspect does not exceed allowed size by removing column-level lineage first,
111
+ then table lineage if necessary.
112
+ """
113
+ if not query_subjects.subjects:
114
+ return
115
+
116
+ total_subjects_size = 0
117
+ accepted_table_level_subjects = []
118
+ accepted_column_level_subjects = []
119
+ column_level_subjects_with_sizes = []
120
+ table_level_subjects_with_sizes = []
121
+
122
+ # Separate column-level and table-level subjects
123
+ for subject in query_subjects.subjects:
124
+ subject_size = len(json.dumps(pre_json_transform(subject.to_obj())))
125
+
126
+ if subject.entity.startswith("urn:li:schemaField:"):
127
+ column_level_subjects_with_sizes.append((subject, subject_size))
128
+ else:
129
+ table_level_subjects_with_sizes.append((subject, subject_size))
130
+
131
+ # Once we find one that doesn't fit, stop everything else to prevent inconsistencies
132
+ first_skip_done = False
133
+
134
+ # First, try to include all table-level subjects
135
+ for subject, subject_size in table_level_subjects_with_sizes:
136
+ if total_subjects_size + subject_size < self.payload_constraint:
137
+ accepted_table_level_subjects.append(subject)
138
+ total_subjects_size += subject_size
139
+ else:
140
+ first_skip_done = True
141
+ break
142
+
143
+ # Then, add column-level subjects if there's remaining space
144
+ # Only process if we successfully included all table-level subjects
145
+ if not first_skip_done:
146
+ for subject, subject_size in column_level_subjects_with_sizes:
147
+ if total_subjects_size + subject_size < self.payload_constraint:
148
+ accepted_column_level_subjects.append(subject)
149
+ total_subjects_size += subject_size
150
+ else:
151
+ first_skip_done = True
152
+ break
153
+
154
+ if first_skip_done:
155
+ # Log aggregate warnings
156
+ table_level_skipped_count = len(table_level_subjects_with_sizes) - len(
157
+ accepted_table_level_subjects
158
+ )
159
+ column_level_skipped_count = len(column_level_subjects_with_sizes) - len(
160
+ accepted_column_level_subjects
161
+ )
162
+
163
+ self._maybe_warn_query_subjects(
164
+ entity_urn, table_level_skipped_count, "table-level lineage subjects"
165
+ )
166
+ self._maybe_warn_query_subjects(
167
+ entity_urn, column_level_skipped_count, "column-level lineage subjects"
168
+ )
169
+
170
+ query_subjects.subjects = (
171
+ accepted_table_level_subjects + accepted_column_level_subjects
172
+ )
173
+
174
+ def _maybe_warn_query_subjects(
175
+ self, entity_urn: str, skipped_count: int, item_type: str
176
+ ) -> None:
177
+ """Log warning for query subjects truncation if any items were skipped."""
178
+ if skipped_count > 0:
179
+ self.report.warning(
180
+ title="Query subjects truncated due to size constraint",
181
+ message="Query subjects contained too much data and would have caused ingestion to fail",
182
+ context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints",
183
+ )
184
+
185
+ def _maybe_warn_upstream_lineage(
186
+ self, entity_urn: str, skipped_count: int, item_type: str
187
+ ) -> None:
188
+ """Log warning for upstream lineage truncation if any items were skipped."""
189
+ if skipped_count > 0:
190
+ self.report.warning(
191
+ title="Upstream lineage truncated due to size constraint",
192
+ message="Upstream lineage contained too much data and would have caused ingestion to fail",
193
+ context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints",
194
+ )
195
+
196
+ def ensure_upstream_lineage_size( # noqa: C901
197
+ self, entity_urn: str, upstream_lineage: UpstreamLineageClass
198
+ ) -> None:
199
+ """
200
+ Ensure upstream lineage aspect does not exceed allowed size by removing lineage in priority order:
201
+ first NONE fine-grained lineages (lowest priority), then FIELD_SET fine-grained lineages,
202
+ then DATASET fine-grained lineages, and finally upstreams (highest priority).
203
+ """
204
+ if not upstream_lineage.fineGrainedLineages and not upstream_lineage.upstreams:
205
+ return
206
+
207
+ total_lineage_size = 0
208
+ accepted_upstreams = []
209
+ accepted_dataset_fg_lineages = []
210
+ accepted_field_set_fg_lineages = []
211
+ accepted_none_fg_lineages = []
212
+ upstream_items_with_sizes = []
213
+ dataset_fg_items_with_sizes = []
214
+ field_set_fg_items_with_sizes = []
215
+ none_fg_items_with_sizes = []
216
+
217
+ # Add upstreams (highest priority)
218
+ if upstream_lineage.upstreams:
219
+ for upstream in upstream_lineage.upstreams:
220
+ upstream_size = len(json.dumps(pre_json_transform(upstream.to_obj())))
221
+ upstream_items_with_sizes.append((upstream, upstream_size))
222
+
223
+ # Separate fine-grained lineage items by upstreamType: DATASET > FIELD_SET > NONE
224
+ if upstream_lineage.fineGrainedLineages:
225
+ for fg_lineage in upstream_lineage.fineGrainedLineages:
226
+ fg_lineage_size = len(
227
+ json.dumps(pre_json_transform(fg_lineage.to_obj()))
228
+ )
229
+
230
+ upstream_type_str = str(fg_lineage.upstreamType)
231
+ if upstream_type_str == "DATASET":
232
+ dataset_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
233
+ elif upstream_type_str == "FIELD_SET":
234
+ field_set_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
235
+ elif upstream_type_str == "NONE":
236
+ none_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
237
+
238
+ # Once we find one that doesn't fit, stop everything else to prevent inconsistencies
239
+ first_skip_done = False
240
+
241
+ # First, include all upstreams (highest priority)
242
+ for item, item_size in upstream_items_with_sizes:
243
+ if total_lineage_size + item_size < self.payload_constraint:
244
+ accepted_upstreams.append(item)
245
+ total_lineage_size += item_size
246
+ else:
247
+ first_skip_done = True
248
+ break
249
+
250
+ # Second, include DATASET fine-grained lineages if no upstreams were skipped
251
+ if not first_skip_done:
252
+ for fg_lineage, fg_lineage_size in dataset_fg_items_with_sizes:
253
+ if total_lineage_size + fg_lineage_size < self.payload_constraint:
254
+ accepted_dataset_fg_lineages.append(fg_lineage)
255
+ total_lineage_size += fg_lineage_size
256
+ else:
257
+ first_skip_done = True
258
+ break
259
+
260
+ # Third, include FIELD_SET fine-grained lineages if no higher priority items were skipped
261
+ if not first_skip_done:
262
+ for fg_lineage, fg_lineage_size in field_set_fg_items_with_sizes:
263
+ if total_lineage_size + fg_lineage_size < self.payload_constraint:
264
+ accepted_field_set_fg_lineages.append(fg_lineage)
265
+ total_lineage_size += fg_lineage_size
266
+ else:
267
+ first_skip_done = True
268
+ break
269
+
270
+ # Finally, include NONE fine-grained lineages if no higher priority items were skipped
271
+ if not first_skip_done:
272
+ for fg_lineage, fg_lineage_size in none_fg_items_with_sizes:
273
+ if total_lineage_size + fg_lineage_size < self.payload_constraint:
274
+ accepted_none_fg_lineages.append(fg_lineage)
275
+ total_lineage_size += fg_lineage_size
276
+ else:
277
+ first_skip_done = True
278
+ break
279
+
280
+ # Log aggregate warnings instead of per-item warnings
281
+ if first_skip_done:
282
+ upstreams_skipped_count = len(upstream_items_with_sizes) - len(
283
+ accepted_upstreams
284
+ )
285
+ dataset_fg_skipped_count = len(dataset_fg_items_with_sizes) - len(
286
+ accepted_dataset_fg_lineages
287
+ )
288
+ field_set_fg_skipped_count = len(field_set_fg_items_with_sizes) - len(
289
+ accepted_field_set_fg_lineages
290
+ )
291
+ none_fg_skipped_count = len(none_fg_items_with_sizes) - len(
292
+ accepted_none_fg_lineages
293
+ )
294
+
295
+ self._maybe_warn_upstream_lineage(
296
+ entity_urn, upstreams_skipped_count, "upstream datasets"
297
+ )
298
+ self._maybe_warn_upstream_lineage(
299
+ entity_urn,
300
+ dataset_fg_skipped_count,
301
+ "dataset-level fine-grained lineages",
302
+ )
303
+ self._maybe_warn_upstream_lineage(
304
+ entity_urn,
305
+ field_set_fg_skipped_count,
306
+ "field-set-level fine-grained lineages",
307
+ )
308
+ self._maybe_warn_upstream_lineage(
309
+ entity_urn, none_fg_skipped_count, "none-level fine-grained lineages"
310
+ )
311
+
312
+ # Combine all accepted fine-grained lineages
313
+ accepted_fine_grained_lineages = (
314
+ accepted_dataset_fg_lineages
315
+ + accepted_field_set_fg_lineages
316
+ + accepted_none_fg_lineages
317
+ )
318
+
319
+ upstream_lineage.upstreams = accepted_upstreams
320
+ upstream_lineage.fineGrainedLineages = (
321
+ accepted_fine_grained_lineages if accepted_fine_grained_lineages else None
322
+ )
323
+
324
+ def ensure_query_properties_size(
325
+ self, entity_urn: str, query_properties: QueryPropertiesClass
326
+ ) -> None:
327
+ """
328
+ Ensure query properties aspect does not exceed allowed size by truncating the query statement value.
329
+ Uses a configurable max payload size that is the minimum between QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES
330
+ and INGEST_MAX_PAYLOAD_BYTES.
331
+
332
+ We have found surprisingly large query statements (e.g. 20MB+) that caused ingestion to fail;
333
+ that was INSERT INTO VALUES with huge list of values.
334
+ """
335
+ if not query_properties.statement or not query_properties.statement.value:
336
+ return
337
+
338
+ max_payload_size = min(
339
+ QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES, self.payload_constraint
340
+ )
341
+
342
+ current_size = len(json.dumps(pre_json_transform(query_properties.to_obj())))
343
+
344
+ if current_size < max_payload_size:
345
+ return
346
+
347
+ reduction_needed = (
348
+ current_size - max_payload_size + QUERY_STATEMENT_TRUNCATION_BUFFER
349
+ )
350
+
351
+ statement_value_size = len(query_properties.statement.value)
352
+ original_statement_size = statement_value_size
353
+
354
+ # Only truncate if reduction is actually needed and possible
355
+ if statement_value_size > reduction_needed > 0:
356
+ new_statement_length = statement_value_size - reduction_needed
357
+ truncated_statement = query_properties.statement.value[
358
+ :new_statement_length
359
+ ]
360
+
361
+ truncation_message = f"... [original value was {original_statement_size} bytes and truncated to {new_statement_length} bytes]"
362
+ query_properties.statement.value = truncated_statement + truncation_message
363
+
364
+ self.report.warning(
365
+ title="Query properties truncated due to size constraint",
366
+ message="Query properties contained too much data and would have caused ingestion to fail",
367
+ context=f"Query statement was truncated from {original_statement_size} to {new_statement_length} characters for {entity_urn} due to aspect size constraints",
368
+ )
369
+ else:
370
+ logger.warning(
371
+ f"Cannot truncate query statement for {entity_urn} as it is smaller than or equal to the required reduction size {reduction_needed}. That means that 'ensure_query_properties_size' must be extended to trim other fields different than statement."
372
+ )
373
+
84
374
  def ensure_aspect_size(
85
375
  self,
86
376
  stream: Iterable[MetadataWorkUnit],
@@ -90,10 +380,16 @@ class EnsureAspectSizeProcessor:
90
380
  on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
91
381
  """
92
382
  for wu in stream:
93
- logger.debug(f"Ensuring size of workunit: {wu.id}")
383
+ # logger.debug(f"Ensuring size of workunit: {wu.id}")
94
384
 
95
385
  if schema := wu.get_aspect_of_type(SchemaMetadataClass):
96
386
  self.ensure_schema_metadata_size(wu.get_urn(), schema)
97
387
  elif profile := wu.get_aspect_of_type(DatasetProfileClass):
98
388
  self.ensure_dataset_profile_size(wu.get_urn(), profile)
389
+ elif query_subjects := wu.get_aspect_of_type(QuerySubjectsClass):
390
+ self.ensure_query_subjects_size(wu.get_urn(), query_subjects)
391
+ elif upstream_lineage := wu.get_aspect_of_type(UpstreamLineageClass):
392
+ self.ensure_upstream_lineage_size(wu.get_urn(), upstream_lineage)
393
+ elif query_properties := wu.get_aspect_of_type(QueryPropertiesClass):
394
+ self.ensure_query_properties_size(wu.get_urn(), query_properties)
99
395
  yield wu
@@ -0,0 +1,87 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, Iterable, List
3
+
4
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
5
+ from datahub.metadata.schema_classes import InputFieldClass, InputFieldsClass
6
+
7
+ if TYPE_CHECKING:
8
+ from datahub.ingestion.api.source import SourceReport
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class ValidateInputFieldsProcessor:
14
+ def __init__(self, report: "SourceReport"):
15
+ self.report = report
16
+
17
+ def validate_input_fields(
18
+ self,
19
+ stream: Iterable[MetadataWorkUnit],
20
+ ) -> Iterable[MetadataWorkUnit]:
21
+ """
22
+ Validate input fields and filter out invalid ones.
23
+
24
+ Invalid input fields have empty or missing fieldPath values, which would cause
25
+ URN generation to fail when sent to the server. This processor filters them out
26
+ and reports them as warnings.
27
+ """
28
+ for wu in stream:
29
+ input_fields_aspect = wu.get_aspect_of_type(InputFieldsClass)
30
+ if input_fields_aspect and input_fields_aspect.fields:
31
+ valid_fields: List[InputFieldClass] = []
32
+ invalid_count = 0
33
+
34
+ for input_field in input_fields_aspect.fields:
35
+ if (
36
+ input_field.schemaField
37
+ and input_field.schemaField.fieldPath
38
+ and input_field.schemaField.fieldPath.strip()
39
+ ):
40
+ valid_fields.append(input_field)
41
+ else:
42
+ invalid_count += 1
43
+
44
+ if invalid_count > 0:
45
+ logger.debug(
46
+ f"Filtered {invalid_count} invalid input field(s) with empty fieldPath for {wu.get_urn()}"
47
+ )
48
+ self.report.num_input_fields_filtered += invalid_count
49
+ self.report.warning(
50
+ title="Invalid input fields filtered",
51
+ message="Input fields with empty fieldPath values were filtered out to prevent ingestion errors",
52
+ context=f"Filtered {invalid_count} invalid input field(s) for {wu.get_urn()}",
53
+ )
54
+
55
+ # Update the aspect with only valid fields
56
+ if valid_fields:
57
+ input_fields_aspect.fields = valid_fields
58
+ else:
59
+ # If no valid fields remain, skip this workunit entirely
60
+ logger.debug(
61
+ f"All input fields were invalid for {wu.get_urn()}, skipping InputFieldsClass workunit"
62
+ )
63
+ # Don't yield this workunit
64
+ continue
65
+
66
+ yield wu
67
+
68
+ def _remove_input_fields_aspect(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
69
+ """Remove InputFieldsClass aspect from a workunit."""
70
+ # For MCPs, we can simply not yield the aspect
71
+ # For MCEs, we need to remove it from the snapshot
72
+ if hasattr(wu.metadata, "aspect") and isinstance(
73
+ wu.metadata.aspect, InputFieldsClass
74
+ ):
75
+ # This is an MCP with InputFieldsClass, skip it
76
+ return wu
77
+
78
+ if hasattr(wu.metadata, "proposedSnapshot"):
79
+ snapshot = wu.metadata.proposedSnapshot
80
+ if hasattr(snapshot, "aspects"):
81
+ snapshot.aspects = [
82
+ aspect
83
+ for aspect in snapshot.aspects
84
+ if not isinstance(aspect, InputFieldsClass)
85
+ ]
86
+
87
+ return wu
@@ -1,12 +1,16 @@
1
+ # So that SourceCapabilityModifier can be resolved at runtime
2
+ from __future__ import annotations
3
+
1
4
  from dataclasses import dataclass
2
5
  from enum import Enum, auto
3
- from typing import Callable, Dict, Optional, Type
6
+ from typing import Callable, Dict, List, Optional, Type
4
7
 
5
8
  from datahub.ingestion.api.common import PipelineContext
6
9
  from datahub.ingestion.api.source import (
7
10
  Source,
8
11
  SourceCapability as SourceCapability,
9
12
  )
13
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
10
14
 
11
15
 
12
16
  def config_class(config_cls: Type) -> Callable[[Type], Type]:
@@ -88,10 +92,14 @@ class CapabilitySetting:
88
92
  capability: SourceCapability
89
93
  description: str
90
94
  supported: bool
95
+ subtype_modifier: Optional[List[SourceCapabilityModifier]] = None
91
96
 
92
97
 
93
98
  def capability(
94
- capability_name: SourceCapability, description: str, supported: bool = True
99
+ capability_name: SourceCapability,
100
+ description: str,
101
+ supported: bool = True,
102
+ subtype_modifier: Optional[List[SourceCapabilityModifier]] = None,
95
103
  ) -> Callable[[Type], Type]:
96
104
  """
97
105
  A decorator to mark a source as having a certain capability
@@ -104,6 +112,7 @@ def capability(
104
112
  for base in cls.__bases__
105
113
  ):
106
114
  cls.__capabilities = {}
115
+
107
116
  cls.get_capabilities = lambda: cls.__capabilities.values()
108
117
 
109
118
  # If the superclasses have capability annotations, copy those over.
@@ -113,7 +122,10 @@ def capability(
113
122
  cls.__capabilities.update(base_caps)
114
123
 
115
124
  cls.__capabilities[capability_name] = CapabilitySetting(
116
- capability=capability_name, description=description, supported=supported
125
+ capability=capability_name,
126
+ description=description,
127
+ supported=supported,
128
+ subtype_modifier=subtype_modifier,
117
129
  )
118
130
  return cls
119
131