acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -40,11 +40,11 @@ class TimeoutHTTPAdapter(HTTPAdapter):
40
40
  del kwargs["timeout"]
41
41
  super().__init__(*args, **kwargs)
42
42
 
43
- def send(self, request, **kwargs):
43
+ def send(self, request, *args, **kwargs):
44
44
  timeout = kwargs.get("timeout")
45
45
  if timeout is None and hasattr(self, "timeout"):
46
46
  kwargs["timeout"] = self.timeout
47
- return super().send(request, **kwargs)
47
+ return super().send(request, *args, **kwargs)
48
48
 
49
49
 
50
50
  class IcebergProfilingConfig(ConfigModel):
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any, Callable, Dict, Iterable, Union, cast
2
+ from typing import Any, Callable, Dict, Iterable, Optional, cast
3
3
 
4
4
  from pyiceberg.conversions import from_bytes
5
5
  from pyiceberg.schema import Schema
@@ -12,6 +12,7 @@ from pyiceberg.types import (
12
12
  IcebergType,
13
13
  IntegerType,
14
14
  LongType,
15
+ PrimitiveType,
15
16
  TimestampType,
16
17
  TimestamptzType,
17
18
  TimeType,
@@ -22,10 +23,9 @@ from pyiceberg.utils.datetime import (
22
23
  to_human_timestamp,
23
24
  to_human_timestamptz,
24
25
  )
26
+ from typing_extensions import TypeGuard
25
27
 
26
28
  from datahub.emitter.mce_builder import get_sys_time
27
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
28
- from datahub.ingestion.api.workunit import MetadataWorkUnit
29
29
  from datahub.ingestion.source.iceberg.iceberg_common import (
30
30
  IcebergProfilingConfig,
31
31
  IcebergSourceReport,
@@ -33,6 +33,7 @@ from datahub.ingestion.source.iceberg.iceberg_common import (
33
33
  from datahub.metadata.schema_classes import (
34
34
  DatasetFieldProfileClass,
35
35
  DatasetProfileClass,
36
+ _Aspect,
36
37
  )
37
38
  from datahub.utilities.perf_timer import PerfTimer
38
39
 
@@ -66,7 +67,7 @@ class IcebergProfiler:
66
67
  aggregated_values: Dict[int, Any],
67
68
  manifest_values: Dict[int, bytes],
68
69
  ) -> None:
69
- for field_id, value_encoded in manifest_values.items(): # type: int, Any
70
+ for field_id, value_encoded in manifest_values.items():
70
71
  try:
71
72
  field = schema.find_field(field_id)
72
73
  except ValueError:
@@ -86,9 +87,8 @@ class IcebergProfiler:
86
87
  def profile_table(
87
88
  self,
88
89
  dataset_name: str,
89
- dataset_urn: str,
90
90
  table: Table,
91
- ) -> Iterable[MetadataWorkUnit]:
91
+ ) -> Iterable[_Aspect]:
92
92
  """This method will profile the supplied Iceberg table by looking at the table's manifest.
93
93
 
94
94
  The overall profile of the table is aggregated from the individual manifest files.
@@ -167,11 +167,11 @@ class IcebergProfiler:
167
167
  )
168
168
  total_count += data_file.record_count
169
169
  except Exception as e:
170
- # Catch any errors that arise from attempting to read the Iceberg table's manifests
171
- # This will prevent stateful ingestion from being blocked by an error (profiling is not critical)
172
- self.report.report_warning(
173
- "profiling",
174
- f"Error while profiling dataset {dataset_name}: {e}",
170
+ self.report.warning(
171
+ title="Error when profiling a table",
172
+ message="Skipping profiling of the table due to errors",
173
+ context=dataset_name,
174
+ exc=e,
175
175
  )
176
176
  if row_count:
177
177
  # Iterating through fieldPaths introduces unwanted stats for list element fields...
@@ -211,14 +211,11 @@ class IcebergProfiler:
211
211
  f"Finished profiling of dataset: {dataset_name} in {time_taken}"
212
212
  )
213
213
 
214
- yield MetadataChangeProposalWrapper(
215
- entityUrn=dataset_urn,
216
- aspect=dataset_profile,
217
- ).as_workunit()
214
+ yield dataset_profile
218
215
 
219
216
  def _render_value(
220
217
  self, dataset_name: str, value_type: IcebergType, value: Any
221
- ) -> Union[str, None]:
218
+ ) -> Optional[str]:
222
219
  try:
223
220
  if isinstance(value_type, TimestampType):
224
221
  return to_human_timestamp(value)
@@ -230,14 +227,22 @@ class IcebergProfiler:
230
227
  return to_human_time(value)
231
228
  return str(value)
232
229
  except Exception as e:
233
- self.report.report_warning(
234
- "profiling",
235
- f"Error in dataset {dataset_name} when profiling a {value_type} field with value {value}: {e}",
230
+ self.report.warning(
231
+ title="Couldn't render value when profiling a table",
232
+ message="Encountered error, when trying to redner a value for table profile.",
233
+ context=str(
234
+ {
235
+ "value": value,
236
+ "value_type": value_type,
237
+ "dataset_name": dataset_name,
238
+ }
239
+ ),
240
+ exc=e,
236
241
  )
237
242
  return None
238
243
 
239
244
  @staticmethod
240
- def _is_numeric_type(type: IcebergType) -> bool:
245
+ def _is_numeric_type(type: IcebergType) -> TypeGuard[PrimitiveType]:
241
246
  return isinstance(
242
247
  type,
243
248
  (
@@ -167,7 +167,7 @@ class AzureADSourceReport(StaleEntityRemovalSourceReport):
167
167
  @config_class(AzureADConfig)
168
168
  @support_status(SupportStatus.CERTIFIED)
169
169
  @capability(
170
- SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
170
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
171
171
  )
172
172
  class AzureADSource(StatefulIngestionSourceBase):
173
173
  """
@@ -41,7 +41,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
41
41
  )
42
42
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
43
43
  from datahub.metadata.schema_classes import (
44
- ChangeTypeClass,
45
44
  CorpGroupInfoClass,
46
45
  CorpUserInfoClass,
47
46
  GroupMembershipClass,
@@ -202,7 +201,7 @@ class OktaSourceReport(StaleEntityRemovalSourceReport):
202
201
  @support_status(SupportStatus.CERTIFIED)
203
202
  @capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration")
204
203
  @capability(
205
- SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
204
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
206
205
  )
207
206
  class OktaSource(StatefulIngestionSourceBase):
208
207
  """
@@ -332,18 +331,12 @@ class OktaSource(StatefulIngestionSourceBase):
332
331
  yield MetadataWorkUnit(id=wu_id, mce=mce)
333
332
 
334
333
  yield MetadataChangeProposalWrapper(
335
- entityType="corpGroup",
336
334
  entityUrn=datahub_corp_group_snapshot.urn,
337
- changeType=ChangeTypeClass.UPSERT,
338
- aspectName="origin",
339
335
  aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
340
336
  ).as_workunit()
341
337
 
342
338
  yield MetadataChangeProposalWrapper(
343
- entityType="corpGroup",
344
339
  entityUrn=datahub_corp_group_snapshot.urn,
345
- changeType=ChangeTypeClass.UPSERT,
346
- aspectName="status",
347
340
  aspect=StatusClass(removed=False),
348
341
  ).as_workunit()
349
342
 
@@ -418,18 +411,12 @@ class OktaSource(StatefulIngestionSourceBase):
418
411
  yield MetadataWorkUnit(id=wu_id, mce=mce)
419
412
 
420
413
  yield MetadataChangeProposalWrapper(
421
- entityType="corpuser",
422
414
  entityUrn=datahub_corp_user_snapshot.urn,
423
- changeType=ChangeTypeClass.UPSERT,
424
- aspectName="origin",
425
415
  aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
426
416
  ).as_workunit()
427
417
 
428
418
  yield MetadataChangeProposalWrapper(
429
- entityType="corpuser",
430
419
  entityUrn=datahub_corp_user_snapshot.urn,
431
- changeType=ChangeTypeClass.UPSERT,
432
- aspectName="status",
433
420
  aspect=StatusClass(removed=False),
434
421
  ).as_workunit()
435
422
 
@@ -7,7 +7,6 @@ from typing import Any, Dict, Iterable, List, Optional, Type, cast
7
7
  import avro.schema
8
8
  import confluent_kafka
9
9
  import confluent_kafka.admin
10
- import pydantic
11
10
  from confluent_kafka.admin import (
12
11
  AdminClient,
13
12
  ConfigEntry,
@@ -16,13 +15,8 @@ from confluent_kafka.admin import (
16
15
  )
17
16
  from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient
18
17
 
19
- from datahub.configuration.common import AllowDenyPattern
20
18
  from datahub.configuration.kafka import KafkaConsumerConnectionConfig
21
19
  from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
22
- from datahub.configuration.source_common import (
23
- DatasetSourceConfigMixin,
24
- LowerCaseDatasetUrnConfigMixin,
25
- )
26
20
  from datahub.emitter import mce_builder
27
21
  from datahub.emitter.mce_builder import (
28
22
  make_data_platform_urn,
@@ -50,16 +44,15 @@ from datahub.ingestion.api.source import (
50
44
  )
51
45
  from datahub.ingestion.api.workunit import MetadataWorkUnit
52
46
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
47
+ from datahub.ingestion.source.kafka.kafka_config import KafkaSourceConfig
53
48
  from datahub.ingestion.source.kafka.kafka_schema_registry_base import (
54
49
  KafkaSchemaRegistryBase,
55
50
  )
56
51
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
57
52
  StaleEntityRemovalHandler,
58
53
  StaleEntityRemovalSourceReport,
59
- StatefulStaleMetadataRemovalConfig,
60
54
  )
61
55
  from datahub.ingestion.source.state.stateful_ingestion_base import (
62
- StatefulIngestionConfigBase,
63
56
  StatefulIngestionSourceBase,
64
57
  )
65
58
  from datahub.metadata.com.linkedin.pegasus2avro.common import Status
@@ -90,64 +83,6 @@ class KafkaTopicConfigKeys(StrEnum):
90
83
  UNCLEAN_LEADER_ELECTION_CONFIG = "unclean.leader.election.enable"
91
84
 
92
85
 
93
- class KafkaSourceConfig(
94
- StatefulIngestionConfigBase,
95
- DatasetSourceConfigMixin,
96
- LowerCaseDatasetUrnConfigMixin,
97
- ):
98
- connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
99
-
100
- topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
101
- domain: Dict[str, AllowDenyPattern] = pydantic.Field(
102
- default={},
103
- description="A map of domain names to allow deny patterns. Domains can be urn-based (`urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810`) or bare (`13ae4d85-d955-49fc-8474-9004c663a810`).",
104
- )
105
- topic_subject_map: Dict[str, str] = pydantic.Field(
106
- default={},
107
- description="Provides the mapping for the `key` and the `value` schemas of a topic to the corresponding schema registry subject name. Each entry of this map has the form `<topic_name>-key`:`<schema_registry_subject_name_for_key_schema>` and `<topic_name>-value`:`<schema_registry_subject_name_for_value_schema>` for the key and the value schemas associated with the topic, respectively. This parameter is mandatory when the [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work) is used as the subject naming strategy in the kafka schema registry. NOTE: When provided, this overrides the default subject name resolution even when the `TopicNameStrategy` or the `TopicRecordNameStrategy` are used.",
108
- )
109
- stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
110
- schema_registry_class: str = pydantic.Field(
111
- default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry",
112
- description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.",
113
- )
114
- schema_tags_field: str = pydantic.Field(
115
- default="tags",
116
- description="The field name in the schema metadata that contains the tags to be added to the dataset.",
117
- )
118
- enable_meta_mapping: bool = pydantic.Field(
119
- default=True,
120
- description="When enabled, applies the mappings that are defined through the meta_mapping directives.",
121
- )
122
- meta_mapping: Dict = pydantic.Field(
123
- default={},
124
- description="mapping rules that will be executed against top-level schema properties. Refer to the section below on meta automated mappings.",
125
- )
126
- field_meta_mapping: Dict = pydantic.Field(
127
- default={},
128
- description="mapping rules that will be executed against field-level schema properties. Refer to the section below on meta automated mappings.",
129
- )
130
- strip_user_ids_from_email: bool = pydantic.Field(
131
- default=False,
132
- description="Whether or not to strip email id while adding owners using meta mappings.",
133
- )
134
- tag_prefix: str = pydantic.Field(
135
- default="", description="Prefix added to tags during ingestion."
136
- )
137
- ignore_warnings_on_schema_type: bool = pydantic.Field(
138
- default=False,
139
- description="Disables warnings reported for non-AVRO/Protobuf value or key schemas if set.",
140
- )
141
- disable_topic_record_naming_strategy: bool = pydantic.Field(
142
- default=False,
143
- description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
144
- )
145
- ingest_schemas_as_entities: bool = pydantic.Field(
146
- default=False,
147
- description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
148
- )
149
-
150
-
151
86
  def get_kafka_consumer(
152
87
  connection: KafkaConsumerConnectionConfig,
153
88
  ) -> confluent_kafka.Consumer:
@@ -254,6 +189,22 @@ class KafkaConnectionTest:
254
189
  SourceCapability.SCHEMA_METADATA,
255
190
  "Schemas associated with each topic are extracted from the schema registry. Avro and Protobuf (certified), JSON (incubating). Schema references are supported.",
256
191
  )
192
+ @capability(
193
+ SourceCapability.DATA_PROFILING,
194
+ "Not supported",
195
+ supported=False,
196
+ )
197
+ @capability(
198
+ SourceCapability.LINEAGE_COARSE,
199
+ "Not supported. If you use Kafka Connect, the kafka-connect source can generate lineage.",
200
+ supported=False,
201
+ )
202
+ @capability(
203
+ SourceCapability.LINEAGE_FINE,
204
+ "Not supported",
205
+ supported=False,
206
+ )
207
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
257
208
  class KafkaSource(StatefulIngestionSourceBase, TestableSource):
258
209
  """
259
210
  This plugin extracts the following:
@@ -430,6 +381,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
430
381
 
431
382
  # 4. Set dataset's description, tags, ownership, etc, if topic schema type is avro
432
383
  description: Optional[str] = None
384
+ external_url: Optional[str] = None
433
385
  if (
434
386
  schema_metadata is not None
435
387
  and isinstance(schema_metadata.platformSchema, KafkaSchemaClass)
@@ -481,8 +433,16 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
481
433
  mce_builder.make_global_tag_aspect_with_tag_list(all_tags)
482
434
  )
483
435
 
436
+ if self.source_config.external_url_base:
437
+ # Remove trailing slash from base URL if present
438
+ base_url = self.source_config.external_url_base.rstrip("/")
439
+ external_url = f"{base_url}/{dataset_name}"
440
+
484
441
  dataset_properties = DatasetPropertiesClass(
485
- name=dataset_name, customProperties=custom_props, description=description
442
+ name=dataset_name,
443
+ customProperties=custom_props,
444
+ description=description,
445
+ externalUrl=external_url,
486
446
  )
487
447
  dataset_snapshot.aspects.append(dataset_properties)
488
448
 
@@ -568,10 +528,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
568
528
 
569
529
  for config_key in KafkaTopicConfigKeys:
570
530
  try:
571
- if (
572
- config_key in topic_config.keys()
573
- and topic_config[config_key] is not None
574
- ):
531
+ if config_key in topic_config and topic_config[config_key] is not None:
575
532
  config_value = topic_config[config_key].value
576
533
  custom_props[config_key] = (
577
534
  config_value
@@ -0,0 +1,78 @@
1
+ from typing import Dict, Optional
2
+
3
+ from pydantic import Field
4
+
5
+ from datahub.configuration.common import AllowDenyPattern
6
+ from datahub.configuration.kafka import KafkaConsumerConnectionConfig
7
+ from datahub.configuration.source_common import (
8
+ DatasetSourceConfigMixin,
9
+ LowerCaseDatasetUrnConfigMixin,
10
+ )
11
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
12
+ StatefulStaleMetadataRemovalConfig,
13
+ )
14
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
15
+ StatefulIngestionConfigBase,
16
+ )
17
+
18
+
19
+ class KafkaSourceConfig(
20
+ StatefulIngestionConfigBase,
21
+ DatasetSourceConfigMixin,
22
+ LowerCaseDatasetUrnConfigMixin,
23
+ ):
24
+ connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
25
+
26
+ topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
27
+ domain: Dict[str, AllowDenyPattern] = Field(
28
+ default={},
29
+ description="A map of domain names to allow deny patterns. Domains can be urn-based (`urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810`) or bare (`13ae4d85-d955-49fc-8474-9004c663a810`).",
30
+ )
31
+ topic_subject_map: Dict[str, str] = Field(
32
+ default={},
33
+ description="Provides the mapping for the `key` and the `value` schemas of a topic to the corresponding schema registry subject name. Each entry of this map has the form `<topic_name>-key`:`<schema_registry_subject_name_for_key_schema>` and `<topic_name>-value`:`<schema_registry_subject_name_for_value_schema>` for the key and the value schemas associated with the topic, respectively. This parameter is mandatory when the [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work) is used as the subject naming strategy in the kafka schema registry. NOTE: When provided, this overrides the default subject name resolution even when the `TopicNameStrategy` or the `TopicRecordNameStrategy` are used.",
34
+ )
35
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
36
+ schema_registry_class: str = Field(
37
+ default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry",
38
+ description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.",
39
+ )
40
+ schema_tags_field: str = Field(
41
+ default="tags",
42
+ description="The field name in the schema metadata that contains the tags to be added to the dataset.",
43
+ )
44
+ enable_meta_mapping: bool = Field(
45
+ default=True,
46
+ description="When enabled, applies the mappings that are defined through the meta_mapping directives.",
47
+ )
48
+ meta_mapping: Dict = Field(
49
+ default={},
50
+ description="mapping rules that will be executed against top-level schema properties. Refer to the section below on meta automated mappings.",
51
+ )
52
+ field_meta_mapping: Dict = Field(
53
+ default={},
54
+ description="mapping rules that will be executed against field-level schema properties. Refer to the section below on meta automated mappings.",
55
+ )
56
+ strip_user_ids_from_email: bool = Field(
57
+ default=False,
58
+ description="Whether or not to strip email id while adding owners using meta mappings.",
59
+ )
60
+ tag_prefix: str = Field(
61
+ default="", description="Prefix added to tags during ingestion."
62
+ )
63
+ ignore_warnings_on_schema_type: bool = Field(
64
+ default=False,
65
+ description="Disables warnings reported for non-AVRO/Protobuf value or key schemas if set.",
66
+ )
67
+ disable_topic_record_naming_strategy: bool = Field(
68
+ default=False,
69
+ description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
70
+ )
71
+ ingest_schemas_as_entities: bool = Field(
72
+ default=False,
73
+ description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
74
+ )
75
+ external_url_base: Optional[str] = Field(
76
+ default=None,
77
+ description="Base URL for external platform (e.g. Aiven) where topics can be viewed. The topic name will be appended to this base URL.",
78
+ )
@@ -4,7 +4,7 @@ from typing import Dict, Iterable, List, Optional
4
4
 
5
5
  from pydantic.fields import Field
6
6
 
7
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
7
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, LaxStr
8
8
  from datahub.configuration.source_common import (
9
9
  DatasetLineageProviderConfigBase,
10
10
  PlatformInstanceConfigMixin,
@@ -29,7 +29,7 @@ CONNECTOR_CLASS = "connector.class"
29
29
  class ProvidedConfig(ConfigModel):
30
30
  provider: str
31
31
  path_key: str
32
- value: str
32
+ value: LaxStr
33
33
 
34
34
 
35
35
  class GenericConnectorConfig(ConfigModel):