acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import re
2
3
  from dataclasses import dataclass
3
4
  from typing import Dict, Iterable, List, Optional, Tuple
@@ -9,6 +10,81 @@ from datahub.ingestion.source.kafka_connect.common import (
9
10
  KafkaConnectLineage,
10
11
  )
11
12
 
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class RegexRouterTransform:
17
+ """Helper class to handle RegexRouter transformations for topic/table names."""
18
+
19
+ def __init__(self, config: Dict[str, str]) -> None:
20
+ self.transforms = self._parse_transforms(config)
21
+
22
+ def _parse_transforms(self, config: Dict[str, str]) -> List[Dict[str, str]]:
23
+ """Parse transforms configuration from connector config."""
24
+ transforms_list: List[Dict[str, str]] = []
25
+
26
+ # Get the transforms parameter
27
+ transforms_param: str = config.get("transforms", "")
28
+ if not transforms_param:
29
+ return transforms_list
30
+
31
+ # Parse individual transforms
32
+ transform_names: List[str] = [
33
+ name.strip() for name in transforms_param.split(",")
34
+ ]
35
+
36
+ for transform_name in transform_names:
37
+ if not transform_name:
38
+ continue
39
+ transform_config: Dict[str, str] = {}
40
+ transform_prefix: str = f"transforms.{transform_name}."
41
+
42
+ # Extract transform configuration
43
+ for key, value in config.items():
44
+ if key.startswith(transform_prefix):
45
+ config_key: str = key[len(transform_prefix) :]
46
+ transform_config[config_key] = value
47
+
48
+ # Only process RegexRouter transforms
49
+ if (
50
+ transform_config.get("type")
51
+ == "org.apache.kafka.connect.transforms.RegexRouter"
52
+ ):
53
+ transform_config["name"] = transform_name
54
+ transforms_list.append(transform_config)
55
+
56
+ return transforms_list
57
+
58
+ def apply_transforms(self, topic_name: str) -> str:
59
+ """Apply RegexRouter transforms to the topic name using Java regex."""
60
+ result: str = topic_name
61
+
62
+ for transform in self.transforms:
63
+ regex_pattern: Optional[str] = transform.get("regex")
64
+ replacement: str = transform.get("replacement", "")
65
+
66
+ if regex_pattern:
67
+ try:
68
+ # Use Java Pattern and Matcher for exact Kafka Connect compatibility
69
+ from java.util.regex import Pattern
70
+
71
+ pattern = Pattern.compile(regex_pattern)
72
+ matcher = pattern.matcher(result)
73
+
74
+ if matcher.find():
75
+ # Reset matcher to beginning for replaceFirst
76
+ matcher.reset()
77
+ result = matcher.replaceFirst(replacement)
78
+ logger.debug(
79
+ f"Applied transform {transform['name']}: {topic_name} -> {result}"
80
+ )
81
+ except Exception as e:
82
+ logger.warning(
83
+ f"Invalid regex pattern in transform {transform['name']}: {e}"
84
+ )
85
+
86
+ return str(result)
87
+
12
88
 
13
89
  @dataclass
14
90
  class ConfluentS3SinkConnector(BaseConnector):
@@ -18,28 +94,35 @@ class ConfluentS3SinkConnector(BaseConnector):
18
94
  bucket: str
19
95
  topics_dir: str
20
96
  topics: Iterable[str]
97
+ regex_router: RegexRouterTransform
21
98
 
22
99
  def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
23
100
  # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
24
- bucket = connector_manifest.config.get("s3.bucket.name")
101
+ bucket: Optional[str] = connector_manifest.config.get("s3.bucket.name")
25
102
  if not bucket:
26
103
  raise ValueError(
27
104
  "Could not find 's3.bucket.name' in connector configuration"
28
105
  )
29
106
 
30
107
  # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
31
- topics_dir = connector_manifest.config.get("topics.dir", "topics")
108
+ topics_dir: str = connector_manifest.config.get("topics.dir", "topics")
109
+
110
+ # Create RegexRouterTransform instance
111
+ regex_router: RegexRouterTransform = RegexRouterTransform(
112
+ connector_manifest.config
113
+ )
32
114
 
33
115
  return self.S3SinkParser(
34
116
  target_platform="s3",
35
117
  bucket=bucket,
36
118
  topics_dir=topics_dir,
37
119
  topics=connector_manifest.topic_names,
120
+ regex_router=regex_router,
38
121
  )
39
122
 
40
123
  def extract_flow_property_bag(self) -> Dict[str, str]:
41
124
  # Mask/Remove properties that may reveal credentials
42
- flow_property_bag = {
125
+ flow_property_bag: Dict[str, str] = {
43
126
  k: v
44
127
  for k, v in self.connector_manifest.config.items()
45
128
  if k
@@ -54,11 +137,17 @@ class ConfluentS3SinkConnector(BaseConnector):
54
137
 
55
138
  def extract_lineages(self) -> List[KafkaConnectLineage]:
56
139
  try:
57
- parser = self._get_parser(self.connector_manifest)
140
+ parser: ConfluentS3SinkConnector.S3SinkParser = self._get_parser(
141
+ self.connector_manifest
142
+ )
58
143
 
59
144
  lineages: List[KafkaConnectLineage] = list()
60
145
  for topic in parser.topics:
61
- target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}"
146
+ # Apply RegexRouter transformations using the RegexRouterTransform class
147
+ transformed_topic: str = parser.regex_router.apply_transforms(topic)
148
+ target_dataset: str = (
149
+ f"{parser.bucket}/{parser.topics_dir}/{transformed_topic}"
150
+ )
62
151
 
63
152
  lineages.append(
64
153
  KafkaConnectLineage(
@@ -86,6 +175,7 @@ class SnowflakeSinkConnector(BaseConnector):
86
175
  database_name: str
87
176
  schema_name: str
88
177
  topics_to_tables: Dict[str, str]
178
+ regex_router: RegexRouterTransform
89
179
 
90
180
  def get_table_name_from_topic_name(self, topic_name: str) -> str:
91
181
  """
@@ -93,7 +183,7 @@ class SnowflakeSinkConnector(BaseConnector):
93
183
  Refer below link for more info
94
184
  https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
95
185
  """
96
- table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
186
+ table_name: str = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
97
187
  if re.match("^[^a-zA-Z_].*", table_name):
98
188
  table_name = "_" + table_name
99
189
  # Connector may append original topic's hash code as suffix for conflict resolution
@@ -106,8 +196,13 @@ class SnowflakeSinkConnector(BaseConnector):
106
196
  self,
107
197
  connector_manifest: ConnectorManifest,
108
198
  ) -> SnowflakeParser:
109
- database_name = connector_manifest.config["snowflake.database.name"]
110
- schema_name = connector_manifest.config["snowflake.schema.name"]
199
+ database_name: str = connector_manifest.config["snowflake.database.name"]
200
+ schema_name: str = connector_manifest.config["snowflake.schema.name"]
201
+
202
+ # Create RegexRouterTransform instance
203
+ regex_router: RegexRouterTransform = RegexRouterTransform(
204
+ connector_manifest.config
205
+ )
111
206
 
112
207
  # Fetch user provided topic to table map
113
208
  provided_topics_to_tables: Dict[str, str] = {}
@@ -121,24 +216,30 @@ class SnowflakeSinkConnector(BaseConnector):
121
216
  topics_to_tables: Dict[str, str] = {}
122
217
  # Extract lineage for only those topics whose data ingestion started
123
218
  for topic in connector_manifest.topic_names:
219
+ # Apply transforms first to get the transformed topic name
220
+ transformed_topic: str = regex_router.apply_transforms(topic)
221
+
124
222
  if topic in provided_topics_to_tables:
125
223
  # If user provided which table to get mapped with this topic
126
224
  topics_to_tables[topic] = provided_topics_to_tables[topic]
127
225
  else:
128
- # Else connector converts topic name to a valid Snowflake table name.
129
- topics_to_tables[topic] = self.get_table_name_from_topic_name(topic)
226
+ # Use the transformed topic name to generate table name
227
+ topics_to_tables[topic] = self.get_table_name_from_topic_name(
228
+ transformed_topic
229
+ )
130
230
 
131
231
  return self.SnowflakeParser(
132
232
  database_name=database_name,
133
233
  schema_name=schema_name,
134
234
  topics_to_tables=topics_to_tables,
235
+ regex_router=regex_router,
135
236
  )
136
237
 
137
238
  def extract_flow_property_bag(self) -> Dict[str, str]:
138
239
  # For all snowflake sink connector properties, refer below link
139
240
  # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
140
241
  # remove private keys, secrets from properties
141
- flow_property_bag = {
242
+ flow_property_bag: Dict[str, str] = {
142
243
  k: v
143
244
  for k, v in self.connector_manifest.config.items()
144
245
  if k
@@ -153,10 +254,12 @@ class SnowflakeSinkConnector(BaseConnector):
153
254
 
154
255
  def extract_lineages(self) -> List[KafkaConnectLineage]:
155
256
  lineages: List[KafkaConnectLineage] = list()
156
- parser = self.get_parser(self.connector_manifest)
257
+ parser: SnowflakeSinkConnector.SnowflakeParser = self.get_parser(
258
+ self.connector_manifest
259
+ )
157
260
 
158
261
  for topic, table in parser.topics_to_tables.items():
159
- target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
262
+ target_dataset: str = f"{parser.database_name}.{parser.schema_name}.{table}"
160
263
  lineages.append(
161
264
  KafkaConnectLineage(
162
265
  source_dataset=topic,
@@ -176,7 +279,8 @@ class BigQuerySinkConnector(BaseConnector):
176
279
  project: str
177
280
  target_platform: str
178
281
  sanitizeTopics: bool
179
- transforms: list
282
+ transforms: List[Dict[str, str]]
283
+ regex_router: RegexRouterTransform
180
284
  topicsToTables: Optional[str] = None
181
285
  datasets: Optional[str] = None
182
286
  defaultDataset: Optional[str] = None
@@ -186,16 +290,18 @@ class BigQuerySinkConnector(BaseConnector):
186
290
  self,
187
291
  connector_manifest: ConnectorManifest,
188
292
  ) -> BQParser:
189
- project = connector_manifest.config["project"]
190
- sanitizeTopics = connector_manifest.config.get("sanitizeTopics") or "false"
191
- transform_names = (
293
+ project: str = connector_manifest.config["project"]
294
+ sanitizeTopics: str = connector_manifest.config.get("sanitizeTopics") or "false"
295
+
296
+ # Parse ALL transforms (original BigQuery logic)
297
+ transform_names: List[str] = (
192
298
  self.connector_manifest.config.get("transforms", "").split(",")
193
299
  if self.connector_manifest.config.get("transforms")
194
300
  else []
195
301
  )
196
- transforms = []
302
+ transforms: List[Dict[str, str]] = []
197
303
  for name in transform_names:
198
- transform = {"name": name}
304
+ transform: Dict[str, str] = {"name": name}
199
305
  transforms.append(transform)
200
306
  for key in self.connector_manifest.config:
201
307
  if key.startswith(f"transforms.{name}."):
@@ -203,8 +309,13 @@ class BigQuerySinkConnector(BaseConnector):
203
309
  self.connector_manifest.config[key]
204
310
  )
205
311
 
312
+ # Create RegexRouterTransform instance for RegexRouter-specific handling
313
+ regex_router: RegexRouterTransform = RegexRouterTransform(
314
+ connector_manifest.config
315
+ )
316
+
206
317
  if "defaultDataset" in connector_manifest.config:
207
- defaultDataset = connector_manifest.config["defaultDataset"]
318
+ defaultDataset: str = connector_manifest.config["defaultDataset"]
208
319
  return self.BQParser(
209
320
  project=project,
210
321
  defaultDataset=defaultDataset,
@@ -212,11 +323,14 @@ class BigQuerySinkConnector(BaseConnector):
212
323
  sanitizeTopics=sanitizeTopics.lower() == "true",
213
324
  version="v2",
214
325
  transforms=transforms,
326
+ regex_router=regex_router,
215
327
  )
216
328
  else:
217
329
  # version 1.6.x and similar configs supported
218
- datasets = connector_manifest.config["datasets"]
219
- topicsToTables = connector_manifest.config.get("topicsToTables")
330
+ datasets: str = connector_manifest.config["datasets"]
331
+ topicsToTables: Optional[str] = connector_manifest.config.get(
332
+ "topicsToTables"
333
+ )
220
334
 
221
335
  return self.BQParser(
222
336
  project=project,
@@ -225,10 +339,11 @@ class BigQuerySinkConnector(BaseConnector):
225
339
  target_platform="bigquery",
226
340
  sanitizeTopics=sanitizeTopics.lower() == "true",
227
341
  transforms=transforms,
342
+ regex_router=regex_router,
228
343
  )
229
344
 
230
345
  def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
231
- entries = property.split(",")
346
+ entries: List[str] = property.split(",")
232
347
  for entry in entries:
233
348
  key, val = entry.rsplit("=")
234
349
  yield (key.strip(), val.strip())
@@ -243,7 +358,7 @@ class BigQuerySinkConnector(BaseConnector):
243
358
  return dataset
244
359
  return None
245
360
 
246
- def sanitize_table_name(self, table_name):
361
+ def sanitize_table_name(self, table_name: str) -> str:
247
362
  table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
248
363
  if re.match("^[^a-zA-Z_].*", table_name):
249
364
  table_name = "_" + table_name
@@ -254,8 +369,8 @@ class BigQuerySinkConnector(BaseConnector):
254
369
  self, topic: str, parser: BQParser
255
370
  ) -> Optional[str]:
256
371
  if parser.version == "v2":
257
- dataset = parser.defaultDataset
258
- parts = topic.split(":")
372
+ dataset: Optional[str] = parser.defaultDataset
373
+ parts: List[str] = topic.split(":")
259
374
  if len(parts) == 2:
260
375
  dataset = parts[0]
261
376
  table = parts[1]
@@ -283,21 +398,9 @@ class BigQuerySinkConnector(BaseConnector):
283
398
  table = self.sanitize_table_name(table)
284
399
  return f"{dataset}.{table}"
285
400
 
286
- def apply_transformations(
287
- self, topic: str, transforms: List[Dict[str, str]]
288
- ) -> str:
289
- for transform in transforms:
290
- if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
291
- regex = transform["regex"]
292
- replacement = transform["replacement"]
293
- pattern = re.compile(regex)
294
- if pattern.match(topic):
295
- topic = pattern.sub(replacement, topic, count=1)
296
- return topic
297
-
298
401
  def extract_flow_property_bag(self) -> Dict[str, str]:
299
402
  # Mask/Remove properties that may reveal credentials
300
- flow_property_bag = {
403
+ flow_property_bag: Dict[str, str] = {
301
404
  k: v
302
405
  for k, v in self.connector_manifest.config.items()
303
406
  if k not in ["keyfile"]
@@ -307,27 +410,33 @@ class BigQuerySinkConnector(BaseConnector):
307
410
 
308
411
  def extract_lineages(self) -> List[KafkaConnectLineage]:
309
412
  lineages: List[KafkaConnectLineage] = list()
310
- parser = self.get_parser(self.connector_manifest)
413
+ parser: BigQuerySinkConnector.BQParser = self.get_parser(
414
+ self.connector_manifest
415
+ )
311
416
  if not parser:
312
417
  return lineages
313
- target_platform = parser.target_platform
314
- project = parser.project
315
- transforms = parser.transforms
418
+ target_platform: str = parser.target_platform
419
+ project: str = parser.project
316
420
 
317
421
  for topic in self.connector_manifest.topic_names:
318
- transformed_topic = self.apply_transformations(topic, transforms)
319
- dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
422
+ # Apply RegexRouter transformations using the RegexRouterTransform class
423
+ transformed_topic: str = parser.regex_router.apply_transforms(topic)
424
+
425
+ # Use the transformed topic to determine dataset/table
426
+ dataset_table: Optional[str] = self.get_dataset_table_for_topic(
427
+ transformed_topic, parser
428
+ )
320
429
  if dataset_table is None:
321
430
  self.report.warning(
322
431
  "Could not find target dataset for topic, please check your connector configuration"
323
432
  f"{self.connector_manifest.name} : {transformed_topic} ",
324
433
  )
325
434
  continue
326
- target_dataset = f"{project}.{dataset_table}"
435
+ target_dataset: str = f"{project}.{dataset_table}"
327
436
 
328
437
  lineages.append(
329
438
  KafkaConnectLineage(
330
- source_dataset=transformed_topic,
439
+ source_dataset=topic, # Keep original topic as source
331
440
  source_platform=KAFKA,
332
441
  target_dataset=target_dataset,
333
442
  target_platform=target_platform,
@@ -20,6 +20,8 @@ from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
20
20
  get_platform_from_sqlalchemy_uri,
21
21
  )
22
22
 
23
+ logger = logging.getLogger(__name__)
24
+
23
25
 
24
26
  @dataclass
25
27
  class ConfluentJDBCSourceConnector(BaseConnector):
@@ -392,7 +394,7 @@ class MongoSourceConnector(BaseConnector):
392
394
  db_connection_url=connector_manifest.config.get("connection.uri"),
393
395
  source_platform="mongodb",
394
396
  database_name=connector_manifest.config.get("database"),
395
- topic_prefix=connector_manifest.config.get("topic_prefix"),
397
+ topic_prefix=connector_manifest.config.get("topic.prefix"),
396
398
  transforms=(
397
399
  connector_manifest.config["transforms"].split(",")
398
400
  if "transforms" in connector_manifest.config
@@ -406,7 +408,14 @@ class MongoSourceConnector(BaseConnector):
406
408
  lineages: List[KafkaConnectLineage] = list()
407
409
  parser = self.get_parser(self.connector_manifest)
408
410
  source_platform = parser.source_platform
409
- topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
411
+ topic_prefix = parser.topic_prefix or ""
412
+
413
+ # Escape topic_prefix to handle cases where it contains dots
414
+ # Some users configure topic.prefix like "my.mongodb" which breaks the regex
415
+
416
+ # \w is equivalent to [a-zA-Z0-9_]
417
+ # So [\w-]+ matches alphanumeric characters, underscores, and hyphens
418
+ topic_naming_pattern = rf"{re.escape(topic_prefix)}\.([\w-]+)\.([\w-]+)"
410
419
 
411
420
  if not self.connector_manifest.topic_names:
412
421
  return lineages
@@ -429,6 +438,26 @@ class MongoSourceConnector(BaseConnector):
429
438
 
430
439
  @dataclass
431
440
  class DebeziumSourceConnector(BaseConnector):
441
+ # Debezium topic naming patterns by connector type
442
+ # - MySQL: {topic.prefix}.{database}.{table}
443
+ # - PostgreSQL: {topic.prefix}.{schema}.{table}
444
+ # - SQL Server: {topic.prefix}.{database}.{schema}.{table}
445
+ # - Oracle: {topic.prefix}.{schema}.{table}
446
+ # - DB2: {topic.prefix}.{schema}.{table}
447
+ # - MongoDB: {topic.prefix}.{database}.{collection}
448
+ # - Vitess: {topic.prefix}.{keyspace}.{table}
449
+
450
+ # Note SQL Server allows for "database.names" (multiple databases) config,
451
+ # and so database is in the topic naming pattern.
452
+ # However, others have "database.dbname" which is a single database name. For these connectors,
453
+ # additional databases would require a different connector instance
454
+
455
+ # Connectors with 2-level container in pattern (database + schema)
456
+ # Others have either database XOR schema, but not both
457
+ DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN = {
458
+ "io.debezium.connector.sqlserver.SqlServerConnector",
459
+ }
460
+
432
461
  @dataclass
433
462
  class DebeziumParser:
434
463
  source_platform: str
@@ -514,16 +543,45 @@ class DebeziumSourceConnector(BaseConnector):
514
543
  source_platform = parser.source_platform
515
544
  server_name = parser.server_name
516
545
  database_name = parser.database_name
517
- topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
546
+ # Escape server_name to handle cases where topic.prefix contains dots
547
+ # Some users configure topic.prefix like "my.server" which breaks the regex
548
+ server_name = server_name or ""
549
+ # Regex pattern (\w+\.\w+(?:\.\w+)?) supports BOTH 2-part and 3-part table names
550
+ topic_naming_pattern = rf"({re.escape(server_name)})\.(\w+\.\w+(?:\.\w+)?)"
518
551
 
519
552
  if not self.connector_manifest.topic_names:
520
553
  return lineages
521
554
 
555
+ # Handle connectors with 2-level container (database + schema) in topic pattern
556
+ connector_class = self.connector_manifest.config.get(CONNECTOR_CLASS, "")
557
+ maybe_duplicated_database_name = (
558
+ connector_class
559
+ in self.DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN
560
+ )
561
+
522
562
  for topic in self.connector_manifest.topic_names:
523
563
  found = re.search(re.compile(topic_naming_pattern), topic)
564
+ logger.debug(
565
+ f"Processing topic: '{topic}' with regex pattern '{topic_naming_pattern}', found: {found}"
566
+ )
524
567
 
525
568
  if found:
526
- table_name = get_dataset_name(database_name, found.group(2))
569
+ # Extract the table part after server_name
570
+ table_part = found.group(2)
571
+
572
+ if (
573
+ maybe_duplicated_database_name
574
+ and database_name
575
+ and table_part.startswith(f"{database_name}.")
576
+ ):
577
+ table_part = table_part[len(database_name) + 1 :]
578
+
579
+ logger.debug(
580
+ f"Extracted table part: '{table_part}' from topic '{topic}'"
581
+ )
582
+ # Apply database name to create final dataset name
583
+ table_name = get_dataset_name(database_name, table_part)
584
+ logger.debug(f"Final table name: '{table_name}'")
527
585
 
528
586
  lineage = KafkaConnectLineage(
529
587
  source_dataset=table_name,