acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import re
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
@@ -9,6 +10,81 @@ from datahub.ingestion.source.kafka_connect.common import (
|
|
|
9
10
|
KafkaConnectLineage,
|
|
10
11
|
)
|
|
11
12
|
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RegexRouterTransform:
|
|
17
|
+
"""Helper class to handle RegexRouter transformations for topic/table names."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, config: Dict[str, str]) -> None:
|
|
20
|
+
self.transforms = self._parse_transforms(config)
|
|
21
|
+
|
|
22
|
+
def _parse_transforms(self, config: Dict[str, str]) -> List[Dict[str, str]]:
|
|
23
|
+
"""Parse transforms configuration from connector config."""
|
|
24
|
+
transforms_list: List[Dict[str, str]] = []
|
|
25
|
+
|
|
26
|
+
# Get the transforms parameter
|
|
27
|
+
transforms_param: str = config.get("transforms", "")
|
|
28
|
+
if not transforms_param:
|
|
29
|
+
return transforms_list
|
|
30
|
+
|
|
31
|
+
# Parse individual transforms
|
|
32
|
+
transform_names: List[str] = [
|
|
33
|
+
name.strip() for name in transforms_param.split(",")
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
for transform_name in transform_names:
|
|
37
|
+
if not transform_name:
|
|
38
|
+
continue
|
|
39
|
+
transform_config: Dict[str, str] = {}
|
|
40
|
+
transform_prefix: str = f"transforms.{transform_name}."
|
|
41
|
+
|
|
42
|
+
# Extract transform configuration
|
|
43
|
+
for key, value in config.items():
|
|
44
|
+
if key.startswith(transform_prefix):
|
|
45
|
+
config_key: str = key[len(transform_prefix) :]
|
|
46
|
+
transform_config[config_key] = value
|
|
47
|
+
|
|
48
|
+
# Only process RegexRouter transforms
|
|
49
|
+
if (
|
|
50
|
+
transform_config.get("type")
|
|
51
|
+
== "org.apache.kafka.connect.transforms.RegexRouter"
|
|
52
|
+
):
|
|
53
|
+
transform_config["name"] = transform_name
|
|
54
|
+
transforms_list.append(transform_config)
|
|
55
|
+
|
|
56
|
+
return transforms_list
|
|
57
|
+
|
|
58
|
+
def apply_transforms(self, topic_name: str) -> str:
|
|
59
|
+
"""Apply RegexRouter transforms to the topic name using Java regex."""
|
|
60
|
+
result: str = topic_name
|
|
61
|
+
|
|
62
|
+
for transform in self.transforms:
|
|
63
|
+
regex_pattern: Optional[str] = transform.get("regex")
|
|
64
|
+
replacement: str = transform.get("replacement", "")
|
|
65
|
+
|
|
66
|
+
if regex_pattern:
|
|
67
|
+
try:
|
|
68
|
+
# Use Java Pattern and Matcher for exact Kafka Connect compatibility
|
|
69
|
+
from java.util.regex import Pattern
|
|
70
|
+
|
|
71
|
+
pattern = Pattern.compile(regex_pattern)
|
|
72
|
+
matcher = pattern.matcher(result)
|
|
73
|
+
|
|
74
|
+
if matcher.find():
|
|
75
|
+
# Reset matcher to beginning for replaceFirst
|
|
76
|
+
matcher.reset()
|
|
77
|
+
result = matcher.replaceFirst(replacement)
|
|
78
|
+
logger.debug(
|
|
79
|
+
f"Applied transform {transform['name']}: {topic_name} -> {result}"
|
|
80
|
+
)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.warning(
|
|
83
|
+
f"Invalid regex pattern in transform {transform['name']}: {e}"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return str(result)
|
|
87
|
+
|
|
12
88
|
|
|
13
89
|
@dataclass
|
|
14
90
|
class ConfluentS3SinkConnector(BaseConnector):
|
|
@@ -18,28 +94,35 @@ class ConfluentS3SinkConnector(BaseConnector):
|
|
|
18
94
|
bucket: str
|
|
19
95
|
topics_dir: str
|
|
20
96
|
topics: Iterable[str]
|
|
97
|
+
regex_router: RegexRouterTransform
|
|
21
98
|
|
|
22
99
|
def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
|
|
23
100
|
# https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
|
|
24
|
-
bucket = connector_manifest.config.get("s3.bucket.name")
|
|
101
|
+
bucket: Optional[str] = connector_manifest.config.get("s3.bucket.name")
|
|
25
102
|
if not bucket:
|
|
26
103
|
raise ValueError(
|
|
27
104
|
"Could not find 's3.bucket.name' in connector configuration"
|
|
28
105
|
)
|
|
29
106
|
|
|
30
107
|
# https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
|
|
31
|
-
topics_dir = connector_manifest.config.get("topics.dir", "topics")
|
|
108
|
+
topics_dir: str = connector_manifest.config.get("topics.dir", "topics")
|
|
109
|
+
|
|
110
|
+
# Create RegexRouterTransform instance
|
|
111
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
112
|
+
connector_manifest.config
|
|
113
|
+
)
|
|
32
114
|
|
|
33
115
|
return self.S3SinkParser(
|
|
34
116
|
target_platform="s3",
|
|
35
117
|
bucket=bucket,
|
|
36
118
|
topics_dir=topics_dir,
|
|
37
119
|
topics=connector_manifest.topic_names,
|
|
120
|
+
regex_router=regex_router,
|
|
38
121
|
)
|
|
39
122
|
|
|
40
123
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
41
124
|
# Mask/Remove properties that may reveal credentials
|
|
42
|
-
flow_property_bag = {
|
|
125
|
+
flow_property_bag: Dict[str, str] = {
|
|
43
126
|
k: v
|
|
44
127
|
for k, v in self.connector_manifest.config.items()
|
|
45
128
|
if k
|
|
@@ -54,11 +137,17 @@ class ConfluentS3SinkConnector(BaseConnector):
|
|
|
54
137
|
|
|
55
138
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
56
139
|
try:
|
|
57
|
-
parser = self._get_parser(
|
|
140
|
+
parser: ConfluentS3SinkConnector.S3SinkParser = self._get_parser(
|
|
141
|
+
self.connector_manifest
|
|
142
|
+
)
|
|
58
143
|
|
|
59
144
|
lineages: List[KafkaConnectLineage] = list()
|
|
60
145
|
for topic in parser.topics:
|
|
61
|
-
|
|
146
|
+
# Apply RegexRouter transformations using the RegexRouterTransform class
|
|
147
|
+
transformed_topic: str = parser.regex_router.apply_transforms(topic)
|
|
148
|
+
target_dataset: str = (
|
|
149
|
+
f"{parser.bucket}/{parser.topics_dir}/{transformed_topic}"
|
|
150
|
+
)
|
|
62
151
|
|
|
63
152
|
lineages.append(
|
|
64
153
|
KafkaConnectLineage(
|
|
@@ -86,6 +175,7 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
86
175
|
database_name: str
|
|
87
176
|
schema_name: str
|
|
88
177
|
topics_to_tables: Dict[str, str]
|
|
178
|
+
regex_router: RegexRouterTransform
|
|
89
179
|
|
|
90
180
|
def get_table_name_from_topic_name(self, topic_name: str) -> str:
|
|
91
181
|
"""
|
|
@@ -93,7 +183,7 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
93
183
|
Refer below link for more info
|
|
94
184
|
https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
|
|
95
185
|
"""
|
|
96
|
-
table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
|
|
186
|
+
table_name: str = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
|
|
97
187
|
if re.match("^[^a-zA-Z_].*", table_name):
|
|
98
188
|
table_name = "_" + table_name
|
|
99
189
|
# Connector may append original topic's hash code as suffix for conflict resolution
|
|
@@ -106,8 +196,13 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
106
196
|
self,
|
|
107
197
|
connector_manifest: ConnectorManifest,
|
|
108
198
|
) -> SnowflakeParser:
|
|
109
|
-
database_name = connector_manifest.config["snowflake.database.name"]
|
|
110
|
-
schema_name = connector_manifest.config["snowflake.schema.name"]
|
|
199
|
+
database_name: str = connector_manifest.config["snowflake.database.name"]
|
|
200
|
+
schema_name: str = connector_manifest.config["snowflake.schema.name"]
|
|
201
|
+
|
|
202
|
+
# Create RegexRouterTransform instance
|
|
203
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
204
|
+
connector_manifest.config
|
|
205
|
+
)
|
|
111
206
|
|
|
112
207
|
# Fetch user provided topic to table map
|
|
113
208
|
provided_topics_to_tables: Dict[str, str] = {}
|
|
@@ -121,24 +216,30 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
121
216
|
topics_to_tables: Dict[str, str] = {}
|
|
122
217
|
# Extract lineage for only those topics whose data ingestion started
|
|
123
218
|
for topic in connector_manifest.topic_names:
|
|
219
|
+
# Apply transforms first to get the transformed topic name
|
|
220
|
+
transformed_topic: str = regex_router.apply_transforms(topic)
|
|
221
|
+
|
|
124
222
|
if topic in provided_topics_to_tables:
|
|
125
223
|
# If user provided which table to get mapped with this topic
|
|
126
224
|
topics_to_tables[topic] = provided_topics_to_tables[topic]
|
|
127
225
|
else:
|
|
128
|
-
#
|
|
129
|
-
topics_to_tables[topic] = self.get_table_name_from_topic_name(
|
|
226
|
+
# Use the transformed topic name to generate table name
|
|
227
|
+
topics_to_tables[topic] = self.get_table_name_from_topic_name(
|
|
228
|
+
transformed_topic
|
|
229
|
+
)
|
|
130
230
|
|
|
131
231
|
return self.SnowflakeParser(
|
|
132
232
|
database_name=database_name,
|
|
133
233
|
schema_name=schema_name,
|
|
134
234
|
topics_to_tables=topics_to_tables,
|
|
235
|
+
regex_router=regex_router,
|
|
135
236
|
)
|
|
136
237
|
|
|
137
238
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
138
239
|
# For all snowflake sink connector properties, refer below link
|
|
139
240
|
# https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
|
|
140
241
|
# remove private keys, secrets from properties
|
|
141
|
-
flow_property_bag = {
|
|
242
|
+
flow_property_bag: Dict[str, str] = {
|
|
142
243
|
k: v
|
|
143
244
|
for k, v in self.connector_manifest.config.items()
|
|
144
245
|
if k
|
|
@@ -153,10 +254,12 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
153
254
|
|
|
154
255
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
155
256
|
lineages: List[KafkaConnectLineage] = list()
|
|
156
|
-
parser = self.get_parser(
|
|
257
|
+
parser: SnowflakeSinkConnector.SnowflakeParser = self.get_parser(
|
|
258
|
+
self.connector_manifest
|
|
259
|
+
)
|
|
157
260
|
|
|
158
261
|
for topic, table in parser.topics_to_tables.items():
|
|
159
|
-
target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
|
|
262
|
+
target_dataset: str = f"{parser.database_name}.{parser.schema_name}.{table}"
|
|
160
263
|
lineages.append(
|
|
161
264
|
KafkaConnectLineage(
|
|
162
265
|
source_dataset=topic,
|
|
@@ -176,7 +279,8 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
176
279
|
project: str
|
|
177
280
|
target_platform: str
|
|
178
281
|
sanitizeTopics: bool
|
|
179
|
-
transforms:
|
|
282
|
+
transforms: List[Dict[str, str]]
|
|
283
|
+
regex_router: RegexRouterTransform
|
|
180
284
|
topicsToTables: Optional[str] = None
|
|
181
285
|
datasets: Optional[str] = None
|
|
182
286
|
defaultDataset: Optional[str] = None
|
|
@@ -186,25 +290,32 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
186
290
|
self,
|
|
187
291
|
connector_manifest: ConnectorManifest,
|
|
188
292
|
) -> BQParser:
|
|
189
|
-
project = connector_manifest.config["project"]
|
|
190
|
-
sanitizeTopics = connector_manifest.config.get("sanitizeTopics") or "false"
|
|
191
|
-
|
|
293
|
+
project: str = connector_manifest.config["project"]
|
|
294
|
+
sanitizeTopics: str = connector_manifest.config.get("sanitizeTopics") or "false"
|
|
295
|
+
|
|
296
|
+
# Parse ALL transforms (original BigQuery logic)
|
|
297
|
+
transform_names: List[str] = (
|
|
192
298
|
self.connector_manifest.config.get("transforms", "").split(",")
|
|
193
299
|
if self.connector_manifest.config.get("transforms")
|
|
194
300
|
else []
|
|
195
301
|
)
|
|
196
|
-
transforms = []
|
|
302
|
+
transforms: List[Dict[str, str]] = []
|
|
197
303
|
for name in transform_names:
|
|
198
|
-
transform = {"name": name}
|
|
304
|
+
transform: Dict[str, str] = {"name": name}
|
|
199
305
|
transforms.append(transform)
|
|
200
|
-
for key in self.connector_manifest.config
|
|
306
|
+
for key in self.connector_manifest.config:
|
|
201
307
|
if key.startswith(f"transforms.{name}."):
|
|
202
308
|
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
203
309
|
self.connector_manifest.config[key]
|
|
204
310
|
)
|
|
205
311
|
|
|
312
|
+
# Create RegexRouterTransform instance for RegexRouter-specific handling
|
|
313
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
314
|
+
connector_manifest.config
|
|
315
|
+
)
|
|
316
|
+
|
|
206
317
|
if "defaultDataset" in connector_manifest.config:
|
|
207
|
-
defaultDataset = connector_manifest.config["defaultDataset"]
|
|
318
|
+
defaultDataset: str = connector_manifest.config["defaultDataset"]
|
|
208
319
|
return self.BQParser(
|
|
209
320
|
project=project,
|
|
210
321
|
defaultDataset=defaultDataset,
|
|
@@ -212,11 +323,14 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
212
323
|
sanitizeTopics=sanitizeTopics.lower() == "true",
|
|
213
324
|
version="v2",
|
|
214
325
|
transforms=transforms,
|
|
326
|
+
regex_router=regex_router,
|
|
215
327
|
)
|
|
216
328
|
else:
|
|
217
329
|
# version 1.6.x and similar configs supported
|
|
218
|
-
datasets = connector_manifest.config["datasets"]
|
|
219
|
-
topicsToTables = connector_manifest.config.get(
|
|
330
|
+
datasets: str = connector_manifest.config["datasets"]
|
|
331
|
+
topicsToTables: Optional[str] = connector_manifest.config.get(
|
|
332
|
+
"topicsToTables"
|
|
333
|
+
)
|
|
220
334
|
|
|
221
335
|
return self.BQParser(
|
|
222
336
|
project=project,
|
|
@@ -225,10 +339,11 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
225
339
|
target_platform="bigquery",
|
|
226
340
|
sanitizeTopics=sanitizeTopics.lower() == "true",
|
|
227
341
|
transforms=transforms,
|
|
342
|
+
regex_router=regex_router,
|
|
228
343
|
)
|
|
229
344
|
|
|
230
345
|
def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
|
|
231
|
-
entries = property.split(",")
|
|
346
|
+
entries: List[str] = property.split(",")
|
|
232
347
|
for entry in entries:
|
|
233
348
|
key, val = entry.rsplit("=")
|
|
234
349
|
yield (key.strip(), val.strip())
|
|
@@ -243,7 +358,7 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
243
358
|
return dataset
|
|
244
359
|
return None
|
|
245
360
|
|
|
246
|
-
def sanitize_table_name(self, table_name):
|
|
361
|
+
def sanitize_table_name(self, table_name: str) -> str:
|
|
247
362
|
table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
|
|
248
363
|
if re.match("^[^a-zA-Z_].*", table_name):
|
|
249
364
|
table_name = "_" + table_name
|
|
@@ -254,8 +369,8 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
254
369
|
self, topic: str, parser: BQParser
|
|
255
370
|
) -> Optional[str]:
|
|
256
371
|
if parser.version == "v2":
|
|
257
|
-
dataset = parser.defaultDataset
|
|
258
|
-
parts = topic.split(":")
|
|
372
|
+
dataset: Optional[str] = parser.defaultDataset
|
|
373
|
+
parts: List[str] = topic.split(":")
|
|
259
374
|
if len(parts) == 2:
|
|
260
375
|
dataset = parts[0]
|
|
261
376
|
table = parts[1]
|
|
@@ -283,21 +398,9 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
283
398
|
table = self.sanitize_table_name(table)
|
|
284
399
|
return f"{dataset}.{table}"
|
|
285
400
|
|
|
286
|
-
def apply_transformations(
|
|
287
|
-
self, topic: str, transforms: List[Dict[str, str]]
|
|
288
|
-
) -> str:
|
|
289
|
-
for transform in transforms:
|
|
290
|
-
if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
|
|
291
|
-
regex = transform["regex"]
|
|
292
|
-
replacement = transform["replacement"]
|
|
293
|
-
pattern = re.compile(regex)
|
|
294
|
-
if pattern.match(topic):
|
|
295
|
-
topic = pattern.sub(replacement, topic, count=1)
|
|
296
|
-
return topic
|
|
297
|
-
|
|
298
401
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
299
402
|
# Mask/Remove properties that may reveal credentials
|
|
300
|
-
flow_property_bag = {
|
|
403
|
+
flow_property_bag: Dict[str, str] = {
|
|
301
404
|
k: v
|
|
302
405
|
for k, v in self.connector_manifest.config.items()
|
|
303
406
|
if k not in ["keyfile"]
|
|
@@ -307,27 +410,33 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
307
410
|
|
|
308
411
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
309
412
|
lineages: List[KafkaConnectLineage] = list()
|
|
310
|
-
parser = self.get_parser(
|
|
413
|
+
parser: BigQuerySinkConnector.BQParser = self.get_parser(
|
|
414
|
+
self.connector_manifest
|
|
415
|
+
)
|
|
311
416
|
if not parser:
|
|
312
417
|
return lineages
|
|
313
|
-
target_platform = parser.target_platform
|
|
314
|
-
project = parser.project
|
|
315
|
-
transforms = parser.transforms
|
|
418
|
+
target_platform: str = parser.target_platform
|
|
419
|
+
project: str = parser.project
|
|
316
420
|
|
|
317
421
|
for topic in self.connector_manifest.topic_names:
|
|
318
|
-
|
|
319
|
-
|
|
422
|
+
# Apply RegexRouter transformations using the RegexRouterTransform class
|
|
423
|
+
transformed_topic: str = parser.regex_router.apply_transforms(topic)
|
|
424
|
+
|
|
425
|
+
# Use the transformed topic to determine dataset/table
|
|
426
|
+
dataset_table: Optional[str] = self.get_dataset_table_for_topic(
|
|
427
|
+
transformed_topic, parser
|
|
428
|
+
)
|
|
320
429
|
if dataset_table is None:
|
|
321
430
|
self.report.warning(
|
|
322
431
|
"Could not find target dataset for topic, please check your connector configuration"
|
|
323
432
|
f"{self.connector_manifest.name} : {transformed_topic} ",
|
|
324
433
|
)
|
|
325
434
|
continue
|
|
326
|
-
target_dataset = f"{project}.{dataset_table}"
|
|
435
|
+
target_dataset: str = f"{project}.{dataset_table}"
|
|
327
436
|
|
|
328
437
|
lineages.append(
|
|
329
438
|
KafkaConnectLineage(
|
|
330
|
-
source_dataset=
|
|
439
|
+
source_dataset=topic, # Keep original topic as source
|
|
331
440
|
source_platform=KAFKA,
|
|
332
441
|
target_dataset=target_dataset,
|
|
333
442
|
target_platform=target_platform,
|
|
@@ -20,6 +20,8 @@ from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
|
20
20
|
get_platform_from_sqlalchemy_uri,
|
|
21
21
|
)
|
|
22
22
|
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
@dataclass
|
|
25
27
|
class ConfluentJDBCSourceConnector(BaseConnector):
|
|
@@ -121,7 +123,7 @@ class ConfluentJDBCSourceConnector(BaseConnector):
|
|
|
121
123
|
for name in transform_names:
|
|
122
124
|
transform = {"name": name}
|
|
123
125
|
transforms.append(transform)
|
|
124
|
-
for key in self.connector_manifest.config
|
|
126
|
+
for key in self.connector_manifest.config:
|
|
125
127
|
if key.startswith(f"transforms.{name}."):
|
|
126
128
|
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
127
129
|
self.connector_manifest.config[key]
|
|
@@ -392,7 +394,7 @@ class MongoSourceConnector(BaseConnector):
|
|
|
392
394
|
db_connection_url=connector_manifest.config.get("connection.uri"),
|
|
393
395
|
source_platform="mongodb",
|
|
394
396
|
database_name=connector_manifest.config.get("database"),
|
|
395
|
-
topic_prefix=connector_manifest.config.get("
|
|
397
|
+
topic_prefix=connector_manifest.config.get("topic.prefix"),
|
|
396
398
|
transforms=(
|
|
397
399
|
connector_manifest.config["transforms"].split(",")
|
|
398
400
|
if "transforms" in connector_manifest.config
|
|
@@ -406,7 +408,14 @@ class MongoSourceConnector(BaseConnector):
|
|
|
406
408
|
lineages: List[KafkaConnectLineage] = list()
|
|
407
409
|
parser = self.get_parser(self.connector_manifest)
|
|
408
410
|
source_platform = parser.source_platform
|
|
409
|
-
|
|
411
|
+
topic_prefix = parser.topic_prefix or ""
|
|
412
|
+
|
|
413
|
+
# Escape topic_prefix to handle cases where it contains dots
|
|
414
|
+
# Some users configure topic.prefix like "my.mongodb" which breaks the regex
|
|
415
|
+
|
|
416
|
+
# \w is equivalent to [a-zA-Z0-9_]
|
|
417
|
+
# So [\w-]+ matches alphanumeric characters, underscores, and hyphens
|
|
418
|
+
topic_naming_pattern = rf"{re.escape(topic_prefix)}\.([\w-]+)\.([\w-]+)"
|
|
410
419
|
|
|
411
420
|
if not self.connector_manifest.topic_names:
|
|
412
421
|
return lineages
|
|
@@ -429,6 +438,26 @@ class MongoSourceConnector(BaseConnector):
|
|
|
429
438
|
|
|
430
439
|
@dataclass
|
|
431
440
|
class DebeziumSourceConnector(BaseConnector):
|
|
441
|
+
# Debezium topic naming patterns by connector type
|
|
442
|
+
# - MySQL: {topic.prefix}.{database}.{table}
|
|
443
|
+
# - PostgreSQL: {topic.prefix}.{schema}.{table}
|
|
444
|
+
# - SQL Server: {topic.prefix}.{database}.{schema}.{table}
|
|
445
|
+
# - Oracle: {topic.prefix}.{schema}.{table}
|
|
446
|
+
# - DB2: {topic.prefix}.{schema}.{table}
|
|
447
|
+
# - MongoDB: {topic.prefix}.{database}.{collection}
|
|
448
|
+
# - Vitess: {topic.prefix}.{keyspace}.{table}
|
|
449
|
+
|
|
450
|
+
# Note SQL Server allows for "database.names" (multiple databases) config,
|
|
451
|
+
# and so database is in the topic naming pattern.
|
|
452
|
+
# However, others have "database.dbname" which is a single database name. For these connectors,
|
|
453
|
+
# additional databases would require a different connector instance
|
|
454
|
+
|
|
455
|
+
# Connectors with 2-level container in pattern (database + schema)
|
|
456
|
+
# Others have either database XOR schema, but not both
|
|
457
|
+
DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN = {
|
|
458
|
+
"io.debezium.connector.sqlserver.SqlServerConnector",
|
|
459
|
+
}
|
|
460
|
+
|
|
432
461
|
@dataclass
|
|
433
462
|
class DebeziumParser:
|
|
434
463
|
source_platform: str
|
|
@@ -514,16 +543,45 @@ class DebeziumSourceConnector(BaseConnector):
|
|
|
514
543
|
source_platform = parser.source_platform
|
|
515
544
|
server_name = parser.server_name
|
|
516
545
|
database_name = parser.database_name
|
|
517
|
-
|
|
546
|
+
# Escape server_name to handle cases where topic.prefix contains dots
|
|
547
|
+
# Some users configure topic.prefix like "my.server" which breaks the regex
|
|
548
|
+
server_name = server_name or ""
|
|
549
|
+
# Regex pattern (\w+\.\w+(?:\.\w+)?) supports BOTH 2-part and 3-part table names
|
|
550
|
+
topic_naming_pattern = rf"({re.escape(server_name)})\.(\w+\.\w+(?:\.\w+)?)"
|
|
518
551
|
|
|
519
552
|
if not self.connector_manifest.topic_names:
|
|
520
553
|
return lineages
|
|
521
554
|
|
|
555
|
+
# Handle connectors with 2-level container (database + schema) in topic pattern
|
|
556
|
+
connector_class = self.connector_manifest.config.get(CONNECTOR_CLASS, "")
|
|
557
|
+
maybe_duplicated_database_name = (
|
|
558
|
+
connector_class
|
|
559
|
+
in self.DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN
|
|
560
|
+
)
|
|
561
|
+
|
|
522
562
|
for topic in self.connector_manifest.topic_names:
|
|
523
563
|
found = re.search(re.compile(topic_naming_pattern), topic)
|
|
564
|
+
logger.debug(
|
|
565
|
+
f"Processing topic: '{topic}' with regex pattern '{topic_naming_pattern}', found: {found}"
|
|
566
|
+
)
|
|
524
567
|
|
|
525
568
|
if found:
|
|
526
|
-
|
|
569
|
+
# Extract the table part after server_name
|
|
570
|
+
table_part = found.group(2)
|
|
571
|
+
|
|
572
|
+
if (
|
|
573
|
+
maybe_duplicated_database_name
|
|
574
|
+
and database_name
|
|
575
|
+
and table_part.startswith(f"{database_name}.")
|
|
576
|
+
):
|
|
577
|
+
table_part = table_part[len(database_name) + 1 :]
|
|
578
|
+
|
|
579
|
+
logger.debug(
|
|
580
|
+
f"Extracted table part: '{table_part}' from topic '{topic}'"
|
|
581
|
+
)
|
|
582
|
+
# Apply database name to create final dataset name
|
|
583
|
+
table_name = get_dataset_name(database_name, table_part)
|
|
584
|
+
logger.debug(f"Final table name: '{table_name}'")
|
|
527
585
|
|
|
528
586
|
lineage = KafkaConnectLineage(
|
|
529
587
|
source_dataset=table_name,
|
datahub/ingestion/source/ldap.py
CHANGED