acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -4,13 +4,15 @@ from enum import Enum
|
|
|
4
4
|
from typing import Dict, List, Literal, Optional, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
|
-
from pydantic import validator
|
|
8
|
-
from pydantic.class_validators import root_validator
|
|
7
|
+
from pydantic import root_validator, validator
|
|
9
8
|
|
|
10
9
|
import datahub.emitter.mce_builder as builder
|
|
11
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
12
11
|
from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
|
|
13
12
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
13
|
+
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
14
|
+
IncrementalLineageConfigMixin,
|
|
15
|
+
)
|
|
14
16
|
from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
|
|
15
17
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
16
18
|
StaleEntityRemovalSourceReport,
|
|
@@ -19,6 +21,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
19
21
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
20
22
|
StatefulIngestionConfigBase,
|
|
21
23
|
)
|
|
24
|
+
from datahub.utilities.global_warning_util import add_global_warning
|
|
22
25
|
from datahub.utilities.lossy_collections import LossyList
|
|
23
26
|
from datahub.utilities.perf_timer import PerfTimer
|
|
24
27
|
|
|
@@ -183,6 +186,16 @@ class SupportedDataPlatform(Enum):
|
|
|
183
186
|
datahub_data_platform_name="databricks",
|
|
184
187
|
)
|
|
185
188
|
|
|
189
|
+
MYSQL = DataPlatformPair(
|
|
190
|
+
powerbi_data_platform_name="MySQL",
|
|
191
|
+
datahub_data_platform_name="mysql",
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
ODBC = DataPlatformPair(
|
|
195
|
+
powerbi_data_platform_name="Odbc",
|
|
196
|
+
datahub_data_platform_name="odbc",
|
|
197
|
+
)
|
|
198
|
+
|
|
186
199
|
|
|
187
200
|
@dataclass
|
|
188
201
|
class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
@@ -275,44 +288,47 @@ class PowerBiProfilingConfig(ConfigModel):
|
|
|
275
288
|
|
|
276
289
|
|
|
277
290
|
class PowerBiDashboardSourceConfig(
|
|
278
|
-
StatefulIngestionConfigBase, DatasetSourceConfigMixin
|
|
291
|
+
StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
|
|
279
292
|
):
|
|
280
|
-
platform_name: str = pydantic.Field(
|
|
281
|
-
default=Constant.PLATFORM_NAME, hidden_from_docs=True
|
|
282
|
-
)
|
|
293
|
+
platform_name: HiddenFromDocs[str] = pydantic.Field(default=Constant.PLATFORM_NAME)
|
|
283
294
|
|
|
284
|
-
platform_urn: str = pydantic.Field(
|
|
295
|
+
platform_urn: HiddenFromDocs[str] = pydantic.Field(
|
|
285
296
|
default=builder.make_data_platform_urn(platform=Constant.PLATFORM_NAME),
|
|
286
|
-
hidden_from_docs=True,
|
|
287
297
|
)
|
|
288
298
|
|
|
289
299
|
# Organization Identifier
|
|
290
300
|
tenant_id: str = pydantic.Field(description="PowerBI tenant identifier")
|
|
291
301
|
# PowerBi workspace identifier
|
|
292
|
-
workspace_id: Optional[str] = pydantic.Field(
|
|
302
|
+
workspace_id: HiddenFromDocs[Optional[str]] = pydantic.Field(
|
|
293
303
|
default=None,
|
|
294
304
|
description="[deprecated] Use workspace_id_pattern instead",
|
|
295
|
-
hidden_from_docs=True,
|
|
296
305
|
)
|
|
297
306
|
# PowerBi workspace identifier
|
|
298
307
|
workspace_id_pattern: AllowDenyPattern = pydantic.Field(
|
|
299
308
|
default=AllowDenyPattern.allow_all(),
|
|
300
|
-
description="Regex patterns to filter PowerBI workspaces in ingestion."
|
|
309
|
+
description="Regex patterns to filter PowerBI workspaces in ingestion by ID."
|
|
310
|
+
" By default all IDs are allowed unless they are filtered by name using 'workspace_name_pattern'."
|
|
311
|
+
" Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
|
|
312
|
+
)
|
|
313
|
+
# PowerBi workspace name
|
|
314
|
+
workspace_name_pattern: AllowDenyPattern = pydantic.Field(
|
|
315
|
+
default=AllowDenyPattern.allow_all(),
|
|
316
|
+
description="Regex patterns to filter PowerBI workspaces in ingestion by name."
|
|
317
|
+
" By default all names are allowed unless they are filtered by ID using 'workspace_id_pattern'."
|
|
301
318
|
" Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
|
|
302
319
|
)
|
|
303
320
|
|
|
304
321
|
# Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
|
|
305
322
|
# DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
|
|
306
323
|
# mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
|
|
307
|
-
dataset_type_mapping:
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
)
|
|
324
|
+
dataset_type_mapping: HiddenFromDocs[
|
|
325
|
+
Union[Dict[str, str], Dict[str, PlatformDetail]]
|
|
326
|
+
] = pydantic.Field(
|
|
327
|
+
default_factory=default_for_dataset_type_mapping,
|
|
328
|
+
description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
|
|
329
|
+
"DataHub supported datasources."
|
|
330
|
+
"You can configured platform instance for dataset lineage. "
|
|
331
|
+
"See Quickstart Recipe for mapping",
|
|
316
332
|
)
|
|
317
333
|
# PowerBI datasource's server to platform instance mapping
|
|
318
334
|
server_to_platform_instance: Dict[
|
|
@@ -324,6 +340,26 @@ class PowerBiDashboardSourceConfig(
|
|
|
324
340
|
"For Google BigQuery the datasource's server is google bigquery project name. "
|
|
325
341
|
"For Databricks Unity Catalog the datasource's server is workspace FQDN.",
|
|
326
342
|
)
|
|
343
|
+
# ODBC DSN to platform mapping
|
|
344
|
+
dsn_to_platform_name: Dict[str, str] = pydantic.Field(
|
|
345
|
+
default={},
|
|
346
|
+
description="A mapping of ODBC DSN to DataHub data platform name. "
|
|
347
|
+
"For example with an ODBC connection string 'DSN=database' where the database type "
|
|
348
|
+
"is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
|
|
349
|
+
)
|
|
350
|
+
# ODBC DSN to database (or database.schema) mapping
|
|
351
|
+
dsn_to_database_schema: Dict[str, str] = pydantic.Field(
|
|
352
|
+
default={},
|
|
353
|
+
description="A mapping of ODBC DSN to database names with optional schema names "
|
|
354
|
+
"(some database platforms such a MySQL use the table name pattern 'database.table', "
|
|
355
|
+
"while others use the pattern 'database.schema.table'). "
|
|
356
|
+
"This mapping is used in conjunction with ODBC SQL query parsing. "
|
|
357
|
+
"If SQL queries used with ODBC do not reference fully qualified tables names, "
|
|
358
|
+
"then you should configure mappings for your DSNs. "
|
|
359
|
+
"For example with an ODBC connection string 'DSN=database' where the database "
|
|
360
|
+
"is 'prod' you would configure the mapping as 'database: prod'. "
|
|
361
|
+
"If the database is 'prod' and the schema is 'data' then mapping would be 'database: prod.data'.",
|
|
362
|
+
)
|
|
327
363
|
# deprecated warning
|
|
328
364
|
_dataset_type_mapping = pydantic_field_deprecated(
|
|
329
365
|
"dataset_type_mapping",
|
|
@@ -373,8 +409,9 @@ class PowerBiDashboardSourceConfig(
|
|
|
373
409
|
)
|
|
374
410
|
# Enable/Disable extracting dataset schema
|
|
375
411
|
extract_dataset_schema: bool = pydantic.Field(
|
|
376
|
-
default=
|
|
377
|
-
description="Whether to ingest PBI Dataset Table columns and measures"
|
|
412
|
+
default=True,
|
|
413
|
+
description="Whether to ingest PBI Dataset Table columns and measures."
|
|
414
|
+
" Note: this setting must be `true` for schema extraction and column lineage to be enabled.",
|
|
378
415
|
)
|
|
379
416
|
# Enable/Disable extracting lineage information of PowerBI Dataset
|
|
380
417
|
extract_lineage: bool = pydantic.Field(
|
|
@@ -483,7 +520,7 @@ class PowerBiDashboardSourceConfig(
|
|
|
483
520
|
include_workspace_name_in_dataset_urn: bool = pydantic.Field(
|
|
484
521
|
default=False,
|
|
485
522
|
description="It is recommended to set this to true, as it helps prevent the overwriting of datasets."
|
|
486
|
-
"Read section #11560 at https://
|
|
523
|
+
"Read section #11560 at https://docs.datahub.com/docs/how/updating-datahub/ before enabling this option."
|
|
487
524
|
"To maintain backward compatibility, this is set to False.",
|
|
488
525
|
)
|
|
489
526
|
|
|
@@ -498,10 +535,9 @@ class PowerBiDashboardSourceConfig(
|
|
|
498
535
|
"Increase this value if you encounter the 'M-Query Parsing Timeout' message in the connector report.",
|
|
499
536
|
)
|
|
500
537
|
|
|
501
|
-
metadata_api_timeout: int = pydantic.Field(
|
|
538
|
+
metadata_api_timeout: HiddenFromDocs[int] = pydantic.Field(
|
|
502
539
|
default=30,
|
|
503
540
|
description="timeout in seconds for Metadata Rest Api.",
|
|
504
|
-
hidden_from_docs=True,
|
|
505
541
|
)
|
|
506
542
|
|
|
507
543
|
@root_validator(skip_on_failure=True)
|
|
@@ -510,6 +546,7 @@ class PowerBiDashboardSourceConfig(
|
|
|
510
546
|
"native_query_parsing",
|
|
511
547
|
"enable_advance_lineage_sql_construct",
|
|
512
548
|
"extract_lineage",
|
|
549
|
+
"extract_dataset_schema",
|
|
513
550
|
]
|
|
514
551
|
|
|
515
552
|
if (
|
|
@@ -536,7 +573,7 @@ class PowerBiDashboardSourceConfig(
|
|
|
536
573
|
def map_data_platform(cls, value):
|
|
537
574
|
# For backward compatibility convert input PostgreSql to PostgreSQL
|
|
538
575
|
# PostgreSQL is name of the data-platform in M-Query
|
|
539
|
-
if "PostgreSql" in value
|
|
576
|
+
if "PostgreSql" in value:
|
|
540
577
|
platform_name = value["PostgreSql"]
|
|
541
578
|
del value["PostgreSql"]
|
|
542
579
|
value["PostgreSQL"] = platform_name
|
|
@@ -575,3 +612,31 @@ class PowerBiDashboardSourceConfig(
|
|
|
575
612
|
)
|
|
576
613
|
|
|
577
614
|
return values
|
|
615
|
+
|
|
616
|
+
@root_validator(skip_on_failure=True)
|
|
617
|
+
def validate_extract_dataset_schema(cls, values: Dict) -> Dict:
|
|
618
|
+
if values.get("extract_dataset_schema") is False:
|
|
619
|
+
add_global_warning(
|
|
620
|
+
"Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
|
|
621
|
+
)
|
|
622
|
+
return values
|
|
623
|
+
|
|
624
|
+
@root_validator(skip_on_failure=True)
|
|
625
|
+
def validate_dsn_to_database_schema(cls, values: Dict) -> Dict:
|
|
626
|
+
if values.get("dsn_to_database_schema") is not None:
|
|
627
|
+
dsn_mapping = values.get("dsn_to_database_schema")
|
|
628
|
+
if not isinstance(dsn_mapping, dict):
|
|
629
|
+
raise ValueError("dsn_to_database_schema must contain key-value pairs")
|
|
630
|
+
|
|
631
|
+
for _key, value in dsn_mapping.items():
|
|
632
|
+
if not isinstance(value, str):
|
|
633
|
+
raise ValueError(
|
|
634
|
+
"dsn_to_database_schema mapping values must be strings"
|
|
635
|
+
)
|
|
636
|
+
parts = value.split(".")
|
|
637
|
+
if len(parts) != 1 and len(parts) != 2:
|
|
638
|
+
raise ValueError(
|
|
639
|
+
f"dsn_to_database_schema invalid mapping value: {value}"
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
return values
|
|
@@ -74,3 +74,6 @@ class FunctionName(Enum):
|
|
|
74
74
|
GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
|
|
75
75
|
AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
|
|
76
76
|
DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
|
|
77
|
+
MYSQL_DATA_ACCESS = "MySQL.Database"
|
|
78
|
+
ODBC_DATA_ACCESS = "Odbc.DataSource"
|
|
79
|
+
ODBC_QUERY = "Odbc.Query"
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
server_patterns = [
|
|
5
|
+
r"Server=([^:]+)[:][0-9]+/.*",
|
|
6
|
+
r"SERVER=\{([^}]*)\}",
|
|
7
|
+
r"SERVER=([^;]*)",
|
|
8
|
+
r"HOST=\{([^}]*)\}",
|
|
9
|
+
r"HOST=([^;]*)",
|
|
10
|
+
r"DATA SOURCE=\{([^}]*)\}",
|
|
11
|
+
r"DATA SOURCE=([^;]*)",
|
|
12
|
+
r"DSN=\{([^}]*)\}",
|
|
13
|
+
r"DSN=([^;]*)",
|
|
14
|
+
r"Server=([^;]*)",
|
|
15
|
+
r"S3OutputLocation=([^;]*)",
|
|
16
|
+
r"HTTPPath=([^;]*)",
|
|
17
|
+
r"Host=([^;]*)",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
dsn_patterns = [
|
|
21
|
+
r"DSN\s*=\s*\"([^\"]+)\"",
|
|
22
|
+
r"DSN\s*=\s*\'([^\']+)\'",
|
|
23
|
+
r"DSN\s*=\s*([^;]+)",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
platform_patterns = {
|
|
27
|
+
"mysql": r"mysql",
|
|
28
|
+
"postgres": r"post(gre(s|sql)?|gres)",
|
|
29
|
+
"mssql": r"(sql\s*server|mssql|sqlncli)",
|
|
30
|
+
"oracle": r"oracle",
|
|
31
|
+
"db2": r"db2",
|
|
32
|
+
"sqlite": r"sqlite",
|
|
33
|
+
"access": r"(access|\.mdb|\.accdb)",
|
|
34
|
+
"excel": r"(excel|\.xls)",
|
|
35
|
+
"firebird": r"firebird",
|
|
36
|
+
"informix": r"informix",
|
|
37
|
+
"sybase": r"sybase",
|
|
38
|
+
"teradata": r"teradata",
|
|
39
|
+
"hadoop": r"(hadoop|hive)",
|
|
40
|
+
"snowflake": r"snowflake",
|
|
41
|
+
"redshift": r"redshift",
|
|
42
|
+
"bigquery": r"bigquery",
|
|
43
|
+
"athena": r"(athena|aws\s*athena)",
|
|
44
|
+
"databricks": r"(databricks|spark)",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
powerbi_platform_names = {
|
|
48
|
+
"mysql": "MySQL",
|
|
49
|
+
"postgres": "PostgreSQL",
|
|
50
|
+
"mssql": "SQL Server",
|
|
51
|
+
"oracle": "Oracle",
|
|
52
|
+
"db2": "IBM DB2",
|
|
53
|
+
"sqlite": "SQLite",
|
|
54
|
+
"access": "Microsoft Access",
|
|
55
|
+
"excel": "Microsoft Excel",
|
|
56
|
+
"firebird": "Firebird",
|
|
57
|
+
"informix": "IBM Informix",
|
|
58
|
+
"sybase": "SAP Sybase",
|
|
59
|
+
"teradata": "Teradata",
|
|
60
|
+
"hadoop": "Hadoop",
|
|
61
|
+
"snowflake": "Snowflake",
|
|
62
|
+
"redshift": "Amazon Redshift",
|
|
63
|
+
"bigquery": "Google BigQuery",
|
|
64
|
+
"athena": "Amazon Athena",
|
|
65
|
+
"databricks": "Databricks",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def extract_driver(connection_string: str) -> Union[str, None]:
|
|
70
|
+
"""
|
|
71
|
+
Parse an ODBC connection string and extract the driver name.
|
|
72
|
+
Handles whitespace in driver names and various connection string formats.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
connection_string (str): The ODBC connection string
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
str: The extracted driver name, or None if not found
|
|
79
|
+
"""
|
|
80
|
+
# Match DRIVER={driver name} pattern
|
|
81
|
+
driver_match = re.search(r"DRIVER=\{([^}]*)}", connection_string, re.IGNORECASE)
|
|
82
|
+
|
|
83
|
+
if driver_match:
|
|
84
|
+
return driver_match.group(1).strip()
|
|
85
|
+
|
|
86
|
+
# Alternative pattern for DRIVER=driver
|
|
87
|
+
driver_match = re.search(r"DRIVER=([^;]*)", connection_string, re.IGNORECASE)
|
|
88
|
+
|
|
89
|
+
if driver_match:
|
|
90
|
+
return driver_match.group(1).strip()
|
|
91
|
+
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def extract_dsn(connection_string: str) -> Union[str, None]:
|
|
96
|
+
"""
|
|
97
|
+
Extract the DSN value from an ODBC connection string.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
connection_string (str): The ODBC connection string
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
str or None: The extracted DSN value, or None if not found
|
|
104
|
+
"""
|
|
105
|
+
for pattern in dsn_patterns:
|
|
106
|
+
match = re.search(pattern, connection_string, re.IGNORECASE)
|
|
107
|
+
if match:
|
|
108
|
+
return match.group(1).strip()
|
|
109
|
+
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def extract_server(connection_string: str) -> Union[str, None]:
|
|
114
|
+
"""
|
|
115
|
+
Parse an ODBC connection string and extract the server name.
|
|
116
|
+
Handles various parameter names for server (SERVER, Host, Data Source, etc.)
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
connection_string (str): The ODBC connection string
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
str: The extracted server name, or None if not found
|
|
123
|
+
"""
|
|
124
|
+
for pattern in server_patterns:
|
|
125
|
+
server_match = re.search(pattern, connection_string, re.IGNORECASE)
|
|
126
|
+
if server_match:
|
|
127
|
+
return server_match.group(1).strip()
|
|
128
|
+
|
|
129
|
+
# Special case for Athena: extract from AwsRegion if no server found
|
|
130
|
+
region_match = re.search(r"AwsRegion=([^;]*)", connection_string, re.IGNORECASE)
|
|
131
|
+
if region_match:
|
|
132
|
+
return f"aws-athena-{region_match.group(1).strip()}"
|
|
133
|
+
|
|
134
|
+
# Special case for Databricks: try to extract hostname from JDBC URL
|
|
135
|
+
jdbc_match = re.search(r"jdbc:spark://([^:;/]+)", connection_string, re.IGNORECASE)
|
|
136
|
+
if jdbc_match:
|
|
137
|
+
return jdbc_match.group(1).strip()
|
|
138
|
+
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def extract_platform(connection_string: str) -> Tuple[Optional[str], Optional[str]]:
|
|
143
|
+
"""
|
|
144
|
+
Extract the database platform name from the ODBC driver name.
|
|
145
|
+
Returns the lowercase platform name.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
connection_string (str): The ODBC connection string
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
tuple: A tuple containing the normalized platform name and the corresponding
|
|
152
|
+
Power BI platform name, or None if not recognized.
|
|
153
|
+
"""
|
|
154
|
+
driver_name = extract_driver(connection_string)
|
|
155
|
+
if not driver_name:
|
|
156
|
+
return None, None
|
|
157
|
+
|
|
158
|
+
driver_lower = driver_name.lower()
|
|
159
|
+
|
|
160
|
+
for platform, pattern in platform_patterns.items():
|
|
161
|
+
if re.search(pattern, driver_lower):
|
|
162
|
+
return platform, powerbi_platform_names.get(platform)
|
|
163
|
+
|
|
164
|
+
return None, None
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def normalize_platform_name(platform: str) -> Tuple[Optional[str], Optional[str]]:
|
|
168
|
+
"""
|
|
169
|
+
Normalizes the platform name by matching it with predefined patterns and maps it to
|
|
170
|
+
a corresponding Power BI platform name.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
platform (str): The platform name to normalize
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
tuple: A tuple containing the normalized platform name and the corresponding
|
|
177
|
+
Power BI platform name, or None if not recognized.
|
|
178
|
+
"""
|
|
179
|
+
platform_lower = platform.lower()
|
|
180
|
+
|
|
181
|
+
for platform, pattern in platform_patterns.items():
|
|
182
|
+
if re.search(pattern, platform_lower):
|
|
183
|
+
return platform, powerbi_platform_names.get(platform)
|
|
184
|
+
|
|
185
|
+
return None, None
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import importlib.resources as pkg_resource
|
|
3
3
|
import logging
|
|
4
|
-
import os
|
|
5
4
|
from typing import Dict, List, Optional
|
|
6
5
|
|
|
7
6
|
import lark
|
|
8
7
|
from lark import Lark, Tree
|
|
9
8
|
|
|
10
9
|
import datahub.ingestion.source.powerbi.m_query.data_classes
|
|
10
|
+
from datahub.configuration.env_vars import get_powerbi_m_query_parse_timeout
|
|
11
11
|
from datahub.ingestion.api.common import PipelineContext
|
|
12
12
|
from datahub.ingestion.source.powerbi.config import (
|
|
13
13
|
PowerBiDashboardSourceConfig,
|
|
@@ -25,7 +25,7 @@ from datahub.utilities.threading_timeout import TimeoutException, threading_time
|
|
|
25
25
|
|
|
26
26
|
logger = logging.getLogger(__name__)
|
|
27
27
|
|
|
28
|
-
_M_QUERY_PARSE_TIMEOUT =
|
|
28
|
+
_M_QUERY_PARSE_TIMEOUT = get_powerbi_m_query_parse_timeout()
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
@functools.lru_cache(maxsize=1)
|