acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import re
|
|
3
|
-
from collections import defaultdict
|
|
4
2
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
3
|
+
from dataclasses import dataclass
|
|
5
4
|
from typing import Dict, Iterable, List, Optional
|
|
6
5
|
|
|
7
6
|
from datahub.emitter.mce_builder import (
|
|
@@ -23,12 +22,16 @@ from datahub.ingestion.api.source import (
|
|
|
23
22
|
SourceReport,
|
|
24
23
|
)
|
|
25
24
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
25
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
26
26
|
from datahub.ingestion.source.dremio.dremio_api import (
|
|
27
27
|
DremioAPIOperations,
|
|
28
28
|
DremioEdition,
|
|
29
29
|
)
|
|
30
30
|
from datahub.ingestion.source.dremio.dremio_aspects import DremioAspects
|
|
31
|
-
from datahub.ingestion.source.dremio.dremio_config import
|
|
31
|
+
from datahub.ingestion.source.dremio.dremio_config import (
|
|
32
|
+
DremioSourceConfig,
|
|
33
|
+
DremioSourceMapping,
|
|
34
|
+
)
|
|
32
35
|
from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
|
|
33
36
|
DremioToDataHubSourceTypeMapping,
|
|
34
37
|
)
|
|
@@ -39,6 +42,7 @@ from datahub.ingestion.source.dremio.dremio_entities import (
|
|
|
39
42
|
DremioDatasetType,
|
|
40
43
|
DremioGlossaryTerm,
|
|
41
44
|
DremioQuery,
|
|
45
|
+
DremioSourceContainer,
|
|
42
46
|
)
|
|
43
47
|
from datahub.ingestion.source.dremio.dremio_profiling import DremioProfiler
|
|
44
48
|
from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
|
|
@@ -48,13 +52,17 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
48
52
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
49
53
|
StatefulIngestionSourceBase,
|
|
50
54
|
)
|
|
51
|
-
from datahub.ingestion.source_report.ingestion_stage import
|
|
55
|
+
from datahub.ingestion.source_report.ingestion_stage import (
|
|
56
|
+
LINEAGE_EXTRACTION,
|
|
57
|
+
METADATA_EXTRACTION,
|
|
58
|
+
IngestionHighStage,
|
|
59
|
+
)
|
|
52
60
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
53
61
|
DatasetLineageTypeClass,
|
|
54
62
|
UpstreamClass,
|
|
55
63
|
UpstreamLineage,
|
|
56
64
|
)
|
|
57
|
-
from datahub.metadata.schema_classes import
|
|
65
|
+
from datahub.metadata.schema_classes import SchemaMetadataClass
|
|
58
66
|
from datahub.metadata.urns import CorpUserUrn
|
|
59
67
|
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
60
68
|
KnownQueryLineageInfo,
|
|
@@ -65,16 +73,48 @@ from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
|
65
73
|
logger = logging.getLogger(__name__)
|
|
66
74
|
|
|
67
75
|
|
|
76
|
+
@dataclass
|
|
77
|
+
class DremioSourceMapEntry:
|
|
78
|
+
platform: str
|
|
79
|
+
source_name: str
|
|
80
|
+
dremio_source_category: str
|
|
81
|
+
root_path: str = ""
|
|
82
|
+
database_name: str = ""
|
|
83
|
+
platform_instance: Optional[str] = None
|
|
84
|
+
env: Optional[str] = None
|
|
85
|
+
|
|
86
|
+
|
|
68
87
|
@platform_name("Dremio")
|
|
69
88
|
@config_class(DremioSourceConfig)
|
|
70
89
|
@support_status(SupportStatus.CERTIFIED)
|
|
71
|
-
@capability(
|
|
90
|
+
@capability(
|
|
91
|
+
SourceCapability.CONTAINERS,
|
|
92
|
+
"Enabled by default",
|
|
93
|
+
subtype_modifier=[
|
|
94
|
+
SourceCapabilityModifier.DREMIO_SPACE,
|
|
95
|
+
SourceCapabilityModifier.DREMIO_SOURCE,
|
|
96
|
+
],
|
|
97
|
+
)
|
|
72
98
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
73
99
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
74
100
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
75
|
-
@capability(
|
|
101
|
+
@capability(
|
|
102
|
+
SourceCapability.LINEAGE_COARSE,
|
|
103
|
+
"Enabled by default",
|
|
104
|
+
subtype_modifier=[
|
|
105
|
+
SourceCapabilityModifier.TABLE,
|
|
106
|
+
],
|
|
107
|
+
)
|
|
108
|
+
@capability(
|
|
109
|
+
SourceCapability.LINEAGE_FINE,
|
|
110
|
+
"Extract column-level lineage",
|
|
111
|
+
subtype_modifier=[
|
|
112
|
+
SourceCapabilityModifier.TABLE,
|
|
113
|
+
],
|
|
114
|
+
)
|
|
76
115
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
77
116
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
117
|
+
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
78
118
|
class DremioSource(StatefulIngestionSourceBase):
|
|
79
119
|
"""
|
|
80
120
|
This plugin integrates with Dremio to extract and ingest metadata into DataHub.
|
|
@@ -112,7 +152,14 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
112
152
|
self.default_db = "dremio"
|
|
113
153
|
self.config = config
|
|
114
154
|
self.report = DremioSourceReport()
|
|
115
|
-
|
|
155
|
+
|
|
156
|
+
# Set time window for query lineage extraction
|
|
157
|
+
self.report.window_start_time, self.report.window_end_time = (
|
|
158
|
+
self.config.start_time,
|
|
159
|
+
self.config.end_time,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
self.source_map: Dict[str, DremioSourceMapEntry] = dict()
|
|
116
163
|
|
|
117
164
|
# Initialize API operations
|
|
118
165
|
dremio_api = DremioAPIOperations(self.config, self.report)
|
|
@@ -140,6 +187,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
140
187
|
generate_operations=True,
|
|
141
188
|
usage_config=self.config.usage,
|
|
142
189
|
)
|
|
190
|
+
self.report.sql_aggregator = self.sql_parsing_aggregator.report
|
|
143
191
|
|
|
144
192
|
# For profiling
|
|
145
193
|
self.profiler = DremioProfiler(config, self.report, dremio_api)
|
|
@@ -152,111 +200,12 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
152
200
|
def get_platform(self) -> str:
|
|
153
201
|
return "dremio"
|
|
154
202
|
|
|
155
|
-
def _build_source_map(self) -> Dict[str,
|
|
156
|
-
"""
|
|
157
|
-
Builds a source mapping dictionary to support external lineage generation across
|
|
158
|
-
multiple Dremio sources, based on provided configuration mappings.
|
|
159
|
-
|
|
160
|
-
This method operates as follows:
|
|
161
|
-
|
|
162
|
-
1. If a source mapping is present in the config:
|
|
163
|
-
- For each source in the Dremio catalog, if the mapping's `source_name` matches
|
|
164
|
-
the `dremio_source_type`, `root_path` and `database_name` are added to the mapping
|
|
165
|
-
information, along with the platform, platform instance, and environment if they exist.
|
|
166
|
-
This allows constructing the full URN for upstream lineage.
|
|
167
|
-
|
|
168
|
-
2. If a source mapping is absent in the configuration:
|
|
169
|
-
- Default mappings are created for each source name, setting `env` and `platform_instance`
|
|
170
|
-
to default values and classifying the source type. This ensures all sources have a
|
|
171
|
-
mapping, even if specific configuration details are missing.
|
|
172
|
-
|
|
173
|
-
Returns:
|
|
174
|
-
Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
|
|
175
|
-
(lowercased) and each value is another dictionary containing:
|
|
176
|
-
- `platform`: The source platform.
|
|
177
|
-
- `source_name`: The source name.
|
|
178
|
-
- `dremio_source_type`: The type mapped to DataHub,
|
|
179
|
-
e.g., "database", "folder".
|
|
180
|
-
- Optional `root_path`, `database_name`, `platform_instance`,
|
|
181
|
-
and `env` if provided in the configuration.
|
|
182
|
-
Example:
|
|
183
|
-
This method is used internally within the class to generate mappings before
|
|
184
|
-
creating cross-platform lineage.
|
|
185
|
-
|
|
186
|
-
"""
|
|
187
|
-
|
|
188
|
-
source_map = {}
|
|
203
|
+
def _build_source_map(self) -> Dict[str, DremioSourceMapEntry]:
|
|
189
204
|
dremio_sources = self.dremio_catalog.get_sources()
|
|
205
|
+
source_mappings_config = self.config.source_mappings or []
|
|
190
206
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
if isinstance(source.dremio_source_type, str):
|
|
194
|
-
source_type = source.dremio_source_type.lower()
|
|
195
|
-
root_path = source.root_path.lower() if source.root_path else ""
|
|
196
|
-
database_name = (
|
|
197
|
-
source.database_name.lower() if source.database_name else ""
|
|
198
|
-
)
|
|
199
|
-
source_present = False
|
|
200
|
-
source_platform_name = source_name
|
|
201
|
-
|
|
202
|
-
for mapping in self.config.source_mappings or []:
|
|
203
|
-
if re.search(mapping.source_name, source_type, re.IGNORECASE):
|
|
204
|
-
source_platform_name = mapping.source_name.lower()
|
|
205
|
-
|
|
206
|
-
datahub_source_type = (
|
|
207
|
-
DremioToDataHubSourceTypeMapping.get_datahub_source_type(
|
|
208
|
-
source_type
|
|
209
|
-
)
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
if re.search(mapping.platform, datahub_source_type, re.IGNORECASE):
|
|
213
|
-
source_platform_name = source_platform_name.lower()
|
|
214
|
-
source_map[source_platform_name] = {
|
|
215
|
-
"platform": mapping.platform,
|
|
216
|
-
"source_name": mapping.source_name,
|
|
217
|
-
"dremio_source_type": DremioToDataHubSourceTypeMapping.get_category(
|
|
218
|
-
source_type,
|
|
219
|
-
),
|
|
220
|
-
"root_path": root_path,
|
|
221
|
-
"database_name": database_name,
|
|
222
|
-
"platform_instance": mapping.platform_instance,
|
|
223
|
-
"env": mapping.env,
|
|
224
|
-
}
|
|
225
|
-
source_present = True
|
|
226
|
-
break
|
|
227
|
-
|
|
228
|
-
if not source_present:
|
|
229
|
-
try:
|
|
230
|
-
dremio_source_type = (
|
|
231
|
-
DremioToDataHubSourceTypeMapping.get_category(source_type)
|
|
232
|
-
)
|
|
233
|
-
except Exception as exc:
|
|
234
|
-
logger.info(
|
|
235
|
-
f"Source {source_type} is not a standard Dremio source type. "
|
|
236
|
-
f"Adding source_type {source_type} to mapping as database. Error: {exc}"
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
DremioToDataHubSourceTypeMapping.add_mapping(
|
|
240
|
-
source_type, source_name
|
|
241
|
-
)
|
|
242
|
-
dremio_source_type = (
|
|
243
|
-
DremioToDataHubSourceTypeMapping.get_category(source_type)
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
source_map[source_platform_name.lower()] = {
|
|
247
|
-
"platform": source_type,
|
|
248
|
-
"source_name": source_name,
|
|
249
|
-
"dremio_source_type": dremio_source_type,
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
else:
|
|
253
|
-
logger.error(
|
|
254
|
-
f'Source "{source.container_name}" is broken. Containers will not be created for source.'
|
|
255
|
-
)
|
|
256
|
-
logger.error(
|
|
257
|
-
f'No new cross-platform lineage will be emitted for source "{source.container_name}".'
|
|
258
|
-
)
|
|
259
|
-
logger.error("Fix this source in Dremio to fix this issue.")
|
|
207
|
+
source_map = build_dremio_source_map(dremio_sources, source_mappings_config)
|
|
208
|
+
logger.info(f"Full source map: {source_map}")
|
|
260
209
|
|
|
261
210
|
return source_map
|
|
262
211
|
|
|
@@ -275,84 +224,88 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
275
224
|
|
|
276
225
|
self.source_map = self._build_source_map()
|
|
277
226
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
227
|
+
with self.report.new_stage(METADATA_EXTRACTION):
|
|
228
|
+
# Process Containers
|
|
229
|
+
containers = self.dremio_catalog.get_containers()
|
|
230
|
+
for container in containers:
|
|
231
|
+
try:
|
|
232
|
+
yield from self.process_container(container)
|
|
233
|
+
logger.info(
|
|
234
|
+
f"Dremio container {container.container_name} emitted successfully"
|
|
235
|
+
)
|
|
236
|
+
except Exception as exc:
|
|
237
|
+
self.report.num_containers_failed += 1
|
|
238
|
+
self.report.report_failure(
|
|
239
|
+
message="Failed to process Dremio container",
|
|
240
|
+
context=f"{'.'.join(container.path)}.{container.container_name}",
|
|
241
|
+
exc=exc,
|
|
242
|
+
)
|
|
293
243
|
|
|
294
|
-
|
|
295
|
-
|
|
244
|
+
# Process Datasets
|
|
245
|
+
datasets = self.dremio_catalog.get_datasets()
|
|
296
246
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
247
|
+
for dataset_info in datasets:
|
|
248
|
+
try:
|
|
249
|
+
yield from self.process_dataset(dataset_info)
|
|
250
|
+
logger.info(
|
|
251
|
+
f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
|
|
252
|
+
)
|
|
253
|
+
except Exception as exc:
|
|
254
|
+
self.report.num_datasets_failed += 1 # Increment failed datasets
|
|
255
|
+
self.report.report_failure(
|
|
256
|
+
message="Failed to process Dremio dataset",
|
|
257
|
+
context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
|
|
258
|
+
exc=exc,
|
|
259
|
+
)
|
|
310
260
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
message="Failed to process Glossary terms",
|
|
324
|
-
context=f"{glossary_term.glossary_term}",
|
|
325
|
-
exc=exc,
|
|
326
|
-
)
|
|
261
|
+
# Process Glossary Terms
|
|
262
|
+
glossary_terms = self.dremio_catalog.get_glossary_terms()
|
|
263
|
+
|
|
264
|
+
for glossary_term in glossary_terms:
|
|
265
|
+
try:
|
|
266
|
+
yield from self.process_glossary_term(glossary_term)
|
|
267
|
+
except Exception as exc:
|
|
268
|
+
self.report.report_failure(
|
|
269
|
+
message="Failed to process Glossary terms",
|
|
270
|
+
context=f"{glossary_term.glossary_term}",
|
|
271
|
+
exc=exc,
|
|
272
|
+
)
|
|
327
273
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
274
|
+
# Optionally Process Query Lineage
|
|
275
|
+
if self.config.include_query_lineage:
|
|
276
|
+
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
277
|
+
self.get_query_lineage_workunits()
|
|
278
|
+
|
|
279
|
+
# Generate workunit for aggregated SQL parsing results
|
|
280
|
+
for mcp in self.sql_parsing_aggregator.gen_metadata():
|
|
281
|
+
yield mcp.as_workunit()
|
|
282
|
+
|
|
283
|
+
# Profiling
|
|
284
|
+
if self.config.is_profiling_enabled():
|
|
285
|
+
with (
|
|
286
|
+
self.report.new_high_stage(IngestionHighStage.PROFILING),
|
|
287
|
+
ThreadPoolExecutor(
|
|
288
|
+
max_workers=self.config.profiling.max_workers
|
|
289
|
+
) as executor,
|
|
290
|
+
):
|
|
291
|
+
future_to_dataset = {
|
|
292
|
+
executor.submit(self.generate_profiles, dataset): dataset
|
|
293
|
+
for dataset in datasets
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
for future in as_completed(future_to_dataset):
|
|
297
|
+
dataset_info = future_to_dataset[future]
|
|
298
|
+
try:
|
|
299
|
+
yield from future.result()
|
|
300
|
+
except Exception as exc:
|
|
301
|
+
self.report.profiling_skipped_other[
|
|
302
|
+
dataset_info.resource_name
|
|
303
|
+
] += 1
|
|
304
|
+
self.report.report_failure(
|
|
305
|
+
message="Failed to profile dataset",
|
|
306
|
+
context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
|
|
307
|
+
exc=exc,
|
|
308
|
+
)
|
|
356
309
|
|
|
357
310
|
def process_container(
|
|
358
311
|
self, container_info: DremioContainer
|
|
@@ -385,10 +338,10 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
385
338
|
return
|
|
386
339
|
|
|
387
340
|
dataset_urn = make_dataset_urn_with_platform_instance(
|
|
388
|
-
platform=
|
|
389
|
-
name=
|
|
390
|
-
env=self.config.env,
|
|
341
|
+
platform=self.get_platform(),
|
|
342
|
+
name=dataset_name,
|
|
391
343
|
platform_instance=self.config.platform_instance,
|
|
344
|
+
env=self.config.env,
|
|
392
345
|
)
|
|
393
346
|
|
|
394
347
|
for dremio_mcp in self.dremio_aspects.populate_dataset_mcp(
|
|
@@ -431,6 +384,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
431
384
|
dremio_path=dataset_info.path,
|
|
432
385
|
dremio_dataset=dataset_info.resource_name,
|
|
433
386
|
)
|
|
387
|
+
logger.debug(f"Upstream dataset for {dataset_urn}: {upstream_urn}")
|
|
434
388
|
|
|
435
389
|
if upstream_urn:
|
|
436
390
|
upstream_lineage = UpstreamLineage(
|
|
@@ -467,13 +421,12 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
467
421
|
schema_str = ".".join(dataset_info.path)
|
|
468
422
|
dataset_name = f"{schema_str}.{dataset_info.resource_name}".lower()
|
|
469
423
|
dataset_urn = make_dataset_urn_with_platform_instance(
|
|
470
|
-
platform=
|
|
471
|
-
name=
|
|
472
|
-
env=self.config.env,
|
|
424
|
+
platform=self.get_platform(),
|
|
425
|
+
name=dataset_name,
|
|
473
426
|
platform_instance=self.config.platform_instance,
|
|
427
|
+
env=self.config.env,
|
|
474
428
|
)
|
|
475
|
-
|
|
476
|
-
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
429
|
+
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
477
430
|
|
|
478
431
|
def generate_view_lineage(
|
|
479
432
|
self, dataset_urn: str, parents: List[str]
|
|
@@ -483,10 +436,10 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
483
436
|
"""
|
|
484
437
|
upstream_urns = [
|
|
485
438
|
make_dataset_urn_with_platform_instance(
|
|
486
|
-
platform=
|
|
487
|
-
name=
|
|
488
|
-
env=self.config.env,
|
|
439
|
+
platform=self.get_platform(),
|
|
440
|
+
name=upstream_table.lower(),
|
|
489
441
|
platform_instance=self.config.platform_instance,
|
|
442
|
+
env=self.config.env,
|
|
490
443
|
)
|
|
491
444
|
for upstream_table in parents
|
|
492
445
|
]
|
|
@@ -501,11 +454,8 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
501
454
|
]
|
|
502
455
|
)
|
|
503
456
|
mcp = MetadataChangeProposalWrapper(
|
|
504
|
-
entityType="dataset",
|
|
505
457
|
entityUrn=dataset_urn,
|
|
506
|
-
aspectName=lineage.ASPECT_NAME,
|
|
507
458
|
aspect=lineage,
|
|
508
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
509
459
|
)
|
|
510
460
|
|
|
511
461
|
for upstream_urn in upstream_urns:
|
|
@@ -548,19 +498,19 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
548
498
|
if query.query and query.affected_dataset:
|
|
549
499
|
upstream_urns = [
|
|
550
500
|
make_dataset_urn_with_platform_instance(
|
|
551
|
-
platform=
|
|
552
|
-
name=
|
|
553
|
-
env=self.config.env,
|
|
501
|
+
platform=self.get_platform(),
|
|
502
|
+
name=ds.lower(),
|
|
554
503
|
platform_instance=self.config.platform_instance,
|
|
504
|
+
env=self.config.env,
|
|
555
505
|
)
|
|
556
506
|
for ds in query.queried_datasets
|
|
557
507
|
]
|
|
558
508
|
|
|
559
509
|
downstream_urn = make_dataset_urn_with_platform_instance(
|
|
560
|
-
platform=
|
|
561
|
-
name=
|
|
562
|
-
env=self.config.env,
|
|
510
|
+
platform=self.get_platform(),
|
|
511
|
+
name=query.affected_dataset.lower(),
|
|
563
512
|
platform_instance=self.config.platform_instance,
|
|
513
|
+
env=self.config.env,
|
|
564
514
|
)
|
|
565
515
|
|
|
566
516
|
# Add query to SqlParsingAggregator
|
|
@@ -596,25 +546,23 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
596
546
|
if not mapping:
|
|
597
547
|
return None
|
|
598
548
|
|
|
599
|
-
platform = mapping.
|
|
549
|
+
platform = mapping.platform
|
|
600
550
|
if not platform:
|
|
601
551
|
return None
|
|
602
552
|
|
|
603
|
-
platform_instance = mapping.
|
|
604
|
-
|
|
605
|
-
)
|
|
606
|
-
env = mapping.get("env", self.config.env)
|
|
553
|
+
platform_instance = mapping.platform_instance
|
|
554
|
+
env = mapping.env or self.config.env
|
|
607
555
|
|
|
608
556
|
root_path = ""
|
|
609
557
|
database_name = ""
|
|
610
558
|
|
|
611
|
-
if mapping.
|
|
612
|
-
if mapping.
|
|
613
|
-
root_path = f"{mapping
|
|
559
|
+
if mapping.dremio_source_category == "file_object_storage":
|
|
560
|
+
if mapping.root_path:
|
|
561
|
+
root_path = f"{mapping.root_path[1:]}/"
|
|
614
562
|
dremio_dataset = f"{root_path}{'/'.join(dremio_path[1:])}/{dremio_dataset}"
|
|
615
563
|
else:
|
|
616
|
-
if mapping.
|
|
617
|
-
database_name = f"{mapping
|
|
564
|
+
if mapping.database_name:
|
|
565
|
+
database_name = f"{mapping.database_name}."
|
|
618
566
|
dremio_dataset = (
|
|
619
567
|
f"{database_name}{'.'.join(dremio_path[1:])}.{dremio_dataset}"
|
|
620
568
|
)
|
|
@@ -639,3 +587,68 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
639
587
|
Get the source report.
|
|
640
588
|
"""
|
|
641
589
|
return self.report
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def build_dremio_source_map(
|
|
593
|
+
dremio_sources: Iterable[DremioSourceContainer],
|
|
594
|
+
source_mappings_config: List[DremioSourceMapping],
|
|
595
|
+
) -> Dict[str, DremioSourceMapEntry]:
|
|
596
|
+
"""
|
|
597
|
+
Builds a source mapping dictionary to support external lineage generation across
|
|
598
|
+
multiple Dremio sources, based on provided configuration mappings.
|
|
599
|
+
|
|
600
|
+
This method operates as follows:
|
|
601
|
+
|
|
602
|
+
Returns:
|
|
603
|
+
Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
|
|
604
|
+
(lowercased) and each value is another entry containing:
|
|
605
|
+
- `platform`: The source platform.
|
|
606
|
+
- `source_name`: The source name.
|
|
607
|
+
- `dremio_source_category`: The type mapped to DataHub,
|
|
608
|
+
e.g., "database", "folder".
|
|
609
|
+
- Optional `root_path`, `database_name`, `platform_instance`,
|
|
610
|
+
and `env` if provided in the configuration.
|
|
611
|
+
Example:
|
|
612
|
+
This method is used internally within the class to generate mappings before
|
|
613
|
+
creating cross-platform lineage.
|
|
614
|
+
|
|
615
|
+
"""
|
|
616
|
+
source_map = {}
|
|
617
|
+
for source in dremio_sources:
|
|
618
|
+
current_source_name = source.container_name
|
|
619
|
+
|
|
620
|
+
source_type = source.dremio_source_type.lower()
|
|
621
|
+
source_category = DremioToDataHubSourceTypeMapping.get_category(source_type)
|
|
622
|
+
datahub_platform = DremioToDataHubSourceTypeMapping.get_datahub_platform(
|
|
623
|
+
source_type
|
|
624
|
+
)
|
|
625
|
+
root_path = source.root_path.lower() if source.root_path else ""
|
|
626
|
+
database_name = source.database_name.lower() if source.database_name else ""
|
|
627
|
+
source_present = False
|
|
628
|
+
|
|
629
|
+
for mapping in source_mappings_config:
|
|
630
|
+
if mapping.source_name.lower() == current_source_name.lower():
|
|
631
|
+
source_map[current_source_name.lower()] = DremioSourceMapEntry(
|
|
632
|
+
platform=mapping.platform,
|
|
633
|
+
source_name=mapping.source_name,
|
|
634
|
+
dremio_source_category=source_category,
|
|
635
|
+
root_path=root_path,
|
|
636
|
+
database_name=database_name,
|
|
637
|
+
platform_instance=mapping.platform_instance,
|
|
638
|
+
env=mapping.env,
|
|
639
|
+
)
|
|
640
|
+
source_present = True
|
|
641
|
+
break
|
|
642
|
+
|
|
643
|
+
if not source_present:
|
|
644
|
+
source_map[current_source_name.lower()] = DremioSourceMapEntry(
|
|
645
|
+
platform=datahub_platform,
|
|
646
|
+
source_name=current_source_name,
|
|
647
|
+
dremio_source_category=source_category,
|
|
648
|
+
root_path=root_path,
|
|
649
|
+
database_name=database_name,
|
|
650
|
+
platform_instance=None,
|
|
651
|
+
env=None,
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
return source_map
|