acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -29,6 +29,7 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
29
29
|
get_key_prefix,
|
|
30
30
|
strip_s3_prefix,
|
|
31
31
|
)
|
|
32
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
32
33
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
|
|
33
34
|
from datahub.ingestion.source.delta_lake.config import DeltaLakeSourceConfig
|
|
34
35
|
from datahub.ingestion.source.delta_lake.delta_lake_utils import (
|
|
@@ -85,6 +86,13 @@ OPERATION_STATEMENT_TYPES = {
|
|
|
85
86
|
@config_class(DeltaLakeSourceConfig)
|
|
86
87
|
@support_status(SupportStatus.INCUBATING)
|
|
87
88
|
@capability(SourceCapability.TAGS, "Can extract S3 object/bucket tags if enabled")
|
|
89
|
+
@capability(
|
|
90
|
+
SourceCapability.CONTAINERS,
|
|
91
|
+
"Enabled by default",
|
|
92
|
+
subtype_modifier=[
|
|
93
|
+
SourceCapabilityModifier.FOLDER,
|
|
94
|
+
],
|
|
95
|
+
)
|
|
88
96
|
class DeltaLakeSource(StatefulIngestionSourceBase):
|
|
89
97
|
"""
|
|
90
98
|
This plugin extracts:
|
|
@@ -7,7 +7,7 @@ from collections import defaultdict
|
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from itertools import product
|
|
9
9
|
from time import sleep, time
|
|
10
|
-
from typing import Any, Deque, Dict, List, Optional, Union
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Deque, Dict, List, Optional, Union
|
|
11
11
|
from urllib.parse import quote
|
|
12
12
|
|
|
13
13
|
import requests
|
|
@@ -15,12 +15,17 @@ from requests.adapters import HTTPAdapter
|
|
|
15
15
|
from urllib3 import Retry
|
|
16
16
|
from urllib3.exceptions import InsecureRequestWarning
|
|
17
17
|
|
|
18
|
+
from datahub.emitter.request_helper import make_curl_command
|
|
18
19
|
from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
|
|
19
20
|
from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
|
|
20
21
|
DremioToDataHubSourceTypeMapping,
|
|
21
22
|
)
|
|
22
23
|
from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
|
|
23
24
|
from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
|
|
25
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from datahub.ingestion.source.dremio.dremio_entities import DremioContainer
|
|
24
29
|
|
|
25
30
|
logger = logging.getLogger(__name__)
|
|
26
31
|
|
|
@@ -54,6 +59,8 @@ class DremioAPIOperations:
|
|
|
54
59
|
self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
|
|
55
60
|
self._max_workers: int = connection_args.max_workers
|
|
56
61
|
self.is_dremio_cloud = connection_args.is_dremio_cloud
|
|
62
|
+
self.start_time = connection_args.start_time
|
|
63
|
+
self.end_time = connection_args.end_time
|
|
57
64
|
self.report = report
|
|
58
65
|
self.session = requests.Session()
|
|
59
66
|
if connection_args.is_dremio_cloud:
|
|
@@ -178,6 +185,7 @@ class DremioAPIOperations:
|
|
|
178
185
|
self.session.headers.update(
|
|
179
186
|
{"Authorization": f"Bearer {connection_args.password}"}
|
|
180
187
|
)
|
|
188
|
+
logger.debug("Configured Dremio cloud API session to use PAT")
|
|
181
189
|
return
|
|
182
190
|
|
|
183
191
|
# On-prem Dremio authentication (PAT or Basic Auth)
|
|
@@ -189,6 +197,7 @@ class DremioAPIOperations:
|
|
|
189
197
|
"Authorization": f"Bearer {connection_args.password}",
|
|
190
198
|
}
|
|
191
199
|
)
|
|
200
|
+
logger.debug("Configured Dremio API session to use PAT")
|
|
192
201
|
return
|
|
193
202
|
else:
|
|
194
203
|
assert connection_args.username and connection_args.password, (
|
|
@@ -212,10 +221,10 @@ class DremioAPIOperations:
|
|
|
212
221
|
response.raise_for_status()
|
|
213
222
|
token = response.json().get("token")
|
|
214
223
|
if token:
|
|
224
|
+
logger.debug("Exchanged username and password for Dremio token")
|
|
215
225
|
self.session.headers.update(
|
|
216
226
|
{"Authorization": f"_dremio{token}"}
|
|
217
227
|
)
|
|
218
|
-
|
|
219
228
|
return
|
|
220
229
|
else:
|
|
221
230
|
self.report.failure("Failed to authenticate", login_url)
|
|
@@ -231,49 +240,76 @@ class DremioAPIOperations:
|
|
|
231
240
|
"Credentials cannot be refreshed. Please check your username and password."
|
|
232
241
|
)
|
|
233
242
|
|
|
243
|
+
def _request(self, method: str, url: str, data: Union[str, None] = None) -> Dict:
|
|
244
|
+
"""Send a request to the Dremio API."""
|
|
245
|
+
|
|
246
|
+
logger.debug(f"{method} request to {self.base_url + url}")
|
|
247
|
+
self.report.api_calls_total += 1
|
|
248
|
+
self.report.api_calls_by_method_and_path[f"{method} {url}"] += 1
|
|
249
|
+
|
|
250
|
+
with PerfTimer() as timer:
|
|
251
|
+
response = self.session.request(
|
|
252
|
+
method=method,
|
|
253
|
+
url=(self.base_url + url),
|
|
254
|
+
data=data,
|
|
255
|
+
verify=self._verify,
|
|
256
|
+
timeout=self._timeout,
|
|
257
|
+
)
|
|
258
|
+
self.report.api_call_secs_by_method_and_path[f"{method} {url}"] += (
|
|
259
|
+
timer.elapsed_seconds()
|
|
260
|
+
)
|
|
261
|
+
# response.raise_for_status() # Enabling this line, makes integration tests to fail
|
|
262
|
+
try:
|
|
263
|
+
return response.json()
|
|
264
|
+
except requests.exceptions.JSONDecodeError as e:
|
|
265
|
+
logger.info(
|
|
266
|
+
f"On {method} request to {url}, failed to parse JSON from response (status {response.status_code}): {response.text}"
|
|
267
|
+
)
|
|
268
|
+
logger.debug(
|
|
269
|
+
f"Request curl equivalent: {make_curl_command(self.session, method, url, data)}"
|
|
270
|
+
)
|
|
271
|
+
raise DremioAPIException(
|
|
272
|
+
f"Failed to parse JSON from response (status {response.status_code}): {response.text}"
|
|
273
|
+
) from e
|
|
274
|
+
|
|
234
275
|
def get(self, url: str) -> Dict:
|
|
235
|
-
"""
|
|
236
|
-
|
|
237
|
-
url=(self.base_url + url),
|
|
238
|
-
verify=self._verify,
|
|
239
|
-
timeout=self._timeout,
|
|
240
|
-
)
|
|
241
|
-
return response.json()
|
|
276
|
+
"""Send a GET request to the Dremio API."""
|
|
277
|
+
return self._request("GET", url)
|
|
242
278
|
|
|
243
279
|
def post(self, url: str, data: str) -> Dict:
|
|
244
|
-
"""
|
|
245
|
-
|
|
246
|
-
url=(self.base_url + url),
|
|
247
|
-
data=data,
|
|
248
|
-
verify=self._verify,
|
|
249
|
-
timeout=self._timeout,
|
|
250
|
-
)
|
|
251
|
-
return response.json()
|
|
280
|
+
"""Send a POST request to the Dremio API."""
|
|
281
|
+
return self._request("POST", url, data=data)
|
|
252
282
|
|
|
253
283
|
def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
|
|
254
284
|
"""Execute SQL query with timeout and error handling"""
|
|
255
285
|
try:
|
|
256
|
-
|
|
286
|
+
with PerfTimer() as timer:
|
|
287
|
+
logger.info(f"Executing query: {query}")
|
|
288
|
+
response = self.post(url="/sql", data=json.dumps({"sql": query}))
|
|
257
289
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
290
|
+
if "errorMessage" in response:
|
|
291
|
+
self.report.failure(
|
|
292
|
+
message="SQL Error", context=f"{response['errorMessage']}"
|
|
293
|
+
)
|
|
294
|
+
raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
|
|
263
295
|
|
|
264
|
-
|
|
296
|
+
job_id = response["id"]
|
|
265
297
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
298
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
299
|
+
future = executor.submit(self.fetch_results, job_id)
|
|
300
|
+
try:
|
|
301
|
+
result = future.result(timeout=timeout)
|
|
302
|
+
logger.info(
|
|
303
|
+
f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
|
|
304
|
+
)
|
|
305
|
+
return result
|
|
306
|
+
except concurrent.futures.TimeoutError:
|
|
307
|
+
self.cancel_query(job_id)
|
|
308
|
+
raise DremioAPIException(
|
|
309
|
+
f"Query execution timed out after {timeout} seconds"
|
|
310
|
+
) from None
|
|
311
|
+
except RuntimeError as e:
|
|
312
|
+
raise DremioAPIException() from e
|
|
277
313
|
|
|
278
314
|
except requests.RequestException as e:
|
|
279
315
|
raise DremioAPIException("Error executing query") from e
|
|
@@ -462,7 +498,9 @@ class DremioAPIOperations:
|
|
|
462
498
|
pattern_str = "|".join(f"({p})" for p in patterns)
|
|
463
499
|
return f"AND {operator}({field}, '{pattern_str}')"
|
|
464
500
|
|
|
465
|
-
def get_all_tables_and_columns(
|
|
501
|
+
def get_all_tables_and_columns(
|
|
502
|
+
self, containers: Deque["DremioContainer"]
|
|
503
|
+
) -> List[Dict]:
|
|
466
504
|
if self.edition == DremioEdition.ENTERPRISE:
|
|
467
505
|
query_template = DremioSQLQueries.QUERY_DATASETS_EE
|
|
468
506
|
elif self.edition == DremioEdition.CLOUD:
|
|
@@ -603,10 +641,25 @@ class DremioAPIOperations:
|
|
|
603
641
|
return parents_list
|
|
604
642
|
|
|
605
643
|
def extract_all_queries(self) -> List[Dict[str, Any]]:
|
|
644
|
+
# Convert datetime objects to string format for SQL queries
|
|
645
|
+
start_timestamp_str = None
|
|
646
|
+
end_timestamp_str = None
|
|
647
|
+
|
|
648
|
+
if self.start_time:
|
|
649
|
+
start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
|
650
|
+
if self.end_time:
|
|
651
|
+
end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
|
652
|
+
|
|
606
653
|
if self.edition == DremioEdition.CLOUD:
|
|
607
|
-
jobs_query = DremioSQLQueries.
|
|
654
|
+
jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
|
|
655
|
+
start_timestamp_millis=start_timestamp_str,
|
|
656
|
+
end_timestamp_millis=end_timestamp_str,
|
|
657
|
+
)
|
|
608
658
|
else:
|
|
609
|
-
jobs_query = DremioSQLQueries.
|
|
659
|
+
jobs_query = DremioSQLQueries.get_query_all_jobs(
|
|
660
|
+
start_timestamp_millis=start_timestamp_str,
|
|
661
|
+
end_timestamp_millis=end_timestamp_str,
|
|
662
|
+
)
|
|
610
663
|
|
|
611
664
|
return self.execute_query(query=jobs_query)
|
|
612
665
|
|
|
@@ -685,6 +738,27 @@ class DremioAPIOperations:
|
|
|
685
738
|
|
|
686
739
|
return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
|
|
687
740
|
|
|
741
|
+
def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
|
|
742
|
+
"""
|
|
743
|
+
Check if a container path could potentially match a schema pattern.
|
|
744
|
+
This handles hierarchical path matching for container filtering.
|
|
745
|
+
"""
|
|
746
|
+
if pattern == ".*":
|
|
747
|
+
return True
|
|
748
|
+
|
|
749
|
+
current_path = ".".join(path_components)
|
|
750
|
+
|
|
751
|
+
# Handle simple .* patterns (like "a.b.c.*")
|
|
752
|
+
if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
|
|
753
|
+
# Simple dotstar pattern - check prefix matching
|
|
754
|
+
pattern_prefix = pattern[:-2] # Remove ".*"
|
|
755
|
+
return current_path.lower().startswith(
|
|
756
|
+
pattern_prefix.lower()
|
|
757
|
+
) or pattern_prefix.lower().startswith(current_path.lower())
|
|
758
|
+
else:
|
|
759
|
+
# Complex regex pattern - use existing regex matching logic
|
|
760
|
+
return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
|
|
761
|
+
|
|
688
762
|
def should_include_container(self, path: List[str], name: str) -> bool:
|
|
689
763
|
"""
|
|
690
764
|
Helper method to check if a container should be included based on schema patterns.
|
|
@@ -711,41 +785,8 @@ class DremioAPIOperations:
|
|
|
711
785
|
|
|
712
786
|
# Check allow patterns
|
|
713
787
|
for pattern in self.allow_schema_pattern:
|
|
714
|
-
#
|
|
715
|
-
if
|
|
716
|
-
pattern_parts = pattern.split(".")
|
|
717
|
-
path_parts = path_components
|
|
718
|
-
|
|
719
|
-
# If pattern has exact same number of parts, check each component
|
|
720
|
-
if len(pattern_parts) == len(path_parts):
|
|
721
|
-
matches = True
|
|
722
|
-
for p_part, c_part in zip(pattern_parts, path_parts):
|
|
723
|
-
if p_part != "*" and p_part.lower() != c_part.lower():
|
|
724
|
-
matches = False
|
|
725
|
-
break
|
|
726
|
-
if matches:
|
|
727
|
-
self.report.report_container_scanned(full_path)
|
|
728
|
-
return True
|
|
729
|
-
# Otherwise check if current path is prefix match
|
|
730
|
-
else:
|
|
731
|
-
# Remove the trailing wildcard if present
|
|
732
|
-
if pattern_parts[-1] == "*":
|
|
733
|
-
pattern_parts = pattern_parts[:-1]
|
|
734
|
-
|
|
735
|
-
for i in range(len(path_parts)):
|
|
736
|
-
current_path = ".".join(path_parts[: i + 1])
|
|
737
|
-
pattern_prefix = ".".join(pattern_parts[: i + 1])
|
|
738
|
-
|
|
739
|
-
if pattern_prefix.startswith(current_path):
|
|
740
|
-
self.report.report_container_scanned(full_path)
|
|
741
|
-
return True
|
|
742
|
-
|
|
743
|
-
# Direct pattern matching
|
|
744
|
-
if self._check_pattern_match(
|
|
745
|
-
pattern=pattern,
|
|
746
|
-
paths=[full_path],
|
|
747
|
-
allow_prefix=True,
|
|
748
|
-
):
|
|
788
|
+
# Check if current path could potentially match this pattern
|
|
789
|
+
if self._could_match_pattern(pattern, path_components):
|
|
749
790
|
self.report.report_container_scanned(full_path)
|
|
750
791
|
return True
|
|
751
792
|
|
|
@@ -14,6 +14,7 @@ from datahub.emitter.mce_builder import (
|
|
|
14
14
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
15
15
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
16
16
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
17
|
+
from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
|
|
17
18
|
from datahub.ingestion.source.dremio.dremio_entities import (
|
|
18
19
|
DremioContainer,
|
|
19
20
|
DremioDataset,
|
|
@@ -364,9 +365,9 @@ class DremioAspects:
|
|
|
364
365
|
) -> Optional[BrowsePathsV2Class]:
|
|
365
366
|
paths = []
|
|
366
367
|
|
|
367
|
-
if entity.subclass ==
|
|
368
|
+
if entity.subclass == DatasetContainerSubTypes.DREMIO_SPACE.value:
|
|
368
369
|
paths.append(BrowsePathEntryClass(id="Spaces"))
|
|
369
|
-
elif entity.subclass ==
|
|
370
|
+
elif entity.subclass == DatasetContainerSubTypes.DREMIO_SOURCE.value:
|
|
370
371
|
paths.append(BrowsePathEntryClass(id="Sources"))
|
|
371
372
|
if paths:
|
|
372
373
|
return BrowsePathsV2Class(path=paths)
|
|
@@ -4,11 +4,12 @@ from typing import List, Literal, Optional
|
|
|
4
4
|
import certifi
|
|
5
5
|
from pydantic import Field, validator
|
|
6
6
|
|
|
7
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
8
8
|
from datahub.configuration.source_common import (
|
|
9
9
|
EnvConfigMixin,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
11
11
|
)
|
|
12
|
+
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
12
13
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
|
|
13
14
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
14
15
|
StatefulStaleMetadataRemovalConfig,
|
|
@@ -99,10 +100,9 @@ class ProfileConfig(GEProfilingBaseConfig):
|
|
|
99
100
|
query_timeout: int = Field(
|
|
100
101
|
default=300, description="Time before cancelling Dremio profiling query"
|
|
101
102
|
)
|
|
102
|
-
include_field_median_value: bool = Field(
|
|
103
|
+
include_field_median_value: HiddenFromDocs[bool] = Field(
|
|
104
|
+
# Hidden because median causes a number of issues in Dremio.
|
|
103
105
|
default=False,
|
|
104
|
-
hidden_from_docs=True,
|
|
105
|
-
description="Median causes a number of issues in Dremio.",
|
|
106
106
|
)
|
|
107
107
|
|
|
108
108
|
|
|
@@ -118,6 +118,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
|
|
|
118
118
|
class DremioSourceConfig(
|
|
119
119
|
DremioConnectionConfig,
|
|
120
120
|
StatefulIngestionConfigBase,
|
|
121
|
+
BaseTimeWindowConfig,
|
|
121
122
|
EnvConfigMixin,
|
|
122
123
|
PlatformInstanceConfigMixin,
|
|
123
124
|
):
|
|
@@ -294,7 +294,7 @@ class DremioContainer:
|
|
|
294
294
|
)
|
|
295
295
|
|
|
296
296
|
|
|
297
|
-
class
|
|
297
|
+
class DremioSourceContainer(DremioContainer):
|
|
298
298
|
subclass: str = "Dremio Source"
|
|
299
299
|
dremio_source_type: str
|
|
300
300
|
root_path: Optional[str]
|
|
@@ -337,7 +337,7 @@ class DremioCatalog:
|
|
|
337
337
|
self.dremio_api = dremio_api
|
|
338
338
|
self.edition = dremio_api.edition
|
|
339
339
|
self.datasets: Deque[DremioDataset] = deque()
|
|
340
|
-
self.sources: Deque[
|
|
340
|
+
self.sources: Deque[DremioSourceContainer] = deque()
|
|
341
341
|
self.spaces: Deque[DremioSpace] = deque()
|
|
342
342
|
self.folders: Deque[DremioFolder] = deque()
|
|
343
343
|
self.glossary_terms: Deque[DremioGlossaryTerm] = deque()
|
|
@@ -380,12 +380,13 @@ class DremioCatalog:
|
|
|
380
380
|
container_type = container.get("container_type")
|
|
381
381
|
if container_type == DremioEntityContainerType.SOURCE:
|
|
382
382
|
self.sources.append(
|
|
383
|
-
|
|
383
|
+
DremioSourceContainer(
|
|
384
384
|
container_name=container.get("name"),
|
|
385
385
|
location_id=container.get("id"),
|
|
386
386
|
path=[],
|
|
387
387
|
api_operations=self.dremio_api,
|
|
388
|
-
dremio_source_type=container.get("source_type")
|
|
388
|
+
dremio_source_type=container.get("source_type")
|
|
389
|
+
or "unknown",
|
|
389
390
|
root_path=container.get("root_path"),
|
|
390
391
|
database_name=container.get("database_name"),
|
|
391
392
|
)
|
|
@@ -426,7 +427,7 @@ class DremioCatalog:
|
|
|
426
427
|
self.set_containers()
|
|
427
428
|
return deque(itertools.chain(self.sources, self.spaces, self.folders))
|
|
428
429
|
|
|
429
|
-
def get_sources(self) -> Deque[
|
|
430
|
+
def get_sources(self) -> Deque[DremioSourceContainer]:
|
|
430
431
|
self.set_containers()
|
|
431
432
|
return self.sources
|
|
432
433
|
|
|
@@ -1,22 +1,41 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from datetime import datetime
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
5
6
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
7
|
StaleEntityRemovalSourceReport,
|
|
7
8
|
)
|
|
8
|
-
from datahub.ingestion.source_report.
|
|
9
|
+
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
10
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
11
|
+
from datahub.utilities.stats_collections import (
|
|
12
|
+
TopKDict,
|
|
13
|
+
float_top_k_dict,
|
|
14
|
+
int_top_k_dict,
|
|
15
|
+
)
|
|
9
16
|
|
|
10
17
|
|
|
11
18
|
@dataclass
|
|
12
19
|
class DremioSourceReport(
|
|
13
|
-
SQLSourceReport,
|
|
20
|
+
SQLSourceReport,
|
|
21
|
+
StaleEntityRemovalSourceReport,
|
|
22
|
+
BaseTimeWindowReport,
|
|
14
23
|
):
|
|
15
24
|
num_containers_failed: int = 0
|
|
16
25
|
num_datasets_failed: int = 0
|
|
17
26
|
containers_scanned: int = 0
|
|
18
27
|
containers_filtered: int = 0
|
|
19
28
|
|
|
29
|
+
api_calls_total: int = 0
|
|
30
|
+
api_calls_by_method_and_path: TopKDict[str, int] = field(
|
|
31
|
+
default_factory=int_top_k_dict
|
|
32
|
+
)
|
|
33
|
+
api_call_secs_by_method_and_path: TopKDict[str, float] = field(
|
|
34
|
+
default_factory=float_top_k_dict
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
38
|
+
|
|
20
39
|
def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
|
|
21
40
|
# recording total combined latency is not very useful, keeping this method as a placeholder
|
|
22
41
|
# for future implementation of min / max / percentiles etc.
|