acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -4,11 +4,12 @@ from typing import List, Literal, Optional
|
|
|
4
4
|
import certifi
|
|
5
5
|
from pydantic import Field, validator
|
|
6
6
|
|
|
7
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
8
8
|
from datahub.configuration.source_common import (
|
|
9
9
|
EnvConfigMixin,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
11
11
|
)
|
|
12
|
+
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
12
13
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
|
|
13
14
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
14
15
|
StatefulStaleMetadataRemovalConfig,
|
|
@@ -99,10 +100,9 @@ class ProfileConfig(GEProfilingBaseConfig):
|
|
|
99
100
|
query_timeout: int = Field(
|
|
100
101
|
default=300, description="Time before cancelling Dremio profiling query"
|
|
101
102
|
)
|
|
102
|
-
include_field_median_value: bool = Field(
|
|
103
|
+
include_field_median_value: HiddenFromDocs[bool] = Field(
|
|
104
|
+
# Hidden because median causes a number of issues in Dremio.
|
|
103
105
|
default=False,
|
|
104
|
-
hidden_from_docs=True,
|
|
105
|
-
description="Median causes a number of issues in Dremio.",
|
|
106
106
|
)
|
|
107
107
|
|
|
108
108
|
|
|
@@ -118,6 +118,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
|
|
|
118
118
|
class DremioSourceConfig(
|
|
119
119
|
DremioConnectionConfig,
|
|
120
120
|
StatefulIngestionConfigBase,
|
|
121
|
+
BaseTimeWindowConfig,
|
|
121
122
|
EnvConfigMixin,
|
|
122
123
|
PlatformInstanceConfigMixin,
|
|
123
124
|
):
|
|
@@ -1,22 +1,41 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from datetime import datetime
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
5
6
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
7
|
StaleEntityRemovalSourceReport,
|
|
7
8
|
)
|
|
8
|
-
from datahub.ingestion.source_report.
|
|
9
|
+
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
10
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
11
|
+
from datahub.utilities.stats_collections import (
|
|
12
|
+
TopKDict,
|
|
13
|
+
float_top_k_dict,
|
|
14
|
+
int_top_k_dict,
|
|
15
|
+
)
|
|
9
16
|
|
|
10
17
|
|
|
11
18
|
@dataclass
|
|
12
19
|
class DremioSourceReport(
|
|
13
|
-
SQLSourceReport,
|
|
20
|
+
SQLSourceReport,
|
|
21
|
+
StaleEntityRemovalSourceReport,
|
|
22
|
+
BaseTimeWindowReport,
|
|
14
23
|
):
|
|
15
24
|
num_containers_failed: int = 0
|
|
16
25
|
num_datasets_failed: int = 0
|
|
17
26
|
containers_scanned: int = 0
|
|
18
27
|
containers_filtered: int = 0
|
|
19
28
|
|
|
29
|
+
api_calls_total: int = 0
|
|
30
|
+
api_calls_by_method_and_path: TopKDict[str, int] = field(
|
|
31
|
+
default_factory=int_top_k_dict
|
|
32
|
+
)
|
|
33
|
+
api_call_secs_by_method_and_path: TopKDict[str, float] = field(
|
|
34
|
+
default_factory=float_top_k_dict
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
38
|
+
|
|
20
39
|
def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
|
|
21
40
|
# recording total combined latency is not very useful, keeping this method as a placeholder
|
|
22
41
|
# for future implementation of min / max / percentiles etc.
|
|
@@ -22,6 +22,7 @@ from datahub.ingestion.api.source import (
|
|
|
22
22
|
SourceReport,
|
|
23
23
|
)
|
|
24
24
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
25
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
25
26
|
from datahub.ingestion.source.dremio.dremio_api import (
|
|
26
27
|
DremioAPIOperations,
|
|
27
28
|
DremioEdition,
|
|
@@ -51,13 +52,17 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
51
52
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
52
53
|
StatefulIngestionSourceBase,
|
|
53
54
|
)
|
|
54
|
-
from datahub.ingestion.source_report.ingestion_stage import
|
|
55
|
+
from datahub.ingestion.source_report.ingestion_stage import (
|
|
56
|
+
LINEAGE_EXTRACTION,
|
|
57
|
+
METADATA_EXTRACTION,
|
|
58
|
+
IngestionHighStage,
|
|
59
|
+
)
|
|
55
60
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
56
61
|
DatasetLineageTypeClass,
|
|
57
62
|
UpstreamClass,
|
|
58
63
|
UpstreamLineage,
|
|
59
64
|
)
|
|
60
|
-
from datahub.metadata.schema_classes import
|
|
65
|
+
from datahub.metadata.schema_classes import SchemaMetadataClass
|
|
61
66
|
from datahub.metadata.urns import CorpUserUrn
|
|
62
67
|
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
63
68
|
KnownQueryLineageInfo,
|
|
@@ -82,13 +87,34 @@ class DremioSourceMapEntry:
|
|
|
82
87
|
@platform_name("Dremio")
|
|
83
88
|
@config_class(DremioSourceConfig)
|
|
84
89
|
@support_status(SupportStatus.CERTIFIED)
|
|
85
|
-
@capability(
|
|
90
|
+
@capability(
|
|
91
|
+
SourceCapability.CONTAINERS,
|
|
92
|
+
"Enabled by default",
|
|
93
|
+
subtype_modifier=[
|
|
94
|
+
SourceCapabilityModifier.DREMIO_SPACE,
|
|
95
|
+
SourceCapabilityModifier.DREMIO_SOURCE,
|
|
96
|
+
],
|
|
97
|
+
)
|
|
86
98
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
87
99
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
88
100
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
89
|
-
@capability(
|
|
101
|
+
@capability(
|
|
102
|
+
SourceCapability.LINEAGE_COARSE,
|
|
103
|
+
"Enabled by default",
|
|
104
|
+
subtype_modifier=[
|
|
105
|
+
SourceCapabilityModifier.TABLE,
|
|
106
|
+
],
|
|
107
|
+
)
|
|
108
|
+
@capability(
|
|
109
|
+
SourceCapability.LINEAGE_FINE,
|
|
110
|
+
"Extract column-level lineage",
|
|
111
|
+
subtype_modifier=[
|
|
112
|
+
SourceCapabilityModifier.TABLE,
|
|
113
|
+
],
|
|
114
|
+
)
|
|
90
115
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
91
116
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
117
|
+
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
92
118
|
class DremioSource(StatefulIngestionSourceBase):
|
|
93
119
|
"""
|
|
94
120
|
This plugin integrates with Dremio to extract and ingest metadata into DataHub.
|
|
@@ -126,6 +152,13 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
126
152
|
self.default_db = "dremio"
|
|
127
153
|
self.config = config
|
|
128
154
|
self.report = DremioSourceReport()
|
|
155
|
+
|
|
156
|
+
# Set time window for query lineage extraction
|
|
157
|
+
self.report.window_start_time, self.report.window_end_time = (
|
|
158
|
+
self.config.start_time,
|
|
159
|
+
self.config.end_time,
|
|
160
|
+
)
|
|
161
|
+
|
|
129
162
|
self.source_map: Dict[str, DremioSourceMapEntry] = dict()
|
|
130
163
|
|
|
131
164
|
# Initialize API operations
|
|
@@ -154,6 +187,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
154
187
|
generate_operations=True,
|
|
155
188
|
usage_config=self.config.usage,
|
|
156
189
|
)
|
|
190
|
+
self.report.sql_aggregator = self.sql_parsing_aggregator.report
|
|
157
191
|
|
|
158
192
|
# For profiling
|
|
159
193
|
self.profiler = DremioProfiler(config, self.report, dremio_api)
|
|
@@ -190,84 +224,88 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
190
224
|
|
|
191
225
|
self.source_map = self._build_source_map()
|
|
192
226
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
227
|
+
with self.report.new_stage(METADATA_EXTRACTION):
|
|
228
|
+
# Process Containers
|
|
229
|
+
containers = self.dremio_catalog.get_containers()
|
|
230
|
+
for container in containers:
|
|
231
|
+
try:
|
|
232
|
+
yield from self.process_container(container)
|
|
233
|
+
logger.info(
|
|
234
|
+
f"Dremio container {container.container_name} emitted successfully"
|
|
235
|
+
)
|
|
236
|
+
except Exception as exc:
|
|
237
|
+
self.report.num_containers_failed += 1
|
|
238
|
+
self.report.report_failure(
|
|
239
|
+
message="Failed to process Dremio container",
|
|
240
|
+
context=f"{'.'.join(container.path)}.{container.container_name}",
|
|
241
|
+
exc=exc,
|
|
242
|
+
)
|
|
208
243
|
|
|
209
|
-
|
|
210
|
-
|
|
244
|
+
# Process Datasets
|
|
245
|
+
datasets = self.dremio_catalog.get_datasets()
|
|
211
246
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
247
|
+
for dataset_info in datasets:
|
|
248
|
+
try:
|
|
249
|
+
yield from self.process_dataset(dataset_info)
|
|
250
|
+
logger.info(
|
|
251
|
+
f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
|
|
252
|
+
)
|
|
253
|
+
except Exception as exc:
|
|
254
|
+
self.report.num_datasets_failed += 1 # Increment failed datasets
|
|
255
|
+
self.report.report_failure(
|
|
256
|
+
message="Failed to process Dremio dataset",
|
|
257
|
+
context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
|
|
258
|
+
exc=exc,
|
|
259
|
+
)
|
|
225
260
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
self.get_query_lineage_workunits()
|
|
229
|
-
|
|
230
|
-
# Process Glossary Terms
|
|
231
|
-
glossary_terms = self.dremio_catalog.get_glossary_terms()
|
|
232
|
-
|
|
233
|
-
for glossary_term in glossary_terms:
|
|
234
|
-
try:
|
|
235
|
-
yield from self.process_glossary_term(glossary_term)
|
|
236
|
-
except Exception as exc:
|
|
237
|
-
self.report.report_failure(
|
|
238
|
-
message="Failed to process Glossary terms",
|
|
239
|
-
context=f"{glossary_term.glossary_term}",
|
|
240
|
-
exc=exc,
|
|
241
|
-
)
|
|
261
|
+
# Process Glossary Terms
|
|
262
|
+
glossary_terms = self.dremio_catalog.get_glossary_terms()
|
|
242
263
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
264
|
+
for glossary_term in glossary_terms:
|
|
265
|
+
try:
|
|
266
|
+
yield from self.process_glossary_term(glossary_term)
|
|
267
|
+
except Exception as exc:
|
|
268
|
+
self.report.report_failure(
|
|
269
|
+
message="Failed to process Glossary terms",
|
|
270
|
+
context=f"{glossary_term.glossary_term}",
|
|
271
|
+
exc=exc,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Optionally Process Query Lineage
|
|
275
|
+
if self.config.include_query_lineage:
|
|
276
|
+
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
277
|
+
self.get_query_lineage_workunits()
|
|
278
|
+
|
|
279
|
+
# Generate workunit for aggregated SQL parsing results
|
|
280
|
+
for mcp in self.sql_parsing_aggregator.gen_metadata():
|
|
281
|
+
yield mcp.as_workunit()
|
|
282
|
+
|
|
283
|
+
# Profiling
|
|
284
|
+
if self.config.is_profiling_enabled():
|
|
285
|
+
with (
|
|
286
|
+
self.report.new_high_stage(IngestionHighStage.PROFILING),
|
|
287
|
+
ThreadPoolExecutor(
|
|
288
|
+
max_workers=self.config.profiling.max_workers
|
|
289
|
+
) as executor,
|
|
290
|
+
):
|
|
291
|
+
future_to_dataset = {
|
|
292
|
+
executor.submit(self.generate_profiles, dataset): dataset
|
|
293
|
+
for dataset in datasets
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
for future in as_completed(future_to_dataset):
|
|
297
|
+
dataset_info = future_to_dataset[future]
|
|
298
|
+
try:
|
|
299
|
+
yield from future.result()
|
|
300
|
+
except Exception as exc:
|
|
301
|
+
self.report.profiling_skipped_other[
|
|
302
|
+
dataset_info.resource_name
|
|
303
|
+
] += 1
|
|
304
|
+
self.report.report_failure(
|
|
305
|
+
message="Failed to profile dataset",
|
|
306
|
+
context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
|
|
307
|
+
exc=exc,
|
|
308
|
+
)
|
|
271
309
|
|
|
272
310
|
def process_container(
|
|
273
311
|
self, container_info: DremioContainer
|
|
@@ -300,10 +338,10 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
300
338
|
return
|
|
301
339
|
|
|
302
340
|
dataset_urn = make_dataset_urn_with_platform_instance(
|
|
303
|
-
platform=
|
|
304
|
-
name=
|
|
305
|
-
env=self.config.env,
|
|
341
|
+
platform=self.get_platform(),
|
|
342
|
+
name=dataset_name,
|
|
306
343
|
platform_instance=self.config.platform_instance,
|
|
344
|
+
env=self.config.env,
|
|
307
345
|
)
|
|
308
346
|
|
|
309
347
|
for dremio_mcp in self.dremio_aspects.populate_dataset_mcp(
|
|
@@ -383,13 +421,12 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
383
421
|
schema_str = ".".join(dataset_info.path)
|
|
384
422
|
dataset_name = f"{schema_str}.{dataset_info.resource_name}".lower()
|
|
385
423
|
dataset_urn = make_dataset_urn_with_platform_instance(
|
|
386
|
-
platform=
|
|
387
|
-
name=
|
|
388
|
-
env=self.config.env,
|
|
424
|
+
platform=self.get_platform(),
|
|
425
|
+
name=dataset_name,
|
|
389
426
|
platform_instance=self.config.platform_instance,
|
|
427
|
+
env=self.config.env,
|
|
390
428
|
)
|
|
391
|
-
|
|
392
|
-
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
429
|
+
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
393
430
|
|
|
394
431
|
def generate_view_lineage(
|
|
395
432
|
self, dataset_urn: str, parents: List[str]
|
|
@@ -399,10 +436,10 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
399
436
|
"""
|
|
400
437
|
upstream_urns = [
|
|
401
438
|
make_dataset_urn_with_platform_instance(
|
|
402
|
-
platform=
|
|
403
|
-
name=
|
|
404
|
-
env=self.config.env,
|
|
439
|
+
platform=self.get_platform(),
|
|
440
|
+
name=upstream_table.lower(),
|
|
405
441
|
platform_instance=self.config.platform_instance,
|
|
442
|
+
env=self.config.env,
|
|
406
443
|
)
|
|
407
444
|
for upstream_table in parents
|
|
408
445
|
]
|
|
@@ -417,11 +454,8 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
417
454
|
]
|
|
418
455
|
)
|
|
419
456
|
mcp = MetadataChangeProposalWrapper(
|
|
420
|
-
entityType="dataset",
|
|
421
457
|
entityUrn=dataset_urn,
|
|
422
|
-
aspectName=lineage.ASPECT_NAME,
|
|
423
458
|
aspect=lineage,
|
|
424
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
425
459
|
)
|
|
426
460
|
|
|
427
461
|
for upstream_urn in upstream_urns:
|
|
@@ -464,19 +498,19 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
464
498
|
if query.query and query.affected_dataset:
|
|
465
499
|
upstream_urns = [
|
|
466
500
|
make_dataset_urn_with_platform_instance(
|
|
467
|
-
platform=
|
|
468
|
-
name=
|
|
469
|
-
env=self.config.env,
|
|
501
|
+
platform=self.get_platform(),
|
|
502
|
+
name=ds.lower(),
|
|
470
503
|
platform_instance=self.config.platform_instance,
|
|
504
|
+
env=self.config.env,
|
|
471
505
|
)
|
|
472
506
|
for ds in query.queried_datasets
|
|
473
507
|
]
|
|
474
508
|
|
|
475
509
|
downstream_urn = make_dataset_urn_with_platform_instance(
|
|
476
|
-
platform=
|
|
477
|
-
name=
|
|
478
|
-
env=self.config.env,
|
|
510
|
+
platform=self.get_platform(),
|
|
511
|
+
name=query.affected_dataset.lower(),
|
|
479
512
|
platform_instance=self.config.platform_instance,
|
|
513
|
+
env=self.config.env,
|
|
480
514
|
)
|
|
481
515
|
|
|
482
516
|
# Add query to SqlParsingAggregator
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
from datetime import datetime, timedelta
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
1
5
|
class DremioSQLQueries:
|
|
2
6
|
QUERY_DATASETS_CE = """
|
|
3
7
|
SELECT* FROM
|
|
@@ -235,28 +239,83 @@ class DremioSQLQueries:
|
|
|
235
239
|
TABLE_NAME ASC
|
|
236
240
|
"""
|
|
237
241
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
242
|
+
@staticmethod
|
|
243
|
+
def _get_default_start_timestamp_millis() -> str:
|
|
244
|
+
"""Get default start timestamp (1 day ago) in milliseconds precision format"""
|
|
245
|
+
one_day_ago = datetime.now() - timedelta(days=1)
|
|
246
|
+
return one_day_ago.strftime("%Y-%m-%d %H:%M:%S.%f")[
|
|
247
|
+
:-3
|
|
248
|
+
] # Truncate to milliseconds
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _get_default_end_timestamp_millis() -> str:
|
|
252
|
+
"""Get default end timestamp (now) in milliseconds precision format"""
|
|
253
|
+
now = datetime.now()
|
|
254
|
+
return now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] # Truncate to milliseconds
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def get_query_all_jobs(
|
|
258
|
+
start_timestamp_millis: Optional[str] = None,
|
|
259
|
+
end_timestamp_millis: Optional[str] = None,
|
|
260
|
+
) -> str:
|
|
261
|
+
"""
|
|
262
|
+
Get query for all jobs with optional time filtering.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 1 day ago)
|
|
266
|
+
end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
SQL query string with time filtering applied
|
|
270
|
+
"""
|
|
271
|
+
if start_timestamp_millis is None:
|
|
272
|
+
start_timestamp_millis = (
|
|
273
|
+
DremioSQLQueries._get_default_start_timestamp_millis()
|
|
274
|
+
)
|
|
275
|
+
if end_timestamp_millis is None:
|
|
276
|
+
end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
|
|
277
|
+
|
|
278
|
+
return f"""
|
|
279
|
+
SELECT
|
|
280
|
+
job_id,
|
|
281
|
+
user_name,
|
|
282
|
+
submitted_ts,
|
|
283
|
+
query,
|
|
284
|
+
queried_datasets
|
|
285
|
+
FROM
|
|
286
|
+
SYS.JOBS_RECENT
|
|
287
|
+
WHERE
|
|
288
|
+
STATUS = 'COMPLETED'
|
|
289
|
+
AND LENGTH(queried_datasets)>0
|
|
290
|
+
AND user_name != '$dremio$'
|
|
291
|
+
AND query_type not like '%INTERNAL%'
|
|
292
|
+
AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
|
|
293
|
+
AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
@staticmethod
|
|
297
|
+
def get_query_all_jobs_cloud(
|
|
298
|
+
start_timestamp_millis: Optional[str] = None,
|
|
299
|
+
end_timestamp_millis: Optional[str] = None,
|
|
300
|
+
) -> str:
|
|
301
|
+
"""
|
|
302
|
+
Get query for all jobs in Dremio Cloud with optional time filtering.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 7 days ago)
|
|
306
|
+
end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
SQL query string with time filtering applied
|
|
310
|
+
"""
|
|
311
|
+
if start_timestamp_millis is None:
|
|
312
|
+
start_timestamp_millis = (
|
|
313
|
+
DremioSQLQueries._get_default_start_timestamp_millis()
|
|
314
|
+
)
|
|
315
|
+
if end_timestamp_millis is None:
|
|
316
|
+
end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
|
|
256
317
|
|
|
257
|
-
|
|
258
|
-
# queried_datasets correctly documented as [varchar]
|
|
259
|
-
QUERY_ALL_JOBS_CLOUD = """
|
|
318
|
+
return f"""
|
|
260
319
|
SELECT
|
|
261
320
|
job_id,
|
|
262
321
|
user_name,
|
|
@@ -270,6 +329,8 @@ class DremioSQLQueries:
|
|
|
270
329
|
AND ARRAY_SIZE(queried_datasets)>0
|
|
271
330
|
AND user_name != '$dremio$'
|
|
272
331
|
AND query_type not like '%INTERNAL%'
|
|
332
|
+
AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
|
|
333
|
+
AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
|
|
273
334
|
"""
|
|
274
335
|
|
|
275
336
|
QUERY_TYPES = [
|
|
@@ -12,7 +12,7 @@ from typing import (
|
|
|
12
12
|
Union,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
|
-
from pydantic
|
|
15
|
+
from pydantic import Field, PositiveInt
|
|
16
16
|
|
|
17
17
|
from datahub.configuration.common import AllowDenyPattern
|
|
18
18
|
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
@@ -73,7 +73,6 @@ from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
|
73
73
|
|
|
74
74
|
MAX_ITEMS_TO_RETRIEVE = 100
|
|
75
75
|
PAGE_SIZE = 100
|
|
76
|
-
MAX_SCHEMA_SIZE = 300
|
|
77
76
|
MAX_PRIMARY_KEYS_SIZE = 100
|
|
78
77
|
FIELD_DELIMITER = "."
|
|
79
78
|
|
|
@@ -107,6 +106,10 @@ class DynamoDBConfig(
|
|
|
107
106
|
'Refer "Advanced Configurations" section for more details',
|
|
108
107
|
)
|
|
109
108
|
|
|
109
|
+
max_schema_size: PositiveInt = Field(
|
|
110
|
+
default=300, description="Maximum number of fields to include in the schema."
|
|
111
|
+
)
|
|
112
|
+
|
|
110
113
|
table_pattern: AllowDenyPattern = Field(
|
|
111
114
|
default=AllowDenyPattern.allow_all(),
|
|
112
115
|
description="Regex patterns for tables to filter in ingestion. The table name format is 'region.table'",
|
|
@@ -160,7 +163,7 @@ _attribute_type_to_field_type_mapping: Dict[str, Type] = {
|
|
|
160
163
|
|
|
161
164
|
@platform_name("DynamoDB", id="dynamodb")
|
|
162
165
|
@config_class(DynamoDBConfig)
|
|
163
|
-
@support_status(SupportStatus.
|
|
166
|
+
@support_status(SupportStatus.INCUBATING)
|
|
164
167
|
@capability(
|
|
165
168
|
SourceCapability.PLATFORM_INSTANCE,
|
|
166
169
|
"By default, platform_instance will use the AWS account id",
|
|
@@ -455,25 +458,25 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
455
458
|
) -> SchemaMetadataClass:
|
|
456
459
|
""" "
|
|
457
460
|
To construct the schema metadata, it will first sort the schema by the occurrence of attribute names
|
|
458
|
-
in descending order and truncate the schema by
|
|
461
|
+
in descending order and truncate the schema by max_schema_size, and then start to construct the
|
|
459
462
|
schema metadata sorted by attribute name
|
|
460
463
|
"""
|
|
461
464
|
|
|
462
465
|
canonical_schema: List[SchemaField] = []
|
|
463
466
|
schema_size = len(schema.values())
|
|
464
467
|
table_fields = list(schema.values())
|
|
465
|
-
if schema_size >
|
|
468
|
+
if schema_size > self.config.max_schema_size:
|
|
466
469
|
# downsample the schema, using frequency as the sort key
|
|
467
470
|
self.report.report_warning(
|
|
468
471
|
title="Schema Size Too Large",
|
|
469
|
-
message=f"Downsampling the table schema because
|
|
472
|
+
message=f"Downsampling the table schema because `max_schema_size` threshold is {self.config.max_schema_size}",
|
|
470
473
|
context=f"Collection: {dataset_urn}",
|
|
471
474
|
)
|
|
472
475
|
|
|
473
476
|
# Add this information to the custom properties so user can know they are looking at down sampled schema
|
|
474
477
|
dataset_properties.customProperties["schema.downsampled"] = "True"
|
|
475
478
|
dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}"
|
|
476
|
-
# append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include
|
|
479
|
+
# append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include max_schema_size items
|
|
477
480
|
primary_keys = []
|
|
478
481
|
for schema_field in sorted(
|
|
479
482
|
table_fields,
|
|
@@ -481,7 +484,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
481
484
|
-x["count"],
|
|
482
485
|
x["delimited_name"],
|
|
483
486
|
), # Negate `count` for descending order, `delimited_name` stays the same for ascending
|
|
484
|
-
)[
|
|
487
|
+
)[: self.config.max_schema_size]:
|
|
485
488
|
field_path = schema_field["delimited_name"]
|
|
486
489
|
native_data_type = self.get_native_type(schema_field["type"], table_name)
|
|
487
490
|
type = self.get_field_type(schema_field["type"], table_name)
|
|
File without changes
|