acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from datahub.ingestion.source.unity.platform_resource_repository import (
|
|
6
|
+
UnityCatalogPlatformResourceRepository,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from datahub.api.entities.external.external_entities import (
|
|
12
|
+
ExternalEntity,
|
|
13
|
+
ExternalEntityId,
|
|
14
|
+
LinkedResourceSet,
|
|
15
|
+
)
|
|
16
|
+
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
17
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
18
|
+
PlatformResource,
|
|
19
|
+
PlatformResourceKey,
|
|
20
|
+
)
|
|
21
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
22
|
+
from datahub.metadata.urns import TagUrn
|
|
23
|
+
from datahub.utilities.urns.urn import Urn
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class UnityCatalogTagSyncContext(BaseModel):
|
|
27
|
+
# it is intentionally empty
|
|
28
|
+
platform_instance: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class UnityCatalogTagPlatformResourceId(ExternalEntityId):
|
|
35
|
+
"""
|
|
36
|
+
A Unity Catalog tag platform resource ID.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
tag_key: str
|
|
40
|
+
tag_value: Optional[str] = None
|
|
41
|
+
platform_instance: Optional[str] = None
|
|
42
|
+
exists_in_unity_catalog: bool = False
|
|
43
|
+
persisted: bool = False
|
|
44
|
+
|
|
45
|
+
# this is a hack to make sure the property is a string and not private pydantic field
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _RESOURCE_TYPE() -> str:
|
|
48
|
+
return "UnityCatalogTagPlatformResource"
|
|
49
|
+
|
|
50
|
+
def to_platform_resource_key(self) -> PlatformResourceKey:
|
|
51
|
+
return PlatformResourceKey(
|
|
52
|
+
platform="databricks",
|
|
53
|
+
resource_type=str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
|
|
54
|
+
primary_key=f"{self.tag_key}:{self.tag_value}",
|
|
55
|
+
platform_instance=self.platform_instance,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def get_or_create_from_tag(
|
|
60
|
+
cls,
|
|
61
|
+
tag: UnityCatalogTag,
|
|
62
|
+
platform_resource_repository: "UnityCatalogPlatformResourceRepository",
|
|
63
|
+
exists_in_unity_catalog: bool = False,
|
|
64
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
65
|
+
"""
|
|
66
|
+
Creates a UnityCatalogTagPlatformResourceId from a UnityCatalogTag.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
existing_platform_resource = platform_resource_repository.search_entity_by_urn(
|
|
70
|
+
tag.to_datahub_tag_urn().urn()
|
|
71
|
+
)
|
|
72
|
+
if existing_platform_resource:
|
|
73
|
+
logger.debug(
|
|
74
|
+
f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.raw_text}: {existing_platform_resource}"
|
|
75
|
+
)
|
|
76
|
+
return existing_platform_resource
|
|
77
|
+
|
|
78
|
+
return UnityCatalogTagPlatformResourceId(
|
|
79
|
+
tag_key=tag.key.raw_text,
|
|
80
|
+
tag_value=tag.value.raw_text if tag.value is not None else None,
|
|
81
|
+
platform_instance=platform_resource_repository.platform_instance,
|
|
82
|
+
exists_in_unity_catalog=exists_in_unity_catalog,
|
|
83
|
+
persisted=False,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def from_datahub_urn(
|
|
88
|
+
cls,
|
|
89
|
+
urn: str,
|
|
90
|
+
tag_sync_context: UnityCatalogTagSyncContext,
|
|
91
|
+
platform_resource_repository: "UnityCatalogPlatformResourceRepository",
|
|
92
|
+
graph: DataHubGraph,
|
|
93
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
94
|
+
"""
|
|
95
|
+
Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
|
|
96
|
+
"""
|
|
97
|
+
existing_platform_resource_id = (
|
|
98
|
+
platform_resource_repository.search_entity_by_urn(urn)
|
|
99
|
+
)
|
|
100
|
+
if existing_platform_resource_id:
|
|
101
|
+
return existing_platform_resource_id
|
|
102
|
+
|
|
103
|
+
new_unity_catalog_tag_id = cls.generate_tag_id(graph, tag_sync_context, urn)
|
|
104
|
+
if new_unity_catalog_tag_id:
|
|
105
|
+
resource_key = platform_resource_repository.get(
|
|
106
|
+
new_unity_catalog_tag_id.to_platform_resource_key()
|
|
107
|
+
)
|
|
108
|
+
if resource_key:
|
|
109
|
+
# Create a new ID with the correct state instead of mutating
|
|
110
|
+
return UnityCatalogTagPlatformResourceId(
|
|
111
|
+
tag_key=new_unity_catalog_tag_id.tag_key,
|
|
112
|
+
tag_value=new_unity_catalog_tag_id.tag_value,
|
|
113
|
+
platform_instance=new_unity_catalog_tag_id.platform_instance,
|
|
114
|
+
exists_in_unity_catalog=True, # This tag exists in Unity Catalog
|
|
115
|
+
persisted=new_unity_catalog_tag_id.persisted,
|
|
116
|
+
)
|
|
117
|
+
return new_unity_catalog_tag_id
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"Unable to create Unity Catalog tag ID from DataHub URN: {urn}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def generate_tag_id(
|
|
124
|
+
cls, graph: DataHubGraph, tag_sync_context: UnityCatalogTagSyncContext, urn: str
|
|
125
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
126
|
+
parsed_urn = Urn.from_string(urn)
|
|
127
|
+
entity_type = parsed_urn.entity_type
|
|
128
|
+
if entity_type == "tag":
|
|
129
|
+
return UnityCatalogTagPlatformResourceId.from_datahub_tag(
|
|
130
|
+
TagUrn.from_string(urn), tag_sync_context
|
|
131
|
+
)
|
|
132
|
+
else:
|
|
133
|
+
raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def from_datahub_tag(
|
|
137
|
+
cls, tag_urn: TagUrn, tag_sync_context: UnityCatalogTagSyncContext
|
|
138
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
139
|
+
uc_tag = UnityCatalogTag.from_urn(tag_urn)
|
|
140
|
+
|
|
141
|
+
return UnityCatalogTagPlatformResourceId(
|
|
142
|
+
tag_key=str(uc_tag.key),
|
|
143
|
+
tag_value=str(uc_tag.value) if uc_tag.value is not None else None,
|
|
144
|
+
platform_instance=tag_sync_context.platform_instance,
|
|
145
|
+
exists_in_unity_catalog=False,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class UnityCatalogTagPlatformResource(ExternalEntity):
|
|
150
|
+
datahub_urns: LinkedResourceSet
|
|
151
|
+
managed_by_datahub: bool
|
|
152
|
+
id: UnityCatalogTagPlatformResourceId
|
|
153
|
+
allowed_values: Optional[List[str]] = None
|
|
154
|
+
|
|
155
|
+
def get_id(self) -> ExternalEntityId:
|
|
156
|
+
return self.id
|
|
157
|
+
|
|
158
|
+
def is_managed_by_datahub(self) -> bool:
|
|
159
|
+
return self.managed_by_datahub
|
|
160
|
+
|
|
161
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
162
|
+
return self.datahub_urns
|
|
163
|
+
|
|
164
|
+
def as_platform_resource(self) -> PlatformResource:
|
|
165
|
+
return PlatformResource.create(
|
|
166
|
+
key=self.id.to_platform_resource_key(),
|
|
167
|
+
secondary_keys=[u for u in self.datahub_urns.urns],
|
|
168
|
+
value=self,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
@classmethod
|
|
172
|
+
def create_default(
|
|
173
|
+
cls,
|
|
174
|
+
entity_id: ExternalEntityId,
|
|
175
|
+
managed_by_datahub: bool,
|
|
176
|
+
) -> "UnityCatalogTagPlatformResource":
|
|
177
|
+
"""Create a default Unity Catalog tag entity when none found in DataHub."""
|
|
178
|
+
# Type narrowing: we know this will be a UnityCatalogTagPlatformResourceId
|
|
179
|
+
assert isinstance(entity_id, UnityCatalogTagPlatformResourceId), (
|
|
180
|
+
f"Expected UnityCatalogTagPlatformResourceId, got {type(entity_id)}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Create a new entity ID with correct default state instead of mutating
|
|
184
|
+
default_entity_id = UnityCatalogTagPlatformResourceId(
|
|
185
|
+
tag_key=entity_id.tag_key,
|
|
186
|
+
tag_value=entity_id.tag_value,
|
|
187
|
+
platform_instance=entity_id.platform_instance,
|
|
188
|
+
exists_in_unity_catalog=False, # New entities don't exist in Unity Catalog yet
|
|
189
|
+
persisted=False, # New entities are not persisted yet
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
return cls(
|
|
193
|
+
id=default_entity_id,
|
|
194
|
+
datahub_urns=LinkedResourceSet(urns=[]),
|
|
195
|
+
managed_by_datahub=managed_by_datahub,
|
|
196
|
+
allowed_values=None,
|
|
197
|
+
)
|
|
@@ -11,7 +11,10 @@ from databricks.sdk.service.sql import QueryStatementType
|
|
|
11
11
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
12
12
|
from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
|
|
13
13
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
14
|
-
from datahub.ingestion.source.unity.config import
|
|
14
|
+
from datahub.ingestion.source.unity.config import (
|
|
15
|
+
UnityCatalogSourceConfig,
|
|
16
|
+
UsageDataSource,
|
|
17
|
+
)
|
|
15
18
|
from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
|
|
16
19
|
from datahub.ingestion.source.unity.proxy_types import (
|
|
17
20
|
OPERATION_STATEMENT_TYPES,
|
|
@@ -164,11 +167,50 @@ class UnityCatalogUsageExtractor:
|
|
|
164
167
|
aspect=operation_aspect,
|
|
165
168
|
).as_workunit()
|
|
166
169
|
|
|
170
|
+
def _validate_usage_data_source_config(self) -> None:
|
|
171
|
+
"""Validate usage data source configuration before execution."""
|
|
172
|
+
usage_data_source = self.config.usage_data_source
|
|
173
|
+
|
|
174
|
+
if (
|
|
175
|
+
usage_data_source == UsageDataSource.SYSTEM_TABLES
|
|
176
|
+
and not self.proxy.warehouse_id
|
|
177
|
+
):
|
|
178
|
+
raise ValueError(
|
|
179
|
+
"usage_data_source is set to SYSTEM_TABLES but warehouse_id is not configured. "
|
|
180
|
+
"Either set warehouse_id or use AUTO/API mode."
|
|
181
|
+
)
|
|
182
|
+
|
|
167
183
|
def _get_queries(self) -> Iterable[Query]:
|
|
168
184
|
try:
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
185
|
+
self._validate_usage_data_source_config()
|
|
186
|
+
usage_data_source = self.config.usage_data_source
|
|
187
|
+
|
|
188
|
+
if usage_data_source == UsageDataSource.AUTO:
|
|
189
|
+
if self.proxy.warehouse_id:
|
|
190
|
+
logger.info(
|
|
191
|
+
"Using system tables for usage query history (AUTO mode)"
|
|
192
|
+
)
|
|
193
|
+
yield from self.proxy.get_query_history_via_system_tables(
|
|
194
|
+
self.config.start_time, self.config.end_time
|
|
195
|
+
)
|
|
196
|
+
else:
|
|
197
|
+
logger.info(
|
|
198
|
+
"Using API for usage query history (AUTO mode, no warehouse)"
|
|
199
|
+
)
|
|
200
|
+
yield from self.proxy.query_history(
|
|
201
|
+
self.config.start_time, self.config.end_time
|
|
202
|
+
)
|
|
203
|
+
elif usage_data_source == UsageDataSource.SYSTEM_TABLES:
|
|
204
|
+
logger.info("Using system tables for usage query history (forced)")
|
|
205
|
+
yield from self.proxy.get_query_history_via_system_tables(
|
|
206
|
+
self.config.start_time, self.config.end_time
|
|
207
|
+
)
|
|
208
|
+
elif usage_data_source == UsageDataSource.API:
|
|
209
|
+
logger.info("Using API for usage query history (forced)")
|
|
210
|
+
yield from self.proxy.query_history(
|
|
211
|
+
self.config.start_time, self.config.end_time
|
|
212
|
+
)
|
|
213
|
+
|
|
172
214
|
except Exception as e:
|
|
173
215
|
logger.warning("Error getting queries", exc_info=True)
|
|
174
216
|
self.report.report_warning("get-queries", str(e))
|
|
@@ -85,8 +85,11 @@ class ClickHouseUsageConfig(ClickHouseConfig, BaseUsageConfig, EnvConfigMixin):
|
|
|
85
85
|
@platform_name("ClickHouse")
|
|
86
86
|
@config_class(ClickHouseUsageConfig)
|
|
87
87
|
@support_status(SupportStatus.CERTIFIED)
|
|
88
|
-
@capability(
|
|
88
|
+
@capability(
|
|
89
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
90
|
+
)
|
|
89
91
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
92
|
+
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
90
93
|
@dataclasses.dataclass
|
|
91
94
|
class ClickHouseUsageSource(Source):
|
|
92
95
|
"""
|
|
@@ -15,7 +15,9 @@ from sqlalchemy.engine import Engine
|
|
|
15
15
|
import datahub.emitter.mce_builder as builder
|
|
16
16
|
from datahub.configuration.time_window_config import get_time_bucket
|
|
17
17
|
from datahub.ingestion.api.decorators import (
|
|
18
|
+
SourceCapability,
|
|
18
19
|
SupportStatus,
|
|
20
|
+
capability,
|
|
19
21
|
config_class,
|
|
20
22
|
platform_name,
|
|
21
23
|
support_status,
|
|
@@ -58,7 +60,7 @@ AggregatedDataset = GenericAggregatedDataset[TrinoTableRef]
|
|
|
58
60
|
|
|
59
61
|
class TrinoConnectorInfo(BaseModel):
|
|
60
62
|
partitionIds: List[str]
|
|
61
|
-
truncated: Optional[bool]
|
|
63
|
+
truncated: Optional[bool] = None
|
|
62
64
|
|
|
63
65
|
|
|
64
66
|
class TrinoAccessedMetadata(BaseModel):
|
|
@@ -78,7 +80,7 @@ class TrinoJoinedAccessEvent(BaseModel):
|
|
|
78
80
|
table: Optional[str] = None
|
|
79
81
|
accessed_metadata: List[TrinoAccessedMetadata]
|
|
80
82
|
starttime: datetime = Field(alias="create_time")
|
|
81
|
-
endtime: Optional[datetime] = Field(alias="end_time")
|
|
83
|
+
endtime: Optional[datetime] = Field(None, alias="end_time")
|
|
82
84
|
|
|
83
85
|
|
|
84
86
|
class EnvBasedSourceBaseConfig:
|
|
@@ -112,6 +114,7 @@ class TrinoUsageReport(SourceReport):
|
|
|
112
114
|
@platform_name("Trino")
|
|
113
115
|
@config_class(TrinoUsageConfig)
|
|
114
116
|
@support_status(SupportStatus.CERTIFIED)
|
|
117
|
+
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
115
118
|
@dataclasses.dataclass
|
|
116
119
|
class TrinoUsageSource(Source):
|
|
117
120
|
"""
|
|
@@ -18,7 +18,7 @@ import pydantic
|
|
|
18
18
|
from pydantic.fields import Field
|
|
19
19
|
|
|
20
20
|
import datahub.emitter.mce_builder as builder
|
|
21
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
21
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
22
22
|
from datahub.configuration.time_window_config import (
|
|
23
23
|
BaseTimeWindowConfig,
|
|
24
24
|
BucketDuration,
|
|
@@ -194,13 +194,13 @@ class GenericAggregatedDataset(Generic[ResourceType]):
|
|
|
194
194
|
|
|
195
195
|
|
|
196
196
|
class BaseUsageConfig(BaseTimeWindowConfig):
|
|
197
|
-
queries_character_limit: int = Field(
|
|
197
|
+
queries_character_limit: HiddenFromDocs[int] = Field(
|
|
198
|
+
# Hidden since we don't want to encourage people to break elasticsearch.
|
|
198
199
|
default=DEFAULT_QUERIES_CHARACTER_LIMIT,
|
|
199
200
|
description=(
|
|
200
201
|
"Total character limit for all queries in a single usage aspect."
|
|
201
202
|
" Queries will be truncated to length `queries_character_limit / top_n_queries`."
|
|
202
203
|
),
|
|
203
|
-
hidden_from_docs=True, # Don't want to encourage people to break elasticsearch
|
|
204
204
|
)
|
|
205
205
|
|
|
206
206
|
top_n_queries: pydantic.PositiveInt = Field(
|
|
@@ -268,6 +268,7 @@ class UsageAggregator(Generic[ResourceType]):
|
|
|
268
268
|
user,
|
|
269
269
|
query,
|
|
270
270
|
fields,
|
|
271
|
+
user_email_pattern=self.config.user_email_pattern,
|
|
271
272
|
count=count,
|
|
272
273
|
)
|
|
273
274
|
|
|
@@ -145,7 +145,7 @@ class PipelineMetadata:
|
|
|
145
145
|
|
|
146
146
|
@platform_name("Vertex AI", id="vertexai")
|
|
147
147
|
@config_class(VertexAIConfig)
|
|
148
|
-
@support_status(SupportStatus.
|
|
148
|
+
@support_status(SupportStatus.INCUBATING)
|
|
149
149
|
@capability(
|
|
150
150
|
SourceCapability.DESCRIPTIONS,
|
|
151
151
|
"Extract descriptions for Vertex AI Registered Models and Model Versions",
|
|
@@ -2,6 +2,7 @@ import re
|
|
|
2
2
|
from typing import Dict, List, Optional, Union
|
|
3
3
|
from urllib.parse import urlparse
|
|
4
4
|
|
|
5
|
+
import pydantic
|
|
5
6
|
from pydantic import Field, validator
|
|
6
7
|
|
|
7
8
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -121,7 +122,8 @@ class PulsarSourceConfig(
|
|
|
121
122
|
)
|
|
122
123
|
return client_secret
|
|
123
124
|
|
|
124
|
-
@
|
|
125
|
+
@pydantic.field_validator("web_service_url", mode="after")
|
|
126
|
+
@classmethod
|
|
125
127
|
def web_service_url_scheme_host_port(cls, val: str) -> str:
|
|
126
128
|
# Tokenize the web url
|
|
127
129
|
url = urlparse(val)
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections import defaultdict
|
|
2
3
|
from contextlib import AbstractContextManager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from datetime import datetime, timezone
|
|
6
|
+
from enum import Enum
|
|
5
7
|
|
|
6
8
|
from datahub.utilities.perf_timer import PerfTimer
|
|
7
9
|
from datahub.utilities.stats_collections import TopKDict
|
|
@@ -20,31 +22,68 @@ QUERIES_EXTRACTION = "Queries Extraction"
|
|
|
20
22
|
PROFILING = "Profiling"
|
|
21
23
|
|
|
22
24
|
|
|
25
|
+
class IngestionHighStage(Enum):
|
|
26
|
+
"""
|
|
27
|
+
The high-level stages at the framework level
|
|
28
|
+
Team to add more stages as needed
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
PROFILING = "Profiling"
|
|
32
|
+
_UNDEFINED = "Ingestion"
|
|
33
|
+
|
|
34
|
+
|
|
23
35
|
@dataclass
|
|
24
36
|
class IngestionStageReport:
|
|
37
|
+
ingestion_high_stage_seconds: dict[IngestionHighStage, float] = field(
|
|
38
|
+
default_factory=lambda: defaultdict(float)
|
|
39
|
+
)
|
|
25
40
|
ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
|
|
26
41
|
|
|
27
|
-
def new_stage(
|
|
28
|
-
|
|
42
|
+
def new_stage(
|
|
43
|
+
self, stage: str, high_stage: IngestionHighStage = IngestionHighStage._UNDEFINED
|
|
44
|
+
) -> "IngestionStageContext":
|
|
45
|
+
return IngestionStageContext(stage, self, high_stage)
|
|
46
|
+
|
|
47
|
+
def new_high_stage(self, stage: IngestionHighStage) -> "IngestionStageContext":
|
|
48
|
+
return IngestionStageContext("", self, stage)
|
|
29
49
|
|
|
30
50
|
|
|
31
51
|
@dataclass
|
|
32
52
|
class IngestionStageContext(AbstractContextManager):
|
|
33
|
-
def __init__(
|
|
34
|
-
self
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
stage: str,
|
|
56
|
+
report: IngestionStageReport,
|
|
57
|
+
high_stage: IngestionHighStage = IngestionHighStage._UNDEFINED,
|
|
58
|
+
):
|
|
59
|
+
self._high_stage = high_stage
|
|
60
|
+
self._ingestion_stage = (
|
|
61
|
+
f"{stage} at {datetime.now(timezone.utc)}" if stage else ""
|
|
62
|
+
)
|
|
35
63
|
self._timer: PerfTimer = PerfTimer()
|
|
36
64
|
self._report = report
|
|
37
65
|
|
|
38
66
|
def __enter__(self) -> "IngestionStageContext":
|
|
39
|
-
|
|
67
|
+
if self._ingestion_stage:
|
|
68
|
+
logger.info(f"Stage started: {self._ingestion_stage}")
|
|
69
|
+
else:
|
|
70
|
+
logger.info(f"High stage started: {self._high_stage.value}")
|
|
40
71
|
self._timer.start()
|
|
41
72
|
return self
|
|
42
73
|
|
|
43
74
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
44
75
|
elapsed = self._timer.elapsed_seconds(digits=2)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
76
|
+
if self._ingestion_stage:
|
|
77
|
+
logger.info(
|
|
78
|
+
f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
|
|
79
|
+
stacklevel=2,
|
|
80
|
+
)
|
|
81
|
+
# Store tuple as string to avoid serialization errors
|
|
82
|
+
key = f"({self._high_stage.value}, {self._ingestion_stage})"
|
|
83
|
+
self._report.ingestion_stage_durations[key] = elapsed
|
|
84
|
+
else:
|
|
85
|
+
logger.info(
|
|
86
|
+
f"Time spent in stage <{self._high_stage.value}>: {elapsed} seconds",
|
|
87
|
+
stacklevel=2,
|
|
88
|
+
)
|
|
89
|
+
self._report.ingestion_high_stage_seconds[self._high_stage] += elapsed
|
|
@@ -71,8 +71,24 @@ class AddDatasetOwnership(OwnershipTransformer):
|
|
|
71
71
|
|
|
72
72
|
server_ownership = graph.get_ownership(entity_urn=urn)
|
|
73
73
|
if server_ownership:
|
|
74
|
-
owners = {
|
|
75
|
-
|
|
74
|
+
owners = {
|
|
75
|
+
(
|
|
76
|
+
owner.owner,
|
|
77
|
+
owner.type,
|
|
78
|
+
owner.typeUrn,
|
|
79
|
+
): owner
|
|
80
|
+
for owner in server_ownership.owners
|
|
81
|
+
}
|
|
82
|
+
owners.update(
|
|
83
|
+
{
|
|
84
|
+
(
|
|
85
|
+
owner.owner,
|
|
86
|
+
owner.type,
|
|
87
|
+
owner.typeUrn,
|
|
88
|
+
): owner
|
|
89
|
+
for owner in mce_ownership.owners
|
|
90
|
+
}
|
|
91
|
+
)
|
|
76
92
|
mce_ownership.owners = list(owners.values())
|
|
77
93
|
|
|
78
94
|
return mce_ownership
|
|
@@ -281,11 +281,14 @@ class BaseTransformer(Transformer, metaclass=ABCMeta):
|
|
|
281
281
|
)
|
|
282
282
|
)
|
|
283
283
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
284
|
+
if mcp.entityUrn:
|
|
285
|
+
record_metadata = _update_work_unit_id(
|
|
286
|
+
envelope=envelope,
|
|
287
|
+
aspect_name=mcp.aspect.get_aspect_name(), # type: ignore
|
|
288
|
+
urn=mcp.entityUrn,
|
|
289
|
+
)
|
|
290
|
+
else:
|
|
291
|
+
record_metadata = envelope.metadata.copy()
|
|
289
292
|
|
|
290
293
|
yield RecordEnvelope(
|
|
291
294
|
record=mcp,
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Dict, List, Optional, cast
|
|
4
|
+
|
|
5
|
+
from datahub.configuration.common import (
|
|
6
|
+
TransformerSemanticsConfigModel,
|
|
7
|
+
)
|
|
8
|
+
from datahub.emitter.mce_builder import Aspect
|
|
9
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
10
|
+
from datahub.ingestion.transformer.base_transformer import (
|
|
11
|
+
BaseTransformer,
|
|
12
|
+
SingleAspectTransformer,
|
|
13
|
+
)
|
|
14
|
+
from datahub.metadata.schema_classes import (
|
|
15
|
+
BrowsePathEntryClass,
|
|
16
|
+
BrowsePathsV2Class,
|
|
17
|
+
)
|
|
18
|
+
from datahub.utilities.urns.urn import guess_entity_type
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SetBrowsePathTransformerConfig(TransformerSemanticsConfigModel):
|
|
22
|
+
path: List[str]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SetBrowsePathTransformer(BaseTransformer, SingleAspectTransformer):
|
|
26
|
+
ctx: PipelineContext
|
|
27
|
+
config: SetBrowsePathTransformerConfig
|
|
28
|
+
|
|
29
|
+
def __init__(self, config: SetBrowsePathTransformerConfig, ctx: PipelineContext):
|
|
30
|
+
super().__init__()
|
|
31
|
+
self.ctx = ctx
|
|
32
|
+
self.config = config
|
|
33
|
+
|
|
34
|
+
def aspect_name(self) -> str:
|
|
35
|
+
return "browsePathsV2"
|
|
36
|
+
|
|
37
|
+
def entity_types(self) -> List[str]:
|
|
38
|
+
# This is an arbitrary list, might be adjusted if it makes sense. It might be reasonable to make it configurable
|
|
39
|
+
return ["dataset", "dataJob", "dataFlow", "chart", "dashboard", "container"]
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def create(
|
|
43
|
+
cls, config_dict: dict, ctx: PipelineContext
|
|
44
|
+
) -> "SetBrowsePathTransformer":
|
|
45
|
+
config = SetBrowsePathTransformerConfig.parse_obj(config_dict)
|
|
46
|
+
return cls(config, ctx)
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def _build_model(existing_browse_paths: BrowsePathsV2Class) -> Dict[str, List[str]]:
|
|
50
|
+
template_vars: Dict[str, List[str]] = {}
|
|
51
|
+
model: Dict[str, List[str]] = defaultdict(list)
|
|
52
|
+
for entry in existing_browse_paths.path or []:
|
|
53
|
+
if entry.urn:
|
|
54
|
+
entity_type = guess_entity_type(entry.urn)
|
|
55
|
+
model[entity_type].append(entry.urn)
|
|
56
|
+
|
|
57
|
+
for entity_type, urns in model.items():
|
|
58
|
+
template_vars[f"{entity_type}[*]"] = urns
|
|
59
|
+
for i, urn in enumerate(urns):
|
|
60
|
+
template_vars[f"{entity_type}[{i}]"] = [urn]
|
|
61
|
+
|
|
62
|
+
return template_vars
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def _expand_nodes(
|
|
66
|
+
cls, templates: List[str], template_vars: Dict[str, List[str]]
|
|
67
|
+
) -> BrowsePathsV2Class:
|
|
68
|
+
expanded_nodes: List[str] = []
|
|
69
|
+
for node in templates:
|
|
70
|
+
resolved_nodes = cls._resolve_template_to_nodes(node, template_vars)
|
|
71
|
+
expanded_nodes.extend(resolved_nodes)
|
|
72
|
+
|
|
73
|
+
processed_entries: List[BrowsePathEntryClass] = []
|
|
74
|
+
for node in expanded_nodes:
|
|
75
|
+
if not node or node.isspace():
|
|
76
|
+
continue
|
|
77
|
+
processed_entries.append(
|
|
78
|
+
BrowsePathEntryClass(
|
|
79
|
+
id=node, urn=node if node.startswith("urn:") else None
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
return BrowsePathsV2Class(path=processed_entries)
|
|
83
|
+
|
|
84
|
+
def transform_aspect(
|
|
85
|
+
self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]
|
|
86
|
+
) -> Optional[Aspect]:
|
|
87
|
+
template_vars: Dict[str, List[str]] = {}
|
|
88
|
+
if aspect is not None:
|
|
89
|
+
assert isinstance(aspect, BrowsePathsV2Class)
|
|
90
|
+
template_vars = self._build_model(aspect)
|
|
91
|
+
new_browse_paths: BrowsePathsV2Class = self._expand_nodes(
|
|
92
|
+
self.config.path, template_vars
|
|
93
|
+
)
|
|
94
|
+
if aspect is not None and not self.config.replace_existing:
|
|
95
|
+
for node in aspect.path:
|
|
96
|
+
new_browse_paths.path.append(node)
|
|
97
|
+
|
|
98
|
+
return cast(Aspect, new_browse_paths)
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _resolve_template_to_nodes(
|
|
102
|
+
template_str: str, template_vars: Dict[str, List[str]]
|
|
103
|
+
) -> List[str]:
|
|
104
|
+
# This mechanism can be made simpler (match against known variables only) or more complex (e.g. by using a
|
|
105
|
+
# proper templating engine, like jinja).
|
|
106
|
+
template_str = template_str.strip()
|
|
107
|
+
var_pattern = re.findall(r"^\$([a-zA-Z]+\[[0-9*]+]$)", template_str)
|
|
108
|
+
|
|
109
|
+
if not var_pattern:
|
|
110
|
+
return [template_str]
|
|
111
|
+
|
|
112
|
+
return template_vars.get(var_pattern[0], [])
|
|
@@ -84,9 +84,10 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
|
|
|
84
84
|
|
|
85
85
|
dmf_definitions_path = self.output_dir / DMF_DEFINITIONS_FILE_NAME
|
|
86
86
|
dmf_associations_path = self.output_dir / DMF_ASSOCIATIONS_FILE_NAME
|
|
87
|
-
with (
|
|
88
|
-
|
|
89
|
-
|
|
87
|
+
with (
|
|
88
|
+
(dmf_definitions_path).open("w") as definitions,
|
|
89
|
+
(dmf_associations_path).open("w") as associations,
|
|
90
|
+
):
|
|
90
91
|
for assertion_spec in assertion_config_spec.assertions:
|
|
91
92
|
result.report.num_processed += 1
|
|
92
93
|
try:
|