acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from datahub.ingestion.source.unity.platform_resource_repository import (
|
|
6
|
+
UnityCatalogPlatformResourceRepository,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from datahub.api.entities.external.external_entities import (
|
|
12
|
+
ExternalEntity,
|
|
13
|
+
ExternalEntityId,
|
|
14
|
+
LinkedResourceSet,
|
|
15
|
+
)
|
|
16
|
+
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
17
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
18
|
+
PlatformResource,
|
|
19
|
+
PlatformResourceKey,
|
|
20
|
+
)
|
|
21
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
22
|
+
from datahub.metadata.urns import TagUrn
|
|
23
|
+
from datahub.utilities.urns.urn import Urn
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class UnityCatalogTagSyncContext(BaseModel):
|
|
27
|
+
# it is intentionally empty
|
|
28
|
+
platform_instance: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class UnityCatalogTagPlatformResourceId(ExternalEntityId):
|
|
35
|
+
"""
|
|
36
|
+
A Unity Catalog tag platform resource ID.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
tag_key: str
|
|
40
|
+
tag_value: Optional[str] = None
|
|
41
|
+
platform_instance: Optional[str] = None
|
|
42
|
+
exists_in_unity_catalog: bool = False
|
|
43
|
+
persisted: bool = False
|
|
44
|
+
|
|
45
|
+
# this is a hack to make sure the property is a string and not private pydantic field
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _RESOURCE_TYPE() -> str:
|
|
48
|
+
return "UnityCatalogTagPlatformResource"
|
|
49
|
+
|
|
50
|
+
def to_platform_resource_key(self) -> PlatformResourceKey:
|
|
51
|
+
return PlatformResourceKey(
|
|
52
|
+
platform="databricks",
|
|
53
|
+
resource_type=str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
|
|
54
|
+
primary_key=f"{self.tag_key}:{self.tag_value}",
|
|
55
|
+
platform_instance=self.platform_instance,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def get_or_create_from_tag(
|
|
60
|
+
cls,
|
|
61
|
+
tag: UnityCatalogTag,
|
|
62
|
+
platform_resource_repository: "UnityCatalogPlatformResourceRepository",
|
|
63
|
+
exists_in_unity_catalog: bool = False,
|
|
64
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
65
|
+
"""
|
|
66
|
+
Creates a UnityCatalogTagPlatformResourceId from a UnityCatalogTag.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
existing_platform_resource = platform_resource_repository.search_entity_by_urn(
|
|
70
|
+
tag.to_datahub_tag_urn().urn()
|
|
71
|
+
)
|
|
72
|
+
if existing_platform_resource:
|
|
73
|
+
logger.debug(
|
|
74
|
+
f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.raw_text}: {existing_platform_resource}"
|
|
75
|
+
)
|
|
76
|
+
return existing_platform_resource
|
|
77
|
+
|
|
78
|
+
return UnityCatalogTagPlatformResourceId(
|
|
79
|
+
tag_key=tag.key.raw_text,
|
|
80
|
+
tag_value=tag.value.raw_text if tag.value is not None else None,
|
|
81
|
+
platform_instance=platform_resource_repository.platform_instance,
|
|
82
|
+
exists_in_unity_catalog=exists_in_unity_catalog,
|
|
83
|
+
persisted=False,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def from_datahub_urn(
|
|
88
|
+
cls,
|
|
89
|
+
urn: str,
|
|
90
|
+
tag_sync_context: UnityCatalogTagSyncContext,
|
|
91
|
+
platform_resource_repository: "UnityCatalogPlatformResourceRepository",
|
|
92
|
+
graph: DataHubGraph,
|
|
93
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
94
|
+
"""
|
|
95
|
+
Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
|
|
96
|
+
"""
|
|
97
|
+
existing_platform_resource_id = (
|
|
98
|
+
platform_resource_repository.search_entity_by_urn(urn)
|
|
99
|
+
)
|
|
100
|
+
if existing_platform_resource_id:
|
|
101
|
+
return existing_platform_resource_id
|
|
102
|
+
|
|
103
|
+
new_unity_catalog_tag_id = cls.generate_tag_id(graph, tag_sync_context, urn)
|
|
104
|
+
if new_unity_catalog_tag_id:
|
|
105
|
+
resource_key = platform_resource_repository.get(
|
|
106
|
+
new_unity_catalog_tag_id.to_platform_resource_key()
|
|
107
|
+
)
|
|
108
|
+
if resource_key:
|
|
109
|
+
# Create a new ID with the correct state instead of mutating
|
|
110
|
+
return UnityCatalogTagPlatformResourceId(
|
|
111
|
+
tag_key=new_unity_catalog_tag_id.tag_key,
|
|
112
|
+
tag_value=new_unity_catalog_tag_id.tag_value,
|
|
113
|
+
platform_instance=new_unity_catalog_tag_id.platform_instance,
|
|
114
|
+
exists_in_unity_catalog=True, # This tag exists in Unity Catalog
|
|
115
|
+
persisted=new_unity_catalog_tag_id.persisted,
|
|
116
|
+
)
|
|
117
|
+
return new_unity_catalog_tag_id
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"Unable to create Unity Catalog tag ID from DataHub URN: {urn}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def generate_tag_id(
|
|
124
|
+
cls, graph: DataHubGraph, tag_sync_context: UnityCatalogTagSyncContext, urn: str
|
|
125
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
126
|
+
parsed_urn = Urn.from_string(urn)
|
|
127
|
+
entity_type = parsed_urn.entity_type
|
|
128
|
+
if entity_type == "tag":
|
|
129
|
+
return UnityCatalogTagPlatformResourceId.from_datahub_tag(
|
|
130
|
+
TagUrn.from_string(urn), tag_sync_context
|
|
131
|
+
)
|
|
132
|
+
else:
|
|
133
|
+
raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def from_datahub_tag(
|
|
137
|
+
cls, tag_urn: TagUrn, tag_sync_context: UnityCatalogTagSyncContext
|
|
138
|
+
) -> "UnityCatalogTagPlatformResourceId":
|
|
139
|
+
uc_tag = UnityCatalogTag.from_urn(tag_urn)
|
|
140
|
+
|
|
141
|
+
return UnityCatalogTagPlatformResourceId(
|
|
142
|
+
tag_key=str(uc_tag.key),
|
|
143
|
+
tag_value=str(uc_tag.value) if uc_tag.value is not None else None,
|
|
144
|
+
platform_instance=tag_sync_context.platform_instance,
|
|
145
|
+
exists_in_unity_catalog=False,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class UnityCatalogTagPlatformResource(ExternalEntity):
|
|
150
|
+
datahub_urns: LinkedResourceSet
|
|
151
|
+
managed_by_datahub: bool
|
|
152
|
+
id: UnityCatalogTagPlatformResourceId
|
|
153
|
+
allowed_values: Optional[List[str]] = None
|
|
154
|
+
|
|
155
|
+
def get_id(self) -> ExternalEntityId:
|
|
156
|
+
return self.id
|
|
157
|
+
|
|
158
|
+
def is_managed_by_datahub(self) -> bool:
|
|
159
|
+
return self.managed_by_datahub
|
|
160
|
+
|
|
161
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
162
|
+
return self.datahub_urns
|
|
163
|
+
|
|
164
|
+
def as_platform_resource(self) -> PlatformResource:
|
|
165
|
+
return PlatformResource.create(
|
|
166
|
+
key=self.id.to_platform_resource_key(),
|
|
167
|
+
secondary_keys=[u for u in self.datahub_urns.urns],
|
|
168
|
+
value=self,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
@classmethod
|
|
172
|
+
def create_default(
|
|
173
|
+
cls,
|
|
174
|
+
entity_id: ExternalEntityId,
|
|
175
|
+
managed_by_datahub: bool,
|
|
176
|
+
) -> "UnityCatalogTagPlatformResource":
|
|
177
|
+
"""Create a default Unity Catalog tag entity when none found in DataHub."""
|
|
178
|
+
# Type narrowing: we know this will be a UnityCatalogTagPlatformResourceId
|
|
179
|
+
assert isinstance(entity_id, UnityCatalogTagPlatformResourceId), (
|
|
180
|
+
f"Expected UnityCatalogTagPlatformResourceId, got {type(entity_id)}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Create a new entity ID with correct default state instead of mutating
|
|
184
|
+
default_entity_id = UnityCatalogTagPlatformResourceId(
|
|
185
|
+
tag_key=entity_id.tag_key,
|
|
186
|
+
tag_value=entity_id.tag_value,
|
|
187
|
+
platform_instance=entity_id.platform_instance,
|
|
188
|
+
exists_in_unity_catalog=False, # New entities don't exist in Unity Catalog yet
|
|
189
|
+
persisted=False, # New entities are not persisted yet
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
return cls(
|
|
193
|
+
id=default_entity_id,
|
|
194
|
+
datahub_urns=LinkedResourceSet(urns=[]),
|
|
195
|
+
managed_by_datahub=managed_by_datahub,
|
|
196
|
+
allowed_values=None,
|
|
197
|
+
)
|
|
@@ -11,7 +11,10 @@ from databricks.sdk.service.sql import QueryStatementType
|
|
|
11
11
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
12
12
|
from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
|
|
13
13
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
14
|
-
from datahub.ingestion.source.unity.config import
|
|
14
|
+
from datahub.ingestion.source.unity.config import (
|
|
15
|
+
UnityCatalogSourceConfig,
|
|
16
|
+
UsageDataSource,
|
|
17
|
+
)
|
|
15
18
|
from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
|
|
16
19
|
from datahub.ingestion.source.unity.proxy_types import (
|
|
17
20
|
OPERATION_STATEMENT_TYPES,
|
|
@@ -164,11 +167,50 @@ class UnityCatalogUsageExtractor:
|
|
|
164
167
|
aspect=operation_aspect,
|
|
165
168
|
).as_workunit()
|
|
166
169
|
|
|
170
|
+
def _validate_usage_data_source_config(self) -> None:
|
|
171
|
+
"""Validate usage data source configuration before execution."""
|
|
172
|
+
usage_data_source = self.config.usage_data_source
|
|
173
|
+
|
|
174
|
+
if (
|
|
175
|
+
usage_data_source == UsageDataSource.SYSTEM_TABLES
|
|
176
|
+
and not self.proxy.warehouse_id
|
|
177
|
+
):
|
|
178
|
+
raise ValueError(
|
|
179
|
+
"usage_data_source is set to SYSTEM_TABLES but warehouse_id is not configured. "
|
|
180
|
+
"Either set warehouse_id or use AUTO/API mode."
|
|
181
|
+
)
|
|
182
|
+
|
|
167
183
|
def _get_queries(self) -> Iterable[Query]:
|
|
168
184
|
try:
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
185
|
+
self._validate_usage_data_source_config()
|
|
186
|
+
usage_data_source = self.config.usage_data_source
|
|
187
|
+
|
|
188
|
+
if usage_data_source == UsageDataSource.AUTO:
|
|
189
|
+
if self.proxy.warehouse_id:
|
|
190
|
+
logger.info(
|
|
191
|
+
"Using system tables for usage query history (AUTO mode)"
|
|
192
|
+
)
|
|
193
|
+
yield from self.proxy.get_query_history_via_system_tables(
|
|
194
|
+
self.config.start_time, self.config.end_time
|
|
195
|
+
)
|
|
196
|
+
else:
|
|
197
|
+
logger.info(
|
|
198
|
+
"Using API for usage query history (AUTO mode, no warehouse)"
|
|
199
|
+
)
|
|
200
|
+
yield from self.proxy.query_history(
|
|
201
|
+
self.config.start_time, self.config.end_time
|
|
202
|
+
)
|
|
203
|
+
elif usage_data_source == UsageDataSource.SYSTEM_TABLES:
|
|
204
|
+
logger.info("Using system tables for usage query history (forced)")
|
|
205
|
+
yield from self.proxy.get_query_history_via_system_tables(
|
|
206
|
+
self.config.start_time, self.config.end_time
|
|
207
|
+
)
|
|
208
|
+
elif usage_data_source == UsageDataSource.API:
|
|
209
|
+
logger.info("Using API for usage query history (forced)")
|
|
210
|
+
yield from self.proxy.query_history(
|
|
211
|
+
self.config.start_time, self.config.end_time
|
|
212
|
+
)
|
|
213
|
+
|
|
172
214
|
except Exception as e:
|
|
173
215
|
logger.warning("Error getting queries", exc_info=True)
|
|
174
216
|
self.report.report_warning("get-queries", str(e))
|
|
@@ -2,7 +2,7 @@ import collections
|
|
|
2
2
|
import dataclasses
|
|
3
3
|
import logging
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
from typing import Dict, Iterable, List
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
8
|
from pydantic.fields import Field
|
|
@@ -74,15 +74,22 @@ class ClickHouseUsageConfig(ClickHouseConfig, BaseUsageConfig, EnvConfigMixin):
|
|
|
74
74
|
options: dict = Field(default={}, description="")
|
|
75
75
|
query_log_table: str = Field(default="system.query_log", exclude=True)
|
|
76
76
|
|
|
77
|
-
def get_sql_alchemy_url(
|
|
78
|
-
|
|
77
|
+
def get_sql_alchemy_url(
|
|
78
|
+
self,
|
|
79
|
+
uri_opts: Optional[Dict[str, Any]] = None,
|
|
80
|
+
current_db: Optional[str] = None,
|
|
81
|
+
) -> str:
|
|
82
|
+
return super().get_sql_alchemy_url(uri_opts=uri_opts, current_db=current_db)
|
|
79
83
|
|
|
80
84
|
|
|
81
85
|
@platform_name("ClickHouse")
|
|
82
86
|
@config_class(ClickHouseUsageConfig)
|
|
83
87
|
@support_status(SupportStatus.CERTIFIED)
|
|
84
|
-
@capability(
|
|
88
|
+
@capability(
|
|
89
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
90
|
+
)
|
|
85
91
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
92
|
+
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
86
93
|
@dataclasses.dataclass
|
|
87
94
|
class ClickHouseUsageSource(Source):
|
|
88
95
|
"""
|
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import logging
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from email.utils import parseaddr
|
|
7
|
-
from typing import Dict, Iterable, List, Optional
|
|
7
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
8
8
|
|
|
9
9
|
from dateutil import parser
|
|
10
10
|
from pydantic.fields import Field
|
|
@@ -15,7 +15,9 @@ from sqlalchemy.engine import Engine
|
|
|
15
15
|
import datahub.emitter.mce_builder as builder
|
|
16
16
|
from datahub.configuration.time_window_config import get_time_bucket
|
|
17
17
|
from datahub.ingestion.api.decorators import (
|
|
18
|
+
SourceCapability,
|
|
18
19
|
SupportStatus,
|
|
20
|
+
capability,
|
|
19
21
|
config_class,
|
|
20
22
|
platform_name,
|
|
21
23
|
support_status,
|
|
@@ -58,7 +60,7 @@ AggregatedDataset = GenericAggregatedDataset[TrinoTableRef]
|
|
|
58
60
|
|
|
59
61
|
class TrinoConnectorInfo(BaseModel):
|
|
60
62
|
partitionIds: List[str]
|
|
61
|
-
truncated: Optional[bool]
|
|
63
|
+
truncated: Optional[bool] = None
|
|
62
64
|
|
|
63
65
|
|
|
64
66
|
class TrinoAccessedMetadata(BaseModel):
|
|
@@ -78,7 +80,7 @@ class TrinoJoinedAccessEvent(BaseModel):
|
|
|
78
80
|
table: Optional[str] = None
|
|
79
81
|
accessed_metadata: List[TrinoAccessedMetadata]
|
|
80
82
|
starttime: datetime = Field(alias="create_time")
|
|
81
|
-
endtime: Optional[datetime] = Field(alias="end_time")
|
|
83
|
+
endtime: Optional[datetime] = Field(None, alias="end_time")
|
|
82
84
|
|
|
83
85
|
|
|
84
86
|
class EnvBasedSourceBaseConfig:
|
|
@@ -98,8 +100,10 @@ class TrinoUsageConfig(TrinoConfig, BaseUsageConfig, EnvBasedSourceBaseConfig):
|
|
|
98
100
|
options: dict = Field(default={}, description="")
|
|
99
101
|
database: str = Field(description="The name of the catalog from getting the usage")
|
|
100
102
|
|
|
101
|
-
def get_sql_alchemy_url(
|
|
102
|
-
|
|
103
|
+
def get_sql_alchemy_url(
|
|
104
|
+
self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
|
|
105
|
+
) -> str:
|
|
106
|
+
return super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
|
|
103
107
|
|
|
104
108
|
|
|
105
109
|
@dataclasses.dataclass
|
|
@@ -110,6 +114,7 @@ class TrinoUsageReport(SourceReport):
|
|
|
110
114
|
@platform_name("Trino")
|
|
111
115
|
@config_class(TrinoUsageConfig)
|
|
112
116
|
@support_status(SupportStatus.CERTIFIED)
|
|
117
|
+
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
113
118
|
@dataclasses.dataclass
|
|
114
119
|
class TrinoUsageSource(Source):
|
|
115
120
|
"""
|
|
@@ -12,15 +12,13 @@ from typing import (
|
|
|
12
12
|
Optional,
|
|
13
13
|
Tuple,
|
|
14
14
|
TypeVar,
|
|
15
|
-
Union,
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
import pydantic
|
|
19
|
-
from deprecated import deprecated
|
|
20
18
|
from pydantic.fields import Field
|
|
21
19
|
|
|
22
20
|
import datahub.emitter.mce_builder as builder
|
|
23
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
21
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
24
22
|
from datahub.configuration.time_window_config import (
|
|
25
23
|
BaseTimeWindowConfig,
|
|
26
24
|
BucketDuration,
|
|
@@ -28,19 +26,13 @@ from datahub.configuration.time_window_config import (
|
|
|
28
26
|
)
|
|
29
27
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
30
28
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
|
-
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetUsageStatistics
|
|
32
29
|
from datahub.metadata.schema_classes import (
|
|
33
|
-
CalendarIntervalClass,
|
|
34
30
|
DatasetFieldUsageCountsClass,
|
|
35
31
|
DatasetUsageStatisticsClass,
|
|
36
32
|
DatasetUserUsageCountsClass,
|
|
37
33
|
TimeWindowSizeClass,
|
|
38
|
-
UsageAggregationClass,
|
|
39
|
-
WindowDurationClass,
|
|
40
34
|
)
|
|
41
35
|
from datahub.utilities.sql_formatter import format_sql_query, trim_query
|
|
42
|
-
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
43
|
-
from datahub.utilities.urns.urn import guess_entity_type
|
|
44
36
|
|
|
45
37
|
logger = logging.getLogger(__name__)
|
|
46
38
|
|
|
@@ -202,13 +194,13 @@ class GenericAggregatedDataset(Generic[ResourceType]):
|
|
|
202
194
|
|
|
203
195
|
|
|
204
196
|
class BaseUsageConfig(BaseTimeWindowConfig):
|
|
205
|
-
queries_character_limit: int = Field(
|
|
197
|
+
queries_character_limit: HiddenFromDocs[int] = Field(
|
|
198
|
+
# Hidden since we don't want to encourage people to break elasticsearch.
|
|
206
199
|
default=DEFAULT_QUERIES_CHARACTER_LIMIT,
|
|
207
200
|
description=(
|
|
208
201
|
"Total character limit for all queries in a single usage aspect."
|
|
209
202
|
" Queries will be truncated to length `queries_character_limit / top_n_queries`."
|
|
210
203
|
),
|
|
211
|
-
hidden_from_docs=True, # Don't want to encourage people to break elasticsearch
|
|
212
204
|
)
|
|
213
205
|
|
|
214
206
|
top_n_queries: pydantic.PositiveInt = Field(
|
|
@@ -276,6 +268,7 @@ class UsageAggregator(Generic[ResourceType]):
|
|
|
276
268
|
user,
|
|
277
269
|
query,
|
|
278
270
|
fields,
|
|
271
|
+
user_email_pattern=self.config.user_email_pattern,
|
|
279
272
|
count=count,
|
|
280
273
|
)
|
|
281
274
|
|
|
@@ -295,60 +288,3 @@ class UsageAggregator(Generic[ResourceType]):
|
|
|
295
288
|
user_urn_builder=user_urn_builder,
|
|
296
289
|
queries_character_limit=self.config.queries_character_limit,
|
|
297
290
|
)
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
@deprecated
|
|
301
|
-
def convert_usage_aggregation_class(
|
|
302
|
-
obj: UsageAggregationClass,
|
|
303
|
-
) -> MetadataChangeProposalWrapper:
|
|
304
|
-
# Legacy usage aggregation only supported dataset usage stats
|
|
305
|
-
if guess_entity_type(obj.resource) == DatasetUrn.ENTITY_TYPE:
|
|
306
|
-
aspect = DatasetUsageStatistics(
|
|
307
|
-
timestampMillis=obj.bucket,
|
|
308
|
-
eventGranularity=TimeWindowSizeClass(
|
|
309
|
-
unit=convert_window_to_interval(obj.duration)
|
|
310
|
-
),
|
|
311
|
-
uniqueUserCount=obj.metrics.uniqueUserCount,
|
|
312
|
-
totalSqlQueries=obj.metrics.totalSqlQueries,
|
|
313
|
-
topSqlQueries=obj.metrics.topSqlQueries,
|
|
314
|
-
userCounts=(
|
|
315
|
-
[
|
|
316
|
-
DatasetUserUsageCountsClass(
|
|
317
|
-
user=u.user, count=u.count, userEmail=u.userEmail
|
|
318
|
-
)
|
|
319
|
-
for u in obj.metrics.users
|
|
320
|
-
if u.user is not None
|
|
321
|
-
]
|
|
322
|
-
if obj.metrics.users
|
|
323
|
-
else None
|
|
324
|
-
),
|
|
325
|
-
fieldCounts=(
|
|
326
|
-
[
|
|
327
|
-
DatasetFieldUsageCountsClass(fieldPath=f.fieldName, count=f.count)
|
|
328
|
-
for f in obj.metrics.fields
|
|
329
|
-
]
|
|
330
|
-
if obj.metrics.fields
|
|
331
|
-
else None
|
|
332
|
-
),
|
|
333
|
-
)
|
|
334
|
-
return MetadataChangeProposalWrapper(entityUrn=obj.resource, aspect=aspect)
|
|
335
|
-
else:
|
|
336
|
-
raise Exception(
|
|
337
|
-
f"Skipping unsupported usage aggregation - invalid entity type: {obj}"
|
|
338
|
-
)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
@deprecated
|
|
342
|
-
def convert_window_to_interval(window: Union[str, WindowDurationClass]) -> str:
|
|
343
|
-
if window == WindowDurationClass.YEAR:
|
|
344
|
-
return CalendarIntervalClass.YEAR
|
|
345
|
-
elif window == WindowDurationClass.MONTH:
|
|
346
|
-
return CalendarIntervalClass.MONTH
|
|
347
|
-
elif window == WindowDurationClass.WEEK:
|
|
348
|
-
return CalendarIntervalClass.WEEK
|
|
349
|
-
elif window == WindowDurationClass.DAY:
|
|
350
|
-
return CalendarIntervalClass.DAY
|
|
351
|
-
elif window == WindowDurationClass.HOUR:
|
|
352
|
-
return CalendarIntervalClass.HOUR
|
|
353
|
-
else:
|
|
354
|
-
raise Exception(f"Unsupported window duration: {window}")
|
|
File without changes
|