acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
datahub/ingestion/source/mode.py
CHANGED
|
@@ -1,14 +1,25 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
import re
|
|
4
5
|
import time
|
|
5
6
|
from dataclasses import dataclass
|
|
6
7
|
from datetime import datetime, timezone
|
|
7
8
|
from functools import lru_cache
|
|
8
9
|
from json import JSONDecodeError
|
|
9
|
-
from typing import
|
|
10
|
+
from typing import (
|
|
11
|
+
Dict,
|
|
12
|
+
Iterable,
|
|
13
|
+
Iterator,
|
|
14
|
+
List,
|
|
15
|
+
Optional,
|
|
16
|
+
Set,
|
|
17
|
+
Tuple,
|
|
18
|
+
Union,
|
|
19
|
+
)
|
|
10
20
|
|
|
11
21
|
import dateutil.parser as dp
|
|
22
|
+
import psutil
|
|
12
23
|
import pydantic
|
|
13
24
|
import requests
|
|
14
25
|
import sqlglot
|
|
@@ -22,7 +33,7 @@ from requests.models import HTTPBasicAuth, HTTPError
|
|
|
22
33
|
from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
|
|
23
34
|
|
|
24
35
|
import datahub.emitter.mce_builder as builder
|
|
25
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
36
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
26
37
|
from datahub.configuration.source_common import (
|
|
27
38
|
DatasetLineageProviderConfigBase,
|
|
28
39
|
)
|
|
@@ -33,6 +44,7 @@ from datahub.emitter.mcp_builder import (
|
|
|
33
44
|
add_dataset_to_container,
|
|
34
45
|
gen_containers,
|
|
35
46
|
)
|
|
47
|
+
from datahub.emitter.request_helper import make_curl_command
|
|
36
48
|
from datahub.ingestion.api.common import PipelineContext
|
|
37
49
|
from datahub.ingestion.api.decorators import (
|
|
38
50
|
SourceCapability,
|
|
@@ -113,8 +125,12 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
113
125
|
)
|
|
114
126
|
from datahub.utilities import config_clean
|
|
115
127
|
from datahub.utilities.lossy_collections import LossyList
|
|
128
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
116
129
|
|
|
117
130
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
131
|
+
# Default API limit for items returned per API call
|
|
132
|
+
# Used for the default per_page value for paginated API requests
|
|
133
|
+
DEFAULT_API_ITEMS_PER_PAGE = 30
|
|
118
134
|
|
|
119
135
|
|
|
120
136
|
class SpaceKey(ContainerKey):
|
|
@@ -193,15 +209,37 @@ class ModeConfig(
|
|
|
193
209
|
default=True, description="Tag measures and dimensions in the schema"
|
|
194
210
|
)
|
|
195
211
|
|
|
212
|
+
items_per_page: HiddenFromDocs[int] = Field(
|
|
213
|
+
DEFAULT_API_ITEMS_PER_PAGE,
|
|
214
|
+
description="Number of items per page for paginated API requests.",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
exclude_archived: bool = Field(
|
|
218
|
+
default=False, description="Exclude archived reports"
|
|
219
|
+
)
|
|
220
|
+
|
|
196
221
|
@validator("connect_uri")
|
|
197
222
|
def remove_trailing_slash(cls, v):
|
|
198
223
|
return config_clean.remove_trailing_slashes(v)
|
|
199
224
|
|
|
225
|
+
@validator("items_per_page")
|
|
226
|
+
def validate_items_per_page(cls, v):
|
|
227
|
+
if 1 <= v <= DEFAULT_API_ITEMS_PER_PAGE:
|
|
228
|
+
return v
|
|
229
|
+
else:
|
|
230
|
+
raise ValueError(
|
|
231
|
+
f"items_per_page must be between 1 and {DEFAULT_API_ITEMS_PER_PAGE}"
|
|
232
|
+
)
|
|
233
|
+
|
|
200
234
|
|
|
201
235
|
class HTTPError429(HTTPError):
|
|
202
236
|
pass
|
|
203
237
|
|
|
204
238
|
|
|
239
|
+
class HTTPError504(HTTPError):
|
|
240
|
+
pass
|
|
241
|
+
|
|
242
|
+
|
|
205
243
|
ModeRequestError = (HTTPError, JSONDecodeError)
|
|
206
244
|
|
|
207
245
|
|
|
@@ -216,6 +254,23 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
|
|
|
216
254
|
num_query_template_render: int = 0
|
|
217
255
|
num_query_template_render_failures: int = 0
|
|
218
256
|
num_query_template_render_success: int = 0
|
|
257
|
+
num_requests_exceeding_rate_limit: int = 0
|
|
258
|
+
num_requests_retried_on_timeout: int = 0
|
|
259
|
+
num_spaces_retrieved: int = 0
|
|
260
|
+
space_get_api_called: int = 0
|
|
261
|
+
report_get_api_called: int = 0
|
|
262
|
+
dataset_get_api_called: int = 0
|
|
263
|
+
query_get_api_called: int = 0
|
|
264
|
+
chart_get_api_called: int = 0
|
|
265
|
+
get_cache_hits: int = 0
|
|
266
|
+
get_cache_misses: int = 0
|
|
267
|
+
get_cache_size: int = 0
|
|
268
|
+
process_memory_used_mb: float = 0
|
|
269
|
+
space_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
270
|
+
report_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
271
|
+
dataset_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
272
|
+
query_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
273
|
+
chart_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
219
274
|
|
|
220
275
|
def report_dropped_space(self, ent_name: str) -> None:
|
|
221
276
|
self.filtered_spaces.append(ent_name)
|
|
@@ -339,7 +394,8 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
339
394
|
|
|
340
395
|
# Test the connection
|
|
341
396
|
try:
|
|
342
|
-
self._get_request_json(f"{self.config.connect_uri}/api/verify")
|
|
397
|
+
key_info = self._get_request_json(f"{self.config.connect_uri}/api/verify")
|
|
398
|
+
logger.debug(f"Auth info: {key_info}")
|
|
343
399
|
except ModeRequestError as e:
|
|
344
400
|
self.report.report_failure(
|
|
345
401
|
title="Failed to Connect",
|
|
@@ -454,9 +510,23 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
454
510
|
# Datasets
|
|
455
511
|
datasets = []
|
|
456
512
|
for imported_dataset_name in report_info.get("imported_datasets", {}):
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
513
|
+
try:
|
|
514
|
+
mode_dataset = self._get_request_json(
|
|
515
|
+
f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
|
|
516
|
+
)
|
|
517
|
+
except HTTPError as http_error:
|
|
518
|
+
status_code = http_error.response.status_code
|
|
519
|
+
if status_code == 404:
|
|
520
|
+
self.report.report_warning(
|
|
521
|
+
title="Report Not Found",
|
|
522
|
+
message="Referenced report for reusable dataset was not found.",
|
|
523
|
+
context=f"Report: {report_info.get('id')}, "
|
|
524
|
+
f"Imported Dataset Report: {imported_dataset_name.get('token')}",
|
|
525
|
+
)
|
|
526
|
+
continue
|
|
527
|
+
else:
|
|
528
|
+
raise http_error
|
|
529
|
+
|
|
460
530
|
dataset_urn = builder.make_dataset_urn_with_platform_instance(
|
|
461
531
|
self.platform,
|
|
462
532
|
str(mode_dataset.get("id")),
|
|
@@ -560,29 +630,38 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
560
630
|
space_info = {}
|
|
561
631
|
try:
|
|
562
632
|
logger.debug(f"Retrieving spaces for {self.workspace_uri}")
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
for s in spaces:
|
|
569
|
-
logger.debug(f"Space: {s.get('name')}")
|
|
570
|
-
space_name = s.get("name", "")
|
|
571
|
-
# Using both restricted and default_access_level because
|
|
572
|
-
# there is a current bug with restricted returning False everytime
|
|
573
|
-
# which has been reported to Mode team
|
|
574
|
-
if self.config.exclude_restricted and (
|
|
575
|
-
s.get("restricted") or s.get("default_access_level") == "restricted"
|
|
633
|
+
with self.report.space_get_timer:
|
|
634
|
+
for spaces_page in self._get_paged_request_json(
|
|
635
|
+
f"{self.workspace_uri}/spaces?filter=all",
|
|
636
|
+
"spaces",
|
|
637
|
+
self.config.items_per_page,
|
|
576
638
|
):
|
|
577
|
-
|
|
578
|
-
|
|
639
|
+
self.report.space_get_api_called += 1
|
|
640
|
+
logger.debug(
|
|
641
|
+
f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
|
|
579
642
|
)
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
643
|
+
self.report.num_spaces_retrieved += len(spaces_page)
|
|
644
|
+
for s in spaces_page:
|
|
645
|
+
logger.debug(f"Space: {s.get('name')}")
|
|
646
|
+
space_name = s.get("name", "")
|
|
647
|
+
# Using both restricted and default_access_level because
|
|
648
|
+
# there is a current bug with restricted returning False everytime
|
|
649
|
+
# which has been reported to Mode team
|
|
650
|
+
if self.config.exclude_restricted and (
|
|
651
|
+
s.get("restricted")
|
|
652
|
+
or s.get("default_access_level") == "restricted"
|
|
653
|
+
):
|
|
654
|
+
logging.debug(
|
|
655
|
+
f"Skipping space {space_name} due to exclude restricted"
|
|
656
|
+
)
|
|
657
|
+
continue
|
|
658
|
+
if not self.config.space_pattern.allowed(space_name):
|
|
659
|
+
self.report.report_dropped_space(space_name)
|
|
660
|
+
logging.debug(
|
|
661
|
+
f"Skipping space {space_name} due to space pattern"
|
|
662
|
+
)
|
|
663
|
+
continue
|
|
664
|
+
space_info[s.get("token", "")] = s.get("name", "")
|
|
586
665
|
except ModeRequestError as e:
|
|
587
666
|
self.report.report_failure(
|
|
588
667
|
title="Failed to Retrieve Spaces",
|
|
@@ -897,7 +976,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
897
976
|
for match in matches:
|
|
898
977
|
definition = Template(source=match).render()
|
|
899
978
|
parameters = yaml.safe_load(definition)
|
|
900
|
-
for key in parameters
|
|
979
|
+
for key in parameters:
|
|
901
980
|
jinja_params[key] = parameters[key].get("default", "")
|
|
902
981
|
|
|
903
982
|
normalized_query = re.sub(
|
|
@@ -1386,125 +1465,217 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1386
1465
|
mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
1387
1466
|
yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
|
|
1388
1467
|
|
|
1389
|
-
|
|
1390
|
-
def _get_reports(self, space_token: str) -> List[dict]:
|
|
1391
|
-
reports = []
|
|
1468
|
+
def _get_reports(self, space_token: str) -> Iterator[List[dict]]:
|
|
1392
1469
|
try:
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1470
|
+
with self.report.report_get_timer:
|
|
1471
|
+
for reports_page in self._get_paged_request_json(
|
|
1472
|
+
f"{self.workspace_uri}/spaces/{space_token}/reports?filter=all",
|
|
1473
|
+
"reports",
|
|
1474
|
+
self.config.items_per_page,
|
|
1475
|
+
):
|
|
1476
|
+
self.report.report_get_api_called += 1
|
|
1477
|
+
logger.debug(
|
|
1478
|
+
f"Read {len(reports_page)} reports records from workspace {self.workspace_uri} space {space_token}"
|
|
1479
|
+
)
|
|
1480
|
+
if self.config.exclude_archived:
|
|
1481
|
+
logger.debug(
|
|
1482
|
+
f"Excluding archived reports since exclude_archived: {self.config.exclude_archived}"
|
|
1483
|
+
)
|
|
1484
|
+
reports_page = [
|
|
1485
|
+
report
|
|
1486
|
+
for report in reports_page
|
|
1487
|
+
if not report.get("archived", False)
|
|
1488
|
+
]
|
|
1489
|
+
yield reports_page
|
|
1397
1490
|
except ModeRequestError as e:
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1491
|
+
if isinstance(e, HTTPError) and e.response.status_code == 404:
|
|
1492
|
+
self.report.report_warning(
|
|
1493
|
+
title="No Reports Found in Space",
|
|
1494
|
+
message="No reports were found in the space. It may have been recently deleted.",
|
|
1495
|
+
context=f"Space Token: {space_token}, Error: {str(e)}",
|
|
1496
|
+
)
|
|
1497
|
+
else:
|
|
1498
|
+
self.report.report_failure(
|
|
1499
|
+
title="Failed to Retrieve Reports for Space",
|
|
1500
|
+
message="Unable to retrieve reports for space token.",
|
|
1501
|
+
context=f"Space Token: {space_token}, Error: {str(e)}",
|
|
1502
|
+
)
|
|
1404
1503
|
|
|
1405
|
-
|
|
1406
|
-
def _get_datasets(self, space_token: str) -> List[dict]:
|
|
1504
|
+
def _get_datasets(self, space_token: str) -> Iterator[List[dict]]:
|
|
1407
1505
|
"""
|
|
1408
1506
|
Retrieves datasets for a given space token.
|
|
1409
1507
|
"""
|
|
1410
|
-
datasets = []
|
|
1411
1508
|
try:
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1509
|
+
with self.report.dataset_get_timer:
|
|
1510
|
+
for dataset_page in self._get_paged_request_json(
|
|
1511
|
+
f"{self.workspace_uri}/spaces/{space_token}/datasets?filter=all",
|
|
1512
|
+
"reports",
|
|
1513
|
+
self.config.items_per_page,
|
|
1514
|
+
):
|
|
1515
|
+
self.report.dataset_get_api_called += 1
|
|
1516
|
+
logger.debug(
|
|
1517
|
+
f"Read {len(dataset_page)} datasets records from workspace {self.workspace_uri} space {space_token}"
|
|
1518
|
+
)
|
|
1519
|
+
yield dataset_page
|
|
1415
1520
|
except ModeRequestError as e:
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1521
|
+
if isinstance(e, HTTPError) and e.response.status_code == 404:
|
|
1522
|
+
self.report.report_warning(
|
|
1523
|
+
title="No Datasets Found in Space",
|
|
1524
|
+
message="No datasets were found in the space. It may have been recently deleted.",
|
|
1525
|
+
context=f"Space Token: {space_token}, Error: {str(e)}",
|
|
1526
|
+
)
|
|
1527
|
+
else:
|
|
1528
|
+
self.report.report_failure(
|
|
1529
|
+
title="Failed to Retrieve Datasets for Space",
|
|
1530
|
+
message=f"Unable to retrieve datasets for space token {space_token}.",
|
|
1531
|
+
context=f"Space Token: {space_token}, Error: {str(e)}",
|
|
1532
|
+
)
|
|
1422
1533
|
|
|
1423
|
-
|
|
1424
|
-
def _get_queries(self, report_token: str) -> list:
|
|
1425
|
-
queries = []
|
|
1534
|
+
def _get_queries(self, report_token: str) -> List[dict]:
|
|
1426
1535
|
try:
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1536
|
+
with self.report.query_get_timer:
|
|
1537
|
+
# This endpoint does not handle pagination properly
|
|
1538
|
+
queries = self._get_request_json(
|
|
1539
|
+
f"{self.workspace_uri}/reports/{report_token}/queries"
|
|
1540
|
+
)
|
|
1541
|
+
self.report.query_get_api_called += 1
|
|
1542
|
+
logger.debug(
|
|
1543
|
+
f"Read {len(queries)} queries records from workspace {self.workspace_uri} report {report_token}"
|
|
1544
|
+
)
|
|
1545
|
+
return queries.get("_embedded", {}).get("queries", [])
|
|
1431
1546
|
except ModeRequestError as e:
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1547
|
+
if isinstance(e, HTTPError) and e.response.status_code == 404:
|
|
1548
|
+
self.report.report_warning(
|
|
1549
|
+
title="No Queries Found",
|
|
1550
|
+
message="No queries found for the report token. Maybe the report is deleted...",
|
|
1551
|
+
context=f"Report Token: {report_token}, Error: {str(e)}",
|
|
1552
|
+
)
|
|
1553
|
+
else:
|
|
1554
|
+
self.report.report_failure(
|
|
1555
|
+
title="Failed to Retrieve Queries",
|
|
1556
|
+
message="Unable to retrieve queries for report token.",
|
|
1557
|
+
context=f"Report Token: {report_token}, Error: {str(e)}",
|
|
1558
|
+
)
|
|
1559
|
+
return []
|
|
1438
1560
|
|
|
1439
1561
|
@lru_cache(maxsize=None)
|
|
1440
|
-
def _get_last_query_run(
|
|
1441
|
-
|
|
1442
|
-
|
|
1562
|
+
def _get_last_query_run(self, report_token: str, report_run_id: str) -> list:
|
|
1563
|
+
# This function is unused and may be subject to removal in a future revision of this source
|
|
1564
|
+
query_runs = []
|
|
1443
1565
|
try:
|
|
1444
|
-
|
|
1445
|
-
f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs
|
|
1446
|
-
|
|
1447
|
-
|
|
1566
|
+
for query_run_page in self._get_paged_request_json(
|
|
1567
|
+
f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs?filter=all",
|
|
1568
|
+
"query_runs",
|
|
1569
|
+
self.config.items_per_page,
|
|
1570
|
+
):
|
|
1571
|
+
query_runs.extend(query_run_page)
|
|
1448
1572
|
except ModeRequestError as e:
|
|
1449
1573
|
self.report.report_failure(
|
|
1450
1574
|
title="Failed to Retrieve Queries for Report",
|
|
1451
1575
|
message="Unable to retrieve queries for report token.",
|
|
1452
1576
|
context=f"Report Token:{report_token}, Error: {str(e)}",
|
|
1453
1577
|
)
|
|
1454
|
-
|
|
1455
|
-
return queries
|
|
1578
|
+
return query_runs
|
|
1456
1579
|
|
|
1457
|
-
|
|
1458
|
-
def _get_charts(self, report_token: str, query_token: str) -> list:
|
|
1459
|
-
charts = []
|
|
1580
|
+
def _get_charts(self, report_token: str, query_token: str) -> List[dict]:
|
|
1460
1581
|
try:
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1582
|
+
with self.report.chart_get_timer:
|
|
1583
|
+
# This endpoint does not handle pagination properly
|
|
1584
|
+
charts = self._get_request_json(
|
|
1585
|
+
f"{self.workspace_uri}/reports/{report_token}/queries/{query_token}/charts"
|
|
1586
|
+
)
|
|
1587
|
+
self.report.chart_get_api_called += 1
|
|
1588
|
+
logger.debug(
|
|
1589
|
+
f"Read {len(charts)} charts records from workspace {self.workspace_uri} report {report_token} query {query_token}"
|
|
1590
|
+
)
|
|
1591
|
+
return charts.get("_embedded", {}).get("charts", [])
|
|
1466
1592
|
except ModeRequestError as e:
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1593
|
+
if isinstance(e, HTTPError) and e.response.status_code == 404:
|
|
1594
|
+
self.report.report_warning(
|
|
1595
|
+
title="No Charts Found for Query",
|
|
1596
|
+
message="No charts were found for the query. The query may have been recently deleted.",
|
|
1597
|
+
context=f"Report Token: {report_token}, Query Token: {query_token}, Error: {str(e)}",
|
|
1598
|
+
)
|
|
1599
|
+
else:
|
|
1600
|
+
self.report.report_failure(
|
|
1601
|
+
title="Failed to Retrieve Charts",
|
|
1602
|
+
message="Unable to retrieve charts from Mode.",
|
|
1603
|
+
context=f"Report Token: {report_token}, Query Token: {query_token}, Error: {str(e)}",
|
|
1604
|
+
)
|
|
1605
|
+
return []
|
|
1475
1606
|
|
|
1607
|
+
def _get_paged_request_json(
|
|
1608
|
+
self, url: str, key: str, per_page: int
|
|
1609
|
+
) -> Iterator[List[Dict]]:
|
|
1610
|
+
page: int = 1
|
|
1611
|
+
while True:
|
|
1612
|
+
page_url = f"{url}&per_page={per_page}&page={page}"
|
|
1613
|
+
response = self._get_request_json(page_url)
|
|
1614
|
+
data: List[Dict] = response.get("_embedded", {}).get(key, [])
|
|
1615
|
+
if not data:
|
|
1616
|
+
break
|
|
1617
|
+
yield data
|
|
1618
|
+
page += 1
|
|
1619
|
+
|
|
1620
|
+
@lru_cache(maxsize=None)
|
|
1476
1621
|
def _get_request_json(self, url: str) -> Dict:
|
|
1477
1622
|
r = tenacity.Retrying(
|
|
1478
1623
|
wait=wait_exponential(
|
|
1479
1624
|
multiplier=self.config.api_options.retry_backoff_multiplier,
|
|
1480
1625
|
max=self.config.api_options.max_retry_interval,
|
|
1481
1626
|
),
|
|
1482
|
-
retry=retry_if_exception_type(
|
|
1627
|
+
retry=retry_if_exception_type(
|
|
1628
|
+
(HTTPError429, HTTPError504, ConnectionError)
|
|
1629
|
+
),
|
|
1483
1630
|
stop=stop_after_attempt(self.config.api_options.max_attempts),
|
|
1484
1631
|
)
|
|
1485
1632
|
|
|
1486
1633
|
@r.wraps
|
|
1487
1634
|
def get_request():
|
|
1635
|
+
curl_command = make_curl_command(self.session, "GET", url, "")
|
|
1636
|
+
logger.debug(f"Issuing request; curl equivalent: {curl_command}")
|
|
1637
|
+
|
|
1488
1638
|
try:
|
|
1489
1639
|
response = self.session.get(
|
|
1490
1640
|
url, timeout=self.config.api_options.timeout
|
|
1491
1641
|
)
|
|
1492
1642
|
if response.status_code == 204: # No content, don't parse json
|
|
1493
1643
|
return {}
|
|
1644
|
+
|
|
1645
|
+
response.raise_for_status()
|
|
1494
1646
|
return response.json()
|
|
1495
1647
|
except HTTPError as http_error:
|
|
1496
1648
|
error_response = http_error.response
|
|
1497
1649
|
if error_response.status_code == 429:
|
|
1650
|
+
self.report.num_requests_exceeding_rate_limit += 1
|
|
1498
1651
|
# respect Retry-After
|
|
1499
1652
|
sleep_time = error_response.headers.get("retry-after")
|
|
1500
1653
|
if sleep_time is not None:
|
|
1501
1654
|
time.sleep(float(sleep_time))
|
|
1502
1655
|
raise HTTPError429 from None
|
|
1656
|
+
elif error_response.status_code == 504:
|
|
1657
|
+
self.report.num_requests_retried_on_timeout += 1
|
|
1658
|
+
time.sleep(0.1)
|
|
1659
|
+
raise HTTPError504 from None
|
|
1503
1660
|
|
|
1661
|
+
logger.debug(
|
|
1662
|
+
f"Error response ({error_response.status_code}): {error_response.text}"
|
|
1663
|
+
)
|
|
1504
1664
|
raise http_error
|
|
1505
1665
|
|
|
1506
1666
|
return get_request()
|
|
1507
1667
|
|
|
1668
|
+
@staticmethod
|
|
1669
|
+
def _get_process_memory():
|
|
1670
|
+
process = psutil.Process(os.getpid())
|
|
1671
|
+
mem_info = process.memory_info()
|
|
1672
|
+
return {
|
|
1673
|
+
"rss": mem_info.rss / (1024 * 1024),
|
|
1674
|
+
"vms": mem_info.vms / (1024 * 1024),
|
|
1675
|
+
"shared": getattr(mem_info, "shared", 0) / (1024 * 1024),
|
|
1676
|
+
"data": getattr(mem_info, "data", 0) / (1024 * 1024),
|
|
1677
|
+
}
|
|
1678
|
+
|
|
1508
1679
|
@staticmethod
|
|
1509
1680
|
def create_embed_aspect_mcp(
|
|
1510
1681
|
entity_urn: str, embed_url: str
|
|
@@ -1540,115 +1711,116 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1540
1711
|
yield from self.construct_space_container(space_token, space_name)
|
|
1541
1712
|
space_container_key = self.gen_space_key(space_token)
|
|
1542
1713
|
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
if dashboard_tuple_from_report is None:
|
|
1553
|
-
continue
|
|
1554
|
-
(
|
|
1555
|
-
dashboard_snapshot_from_report,
|
|
1556
|
-
browse_mcpw,
|
|
1557
|
-
) = dashboard_tuple_from_report
|
|
1714
|
+
for report_page in self._get_reports(space_token):
|
|
1715
|
+
for report in report_page:
|
|
1716
|
+
logger.debug(
|
|
1717
|
+
f"Report: name: {report.get('name')} token: {report.get('token')}"
|
|
1718
|
+
)
|
|
1719
|
+
dashboard_tuple_from_report = self.construct_dashboard(
|
|
1720
|
+
space_token=space_token, report_info=report
|
|
1721
|
+
)
|
|
1558
1722
|
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1723
|
+
if dashboard_tuple_from_report is None:
|
|
1724
|
+
continue
|
|
1725
|
+
(
|
|
1726
|
+
dashboard_snapshot_from_report,
|
|
1727
|
+
browse_mcpw,
|
|
1728
|
+
) = dashboard_tuple_from_report
|
|
1562
1729
|
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
)
|
|
1567
|
-
yield mcpw.as_workunit()
|
|
1568
|
-
yield from add_dataset_to_container(
|
|
1569
|
-
container_key=space_container_key,
|
|
1570
|
-
dataset_urn=dashboard_snapshot_from_report.urn,
|
|
1571
|
-
)
|
|
1572
|
-
yield browse_mcpw.as_workunit()
|
|
1730
|
+
mce = MetadataChangeEvent(
|
|
1731
|
+
proposedSnapshot=dashboard_snapshot_from_report
|
|
1732
|
+
)
|
|
1573
1733
|
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1734
|
+
mcpw = MetadataChangeProposalWrapper(
|
|
1735
|
+
entityUrn=dashboard_snapshot_from_report.urn,
|
|
1736
|
+
aspect=SubTypesClass(typeNames=[BIAssetSubTypes.MODE_REPORT]),
|
|
1737
|
+
)
|
|
1738
|
+
yield mcpw.as_workunit()
|
|
1739
|
+
yield from add_dataset_to_container(
|
|
1740
|
+
container_key=space_container_key,
|
|
1741
|
+
dataset_urn=dashboard_snapshot_from_report.urn,
|
|
1742
|
+
)
|
|
1743
|
+
yield browse_mcpw.as_workunit()
|
|
1578
1744
|
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1745
|
+
usage_statistics = DashboardUsageStatisticsClass(
|
|
1746
|
+
timestampMillis=round(datetime.now().timestamp() * 1000),
|
|
1747
|
+
viewsCount=report.get("view_count", 0),
|
|
1748
|
+
)
|
|
1583
1749
|
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
|
|
1750
|
+
yield MetadataChangeProposalWrapper(
|
|
1751
|
+
entityUrn=dashboard_snapshot_from_report.urn,
|
|
1752
|
+
aspect=usage_statistics,
|
|
1588
1753
|
).as_workunit()
|
|
1589
1754
|
|
|
1590
|
-
|
|
1755
|
+
if self.config.ingest_embed_url is True:
|
|
1756
|
+
yield self.create_embed_aspect_mcp(
|
|
1757
|
+
entity_urn=dashboard_snapshot_from_report.urn,
|
|
1758
|
+
embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
|
|
1759
|
+
).as_workunit()
|
|
1760
|
+
|
|
1761
|
+
yield MetadataWorkUnit(
|
|
1762
|
+
id=dashboard_snapshot_from_report.urn, mce=mce
|
|
1763
|
+
)
|
|
1591
1764
|
|
|
1592
1765
|
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
1593
1766
|
# Space/collection -> report -> query -> Chart
|
|
1594
|
-
for space_token in self.space_tokens
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
query,
|
|
1604
|
-
space_token=space_token,
|
|
1605
|
-
report_info=report,
|
|
1606
|
-
is_mode_dataset=False,
|
|
1607
|
-
)
|
|
1608
|
-
chart_fields: Dict[str, SchemaFieldClass] = {}
|
|
1609
|
-
for wu in query_mcps:
|
|
1610
|
-
if isinstance(
|
|
1611
|
-
wu.metadata, MetadataChangeProposalWrapper
|
|
1612
|
-
) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
|
|
1613
|
-
schema_metadata = wu.metadata.aspect
|
|
1614
|
-
for field in schema_metadata.fields:
|
|
1615
|
-
chart_fields.setdefault(field.fieldPath, field)
|
|
1616
|
-
|
|
1617
|
-
yield wu
|
|
1618
|
-
|
|
1619
|
-
charts = self._get_charts(report_token, query.get("token", ""))
|
|
1620
|
-
# build charts
|
|
1621
|
-
for i, chart in enumerate(charts):
|
|
1622
|
-
yield from self.construct_chart_from_api_data(
|
|
1623
|
-
i,
|
|
1624
|
-
chart,
|
|
1625
|
-
chart_fields,
|
|
1767
|
+
for space_token in self.space_tokens:
|
|
1768
|
+
for report_page in self._get_reports(space_token):
|
|
1769
|
+
for report in report_page:
|
|
1770
|
+
report_token = report.get("token", "")
|
|
1771
|
+
|
|
1772
|
+
queries = self._get_queries(report_token)
|
|
1773
|
+
for query in queries:
|
|
1774
|
+
query_mcps = self.construct_query_or_dataset(
|
|
1775
|
+
report_token,
|
|
1626
1776
|
query,
|
|
1627
1777
|
space_token=space_token,
|
|
1628
1778
|
report_info=report,
|
|
1629
|
-
|
|
1779
|
+
is_mode_dataset=False,
|
|
1630
1780
|
)
|
|
1781
|
+
chart_fields: Dict[str, SchemaFieldClass] = {}
|
|
1782
|
+
for wu in query_mcps:
|
|
1783
|
+
if isinstance(
|
|
1784
|
+
wu.metadata, MetadataChangeProposalWrapper
|
|
1785
|
+
) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
|
|
1786
|
+
schema_metadata = wu.metadata.aspect
|
|
1787
|
+
for field in schema_metadata.fields:
|
|
1788
|
+
chart_fields.setdefault(field.fieldPath, field)
|
|
1789
|
+
|
|
1790
|
+
yield wu
|
|
1791
|
+
|
|
1792
|
+
charts = self._get_charts(report_token, query.get("token", ""))
|
|
1793
|
+
# build charts
|
|
1794
|
+
for i, chart in enumerate(charts):
|
|
1795
|
+
yield from self.construct_chart_from_api_data(
|
|
1796
|
+
i,
|
|
1797
|
+
chart,
|
|
1798
|
+
chart_fields,
|
|
1799
|
+
query,
|
|
1800
|
+
space_token=space_token,
|
|
1801
|
+
report_info=report,
|
|
1802
|
+
query_name=query["name"],
|
|
1803
|
+
)
|
|
1631
1804
|
|
|
1632
1805
|
def emit_dataset_mces(self):
|
|
1633
1806
|
"""
|
|
1634
1807
|
Emits MetadataChangeEvents (MCEs) for datasets within each space.
|
|
1635
1808
|
"""
|
|
1636
1809
|
for space_token, _ in self.space_tokens.items():
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
yield wu
|
|
1810
|
+
for dataset_page in self._get_datasets(space_token):
|
|
1811
|
+
for report in dataset_page:
|
|
1812
|
+
report_token = report.get("token", "")
|
|
1813
|
+
queries = self._get_queries(report_token)
|
|
1814
|
+
for query in queries:
|
|
1815
|
+
query_mcps = self.construct_query_or_dataset(
|
|
1816
|
+
report_token,
|
|
1817
|
+
query,
|
|
1818
|
+
space_token=space_token,
|
|
1819
|
+
report_info=report,
|
|
1820
|
+
is_mode_dataset=True,
|
|
1821
|
+
)
|
|
1822
|
+
for wu in query_mcps:
|
|
1823
|
+
yield wu
|
|
1652
1824
|
|
|
1653
1825
|
@classmethod
|
|
1654
1826
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "ModeSource":
|
|
@@ -1667,6 +1839,12 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1667
1839
|
yield from self.emit_dashboard_mces()
|
|
1668
1840
|
yield from self.emit_dataset_mces()
|
|
1669
1841
|
yield from self.emit_chart_mces()
|
|
1842
|
+
cache_info = self._get_request_json.cache_info()
|
|
1843
|
+
self.report.get_cache_hits = cache_info.hits
|
|
1844
|
+
self.report.get_cache_misses = cache_info.misses
|
|
1845
|
+
self.report.get_cache_size = cache_info.currsize
|
|
1846
|
+
memory_used = self._get_process_memory()
|
|
1847
|
+
self.report.process_memory_used_mb = round(memory_used["rss"], 2)
|
|
1670
1848
|
|
|
1671
1849
|
def get_report(self) -> SourceReport:
|
|
1672
1850
|
return self.report
|