acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from datahub.emitter.mce_builder import (
|
|
5
|
+
make_dataset_urn_with_platform_instance,
|
|
6
|
+
make_schema_field_urn,
|
|
7
|
+
)
|
|
8
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
10
|
+
from datahub.ingestion.source.grafana.grafana_config import PlatformConnectionConfig
|
|
11
|
+
from datahub.ingestion.source.grafana.models import (
|
|
12
|
+
DatasourceRef,
|
|
13
|
+
GrafanaQueryTarget,
|
|
14
|
+
Panel,
|
|
15
|
+
)
|
|
16
|
+
from datahub.ingestion.source.grafana.report import GrafanaSourceReport
|
|
17
|
+
from datahub.metadata.schema_classes import (
|
|
18
|
+
DatasetLineageTypeClass,
|
|
19
|
+
FineGrainedLineageClass,
|
|
20
|
+
FineGrainedLineageDownstreamTypeClass,
|
|
21
|
+
FineGrainedLineageUpstreamTypeClass,
|
|
22
|
+
UpstreamClass,
|
|
23
|
+
UpstreamLineageClass,
|
|
24
|
+
)
|
|
25
|
+
from datahub.sql_parsing.sqlglot_lineage import (
|
|
26
|
+
SqlParsingResult,
|
|
27
|
+
create_lineage_sql_parsed_result,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class LineageExtractor:
|
|
34
|
+
"""Handles extraction of lineage information from Grafana panels"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
platform: str,
|
|
39
|
+
platform_instance: Optional[str],
|
|
40
|
+
env: str,
|
|
41
|
+
connection_to_platform_map: Dict[str, PlatformConnectionConfig],
|
|
42
|
+
report: GrafanaSourceReport,
|
|
43
|
+
graph: Optional[DataHubGraph] = None,
|
|
44
|
+
include_column_lineage: bool = True,
|
|
45
|
+
):
|
|
46
|
+
self.platform = platform
|
|
47
|
+
self.platform_instance = platform_instance
|
|
48
|
+
self.env = env
|
|
49
|
+
self.connection_map = connection_to_platform_map
|
|
50
|
+
self.graph = graph
|
|
51
|
+
self.report = report
|
|
52
|
+
self.include_column_lineage = include_column_lineage
|
|
53
|
+
|
|
54
|
+
def extract_panel_lineage(
|
|
55
|
+
self, panel: Panel
|
|
56
|
+
) -> Optional[MetadataChangeProposalWrapper]:
|
|
57
|
+
"""Extract lineage information from a panel."""
|
|
58
|
+
if not panel.datasource_ref:
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
ds_type, ds_uid = self._extract_datasource_info(panel.datasource_ref)
|
|
62
|
+
raw_sql = self._extract_raw_sql(panel.query_targets)
|
|
63
|
+
ds_urn = self._build_dataset_urn(ds_type, ds_uid, panel.id)
|
|
64
|
+
|
|
65
|
+
# Handle platform-specific lineage
|
|
66
|
+
if ds_uid in self.connection_map:
|
|
67
|
+
if raw_sql:
|
|
68
|
+
parsed_sql = self._parse_sql(raw_sql, self.connection_map[ds_uid])
|
|
69
|
+
if parsed_sql:
|
|
70
|
+
lineage = self._create_column_lineage(ds_urn, parsed_sql)
|
|
71
|
+
if lineage:
|
|
72
|
+
return lineage
|
|
73
|
+
|
|
74
|
+
# Fall back to basic lineage if SQL parsing fails or no column lineage created
|
|
75
|
+
return self._create_basic_lineage(
|
|
76
|
+
ds_uid, self.connection_map[ds_uid], ds_urn
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
def _extract_datasource_info(
|
|
82
|
+
self, datasource_ref: "DatasourceRef"
|
|
83
|
+
) -> Tuple[str, str]:
|
|
84
|
+
"""Extract datasource type and UID."""
|
|
85
|
+
return datasource_ref.type or "unknown", datasource_ref.uid or "unknown"
|
|
86
|
+
|
|
87
|
+
def _extract_raw_sql(
|
|
88
|
+
self, query_targets: List["GrafanaQueryTarget"]
|
|
89
|
+
) -> Optional[str]:
|
|
90
|
+
"""Extract raw SQL from panel query targets."""
|
|
91
|
+
for target in query_targets:
|
|
92
|
+
if target.get("rawSql"):
|
|
93
|
+
return target["rawSql"]
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
def _build_dataset_urn(self, ds_type: str, ds_uid: str, panel_id: str) -> str:
|
|
97
|
+
"""Build dataset URN."""
|
|
98
|
+
dataset_name = f"{ds_type}.{ds_uid}.{panel_id}"
|
|
99
|
+
return make_dataset_urn_with_platform_instance(
|
|
100
|
+
platform=self.platform,
|
|
101
|
+
name=dataset_name,
|
|
102
|
+
platform_instance=self.platform_instance,
|
|
103
|
+
env=self.env,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def _create_basic_lineage(
|
|
107
|
+
self, ds_uid: str, platform_config: PlatformConnectionConfig, ds_urn: str
|
|
108
|
+
) -> MetadataChangeProposalWrapper:
|
|
109
|
+
"""Create basic upstream lineage."""
|
|
110
|
+
name = (
|
|
111
|
+
f"{platform_config.database}.{ds_uid}"
|
|
112
|
+
if platform_config.database
|
|
113
|
+
else ds_uid
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
upstream_urn = make_dataset_urn_with_platform_instance(
|
|
117
|
+
platform=platform_config.platform,
|
|
118
|
+
name=name,
|
|
119
|
+
platform_instance=platform_config.platform_instance,
|
|
120
|
+
env=platform_config.env,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
logger.info(f"Generated upstream URN: {upstream_urn}")
|
|
124
|
+
|
|
125
|
+
return MetadataChangeProposalWrapper(
|
|
126
|
+
entityUrn=ds_urn,
|
|
127
|
+
aspect=UpstreamLineageClass(
|
|
128
|
+
upstreams=[
|
|
129
|
+
UpstreamClass(
|
|
130
|
+
dataset=upstream_urn,
|
|
131
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
132
|
+
)
|
|
133
|
+
]
|
|
134
|
+
),
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def _parse_sql(
|
|
138
|
+
self, sql: str, platform_config: PlatformConnectionConfig
|
|
139
|
+
) -> Optional[SqlParsingResult]:
|
|
140
|
+
"""Parse SQL query for lineage information."""
|
|
141
|
+
if not self.graph:
|
|
142
|
+
logger.warning("No DataHub graph specified for SQL parsing.")
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
return create_lineage_sql_parsed_result(
|
|
147
|
+
query=sql,
|
|
148
|
+
platform=platform_config.platform,
|
|
149
|
+
platform_instance=platform_config.platform_instance,
|
|
150
|
+
env=platform_config.env,
|
|
151
|
+
default_db=platform_config.database,
|
|
152
|
+
default_schema=platform_config.database_schema,
|
|
153
|
+
graph=self.graph,
|
|
154
|
+
)
|
|
155
|
+
except ValueError as e:
|
|
156
|
+
logger.error(f"SQL parsing error for query: {sql}", exc_info=e)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.exception(f"Unexpected error during SQL parsing: {sql}", exc_info=e)
|
|
159
|
+
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
def _create_column_lineage(
|
|
163
|
+
self,
|
|
164
|
+
dataset_urn: str,
|
|
165
|
+
parsed_sql: SqlParsingResult,
|
|
166
|
+
) -> Optional[MetadataChangeProposalWrapper]:
|
|
167
|
+
"""Create column-level lineage"""
|
|
168
|
+
if not parsed_sql.column_lineage or not self.include_column_lineage:
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
upstream_lineages = []
|
|
172
|
+
for col_lineage in parsed_sql.column_lineage:
|
|
173
|
+
upstream_lineages.append(
|
|
174
|
+
FineGrainedLineageClass(
|
|
175
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
176
|
+
downstreams=[
|
|
177
|
+
make_schema_field_urn(
|
|
178
|
+
dataset_urn, col_lineage.downstream.column
|
|
179
|
+
)
|
|
180
|
+
],
|
|
181
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
182
|
+
upstreams=[
|
|
183
|
+
make_schema_field_urn(upstream_dataset, col.column)
|
|
184
|
+
for col in col_lineage.upstreams
|
|
185
|
+
for upstream_dataset in parsed_sql.in_tables
|
|
186
|
+
],
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
return MetadataChangeProposalWrapper(
|
|
191
|
+
entityUrn=dataset_urn,
|
|
192
|
+
aspect=UpstreamLineageClass(
|
|
193
|
+
upstreams=[
|
|
194
|
+
UpstreamClass(
|
|
195
|
+
dataset=table,
|
|
196
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
197
|
+
)
|
|
198
|
+
for table in parsed_sql.in_tables
|
|
199
|
+
],
|
|
200
|
+
fineGrainedLineages=upstream_lineages,
|
|
201
|
+
),
|
|
202
|
+
)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Grafana data models for DataHub ingestion.
|
|
2
|
+
|
|
3
|
+
References:
|
|
4
|
+
- Grafana HTTP API: https://grafana.com/docs/grafana/latest/developers/http_api/
|
|
5
|
+
- Dashboard API: https://grafana.com/docs/grafana/latest/developers/http_api/dashboard/
|
|
6
|
+
- Folder API: https://grafana.com/docs/grafana/latest/developers/http_api/folder/
|
|
7
|
+
- Search API: https://grafana.com/docs/grafana/latest/developers/http_api/other/#search-api
|
|
8
|
+
- Dashboard JSON structure: https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/view-dashboard-json-model/
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
15
|
+
|
|
16
|
+
from datahub.emitter.mcp_builder import ContainerKey
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
# Grafana-specific type definitions for better type safety
|
|
20
|
+
GrafanaQueryTarget = Dict[
|
|
21
|
+
str, Any
|
|
22
|
+
] # Query targets: refId, expr/query, datasource, hide, etc.
|
|
23
|
+
GrafanaFieldConfig = Dict[
|
|
24
|
+
str, Any
|
|
25
|
+
] # Field config: defaults, overrides, display settings
|
|
26
|
+
GrafanaTransformation = Dict[str, Any] # Transformations: id, options
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class _GrafanaBaseModel(BaseModel):
|
|
30
|
+
model_config = ConfigDict(coerce_numbers_to_str=True)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DatasourceRef(_GrafanaBaseModel):
|
|
34
|
+
"""Reference to a Grafana datasource."""
|
|
35
|
+
|
|
36
|
+
type: Optional[str] = None # Datasource type (prometheus, mysql, postgres, etc.)
|
|
37
|
+
uid: Optional[str] = None # Datasource unique identifier
|
|
38
|
+
name: Optional[str] = None # Datasource display name
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Panel(_GrafanaBaseModel):
|
|
42
|
+
"""Represents a Grafana dashboard panel."""
|
|
43
|
+
|
|
44
|
+
id: str
|
|
45
|
+
title: str
|
|
46
|
+
description: str = ""
|
|
47
|
+
type: Optional[str] = None
|
|
48
|
+
# Query targets - each contains refId (A,B,C...), query/expr, datasource ref, etc.
|
|
49
|
+
query_targets: List[GrafanaQueryTarget] = Field(
|
|
50
|
+
default_factory=list, alias="targets"
|
|
51
|
+
)
|
|
52
|
+
# Datasource reference - contains type, uid, name
|
|
53
|
+
datasource_ref: Optional[DatasourceRef] = Field(default=None, alias="datasource")
|
|
54
|
+
# Field configuration - display settings, defaults, overrides
|
|
55
|
+
field_config: GrafanaFieldConfig = Field(default_factory=dict, alias="fieldConfig")
|
|
56
|
+
# Data transformations - each contains id and transformation-specific options
|
|
57
|
+
transformations: List[GrafanaTransformation] = Field(default_factory=list)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class Dashboard(_GrafanaBaseModel):
|
|
61
|
+
"""Represents a Grafana dashboard."""
|
|
62
|
+
|
|
63
|
+
uid: str
|
|
64
|
+
title: str
|
|
65
|
+
description: str = ""
|
|
66
|
+
version: Optional[str] = None
|
|
67
|
+
panels: List[Panel]
|
|
68
|
+
tags: List[str]
|
|
69
|
+
timezone: Optional[str] = None
|
|
70
|
+
refresh: Optional[str] = None
|
|
71
|
+
schema_version: Optional[str] = Field(default=None, alias="schemaVersion")
|
|
72
|
+
folder_id: Optional[str] = Field(default=None, alias="meta.folderId")
|
|
73
|
+
created_by: Optional[str] = None
|
|
74
|
+
|
|
75
|
+
@staticmethod
|
|
76
|
+
def extract_panels(panels_data: List[Dict[str, Any]]) -> List[Panel]:
|
|
77
|
+
"""Extract panels, including nested ones."""
|
|
78
|
+
panels: List[Panel] = []
|
|
79
|
+
for panel_data in panels_data:
|
|
80
|
+
if panel_data.get("type") == "row" and "panels" in panel_data:
|
|
81
|
+
panels.extend(
|
|
82
|
+
Panel.parse_obj(p)
|
|
83
|
+
for p in panel_data["panels"]
|
|
84
|
+
if p.get("type") != "row"
|
|
85
|
+
)
|
|
86
|
+
elif panel_data.get("type") != "row":
|
|
87
|
+
panels.append(Panel.parse_obj(panel_data))
|
|
88
|
+
return panels
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def parse_obj(cls, data: Dict[str, Any]) -> "Dashboard":
|
|
92
|
+
"""Custom parsing to handle nested panel extraction."""
|
|
93
|
+
dashboard_data = data.get("dashboard", {})
|
|
94
|
+
_panel_data = dashboard_data.get("panels", [])
|
|
95
|
+
panels = []
|
|
96
|
+
try:
|
|
97
|
+
panels = cls.extract_panels(_panel_data)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.warning(
|
|
100
|
+
f"Error extracting panels from dashboard for dashboard panels {_panel_data} : {e}"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Extract meta.folderId from nested structure
|
|
104
|
+
meta = dashboard_data.get("meta", {})
|
|
105
|
+
folder_id = meta.get("folderId")
|
|
106
|
+
|
|
107
|
+
# Create dashboard data without meta to avoid conflicts
|
|
108
|
+
dashboard_dict = {**dashboard_data, "panels": panels, "folder_id": folder_id}
|
|
109
|
+
if "meta" in dashboard_dict:
|
|
110
|
+
del dashboard_dict["meta"]
|
|
111
|
+
|
|
112
|
+
# Handle refresh field type mismatch - convert boolean to string
|
|
113
|
+
if "refresh" in dashboard_dict and isinstance(dashboard_dict["refresh"], bool):
|
|
114
|
+
dashboard_dict["refresh"] = str(dashboard_dict["refresh"])
|
|
115
|
+
|
|
116
|
+
return super().parse_obj(dashboard_dict)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class Folder(_GrafanaBaseModel):
|
|
120
|
+
"""Represents a Grafana folder."""
|
|
121
|
+
|
|
122
|
+
id: str
|
|
123
|
+
title: str
|
|
124
|
+
description: Optional[str] = ""
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class FolderKey(ContainerKey):
|
|
128
|
+
"""Key for identifying a Grafana folder."""
|
|
129
|
+
|
|
130
|
+
folder_id: str
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class DashboardContainerKey(ContainerKey):
|
|
134
|
+
"""Key for identifying a Grafana dashboard."""
|
|
135
|
+
|
|
136
|
+
dashboard_id: str
|
|
137
|
+
folder_id: Optional[str] = None # Reference to parent folder
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
4
|
+
StaleEntityRemovalSourceReport,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class GrafanaSourceReport(StaleEntityRemovalSourceReport):
|
|
10
|
+
# Entity counters
|
|
11
|
+
dashboards_scanned: int = 0
|
|
12
|
+
charts_scanned: int = 0
|
|
13
|
+
folders_scanned: int = 0
|
|
14
|
+
datasets_scanned: int = 0
|
|
15
|
+
|
|
16
|
+
# Lineage counters
|
|
17
|
+
panels_with_lineage: int = 0
|
|
18
|
+
panels_without_lineage: int = 0
|
|
19
|
+
lineage_extraction_failures: int = 0
|
|
20
|
+
sql_parsing_attempts: int = 0
|
|
21
|
+
sql_parsing_successes: int = 0
|
|
22
|
+
sql_parsing_failures: int = 0
|
|
23
|
+
|
|
24
|
+
# Schema extraction counters
|
|
25
|
+
panels_with_schema_fields: int = 0
|
|
26
|
+
panels_without_schema_fields: int = 0
|
|
27
|
+
|
|
28
|
+
# Warning counters
|
|
29
|
+
permission_warnings: int = 0
|
|
30
|
+
datasource_warnings: int = 0
|
|
31
|
+
panel_parsing_warnings: int = 0
|
|
32
|
+
|
|
33
|
+
def report_dashboard_scanned(self) -> None:
|
|
34
|
+
self.dashboards_scanned += 1
|
|
35
|
+
|
|
36
|
+
def report_chart_scanned(self) -> None:
|
|
37
|
+
self.charts_scanned += 1
|
|
38
|
+
|
|
39
|
+
def report_folder_scanned(self) -> None:
|
|
40
|
+
self.folders_scanned += 1
|
|
41
|
+
|
|
42
|
+
def report_dataset_scanned(self) -> None:
|
|
43
|
+
self.datasets_scanned += 1
|
|
44
|
+
|
|
45
|
+
# Lineage reporting methods
|
|
46
|
+
def report_lineage_extracted(self) -> None:
|
|
47
|
+
"""Report successful lineage extraction for a panel"""
|
|
48
|
+
self.panels_with_lineage += 1
|
|
49
|
+
|
|
50
|
+
def report_no_lineage(self) -> None:
|
|
51
|
+
"""Report that no lineage was found for a panel"""
|
|
52
|
+
self.panels_without_lineage += 1
|
|
53
|
+
|
|
54
|
+
def report_lineage_extraction_failure(self) -> None:
|
|
55
|
+
"""Report failure to extract lineage for a panel"""
|
|
56
|
+
self.lineage_extraction_failures += 1
|
|
57
|
+
|
|
58
|
+
def report_sql_parsing_attempt(self) -> None:
|
|
59
|
+
"""Report attempt to parse SQL"""
|
|
60
|
+
self.sql_parsing_attempts += 1
|
|
61
|
+
|
|
62
|
+
def report_sql_parsing_success(self) -> None:
|
|
63
|
+
"""Report successful SQL parsing"""
|
|
64
|
+
self.sql_parsing_successes += 1
|
|
65
|
+
|
|
66
|
+
def report_sql_parsing_failure(self) -> None:
|
|
67
|
+
"""Report failed SQL parsing"""
|
|
68
|
+
self.sql_parsing_failures += 1
|
|
69
|
+
|
|
70
|
+
# Schema field reporting methods
|
|
71
|
+
def report_schema_fields_extracted(self) -> None:
|
|
72
|
+
"""Report that schema fields were extracted for a panel"""
|
|
73
|
+
self.panels_with_schema_fields += 1
|
|
74
|
+
|
|
75
|
+
def report_no_schema_fields(self) -> None:
|
|
76
|
+
"""Report that no schema fields were found for a panel"""
|
|
77
|
+
self.panels_without_schema_fields += 1
|
|
78
|
+
|
|
79
|
+
# Warning reporting methods
|
|
80
|
+
def report_permission_warning(self) -> None:
|
|
81
|
+
"""Report a permission-related warning"""
|
|
82
|
+
self.permission_warnings += 1
|
|
83
|
+
|
|
84
|
+
def report_datasource_warning(self) -> None:
|
|
85
|
+
"""Report a datasource-related warning"""
|
|
86
|
+
self.datasource_warnings += 1
|
|
87
|
+
|
|
88
|
+
def report_panel_parsing_warning(self) -> None:
|
|
89
|
+
"""Report a panel parsing warning"""
|
|
90
|
+
self.panel_parsing_warnings += 1
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from datahub.metadata.schema_classes import (
|
|
2
|
+
ChartTypeClass,
|
|
3
|
+
)
|
|
4
|
+
|
|
5
|
+
CHART_TYPE_MAPPINGS = {
|
|
6
|
+
"graph": ChartTypeClass.LINE,
|
|
7
|
+
"timeseries": ChartTypeClass.LINE,
|
|
8
|
+
"table": ChartTypeClass.TABLE,
|
|
9
|
+
"stat": ChartTypeClass.TEXT,
|
|
10
|
+
"gauge": ChartTypeClass.TEXT,
|
|
11
|
+
"bargauge": ChartTypeClass.TEXT,
|
|
12
|
+
"bar": ChartTypeClass.BAR,
|
|
13
|
+
"pie": ChartTypeClass.PIE,
|
|
14
|
+
"heatmap": ChartTypeClass.TABLE,
|
|
15
|
+
"histogram": ChartTypeClass.BAR,
|
|
16
|
+
}
|
|
@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
|
|
|
5
5
|
|
|
6
6
|
import requests
|
|
7
7
|
from pydantic import BaseModel, Field, ValidationError, validator
|
|
8
|
+
from requests.adapters import HTTPAdapter
|
|
8
9
|
from typing_extensions import assert_never
|
|
10
|
+
from urllib3.util.retry import Retry
|
|
9
11
|
|
|
10
12
|
from datahub.ingestion.api.source import SourceReport
|
|
11
13
|
from datahub.ingestion.source.hex.constants import (
|
|
@@ -220,6 +222,7 @@ class HexApi:
|
|
|
220
222
|
self.base_url = base_url
|
|
221
223
|
self.report = report
|
|
222
224
|
self.page_size = page_size
|
|
225
|
+
self.session = self._create_retry_session()
|
|
223
226
|
|
|
224
227
|
def _list_projects_url(self):
|
|
225
228
|
return f"{self.base_url}/projects"
|
|
@@ -227,6 +230,28 @@ class HexApi:
|
|
|
227
230
|
def _auth_header(self):
|
|
228
231
|
return {"Authorization": f"Bearer {self.token}"}
|
|
229
232
|
|
|
233
|
+
def _create_retry_session(self) -> requests.Session:
|
|
234
|
+
"""Create a requests session with retry logic for rate limiting.
|
|
235
|
+
|
|
236
|
+
Hex API rate limit: 60 requests per minute
|
|
237
|
+
https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
|
|
238
|
+
"""
|
|
239
|
+
session = requests.Session()
|
|
240
|
+
|
|
241
|
+
# Configure retry strategy for 429 (Too Many Requests) with exponential backoff
|
|
242
|
+
retry_strategy = Retry(
|
|
243
|
+
total=5, # Maximum number of retries
|
|
244
|
+
status_forcelist=[429], # Only retry on 429 status code
|
|
245
|
+
backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32 seconds
|
|
246
|
+
raise_on_status=True, # Raise exception after max retries
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
250
|
+
session.mount("http://", adapter)
|
|
251
|
+
session.mount("https://", adapter)
|
|
252
|
+
|
|
253
|
+
return session
|
|
254
|
+
|
|
230
255
|
def fetch_projects(
|
|
231
256
|
self,
|
|
232
257
|
include_components: bool = True,
|
|
@@ -259,7 +284,7 @@ class HexApi:
|
|
|
259
284
|
logger.debug(f"Fetching projects page with params: {params}")
|
|
260
285
|
self.report.fetch_projects_page_calls += 1
|
|
261
286
|
try:
|
|
262
|
-
response =
|
|
287
|
+
response = self.session.get(
|
|
263
288
|
url=self._list_projects_url(),
|
|
264
289
|
headers=self._auth_header(),
|
|
265
290
|
params=params,
|
|
@@ -350,6 +375,7 @@ class HexApi:
|
|
|
350
375
|
description=hex_item.description,
|
|
351
376
|
created_at=hex_item.created_at,
|
|
352
377
|
last_edited_at=hex_item.last_edited_at,
|
|
378
|
+
last_published_at=hex_item.last_published_at,
|
|
353
379
|
status=status,
|
|
354
380
|
categories=categories,
|
|
355
381
|
collections=collections,
|
|
@@ -364,6 +390,7 @@ class HexApi:
|
|
|
364
390
|
description=hex_item.description,
|
|
365
391
|
created_at=hex_item.created_at,
|
|
366
392
|
last_edited_at=hex_item.last_edited_at,
|
|
393
|
+
last_published_at=hex_item.last_published_at,
|
|
367
394
|
status=status,
|
|
368
395
|
categories=categories,
|
|
369
396
|
collections=collections,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
1
2
|
from dataclasses import dataclass
|
|
2
3
|
from datetime import datetime, timedelta, timezone
|
|
3
4
|
from typing import Any, Dict, Iterable, List, Optional
|
|
@@ -22,6 +23,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
22
23
|
)
|
|
23
24
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
24
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
25
27
|
from datahub.ingestion.source.hex.api import HexApi, HexApiReport
|
|
26
28
|
from datahub.ingestion.source.hex.constants import (
|
|
27
29
|
DATAHUB_API_PAGE_SIZE_DEFAULT,
|
|
@@ -44,7 +46,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
44
46
|
StatefulIngestionConfigBase,
|
|
45
47
|
StatefulIngestionSourceBase,
|
|
46
48
|
)
|
|
47
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
48
49
|
from datahub.sdk.main_client import DataHubClient
|
|
49
50
|
|
|
50
51
|
|
|
@@ -68,7 +69,7 @@ class HexSourceConfig(
|
|
|
68
69
|
)
|
|
69
70
|
include_components: bool = Field(
|
|
70
71
|
default=True,
|
|
71
|
-
|
|
72
|
+
description="Include Hex Components in the ingestion",
|
|
72
73
|
)
|
|
73
74
|
page_size: int = Field(
|
|
74
75
|
default=HEX_API_PAGE_SIZE_DEFAULT,
|
|
@@ -121,7 +122,11 @@ class HexSourceConfig(
|
|
|
121
122
|
|
|
122
123
|
@root_validator(pre=True)
|
|
123
124
|
def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
124
|
-
#
|
|
125
|
+
# In-place update of the input dict would cause state contamination. This was discovered through test failures
|
|
126
|
+
# in test_hex.py where the same dict is reused.
|
|
127
|
+
# So a deepcopy is performed first.
|
|
128
|
+
data = deepcopy(data)
|
|
129
|
+
|
|
125
130
|
if "lineage_end_time" not in data or data["lineage_end_time"] is None:
|
|
126
131
|
data["lineage_end_time"] = datetime.now(tz=timezone.utc)
|
|
127
132
|
# if string is given, parse it
|
|
@@ -166,7 +171,6 @@ class HexSourceConfig(
|
|
|
166
171
|
class HexReport(
|
|
167
172
|
StaleEntityRemovalSourceReport,
|
|
168
173
|
HexApiReport,
|
|
169
|
-
IngestionStageReport,
|
|
170
174
|
HexQueryFetcherReport,
|
|
171
175
|
):
|
|
172
176
|
pass
|
|
@@ -174,11 +178,18 @@ class HexReport(
|
|
|
174
178
|
|
|
175
179
|
@platform_name("Hex")
|
|
176
180
|
@config_class(HexSourceConfig)
|
|
177
|
-
@support_status(SupportStatus.
|
|
181
|
+
@support_status(SupportStatus.INCUBATING)
|
|
178
182
|
@capability(SourceCapability.DESCRIPTIONS, "Supported by default")
|
|
179
183
|
@capability(SourceCapability.OWNERSHIP, "Supported by default")
|
|
180
184
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
181
185
|
@capability(SourceCapability.CONTAINERS, "Enabled by default")
|
|
186
|
+
@capability(
|
|
187
|
+
SourceCapability.USAGE_STATS,
|
|
188
|
+
"Supported by default",
|
|
189
|
+
subtype_modifier=[
|
|
190
|
+
SourceCapabilityModifier.HEX_PROJECT,
|
|
191
|
+
],
|
|
192
|
+
)
|
|
182
193
|
class HexSource(StatefulIngestionSourceBase):
|
|
183
194
|
def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
|
|
184
195
|
super().__init__(config, ctx)
|
|
@@ -122,7 +122,7 @@ class Mapper:
|
|
|
122
122
|
lastModified=self._change_audit_stamps(
|
|
123
123
|
created_at=project.created_at, last_edited_at=project.last_edited_at
|
|
124
124
|
),
|
|
125
|
-
externalUrl=
|
|
125
|
+
externalUrl=self._get_project_or_component_external_url(project),
|
|
126
126
|
customProperties=dict(id=project.id),
|
|
127
127
|
datasetEdges=self._dataset_edges(project.upstream_datasets),
|
|
128
128
|
# TODO: support schema field upstream, maybe InputFields?
|
|
@@ -173,7 +173,7 @@ class Mapper:
|
|
|
173
173
|
lastModified=self._change_audit_stamps(
|
|
174
174
|
created_at=component.created_at, last_edited_at=component.last_edited_at
|
|
175
175
|
),
|
|
176
|
-
externalUrl=
|
|
176
|
+
externalUrl=self._get_project_or_component_external_url(component),
|
|
177
177
|
customProperties=dict(id=component.id),
|
|
178
178
|
)
|
|
179
179
|
|
|
@@ -242,6 +242,20 @@ class Mapper:
|
|
|
242
242
|
assert isinstance(dashboard_urn, DashboardUrn)
|
|
243
243
|
return dashboard_urn
|
|
244
244
|
|
|
245
|
+
def _get_project_or_component_external_url(
|
|
246
|
+
self,
|
|
247
|
+
project_or_component: Union[Project, Component],
|
|
248
|
+
) -> Optional[str]:
|
|
249
|
+
if project_or_component.last_published_at is None:
|
|
250
|
+
return (
|
|
251
|
+
f"{self._base_url}/{self._workspace_name}/hex/{project_or_component.id}"
|
|
252
|
+
)
|
|
253
|
+
else:
|
|
254
|
+
# published Projects/Components have a different URL that everybody, not just editors, can access
|
|
255
|
+
return (
|
|
256
|
+
f"{self._base_url}/{self._workspace_name}/app/{project_or_component.id}"
|
|
257
|
+
)
|
|
258
|
+
|
|
245
259
|
def _change_audit_stamps(
|
|
246
260
|
self, created_at: Optional[datetime], last_edited_at: Optional[datetime]
|
|
247
261
|
) -> ChangeAuditStampsClass:
|
|
@@ -46,6 +46,7 @@ class Project:
|
|
|
46
46
|
title: str
|
|
47
47
|
description: Optional[str]
|
|
48
48
|
last_edited_at: Optional[datetime] = None
|
|
49
|
+
last_published_at: Optional[datetime] = None
|
|
49
50
|
created_at: Optional[datetime] = None
|
|
50
51
|
status: Optional[Status] = None
|
|
51
52
|
categories: Optional[List[Category]] = None # TODO: emit category description!
|
|
@@ -67,6 +68,7 @@ class Component:
|
|
|
67
68
|
title: str
|
|
68
69
|
description: Optional[str]
|
|
69
70
|
last_edited_at: Optional[datetime] = None
|
|
71
|
+
last_published_at: Optional[datetime] = None
|
|
70
72
|
created_at: Optional[datetime] = None
|
|
71
73
|
status: Optional[Status] = None
|
|
72
74
|
categories: Optional[List[Category]] = None
|
|
@@ -97,7 +97,7 @@ class HexQueryFetcher:
|
|
|
97
97
|
if not query_urns or not entities_by_urn:
|
|
98
98
|
self.report.warning(
|
|
99
99
|
title="No Queries found with Hex as origin",
|
|
100
|
-
message="No lineage because of no Queries found with Hex as origin in the given time range
|
|
100
|
+
message="No lineage because of no Queries found with Hex as origin in the given time range. You may need to set use_queries_v2: true on your warehouse ingestion or you may consider extending the time range to fetch more queries.",
|
|
101
101
|
context=str(
|
|
102
102
|
dict(
|
|
103
103
|
workspace_name=self.workspace_name,
|