acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
5
|
+
from datahub.ingestion.source.grafana.models import Panel
|
|
6
|
+
from datahub.metadata.schema_classes import (
|
|
7
|
+
NumberTypeClass,
|
|
8
|
+
SchemaFieldClass,
|
|
9
|
+
SchemaFieldDataTypeClass,
|
|
10
|
+
StringTypeClass,
|
|
11
|
+
TimeTypeClass,
|
|
12
|
+
)
|
|
13
|
+
from datahub.sql_parsing.sqlglot_lineage import (
|
|
14
|
+
create_lineage_sql_parsed_result,
|
|
15
|
+
infer_output_schema,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def extract_sql_column_fields(target: Dict[str, Any]) -> List[SchemaFieldClass]:
|
|
22
|
+
"""Extract fields from SQL-style columns."""
|
|
23
|
+
fields = []
|
|
24
|
+
for col in target.get("sql", {}).get("columns", []):
|
|
25
|
+
for param in col.get("parameters", []):
|
|
26
|
+
if param.get("type") == "column" and param.get("name"):
|
|
27
|
+
field_type: Union[NumberTypeClass, StringTypeClass, TimeTypeClass] = (
|
|
28
|
+
TimeTypeClass()
|
|
29
|
+
if col["type"] == "time"
|
|
30
|
+
else NumberTypeClass()
|
|
31
|
+
if col["type"] == "number"
|
|
32
|
+
else StringTypeClass()
|
|
33
|
+
)
|
|
34
|
+
fields.append(
|
|
35
|
+
SchemaFieldClass(
|
|
36
|
+
fieldPath=param["name"],
|
|
37
|
+
type=SchemaFieldDataTypeClass(type=field_type),
|
|
38
|
+
nativeDataType=col["type"],
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
return fields
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def extract_prometheus_fields(target: Dict[str, Any]) -> List[SchemaFieldClass]:
|
|
45
|
+
"""Extract fields from Prometheus expressions."""
|
|
46
|
+
expr = target.get("expr")
|
|
47
|
+
if expr:
|
|
48
|
+
legend = target.get("legendFormat", expr)
|
|
49
|
+
return [
|
|
50
|
+
SchemaFieldClass(
|
|
51
|
+
fieldPath=legend,
|
|
52
|
+
type=SchemaFieldDataTypeClass(type=NumberTypeClass()),
|
|
53
|
+
nativeDataType="prometheus_metric",
|
|
54
|
+
)
|
|
55
|
+
]
|
|
56
|
+
return []
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def extract_raw_sql_fields(
|
|
60
|
+
target: Dict[str, Any],
|
|
61
|
+
panel: Optional[Panel] = None,
|
|
62
|
+
connection_to_platform_map: Optional[Dict[str, Any]] = None,
|
|
63
|
+
graph: Optional[DataHubGraph] = None,
|
|
64
|
+
report: Optional[Any] = None,
|
|
65
|
+
) -> List[SchemaFieldClass]:
|
|
66
|
+
"""Extract fields from raw SQL queries using DataHub's SQL parsing."""
|
|
67
|
+
raw_sql = target.get("rawSql", "")
|
|
68
|
+
if not raw_sql:
|
|
69
|
+
return []
|
|
70
|
+
|
|
71
|
+
# Determine upstream platform and environment from datasource mapping
|
|
72
|
+
platform = "unknown"
|
|
73
|
+
env = "PROD"
|
|
74
|
+
default_db = None
|
|
75
|
+
default_schema = None
|
|
76
|
+
platform_instance = None
|
|
77
|
+
schema_aware = False
|
|
78
|
+
|
|
79
|
+
if panel and panel.datasource_ref and connection_to_platform_map:
|
|
80
|
+
ds_type = panel.datasource_ref.type or "unknown"
|
|
81
|
+
ds_uid = panel.datasource_ref.uid or "unknown"
|
|
82
|
+
|
|
83
|
+
# Try to find mapping by datasource UID first, then by type
|
|
84
|
+
platform_config = connection_to_platform_map.get(
|
|
85
|
+
ds_uid
|
|
86
|
+
) or connection_to_platform_map.get(ds_type)
|
|
87
|
+
|
|
88
|
+
if platform_config:
|
|
89
|
+
platform = platform_config.platform
|
|
90
|
+
env = getattr(platform_config, "env", env)
|
|
91
|
+
default_db = getattr(platform_config, "database", None)
|
|
92
|
+
default_schema = getattr(platform_config, "database_schema", None)
|
|
93
|
+
platform_instance = getattr(platform_config, "platform_instance", None)
|
|
94
|
+
|
|
95
|
+
# Enable schema-aware parsing if we have platform mapping and graph access
|
|
96
|
+
if graph and platform != "unknown":
|
|
97
|
+
schema_aware = True
|
|
98
|
+
|
|
99
|
+
# Track SQL parsing attempt
|
|
100
|
+
if report:
|
|
101
|
+
report.report_sql_parsing_attempt()
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
# Use DataHub's standard SQL parsing approach
|
|
105
|
+
sql_parsing_result = create_lineage_sql_parsed_result(
|
|
106
|
+
query=raw_sql,
|
|
107
|
+
default_db=default_db,
|
|
108
|
+
default_schema=default_schema,
|
|
109
|
+
platform=platform,
|
|
110
|
+
platform_instance=platform_instance,
|
|
111
|
+
env=env,
|
|
112
|
+
schema_aware=schema_aware,
|
|
113
|
+
graph=graph,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Extract the output schema from the parsing result
|
|
117
|
+
output_schema = infer_output_schema(sql_parsing_result)
|
|
118
|
+
|
|
119
|
+
if output_schema:
|
|
120
|
+
if report:
|
|
121
|
+
report.report_sql_parsing_success()
|
|
122
|
+
return output_schema
|
|
123
|
+
else:
|
|
124
|
+
# If sqlglot parsing succeeds but no schema is inferred,
|
|
125
|
+
# fall back to basic parsing
|
|
126
|
+
logger.debug(f"No schema inferred from SQL: {raw_sql}")
|
|
127
|
+
fallback_result = _extract_raw_sql_fields_fallback(target)
|
|
128
|
+
if fallback_result and report:
|
|
129
|
+
report.report_sql_parsing_success()
|
|
130
|
+
elif report:
|
|
131
|
+
report.report_sql_parsing_failure()
|
|
132
|
+
return fallback_result
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.debug(f"Failed to parse SQL with DataHub parser: {raw_sql}, error: {e}")
|
|
136
|
+
if report:
|
|
137
|
+
report.report_sql_parsing_failure()
|
|
138
|
+
# Fallback to basic parsing for backwards compatibility
|
|
139
|
+
return _extract_raw_sql_fields_fallback(target)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _extract_raw_sql_fields_fallback(target: Dict[str, Any]) -> List[SchemaFieldClass]:
|
|
143
|
+
"""Fallback basic SQL parsing for when sqlglot fails."""
|
|
144
|
+
raw_sql = target.get("rawSql", "").lower()
|
|
145
|
+
if not raw_sql:
|
|
146
|
+
return []
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
sql = raw_sql.lower()
|
|
150
|
+
select_start = sql.index("select") + 6 # len("select")
|
|
151
|
+
from_start = sql.index("from")
|
|
152
|
+
select_part = sql[select_start:from_start].strip()
|
|
153
|
+
|
|
154
|
+
# Split by comma, handling nested parentheses
|
|
155
|
+
columns = []
|
|
156
|
+
current_column = ""
|
|
157
|
+
paren_count = 0
|
|
158
|
+
|
|
159
|
+
for char in select_part:
|
|
160
|
+
if char == "," and paren_count == 0:
|
|
161
|
+
if current_column.strip():
|
|
162
|
+
columns.append(current_column.strip())
|
|
163
|
+
current_column = ""
|
|
164
|
+
else:
|
|
165
|
+
if char == "(":
|
|
166
|
+
paren_count += 1
|
|
167
|
+
elif char == ")":
|
|
168
|
+
paren_count -= 1
|
|
169
|
+
current_column += char
|
|
170
|
+
|
|
171
|
+
if current_column.strip():
|
|
172
|
+
columns.append(current_column.strip())
|
|
173
|
+
|
|
174
|
+
# For each column, extract the alias if it exists
|
|
175
|
+
fields = []
|
|
176
|
+
for col in columns:
|
|
177
|
+
# Check for alias with 'AS' keyword
|
|
178
|
+
if " as " in col:
|
|
179
|
+
field_name = col.split(" as ")[-1].strip()
|
|
180
|
+
else:
|
|
181
|
+
# If no alias, use the last part after last space
|
|
182
|
+
# This handles both simple columns and function calls without alias
|
|
183
|
+
field_name = col.split()[-1].strip()
|
|
184
|
+
|
|
185
|
+
# Clean up any remaining quotes or parentheses
|
|
186
|
+
field_name = field_name.strip("\"'()")
|
|
187
|
+
|
|
188
|
+
fields.append(
|
|
189
|
+
SchemaFieldClass(
|
|
190
|
+
fieldPath=field_name,
|
|
191
|
+
type=SchemaFieldDataTypeClass(type=StringTypeClass()),
|
|
192
|
+
nativeDataType="sql_column",
|
|
193
|
+
)
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return fields
|
|
197
|
+
|
|
198
|
+
except (IndexError, ValueError, StopIteration) as e:
|
|
199
|
+
logger.warning(f"Failed to parse SQL: {target.get('rawSql')}", e)
|
|
200
|
+
return []
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def extract_fields_from_panel(
|
|
204
|
+
panel: Panel,
|
|
205
|
+
connection_to_platform_map: Optional[Dict[str, Any]] = None,
|
|
206
|
+
graph: Optional[DataHubGraph] = None,
|
|
207
|
+
report: Optional[Any] = None,
|
|
208
|
+
) -> List[SchemaFieldClass]:
|
|
209
|
+
"""Extract all fields from a panel."""
|
|
210
|
+
fields = []
|
|
211
|
+
fields.extend(
|
|
212
|
+
extract_fields_from_targets(
|
|
213
|
+
panel.query_targets, panel, connection_to_platform_map, graph, report
|
|
214
|
+
)
|
|
215
|
+
)
|
|
216
|
+
fields.extend(get_fields_from_field_config(panel.field_config))
|
|
217
|
+
fields.extend(get_fields_from_transformations(panel.transformations))
|
|
218
|
+
|
|
219
|
+
# Track schema field extraction
|
|
220
|
+
if report:
|
|
221
|
+
if fields:
|
|
222
|
+
report.report_schema_fields_extracted()
|
|
223
|
+
else:
|
|
224
|
+
report.report_no_schema_fields()
|
|
225
|
+
|
|
226
|
+
return fields
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def extract_fields_from_targets(
|
|
230
|
+
targets: List[Dict[str, Any]],
|
|
231
|
+
panel: Optional[Panel] = None,
|
|
232
|
+
connection_to_platform_map: Optional[Dict[str, Any]] = None,
|
|
233
|
+
graph: Optional[DataHubGraph] = None,
|
|
234
|
+
report: Optional[Any] = None,
|
|
235
|
+
) -> List[SchemaFieldClass]:
|
|
236
|
+
"""Extract fields from panel targets."""
|
|
237
|
+
fields = []
|
|
238
|
+
for target in targets:
|
|
239
|
+
fields.extend(extract_sql_column_fields(target))
|
|
240
|
+
fields.extend(extract_prometheus_fields(target))
|
|
241
|
+
fields.extend(
|
|
242
|
+
extract_raw_sql_fields(
|
|
243
|
+
target, panel, connection_to_platform_map, graph, report
|
|
244
|
+
)
|
|
245
|
+
)
|
|
246
|
+
fields.extend(extract_time_format_fields(target))
|
|
247
|
+
return fields
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def extract_time_format_fields(target: Dict[str, Any]) -> List[SchemaFieldClass]:
|
|
251
|
+
"""Extract fields from time series and table formats."""
|
|
252
|
+
if target.get("format") in {"time_series", "table"}:
|
|
253
|
+
return [
|
|
254
|
+
SchemaFieldClass(
|
|
255
|
+
fieldPath="time",
|
|
256
|
+
type=SchemaFieldDataTypeClass(type=TimeTypeClass()),
|
|
257
|
+
nativeDataType="timestamp",
|
|
258
|
+
)
|
|
259
|
+
]
|
|
260
|
+
return []
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def get_fields_from_field_config(
|
|
264
|
+
field_config: Dict[str, Any],
|
|
265
|
+
) -> List[SchemaFieldClass]:
|
|
266
|
+
"""Extract fields from field configuration."""
|
|
267
|
+
fields = []
|
|
268
|
+
defaults = field_config.get("defaults", {})
|
|
269
|
+
unit = defaults.get("unit")
|
|
270
|
+
if unit:
|
|
271
|
+
fields.append(
|
|
272
|
+
SchemaFieldClass(
|
|
273
|
+
fieldPath=f"value_{unit}",
|
|
274
|
+
type=SchemaFieldDataTypeClass(type=NumberTypeClass()),
|
|
275
|
+
nativeDataType="value",
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
for override in field_config.get("overrides", []):
|
|
279
|
+
if override.get("matcher", {}).get("id") == "byName":
|
|
280
|
+
field_name = override.get("matcher", {}).get("options")
|
|
281
|
+
if field_name:
|
|
282
|
+
fields.append(
|
|
283
|
+
SchemaFieldClass(
|
|
284
|
+
fieldPath=field_name,
|
|
285
|
+
type=SchemaFieldDataTypeClass(type=NumberTypeClass()),
|
|
286
|
+
nativeDataType="metric",
|
|
287
|
+
)
|
|
288
|
+
)
|
|
289
|
+
return fields
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def get_fields_from_transformations(
|
|
293
|
+
transformations: List[Dict[str, Any]],
|
|
294
|
+
) -> List[SchemaFieldClass]:
|
|
295
|
+
"""Extract fields from transformations."""
|
|
296
|
+
fields = []
|
|
297
|
+
for transform in transformations:
|
|
298
|
+
if transform.get("type") == "organize":
|
|
299
|
+
for field_name in transform.get("options", {}).get("indexByName", {}):
|
|
300
|
+
fields.append(
|
|
301
|
+
SchemaFieldClass(
|
|
302
|
+
fieldPath=field_name,
|
|
303
|
+
type=SchemaFieldDataTypeClass(type=StringTypeClass()),
|
|
304
|
+
nativeDataType="transformed",
|
|
305
|
+
)
|
|
306
|
+
)
|
|
307
|
+
return fields
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""API client for Grafana metadata extraction"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
import urllib3.exceptions
|
|
8
|
+
from pydantic import SecretStr
|
|
9
|
+
|
|
10
|
+
from datahub.ingestion.source.grafana.models import Dashboard, Folder
|
|
11
|
+
from datahub.ingestion.source.grafana.report import GrafanaSourceReport
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GrafanaAPIClient:
|
|
17
|
+
"""Client for making requests to Grafana API"""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
base_url: str,
|
|
22
|
+
token: SecretStr,
|
|
23
|
+
verify_ssl: bool,
|
|
24
|
+
page_size: int,
|
|
25
|
+
report: GrafanaSourceReport,
|
|
26
|
+
) -> None:
|
|
27
|
+
self.base_url = base_url
|
|
28
|
+
self.verify_ssl = verify_ssl
|
|
29
|
+
self.page_size = page_size
|
|
30
|
+
self.report = report
|
|
31
|
+
self.session = self._create_session(token)
|
|
32
|
+
|
|
33
|
+
def _create_session(self, token: SecretStr) -> requests.Session:
|
|
34
|
+
session = requests.Session()
|
|
35
|
+
session.headers.update(
|
|
36
|
+
{
|
|
37
|
+
"Authorization": f"Bearer {token.get_secret_value()}",
|
|
38
|
+
"Accept": "application/json",
|
|
39
|
+
"Content-Type": "application/json",
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
session.verify = self.verify_ssl
|
|
43
|
+
|
|
44
|
+
# If SSL verification is disabled, suppress the warnings
|
|
45
|
+
if not self.verify_ssl:
|
|
46
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
47
|
+
self.report.warning(
|
|
48
|
+
title="SSL Configuration Warning",
|
|
49
|
+
message="SSL Verification is recommended.",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return session
|
|
53
|
+
|
|
54
|
+
def get_folders(self) -> List[Folder]:
|
|
55
|
+
"""Fetch all folders from Grafana with pagination."""
|
|
56
|
+
folders: List[Folder] = []
|
|
57
|
+
page = 1
|
|
58
|
+
per_page = self.page_size
|
|
59
|
+
|
|
60
|
+
while True:
|
|
61
|
+
try:
|
|
62
|
+
response = self.session.get(
|
|
63
|
+
f"{self.base_url}/api/folders",
|
|
64
|
+
params={"page": page, "limit": per_page},
|
|
65
|
+
)
|
|
66
|
+
response.raise_for_status()
|
|
67
|
+
|
|
68
|
+
batch = response.json()
|
|
69
|
+
if not batch:
|
|
70
|
+
break
|
|
71
|
+
|
|
72
|
+
folders.extend(Folder.parse_obj(folder) for folder in batch)
|
|
73
|
+
page += 1
|
|
74
|
+
except requests.exceptions.RequestException as e:
|
|
75
|
+
self.report.report_failure(
|
|
76
|
+
title="Folder Fetch Error",
|
|
77
|
+
message="Failed to fetch folders on page",
|
|
78
|
+
context=str(page),
|
|
79
|
+
exc=e,
|
|
80
|
+
)
|
|
81
|
+
self.report.report_permission_warning() # Likely a permission issue
|
|
82
|
+
break
|
|
83
|
+
|
|
84
|
+
return folders
|
|
85
|
+
|
|
86
|
+
def get_dashboard(self, uid: str) -> Optional[Dashboard]:
|
|
87
|
+
"""Fetch a specific dashboard by UID"""
|
|
88
|
+
try:
|
|
89
|
+
response = self.session.get(f"{self.base_url}/api/dashboards/uid/{uid}")
|
|
90
|
+
response.raise_for_status()
|
|
91
|
+
return Dashboard.parse_obj(response.json())
|
|
92
|
+
except requests.exceptions.RequestException as e:
|
|
93
|
+
self.report.warning(
|
|
94
|
+
title="Dashboard Fetch Error",
|
|
95
|
+
message="Failed to fetch dashboard",
|
|
96
|
+
context=uid,
|
|
97
|
+
exc=e,
|
|
98
|
+
)
|
|
99
|
+
if e.response and e.response.status_code in (401, 403):
|
|
100
|
+
self.report.report_permission_warning()
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
def get_dashboards(self) -> List[Dashboard]:
|
|
104
|
+
"""Fetch all dashboards from search endpoint with pagination."""
|
|
105
|
+
dashboards: List[Dashboard] = []
|
|
106
|
+
page = 1
|
|
107
|
+
per_page = self.page_size
|
|
108
|
+
|
|
109
|
+
while True:
|
|
110
|
+
try:
|
|
111
|
+
params: Dict[str, Union[str, int]] = {
|
|
112
|
+
"type": "dash-db",
|
|
113
|
+
"page": page,
|
|
114
|
+
"limit": per_page,
|
|
115
|
+
}
|
|
116
|
+
response = self.session.get(
|
|
117
|
+
f"{self.base_url}/api/search",
|
|
118
|
+
params=params,
|
|
119
|
+
)
|
|
120
|
+
response.raise_for_status()
|
|
121
|
+
|
|
122
|
+
batch = response.json()
|
|
123
|
+
if not batch:
|
|
124
|
+
break
|
|
125
|
+
|
|
126
|
+
for result in batch:
|
|
127
|
+
dashboard = self.get_dashboard(result["uid"])
|
|
128
|
+
if dashboard:
|
|
129
|
+
dashboards.append(dashboard)
|
|
130
|
+
page += 1
|
|
131
|
+
except requests.exceptions.RequestException as e:
|
|
132
|
+
self.report.report_failure(
|
|
133
|
+
title="Dashboard Search Error",
|
|
134
|
+
message="Failed to fetch dashboards on page",
|
|
135
|
+
context=str(page),
|
|
136
|
+
exc=e,
|
|
137
|
+
)
|
|
138
|
+
if e.response and e.response.status_code in (401, 403):
|
|
139
|
+
self.report.report_permission_warning()
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
return dashboards
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, SecretStr, validator
|
|
4
|
+
|
|
5
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
6
|
+
from datahub.configuration.source_common import (
|
|
7
|
+
DatasetLineageProviderConfigBase,
|
|
8
|
+
EnvConfigMixin,
|
|
9
|
+
PlatformInstanceConfigMixin,
|
|
10
|
+
)
|
|
11
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
12
|
+
StatefulIngestionConfigBase,
|
|
13
|
+
)
|
|
14
|
+
from datahub.utilities import config_clean
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PlatformConnectionConfig(
|
|
18
|
+
EnvConfigMixin,
|
|
19
|
+
PlatformInstanceConfigMixin,
|
|
20
|
+
):
|
|
21
|
+
"""Platform connection configuration for mapping Grafana datasources to their actual platforms."""
|
|
22
|
+
|
|
23
|
+
platform: str = Field(
|
|
24
|
+
description="The platform name (e.g., 'postgres', 'mysql', 'snowflake')"
|
|
25
|
+
)
|
|
26
|
+
database: Optional[str] = Field(default=None, description="Default database name")
|
|
27
|
+
database_schema: Optional[str] = Field(
|
|
28
|
+
default=None, description="Default schema name"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class GrafanaSourceConfig(
|
|
33
|
+
DatasetLineageProviderConfigBase,
|
|
34
|
+
StatefulIngestionConfigBase,
|
|
35
|
+
PlatformInstanceConfigMixin,
|
|
36
|
+
EnvConfigMixin,
|
|
37
|
+
):
|
|
38
|
+
"""Configuration for Grafana source"""
|
|
39
|
+
|
|
40
|
+
platform: HiddenFromDocs[str] = Field(default="grafana")
|
|
41
|
+
url: str = Field(
|
|
42
|
+
description="Grafana URL in the format http://your-grafana-instance with no trailing slash"
|
|
43
|
+
)
|
|
44
|
+
service_account_token: SecretStr = Field(
|
|
45
|
+
description="Service account token for Grafana"
|
|
46
|
+
)
|
|
47
|
+
verify_ssl: bool = Field(
|
|
48
|
+
default=True,
|
|
49
|
+
description="Whether to verify SSL certificates when connecting to Grafana",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# API pagination configuration
|
|
53
|
+
page_size: int = Field(
|
|
54
|
+
default=100,
|
|
55
|
+
description="Number of items to fetch per API call when paginating through folders and dashboards",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Extraction mode configuration
|
|
59
|
+
basic_mode: bool = Field(
|
|
60
|
+
default=False,
|
|
61
|
+
description="Enable basic extraction mode for users with limited permissions. "
|
|
62
|
+
"In basic mode, only dashboard metadata is extracted without detailed panel information, "
|
|
63
|
+
"lineage, or folder hierarchy. This requires only basic dashboard read permissions.",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Content filtering
|
|
67
|
+
dashboard_pattern: AllowDenyPattern = Field(
|
|
68
|
+
default=AllowDenyPattern.allow_all(),
|
|
69
|
+
description="Regex pattern to filter dashboards for ingestion",
|
|
70
|
+
)
|
|
71
|
+
folder_pattern: AllowDenyPattern = Field(
|
|
72
|
+
default=AllowDenyPattern.allow_all(),
|
|
73
|
+
description="Regex pattern to filter folders for ingestion",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Feature toggles
|
|
77
|
+
ingest_tags: bool = Field(
|
|
78
|
+
default=True, description="Whether to ingest dashboard and chart tags"
|
|
79
|
+
)
|
|
80
|
+
ingest_owners: bool = Field(
|
|
81
|
+
default=True, description="Whether to ingest dashboard ownership information"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
include_lineage: bool = Field(
|
|
85
|
+
default=True,
|
|
86
|
+
description="Whether to extract lineage between charts and data sources. "
|
|
87
|
+
"When enabled, the source will parse SQL queries and datasource configurations "
|
|
88
|
+
"to build lineage relationships.",
|
|
89
|
+
)
|
|
90
|
+
include_column_lineage: bool = Field(
|
|
91
|
+
default=True,
|
|
92
|
+
description="Whether to extract column-level lineage from SQL queries. "
|
|
93
|
+
"Only applicable when include_lineage is enabled.",
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Platform connection mappings
|
|
97
|
+
connection_to_platform_map: Dict[str, PlatformConnectionConfig] = Field(
|
|
98
|
+
default_factory=dict,
|
|
99
|
+
description="Map of Grafana datasource types/UIDs to platform connection configs for lineage extraction",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
@validator("url", allow_reuse=True)
|
|
103
|
+
def remove_trailing_slash(cls, v):
|
|
104
|
+
return config_clean.remove_trailing_slashes(v)
|