acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,18 +1,33 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from datetime import datetime
|
|
4
5
|
from functools import lru_cache
|
|
5
6
|
from typing import Dict, List, Optional
|
|
6
7
|
|
|
8
|
+
from looker_sdk.sdk.api40.models import (
|
|
9
|
+
WriteQuery,
|
|
10
|
+
)
|
|
11
|
+
|
|
7
12
|
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
|
|
8
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
9
14
|
from datahub.ingestion.source.looker.looker_common import (
|
|
10
15
|
LookerExplore,
|
|
11
16
|
LookerViewId,
|
|
12
17
|
ViewField,
|
|
18
|
+
ViewFieldDimensionGroupType,
|
|
13
19
|
ViewFieldType,
|
|
14
20
|
)
|
|
15
21
|
from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition
|
|
22
|
+
from datahub.ingestion.source.looker.looker_constant import (
|
|
23
|
+
NAME,
|
|
24
|
+
VIEW_FIELD_INTERVALS_ATTRIBUTE,
|
|
25
|
+
VIEW_FIELD_TIMEFRAMES_ATTRIBUTE,
|
|
26
|
+
VIEW_FIELD_TYPE_ATTRIBUTE,
|
|
27
|
+
)
|
|
28
|
+
from datahub.ingestion.source.looker.looker_lib_wrapper import (
|
|
29
|
+
LookerAPI,
|
|
30
|
+
)
|
|
16
31
|
from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewIdCache
|
|
17
32
|
from datahub.ingestion.source.looker.lookml_concept_context import (
|
|
18
33
|
LookerFieldContext,
|
|
@@ -20,7 +35,6 @@ from datahub.ingestion.source.looker.lookml_concept_context import (
|
|
|
20
35
|
)
|
|
21
36
|
from datahub.ingestion.source.looker.lookml_config import (
|
|
22
37
|
DERIVED_VIEW_SUFFIX,
|
|
23
|
-
NAME,
|
|
24
38
|
LookMLSourceConfig,
|
|
25
39
|
LookMLSourceReport,
|
|
26
40
|
)
|
|
@@ -280,6 +294,447 @@ class AbstractViewUpstream(ABC):
|
|
|
280
294
|
return upstream_column_refs
|
|
281
295
|
|
|
282
296
|
|
|
297
|
+
class LookerQueryAPIBasedViewUpstream(AbstractViewUpstream):
|
|
298
|
+
"""
|
|
299
|
+
Implements Looker view upstream lineage extraction using the Looker Query API.
|
|
300
|
+
|
|
301
|
+
This class leverages the Looker API to generate the fully resolved SQL for a Looker view by constructing a WriteQuery
|
|
302
|
+
that includes all dimensions, dimension groups and measures. The SQL is then parsed to extract column-level lineage.
|
|
303
|
+
The Looker client is required for this class, as it is used to execute the WriteQuery and retrieve the SQL.
|
|
304
|
+
|
|
305
|
+
Other view upstream implementations use string parsing to extract lineage information from the SQL, which does not cover all the edge cases.
|
|
306
|
+
Limitations of string based lineage extraction: Ref: https://cloud.google.com/looker/docs/reference/param-field-sql#sql_for_dimensions
|
|
307
|
+
|
|
308
|
+
Key Features:
|
|
309
|
+
- Requires a Looker client (`looker_client`) to execute queries and retrieve SQL for the view.
|
|
310
|
+
- Requires a `view_to_explore_map` to map view names to their corresponding explore name
|
|
311
|
+
- Field name translation is handled: Looker API field names are constructed as `<view_name>.<field_name>`, and helper
|
|
312
|
+
methods are provided to convert between Looker API field names and raw field names.
|
|
313
|
+
- SQL parsing is cached for efficiency, and the class is designed to gracefully fall back if the Looker Query API fails.
|
|
314
|
+
- All lineage extraction is based on the SQL returned by the Looker API, ensuring accurate and up-to-date lineage.
|
|
315
|
+
|
|
316
|
+
Why view_to_explore_map is required:
|
|
317
|
+
The Looker Query API expects the explore name (not the view name) as the "view" parameter in the WriteQuery.
|
|
318
|
+
In Looker, a view can be referenced by multiple explores, but the API needs any one of the
|
|
319
|
+
explores to access the view's fields
|
|
320
|
+
|
|
321
|
+
Example WriteQuery request (see `_execute_query` for details):
|
|
322
|
+
{
|
|
323
|
+
"model": "test_model",
|
|
324
|
+
"view": "users_explore", # This is the explore name, not the view name
|
|
325
|
+
"fields": [
|
|
326
|
+
"users.email", "users.lifetime_purchase_count"
|
|
327
|
+
],
|
|
328
|
+
"limit": "1",
|
|
329
|
+
"cache": true
|
|
330
|
+
}
|
|
331
|
+
The SQL response is then parsed to extract upstream tables and column-level lineage.
|
|
332
|
+
|
|
333
|
+
For further details, see the method-level docstrings, especially:
|
|
334
|
+
- `__get_spr`: SQL parsing and lineage extraction workflow
|
|
335
|
+
- `_get_sql_write_query`: WriteQuery construction and field enumeration
|
|
336
|
+
- `_execute_query`: Looker API invocation and SQL retrieval - this only generates the SQL query, does not execute it
|
|
337
|
+
- Field name translation: `_get_looker_api_field_name` and `_get_field_name_from_looker_api_field_name`
|
|
338
|
+
|
|
339
|
+
Note: This class is intended to be robust and raise exceptions if SQL parsing or API calls fail, and will fall back to
|
|
340
|
+
other implementations - custom regex-based parsing if necessary.
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
def __init__(
|
|
344
|
+
self,
|
|
345
|
+
view_context: LookerViewContext,
|
|
346
|
+
looker_view_id_cache: LookerViewIdCache,
|
|
347
|
+
config: LookMLSourceConfig,
|
|
348
|
+
reporter: LookMLSourceReport,
|
|
349
|
+
ctx: PipelineContext,
|
|
350
|
+
looker_client: LookerAPI,
|
|
351
|
+
view_to_explore_map: Dict[str, str],
|
|
352
|
+
):
|
|
353
|
+
super().__init__(view_context, looker_view_id_cache, config, reporter, ctx)
|
|
354
|
+
self.looker_client = looker_client
|
|
355
|
+
self.view_to_explore_map = view_to_explore_map
|
|
356
|
+
# Cache the SQL parsing results
|
|
357
|
+
# We use maxsize=1 because a new class instance is created for each view, Ref: view_upstream.create_view_upstream
|
|
358
|
+
self._get_spr = lru_cache(maxsize=1)(self.__get_spr)
|
|
359
|
+
self._get_upstream_dataset_urn = lru_cache(maxsize=1)(
|
|
360
|
+
self.__get_upstream_dataset_urn
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# Initialize the cache
|
|
364
|
+
# Done to fallback to other implementations if the Looker Query API fails
|
|
365
|
+
self._get_spr()
|
|
366
|
+
|
|
367
|
+
def __get_spr(self) -> SqlParsingResult:
|
|
368
|
+
"""
|
|
369
|
+
Retrieves the SQL parsing result for the current Looker view by:
|
|
370
|
+
1. Building a WriteQuery for the view.
|
|
371
|
+
2. Executing the query via the Looker API to get the SQL.
|
|
372
|
+
3. Parsing the SQL to extract lineage information.
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
SqlParsingResult if successful, otherwise None.
|
|
376
|
+
Raises:
|
|
377
|
+
ValueError: If no SQL is found in the response.
|
|
378
|
+
ValueError: If no fields are found for the view.
|
|
379
|
+
ValueError: If explore name is not found for the view.
|
|
380
|
+
ValueError: If error in parsing SQL for upstream tables.
|
|
381
|
+
ValueError: If error in parsing SQL for column lineage.
|
|
382
|
+
"""
|
|
383
|
+
try:
|
|
384
|
+
# Build the WriteQuery for the current view.
|
|
385
|
+
sql_query: WriteQuery = self._get_sql_write_query()
|
|
386
|
+
|
|
387
|
+
# Execute the query to get the SQL representation from Looker.
|
|
388
|
+
sql_response = self._execute_query(sql_query)
|
|
389
|
+
|
|
390
|
+
# Parse the SQL to extract lineage information.
|
|
391
|
+
spr = create_lineage_sql_parsed_result(
|
|
392
|
+
query=sql_response,
|
|
393
|
+
default_schema=self.view_context.view_connection.default_schema,
|
|
394
|
+
default_db=self.view_context.view_connection.default_db,
|
|
395
|
+
platform=self.view_context.view_connection.platform,
|
|
396
|
+
platform_instance=self.view_context.view_connection.platform_instance,
|
|
397
|
+
env=self.view_context.view_connection.platform_env or self.config.env,
|
|
398
|
+
graph=self.ctx.graph,
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# Check for errors encountered during table extraction.
|
|
402
|
+
table_error = spr.debug_info.table_error
|
|
403
|
+
if table_error is not None:
|
|
404
|
+
self.reporter.report_warning(
|
|
405
|
+
title="Table Level Lineage Extraction Failed",
|
|
406
|
+
message="Error in parsing derived sql",
|
|
407
|
+
context=f"View-name: {self.view_context.name()}",
|
|
408
|
+
exc=table_error,
|
|
409
|
+
)
|
|
410
|
+
raise ValueError(
|
|
411
|
+
f"Error in parsing SQL for upstream tables: {table_error}"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
column_error = spr.debug_info.column_error
|
|
415
|
+
if column_error is not None:
|
|
416
|
+
self.reporter.report_warning(
|
|
417
|
+
title="Column Level Lineage Extraction Failed",
|
|
418
|
+
message="Error in parsing derived sql",
|
|
419
|
+
context=f"View-name: {self.view_context.name()}",
|
|
420
|
+
exc=column_error,
|
|
421
|
+
)
|
|
422
|
+
raise ValueError(
|
|
423
|
+
f"Error in parsing SQL for column lineage: {column_error}"
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
return spr
|
|
427
|
+
except Exception:
|
|
428
|
+
# Reraise the exception to allow higher-level handling.
|
|
429
|
+
raise
|
|
430
|
+
|
|
431
|
+
def _get_time_dim_group_field_name(self, dim_group: dict) -> str:
|
|
432
|
+
"""
|
|
433
|
+
Time dimension groups must be referenced by their individual timeframes suffix.
|
|
434
|
+
Example:
|
|
435
|
+
dimension_group: created {
|
|
436
|
+
type: time
|
|
437
|
+
timeframes: [date, week, month]
|
|
438
|
+
sql: ${TABLE}.created_at ;;
|
|
439
|
+
}
|
|
440
|
+
Used as: {view_name.date_created}
|
|
441
|
+
|
|
442
|
+
created -> created_date, created_week, created_month
|
|
443
|
+
# Ref: https://cloud.google.com/looker/docs/reference/param-field-dimension-group#dimension_groups_must_be_referenced_by_their_individual_dimensions
|
|
444
|
+
"""
|
|
445
|
+
dim_group_name = dim_group.get(NAME)
|
|
446
|
+
timeframes = dim_group.get(VIEW_FIELD_TIMEFRAMES_ATTRIBUTE)
|
|
447
|
+
|
|
448
|
+
# If timeframes is not included (rare case), the dimension group will include all possible timeframes.
|
|
449
|
+
# We will pick to use "raw"
|
|
450
|
+
suffix = timeframes[0] if timeframes else "raw"
|
|
451
|
+
return f"{dim_group_name}_{suffix}"
|
|
452
|
+
|
|
453
|
+
def _get_duration_dim_group_field_name(self, dim_group: dict) -> str:
|
|
454
|
+
"""
|
|
455
|
+
Duration dimension groups must be referenced by their plural version of the interval value as prefix
|
|
456
|
+
Example:
|
|
457
|
+
dimension_group: since_event {
|
|
458
|
+
type: duration
|
|
459
|
+
intervals: [hour, day, week, month, quarter, year]
|
|
460
|
+
sql_start: ${faa_event_date_raw} ;;
|
|
461
|
+
sql_end: CURRENT_TIMESTAMP();;
|
|
462
|
+
}
|
|
463
|
+
Used as: {view_name.hours_since_event}
|
|
464
|
+
|
|
465
|
+
since_event -> hours_since_event, days_since_event, weeks_since_event, months_since_event, quarters_since_event, years_since_event
|
|
466
|
+
# Ref: https://cloud.google.com/looker/docs/reference/param-field-dimension-group#referencing_intervals_from_another_lookml_field
|
|
467
|
+
"""
|
|
468
|
+
dim_group_name = dim_group.get(NAME)
|
|
469
|
+
intervals = dim_group.get(VIEW_FIELD_INTERVALS_ATTRIBUTE)
|
|
470
|
+
|
|
471
|
+
# If intervals is not included (rare case), the dimension group will include all possible intervals.
|
|
472
|
+
# We will pick to use "day" -> "days"
|
|
473
|
+
prefix = f"{intervals[0]}s" if intervals else "days"
|
|
474
|
+
return f"{prefix}_{dim_group_name}"
|
|
475
|
+
|
|
476
|
+
def _get_sql_write_query(self) -> WriteQuery:
|
|
477
|
+
"""
|
|
478
|
+
Constructs a WriteQuery object to obtain the SQL representation of the current Looker view.
|
|
479
|
+
|
|
480
|
+
We need to list all the fields for the view to get the SQL representation of the view - this fully resolved SQL for view dimensions and measures.
|
|
481
|
+
|
|
482
|
+
The method uses the view_to_explore_map to determine the correct explore name to use in the WriteQuery.
|
|
483
|
+
This is crucial because the Looker Query API expects the explore name (not the view name) as the "view" parameter.
|
|
484
|
+
|
|
485
|
+
Ref: https://cloud.google.com/looker/docs/reference/param-field-sql#sql_for_dimensions
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
WriteQuery: The WriteQuery object if fields are found and explore name is available, otherwise None.
|
|
489
|
+
|
|
490
|
+
Raises:
|
|
491
|
+
ValueError: If the explore name is not found in the view_to_explore_map for the current view.
|
|
492
|
+
ValueError: If no fields are found for the view.
|
|
493
|
+
"""
|
|
494
|
+
|
|
495
|
+
# Collect all dimension and measure fields for the view.
|
|
496
|
+
view_fields: List[str] = []
|
|
497
|
+
# Add dimension fields in the format: <view_name>.<dimension_name> or <view_name>.<measure_name>
|
|
498
|
+
for field in self.view_context.dimensions() + self.view_context.measures():
|
|
499
|
+
field_name = field.get(NAME)
|
|
500
|
+
assert field_name # Happy linter
|
|
501
|
+
view_fields.append(self._get_looker_api_field_name(field_name))
|
|
502
|
+
|
|
503
|
+
for dim_group in self.view_context.dimension_groups():
|
|
504
|
+
dim_group_type: ViewFieldDimensionGroupType = ViewFieldDimensionGroupType(
|
|
505
|
+
dim_group.get(VIEW_FIELD_TYPE_ATTRIBUTE)
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
if dim_group_type == ViewFieldDimensionGroupType.TIME:
|
|
509
|
+
view_fields.append(
|
|
510
|
+
self._get_looker_api_field_name(
|
|
511
|
+
self._get_time_dim_group_field_name(dim_group)
|
|
512
|
+
)
|
|
513
|
+
)
|
|
514
|
+
elif dim_group_type == ViewFieldDimensionGroupType.DURATION:
|
|
515
|
+
view_fields.append(
|
|
516
|
+
self._get_looker_api_field_name(
|
|
517
|
+
self._get_duration_dim_group_field_name(dim_group)
|
|
518
|
+
)
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
# Use explore name from view_to_explore_map if available
|
|
522
|
+
# explore_name is always present in the view_to_explore_map because of the check in view_upstream.create_view_upstream
|
|
523
|
+
explore_name = self.view_to_explore_map.get(self.view_context.name())
|
|
524
|
+
assert explore_name # Happy linter
|
|
525
|
+
|
|
526
|
+
if not view_fields:
|
|
527
|
+
raise ValueError(
|
|
528
|
+
f"No fields found for view '{self.view_context.name()}'. Cannot proceed with Looker API for view lineage."
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
# Construct and return the WriteQuery object.
|
|
532
|
+
# The 'limit' is set to "1" as the query is only used to obtain SQL, not to fetch data.
|
|
533
|
+
return WriteQuery(
|
|
534
|
+
model=self.looker_view_id_cache.model_name,
|
|
535
|
+
view=explore_name,
|
|
536
|
+
fields=view_fields,
|
|
537
|
+
filters={},
|
|
538
|
+
limit="1",
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
def _execute_query(self, query: WriteQuery) -> str:
|
|
542
|
+
"""
|
|
543
|
+
Executes a Looker SQL query using the Looker API and returns the SQL string.
|
|
544
|
+
|
|
545
|
+
Ref: https://cloud.google.com/looker/docs/reference/looker-api/latest/methods/Query/run_inline_query
|
|
546
|
+
|
|
547
|
+
Example Request:
|
|
548
|
+
WriteQuery:
|
|
549
|
+
{
|
|
550
|
+
"model": "test_model",
|
|
551
|
+
"view": "users",
|
|
552
|
+
"fields": [
|
|
553
|
+
"users.email", "users.lifetime_purchase_count"
|
|
554
|
+
],
|
|
555
|
+
"limit": "1",
|
|
556
|
+
"cache": true
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
Response:
|
|
560
|
+
"
|
|
561
|
+
SELECT
|
|
562
|
+
users."EMAIL" AS "users.email",
|
|
563
|
+
COUNT(DISTINCT ( purchases."PK" ) ) AS "users.lifetime_purchase_count"
|
|
564
|
+
FROM "ECOMMERCE"."USERS" AS users
|
|
565
|
+
LEFT JOIN "ECOMMERCE"."PURCHASES" AS purchases ON (users."PK") = (purchases."USER_FK")
|
|
566
|
+
GROUP BY
|
|
567
|
+
1
|
|
568
|
+
ORDER BY
|
|
569
|
+
2 DESC
|
|
570
|
+
FETCH NEXT 1 ROWS ONLY
|
|
571
|
+
"
|
|
572
|
+
Args:
|
|
573
|
+
query (WriteQuery): The Looker WriteQuery object to execute.
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
str: The SQL string returned by the Looker API, or an empty string if execution fails.
|
|
577
|
+
"""
|
|
578
|
+
|
|
579
|
+
# Record the start time for latency measurement.
|
|
580
|
+
start_time = datetime.now()
|
|
581
|
+
|
|
582
|
+
# Execute the query using the Looker client.
|
|
583
|
+
sql_response = self.looker_client.generate_sql_query(
|
|
584
|
+
write_query=query, use_cache=self.config.use_api_cache_for_view_lineage
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
# Record the end time after query execution.
|
|
588
|
+
end_time = datetime.now()
|
|
589
|
+
|
|
590
|
+
# Attempt to get the LookerViewId for reporting.
|
|
591
|
+
looker_view_id: Optional[LookerViewId] = (
|
|
592
|
+
self.looker_view_id_cache.get_looker_view_id(
|
|
593
|
+
view_name=self.view_context.name(),
|
|
594
|
+
base_folder_path=self.view_context.base_folder_path,
|
|
595
|
+
)
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
# Report the query API latency if the view ID is available.
|
|
599
|
+
if looker_view_id is not None:
|
|
600
|
+
self.reporter.report_looker_query_api_latency(
|
|
601
|
+
looker_view_id.get_urn(self.config),
|
|
602
|
+
end_time - start_time,
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
# Validate the response structure.
|
|
606
|
+
if not sql_response:
|
|
607
|
+
raise ValueError(
|
|
608
|
+
f"No SQL found in response for view '{self.view_context.name()}'. Response: {sql_response}"
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# Extract the SQL string from the response.
|
|
612
|
+
return sql_response
|
|
613
|
+
|
|
614
|
+
def __get_upstream_dataset_urn(self) -> List[Urn]:
|
|
615
|
+
"""
|
|
616
|
+
Extract upstream dataset URNs by parsing the SQL for the current view.
|
|
617
|
+
|
|
618
|
+
Returns:
|
|
619
|
+
List[Urn]: List of upstream dataset URNs, or an empty list if parsing fails.
|
|
620
|
+
"""
|
|
621
|
+
# Attempt to get the SQL parsing result for the current view.
|
|
622
|
+
spr: SqlParsingResult = self._get_spr()
|
|
623
|
+
|
|
624
|
+
# Remove any 'hive.' prefix from upstream table URNs.
|
|
625
|
+
upstream_dataset_urns: List[str] = [
|
|
626
|
+
_drop_hive_dot(urn) for urn in spr.in_tables
|
|
627
|
+
]
|
|
628
|
+
|
|
629
|
+
# Fix any derived view references present in the URNs.
|
|
630
|
+
upstream_dataset_urns = fix_derived_view_urn(
|
|
631
|
+
urns=upstream_dataset_urns,
|
|
632
|
+
looker_view_id_cache=self.looker_view_id_cache,
|
|
633
|
+
base_folder_path=self.view_context.base_folder_path,
|
|
634
|
+
config=self.config,
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
return upstream_dataset_urns
|
|
638
|
+
|
|
639
|
+
def _get_looker_api_field_name(self, field_name: str) -> str:
|
|
640
|
+
"""
|
|
641
|
+
Translate the field name to the looker api field name
|
|
642
|
+
|
|
643
|
+
Example:
|
|
644
|
+
pk -> purchases.pk
|
|
645
|
+
"""
|
|
646
|
+
return f"{self.view_context.name()}.{field_name}"
|
|
647
|
+
|
|
648
|
+
def _get_field_name_from_looker_api_field_name(
|
|
649
|
+
self, looker_api_field_name: str
|
|
650
|
+
) -> str:
|
|
651
|
+
"""
|
|
652
|
+
Translate the looker api field name to the field name
|
|
653
|
+
|
|
654
|
+
Example:
|
|
655
|
+
purchases.pk -> pk
|
|
656
|
+
"""
|
|
657
|
+
# Remove the view name at the start and the dot from the looker_api_field_name, but only if it matches the current view name
|
|
658
|
+
prefix = f"{self.view_context.name()}."
|
|
659
|
+
if looker_api_field_name.startswith(prefix):
|
|
660
|
+
return looker_api_field_name[len(prefix) :]
|
|
661
|
+
else:
|
|
662
|
+
# Don't throw an error, just return the original field name
|
|
663
|
+
return looker_api_field_name
|
|
664
|
+
|
|
665
|
+
def get_upstream_dataset_urn(self) -> List[Urn]:
|
|
666
|
+
"""Get upstream dataset URNs"""
|
|
667
|
+
return self._get_upstream_dataset_urn()
|
|
668
|
+
|
|
669
|
+
def get_upstream_column_ref(
|
|
670
|
+
self, field_context: LookerFieldContext
|
|
671
|
+
) -> List[ColumnRef]:
|
|
672
|
+
"""Return upstream column references for a given field."""
|
|
673
|
+
spr: SqlParsingResult = self._get_spr()
|
|
674
|
+
if not spr.column_lineage:
|
|
675
|
+
return []
|
|
676
|
+
|
|
677
|
+
field_type: Optional[ViewFieldDimensionGroupType] = None
|
|
678
|
+
field_name = field_context.name()
|
|
679
|
+
try:
|
|
680
|
+
# Try if field is a dimension group
|
|
681
|
+
field_type = ViewFieldDimensionGroupType(
|
|
682
|
+
field_context.raw_field.get(VIEW_FIELD_TYPE_ATTRIBUTE)
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
if field_type == ViewFieldDimensionGroupType.TIME:
|
|
686
|
+
field_name = self._get_time_dim_group_field_name(
|
|
687
|
+
field_context.raw_field
|
|
688
|
+
)
|
|
689
|
+
elif field_type == ViewFieldDimensionGroupType.DURATION:
|
|
690
|
+
field_name = self._get_duration_dim_group_field_name(
|
|
691
|
+
field_context.raw_field
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
except Exception:
|
|
695
|
+
# Not a dimension group, no modification needed
|
|
696
|
+
logger.debug(
|
|
697
|
+
f"view-name={self.view_context.name()}, field-name={field_name}, field-type={field_context.raw_field.get(VIEW_FIELD_TYPE_ATTRIBUTE)}"
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
field_api_name = self._get_looker_api_field_name(field_name).lower()
|
|
701
|
+
|
|
702
|
+
upstream_refs: List[ColumnRef] = []
|
|
703
|
+
|
|
704
|
+
for lineage in spr.column_lineage:
|
|
705
|
+
if lineage.downstream.column.lower() == field_api_name:
|
|
706
|
+
for upstream in lineage.upstreams:
|
|
707
|
+
upstream_refs.append(
|
|
708
|
+
ColumnRef(table=upstream.table, column=upstream.column)
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
return _drop_hive_dot_from_upstream(upstream_refs)
|
|
712
|
+
|
|
713
|
+
def create_fields(self) -> List[ViewField]:
|
|
714
|
+
"""Create ViewField objects from SQL parsing result."""
|
|
715
|
+
spr: SqlParsingResult = self._get_spr()
|
|
716
|
+
|
|
717
|
+
if not spr.column_lineage:
|
|
718
|
+
return []
|
|
719
|
+
|
|
720
|
+
fields: List[ViewField] = []
|
|
721
|
+
|
|
722
|
+
for lineage in spr.column_lineage:
|
|
723
|
+
fields.append(
|
|
724
|
+
ViewField(
|
|
725
|
+
name=self._get_field_name_from_looker_api_field_name(
|
|
726
|
+
lineage.downstream.column
|
|
727
|
+
),
|
|
728
|
+
label="",
|
|
729
|
+
type=lineage.downstream.native_column_type or "unknown",
|
|
730
|
+
description="",
|
|
731
|
+
field_type=ViewFieldType.UNKNOWN,
|
|
732
|
+
upstream_fields=_drop_hive_dot_from_upstream(lineage.upstreams),
|
|
733
|
+
)
|
|
734
|
+
)
|
|
735
|
+
return fields
|
|
736
|
+
|
|
737
|
+
|
|
283
738
|
class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
|
|
284
739
|
"""
|
|
285
740
|
Handle the case where upstream dataset is defined in derived_table.sql
|
|
@@ -674,7 +1129,45 @@ def create_view_upstream(
|
|
|
674
1129
|
config: LookMLSourceConfig,
|
|
675
1130
|
ctx: PipelineContext,
|
|
676
1131
|
reporter: LookMLSourceReport,
|
|
1132
|
+
looker_client: Optional["LookerAPI"] = None,
|
|
1133
|
+
view_to_explore_map: Optional[Dict[str, str]] = None,
|
|
677
1134
|
) -> AbstractViewUpstream:
|
|
1135
|
+
# Looker client is required for LookerQueryAPIBasedViewUpstream also enforced by config.use_api_for_view_lineage
|
|
1136
|
+
# view_to_explore_map is required for Looker query API args
|
|
1137
|
+
# Only process if view exists in view_to_explore_map, because we cannot query views which are not reachable from an explore
|
|
1138
|
+
if (
|
|
1139
|
+
config.use_api_for_view_lineage
|
|
1140
|
+
and looker_client
|
|
1141
|
+
and view_to_explore_map
|
|
1142
|
+
and view_context.name() in view_to_explore_map
|
|
1143
|
+
):
|
|
1144
|
+
try:
|
|
1145
|
+
return LookerQueryAPIBasedViewUpstream(
|
|
1146
|
+
view_context=view_context,
|
|
1147
|
+
config=config,
|
|
1148
|
+
reporter=reporter,
|
|
1149
|
+
ctx=ctx,
|
|
1150
|
+
looker_view_id_cache=looker_view_id_cache,
|
|
1151
|
+
looker_client=looker_client,
|
|
1152
|
+
view_to_explore_map=view_to_explore_map,
|
|
1153
|
+
)
|
|
1154
|
+
except Exception as e:
|
|
1155
|
+
# Falling back to custom regex-based parsing - best effort approach
|
|
1156
|
+
reporter.report_warning(
|
|
1157
|
+
title="Looker Query API based View Upstream Failed",
|
|
1158
|
+
message="Error in getting upstream lineage for view using Looker Query API",
|
|
1159
|
+
context=f"View-name: {view_context.name()}",
|
|
1160
|
+
exc=e,
|
|
1161
|
+
)
|
|
1162
|
+
else:
|
|
1163
|
+
logger.debug(
|
|
1164
|
+
f"Skipping Looker Query API for view: {view_context.name()} because one or more conditions are not met: "
|
|
1165
|
+
f"use_api_for_view_lineage={config.use_api_for_view_lineage}, "
|
|
1166
|
+
f"looker_client={'set' if looker_client else 'not set'}, "
|
|
1167
|
+
f"view_to_explore_map={'set' if view_to_explore_map else 'not set'}, "
|
|
1168
|
+
f"view_in_view_to_explore_map={view_context.name() in view_to_explore_map if view_to_explore_map else False}"
|
|
1169
|
+
)
|
|
1170
|
+
|
|
678
1171
|
if view_context.is_regular_case():
|
|
679
1172
|
return RegularViewUpstream(
|
|
680
1173
|
view_context=view_context,
|
|
@@ -13,7 +13,10 @@ from pydantic import Field, root_validator, validator
|
|
|
13
13
|
from requests.models import HTTPError
|
|
14
14
|
|
|
15
15
|
import datahub.emitter.mce_builder as builder
|
|
16
|
-
from datahub.configuration.source_common import
|
|
16
|
+
from datahub.configuration.source_common import (
|
|
17
|
+
DatasetLineageProviderConfigBase,
|
|
18
|
+
LowerCaseDatasetUrnConfigMixin,
|
|
19
|
+
)
|
|
17
20
|
from datahub.ingestion.api.common import PipelineContext
|
|
18
21
|
from datahub.ingestion.api.decorators import (
|
|
19
22
|
SourceCapability,
|
|
@@ -49,6 +52,7 @@ from datahub.metadata.schema_classes import (
|
|
|
49
52
|
ChartQueryTypeClass,
|
|
50
53
|
ChartTypeClass,
|
|
51
54
|
DashboardInfoClass,
|
|
55
|
+
EdgeClass,
|
|
52
56
|
OwnerClass,
|
|
53
57
|
OwnershipClass,
|
|
54
58
|
OwnershipTypeClass,
|
|
@@ -61,7 +65,11 @@ logger = logging.getLogger(__name__)
|
|
|
61
65
|
DATASOURCE_URN_RECURSION_LIMIT = 5
|
|
62
66
|
|
|
63
67
|
|
|
64
|
-
class MetabaseConfig(
|
|
68
|
+
class MetabaseConfig(
|
|
69
|
+
DatasetLineageProviderConfigBase,
|
|
70
|
+
StatefulIngestionConfigBase,
|
|
71
|
+
LowerCaseDatasetUrnConfigMixin,
|
|
72
|
+
):
|
|
65
73
|
# See the Metabase /api/session endpoint for details
|
|
66
74
|
# https://www.metabase.com/docs/latest/api-documentation.html#post-apisession
|
|
67
75
|
connect_uri: str = Field(default="localhost:3000", description="Metabase host URL.")
|
|
@@ -331,19 +339,25 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
331
339
|
lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
|
|
332
340
|
)
|
|
333
341
|
|
|
334
|
-
|
|
342
|
+
# Convert chart URNs to chart edges (instead of deprecated charts field)
|
|
343
|
+
chart_edges = []
|
|
335
344
|
cards_data = dashboard_details.get("dashcards", {})
|
|
336
345
|
for card_info in cards_data:
|
|
337
346
|
card_id = card_info.get("card").get("id", "")
|
|
338
347
|
if not card_id:
|
|
339
348
|
continue # most likely a virtual card without an id (text or heading), not relevant.
|
|
340
349
|
chart_urn = builder.make_chart_urn(self.platform, str(card_id))
|
|
341
|
-
|
|
350
|
+
chart_edges.append(
|
|
351
|
+
EdgeClass(
|
|
352
|
+
destinationUrn=chart_urn,
|
|
353
|
+
lastModified=last_modified.lastModified,
|
|
354
|
+
)
|
|
355
|
+
)
|
|
342
356
|
|
|
343
357
|
dashboard_info_class = DashboardInfoClass(
|
|
344
358
|
description=description,
|
|
345
359
|
title=title,
|
|
346
|
-
|
|
360
|
+
chartEdges=chart_edges,
|
|
347
361
|
lastModified=last_modified,
|
|
348
362
|
dashboardUrl=f"{self.config.display_uri}/dashboard/{dashboard_id}",
|
|
349
363
|
customProperties={},
|
|
@@ -481,13 +495,25 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
481
495
|
datasource_urn = self.get_datasource_urn(card_details)
|
|
482
496
|
custom_properties = self.construct_card_custom_properties(card_details)
|
|
483
497
|
|
|
498
|
+
input_edges = (
|
|
499
|
+
[
|
|
500
|
+
EdgeClass(
|
|
501
|
+
destinationUrn=urn,
|
|
502
|
+
lastModified=last_modified.lastModified,
|
|
503
|
+
)
|
|
504
|
+
for urn in datasource_urn
|
|
505
|
+
]
|
|
506
|
+
if datasource_urn
|
|
507
|
+
else None
|
|
508
|
+
)
|
|
509
|
+
|
|
484
510
|
chart_info = ChartInfoClass(
|
|
485
511
|
type=chart_type,
|
|
486
512
|
description=description,
|
|
487
513
|
title=title,
|
|
488
514
|
lastModified=last_modified,
|
|
489
515
|
chartUrl=f"{self.config.display_uri}/card/{card_id}",
|
|
490
|
-
|
|
516
|
+
inputEdges=input_edges,
|
|
491
517
|
customProperties=custom_properties,
|
|
492
518
|
)
|
|
493
519
|
chart_snapshot.aspects.append(chart_info)
|
|
@@ -5,11 +5,11 @@ import time
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
6
|
from typing import Any, Dict, Iterable, List, Optional, TypeVar, Union
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
import pydantic
|
|
9
9
|
from pydantic.fields import Field
|
|
10
10
|
|
|
11
11
|
import datahub.metadata.schema_classes as models
|
|
12
|
-
from datahub.configuration.common import ConfigModel
|
|
12
|
+
from datahub.configuration.common import ConfigModel, LaxStr
|
|
13
13
|
from datahub.configuration.config_loader import load_config_file
|
|
14
14
|
from datahub.emitter.mce_builder import (
|
|
15
15
|
datahub_guid,
|
|
@@ -66,7 +66,7 @@ class GlossaryTermConfig(ConfigModel):
|
|
|
66
66
|
contains: Optional[List[str]] = None
|
|
67
67
|
values: Optional[List[str]] = None
|
|
68
68
|
related_terms: Optional[List[str]] = None
|
|
69
|
-
custom_properties: Optional[Dict[str,
|
|
69
|
+
custom_properties: Optional[Dict[str, LaxStr]] = None
|
|
70
70
|
knowledge_links: Optional[List[KnowledgeCard]] = None
|
|
71
71
|
domain: Optional[str] = None
|
|
72
72
|
|
|
@@ -82,7 +82,7 @@ class GlossaryNodeConfig(ConfigModel):
|
|
|
82
82
|
terms: Optional[List["GlossaryTermConfig"]] = None
|
|
83
83
|
nodes: Optional[List["GlossaryNodeConfig"]] = None
|
|
84
84
|
knowledge_links: Optional[List[KnowledgeCard]] = None
|
|
85
|
-
custom_properties: Optional[Dict[str,
|
|
85
|
+
custom_properties: Optional[Dict[str, LaxStr]] = None
|
|
86
86
|
|
|
87
87
|
# Private fields.
|
|
88
88
|
_urn: str
|
|
@@ -108,12 +108,12 @@ class BusinessGlossarySourceConfig(ConfigModel):
|
|
|
108
108
|
|
|
109
109
|
|
|
110
110
|
class BusinessGlossaryConfig(DefaultConfig):
|
|
111
|
-
version:
|
|
111
|
+
version: LaxStr
|
|
112
112
|
terms: Optional[List["GlossaryTermConfig"]] = None
|
|
113
113
|
nodes: Optional[List["GlossaryNodeConfig"]] = None
|
|
114
114
|
|
|
115
|
-
@
|
|
116
|
-
def version_must_be_1(cls, v):
|
|
115
|
+
@pydantic.field_validator("version", mode="after")
|
|
116
|
+
def version_must_be_1(cls, v: str) -> str:
|
|
117
117
|
if v != "1":
|
|
118
118
|
raise ValueError("Only version 1 is supported")
|
|
119
119
|
return v
|