acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import os
|
|
3
2
|
import re
|
|
3
|
+
from copy import deepcopy
|
|
4
4
|
from datetime import timedelta
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Dict, List, Optional, Union
|
|
6
6
|
|
|
7
|
-
from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
|
|
8
|
-
from google.cloud.logging_v2.client import Client as GCPLoggingClient
|
|
9
7
|
from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
|
|
10
8
|
|
|
11
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
9
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
10
|
+
from datahub.configuration.env_vars import get_bigquery_schema_parallelism
|
|
12
11
|
from datahub.configuration.source_common import (
|
|
13
12
|
EnvConfigMixin,
|
|
14
13
|
LowerCaseDatasetUrnConfigMixin,
|
|
@@ -18,21 +17,22 @@ from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
|
18
17
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
19
18
|
ClassificationSourceConfigMixin,
|
|
20
19
|
)
|
|
21
|
-
from datahub.ingestion.source.
|
|
20
|
+
from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
|
|
21
|
+
BigQueryConnectionConfig,
|
|
22
|
+
)
|
|
22
23
|
from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
|
|
23
24
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig
|
|
24
25
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
25
26
|
StatefulLineageConfigMixin,
|
|
26
27
|
StatefulProfilingConfigMixin,
|
|
28
|
+
StatefulTimeWindowConfigMixin,
|
|
27
29
|
StatefulUsageConfigMixin,
|
|
28
30
|
)
|
|
29
31
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
30
32
|
|
|
31
33
|
logger = logging.getLogger(__name__)
|
|
32
34
|
|
|
33
|
-
DEFAULT_BQ_SCHEMA_PARALLELISM =
|
|
34
|
-
os.getenv("DATAHUB_BIGQUERY_SCHEMA_PARALLELISM", 20)
|
|
35
|
-
)
|
|
35
|
+
DEFAULT_BQ_SCHEMA_PARALLELISM = get_bigquery_schema_parallelism()
|
|
36
36
|
|
|
37
37
|
# Regexp for sharded tables.
|
|
38
38
|
# A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date.
|
|
@@ -73,8 +73,10 @@ class BigQueryBaseConfig(ConfigModel):
|
|
|
73
73
|
) from e
|
|
74
74
|
return v
|
|
75
75
|
|
|
76
|
-
@root_validator(pre=True
|
|
76
|
+
@root_validator(pre=True)
|
|
77
77
|
def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
|
|
78
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
79
|
+
values = deepcopy(values)
|
|
78
80
|
project_id = values.pop("project_id", None)
|
|
79
81
|
project_ids = values.get("project_ids")
|
|
80
82
|
|
|
@@ -105,64 +107,6 @@ class BigQueryUsageConfig(BaseUsageConfig):
|
|
|
105
107
|
)
|
|
106
108
|
|
|
107
109
|
|
|
108
|
-
class BigQueryConnectionConfig(ConfigModel):
|
|
109
|
-
credential: Optional[GCPCredential] = Field(
|
|
110
|
-
default=None, description="BigQuery credential informations"
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
_credentials_path: Optional[str] = PrivateAttr(None)
|
|
114
|
-
|
|
115
|
-
extra_client_options: Dict[str, Any] = Field(
|
|
116
|
-
default={},
|
|
117
|
-
description="Additional options to pass to google.cloud.logging_v2.client.Client.",
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
project_on_behalf: Optional[str] = Field(
|
|
121
|
-
default=None,
|
|
122
|
-
description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
def __init__(self, **data: Any):
|
|
126
|
-
super().__init__(**data)
|
|
127
|
-
|
|
128
|
-
if self.credential:
|
|
129
|
-
self._credentials_path = self.credential.create_credential_temp_file()
|
|
130
|
-
logger.debug(
|
|
131
|
-
f"Creating temporary credential file at {self._credentials_path}"
|
|
132
|
-
)
|
|
133
|
-
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
|
|
134
|
-
|
|
135
|
-
def get_bigquery_client(self) -> bigquery.Client:
|
|
136
|
-
client_options = self.extra_client_options
|
|
137
|
-
return bigquery.Client(self.project_on_behalf, **client_options)
|
|
138
|
-
|
|
139
|
-
def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
|
|
140
|
-
return resourcemanager_v3.ProjectsClient()
|
|
141
|
-
|
|
142
|
-
def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
|
|
143
|
-
return datacatalog_v1.PolicyTagManagerClient()
|
|
144
|
-
|
|
145
|
-
def make_gcp_logging_client(
|
|
146
|
-
self, project_id: Optional[str] = None
|
|
147
|
-
) -> GCPLoggingClient:
|
|
148
|
-
# See https://github.com/googleapis/google-cloud-python/issues/2674 for
|
|
149
|
-
# why we disable gRPC here.
|
|
150
|
-
client_options = self.extra_client_options.copy()
|
|
151
|
-
client_options["_use_grpc"] = False
|
|
152
|
-
if project_id is not None:
|
|
153
|
-
return GCPLoggingClient(**client_options, project=project_id)
|
|
154
|
-
else:
|
|
155
|
-
return GCPLoggingClient(**client_options)
|
|
156
|
-
|
|
157
|
-
def get_sql_alchemy_url(self) -> str:
|
|
158
|
-
if self.project_on_behalf:
|
|
159
|
-
return f"bigquery://{self.project_on_behalf}"
|
|
160
|
-
# When project_id is not set, we will attempt to detect the project ID
|
|
161
|
-
# based on the credentials or environment variables.
|
|
162
|
-
# See https://github.com/mxmzdlv/pybigquery#authentication.
|
|
163
|
-
return "bigquery://"
|
|
164
|
-
|
|
165
|
-
|
|
166
110
|
class GcsLineageProviderConfig(ConfigModel):
|
|
167
111
|
"""
|
|
168
112
|
Any source that produces gcs lineage from/to Datasets should inherit this class.
|
|
@@ -240,13 +184,14 @@ class BigQueryFilterConfig(SQLFilterConfig):
|
|
|
240
184
|
)
|
|
241
185
|
|
|
242
186
|
# NOTE: `schema_pattern` is added here only to hide it from docs.
|
|
243
|
-
schema_pattern: AllowDenyPattern = Field(
|
|
187
|
+
schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
|
|
244
188
|
default=AllowDenyPattern.allow_all(),
|
|
245
|
-
hidden_from_docs=True,
|
|
246
189
|
)
|
|
247
190
|
|
|
248
191
|
@root_validator(pre=False, skip_on_failure=True)
|
|
249
192
|
def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
|
|
193
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
194
|
+
values = deepcopy(values)
|
|
250
195
|
dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
|
|
251
196
|
schema_pattern = values.get("schema_pattern")
|
|
252
197
|
if (
|
|
@@ -327,6 +272,7 @@ class BigQueryV2Config(
|
|
|
327
272
|
SQLCommonConfig,
|
|
328
273
|
StatefulUsageConfigMixin,
|
|
329
274
|
StatefulLineageConfigMixin,
|
|
275
|
+
StatefulTimeWindowConfigMixin,
|
|
330
276
|
StatefulProfilingConfigMixin,
|
|
331
277
|
ClassificationSourceConfigMixin,
|
|
332
278
|
):
|
|
@@ -378,8 +324,7 @@ class BigQueryV2Config(
|
|
|
378
324
|
description="Include full payload into events. It is only for debugging and internal use.",
|
|
379
325
|
)
|
|
380
326
|
|
|
381
|
-
number_of_datasets_process_in_batch: int = Field(
|
|
382
|
-
hidden_from_docs=True,
|
|
327
|
+
number_of_datasets_process_in_batch: HiddenFromDocs[int] = Field(
|
|
383
328
|
default=10000,
|
|
384
329
|
description="Number of table queried in batch when getting metadata. This is a low level config property "
|
|
385
330
|
"which should be touched with care.",
|
|
@@ -400,7 +345,7 @@ class BigQueryV2Config(
|
|
|
400
345
|
)
|
|
401
346
|
|
|
402
347
|
use_queries_v2: bool = Field(
|
|
403
|
-
default=
|
|
348
|
+
default=True,
|
|
404
349
|
description="If enabled, uses the new queries extractor to extract queries from bigquery.",
|
|
405
350
|
)
|
|
406
351
|
include_queries: bool = Field(
|
|
@@ -494,17 +439,15 @@ class BigQueryV2Config(
|
|
|
494
439
|
|
|
495
440
|
upstream_lineage_in_report: bool = Field(
|
|
496
441
|
default=False,
|
|
497
|
-
description="Useful for debugging lineage information. Set to True to see the raw lineage created internally.",
|
|
442
|
+
description="Useful for debugging lineage information. Set to True to see the raw lineage created internally. Only works with legacy approach (`use_queries_v2: False`).",
|
|
498
443
|
)
|
|
499
444
|
|
|
500
|
-
run_optimized_column_query: bool = Field(
|
|
501
|
-
hidden_from_docs=True,
|
|
445
|
+
run_optimized_column_query: HiddenFromDocs[bool] = Field(
|
|
502
446
|
default=False,
|
|
503
447
|
description="Run optimized column query to get column information. This is an experimental feature and may not work for all cases.",
|
|
504
448
|
)
|
|
505
449
|
|
|
506
|
-
file_backed_cache_size: int = Field(
|
|
507
|
-
hidden_from_docs=True,
|
|
450
|
+
file_backed_cache_size: HiddenFromDocs[int] = Field(
|
|
508
451
|
default=2000,
|
|
509
452
|
description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
|
|
510
453
|
)
|
|
@@ -514,10 +457,9 @@ class BigQueryV2Config(
|
|
|
514
457
|
description="Option to exclude empty projects from being ingested.",
|
|
515
458
|
)
|
|
516
459
|
|
|
517
|
-
schema_resolution_batch_size: int = Field(
|
|
460
|
+
schema_resolution_batch_size: HiddenFromDocs[int] = Field(
|
|
518
461
|
default=100,
|
|
519
462
|
description="The number of tables to process in a batch when resolving schema from DataHub.",
|
|
520
|
-
hidden_from_schema=True,
|
|
521
463
|
)
|
|
522
464
|
|
|
523
465
|
max_threads_dataset_parallelism: int = Field(
|
|
@@ -538,6 +480,8 @@ class BigQueryV2Config(
|
|
|
538
480
|
|
|
539
481
|
@root_validator(pre=True)
|
|
540
482
|
def set_include_schema_metadata(cls, values: Dict) -> Dict:
|
|
483
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
484
|
+
values = deepcopy(values)
|
|
541
485
|
# Historically this is used to disable schema ingestion
|
|
542
486
|
if (
|
|
543
487
|
"include_tables" in values
|
|
@@ -556,6 +500,8 @@ class BigQueryV2Config(
|
|
|
556
500
|
|
|
557
501
|
@root_validator(skip_on_failure=True)
|
|
558
502
|
def profile_default_settings(cls, values: Dict) -> Dict:
|
|
503
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
504
|
+
values = deepcopy(values)
|
|
559
505
|
# Extra default SQLAlchemy option for better connection pooling and threading.
|
|
560
506
|
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
|
|
561
507
|
values["options"].setdefault("max_overflow", -1)
|
|
@@ -573,9 +519,33 @@ class BigQueryV2Config(
|
|
|
573
519
|
|
|
574
520
|
return v
|
|
575
521
|
|
|
522
|
+
@validator("upstream_lineage_in_report")
|
|
523
|
+
def validate_upstream_lineage_in_report(cls, v: bool, values: Dict) -> bool:
|
|
524
|
+
if v and values.get("use_queries_v2", True):
|
|
525
|
+
logging.warning(
|
|
526
|
+
"`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
|
|
527
|
+
"This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
return v
|
|
531
|
+
|
|
532
|
+
@root_validator(pre=False, skip_on_failure=True)
|
|
533
|
+
def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
|
|
534
|
+
if values.get("use_queries_v2"):
|
|
535
|
+
if values.get("enable_stateful_lineage_ingestion") or values.get(
|
|
536
|
+
"enable_stateful_usage_ingestion"
|
|
537
|
+
):
|
|
538
|
+
logger.warning(
|
|
539
|
+
"enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
|
|
540
|
+
"when using use_queries_v2=True. These configs only work with the legacy (non-queries v2) extraction path. "
|
|
541
|
+
"For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
|
|
542
|
+
"for the unified time window extraction (lineage + usage + operations + queries)."
|
|
543
|
+
)
|
|
544
|
+
return values
|
|
545
|
+
|
|
576
546
|
def get_table_pattern(self, pattern: List[str]) -> str:
|
|
577
547
|
return "|".join(pattern) if pattern else ""
|
|
578
548
|
|
|
579
|
-
|
|
549
|
+
_platform_instance_not_supported_for_bigquery = pydantic_removed_field(
|
|
580
550
|
"platform_instance"
|
|
581
551
|
)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from google.api_core.client_info import ClientInfo
|
|
6
|
+
from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
|
|
7
|
+
from google.cloud.logging_v2.client import Client as GCPLoggingClient
|
|
8
|
+
from pydantic import Field, PrivateAttr
|
|
9
|
+
|
|
10
|
+
from datahub._version import __version__
|
|
11
|
+
from datahub.configuration.common import ConfigModel
|
|
12
|
+
from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_bigquery_client_info() -> ClientInfo:
|
|
18
|
+
"""Get ClientInfo with DataHub user-agent for BigQuery client identification"""
|
|
19
|
+
return ClientInfo(user_agent=f"datahub/{__version__}")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BigQueryConnectionConfig(ConfigModel):
|
|
23
|
+
credential: Optional[GCPCredential] = Field(
|
|
24
|
+
default=None, description="BigQuery credential informations"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
_credentials_path: Optional[str] = PrivateAttr(None)
|
|
28
|
+
|
|
29
|
+
extra_client_options: Dict[str, Any] = Field(
|
|
30
|
+
default={},
|
|
31
|
+
description="Additional options to pass to google.cloud.logging_v2.client.Client.",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
project_on_behalf: Optional[str] = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def __init__(self, **data: Any):
|
|
40
|
+
super().__init__(**data)
|
|
41
|
+
|
|
42
|
+
if self.credential:
|
|
43
|
+
self._credentials_path = self.credential.create_credential_temp_file()
|
|
44
|
+
logger.debug(
|
|
45
|
+
f"Creating temporary credential file at {self._credentials_path}"
|
|
46
|
+
)
|
|
47
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
|
|
48
|
+
|
|
49
|
+
def get_bigquery_client(self) -> bigquery.Client:
|
|
50
|
+
client_options = self.extra_client_options
|
|
51
|
+
return bigquery.Client(
|
|
52
|
+
self.project_on_behalf,
|
|
53
|
+
client_info=_get_bigquery_client_info(),
|
|
54
|
+
**client_options,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
|
|
58
|
+
return resourcemanager_v3.ProjectsClient()
|
|
59
|
+
|
|
60
|
+
def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
|
|
61
|
+
return datacatalog_v1.PolicyTagManagerClient()
|
|
62
|
+
|
|
63
|
+
def make_gcp_logging_client(
|
|
64
|
+
self, project_id: Optional[str] = None
|
|
65
|
+
) -> GCPLoggingClient:
|
|
66
|
+
# See https://github.com/googleapis/google-cloud-python/issues/2674 for
|
|
67
|
+
# why we disable gRPC here.
|
|
68
|
+
client_options = self.extra_client_options.copy()
|
|
69
|
+
client_options["_use_grpc"] = False
|
|
70
|
+
if project_id is not None:
|
|
71
|
+
return GCPLoggingClient(**client_options, project=project_id)
|
|
72
|
+
else:
|
|
73
|
+
return GCPLoggingClient(**client_options)
|
|
74
|
+
|
|
75
|
+
def get_sql_alchemy_url(self) -> str:
|
|
76
|
+
if self.project_on_behalf:
|
|
77
|
+
return f"bigquery://{self.project_on_behalf}"
|
|
78
|
+
# When project_id is not set, we will attempt to detect the project ID
|
|
79
|
+
# based on the credentials or environment variables.
|
|
80
|
+
# See https://github.com/mxmzdlv/pybigquery#authentication.
|
|
81
|
+
return "bigquery://"
|
|
@@ -7,13 +7,16 @@ from typing_extensions import Self
|
|
|
7
7
|
|
|
8
8
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
9
9
|
from datahub.ingestion.api.common import PipelineContext
|
|
10
|
+
from datahub.ingestion.api.decorators import SupportStatus, support_status
|
|
10
11
|
from datahub.ingestion.api.source import Source, SourceReport
|
|
11
12
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
12
13
|
from datahub.ingestion.source.bigquery_v2.bigquery_config import (
|
|
13
|
-
BigQueryConnectionConfig,
|
|
14
14
|
BigQueryFilterConfig,
|
|
15
15
|
BigQueryIdentifierConfig,
|
|
16
16
|
)
|
|
17
|
+
from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
|
|
18
|
+
BigQueryConnectionConfig,
|
|
19
|
+
)
|
|
17
20
|
from datahub.ingestion.source.bigquery_v2.bigquery_report import (
|
|
18
21
|
BigQueryQueriesExtractorReport,
|
|
19
22
|
BigQuerySchemaApiPerfReport,
|
|
@@ -48,6 +51,7 @@ class BigQueryQueriesSourceConfig(
|
|
|
48
51
|
)
|
|
49
52
|
|
|
50
53
|
|
|
54
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
51
55
|
class BigQueryQueriesSource(Source):
|
|
52
56
|
def __init__(self, ctx: PipelineContext, config: BigQueryQueriesSourceConfig):
|
|
53
57
|
self.ctx = ctx
|
|
@@ -92,3 +96,4 @@ class BigQueryQueriesSource(Source):
|
|
|
92
96
|
def close(self) -> None:
|
|
93
97
|
self.queries_extractor.close()
|
|
94
98
|
self.connection.close()
|
|
99
|
+
super().close()
|
|
@@ -9,7 +9,6 @@ import pydantic
|
|
|
9
9
|
from datahub.ingestion.api.report import Report
|
|
10
10
|
from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
|
|
11
11
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
12
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
13
12
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
14
13
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
15
14
|
from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
|
|
@@ -78,7 +77,6 @@ class BigQueryQueriesExtractorReport(Report):
|
|
|
78
77
|
@dataclass
|
|
79
78
|
class BigQueryV2Report(
|
|
80
79
|
SQLSourceReport,
|
|
81
|
-
IngestionStageReport,
|
|
82
80
|
BaseTimeWindowReport,
|
|
83
81
|
ClassificationReportMixin,
|
|
84
82
|
):
|
|
@@ -283,23 +283,30 @@ class BigQuerySchemaApi:
|
|
|
283
283
|
with self.report.list_datasets_timer:
|
|
284
284
|
self.report.num_list_datasets_api_requests += 1
|
|
285
285
|
datasets = self.bq_client.list_datasets(project_id, max_results=maxResults)
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
)
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
286
|
+
result = []
|
|
287
|
+
for d in datasets:
|
|
288
|
+
# TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
|
|
289
|
+
# https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
|
|
290
|
+
# https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
|
|
291
|
+
dataset = self.bq_client.get_dataset(d.reference)
|
|
292
|
+
|
|
293
|
+
location = (
|
|
294
|
+
d._properties.get("location")
|
|
295
|
+
if hasattr(d, "_properties") and isinstance(d._properties, dict)
|
|
296
|
+
else None
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
result.append(
|
|
300
|
+
BigqueryDataset(
|
|
301
|
+
name=d.dataset_id,
|
|
302
|
+
labels=d.labels,
|
|
303
|
+
location=location,
|
|
304
|
+
comment=dataset.description,
|
|
305
|
+
created=dataset.created,
|
|
306
|
+
last_altered=dataset.modified,
|
|
307
|
+
)
|
|
300
308
|
)
|
|
301
|
-
|
|
302
|
-
]
|
|
309
|
+
return result
|
|
303
310
|
|
|
304
311
|
# This is not used anywhere
|
|
305
312
|
def get_datasets_for_project_id_with_information_schema(
|
|
@@ -12,6 +12,7 @@ from datahub.emitter.mce_builder import (
|
|
|
12
12
|
make_dataset_urn_with_platform_instance,
|
|
13
13
|
make_schema_field_urn,
|
|
14
14
|
make_tag_urn,
|
|
15
|
+
make_ts_millis,
|
|
15
16
|
)
|
|
16
17
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
17
18
|
from datahub.emitter.mcp_builder import BigQueryDatasetKey, ContainerKey, ProjectIdKey
|
|
@@ -65,7 +66,7 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
65
66
|
)
|
|
66
67
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
67
68
|
METADATA_EXTRACTION,
|
|
68
|
-
|
|
69
|
+
IngestionHighStage,
|
|
69
70
|
)
|
|
70
71
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
71
72
|
Status,
|
|
@@ -286,6 +287,7 @@ class BigQuerySchemaGenerator:
|
|
|
286
287
|
yield from gen_database_container(
|
|
287
288
|
database=database,
|
|
288
289
|
name=database,
|
|
290
|
+
qualified_name=database,
|
|
289
291
|
sub_types=[DatasetContainerSubTypes.BIGQUERY_PROJECT],
|
|
290
292
|
domain_registry=self.domain_registry,
|
|
291
293
|
domain_config=self.config.domain,
|
|
@@ -299,6 +301,8 @@ class BigQuerySchemaGenerator:
|
|
|
299
301
|
description: Optional[str] = None,
|
|
300
302
|
tags: Optional[Dict[str, str]] = None,
|
|
301
303
|
extra_properties: Optional[Dict[str, str]] = None,
|
|
304
|
+
created: Optional[int] = None,
|
|
305
|
+
last_modified: Optional[int] = None,
|
|
302
306
|
) -> Iterable[MetadataWorkUnit]:
|
|
303
307
|
schema_container_key = self.gen_dataset_key(project_id, dataset)
|
|
304
308
|
|
|
@@ -332,6 +336,7 @@ class BigQuerySchemaGenerator:
|
|
|
332
336
|
yield from gen_schema_container(
|
|
333
337
|
database=project_id,
|
|
334
338
|
schema=dataset,
|
|
339
|
+
qualified_name=f"{project_id}.{dataset}",
|
|
335
340
|
sub_types=[DatasetContainerSubTypes.BIGQUERY_DATASET],
|
|
336
341
|
domain_registry=self.domain_registry,
|
|
337
342
|
domain_config=self.config.domain,
|
|
@@ -347,6 +352,8 @@ class BigQuerySchemaGenerator:
|
|
|
347
352
|
),
|
|
348
353
|
tags=tags_joined,
|
|
349
354
|
extra_properties=extra_properties,
|
|
355
|
+
created=created,
|
|
356
|
+
last_modified=last_modified,
|
|
350
357
|
)
|
|
351
358
|
|
|
352
359
|
def _process_project(
|
|
@@ -409,7 +416,7 @@ class BigQuerySchemaGenerator:
|
|
|
409
416
|
|
|
410
417
|
if self.config.is_profiling_enabled():
|
|
411
418
|
logger.info(f"Starting profiling project {project_id}")
|
|
412
|
-
with self.report.
|
|
419
|
+
with self.report.new_high_stage(IngestionHighStage.PROFILING):
|
|
413
420
|
yield from self.profiler.get_workunits(
|
|
414
421
|
project_id=project_id,
|
|
415
422
|
tables=db_tables,
|
|
@@ -442,10 +449,12 @@ class BigQuerySchemaGenerator:
|
|
|
442
449
|
):
|
|
443
450
|
yield wu
|
|
444
451
|
except Exception as e:
|
|
445
|
-
|
|
446
|
-
|
|
452
|
+
# If configuration indicates we need table data access (for profiling or use_tables_list_query_v2),
|
|
453
|
+
# include bigquery.tables.getData in the error message since that's likely the missing permission
|
|
454
|
+
if self.config.have_table_data_read_permission:
|
|
455
|
+
action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list, bigquery.tables.getData permissions?"
|
|
447
456
|
else:
|
|
448
|
-
action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list
|
|
457
|
+
action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permissions?"
|
|
449
458
|
|
|
450
459
|
self.report.failure(
|
|
451
460
|
title="Unable to get tables for dataset",
|
|
@@ -482,6 +491,12 @@ class BigQuerySchemaGenerator:
|
|
|
482
491
|
else None
|
|
483
492
|
),
|
|
484
493
|
description=bigquery_dataset.comment,
|
|
494
|
+
created=make_ts_millis(bigquery_dataset.created)
|
|
495
|
+
if bigquery_dataset.created
|
|
496
|
+
else None,
|
|
497
|
+
last_modified=make_ts_millis(bigquery_dataset.last_altered)
|
|
498
|
+
if bigquery_dataset.last_altered
|
|
499
|
+
else None,
|
|
485
500
|
)
|
|
486
501
|
|
|
487
502
|
columns = None
|
|
@@ -63,7 +63,7 @@ class BigQueryIdentifierBuilder:
|
|
|
63
63
|
)
|
|
64
64
|
|
|
65
65
|
def gen_user_urn(self, user_email: str) -> str:
|
|
66
|
-
return make_user_urn(user_email
|
|
66
|
+
return make_user_urn(user_email)
|
|
67
67
|
|
|
68
68
|
def make_data_platform_urn(self) -> str:
|
|
69
69
|
return make_data_platform_urn(self.platform)
|
|
@@ -375,7 +375,7 @@ class BigqueryLineageExtractor:
|
|
|
375
375
|
memory_footprint.total_size(lineage)
|
|
376
376
|
)
|
|
377
377
|
|
|
378
|
-
for lineage_key in lineage
|
|
378
|
+
for lineage_key in lineage:
|
|
379
379
|
# For views, we do not use the upstreams obtained by parsing audit logs
|
|
380
380
|
# as they may contain indirectly referenced tables.
|
|
381
381
|
if (
|
|
@@ -189,6 +189,7 @@ WHERE
|
|
|
189
189
|
|
|
190
190
|
if len(profile_requests) == 0:
|
|
191
191
|
return
|
|
192
|
+
|
|
192
193
|
yield from self.generate_profile_workunits(
|
|
193
194
|
profile_requests,
|
|
194
195
|
max_workers=self.config.profiling.max_workers,
|
|
@@ -226,10 +227,11 @@ WHERE
|
|
|
226
227
|
db_name, schema_name, bq_table, self.config.profiling.partition_datetime
|
|
227
228
|
)
|
|
228
229
|
|
|
229
|
-
if partition
|
|
230
|
+
# For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
|
|
231
|
+
if partition is None and bq_table.partition_info and bq_table.rows_count:
|
|
230
232
|
self.report.report_warning(
|
|
231
233
|
title="Profile skipped for partitioned table",
|
|
232
|
-
message="profile skipped as
|
|
234
|
+
message="profile skipped as partition id or type was invalid",
|
|
233
235
|
context=profile_request.pretty_name,
|
|
234
236
|
)
|
|
235
237
|
return None
|
|
@@ -45,12 +45,12 @@ SELECT
|
|
|
45
45
|
tos.OPTION_VALUE as comment,
|
|
46
46
|
t.is_insertable_into,
|
|
47
47
|
t.ddl,
|
|
48
|
-
ts.row_count,
|
|
48
|
+
ts.row_count as row_count,
|
|
49
49
|
ts.size_bytes as bytes,
|
|
50
50
|
p.num_partitions,
|
|
51
51
|
p.max_partition_id,
|
|
52
|
-
p.active_billable_bytes,
|
|
53
|
-
p.long_term_billable_bytes,
|
|
52
|
+
p.active_billable_bytes as active_billable_bytes,
|
|
53
|
+
IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
|
|
54
54
|
REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
|
|
55
55
|
REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
|
|
56
56
|
|