acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import os
|
|
3
2
|
import re
|
|
3
|
+
from copy import deepcopy
|
|
4
4
|
from datetime import timedelta
|
|
5
5
|
from typing import Dict, List, Optional, Union
|
|
6
6
|
|
|
7
7
|
from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
|
|
8
8
|
|
|
9
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
9
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
10
|
+
from datahub.configuration.env_vars import get_bigquery_schema_parallelism
|
|
10
11
|
from datahub.configuration.source_common import (
|
|
11
12
|
EnvConfigMixin,
|
|
12
13
|
LowerCaseDatasetUrnConfigMixin,
|
|
@@ -24,15 +25,14 @@ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterCo
|
|
|
24
25
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
25
26
|
StatefulLineageConfigMixin,
|
|
26
27
|
StatefulProfilingConfigMixin,
|
|
28
|
+
StatefulTimeWindowConfigMixin,
|
|
27
29
|
StatefulUsageConfigMixin,
|
|
28
30
|
)
|
|
29
31
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
30
32
|
|
|
31
33
|
logger = logging.getLogger(__name__)
|
|
32
34
|
|
|
33
|
-
DEFAULT_BQ_SCHEMA_PARALLELISM =
|
|
34
|
-
os.getenv("DATAHUB_BIGQUERY_SCHEMA_PARALLELISM", 20)
|
|
35
|
-
)
|
|
35
|
+
DEFAULT_BQ_SCHEMA_PARALLELISM = get_bigquery_schema_parallelism()
|
|
36
36
|
|
|
37
37
|
# Regexp for sharded tables.
|
|
38
38
|
# A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date.
|
|
@@ -73,8 +73,10 @@ class BigQueryBaseConfig(ConfigModel):
|
|
|
73
73
|
) from e
|
|
74
74
|
return v
|
|
75
75
|
|
|
76
|
-
@root_validator(pre=True
|
|
76
|
+
@root_validator(pre=True)
|
|
77
77
|
def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
|
|
78
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
79
|
+
values = deepcopy(values)
|
|
78
80
|
project_id = values.pop("project_id", None)
|
|
79
81
|
project_ids = values.get("project_ids")
|
|
80
82
|
|
|
@@ -182,13 +184,14 @@ class BigQueryFilterConfig(SQLFilterConfig):
|
|
|
182
184
|
)
|
|
183
185
|
|
|
184
186
|
# NOTE: `schema_pattern` is added here only to hide it from docs.
|
|
185
|
-
schema_pattern: AllowDenyPattern = Field(
|
|
187
|
+
schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
|
|
186
188
|
default=AllowDenyPattern.allow_all(),
|
|
187
|
-
hidden_from_docs=True,
|
|
188
189
|
)
|
|
189
190
|
|
|
190
191
|
@root_validator(pre=False, skip_on_failure=True)
|
|
191
192
|
def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
|
|
193
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
194
|
+
values = deepcopy(values)
|
|
192
195
|
dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
|
|
193
196
|
schema_pattern = values.get("schema_pattern")
|
|
194
197
|
if (
|
|
@@ -269,6 +272,7 @@ class BigQueryV2Config(
|
|
|
269
272
|
SQLCommonConfig,
|
|
270
273
|
StatefulUsageConfigMixin,
|
|
271
274
|
StatefulLineageConfigMixin,
|
|
275
|
+
StatefulTimeWindowConfigMixin,
|
|
272
276
|
StatefulProfilingConfigMixin,
|
|
273
277
|
ClassificationSourceConfigMixin,
|
|
274
278
|
):
|
|
@@ -320,8 +324,7 @@ class BigQueryV2Config(
|
|
|
320
324
|
description="Include full payload into events. It is only for debugging and internal use.",
|
|
321
325
|
)
|
|
322
326
|
|
|
323
|
-
number_of_datasets_process_in_batch: int = Field(
|
|
324
|
-
hidden_from_docs=True,
|
|
327
|
+
number_of_datasets_process_in_batch: HiddenFromDocs[int] = Field(
|
|
325
328
|
default=10000,
|
|
326
329
|
description="Number of table queried in batch when getting metadata. This is a low level config property "
|
|
327
330
|
"which should be touched with care.",
|
|
@@ -342,7 +345,7 @@ class BigQueryV2Config(
|
|
|
342
345
|
)
|
|
343
346
|
|
|
344
347
|
use_queries_v2: bool = Field(
|
|
345
|
-
default=
|
|
348
|
+
default=True,
|
|
346
349
|
description="If enabled, uses the new queries extractor to extract queries from bigquery.",
|
|
347
350
|
)
|
|
348
351
|
include_queries: bool = Field(
|
|
@@ -436,17 +439,15 @@ class BigQueryV2Config(
|
|
|
436
439
|
|
|
437
440
|
upstream_lineage_in_report: bool = Field(
|
|
438
441
|
default=False,
|
|
439
|
-
description="Useful for debugging lineage information. Set to True to see the raw lineage created internally.",
|
|
442
|
+
description="Useful for debugging lineage information. Set to True to see the raw lineage created internally. Only works with legacy approach (`use_queries_v2: False`).",
|
|
440
443
|
)
|
|
441
444
|
|
|
442
|
-
run_optimized_column_query: bool = Field(
|
|
443
|
-
hidden_from_docs=True,
|
|
445
|
+
run_optimized_column_query: HiddenFromDocs[bool] = Field(
|
|
444
446
|
default=False,
|
|
445
447
|
description="Run optimized column query to get column information. This is an experimental feature and may not work for all cases.",
|
|
446
448
|
)
|
|
447
449
|
|
|
448
|
-
file_backed_cache_size: int = Field(
|
|
449
|
-
hidden_from_docs=True,
|
|
450
|
+
file_backed_cache_size: HiddenFromDocs[int] = Field(
|
|
450
451
|
default=2000,
|
|
451
452
|
description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
|
|
452
453
|
)
|
|
@@ -456,10 +457,9 @@ class BigQueryV2Config(
|
|
|
456
457
|
description="Option to exclude empty projects from being ingested.",
|
|
457
458
|
)
|
|
458
459
|
|
|
459
|
-
schema_resolution_batch_size: int = Field(
|
|
460
|
+
schema_resolution_batch_size: HiddenFromDocs[int] = Field(
|
|
460
461
|
default=100,
|
|
461
462
|
description="The number of tables to process in a batch when resolving schema from DataHub.",
|
|
462
|
-
hidden_from_schema=True,
|
|
463
463
|
)
|
|
464
464
|
|
|
465
465
|
max_threads_dataset_parallelism: int = Field(
|
|
@@ -480,6 +480,8 @@ class BigQueryV2Config(
|
|
|
480
480
|
|
|
481
481
|
@root_validator(pre=True)
|
|
482
482
|
def set_include_schema_metadata(cls, values: Dict) -> Dict:
|
|
483
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
484
|
+
values = deepcopy(values)
|
|
483
485
|
# Historically this is used to disable schema ingestion
|
|
484
486
|
if (
|
|
485
487
|
"include_tables" in values
|
|
@@ -498,6 +500,8 @@ class BigQueryV2Config(
|
|
|
498
500
|
|
|
499
501
|
@root_validator(skip_on_failure=True)
|
|
500
502
|
def profile_default_settings(cls, values: Dict) -> Dict:
|
|
503
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
504
|
+
values = deepcopy(values)
|
|
501
505
|
# Extra default SQLAlchemy option for better connection pooling and threading.
|
|
502
506
|
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
|
|
503
507
|
values["options"].setdefault("max_overflow", -1)
|
|
@@ -515,9 +519,33 @@ class BigQueryV2Config(
|
|
|
515
519
|
|
|
516
520
|
return v
|
|
517
521
|
|
|
522
|
+
@validator("upstream_lineage_in_report")
|
|
523
|
+
def validate_upstream_lineage_in_report(cls, v: bool, values: Dict) -> bool:
|
|
524
|
+
if v and values.get("use_queries_v2", True):
|
|
525
|
+
logging.warning(
|
|
526
|
+
"`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
|
|
527
|
+
"This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
return v
|
|
531
|
+
|
|
532
|
+
@root_validator(pre=False, skip_on_failure=True)
|
|
533
|
+
def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
|
|
534
|
+
if values.get("use_queries_v2"):
|
|
535
|
+
if values.get("enable_stateful_lineage_ingestion") or values.get(
|
|
536
|
+
"enable_stateful_usage_ingestion"
|
|
537
|
+
):
|
|
538
|
+
logger.warning(
|
|
539
|
+
"enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
|
|
540
|
+
"when using use_queries_v2=True. These configs only work with the legacy (non-queries v2) extraction path. "
|
|
541
|
+
"For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
|
|
542
|
+
"for the unified time window extraction (lineage + usage + operations + queries)."
|
|
543
|
+
)
|
|
544
|
+
return values
|
|
545
|
+
|
|
518
546
|
def get_table_pattern(self, pattern: List[str]) -> str:
|
|
519
547
|
return "|".join(pattern) if pattern else ""
|
|
520
548
|
|
|
521
|
-
|
|
549
|
+
_platform_instance_not_supported_for_bigquery = pydantic_removed_field(
|
|
522
550
|
"platform_instance"
|
|
523
551
|
)
|
|
@@ -2,16 +2,23 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Any, Dict, Optional
|
|
4
4
|
|
|
5
|
+
from google.api_core.client_info import ClientInfo
|
|
5
6
|
from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
|
|
6
7
|
from google.cloud.logging_v2.client import Client as GCPLoggingClient
|
|
7
8
|
from pydantic import Field, PrivateAttr
|
|
8
9
|
|
|
10
|
+
from datahub._version import __version__
|
|
9
11
|
from datahub.configuration.common import ConfigModel
|
|
10
12
|
from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
|
|
11
13
|
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
|
|
14
16
|
|
|
17
|
+
def _get_bigquery_client_info() -> ClientInfo:
|
|
18
|
+
"""Get ClientInfo with DataHub user-agent for BigQuery client identification"""
|
|
19
|
+
return ClientInfo(user_agent=f"datahub/{__version__}")
|
|
20
|
+
|
|
21
|
+
|
|
15
22
|
class BigQueryConnectionConfig(ConfigModel):
|
|
16
23
|
credential: Optional[GCPCredential] = Field(
|
|
17
24
|
default=None, description="BigQuery credential informations"
|
|
@@ -41,7 +48,11 @@ class BigQueryConnectionConfig(ConfigModel):
|
|
|
41
48
|
|
|
42
49
|
def get_bigquery_client(self) -> bigquery.Client:
|
|
43
50
|
client_options = self.extra_client_options
|
|
44
|
-
return bigquery.Client(
|
|
51
|
+
return bigquery.Client(
|
|
52
|
+
self.project_on_behalf,
|
|
53
|
+
client_info=_get_bigquery_client_info(),
|
|
54
|
+
**client_options,
|
|
55
|
+
)
|
|
45
56
|
|
|
46
57
|
def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
|
|
47
58
|
return resourcemanager_v3.ProjectsClient()
|
|
@@ -7,6 +7,7 @@ from typing_extensions import Self
|
|
|
7
7
|
|
|
8
8
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
9
9
|
from datahub.ingestion.api.common import PipelineContext
|
|
10
|
+
from datahub.ingestion.api.decorators import SupportStatus, support_status
|
|
10
11
|
from datahub.ingestion.api.source import Source, SourceReport
|
|
11
12
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
12
13
|
from datahub.ingestion.source.bigquery_v2.bigquery_config import (
|
|
@@ -50,6 +51,7 @@ class BigQueryQueriesSourceConfig(
|
|
|
50
51
|
)
|
|
51
52
|
|
|
52
53
|
|
|
54
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
53
55
|
class BigQueryQueriesSource(Source):
|
|
54
56
|
def __init__(self, ctx: PipelineContext, config: BigQueryQueriesSourceConfig):
|
|
55
57
|
self.ctx = ctx
|
|
@@ -94,3 +96,4 @@ class BigQueryQueriesSource(Source):
|
|
|
94
96
|
def close(self) -> None:
|
|
95
97
|
self.queries_extractor.close()
|
|
96
98
|
self.connection.close()
|
|
99
|
+
super().close()
|
|
@@ -9,7 +9,6 @@ import pydantic
|
|
|
9
9
|
from datahub.ingestion.api.report import Report
|
|
10
10
|
from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
|
|
11
11
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
12
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
13
12
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
14
13
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
15
14
|
from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
|
|
@@ -78,7 +77,6 @@ class BigQueryQueriesExtractorReport(Report):
|
|
|
78
77
|
@dataclass
|
|
79
78
|
class BigQueryV2Report(
|
|
80
79
|
SQLSourceReport,
|
|
81
|
-
IngestionStageReport,
|
|
82
80
|
BaseTimeWindowReport,
|
|
83
81
|
ClassificationReportMixin,
|
|
84
82
|
):
|
|
@@ -283,23 +283,30 @@ class BigQuerySchemaApi:
|
|
|
283
283
|
with self.report.list_datasets_timer:
|
|
284
284
|
self.report.num_list_datasets_api_requests += 1
|
|
285
285
|
datasets = self.bq_client.list_datasets(project_id, max_results=maxResults)
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
)
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
286
|
+
result = []
|
|
287
|
+
for d in datasets:
|
|
288
|
+
# TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
|
|
289
|
+
# https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
|
|
290
|
+
# https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
|
|
291
|
+
dataset = self.bq_client.get_dataset(d.reference)
|
|
292
|
+
|
|
293
|
+
location = (
|
|
294
|
+
d._properties.get("location")
|
|
295
|
+
if hasattr(d, "_properties") and isinstance(d._properties, dict)
|
|
296
|
+
else None
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
result.append(
|
|
300
|
+
BigqueryDataset(
|
|
301
|
+
name=d.dataset_id,
|
|
302
|
+
labels=d.labels,
|
|
303
|
+
location=location,
|
|
304
|
+
comment=dataset.description,
|
|
305
|
+
created=dataset.created,
|
|
306
|
+
last_altered=dataset.modified,
|
|
307
|
+
)
|
|
300
308
|
)
|
|
301
|
-
|
|
302
|
-
]
|
|
309
|
+
return result
|
|
303
310
|
|
|
304
311
|
# This is not used anywhere
|
|
305
312
|
def get_datasets_for_project_id_with_information_schema(
|
|
@@ -12,6 +12,7 @@ from datahub.emitter.mce_builder import (
|
|
|
12
12
|
make_dataset_urn_with_platform_instance,
|
|
13
13
|
make_schema_field_urn,
|
|
14
14
|
make_tag_urn,
|
|
15
|
+
make_ts_millis,
|
|
15
16
|
)
|
|
16
17
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
17
18
|
from datahub.emitter.mcp_builder import BigQueryDatasetKey, ContainerKey, ProjectIdKey
|
|
@@ -65,7 +66,7 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
65
66
|
)
|
|
66
67
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
67
68
|
METADATA_EXTRACTION,
|
|
68
|
-
|
|
69
|
+
IngestionHighStage,
|
|
69
70
|
)
|
|
70
71
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
71
72
|
Status,
|
|
@@ -286,6 +287,7 @@ class BigQuerySchemaGenerator:
|
|
|
286
287
|
yield from gen_database_container(
|
|
287
288
|
database=database,
|
|
288
289
|
name=database,
|
|
290
|
+
qualified_name=database,
|
|
289
291
|
sub_types=[DatasetContainerSubTypes.BIGQUERY_PROJECT],
|
|
290
292
|
domain_registry=self.domain_registry,
|
|
291
293
|
domain_config=self.config.domain,
|
|
@@ -299,6 +301,8 @@ class BigQuerySchemaGenerator:
|
|
|
299
301
|
description: Optional[str] = None,
|
|
300
302
|
tags: Optional[Dict[str, str]] = None,
|
|
301
303
|
extra_properties: Optional[Dict[str, str]] = None,
|
|
304
|
+
created: Optional[int] = None,
|
|
305
|
+
last_modified: Optional[int] = None,
|
|
302
306
|
) -> Iterable[MetadataWorkUnit]:
|
|
303
307
|
schema_container_key = self.gen_dataset_key(project_id, dataset)
|
|
304
308
|
|
|
@@ -332,6 +336,7 @@ class BigQuerySchemaGenerator:
|
|
|
332
336
|
yield from gen_schema_container(
|
|
333
337
|
database=project_id,
|
|
334
338
|
schema=dataset,
|
|
339
|
+
qualified_name=f"{project_id}.{dataset}",
|
|
335
340
|
sub_types=[DatasetContainerSubTypes.BIGQUERY_DATASET],
|
|
336
341
|
domain_registry=self.domain_registry,
|
|
337
342
|
domain_config=self.config.domain,
|
|
@@ -347,6 +352,8 @@ class BigQuerySchemaGenerator:
|
|
|
347
352
|
),
|
|
348
353
|
tags=tags_joined,
|
|
349
354
|
extra_properties=extra_properties,
|
|
355
|
+
created=created,
|
|
356
|
+
last_modified=last_modified,
|
|
350
357
|
)
|
|
351
358
|
|
|
352
359
|
def _process_project(
|
|
@@ -409,7 +416,7 @@ class BigQuerySchemaGenerator:
|
|
|
409
416
|
|
|
410
417
|
if self.config.is_profiling_enabled():
|
|
411
418
|
logger.info(f"Starting profiling project {project_id}")
|
|
412
|
-
with self.report.
|
|
419
|
+
with self.report.new_high_stage(IngestionHighStage.PROFILING):
|
|
413
420
|
yield from self.profiler.get_workunits(
|
|
414
421
|
project_id=project_id,
|
|
415
422
|
tables=db_tables,
|
|
@@ -442,10 +449,12 @@ class BigQuerySchemaGenerator:
|
|
|
442
449
|
):
|
|
443
450
|
yield wu
|
|
444
451
|
except Exception as e:
|
|
445
|
-
|
|
446
|
-
|
|
452
|
+
# If configuration indicates we need table data access (for profiling or use_tables_list_query_v2),
|
|
453
|
+
# include bigquery.tables.getData in the error message since that's likely the missing permission
|
|
454
|
+
if self.config.have_table_data_read_permission:
|
|
455
|
+
action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list, bigquery.tables.getData permissions?"
|
|
447
456
|
else:
|
|
448
|
-
action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list
|
|
457
|
+
action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permissions?"
|
|
449
458
|
|
|
450
459
|
self.report.failure(
|
|
451
460
|
title="Unable to get tables for dataset",
|
|
@@ -482,6 +491,12 @@ class BigQuerySchemaGenerator:
|
|
|
482
491
|
else None
|
|
483
492
|
),
|
|
484
493
|
description=bigquery_dataset.comment,
|
|
494
|
+
created=make_ts_millis(bigquery_dataset.created)
|
|
495
|
+
if bigquery_dataset.created
|
|
496
|
+
else None,
|
|
497
|
+
last_modified=make_ts_millis(bigquery_dataset.last_altered)
|
|
498
|
+
if bigquery_dataset.last_altered
|
|
499
|
+
else None,
|
|
485
500
|
)
|
|
486
501
|
|
|
487
502
|
columns = None
|
|
@@ -63,7 +63,7 @@ class BigQueryIdentifierBuilder:
|
|
|
63
63
|
)
|
|
64
64
|
|
|
65
65
|
def gen_user_urn(self, user_email: str) -> str:
|
|
66
|
-
return make_user_urn(user_email
|
|
66
|
+
return make_user_urn(user_email)
|
|
67
67
|
|
|
68
68
|
def make_data_platform_urn(self) -> str:
|
|
69
69
|
return make_data_platform_urn(self.platform)
|
|
@@ -189,6 +189,7 @@ WHERE
|
|
|
189
189
|
|
|
190
190
|
if len(profile_requests) == 0:
|
|
191
191
|
return
|
|
192
|
+
|
|
192
193
|
yield from self.generate_profile_workunits(
|
|
193
194
|
profile_requests,
|
|
194
195
|
max_workers=self.config.profiling.max_workers,
|
|
@@ -226,10 +227,11 @@ WHERE
|
|
|
226
227
|
db_name, schema_name, bq_table, self.config.profiling.partition_datetime
|
|
227
228
|
)
|
|
228
229
|
|
|
229
|
-
if partition
|
|
230
|
+
# For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
|
|
231
|
+
if partition is None and bq_table.partition_info and bq_table.rows_count:
|
|
230
232
|
self.report.report_warning(
|
|
231
233
|
title="Profile skipped for partitioned table",
|
|
232
|
-
message="profile skipped as
|
|
234
|
+
message="profile skipped as partition id or type was invalid",
|
|
233
235
|
context=profile_request.pretty_name,
|
|
234
236
|
)
|
|
235
237
|
return None
|
|
@@ -45,12 +45,12 @@ SELECT
|
|
|
45
45
|
tos.OPTION_VALUE as comment,
|
|
46
46
|
t.is_insertable_into,
|
|
47
47
|
t.ddl,
|
|
48
|
-
ts.row_count,
|
|
48
|
+
ts.row_count as row_count,
|
|
49
49
|
ts.size_bytes as bytes,
|
|
50
50
|
p.num_partitions,
|
|
51
51
|
p.max_partition_id,
|
|
52
|
-
p.active_billable_bytes,
|
|
53
|
-
p.long_term_billable_bytes,
|
|
52
|
+
p.active_billable_bytes as active_billable_bytes,
|
|
53
|
+
IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
|
|
54
54
|
REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
|
|
55
55
|
REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
|
|
56
56
|
|
|
@@ -8,7 +8,7 @@ from typing import Collection, Dict, Iterable, List, Optional, TypedDict
|
|
|
8
8
|
from google.cloud.bigquery import Client
|
|
9
9
|
from pydantic import Field, PositiveInt
|
|
10
10
|
|
|
11
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
11
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
12
12
|
from datahub.configuration.time_window_config import (
|
|
13
13
|
BaseTimeWindowConfig,
|
|
14
14
|
get_time_bucket,
|
|
@@ -36,6 +36,9 @@ from datahub.ingestion.source.bigquery_v2.common import (
|
|
|
36
36
|
BigQueryFilter,
|
|
37
37
|
BigQueryIdentifierBuilder,
|
|
38
38
|
)
|
|
39
|
+
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
40
|
+
RedundantQueriesRunSkipHandler,
|
|
41
|
+
)
|
|
39
42
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
40
43
|
from datahub.metadata.urns import CorpUserUrn
|
|
41
44
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -86,12 +89,11 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
|
|
|
86
89
|
# TODO: Support stateful ingestion for the time windows.
|
|
87
90
|
window: BaseTimeWindowConfig = BaseTimeWindowConfig()
|
|
88
91
|
|
|
89
|
-
local_temp_path: Optional[pathlib.Path] = Field(
|
|
90
|
-
default=None,
|
|
91
|
-
description="Local path to store the audit log.",
|
|
92
|
+
local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = Field(
|
|
92
93
|
# TODO: For now, this is simply an advanced config to make local testing easier.
|
|
93
94
|
# Eventually, we will want to store date-specific files in the directory and use it as a cache.
|
|
94
|
-
|
|
95
|
+
default=None,
|
|
96
|
+
description="Local path to store the audit log.",
|
|
95
97
|
)
|
|
96
98
|
|
|
97
99
|
user_email_pattern: AllowDenyPattern = Field(
|
|
@@ -136,6 +138,7 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
136
138
|
structured_report: SourceReport,
|
|
137
139
|
filters: BigQueryFilter,
|
|
138
140
|
identifiers: BigQueryIdentifierBuilder,
|
|
141
|
+
redundant_run_skip_handler: Optional[RedundantQueriesRunSkipHandler] = None,
|
|
139
142
|
graph: Optional[DataHubGraph] = None,
|
|
140
143
|
schema_resolver: Optional[SchemaResolver] = None,
|
|
141
144
|
discovered_tables: Optional[Collection[str]] = None,
|
|
@@ -159,6 +162,9 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
159
162
|
)
|
|
160
163
|
|
|
161
164
|
self.structured_report = structured_report
|
|
165
|
+
self.redundant_run_skip_handler = redundant_run_skip_handler
|
|
166
|
+
|
|
167
|
+
self.start_time, self.end_time = self._get_time_window()
|
|
162
168
|
|
|
163
169
|
self.aggregator = SqlParsingAggregator(
|
|
164
170
|
platform=self.identifiers.platform,
|
|
@@ -173,8 +179,8 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
173
179
|
generate_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
174
180
|
usage_config=BaseUsageConfig(
|
|
175
181
|
bucket_duration=self.config.window.bucket_duration,
|
|
176
|
-
start_time=self.
|
|
177
|
-
end_time=self.
|
|
182
|
+
start_time=self.start_time,
|
|
183
|
+
end_time=self.end_time,
|
|
178
184
|
user_email_pattern=self.config.user_email_pattern,
|
|
179
185
|
top_n_queries=self.config.top_n_queries,
|
|
180
186
|
),
|
|
@@ -200,6 +206,34 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
200
206
|
logger.info(f"Using local temp path: {path}")
|
|
201
207
|
return path
|
|
202
208
|
|
|
209
|
+
def _get_time_window(self) -> tuple[datetime, datetime]:
|
|
210
|
+
if self.redundant_run_skip_handler:
|
|
211
|
+
start_time, end_time = (
|
|
212
|
+
self.redundant_run_skip_handler.suggest_run_time_window(
|
|
213
|
+
self.config.window.start_time,
|
|
214
|
+
self.config.window.end_time,
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
start_time = self.config.window.start_time
|
|
219
|
+
end_time = self.config.window.end_time
|
|
220
|
+
|
|
221
|
+
# Usage statistics are aggregated per bucket (typically per day).
|
|
222
|
+
# To ensure accurate aggregated metrics, we need to align the start_time
|
|
223
|
+
# to the beginning of a bucket so that we include complete bucket periods.
|
|
224
|
+
if self.config.include_usage_statistics:
|
|
225
|
+
start_time = get_time_bucket(start_time, self.config.window.bucket_duration)
|
|
226
|
+
|
|
227
|
+
return start_time, end_time
|
|
228
|
+
|
|
229
|
+
def _update_state(self) -> None:
|
|
230
|
+
if self.redundant_run_skip_handler:
|
|
231
|
+
self.redundant_run_skip_handler.update_state(
|
|
232
|
+
self.config.window.start_time,
|
|
233
|
+
self.config.window.end_time,
|
|
234
|
+
self.config.window.bucket_duration,
|
|
235
|
+
)
|
|
236
|
+
|
|
203
237
|
def is_temp_table(self, name: str) -> bool:
|
|
204
238
|
try:
|
|
205
239
|
table = BigqueryTableIdentifier.from_string_name(name)
|
|
@@ -300,6 +334,8 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
300
334
|
shared_connection.close()
|
|
301
335
|
audit_log_file.unlink(missing_ok=True)
|
|
302
336
|
|
|
337
|
+
self._update_state()
|
|
338
|
+
|
|
303
339
|
def deduplicate_queries(
|
|
304
340
|
self, queries: FileBackedList[ObservedQuery]
|
|
305
341
|
) -> FileBackedDict[Dict[int, ObservedQuery]]:
|
|
@@ -356,8 +392,8 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
356
392
|
query_log_query = _build_enriched_query_log_query(
|
|
357
393
|
project_id=project.id,
|
|
358
394
|
region=region,
|
|
359
|
-
start_time=self.
|
|
360
|
-
end_time=self.
|
|
395
|
+
start_time=self.start_time,
|
|
396
|
+
end_time=self.end_time,
|
|
361
397
|
)
|
|
362
398
|
|
|
363
399
|
logger.info(f"Fetching query log from BQ Project {project.id} for {region}")
|
|
@@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
|
|
|
80
80
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
81
81
|
@capability(
|
|
82
82
|
SourceCapability.DELETION_DETECTION,
|
|
83
|
-
"
|
|
83
|
+
"Enabled by default via stateful ingestion",
|
|
84
84
|
supported=True,
|
|
85
85
|
)
|
|
86
86
|
class CassandraSource(StatefulIngestionSourceBase):
|
|
@@ -296,13 +296,11 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
296
296
|
qualified_name=dataset_name,
|
|
297
297
|
description=view.comment,
|
|
298
298
|
custom_properties=self._get_dataset_custom_props(view),
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
),
|
|
305
|
-
],
|
|
299
|
+
view_definition=ViewPropertiesClass(
|
|
300
|
+
materialized=True,
|
|
301
|
+
viewLogic=view.where_clause, # Use the WHERE clause as view logic
|
|
302
|
+
viewLanguage="CQL", # Use "CQL" as the language
|
|
303
|
+
),
|
|
306
304
|
)
|
|
307
305
|
|
|
308
306
|
# Construct and emit lineage off of 'base_table_name'
|
|
@@ -132,7 +132,23 @@ class CassandraAPI:
|
|
|
132
132
|
|
|
133
133
|
ssl_context = None
|
|
134
134
|
if self.config.ssl_ca_certs:
|
|
135
|
-
|
|
135
|
+
# Map SSL version string to ssl module constant
|
|
136
|
+
ssl_version_map = {
|
|
137
|
+
"TLS_CLIENT": ssl.PROTOCOL_TLS_CLIENT,
|
|
138
|
+
"TLSv1": ssl.PROTOCOL_TLSv1,
|
|
139
|
+
"TLSv1_1": ssl.PROTOCOL_TLSv1_1,
|
|
140
|
+
"TLSv1_2": ssl.PROTOCOL_TLSv1_2,
|
|
141
|
+
"TLSv1_3": ssl.PROTOCOL_TLSv1_2, # Python's ssl module uses TLSv1_2 for TLS 1.3
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
ssl_protocol = (
|
|
145
|
+
ssl_version_map.get(
|
|
146
|
+
self.config.ssl_version, ssl.PROTOCOL_TLS_CLIENT
|
|
147
|
+
)
|
|
148
|
+
if self.config.ssl_version
|
|
149
|
+
else ssl.PROTOCOL_TLS_CLIENT
|
|
150
|
+
)
|
|
151
|
+
ssl_context = ssl.SSLContext(ssl_protocol)
|
|
136
152
|
ssl_context.load_verify_locations(self.config.ssl_ca_certs)
|
|
137
153
|
if self.config.ssl_certfile and self.config.ssl_keyfile:
|
|
138
154
|
ssl_context.load_cert_chain(
|
|
@@ -94,6 +94,11 @@ class CassandraSourceConfig(
|
|
|
94
94
|
description="Path to the SSL key file for SSL connections.",
|
|
95
95
|
)
|
|
96
96
|
|
|
97
|
+
ssl_version: Optional[str] = Field(
|
|
98
|
+
default="TLS_CLIENT",
|
|
99
|
+
description="SSL protocol version to use for connections. Options: TLS_CLIENT, TLSv1, TLSv1_1, TLSv1_2, TLSv1_3. Defaults to TLS_CLIENT.",
|
|
100
|
+
)
|
|
101
|
+
|
|
97
102
|
keyspace_pattern: AllowDenyPattern = Field(
|
|
98
103
|
default=AllowDenyPattern.allow_all(),
|
|
99
104
|
description="Regex patterns to filter keyspaces for ingestion.",
|
|
@@ -18,7 +18,7 @@ from datahub.ingestion.source.cassandra.cassandra_api import (
|
|
|
18
18
|
)
|
|
19
19
|
from datahub.ingestion.source.cassandra.cassandra_config import CassandraSourceConfig
|
|
20
20
|
from datahub.ingestion.source.cassandra.cassandra_utils import CassandraSourceReport
|
|
21
|
-
from datahub.ingestion.source_report.ingestion_stage import
|
|
21
|
+
from datahub.ingestion.source_report.ingestion_stage import IngestionHighStage
|
|
22
22
|
from datahub.metadata.schema_classes import (
|
|
23
23
|
DatasetFieldProfileClass,
|
|
24
24
|
DatasetProfileClass,
|
|
@@ -70,11 +70,12 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
with
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
73
|
+
with (
|
|
74
|
+
self.report.new_high_stage(IngestionHighStage.PROFILING),
|
|
75
|
+
ThreadPoolExecutor(
|
|
76
|
+
max_workers=self.config.profiling.max_workers
|
|
77
|
+
) as executor,
|
|
78
|
+
):
|
|
78
79
|
future_to_dataset = {
|
|
79
80
|
executor.submit(
|
|
80
81
|
self.generate_profile,
|
|
@@ -6,7 +6,6 @@ from datahub.ingestion.source.cassandra.cassandra_api import CassandraColumn
|
|
|
6
6
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
7
7
|
StaleEntityRemovalSourceReport,
|
|
8
8
|
)
|
|
9
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
10
9
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
11
10
|
SchemaField,
|
|
12
11
|
SchemaFieldDataType,
|
|
@@ -35,7 +34,7 @@ SYSTEM_KEYSPACE_LIST = set(
|
|
|
35
34
|
|
|
36
35
|
|
|
37
36
|
@dataclass
|
|
38
|
-
class CassandraSourceReport(StaleEntityRemovalSourceReport
|
|
37
|
+
class CassandraSourceReport(StaleEntityRemovalSourceReport):
|
|
39
38
|
num_tables_failed: int = 0
|
|
40
39
|
num_views_failed: int = 0
|
|
41
40
|
tables_scanned: int = 0
|