acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -27,6 +27,7 @@ from sqlalchemy.exc import ProgrammingError
|
|
|
27
27
|
from sqlalchemy.sql import sqltypes as types
|
|
28
28
|
from sqlalchemy.types import TypeDecorator, TypeEngine
|
|
29
29
|
|
|
30
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
30
31
|
from datahub.emitter.mce_builder import (
|
|
31
32
|
make_data_platform_urn,
|
|
32
33
|
make_dataplatform_instance_urn,
|
|
@@ -45,6 +46,7 @@ from datahub.ingestion.api.source import (
|
|
|
45
46
|
TestableSource,
|
|
46
47
|
TestConnectionReport,
|
|
47
48
|
)
|
|
49
|
+
from datahub.ingestion.api.source_protocols import MetadataWorkUnitIterable
|
|
48
50
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
49
51
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
50
52
|
SAMPLE_SIZE_MULTIPLIER,
|
|
@@ -54,6 +56,7 @@ from datahub.ingestion.source.common.data_reader import DataReader
|
|
|
54
56
|
from datahub.ingestion.source.common.subtypes import (
|
|
55
57
|
DatasetContainerSubTypes,
|
|
56
58
|
DatasetSubTypes,
|
|
59
|
+
SourceCapabilityModifier,
|
|
57
60
|
)
|
|
58
61
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
59
62
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
@@ -70,39 +73,47 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
70
73
|
from datahub.ingestion.source.sql.sqlalchemy_data_reader import (
|
|
71
74
|
SqlAlchemyTableDataReader,
|
|
72
75
|
)
|
|
76
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
77
|
+
BaseProcedure,
|
|
78
|
+
generate_procedure_container_workunits,
|
|
79
|
+
generate_procedure_workunits,
|
|
80
|
+
)
|
|
73
81
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
74
82
|
StaleEntityRemovalHandler,
|
|
75
83
|
)
|
|
76
84
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
77
85
|
StatefulIngestionSourceBase,
|
|
78
86
|
)
|
|
79
|
-
from datahub.metadata.
|
|
80
|
-
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
81
|
-
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
82
|
-
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
87
|
+
from datahub.metadata.schema_classes import (
|
|
83
88
|
ArrayTypeClass,
|
|
84
89
|
BooleanTypeClass,
|
|
85
90
|
BytesTypeClass,
|
|
91
|
+
DataPlatformInstanceClass,
|
|
92
|
+
DatasetLineageTypeClass,
|
|
93
|
+
DatasetPropertiesClass,
|
|
94
|
+
DatasetSnapshotClass,
|
|
86
95
|
DateTypeClass,
|
|
87
96
|
EnumTypeClass,
|
|
88
|
-
|
|
89
|
-
|
|
97
|
+
FineGrainedLineageClass,
|
|
98
|
+
FineGrainedLineageDownstreamTypeClass,
|
|
99
|
+
FineGrainedLineageUpstreamTypeClass,
|
|
100
|
+
ForeignKeyConstraintClass,
|
|
101
|
+
GlobalTagsClass,
|
|
102
|
+
MetadataChangeEventClass,
|
|
103
|
+
MySqlDDLClass,
|
|
90
104
|
NullTypeClass,
|
|
91
105
|
NumberTypeClass,
|
|
92
106
|
RecordTypeClass,
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
107
|
+
SchemaFieldClass,
|
|
108
|
+
SchemaFieldDataTypeClass,
|
|
109
|
+
SchemaMetadataClass,
|
|
110
|
+
StatusClass,
|
|
96
111
|
StringTypeClass,
|
|
97
|
-
TimeTypeClass,
|
|
98
|
-
)
|
|
99
|
-
from datahub.metadata.schema_classes import (
|
|
100
|
-
DataPlatformInstanceClass,
|
|
101
|
-
DatasetLineageTypeClass,
|
|
102
|
-
DatasetPropertiesClass,
|
|
103
|
-
GlobalTagsClass,
|
|
104
112
|
SubTypesClass,
|
|
105
113
|
TagAssociationClass,
|
|
114
|
+
TimeTypeClass,
|
|
115
|
+
UpstreamClass,
|
|
116
|
+
UpstreamLineageClass,
|
|
106
117
|
ViewPropertiesClass,
|
|
107
118
|
)
|
|
108
119
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -112,6 +123,7 @@ from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
|
112
123
|
from datahub.utilities.sqlalchemy_type_converter import (
|
|
113
124
|
get_native_data_type_for_sqlalchemy_type,
|
|
114
125
|
)
|
|
126
|
+
from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
|
|
115
127
|
|
|
116
128
|
if TYPE_CHECKING:
|
|
117
129
|
from datahub.ingestion.source.ge_data_profiler import (
|
|
@@ -198,7 +210,7 @@ def make_sqlalchemy_type(name: str) -> Type[TypeEngine]:
|
|
|
198
210
|
|
|
199
211
|
def get_column_type(
|
|
200
212
|
sql_report: SQLSourceReport, dataset_name: str, column_type: Any
|
|
201
|
-
) ->
|
|
213
|
+
) -> SchemaFieldDataTypeClass:
|
|
202
214
|
"""
|
|
203
215
|
Maps SQLAlchemy types (https://docs.sqlalchemy.org/en/13/core/type_basics.html) to corresponding schema types
|
|
204
216
|
"""
|
|
@@ -223,7 +235,7 @@ def get_column_type(
|
|
|
223
235
|
)
|
|
224
236
|
TypeClass = NullTypeClass
|
|
225
237
|
|
|
226
|
-
return
|
|
238
|
+
return SchemaFieldDataTypeClass(type=TypeClass())
|
|
227
239
|
|
|
228
240
|
|
|
229
241
|
def get_schema_metadata(
|
|
@@ -232,10 +244,10 @@ def get_schema_metadata(
|
|
|
232
244
|
platform: str,
|
|
233
245
|
columns: List[dict],
|
|
234
246
|
pk_constraints: Optional[dict] = None,
|
|
235
|
-
foreign_keys: Optional[List[
|
|
236
|
-
canonical_schema: Optional[List[
|
|
247
|
+
foreign_keys: Optional[List[ForeignKeyConstraintClass]] = None,
|
|
248
|
+
canonical_schema: Optional[List[SchemaFieldClass]] = None,
|
|
237
249
|
simplify_nested_field_paths: bool = False,
|
|
238
|
-
) ->
|
|
250
|
+
) -> SchemaMetadataClass:
|
|
239
251
|
if (
|
|
240
252
|
simplify_nested_field_paths
|
|
241
253
|
and canonical_schema is not None
|
|
@@ -243,12 +255,12 @@ def get_schema_metadata(
|
|
|
243
255
|
):
|
|
244
256
|
canonical_schema = downgrade_schema_from_v2(canonical_schema)
|
|
245
257
|
|
|
246
|
-
schema_metadata =
|
|
258
|
+
schema_metadata = SchemaMetadataClass(
|
|
247
259
|
schemaName=dataset_name,
|
|
248
260
|
platform=make_data_platform_urn(platform),
|
|
249
261
|
version=0,
|
|
250
262
|
hash="",
|
|
251
|
-
platformSchema=
|
|
263
|
+
platformSchema=MySqlDDLClass(tableSchema=""),
|
|
252
264
|
fields=canonical_schema or [],
|
|
253
265
|
)
|
|
254
266
|
if foreign_keys is not None and foreign_keys != []:
|
|
@@ -287,6 +299,10 @@ class ProfileMetadata:
|
|
|
287
299
|
SourceCapability.CONTAINERS,
|
|
288
300
|
"Enabled by default",
|
|
289
301
|
supported=True,
|
|
302
|
+
subtype_modifier=[
|
|
303
|
+
SourceCapabilityModifier.DATABASE,
|
|
304
|
+
SourceCapabilityModifier.SCHEMA,
|
|
305
|
+
],
|
|
290
306
|
)
|
|
291
307
|
@capability(
|
|
292
308
|
SourceCapability.DESCRIPTIONS,
|
|
@@ -298,6 +314,20 @@ class ProfileMetadata:
|
|
|
298
314
|
"Enabled by default",
|
|
299
315
|
supported=True,
|
|
300
316
|
)
|
|
317
|
+
@capability(
|
|
318
|
+
SourceCapability.LINEAGE_COARSE,
|
|
319
|
+
"Enabled by default to get lineage for views via `include_view_lineage`",
|
|
320
|
+
subtype_modifier=[SourceCapabilityModifier.VIEW],
|
|
321
|
+
)
|
|
322
|
+
@capability(
|
|
323
|
+
SourceCapability.LINEAGE_FINE,
|
|
324
|
+
"Enabled by default to get lineage for views via `include_view_column_lineage`",
|
|
325
|
+
subtype_modifier=[SourceCapabilityModifier.VIEW],
|
|
326
|
+
)
|
|
327
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
328
|
+
@capability(
|
|
329
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
330
|
+
)
|
|
301
331
|
class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
302
332
|
"""A Base class for all SQL Sources that use SQLAlchemy to extend"""
|
|
303
333
|
|
|
@@ -508,6 +538,24 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
508
538
|
if self.config.include_views:
|
|
509
539
|
yield from self.loop_views(inspector, schema, self.config)
|
|
510
540
|
|
|
541
|
+
if getattr(self.config, "include_stored_procedures", False):
|
|
542
|
+
try:
|
|
543
|
+
yield from self.loop_stored_procedures(inspector, schema, self.config)
|
|
544
|
+
except NotImplementedError as e:
|
|
545
|
+
self.report.warning(
|
|
546
|
+
title="Stored procedures not supported",
|
|
547
|
+
message="The current SQL dialect does not support stored procedures.",
|
|
548
|
+
context=f"{database}.{schema}",
|
|
549
|
+
exc=e,
|
|
550
|
+
)
|
|
551
|
+
except Exception as e:
|
|
552
|
+
self.report.failure(
|
|
553
|
+
title="Failed to list stored procedures for schema",
|
|
554
|
+
message="An error occurred while listing procedures for the schema.",
|
|
555
|
+
context=f"{database}.{schema}",
|
|
556
|
+
exc=e,
|
|
557
|
+
)
|
|
558
|
+
|
|
511
559
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
512
560
|
return [
|
|
513
561
|
*super().get_workunit_processors(),
|
|
@@ -531,19 +579,6 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
531
579
|
self._add_default_options(sql_config)
|
|
532
580
|
|
|
533
581
|
for inspector in self.get_inspectors():
|
|
534
|
-
profiler = None
|
|
535
|
-
profile_requests: List["GEProfilerRequest"] = []
|
|
536
|
-
if sql_config.is_profiling_enabled():
|
|
537
|
-
profiler = self.get_profiler_instance(inspector)
|
|
538
|
-
try:
|
|
539
|
-
self.add_profile_metadata(inspector)
|
|
540
|
-
except Exception as e:
|
|
541
|
-
self.warn(
|
|
542
|
-
logger,
|
|
543
|
-
"profile_metadata",
|
|
544
|
-
f"Failed to get enrichment data for profile {e}",
|
|
545
|
-
)
|
|
546
|
-
|
|
547
582
|
db_name = self.get_db_name(inspector)
|
|
548
583
|
yield from self.get_database_level_workunits(
|
|
549
584
|
inspector=inspector,
|
|
@@ -559,17 +594,41 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
559
594
|
database=db_name,
|
|
560
595
|
)
|
|
561
596
|
|
|
597
|
+
# Generate workunit for aggregated SQL parsing results
|
|
598
|
+
yield from self._generate_aggregator_workunits()
|
|
599
|
+
|
|
600
|
+
def is_profiling_enabled_internal(self) -> bool:
|
|
601
|
+
return self.config.is_profiling_enabled()
|
|
602
|
+
|
|
603
|
+
def get_profiling_internal(
|
|
604
|
+
self,
|
|
605
|
+
) -> MetadataWorkUnitIterable:
|
|
606
|
+
sql_config = self.config
|
|
607
|
+
for inspector in self.get_inspectors():
|
|
608
|
+
profiler = None
|
|
609
|
+
profile_requests: List["GEProfilerRequest"] = []
|
|
610
|
+
profiler = self.get_profiler_instance(inspector)
|
|
611
|
+
try:
|
|
612
|
+
self.add_profile_metadata(inspector)
|
|
613
|
+
except Exception as e:
|
|
614
|
+
self.warn(
|
|
615
|
+
logger,
|
|
616
|
+
"profile_metadata",
|
|
617
|
+
f"Failed to get enrichment data for profile {e}",
|
|
618
|
+
)
|
|
619
|
+
db_name = self.get_db_name(inspector)
|
|
620
|
+
for schema in self.get_allowed_schemas(inspector, db_name):
|
|
562
621
|
if profiler:
|
|
563
622
|
profile_requests += list(
|
|
564
623
|
self.loop_profiler_requests(inspector, schema, sql_config)
|
|
565
624
|
)
|
|
566
|
-
|
|
567
625
|
if profiler and profile_requests:
|
|
568
626
|
yield from self.loop_profiler(
|
|
569
627
|
profile_requests, profiler, platform=self.platform
|
|
570
628
|
)
|
|
571
629
|
|
|
572
|
-
|
|
630
|
+
def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
631
|
+
"""Generate work units from SQL parsing aggregator. Can be overridden by subclasses."""
|
|
573
632
|
for mcp in self.aggregator.gen_metadata():
|
|
574
633
|
yield mcp.as_workunit()
|
|
575
634
|
|
|
@@ -590,7 +649,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
590
649
|
schema: str,
|
|
591
650
|
fk_dict: Dict[str, str],
|
|
592
651
|
inspector: Inspector,
|
|
593
|
-
) ->
|
|
652
|
+
) -> ForeignKeyConstraintClass:
|
|
594
653
|
referred_schema: Optional[str] = fk_dict.get("referred_schema")
|
|
595
654
|
|
|
596
655
|
if not referred_schema:
|
|
@@ -617,7 +676,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
617
676
|
for f in fk_dict["referred_columns"]
|
|
618
677
|
]
|
|
619
678
|
|
|
620
|
-
return
|
|
679
|
+
return ForeignKeyConstraintClass(
|
|
621
680
|
fk_dict["name"], foreign_fields, source_fields, foreign_dataset
|
|
622
681
|
)
|
|
623
682
|
|
|
@@ -714,7 +773,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
714
773
|
self.config.platform_instance,
|
|
715
774
|
self.config.env,
|
|
716
775
|
)
|
|
717
|
-
dataset_snapshot =
|
|
776
|
+
dataset_snapshot = DatasetSnapshotClass(
|
|
718
777
|
urn=dataset_urn,
|
|
719
778
|
aspects=[StatusClass(removed=False)],
|
|
720
779
|
)
|
|
@@ -742,6 +801,30 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
742
801
|
tags=extra_tags,
|
|
743
802
|
partition_keys=partitions,
|
|
744
803
|
)
|
|
804
|
+
|
|
805
|
+
if self.config.include_table_location_lineage and location_urn:
|
|
806
|
+
self.aggregator.add_known_lineage_mapping(
|
|
807
|
+
upstream_urn=location_urn,
|
|
808
|
+
downstream_urn=dataset_snapshot.urn,
|
|
809
|
+
lineage_type=DatasetLineageTypeClass.COPY,
|
|
810
|
+
)
|
|
811
|
+
external_upstream_table = UpstreamClass(
|
|
812
|
+
dataset=location_urn,
|
|
813
|
+
type=DatasetLineageTypeClass.COPY,
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
yield MetadataChangeProposalWrapper(
|
|
817
|
+
entityUrn=dataset_snapshot.urn,
|
|
818
|
+
aspect=UpstreamLineageClass(
|
|
819
|
+
upstreams=[external_upstream_table],
|
|
820
|
+
fineGrainedLineages=self.get_fine_grained_lineages(
|
|
821
|
+
dataset_urn=dataset_snapshot.urn,
|
|
822
|
+
upstream_dataset_urn=location_urn,
|
|
823
|
+
schema_fields=schema_fields,
|
|
824
|
+
),
|
|
825
|
+
),
|
|
826
|
+
).as_workunit()
|
|
827
|
+
|
|
745
828
|
schema_metadata = get_schema_metadata(
|
|
746
829
|
self.report,
|
|
747
830
|
dataset_name,
|
|
@@ -762,7 +845,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
762
845
|
yield from self.add_table_to_schema_container(
|
|
763
846
|
dataset_urn=dataset_urn, db_name=db_name, schema=schema
|
|
764
847
|
)
|
|
765
|
-
mce =
|
|
848
|
+
mce = MetadataChangeEventClass(proposedSnapshot=dataset_snapshot)
|
|
766
849
|
yield SqlWorkUnit(id=dataset_name, mce=mce)
|
|
767
850
|
dpi_aspect = self.get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
768
851
|
if dpi_aspect:
|
|
@@ -797,7 +880,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
797
880
|
schema: str,
|
|
798
881
|
table: str,
|
|
799
882
|
data_reader: Optional[DataReader],
|
|
800
|
-
schema_metadata:
|
|
883
|
+
schema_metadata: SchemaMetadataClass,
|
|
801
884
|
) -> None:
|
|
802
885
|
try:
|
|
803
886
|
if (
|
|
@@ -908,7 +991,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
908
991
|
|
|
909
992
|
def _get_foreign_keys(
|
|
910
993
|
self, dataset_urn: str, inspector: Inspector, schema: str, table: str
|
|
911
|
-
) -> List[
|
|
994
|
+
) -> List[ForeignKeyConstraintClass]:
|
|
912
995
|
try:
|
|
913
996
|
foreign_keys = [
|
|
914
997
|
self.get_foreign_key_metadata(dataset_urn, schema, fk_rec, inspector)
|
|
@@ -922,6 +1005,42 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
922
1005
|
foreign_keys = []
|
|
923
1006
|
return foreign_keys
|
|
924
1007
|
|
|
1008
|
+
def get_fine_grained_lineages(
|
|
1009
|
+
self,
|
|
1010
|
+
dataset_urn: str,
|
|
1011
|
+
upstream_dataset_urn: str,
|
|
1012
|
+
schema_fields: List[SchemaFieldClass],
|
|
1013
|
+
) -> Optional[List[FineGrainedLineageClass]]:
|
|
1014
|
+
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
1015
|
+
|
|
1016
|
+
for schema_field in schema_fields:
|
|
1017
|
+
try:
|
|
1018
|
+
field_path_v1 = get_simple_field_path_from_v2_field_path(
|
|
1019
|
+
schema_field.fieldPath
|
|
1020
|
+
)
|
|
1021
|
+
fine_grained_lineages.append(
|
|
1022
|
+
FineGrainedLineageClass(
|
|
1023
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
1024
|
+
downstreams=[make_schema_field_urn(dataset_urn, field_path_v1)],
|
|
1025
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
1026
|
+
upstreams=[
|
|
1027
|
+
make_schema_field_urn(
|
|
1028
|
+
upstream_dataset_urn,
|
|
1029
|
+
get_simple_field_path_from_v2_field_path(
|
|
1030
|
+
schema_field.fieldPath
|
|
1031
|
+
),
|
|
1032
|
+
)
|
|
1033
|
+
],
|
|
1034
|
+
)
|
|
1035
|
+
)
|
|
1036
|
+
except Exception as e:
|
|
1037
|
+
logger.warning(
|
|
1038
|
+
f"Error processing field path for {dataset_urn}: {str(e)}"
|
|
1039
|
+
)
|
|
1040
|
+
continue
|
|
1041
|
+
|
|
1042
|
+
return fine_grained_lineages if fine_grained_lineages else None
|
|
1043
|
+
|
|
925
1044
|
def get_schema_fields(
|
|
926
1045
|
self,
|
|
927
1046
|
dataset_name: str,
|
|
@@ -930,7 +1049,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
930
1049
|
pk_constraints: Optional[dict] = None,
|
|
931
1050
|
partition_keys: Optional[List[str]] = None,
|
|
932
1051
|
tags: Optional[Dict[str, List[str]]] = None,
|
|
933
|
-
) -> List[
|
|
1052
|
+
) -> List[SchemaFieldClass]:
|
|
934
1053
|
canonical_schema = []
|
|
935
1054
|
for column in columns:
|
|
936
1055
|
column_tags: Optional[List[str]] = None
|
|
@@ -955,14 +1074,14 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
955
1074
|
pk_constraints: Optional[dict] = None,
|
|
956
1075
|
partition_keys: Optional[List[str]] = None,
|
|
957
1076
|
tags: Optional[List[str]] = None,
|
|
958
|
-
) -> List[
|
|
1077
|
+
) -> List[SchemaFieldClass]:
|
|
959
1078
|
gtc: Optional[GlobalTagsClass] = None
|
|
960
1079
|
if tags:
|
|
961
1080
|
tags_str = [make_tag_urn(t) for t in tags]
|
|
962
1081
|
tags_tac = [TagAssociationClass(t) for t in tags_str]
|
|
963
1082
|
gtc = GlobalTagsClass(tags_tac)
|
|
964
1083
|
full_type = column.get("full_type")
|
|
965
|
-
field =
|
|
1084
|
+
field = SchemaFieldClass(
|
|
966
1085
|
fieldPath=column["name"],
|
|
967
1086
|
type=get_column_type(self.report, dataset_name, column["type"]),
|
|
968
1087
|
nativeDataType=(
|
|
@@ -1092,7 +1211,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1092
1211
|
default_schema=default_schema,
|
|
1093
1212
|
)
|
|
1094
1213
|
|
|
1095
|
-
dataset_snapshot =
|
|
1214
|
+
dataset_snapshot = DatasetSnapshotClass(
|
|
1096
1215
|
urn=dataset_urn,
|
|
1097
1216
|
aspects=[StatusClass(removed=False)],
|
|
1098
1217
|
)
|
|
@@ -1111,7 +1230,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1111
1230
|
dataset_snapshot.aspects.append(dataset_properties)
|
|
1112
1231
|
if schema_metadata:
|
|
1113
1232
|
dataset_snapshot.aspects.append(schema_metadata)
|
|
1114
|
-
mce =
|
|
1233
|
+
mce = MetadataChangeEventClass(proposedSnapshot=dataset_snapshot)
|
|
1115
1234
|
yield SqlWorkUnit(id=dataset_name, mce=mce)
|
|
1116
1235
|
dpi_aspect = self.get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
1117
1236
|
if dpi_aspect:
|
|
@@ -1350,3 +1469,116 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1350
1469
|
|
|
1351
1470
|
def get_report(self):
|
|
1352
1471
|
return self.report
|
|
1472
|
+
|
|
1473
|
+
def loop_stored_procedures(
|
|
1474
|
+
self,
|
|
1475
|
+
inspector: Inspector,
|
|
1476
|
+
schema: str,
|
|
1477
|
+
config: Union[SQLCommonConfig, Type[SQLCommonConfig]],
|
|
1478
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1479
|
+
"""
|
|
1480
|
+
Loop schema data for get stored procedures as dataJob-s.
|
|
1481
|
+
"""
|
|
1482
|
+
db_name = self.get_db_name(inspector)
|
|
1483
|
+
|
|
1484
|
+
procedures = self.fetch_procedures_for_schema(inspector, schema, db_name)
|
|
1485
|
+
if procedures:
|
|
1486
|
+
yield from self._process_procedures(procedures, db_name, schema)
|
|
1487
|
+
|
|
1488
|
+
def fetch_procedures_for_schema(
|
|
1489
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
1490
|
+
) -> List[BaseProcedure]:
|
|
1491
|
+
try:
|
|
1492
|
+
raw_procedures: List[BaseProcedure] = self.get_procedures_for_schema(
|
|
1493
|
+
inspector, schema, db_name
|
|
1494
|
+
)
|
|
1495
|
+
procedures: List[BaseProcedure] = []
|
|
1496
|
+
for procedure in raw_procedures:
|
|
1497
|
+
procedure_qualified_name = self.get_identifier(
|
|
1498
|
+
schema=schema,
|
|
1499
|
+
entity=procedure.name,
|
|
1500
|
+
inspector=inspector,
|
|
1501
|
+
)
|
|
1502
|
+
|
|
1503
|
+
procedure_pattern = getattr(
|
|
1504
|
+
self.config, "procedure_pattern", AllowDenyPattern.allow_all()
|
|
1505
|
+
)
|
|
1506
|
+
if not procedure_pattern.allowed(procedure_qualified_name):
|
|
1507
|
+
self.report.report_dropped(procedure_qualified_name)
|
|
1508
|
+
else:
|
|
1509
|
+
procedures.append(procedure)
|
|
1510
|
+
return procedures
|
|
1511
|
+
except NotImplementedError:
|
|
1512
|
+
raise
|
|
1513
|
+
except Exception as e:
|
|
1514
|
+
self.report.warning(
|
|
1515
|
+
title="Failed to get procedures for schema",
|
|
1516
|
+
message="An error occurred while fetching procedures for the schema.",
|
|
1517
|
+
context=f"{db_name}.{schema}",
|
|
1518
|
+
exc=e,
|
|
1519
|
+
)
|
|
1520
|
+
return []
|
|
1521
|
+
|
|
1522
|
+
def get_procedures_for_schema(
|
|
1523
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
1524
|
+
) -> List[BaseProcedure]:
|
|
1525
|
+
raise NotImplementedError(
|
|
1526
|
+
"Subclasses must implement the 'get_procedures_for_schema' method."
|
|
1527
|
+
)
|
|
1528
|
+
|
|
1529
|
+
def _process_procedures(
|
|
1530
|
+
self,
|
|
1531
|
+
procedures: List[BaseProcedure],
|
|
1532
|
+
db_name: str,
|
|
1533
|
+
schema: str,
|
|
1534
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1535
|
+
if procedures:
|
|
1536
|
+
yield from generate_procedure_container_workunits(
|
|
1537
|
+
database_key=gen_database_key(
|
|
1538
|
+
database=db_name,
|
|
1539
|
+
platform=self.platform,
|
|
1540
|
+
platform_instance=self.config.platform_instance,
|
|
1541
|
+
env=self.config.env,
|
|
1542
|
+
),
|
|
1543
|
+
schema_key=gen_schema_key(
|
|
1544
|
+
db_name=db_name,
|
|
1545
|
+
schema=schema,
|
|
1546
|
+
platform=self.platform,
|
|
1547
|
+
platform_instance=self.config.platform_instance,
|
|
1548
|
+
env=self.config.env,
|
|
1549
|
+
),
|
|
1550
|
+
)
|
|
1551
|
+
for procedure in procedures:
|
|
1552
|
+
yield from self._process_procedure(procedure, schema, db_name)
|
|
1553
|
+
|
|
1554
|
+
def _process_procedure(
|
|
1555
|
+
self,
|
|
1556
|
+
procedure: BaseProcedure,
|
|
1557
|
+
schema: str,
|
|
1558
|
+
db_name: str,
|
|
1559
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1560
|
+
try:
|
|
1561
|
+
yield from generate_procedure_workunits(
|
|
1562
|
+
procedure=procedure,
|
|
1563
|
+
database_key=gen_database_key(
|
|
1564
|
+
database=db_name,
|
|
1565
|
+
platform=self.platform,
|
|
1566
|
+
platform_instance=self.config.platform_instance,
|
|
1567
|
+
env=self.config.env,
|
|
1568
|
+
),
|
|
1569
|
+
schema_key=gen_schema_key(
|
|
1570
|
+
db_name=db_name,
|
|
1571
|
+
schema=schema,
|
|
1572
|
+
platform=self.platform,
|
|
1573
|
+
platform_instance=self.config.platform_instance,
|
|
1574
|
+
env=self.config.env,
|
|
1575
|
+
),
|
|
1576
|
+
schema_resolver=self.get_schema_resolver(),
|
|
1577
|
+
)
|
|
1578
|
+
except Exception as e:
|
|
1579
|
+
self.report.warning(
|
|
1580
|
+
title="Failed to emit stored procedure",
|
|
1581
|
+
message="An error occurred while emitting stored procedure",
|
|
1582
|
+
context=procedure.name,
|
|
1583
|
+
exc=e,
|
|
1584
|
+
)
|
|
@@ -57,10 +57,11 @@ class GenericProfiler:
|
|
|
57
57
|
platform: Optional[str] = None,
|
|
58
58
|
profiler_args: Optional[Dict] = None,
|
|
59
59
|
) -> Iterable[MetadataWorkUnit]:
|
|
60
|
+
# We don't run ge profiling queries if table profiling is enabled or if the row count is 0.
|
|
60
61
|
ge_profile_requests: List[GEProfilerRequest] = [
|
|
61
62
|
cast(GEProfilerRequest, request)
|
|
62
63
|
for request in requests
|
|
63
|
-
if not request.profile_table_level_only
|
|
64
|
+
if not request.profile_table_level_only or request.table.rows_count == 0
|
|
64
65
|
]
|
|
65
66
|
table_level_profile_requests: List[TableProfilerRequest] = [
|
|
66
67
|
request for request in requests if request.profile_table_level_only
|
|
@@ -459,6 +459,25 @@ VERTICA_SQL_TYPES_MAP: Dict[str, Any] = {
|
|
|
459
459
|
"uuid": StringType,
|
|
460
460
|
}
|
|
461
461
|
|
|
462
|
+
# Neo4j property types mapping
|
|
463
|
+
# https://neo4j.com/docs/cypher-manual/current/values-and-types/property-structural-constructed/
|
|
464
|
+
NEO4J_TYPES_MAP: Dict[str, Any] = {
|
|
465
|
+
"boolean": BooleanType,
|
|
466
|
+
"date": DateType,
|
|
467
|
+
"duration": TimeType, # Neo4j duration represents a temporal amount
|
|
468
|
+
"float": NumberType,
|
|
469
|
+
"integer": NumberType,
|
|
470
|
+
"list": ArrayType,
|
|
471
|
+
"local_date_time": TimeType,
|
|
472
|
+
"local_time": TimeType,
|
|
473
|
+
"point": StringType, # Neo4j point - spatial coordinate, represented as string
|
|
474
|
+
"string": StringType,
|
|
475
|
+
"zoned_date_time": TimeType,
|
|
476
|
+
"zoned_time": TimeType,
|
|
477
|
+
"node": StringType, # Neo4j object type
|
|
478
|
+
"relationship": StringType, # Neo4j object type
|
|
479
|
+
}
|
|
480
|
+
|
|
462
481
|
|
|
463
482
|
_merged_mapping = {
|
|
464
483
|
"boolean": BooleanType,
|
|
@@ -478,6 +497,7 @@ _merged_mapping = {
|
|
|
478
497
|
**TRINO_SQL_TYPES_MAP,
|
|
479
498
|
**ATHENA_SQL_TYPES_MAP,
|
|
480
499
|
**VERTICA_SQL_TYPES_MAP,
|
|
500
|
+
**NEO4J_TYPES_MAP,
|
|
481
501
|
}
|
|
482
502
|
|
|
483
503
|
|
|
@@ -487,6 +507,8 @@ def resolve_sql_type(
|
|
|
487
507
|
) -> Optional[DATAHUB_FIELD_TYPE]:
|
|
488
508
|
# In theory, we should use the platform-specific mapping where available.
|
|
489
509
|
# However, the types don't ever conflict, so the merged mapping is fine.
|
|
510
|
+
# Wrong assumption - there ARE conflicts as the test_type_conflicts_across_platforms in test_sql_types.py shows.
|
|
511
|
+
# TODO: revisit this and make platform-specific mappings work.
|
|
490
512
|
TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = (
|
|
491
513
|
_merged_mapping.get(column_type) if column_type else None
|
|
492
514
|
)
|
|
@@ -1,8 +1,45 @@
|
|
|
1
|
-
from typing import Any, Dict, Optional
|
|
1
|
+
from typing import Any, Dict, Optional, Tuple
|
|
2
2
|
|
|
3
3
|
from sqlalchemy.engine import URL
|
|
4
4
|
|
|
5
5
|
|
|
6
|
+
def parse_host_port(
|
|
7
|
+
host_port: str, default_port: Optional[int] = None
|
|
8
|
+
) -> Tuple[str, Optional[int]]:
|
|
9
|
+
"""
|
|
10
|
+
Parse a host:port string into separate host and port components.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
host_port: String in format "host:port" or just "host"
|
|
14
|
+
default_port: Optional default port to use if not specified in host_port
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Tuple of (hostname, port) where port may be None if not specified
|
|
18
|
+
|
|
19
|
+
Examples:
|
|
20
|
+
>>> parse_host_port("localhost:3306")
|
|
21
|
+
('localhost', 3306)
|
|
22
|
+
>>> parse_host_port("localhost")
|
|
23
|
+
('localhost', None)
|
|
24
|
+
>>> parse_host_port("localhost", 5432)
|
|
25
|
+
('localhost', 5432)
|
|
26
|
+
>>> parse_host_port("db.example.com:invalid", 3306)
|
|
27
|
+
('db.example.com', 3306)
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
host, port_str = host_port.rsplit(":", 1)
|
|
31
|
+
port: Optional[int]
|
|
32
|
+
try:
|
|
33
|
+
port = int(port_str)
|
|
34
|
+
except ValueError:
|
|
35
|
+
# Port is not a valid integer
|
|
36
|
+
port = default_port
|
|
37
|
+
return host, port
|
|
38
|
+
except ValueError:
|
|
39
|
+
# No colon found, entire string is the hostname
|
|
40
|
+
return host_port, default_port
|
|
41
|
+
|
|
42
|
+
|
|
6
43
|
def make_sqlalchemy_uri(
|
|
7
44
|
scheme: str,
|
|
8
45
|
username: Optional[str],
|
|
@@ -14,12 +51,7 @@ def make_sqlalchemy_uri(
|
|
|
14
51
|
host: Optional[str] = None
|
|
15
52
|
port: Optional[int] = None
|
|
16
53
|
if at:
|
|
17
|
-
|
|
18
|
-
host, port_str = at.rsplit(":", 1)
|
|
19
|
-
port = int(port_str)
|
|
20
|
-
except ValueError:
|
|
21
|
-
host = at
|
|
22
|
-
port = None
|
|
54
|
+
host, port = parse_host_port(at)
|
|
23
55
|
if uri_opts:
|
|
24
56
|
uri_opts = {k: v for k, v in uri_opts.items() if v is not None}
|
|
25
57
|
|