acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class TableNamingHelper:
|
|
5
|
+
"""
|
|
6
|
+
Helper class for managing table naming conventions in mock data generation.
|
|
7
|
+
|
|
8
|
+
Table naming pattern: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
@staticmethod
|
|
12
|
+
def generate_table_name(
|
|
13
|
+
lineage_hops: int,
|
|
14
|
+
lineage_fan_out: int,
|
|
15
|
+
level: int,
|
|
16
|
+
table_index: int,
|
|
17
|
+
prefix: Optional[str] = None,
|
|
18
|
+
) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Generate a table name following the standard naming convention.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
lineage_hops: Total number of hops in the lineage graph
|
|
24
|
+
lineage_fan_out: Number of downstream tables per upstream table
|
|
25
|
+
level: Level of the table in the lineage graph (0-based)
|
|
26
|
+
table_index: Index of the table within its level (0-based)
|
|
27
|
+
prefix: Optional prefix to add to the table name
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Table name following the pattern: "{prefix}hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
|
|
31
|
+
"""
|
|
32
|
+
base_name = f"hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
|
|
33
|
+
return f"{prefix}{base_name}" if prefix else base_name
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def parse_table_name(table_name: str) -> Dict[str, int]:
|
|
37
|
+
"""
|
|
38
|
+
Parse a table name to extract its components.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
table_name: Table name following the standard naming convention
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Dictionary containing parsed components:
|
|
45
|
+
- lineage_hops: Total number of hops in the lineage graph
|
|
46
|
+
- lineage_fan_out: Number of downstream tables per upstream table
|
|
47
|
+
- level: Level of the table in the lineage graph (0-based)
|
|
48
|
+
- table_index: Index of the table within its level (0-based)
|
|
49
|
+
|
|
50
|
+
Raises:
|
|
51
|
+
ValueError: If the table name doesn't follow the expected pattern
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
# Expected pattern: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
|
|
55
|
+
parts = table_name.split("_")
|
|
56
|
+
|
|
57
|
+
if (
|
|
58
|
+
len(parts) != 6
|
|
59
|
+
or parts[0] != "hops"
|
|
60
|
+
or parts[2] != "f"
|
|
61
|
+
or not parts[4].startswith("h")
|
|
62
|
+
or not parts[5].startswith("t")
|
|
63
|
+
):
|
|
64
|
+
raise ValueError(f"Invalid table name format: {table_name}")
|
|
65
|
+
|
|
66
|
+
lineage_hops = int(parts[1])
|
|
67
|
+
lineage_fan_out = int(parts[3]) # lineage_fan_out is at index 3
|
|
68
|
+
level = int(parts[4][1:]) # Remove 'h' prefix from parts[4]
|
|
69
|
+
table_index = int(parts[5][1:]) # Remove 't' prefix from parts[5]
|
|
70
|
+
|
|
71
|
+
return {
|
|
72
|
+
"lineage_hops": lineage_hops,
|
|
73
|
+
"lineage_fan_out": lineage_fan_out,
|
|
74
|
+
"level": level,
|
|
75
|
+
"table_index": table_index,
|
|
76
|
+
}
|
|
77
|
+
except (ValueError, IndexError) as e:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"Failed to parse table name '{table_name}': {str(e)}"
|
|
80
|
+
) from e
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def is_valid_table_name(table_name: str) -> bool:
|
|
84
|
+
"""
|
|
85
|
+
Check if a table name follows the expected naming convention.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
table_name: Table name to validate
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
True if the table name follows the expected pattern, False otherwise
|
|
92
|
+
"""
|
|
93
|
+
try:
|
|
94
|
+
TableNamingHelper.parse_table_name(table_name)
|
|
95
|
+
return True
|
|
96
|
+
except ValueError:
|
|
97
|
+
return False
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -7,7 +7,16 @@ from dataclasses import dataclass
|
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
8
|
from functools import lru_cache
|
|
9
9
|
from json import JSONDecodeError
|
|
10
|
-
from typing import
|
|
10
|
+
from typing import (
|
|
11
|
+
Dict,
|
|
12
|
+
Iterable,
|
|
13
|
+
Iterator,
|
|
14
|
+
List,
|
|
15
|
+
Optional,
|
|
16
|
+
Set,
|
|
17
|
+
Tuple,
|
|
18
|
+
Union,
|
|
19
|
+
)
|
|
11
20
|
|
|
12
21
|
import dateutil.parser as dp
|
|
13
22
|
import psutil
|
|
@@ -24,7 +33,7 @@ from requests.models import HTTPBasicAuth, HTTPError
|
|
|
24
33
|
from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
|
|
25
34
|
|
|
26
35
|
import datahub.emitter.mce_builder as builder
|
|
27
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
36
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
28
37
|
from datahub.configuration.source_common import (
|
|
29
38
|
DatasetLineageProviderConfigBase,
|
|
30
39
|
)
|
|
@@ -200,10 +209,13 @@ class ModeConfig(
|
|
|
200
209
|
default=True, description="Tag measures and dimensions in the schema"
|
|
201
210
|
)
|
|
202
211
|
|
|
203
|
-
items_per_page: int = Field(
|
|
204
|
-
|
|
212
|
+
items_per_page: HiddenFromDocs[int] = Field(
|
|
213
|
+
DEFAULT_API_ITEMS_PER_PAGE,
|
|
205
214
|
description="Number of items per page for paginated API requests.",
|
|
206
|
-
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
exclude_archived: bool = Field(
|
|
218
|
+
default=False, description="Exclude archived reports"
|
|
207
219
|
)
|
|
208
220
|
|
|
209
221
|
@validator("connect_uri")
|
|
@@ -1465,6 +1477,15 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1465
1477
|
logger.debug(
|
|
1466
1478
|
f"Read {len(reports_page)} reports records from workspace {self.workspace_uri} space {space_token}"
|
|
1467
1479
|
)
|
|
1480
|
+
if self.config.exclude_archived:
|
|
1481
|
+
logger.debug(
|
|
1482
|
+
f"Excluding archived reports since exclude_archived: {self.config.exclude_archived}"
|
|
1483
|
+
)
|
|
1484
|
+
reports_page = [
|
|
1485
|
+
report
|
|
1486
|
+
for report in reports_page
|
|
1487
|
+
if not report.get("archived", False)
|
|
1488
|
+
]
|
|
1468
1489
|
yield reports_page
|
|
1469
1490
|
except ModeRequestError as e:
|
|
1470
1491
|
if isinstance(e, HTTPError) and e.response.status_code == 404:
|
|
@@ -36,7 +36,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
36
36
|
)
|
|
37
37
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
38
38
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
39
|
-
from datahub.ingestion.source.common.subtypes import
|
|
39
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
40
|
+
DatasetContainerSubTypes,
|
|
41
|
+
SourceCapabilityModifier,
|
|
42
|
+
)
|
|
40
43
|
from datahub.ingestion.source.schema_inference.object import (
|
|
41
44
|
SchemaDescription,
|
|
42
45
|
construct_schema,
|
|
@@ -249,6 +252,13 @@ def construct_schema_pymongo(
|
|
|
249
252
|
@support_status(SupportStatus.CERTIFIED)
|
|
250
253
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
251
254
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
255
|
+
@capability(
|
|
256
|
+
SourceCapability.CONTAINERS,
|
|
257
|
+
"Enabled by default",
|
|
258
|
+
subtype_modifier=[
|
|
259
|
+
SourceCapabilityModifier.DATABASE,
|
|
260
|
+
],
|
|
261
|
+
)
|
|
252
262
|
@dataclass
|
|
253
263
|
class MongoDBSource(StatefulIngestionSourceBase):
|
|
254
264
|
"""
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import time
|
|
3
2
|
from dataclasses import dataclass
|
|
4
|
-
from typing import
|
|
3
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
5
4
|
|
|
6
5
|
import pandas as pd
|
|
7
6
|
from neo4j import GraphDatabase
|
|
@@ -11,11 +10,6 @@ from datahub.configuration.source_common import (
|
|
|
11
10
|
EnvConfigMixin,
|
|
12
11
|
PlatformInstanceConfigMixin,
|
|
13
12
|
)
|
|
14
|
-
from datahub.emitter.mce_builder import (
|
|
15
|
-
make_data_platform_urn,
|
|
16
|
-
make_dataset_urn_with_platform_instance,
|
|
17
|
-
)
|
|
18
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
20
14
|
from datahub.ingestion.api.decorators import (
|
|
21
15
|
SupportStatus,
|
|
@@ -28,7 +22,6 @@ from datahub.ingestion.api.source import (
|
|
|
28
22
|
MetadataWorkUnitProcessor,
|
|
29
23
|
SourceCapability,
|
|
30
24
|
)
|
|
31
|
-
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
32
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
33
26
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
34
27
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
@@ -40,36 +33,14 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
40
33
|
StatefulIngestionReport,
|
|
41
34
|
StatefulIngestionSourceBase,
|
|
42
35
|
)
|
|
43
|
-
from datahub.
|
|
44
|
-
from datahub.metadata.schema_classes import (
|
|
45
|
-
AuditStampClass,
|
|
46
|
-
BooleanTypeClass,
|
|
47
|
-
DatasetPropertiesClass,
|
|
48
|
-
DateTypeClass,
|
|
49
|
-
NullTypeClass,
|
|
50
|
-
NumberTypeClass,
|
|
51
|
-
OtherSchemaClass,
|
|
52
|
-
SchemaFieldClass,
|
|
53
|
-
SchemaMetadataClass,
|
|
54
|
-
StringTypeClass,
|
|
55
|
-
SubTypesClass,
|
|
56
|
-
UnionTypeClass,
|
|
57
|
-
)
|
|
36
|
+
from datahub.sdk.dataset import Dataset
|
|
58
37
|
|
|
59
38
|
log = logging.getLogger(__name__)
|
|
60
39
|
logging.basicConfig(level=logging.INFO)
|
|
61
40
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
"integer": NumberTypeClass,
|
|
66
|
-
"local_date_time": DateTypeClass,
|
|
67
|
-
"float": NumberTypeClass,
|
|
68
|
-
"string": StringTypeClass,
|
|
69
|
-
"date": DateTypeClass,
|
|
70
|
-
"node": StringTypeClass,
|
|
71
|
-
"relationship": StringTypeClass,
|
|
72
|
-
}
|
|
41
|
+
# Neo4j object types
|
|
42
|
+
_NODE = "node"
|
|
43
|
+
_RELATIONSHIP = "relationship"
|
|
73
44
|
|
|
74
45
|
|
|
75
46
|
class Neo4jConfig(
|
|
@@ -78,7 +49,6 @@ class Neo4jConfig(
|
|
|
78
49
|
username: str = Field(description="Neo4j Username")
|
|
79
50
|
password: str = Field(description="Neo4j Password")
|
|
80
51
|
uri: str = Field(description="The URI for the Neo4j server")
|
|
81
|
-
env: str = Field(description="Neo4j env")
|
|
82
52
|
|
|
83
53
|
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
84
54
|
|
|
@@ -96,8 +66,6 @@ class Neo4jSourceReport(StatefulIngestionReport):
|
|
|
96
66
|
)
|
|
97
67
|
@support_status(SupportStatus.CERTIFIED)
|
|
98
68
|
class Neo4jSource(StatefulIngestionSourceBase):
|
|
99
|
-
NODE = "node"
|
|
100
|
-
RELATIONSHIP = "relationship"
|
|
101
69
|
config: Neo4jConfig
|
|
102
70
|
report: Neo4jSourceReport
|
|
103
71
|
|
|
@@ -113,78 +81,59 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
113
81
|
config = Neo4jConfig.parse_obj(config_dict)
|
|
114
82
|
return cls(config, ctx)
|
|
115
83
|
|
|
116
|
-
def
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
nativeDataType=col_type,
|
|
131
|
-
description=col_type.upper()
|
|
132
|
-
if col_type in (self.NODE, self.RELATIONSHIP)
|
|
133
|
-
else col_type,
|
|
134
|
-
lastModified=AuditStampClass(
|
|
135
|
-
time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
|
|
136
|
-
),
|
|
84
|
+
def create_schema_field_tuple(
|
|
85
|
+
self, col_name: str, col_type: str, obj_type: Optional[str]
|
|
86
|
+
) -> Tuple[str, str, str]:
|
|
87
|
+
"""Convert Neo4j property to (field_name, field_type, description) tuple."""
|
|
88
|
+
# Special case: when a node has a relationship-typed property, treat it as a node reference
|
|
89
|
+
# This ensures relationship properties within nodes are described as "NODE" rather than "RELATIONSHIP"
|
|
90
|
+
column_type = (
|
|
91
|
+
_NODE if obj_type == _NODE and col_type == _RELATIONSHIP else col_type
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
description = (
|
|
95
|
+
column_type.upper()
|
|
96
|
+
if column_type in (_NODE, _RELATIONSHIP)
|
|
97
|
+
else column_type
|
|
137
98
|
)
|
|
138
99
|
|
|
139
|
-
|
|
100
|
+
return (col_name, column_type, description)
|
|
101
|
+
|
|
102
|
+
def get_subtype_from_obj_type(self, obj_type: str) -> str:
|
|
103
|
+
"""Map Neo4j object type to DataHub subtype."""
|
|
104
|
+
if obj_type == _NODE:
|
|
105
|
+
return DatasetSubTypes.NEO4J_NODE
|
|
106
|
+
elif obj_type == _RELATIONSHIP:
|
|
107
|
+
return DatasetSubTypes.NEO4J_RELATIONSHIP
|
|
108
|
+
return DatasetSubTypes.NEO4J_NODE # default fallback
|
|
109
|
+
|
|
110
|
+
def create_neo4j_dataset(
|
|
140
111
|
self,
|
|
141
112
|
dataset: str,
|
|
113
|
+
columns: list,
|
|
114
|
+
obj_type: Optional[str] = None,
|
|
142
115
|
description: Optional[str] = None,
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
dataset_properties = DatasetPropertiesClass(
|
|
146
|
-
description=description,
|
|
147
|
-
customProperties=custom_properties,
|
|
148
|
-
)
|
|
149
|
-
yield MetadataChangeProposalWrapper(
|
|
150
|
-
entityUrn=make_dataset_urn_with_platform_instance(
|
|
151
|
-
platform=self.platform,
|
|
152
|
-
name=dataset,
|
|
153
|
-
platform_instance=self.config.platform_instance,
|
|
154
|
-
env=self.config.env,
|
|
155
|
-
),
|
|
156
|
-
aspect=dataset_properties,
|
|
157
|
-
).as_workunit()
|
|
158
|
-
|
|
159
|
-
def generate_neo4j_object(
|
|
160
|
-
self, dataset: str, columns: list, obj_type: Optional[str] = None
|
|
161
|
-
) -> Optional[MetadataChangeProposalWrapper]:
|
|
116
|
+
) -> Optional[Dataset]:
|
|
117
|
+
"""Create Dataset entity with Neo4j schema and metadata."""
|
|
162
118
|
try:
|
|
163
|
-
|
|
164
|
-
self.
|
|
119
|
+
schema_fields = [
|
|
120
|
+
self.create_schema_field_tuple(
|
|
121
|
+
col_name=key, col_type=value.lower(), obj_type=obj_type
|
|
122
|
+
)
|
|
165
123
|
for d in columns
|
|
166
124
|
for key, value in d.items()
|
|
167
125
|
]
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
platform=make_data_platform_urn(self.platform),
|
|
178
|
-
version=0,
|
|
179
|
-
hash="",
|
|
180
|
-
platformSchema=OtherSchemaClass(rawSchema=""),
|
|
181
|
-
lastModified=AuditStampClass(
|
|
182
|
-
time=round(time.time() * 1000),
|
|
183
|
-
actor="urn:li:corpuser:ingestion",
|
|
184
|
-
),
|
|
185
|
-
fields=fields,
|
|
186
|
-
),
|
|
126
|
+
|
|
127
|
+
return Dataset(
|
|
128
|
+
platform=self.platform,
|
|
129
|
+
name=dataset,
|
|
130
|
+
platform_instance=self.config.platform_instance,
|
|
131
|
+
env=self.config.env,
|
|
132
|
+
schema=schema_fields,
|
|
133
|
+
subtype=self.get_subtype_from_obj_type(obj_type or _NODE),
|
|
134
|
+
description=description,
|
|
187
135
|
)
|
|
136
|
+
|
|
188
137
|
except Exception as e:
|
|
189
138
|
log.error(e)
|
|
190
139
|
self.report.report_failure(
|
|
@@ -199,21 +148,24 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
199
148
|
self.config.uri, auth=(self.config.username, self.config.password)
|
|
200
149
|
)
|
|
201
150
|
"""
|
|
202
|
-
This process retrieves the metadata for Neo4j objects using an APOC query,
|
|
203
|
-
with two columns: key and value. The key represents
|
|
204
|
-
corresponding metadata.
|
|
151
|
+
This process retrieves the metadata for Neo4j objects using an APOC query,
|
|
152
|
+
which returns a dictionary with two columns: key and value. The key represents
|
|
153
|
+
the Neo4j object, while the value contains the corresponding metadata.
|
|
205
154
|
|
|
206
|
-
When data is returned from Neo4j, much of the relationship metadata is stored
|
|
207
|
-
metadata. Consequently, the objects are organized
|
|
208
|
-
relationships.
|
|
155
|
+
When data is returned from Neo4j, much of the relationship metadata is stored
|
|
156
|
+
with the relevant node's metadata. Consequently, the objects are organized
|
|
157
|
+
into two separate dataframes: one for nodes and one for relationships.
|
|
209
158
|
|
|
210
|
-
In the node dataframe, several fields are extracted and added as new columns.
|
|
211
|
-
dataframe, certain fields are parsed out,
|
|
159
|
+
In the node dataframe, several fields are extracted and added as new columns.
|
|
160
|
+
Similarly, in the relationship dataframe, certain fields are parsed out,
|
|
161
|
+
while others require metadata from the nodes dataframe.
|
|
212
162
|
|
|
213
|
-
Once the data is parsed and these two dataframes are created, we combine
|
|
214
|
-
single dataframe, which will be used to
|
|
163
|
+
Once the data is parsed and these two dataframes are created, we combine
|
|
164
|
+
a subset of their columns into a single dataframe, which will be used to
|
|
165
|
+
create the DataHub objects.
|
|
215
166
|
|
|
216
|
-
See the docs for examples of metadata:
|
|
167
|
+
See the docs for examples of metadata:
|
|
168
|
+
metadata-ingestion/docs/sources/neo4j/neo4j.md
|
|
217
169
|
"""
|
|
218
170
|
try:
|
|
219
171
|
log.info(f"{query}")
|
|
@@ -238,7 +190,7 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
238
190
|
return None
|
|
239
191
|
|
|
240
192
|
def process_nodes(self, data: list) -> pd.DataFrame:
|
|
241
|
-
nodes = [record for record in data if record["value"]["type"] ==
|
|
193
|
+
nodes = [record for record in data if record["value"]["type"] == _NODE]
|
|
242
194
|
node_df = pd.DataFrame(
|
|
243
195
|
nodes,
|
|
244
196
|
columns=["key", "value"],
|
|
@@ -261,9 +213,7 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
261
213
|
return node_df
|
|
262
214
|
|
|
263
215
|
def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
|
|
264
|
-
rels = [
|
|
265
|
-
record for record in data if record["value"]["type"] == self.RELATIONSHIP
|
|
266
|
-
]
|
|
216
|
+
rels = [record for record in data if record["value"]["type"] == _RELATIONSHIP]
|
|
267
217
|
rel_df = pd.DataFrame(rels, columns=["key", "value"])
|
|
268
218
|
rel_df["obj_type"] = rel_df["value"].apply(
|
|
269
219
|
lambda record: self.get_obj_type(record)
|
|
@@ -331,51 +281,40 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
331
281
|
]
|
|
332
282
|
|
|
333
283
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
334
|
-
|
|
335
|
-
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key
|
|
284
|
+
query = (
|
|
285
|
+
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key "
|
|
286
|
+
"RETURN key, value[key] AS value;"
|
|
336
287
|
)
|
|
288
|
+
df = self.get_neo4j_metadata(query)
|
|
337
289
|
if df is None:
|
|
338
290
|
log.warning("No metadata retrieved from Neo4j")
|
|
339
291
|
return
|
|
340
292
|
|
|
341
293
|
for _, row in df.iterrows():
|
|
342
294
|
try:
|
|
343
|
-
|
|
344
|
-
columns=row["property_data_types"],
|
|
345
|
-
dataset=row["key"],
|
|
346
|
-
)
|
|
347
|
-
if neo4j_obj:
|
|
348
|
-
yield from auto_workunit([neo4j_obj])
|
|
349
|
-
|
|
350
|
-
yield MetadataChangeProposalWrapper(
|
|
351
|
-
entityUrn=make_dataset_urn_with_platform_instance(
|
|
352
|
-
platform=self.platform,
|
|
353
|
-
name=row["key"],
|
|
354
|
-
platform_instance=self.config.platform_instance,
|
|
355
|
-
env=self.config.env,
|
|
356
|
-
),
|
|
357
|
-
aspect=SubTypesClass(
|
|
358
|
-
typeNames=[
|
|
359
|
-
DatasetSubTypes.NEO4J_NODE
|
|
360
|
-
if row["obj_type"] == self.NODE
|
|
361
|
-
else DatasetSubTypes.NEO4J_RELATIONSHIP
|
|
362
|
-
]
|
|
363
|
-
),
|
|
364
|
-
).as_workunit()
|
|
365
|
-
|
|
366
|
-
yield from self.add_properties(
|
|
295
|
+
dataset_obj = self.create_neo4j_dataset(
|
|
367
296
|
dataset=row["key"],
|
|
368
|
-
|
|
297
|
+
columns=row["property_data_types"],
|
|
298
|
+
obj_type=row["obj_type"],
|
|
369
299
|
description=row["description"],
|
|
370
300
|
)
|
|
371
301
|
|
|
302
|
+
if dataset_obj:
|
|
303
|
+
yield from dataset_obj.as_workunits()
|
|
304
|
+
self.report.obj_created += 1
|
|
305
|
+
else:
|
|
306
|
+
log.warning(f"Failed to create dataset object for {row['key']}")
|
|
307
|
+
self.report.obj_failures += 1
|
|
308
|
+
|
|
372
309
|
except Exception as e:
|
|
373
|
-
log.
|
|
374
|
-
self.report.
|
|
375
|
-
|
|
310
|
+
log.warning(f"Failed to process row {row['key']}: {str(e)}")
|
|
311
|
+
self.report.report_warning(
|
|
312
|
+
title="Error processing Neo4j metadata",
|
|
313
|
+
message="Some entities will be missed",
|
|
376
314
|
context=row["key"],
|
|
377
315
|
exc=e,
|
|
378
316
|
)
|
|
317
|
+
self.report.obj_failures += 1
|
|
379
318
|
|
|
380
319
|
def get_report(self) -> "Neo4jSourceReport":
|
|
381
320
|
return self.report
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -72,7 +72,7 @@ NIFI = "nifi"
|
|
|
72
72
|
# and here - https://github.com/psf/requests/issues/1573
|
|
73
73
|
class SSLAdapter(HTTPAdapter):
|
|
74
74
|
def __init__(self, certfile, keyfile, password=None):
|
|
75
|
-
self.context = ssl.create_default_context(ssl.Purpose.
|
|
75
|
+
self.context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
|
|
76
76
|
self.context.load_cert_chain(
|
|
77
77
|
certfile=certfile, keyfile=keyfile, password=password
|
|
78
78
|
)
|
|
@@ -166,7 +166,7 @@ class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
|
166
166
|
)
|
|
167
167
|
|
|
168
168
|
@root_validator(skip_on_failure=True)
|
|
169
|
-
def validate_auth_params(
|
|
169
|
+
def validate_auth_params(cls, values):
|
|
170
170
|
if values.get("auth") is NifiAuthType.CLIENT_CERT and not values.get(
|
|
171
171
|
"client_cert_file"
|
|
172
172
|
):
|
|
@@ -333,7 +333,7 @@ class APISource(Source, ABC):
|
|
|
333
333
|
),
|
|
334
334
|
)
|
|
335
335
|
yield wu
|
|
336
|
-
elif endpoint_dets["method"] != "
|
|
336
|
+
elif endpoint_dets["method"] != "GET":
|
|
337
337
|
self.report.report_warning(
|
|
338
338
|
title="Failed to Extract Endpoint Metadata",
|
|
339
339
|
message=f"No example provided for {endpoint_dets['method']}",
|
|
@@ -4,11 +4,10 @@ from enum import Enum
|
|
|
4
4
|
from typing import Dict, List, Literal, Optional, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
|
-
from pydantic import validator
|
|
8
|
-
from pydantic.class_validators import root_validator
|
|
7
|
+
from pydantic import root_validator, validator
|
|
9
8
|
|
|
10
9
|
import datahub.emitter.mce_builder as builder
|
|
11
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
12
11
|
from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
|
|
13
12
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
14
13
|
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
@@ -291,22 +290,18 @@ class PowerBiProfilingConfig(ConfigModel):
|
|
|
291
290
|
class PowerBiDashboardSourceConfig(
|
|
292
291
|
StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
|
|
293
292
|
):
|
|
294
|
-
platform_name: str = pydantic.Field(
|
|
295
|
-
default=Constant.PLATFORM_NAME, hidden_from_docs=True
|
|
296
|
-
)
|
|
293
|
+
platform_name: HiddenFromDocs[str] = pydantic.Field(default=Constant.PLATFORM_NAME)
|
|
297
294
|
|
|
298
|
-
platform_urn: str = pydantic.Field(
|
|
295
|
+
platform_urn: HiddenFromDocs[str] = pydantic.Field(
|
|
299
296
|
default=builder.make_data_platform_urn(platform=Constant.PLATFORM_NAME),
|
|
300
|
-
hidden_from_docs=True,
|
|
301
297
|
)
|
|
302
298
|
|
|
303
299
|
# Organization Identifier
|
|
304
300
|
tenant_id: str = pydantic.Field(description="PowerBI tenant identifier")
|
|
305
301
|
# PowerBi workspace identifier
|
|
306
|
-
workspace_id: Optional[str] = pydantic.Field(
|
|
302
|
+
workspace_id: HiddenFromDocs[Optional[str]] = pydantic.Field(
|
|
307
303
|
default=None,
|
|
308
304
|
description="[deprecated] Use workspace_id_pattern instead",
|
|
309
|
-
hidden_from_docs=True,
|
|
310
305
|
)
|
|
311
306
|
# PowerBi workspace identifier
|
|
312
307
|
workspace_id_pattern: AllowDenyPattern = pydantic.Field(
|
|
@@ -326,15 +321,14 @@ class PowerBiDashboardSourceConfig(
|
|
|
326
321
|
# Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
|
|
327
322
|
# DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
|
|
328
323
|
# mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
|
|
329
|
-
dataset_type_mapping:
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
)
|
|
324
|
+
dataset_type_mapping: HiddenFromDocs[
|
|
325
|
+
Union[Dict[str, str], Dict[str, PlatformDetail]]
|
|
326
|
+
] = pydantic.Field(
|
|
327
|
+
default_factory=default_for_dataset_type_mapping,
|
|
328
|
+
description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
|
|
329
|
+
"DataHub supported datasources."
|
|
330
|
+
"You can configured platform instance for dataset lineage. "
|
|
331
|
+
"See Quickstart Recipe for mapping",
|
|
338
332
|
)
|
|
339
333
|
# PowerBI datasource's server to platform instance mapping
|
|
340
334
|
server_to_platform_instance: Dict[
|
|
@@ -353,6 +347,19 @@ class PowerBiDashboardSourceConfig(
|
|
|
353
347
|
"For example with an ODBC connection string 'DSN=database' where the database type "
|
|
354
348
|
"is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
|
|
355
349
|
)
|
|
350
|
+
# ODBC DSN to database (or database.schema) mapping
|
|
351
|
+
dsn_to_database_schema: Dict[str, str] = pydantic.Field(
|
|
352
|
+
default={},
|
|
353
|
+
description="A mapping of ODBC DSN to database names with optional schema names "
|
|
354
|
+
"(some database platforms such a MySQL use the table name pattern 'database.table', "
|
|
355
|
+
"while others use the pattern 'database.schema.table'). "
|
|
356
|
+
"This mapping is used in conjunction with ODBC SQL query parsing. "
|
|
357
|
+
"If SQL queries used with ODBC do not reference fully qualified tables names, "
|
|
358
|
+
"then you should configure mappings for your DSNs. "
|
|
359
|
+
"For example with an ODBC connection string 'DSN=database' where the database "
|
|
360
|
+
"is 'prod' you would configure the mapping as 'database: prod'. "
|
|
361
|
+
"If the database is 'prod' and the schema is 'data' then mapping would be 'database: prod.data'.",
|
|
362
|
+
)
|
|
356
363
|
# deprecated warning
|
|
357
364
|
_dataset_type_mapping = pydantic_field_deprecated(
|
|
358
365
|
"dataset_type_mapping",
|
|
@@ -528,10 +535,9 @@ class PowerBiDashboardSourceConfig(
|
|
|
528
535
|
"Increase this value if you encounter the 'M-Query Parsing Timeout' message in the connector report.",
|
|
529
536
|
)
|
|
530
537
|
|
|
531
|
-
metadata_api_timeout: int = pydantic.Field(
|
|
538
|
+
metadata_api_timeout: HiddenFromDocs[int] = pydantic.Field(
|
|
532
539
|
default=30,
|
|
533
540
|
description="timeout in seconds for Metadata Rest Api.",
|
|
534
|
-
hidden_from_docs=True,
|
|
535
541
|
)
|
|
536
542
|
|
|
537
543
|
@root_validator(skip_on_failure=True)
|
|
@@ -614,3 +620,23 @@ class PowerBiDashboardSourceConfig(
|
|
|
614
620
|
"Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
|
|
615
621
|
)
|
|
616
622
|
return values
|
|
623
|
+
|
|
624
|
+
@root_validator(skip_on_failure=True)
|
|
625
|
+
def validate_dsn_to_database_schema(cls, values: Dict) -> Dict:
|
|
626
|
+
if values.get("dsn_to_database_schema") is not None:
|
|
627
|
+
dsn_mapping = values.get("dsn_to_database_schema")
|
|
628
|
+
if not isinstance(dsn_mapping, dict):
|
|
629
|
+
raise ValueError("dsn_to_database_schema must contain key-value pairs")
|
|
630
|
+
|
|
631
|
+
for _key, value in dsn_mapping.items():
|
|
632
|
+
if not isinstance(value, str):
|
|
633
|
+
raise ValueError(
|
|
634
|
+
"dsn_to_database_schema mapping values must be strings"
|
|
635
|
+
)
|
|
636
|
+
parts = value.split(".")
|
|
637
|
+
if len(parts) != 1 and len(parts) != 2:
|
|
638
|
+
raise ValueError(
|
|
639
|
+
f"dsn_to_database_schema invalid mapping value: {value}"
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
return values
|