acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -36,7 +36,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
36
36
|
)
|
|
37
37
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
38
38
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
39
|
-
from datahub.ingestion.source.common.subtypes import
|
|
39
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
40
|
+
DatasetContainerSubTypes,
|
|
41
|
+
SourceCapabilityModifier,
|
|
42
|
+
)
|
|
40
43
|
from datahub.ingestion.source.schema_inference.object import (
|
|
41
44
|
SchemaDescription,
|
|
42
45
|
construct_schema,
|
|
@@ -249,6 +252,13 @@ def construct_schema_pymongo(
|
|
|
249
252
|
@support_status(SupportStatus.CERTIFIED)
|
|
250
253
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
251
254
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
255
|
+
@capability(
|
|
256
|
+
SourceCapability.CONTAINERS,
|
|
257
|
+
"Enabled by default",
|
|
258
|
+
subtype_modifier=[
|
|
259
|
+
SourceCapabilityModifier.DATABASE,
|
|
260
|
+
],
|
|
261
|
+
)
|
|
252
262
|
@dataclass
|
|
253
263
|
class MongoDBSource(StatefulIngestionSourceBase):
|
|
254
264
|
"""
|
|
@@ -1,74 +1,56 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import time
|
|
3
2
|
from dataclasses import dataclass
|
|
4
|
-
from typing import
|
|
3
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
5
4
|
|
|
6
5
|
import pandas as pd
|
|
7
6
|
from neo4j import GraphDatabase
|
|
8
|
-
from pydantic
|
|
7
|
+
from pydantic import Field
|
|
9
8
|
|
|
10
9
|
from datahub.configuration.source_common import (
|
|
11
10
|
EnvConfigMixin,
|
|
11
|
+
PlatformInstanceConfigMixin,
|
|
12
12
|
)
|
|
13
|
-
from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
|
|
14
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
15
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
16
14
|
from datahub.ingestion.api.decorators import (
|
|
17
15
|
SupportStatus,
|
|
16
|
+
capability,
|
|
18
17
|
config_class,
|
|
19
18
|
platform_name,
|
|
20
19
|
support_status,
|
|
21
20
|
)
|
|
22
21
|
from datahub.ingestion.api.source import (
|
|
23
22
|
MetadataWorkUnitProcessor,
|
|
23
|
+
SourceCapability,
|
|
24
24
|
)
|
|
25
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
26
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
27
27
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
28
28
|
StaleEntityRemovalHandler,
|
|
29
|
+
StatefulStaleMetadataRemovalConfig,
|
|
29
30
|
)
|
|
30
31
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
31
32
|
StatefulIngestionConfigBase,
|
|
32
33
|
StatefulIngestionReport,
|
|
33
34
|
StatefulIngestionSourceBase,
|
|
34
35
|
)
|
|
35
|
-
from datahub.
|
|
36
|
-
from datahub.metadata.schema_classes import (
|
|
37
|
-
AuditStampClass,
|
|
38
|
-
BooleanTypeClass,
|
|
39
|
-
DatasetPropertiesClass,
|
|
40
|
-
DateTypeClass,
|
|
41
|
-
NullTypeClass,
|
|
42
|
-
NumberTypeClass,
|
|
43
|
-
OtherSchemaClass,
|
|
44
|
-
SchemaFieldClass,
|
|
45
|
-
SchemaMetadataClass,
|
|
46
|
-
StringTypeClass,
|
|
47
|
-
SubTypesClass,
|
|
48
|
-
UnionTypeClass,
|
|
49
|
-
)
|
|
36
|
+
from datahub.sdk.dataset import Dataset
|
|
50
37
|
|
|
51
38
|
log = logging.getLogger(__name__)
|
|
52
39
|
logging.basicConfig(level=logging.INFO)
|
|
53
40
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
"integer": NumberTypeClass,
|
|
58
|
-
"local_date_time": DateTypeClass,
|
|
59
|
-
"float": NumberTypeClass,
|
|
60
|
-
"string": StringTypeClass,
|
|
61
|
-
"date": DateTypeClass,
|
|
62
|
-
"node": StringTypeClass,
|
|
63
|
-
"relationship": StringTypeClass,
|
|
64
|
-
}
|
|
41
|
+
# Neo4j object types
|
|
42
|
+
_NODE = "node"
|
|
43
|
+
_RELATIONSHIP = "relationship"
|
|
65
44
|
|
|
66
45
|
|
|
67
|
-
class Neo4jConfig(
|
|
46
|
+
class Neo4jConfig(
|
|
47
|
+
StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
|
|
48
|
+
):
|
|
68
49
|
username: str = Field(description="Neo4j Username")
|
|
69
50
|
password: str = Field(description="Neo4j Password")
|
|
70
51
|
uri: str = Field(description="The URI for the Neo4j server")
|
|
71
|
-
|
|
52
|
+
|
|
53
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
72
54
|
|
|
73
55
|
|
|
74
56
|
@dataclass
|
|
@@ -79,114 +61,111 @@ class Neo4jSourceReport(StatefulIngestionReport):
|
|
|
79
61
|
|
|
80
62
|
@platform_name("Neo4j", id="neo4j")
|
|
81
63
|
@config_class(Neo4jConfig)
|
|
64
|
+
@capability(
|
|
65
|
+
SourceCapability.PLATFORM_INSTANCE, "Supported via the `platform_instance` config"
|
|
66
|
+
)
|
|
82
67
|
@support_status(SupportStatus.CERTIFIED)
|
|
83
68
|
class Neo4jSource(StatefulIngestionSourceBase):
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
PLATFORM = "neo4j"
|
|
69
|
+
config: Neo4jConfig
|
|
70
|
+
report: Neo4jSourceReport
|
|
87
71
|
|
|
88
|
-
def __init__(self,
|
|
72
|
+
def __init__(self, config: Neo4jConfig, ctx: PipelineContext):
|
|
73
|
+
super().__init__(config, ctx)
|
|
89
74
|
self.ctx = ctx
|
|
90
75
|
self.config = config
|
|
76
|
+
self.platform = "neo4j"
|
|
91
77
|
self.report: Neo4jSourceReport = Neo4jSourceReport()
|
|
92
78
|
|
|
93
79
|
@classmethod
|
|
94
|
-
def create(cls, config_dict, ctx):
|
|
80
|
+
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "Neo4jSource":
|
|
95
81
|
config = Neo4jConfig.parse_obj(config_dict)
|
|
96
|
-
return cls(
|
|
97
|
-
|
|
98
|
-
def
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
col_type = self.NODE
|
|
107
|
-
else:
|
|
108
|
-
col_type = col_type
|
|
109
|
-
return SchemaFieldClass(
|
|
110
|
-
fieldPath=col_name,
|
|
111
|
-
type=self.get_field_type(col_type),
|
|
112
|
-
nativeDataType=col_type,
|
|
113
|
-
description=col_type.upper()
|
|
114
|
-
if col_type in (self.NODE, self.RELATIONSHIP)
|
|
115
|
-
else col_type,
|
|
116
|
-
lastModified=AuditStampClass(
|
|
117
|
-
time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
|
|
118
|
-
),
|
|
82
|
+
return cls(config, ctx)
|
|
83
|
+
|
|
84
|
+
def create_schema_field_tuple(
|
|
85
|
+
self, col_name: str, col_type: str, obj_type: Optional[str]
|
|
86
|
+
) -> Tuple[str, str, str]:
|
|
87
|
+
"""Convert Neo4j property to (field_name, field_type, description) tuple."""
|
|
88
|
+
# Special case: when a node has a relationship-typed property, treat it as a node reference
|
|
89
|
+
# This ensures relationship properties within nodes are described as "NODE" rather than "RELATIONSHIP"
|
|
90
|
+
column_type = (
|
|
91
|
+
_NODE if obj_type == _NODE and col_type == _RELATIONSHIP else col_type
|
|
119
92
|
)
|
|
120
93
|
|
|
121
|
-
|
|
94
|
+
description = (
|
|
95
|
+
column_type.upper()
|
|
96
|
+
if column_type in (_NODE, _RELATIONSHIP)
|
|
97
|
+
else column_type
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return (col_name, column_type, description)
|
|
101
|
+
|
|
102
|
+
def get_subtype_from_obj_type(self, obj_type: str) -> str:
|
|
103
|
+
"""Map Neo4j object type to DataHub subtype."""
|
|
104
|
+
if obj_type == _NODE:
|
|
105
|
+
return DatasetSubTypes.NEO4J_NODE
|
|
106
|
+
elif obj_type == _RELATIONSHIP:
|
|
107
|
+
return DatasetSubTypes.NEO4J_RELATIONSHIP
|
|
108
|
+
return DatasetSubTypes.NEO4J_NODE # default fallback
|
|
109
|
+
|
|
110
|
+
def create_neo4j_dataset(
|
|
122
111
|
self,
|
|
123
112
|
dataset: str,
|
|
113
|
+
columns: list,
|
|
114
|
+
obj_type: Optional[str] = None,
|
|
124
115
|
description: Optional[str] = None,
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
dataset_properties = DatasetPropertiesClass(
|
|
128
|
-
description=description,
|
|
129
|
-
customProperties=custom_properties,
|
|
130
|
-
)
|
|
131
|
-
return MetadataChangeProposalWrapper(
|
|
132
|
-
entityUrn=make_dataset_urn(
|
|
133
|
-
platform=self.PLATFORM, name=dataset, env=self.config.env
|
|
134
|
-
),
|
|
135
|
-
aspect=dataset_properties,
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
def generate_neo4j_object(
|
|
139
|
-
self, dataset: str, columns: list, obj_type: Optional[str] = None
|
|
140
|
-
) -> MetadataChangeProposalWrapper:
|
|
116
|
+
) -> Optional[Dataset]:
|
|
117
|
+
"""Create Dataset entity with Neo4j schema and metadata."""
|
|
141
118
|
try:
|
|
142
|
-
|
|
143
|
-
self.
|
|
119
|
+
schema_fields = [
|
|
120
|
+
self.create_schema_field_tuple(
|
|
121
|
+
col_name=key, col_type=value.lower(), obj_type=obj_type
|
|
122
|
+
)
|
|
144
123
|
for d in columns
|
|
145
124
|
for key, value in d.items()
|
|
146
125
|
]
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
platformSchema=OtherSchemaClass(rawSchema=""),
|
|
157
|
-
lastModified=AuditStampClass(
|
|
158
|
-
time=round(time.time() * 1000),
|
|
159
|
-
actor="urn:li:corpuser:ingestion",
|
|
160
|
-
),
|
|
161
|
-
fields=fields,
|
|
162
|
-
),
|
|
126
|
+
|
|
127
|
+
return Dataset(
|
|
128
|
+
platform=self.platform,
|
|
129
|
+
name=dataset,
|
|
130
|
+
platform_instance=self.config.platform_instance,
|
|
131
|
+
env=self.config.env,
|
|
132
|
+
schema=schema_fields,
|
|
133
|
+
subtype=self.get_subtype_from_obj_type(obj_type or _NODE),
|
|
134
|
+
description=description,
|
|
163
135
|
)
|
|
164
|
-
|
|
136
|
+
|
|
165
137
|
except Exception as e:
|
|
166
138
|
log.error(e)
|
|
167
|
-
self.report.
|
|
168
|
-
|
|
139
|
+
self.report.report_failure(
|
|
140
|
+
message="Failed to process dataset",
|
|
141
|
+
context=dataset,
|
|
142
|
+
exc=e,
|
|
143
|
+
)
|
|
144
|
+
return None
|
|
169
145
|
|
|
170
|
-
def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
|
|
146
|
+
def get_neo4j_metadata(self, query: str) -> Optional[pd.DataFrame]:
|
|
171
147
|
driver = GraphDatabase.driver(
|
|
172
148
|
self.config.uri, auth=(self.config.username, self.config.password)
|
|
173
149
|
)
|
|
174
150
|
"""
|
|
175
|
-
This process retrieves the metadata for Neo4j objects using an APOC query,
|
|
176
|
-
with two columns: key and value. The key represents
|
|
177
|
-
corresponding metadata.
|
|
151
|
+
This process retrieves the metadata for Neo4j objects using an APOC query,
|
|
152
|
+
which returns a dictionary with two columns: key and value. The key represents
|
|
153
|
+
the Neo4j object, while the value contains the corresponding metadata.
|
|
178
154
|
|
|
179
|
-
When data is returned from Neo4j, much of the relationship metadata is stored
|
|
180
|
-
metadata. Consequently, the objects are organized
|
|
181
|
-
relationships.
|
|
155
|
+
When data is returned from Neo4j, much of the relationship metadata is stored
|
|
156
|
+
with the relevant node's metadata. Consequently, the objects are organized
|
|
157
|
+
into two separate dataframes: one for nodes and one for relationships.
|
|
182
158
|
|
|
183
|
-
In the node dataframe, several fields are extracted and added as new columns.
|
|
184
|
-
dataframe, certain fields are parsed out,
|
|
159
|
+
In the node dataframe, several fields are extracted and added as new columns.
|
|
160
|
+
Similarly, in the relationship dataframe, certain fields are parsed out,
|
|
161
|
+
while others require metadata from the nodes dataframe.
|
|
185
162
|
|
|
186
|
-
Once the data is parsed and these two dataframes are created, we combine
|
|
187
|
-
single dataframe, which will be used to
|
|
163
|
+
Once the data is parsed and these two dataframes are created, we combine
|
|
164
|
+
a subset of their columns into a single dataframe, which will be used to
|
|
165
|
+
create the DataHub objects.
|
|
188
166
|
|
|
189
|
-
See the docs for examples of metadata:
|
|
167
|
+
See the docs for examples of metadata:
|
|
168
|
+
metadata-ingestion/docs/sources/neo4j/neo4j.md
|
|
190
169
|
"""
|
|
191
170
|
try:
|
|
192
171
|
log.info(f"{query}")
|
|
@@ -201,16 +180,17 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
201
180
|
|
|
202
181
|
union_cols = ["key", "obj_type", "property_data_types", "description"]
|
|
203
182
|
df = pd.concat([node_df[union_cols], rel_df[union_cols]])
|
|
183
|
+
return df
|
|
204
184
|
except Exception as e:
|
|
205
185
|
self.report.failure(
|
|
206
186
|
message="Failed to get neo4j metadata",
|
|
207
187
|
exc=e,
|
|
208
188
|
)
|
|
209
189
|
|
|
210
|
-
return
|
|
190
|
+
return None
|
|
211
191
|
|
|
212
192
|
def process_nodes(self, data: list) -> pd.DataFrame:
|
|
213
|
-
nodes = [record for record in data if record["value"]["type"] ==
|
|
193
|
+
nodes = [record for record in data if record["value"]["type"] == _NODE]
|
|
214
194
|
node_df = pd.DataFrame(
|
|
215
195
|
nodes,
|
|
216
196
|
columns=["key", "value"],
|
|
@@ -233,9 +213,7 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
233
213
|
return node_df
|
|
234
214
|
|
|
235
215
|
def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
|
|
236
|
-
rels = [
|
|
237
|
-
record for record in data if record["value"]["type"] == self.RELATIONSHIP
|
|
238
|
-
]
|
|
216
|
+
rels = [record for record in data if record["value"]["type"] == _RELATIONSHIP]
|
|
239
217
|
rel_df = pd.DataFrame(rels, columns=["key", "value"])
|
|
240
218
|
rel_df["obj_type"] = rel_df["value"].apply(
|
|
241
219
|
lambda record: self.get_obj_type(record)
|
|
@@ -303,49 +281,40 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
303
281
|
]
|
|
304
282
|
|
|
305
283
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
306
|
-
|
|
307
|
-
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key
|
|
284
|
+
query = (
|
|
285
|
+
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key "
|
|
286
|
+
"RETURN key, value[key] AS value;"
|
|
308
287
|
)
|
|
288
|
+
df = self.get_neo4j_metadata(query)
|
|
289
|
+
if df is None:
|
|
290
|
+
log.warning("No metadata retrieved from Neo4j")
|
|
291
|
+
return
|
|
292
|
+
|
|
309
293
|
for _, row in df.iterrows():
|
|
310
294
|
try:
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
),
|
|
317
|
-
is_primary_source=True,
|
|
295
|
+
dataset_obj = self.create_neo4j_dataset(
|
|
296
|
+
dataset=row["key"],
|
|
297
|
+
columns=row["property_data_types"],
|
|
298
|
+
obj_type=row["obj_type"],
|
|
299
|
+
description=row["description"],
|
|
318
300
|
)
|
|
319
301
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
env=self.config.env,
|
|
327
|
-
),
|
|
328
|
-
aspect=SubTypesClass(
|
|
329
|
-
typeNames=[
|
|
330
|
-
DatasetSubTypes.NEO4J_NODE
|
|
331
|
-
if row["obj_type"] == self.NODE
|
|
332
|
-
else DatasetSubTypes.NEO4J_RELATIONSHIP
|
|
333
|
-
]
|
|
334
|
-
),
|
|
335
|
-
),
|
|
336
|
-
)
|
|
337
|
-
|
|
338
|
-
yield MetadataWorkUnit(
|
|
339
|
-
id=row["key"],
|
|
340
|
-
mcp=self.add_properties(
|
|
341
|
-
dataset=row["key"],
|
|
342
|
-
custom_properties=None,
|
|
343
|
-
description=row["description"],
|
|
344
|
-
),
|
|
345
|
-
)
|
|
302
|
+
if dataset_obj:
|
|
303
|
+
yield from dataset_obj.as_workunits()
|
|
304
|
+
self.report.obj_created += 1
|
|
305
|
+
else:
|
|
306
|
+
log.warning(f"Failed to create dataset object for {row['key']}")
|
|
307
|
+
self.report.obj_failures += 1
|
|
346
308
|
|
|
347
309
|
except Exception as e:
|
|
348
|
-
|
|
310
|
+
log.warning(f"Failed to process row {row['key']}: {str(e)}")
|
|
311
|
+
self.report.report_warning(
|
|
312
|
+
title="Error processing Neo4j metadata",
|
|
313
|
+
message="Some entities will be missed",
|
|
314
|
+
context=row["key"],
|
|
315
|
+
exc=e,
|
|
316
|
+
)
|
|
317
|
+
self.report.obj_failures += 1
|
|
349
318
|
|
|
350
|
-
def get_report(self):
|
|
319
|
+
def get_report(self) -> "Neo4jSourceReport":
|
|
351
320
|
return self.report
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -72,7 +72,7 @@ NIFI = "nifi"
|
|
|
72
72
|
# and here - https://github.com/psf/requests/issues/1573
|
|
73
73
|
class SSLAdapter(HTTPAdapter):
|
|
74
74
|
def __init__(self, certfile, keyfile, password=None):
|
|
75
|
-
self.context = ssl.create_default_context(ssl.Purpose.
|
|
75
|
+
self.context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
|
|
76
76
|
self.context.load_cert_chain(
|
|
77
77
|
certfile=certfile, keyfile=keyfile, password=password
|
|
78
78
|
)
|
|
@@ -166,7 +166,7 @@ class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
|
166
166
|
)
|
|
167
167
|
|
|
168
168
|
@root_validator(skip_on_failure=True)
|
|
169
|
-
def validate_auth_params(
|
|
169
|
+
def validate_auth_params(cls, values):
|
|
170
170
|
if values.get("auth") is NifiAuthType.CLIENT_CERT and not values.get(
|
|
171
171
|
"client_cert_file"
|
|
172
172
|
):
|
|
@@ -703,7 +703,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
703
703
|
if (
|
|
704
704
|
component.nifi_type is NifiType.PROCESSOR
|
|
705
705
|
and component.type
|
|
706
|
-
not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
|
|
706
|
+
not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
|
|
707
707
|
) or component.nifi_type not in [
|
|
708
708
|
NifiType.PROCESSOR,
|
|
709
709
|
NifiType.REMOTE_INPUT_PORT,
|
|
@@ -977,7 +977,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
977
977
|
)
|
|
978
978
|
|
|
979
979
|
for incoming_from in incoming:
|
|
980
|
-
if incoming_from in self.nifi_flow.remotely_accessible_ports
|
|
980
|
+
if incoming_from in self.nifi_flow.remotely_accessible_ports:
|
|
981
981
|
dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[incoming_from].name}"
|
|
982
982
|
dataset_urn = builder.make_dataset_urn(
|
|
983
983
|
NIFI, dataset_name, self.config.env
|
|
@@ -994,7 +994,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
994
994
|
)
|
|
995
995
|
|
|
996
996
|
for outgoing_to in outgoing:
|
|
997
|
-
if outgoing_to in self.nifi_flow.remotely_accessible_ports
|
|
997
|
+
if outgoing_to in self.nifi_flow.remotely_accessible_ports:
|
|
998
998
|
dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[outgoing_to].name}"
|
|
999
999
|
dataset_urn = builder.make_dataset_urn(
|
|
1000
1000
|
NIFI, dataset_name, self.config.env
|