acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
from typing import Iterable, List, Optional
|
|
2
|
+
|
|
3
|
+
from datahub.emitter.mce_builder import (
|
|
4
|
+
make_data_flow_urn,
|
|
5
|
+
make_data_job_urn,
|
|
6
|
+
make_data_platform_urn,
|
|
7
|
+
make_dataset_urn_with_platform_instance,
|
|
8
|
+
make_schema_field_urn,
|
|
9
|
+
)
|
|
10
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
11
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
12
|
+
from datahub.ingestion.api.decorators import (
|
|
13
|
+
SupportStatus,
|
|
14
|
+
capability,
|
|
15
|
+
config_class,
|
|
16
|
+
platform_name,
|
|
17
|
+
support_status,
|
|
18
|
+
)
|
|
19
|
+
from datahub.ingestion.api.source import (
|
|
20
|
+
MetadataWorkUnitProcessor,
|
|
21
|
+
SourceCapability,
|
|
22
|
+
SourceReport,
|
|
23
|
+
)
|
|
24
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
25
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
26
|
+
from datahub.ingestion.source.snaplogic.snaplogic_config import SnaplogicConfig
|
|
27
|
+
from datahub.ingestion.source.snaplogic.snaplogic_lineage_extractor import (
|
|
28
|
+
SnaplogicLineageExtractor,
|
|
29
|
+
)
|
|
30
|
+
from datahub.ingestion.source.snaplogic.snaplogic_parser import (
|
|
31
|
+
ColumnMapping,
|
|
32
|
+
Dataset,
|
|
33
|
+
SnapLogicParser,
|
|
34
|
+
)
|
|
35
|
+
from datahub.ingestion.source.snaplogic.snaplogic_utils import SnaplogicUtils
|
|
36
|
+
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
37
|
+
RedundantLineageRunSkipHandler,
|
|
38
|
+
)
|
|
39
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
40
|
+
StaleEntityRemovalHandler,
|
|
41
|
+
StaleEntityRemovalSourceReport,
|
|
42
|
+
)
|
|
43
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
44
|
+
StatefulIngestionSourceBase,
|
|
45
|
+
)
|
|
46
|
+
from datahub.metadata.schema_classes import (
|
|
47
|
+
DataFlowInfoClass,
|
|
48
|
+
DataJobInfoClass,
|
|
49
|
+
DataJobInputOutputClass,
|
|
50
|
+
DatasetPropertiesClass,
|
|
51
|
+
FineGrainedLineageClass,
|
|
52
|
+
FineGrainedLineageDownstreamTypeClass,
|
|
53
|
+
OtherSchemaClass,
|
|
54
|
+
SchemaFieldClass,
|
|
55
|
+
SchemaMetadataClass,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@platform_name("SnapLogic")
|
|
60
|
+
@config_class(SnaplogicConfig)
|
|
61
|
+
@support_status(SupportStatus.TESTING)
|
|
62
|
+
@capability(
|
|
63
|
+
SourceCapability.PLATFORM_INSTANCE,
|
|
64
|
+
"SnapLogic does not support platform instances",
|
|
65
|
+
supported=False,
|
|
66
|
+
)
|
|
67
|
+
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
68
|
+
@capability(SourceCapability.LINEAGE_FINE, "Enabled by default")
|
|
69
|
+
@capability(SourceCapability.DELETION_DETECTION, "Not supported yet", supported=False)
|
|
70
|
+
class SnaplogicSource(StatefulIngestionSourceBase):
|
|
71
|
+
"""
|
|
72
|
+
A source plugin for ingesting lineage and metadata from SnapLogic.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(self, config: SnaplogicConfig, ctx: PipelineContext):
|
|
76
|
+
super().__init__(config, ctx)
|
|
77
|
+
self.config = config
|
|
78
|
+
self.report = StaleEntityRemovalSourceReport()
|
|
79
|
+
self.graph: Optional[DataHubGraph] = ctx.graph
|
|
80
|
+
self.snaplogic_parser = SnapLogicParser(
|
|
81
|
+
config.case_insensitive_namespaces, self.config.namespace_mapping
|
|
82
|
+
)
|
|
83
|
+
self.redundant_lineage_run_skip_handler: Optional[
|
|
84
|
+
RedundantLineageRunSkipHandler
|
|
85
|
+
] = None
|
|
86
|
+
if self.config.enable_stateful_lineage_ingestion:
|
|
87
|
+
self.redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
|
|
88
|
+
source=self,
|
|
89
|
+
config=self.config,
|
|
90
|
+
pipeline_name=ctx.pipeline_name,
|
|
91
|
+
run_id=ctx.run_id,
|
|
92
|
+
)
|
|
93
|
+
self.snaplogic_lineage_extractor = SnaplogicLineageExtractor(
|
|
94
|
+
config=config,
|
|
95
|
+
redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
|
|
96
|
+
report=self.report,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
100
|
+
try:
|
|
101
|
+
self.report.info(
|
|
102
|
+
message="Starting lineage ingestion from SnapLogic",
|
|
103
|
+
title="Lineage Ingestion",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
records_processed = 0
|
|
107
|
+
for lineage in self.snaplogic_lineage_extractor.get_lineages():
|
|
108
|
+
try:
|
|
109
|
+
for workunit in self._process_lineage_record(lineage):
|
|
110
|
+
yield workunit
|
|
111
|
+
records_processed += 1
|
|
112
|
+
|
|
113
|
+
if records_processed % 20 == 0:
|
|
114
|
+
self.report.info(
|
|
115
|
+
message=f"Processed {records_processed} lineage records",
|
|
116
|
+
title="Lineage Ingestion Progress",
|
|
117
|
+
)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
self.report.report_failure(
|
|
120
|
+
message="Failed to process lineage record",
|
|
121
|
+
context=str(lineage),
|
|
122
|
+
exc=e,
|
|
123
|
+
)
|
|
124
|
+
self.report.info(
|
|
125
|
+
message=f"Completed processing {records_processed} lineage records",
|
|
126
|
+
title="Lineage Ingestion Complete",
|
|
127
|
+
)
|
|
128
|
+
self.snaplogic_lineage_extractor.report_status("lineage_ingestion", True)
|
|
129
|
+
self.snaplogic_lineage_extractor.update_stats()
|
|
130
|
+
except Exception as e:
|
|
131
|
+
self.report.report_failure(message="Failed to fetch lineages", exc=e)
|
|
132
|
+
self.snaplogic_lineage_extractor.report_status("lineage_ingestion", False)
|
|
133
|
+
|
|
134
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
135
|
+
return [
|
|
136
|
+
*super().get_workunit_processors(),
|
|
137
|
+
StaleEntityRemovalHandler.create(
|
|
138
|
+
self, self.config, self.ctx
|
|
139
|
+
).workunit_processor,
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
def _process_lineage_record(self, lineage: dict) -> Iterable[MetadataWorkUnit]:
|
|
143
|
+
"""Process a lineage record to create pipeline and task workunits with relationships."""
|
|
144
|
+
producer = lineage.get("producer")
|
|
145
|
+
if not producer:
|
|
146
|
+
return
|
|
147
|
+
pipeline_snode_id = producer.split("#pipe_snode=")[1]
|
|
148
|
+
if not pipeline_snode_id:
|
|
149
|
+
return
|
|
150
|
+
datasets = self.snaplogic_parser.extract_datasets_from_lineage(lineage)
|
|
151
|
+
pipeline = self.snaplogic_parser.extract_pipeline_from_lineage(lineage)
|
|
152
|
+
task = self.snaplogic_parser.extract_task_from_lineage(lineage)
|
|
153
|
+
columns_mapping = self.snaplogic_parser.extract_columns_mapping_from_lineage(
|
|
154
|
+
lineage
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Create pipeline MCP
|
|
158
|
+
for pipeline_workunit in self.create_pipeline_mcp(
|
|
159
|
+
name=pipeline.name,
|
|
160
|
+
pipeline_snode_id=pipeline.id,
|
|
161
|
+
namespace=pipeline.namespace,
|
|
162
|
+
):
|
|
163
|
+
self.report.report_workunit(pipeline_workunit)
|
|
164
|
+
yield pipeline_workunit
|
|
165
|
+
|
|
166
|
+
# Create dataset MCP
|
|
167
|
+
for dataset in datasets:
|
|
168
|
+
for dataset_workunit in self.create_dataset_mcp(
|
|
169
|
+
dataset_name=dataset.name,
|
|
170
|
+
dataset_display_name=dataset.display_name,
|
|
171
|
+
fields=dataset.fields,
|
|
172
|
+
platform=dataset.platform,
|
|
173
|
+
platform_instance=dataset.platform_instance,
|
|
174
|
+
):
|
|
175
|
+
self.report.report_workunit(dataset_workunit)
|
|
176
|
+
yield dataset_workunit
|
|
177
|
+
|
|
178
|
+
# Create task MCP
|
|
179
|
+
for task_workunit in self.create_task_mcp(
|
|
180
|
+
name=task.name,
|
|
181
|
+
task_id=task.id,
|
|
182
|
+
namespace=task.namespace,
|
|
183
|
+
pipeline_snode_id=pipeline_snode_id,
|
|
184
|
+
input_datasets=[dataset for dataset in datasets if dataset.type == "INPUT"],
|
|
185
|
+
output_datasets=[
|
|
186
|
+
dataset for dataset in datasets if dataset.type == "OUTPUT"
|
|
187
|
+
],
|
|
188
|
+
columns_mapping=columns_mapping,
|
|
189
|
+
):
|
|
190
|
+
self.report.report_workunit(task_workunit)
|
|
191
|
+
yield task_workunit
|
|
192
|
+
|
|
193
|
+
def create_task_mcp(
|
|
194
|
+
self,
|
|
195
|
+
task_id: str,
|
|
196
|
+
name: str,
|
|
197
|
+
namespace: str,
|
|
198
|
+
pipeline_snode_id: str,
|
|
199
|
+
input_datasets: list[Dataset],
|
|
200
|
+
output_datasets: list[Dataset],
|
|
201
|
+
columns_mapping: list[ColumnMapping],
|
|
202
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
203
|
+
"""Create MCPs for a task (snap) including metadata and lineage."""
|
|
204
|
+
job_urn = make_data_job_urn(
|
|
205
|
+
orchestrator=namespace,
|
|
206
|
+
flow_id=pipeline_snode_id,
|
|
207
|
+
job_id=task_id,
|
|
208
|
+
cluster="PROD",
|
|
209
|
+
)
|
|
210
|
+
yield MetadataChangeProposalWrapper(
|
|
211
|
+
entityUrn=job_urn,
|
|
212
|
+
aspect=DataJobInfoClass(
|
|
213
|
+
name=name,
|
|
214
|
+
description="",
|
|
215
|
+
externalUrl=f"{self.config.base_url}/sl/designer.html?v=21818#pipe_snode={pipeline_snode_id}",
|
|
216
|
+
type="SNAPLOGIC_SNAP",
|
|
217
|
+
),
|
|
218
|
+
).as_workunit()
|
|
219
|
+
|
|
220
|
+
# Helper functions
|
|
221
|
+
def dataset_urn(d: Dataset) -> str:
|
|
222
|
+
return make_dataset_urn_with_platform_instance(
|
|
223
|
+
d.platform, d.name, d.platform_instance
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def field_urn(d, f):
|
|
227
|
+
return make_schema_field_urn(dataset_urn(d), f["name"])
|
|
228
|
+
|
|
229
|
+
# Emit lineage
|
|
230
|
+
yield MetadataChangeProposalWrapper(
|
|
231
|
+
entityUrn=job_urn,
|
|
232
|
+
aspect=DataJobInputOutputClass(
|
|
233
|
+
inputDatasets=[dataset_urn(d) for d in input_datasets],
|
|
234
|
+
outputDatasets=[dataset_urn(d) for d in output_datasets],
|
|
235
|
+
inputDatasetFields=[
|
|
236
|
+
field_urn(d, f) for d in input_datasets for f in d.fields
|
|
237
|
+
],
|
|
238
|
+
outputDatasetFields=[
|
|
239
|
+
field_urn(d, f) for d in output_datasets for f in d.fields
|
|
240
|
+
],
|
|
241
|
+
fineGrainedLineages=[
|
|
242
|
+
FineGrainedLineageClass(
|
|
243
|
+
upstreamType=FineGrainedLineageDownstreamTypeClass.FIELD_SET,
|
|
244
|
+
upstreams=[
|
|
245
|
+
make_schema_field_urn(
|
|
246
|
+
make_dataset_urn_with_platform_instance(
|
|
247
|
+
cl.input_dataset.platform,
|
|
248
|
+
cl.input_dataset.name,
|
|
249
|
+
cl.input_dataset.platform_instance,
|
|
250
|
+
cl.input_dataset.env,
|
|
251
|
+
),
|
|
252
|
+
cl.input_field,
|
|
253
|
+
)
|
|
254
|
+
],
|
|
255
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD_SET,
|
|
256
|
+
downstreams=[
|
|
257
|
+
make_schema_field_urn(
|
|
258
|
+
make_dataset_urn_with_platform_instance(
|
|
259
|
+
cl.output_dataset.platform,
|
|
260
|
+
cl.output_dataset.name,
|
|
261
|
+
cl.output_dataset.platform_instance,
|
|
262
|
+
cl.output_dataset.env,
|
|
263
|
+
),
|
|
264
|
+
cl.output_field,
|
|
265
|
+
)
|
|
266
|
+
],
|
|
267
|
+
)
|
|
268
|
+
for cl in columns_mapping
|
|
269
|
+
],
|
|
270
|
+
),
|
|
271
|
+
).as_workunit()
|
|
272
|
+
|
|
273
|
+
def create_dataset_mcp(
|
|
274
|
+
self,
|
|
275
|
+
dataset_name: str,
|
|
276
|
+
dataset_display_name: str,
|
|
277
|
+
fields: list[dict],
|
|
278
|
+
platform: str = "snaplogic",
|
|
279
|
+
env: str = "PROD",
|
|
280
|
+
platform_instance: Optional[str] = None,
|
|
281
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
282
|
+
dataset_urn = make_dataset_urn_with_platform_instance(
|
|
283
|
+
platform=platform,
|
|
284
|
+
name=dataset_name,
|
|
285
|
+
env=env,
|
|
286
|
+
platform_instance=platform_instance,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Skip dataset creation if:
|
|
290
|
+
# 1. The platform is not "snaplogic" AND
|
|
291
|
+
# 2. Either:
|
|
292
|
+
# a) The config `create_non_snaplogic_datasets` is disabled (False), meaning
|
|
293
|
+
# we do not create datasets for non-snaplogic platforms, OR
|
|
294
|
+
# b) The dataset already exists in DataHub (`self.graph.exists(dataset_urn)`).
|
|
295
|
+
if platform != "snaplogic" and (
|
|
296
|
+
not self.config.create_non_snaplogic_datasets
|
|
297
|
+
or (self.graph and self.graph.exists(dataset_urn))
|
|
298
|
+
):
|
|
299
|
+
return
|
|
300
|
+
|
|
301
|
+
dataset_properties = DatasetPropertiesClass(
|
|
302
|
+
name=dataset_display_name,
|
|
303
|
+
qualifiedName=dataset_name,
|
|
304
|
+
)
|
|
305
|
+
schema_fields = [
|
|
306
|
+
SchemaFieldClass(
|
|
307
|
+
fieldPath=field["name"],
|
|
308
|
+
type=SnaplogicUtils.get_datahub_type(field.get("type", "Varchar")),
|
|
309
|
+
nativeDataType=field.get("type", "Varchar"),
|
|
310
|
+
)
|
|
311
|
+
for field in fields
|
|
312
|
+
]
|
|
313
|
+
schema_metadata = SchemaMetadataClass(
|
|
314
|
+
schemaName=dataset_name,
|
|
315
|
+
platform=make_data_platform_urn(platform),
|
|
316
|
+
version=0,
|
|
317
|
+
hash="",
|
|
318
|
+
platformSchema=OtherSchemaClass(rawSchema=""),
|
|
319
|
+
fields=schema_fields,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
yield MetadataChangeProposalWrapper(
|
|
323
|
+
entityUrn=dataset_urn, aspect=dataset_properties
|
|
324
|
+
).as_workunit()
|
|
325
|
+
|
|
326
|
+
yield MetadataChangeProposalWrapper(
|
|
327
|
+
entityUrn=dataset_urn, aspect=schema_metadata
|
|
328
|
+
).as_workunit()
|
|
329
|
+
|
|
330
|
+
def create_pipeline_mcp(
|
|
331
|
+
self, name: str, namespace: str, pipeline_snode_id: str
|
|
332
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
333
|
+
flow_urn = make_data_flow_urn(
|
|
334
|
+
orchestrator=namespace, flow_id=pipeline_snode_id, cluster="PROD"
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
yield MetadataChangeProposalWrapper(
|
|
338
|
+
entityUrn=flow_urn,
|
|
339
|
+
aspect=DataFlowInfoClass(
|
|
340
|
+
name=name,
|
|
341
|
+
description="",
|
|
342
|
+
externalUrl=f"{self.config.base_url}/sl/designer.html?v=21818#pipe_snode={pipeline_snode_id}",
|
|
343
|
+
),
|
|
344
|
+
).as_workunit()
|
|
345
|
+
|
|
346
|
+
def get_report(self) -> SourceReport:
|
|
347
|
+
return self.report
|
|
348
|
+
|
|
349
|
+
def close(self) -> None:
|
|
350
|
+
super().close()
|
|
351
|
+
|
|
352
|
+
@classmethod
|
|
353
|
+
def create(cls, config_dict: dict, ctx: PipelineContext) -> "SnaplogicSource":
|
|
354
|
+
config = SnaplogicConfig.parse_obj(config_dict)
|
|
355
|
+
return cls(config, ctx)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, SecretStr
|
|
4
|
+
|
|
5
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
|
+
StatefulIngestionConfigBase,
|
|
7
|
+
StatefulStaleMetadataRemovalConfig,
|
|
8
|
+
)
|
|
9
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
10
|
+
StatefulLineageConfigMixin,
|
|
11
|
+
StatefulUsageConfigMixin,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SnaplogicConfig(
|
|
16
|
+
StatefulIngestionConfigBase, StatefulLineageConfigMixin, StatefulUsageConfigMixin
|
|
17
|
+
):
|
|
18
|
+
platform: str = "SnapLogic"
|
|
19
|
+
username: str = Field(description="Username")
|
|
20
|
+
password: SecretStr = Field(description="Password")
|
|
21
|
+
base_url: str = Field(
|
|
22
|
+
default="https://elastic.snaplogic.com",
|
|
23
|
+
description="Url to your SnapLogic instance: `https://elastic.snaplogic.com`, or similar. Used for making API calls to SnapLogic.",
|
|
24
|
+
)
|
|
25
|
+
org_name: str = Field(description="Organization name from SnapLogic instance")
|
|
26
|
+
namespace_mapping: dict = Field(
|
|
27
|
+
default={}, description="Mapping of namespaces to platform instances"
|
|
28
|
+
)
|
|
29
|
+
case_insensitive_namespaces: list = Field(
|
|
30
|
+
default=[],
|
|
31
|
+
description="List of namespaces that should be treated as case insensitive",
|
|
32
|
+
)
|
|
33
|
+
create_non_snaplogic_datasets: bool = Field(
|
|
34
|
+
default=False,
|
|
35
|
+
description="Whether to create datasets for non-SnapLogic datasets (e.g., databases, S3, etc.)",
|
|
36
|
+
)
|
|
37
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Iterable, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
from datahub.ingestion.api.source import (
|
|
7
|
+
SourceReport,
|
|
8
|
+
)
|
|
9
|
+
from datahub.ingestion.source.snaplogic.snaplogic_config import SnaplogicConfig
|
|
10
|
+
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
11
|
+
RedundantLineageRunSkipHandler,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SnaplogicLineageExtractor:
|
|
16
|
+
"""
|
|
17
|
+
A class to interact with the SnapLogic API.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
config: SnaplogicConfig,
|
|
23
|
+
redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler],
|
|
24
|
+
report: SourceReport,
|
|
25
|
+
):
|
|
26
|
+
self.config = config
|
|
27
|
+
self.report = report
|
|
28
|
+
self.redundant_run_skip_handler = redundant_run_skip_handler
|
|
29
|
+
self.start_time, self.end_time = self._get_time_window()
|
|
30
|
+
|
|
31
|
+
def get_lineages(self) -> Iterable[dict]:
|
|
32
|
+
"""Generator function that yields lineage records one at a time as they are fetched."""
|
|
33
|
+
page = 0
|
|
34
|
+
has_more = True
|
|
35
|
+
records_processed = 0
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
while has_more:
|
|
39
|
+
params = {
|
|
40
|
+
"format": "OPENLINEAGE",
|
|
41
|
+
"start_ts": str(int(self.start_time.timestamp() * 1000)),
|
|
42
|
+
"end_ts": str(int(self.end_time.timestamp() * 1000)),
|
|
43
|
+
"page": str(page),
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
self.report.info(
|
|
47
|
+
message=f"Fetching lineage data - page: {page}, start_ts: {self.start_time}, end_ts: {self.end_time}",
|
|
48
|
+
title="Lineage Fetch",
|
|
49
|
+
)
|
|
50
|
+
headers = {"User-Agent": "datahub-connector/1.0"}
|
|
51
|
+
response = requests.get(
|
|
52
|
+
url=f"{self.config.base_url}/api/1/rest/public/catalog/{self.config.org_name}/lineage",
|
|
53
|
+
params=params,
|
|
54
|
+
headers=headers,
|
|
55
|
+
auth=(
|
|
56
|
+
self.config.username,
|
|
57
|
+
self.config.password.get_secret_value(),
|
|
58
|
+
),
|
|
59
|
+
)
|
|
60
|
+
response.raise_for_status()
|
|
61
|
+
|
|
62
|
+
data = response.json()
|
|
63
|
+
content = data["content"]
|
|
64
|
+
|
|
65
|
+
# Yield records one at a time
|
|
66
|
+
for record in content:
|
|
67
|
+
records_processed += 1
|
|
68
|
+
yield record
|
|
69
|
+
|
|
70
|
+
# Check if we need to fetch more pages
|
|
71
|
+
has_more = (
|
|
72
|
+
len(content) >= 20
|
|
73
|
+
) # If we got full page size, there might be more
|
|
74
|
+
page += 1
|
|
75
|
+
|
|
76
|
+
self.report.info(
|
|
77
|
+
message=f"Completed fetching lineage data. Total records processed: {records_processed}",
|
|
78
|
+
title="Lineage Fetch Complete",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
except Exception as e:
|
|
82
|
+
self.report.report_failure(
|
|
83
|
+
message="Error fetching lineage data",
|
|
84
|
+
exc=e,
|
|
85
|
+
title="Lineage Fetch Error",
|
|
86
|
+
)
|
|
87
|
+
raise
|
|
88
|
+
|
|
89
|
+
def _get_time_window(self) -> Tuple[datetime, datetime]:
|
|
90
|
+
if self.redundant_run_skip_handler:
|
|
91
|
+
return self.redundant_run_skip_handler.suggest_run_time_window(
|
|
92
|
+
self.config.start_time, self.config.end_time
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
return self.config.start_time, self.config.end_time
|
|
96
|
+
|
|
97
|
+
def update_stats(self):
|
|
98
|
+
if self.redundant_run_skip_handler:
|
|
99
|
+
# Update the checkpoint state for this run.
|
|
100
|
+
self.redundant_run_skip_handler.update_state(
|
|
101
|
+
self.config.start_time,
|
|
102
|
+
self.config.end_time,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def report_status(self, step: str, status: bool) -> None:
|
|
106
|
+
if self.redundant_run_skip_handler:
|
|
107
|
+
self.redundant_run_skip_handler.report_current_run_status(step, status)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class Dataset:
|
|
7
|
+
name: str
|
|
8
|
+
display_name: str
|
|
9
|
+
fields: List[Dict] = field(default_factory=list)
|
|
10
|
+
platform: str = "snaplogic"
|
|
11
|
+
platform_instance: Optional[str] = None
|
|
12
|
+
type: Optional[str] = None # INPUT or OUTPUT
|
|
13
|
+
env: str = "PROD"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class Pipeline:
|
|
18
|
+
name: str
|
|
19
|
+
id: str
|
|
20
|
+
namespace: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class Task:
|
|
25
|
+
name: str
|
|
26
|
+
id: str
|
|
27
|
+
namespace: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ColumnMapping:
|
|
32
|
+
input_dataset: Dataset
|
|
33
|
+
output_dataset: Dataset
|
|
34
|
+
input_field: str
|
|
35
|
+
output_field: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SnapLogicParser:
|
|
39
|
+
def __init__(self, case_insensitive_namespaces: list[str], namespace_mapping: dict):
|
|
40
|
+
self.case_insensitive_namespaces = case_insensitive_namespaces
|
|
41
|
+
self.namespace_mapping = namespace_mapping
|
|
42
|
+
self.platform_mapping = {
|
|
43
|
+
"sqlserver": "mssql",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
def _parse_platform(self, namespace: str) -> str:
|
|
47
|
+
type_part = namespace.split("://")[0] if "://" in namespace else namespace
|
|
48
|
+
|
|
49
|
+
return self.platform_mapping.get(type_part.lower(), type_part.lower())
|
|
50
|
+
|
|
51
|
+
def extract_task_from_lineage(self, lineage: dict) -> Task:
|
|
52
|
+
job = lineage.get("job")
|
|
53
|
+
if not job:
|
|
54
|
+
raise ValueError("Job information is missing in the lineage data.")
|
|
55
|
+
name = job.get("name")
|
|
56
|
+
namespace = job.get("namespace")
|
|
57
|
+
|
|
58
|
+
return Task(
|
|
59
|
+
id=name,
|
|
60
|
+
name=name.rsplit(":", 1)[0],
|
|
61
|
+
namespace=self._parse_platform(namespace),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def extract_pipeline_from_lineage(self, lineage: dict) -> Pipeline:
|
|
65
|
+
parent_run = lineage.get("run", {}).get("facets", {}).get("parent", {})
|
|
66
|
+
job = parent_run.get("job", {})
|
|
67
|
+
name = job.get("name")
|
|
68
|
+
namespace = job.get("namespace")
|
|
69
|
+
pipeline_snode_id = parent_run.get("_producer").split("#pipe_snode=")[1]
|
|
70
|
+
return Pipeline(
|
|
71
|
+
id=pipeline_snode_id, name=name, namespace=self._parse_platform(namespace)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def _get_case_sensitive_value(self, value: str, namespace: str) -> str:
|
|
75
|
+
"""Transform value to lowercase if namespace is case-insensitive."""
|
|
76
|
+
return value.lower() if namespace in self.case_insensitive_namespaces else value
|
|
77
|
+
|
|
78
|
+
def _create_dataset_info(
|
|
79
|
+
self,
|
|
80
|
+
namespace: str,
|
|
81
|
+
name: str,
|
|
82
|
+
display_name: str,
|
|
83
|
+
type: str,
|
|
84
|
+
fields: Optional[List[Dict]] = None,
|
|
85
|
+
) -> Dataset:
|
|
86
|
+
"""Create a Dataset instance with proper case sensitivity."""
|
|
87
|
+
return Dataset(
|
|
88
|
+
platform=self._parse_platform(namespace),
|
|
89
|
+
name=self._get_case_sensitive_value(name, namespace),
|
|
90
|
+
display_name=display_name or name,
|
|
91
|
+
fields=fields or [],
|
|
92
|
+
env="PROD",
|
|
93
|
+
platform_instance=self.namespace_mapping.get(namespace, None),
|
|
94
|
+
type=type,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def extract_columns_mapping_from_lineage(
|
|
98
|
+
self, lineage: dict
|
|
99
|
+
) -> List[ColumnMapping]:
|
|
100
|
+
outputs = lineage.get("outputs", [])
|
|
101
|
+
lineages = []
|
|
102
|
+
|
|
103
|
+
for output in outputs:
|
|
104
|
+
output_namespace = output.get("namespace")
|
|
105
|
+
output_name = output.get("name", "")
|
|
106
|
+
column_lineage = (
|
|
107
|
+
output.get("facets", {}).get("columnLineage", {}).get("fields", {})
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
for field_name, field_dict in column_lineage.items():
|
|
111
|
+
output_field = self._get_case_sensitive_value(
|
|
112
|
+
field_name, output_namespace
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
for input_field in field_dict.get("inputFields", []):
|
|
116
|
+
input_namespace = input_field.get("namespace")
|
|
117
|
+
input_name = input_field.get("name", "")
|
|
118
|
+
input_field_name = input_field.get("field", "")
|
|
119
|
+
|
|
120
|
+
lineages.append(
|
|
121
|
+
ColumnMapping(
|
|
122
|
+
input_dataset=self._create_dataset_info(
|
|
123
|
+
input_namespace, input_name, input_name, "INPUT"
|
|
124
|
+
),
|
|
125
|
+
output_dataset=self._create_dataset_info(
|
|
126
|
+
output_namespace, output_name, output_name, "OUTPUT"
|
|
127
|
+
),
|
|
128
|
+
input_field=self._get_case_sensitive_value(
|
|
129
|
+
input_field_name, input_namespace
|
|
130
|
+
),
|
|
131
|
+
output_field=output_field,
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return lineages
|
|
136
|
+
|
|
137
|
+
def extract_datasets_from_lineage(self, lineage: dict) -> List[Dataset]:
|
|
138
|
+
inputs = lineage.get("inputs", {})
|
|
139
|
+
outputs = lineage.get("outputs", {})
|
|
140
|
+
|
|
141
|
+
datasets = []
|
|
142
|
+
for dataset, dataset_type in [
|
|
143
|
+
*[(input_dataset, "INPUT") for input_dataset in inputs],
|
|
144
|
+
*[(output_dataset, "OUTPUT") for output_dataset in outputs],
|
|
145
|
+
]:
|
|
146
|
+
namespace = dataset.get("namespace")
|
|
147
|
+
name = dataset.get("name", "")
|
|
148
|
+
fields = dataset.get("facets", {}).get("schema", {}).get("fields", [])
|
|
149
|
+
display_name = name
|
|
150
|
+
|
|
151
|
+
# Transform names to lowercase if namespace is in case_insensitive_namespaces
|
|
152
|
+
if namespace in self.case_insensitive_namespaces:
|
|
153
|
+
name = name.lower()
|
|
154
|
+
fields = [
|
|
155
|
+
{**field, "name": field.get("name", "").lower()} for field in fields
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
datasets.append(
|
|
159
|
+
self._create_dataset_info(
|
|
160
|
+
namespace=namespace,
|
|
161
|
+
name=name,
|
|
162
|
+
fields=fields,
|
|
163
|
+
display_name=display_name,
|
|
164
|
+
type=dataset_type,
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return datasets
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from datahub.metadata.schema_classes import (
|
|
2
|
+
BooleanTypeClass,
|
|
3
|
+
NumberTypeClass,
|
|
4
|
+
SchemaFieldDataTypeClass,
|
|
5
|
+
StringTypeClass,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SnaplogicUtils:
|
|
10
|
+
@staticmethod
|
|
11
|
+
def get_datahub_type(type_str: str) -> SchemaFieldDataTypeClass:
|
|
12
|
+
"""
|
|
13
|
+
Maps a string-based type to a DataHub SchemaFieldDataTypeClass.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
type_str (str): The input type (e.g., "string", "int", "boolean").
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
SchemaFieldDataTypeClass: The mapped DataHub type.
|
|
20
|
+
"""
|
|
21
|
+
normalized_type = type_str.lower()
|
|
22
|
+
|
|
23
|
+
if normalized_type in ["string", "varchar"]:
|
|
24
|
+
return SchemaFieldDataTypeClass(type=StringTypeClass())
|
|
25
|
+
elif normalized_type in ["number", "long", "float", "double", "int"]:
|
|
26
|
+
return SchemaFieldDataTypeClass(type=NumberTypeClass())
|
|
27
|
+
elif normalized_type == "boolean":
|
|
28
|
+
return SchemaFieldDataTypeClass(type=BooleanTypeClass())
|
|
29
|
+
else:
|
|
30
|
+
# Default fallback: String
|
|
31
|
+
return SchemaFieldDataTypeClass(type=StringTypeClass())
|