acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,15 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import logging
|
|
2
|
-
import textwrap
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import Iterable, List, Optional, Tuple
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
5
5
|
|
|
6
|
-
from pydantic import Field, SecretStr
|
|
6
|
+
from pydantic import BaseModel, Field, SecretStr
|
|
7
7
|
from slack_sdk import WebClient
|
|
8
8
|
from tenacity import retry, wait_exponential
|
|
9
9
|
from tenacity.before_sleep import before_sleep_log
|
|
10
10
|
|
|
11
11
|
import datahub.emitter.mce_builder as builder
|
|
12
|
+
from datahub.emitter.mce_builder import datahub_guid, make_dataplatform_instance_urn
|
|
12
13
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
14
|
from datahub.ingestion.api.common import PipelineContext
|
|
14
15
|
from datahub.ingestion.api.decorators import (
|
|
@@ -22,6 +23,7 @@ from datahub.ingestion.api.source import (
|
|
|
22
23
|
SourceReport,
|
|
23
24
|
)
|
|
24
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
25
27
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
26
28
|
StaleEntityRemovalHandler,
|
|
27
29
|
StaleEntityRemovalSourceReport,
|
|
@@ -32,16 +34,153 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
32
34
|
)
|
|
33
35
|
from datahub.metadata.schema_classes import (
|
|
34
36
|
CorpUserEditableInfoClass,
|
|
37
|
+
CorpUserSettingsClass,
|
|
38
|
+
DataPlatformInstanceClass,
|
|
39
|
+
DataPlatformInstancePropertiesClass,
|
|
35
40
|
DatasetPropertiesClass,
|
|
36
41
|
DeprecationClass,
|
|
42
|
+
NotificationSettingsClass,
|
|
43
|
+
PlatformResourceInfoClass,
|
|
44
|
+
SerializedValueClass,
|
|
45
|
+
SerializedValueContentTypeClass,
|
|
46
|
+
SerializedValueSchemaTypeClass,
|
|
47
|
+
SlackNotificationSettingsClass,
|
|
48
|
+
SlackUserInfoClass as SlackUserInfo,
|
|
49
|
+
StatusClass,
|
|
37
50
|
SubTypesClass,
|
|
51
|
+
_Aspect,
|
|
38
52
|
)
|
|
39
53
|
from datahub.utilities.ratelimiter import RateLimiter
|
|
54
|
+
from datahub.utilities.str_enum import StrEnum
|
|
40
55
|
from datahub.utilities.urns.urn import Urn
|
|
41
56
|
|
|
42
57
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
43
58
|
|
|
44
59
|
|
|
60
|
+
# TODO: Relocate this function to a utility module
|
|
61
|
+
def is_picture_default_or_missing(picture_link: Optional[str]) -> bool:
|
|
62
|
+
if not picture_link:
|
|
63
|
+
return True
|
|
64
|
+
return picture_link.endswith("default_avatar.png")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def is_slack_image(picture_link: Optional[str]) -> bool:
|
|
68
|
+
"""
|
|
69
|
+
Guesses if the picture link is a slack image.
|
|
70
|
+
"""
|
|
71
|
+
if not picture_link:
|
|
72
|
+
return False
|
|
73
|
+
return "slack-edge.com" in picture_link
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class ResourceType(StrEnum):
|
|
77
|
+
USER_INFO = "user-info"
|
|
78
|
+
CHANNEL_INFO = "channel-info"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class SlackInstance(BaseModel):
|
|
82
|
+
id: str
|
|
83
|
+
name: Optional[str] = None
|
|
84
|
+
description: Optional[str] = None
|
|
85
|
+
external_url: Optional[str] = None
|
|
86
|
+
custom_properties: Optional[Dict[str, str]] = None
|
|
87
|
+
|
|
88
|
+
def to_platform_instance_urn(self) -> str:
|
|
89
|
+
return make_dataplatform_instance_urn(
|
|
90
|
+
platform=DATA_PLATFORM_SLACK_URN, instance=self.id
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def with_slack_team_info(self, team_info: dict) -> "SlackInstance":
|
|
94
|
+
"""
|
|
95
|
+
team_info looks like this
|
|
96
|
+
{'id': 'T22BUCL1LKW', 'name': 'DataHub', 'url': 'https://datahubspace.slack.com/', 'domain': 'datahub', 'email_domain': '', 'icon': {'image_default': False, 'image_34': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_34.png', 'image_44': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_44.png', 'image_68': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_68.png', 'image_88': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_88.png', 'image_102': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_102.png', 'image_230': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_230.png', 'image_132': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_132.png'}, 'avatar_base_url': 'https://ca.slack-edge.com/', 'is_verified': False, 'external_org_migrations': {'date_updated': 1722672564, 'current': []}, 'discoverable': 'closed', 'enterprise_id': 'E06TPM5T1G9', 'enterprise_name': 'DataHub', 'enterprise_domain': 'datahubspace', 'lob_sales_home_enabled': False}
|
|
97
|
+
"""
|
|
98
|
+
self.name = team_info.get("name")
|
|
99
|
+
self.description = team_info.get("name")
|
|
100
|
+
self.external_url = team_info.get("url")
|
|
101
|
+
self.custom_properties = {
|
|
102
|
+
k: v
|
|
103
|
+
for k, v in {
|
|
104
|
+
"domain": team_info.get("domain"),
|
|
105
|
+
"enterprise_id": team_info.get("enterprise_id"),
|
|
106
|
+
"enterprise_name": team_info.get("enterprise_name"),
|
|
107
|
+
"enterprise_domain": team_info.get("enterprise_domain"),
|
|
108
|
+
"icon": team_info.get("icon", {}).get("image_102"),
|
|
109
|
+
}.items()
|
|
110
|
+
if v is not None
|
|
111
|
+
}
|
|
112
|
+
return self
|
|
113
|
+
|
|
114
|
+
def to_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
115
|
+
return [
|
|
116
|
+
MetadataChangeProposalWrapper(
|
|
117
|
+
entityUrn=self.to_platform_instance_urn(),
|
|
118
|
+
aspect=DataPlatformInstancePropertiesClass(
|
|
119
|
+
name=self.name or self.id,
|
|
120
|
+
description=self.description,
|
|
121
|
+
externalUrl=self.external_url or None,
|
|
122
|
+
customProperties=self.custom_properties or {},
|
|
123
|
+
),
|
|
124
|
+
)
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def to_serialized_value(value: _Aspect) -> SerializedValueClass:
|
|
129
|
+
# HACK: we remove the .pegasus2avro from the schema type since we want to refer to
|
|
130
|
+
# the original pdl type
|
|
131
|
+
schema_type = value.RECORD_SCHEMA.fullname.replace(".pegasus2avro", "")
|
|
132
|
+
serialized_value = SerializedValueClass(
|
|
133
|
+
blob=json.dumps(value.to_obj()).encode("utf-8"),
|
|
134
|
+
contentType=SerializedValueContentTypeClass.JSON,
|
|
135
|
+
schemaType=SerializedValueSchemaTypeClass.PEGASUS,
|
|
136
|
+
schemaRef=schema_type,
|
|
137
|
+
)
|
|
138
|
+
return serialized_value
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class SlackUserDetails:
|
|
142
|
+
def __init__(self, slack_user_info: SlackUserInfo):
|
|
143
|
+
self.slack_user_info = slack_user_info
|
|
144
|
+
|
|
145
|
+
def to_guid(self) -> str:
|
|
146
|
+
"""
|
|
147
|
+
A slack user is uniquely identified by the combination of their id and teamId.
|
|
148
|
+
"""
|
|
149
|
+
return datahub_guid(
|
|
150
|
+
{"id": self.slack_user_info.id, "dpi": self.slack_user_info.teamId}
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def get_resource_urn(self) -> str:
|
|
154
|
+
return f"urn:li:platformResource:{self.to_guid()}"
|
|
155
|
+
|
|
156
|
+
def to_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
157
|
+
resource_urn = self.get_resource_urn()
|
|
158
|
+
|
|
159
|
+
dpi = DataPlatformInstanceClass(
|
|
160
|
+
platform=DATA_PLATFORM_SLACK_URN,
|
|
161
|
+
instance=self.slack_user_info.slackInstance,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
secondary_keys = []
|
|
165
|
+
if self.slack_user_info.email:
|
|
166
|
+
secondary_keys.append(self.slack_user_info.email)
|
|
167
|
+
|
|
168
|
+
resource_info = PlatformResourceInfoClass(
|
|
169
|
+
resourceType=ResourceType.USER_INFO.value,
|
|
170
|
+
value=to_serialized_value(self.slack_user_info),
|
|
171
|
+
primaryKey=self.slack_user_info.id,
|
|
172
|
+
secondaryKeys=secondary_keys,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
status = StatusClass(
|
|
176
|
+
removed=self.slack_user_info.isDeleted,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
yield from MetadataChangeProposalWrapper.construct_many(
|
|
180
|
+
resource_urn, aspects=[dpi, resource_info, status]
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
45
184
|
@dataclass
|
|
46
185
|
class CorpUser:
|
|
47
186
|
urn: Optional[str] = None
|
|
@@ -52,42 +191,43 @@ class CorpUser:
|
|
|
52
191
|
phone: Optional[str] = None
|
|
53
192
|
real_name: Optional[str] = None
|
|
54
193
|
slack_display_name: Optional[str] = None
|
|
194
|
+
team_id: Optional[str] = None
|
|
195
|
+
team_domain: Optional[str] = None
|
|
196
|
+
is_team_enterprise: Optional[bool] = None
|
|
55
197
|
|
|
56
198
|
|
|
57
199
|
class SlackSourceConfig(
|
|
58
200
|
StatefulIngestionConfigBase,
|
|
59
201
|
):
|
|
60
202
|
bot_token: SecretStr = Field(
|
|
61
|
-
description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email
|
|
203
|
+
description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email`, `users.profile:read`, and `team:read` scopes.",
|
|
62
204
|
)
|
|
63
205
|
enrich_user_metadata: bool = Field(
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
206
|
+
True,
|
|
207
|
+
description="When enabled, will enrich provisioned DataHub users' metadata with information from Slack.",
|
|
208
|
+
)
|
|
209
|
+
ingest_users: bool = Field(
|
|
210
|
+
True,
|
|
211
|
+
description="Whether to ingest users. When set to true, will ingest all users in the Slack workspace (as platform resources) to simplify user enrichment after they are provisioned on DataHub.",
|
|
67
212
|
)
|
|
68
213
|
api_requests_per_min: int = Field(
|
|
69
|
-
|
|
70
|
-
default=10,
|
|
214
|
+
10,
|
|
71
215
|
description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
|
|
72
216
|
)
|
|
73
217
|
ingest_public_channels: bool = Field(
|
|
74
|
-
|
|
75
|
-
default=False,
|
|
218
|
+
False,
|
|
76
219
|
description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
|
|
77
220
|
)
|
|
78
221
|
channels_iteration_limit: int = Field(
|
|
79
|
-
|
|
80
|
-
default=200,
|
|
222
|
+
200,
|
|
81
223
|
description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
|
|
82
224
|
)
|
|
83
225
|
channel_min_members: int = Field(
|
|
84
|
-
|
|
85
|
-
default=2,
|
|
226
|
+
2,
|
|
86
227
|
description="Ingest channels with at least this many members.",
|
|
87
228
|
)
|
|
88
229
|
should_ingest_archived_channels: bool = Field(
|
|
89
|
-
|
|
90
|
-
default=False,
|
|
230
|
+
False,
|
|
91
231
|
description="Whether to ingest archived channels.",
|
|
92
232
|
)
|
|
93
233
|
|
|
@@ -96,14 +236,16 @@ class SlackSourceConfig(
|
|
|
96
236
|
class SlackSourceReport(StaleEntityRemovalSourceReport):
|
|
97
237
|
channels_reported: int = 0
|
|
98
238
|
archived_channels_reported: int = 0
|
|
239
|
+
users_reported: int = 0
|
|
99
240
|
|
|
100
241
|
|
|
101
242
|
PLATFORM_NAME = "slack"
|
|
243
|
+
DATA_PLATFORM_SLACK_URN: str = builder.make_data_platform_urn(PLATFORM_NAME)
|
|
102
244
|
|
|
103
245
|
|
|
104
246
|
@platform_name("Slack")
|
|
105
247
|
@config_class(SlackSourceConfig)
|
|
106
|
-
@support_status(SupportStatus.
|
|
248
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
107
249
|
class SlackSource(StatefulIngestionSourceBase):
|
|
108
250
|
def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
|
|
109
251
|
super().__init__(config, ctx)
|
|
@@ -124,6 +266,38 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
124
266
|
def get_slack_client(self) -> WebClient:
|
|
125
267
|
return WebClient(token=self.config.bot_token.get_secret_value())
|
|
126
268
|
|
|
269
|
+
@staticmethod
|
|
270
|
+
def populate_slack_member_from_response(
|
|
271
|
+
user: Dict[str, Any], slack_instance: SlackInstance
|
|
272
|
+
) -> SlackUserDetails:
|
|
273
|
+
profile = user.get("profile", {})
|
|
274
|
+
|
|
275
|
+
user_info = SlackUserInfo(
|
|
276
|
+
slackInstance=slack_instance.to_platform_instance_urn(),
|
|
277
|
+
id=user["id"],
|
|
278
|
+
name=user["name"],
|
|
279
|
+
realName=user.get("real_name", ""),
|
|
280
|
+
displayName=profile.get("display_name", ""),
|
|
281
|
+
email=profile.get("email"),
|
|
282
|
+
teamId=user["team_id"],
|
|
283
|
+
isDeleted=user.get("deleted", False),
|
|
284
|
+
isAdmin=user.get("is_admin", False),
|
|
285
|
+
isOwner=user.get("is_owner", False),
|
|
286
|
+
isPrimaryOwner=user.get("is_primary_owner", False),
|
|
287
|
+
isBot=user.get("is_bot", False),
|
|
288
|
+
timezone=user.get("tz"),
|
|
289
|
+
timezoneOffset=user.get("tz_offset"),
|
|
290
|
+
title=profile.get("title"),
|
|
291
|
+
phone=profile.get("phone"),
|
|
292
|
+
profilePictureUrl=profile.get(
|
|
293
|
+
"image_192"
|
|
294
|
+
), # Using 192px image as an example
|
|
295
|
+
statusText=profile.get("status_text"),
|
|
296
|
+
statusEmoji=profile.get("status_emoji"),
|
|
297
|
+
lastUpdatedSeconds=user.get("updated"),
|
|
298
|
+
)
|
|
299
|
+
return SlackUserDetails(slack_user_info=user_info)
|
|
300
|
+
|
|
127
301
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
128
302
|
return [
|
|
129
303
|
*super().get_workunit_processors(),
|
|
@@ -143,46 +317,108 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
143
317
|
logger.info(auth_resp.data)
|
|
144
318
|
if self.config.ingest_public_channels:
|
|
145
319
|
yield from self.get_public_channels()
|
|
146
|
-
if self.config.enrich_user_metadata:
|
|
320
|
+
if self.config.enrich_user_metadata or self.config.ingest_users:
|
|
147
321
|
yield from self.get_user_info()
|
|
148
322
|
|
|
323
|
+
def _get_datahub_user_info(
|
|
324
|
+
self,
|
|
325
|
+
) -> Dict[str, Tuple[CorpUser, Optional[CorpUserEditableInfoClass]]]:
|
|
326
|
+
# get_user_to_be_updated ensures that the email field is not None
|
|
327
|
+
users = {
|
|
328
|
+
user_obj.email: (user_obj, editable_properties)
|
|
329
|
+
for user_obj, editable_properties in self.get_user_to_be_updated()
|
|
330
|
+
if user_obj.email
|
|
331
|
+
}
|
|
332
|
+
return users
|
|
333
|
+
|
|
149
334
|
def get_user_info(self) -> Iterable[MetadataWorkUnit]:
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
corpuser_editable_info.slack = user_obj.slack_id
|
|
167
|
-
corpuser_editable_info.title = user_obj.title
|
|
168
|
-
if user_obj.image_url:
|
|
169
|
-
corpuser_editable_info.pictureLink = user_obj.image_url
|
|
170
|
-
if user_obj.phone:
|
|
171
|
-
corpuser_editable_info.phone = user_obj.phone
|
|
172
|
-
if (
|
|
173
|
-
not corpuser_editable_info.displayName
|
|
174
|
-
or corpuser_editable_info.displayName == corpuser_editable_info.email
|
|
175
|
-
):
|
|
176
|
-
# let's fill out a real name
|
|
177
|
-
corpuser_editable_info.displayName = user_obj.real_name
|
|
178
|
-
yield MetadataWorkUnit(
|
|
179
|
-
id=f"{user_obj.urn}",
|
|
180
|
-
mcp=MetadataChangeProposalWrapper(
|
|
181
|
-
entityUrn=user_obj.urn,
|
|
182
|
-
aspect=corpuser_editable_info,
|
|
183
|
-
),
|
|
335
|
+
# Get team information to populate for all users
|
|
336
|
+
slack_instance: Optional[SlackInstance] = None
|
|
337
|
+
with self.rate_limiter:
|
|
338
|
+
team_response = self.get_slack_client().team_info()
|
|
339
|
+
if team_response and "team" in team_response:
|
|
340
|
+
team_info = team_response["team"]
|
|
341
|
+
slack_instance = SlackInstance(id=team_info.get("id"))
|
|
342
|
+
slack_instance = slack_instance.with_slack_team_info(team_info)
|
|
343
|
+
|
|
344
|
+
if slack_instance:
|
|
345
|
+
for mcp in slack_instance.to_mcps():
|
|
346
|
+
yield mcp.as_workunit()
|
|
347
|
+
else:
|
|
348
|
+
logger.error("Failed to fetch team information")
|
|
349
|
+
self.report.report_failure(
|
|
350
|
+
"team_info", "Failed to fetch team information for users"
|
|
184
351
|
)
|
|
185
352
|
|
|
353
|
+
assert slack_instance
|
|
354
|
+
|
|
355
|
+
# Fetch all DataHub users that need to be updated
|
|
356
|
+
if self.config.enrich_user_metadata:
|
|
357
|
+
datahub_users = self._get_datahub_user_info()
|
|
358
|
+
else:
|
|
359
|
+
datahub_users = {}
|
|
360
|
+
cursor = None
|
|
361
|
+
while True:
|
|
362
|
+
with self.rate_limiter:
|
|
363
|
+
response = self.get_slack_client().users_list(cursor=cursor)
|
|
364
|
+
assert isinstance(response.data, dict)
|
|
365
|
+
if not response.data["ok"]:
|
|
366
|
+
self.report.report_failure("users", "Failed to fetch users")
|
|
367
|
+
return
|
|
368
|
+
|
|
369
|
+
assert self.ctx.graph is not None
|
|
370
|
+
for user in response.data["members"]:
|
|
371
|
+
# Query all slack users and ingest them into the generic
|
|
372
|
+
# slackMember aspect
|
|
373
|
+
slack_user_details: SlackUserDetails = (
|
|
374
|
+
self.populate_slack_member_from_response(user, slack_instance)
|
|
375
|
+
)
|
|
376
|
+
if self.config.ingest_users:
|
|
377
|
+
for mcp in slack_user_details.to_mcps():
|
|
378
|
+
yield mcp.as_workunit()
|
|
379
|
+
|
|
380
|
+
platform_resource_urn = slack_user_details.get_resource_urn()
|
|
381
|
+
# If user is in DataHub, compute and emit CorpUserEditableInfo
|
|
382
|
+
# aspect. This code will be removed once we have server side
|
|
383
|
+
# processing of raw slackMember aspects. This code path can also
|
|
384
|
+
# be turned off by setting enrich_user_metadata to False.
|
|
385
|
+
user_obj_props_tuple = datahub_users.get(user["profile"].get("email"))
|
|
386
|
+
if user_obj_props_tuple is None:
|
|
387
|
+
# User is not in DataHub or enrichment is disabled
|
|
388
|
+
continue
|
|
389
|
+
user_obj, editable_properties = user_obj_props_tuple
|
|
390
|
+
slack_user_profile = user.get("profile", {})
|
|
391
|
+
user_obj.slack_id = user.get("id")
|
|
392
|
+
user_obj.title = slack_user_profile.get("title")
|
|
393
|
+
user_obj.image_url = slack_user_profile.get("image_192")
|
|
394
|
+
user_obj.phone = slack_user_profile.get("phone")
|
|
395
|
+
user_obj.real_name = slack_user_profile.get("real_name")
|
|
396
|
+
user_obj.slack_display_name = slack_user_profile.get("display_name")
|
|
397
|
+
corpuser_editable_info = editable_properties or (
|
|
398
|
+
CorpUserEditableInfoClass()
|
|
399
|
+
)
|
|
400
|
+
emittable_corpuser_editable_info = self.populate_corpuser_editable_info(
|
|
401
|
+
corpuser_editable_info,
|
|
402
|
+
user_obj,
|
|
403
|
+
platform_resource_urn=platform_resource_urn,
|
|
404
|
+
slack_instance=slack_instance,
|
|
405
|
+
)
|
|
406
|
+
if emittable_corpuser_editable_info:
|
|
407
|
+
yield MetadataChangeProposalWrapper(
|
|
408
|
+
entityUrn=user_obj.urn, aspect=emittable_corpuser_editable_info
|
|
409
|
+
).as_workunit()
|
|
410
|
+
# if we update corpusereditable info, we also update
|
|
411
|
+
# slackuserinfo. This will be removed once we have server
|
|
412
|
+
# side processing of raw slackMember aspects.
|
|
413
|
+
yield MetadataChangeProposalWrapper(
|
|
414
|
+
entityUrn=user_obj.urn,
|
|
415
|
+
aspect=slack_user_details.slack_user_info,
|
|
416
|
+
).as_workunit()
|
|
417
|
+
yield from self.emit_corp_user_slack_settings(user_obj)
|
|
418
|
+
cursor = str(response.data["response_metadata"]["next_cursor"])
|
|
419
|
+
if not cursor:
|
|
420
|
+
break
|
|
421
|
+
|
|
186
422
|
def _get_channel_info(
|
|
187
423
|
self, cursor: Optional[str]
|
|
188
424
|
) -> Tuple[List[MetadataWorkUnit], Optional[str]]:
|
|
@@ -251,7 +487,7 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
251
487
|
mcp=MetadataChangeProposalWrapper(
|
|
252
488
|
entityUrn=urn_channel,
|
|
253
489
|
aspect=SubTypesClass(
|
|
254
|
-
typeNames=[
|
|
490
|
+
typeNames=[DatasetSubTypes.SLACK_CHANNEL],
|
|
255
491
|
),
|
|
256
492
|
),
|
|
257
493
|
)
|
|
@@ -259,6 +495,58 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
259
495
|
cursor = str(response.data["response_metadata"]["next_cursor"])
|
|
260
496
|
return result_channels, cursor
|
|
261
497
|
|
|
498
|
+
def populate_corpuser_editable_info(
|
|
499
|
+
self,
|
|
500
|
+
corpuser_editable_info: CorpUserEditableInfoClass,
|
|
501
|
+
user_obj: CorpUser,
|
|
502
|
+
platform_resource_urn: str,
|
|
503
|
+
slack_instance: SlackInstance,
|
|
504
|
+
) -> Optional[CorpUserEditableInfoClass]:
|
|
505
|
+
"""
|
|
506
|
+
Populate CorpUserEditableInfo aspect with user information from Slack.
|
|
507
|
+
If changes are not required, None is returned.
|
|
508
|
+
If changes are required, the updated aspect is returned.
|
|
509
|
+
"""
|
|
510
|
+
mutation_required = False
|
|
511
|
+
if not corpuser_editable_info.email and user_obj.email:
|
|
512
|
+
mutation_required = True
|
|
513
|
+
corpuser_editable_info.email = user_obj.email
|
|
514
|
+
if not corpuser_editable_info.slack and user_obj.slack_id:
|
|
515
|
+
mutation_required = True
|
|
516
|
+
corpuser_editable_info.slack = user_obj.slack_id
|
|
517
|
+
if not corpuser_editable_info.title and user_obj.title:
|
|
518
|
+
mutation_required = True
|
|
519
|
+
corpuser_editable_info.title = user_obj.title
|
|
520
|
+
if user_obj.image_url and (
|
|
521
|
+
is_picture_default_or_missing(corpuser_editable_info.pictureLink)
|
|
522
|
+
or (
|
|
523
|
+
is_slack_image(corpuser_editable_info.pictureLink)
|
|
524
|
+
and user_obj.image_url != corpuser_editable_info.pictureLink
|
|
525
|
+
)
|
|
526
|
+
):
|
|
527
|
+
mutation_required = True
|
|
528
|
+
corpuser_editable_info.pictureLink = user_obj.image_url
|
|
529
|
+
if user_obj.phone and not corpuser_editable_info.phone:
|
|
530
|
+
mutation_required = True
|
|
531
|
+
corpuser_editable_info.phone = user_obj.phone
|
|
532
|
+
if (
|
|
533
|
+
not corpuser_editable_info.displayName
|
|
534
|
+
or corpuser_editable_info.displayName == corpuser_editable_info.email
|
|
535
|
+
) and user_obj.real_name:
|
|
536
|
+
mutation_required = True
|
|
537
|
+
corpuser_editable_info.displayName = user_obj.real_name
|
|
538
|
+
if mutation_required:
|
|
539
|
+
# update informationSources
|
|
540
|
+
corpuser_editable_info.informationSources = (
|
|
541
|
+
[]
|
|
542
|
+
if not corpuser_editable_info.informationSources
|
|
543
|
+
else corpuser_editable_info.informationSources
|
|
544
|
+
)
|
|
545
|
+
if platform_resource_urn not in corpuser_editable_info.informationSources:
|
|
546
|
+
corpuser_editable_info.informationSources.append(platform_resource_urn)
|
|
547
|
+
return corpuser_editable_info
|
|
548
|
+
return None
|
|
549
|
+
|
|
262
550
|
def get_public_channels(self) -> Iterable[MetadataWorkUnit]:
|
|
263
551
|
cursor = None
|
|
264
552
|
while True:
|
|
@@ -270,103 +558,78 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
270
558
|
if not cursor:
|
|
271
559
|
break
|
|
272
560
|
|
|
273
|
-
def
|
|
274
|
-
|
|
561
|
+
def emit_slack_member_aspect(
|
|
562
|
+
self, user: SlackUserInfo
|
|
563
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
564
|
+
slack_user = SlackUserDetails(slack_user_info=user)
|
|
565
|
+
for mcp in slack_user.to_mcps():
|
|
566
|
+
yield mcp.as_workunit()
|
|
567
|
+
|
|
568
|
+
def emit_corp_user_slack_settings(
|
|
569
|
+
self, user_obj: CorpUser
|
|
570
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
571
|
+
assert self.ctx.graph is not None
|
|
572
|
+
|
|
573
|
+
if not user_obj.urn:
|
|
275
574
|
return
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
user=user_obj.slack_id
|
|
282
|
-
)
|
|
283
|
-
user_profile_res = user_profile_res.get("user", {})
|
|
284
|
-
else:
|
|
285
|
-
user_profile_res = self.get_slack_client().users_profile_get(
|
|
286
|
-
user=user_obj.slack_id
|
|
287
|
-
)
|
|
288
|
-
logger.debug(f"User profile: {user_profile_res}")
|
|
289
|
-
user_profile = user_profile_res.get("profile", {})
|
|
290
|
-
user_obj.title = user_profile.get("title")
|
|
291
|
-
user_obj.image_url = user_profile.get("image_192")
|
|
292
|
-
user_obj.phone = user_profile.get("phone")
|
|
293
|
-
user_obj.real_name = user_profile.get("real_name")
|
|
294
|
-
user_obj.slack_display_name = user_profile.get("display_name")
|
|
295
|
-
|
|
296
|
-
except Exception as e:
|
|
297
|
-
if "missing_scope" in str(e):
|
|
298
|
-
if self._use_users_info:
|
|
299
|
-
raise e
|
|
300
|
-
self._use_users_info = True
|
|
301
|
-
self.populate_user_profile(user_obj)
|
|
575
|
+
|
|
576
|
+
corp_user_settings = self.ctx.graph.get_aspect(
|
|
577
|
+
user_obj.urn, CorpUserSettingsClass
|
|
578
|
+
)
|
|
579
|
+
if not corp_user_settings:
|
|
302
580
|
return
|
|
303
581
|
|
|
304
|
-
|
|
305
|
-
|
|
582
|
+
notification_settings = corp_user_settings.notificationSettings
|
|
583
|
+
|
|
584
|
+
if not notification_settings:
|
|
585
|
+
corp_user_settings.notificationSettings = NotificationSettingsClass(
|
|
586
|
+
sinkTypes=[],
|
|
587
|
+
slackSettings=SlackNotificationSettingsClass(
|
|
588
|
+
userHandle=user_obj.slack_id
|
|
589
|
+
),
|
|
590
|
+
)
|
|
591
|
+
elif (
|
|
592
|
+
not notification_settings.slackSettings
|
|
593
|
+
or not notification_settings.slackSettings.userHandle
|
|
594
|
+
):
|
|
595
|
+
notification_settings.slackSettings = SlackNotificationSettingsClass(
|
|
596
|
+
userHandle=user_obj.slack_id
|
|
597
|
+
)
|
|
598
|
+
else:
|
|
306
599
|
return
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
except Exception as e:
|
|
316
|
-
if "users_not_found" in str(e):
|
|
317
|
-
return
|
|
318
|
-
raise e
|
|
600
|
+
|
|
601
|
+
yield MetadataWorkUnit(
|
|
602
|
+
id=f"{user_obj.urn}",
|
|
603
|
+
mcp=MetadataChangeProposalWrapper(
|
|
604
|
+
entityUrn=user_obj.urn,
|
|
605
|
+
aspect=corp_user_settings,
|
|
606
|
+
),
|
|
607
|
+
)
|
|
319
608
|
|
|
320
609
|
@retry(
|
|
321
610
|
wait=wait_exponential(multiplier=2, min=4, max=60),
|
|
322
611
|
before_sleep=before_sleep_log(logger, logging.ERROR, True),
|
|
323
612
|
)
|
|
324
|
-
def get_user_to_be_updated(
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
query listUsers($input: ListUsersInput!) {
|
|
328
|
-
listUsers(input: $input) {
|
|
329
|
-
total
|
|
330
|
-
users {
|
|
331
|
-
urn
|
|
332
|
-
editableProperties {
|
|
333
|
-
email
|
|
334
|
-
slack
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
|
-
}
|
|
338
|
-
}
|
|
339
|
-
"""
|
|
340
|
-
)
|
|
341
|
-
start = 0
|
|
342
|
-
count = 10
|
|
343
|
-
total = count
|
|
344
|
-
|
|
613
|
+
def get_user_to_be_updated(
|
|
614
|
+
self,
|
|
615
|
+
) -> Iterable[Tuple[CorpUser, Optional[CorpUserEditableInfoClass]]]:
|
|
345
616
|
assert self.ctx.graph is not None
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
617
|
+
for urn in self.ctx.graph.get_urns_by_filter(
|
|
618
|
+
entity_types=["corpuser"], query="*"
|
|
619
|
+
):
|
|
620
|
+
user_obj = CorpUser()
|
|
621
|
+
user_obj.urn = urn
|
|
622
|
+
editable_properties = self.ctx.graph.get_aspect(
|
|
623
|
+
urn, CorpUserEditableInfoClass
|
|
351
624
|
)
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
continue
|
|
361
|
-
if editable_properties is not None:
|
|
362
|
-
user_obj.email = editable_properties.get("email")
|
|
363
|
-
if user_obj.email is None:
|
|
364
|
-
urn_id = Urn.from_string(user_obj.urn).get_entity_id_as_string()
|
|
365
|
-
if "@" in urn_id:
|
|
366
|
-
user_obj.email = urn_id
|
|
367
|
-
if user_obj.email is not None:
|
|
368
|
-
yield user_obj
|
|
369
|
-
start += count
|
|
625
|
+
if editable_properties and editable_properties.email:
|
|
626
|
+
user_obj.email = editable_properties.email
|
|
627
|
+
else:
|
|
628
|
+
urn_id = Urn.from_string(user_obj.urn).get_entity_id_as_string()
|
|
629
|
+
if "@" in urn_id:
|
|
630
|
+
user_obj.email = urn_id
|
|
631
|
+
if user_obj.email is not None:
|
|
632
|
+
yield (user_obj, editable_properties)
|
|
370
633
|
|
|
371
634
|
def get_report(self) -> SourceReport:
|
|
372
635
|
return self.report
|
|
File without changes
|