acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -178,7 +178,9 @@ class SACSourceReport(StaleEntityRemovalSourceReport):
|
|
|
178
178
|
SourceCapability.LINEAGE_COARSE,
|
|
179
179
|
"Enabled by default (only for Live Data Models)",
|
|
180
180
|
)
|
|
181
|
-
@capability(
|
|
181
|
+
@capability(
|
|
182
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
183
|
+
)
|
|
182
184
|
@capability(
|
|
183
185
|
SourceCapability.SCHEMA_METADATA,
|
|
184
186
|
"Enabled by default (only for Import Data Models)",
|
|
@@ -33,7 +33,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
33
33
|
)
|
|
34
34
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
35
35
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
36
|
-
from datahub.ingestion.source.common.subtypes import
|
|
36
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
37
|
+
DatasetSubTypes,
|
|
38
|
+
SourceCapabilityModifier,
|
|
39
|
+
)
|
|
37
40
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
38
41
|
StaleEntityRemovalHandler,
|
|
39
42
|
StaleEntityRemovalSourceReport,
|
|
@@ -107,30 +110,33 @@ class SalesforceConfig(
|
|
|
107
110
|
auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD
|
|
108
111
|
|
|
109
112
|
# Username, Password Auth
|
|
110
|
-
username: Optional[str] = Field(description="Salesforce username")
|
|
111
|
-
password: Optional[str] = Field(description="Password for Salesforce user")
|
|
113
|
+
username: Optional[str] = Field(None, description="Salesforce username")
|
|
114
|
+
password: Optional[str] = Field(None, description="Password for Salesforce user")
|
|
112
115
|
consumer_key: Optional[str] = Field(
|
|
113
|
-
description="Consumer key for Salesforce JSON web token access"
|
|
116
|
+
None, description="Consumer key for Salesforce JSON web token access"
|
|
114
117
|
)
|
|
115
118
|
private_key: Optional[str] = Field(
|
|
116
|
-
description="Private key as a string for Salesforce JSON web token access"
|
|
119
|
+
None, description="Private key as a string for Salesforce JSON web token access"
|
|
117
120
|
)
|
|
118
121
|
security_token: Optional[str] = Field(
|
|
119
|
-
description="Security token for Salesforce username"
|
|
122
|
+
None, description="Security token for Salesforce username"
|
|
120
123
|
)
|
|
121
124
|
# client_id, client_secret not required
|
|
122
125
|
|
|
123
126
|
# Direct - Instance URL, Access Token Auth
|
|
124
127
|
instance_url: Optional[str] = Field(
|
|
125
|
-
|
|
128
|
+
None,
|
|
129
|
+
description="Salesforce instance url. e.g. https://MyDomainName.my.salesforce.com",
|
|
126
130
|
)
|
|
127
131
|
# Flag to indicate whether the instance is production or sandbox
|
|
128
132
|
is_sandbox: bool = Field(
|
|
129
133
|
default=False, description="Connect to Sandbox instance of your Salesforce"
|
|
130
134
|
)
|
|
131
|
-
access_token: Optional[str] = Field(
|
|
135
|
+
access_token: Optional[str] = Field(
|
|
136
|
+
None, description="Access token for instance url"
|
|
137
|
+
)
|
|
132
138
|
|
|
133
|
-
ingest_tags:
|
|
139
|
+
ingest_tags: bool = Field(
|
|
134
140
|
default=False,
|
|
135
141
|
description="Ingest Tags from source. This will override Tags entered from UI",
|
|
136
142
|
)
|
|
@@ -144,7 +150,8 @@ class SalesforceConfig(
|
|
|
144
150
|
description='Regex patterns for tables/schemas to describe domain_key domain key (domain_key can be any string like "sales".) There can be multiple domain keys specified.',
|
|
145
151
|
)
|
|
146
152
|
api_version: Optional[str] = Field(
|
|
147
|
-
|
|
153
|
+
None,
|
|
154
|
+
description="If specified, overrides default version used by the Salesforce package. Example value: '59.0'",
|
|
148
155
|
)
|
|
149
156
|
|
|
150
157
|
profiling: SalesforceProfilingConfig = SalesforceProfilingConfig()
|
|
@@ -520,7 +527,7 @@ class SalesforceApi:
|
|
|
520
527
|
|
|
521
528
|
@platform_name("Salesforce")
|
|
522
529
|
@config_class(SalesforceConfig)
|
|
523
|
-
@support_status(SupportStatus.
|
|
530
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
524
531
|
@capability(
|
|
525
532
|
capability_name=SourceCapability.PLATFORM_INSTANCE,
|
|
526
533
|
description="Can be equivalent to Salesforce organization",
|
|
@@ -532,11 +539,11 @@ class SalesforceApi:
|
|
|
532
539
|
@capability(
|
|
533
540
|
capability_name=SourceCapability.DATA_PROFILING,
|
|
534
541
|
description="Only table level profiling is supported via `profiling.enabled` config field",
|
|
542
|
+
subtype_modifier=[SourceCapabilityModifier.TABLE],
|
|
535
543
|
)
|
|
536
544
|
@capability(
|
|
537
545
|
capability_name=SourceCapability.DELETION_DETECTION,
|
|
538
|
-
description="
|
|
539
|
-
supported=False,
|
|
546
|
+
description="Enabled by default via stateful ingestion",
|
|
540
547
|
)
|
|
541
548
|
@capability(
|
|
542
549
|
capability_name=SourceCapability.SCHEMA_METADATA,
|
|
@@ -546,6 +553,14 @@ class SalesforceApi:
|
|
|
546
553
|
capability_name=SourceCapability.TAGS,
|
|
547
554
|
description="Enabled by default",
|
|
548
555
|
)
|
|
556
|
+
@capability(
|
|
557
|
+
capability_name=SourceCapability.LINEAGE_COARSE,
|
|
558
|
+
description="Extract table-level lineage for Salesforce objects",
|
|
559
|
+
subtype_modifier=[
|
|
560
|
+
SourceCapabilityModifier.SALESFORCE_CUSTOM_OBJECT,
|
|
561
|
+
SourceCapabilityModifier.SALESFORCE_STANDARD_OBJECT,
|
|
562
|
+
],
|
|
563
|
+
)
|
|
549
564
|
class SalesforceSource(StatefulIngestionSourceBase):
|
|
550
565
|
def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None:
|
|
551
566
|
super().__init__(config, ctx)
|
|
@@ -4,7 +4,6 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
import tempfile
|
|
6
6
|
import unittest
|
|
7
|
-
import urllib.request
|
|
8
7
|
from dataclasses import dataclass
|
|
9
8
|
from os.path import basename, dirname
|
|
10
9
|
from pathlib import Path
|
|
@@ -12,6 +11,7 @@ from typing import Any, Iterable, List, Optional, Union
|
|
|
12
11
|
from urllib.parse import urlparse
|
|
13
12
|
|
|
14
13
|
import jsonref
|
|
14
|
+
import requests
|
|
15
15
|
from pydantic import AnyHttpUrl, DirectoryPath, FilePath, validator
|
|
16
16
|
from pydantic.fields import Field
|
|
17
17
|
|
|
@@ -91,19 +91,18 @@ class JsonSchemaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMix
|
|
|
91
91
|
)
|
|
92
92
|
|
|
93
93
|
@validator("path")
|
|
94
|
-
def download_http_url_to_temp_file(v):
|
|
94
|
+
def download_http_url_to_temp_file(cls, v):
|
|
95
95
|
if isinstance(v, AnyHttpUrl):
|
|
96
96
|
try:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
return tmp_file.name
|
|
97
|
+
response = requests.get(str(v))
|
|
98
|
+
response.raise_for_status()
|
|
99
|
+
schema_dict = response.json()
|
|
100
|
+
if not JsonSchemaTranslator._get_id_from_any_schema(schema_dict):
|
|
101
|
+
schema_dict["$id"] = str(v)
|
|
102
|
+
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
|
|
103
|
+
tmp_file.write(json.dumps(schema_dict))
|
|
104
|
+
tmp_file.flush()
|
|
105
|
+
return tmp_file.name
|
|
107
106
|
except Exception as e:
|
|
108
107
|
logger.error(
|
|
109
108
|
f"Failed to localize url {v} due to {e}. Run with --debug to get full stacktrace"
|
|
@@ -353,7 +352,7 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
|
|
|
353
352
|
if self.config.platform_instance:
|
|
354
353
|
browse_prefix = f"/{self.config.env.lower()}/{self.config.platform}/{self.config.platform_instance}"
|
|
355
354
|
|
|
356
|
-
if
|
|
355
|
+
if isinstance(self.config.path, Path) and self.config.path.is_dir():
|
|
357
356
|
for root, _, files in os.walk(self.config.path, topdown=False):
|
|
358
357
|
for file_name in [f for f in files if f.endswith(".json")]:
|
|
359
358
|
try:
|
|
@@ -373,10 +372,11 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
|
|
|
373
372
|
|
|
374
373
|
else:
|
|
375
374
|
try:
|
|
375
|
+
assert isinstance(self.config.path, Path)
|
|
376
376
|
yield from self._load_one_file(
|
|
377
377
|
ref_loader,
|
|
378
378
|
browse_prefix=browse_prefix,
|
|
379
|
-
root_dir=
|
|
379
|
+
root_dir=self.config.path.parent,
|
|
380
380
|
file_name=str(self.config.path),
|
|
381
381
|
)
|
|
382
382
|
except Exception as e:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from collections import Counter
|
|
1
|
+
from collections import Counter, defaultdict
|
|
2
2
|
from typing import Any, Counter as CounterType, Dict, Sequence, Tuple, Union
|
|
3
3
|
|
|
4
4
|
from typing_extensions import TypedDict
|
|
@@ -84,7 +84,7 @@ def is_nullable_collection(
|
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
def construct_schema(
|
|
87
|
-
collection: Sequence[Dict[str, Any]], delimiter: str
|
|
87
|
+
collection: Sequence[Dict[str, Any]], delimiter: str = "."
|
|
88
88
|
) -> Dict[Tuple[str, ...], SchemaDescription]:
|
|
89
89
|
"""
|
|
90
90
|
Construct (infer) a schema from a collection of documents.
|
|
@@ -104,9 +104,11 @@ def construct_schema(
|
|
|
104
104
|
string to concatenate field names by
|
|
105
105
|
"""
|
|
106
106
|
|
|
107
|
-
schema: Dict[Tuple[str, ...], BasicSchemaDescription] =
|
|
107
|
+
schema: Dict[Tuple[str, ...], BasicSchemaDescription] = defaultdict(
|
|
108
|
+
lambda: {"types": Counter(), "count": 0}
|
|
109
|
+
)
|
|
108
110
|
|
|
109
|
-
def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) ->
|
|
111
|
+
def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> int:
|
|
110
112
|
"""
|
|
111
113
|
Recursively update the schema with a document, which may/may not contain nested fields.
|
|
112
114
|
|
|
@@ -118,18 +120,24 @@ def construct_schema(
|
|
|
118
120
|
prefix of fields that the document is under, pass an empty tuple when initializing
|
|
119
121
|
"""
|
|
120
122
|
|
|
123
|
+
# we want to make sure that parents of nested structures are included first, before their children, so that
|
|
124
|
+
# they are displayed properly in the UI, also in the event of trimming the list (which happens, for example,
|
|
125
|
+
# in mongodb ingestor)
|
|
126
|
+
max_count = 0
|
|
121
127
|
for key, value in doc.items():
|
|
122
128
|
new_parent_prefix = parent_prefix + (key,)
|
|
123
129
|
|
|
124
130
|
# if nested value, look at the types within
|
|
125
131
|
if isinstance(value, dict):
|
|
126
|
-
append_to_schema(value, new_parent_prefix)
|
|
132
|
+
max_count = max(append_to_schema(value, new_parent_prefix), max_count)
|
|
127
133
|
# if array of values, check what types are within
|
|
128
134
|
if isinstance(value, list):
|
|
129
135
|
for item in value:
|
|
130
136
|
# if dictionary, add it as a nested object
|
|
131
137
|
if isinstance(item, dict):
|
|
132
|
-
|
|
138
|
+
max_count = max(
|
|
139
|
+
append_to_schema(item, new_parent_prefix), max_count
|
|
140
|
+
)
|
|
133
141
|
|
|
134
142
|
# don't record None values (counted towards nullable)
|
|
135
143
|
if value is not None:
|
|
@@ -143,6 +151,14 @@ def construct_schema(
|
|
|
143
151
|
# update the type count
|
|
144
152
|
schema[new_parent_prefix]["types"].update({type(value): 1})
|
|
145
153
|
schema[new_parent_prefix]["count"] += 1
|
|
154
|
+
max_count = max(schema[new_parent_prefix]["count"], max_count)
|
|
155
|
+
|
|
156
|
+
if parent_prefix != ():
|
|
157
|
+
schema[parent_prefix]["count"] = max(
|
|
158
|
+
schema[parent_prefix]["count"], max_count
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return max_count
|
|
146
162
|
|
|
147
163
|
for document in collection:
|
|
148
164
|
append_to_schema(document, ())
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import Dict, Optional
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
import pydantic
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
6
7
|
|
|
7
8
|
from datahub.configuration.common import AllowDenyPattern
|
|
8
9
|
from datahub.configuration.source_common import (
|
|
9
10
|
EnvConfigMixin,
|
|
10
11
|
PlatformInstanceConfigMixin,
|
|
11
12
|
)
|
|
13
|
+
from datahub.ingestion.api.report import EntityFilterReport
|
|
12
14
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
13
15
|
StaleEntityRemovalSourceReport,
|
|
14
16
|
StatefulStaleMetadataRemovalConfig,
|
|
@@ -52,17 +54,82 @@ class Constant:
|
|
|
52
54
|
DEFAULT_API_URL = "https://aws-api.sigmacomputing.com/v2"
|
|
53
55
|
|
|
54
56
|
|
|
57
|
+
class WorkspaceCounts(BaseModel):
|
|
58
|
+
workbooks_count: int = 0
|
|
59
|
+
datasets_count: int = 0
|
|
60
|
+
elements_count: int = 0
|
|
61
|
+
pages_count: int = 0
|
|
62
|
+
|
|
63
|
+
def is_empty(self) -> bool:
|
|
64
|
+
return (
|
|
65
|
+
self.workbooks_count == 0
|
|
66
|
+
and self.datasets_count == 0
|
|
67
|
+
and self.elements_count == 0
|
|
68
|
+
and self.pages_count == 0
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def as_obj(self) -> dict:
|
|
72
|
+
return {
|
|
73
|
+
"workbooks_count": self.workbooks_count,
|
|
74
|
+
"datasets_count": self.datasets_count,
|
|
75
|
+
"elements_count": self.elements_count,
|
|
76
|
+
"pages_count": self.pages_count,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class SigmaWorkspaceEntityFilterReport(EntityFilterReport):
|
|
81
|
+
type: str = "workspace"
|
|
82
|
+
|
|
83
|
+
workspace_counts: Dict[str, WorkspaceCounts] = Field(
|
|
84
|
+
default_factory=dict,
|
|
85
|
+
description="Counts of workbooks, datasets, elements and pages in each workspace.",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def increment_workbooks_count(self, workspace_id: str) -> None:
|
|
89
|
+
if workspace_id not in self.workspace_counts:
|
|
90
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
91
|
+
self.workspace_counts[workspace_id].workbooks_count += 1
|
|
92
|
+
|
|
93
|
+
def increment_datasets_count(self, workspace_id: str) -> None:
|
|
94
|
+
if workspace_id not in self.workspace_counts:
|
|
95
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
96
|
+
self.workspace_counts[workspace_id].datasets_count += 1
|
|
97
|
+
|
|
98
|
+
def increment_elements_count(self, workspace_id: str) -> None:
|
|
99
|
+
if workspace_id not in self.workspace_counts:
|
|
100
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
101
|
+
self.workspace_counts[workspace_id].elements_count += 1
|
|
102
|
+
|
|
103
|
+
def increment_pages_count(self, workspace_id: str) -> None:
|
|
104
|
+
if workspace_id not in self.workspace_counts:
|
|
105
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
106
|
+
self.workspace_counts[workspace_id].pages_count += 1
|
|
107
|
+
|
|
108
|
+
def as_obj(self) -> dict:
|
|
109
|
+
return {
|
|
110
|
+
"filtered": self.dropped_entities.as_obj(),
|
|
111
|
+
"processed": self.processed_entities.as_obj(),
|
|
112
|
+
"workspace_counts": {
|
|
113
|
+
key: item.as_obj() for key, item in self.workspace_counts.items()
|
|
114
|
+
},
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
55
118
|
@dataclass
|
|
56
119
|
class SigmaSourceReport(StaleEntityRemovalSourceReport):
|
|
57
|
-
|
|
120
|
+
workspaces: SigmaWorkspaceEntityFilterReport = field(
|
|
121
|
+
default_factory=SigmaWorkspaceEntityFilterReport
|
|
122
|
+
)
|
|
58
123
|
non_accessible_workspaces_count: int = 0
|
|
59
|
-
shared_entities_count: int = 0
|
|
60
|
-
number_of_datasets: int = 0
|
|
61
|
-
number_of_workbooks: int = 0
|
|
62
|
-
number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
|
|
63
124
|
|
|
64
|
-
|
|
65
|
-
|
|
125
|
+
datasets: EntityFilterReport = EntityFilterReport.field(type="dataset")
|
|
126
|
+
datasets_without_workspace: int = 0
|
|
127
|
+
|
|
128
|
+
workbooks: EntityFilterReport = EntityFilterReport.field(type="workbook")
|
|
129
|
+
workbooks_without_workspace: int = 0
|
|
130
|
+
|
|
131
|
+
number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
|
|
132
|
+
empty_workspaces: List[str] = field(default_factory=list)
|
|
66
133
|
|
|
67
134
|
|
|
68
135
|
class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
1
2
|
from datetime import datetime
|
|
2
3
|
from typing import Dict, List, Optional
|
|
3
4
|
|
|
@@ -23,6 +24,8 @@ class Workspace(BaseModel):
|
|
|
23
24
|
|
|
24
25
|
@root_validator(pre=True)
|
|
25
26
|
def update_values(cls, values: Dict) -> Dict:
|
|
27
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
28
|
+
values = deepcopy(values)
|
|
26
29
|
# Update name if presonal workspace
|
|
27
30
|
if values["name"] == "User Folder":
|
|
28
31
|
values["name"] = "My documents"
|
|
@@ -30,11 +30,13 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
30
30
|
from datahub.ingestion.source.common.subtypes import (
|
|
31
31
|
BIContainerSubTypes,
|
|
32
32
|
DatasetSubTypes,
|
|
33
|
+
SourceCapabilityModifier,
|
|
33
34
|
)
|
|
34
35
|
from datahub.ingestion.source.sigma.config import (
|
|
35
36
|
PlatformDetail,
|
|
36
37
|
SigmaSourceConfig,
|
|
37
38
|
SigmaSourceReport,
|
|
39
|
+
WorkspaceCounts,
|
|
38
40
|
)
|
|
39
41
|
from datahub.ingestion.source.sigma.data_classes import (
|
|
40
42
|
Element,
|
|
@@ -94,7 +96,11 @@ logger = logging.getLogger(__name__)
|
|
|
94
96
|
@platform_name("Sigma")
|
|
95
97
|
@config_class(SigmaSourceConfig)
|
|
96
98
|
@support_status(SupportStatus.INCUBATING)
|
|
97
|
-
@capability(
|
|
99
|
+
@capability(
|
|
100
|
+
SourceCapability.CONTAINERS,
|
|
101
|
+
"Enabled by default",
|
|
102
|
+
subtype_modifier=[SourceCapabilityModifier.SIGMA_WORKSPACE],
|
|
103
|
+
)
|
|
98
104
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
99
105
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default.")
|
|
100
106
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
@@ -104,6 +110,7 @@ logger = logging.getLogger(__name__)
|
|
|
104
110
|
SourceCapability.OWNERSHIP,
|
|
105
111
|
"Enabled by default, configured using `ingest_owner`",
|
|
106
112
|
)
|
|
113
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
107
114
|
class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
108
115
|
"""
|
|
109
116
|
This plugin extracts the following:
|
|
@@ -162,14 +169,18 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
162
169
|
|
|
163
170
|
def _get_allowed_workspaces(self) -> List[Workspace]:
|
|
164
171
|
all_workspaces = self.sigma_api.workspaces.values()
|
|
165
|
-
allowed_workspaces = [
|
|
166
|
-
workspace
|
|
167
|
-
for workspace in all_workspaces
|
|
168
|
-
if self.config.workspace_pattern.allowed(workspace.name)
|
|
169
|
-
]
|
|
170
172
|
logger.info(f"Number of workspaces = {len(all_workspaces)}")
|
|
171
|
-
|
|
173
|
+
|
|
174
|
+
allowed_workspaces = []
|
|
175
|
+
for workspace in all_workspaces:
|
|
176
|
+
if self.config.workspace_pattern.allowed(workspace.name):
|
|
177
|
+
allowed_workspaces.append(workspace)
|
|
178
|
+
else:
|
|
179
|
+
self.reporter.workspaces.dropped(
|
|
180
|
+
f"{workspace.name} ({workspace.workspaceId})"
|
|
181
|
+
)
|
|
172
182
|
logger.info(f"Number of allowed workspaces = {len(allowed_workspaces)}")
|
|
183
|
+
|
|
173
184
|
return allowed_workspaces
|
|
174
185
|
|
|
175
186
|
def _gen_workspace_workunit(
|
|
@@ -280,6 +291,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
280
291
|
yield self._gen_dataset_properties(dataset_urn, dataset)
|
|
281
292
|
|
|
282
293
|
if dataset.workspaceId:
|
|
294
|
+
self.reporter.workspaces.increment_datasets_count(dataset.workspaceId)
|
|
283
295
|
yield from add_entity_to_container(
|
|
284
296
|
container_key=self._gen_workspace_key(dataset.workspaceId),
|
|
285
297
|
entity_type="dataset",
|
|
@@ -463,6 +475,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
463
475
|
).as_workunit()
|
|
464
476
|
|
|
465
477
|
if workbook.workspaceId:
|
|
478
|
+
self.reporter.workspaces.increment_elements_count(workbook.workspaceId)
|
|
479
|
+
|
|
466
480
|
yield self._gen_entity_browsepath_aspect(
|
|
467
481
|
entity_urn=chart_urn,
|
|
468
482
|
parent_entity_urn=builder.make_container_urn(
|
|
@@ -520,6 +534,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
520
534
|
all_input_fields: List[InputFieldClass] = []
|
|
521
535
|
|
|
522
536
|
if workbook.workspaceId:
|
|
537
|
+
self.reporter.workspaces.increment_pages_count(workbook.workspaceId)
|
|
523
538
|
yield self._gen_entity_browsepath_aspect(
|
|
524
539
|
entity_urn=dashboard_urn,
|
|
525
540
|
parent_entity_urn=builder.make_container_urn(
|
|
@@ -609,6 +624,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
609
624
|
|
|
610
625
|
paths = workbook.path.split("/")[1:]
|
|
611
626
|
if workbook.workspaceId:
|
|
627
|
+
self.reporter.workspaces.increment_workbooks_count(workbook.workspaceId)
|
|
628
|
+
|
|
612
629
|
yield self._gen_entity_browsepath_aspect(
|
|
613
630
|
entity_urn=dashboard_urn,
|
|
614
631
|
parent_entity_urn=builder.make_container_urn(
|
|
@@ -658,7 +675,19 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
658
675
|
yield from self._gen_workbook_workunit(workbook)
|
|
659
676
|
|
|
660
677
|
for workspace in self._get_allowed_workspaces():
|
|
678
|
+
self.reporter.workspaces.processed(
|
|
679
|
+
f"{workspace.name} ({workspace.workspaceId})"
|
|
680
|
+
)
|
|
661
681
|
yield from self._gen_workspace_workunit(workspace)
|
|
682
|
+
if self.reporter.workspaces.workspace_counts.get(
|
|
683
|
+
workspace.workspaceId, WorkspaceCounts()
|
|
684
|
+
).is_empty():
|
|
685
|
+
logger.warning(
|
|
686
|
+
f"Workspace {workspace.name} ({workspace.workspaceId}) is empty. If this is not expected, add the user associated with the Client ID/Secret to each workspace with missing metadata"
|
|
687
|
+
)
|
|
688
|
+
self.reporter.empty_workspaces.append(
|
|
689
|
+
f"{workspace.name} ({workspace.workspaceId})"
|
|
690
|
+
)
|
|
662
691
|
yield from self._gen_sigma_dataset_upstream_lineage_workunit()
|
|
663
692
|
|
|
664
693
|
def get_report(self) -> SourceReport:
|