acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -40,11 +40,11 @@ class TimeoutHTTPAdapter(HTTPAdapter):
|
|
|
40
40
|
del kwargs["timeout"]
|
|
41
41
|
super().__init__(*args, **kwargs)
|
|
42
42
|
|
|
43
|
-
def send(self, request, **kwargs):
|
|
43
|
+
def send(self, request, *args, **kwargs):
|
|
44
44
|
timeout = kwargs.get("timeout")
|
|
45
45
|
if timeout is None and hasattr(self, "timeout"):
|
|
46
46
|
kwargs["timeout"] = self.timeout
|
|
47
|
-
return super().send(request, **kwargs)
|
|
47
|
+
return super().send(request, *args, **kwargs)
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class IcebergProfilingConfig(ConfigModel):
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Callable, Dict, Iterable,
|
|
2
|
+
from typing import Any, Callable, Dict, Iterable, Optional, cast
|
|
3
3
|
|
|
4
4
|
from pyiceberg.conversions import from_bytes
|
|
5
5
|
from pyiceberg.schema import Schema
|
|
@@ -12,6 +12,7 @@ from pyiceberg.types import (
|
|
|
12
12
|
IcebergType,
|
|
13
13
|
IntegerType,
|
|
14
14
|
LongType,
|
|
15
|
+
PrimitiveType,
|
|
15
16
|
TimestampType,
|
|
16
17
|
TimestamptzType,
|
|
17
18
|
TimeType,
|
|
@@ -22,10 +23,9 @@ from pyiceberg.utils.datetime import (
|
|
|
22
23
|
to_human_timestamp,
|
|
23
24
|
to_human_timestamptz,
|
|
24
25
|
)
|
|
26
|
+
from typing_extensions import TypeGuard
|
|
25
27
|
|
|
26
28
|
from datahub.emitter.mce_builder import get_sys_time
|
|
27
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
28
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
29
29
|
from datahub.ingestion.source.iceberg.iceberg_common import (
|
|
30
30
|
IcebergProfilingConfig,
|
|
31
31
|
IcebergSourceReport,
|
|
@@ -33,6 +33,7 @@ from datahub.ingestion.source.iceberg.iceberg_common import (
|
|
|
33
33
|
from datahub.metadata.schema_classes import (
|
|
34
34
|
DatasetFieldProfileClass,
|
|
35
35
|
DatasetProfileClass,
|
|
36
|
+
_Aspect,
|
|
36
37
|
)
|
|
37
38
|
from datahub.utilities.perf_timer import PerfTimer
|
|
38
39
|
|
|
@@ -66,7 +67,7 @@ class IcebergProfiler:
|
|
|
66
67
|
aggregated_values: Dict[int, Any],
|
|
67
68
|
manifest_values: Dict[int, bytes],
|
|
68
69
|
) -> None:
|
|
69
|
-
for field_id, value_encoded in manifest_values.items():
|
|
70
|
+
for field_id, value_encoded in manifest_values.items():
|
|
70
71
|
try:
|
|
71
72
|
field = schema.find_field(field_id)
|
|
72
73
|
except ValueError:
|
|
@@ -86,9 +87,8 @@ class IcebergProfiler:
|
|
|
86
87
|
def profile_table(
|
|
87
88
|
self,
|
|
88
89
|
dataset_name: str,
|
|
89
|
-
dataset_urn: str,
|
|
90
90
|
table: Table,
|
|
91
|
-
) -> Iterable[
|
|
91
|
+
) -> Iterable[_Aspect]:
|
|
92
92
|
"""This method will profile the supplied Iceberg table by looking at the table's manifest.
|
|
93
93
|
|
|
94
94
|
The overall profile of the table is aggregated from the individual manifest files.
|
|
@@ -167,11 +167,11 @@ class IcebergProfiler:
|
|
|
167
167
|
)
|
|
168
168
|
total_count += data_file.record_count
|
|
169
169
|
except Exception as e:
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
170
|
+
self.report.warning(
|
|
171
|
+
title="Error when profiling a table",
|
|
172
|
+
message="Skipping profiling of the table due to errors",
|
|
173
|
+
context=dataset_name,
|
|
174
|
+
exc=e,
|
|
175
175
|
)
|
|
176
176
|
if row_count:
|
|
177
177
|
# Iterating through fieldPaths introduces unwanted stats for list element fields...
|
|
@@ -211,14 +211,11 @@ class IcebergProfiler:
|
|
|
211
211
|
f"Finished profiling of dataset: {dataset_name} in {time_taken}"
|
|
212
212
|
)
|
|
213
213
|
|
|
214
|
-
yield
|
|
215
|
-
entityUrn=dataset_urn,
|
|
216
|
-
aspect=dataset_profile,
|
|
217
|
-
).as_workunit()
|
|
214
|
+
yield dataset_profile
|
|
218
215
|
|
|
219
216
|
def _render_value(
|
|
220
217
|
self, dataset_name: str, value_type: IcebergType, value: Any
|
|
221
|
-
) ->
|
|
218
|
+
) -> Optional[str]:
|
|
222
219
|
try:
|
|
223
220
|
if isinstance(value_type, TimestampType):
|
|
224
221
|
return to_human_timestamp(value)
|
|
@@ -230,14 +227,22 @@ class IcebergProfiler:
|
|
|
230
227
|
return to_human_time(value)
|
|
231
228
|
return str(value)
|
|
232
229
|
except Exception as e:
|
|
233
|
-
self.report.
|
|
234
|
-
"profiling",
|
|
235
|
-
|
|
230
|
+
self.report.warning(
|
|
231
|
+
title="Couldn't render value when profiling a table",
|
|
232
|
+
message="Encountered error, when trying to redner a value for table profile.",
|
|
233
|
+
context=str(
|
|
234
|
+
{
|
|
235
|
+
"value": value,
|
|
236
|
+
"value_type": value_type,
|
|
237
|
+
"dataset_name": dataset_name,
|
|
238
|
+
}
|
|
239
|
+
),
|
|
240
|
+
exc=e,
|
|
236
241
|
)
|
|
237
242
|
return None
|
|
238
243
|
|
|
239
244
|
@staticmethod
|
|
240
|
-
def _is_numeric_type(type: IcebergType) ->
|
|
245
|
+
def _is_numeric_type(type: IcebergType) -> TypeGuard[PrimitiveType]:
|
|
241
246
|
return isinstance(
|
|
242
247
|
type,
|
|
243
248
|
(
|
|
@@ -167,7 +167,7 @@ class AzureADSourceReport(StaleEntityRemovalSourceReport):
|
|
|
167
167
|
@config_class(AzureADConfig)
|
|
168
168
|
@support_status(SupportStatus.CERTIFIED)
|
|
169
169
|
@capability(
|
|
170
|
-
SourceCapability.DELETION_DETECTION, "
|
|
170
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
171
171
|
)
|
|
172
172
|
class AzureADSource(StatefulIngestionSourceBase):
|
|
173
173
|
"""
|
|
@@ -41,7 +41,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
|
41
41
|
)
|
|
42
42
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
43
43
|
from datahub.metadata.schema_classes import (
|
|
44
|
-
ChangeTypeClass,
|
|
45
44
|
CorpGroupInfoClass,
|
|
46
45
|
CorpUserInfoClass,
|
|
47
46
|
GroupMembershipClass,
|
|
@@ -202,7 +201,7 @@ class OktaSourceReport(StaleEntityRemovalSourceReport):
|
|
|
202
201
|
@support_status(SupportStatus.CERTIFIED)
|
|
203
202
|
@capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration")
|
|
204
203
|
@capability(
|
|
205
|
-
SourceCapability.DELETION_DETECTION, "
|
|
204
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
206
205
|
)
|
|
207
206
|
class OktaSource(StatefulIngestionSourceBase):
|
|
208
207
|
"""
|
|
@@ -332,18 +331,12 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
332
331
|
yield MetadataWorkUnit(id=wu_id, mce=mce)
|
|
333
332
|
|
|
334
333
|
yield MetadataChangeProposalWrapper(
|
|
335
|
-
entityType="corpGroup",
|
|
336
334
|
entityUrn=datahub_corp_group_snapshot.urn,
|
|
337
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
338
|
-
aspectName="origin",
|
|
339
335
|
aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
|
|
340
336
|
).as_workunit()
|
|
341
337
|
|
|
342
338
|
yield MetadataChangeProposalWrapper(
|
|
343
|
-
entityType="corpGroup",
|
|
344
339
|
entityUrn=datahub_corp_group_snapshot.urn,
|
|
345
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
346
|
-
aspectName="status",
|
|
347
340
|
aspect=StatusClass(removed=False),
|
|
348
341
|
).as_workunit()
|
|
349
342
|
|
|
@@ -418,18 +411,12 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
418
411
|
yield MetadataWorkUnit(id=wu_id, mce=mce)
|
|
419
412
|
|
|
420
413
|
yield MetadataChangeProposalWrapper(
|
|
421
|
-
entityType="corpuser",
|
|
422
414
|
entityUrn=datahub_corp_user_snapshot.urn,
|
|
423
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
424
|
-
aspectName="origin",
|
|
425
415
|
aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
|
|
426
416
|
).as_workunit()
|
|
427
417
|
|
|
428
418
|
yield MetadataChangeProposalWrapper(
|
|
429
|
-
entityType="corpuser",
|
|
430
419
|
entityUrn=datahub_corp_user_snapshot.urn,
|
|
431
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
432
|
-
aspectName="status",
|
|
433
420
|
aspect=StatusClass(removed=False),
|
|
434
421
|
).as_workunit()
|
|
435
422
|
|
|
@@ -7,7 +7,6 @@ from typing import Any, Dict, Iterable, List, Optional, Type, cast
|
|
|
7
7
|
import avro.schema
|
|
8
8
|
import confluent_kafka
|
|
9
9
|
import confluent_kafka.admin
|
|
10
|
-
import pydantic
|
|
11
10
|
from confluent_kafka.admin import (
|
|
12
11
|
AdminClient,
|
|
13
12
|
ConfigEntry,
|
|
@@ -16,13 +15,8 @@ from confluent_kafka.admin import (
|
|
|
16
15
|
)
|
|
17
16
|
from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient
|
|
18
17
|
|
|
19
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
20
18
|
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
|
|
21
19
|
from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
|
|
22
|
-
from datahub.configuration.source_common import (
|
|
23
|
-
DatasetSourceConfigMixin,
|
|
24
|
-
LowerCaseDatasetUrnConfigMixin,
|
|
25
|
-
)
|
|
26
20
|
from datahub.emitter import mce_builder
|
|
27
21
|
from datahub.emitter.mce_builder import (
|
|
28
22
|
make_data_platform_urn,
|
|
@@ -50,16 +44,15 @@ from datahub.ingestion.api.source import (
|
|
|
50
44
|
)
|
|
51
45
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
52
46
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
47
|
+
from datahub.ingestion.source.kafka.kafka_config import KafkaSourceConfig
|
|
53
48
|
from datahub.ingestion.source.kafka.kafka_schema_registry_base import (
|
|
54
49
|
KafkaSchemaRegistryBase,
|
|
55
50
|
)
|
|
56
51
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
57
52
|
StaleEntityRemovalHandler,
|
|
58
53
|
StaleEntityRemovalSourceReport,
|
|
59
|
-
StatefulStaleMetadataRemovalConfig,
|
|
60
54
|
)
|
|
61
55
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
62
|
-
StatefulIngestionConfigBase,
|
|
63
56
|
StatefulIngestionSourceBase,
|
|
64
57
|
)
|
|
65
58
|
from datahub.metadata.com.linkedin.pegasus2avro.common import Status
|
|
@@ -90,64 +83,6 @@ class KafkaTopicConfigKeys(StrEnum):
|
|
|
90
83
|
UNCLEAN_LEADER_ELECTION_CONFIG = "unclean.leader.election.enable"
|
|
91
84
|
|
|
92
85
|
|
|
93
|
-
class KafkaSourceConfig(
|
|
94
|
-
StatefulIngestionConfigBase,
|
|
95
|
-
DatasetSourceConfigMixin,
|
|
96
|
-
LowerCaseDatasetUrnConfigMixin,
|
|
97
|
-
):
|
|
98
|
-
connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
|
|
99
|
-
|
|
100
|
-
topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
|
|
101
|
-
domain: Dict[str, AllowDenyPattern] = pydantic.Field(
|
|
102
|
-
default={},
|
|
103
|
-
description="A map of domain names to allow deny patterns. Domains can be urn-based (`urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810`) or bare (`13ae4d85-d955-49fc-8474-9004c663a810`).",
|
|
104
|
-
)
|
|
105
|
-
topic_subject_map: Dict[str, str] = pydantic.Field(
|
|
106
|
-
default={},
|
|
107
|
-
description="Provides the mapping for the `key` and the `value` schemas of a topic to the corresponding schema registry subject name. Each entry of this map has the form `<topic_name>-key`:`<schema_registry_subject_name_for_key_schema>` and `<topic_name>-value`:`<schema_registry_subject_name_for_value_schema>` for the key and the value schemas associated with the topic, respectively. This parameter is mandatory when the [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work) is used as the subject naming strategy in the kafka schema registry. NOTE: When provided, this overrides the default subject name resolution even when the `TopicNameStrategy` or the `TopicRecordNameStrategy` are used.",
|
|
108
|
-
)
|
|
109
|
-
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
110
|
-
schema_registry_class: str = pydantic.Field(
|
|
111
|
-
default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry",
|
|
112
|
-
description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.",
|
|
113
|
-
)
|
|
114
|
-
schema_tags_field: str = pydantic.Field(
|
|
115
|
-
default="tags",
|
|
116
|
-
description="The field name in the schema metadata that contains the tags to be added to the dataset.",
|
|
117
|
-
)
|
|
118
|
-
enable_meta_mapping: bool = pydantic.Field(
|
|
119
|
-
default=True,
|
|
120
|
-
description="When enabled, applies the mappings that are defined through the meta_mapping directives.",
|
|
121
|
-
)
|
|
122
|
-
meta_mapping: Dict = pydantic.Field(
|
|
123
|
-
default={},
|
|
124
|
-
description="mapping rules that will be executed against top-level schema properties. Refer to the section below on meta automated mappings.",
|
|
125
|
-
)
|
|
126
|
-
field_meta_mapping: Dict = pydantic.Field(
|
|
127
|
-
default={},
|
|
128
|
-
description="mapping rules that will be executed against field-level schema properties. Refer to the section below on meta automated mappings.",
|
|
129
|
-
)
|
|
130
|
-
strip_user_ids_from_email: bool = pydantic.Field(
|
|
131
|
-
default=False,
|
|
132
|
-
description="Whether or not to strip email id while adding owners using meta mappings.",
|
|
133
|
-
)
|
|
134
|
-
tag_prefix: str = pydantic.Field(
|
|
135
|
-
default="", description="Prefix added to tags during ingestion."
|
|
136
|
-
)
|
|
137
|
-
ignore_warnings_on_schema_type: bool = pydantic.Field(
|
|
138
|
-
default=False,
|
|
139
|
-
description="Disables warnings reported for non-AVRO/Protobuf value or key schemas if set.",
|
|
140
|
-
)
|
|
141
|
-
disable_topic_record_naming_strategy: bool = pydantic.Field(
|
|
142
|
-
default=False,
|
|
143
|
-
description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
|
|
144
|
-
)
|
|
145
|
-
ingest_schemas_as_entities: bool = pydantic.Field(
|
|
146
|
-
default=False,
|
|
147
|
-
description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
|
|
151
86
|
def get_kafka_consumer(
|
|
152
87
|
connection: KafkaConsumerConnectionConfig,
|
|
153
88
|
) -> confluent_kafka.Consumer:
|
|
@@ -254,6 +189,22 @@ class KafkaConnectionTest:
|
|
|
254
189
|
SourceCapability.SCHEMA_METADATA,
|
|
255
190
|
"Schemas associated with each topic are extracted from the schema registry. Avro and Protobuf (certified), JSON (incubating). Schema references are supported.",
|
|
256
191
|
)
|
|
192
|
+
@capability(
|
|
193
|
+
SourceCapability.DATA_PROFILING,
|
|
194
|
+
"Not supported",
|
|
195
|
+
supported=False,
|
|
196
|
+
)
|
|
197
|
+
@capability(
|
|
198
|
+
SourceCapability.LINEAGE_COARSE,
|
|
199
|
+
"Not supported. If you use Kafka Connect, the kafka-connect source can generate lineage.",
|
|
200
|
+
supported=False,
|
|
201
|
+
)
|
|
202
|
+
@capability(
|
|
203
|
+
SourceCapability.LINEAGE_FINE,
|
|
204
|
+
"Not supported",
|
|
205
|
+
supported=False,
|
|
206
|
+
)
|
|
207
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
257
208
|
class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
258
209
|
"""
|
|
259
210
|
This plugin extracts the following:
|
|
@@ -430,6 +381,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
430
381
|
|
|
431
382
|
# 4. Set dataset's description, tags, ownership, etc, if topic schema type is avro
|
|
432
383
|
description: Optional[str] = None
|
|
384
|
+
external_url: Optional[str] = None
|
|
433
385
|
if (
|
|
434
386
|
schema_metadata is not None
|
|
435
387
|
and isinstance(schema_metadata.platformSchema, KafkaSchemaClass)
|
|
@@ -481,8 +433,16 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
481
433
|
mce_builder.make_global_tag_aspect_with_tag_list(all_tags)
|
|
482
434
|
)
|
|
483
435
|
|
|
436
|
+
if self.source_config.external_url_base:
|
|
437
|
+
# Remove trailing slash from base URL if present
|
|
438
|
+
base_url = self.source_config.external_url_base.rstrip("/")
|
|
439
|
+
external_url = f"{base_url}/{dataset_name}"
|
|
440
|
+
|
|
484
441
|
dataset_properties = DatasetPropertiesClass(
|
|
485
|
-
name=dataset_name,
|
|
442
|
+
name=dataset_name,
|
|
443
|
+
customProperties=custom_props,
|
|
444
|
+
description=description,
|
|
445
|
+
externalUrl=external_url,
|
|
486
446
|
)
|
|
487
447
|
dataset_snapshot.aspects.append(dataset_properties)
|
|
488
448
|
|
|
@@ -568,10 +528,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
568
528
|
|
|
569
529
|
for config_key in KafkaTopicConfigKeys:
|
|
570
530
|
try:
|
|
571
|
-
if
|
|
572
|
-
config_key in topic_config.keys()
|
|
573
|
-
and topic_config[config_key] is not None
|
|
574
|
-
):
|
|
531
|
+
if config_key in topic_config and topic_config[config_key] is not None:
|
|
575
532
|
config_value = topic_config[config_key].value
|
|
576
533
|
custom_props[config_key] = (
|
|
577
534
|
config_value
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
6
|
+
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
|
|
7
|
+
from datahub.configuration.source_common import (
|
|
8
|
+
DatasetSourceConfigMixin,
|
|
9
|
+
LowerCaseDatasetUrnConfigMixin,
|
|
10
|
+
)
|
|
11
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
12
|
+
StatefulStaleMetadataRemovalConfig,
|
|
13
|
+
)
|
|
14
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
15
|
+
StatefulIngestionConfigBase,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class KafkaSourceConfig(
|
|
20
|
+
StatefulIngestionConfigBase,
|
|
21
|
+
DatasetSourceConfigMixin,
|
|
22
|
+
LowerCaseDatasetUrnConfigMixin,
|
|
23
|
+
):
|
|
24
|
+
connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
|
|
25
|
+
|
|
26
|
+
topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
|
|
27
|
+
domain: Dict[str, AllowDenyPattern] = Field(
|
|
28
|
+
default={},
|
|
29
|
+
description="A map of domain names to allow deny patterns. Domains can be urn-based (`urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810`) or bare (`13ae4d85-d955-49fc-8474-9004c663a810`).",
|
|
30
|
+
)
|
|
31
|
+
topic_subject_map: Dict[str, str] = Field(
|
|
32
|
+
default={},
|
|
33
|
+
description="Provides the mapping for the `key` and the `value` schemas of a topic to the corresponding schema registry subject name. Each entry of this map has the form `<topic_name>-key`:`<schema_registry_subject_name_for_key_schema>` and `<topic_name>-value`:`<schema_registry_subject_name_for_value_schema>` for the key and the value schemas associated with the topic, respectively. This parameter is mandatory when the [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work) is used as the subject naming strategy in the kafka schema registry. NOTE: When provided, this overrides the default subject name resolution even when the `TopicNameStrategy` or the `TopicRecordNameStrategy` are used.",
|
|
34
|
+
)
|
|
35
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
36
|
+
schema_registry_class: str = Field(
|
|
37
|
+
default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry",
|
|
38
|
+
description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.",
|
|
39
|
+
)
|
|
40
|
+
schema_tags_field: str = Field(
|
|
41
|
+
default="tags",
|
|
42
|
+
description="The field name in the schema metadata that contains the tags to be added to the dataset.",
|
|
43
|
+
)
|
|
44
|
+
enable_meta_mapping: bool = Field(
|
|
45
|
+
default=True,
|
|
46
|
+
description="When enabled, applies the mappings that are defined through the meta_mapping directives.",
|
|
47
|
+
)
|
|
48
|
+
meta_mapping: Dict = Field(
|
|
49
|
+
default={},
|
|
50
|
+
description="mapping rules that will be executed against top-level schema properties. Refer to the section below on meta automated mappings.",
|
|
51
|
+
)
|
|
52
|
+
field_meta_mapping: Dict = Field(
|
|
53
|
+
default={},
|
|
54
|
+
description="mapping rules that will be executed against field-level schema properties. Refer to the section below on meta automated mappings.",
|
|
55
|
+
)
|
|
56
|
+
strip_user_ids_from_email: bool = Field(
|
|
57
|
+
default=False,
|
|
58
|
+
description="Whether or not to strip email id while adding owners using meta mappings.",
|
|
59
|
+
)
|
|
60
|
+
tag_prefix: str = Field(
|
|
61
|
+
default="", description="Prefix added to tags during ingestion."
|
|
62
|
+
)
|
|
63
|
+
ignore_warnings_on_schema_type: bool = Field(
|
|
64
|
+
default=False,
|
|
65
|
+
description="Disables warnings reported for non-AVRO/Protobuf value or key schemas if set.",
|
|
66
|
+
)
|
|
67
|
+
disable_topic_record_naming_strategy: bool = Field(
|
|
68
|
+
default=False,
|
|
69
|
+
description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
|
|
70
|
+
)
|
|
71
|
+
ingest_schemas_as_entities: bool = Field(
|
|
72
|
+
default=False,
|
|
73
|
+
description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
|
|
74
|
+
)
|
|
75
|
+
external_url_base: Optional[str] = Field(
|
|
76
|
+
default=None,
|
|
77
|
+
description="Base URL for external platform (e.g. Aiven) where topics can be viewed. The topic name will be appended to this base URL.",
|
|
78
|
+
)
|
|
@@ -4,7 +4,7 @@ from typing import Dict, Iterable, List, Optional
|
|
|
4
4
|
|
|
5
5
|
from pydantic.fields import Field
|
|
6
6
|
|
|
7
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, LaxStr
|
|
8
8
|
from datahub.configuration.source_common import (
|
|
9
9
|
DatasetLineageProviderConfigBase,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
@@ -29,7 +29,7 @@ CONNECTOR_CLASS = "connector.class"
|
|
|
29
29
|
class ProvidedConfig(ConfigModel):
|
|
30
30
|
provider: str
|
|
31
31
|
path_key: str
|
|
32
|
-
value:
|
|
32
|
+
value: LaxStr
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class GenericConnectorConfig(ConfigModel):
|