acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
from datetime import datetime, timedelta
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
1
5
|
class DremioSQLQueries:
|
|
2
6
|
QUERY_DATASETS_CE = """
|
|
3
7
|
SELECT* FROM
|
|
@@ -235,28 +239,83 @@ class DremioSQLQueries:
|
|
|
235
239
|
TABLE_NAME ASC
|
|
236
240
|
"""
|
|
237
241
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
242
|
+
@staticmethod
|
|
243
|
+
def _get_default_start_timestamp_millis() -> str:
|
|
244
|
+
"""Get default start timestamp (1 day ago) in milliseconds precision format"""
|
|
245
|
+
one_day_ago = datetime.now() - timedelta(days=1)
|
|
246
|
+
return one_day_ago.strftime("%Y-%m-%d %H:%M:%S.%f")[
|
|
247
|
+
:-3
|
|
248
|
+
] # Truncate to milliseconds
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _get_default_end_timestamp_millis() -> str:
|
|
252
|
+
"""Get default end timestamp (now) in milliseconds precision format"""
|
|
253
|
+
now = datetime.now()
|
|
254
|
+
return now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] # Truncate to milliseconds
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def get_query_all_jobs(
|
|
258
|
+
start_timestamp_millis: Optional[str] = None,
|
|
259
|
+
end_timestamp_millis: Optional[str] = None,
|
|
260
|
+
) -> str:
|
|
261
|
+
"""
|
|
262
|
+
Get query for all jobs with optional time filtering.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 1 day ago)
|
|
266
|
+
end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
SQL query string with time filtering applied
|
|
270
|
+
"""
|
|
271
|
+
if start_timestamp_millis is None:
|
|
272
|
+
start_timestamp_millis = (
|
|
273
|
+
DremioSQLQueries._get_default_start_timestamp_millis()
|
|
274
|
+
)
|
|
275
|
+
if end_timestamp_millis is None:
|
|
276
|
+
end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
|
|
277
|
+
|
|
278
|
+
return f"""
|
|
279
|
+
SELECT
|
|
280
|
+
job_id,
|
|
281
|
+
user_name,
|
|
282
|
+
submitted_ts,
|
|
283
|
+
query,
|
|
284
|
+
queried_datasets
|
|
285
|
+
FROM
|
|
286
|
+
SYS.JOBS_RECENT
|
|
287
|
+
WHERE
|
|
288
|
+
STATUS = 'COMPLETED'
|
|
289
|
+
AND LENGTH(queried_datasets)>0
|
|
290
|
+
AND user_name != '$dremio$'
|
|
291
|
+
AND query_type not like '%INTERNAL%'
|
|
292
|
+
AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
|
|
293
|
+
AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
@staticmethod
|
|
297
|
+
def get_query_all_jobs_cloud(
|
|
298
|
+
start_timestamp_millis: Optional[str] = None,
|
|
299
|
+
end_timestamp_millis: Optional[str] = None,
|
|
300
|
+
) -> str:
|
|
301
|
+
"""
|
|
302
|
+
Get query for all jobs in Dremio Cloud with optional time filtering.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 7 days ago)
|
|
306
|
+
end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
SQL query string with time filtering applied
|
|
310
|
+
"""
|
|
311
|
+
if start_timestamp_millis is None:
|
|
312
|
+
start_timestamp_millis = (
|
|
313
|
+
DremioSQLQueries._get_default_start_timestamp_millis()
|
|
314
|
+
)
|
|
315
|
+
if end_timestamp_millis is None:
|
|
316
|
+
end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
|
|
256
317
|
|
|
257
|
-
|
|
258
|
-
# queried_datasets correctly documented as [varchar]
|
|
259
|
-
QUERY_ALL_JOBS_CLOUD = """
|
|
318
|
+
return f"""
|
|
260
319
|
SELECT
|
|
261
320
|
job_id,
|
|
262
321
|
user_name,
|
|
@@ -270,6 +329,8 @@ class DremioSQLQueries:
|
|
|
270
329
|
AND ARRAY_SIZE(queried_datasets)>0
|
|
271
330
|
AND user_name != '$dremio$'
|
|
272
331
|
AND query_type not like '%INTERNAL%'
|
|
332
|
+
AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
|
|
333
|
+
AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
|
|
273
334
|
"""
|
|
274
335
|
|
|
275
336
|
QUERY_TYPES = [
|
|
@@ -12,7 +12,7 @@ from typing import (
|
|
|
12
12
|
Union,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
|
-
from pydantic
|
|
15
|
+
from pydantic import Field, PositiveInt
|
|
16
16
|
|
|
17
17
|
from datahub.configuration.common import AllowDenyPattern
|
|
18
18
|
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
@@ -73,7 +73,6 @@ from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
|
73
73
|
|
|
74
74
|
MAX_ITEMS_TO_RETRIEVE = 100
|
|
75
75
|
PAGE_SIZE = 100
|
|
76
|
-
MAX_SCHEMA_SIZE = 300
|
|
77
76
|
MAX_PRIMARY_KEYS_SIZE = 100
|
|
78
77
|
FIELD_DELIMITER = "."
|
|
79
78
|
|
|
@@ -107,6 +106,10 @@ class DynamoDBConfig(
|
|
|
107
106
|
'Refer "Advanced Configurations" section for more details',
|
|
108
107
|
)
|
|
109
108
|
|
|
109
|
+
max_schema_size: PositiveInt = Field(
|
|
110
|
+
default=300, description="Maximum number of fields to include in the schema."
|
|
111
|
+
)
|
|
112
|
+
|
|
110
113
|
table_pattern: AllowDenyPattern = Field(
|
|
111
114
|
default=AllowDenyPattern.allow_all(),
|
|
112
115
|
description="Regex patterns for tables to filter in ingestion. The table name format is 'region.table'",
|
|
@@ -160,7 +163,7 @@ _attribute_type_to_field_type_mapping: Dict[str, Type] = {
|
|
|
160
163
|
|
|
161
164
|
@platform_name("DynamoDB", id="dynamodb")
|
|
162
165
|
@config_class(DynamoDBConfig)
|
|
163
|
-
@support_status(SupportStatus.
|
|
166
|
+
@support_status(SupportStatus.INCUBATING)
|
|
164
167
|
@capability(
|
|
165
168
|
SourceCapability.PLATFORM_INSTANCE,
|
|
166
169
|
"By default, platform_instance will use the AWS account id",
|
|
@@ -362,7 +365,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
362
365
|
if self.config.include_table_item is None:
|
|
363
366
|
return
|
|
364
367
|
dataset_name = f"{region}.{table_name}"
|
|
365
|
-
if dataset_name not in self.config.include_table_item
|
|
368
|
+
if dataset_name not in self.config.include_table_item:
|
|
366
369
|
return
|
|
367
370
|
primary_key_list = self.config.include_table_item.get(dataset_name)
|
|
368
371
|
assert isinstance(primary_key_list, List)
|
|
@@ -455,51 +458,53 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
455
458
|
) -> SchemaMetadataClass:
|
|
456
459
|
""" "
|
|
457
460
|
To construct the schema metadata, it will first sort the schema by the occurrence of attribute names
|
|
458
|
-
in descending order and truncate the schema by
|
|
461
|
+
in descending order and truncate the schema by max_schema_size, and then start to construct the
|
|
459
462
|
schema metadata sorted by attribute name
|
|
460
463
|
"""
|
|
461
464
|
|
|
462
465
|
canonical_schema: List[SchemaField] = []
|
|
463
466
|
schema_size = len(schema.values())
|
|
464
467
|
table_fields = list(schema.values())
|
|
465
|
-
if schema_size >
|
|
468
|
+
if schema_size > self.config.max_schema_size:
|
|
466
469
|
# downsample the schema, using frequency as the sort key
|
|
467
470
|
self.report.report_warning(
|
|
468
471
|
title="Schema Size Too Large",
|
|
469
|
-
message=f"Downsampling the table schema because
|
|
472
|
+
message=f"Downsampling the table schema because `max_schema_size` threshold is {self.config.max_schema_size}",
|
|
470
473
|
context=f"Collection: {dataset_urn}",
|
|
471
474
|
)
|
|
472
475
|
|
|
473
476
|
# Add this information to the custom properties so user can know they are looking at down sampled schema
|
|
474
477
|
dataset_properties.customProperties["schema.downsampled"] = "True"
|
|
475
478
|
dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}"
|
|
476
|
-
# append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include
|
|
479
|
+
# append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include max_schema_size items
|
|
480
|
+
primary_keys = []
|
|
477
481
|
for schema_field in sorted(
|
|
478
482
|
table_fields,
|
|
479
483
|
key=lambda x: (
|
|
480
484
|
-x["count"],
|
|
481
485
|
x["delimited_name"],
|
|
482
486
|
), # Negate `count` for descending order, `delimited_name` stays the same for ascending
|
|
483
|
-
)[
|
|
487
|
+
)[: self.config.max_schema_size]:
|
|
484
488
|
field_path = schema_field["delimited_name"]
|
|
485
489
|
native_data_type = self.get_native_type(schema_field["type"], table_name)
|
|
486
490
|
type = self.get_field_type(schema_field["type"], table_name)
|
|
487
|
-
description = None
|
|
488
491
|
nullable = True
|
|
489
492
|
if field_path in primary_key_dict:
|
|
490
|
-
|
|
493
|
+
# primary key should not be nullable
|
|
494
|
+
type_key = (
|
|
491
495
|
"Partition Key"
|
|
492
496
|
if primary_key_dict.get(field_path) == "HASH"
|
|
493
497
|
else "Sort Key"
|
|
494
498
|
)
|
|
495
|
-
|
|
499
|
+
dataset_properties.customProperties[type_key] = field_path
|
|
496
500
|
nullable = False
|
|
501
|
+
primary_keys.append(field_path)
|
|
497
502
|
|
|
498
503
|
field = SchemaField(
|
|
499
504
|
fieldPath=field_path,
|
|
500
505
|
nativeDataType=native_data_type,
|
|
501
506
|
type=type,
|
|
502
|
-
description=
|
|
507
|
+
description=None,
|
|
503
508
|
nullable=nullable,
|
|
504
509
|
recursive=False,
|
|
505
510
|
)
|
|
@@ -513,6 +518,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
513
518
|
hash="",
|
|
514
519
|
platformSchema=SchemalessClass(),
|
|
515
520
|
fields=canonical_schema,
|
|
521
|
+
primaryKeys=primary_keys,
|
|
516
522
|
)
|
|
517
523
|
return schema_metadata
|
|
518
524
|
|
|
File without changes
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from typing import List, Optional, Union
|
|
2
|
+
|
|
3
|
+
from pydantic.fields import Field
|
|
4
|
+
|
|
5
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
6
|
+
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
7
|
+
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
8
|
+
from datahub.ingestion.source.azure.azure_common import AzureConnectionConfig
|
|
9
|
+
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
10
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
11
|
+
StatefulStaleMetadataRemovalConfig,
|
|
12
|
+
)
|
|
13
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
14
|
+
StatefulIngestionConfigBase,
|
|
15
|
+
)
|
|
16
|
+
from datahub.ingestion.source_config.operation_config import is_profiling_enabled
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ExcelSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
|
|
20
|
+
path_list: List[str] = Field(
|
|
21
|
+
description="List of paths to Excel files or folders to ingest."
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
path_pattern: AllowDenyPattern = Field(
|
|
25
|
+
default=AllowDenyPattern.allow_all(),
|
|
26
|
+
description="Regex patterns for file paths to filter in ingestion.",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
aws_config: Optional[AwsConnectionConfig] = Field(
|
|
30
|
+
default=None, description="AWS configuration"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
use_s3_bucket_tags: Optional[bool] = Field(
|
|
34
|
+
default=False,
|
|
35
|
+
description="Whether or not to create tags in datahub from the s3 bucket",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
use_s3_object_tags: Optional[bool] = Field(
|
|
39
|
+
default=False,
|
|
40
|
+
description="Whether or not to create tags in datahub from the s3 object",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
verify_ssl: Union[bool, str] = Field(
|
|
44
|
+
default=True,
|
|
45
|
+
description="Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use.",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
azure_config: Optional[AzureConnectionConfig] = Field(
|
|
49
|
+
default=None, description="Azure configuration"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
use_abs_blob_tags: Optional[bool] = Field(
|
|
53
|
+
default=False,
|
|
54
|
+
description="Whether to create tags in datahub from the abs blob tags",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
convert_urns_to_lowercase: bool = Field(
|
|
58
|
+
default=False,
|
|
59
|
+
description="Enable to convert the Excel asset urns to lowercase",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
active_sheet_only: bool = Field(
|
|
63
|
+
default=False,
|
|
64
|
+
description="Enable to only ingest the active sheet of the workbook. If not set, all sheets will be ingested.",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
worksheet_pattern: AllowDenyPattern = Field(
|
|
68
|
+
default=AllowDenyPattern.allow_all(),
|
|
69
|
+
description="Regex patterns for worksheets to ingest. Worksheets are specified as 'filename_without_extension.worksheet_name'. "
|
|
70
|
+
"For example to allow the worksheet Sheet1 from file report.xlsx, use the pattern: 'report.Sheet1'.",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
profile_pattern: AllowDenyPattern = Field(
|
|
74
|
+
default=AllowDenyPattern.allow_all(),
|
|
75
|
+
description="Regex patterns for worksheets to profile. Worksheets are specified as 'filename_without_extension.worksheet_name'. "
|
|
76
|
+
"For example to allow the worksheet Sheet1 from file report.xlsx, use the pattern: 'report.Sheet1'.",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
profiling: GEProfilingConfig = Field(
|
|
80
|
+
default=GEProfilingConfig(),
|
|
81
|
+
description="Configuration for profiling",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
|
|
85
|
+
default=None,
|
|
86
|
+
description="Configuration for stateful ingestion and stale metadata removal.",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def is_profiling_enabled(self) -> bool:
|
|
90
|
+
return self.profiling.enabled and is_profiling_enabled(
|
|
91
|
+
self.profiling.operation_config
|
|
92
|
+
)
|