acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import threading
|
|
4
4
|
import uuid
|
|
5
|
+
from functools import partial
|
|
5
6
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
6
7
|
|
|
7
8
|
from dateutil import parser as dateutil_parser
|
|
@@ -11,11 +12,11 @@ from pyiceberg.exceptions import (
|
|
|
11
12
|
NoSuchNamespaceError,
|
|
12
13
|
NoSuchPropertyException,
|
|
13
14
|
NoSuchTableError,
|
|
14
|
-
|
|
15
|
+
RESTError,
|
|
15
16
|
)
|
|
16
17
|
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
|
|
17
18
|
from pyiceberg.table import Table
|
|
18
|
-
from pyiceberg.typedef import Identifier
|
|
19
|
+
from pyiceberg.typedef import Identifier, Properties
|
|
19
20
|
from pyiceberg.types import (
|
|
20
21
|
BinaryType,
|
|
21
22
|
BooleanType,
|
|
@@ -38,6 +39,7 @@ from pyiceberg.types import (
|
|
|
38
39
|
)
|
|
39
40
|
|
|
40
41
|
from datahub.emitter.mce_builder import (
|
|
42
|
+
make_container_urn,
|
|
41
43
|
make_data_platform_urn,
|
|
42
44
|
make_dataplatform_instance_urn,
|
|
43
45
|
make_dataset_urn_with_platform_instance,
|
|
@@ -45,6 +47,13 @@ from datahub.emitter.mce_builder import (
|
|
|
45
47
|
make_user_urn,
|
|
46
48
|
)
|
|
47
49
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
50
|
+
from datahub.emitter.mcp_builder import NamespaceKey
|
|
51
|
+
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
52
|
+
auto_patch_last_modified,
|
|
53
|
+
)
|
|
54
|
+
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
55
|
+
EnsureAspectSizeProcessor,
|
|
56
|
+
)
|
|
48
57
|
from datahub.ingestion.api.common import PipelineContext
|
|
49
58
|
from datahub.ingestion.api.decorators import (
|
|
50
59
|
SourceCapability,
|
|
@@ -55,8 +64,20 @@ from datahub.ingestion.api.decorators import (
|
|
|
55
64
|
support_status,
|
|
56
65
|
)
|
|
57
66
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
67
|
+
from datahub.ingestion.api.source_helpers import (
|
|
68
|
+
AutoSystemMetadata,
|
|
69
|
+
auto_fix_duplicate_schema_field_paths,
|
|
70
|
+
auto_fix_empty_field_paths,
|
|
71
|
+
auto_lowercase_urns,
|
|
72
|
+
auto_materialize_referenced_tags_terms,
|
|
73
|
+
auto_workunit_reporter,
|
|
74
|
+
)
|
|
58
75
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
59
76
|
from datahub.ingestion.extractor import schema_util
|
|
77
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
78
|
+
DatasetContainerSubTypes,
|
|
79
|
+
DatasetSubTypes,
|
|
80
|
+
)
|
|
60
81
|
from datahub.ingestion.source.iceberg.iceberg_common import (
|
|
61
82
|
IcebergSourceConfig,
|
|
62
83
|
IcebergSourceReport,
|
|
@@ -68,21 +89,24 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
68
89
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
69
90
|
StatefulIngestionSourceBase,
|
|
70
91
|
)
|
|
71
|
-
from datahub.metadata.com.linkedin.pegasus2avro.common import Status
|
|
72
|
-
from datahub.metadata.com.linkedin.pegasus2avro.
|
|
73
|
-
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
92
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import Status, SubTypes
|
|
93
|
+
from datahub.metadata.com.linkedin.pegasus2avro.container import ContainerProperties
|
|
74
94
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
75
95
|
OtherSchema,
|
|
76
96
|
SchemaField,
|
|
77
97
|
SchemaMetadata,
|
|
78
98
|
)
|
|
79
99
|
from datahub.metadata.schema_classes import (
|
|
100
|
+
BrowsePathEntryClass,
|
|
101
|
+
BrowsePathsV2Class,
|
|
102
|
+
ContainerClass,
|
|
80
103
|
DataPlatformInstanceClass,
|
|
81
104
|
DatasetPropertiesClass,
|
|
82
105
|
OwnerClass,
|
|
83
106
|
OwnershipClass,
|
|
84
107
|
OwnershipTypeClass,
|
|
85
108
|
TimeStampClass,
|
|
109
|
+
_Aspect,
|
|
86
110
|
)
|
|
87
111
|
from datahub.utilities.perf_timer import PerfTimer
|
|
88
112
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
@@ -94,7 +118,7 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
|
|
|
94
118
|
|
|
95
119
|
|
|
96
120
|
@platform_name("Iceberg")
|
|
97
|
-
@support_status(SupportStatus.
|
|
121
|
+
@support_status(SupportStatus.INCUBATING)
|
|
98
122
|
@config_class(IcebergSourceConfig)
|
|
99
123
|
@capability(
|
|
100
124
|
SourceCapability.PLATFORM_INSTANCE,
|
|
@@ -110,7 +134,9 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
|
|
|
110
134
|
SourceCapability.OWNERSHIP,
|
|
111
135
|
"Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`",
|
|
112
136
|
)
|
|
113
|
-
@capability(
|
|
137
|
+
@capability(
|
|
138
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
139
|
+
)
|
|
114
140
|
class IcebergSource(StatefulIngestionSourceBase):
|
|
115
141
|
"""
|
|
116
142
|
## Integration Details
|
|
@@ -121,11 +147,17 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
121
147
|
[pyiceberg library](https://py.iceberg.apache.org/).
|
|
122
148
|
"""
|
|
123
149
|
|
|
150
|
+
platform: str = "iceberg"
|
|
151
|
+
|
|
124
152
|
def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None:
|
|
125
153
|
super().__init__(config, ctx)
|
|
126
|
-
self.platform: str = "iceberg"
|
|
127
154
|
self.report: IcebergSourceReport = IcebergSourceReport()
|
|
128
155
|
self.config: IcebergSourceConfig = config
|
|
156
|
+
self.ctx: PipelineContext = ctx
|
|
157
|
+
self.stamping_processor = AutoSystemMetadata(
|
|
158
|
+
self.ctx
|
|
159
|
+
) # single instance used only when processing namespaces
|
|
160
|
+
self.namespaces: List[Tuple[Identifier, str]] = []
|
|
129
161
|
|
|
130
162
|
@classmethod
|
|
131
163
|
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
|
|
@@ -133,20 +165,58 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
133
165
|
return cls(config, ctx)
|
|
134
166
|
|
|
135
167
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
168
|
+
# This source needs to overwrite standard `get_workunit_processor`, because it is unique in terms of usage
|
|
169
|
+
# of parallelism. Because of this, 2 processors won't work as expected:
|
|
170
|
+
# 1. browse_path_processor - it needs aspects for a single entity to be continuous - which is not guaranteed
|
|
171
|
+
# in this source
|
|
172
|
+
# 2. automatic stamping with systemMetadata - in current implementation of the Source class this processor
|
|
173
|
+
# would have been applied in a thread (single) shared between the source, processors and transformers.
|
|
174
|
+
# Since the metadata scraping happens in separate threads, this could lead to difference between
|
|
175
|
+
# time used by systemMetadata and actual time at which metadata was read
|
|
176
|
+
auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
|
|
177
|
+
if (
|
|
178
|
+
self.ctx.pipeline_config
|
|
179
|
+
and self.ctx.pipeline_config.source
|
|
180
|
+
and self.ctx.pipeline_config.source.config
|
|
181
|
+
and (
|
|
182
|
+
(
|
|
183
|
+
hasattr(
|
|
184
|
+
self.ctx.pipeline_config.source.config,
|
|
185
|
+
"convert_urns_to_lowercase",
|
|
186
|
+
)
|
|
187
|
+
and self.ctx.pipeline_config.source.config.convert_urns_to_lowercase
|
|
188
|
+
)
|
|
189
|
+
or (
|
|
190
|
+
hasattr(self.ctx.pipeline_config.source.config, "get")
|
|
191
|
+
and self.ctx.pipeline_config.source.config.get(
|
|
192
|
+
"convert_urns_to_lowercase"
|
|
193
|
+
)
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
):
|
|
197
|
+
auto_lowercase_dataset_urns = auto_lowercase_urns
|
|
198
|
+
|
|
136
199
|
return [
|
|
137
|
-
|
|
200
|
+
auto_lowercase_dataset_urns,
|
|
201
|
+
auto_materialize_referenced_tags_terms,
|
|
202
|
+
partial(
|
|
203
|
+
auto_fix_duplicate_schema_field_paths, platform=self.infer_platform()
|
|
204
|
+
),
|
|
205
|
+
partial(auto_fix_empty_field_paths, platform=self.infer_platform()),
|
|
206
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
207
|
+
auto_patch_last_modified,
|
|
208
|
+
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
138
209
|
StaleEntityRemovalHandler.create(
|
|
139
210
|
self, self.config, self.ctx
|
|
140
211
|
).workunit_processor,
|
|
141
212
|
]
|
|
142
213
|
|
|
143
|
-
def
|
|
214
|
+
def _get_namespaces(self, catalog: Catalog) -> Iterable[Identifier]:
|
|
144
215
|
namespaces = catalog.list_namespaces()
|
|
145
216
|
LOGGER.debug(
|
|
146
217
|
f"Retrieved {len(namespaces)} namespaces, first 10: {namespaces[:10]}"
|
|
147
218
|
)
|
|
148
219
|
self.report.report_no_listed_namespaces(len(namespaces))
|
|
149
|
-
tables_count = 0
|
|
150
220
|
for namespace in namespaces:
|
|
151
221
|
namespace_repr = ".".join(namespace)
|
|
152
222
|
if not self.config.namespace_pattern.allowed(namespace_repr):
|
|
@@ -155,6 +225,14 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
155
225
|
)
|
|
156
226
|
self.report.report_dropped(f"{namespace_repr}.*")
|
|
157
227
|
continue
|
|
228
|
+
yield namespace
|
|
229
|
+
|
|
230
|
+
def _get_datasets(
|
|
231
|
+
self, catalog: Catalog, namespaces: Iterable[Tuple[Identifier, str]]
|
|
232
|
+
) -> Iterable[Tuple[Identifier, str]]:
|
|
233
|
+
LOGGER.debug("Starting to retrieve tables")
|
|
234
|
+
tables_count = 0
|
|
235
|
+
for namespace, namespace_urn in namespaces:
|
|
158
236
|
try:
|
|
159
237
|
tables = catalog.list_tables(namespace)
|
|
160
238
|
tables_count += len(tables)
|
|
@@ -164,29 +242,34 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
164
242
|
self.report.report_listed_tables_for_namespace(
|
|
165
243
|
".".join(namespace), len(tables)
|
|
166
244
|
)
|
|
167
|
-
yield from tables
|
|
168
|
-
except NoSuchNamespaceError:
|
|
169
|
-
self.report.
|
|
170
|
-
"
|
|
171
|
-
|
|
245
|
+
yield from [(table, namespace_urn) for table in tables]
|
|
246
|
+
except NoSuchNamespaceError as e:
|
|
247
|
+
self.report.warning(
|
|
248
|
+
title="No such namespace",
|
|
249
|
+
message="Skipping the missing namespace.",
|
|
250
|
+
context=str(namespace),
|
|
251
|
+
exc=e,
|
|
172
252
|
)
|
|
173
|
-
|
|
174
|
-
|
|
253
|
+
except RESTError as e:
|
|
254
|
+
self.report.warning(
|
|
255
|
+
title="Iceberg REST Server Error",
|
|
256
|
+
message="Iceberg REST Server returned error status when trying to list tables for a namespace, skipping it.",
|
|
257
|
+
context=str(namespace),
|
|
258
|
+
exc=e,
|
|
175
259
|
)
|
|
176
260
|
except Exception as e:
|
|
177
261
|
self.report.report_failure(
|
|
178
|
-
"
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
f"Unexpected exception while trying to get list of tables for namespace {namespace}, skipping it"
|
|
262
|
+
title="Error when processing a namespace",
|
|
263
|
+
message="Skipping the namespace due to errors while processing it.",
|
|
264
|
+
context=str(namespace),
|
|
265
|
+
exc=e,
|
|
183
266
|
)
|
|
184
267
|
|
|
185
268
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
186
269
|
thread_local = threading.local()
|
|
187
270
|
|
|
188
271
|
def _try_processing_dataset(
|
|
189
|
-
dataset_path: Tuple[str, ...], dataset_name: str
|
|
272
|
+
dataset_path: Tuple[str, ...], dataset_name: str, namespace_urn: str
|
|
190
273
|
) -> Iterable[MetadataWorkUnit]:
|
|
191
274
|
try:
|
|
192
275
|
if not hasattr(thread_local, "local_catalog"):
|
|
@@ -195,6 +278,12 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
195
278
|
)
|
|
196
279
|
thread_local.local_catalog = self.config.get_catalog()
|
|
197
280
|
|
|
281
|
+
if not hasattr(thread_local, "stamping_processor"):
|
|
282
|
+
LOGGER.debug(
|
|
283
|
+
f"Didn't find stamping_processor in thread_local ({thread_local}), initializing new workunit processor"
|
|
284
|
+
)
|
|
285
|
+
thread_local.stamping_processor = AutoSystemMetadata(self.ctx)
|
|
286
|
+
|
|
198
287
|
with PerfTimer() as timer:
|
|
199
288
|
table = thread_local.local_catalog.load_table(dataset_path)
|
|
200
289
|
time_taken = timer.elapsed_seconds()
|
|
@@ -202,56 +291,68 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
202
291
|
time_taken, dataset_name, table.metadata_location
|
|
203
292
|
)
|
|
204
293
|
LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
294
|
+
dataset_urn: str = make_dataset_urn_with_platform_instance(
|
|
295
|
+
self.platform,
|
|
296
|
+
dataset_name,
|
|
297
|
+
self.config.platform_instance,
|
|
298
|
+
self.config.env,
|
|
210
299
|
)
|
|
211
|
-
|
|
212
|
-
|
|
300
|
+
for aspect in self._create_iceberg_table_aspects(
|
|
301
|
+
dataset_name, table, namespace_urn
|
|
302
|
+
):
|
|
303
|
+
yield thread_local.stamping_processor.stamp_wu(
|
|
304
|
+
MetadataChangeProposalWrapper(
|
|
305
|
+
entityUrn=dataset_urn, aspect=aspect
|
|
306
|
+
).as_workunit()
|
|
307
|
+
)
|
|
308
|
+
except NoSuchPropertyException as e:
|
|
309
|
+
self.report.warning(
|
|
310
|
+
title="Unable to process table",
|
|
311
|
+
message="Table was not processed due to expected property missing (table is probably not an iceberg table).",
|
|
312
|
+
context=dataset_name,
|
|
313
|
+
exc=e,
|
|
213
314
|
)
|
|
214
315
|
except NoSuchIcebergTableError as e:
|
|
215
|
-
self.report.
|
|
216
|
-
"
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
|
|
316
|
+
self.report.warning(
|
|
317
|
+
title="Skipped non-iceberg table",
|
|
318
|
+
message="Table was recognized as non-iceberg and skipped.",
|
|
319
|
+
context=dataset_name,
|
|
320
|
+
exc=e,
|
|
221
321
|
)
|
|
222
322
|
except NoSuchTableError as e:
|
|
223
|
-
self.report.
|
|
224
|
-
"
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
f"NoSuchTableError while processing table {dataset_path}, skipping it.",
|
|
323
|
+
self.report.warning(
|
|
324
|
+
title="Table not found",
|
|
325
|
+
message="Table was returned by the catalog in the list of table but catalog can't find its details, table was skipped.",
|
|
326
|
+
context=dataset_name,
|
|
327
|
+
exc=e,
|
|
229
328
|
)
|
|
230
329
|
except FileNotFoundError as e:
|
|
231
|
-
self.report.
|
|
232
|
-
"file
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
f"FileNotFoundError while processing table {dataset_path}, skipping it."
|
|
237
|
-
)
|
|
238
|
-
except ServerError as e:
|
|
239
|
-
self.report.report_warning(
|
|
240
|
-
"iceberg-rest-server-error",
|
|
241
|
-
f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}",
|
|
330
|
+
self.report.warning(
|
|
331
|
+
title="Manifest file not found",
|
|
332
|
+
message="Couldn't find manifest file to read for the table, skipping it.",
|
|
333
|
+
context=dataset_name,
|
|
334
|
+
exc=e,
|
|
242
335
|
)
|
|
243
|
-
|
|
244
|
-
|
|
336
|
+
except RESTError as e:
|
|
337
|
+
self.report.warning(
|
|
338
|
+
title="Iceberg REST Server Error",
|
|
339
|
+
message="Iceberg REST Server returned error status when trying to process a table, skipping it.",
|
|
340
|
+
context=dataset_name,
|
|
341
|
+
exc=e,
|
|
245
342
|
)
|
|
246
343
|
except ValueError as e:
|
|
247
344
|
if "Could not initialize FileIO" not in str(e):
|
|
248
345
|
raise
|
|
249
346
|
self.report.warning(
|
|
250
|
-
"Could not initialize FileIO",
|
|
251
|
-
|
|
347
|
+
title="Could not initialize FileIO",
|
|
348
|
+
message="Could not initialize FileIO for a table (are you using custom FileIO?). Skipping the table.",
|
|
349
|
+
context=dataset_name,
|
|
350
|
+
exc=e,
|
|
252
351
|
)
|
|
253
352
|
|
|
254
|
-
def _process_dataset(
|
|
353
|
+
def _process_dataset(
|
|
354
|
+
dataset_path: Identifier, namespace_urn: str
|
|
355
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
255
356
|
try:
|
|
256
357
|
LOGGER.debug(f"Processing dataset for path {dataset_path}")
|
|
257
358
|
dataset_name = ".".join(dataset_path)
|
|
@@ -263,106 +364,153 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
263
364
|
)
|
|
264
365
|
return
|
|
265
366
|
|
|
266
|
-
yield from _try_processing_dataset(
|
|
367
|
+
yield from _try_processing_dataset(
|
|
368
|
+
dataset_path, dataset_name, namespace_urn
|
|
369
|
+
)
|
|
267
370
|
except Exception as e:
|
|
268
371
|
self.report.report_failure(
|
|
269
|
-
"
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
f"Exception while processing table {dataset_path}, skipping it.",
|
|
372
|
+
title="Error when processing a table",
|
|
373
|
+
message="Skipping the table due to errors when processing it.",
|
|
374
|
+
context=str(dataset_path),
|
|
375
|
+
exc=e,
|
|
274
376
|
)
|
|
275
377
|
|
|
276
378
|
try:
|
|
277
|
-
catalog = self.config.get_catalog()
|
|
379
|
+
self.catalog = self.config.get_catalog()
|
|
380
|
+
except Exception as e:
|
|
381
|
+
self.report.report_failure(
|
|
382
|
+
title="Failed to initialize catalog object",
|
|
383
|
+
message="Couldn't start the ingestion due to failure to initialize catalog object.",
|
|
384
|
+
exc=e,
|
|
385
|
+
)
|
|
386
|
+
return
|
|
387
|
+
|
|
388
|
+
try:
|
|
389
|
+
yield from self._process_namespaces()
|
|
278
390
|
except Exception as e:
|
|
279
|
-
self.report.report_failure(
|
|
391
|
+
self.report.report_failure(
|
|
392
|
+
title="Failed to list namespaces",
|
|
393
|
+
message="Couldn't start the ingestion due to a failure to process the list of the namespaces",
|
|
394
|
+
exc=e,
|
|
395
|
+
)
|
|
280
396
|
return
|
|
281
397
|
|
|
282
398
|
for wu in ThreadedIteratorExecutor.process(
|
|
283
399
|
worker_func=_process_dataset,
|
|
284
|
-
args_list=[
|
|
400
|
+
args_list=[
|
|
401
|
+
(dataset_path, namespace_urn)
|
|
402
|
+
for dataset_path, namespace_urn in self._get_datasets(
|
|
403
|
+
self.catalog, self.namespaces
|
|
404
|
+
)
|
|
405
|
+
],
|
|
285
406
|
max_workers=self.config.processing_threads,
|
|
286
407
|
):
|
|
287
408
|
yield wu
|
|
288
409
|
|
|
289
|
-
def
|
|
290
|
-
self,
|
|
410
|
+
def _try_processing_namespace(
|
|
411
|
+
self, namespace: Identifier
|
|
291
412
|
) -> Iterable[MetadataWorkUnit]:
|
|
413
|
+
namespace_repr = ".".join(namespace)
|
|
414
|
+
try:
|
|
415
|
+
LOGGER.debug(f"Processing namespace {namespace_repr}")
|
|
416
|
+
namespace_urn = make_container_urn(
|
|
417
|
+
NamespaceKey(
|
|
418
|
+
namespace=namespace_repr,
|
|
419
|
+
platform=self.platform,
|
|
420
|
+
instance=self.config.platform_instance,
|
|
421
|
+
env=self.config.env,
|
|
422
|
+
)
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
namespace_properties: Properties = self.catalog.load_namespace_properties(
|
|
426
|
+
namespace
|
|
427
|
+
)
|
|
428
|
+
for aspect in self._create_iceberg_namespace_aspects(
|
|
429
|
+
namespace, namespace_properties
|
|
430
|
+
):
|
|
431
|
+
yield self.stamping_processor.stamp_wu(
|
|
432
|
+
MetadataChangeProposalWrapper(
|
|
433
|
+
entityUrn=namespace_urn, aspect=aspect
|
|
434
|
+
).as_workunit()
|
|
435
|
+
)
|
|
436
|
+
self.namespaces.append((namespace, namespace_urn))
|
|
437
|
+
except NoSuchNamespaceError as e:
|
|
438
|
+
self.report.report_warning(
|
|
439
|
+
title="Failed to retrieve namespace properties",
|
|
440
|
+
message="Couldn't find the namespace, was it deleted during the ingestion?",
|
|
441
|
+
context=namespace_repr,
|
|
442
|
+
exc=e,
|
|
443
|
+
)
|
|
444
|
+
return
|
|
445
|
+
except RESTError as e:
|
|
446
|
+
self.report.warning(
|
|
447
|
+
title="Iceberg REST Server Error",
|
|
448
|
+
message="Iceberg REST Server returned error status when trying to retrieve namespace properties, skipping it.",
|
|
449
|
+
context=str(namespace),
|
|
450
|
+
exc=e,
|
|
451
|
+
)
|
|
452
|
+
except Exception as e:
|
|
453
|
+
self.report.report_failure(
|
|
454
|
+
title="Failed to process namespace",
|
|
455
|
+
message="Unhandled exception happened during processing of the namespace",
|
|
456
|
+
context=namespace_repr,
|
|
457
|
+
exc=e,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
def _process_namespaces(self) -> Iterable[MetadataWorkUnit]:
|
|
461
|
+
namespace_ids = self._get_namespaces(self.catalog)
|
|
462
|
+
for namespace in namespace_ids:
|
|
463
|
+
yield from self._try_processing_namespace(namespace)
|
|
464
|
+
|
|
465
|
+
LOGGER.debug("Namespaces ingestion completed")
|
|
466
|
+
|
|
467
|
+
def _create_iceberg_table_aspects(
|
|
468
|
+
self, dataset_name: str, table: Table, namespace_urn: str
|
|
469
|
+
) -> Iterable[_Aspect]:
|
|
292
470
|
with PerfTimer() as timer:
|
|
293
471
|
self.report.report_table_scanned(dataset_name)
|
|
294
472
|
LOGGER.debug(f"Processing table {dataset_name}")
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
dataset_name,
|
|
298
|
-
self.config.platform_instance,
|
|
299
|
-
self.config.env,
|
|
300
|
-
)
|
|
301
|
-
dataset_snapshot = DatasetSnapshot(
|
|
302
|
-
urn=dataset_urn,
|
|
303
|
-
aspects=[Status(removed=False)],
|
|
304
|
-
)
|
|
473
|
+
yield Status(removed=False)
|
|
474
|
+
yield SubTypes(typeNames=[DatasetSubTypes.TABLE])
|
|
305
475
|
|
|
306
|
-
|
|
307
|
-
additional_properties = {}
|
|
308
|
-
custom_properties = table.metadata.properties.copy()
|
|
309
|
-
custom_properties["location"] = table.metadata.location
|
|
310
|
-
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
311
|
-
custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
|
|
312
|
-
if table.current_snapshot():
|
|
313
|
-
custom_properties["snapshot-id"] = str(
|
|
314
|
-
table.current_snapshot().snapshot_id
|
|
315
|
-
)
|
|
316
|
-
custom_properties["manifest-list"] = (
|
|
317
|
-
table.current_snapshot().manifest_list
|
|
318
|
-
)
|
|
319
|
-
additional_properties["lastModified"] = TimeStampClass(
|
|
320
|
-
int(table.current_snapshot().timestamp_ms)
|
|
321
|
-
)
|
|
322
|
-
if "created-at" in custom_properties:
|
|
323
|
-
try:
|
|
324
|
-
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
325
|
-
additional_properties["created"] = TimeStampClass(
|
|
326
|
-
int(dt.timestamp() * 1000)
|
|
327
|
-
)
|
|
328
|
-
except Exception as ex:
|
|
329
|
-
LOGGER.warning(
|
|
330
|
-
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
331
|
-
)
|
|
476
|
+
yield self._get_dataset_properties_aspect(dataset_name, table)
|
|
332
477
|
|
|
333
|
-
dataset_properties = DatasetPropertiesClass(
|
|
334
|
-
name=table.name()[-1],
|
|
335
|
-
description=table.metadata.properties.get("comment", None),
|
|
336
|
-
customProperties=custom_properties,
|
|
337
|
-
lastModified=additional_properties.get("lastModified"),
|
|
338
|
-
created=additional_properties.get("created"),
|
|
339
|
-
qualifiedName=dataset_name,
|
|
340
|
-
)
|
|
341
|
-
dataset_snapshot.aspects.append(dataset_properties)
|
|
342
|
-
# Dataset ownership aspect.
|
|
343
478
|
dataset_ownership = self._get_ownership_aspect(table)
|
|
344
479
|
if dataset_ownership:
|
|
345
480
|
LOGGER.debug(
|
|
346
481
|
f"Adding ownership: {dataset_ownership} to the dataset {dataset_name}"
|
|
347
482
|
)
|
|
348
|
-
|
|
483
|
+
yield dataset_ownership
|
|
349
484
|
|
|
350
|
-
|
|
351
|
-
|
|
485
|
+
yield self._create_schema_metadata(dataset_name, table)
|
|
486
|
+
dpi = self._get_dataplatform_instance_aspect()
|
|
487
|
+
yield dpi
|
|
488
|
+
yield self._create_browse_paths_aspect(dpi.instance, str(namespace_urn))
|
|
489
|
+
yield ContainerClass(container=str(namespace_urn))
|
|
352
490
|
|
|
353
|
-
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
354
491
|
self.report.report_table_processing_time(
|
|
355
492
|
timer.elapsed_seconds(), dataset_name, table.metadata_location
|
|
356
493
|
)
|
|
357
|
-
yield MetadataWorkUnit(id=dataset_name, mce=mce)
|
|
358
|
-
|
|
359
|
-
dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
360
|
-
if dpi_aspect:
|
|
361
|
-
yield dpi_aspect
|
|
362
494
|
|
|
363
495
|
if self.config.is_profiling_enabled():
|
|
364
496
|
profiler = IcebergProfiler(self.report, self.config.profiling)
|
|
365
|
-
yield from profiler.profile_table(dataset_name,
|
|
497
|
+
yield from profiler.profile_table(dataset_name, table)
|
|
498
|
+
|
|
499
|
+
def _create_browse_paths_aspect(
|
|
500
|
+
self,
|
|
501
|
+
platform_instance_urn: Optional[str] = None,
|
|
502
|
+
container_urn: Optional[str] = None,
|
|
503
|
+
) -> BrowsePathsV2Class:
|
|
504
|
+
path = []
|
|
505
|
+
if platform_instance_urn:
|
|
506
|
+
path.append(
|
|
507
|
+
BrowsePathEntryClass(
|
|
508
|
+
id=platform_instance_urn, urn=platform_instance_urn
|
|
509
|
+
)
|
|
510
|
+
)
|
|
511
|
+
if container_urn:
|
|
512
|
+
path.append(BrowsePathEntryClass(id=container_urn, urn=container_urn))
|
|
513
|
+
return BrowsePathsV2Class(path=path)
|
|
366
514
|
|
|
367
515
|
def _get_partition_aspect(self, table: Table) -> Optional[str]:
|
|
368
516
|
"""Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
|
|
@@ -401,12 +549,48 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
401
549
|
]
|
|
402
550
|
)
|
|
403
551
|
except Exception as e:
|
|
404
|
-
self.report.
|
|
405
|
-
"extract
|
|
406
|
-
|
|
552
|
+
self.report.warning(
|
|
553
|
+
title="Failed to extract partition information",
|
|
554
|
+
message="Failed to extract partition information for a table. Table metadata will be ingested without it.",
|
|
555
|
+
context=str(table.name),
|
|
556
|
+
exc=e,
|
|
407
557
|
)
|
|
408
558
|
return None
|
|
409
559
|
|
|
560
|
+
def _get_dataset_properties_aspect(
|
|
561
|
+
self, dataset_name: str, table: Table
|
|
562
|
+
) -> DatasetPropertiesClass:
|
|
563
|
+
created: Optional[TimeStampClass] = None
|
|
564
|
+
custom_properties = table.metadata.properties.copy()
|
|
565
|
+
custom_properties["location"] = table.metadata.location
|
|
566
|
+
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
567
|
+
custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
|
|
568
|
+
last_modified: Optional[int] = table.metadata.last_updated_ms
|
|
569
|
+
if current_snapshot := table.current_snapshot():
|
|
570
|
+
custom_properties["snapshot-id"] = str(current_snapshot.snapshot_id)
|
|
571
|
+
custom_properties["manifest-list"] = current_snapshot.manifest_list
|
|
572
|
+
if not last_modified:
|
|
573
|
+
last_modified = int(current_snapshot.timestamp_ms)
|
|
574
|
+
if "created-at" in custom_properties:
|
|
575
|
+
try:
|
|
576
|
+
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
577
|
+
created = TimeStampClass(int(dt.timestamp() * 1000))
|
|
578
|
+
except Exception as ex:
|
|
579
|
+
LOGGER.warning(
|
|
580
|
+
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
return DatasetPropertiesClass(
|
|
584
|
+
name=table.name()[-1],
|
|
585
|
+
description=table.metadata.properties.get("comment", None),
|
|
586
|
+
customProperties=custom_properties,
|
|
587
|
+
lastModified=TimeStampClass(last_modified)
|
|
588
|
+
if last_modified is not None
|
|
589
|
+
else None,
|
|
590
|
+
created=created,
|
|
591
|
+
qualifiedName=dataset_name,
|
|
592
|
+
)
|
|
593
|
+
|
|
410
594
|
def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]:
|
|
411
595
|
owners = []
|
|
412
596
|
if self.config.user_ownership_property:
|
|
@@ -435,22 +619,15 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
435
619
|
)
|
|
436
620
|
return OwnershipClass(owners=owners) if owners else None
|
|
437
621
|
|
|
438
|
-
def _get_dataplatform_instance_aspect(
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
instance=make_dataplatform_instance_urn(
|
|
448
|
-
self.platform, self.config.platform_instance
|
|
449
|
-
),
|
|
450
|
-
),
|
|
451
|
-
).as_workunit()
|
|
452
|
-
|
|
453
|
-
return None
|
|
622
|
+
def _get_dataplatform_instance_aspect(self) -> DataPlatformInstanceClass:
|
|
623
|
+
return DataPlatformInstanceClass(
|
|
624
|
+
platform=make_data_platform_urn(self.platform),
|
|
625
|
+
instance=make_dataplatform_instance_urn(
|
|
626
|
+
self.platform, self.config.platform_instance
|
|
627
|
+
)
|
|
628
|
+
if self.config.platform_instance
|
|
629
|
+
else None,
|
|
630
|
+
)
|
|
454
631
|
|
|
455
632
|
def _create_schema_metadata(
|
|
456
633
|
self, dataset_name: str, table: Table
|
|
@@ -479,6 +656,30 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
479
656
|
def get_report(self) -> SourceReport:
|
|
480
657
|
return self.report
|
|
481
658
|
|
|
659
|
+
def _create_iceberg_namespace_aspects(
|
|
660
|
+
self, namespace: Identifier, properties: Properties
|
|
661
|
+
) -> Iterable[_Aspect]:
|
|
662
|
+
namespace_repr = ".".join(namespace)
|
|
663
|
+
custom_properties: Dict[str, str] = {}
|
|
664
|
+
for k, v in properties.items():
|
|
665
|
+
try:
|
|
666
|
+
custom_properties[str(k)] = str(v)
|
|
667
|
+
except Exception as e:
|
|
668
|
+
LOGGER.warning(
|
|
669
|
+
f"Exception when trying to parse namespace properties for {namespace_repr}. Exception: {e}"
|
|
670
|
+
)
|
|
671
|
+
yield Status(removed=False)
|
|
672
|
+
yield ContainerProperties(
|
|
673
|
+
name=namespace_repr,
|
|
674
|
+
qualifiedName=namespace_repr,
|
|
675
|
+
env=self.config.env,
|
|
676
|
+
customProperties=custom_properties,
|
|
677
|
+
)
|
|
678
|
+
yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
|
|
679
|
+
dpi = self._get_dataplatform_instance_aspect()
|
|
680
|
+
yield dpi
|
|
681
|
+
yield self._create_browse_paths_aspect(dpi.instance)
|
|
682
|
+
|
|
482
683
|
|
|
483
684
|
class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
484
685
|
"""Implementation of a visitor to build an Avro schema as a dictionary from an Iceberg schema."""
|
|
@@ -635,9 +836,6 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
|
635
836
|
"native_data_type": str(timestamp_type),
|
|
636
837
|
}
|
|
637
838
|
|
|
638
|
-
# visit_timestamptz() is required when using pyiceberg >= 0.5.0, which is essentially a duplicate
|
|
639
|
-
# of visit_timestampz(). The function has been renamed from visit_timestampz().
|
|
640
|
-
# Once Datahub can upgrade its pyiceberg dependency to >=0.5.0, the visit_timestampz() function can be safely removed.
|
|
641
839
|
def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]:
|
|
642
840
|
# Avro supports 2 types of timestamp:
|
|
643
841
|
# - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
|
|
@@ -654,22 +852,6 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
|
654
852
|
"native_data_type": str(timestamptz_type),
|
|
655
853
|
}
|
|
656
854
|
|
|
657
|
-
def visit_timestampz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]:
|
|
658
|
-
# Avro supports 2 types of timestamp:
|
|
659
|
-
# - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
|
|
660
|
-
# - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
|
|
661
|
-
# utcAdjustment: bool = True
|
|
662
|
-
return {
|
|
663
|
-
"type": "long",
|
|
664
|
-
"logicalType": "timestamp-micros",
|
|
665
|
-
# Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
|
|
666
|
-
# See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
|
|
667
|
-
# "logicalType": "timestamp-micros"
|
|
668
|
-
# if timestamp_type.adjust_to_utc
|
|
669
|
-
# else "local-timestamp-micros",
|
|
670
|
-
"native_data_type": str(timestamptz_type),
|
|
671
|
-
}
|
|
672
|
-
|
|
673
855
|
def visit_string(self, string_type: StringType) -> Dict[str, Any]:
|
|
674
856
|
return {
|
|
675
857
|
"type": "string",
|
|
@@ -688,3 +870,42 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
|
688
870
|
"type": "bytes",
|
|
689
871
|
"native_data_type": str(binary_type),
|
|
690
872
|
}
|
|
873
|
+
|
|
874
|
+
def visit_timestamp_ns(self, timestamp_ns_type: Any) -> Dict[str, Any]:
|
|
875
|
+
# Handle nanosecond precision timestamps
|
|
876
|
+
# Avro supports 2 types of timestamp:
|
|
877
|
+
# - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
|
|
878
|
+
# - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
|
|
879
|
+
return {
|
|
880
|
+
"type": "long",
|
|
881
|
+
"logicalType": "timestamp-micros",
|
|
882
|
+
# Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
|
|
883
|
+
# See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
|
|
884
|
+
# "logicalType": "timestamp-micros"
|
|
885
|
+
# if timestamp_ns_type.adjust_to_utc
|
|
886
|
+
# else "local-timestamp-micros",
|
|
887
|
+
"native_data_type": str(timestamp_ns_type),
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
def visit_timestamptz_ns(self, timestamptz_ns_type: Any) -> Dict[str, Any]:
|
|
891
|
+
# Handle nanosecond precision timestamps with timezone
|
|
892
|
+
# Avro supports 2 types of timestamp:
|
|
893
|
+
# - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
|
|
894
|
+
# - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
|
|
895
|
+
return {
|
|
896
|
+
"type": "long",
|
|
897
|
+
"logicalType": "timestamp-micros",
|
|
898
|
+
# Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
|
|
899
|
+
# See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
|
|
900
|
+
# "logicalType": "timestamp-micros"
|
|
901
|
+
# if timestamptz_ns_type.adjust_to_utc
|
|
902
|
+
# else "local-timestamp-micros",
|
|
903
|
+
"native_data_type": str(timestamptz_ns_type),
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
def visit_unknown(self, unknown_type: Any) -> Dict[str, Any]:
|
|
907
|
+
# Handle unknown types
|
|
908
|
+
return {
|
|
909
|
+
"type": "string",
|
|
910
|
+
"native_data_type": str(unknown_type),
|
|
911
|
+
}
|