acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import contextlib
|
|
2
1
|
import json
|
|
3
2
|
import logging
|
|
3
|
+
import time
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from typing import Any, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
|
|
6
6
|
|
|
7
|
-
from sqlalchemy import create_engine
|
|
7
|
+
from sqlalchemy import create_engine, text
|
|
8
8
|
|
|
9
9
|
from datahub.emitter.aspect import ASPECT_MAP
|
|
10
10
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
@@ -12,13 +12,14 @@ from datahub.emitter.serialization_helper import post_json_transform
|
|
|
12
12
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
13
13
|
from datahub.ingestion.source.datahub.report import DataHubSourceReport
|
|
14
14
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
|
|
15
|
-
from datahub.metadata.schema_classes import
|
|
15
|
+
from datahub.metadata.schema_classes import SystemMetadataClass
|
|
16
16
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
20
|
# Should work for at least mysql, mariadb, postgres
|
|
21
21
|
DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
|
|
22
|
+
DATE_FORMAT = "%Y-%m-%d"
|
|
22
23
|
|
|
23
24
|
ROW = TypeVar("ROW", bound=Dict[str, Any])
|
|
24
25
|
|
|
@@ -85,6 +86,9 @@ class DataHubDatabaseReader:
|
|
|
85
86
|
**connection_config.options,
|
|
86
87
|
)
|
|
87
88
|
|
|
89
|
+
# Cache for available dates to avoid redundant queries
|
|
90
|
+
self.available_dates_cache: Optional[List[datetime]] = None
|
|
91
|
+
|
|
88
92
|
@property
|
|
89
93
|
def soft_deleted_urns_query(self) -> str:
|
|
90
94
|
return f"""
|
|
@@ -100,14 +104,28 @@ class DataHubDatabaseReader:
|
|
|
100
104
|
ORDER BY mav.urn
|
|
101
105
|
"""
|
|
102
106
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
+
def _get_json_extract_expression(self) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Returns the appropriate JSON extraction expression based on the database dialect.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Database-specific JSON extraction expression
|
|
113
|
+
"""
|
|
114
|
+
# Return the correct JSON extraction expression for the "removed" field,
|
|
115
|
+
# depending on the database dialect.
|
|
116
|
+
if self.engine.dialect.name == "postgresql":
|
|
117
|
+
# For PostgreSQL, cast the metadata column to JSON and extract the 'removed' key as boolean.
|
|
118
|
+
return "((metadata::json)->>'removed')::boolean"
|
|
119
|
+
else:
|
|
120
|
+
# For other databases (e.g., MySQL), use JSON_EXTRACT.
|
|
121
|
+
return "JSON_EXTRACT(metadata, '$.removed')"
|
|
122
|
+
|
|
123
|
+
def query(self, set_structured_properties_filter: bool) -> str:
|
|
124
|
+
"""
|
|
125
|
+
Main query that gets data for specified date range with appropriate filters.
|
|
126
|
+
"""
|
|
127
|
+
structured_prop_filter = f" AND urn {'' if set_structured_properties_filter else 'NOT'} like 'urn:li:structuredProperty:%%'"
|
|
107
128
|
|
|
108
|
-
# Ensures stable order, chronological per (urn, aspect)
|
|
109
|
-
# Relies on createdon order to reflect version order
|
|
110
|
-
# Ordering of entries with the same createdon is handled by VersionOrderer
|
|
111
129
|
return f"""
|
|
112
130
|
SELECT *
|
|
113
131
|
FROM (
|
|
@@ -123,7 +141,7 @@ class DataHubDatabaseReader:
|
|
|
123
141
|
LEFT JOIN (
|
|
124
142
|
SELECT
|
|
125
143
|
*,
|
|
126
|
-
|
|
144
|
+
{self._get_json_extract_expression()} as removed
|
|
127
145
|
FROM {self.engine.dialect.identifier_preparer.quote(self.config.database_table_name)}
|
|
128
146
|
WHERE aspect = 'status'
|
|
129
147
|
AND version = 0
|
|
@@ -132,6 +150,7 @@ class DataHubDatabaseReader:
|
|
|
132
150
|
{"" if self.config.include_all_versions else "AND mav.version = 0"}
|
|
133
151
|
{"" if not self.config.exclude_aspects else "AND mav.aspect NOT IN %(exclude_aspects)s"}
|
|
134
152
|
AND mav.createdon >= %(since_createdon)s
|
|
153
|
+
AND mav.createdon < %(end_createdon)s
|
|
135
154
|
ORDER BY
|
|
136
155
|
createdon,
|
|
137
156
|
urn,
|
|
@@ -139,50 +158,189 @@ class DataHubDatabaseReader:
|
|
|
139
158
|
version
|
|
140
159
|
) as t
|
|
141
160
|
WHERE 1=1
|
|
142
|
-
{"" if self.config.include_soft_deleted_entities else "AND (removed = false or removed is NULL)"}
|
|
161
|
+
{"" if self.config.include_soft_deleted_entities else " AND (removed = false or removed is NULL)"}
|
|
162
|
+
{structured_prop_filter}
|
|
143
163
|
ORDER BY
|
|
144
164
|
createdon,
|
|
145
165
|
urn,
|
|
146
166
|
aspect,
|
|
147
167
|
version
|
|
168
|
+
LIMIT %(limit)s
|
|
169
|
+
OFFSET %(offset)s
|
|
148
170
|
"""
|
|
149
171
|
|
|
172
|
+
def execute_with_params(
|
|
173
|
+
self, query: str, params: Dict[str, Any]
|
|
174
|
+
) -> List[Dict[str, Any]]:
|
|
175
|
+
"""Execute query with proper parameter binding that works with your database"""
|
|
176
|
+
with self.engine.connect() as conn:
|
|
177
|
+
result = conn.execute(query, params or {})
|
|
178
|
+
return [dict(row) for row in result.fetchall()]
|
|
179
|
+
|
|
150
180
|
def execute_server_cursor(
|
|
151
181
|
self, query: str, params: Dict[str, Any]
|
|
152
182
|
) -> Iterable[Dict[str, Any]]:
|
|
183
|
+
"""Execute a query with server-side cursor"""
|
|
153
184
|
with self.engine.connect() as conn:
|
|
154
185
|
if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
|
|
155
186
|
with (
|
|
156
187
|
conn.begin()
|
|
157
188
|
): # Transaction required for PostgreSQL server-side cursor
|
|
158
|
-
#
|
|
159
|
-
|
|
189
|
+
# Set query timeout at the connection level
|
|
190
|
+
if self.config.query_timeout:
|
|
191
|
+
if self.engine.dialect.name == "postgresql":
|
|
192
|
+
conn.execute(
|
|
193
|
+
text(
|
|
194
|
+
f"SET statement_timeout = {self.config.query_timeout * 1000}"
|
|
195
|
+
)
|
|
196
|
+
) # milliseconds
|
|
197
|
+
elif self.engine.dialect.name in ["mysql", "mariadb"]:
|
|
198
|
+
conn.execute(
|
|
199
|
+
text(
|
|
200
|
+
f"SET max_execution_time = {self.config.query_timeout * 1000}"
|
|
201
|
+
)
|
|
202
|
+
) # milliseconds
|
|
203
|
+
|
|
204
|
+
# Stream results with batch size
|
|
160
205
|
conn = conn.execution_options(
|
|
161
206
|
stream_results=True,
|
|
162
207
|
yield_per=self.config.database_query_batch_size,
|
|
163
208
|
)
|
|
209
|
+
|
|
210
|
+
# Execute query - using native parameterization without text()
|
|
211
|
+
# to maintain compatibility with your original code
|
|
164
212
|
result = conn.execute(query, params)
|
|
165
213
|
for row in result:
|
|
166
214
|
yield dict(row)
|
|
215
|
+
|
|
216
|
+
return # Success, exit the retry loop
|
|
167
217
|
else:
|
|
168
218
|
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
|
|
169
219
|
|
|
170
220
|
def _get_rows(
|
|
171
|
-
self,
|
|
221
|
+
self,
|
|
222
|
+
start_date: datetime,
|
|
223
|
+
end_date: datetime,
|
|
224
|
+
set_structured_properties_filter: bool,
|
|
225
|
+
limit: int,
|
|
172
226
|
) -> Iterable[Dict[str, Any]]:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
"since_createdon": from_createdon.strftime(DATETIME_FORMAT),
|
|
176
|
-
}
|
|
177
|
-
yield from self.execute_server_cursor(self.query, params)
|
|
227
|
+
"""
|
|
228
|
+
Retrieves data rows within a specified date range using pagination.
|
|
178
229
|
|
|
179
|
-
|
|
230
|
+
Implements a hybrid pagination strategy that switches between time-based and
|
|
231
|
+
offset-based approaches depending on the returned data. Uses server-side
|
|
232
|
+
cursors for efficient memory usage.
|
|
233
|
+
|
|
234
|
+
Note: May return duplicate rows across batch boundaries when multiple rows
|
|
235
|
+
share the same 'createdon' timestamp. This is expected behavior when
|
|
236
|
+
transitioning between pagination methods.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
start_date: Beginning of date range (inclusive)
|
|
240
|
+
end_date: End of date range (exclusive)
|
|
241
|
+
set_structured_properties_filter: Whether to apply structured filtering
|
|
242
|
+
limit: Maximum rows to fetch per query
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
An iterable of database rows as dictionaries
|
|
246
|
+
"""
|
|
247
|
+
offset = 0
|
|
248
|
+
last_createdon = None
|
|
249
|
+
first_iteration = True
|
|
250
|
+
|
|
251
|
+
while True:
|
|
252
|
+
try:
|
|
253
|
+
# Set up query and parameters - using named parameters
|
|
254
|
+
query = self.query(set_structured_properties_filter)
|
|
255
|
+
params: Dict[str, Any] = {
|
|
256
|
+
"since_createdon": start_date.strftime(DATETIME_FORMAT),
|
|
257
|
+
"end_createdon": end_date.strftime(DATETIME_FORMAT),
|
|
258
|
+
"limit": limit,
|
|
259
|
+
"offset": offset,
|
|
260
|
+
# Always pass exclude_aspects as a tuple, postgres doesn't support lists
|
|
261
|
+
"exclude_aspects": tuple(self.config.exclude_aspects),
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
logger.info(
|
|
265
|
+
f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
|
|
266
|
+
f"with limit {limit} and offset {offset} (inclusive range)"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Execute query with server-side cursor
|
|
270
|
+
rows = self.execute_server_cursor(query, params)
|
|
271
|
+
# Process and yield rows
|
|
272
|
+
rows_processed = 0
|
|
273
|
+
for row in rows:
|
|
274
|
+
if first_iteration:
|
|
275
|
+
start_date = row.get("createdon", start_date)
|
|
276
|
+
first_iteration = False
|
|
277
|
+
|
|
278
|
+
last_createdon = row.get("createdon")
|
|
279
|
+
rows_processed += 1
|
|
280
|
+
yield row
|
|
281
|
+
|
|
282
|
+
# If we processed fewer than the limit or no last_createdon, we're done
|
|
283
|
+
if rows_processed < limit or not last_createdon:
|
|
284
|
+
break
|
|
285
|
+
|
|
286
|
+
# Update parameters for next iteration
|
|
287
|
+
if start_date != last_createdon:
|
|
288
|
+
start_date = last_createdon
|
|
289
|
+
offset = 0
|
|
290
|
+
else:
|
|
291
|
+
offset += limit
|
|
292
|
+
|
|
293
|
+
logger.info(
|
|
294
|
+
f"Processed {rows_processed} rows for date range {start_date} to {end_date}. Continuing to next batch."
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
except Exception as e:
|
|
298
|
+
logger.error(
|
|
299
|
+
f"Error processing date range {start_date} to {end_date}: {str(e)}"
|
|
300
|
+
)
|
|
301
|
+
# Re-raise the exception after logging
|
|
302
|
+
raise
|
|
303
|
+
|
|
304
|
+
def get_all_aspects(
|
|
180
305
|
self, from_createdon: datetime, stop_time: datetime
|
|
306
|
+
) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
|
|
307
|
+
logger.info("Fetching Structured properties aspects")
|
|
308
|
+
yield from self.get_aspects(
|
|
309
|
+
from_createdon=from_createdon,
|
|
310
|
+
stop_time=stop_time,
|
|
311
|
+
set_structured_properties_filter=True,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
logger.info(
|
|
315
|
+
f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
time.sleep(
|
|
319
|
+
self.config.structured_properties_template_cache_invalidation_interval
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
logger.info("Fetching aspects")
|
|
323
|
+
yield from self.get_aspects(
|
|
324
|
+
from_createdon=from_createdon,
|
|
325
|
+
stop_time=stop_time,
|
|
326
|
+
set_structured_properties_filter=False,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
def get_aspects(
|
|
330
|
+
self,
|
|
331
|
+
from_createdon: datetime,
|
|
332
|
+
stop_time: datetime,
|
|
333
|
+
set_structured_properties_filter: bool = False,
|
|
181
334
|
) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
|
|
182
335
|
orderer = VersionOrderer[Dict[str, Any]](
|
|
183
336
|
enabled=self.config.include_all_versions
|
|
184
337
|
)
|
|
185
|
-
rows = self._get_rows(
|
|
338
|
+
rows = self._get_rows(
|
|
339
|
+
start_date=from_createdon,
|
|
340
|
+
end_date=stop_time,
|
|
341
|
+
set_structured_properties_filter=set_structured_properties_filter,
|
|
342
|
+
limit=self.config.database_query_batch_size,
|
|
343
|
+
)
|
|
186
344
|
for row in orderer(rows):
|
|
187
345
|
mcp = self._parse_row(row)
|
|
188
346
|
if mcp:
|
|
@@ -190,22 +348,29 @@ class DataHubDatabaseReader:
|
|
|
190
348
|
|
|
191
349
|
def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
|
|
192
350
|
"""
|
|
193
|
-
Fetches all soft-deleted entities from the database.
|
|
351
|
+
Fetches all soft-deleted entities from the database using pagination.
|
|
194
352
|
|
|
195
353
|
Yields:
|
|
196
354
|
Row objects containing URNs of soft-deleted entities
|
|
197
355
|
"""
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
356
|
+
try:
|
|
357
|
+
params: Dict = {}
|
|
358
|
+
|
|
359
|
+
logger.debug("Fetching soft-deleted URNs")
|
|
360
|
+
|
|
361
|
+
# Use server-side cursor implementation
|
|
362
|
+
rows = self.execute_server_cursor(self.soft_deleted_urns_query, params)
|
|
363
|
+
processed_rows = 0
|
|
364
|
+
# Process and yield rows
|
|
365
|
+
for row in rows:
|
|
366
|
+
processed_rows += 1
|
|
367
|
+
yield row
|
|
368
|
+
|
|
369
|
+
logger.debug(f"Fetched batch of {processed_rows} soft-deleted URNs")
|
|
370
|
+
|
|
371
|
+
except Exception:
|
|
372
|
+
logger.exception("Error fetching soft-deleted row", exc_info=True)
|
|
373
|
+
raise
|
|
209
374
|
|
|
210
375
|
def _parse_row(
|
|
211
376
|
self, row: Dict[str, Any]
|
|
@@ -215,12 +380,16 @@ class DataHubDatabaseReader:
|
|
|
215
380
|
json_metadata = post_json_transform(
|
|
216
381
|
json.loads(row["systemmetadata"] or "{}")
|
|
217
382
|
)
|
|
218
|
-
system_metadata =
|
|
383
|
+
system_metadata = None
|
|
384
|
+
if self.config.preserve_system_metadata:
|
|
385
|
+
system_metadata = SystemMetadataClass.from_obj(json_metadata)
|
|
386
|
+
if system_metadata.properties:
|
|
387
|
+
is_no_op = system_metadata.properties.pop("isNoOp", None)
|
|
388
|
+
logger.debug(f"Removed potential value for is_no_op={is_no_op}")
|
|
219
389
|
return MetadataChangeProposalWrapper(
|
|
220
390
|
entityUrn=row["urn"],
|
|
221
391
|
aspect=ASPECT_MAP[row["aspect"]].from_obj(json_aspect),
|
|
222
392
|
systemMetadata=system_metadata,
|
|
223
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
224
393
|
)
|
|
225
394
|
except Exception as e:
|
|
226
395
|
logger.warning(
|
|
@@ -6,7 +6,9 @@ from typing import Dict, Iterable, List, Optional
|
|
|
6
6
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
7
7
|
from datahub.ingestion.api.common import PipelineContext
|
|
8
8
|
from datahub.ingestion.api.decorators import (
|
|
9
|
+
SourceCapability,
|
|
9
10
|
SupportStatus,
|
|
11
|
+
capability,
|
|
10
12
|
config_class,
|
|
11
13
|
platform_name,
|
|
12
14
|
support_status,
|
|
@@ -17,6 +19,7 @@ from datahub.ingestion.api.source_helpers import (
|
|
|
17
19
|
auto_workunit_reporter,
|
|
18
20
|
)
|
|
19
21
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
20
23
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
21
24
|
from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
|
|
22
25
|
from datahub.ingestion.source.datahub.datahub_database_reader import (
|
|
@@ -37,6 +40,13 @@ logger = logging.getLogger(__name__)
|
|
|
37
40
|
@platform_name("DataHub")
|
|
38
41
|
@config_class(DataHubSourceConfig)
|
|
39
42
|
@support_status(SupportStatus.TESTING)
|
|
43
|
+
@capability(
|
|
44
|
+
SourceCapability.CONTAINERS,
|
|
45
|
+
"Enabled by default",
|
|
46
|
+
subtype_modifier=[
|
|
47
|
+
SourceCapabilityModifier.DATABASE,
|
|
48
|
+
],
|
|
49
|
+
)
|
|
40
50
|
class DataHubSource(StatefulIngestionSourceBase):
|
|
41
51
|
platform: str = "datahub"
|
|
42
52
|
|
|
@@ -117,7 +127,7 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
117
127
|
) -> Iterable[MetadataWorkUnit]:
|
|
118
128
|
logger.info(f"Fetching database aspects starting from {from_createdon}")
|
|
119
129
|
progress = ProgressTimer(report_every=timedelta(seconds=60))
|
|
120
|
-
mcps = reader.
|
|
130
|
+
mcps = reader.get_all_aspects(from_createdon, self.report.stop_time)
|
|
121
131
|
for i, (mcp, createdon) in enumerate(mcps):
|
|
122
132
|
if not self.urn_pattern.allowed(str(mcp.entityUrn)):
|
|
123
133
|
continue
|
|
@@ -9,6 +9,7 @@ import requests
|
|
|
9
9
|
from pydantic import Field, root_validator
|
|
10
10
|
|
|
11
11
|
from datahub.ingestion.api.decorators import (
|
|
12
|
+
SourceCapability,
|
|
12
13
|
SupportStatus,
|
|
13
14
|
capability,
|
|
14
15
|
config_class,
|
|
@@ -17,7 +18,6 @@ from datahub.ingestion.api.decorators import (
|
|
|
17
18
|
)
|
|
18
19
|
from datahub.ingestion.api.source import (
|
|
19
20
|
CapabilityReport,
|
|
20
|
-
SourceCapability,
|
|
21
21
|
TestableSource,
|
|
22
22
|
TestConnectionReport,
|
|
23
23
|
)
|
|
@@ -26,6 +26,7 @@ from datahub.ingestion.source.dbt.dbt_common import (
|
|
|
26
26
|
DBTCommonConfig,
|
|
27
27
|
DBTNode,
|
|
28
28
|
DBTSourceBase,
|
|
29
|
+
DBTSourceReport,
|
|
29
30
|
)
|
|
30
31
|
from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
|
|
31
32
|
|
|
@@ -262,16 +263,16 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
|
|
|
262
263
|
|
|
263
264
|
@platform_name("dbt")
|
|
264
265
|
@config_class(DBTCloudConfig)
|
|
265
|
-
@support_status(SupportStatus.
|
|
266
|
-
@capability(SourceCapability.
|
|
267
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
266
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
267
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
268
268
|
class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
269
269
|
config: DBTCloudConfig
|
|
270
|
+
report: DBTSourceReport # nothing cloud-specific in the report
|
|
270
271
|
|
|
271
272
|
@classmethod
|
|
272
273
|
def create(cls, config_dict, ctx):
|
|
273
274
|
config = DBTCloudConfig.parse_obj(config_dict)
|
|
274
|
-
return cls(config, ctx
|
|
275
|
+
return cls(config, ctx)
|
|
275
276
|
|
|
276
277
|
@staticmethod
|
|
277
278
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -369,9 +370,12 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
369
370
|
name = node["alias"]
|
|
370
371
|
|
|
371
372
|
comment = node.get("comment", "")
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
373
|
+
|
|
374
|
+
# In dbt sources, there are two types of descriptions:
|
|
375
|
+
# - description: table-level description (specific to the source table)
|
|
376
|
+
# - sourceDescription: schema-level description (describes the overall source schema)
|
|
377
|
+
# The table-level description should take precedence since it's more specific.
|
|
378
|
+
description = node["description"] or node.get("sourceDescription", "")
|
|
375
379
|
|
|
376
380
|
if node["resourceType"] == "model":
|
|
377
381
|
materialization = node["materializedType"]
|
|
@@ -405,8 +409,11 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
405
409
|
if node["resourceType"] in {"model", "seed", "snapshot"}:
|
|
406
410
|
status = node["status"]
|
|
407
411
|
if status is None and materialization != "ephemeral":
|
|
408
|
-
self.report.
|
|
409
|
-
|
|
412
|
+
self.report.warning(
|
|
413
|
+
title="Schema information may be incomplete",
|
|
414
|
+
message="Some nodes are missing the `status` field, which dbt uses to track the status of the node in the target database.",
|
|
415
|
+
context=key,
|
|
416
|
+
log=False,
|
|
410
417
|
)
|
|
411
418
|
|
|
412
419
|
# The code fields are new in dbt 1.3, and replace the sql ones.
|