acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -8,7 +8,7 @@ from typing import Collection, Dict, Iterable, List, Optional, TypedDict
|
|
|
8
8
|
from google.cloud.bigquery import Client
|
|
9
9
|
from pydantic import Field, PositiveInt
|
|
10
10
|
|
|
11
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
11
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
12
12
|
from datahub.configuration.time_window_config import (
|
|
13
13
|
BaseTimeWindowConfig,
|
|
14
14
|
get_time_bucket,
|
|
@@ -36,6 +36,9 @@ from datahub.ingestion.source.bigquery_v2.common import (
|
|
|
36
36
|
BigQueryFilter,
|
|
37
37
|
BigQueryIdentifierBuilder,
|
|
38
38
|
)
|
|
39
|
+
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
40
|
+
RedundantQueriesRunSkipHandler,
|
|
41
|
+
)
|
|
39
42
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
40
43
|
from datahub.metadata.urns import CorpUserUrn
|
|
41
44
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -86,12 +89,11 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
|
|
|
86
89
|
# TODO: Support stateful ingestion for the time windows.
|
|
87
90
|
window: BaseTimeWindowConfig = BaseTimeWindowConfig()
|
|
88
91
|
|
|
89
|
-
local_temp_path: Optional[pathlib.Path] = Field(
|
|
90
|
-
default=None,
|
|
91
|
-
description="Local path to store the audit log.",
|
|
92
|
+
local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = Field(
|
|
92
93
|
# TODO: For now, this is simply an advanced config to make local testing easier.
|
|
93
94
|
# Eventually, we will want to store date-specific files in the directory and use it as a cache.
|
|
94
|
-
|
|
95
|
+
default=None,
|
|
96
|
+
description="Local path to store the audit log.",
|
|
95
97
|
)
|
|
96
98
|
|
|
97
99
|
user_email_pattern: AllowDenyPattern = Field(
|
|
@@ -136,6 +138,7 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
136
138
|
structured_report: SourceReport,
|
|
137
139
|
filters: BigQueryFilter,
|
|
138
140
|
identifiers: BigQueryIdentifierBuilder,
|
|
141
|
+
redundant_run_skip_handler: Optional[RedundantQueriesRunSkipHandler] = None,
|
|
139
142
|
graph: Optional[DataHubGraph] = None,
|
|
140
143
|
schema_resolver: Optional[SchemaResolver] = None,
|
|
141
144
|
discovered_tables: Optional[Collection[str]] = None,
|
|
@@ -159,6 +162,9 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
159
162
|
)
|
|
160
163
|
|
|
161
164
|
self.structured_report = structured_report
|
|
165
|
+
self.redundant_run_skip_handler = redundant_run_skip_handler
|
|
166
|
+
|
|
167
|
+
self.start_time, self.end_time = self._get_time_window()
|
|
162
168
|
|
|
163
169
|
self.aggregator = SqlParsingAggregator(
|
|
164
170
|
platform=self.identifiers.platform,
|
|
@@ -173,8 +179,8 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
173
179
|
generate_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
174
180
|
usage_config=BaseUsageConfig(
|
|
175
181
|
bucket_duration=self.config.window.bucket_duration,
|
|
176
|
-
start_time=self.
|
|
177
|
-
end_time=self.
|
|
182
|
+
start_time=self.start_time,
|
|
183
|
+
end_time=self.end_time,
|
|
178
184
|
user_email_pattern=self.config.user_email_pattern,
|
|
179
185
|
top_n_queries=self.config.top_n_queries,
|
|
180
186
|
),
|
|
@@ -200,6 +206,34 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
200
206
|
logger.info(f"Using local temp path: {path}")
|
|
201
207
|
return path
|
|
202
208
|
|
|
209
|
+
def _get_time_window(self) -> tuple[datetime, datetime]:
|
|
210
|
+
if self.redundant_run_skip_handler:
|
|
211
|
+
start_time, end_time = (
|
|
212
|
+
self.redundant_run_skip_handler.suggest_run_time_window(
|
|
213
|
+
self.config.window.start_time,
|
|
214
|
+
self.config.window.end_time,
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
start_time = self.config.window.start_time
|
|
219
|
+
end_time = self.config.window.end_time
|
|
220
|
+
|
|
221
|
+
# Usage statistics are aggregated per bucket (typically per day).
|
|
222
|
+
# To ensure accurate aggregated metrics, we need to align the start_time
|
|
223
|
+
# to the beginning of a bucket so that we include complete bucket periods.
|
|
224
|
+
if self.config.include_usage_statistics:
|
|
225
|
+
start_time = get_time_bucket(start_time, self.config.window.bucket_duration)
|
|
226
|
+
|
|
227
|
+
return start_time, end_time
|
|
228
|
+
|
|
229
|
+
def _update_state(self) -> None:
|
|
230
|
+
if self.redundant_run_skip_handler:
|
|
231
|
+
self.redundant_run_skip_handler.update_state(
|
|
232
|
+
self.config.window.start_time,
|
|
233
|
+
self.config.window.end_time,
|
|
234
|
+
self.config.window.bucket_duration,
|
|
235
|
+
)
|
|
236
|
+
|
|
203
237
|
def is_temp_table(self, name: str) -> bool:
|
|
204
238
|
try:
|
|
205
239
|
table = BigqueryTableIdentifier.from_string_name(name)
|
|
@@ -300,6 +334,8 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
300
334
|
shared_connection.close()
|
|
301
335
|
audit_log_file.unlink(missing_ok=True)
|
|
302
336
|
|
|
337
|
+
self._update_state()
|
|
338
|
+
|
|
303
339
|
def deduplicate_queries(
|
|
304
340
|
self, queries: FileBackedList[ObservedQuery]
|
|
305
341
|
) -> FileBackedDict[Dict[int, ObservedQuery]]:
|
|
@@ -356,8 +392,8 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
356
392
|
query_log_query = _build_enriched_query_log_query(
|
|
357
393
|
project_id=project.id,
|
|
358
394
|
region=region,
|
|
359
|
-
start_time=self.
|
|
360
|
-
end_time=self.
|
|
395
|
+
start_time=self.start_time,
|
|
396
|
+
end_time=self.end_time,
|
|
361
397
|
)
|
|
362
398
|
|
|
363
399
|
logger.info(f"Fetching query log from BQ Project {project.id} for {region}")
|
|
@@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
|
|
|
80
80
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
81
81
|
@capability(
|
|
82
82
|
SourceCapability.DELETION_DETECTION,
|
|
83
|
-
"
|
|
83
|
+
"Enabled by default via stateful ingestion",
|
|
84
84
|
supported=True,
|
|
85
85
|
)
|
|
86
86
|
class CassandraSource(StatefulIngestionSourceBase):
|
|
@@ -123,16 +123,7 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
123
123
|
).workunit_processor,
|
|
124
124
|
]
|
|
125
125
|
|
|
126
|
-
def get_workunits_internal(
|
|
127
|
-
self,
|
|
128
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
129
|
-
for metadata in self._get_metadata():
|
|
130
|
-
if isinstance(metadata, MetadataWorkUnit):
|
|
131
|
-
yield metadata
|
|
132
|
-
else:
|
|
133
|
-
yield from metadata.as_workunits()
|
|
134
|
-
|
|
135
|
-
def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
126
|
+
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
136
127
|
if not self.cassandra_api.authenticate():
|
|
137
128
|
return
|
|
138
129
|
keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces()
|
|
@@ -305,13 +296,11 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
305
296
|
qualified_name=dataset_name,
|
|
306
297
|
description=view.comment,
|
|
307
298
|
custom_properties=self._get_dataset_custom_props(view),
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
),
|
|
314
|
-
],
|
|
299
|
+
view_definition=ViewPropertiesClass(
|
|
300
|
+
materialized=True,
|
|
301
|
+
viewLogic=view.where_clause, # Use the WHERE clause as view logic
|
|
302
|
+
viewLanguage="CQL", # Use "CQL" as the language
|
|
303
|
+
),
|
|
315
304
|
)
|
|
316
305
|
|
|
317
306
|
# Construct and emit lineage off of 'base_table_name'
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import ssl
|
|
1
2
|
from dataclasses import dataclass, field
|
|
2
3
|
from typing import Any, Dict, List, Optional
|
|
3
4
|
|
|
@@ -128,6 +129,39 @@ class CassandraAPI:
|
|
|
128
129
|
|
|
129
130
|
self._cassandra_session = cluster.connect()
|
|
130
131
|
return True
|
|
132
|
+
|
|
133
|
+
ssl_context = None
|
|
134
|
+
if self.config.ssl_ca_certs:
|
|
135
|
+
# Map SSL version string to ssl module constant
|
|
136
|
+
ssl_version_map = {
|
|
137
|
+
"TLS_CLIENT": ssl.PROTOCOL_TLS_CLIENT,
|
|
138
|
+
"TLSv1": ssl.PROTOCOL_TLSv1,
|
|
139
|
+
"TLSv1_1": ssl.PROTOCOL_TLSv1_1,
|
|
140
|
+
"TLSv1_2": ssl.PROTOCOL_TLSv1_2,
|
|
141
|
+
"TLSv1_3": ssl.PROTOCOL_TLSv1_2, # Python's ssl module uses TLSv1_2 for TLS 1.3
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
ssl_protocol = (
|
|
145
|
+
ssl_version_map.get(
|
|
146
|
+
self.config.ssl_version, ssl.PROTOCOL_TLS_CLIENT
|
|
147
|
+
)
|
|
148
|
+
if self.config.ssl_version
|
|
149
|
+
else ssl.PROTOCOL_TLS_CLIENT
|
|
150
|
+
)
|
|
151
|
+
ssl_context = ssl.SSLContext(ssl_protocol)
|
|
152
|
+
ssl_context.load_verify_locations(self.config.ssl_ca_certs)
|
|
153
|
+
if self.config.ssl_certfile and self.config.ssl_keyfile:
|
|
154
|
+
ssl_context.load_cert_chain(
|
|
155
|
+
certfile=self.config.ssl_certfile,
|
|
156
|
+
keyfile=self.config.ssl_keyfile,
|
|
157
|
+
)
|
|
158
|
+
elif self.config.ssl_certfile or self.config.ssl_keyfile:
|
|
159
|
+
# If one is provided, the other must be too.
|
|
160
|
+
# This is a simplification; real-world scenarios might allow one without the other depending on setup.
|
|
161
|
+
raise ValueError(
|
|
162
|
+
"Both ssl_certfile and ssl_keyfile must be provided if one is specified."
|
|
163
|
+
)
|
|
164
|
+
|
|
131
165
|
if self.config.username and self.config.password:
|
|
132
166
|
auth_provider = PlainTextAuthProvider(
|
|
133
167
|
username=self.config.username, password=self.config.password
|
|
@@ -136,12 +170,14 @@ class CassandraAPI:
|
|
|
136
170
|
[self.config.contact_point],
|
|
137
171
|
port=self.config.port,
|
|
138
172
|
auth_provider=auth_provider,
|
|
173
|
+
ssl_context=ssl_context,
|
|
139
174
|
load_balancing_policy=None,
|
|
140
175
|
)
|
|
141
176
|
else:
|
|
142
177
|
cluster = Cluster(
|
|
143
178
|
[self.config.contact_point],
|
|
144
179
|
port=self.config.port,
|
|
180
|
+
ssl_context=ssl_context,
|
|
145
181
|
load_balancing_policy=None,
|
|
146
182
|
)
|
|
147
183
|
|
|
@@ -79,6 +79,26 @@ class CassandraSourceConfig(
|
|
|
79
79
|
description="Configuration for cloud-based Cassandra, such as DataStax Astra DB.",
|
|
80
80
|
)
|
|
81
81
|
|
|
82
|
+
ssl_ca_certs: Optional[str] = Field(
|
|
83
|
+
default=None,
|
|
84
|
+
description="Path to the CA certificate file for SSL connections.",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
ssl_certfile: Optional[str] = Field(
|
|
88
|
+
default=None,
|
|
89
|
+
description="Path to the SSL certificate file for SSL connections.",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
ssl_keyfile: Optional[str] = Field(
|
|
93
|
+
default=None,
|
|
94
|
+
description="Path to the SSL key file for SSL connections.",
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
ssl_version: Optional[str] = Field(
|
|
98
|
+
default="TLS_CLIENT",
|
|
99
|
+
description="SSL protocol version to use for connections. Options: TLS_CLIENT, TLSv1, TLSv1_1, TLSv1_2, TLSv1_3. Defaults to TLS_CLIENT.",
|
|
100
|
+
)
|
|
101
|
+
|
|
82
102
|
keyspace_pattern: AllowDenyPattern = Field(
|
|
83
103
|
default=AllowDenyPattern.allow_all(),
|
|
84
104
|
description="Regex patterns to filter keyspaces for ingestion.",
|
|
@@ -18,7 +18,7 @@ from datahub.ingestion.source.cassandra.cassandra_api import (
|
|
|
18
18
|
)
|
|
19
19
|
from datahub.ingestion.source.cassandra.cassandra_config import CassandraSourceConfig
|
|
20
20
|
from datahub.ingestion.source.cassandra.cassandra_utils import CassandraSourceReport
|
|
21
|
-
from datahub.ingestion.source_report.ingestion_stage import
|
|
21
|
+
from datahub.ingestion.source_report.ingestion_stage import IngestionHighStage
|
|
22
22
|
from datahub.metadata.schema_classes import (
|
|
23
23
|
DatasetFieldProfileClass,
|
|
24
24
|
DatasetProfileClass,
|
|
@@ -70,30 +70,32 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
with
|
|
74
|
-
|
|
73
|
+
with (
|
|
74
|
+
self.report.new_high_stage(IngestionHighStage.PROFILING),
|
|
75
|
+
ThreadPoolExecutor(
|
|
75
76
|
max_workers=self.config.profiling.max_workers
|
|
76
|
-
) as executor
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
77
|
+
) as executor,
|
|
78
|
+
):
|
|
79
|
+
future_to_dataset = {
|
|
80
|
+
executor.submit(
|
|
81
|
+
self.generate_profile,
|
|
82
|
+
keyspace_name,
|
|
83
|
+
table_name,
|
|
84
|
+
cassandra_data.columns.get(table_name, []),
|
|
85
|
+
): table_name
|
|
86
|
+
for table_name in tables
|
|
87
|
+
}
|
|
88
|
+
for future in as_completed(future_to_dataset):
|
|
89
|
+
table_name = future_to_dataset[future]
|
|
90
|
+
try:
|
|
91
|
+
yield from future.result()
|
|
92
|
+
except Exception as exc:
|
|
93
|
+
self.report.profiling_skipped_other[table_name] += 1
|
|
94
|
+
self.report.failure(
|
|
95
|
+
message="Failed to profile for table",
|
|
96
|
+
context=f"{keyspace_name}.{table_name}",
|
|
97
|
+
exc=exc,
|
|
98
|
+
)
|
|
97
99
|
|
|
98
100
|
def generate_profile(
|
|
99
101
|
self,
|
|
@@ -6,7 +6,6 @@ from datahub.ingestion.source.cassandra.cassandra_api import CassandraColumn
|
|
|
6
6
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
7
7
|
StaleEntityRemovalSourceReport,
|
|
8
8
|
)
|
|
9
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
10
9
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
11
10
|
SchemaField,
|
|
12
11
|
SchemaFieldDataType,
|
|
@@ -35,7 +34,7 @@ SYSTEM_KEYSPACE_LIST = set(
|
|
|
35
34
|
|
|
36
35
|
|
|
37
36
|
@dataclass
|
|
38
|
-
class CassandraSourceReport(StaleEntityRemovalSourceReport
|
|
37
|
+
class CassandraSourceReport(StaleEntityRemovalSourceReport):
|
|
39
38
|
num_tables_failed: int = 0
|
|
40
39
|
num_views_failed: int = 0
|
|
41
40
|
tables_scanned: int = 0
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# This is a pretty limited list, and is not really complete yet. Right now it's only used to allow
|
|
2
|
+
# automatic platform mapping when generating lineage and we have a manual override, so
|
|
3
|
+
# it being incomplete is ok. This should not be used for urn validation.
|
|
4
|
+
KNOWN_VALID_PLATFORM_NAMES = [
|
|
5
|
+
"bigquery",
|
|
6
|
+
"cassandra",
|
|
7
|
+
"databricks",
|
|
8
|
+
"delta-lake",
|
|
9
|
+
"dbt",
|
|
10
|
+
"feast",
|
|
11
|
+
"file",
|
|
12
|
+
"gcs",
|
|
13
|
+
"hdfs",
|
|
14
|
+
"hive",
|
|
15
|
+
"mssql",
|
|
16
|
+
"mysql",
|
|
17
|
+
"oracle",
|
|
18
|
+
"postgres",
|
|
19
|
+
"redshift",
|
|
20
|
+
"s3",
|
|
21
|
+
"sagemaker",
|
|
22
|
+
"snowflake",
|
|
23
|
+
]
|
|
@@ -9,7 +9,9 @@ from datahub.configuration.validate_multiline_string import pydantic_multiline_s
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class GCPCredential(ConfigModel):
|
|
12
|
-
project_id: Optional[str] = Field(
|
|
12
|
+
project_id: Optional[str] = Field(
|
|
13
|
+
None, description="Project id to set the credentials"
|
|
14
|
+
)
|
|
13
15
|
private_key_id: str = Field(description="Private key id")
|
|
14
16
|
private_key: str = Field(
|
|
15
17
|
description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
|
|
@@ -51,3 +53,9 @@ class GCPCredential(ConfigModel):
|
|
|
51
53
|
cred_json = json.dumps(configs, indent=4, separators=(",", ": "))
|
|
52
54
|
fp.write(cred_json.encode())
|
|
53
55
|
return fp.name
|
|
56
|
+
|
|
57
|
+
def to_dict(self, project_id: Optional[str] = None) -> Dict[str, str]:
|
|
58
|
+
configs = self.dict()
|
|
59
|
+
if project_id:
|
|
60
|
+
configs["project_id"] = project_id
|
|
61
|
+
return configs
|
|
@@ -1,5 +1,10 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
1
4
|
from datahub.utilities.str_enum import StrEnum
|
|
2
5
|
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
3
8
|
|
|
4
9
|
class DatasetSubTypes(StrEnum):
|
|
5
10
|
# Generic SubTypes
|
|
@@ -25,6 +30,12 @@ class DatasetSubTypes(StrEnum):
|
|
|
25
30
|
NEO4J_NODE = "Neo4j Node"
|
|
26
31
|
NEO4J_RELATIONSHIP = "Neo4j Relationship"
|
|
27
32
|
SNOWFLAKE_STREAM = "Snowflake Stream"
|
|
33
|
+
DYNAMIC_TABLE = "Dynamic Table"
|
|
34
|
+
API_ENDPOINT = "API Endpoint"
|
|
35
|
+
SLACK_CHANNEL = "Slack Channel"
|
|
36
|
+
PROJECTIONS = "Projections"
|
|
37
|
+
GOOGLE_SHEETS = "Google Sheets"
|
|
38
|
+
GOOGLE_SHEETS_NAMED_RANGE = "Google Sheets Named Range"
|
|
28
39
|
|
|
29
40
|
# TODO: Create separate entity...
|
|
30
41
|
NOTEBOOK = "Notebook"
|
|
@@ -44,13 +55,19 @@ class DatasetContainerSubTypes(StrEnum):
|
|
|
44
55
|
GCS_BUCKET = "GCS bucket"
|
|
45
56
|
ABS_CONTAINER = "ABS container"
|
|
46
57
|
KEYSPACE = "Keyspace" # Cassandra
|
|
58
|
+
NAMESPACE = "Namespace" # Iceberg
|
|
59
|
+
DREMIO_SPACE = "Dremio Space"
|
|
60
|
+
DREMIO_SOURCE = "Dremio Source"
|
|
47
61
|
|
|
48
62
|
|
|
49
63
|
class BIContainerSubTypes(StrEnum):
|
|
50
64
|
LOOKER_FOLDER = "Folder"
|
|
51
65
|
LOOKML_PROJECT = "LookML Project"
|
|
52
66
|
LOOKML_MODEL = "LookML Model"
|
|
67
|
+
TABLEAU_SITE = "Site"
|
|
68
|
+
TABLEAU_PROJECT = "Project"
|
|
53
69
|
TABLEAU_WORKBOOK = "Workbook"
|
|
70
|
+
POWERBI_WORKSPACE = "Workspace"
|
|
54
71
|
POWERBI_DATASET = "Semantic Model"
|
|
55
72
|
POWERBI_DATASET_TABLE = "Table"
|
|
56
73
|
QLIK_SPACE = "Qlik Space"
|
|
@@ -58,6 +75,8 @@ class BIContainerSubTypes(StrEnum):
|
|
|
58
75
|
SIGMA_WORKSPACE = "Sigma Workspace"
|
|
59
76
|
SIGMA_WORKBOOK = "Sigma Workbook"
|
|
60
77
|
MODE_COLLECTION = "Collection"
|
|
78
|
+
GRAFANA_FOLDER = "Folder"
|
|
79
|
+
GRAFANA_DASHBOARD = "Dashboard"
|
|
61
80
|
|
|
62
81
|
|
|
63
82
|
class FlowContainerSubTypes(StrEnum):
|
|
@@ -68,10 +87,13 @@ class FlowContainerSubTypes(StrEnum):
|
|
|
68
87
|
class JobContainerSubTypes(StrEnum):
|
|
69
88
|
NIFI_PROCESS_GROUP = "Process Group"
|
|
70
89
|
MSSQL_JOBSTEP = "Job Step"
|
|
71
|
-
|
|
90
|
+
STORED_PROCEDURE = "Stored Procedure"
|
|
72
91
|
|
|
73
92
|
|
|
74
93
|
class BIAssetSubTypes(StrEnum):
|
|
94
|
+
DASHBOARD = "Dashboard"
|
|
95
|
+
CHART = "Chart"
|
|
96
|
+
|
|
75
97
|
# Generic SubTypes
|
|
76
98
|
REPORT = "Report"
|
|
77
99
|
|
|
@@ -93,7 +115,57 @@ class BIAssetSubTypes(StrEnum):
|
|
|
93
115
|
SAC_STORY = "Story"
|
|
94
116
|
SAC_APPLICATION = "Application"
|
|
95
117
|
|
|
118
|
+
# Hex
|
|
119
|
+
HEX_PROJECT = "Project"
|
|
120
|
+
HEX_COMPONENT = "Component"
|
|
121
|
+
|
|
96
122
|
|
|
97
123
|
class MLAssetSubTypes(StrEnum):
|
|
98
124
|
MLFLOW_TRAINING_RUN = "ML Training Run"
|
|
99
125
|
MLFLOW_EXPERIMENT = "ML Experiment"
|
|
126
|
+
VERTEX_EXPERIMENT = "Experiment"
|
|
127
|
+
VERTEX_EXPERIMENT_RUN = "Experiment Run"
|
|
128
|
+
VERTEX_EXECUTION = "Execution"
|
|
129
|
+
|
|
130
|
+
VERTEX_MODEL = "ML Model"
|
|
131
|
+
VERTEX_MODEL_GROUP = "ML Model Group"
|
|
132
|
+
VERTEX_TRAINING_JOB = "Training Job"
|
|
133
|
+
VERTEX_ENDPOINT = "Endpoint"
|
|
134
|
+
VERTEX_DATASET = "Dataset"
|
|
135
|
+
VERTEX_PROJECT = "Project"
|
|
136
|
+
VERTEX_PIPELINE = "Pipeline Job"
|
|
137
|
+
VERTEX_PIPELINE_TASK = "Pipeline Task"
|
|
138
|
+
VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def create_source_capability_modifier_enum():
|
|
142
|
+
all_values: Dict[str, Any] = {}
|
|
143
|
+
source_enums = [
|
|
144
|
+
DatasetSubTypes,
|
|
145
|
+
DatasetContainerSubTypes,
|
|
146
|
+
BIContainerSubTypes,
|
|
147
|
+
FlowContainerSubTypes,
|
|
148
|
+
JobContainerSubTypes,
|
|
149
|
+
BIAssetSubTypes,
|
|
150
|
+
MLAssetSubTypes,
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
for enum_class in source_enums:
|
|
154
|
+
for member in enum_class: # type: ignore[var-annotated]
|
|
155
|
+
if member.name in all_values:
|
|
156
|
+
logger.debug(
|
|
157
|
+
f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
|
|
158
|
+
)
|
|
159
|
+
continue
|
|
160
|
+
all_values[member.name] = member.value
|
|
161
|
+
|
|
162
|
+
enum_code = "class SourceCapabilityModifier(StrEnum):\n"
|
|
163
|
+
for name, value in all_values.items():
|
|
164
|
+
enum_code += f' {name} = "{value}"\n'
|
|
165
|
+
|
|
166
|
+
exec(enum_code, globals())
|
|
167
|
+
return globals()["SourceCapabilityModifier"]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# This will have all values from the enums above
|
|
171
|
+
SourceCapabilityModifier = create_source_capability_modifier_enum()
|
|
@@ -11,23 +11,30 @@ from datahub.emitter.mcp_builder import (
|
|
|
11
11
|
)
|
|
12
12
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
13
13
|
from datahub.ingestion.source.aws.s3_util import (
|
|
14
|
-
get_bucket_name,
|
|
15
14
|
get_bucket_relative_path,
|
|
16
15
|
get_s3_prefix,
|
|
17
16
|
is_s3_uri,
|
|
18
17
|
)
|
|
19
18
|
from datahub.ingestion.source.azure.abs_utils import (
|
|
20
19
|
get_abs_prefix,
|
|
21
|
-
get_container_name,
|
|
22
20
|
get_container_relative_path,
|
|
23
21
|
is_abs_uri,
|
|
24
22
|
)
|
|
25
23
|
from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
|
|
24
|
+
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
25
|
+
get_object_store_bucket_name,
|
|
26
|
+
get_object_store_for_uri,
|
|
27
|
+
)
|
|
28
|
+
from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
|
|
26
29
|
from datahub.ingestion.source.gcs.gcs_utils import (
|
|
27
|
-
get_gcs_bucket_name,
|
|
28
30
|
get_gcs_prefix,
|
|
29
31
|
is_gcs_uri,
|
|
30
32
|
)
|
|
33
|
+
from datahub.metadata.schema_classes import (
|
|
34
|
+
SchemaFieldClass,
|
|
35
|
+
SchemaFieldDataTypeClass,
|
|
36
|
+
StringTypeClass,
|
|
37
|
+
)
|
|
31
38
|
|
|
32
39
|
# hide annoying debug errors from py4j
|
|
33
40
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
@@ -38,6 +45,37 @@ PLATFORM_GCS = "gcs"
|
|
|
38
45
|
PLATFORM_ABS = "abs"
|
|
39
46
|
|
|
40
47
|
|
|
48
|
+
def add_partition_columns_to_schema(
|
|
49
|
+
path_spec: PathSpec, full_path: str, fields: List[SchemaFieldClass]
|
|
50
|
+
) -> None:
|
|
51
|
+
# Check if using fieldPath v2 format
|
|
52
|
+
is_fieldpath_v2 = any(
|
|
53
|
+
field.fieldPath.startswith("[version=2.0]") for field in fields
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Extract partition information from path
|
|
57
|
+
partition_keys = path_spec.get_partition_from_path(full_path)
|
|
58
|
+
if not partition_keys:
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
# Add partition fields to schema
|
|
62
|
+
for partition_key in partition_keys:
|
|
63
|
+
fields.append(
|
|
64
|
+
SchemaFieldClass(
|
|
65
|
+
fieldPath=(
|
|
66
|
+
f"{partition_key[0]}"
|
|
67
|
+
if not is_fieldpath_v2
|
|
68
|
+
else f"[version=2.0].[type=string].{partition_key[0]}"
|
|
69
|
+
),
|
|
70
|
+
nativeDataType="string",
|
|
71
|
+
type=SchemaFieldDataTypeClass(StringTypeClass()),
|
|
72
|
+
isPartitioningKey=True,
|
|
73
|
+
nullable=False,
|
|
74
|
+
recursive=False,
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
41
79
|
class ContainerWUCreator:
|
|
42
80
|
processed_containers: List[str]
|
|
43
81
|
|
|
@@ -87,6 +125,13 @@ class ContainerWUCreator:
|
|
|
87
125
|
|
|
88
126
|
@staticmethod
|
|
89
127
|
def get_protocol(path: str) -> str:
|
|
128
|
+
object_store = get_object_store_for_uri(path)
|
|
129
|
+
if object_store:
|
|
130
|
+
prefix = object_store.get_prefix(path)
|
|
131
|
+
if prefix:
|
|
132
|
+
return prefix
|
|
133
|
+
|
|
134
|
+
# Legacy fallback
|
|
90
135
|
protocol: Optional[str] = None
|
|
91
136
|
if is_s3_uri(path):
|
|
92
137
|
protocol = get_s3_prefix(path)
|
|
@@ -104,13 +149,12 @@ class ContainerWUCreator:
|
|
|
104
149
|
|
|
105
150
|
@staticmethod
|
|
106
151
|
def get_bucket_name(path: str) -> str:
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
raise ValueError(f"Unable to get bucket name from path: {path}")
|
|
152
|
+
"""
|
|
153
|
+
Get the bucket/container name from any supported object store URI.
|
|
154
|
+
|
|
155
|
+
Delegates to the abstract get_object_store_bucket_name function.
|
|
156
|
+
"""
|
|
157
|
+
return get_object_store_bucket_name(path)
|
|
114
158
|
|
|
115
159
|
def get_sub_types(self) -> str:
|
|
116
160
|
if self.platform == PLATFORM_S3:
|
|
@@ -122,6 +166,11 @@ class ContainerWUCreator:
|
|
|
122
166
|
raise ValueError(f"Unable to sub type for platform: {self.platform}")
|
|
123
167
|
|
|
124
168
|
def get_base_full_path(self, path: str) -> str:
|
|
169
|
+
object_store = get_object_store_for_uri(path)
|
|
170
|
+
if object_store:
|
|
171
|
+
return object_store.get_object_key(path)
|
|
172
|
+
|
|
173
|
+
# Legacy fallback
|
|
125
174
|
if self.platform == "s3" or self.platform == "gcs":
|
|
126
175
|
return get_bucket_relative_path(path)
|
|
127
176
|
elif self.platform == "abs":
|