acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -9,6 +9,7 @@ from sqlalchemy import create_engine
|
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.common import AllowDenyPattern, ConfigurationError
|
|
11
11
|
from datahub.ingestion.source.fivetran.config import (
|
|
12
|
+
DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES,
|
|
12
13
|
Constant,
|
|
13
14
|
FivetranLogConfig,
|
|
14
15
|
FivetranSourceReport,
|
|
@@ -54,7 +55,7 @@ class FivetranLogAPI:
|
|
|
54
55
|
snowflake_destination_config.database,
|
|
55
56
|
)
|
|
56
57
|
)
|
|
57
|
-
fivetran_log_query.
|
|
58
|
+
fivetran_log_query.set_schema(
|
|
58
59
|
snowflake_destination_config.log_schema,
|
|
59
60
|
)
|
|
60
61
|
fivetran_log_database = snowflake_destination_config.database
|
|
@@ -66,8 +67,26 @@ class FivetranLogAPI:
|
|
|
66
67
|
engine = create_engine(
|
|
67
68
|
bigquery_destination_config.get_sql_alchemy_url(),
|
|
68
69
|
)
|
|
69
|
-
fivetran_log_query.
|
|
70
|
-
|
|
70
|
+
fivetran_log_query.set_schema(bigquery_destination_config.dataset)
|
|
71
|
+
|
|
72
|
+
# The "database" should be the BigQuery project name.
|
|
73
|
+
result = engine.execute("SELECT @@project_id").fetchone()
|
|
74
|
+
if result is None:
|
|
75
|
+
raise ValueError("Failed to retrieve BigQuery project ID")
|
|
76
|
+
fivetran_log_database = result[0]
|
|
77
|
+
elif destination_platform == "databricks":
|
|
78
|
+
databricks_destination_config = (
|
|
79
|
+
self.fivetran_log_config.databricks_destination_config
|
|
80
|
+
)
|
|
81
|
+
if databricks_destination_config is not None:
|
|
82
|
+
engine = create_engine(
|
|
83
|
+
databricks_destination_config.get_sql_alchemy_url(
|
|
84
|
+
databricks_destination_config.catalog
|
|
85
|
+
),
|
|
86
|
+
**databricks_destination_config.get_options(),
|
|
87
|
+
)
|
|
88
|
+
fivetran_log_query.set_schema(databricks_destination_config.log_schema)
|
|
89
|
+
fivetran_log_database = databricks_destination_config.catalog
|
|
71
90
|
else:
|
|
72
91
|
raise ConfigurationError(
|
|
73
92
|
f"Destination platform '{destination_platform}' is not yet supported."
|
|
@@ -94,7 +113,11 @@ class FivetranLogAPI:
|
|
|
94
113
|
"""
|
|
95
114
|
Returns dict of column lineage metadata with key as (<SOURCE_TABLE_ID>, <DESTINATION_TABLE_ID>)
|
|
96
115
|
"""
|
|
97
|
-
all_column_lineage = defaultdict(list)
|
|
116
|
+
all_column_lineage: Dict[Tuple[str, str], List] = defaultdict(list)
|
|
117
|
+
|
|
118
|
+
if not connector_ids:
|
|
119
|
+
return dict(all_column_lineage)
|
|
120
|
+
|
|
98
121
|
column_lineage_result = self._query(
|
|
99
122
|
self.fivetran_log_query.get_column_lineage_query(
|
|
100
123
|
connector_ids=connector_ids
|
|
@@ -112,7 +135,11 @@ class FivetranLogAPI:
|
|
|
112
135
|
"""
|
|
113
136
|
Returns dict of table lineage metadata with key as 'CONNECTOR_ID'
|
|
114
137
|
"""
|
|
115
|
-
connectors_table_lineage_metadata = defaultdict(list)
|
|
138
|
+
connectors_table_lineage_metadata: Dict[str, List] = defaultdict(list)
|
|
139
|
+
|
|
140
|
+
if not connector_ids:
|
|
141
|
+
return dict(connectors_table_lineage_metadata)
|
|
142
|
+
|
|
116
143
|
table_lineage_result = self._query(
|
|
117
144
|
self.fivetran_log_query.get_table_lineage_query(connector_ids=connector_ids)
|
|
118
145
|
)
|
|
@@ -190,7 +217,7 @@ class FivetranLogAPI:
|
|
|
190
217
|
jobs: List[Job] = []
|
|
191
218
|
if connector_sync_log is None:
|
|
192
219
|
return jobs
|
|
193
|
-
for sync_id in connector_sync_log
|
|
220
|
+
for sync_id in connector_sync_log:
|
|
194
221
|
if len(connector_sync_log[sync_id]) != 2:
|
|
195
222
|
# If both sync-start and sync-end event log not present for this sync that means sync is still in progress
|
|
196
223
|
continue
|
|
@@ -228,9 +255,15 @@ class FivetranLogAPI:
|
|
|
228
255
|
return self._get_users().get(user_id)
|
|
229
256
|
|
|
230
257
|
def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
|
|
231
|
-
connector_ids
|
|
232
|
-
|
|
233
|
-
|
|
258
|
+
# Create 2 filtered connector_ids lists - one for table lineage and one for column lineage
|
|
259
|
+
tll_connector_ids: List[str] = []
|
|
260
|
+
cll_connector_ids: List[str] = []
|
|
261
|
+
for connector in connectors:
|
|
262
|
+
tll_connector_ids.append(connector.connector_id)
|
|
263
|
+
if connector.connector_type not in DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES:
|
|
264
|
+
cll_connector_ids.append(connector.connector_id)
|
|
265
|
+
table_lineage_metadata = self._get_table_lineage_metadata(tll_connector_ids)
|
|
266
|
+
column_lineage_metadata = self._get_column_lineage_metadata(cll_connector_ids)
|
|
234
267
|
for connector in connectors:
|
|
235
268
|
connector.lineage = self._extract_connector_lineage(
|
|
236
269
|
table_lineage_result=table_lineage_metadata.get(connector.connector_id),
|
|
@@ -6,34 +6,56 @@ MAX_COLUMN_LINEAGE_PER_CONNECTOR = 1000
|
|
|
6
6
|
MAX_JOBS_PER_CONNECTOR = 500
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
"""
|
|
10
|
+
------------------------------------------------------------------------------------------------------------
|
|
11
|
+
Fivetran Platform Connector Handling
|
|
12
|
+
------------------------------------------------------------------------------------------------------------
|
|
13
|
+
Current Query Change Log: August 2025 (See: https://fivetran.com/docs/changelog/2025/august-2025)
|
|
14
|
+
|
|
15
|
+
All queries have to be updated as per Fivetran Platform Connector release if any. We expect customers
|
|
16
|
+
and fivetran to keep platform connector configured for DataHub with auto sync enabled to get latest changes.
|
|
17
|
+
|
|
18
|
+
References:
|
|
19
|
+
- Fivetran Release Notes: https://fivetran.com/docs/changelog (Look for "Fivetran Platform Connector")
|
|
20
|
+
- Latest Platform Connector Schema: https://fivetran.com/docs/logs/fivetran-platform?erdModal=open
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
|
|
9
24
|
class FivetranLogQuery:
|
|
10
25
|
# Note: All queries are written in Snowflake SQL.
|
|
11
26
|
# They will be transpiled to the target database's SQL dialect at runtime.
|
|
12
27
|
|
|
13
28
|
def __init__(self) -> None:
|
|
14
29
|
# Select query db clause
|
|
15
|
-
self.
|
|
16
|
-
|
|
17
|
-
def set_db(self, db_name: str) -> None:
|
|
18
|
-
self.db_clause = f"{db_name}."
|
|
30
|
+
self.schema_clause: str = ""
|
|
19
31
|
|
|
20
32
|
def use_database(self, db_name: str) -> str:
|
|
21
33
|
return f"use database {db_name}"
|
|
22
34
|
|
|
35
|
+
def set_schema(self, schema_name: str) -> None:
|
|
36
|
+
"""
|
|
37
|
+
Using Snowflake quoted identifiers convention
|
|
38
|
+
|
|
39
|
+
Add double quotes around an identifier
|
|
40
|
+
Use two quotes to use the double quote character inside a quoted identifier
|
|
41
|
+
"""
|
|
42
|
+
schema_name = schema_name.replace('"', '""')
|
|
43
|
+
self.schema_clause = f'"{schema_name}".'
|
|
44
|
+
|
|
23
45
|
def get_connectors_query(self) -> str:
|
|
24
46
|
return f"""\
|
|
25
47
|
SELECT
|
|
26
|
-
|
|
48
|
+
connection_id,
|
|
27
49
|
connecting_user_id,
|
|
28
50
|
connector_type_id,
|
|
29
|
-
|
|
51
|
+
connection_name,
|
|
30
52
|
paused,
|
|
31
53
|
sync_frequency,
|
|
32
54
|
destination_id
|
|
33
|
-
FROM {self.
|
|
55
|
+
FROM {self.schema_clause}connection
|
|
34
56
|
WHERE
|
|
35
57
|
_fivetran_deleted = FALSE
|
|
36
|
-
QUALIFY ROW_NUMBER() OVER (PARTITION BY
|
|
58
|
+
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY _fivetran_synced DESC) = 1
|
|
37
59
|
"""
|
|
38
60
|
|
|
39
61
|
def get_users_query(self) -> str:
|
|
@@ -42,7 +64,7 @@ SELECT id as user_id,
|
|
|
42
64
|
given_name,
|
|
43
65
|
family_name,
|
|
44
66
|
email
|
|
45
|
-
FROM {self.
|
|
67
|
+
FROM {self.schema_clause}user
|
|
46
68
|
"""
|
|
47
69
|
|
|
48
70
|
def get_sync_logs_query(
|
|
@@ -56,20 +78,20 @@ FROM {self.db_clause}user
|
|
|
56
78
|
return f"""\
|
|
57
79
|
WITH ranked_syncs AS (
|
|
58
80
|
SELECT
|
|
59
|
-
|
|
81
|
+
connection_id,
|
|
60
82
|
sync_id,
|
|
61
83
|
MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
|
|
62
84
|
MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
|
|
63
85
|
MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
|
|
64
|
-
ROW_NUMBER() OVER (PARTITION BY
|
|
65
|
-
FROM {self.
|
|
86
|
+
ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY MAX(time_stamp) DESC) as rn
|
|
87
|
+
FROM {self.schema_clause}log
|
|
66
88
|
WHERE message_event in ('sync_start', 'sync_end')
|
|
67
89
|
AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
|
|
68
|
-
AND
|
|
69
|
-
GROUP BY
|
|
90
|
+
AND connection_id IN ({formatted_connector_ids})
|
|
91
|
+
GROUP BY connection_id, sync_id
|
|
70
92
|
)
|
|
71
93
|
SELECT
|
|
72
|
-
|
|
94
|
+
connection_id,
|
|
73
95
|
sync_id,
|
|
74
96
|
start_time,
|
|
75
97
|
end_time,
|
|
@@ -78,7 +100,7 @@ FROM ranked_syncs
|
|
|
78
100
|
WHERE rn <= {MAX_JOBS_PER_CONNECTOR}
|
|
79
101
|
AND start_time IS NOT NULL
|
|
80
102
|
AND end_time IS NOT NULL
|
|
81
|
-
ORDER BY
|
|
103
|
+
ORDER BY connection_id, end_time DESC
|
|
82
104
|
"""
|
|
83
105
|
|
|
84
106
|
def get_table_lineage_query(self, connector_ids: List[str]) -> str:
|
|
@@ -90,7 +112,7 @@ SELECT
|
|
|
90
112
|
*
|
|
91
113
|
FROM (
|
|
92
114
|
SELECT
|
|
93
|
-
stm.
|
|
115
|
+
stm.connection_id as connection_id,
|
|
94
116
|
stm.id as source_table_id,
|
|
95
117
|
stm.name as source_table_name,
|
|
96
118
|
ssm.name as source_schema_name,
|
|
@@ -98,18 +120,18 @@ FROM (
|
|
|
98
120
|
dtm.name as destination_table_name,
|
|
99
121
|
dsm.name as destination_schema_name,
|
|
100
122
|
tl.created_at as created_at,
|
|
101
|
-
ROW_NUMBER() OVER (PARTITION BY stm.
|
|
102
|
-
FROM {self.
|
|
103
|
-
JOIN {self.
|
|
104
|
-
JOIN {self.
|
|
105
|
-
JOIN {self.
|
|
106
|
-
JOIN {self.
|
|
107
|
-
WHERE stm.
|
|
123
|
+
ROW_NUMBER() OVER (PARTITION BY stm.connection_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
|
|
124
|
+
FROM {self.schema_clause}table_lineage as tl
|
|
125
|
+
JOIN {self.schema_clause}source_table as stm on tl.source_table_id = stm.id -- stm: source_table_metadata
|
|
126
|
+
JOIN {self.schema_clause}destination_table as dtm on tl.destination_table_id = dtm.id -- dtm: destination_table_metadata
|
|
127
|
+
JOIN {self.schema_clause}source_schema as ssm on stm.schema_id = ssm.id -- ssm: source_schema_metadata
|
|
128
|
+
JOIN {self.schema_clause}destination_schema as dsm on dtm.schema_id = dsm.id -- dsm: destination_schema_metadata
|
|
129
|
+
WHERE stm.connection_id IN ({formatted_connector_ids})
|
|
108
130
|
)
|
|
109
131
|
-- Ensure that we only get back one entry per source and destination pair.
|
|
110
132
|
WHERE table_combo_rn = 1
|
|
111
|
-
QUALIFY ROW_NUMBER() OVER (PARTITION BY
|
|
112
|
-
ORDER BY
|
|
133
|
+
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
|
|
134
|
+
ORDER BY connection_id, created_at DESC
|
|
113
135
|
"""
|
|
114
136
|
|
|
115
137
|
def get_column_lineage_query(self, connector_ids: List[str]) -> str:
|
|
@@ -124,25 +146,25 @@ SELECT
|
|
|
124
146
|
destination_column_name
|
|
125
147
|
FROM (
|
|
126
148
|
SELECT
|
|
127
|
-
stm.
|
|
149
|
+
stm.connection_id as connection_id,
|
|
128
150
|
scm.table_id as source_table_id,
|
|
129
151
|
dcm.table_id as destination_table_id,
|
|
130
152
|
scm.name as source_column_name,
|
|
131
153
|
dcm.name as destination_column_name,
|
|
132
154
|
cl.created_at as created_at,
|
|
133
|
-
ROW_NUMBER() OVER (PARTITION BY stm.
|
|
134
|
-
FROM {self.
|
|
135
|
-
JOIN {self.
|
|
155
|
+
ROW_NUMBER() OVER (PARTITION BY stm.connection_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
|
|
156
|
+
FROM {self.schema_clause}column_lineage as cl
|
|
157
|
+
JOIN {self.schema_clause}source_column as scm -- scm: source_column_metadata
|
|
136
158
|
ON cl.source_column_id = scm.id
|
|
137
|
-
JOIN {self.
|
|
159
|
+
JOIN {self.schema_clause}destination_column as dcm -- dcm: destination_column_metadata
|
|
138
160
|
ON cl.destination_column_id = dcm.id
|
|
139
|
-
-- Only joining
|
|
140
|
-
JOIN {self.
|
|
161
|
+
-- Only joining source_table to get the connection_id.
|
|
162
|
+
JOIN {self.schema_clause}source_table as stm -- stm: source_table_metadata
|
|
141
163
|
ON scm.table_id = stm.id
|
|
142
|
-
WHERE stm.
|
|
164
|
+
WHERE stm.connection_id IN ({formatted_connector_ids})
|
|
143
165
|
)
|
|
144
166
|
-- Ensure that we only get back one entry per (connector, source column, destination column) pair.
|
|
145
167
|
WHERE column_combo_rn = 1
|
|
146
|
-
QUALIFY ROW_NUMBER() OVER (PARTITION BY
|
|
147
|
-
ORDER BY
|
|
168
|
+
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
|
|
169
|
+
ORDER BY connection_id, created_at DESC
|
|
148
170
|
"""
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
from requests.adapters import HTTPAdapter
|
|
5
|
+
from urllib3.util import Retry
|
|
6
|
+
|
|
7
|
+
from datahub.ingestion.source.fivetran.config import (
|
|
8
|
+
FivetranAPIConfig,
|
|
9
|
+
)
|
|
10
|
+
from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Retry configuration constants
|
|
15
|
+
RETRY_MAX_TIMES = 3
|
|
16
|
+
RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
|
|
17
|
+
RETRY_BACKOFF_FACTOR = 1
|
|
18
|
+
RETRY_ALLOWED_METHODS = ["GET"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FivetranAPIClient:
|
|
22
|
+
"""Client for interacting with the Fivetran REST API."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, config: FivetranAPIConfig) -> None:
|
|
25
|
+
self.config = config
|
|
26
|
+
self._session = self._create_session()
|
|
27
|
+
|
|
28
|
+
def _create_session(self) -> requests.Session:
|
|
29
|
+
"""
|
|
30
|
+
Create a session with retry logic and basic authentication
|
|
31
|
+
"""
|
|
32
|
+
requests_session = requests.Session()
|
|
33
|
+
|
|
34
|
+
# Configure retry strategy for transient failures
|
|
35
|
+
retry_strategy = Retry(
|
|
36
|
+
total=RETRY_MAX_TIMES,
|
|
37
|
+
backoff_factor=RETRY_BACKOFF_FACTOR,
|
|
38
|
+
status_forcelist=RETRY_STATUS_CODES,
|
|
39
|
+
allowed_methods=RETRY_ALLOWED_METHODS,
|
|
40
|
+
raise_on_status=True,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
44
|
+
requests_session.mount("http://", adapter)
|
|
45
|
+
requests_session.mount("https://", adapter)
|
|
46
|
+
|
|
47
|
+
# Set up basic authentication
|
|
48
|
+
requests_session.auth = (self.config.api_key, self.config.api_secret)
|
|
49
|
+
requests_session.headers.update(
|
|
50
|
+
{
|
|
51
|
+
"Content-Type": "application/json",
|
|
52
|
+
"Accept": "application/json",
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
return requests_session
|
|
56
|
+
|
|
57
|
+
def get_connection_details_by_id(
|
|
58
|
+
self, connection_id: str
|
|
59
|
+
) -> FivetranConnectionDetails:
|
|
60
|
+
"""Get details for a specific connection."""
|
|
61
|
+
connection_details = self._session.get(
|
|
62
|
+
f"{self.config.base_url}/v1/connections/{connection_id}",
|
|
63
|
+
timeout=self.config.request_timeout_sec,
|
|
64
|
+
)
|
|
65
|
+
return FivetranConnectionDetails(**connection_details.json().get("data", {}))
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FivetranConnectionWarnings(BaseModel):
|
|
8
|
+
code: str # Warning Code
|
|
9
|
+
message: str # Warning Message
|
|
10
|
+
details: Dict # Warning Details
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FivetranConnectionStatus(BaseModel):
|
|
14
|
+
setup_state: str # Setup State
|
|
15
|
+
schema_status: str # Schema Status
|
|
16
|
+
sync_state: str # Sync State
|
|
17
|
+
update_state: str # Update State
|
|
18
|
+
is_historical_sync: bool # Is Historical Sync
|
|
19
|
+
warnings: List[FivetranConnectionWarnings] # Warnings
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FivetranConnectionConfig(BaseModel):
|
|
23
|
+
# Note: Connection Config is different for different connectors
|
|
24
|
+
auth_type: str # Auth Type
|
|
25
|
+
sheet_id: str # Sheet ID - URL to the Google Sheet
|
|
26
|
+
named_range: str # Named Range
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class FivetranConnectionSourceSyncDetails(BaseModel):
|
|
30
|
+
last_synced: datetime.datetime # Last Synced
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class FivetranConnectionDetails(BaseModel):
|
|
34
|
+
"""
|
|
35
|
+
Note: This reponse class only captures fields that are relevant to the Google Sheets Connector
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
id: str # Source ID
|
|
39
|
+
group_id: str # Destination ID
|
|
40
|
+
service: str # Connector Type
|
|
41
|
+
created_at: datetime.datetime
|
|
42
|
+
succeeded_at: datetime.datetime
|
|
43
|
+
paused: bool # Paused Status
|
|
44
|
+
sync_frequency: int # Sync Frequency (minutes)
|
|
45
|
+
status: FivetranConnectionStatus # Status
|
|
46
|
+
config: FivetranConnectionConfig # Connection Config
|
|
47
|
+
source_sync_details: FivetranConnectionSourceSyncDetails # Source Sync Details
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
# Sample Response for Google Sheets Connector
|
|
51
|
+
{
|
|
52
|
+
"code": "Success",
|
|
53
|
+
"data": {
|
|
54
|
+
"id": "dialectical_remindful",
|
|
55
|
+
"group_id": "empties_classification",
|
|
56
|
+
"service": "google_sheets",
|
|
57
|
+
"service_version": 1,
|
|
58
|
+
"schema": "fivetran_google_sheets.fivetran_google_sheets",
|
|
59
|
+
"connected_by": "sewn_restrained",
|
|
60
|
+
"created_at": "2025-10-06T17:53:01.554289Z",
|
|
61
|
+
"succeeded_at": "2025-10-06T22:55:45.275000Z",
|
|
62
|
+
"failed_at": null,
|
|
63
|
+
"paused": true,
|
|
64
|
+
"pause_after_trial": false,
|
|
65
|
+
"sync_frequency": 360,
|
|
66
|
+
"data_delay_threshold": 0,
|
|
67
|
+
"data_delay_sensitivity": "NORMAL",
|
|
68
|
+
"private_link_id": null,
|
|
69
|
+
"networking_method": "Directly",
|
|
70
|
+
"proxy_agent_id": null,
|
|
71
|
+
"schedule_type": "auto",
|
|
72
|
+
"status": {
|
|
73
|
+
"setup_state": "connected",
|
|
74
|
+
"schema_status": "ready",
|
|
75
|
+
"sync_state": "paused",
|
|
76
|
+
"update_state": "on_schedule",
|
|
77
|
+
"is_historical_sync": false,
|
|
78
|
+
"tasks": [],
|
|
79
|
+
"warnings": [
|
|
80
|
+
{
|
|
81
|
+
"code": "snowflake_discontinuing_password_auth",
|
|
82
|
+
"message": "Snowflake is discontinuing username/password authentication",
|
|
83
|
+
"details": {}
|
|
84
|
+
}
|
|
85
|
+
]
|
|
86
|
+
},
|
|
87
|
+
"config": {
|
|
88
|
+
"auth_type": "ServiceAccount",
|
|
89
|
+
"sheet_id": "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
|
|
90
|
+
"named_range": "Fivetran_Test_Range"
|
|
91
|
+
},
|
|
92
|
+
"source_sync_details": {
|
|
93
|
+
"last_synced": "2025-10-06T22:55:27.371Z"
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
"""
|
|
@@ -34,7 +34,6 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
|
|
|
34
34
|
SoftDeletedEntitiesCleanupConfig,
|
|
35
35
|
SoftDeletedEntitiesReport,
|
|
36
36
|
)
|
|
37
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
38
37
|
|
|
39
38
|
logger = logging.getLogger(__name__)
|
|
40
39
|
|
|
@@ -87,7 +86,6 @@ class DataHubGcSourceReport(
|
|
|
87
86
|
DataProcessCleanupReport,
|
|
88
87
|
SoftDeletedEntitiesReport,
|
|
89
88
|
DatahubExecutionRequestCleanupReport,
|
|
90
|
-
IngestionStageReport,
|
|
91
89
|
):
|
|
92
90
|
expired_tokens_revoked: int = 0
|
|
93
91
|
|