acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -9,6 +9,7 @@ from sqlalchemy import create_engine
|
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.common import AllowDenyPattern, ConfigurationError
|
|
11
11
|
from datahub.ingestion.source.fivetran.config import (
|
|
12
|
+
DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES,
|
|
12
13
|
Constant,
|
|
13
14
|
FivetranLogConfig,
|
|
14
15
|
FivetranSourceReport,
|
|
@@ -69,9 +70,23 @@ class FivetranLogAPI:
|
|
|
69
70
|
fivetran_log_query.set_schema(bigquery_destination_config.dataset)
|
|
70
71
|
|
|
71
72
|
# The "database" should be the BigQuery project name.
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
73
|
+
result = engine.execute("SELECT @@project_id").fetchone()
|
|
74
|
+
if result is None:
|
|
75
|
+
raise ValueError("Failed to retrieve BigQuery project ID")
|
|
76
|
+
fivetran_log_database = result[0]
|
|
77
|
+
elif destination_platform == "databricks":
|
|
78
|
+
databricks_destination_config = (
|
|
79
|
+
self.fivetran_log_config.databricks_destination_config
|
|
80
|
+
)
|
|
81
|
+
if databricks_destination_config is not None:
|
|
82
|
+
engine = create_engine(
|
|
83
|
+
databricks_destination_config.get_sql_alchemy_url(
|
|
84
|
+
databricks_destination_config.catalog
|
|
85
|
+
),
|
|
86
|
+
**databricks_destination_config.get_options(),
|
|
87
|
+
)
|
|
88
|
+
fivetran_log_query.set_schema(databricks_destination_config.log_schema)
|
|
89
|
+
fivetran_log_database = databricks_destination_config.catalog
|
|
75
90
|
else:
|
|
76
91
|
raise ConfigurationError(
|
|
77
92
|
f"Destination platform '{destination_platform}' is not yet supported."
|
|
@@ -98,7 +113,11 @@ class FivetranLogAPI:
|
|
|
98
113
|
"""
|
|
99
114
|
Returns dict of column lineage metadata with key as (<SOURCE_TABLE_ID>, <DESTINATION_TABLE_ID>)
|
|
100
115
|
"""
|
|
101
|
-
all_column_lineage = defaultdict(list)
|
|
116
|
+
all_column_lineage: Dict[Tuple[str, str], List] = defaultdict(list)
|
|
117
|
+
|
|
118
|
+
if not connector_ids:
|
|
119
|
+
return dict(all_column_lineage)
|
|
120
|
+
|
|
102
121
|
column_lineage_result = self._query(
|
|
103
122
|
self.fivetran_log_query.get_column_lineage_query(
|
|
104
123
|
connector_ids=connector_ids
|
|
@@ -116,7 +135,11 @@ class FivetranLogAPI:
|
|
|
116
135
|
"""
|
|
117
136
|
Returns dict of table lineage metadata with key as 'CONNECTOR_ID'
|
|
118
137
|
"""
|
|
119
|
-
connectors_table_lineage_metadata = defaultdict(list)
|
|
138
|
+
connectors_table_lineage_metadata: Dict[str, List] = defaultdict(list)
|
|
139
|
+
|
|
140
|
+
if not connector_ids:
|
|
141
|
+
return dict(connectors_table_lineage_metadata)
|
|
142
|
+
|
|
120
143
|
table_lineage_result = self._query(
|
|
121
144
|
self.fivetran_log_query.get_table_lineage_query(connector_ids=connector_ids)
|
|
122
145
|
)
|
|
@@ -232,9 +255,15 @@ class FivetranLogAPI:
|
|
|
232
255
|
return self._get_users().get(user_id)
|
|
233
256
|
|
|
234
257
|
def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
|
|
235
|
-
connector_ids
|
|
236
|
-
|
|
237
|
-
|
|
258
|
+
# Create 2 filtered connector_ids lists - one for table lineage and one for column lineage
|
|
259
|
+
tll_connector_ids: List[str] = []
|
|
260
|
+
cll_connector_ids: List[str] = []
|
|
261
|
+
for connector in connectors:
|
|
262
|
+
tll_connector_ids.append(connector.connector_id)
|
|
263
|
+
if connector.connector_type not in DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES:
|
|
264
|
+
cll_connector_ids.append(connector.connector_id)
|
|
265
|
+
table_lineage_metadata = self._get_table_lineage_metadata(tll_connector_ids)
|
|
266
|
+
column_lineage_metadata = self._get_column_lineage_metadata(cll_connector_ids)
|
|
238
267
|
for connector in connectors:
|
|
239
268
|
connector.lineage = self._extract_connector_lineage(
|
|
240
269
|
table_lineage_result=table_lineage_metadata.get(connector.connector_id),
|
|
@@ -6,6 +6,21 @@ MAX_COLUMN_LINEAGE_PER_CONNECTOR = 1000
|
|
|
6
6
|
MAX_JOBS_PER_CONNECTOR = 500
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
"""
|
|
10
|
+
------------------------------------------------------------------------------------------------------------
|
|
11
|
+
Fivetran Platform Connector Handling
|
|
12
|
+
------------------------------------------------------------------------------------------------------------
|
|
13
|
+
Current Query Change Log: August 2025 (See: https://fivetran.com/docs/changelog/2025/august-2025)
|
|
14
|
+
|
|
15
|
+
All queries have to be updated as per Fivetran Platform Connector release if any. We expect customers
|
|
16
|
+
and fivetran to keep platform connector configured for DataHub with auto sync enabled to get latest changes.
|
|
17
|
+
|
|
18
|
+
References:
|
|
19
|
+
- Fivetran Release Notes: https://fivetran.com/docs/changelog (Look for "Fivetran Platform Connector")
|
|
20
|
+
- Latest Platform Connector Schema: https://fivetran.com/docs/logs/fivetran-platform?erdModal=open
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
|
|
9
24
|
class FivetranLogQuery:
|
|
10
25
|
# Note: All queries are written in Snowflake SQL.
|
|
11
26
|
# They will be transpiled to the target database's SQL dialect at runtime.
|
|
@@ -18,22 +33,29 @@ class FivetranLogQuery:
|
|
|
18
33
|
return f"use database {db_name}"
|
|
19
34
|
|
|
20
35
|
def set_schema(self, schema_name: str) -> None:
|
|
21
|
-
|
|
36
|
+
"""
|
|
37
|
+
Using Snowflake quoted identifiers convention
|
|
38
|
+
|
|
39
|
+
Add double quotes around an identifier
|
|
40
|
+
Use two quotes to use the double quote character inside a quoted identifier
|
|
41
|
+
"""
|
|
42
|
+
schema_name = schema_name.replace('"', '""')
|
|
43
|
+
self.schema_clause = f'"{schema_name}".'
|
|
22
44
|
|
|
23
45
|
def get_connectors_query(self) -> str:
|
|
24
46
|
return f"""\
|
|
25
47
|
SELECT
|
|
26
|
-
|
|
48
|
+
connection_id,
|
|
27
49
|
connecting_user_id,
|
|
28
50
|
connector_type_id,
|
|
29
|
-
|
|
51
|
+
connection_name,
|
|
30
52
|
paused,
|
|
31
53
|
sync_frequency,
|
|
32
54
|
destination_id
|
|
33
|
-
FROM {self.schema_clause}
|
|
55
|
+
FROM {self.schema_clause}connection
|
|
34
56
|
WHERE
|
|
35
57
|
_fivetran_deleted = FALSE
|
|
36
|
-
QUALIFY ROW_NUMBER() OVER (PARTITION BY
|
|
58
|
+
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY _fivetran_synced DESC) = 1
|
|
37
59
|
"""
|
|
38
60
|
|
|
39
61
|
def get_users_query(self) -> str:
|
|
@@ -56,20 +78,20 @@ FROM {self.schema_clause}user
|
|
|
56
78
|
return f"""\
|
|
57
79
|
WITH ranked_syncs AS (
|
|
58
80
|
SELECT
|
|
59
|
-
|
|
81
|
+
connection_id,
|
|
60
82
|
sync_id,
|
|
61
83
|
MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
|
|
62
84
|
MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
|
|
63
85
|
MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
|
|
64
|
-
ROW_NUMBER() OVER (PARTITION BY
|
|
86
|
+
ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY MAX(time_stamp) DESC) as rn
|
|
65
87
|
FROM {self.schema_clause}log
|
|
66
88
|
WHERE message_event in ('sync_start', 'sync_end')
|
|
67
89
|
AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
|
|
68
|
-
AND
|
|
69
|
-
GROUP BY
|
|
90
|
+
AND connection_id IN ({formatted_connector_ids})
|
|
91
|
+
GROUP BY connection_id, sync_id
|
|
70
92
|
)
|
|
71
93
|
SELECT
|
|
72
|
-
|
|
94
|
+
connection_id,
|
|
73
95
|
sync_id,
|
|
74
96
|
start_time,
|
|
75
97
|
end_time,
|
|
@@ -78,7 +100,7 @@ FROM ranked_syncs
|
|
|
78
100
|
WHERE rn <= {MAX_JOBS_PER_CONNECTOR}
|
|
79
101
|
AND start_time IS NOT NULL
|
|
80
102
|
AND end_time IS NOT NULL
|
|
81
|
-
ORDER BY
|
|
103
|
+
ORDER BY connection_id, end_time DESC
|
|
82
104
|
"""
|
|
83
105
|
|
|
84
106
|
def get_table_lineage_query(self, connector_ids: List[str]) -> str:
|
|
@@ -90,7 +112,7 @@ SELECT
|
|
|
90
112
|
*
|
|
91
113
|
FROM (
|
|
92
114
|
SELECT
|
|
93
|
-
stm.
|
|
115
|
+
stm.connection_id as connection_id,
|
|
94
116
|
stm.id as source_table_id,
|
|
95
117
|
stm.name as source_table_name,
|
|
96
118
|
ssm.name as source_schema_name,
|
|
@@ -98,18 +120,18 @@ FROM (
|
|
|
98
120
|
dtm.name as destination_table_name,
|
|
99
121
|
dsm.name as destination_schema_name,
|
|
100
122
|
tl.created_at as created_at,
|
|
101
|
-
ROW_NUMBER() OVER (PARTITION BY stm.
|
|
123
|
+
ROW_NUMBER() OVER (PARTITION BY stm.connection_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
|
|
102
124
|
FROM {self.schema_clause}table_lineage as tl
|
|
103
|
-
JOIN {self.schema_clause}
|
|
104
|
-
JOIN {self.schema_clause}
|
|
105
|
-
JOIN {self.schema_clause}
|
|
106
|
-
JOIN {self.schema_clause}
|
|
107
|
-
WHERE stm.
|
|
125
|
+
JOIN {self.schema_clause}source_table as stm on tl.source_table_id = stm.id -- stm: source_table_metadata
|
|
126
|
+
JOIN {self.schema_clause}destination_table as dtm on tl.destination_table_id = dtm.id -- dtm: destination_table_metadata
|
|
127
|
+
JOIN {self.schema_clause}source_schema as ssm on stm.schema_id = ssm.id -- ssm: source_schema_metadata
|
|
128
|
+
JOIN {self.schema_clause}destination_schema as dsm on dtm.schema_id = dsm.id -- dsm: destination_schema_metadata
|
|
129
|
+
WHERE stm.connection_id IN ({formatted_connector_ids})
|
|
108
130
|
)
|
|
109
131
|
-- Ensure that we only get back one entry per source and destination pair.
|
|
110
132
|
WHERE table_combo_rn = 1
|
|
111
|
-
QUALIFY ROW_NUMBER() OVER (PARTITION BY
|
|
112
|
-
ORDER BY
|
|
133
|
+
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
|
|
134
|
+
ORDER BY connection_id, created_at DESC
|
|
113
135
|
"""
|
|
114
136
|
|
|
115
137
|
def get_column_lineage_query(self, connector_ids: List[str]) -> str:
|
|
@@ -124,25 +146,25 @@ SELECT
|
|
|
124
146
|
destination_column_name
|
|
125
147
|
FROM (
|
|
126
148
|
SELECT
|
|
127
|
-
stm.
|
|
149
|
+
stm.connection_id as connection_id,
|
|
128
150
|
scm.table_id as source_table_id,
|
|
129
151
|
dcm.table_id as destination_table_id,
|
|
130
152
|
scm.name as source_column_name,
|
|
131
153
|
dcm.name as destination_column_name,
|
|
132
154
|
cl.created_at as created_at,
|
|
133
|
-
ROW_NUMBER() OVER (PARTITION BY stm.
|
|
155
|
+
ROW_NUMBER() OVER (PARTITION BY stm.connection_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
|
|
134
156
|
FROM {self.schema_clause}column_lineage as cl
|
|
135
|
-
JOIN {self.schema_clause}
|
|
157
|
+
JOIN {self.schema_clause}source_column as scm -- scm: source_column_metadata
|
|
136
158
|
ON cl.source_column_id = scm.id
|
|
137
|
-
JOIN {self.schema_clause}
|
|
159
|
+
JOIN {self.schema_clause}destination_column as dcm -- dcm: destination_column_metadata
|
|
138
160
|
ON cl.destination_column_id = dcm.id
|
|
139
|
-
-- Only joining
|
|
140
|
-
JOIN {self.schema_clause}
|
|
161
|
+
-- Only joining source_table to get the connection_id.
|
|
162
|
+
JOIN {self.schema_clause}source_table as stm -- stm: source_table_metadata
|
|
141
163
|
ON scm.table_id = stm.id
|
|
142
|
-
WHERE stm.
|
|
164
|
+
WHERE stm.connection_id IN ({formatted_connector_ids})
|
|
143
165
|
)
|
|
144
166
|
-- Ensure that we only get back one entry per (connector, source column, destination column) pair.
|
|
145
167
|
WHERE column_combo_rn = 1
|
|
146
|
-
QUALIFY ROW_NUMBER() OVER (PARTITION BY
|
|
147
|
-
ORDER BY
|
|
168
|
+
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
|
|
169
|
+
ORDER BY connection_id, created_at DESC
|
|
148
170
|
"""
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
from requests.adapters import HTTPAdapter
|
|
5
|
+
from urllib3.util import Retry
|
|
6
|
+
|
|
7
|
+
from datahub.ingestion.source.fivetran.config import (
|
|
8
|
+
FivetranAPIConfig,
|
|
9
|
+
)
|
|
10
|
+
from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Retry configuration constants
|
|
15
|
+
RETRY_MAX_TIMES = 3
|
|
16
|
+
RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
|
|
17
|
+
RETRY_BACKOFF_FACTOR = 1
|
|
18
|
+
RETRY_ALLOWED_METHODS = ["GET"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FivetranAPIClient:
|
|
22
|
+
"""Client for interacting with the Fivetran REST API."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, config: FivetranAPIConfig) -> None:
|
|
25
|
+
self.config = config
|
|
26
|
+
self._session = self._create_session()
|
|
27
|
+
|
|
28
|
+
def _create_session(self) -> requests.Session:
|
|
29
|
+
"""
|
|
30
|
+
Create a session with retry logic and basic authentication
|
|
31
|
+
"""
|
|
32
|
+
requests_session = requests.Session()
|
|
33
|
+
|
|
34
|
+
# Configure retry strategy for transient failures
|
|
35
|
+
retry_strategy = Retry(
|
|
36
|
+
total=RETRY_MAX_TIMES,
|
|
37
|
+
backoff_factor=RETRY_BACKOFF_FACTOR,
|
|
38
|
+
status_forcelist=RETRY_STATUS_CODES,
|
|
39
|
+
allowed_methods=RETRY_ALLOWED_METHODS,
|
|
40
|
+
raise_on_status=True,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
44
|
+
requests_session.mount("http://", adapter)
|
|
45
|
+
requests_session.mount("https://", adapter)
|
|
46
|
+
|
|
47
|
+
# Set up basic authentication
|
|
48
|
+
requests_session.auth = (self.config.api_key, self.config.api_secret)
|
|
49
|
+
requests_session.headers.update(
|
|
50
|
+
{
|
|
51
|
+
"Content-Type": "application/json",
|
|
52
|
+
"Accept": "application/json",
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
return requests_session
|
|
56
|
+
|
|
57
|
+
def get_connection_details_by_id(
|
|
58
|
+
self, connection_id: str
|
|
59
|
+
) -> FivetranConnectionDetails:
|
|
60
|
+
"""Get details for a specific connection."""
|
|
61
|
+
connection_details = self._session.get(
|
|
62
|
+
f"{self.config.base_url}/v1/connections/{connection_id}",
|
|
63
|
+
timeout=self.config.request_timeout_sec,
|
|
64
|
+
)
|
|
65
|
+
return FivetranConnectionDetails(**connection_details.json().get("data", {}))
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FivetranConnectionWarnings(BaseModel):
|
|
8
|
+
code: str # Warning Code
|
|
9
|
+
message: str # Warning Message
|
|
10
|
+
details: Dict # Warning Details
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FivetranConnectionStatus(BaseModel):
|
|
14
|
+
setup_state: str # Setup State
|
|
15
|
+
schema_status: str # Schema Status
|
|
16
|
+
sync_state: str # Sync State
|
|
17
|
+
update_state: str # Update State
|
|
18
|
+
is_historical_sync: bool # Is Historical Sync
|
|
19
|
+
warnings: List[FivetranConnectionWarnings] # Warnings
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FivetranConnectionConfig(BaseModel):
|
|
23
|
+
# Note: Connection Config is different for different connectors
|
|
24
|
+
auth_type: str # Auth Type
|
|
25
|
+
sheet_id: str # Sheet ID - URL to the Google Sheet
|
|
26
|
+
named_range: str # Named Range
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class FivetranConnectionSourceSyncDetails(BaseModel):
|
|
30
|
+
last_synced: datetime.datetime # Last Synced
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class FivetranConnectionDetails(BaseModel):
|
|
34
|
+
"""
|
|
35
|
+
Note: This reponse class only captures fields that are relevant to the Google Sheets Connector
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
id: str # Source ID
|
|
39
|
+
group_id: str # Destination ID
|
|
40
|
+
service: str # Connector Type
|
|
41
|
+
created_at: datetime.datetime
|
|
42
|
+
succeeded_at: datetime.datetime
|
|
43
|
+
paused: bool # Paused Status
|
|
44
|
+
sync_frequency: int # Sync Frequency (minutes)
|
|
45
|
+
status: FivetranConnectionStatus # Status
|
|
46
|
+
config: FivetranConnectionConfig # Connection Config
|
|
47
|
+
source_sync_details: FivetranConnectionSourceSyncDetails # Source Sync Details
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
# Sample Response for Google Sheets Connector
|
|
51
|
+
{
|
|
52
|
+
"code": "Success",
|
|
53
|
+
"data": {
|
|
54
|
+
"id": "dialectical_remindful",
|
|
55
|
+
"group_id": "empties_classification",
|
|
56
|
+
"service": "google_sheets",
|
|
57
|
+
"service_version": 1,
|
|
58
|
+
"schema": "fivetran_google_sheets.fivetran_google_sheets",
|
|
59
|
+
"connected_by": "sewn_restrained",
|
|
60
|
+
"created_at": "2025-10-06T17:53:01.554289Z",
|
|
61
|
+
"succeeded_at": "2025-10-06T22:55:45.275000Z",
|
|
62
|
+
"failed_at": null,
|
|
63
|
+
"paused": true,
|
|
64
|
+
"pause_after_trial": false,
|
|
65
|
+
"sync_frequency": 360,
|
|
66
|
+
"data_delay_threshold": 0,
|
|
67
|
+
"data_delay_sensitivity": "NORMAL",
|
|
68
|
+
"private_link_id": null,
|
|
69
|
+
"networking_method": "Directly",
|
|
70
|
+
"proxy_agent_id": null,
|
|
71
|
+
"schedule_type": "auto",
|
|
72
|
+
"status": {
|
|
73
|
+
"setup_state": "connected",
|
|
74
|
+
"schema_status": "ready",
|
|
75
|
+
"sync_state": "paused",
|
|
76
|
+
"update_state": "on_schedule",
|
|
77
|
+
"is_historical_sync": false,
|
|
78
|
+
"tasks": [],
|
|
79
|
+
"warnings": [
|
|
80
|
+
{
|
|
81
|
+
"code": "snowflake_discontinuing_password_auth",
|
|
82
|
+
"message": "Snowflake is discontinuing username/password authentication",
|
|
83
|
+
"details": {}
|
|
84
|
+
}
|
|
85
|
+
]
|
|
86
|
+
},
|
|
87
|
+
"config": {
|
|
88
|
+
"auth_type": "ServiceAccount",
|
|
89
|
+
"sheet_id": "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
|
|
90
|
+
"named_range": "Fivetran_Test_Range"
|
|
91
|
+
},
|
|
92
|
+
"source_sync_details": {
|
|
93
|
+
"last_synced": "2025-10-06T22:55:27.371Z"
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
"""
|
|
@@ -34,7 +34,6 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
|
|
|
34
34
|
SoftDeletedEntitiesCleanupConfig,
|
|
35
35
|
SoftDeletedEntitiesReport,
|
|
36
36
|
)
|
|
37
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
38
37
|
|
|
39
38
|
logger = logging.getLogger(__name__)
|
|
40
39
|
|
|
@@ -87,7 +86,6 @@ class DataHubGcSourceReport(
|
|
|
87
86
|
DataProcessCleanupReport,
|
|
88
87
|
SoftDeletedEntitiesReport,
|
|
89
88
|
DatahubExecutionRequestCleanupReport,
|
|
90
|
-
IngestionStageReport,
|
|
91
89
|
):
|
|
92
90
|
expired_tokens_revoked: int = 0
|
|
93
91
|
|
|
@@ -16,6 +16,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
16
16
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
|
|
17
17
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
18
18
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
19
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
19
20
|
from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
|
|
20
21
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
|
|
21
22
|
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
@@ -36,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
36
37
|
|
|
37
38
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
38
39
|
|
|
40
|
+
GCS_ENDPOINT_URL = "https://storage.googleapis.com"
|
|
41
|
+
|
|
39
42
|
|
|
40
43
|
class HMACKey(ConfigModel):
|
|
41
44
|
hmac_access_id: str = Field(description="Access ID")
|
|
@@ -82,7 +85,14 @@ class GCSSourceReport(DataLakeSourceReport):
|
|
|
82
85
|
@platform_name("Google Cloud Storage", id=PLATFORM_GCS)
|
|
83
86
|
@config_class(GCSSourceConfig)
|
|
84
87
|
@support_status(SupportStatus.INCUBATING)
|
|
85
|
-
@capability(
|
|
88
|
+
@capability(
|
|
89
|
+
SourceCapability.CONTAINERS,
|
|
90
|
+
"Enabled by default",
|
|
91
|
+
subtype_modifier=[
|
|
92
|
+
SourceCapabilityModifier.GCS_BUCKET,
|
|
93
|
+
SourceCapabilityModifier.FOLDER,
|
|
94
|
+
],
|
|
95
|
+
)
|
|
86
96
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
87
97
|
@capability(SourceCapability.DATA_PROFILING, "Not supported", supported=False)
|
|
88
98
|
class GCSSource(StatefulIngestionSourceBase):
|
|
@@ -104,7 +114,7 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
104
114
|
s3_config = DataLakeSourceConfig(
|
|
105
115
|
path_specs=s3_path_specs,
|
|
106
116
|
aws_config=AwsConnectionConfig(
|
|
107
|
-
aws_endpoint_url=
|
|
117
|
+
aws_endpoint_url=GCS_ENDPOINT_URL,
|
|
108
118
|
aws_access_key_id=self.config.credential.hmac_access_id,
|
|
109
119
|
aws_secret_access_key=self.config.credential.hmac_access_secret.get_secret_value(),
|
|
110
120
|
aws_region="auto",
|
|
@@ -112,15 +122,26 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
112
122
|
env=self.config.env,
|
|
113
123
|
max_rows=self.config.max_rows,
|
|
114
124
|
number_of_files_to_sample=self.config.number_of_files_to_sample,
|
|
125
|
+
platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
|
|
126
|
+
platform_instance=self.config.platform_instance,
|
|
115
127
|
)
|
|
116
128
|
return s3_config
|
|
117
129
|
|
|
118
130
|
def create_equivalent_s3_path_specs(self):
|
|
119
131
|
s3_path_specs = []
|
|
120
132
|
for path_spec in self.config.path_specs:
|
|
133
|
+
# PathSpec modifies the passed-in include to add /** to the end if
|
|
134
|
+
# autodetecting partitions. Remove that, otherwise creating a new
|
|
135
|
+
# PathSpec will complain.
|
|
136
|
+
# TODO: this should be handled inside PathSpec, which probably shouldn't
|
|
137
|
+
# modify its input.
|
|
138
|
+
include = path_spec.include
|
|
139
|
+
if include.endswith("{table}/**") and not path_spec.allow_double_stars:
|
|
140
|
+
include = include.removesuffix("**")
|
|
141
|
+
|
|
121
142
|
s3_path_specs.append(
|
|
122
143
|
PathSpec(
|
|
123
|
-
include=
|
|
144
|
+
include=include.replace("gs://", "s3://"),
|
|
124
145
|
exclude=(
|
|
125
146
|
[exc.replace("gs://", "s3://") for exc in path_spec.exclude]
|
|
126
147
|
if path_spec.exclude
|
|
@@ -131,6 +152,11 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
131
152
|
table_name=path_spec.table_name,
|
|
132
153
|
enable_compression=path_spec.enable_compression,
|
|
133
154
|
sample_files=path_spec.sample_files,
|
|
155
|
+
allow_double_stars=path_spec.allow_double_stars,
|
|
156
|
+
autodetect_partitions=path_spec.autodetect_partitions,
|
|
157
|
+
include_hidden_folders=path_spec.include_hidden_folders,
|
|
158
|
+
tables_filter_pattern=path_spec.tables_filter_pattern,
|
|
159
|
+
traversal_method=path_spec.traversal_method,
|
|
134
160
|
)
|
|
135
161
|
)
|
|
136
162
|
|
|
@@ -138,7 +164,9 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
138
164
|
|
|
139
165
|
def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
|
|
140
166
|
config = self.create_equivalent_s3_config()
|
|
141
|
-
|
|
167
|
+
# Create a new context for S3 source without graph to avoid duplicate checkpointer registration
|
|
168
|
+
s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
|
|
169
|
+
s3_source = S3Source(config, s3_ctx)
|
|
142
170
|
return self.s3_source_overrides(s3_source)
|
|
143
171
|
|
|
144
172
|
def s3_source_overrides(self, source: S3Source) -> S3Source:
|